Skip to content

Latest commit

 

History

History
829 lines (678 loc) · 19.3 KB

example_exploring_border_crossing_data.md

File metadata and controls

829 lines (678 loc) · 19.3 KB

Wrangling DHS Border Crossing Data

  • Author: Chris Albon, @ChrisAlbon

  • Date: -

  • Repo: Python 3 code snippets for data science

  • Note:

    import pandas as pd import os %matplotlib inline import matplotlib.pyplot as plt import numpy as np

    cur_dir = os.path.dirname(os.path.realpath('file'))

    df = pd.read_csv(cur_dir + '/data/cross_raw_data/' + 'bc_full_crossing_data.csv')

    df.head()

Port Name Year Month Trucks Loaded Truck Containers Empty Truck Containers Trains Loaded Rail Containers Empty Rail Containers Train Passengers Buses Bus Passengers Personal Vehicles Personal Vehicle Passengers Pedestrians City State
0 AZ: Douglas ... 1995 01 - JAN 2890 0 0 0 0 0 0 270 270 164803 379047 40735 Douglas ... AZ
1 AZ: Douglas ... 1995 02 - FEB 2849 0 0 0 0 0 0 270 270 146925 337926 40463 Douglas ... AZ
2 AZ: Douglas ... 1995 03 - MAR 3421 0 0 0 0 0 0 270 270 139060 319838 55370 Douglas ... AZ
3 AZ: Douglas ... 1995 04 - APR 3676 0 0 0 0 0 0 270 270 144163 331575 46279 Douglas ... AZ
4 AZ: Douglas ... 1995 05 - MAY 3560 0 0 0 0 0 0 270 270 154597 355573 50160 Douglas ... AZ
df['Port Name'][1]




'AZ: Douglas                                                 '




df_douglas = df[df['Port Name'] == 'AZ: Douglas                                                 '][:]


df_douglas.head()
Port Name Year Month Trucks Loaded Truck Containers Empty Truck Containers Trains Loaded Rail Containers Empty Rail Containers Train Passengers Buses Bus Passengers Personal Vehicles Personal Vehicle Passengers Pedestrians City State
0 AZ: Douglas ... 1995 01 - JAN 2890 0 0 0 0 0 0 270 270 164803 379047 40735 Douglas ... AZ
1 AZ: Douglas ... 1995 02 - FEB 2849 0 0 0 0 0 0 270 270 146925 337926 40463 Douglas ... AZ
2 AZ: Douglas ... 1995 03 - MAR 3421 0 0 0 0 0 0 270 270 139060 319838 55370 Douglas ... AZ
3 AZ: Douglas ... 1995 04 - APR 3676 0 0 0 0 0 0 270 270 144163 331575 46279 Douglas ... AZ
4 AZ: Douglas ... 1995 05 - MAY 3560 0 0 0 0 0 0 270 270 154597 355573 50160 Douglas ... AZ
df_douglas['Total Crossers'] = df_douglas['Train Passengers'] + df_douglas['Bus Passengers'] + df_douglas['Personal Vehicle Passengers'] + df_douglas['Pedestrians']


df_douglas_annual_sum = df_douglas.groupby(df_douglas['Year']).sum()


df_douglas_annual_sum.head()
Trucks Loaded Truck Containers Empty Truck Containers Trains Loaded Rail Containers Empty Rail Containers Train Passengers Buses Bus Passengers Personal Vehicles Personal Vehicle Passengers Pedestrians Total Crossers
Year
1995 36272 0 0 0 0 0 0 3249 3249 1827277 4202735 567030 4773014
1996 38089 8703 13811 0 0 0 0 3353 3433 1915119 4404773 547742 4955948
1997 35718 10186 13119 0 0 0 0 3651 3651 1991904 4803469 599082 5406202
1998 35656 14952 14106 0 49 57 0 3650 3650 2028032 5577088 641181 6221919
1999 32568 14745 11720 0 0 0 0 3650 3650 2150092 5912753 704973 6621376
df_douglas_annual_sum['Bus Passengers'].plot(kind='area', grid=False)




<matplotlib.axes._subplots.AxesSubplot at 0x1083fd410>

png

df_douglas_annual_sum['Personal Vehicle Passengers'].plot(kind='area', grid=False)




<matplotlib.axes._subplots.AxesSubplot at 0x1087cfe50>

png

df_douglas_annual_sum['Pedestrians'].plot(kind='area', grid=False)




<matplotlib.axes._subplots.AxesSubplot at 0x1088b8590>

png

# Create a figure with a single subplot
f, ax = plt.subplots(1, figsize=(10,5))

# Set bar width at 1
bar_width = 1

# positions of the left bar-boundaries
bar_l = [i for i in range(len(df_douglas_annual_sum['Bus Passengers']))] 

# positions of the x-axis ticks (center of the bars as bar labels)
tick_pos = [i+(bar_width/2) for i in bar_l] 

# Create the total score for each participant
totals = [i+j+k for i,j,k in zip(df_douglas_annual_sum['Bus Passengers'], 
                                 df_douglas_annual_sum['Personal Vehicle Passengers'], 
                                 df_douglas_annual_sum['Pedestrians'])]

# Create the percentage of the total score the pre_score value for each participant was
pre_rel = [i / j * 100 for  i,j in zip(df_douglas_annual_sum['Bus Passengers'], totals)]

# Create the percentage of the total score the mid_score value for each participant was
mid_rel = [i / j * 100 for  i,j in zip(df_douglas_annual_sum['Personal Vehicle Passengers'], totals)]

# Create the percentage of the total score the post_score value for each participant was
post_rel = [i / j * 100 for  i,j in zip(df_douglas_annual_sum['Pedestrians'], totals)]

# Create a bar chart in position bar_1
ax.bar(bar_l, 
       # using pre_rel data
       pre_rel, 
       # labeled 
       label='Bus Passengers', 
       # with alpha
       alpha=0.9, 
       # with color
       color='#019600',
       # with bar width
       width=bar_width,
       # with border color
       edgecolor='white'
       )

# Create a bar chart in position bar_1
ax.bar(bar_l, 
       # using mid_rel data
       mid_rel, 
       # with pre_rel
       bottom=pre_rel, 
       # labeled 
       label='Personal Vehicle Passengers', 
       # with alpha
       alpha=0.9, 
       # with color
       color='#3C5F5A', 
       # with bar width
       width=bar_width,
       # with border color
       edgecolor='white'
       )

# Create a bar chart in position bar_1
ax.bar(bar_l, 
       # using post_rel data
       post_rel, 
       # with pre_rel and mid_rel on bottom
       bottom=[i+j for i,j in zip(pre_rel, mid_rel)], 
       # labeled 
       label='Pedestrians',
       # with alpha
       alpha=0.9, 
       # with color
       color='#219AD8', 
       # with bar width
       width=bar_width,
       # with border color
       edgecolor='white'
       )

# Set the ticks to be first names
plt.xticks(tick_pos, df_douglas_annual_sum.index)
ax.set_ylabel("Percent Of Total")
ax.set_xlabel("Year")

# Let the borders of the graphic
plt.xlim([min(tick_pos)-bar_width, max(tick_pos)+bar_width])
plt.ylim(-10, 110)

# rotate axis labels
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')

# shot plot
plt.show()

png

# Create a figure with a single subplot
f, ax = plt.subplots(1, figsize=(10,5))

# Create a scatterplot of,
            # x axis: Latency, and 
plt.bar(df_douglas_annual_sum.index, 
            # y axis: Download, with
            df_douglas_annual_sum['Personal Vehicle Passengers'], 
            # the color assigned as blue
            color='b')

plt.bar(df_douglas_annual_sum.index, 
            # y axis: Download, with
            df_douglas_annual_sum['Pedestrians'], 
            # the color assigned as blue
            color='r')

plt.bar(df_douglas_annual_sum.index, 
            # y axis: Download, with
            df_douglas_annual_sum['Bus Passengers'], 
            # the color assigned as blue
            color='g')

# Chart title
plt.title('Types Of Border Crossers')

# y label
plt.ylabel('Total People')

# x label
plt.xlabel('Year')




<matplotlib.text.Text at 0x109016e10>

png

df.columns




Index(['Port Name', 'Year', 'Month', 'Trucks', 'Loaded Truck Containers', 'Empty Truck Containers', 'Trains', 'Loaded Rail Containers', 'Empty Rail Containers', 'Train Passengers', 'Buses', 'Bus Passengers', 'Personal Vehicles', 'Personal Vehicle Passengers', 'Pedestrians', 'City', 'State'], dtype='object')




df_crossing = df.groupby('Port Name').sum()


df_crossing.sort('Trains', ascending=False).head()
Year Trucks Loaded Truck Containers Empty Truck Containers Trains Loaded Rail Containers Empty Rail Containers Train Passengers Buses Bus Passengers Personal Vehicles Personal Vehicle Passengers Pedestrians
Port Name
TX: Laredo 456912 27331291 16555501 9484814 62322 3060584 2141567 0 670433 13725607 117969118 273240177 82942171
TX: Eagle Pass 456912 1774624 1054386 676904 31437 803534 1365715 83048 42050 340218 57603495 136747990 12947972
TX: El Paso 456912 13050522 6504160 5548367 23793 660056 842385 49831 311136 5624071 261185949 567649753 128299884
TX: Brownsville 456912 4503798 2249740 2031792 14299 155970 1243865 2592 180687 1591315 119779453 263794440 55892176
AZ: Nogales 456912 5076159 3812053 1044586 11647 539083 357639 35902 136006 2464601 65971957 164483990 97174209
df_crossing['container_total'] = df_crossing['Loaded Truck Containers'] + df_crossing['Empty Truck Containers']

df_crossing = df_crossing.sort(columns='container_total')


# input data, specifically the second and 
# third rows, skipping the first column
x1 = df_crossing['Loaded Truck Containers']
x2 = df_crossing['Empty Truck Containers']

# Create the bar labels
bar_labels = df_crossing.index

# Create a figure
fig = plt.figure(figsize=(10,8))

# Set the y position
y_pos = np.arange(len(x1))
y_pos = [x for x in y_pos]
plt.yticks(y_pos, bar_labels, fontsize=10)

# Create a horizontal bar in the position y_pos
plt.barh(y_pos, 
         # using x1 data
         x1, 
         # that is centered
         align='center', 
         # with alpha 0.4
         alpha=0.4, 
         # and color green
         color='#263F13')

# Create a horizontal bar in the position y_pos
plt.barh(y_pos, 
         # using NEGATIVE x2 data
         -x2,
         # that is centered
         align='center', 
         # with alpha 0.4
         alpha=0.4, 
         # and color green
         color='#77A61D')

# annotation and labels
plt.xlabel('Empty Containers: Light Green. Full Containers: Dark Green')
t = plt.title('Comparison of Empty And Full Truck Containers Per Border Crossing')
plt.ylim([-1,len(x1)+0.1])
plt.grid()

plt.show()

png