Data Processing Exercise

In [15]:
import os
from six.moves import urllib
import pandas as pd
import numpy as np

DOWNLOAD_ROOT = "http://s3.amazonaws.com/gamma-datasets/"
TRAFFIC_PATH = "datasets"
TRAFFIC_URL = DOWNLOAD_ROOT + "Speed_Camera_Violations.csv"


## Use following codes for fetching data

def fetch_traffic_data(traffic_url=TRAFFIC_URL, traffic_path=TRAFFIC_PATH):
    if not os.path.isdir(traffic_path):
        os.makedirs(traffic_path)
    csv_path = os.path.join(traffic_path, "Speed_Camera_Violations.csv")
    urllib.request.urlretrieve(traffic_url, csv_path)

    return pd.read_csv(csv_path, parse_dates=[2], index_col=[2])
In [16]:
traffic = fetch_traffic_data()

What is the average citation per date

In [17]:
traffic.sort_index(inplace=True)
traffic.groupby(level=0).mean()
Out[17]:
VIOLATIONS X COORDINATE Y COORDINATE LATITUDE LONGITUDE
VIOLATION DATE
2014-07-01 52.168421 1.157331e+06 1.895851e+06 41.869966 -87.697890
2014-07-02 43.228261 1.157172e+06 1.896513e+06 41.871785 -87.698460
2014-07-03 51.617021 1.157533e+06 1.896956e+06 41.872993 -87.697119
2014-07-04 59.596774 1.159785e+06 1.900856e+06 41.883649 -87.688748
2014-07-05 55.380952 1.160100e+06 1.900902e+06 41.883770 -87.687592
2014-07-06 59.983333 1.159726e+06 1.901113e+06 41.884357 -87.688963
2014-07-07 49.237113 1.157365e+06 1.895910e+06 41.870129 -87.697765
2014-07-08 48.316327 1.157691e+06 1.895995e+06 41.870354 -87.696563
2014-07-09 49.824742 1.157944e+06 1.895733e+06 41.869632 -87.695640
2014-07-10 47.309278 1.157290e+06 1.897096e+06 41.873383 -87.698004
2014-07-11 51.686747 1.158252e+06 1.894460e+06 41.866131 -87.694548
2014-07-12 42.203390 1.160162e+06 1.899432e+06 41.879735 -87.687409
2014-07-13 57.645161 1.160090e+06 1.900417e+06 41.882440 -87.687643
2014-07-14 46.145833 1.157698e+06 1.894575e+06 41.866455 -87.696585
2014-07-15 46.857143 1.158040e+06 1.895050e+06 41.867752 -87.695312
2014-07-16 48.646465 1.158067e+06 1.895417e+06 41.868761 -87.695204
2014-07-17 48.103093 1.158137e+06 1.895128e+06 41.867964 -87.694953
2014-07-18 52.376471 1.158751e+06 1.894366e+06 41.865862 -87.692724
2014-07-19 58.274194 1.160721e+06 1.899518e+06 41.879958 -87.685355
2014-07-20 52.904762 1.160850e+06 1.900023e+06 41.881340 -87.684866
2014-07-21 49.322222 1.159274e+06 1.895277e+06 41.868352 -87.690772
2014-07-22 50.811111 1.159185e+06 1.895274e+06 41.868345 -87.691100
2014-07-23 51.318681 1.159495e+06 1.895639e+06 41.869340 -87.689954
2014-07-24 52.321839 1.159703e+06 1.896104e+06 41.870612 -87.689177
2014-07-25 48.236842 1.159034e+06 1.896310e+06 41.871189 -87.691635
2014-07-26 56.777778 1.161149e+06 1.899982e+06 41.881223 -87.683770
2014-07-27 57.903226 1.160853e+06 1.899920e+06 41.881059 -87.684861
2014-07-28 46.666667 1.159703e+06 1.896104e+06 41.870612 -87.689177
2014-07-29 44.534247 1.160348e+06 1.898468e+06 41.877085 -87.686749
2014-07-30 47.000000 1.160617e+06 1.898516e+06 41.877210 -87.685761
... ... ... ... ... ...
2018-11-24 35.171053 1.159490e+06 1.894617e+06 41.866529 -87.690028
2018-11-25 22.191781 1.158107e+06 1.898191e+06 41.876368 -87.694995
2018-11-26 8.547009 1.159749e+06 1.893199e+06 41.862637 -87.689100
2018-11-27 14.200000 1.159642e+06 1.891924e+06 41.859141 -87.689530
2018-11-28 15.267606 1.159012e+06 1.891869e+06 41.859001 -87.691843
2018-11-29 18.236111 1.158425e+06 1.893154e+06 41.862540 -87.693959
2018-11-30 20.423841 1.158600e+06 1.892246e+06 41.860047 -87.693342
2018-12-01 20.613333 1.159016e+06 1.895342e+06 41.868530 -87.691742
2018-12-02 25.848101 1.159554e+06 1.895372e+06 41.868602 -87.689768
2018-12-03 18.751724 1.158749e+06 1.892040e+06 41.859477 -87.692801
2018-12-04 18.297872 1.159058e+06 1.892202e+06 41.859915 -87.691666
2018-12-05 17.839161 1.158522e+06 1.893111e+06 41.862422 -87.693607
2018-12-06 18.197183 1.158819e+06 1.892125e+06 41.859710 -87.692544
2018-12-07 19.897436 1.159356e+06 1.893968e+06 41.864754 -87.690526
2018-12-08 34.746835 1.159553e+06 1.895374e+06 41.868605 -87.689775
2018-12-09 32.975000 1.159798e+06 1.895477e+06 41.868884 -87.688871
2018-12-10 18.577586 1.159191e+06 1.893608e+06 41.863769 -87.691146
2018-12-11 17.264957 1.159350e+06 1.894265e+06 41.865570 -87.690540
2018-12-12 17.478632 1.158634e+06 1.894483e+06 41.866183 -87.693162
2018-12-13 16.660714 1.158779e+06 1.895139e+06 41.867979 -87.692615
2018-12-14 17.464912 1.158872e+06 1.895000e+06 41.867596 -87.692276
2018-12-15 32.986842 1.159668e+06 1.894936e+06 41.867400 -87.689364
2018-12-16 35.184211 1.159870e+06 1.894878e+06 41.867237 -87.688627
2018-12-17 19.377193 1.158933e+06 1.892744e+06 41.861405 -87.692111
2018-12-18 18.218487 1.159433e+06 1.895158e+06 41.868017 -87.690214
2018-12-19 19.854545 1.159134e+06 1.893288e+06 41.862893 -87.691366
2018-12-20 16.330275 1.157975e+06 1.894762e+06 41.866962 -87.695577
2018-12-21 18.811321 1.157903e+06 1.896157e+06 41.870791 -87.695799
2018-12-22 34.670886 1.159684e+06 1.895026e+06 41.867647 -87.689303
2018-12-23 26.040541 1.158972e+06 1.896172e+06 41.870809 -87.691882

1637 rows × 5 columns

Let's see the average citation per camera

In [18]:
traffic.set_index("CAMERA ID", inplace=True)
traffic.sort_index(inplace=True)
traffic.groupby(level=0).mean()
Out[18]:
VIOLATIONS X COORDINATE Y COORDINATE LATITUDE LONGITUDE
CAMERA ID
CHI003 116.335859 1.147853e+06 1.934275e+06 41.975605 -87.731670
CHI004 34.781192 1.148759e+06 1.933818e+06 41.974333 -87.728347
CHI005 12.046569 1.163048e+06 1.878843e+06 41.823189 -87.677349
CHI007 59.697837 1.161038e+06 1.878964e+06 41.823564 -87.684721
CHI008 19.142241 1.151781e+06 1.898395e+06 41.877071 -87.718168
CHI009 46.374004 1.151845e+06 1.899805e+06 41.880938 -87.717898
CHI010 55.978475 NaN NaN NaN NaN
CHI011 15.503542 1.155438e+06 1.906531e+06 41.899325 -87.704522
CHI013 43.584568 1.182453e+06 1.869712e+06 41.797704 -87.606445
CHI014 58.330670 1.156276e+06 1.858583e+06 41.767732 -87.702738
CHI015 13.998763 1.154378e+06 1.857352e+06 41.764391 -87.709728
CHI018 41.265596 1.156183e+06 1.859145e+06 41.769276 -87.703063
CHI019 40.185117 1.156992e+06 1.894531e+06 41.866364 -87.699143
CHI020 24.795818 1.157040e+06 1.894612e+06 41.866585 -87.698962
CHI021 116.066912 1.157136e+06 1.892362e+06 41.860408 -87.698672
CHI022 9.982166 1.167029e+06 1.830594e+06 41.690702 -87.664122
CHI023 20.836609 1.166994e+06 1.830711e+06 41.691025 -87.664248
CHI024 43.347826 1.154183e+06 1.939669e+06 41.990282 -87.708245
CHI025 17.594213 1.155158e+06 1.934429e+06 41.975884 -87.704801
CHI026 29.622984 1.159788e+06 1.923418e+06 41.945574 -87.688078
CHI027 42.721631 1.159704e+06 1.923559e+06 41.945963 -87.688384
CHI028 67.848101 1.158541e+06 1.923785e+06 41.946608 -87.692652
CHI029 77.253676 1.180985e+06 1.868165e+06 41.793493 -87.611876
CHI030 24.646341 1.150574e+06 1.871490e+06 41.803265 -87.723304
CHI031 58.457074 1.150513e+06 1.870811e+06 41.801402 -87.723545
CHI032 21.004773 1.150881e+06 1.871374e+06 41.802939 -87.722182
CHI033 20.791209 1.176498e+06 1.897411e+06 41.873850 -87.627448
CHI034 15.169788 1.176418e+06 1.897401e+06 41.873824 -87.627741
CHI035 5.924515 1.178587e+06 1.841967e+06 41.721657 -87.621463
CHI036 11.588529 1.178616e+06 1.842048e+06 41.721878 -87.621354
... ... ... ... ... ...
CHI154 4.730534 1.161996e+06 1.836307e+06 41.706487 -87.682391
CHI156 16.366848 1.132581e+06 1.938004e+06 41.986119 -87.787741
CHI157 8.686924 1.149395e+06 1.911608e+06 41.913375 -87.726589
CHI158 29.301351 1.131748e+06 1.936860e+06 41.982995 -87.790832
CHI159 12.723206 1.148867e+06 1.910282e+06 41.909746 -87.728563
CHI160 30.117093 1.149437e+06 1.894438e+06 41.866257 -87.726879
CHI161 12.437132 1.149006e+06 1.910365e+06 41.909972 -87.728050
CHI162 18.089552 1.149923e+06 1.894858e+06 41.867402 -87.725084
CHI163 20.076613 1.149841e+06 1.894931e+06 41.867603 -87.725383
CHI164 6.827181 1.155954e+06 1.867614e+06 41.792520 -87.703676
CHI165 2.479060 1.161996e+06 1.836227e+06 41.706268 -87.682391
CHI166 1.718563 1.155768e+06 1.867930e+06 41.793392 -87.704348
CHI167 2.097500 1.155816e+06 1.868012e+06 41.793615 -87.704170
CHI168 26.908591 1.167170e+06 1.849194e+06 41.741742 -87.663073
CHI169 32.552524 1.167083e+06 1.849460e+06 41.742474 -87.663385
CHI170 6.366040 1.167695e+06 1.849654e+06 41.742993 -87.661138
CHI171 4.818966 1.165145e+06 1.920791e+06 41.938253 -87.668463
CHI172 14.719225 1.165058e+06 1.920991e+06 41.938805 -87.668779
CHI173 40.184932 1.161334e+06 1.871365e+06 41.802706 -87.683845
CHI174 114.339130 1.121895e+06 1.922706e+06 41.944319 -87.827378
CHI175 8.948276 1.121638e+06 1.922907e+06 41.944874 -87.828317
CHI176 11.000000 1.122009e+06 1.923002e+06 41.945129 -87.826952
CHI177 2.961538 1.149783e+06 1.918290e+06 41.931705 -87.724990
CHI178 14.756098 1.148258e+06 1.873075e+06 41.807658 -87.731755
CHI179 5.363636 1.148661e+06 1.873165e+06 41.807896 -87.730276
CHI180 29.145833 1.183268e+06 1.857719e+06 41.764775 -87.603828
CHI181 27.836735 1.182997e+06 1.858058e+06 41.765711 -87.604810
CHI182 58.250000 1.166307e+06 1.908050e+06 41.903268 -87.664557
CHI183 24.083333 1.166381e+06 1.908132e+06 41.903490 -87.664285
CHI184 74.446809 1.182951e+06 1.857908e+06 41.765301 -87.604985

162 rows × 5 columns

Friday gave nightmare for cameras

In [19]:
traffic2 = fetch_traffic_data()
In [20]:
import datetime as dt
traffic2.sort_index(inplace=True)
# for future reference: below gives month and year
# traffic.index.to_series().apply(lambda x: dt.datetime.strftime(x, '%b %Y')) 
# A: weekday, B: month, d: date, Y: year, m: month

traffic2["weekday"] = traffic2.index.to_series().apply(lambda x: dt.datetime.strftime(x, '%A')) 
traffic2.groupby(['weekday']).sum()
Out[20]:
VIOLATIONS X COORDINATE Y COORDINATE LATITUDE LONGITUDE
weekday
Friday 753468 2.969780e+10 4.849572e+10 1.072352e+06 -2.246282e+06
Monday 686724 2.976620e+10 4.860353e+10 1.074738e+06 -2.251272e+06
Saturday 676715 1.893861e+10 3.090150e+10 6.827700e+05 -1.429940e+06
Sunday 663801 1.892335e+10 3.087836e+10 6.822298e+05 -1.428801e+06
Thursday 730603 3.063207e+10 5.002385e+10 1.106136e+06 -2.317053e+06
Tuesday 708276 3.147777e+10 5.140274e+10 1.136724e+06 -2.381161e+06
Wednesday 705136 3.070936e+10 5.014615e+10 1.108929e+06 -2.322930e+06
In [21]:
# Did the number of unique camera increase? it seems yes by a marginal number
traffic2["Year"] = traffic2.index.to_series().apply(lambda x: dt.datetime.strftime(x, '%Y'))
traffic2.groupby(['Year']).agg('nunique')["CAMERA ID"].sort_index #['min', 'max', 'count', 'nunique'] possible
Out[21]:
<bound method Series.sort_index of Year
2014    143
2015    150
2016    150
2017    150
2018    162
Name: CAMERA ID, dtype: int64>

Any interesing pattern? Number of violations significantly increased in 2015 and began to decrease. However, still the number of violations is much larger than it was the case in 2014. The bottom figure shows the location of traffic violations. The size of the circle represents the number of violations and the color highlights the date of violations. The darker the circle is, the newer is the violation. The fact that most of the circles are dark means that the majority of past violations are a subset of recent violations. A few circles with lighter boundary suggests that, at least in some regions, the number of violation has decreased.

In [22]:
# Any interesting pattern? Let's find out
traffic2.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 170521 entries, 2014-07-01 to 2018-12-23
Data columns (total 10 columns):
ADDRESS         170521 non-null object
CAMERA ID       170521 non-null object
VIOLATIONS      170521 non-null int64
X COORDINATE    163959 non-null float64
Y COORDINATE    163959 non-null float64
LATITUDE        163959 non-null float64
LONGITUDE       163959 non-null float64
LOCATION        163959 non-null object
weekday         170521 non-null object
Year            170521 non-null object
dtypes: float64(4), int64(1), object(5)
memory usage: 19.3+ MB
In [23]:
traffic2.describe()
Out[23]:
VIOLATIONS X COORDINATE Y COORDINATE LATITUDE LONGITUDE
count 170521.000000 1.639590e+05 1.639590e+05 163959.000000 163959.000000
mean 28.880449 1.159712e+06 1.893473e+06 41.863389 -87.689229
std 36.636151 1.570560e+04 3.212393e+04 0.088338 0.057135
min 1.000000 1.121638e+06 1.820629e+06 41.663174 -87.828317
25% 7.000000 1.149783e+06 1.868078e+06 41.793493 -87.725084
50% 16.000000 1.159169e+06 1.898488e+06 41.877243 -87.689803
75% 37.000000 1.167170e+06 1.920991e+06 41.939040 -87.662810
max 479.000000 1.203645e+06 1.943342e+06 42.000260 -87.529848
In [24]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
location_plot = traffic2['VIOLATIONS'].hist(bins=20)
location_plot.set_title("Frequency and Number of Violations")
location_plot.set_xlabel("Number of Violations per Case")
location_plot.set_ylabel("Frequency of Cases")
Out[24]:
Text(0,0.5,'Frequency of Cases')
In [53]:
traffic2["Year"] = traffic2.index.to_series().apply(lambda x: dt.datetime.strftime(x, '%Y')) 
traffic2.groupby(['Year']).sum()
Out[53]:
VIOLATIONS X COORDINATE Y COORDINATE LATITUDE LONGITUDE
Year
2014 659424 1.892601e+10 3.093007e+10 6.832156e+05 -1.430905e+06
2015 1201698 4.221439e+10 6.889669e+10 1.523463e+06 -3.191149e+06
2016 1116805 4.352645e+10 7.105770e+10 1.571194e+06 -3.291152e+06
2017 1015490 4.276128e+10 6.981971e+10 1.543780e+06 -3.233736e+06
2018 931306 4.271703e+10 6.974767e+10 1.542226e+06 -3.230496e+06
In [55]:
pd.to_datetime(traffic2.index.values.astype(float))
traffic2.plot(kind="scatter", x="LONGITUDE", y="LATITUDE", alpha=0.4, label="Traffic Violations per Case", 
              figsize=(10,7),s = traffic2["VIOLATIONS"], c=traffic2.index, cmap=plt.get_cmap('Reds'),
              colorbar=True, sharex=False)
Out[55]:
<matplotlib.axes._subplots.AxesSubplot at 0x10cf12320>