import os
from six.moves import urllib
import pandas as pd
import numpy as np
DOWNLOAD_ROOT = "http://s3.amazonaws.com/gamma-datasets/"
TRAFFIC_PATH = "datasets"
TRAFFIC_URL = DOWNLOAD_ROOT + "Speed_Camera_Violations.csv"
## Use following codes for fetching data
def fetch_traffic_data(traffic_url=TRAFFIC_URL, traffic_path=TRAFFIC_PATH):
if not os.path.isdir(traffic_path):
os.makedirs(traffic_path)
csv_path = os.path.join(traffic_path, "Speed_Camera_Violations.csv")
urllib.request.urlretrieve(traffic_url, csv_path)
return pd.read_csv(csv_path, parse_dates=[2], index_col=[2])
traffic = fetch_traffic_data()
traffic.sort_index(inplace=True)
traffic.groupby(level=0).mean()
traffic.set_index("CAMERA ID", inplace=True)
traffic.sort_index(inplace=True)
traffic.groupby(level=0).mean()
traffic2 = fetch_traffic_data()
import datetime as dt
traffic2.sort_index(inplace=True)
# for future reference: below gives month and year
# traffic.index.to_series().apply(lambda x: dt.datetime.strftime(x, '%b %Y'))
# A: weekday, B: month, d: date, Y: year, m: month
traffic2["weekday"] = traffic2.index.to_series().apply(lambda x: dt.datetime.strftime(x, '%A'))
traffic2.groupby(['weekday']).sum()
# Did the number of unique camera increase? it seems yes by a marginal number
traffic2["Year"] = traffic2.index.to_series().apply(lambda x: dt.datetime.strftime(x, '%Y'))
traffic2.groupby(['Year']).agg('nunique')["CAMERA ID"].sort_index #['min', 'max', 'count', 'nunique'] possible
Any interesing pattern? Number of violations significantly increased in 2015 and began to decrease. However, still the number of violations is much larger than it was the case in 2014. The bottom figure shows the location of traffic violations. The size of the circle represents the number of violations and the color highlights the date of violations. The darker the circle is, the newer is the violation. The fact that most of the circles are dark means that the majority of past violations are a subset of recent violations. A few circles with lighter boundary suggests that, at least in some regions, the number of violation has decreased.
# Any interesting pattern? Let's find out
traffic2.info()
traffic2.describe()
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
location_plot = traffic2['VIOLATIONS'].hist(bins=20)
location_plot.set_title("Frequency and Number of Violations")
location_plot.set_xlabel("Number of Violations per Case")
location_plot.set_ylabel("Frequency of Cases")
traffic2["Year"] = traffic2.index.to_series().apply(lambda x: dt.datetime.strftime(x, '%Y'))
traffic2.groupby(['Year']).sum()
pd.to_datetime(traffic2.index.values.astype(float))
traffic2.plot(kind="scatter", x="LONGITUDE", y="LATITUDE", alpha=0.4, label="Traffic Violations per Case",
figsize=(10,7),s = traffic2["VIOLATIONS"], c=traffic2.index, cmap=plt.get_cmap('Reds'),
colorbar=True, sharex=False)