This notebook contains the code necessary to create the SFPD Incident and Crime Rate Reporter.
import pandas as pd
from sodapy import Socrata
import folium
import json
import plotly.express as px
from datetime import datetime, timedelta
import sys
sys.path.append("..")
# The config file is hidden since it contains the api key
# needed to get data from the sfpd database
import src.config as config
Via data.sfgov.org
, we can access many databases including the SFPD database which records all occuring incidents in the different police districts. For us to query it, we need to instantiate our client:
client = Socrata("data.sfgov.org",
config.api_key,
username=config.username,
password=config.password)
We can know get the latest 15,000 reports:
results = client.get("wg3w-h783", limit=15000, order="incident_date DESC")
wg3w-h783
is the database ID
.
# First we convert the response to a pandas dataframe followed by formating the datetime
results_df = pd.DataFrame.from_records(results)
results_df['incident_datetime'] = pd.to_datetime(results_df['incident_datetime'])
Having loaded the 15000 incidents into our dataframe, we can group the data according to police districts. This will allow us to create a crime rate graph for each district.
# Since we only care about crime rate we just need the time and the district of the event
district_df = results_df[["incident_datetime", "police_district"]]
# Each row represents an event
district_df["Count"] = 1
# Formatting the datetime
district_df["incident_datetime"] = district_df.incident_datetime.dt.date
# Grouping the data by date and district
district_df = district_df.groupby(['incident_datetime','police_district'], as_index=False)['Count'].sum()
district_df.incident_datetime = district_df.incident_datetime.astype(str)
/opt/anaconda3/envs/data_mining/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /opt/anaconda3/envs/data_mining/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy This is separate from the ipykernel package so we can avoid doing imports until
Similarly, we can group the main database by date to get a sense of the total crime rate per day.
total_df = results_df[["incident_datetime"]]
total_df["Count"] = 1
# This groups the dataset by day
total_df = total_df.groupby(total_df.incident_datetime.dt.date)[['Count']].sum()
total_df = total_df.iloc[1:]
total_df = total_df.reset_index()
total_df.incident_datetime = total_df.incident_datetime.astype(str)
/opt/anaconda3/envs/data_mining/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Having our total crime rate, it is now possible to get the rolling mean. In this case we select a five-day window.
mean = total_df.rolling(5).mean().fillna(method="backfill")
In order to predict the future, rows containing future dates must be created:
# Here we extract the last day avaliable in the dataset
d = datetime.strptime(total_df.incident_datetime.values[-1].replace("-", "/"), "%Y/%m/%d")
# Then, we create 2 more rows belonging to two more days after the last date avaliable
rng = pd.date_range(d, periods=3, freq='d')
rng = pd.to_datetime(rng, format='%Y%m%d')
# We transform the new dates into a dataframe
to_append = pd.DataFrame({'incident_datetime': rng})
to_append["incident_datetime"] = to_append.incident_datetime.dt.date
to_append = to_append.iloc[1:]
# we fill the new values with null so that ChartJS does not plot them
total_df = total_df.append(to_append).fillna("null")
total_df.incident_datetime = total_df.incident_datetime.astype(str)
Similarly, we need to add null values to the rolling average:
mean = list(mean.Count.values)
mean.extend(to_append.shape[0] * ["null"])
template.html
file to create the new index.html
¶There are various ways to achieve this. There is definitely a better way using JavaScript but for the purposees of this project we will stick with Python:
# Read in the template html file
with open('../template.html', 'r') as file :
filedata = file.read()
# Inserting the dates
filedata = filedata.replace(' labels: [], // INCLUDE X AXIS DATES',
' labels: {}, // INCLUDE X AXIS DATES'.format(
list(total_df.incident_datetime.values)))
# Inserting the total crime rate information
filedata = filedata.replace(' data: [], // Total',
' data: {}, // Total'.format(
list(total_df.Count.values)))
# Inserting the rolling average values
filedata = filedata.replace(' data: [], // Average',
' data: {}, // Average'.format(
mean))
# Inserting district-depedent crime rate information
for i in district_df.police_district.unique():
if i != "Out of SF":
district_info = district_df[district_df.police_district == i]
filedata = filedata.replace(' data: [], // {}'.format(i),
' data: {}, // {}'.format(list(district_info.Count.values), i))
# Write the file out again
with open('../index.html', 'w') as file:
file.write(filedata)
latest_reports = results_df.copy()
latest_reports = latest_reports[["incident_datetime", "police_district", "incident_category",
"incident_subcategory", "incident_description", "resolution",
"intersection"]]
latest_reports.columns = ["Incident Date", "District", "Incident Category", "Subcategory",
"Description", "Resolution", "Intersection"]
# We save the dataframe as html with the table and table-hover class
latest_reports.head(5).to_html(open('../table.html', 'w'), classes=["table", "table-hover"],
border=0, index=False, justify="center")
We can now insert the html table into the updated index html table:
with open("../index.html", "r") as f1:
t1 = f1.readlines()
with open("../table.html", "r") as f2:
t2 = f2.readlines()
initial = 78
for i in range(0,len(t2)):
t1.insert(initial, t2[i])
initial = initial + 1
with open("../index.html", "w") as f2:
f2.writelines(t1)
For asthetics, we provide the thead-dark
class to our table head.
with open('../index.html','r') as file:
filedata = file.read()
filedata = filedata.replace(' <thead>',' <thead class="thead-dark">')
with open('../index.html','w') as file:
file.write(filedata)
To create the chloropleth map showing the the latest weekly crime rate we need to resample the dataframe.
# RESAMPLING THE DATAFRAME
weekly_df = results_df[["incident_datetime", "police_district"]]
weekly_df["Count"] = 1
weekly_df = weekly_df.set_index(weekly_df["incident_datetime"]).drop(columns=["incident_datetime"])
weekly_df = weekly_df.groupby('police_district').resample('1w').sum()
weekly_df = weekly_df.reset_index()
/opt/anaconda3/envs/data_mining/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Having grouped the dataset into a weekly manner, we can extract the previous to last row. The dataset is updated daily but the incidents are reported after these are approved which is not always in a timely manner. this is why the previous to last entrie is taken.
last_week = pd.DataFrame(columns=weekly_df.columns)
for i in weekly_df.police_district.unique():
last_week = last_week.append(weekly_df[weekly_df.police_district == i].iloc[[-2]])
last_week = last_week[["police_district", "Count"]]
last_week.columns = ["Neighborhood", "Count"]
last_week.Neighborhood = last_week.Neighborhood.apply(lambda x: x.upper())
last_week.Count = last_week.Count.astype(int)
The GeoJSON file contains the coordinates needed to plot the different districts in the san francisco area. This file is needed by plotly to create a map plot using various plotters including folium and plotly. In this notebook we use the latter.
# San Francisco latitude and longitude values
latitude = 37.77
longitude = -122.42
# loading up json file
with open("../src/sf.geojson") as f:
data = json.load(f)
# creating and id key to hold district info needed by plotly
for i in range(0,len(data["features"])):
copy_dict = data["features"][i]
copy_dict["id"] = copy_dict["properties"]["DISTRICT"]
data["features"][i] = copy_dict
The plotly map is saved as an html file and inserted into our updated index.html
file.
fig = px.choropleth_mapbox(last_week, geojson=data, locations='Neighborhood', color='Count',
color_continuous_scale="Turbo",
range_color=(0, last_week.Count.max()),
mapbox_style="carto-positron",
zoom=11, center = {"lat": latitude, "lon": longitude},
opacity=0.5,
labels={"Count":"Weekly Crime Rate"}
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.write_html("../plotly_map.html")
We can also create a bar plot to visualize the distribution of the latest 15000 incident categories. For this we group the data by category rather than by date or district.
results_df["Count"] = 1
incident_category = results_df.groupby("incident_category")[["Count"]].sum().reset_index().sort_values(by="Count", ascending=False).head(10)
fig = px.bar(incident_category, x='incident_category', y='Count',
labels={"incident_category":"Incident Category"}, color="Count", opacity=1.0)
fig.update_layout(margin={"r":0,"t":40,"l":0,"b":0})
fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)', 'paper_bgcolor': 'rgba(0, 0, 0, 0)'})
fig.write_html("../incident_category.html")
In this section we create a simple Decision Tree model to predict the crime rate in the next pair of dates. At the begging we extracted only the latest 15,000 incidents. To train our model we need more data. For this we query the database for 500,000 rows which is more than the avaliable datapoints.
# This cell was ran once and the data was saved into a csv
# results_ml = client.get("wg3w-h783", limit=500000, order="incident_date DESC")
# df = pd.DataFrame.from_records(results_ml)
# df.to_csv("sfpd_reports.csv", index=False)
df = pd.read_csv("sfpd_reports.csv")
First we make remove any rows with missing data:
df['incident_datetime'] = pd.to_datetime(df['incident_datetime'])
df = df[~df['incident_category'].isnull()]
df = df[~df['latitude'].isnull()]
df = df[~df['police_district'].isnull()]
df = df[["incident_datetime", "incident_day_of_week", "police_district", "incident_category"]]
We want to predict the daily crime rate so we group the response by date. In other words, we resample the dataframe into one day bins.
group_by_pd = df.copy()
group_by_pd["Count"] = 1
group_by_pd = group_by_pd.set_index(group_by_pd["incident_datetime"]).drop(columns=["incident_datetime"])
group_by_pd = group_by_pd.groupby(['police_district']).resample('1D').sum()
group_by_pd = group_by_pd.reset_index()
X = group_by_pd.copy()
Here, we one-hot encode the police district feature and create a feature for both the day and month:
X = pd.concat([X, pd.get_dummies(X["police_district"])], axis=1).drop(columns=["police_district"])
# X['year'] = pd.to_datetime(X['incident_datetime']).dt.year
X['month'] = pd.to_datetime(X['incident_datetime']).dt.month
X['day'] = pd.to_datetime(X['incident_datetime']).dt.day
Given that the latest incidents will be approved later in the future we cannot use the last days as training data.
X = X.iloc[:-10]
y = X[["Count"]]
X = X.drop(['incident_datetime', "Count"], axis=1)
Here we use the scikit-learn implementation to train a DT model.
from sklearn.tree import DecisionTreeRegressor
regr_1 = DecisionTreeRegressor()
regr_1.fit(X, y)
DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort='deprecated', random_state=None, splitter='best')
We will get the last eight days for inference plus the future two days:
pd_districts = ['Bayview', 'Central', 'Ingleside', 'Mission', 'Northern',
'Out of SF', 'Park', 'Richmond', 'Southern', 'Taraval',
'Tenderloin']
d = datetime.today() - timedelta(days=10)
rng = pd.date_range(d, periods=10+2, freq='d')
rng = pd.to_datetime(rng, format='%Y%m%d')
dates_to_query = pd.DataFrame({ 'Date': rng})
dates_to_query["year"] = dates_to_query.Date.dt.year
dates_to_query["day"] = dates_to_query.Date.dt.day
dates_to_query["month"] = dates_to_query.Date.dt.month
dates_to_query["Date"] = dates_to_query.Date.dt.date
predictions_dates = dates_to_query.Date.astype(str).values
The following function allows us to get the total crime rate for each queried data. Since our model is built to predict crime on each neighborhood, we need to sum all predictions to get the total crime rate.
def get_prediction_for_dates_df(model, dates):
# Iterate through the police districts and sum all predictions
for i in pd_districts:
testing = pd.DataFrame(columns=X.columns)
data = pd.DataFrame({i: 1,
# "year":dates.year.values,
"month":dates.month.values,
"day":dates.day.values})
testing = testing.append(data).fillna(0)
y = model.predict(testing)
dates[i] = y.astype(int)
return dates
dates_to_query = get_prediction_for_dates_df(regr_1, dates_to_query)
dates_to_query["total_crime_rate"] = dates_to_query.iloc[:, 4:].sum(axis=1)
dates_to_query = dates_to_query[["Date", "total_crime_rate"]]
dates_to_query.columns = ["incident_datetime", "tcr"]
dates_to_query.incident_datetime = dates_to_query.incident_datetime.astype(str)
ai_predictions = total_df[["incident_datetime"]].iloc[:-2]
ai_predictions = pd.merge(ai_predictions, dates_to_query, on="incident_datetime", how="outer").fillna("null")
ai_predictions.loc[ai_predictions.index[50-11], 'tcr'] = total_df.loc[total_df.index[50-11], 'Count']
index.html
file¶# Read in the file
with open('../index.html', 'r') as file :
filedata = file.read()
filedata = filedata.replace(' data: [], // AI',
' data: {}, // AI'.format(
list(ai_predictions.tcr.values)))
# Write the file out again
with open('../index.html', 'w') as file:
file.write(filedata)