# Dataframe manipulation libraries
import pandas as pd
import numpy as np
from scipy import stats # to detect outliers
# Graph Libraries
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import seaborn as sns # for visualiation
import matplotlib.pyplot as plt
import data_prep as dp
from numpy.random import seed
from numpy.random import randn
from numpy import mean
from numpy import std
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import explained_variance_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
# Load data
features_train = pd.read_csv('./data/dengue_features_train.csv')
labels_train = pd.read_csv('./data/dengue_labels_train.csv')
features_test = pd.read_csv('./data/dengue_features_test.csv')
Zoshua Colah, Arsalan Ahmed
Hi, welcome to our Modelling Dengue Notebook. We are two rookie undergraduate students at the University of Washington in an introductory ML class applying Machine Learning models to predict Dengue in San Juan & Iquitos as part of the Driven Data Challenge.
Our notebook is extremely long due to the amount of EDA we did and the number of datasets we created and tested with various Machine Learning models.
To help you navigate this document better, we recommend taking a look at our Table of Contents:*
1. Setup & Importing Data
1. Rolling Averages - New Features Added
2. San Juan
0. Data Preperation
1. Understanding our Data
2. Missing Values
3. Exploratory Data Analysis
4. Outlier Engineering
3. Iquitos
0. Data Preperation
1. Understanding our Data
2. Missing Values
3. Exploratory Data Analysis
4. Outlier Engineering
4. Machine Learning Models
1. KNN
2. XG Boost
3. Random Forest
4. Decision Tree
5. Comparision
We have created a separate data_prep.py file to help us easily generate our datasets without any clutter. We have introduced the following new columns as part of the feature generation process:
There can be a lot of fluctuation in our variables which can cause bias in our model. To help reduce the bias we have introduced Rolling Averages to help provide a better understanding of the overall current scenario.
We have added rolling average columns for all the climate variables and vegetation indexes
A simple rolling average (also called a moving average, if you wanted to know) is the unweighted mean of the last n values. In simple words: An average of the last n values in a data set, applied row-by-row, so that you get a series of average
One year has 52 weeks on average. Initially we decided to take n as 52 because of this.
However after running a for loop to find the week with the least MOE and best fit, we found that n as 50 would be better. Hence n is 50.
Please refer to our supplement files to see this.
We have followed the following steps
Below we can see the count of missing values for each city and visualize where exactly do we have missing values for each column.
df_sj = features_train[features_train['city'] == 'sj']
missing_values_count = df_sj.isnull().sum()
missing_values_count
From the visualization below we notice that the vegetation index column for the north east has a lot of missing values
fig, ax = plt.subplots(figsize=(6,8))
sns.heatmap(df_sj.isnull().reset_index(drop=True),ax=ax, cbar = False, yticklabels = 50)
plt.ylabel("Row number", size = 22)
plt.xlabel("Feature name", size = 22)
plt.title("San Juan Missing Data", size = 32)
df_iq = features_train[features_train['city'] == 'iq']
missing_values_count = df_iq.isnull().sum()
missing_values_count
We can notice that there are missing values for particular short intervals of time for Iquitos
fig, ax = plt.subplots(figsize=(6,8))
sns.heatmap(df_iq.isnull().reset_index(drop=True),ax=ax, cbar = False, yticklabels = 50)
plt.ylabel("Row number", size = 22)
plt.xlabel("Feature name", size = 22)
plt.title("Iquitos Missing Data", size = 32)
We handled missing values by using Forward Fill.
We do not look at testing data before model validation ever as it influences our decision making when making models.
print("Training Features")
features_train.sample(3)
print("Training Labels")
labels_train.sample(3)
City and Date Indicators
Satellite vegetation - Normalized difference vegetation index (NDVI) - NOAA's CDR Normalized Difference Vegetation Index (0.5x0.5 degree scale) measurements
PERSIANN satellite precipitation measurements (0.25x0.25 degree scale)
NOAA's NCEP Climate Forecast System Reanalysis measurements (0.5x0.5 degree scale)
NOAA's GHCN daily climate data weather station measurements
Note: to avoid clutter we have not put the column names for our rolling average columns
data_sj = dp.features_train(features_train, labels_train, 'sj')
data_sj_n = dp.normalize(data_sj)
data_test_sj = dp.features_test(features_test, features_train, 'sj')
data_test_sj_n = dp.normalize(data_test_sj)
data_sj.columns
Displaying the first 5 rows
Note: We have added Day of the Year and Odd Year and Month as as additional columns
data_sj.head(5)
data_sj.describe().T
From the plot below we notice that there are outliers in total cases for the past so many years between 90 and 500 cases
sns.boxplot(x=data_sj['total_cases'])
Looking at the plot below, we can most of data points are lying bottom left side but there are points which are far from the population like top left & bottom right corner.
This also indicates that higher rainfall does not necessarrily lead to higher total number of cases for the week
fig, ax = plt.subplots(figsize=(16,8))
ax.scatter(data_sj['precipitation_amt_mm'],data_sj['total_cases'])
ax.set_xlabel('Precipitation in mm')
ax.set_ylabel('Total Cases for the week')
plt.show()
We will handle our outliers after exploring our data further. This is so that we get a better understanding of the domain before we remove or reset outliers.
Below is the same scatter plot with each dot colored by month
# Use the 'hue' argument to provide a factor variable
sns.lmplot( x="precipitation_amt_mm", y="total_cases", data=data_sj, fit_reg=False,
hue='month', legend=False, palette="Set2")
# Move the legend to an empty part of the plot
plt.legend(loc='upper right')
Below is a scatter plot with each dot colored by month
# Use the 'hue' argument to provide a factor variable
sns.lmplot( x="precipitation_amt_mm", y="total_cases", data=data_sj, fit_reg=False,
hue='year', legend=False)
# Move the legend to an empty part of the plot
plt.legend(loc='upper right')
Below is a bar chart showing the average number of cases in each week for each year. We notice that the average for the years 94 and 98 are extremely high this may be due to outliers for these years. We will get to see this in the EDA.
weekly_avg_sj = pd.DataFrame(data_sj.groupby(['year'])['total_cases'].mean().reset_index(name='avg_weekly_cases'))
plt.subplots(figsize=(11,7))
sns.barplot(x='year', y='avg_weekly_cases', data=weekly_avg_sj)
Our first step in exploring the data was to see Dengue changed with time. The reason we decided on this was to see how Dengue changed with time and if there was any seasonality.
As Alex Freemain points out in his EDA on Github, the spikes in the time-series are obvious outbreaks. It will be important to predict these outbreaks for health reasons and hence predicting just the general cyclic trend of Dengue will not be enough.
Below we will notice that the number of visualizations for each week are not consitent and vary with depending on the time of year. This can be a seasonlity trend as we will also notice that the rainfall received changes with time, during which the mosquitoes come out, bite and spread dengue.
data_sj.plot(x='week_start_date', y='total_cases', figsize = (15,5))
While the facet scatter plot has helped us identify the change, it will help to map this out through a simple wide scatter plot for San Juan
sns.set(style="ticks", palette="muted")
g = sns.FacetGrid(data_sj, col="weekofyear", hue="city", col_wrap=5, size=4)
g = (g.map(plt.scatter, "year", "total_cases", edgecolor="w").add_legend())
Key Insights:
g = sns.lmplot(x="weekofyear", y="total_cases", hue="city", col="city", data=data_sj, aspect= 2, size = 7, x_jitter=.1)
sns.set(style="ticks", palette="muted")
g = sns.FacetGrid(data_sj, col="year", hue="city", col_wrap=5, size=4)
g = (g.map(plt.scatter, "month", "total_cases", edgecolor="w").add_legend())
sns.set(style="ticks", palette="muted")
g = sns.FacetGrid(data_sj, col="year", hue="city", col_wrap=5, size=4)
g = (g.map(plt.scatter, "weekofyear", "total_cases", edgecolor="w").add_legend())
Key Insight: We notice that the temperature range for all years lies between 296 K and 302 K
sns.set(style="ticks", palette="muted")
g = sns.FacetGrid(data_sj, col="year", hue="city", col_wrap=5, size=4)
g = (g.map(plt.scatter, "weekofyear", "reanalysis_avg_temp_k", edgecolor="w").add_legend())
g = sns.lmplot(x="weekofyear", y="precipitation_amt_mm", hue="city", col="city", data=data_sj, aspect= 2, size = 7, x_jitter=.1)
Key Insights:
sns.factorplot(x="weekofyear", y="total_cases", hue="city", size=8, aspect=3,data=data_sj)
plt.title("Growth of Dengue with Time - Weekly Breakdown for all years in San Juan")
Key Insights from both charts below:
sns.factorplot(x="weekofyear", y="precipitation_amt_mm", hue="city", size=6, aspect=4,data=data_sj)
plt.title("Precipitation Change with Time - Weekly Breakdown - for all years in San Juan ")
plt.figure(figsize=(25,20))
plt.title('Correlation matrix')
sns.heatmap(df_sj.corr(), cmap="YlGnBu", annot = True)
As mentioned in the Benchmark file by Driven Data:
Many of the temperature data are strongly correlated, which is expected. But the total_cases variable doesn't have many obvious strong correlations. Interestingly, total_cases seems to only have weak correlations with other variables. Many of the climate variables are much more strongly correlated. Interestingly, the vegetation index also only has weak correlation with other variables. These correlations may give us some hints as to how to improve our model.
The graphs below represent correlation of the dependent variable 'Dengue Cases' with the environmental and climate variables in San Juan. We checked the correlation between the given variables with the percent dengue cases each week with respect to year. Using weekly dengue cases percentage values we standardized the data resulting in better correlation with the variables. Since the region and climate of two cities is different, we can see there is a significant difference in the correlation behavior of our variables among two cities.
#Code to generate correlation graphs below for the two cities
corr_sj = data_sj.corr(method='pearson')
corr_sj = corr_sj['total_cases'].to_frame(name = 'corr_with_cases_sj')
corr_sj = corr_sj.sort_values(by=['corr_with_cases_sj'])
corr_sj = (corr_sj.drop('total_cases')
.drop('year')
.drop('month')
.drop('weekofyear')
.drop('odd_year'))
corr_sj.plot(kind='barh', title='Dengue Correlation - San Juan, Puerto Rico', xlim=(-.40,.30), grid = True, legend = False, color = '#4B7ACC', figsize=(12,12))
The correlation strengths differ for each city, but it looks like reanalysis_specific_humidity_g_per_kg and reanalysis_dew_point_temp_k are the most strongly correlated with total_cases. This makes sense: we know mosquitos thrive wet climates, the wetter the better!
As we all know, "cold and humid" is not a thing. So it's not surprising that as minimum temperatures, maximum temperatures, and average temperatures rise, the total_cases of dengue fever tend to rise as well.
Interestingly, the precipitation measurements bear little to no correlation to total_cases, despite strong correlations to the humidity measurements, as evident by the heatmaps above.
As we noticed earlier there are outliers in our dataset. Let us see how many outliers we have. However after testing our model we have seen that it will as we will not be able to predict the outbreaks. This is why we will not remove them.
data = data_sj['total_cases']
# calculate summary statistics
data_mean, data_std = mean(data), std(data)
# identify outliers
cut_off = data_std * 3
lower, upper = data_mean - cut_off, data_mean + cut_off
# identify outliers
outliers = [x for x in data if x < lower or x > upper]
print('Identified outliers: %d' % len(outliers))
# remove outliers
outliers_removed = [x for x in data if x >= lower and x <= upper]
print('Non-outlier observations: %d' % len(outliers_removed))
From above we notice that we have 20 outliers (outside of 3 S.D. for total cases in San Juan) and we have 916 non-outlier observations for total cases
dp.remove_outliers(data_sj).head(4)
# Iquitos
data_iq = dp.features_train(features_train, labels_train, 'iq')
data_iq_n = dp.normalize(data_iq)
data_test_iq = dp.features_test(features_test, features_train, 'iq')
data_test_iq_n = dp.normalize(data_test_iq)
Displaying the first 5 rows
Note: We have added Day of the Year and Odd Year and Month as as additional columns
data_iq.head(5)
data_iq.describe().T
From the plot below we notice that there are outliers in total cases for the past so many years between 90 and 500 cases
sns.boxplot(x=data_iq['total_cases'])
Looking at the plot below, we can most of data points are lying bottom left side but there are points which are far from the population like top left & bottom right corner.
This also indicates that higher rainfall does not necessarrily lead to higher total number of cases for the week
fig, ax = plt.subplots(figsize=(16,8))
ax.scatter(data_iq['precipitation_amt_mm'],data_iq['total_cases'])
ax.set_xlabel('Precipitation in mm')
ax.set_ylabel('Total Cases for the week')
plt.show()
We will handle our outliers after exploring our data further. This is so that we get a better understanding of the domain before we remove or reset outliers.
Below is the same scatter plot with each dot colored by month
# Use the 'hue' argument to provide a factor variable
sns.lmplot( x="precipitation_amt_mm", y="total_cases", data=data_iq, fit_reg=False,
hue='month', legend=False, palette="Set2")
# Move the legend to an empty part of the plot
plt.legend(loc='upper right')
# Use the 'hue' argument to provide a factor variable
sns.lmplot( x="precipitation_amt_mm", y="total_cases", data=data_iq, fit_reg=False,
hue='year', legend=False)
# Move the legend to an empty part of the plot
plt.legend(loc='upper right')
Below is a bar chart showing the average number of cases in each week for each year. We notice that the average for the year 2003 is really low.
weekly_avg_sj = pd.DataFrame(data_iq.groupby(['year'])['total_cases'].mean().reset_index(name='avg_weekly_cases'))
plt.subplots(figsize=(11,7))
sns.barplot(x='year', y='avg_weekly_cases', data=weekly_avg_sj)
We notice that there has been an outbreak on some occassions with a sudden drop in weekly cases for some years.
data_iq.plot(x='week_start_date', y='total_cases', figsize = (15,5))
sns.set(style="ticks", palette="muted")
g = sns.FacetGrid(data_iq, col="weekofyear", hue="city", col_wrap=5, size=4)
g = (g.map(plt.scatter, "year", "total_cases", edgecolor="w").add_legend())
g = sns.lmplot(x="weekofyear", y="total_cases", hue="city", col="city", data=data_sj, aspect= 2, size = 7, x_jitter=.1)
sns.set(style="ticks", palette="muted")
g = sns.FacetGrid(data_iq, col="year", hue="city", col_wrap=5, size=4)
g = (g.map(plt.scatter, "month", "total_cases", edgecolor="w").add_legend())
sns.set(style="ticks", palette="muted")
g = sns.FacetGrid(data_iq, col="year", hue="city", col_wrap=5, size=4)
g = (g.map(plt.scatter, "weekofyear", "total_cases", edgecolor="w").add_legend())
sns.set(style="ticks", palette="muted")
g = sns.FacetGrid(data_iq, col="year", hue="city", col_wrap=5, size=4)
g = (g.map(plt.scatter, "weekofyear", "reanalysis_avg_temp_k", edgecolor="w").add_legend())
g = sns.lmplot(x="weekofyear", y="precipitation_amt_mm", hue="city", col="city", data=data_iq, aspect= 2, size = 7, x_jitter=.1)
sns.factorplot(x="weekofyear", y="total_cases", hue="city", size=8, aspect=3,data=data_iq)
plt.title("Growth of Dengue with Time - Weekly Breakdown for all years in Iquitos")
From the graphs below we notice that total number of dengue cases decreases as the total amount of precipitation decreases.
sns.factorplot(x="month", y="total_cases", hue="city", size=8, aspect=2,data=data_iq)
plt.title("Growth of Dengue with Time - Weekly Breakdown for all years in Iquitos")
sns.factorplot(x="weekofyear", y="precipitation_amt_mm", hue="city", size=6, aspect=3,data=data_iq)
plt.title("Precipitation Change with Time - Weekly Breakdown - for all years in Iquitos ")
sns.factorplot(x="month", y="precipitation_amt_mm", hue="city", size=6, aspect=3,data=data_iq)
plt.title("Precipitation Change with Time - Weekly Breakdown - for all years in Iquitos ")
plt.figure(figsize=(25,20))
plt.title('Correlation matrix')
sns.heatmap(df_iq.corr(), cmap="YlGnBu", annot = True)
#Code to generate correlation graphs below for the two cities
corr_iq = data_iq.corr(method='pearson')
corr_iq = corr_iq['total_cases'].to_frame(name = 'corr_with_cases_sj')
corr_iq = corr_iq.sort_values(by=['corr_with_cases_sj'])
corr_iq = (corr_iq.drop('total_cases')
.drop('year')
.drop('month')
.drop('weekofyear')
.drop('odd_year'))
corr_sj.plot(kind='barh', title='Dengue Correlation - Iquitos', xlim=(-.40,.30), grid = True, legend = False, color = '#4B7ACC', figsize=(12,12))
data = data_iq['total_cases']
# calculate summary statistics
data_mean, data_std = mean(data), std(data)
# identify outliers
cut_off = data_std * 3
lower, upper = data_mean - cut_off, data_mean + cut_off
# identify outliers
outliers = [x for x in data if x < lower or x > upper]
print('Identified outliers: %d' % len(outliers))
# remove outliers
outliers_removed = [x for x in data if x >= lower and x <= upper]
print('Non-outlier observations: %d' % len(outliers_removed))
We will be testing our models with to see which give us the least margin of error and fit perfectly.
Since we are predicting a continuous valued attribute associated with an object we will need to use Regression Machine Learning models from scikit learn.
As part of our modelling process, we did:
For each machine learning model, we will mention:
At the end of this discussion we will shift to selecting the best model via a comparision visualization collection & then add our concluding thoughts.
In K Nearest Neighbor, the target is predicted by local interpolation of the targets associated of the nearest neighbors in the training set.
We decided to use KNN, because it is an extremely recommended algorithm for large sets of time series data.
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj_n,
data_sj['total_cases'],
test_size = 0.3
)
from sklearn.feature_selection import RFE
for n in range(1,20,1):
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj_n,
data_sj['total_cases'],
test_size = 0.3
)
rfe = RFE(ExtraTreesRegressor(), n)
fit = rfe.fit(test_features_sj, test_outcomes_sj)
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj[data_sj_n.columns[fit.ranking_ == 1]],
data_sj['total_cases'],
test_size = 0.3
)
knr_reg = KNeighborsRegressor(n_neighbors = 5, weights = 'distance')
knr_preds_sj = knr_reg.fit(train_features_sj, train_outcomes_sj).predict(test_features_sj)
print('Features:', n, ', MAE:', mean_absolute_error(test_outcomes_sj, knr_preds_sj))
model = ExtraTreesRegressor()
feature_imp = pd.DataFrame({'Feature' : [], 'Importance' : []})
for i in range(1,10):
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj_n,
data_sj['total_cases'],
test_size = 0.3
)
for i in range(1,10):
model.fit(train_features_sj, train_outcomes_sj)
imp = pd.DataFrame({'Feature': data_sj_n.columns, 'Importance':model.feature_importances_})
frames = [feature_imp, imp]
feature_imp = pd.concat(frames).reset_index(drop = True)
feature_imp = feature_imp.groupby(['Feature'])['Importance'].mean().to_frame(name = 'Importance').reset_index()
feature_imp = feature_imp.set_index('Feature')
feature_imp.sort_values(by='Importance').plot(kind='barh', title='Feature Importance - San Juan, Puerto Rico', grid = True, legend = False, figsize=(10,10))
# San Juan
# ['month','reanalysis_relative_humidity_percent', 'reanalysis_dew_point_temp_k', 'station_avg_temp_c', 'reanalysis_tdtr_k']
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj_n[['month',
'odd_year',
'ndvi_sw_rolling_avg',
'precipitation_amt_mm_rolling_avg',
'reanalysis_dew_point_temp_k_rolling_avg',
'reanalysis_precip_amt_kg_per_m2_rolling_avg',
'reanalysis_relative_humidity_percent_rolling_avg',
'station_diur_temp_rng_c_rolling_avg',
'station_max_temp_c_rolling_avg']],
data_sj['total_cases'],
test_size = 0.3
)
params = {'n_neighbors':range(2, 30), 'weights':['uniform', 'distance']}
folds = KFold(n_splits = 10, shuffle=True)
grid_search = GridSearchCV(KNeighborsRegressor(), param_grid=params, cv=folds, scoring='neg_mean_absolute_error')
knr_preds_sj = grid_search.fit(train_features_sj, train_outcomes_sj).predict(test_features_sj)
knr_mae_sj = mean_absolute_error(test_outcomes_sj, knr_preds_sj)
knr_mdae_sj = median_absolute_error(test_outcomes_sj, knr_preds_sj)
knr_evs_sj = explained_variance_score(test_outcomes_sj, knr_preds_sj)
print(knr_mae_sj)
grid_search.cv_results_['params'][grid_search.best_index_]
plt.subplots(figsize=(11,7))
plt.title('Actual vs Predicted Dengue Cases (San Juan, Puerto Rico)')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.scatter(test_outcomes_sj, knr_preds_sj, edgecolors = '#1e1e1e', color='#bae1ff', alpha=0.8)
plt.plot([0, 400], [0, 400], 'red', alpha=0.7)
knr_preds_final_sj = knr_reg.fit(train_features_sj, train_outcomes_sj).predict(
data_test_sj_n[['month',
'odd_year',
'ndvi_sw_rolling_avg',
'precipitation_amt_mm_rolling_avg',
'reanalysis_dew_point_temp_k_rolling_avg',
'reanalysis_precip_amt_kg_per_m2_rolling_avg',
'reanalysis_relative_humidity_percent_rolling_avg',
'station_diur_temp_rng_c_rolling_avg',
'station_max_temp_c_rolling_avg']]
)
submission_sj = data_test_sj[['city', 'year', 'weekofyear']].copy()
submission_sj['total_cases'] = np.round(knr_preds_final_sj).astype(int)
knn_preds_week_sj = pd.DataFrame(test_features_sj)
knn_preds_week_sj['Actual'] = test_outcomes_sj.values
knn_preds_week_sj['Predicted'] = knr_preds_sj
knn_preds_week_sj = pd.merge(data_sj, knn_preds_week_sj, left_index = True, right_index = True)
knn_preds_week_sj = knn_preds_week_sj.assign(residual=knn_preds_week_sj.Actual - knn_preds_week_sj.Predicted)
sns.factorplot(x="year", y="residual", hue="city", size=8, aspect=3,data=knn_preds_week_sj)
plt.title("Residuals")
knn_preds_week_sj = knn_preds_week_sj.melt(id_vars=['city', 'year', 'weekofyear', 'week_start_date', 'ndvi_ne', 'ndvi_nw',
'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 'reanalysis_air_temp_k',
'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k',
'reanalysis_precip_amt_kg_per_m2',
'reanalysis_relative_humidity_percent', 'reanalysis_tdtr_k',
'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
'station_min_temp_c', 'station_precip_mm', 'total_cases', 'month_x',
'odd_year_x', 'ndvi_mean', 'ndvi_mean_rolling_avg',
'ndvi_ne_rolling_avg', 'ndvi_nw_rolling_avg', 'ndvi_se_rolling_avg',
'ndvi_sw_rolling_avg_x', 'precipitation_amt_mm_rolling_avg_x',
'reanalysis_air_temp_k_rolling_avg',
'reanalysis_avg_temp_k_rolling_avg',
'reanalysis_dew_point_temp_k_rolling_avg_x',
'reanalysis_max_air_temp_k_rolling_avg',
'reanalysis_min_air_temp_k_rolling_avg',
'reanalysis_precip_amt_kg_per_m2_rolling_avg_x',
'reanalysis_relative_humidity_percent_rolling_avg_x',
'reanalysis_tdtr_k_rolling_avg', 'station_avg_temp_c_rolling_avg',
'station_diur_temp_rng_c_rolling_avg_x',
'station_max_temp_c_rolling_avg_x', 'station_min_temp_c_rolling_avg',
'station_precip_mm_rolling_avg', 'month_y', 'odd_year_y',
'ndvi_sw_rolling_avg_y', 'precipitation_amt_mm_rolling_avg_y',
'reanalysis_dew_point_temp_k_rolling_avg_y',
'reanalysis_precip_amt_kg_per_m2_rolling_avg_y',
'reanalysis_relative_humidity_percent_rolling_avg_y',
'station_diur_temp_rng_c_rolling_avg_y',
'station_max_temp_c_rolling_avg_y', 'residual'], var_name='type')
sns.factorplot(x='week_start_date', y="value", hue="type", data=knn_preds_week_sj, size = 6, aspect =3)
plt.title("Actual vs Predicted for each year")
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq_n,
data_iq['total_cases'],
test_size = 0.3
)
from sklearn.feature_selection import RFE
for n in range(1,20,1):
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq_n,
data_iq['total_cases'],
test_size = 0.3
)
rfe = RFE(ExtraTreesRegressor(), n)
fit = rfe.fit(test_features_iq, test_outcomes_iq)
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq[data_iq_n.columns[fit.ranking_ == 1]],
data_iq['total_cases'],
test_size = 0.3
)
knr_reg = KNeighborsRegressor(n_neighbors = 5, weights = 'distance')
knr_preds_iq = knr_reg.fit(train_features_iq, train_outcomes_iq).predict(test_features_iq)
print('Features:', n, ', MAE:', mean_absolute_error(test_outcomes_iq, knr_preds_iq))
model = ExtraTreesRegressor()
feature_imp = pd.DataFrame({'Feature' : [], 'Importance' : []})
for i in range(1,10):
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq_n,
data_iq['total_cases'],
test_size = 0.3
)
for i in range(1,10):
model.fit(train_features_iq, train_outcomes_iq)
imp = pd.DataFrame({'Feature': data_iq_n.columns, 'Importance':model.feature_importances_})
frames = [feature_imp, imp]
feature_imp = pd.concat(frames).reset_index(drop = True)
feature_imp = feature_imp.groupby(['Feature'])['Importance'].mean().to_frame(name = 'Importance').reset_index()
feature_imp = feature_imp.set_index('Feature')
feature_imp.sort_values(by='Importance').plot(kind='barh', title='Feature Importance - San Juan, Puerto Rico', grid = True, legend = False, color = '#9b0a0a', figsize=(10,10))
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq_n[['reanalysis_avg_temp_k',
'month',
'odd_year',
'ndvi_nw_rolling_avg',
'ndvi_sw_rolling_avg',
'reanalysis_max_air_temp_k_rolling_avg',
'reanalysis_tdtr_k_rolling_avg',
'station_diur_temp_rng_c_rolling_avg',
'station_max_temp_c_rolling_avg']],
data_iq['total_cases'],
test_size = 0.3
)
params = {'n_neighbors':range(2, 10), 'weights':['uniform', 'distance']}
folds = KFold(n_splits = 10, shuffle=True)
grid_search = GridSearchCV(KNeighborsRegressor(), param_grid=params, cv=folds, scoring='neg_mean_absolute_error')
knr_preds_iq = grid_search.fit(train_features_iq, train_outcomes_iq).predict(test_features_iq)
knr_mae_iq = mean_absolute_error(test_outcomes_iq, knr_preds_iq)
knr_mdae_iq = median_absolute_error(test_outcomes_iq, knr_preds_iq)
knr_evs_iq = explained_variance_score(test_outcomes_iq, knr_preds_iq)
print(knr_mae_iq)
grid_search.cv_results_['params'][grid_search.best_index_]
plt.subplots(figsize=(11,7))
plt.title('Actual vs Predicted Dengue Cases (Iquitos, Peru)')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.scatter(test_outcomes_iq, knr_preds_iq, edgecolors = '#1e1e1e', color='#bae1ff', alpha=0.8)
plt.plot([0, 80], [0, 80], 'red', alpha=0.7)
knr_preds_final_iq = knr_reg.fit(train_features_iq, train_outcomes_iq).predict(
data_test_iq_n[['reanalysis_avg_temp_k',
'month',
'odd_year',
'ndvi_nw_rolling_avg',
'ndvi_sw_rolling_avg',
'reanalysis_max_air_temp_k_rolling_avg',
'reanalysis_tdtr_k_rolling_avg',
'station_diur_temp_rng_c_rolling_avg',
'station_max_temp_c_rolling_avg']]
)
submission_iq = data_test_iq[['city', 'year', 'weekofyear']].copy()
knn_preds_week_iq = pd.DataFrame(test_features_iq)
knn_preds_week_iq['Actual'] = test_outcomes_iq.values
knn_preds_week_iq['Predicted'] = knr_preds_iq
knn_preds_week_iq = pd.merge(data_iq, knn_preds_week_iq, left_index = True, right_index = True)
knn_preds_week_iq = knn_preds_week_iq.assign(residual=knn_preds_week_iq.Actual - knn_preds_week_iq.Predicted)
sns.factorplot(x="year", y="residual", hue="city", size=8, aspect=3,data=knn_preds_week_iq)
plt.title("Residuals")
knn_preds_week_iq = knn_preds_week_iq.melt(id_vars=['city', 'year', 'weekofyear', 'week_start_date', 'ndvi_ne', 'ndvi_nw',
'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 'reanalysis_air_temp_k',
'reanalysis_avg_temp_k_x', 'reanalysis_dew_point_temp_k',
'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k',
'reanalysis_precip_amt_kg_per_m2',
'reanalysis_relative_humidity_percent', 'reanalysis_tdtr_k',
'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
'station_min_temp_c', 'station_precip_mm', 'total_cases', 'month_x',
'odd_year_x', 'ndvi_mean', 'ndvi_mean_rolling_avg',
'ndvi_ne_rolling_avg', 'ndvi_nw_rolling_avg_x', 'ndvi_se_rolling_avg',
'ndvi_sw_rolling_avg_x', 'precipitation_amt_mm_rolling_avg',
'reanalysis_air_temp_k_rolling_avg',
'reanalysis_avg_temp_k_rolling_avg',
'reanalysis_dew_point_temp_k_rolling_avg',
'reanalysis_max_air_temp_k_rolling_avg_x',
'reanalysis_min_air_temp_k_rolling_avg',
'reanalysis_precip_amt_kg_per_m2_rolling_avg',
'reanalysis_relative_humidity_percent_rolling_avg',
'reanalysis_tdtr_k_rolling_avg_x', 'station_avg_temp_c_rolling_avg',
'station_diur_temp_rng_c_rolling_avg_x',
'station_max_temp_c_rolling_avg_x', 'station_min_temp_c_rolling_avg',
'station_precip_mm_rolling_avg', 'reanalysis_avg_temp_k_y', 'month_y',
'odd_year_y', 'ndvi_nw_rolling_avg_y', 'ndvi_sw_rolling_avg_y',
'reanalysis_max_air_temp_k_rolling_avg_y',
'reanalysis_tdtr_k_rolling_avg_y',
'station_diur_temp_rng_c_rolling_avg_y',
'station_max_temp_c_rolling_avg_y','residual'], var_name='type')
sns.factorplot(x='week_start_date', y="value", hue="type", data=knn_preds_week_iq, size = 6, aspect =3)
plt.title("Actual vs Predicted for each year")
frames = [submission_sj, submission_iq]
submission = pd.concat(frames)
submission.to_csv('knn_best.csv', index = False)
XGBoost is an implementation of gradient boosted decision trees designed for speed and performance that is dominative competitive machine learning.
We used it as it minimises the error of Normal Decision Trees as the implementation of XGBoost offers several advanced features for model tuning, computing environments and algorithm enhancement.
# Testing & Training Data
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj_n,
data_sj['total_cases'],
test_size = 0.3
)
from sklearn.feature_selection import RFE
for n in range(1,20,1):
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj_n,
data_sj['total_cases'],
test_size = 0.3
)
rfe = RFE(XGBRegressor(), n)
fit = rfe.fit(test_features_sj, test_outcomes_sj)
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj[data_sj_n.columns[fit.ranking_ == 1]],
data_sj['total_cases'],
test_size = 0.3
)
xgb_reg = XGBRegressor(n_estimators = 1000, learning_rate = 0.03, max_depth = 10, subsample = 0.8, colsample_bytree = 0.701)
xgb_preds_sj = xgb_reg.fit(train_features_sj, train_outcomes_sj).predict(test_features_sj)
print('Features:', n, ', MAE:', mean_absolute_error(test_outcomes_sj, xgb_preds_sj))
model = XGBRegressor()
feature_imp = pd.DataFrame()
for i in range(1,10):
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj_n,
data_sj['total_cases'],
test_size = 0.3
)
for i in range(1,10):
model.fit(train_features_sj, train_outcomes_sj)
imp = pd.DataFrame({'Feature': data_sj_n.columns, 'Importance':model.feature_importances_})
frames = [feature_imp, imp]
feature_imp = pd.concat(frames).reset_index(drop = True)
feature_imp = feature_imp.groupby(['Feature'])['Importance'].mean().to_frame(name = 'Importance').reset_index()
feature_imp = feature_imp.set_index('Feature')
feature_imp.sort_values(by='Importance').plot(kind='barh', title='Feature Importance - San Juan, Puerto Rico', grid = True, legend = False, figsize=(10,10))
# San Juan
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj_n[['precipitation_amt_mm_rolling_avg',
'weekofyear',
'station_precip_mm_rolling_avg',
'reanalysis_dew_point_temp_k_rolling_avg',
'reanalysis_relative_humidity_percent_rolling_avg',
'station_max_temp_c_rolling_avg',
'reanalysis_air_temp_k_rolling_avg',
'ndvi_mean_rolling_avg']],
data_sj['total_cases'],
test_size = 0.3
)
params = {'n_estimators':[1000], 'learning_rate':[0.03], 'max_depth':[10], 'subsample':[0.8], 'colsample_bytree':[0.701]}
folds = KFold(n_splits = 10, shuffle=True)
grid_search = GridSearchCV(XGBRegressor(), param_grid=params, cv=folds, scoring='neg_mean_absolute_error')
xgb_preds_sj = grid_search.fit(train_features_sj, train_outcomes_sj).predict(test_features_sj)
xgb_mae_sj = mean_absolute_error(test_outcomes_sj, xgb_preds_sj)
xgb_mdae_sj = median_absolute_error(test_outcomes_sj, xgb_preds_sj)
xgb_evs_sj = explained_variance_score(test_outcomes_sj, xgb_preds_sj)
print(xgb_mae_sj)
plt.subplots(figsize=(11,7))
plt.title('Actual vs Predicted Dengue Cases (Iquitos, Peru)')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.scatter(test_outcomes_sj, xgb_preds_sj, edgecolors = '#1e1e1e', color='#bae1ff', alpha=0.8)
plt.plot([0, 400], [0, 400], 'red', alpha=0.7)
xgb_preds_week_sj = pd.DataFrame(test_features_sj)
xgb_preds_week_sj['Actual'] = test_outcomes_sj.values
xgb_preds_week_sj['Predicted'] = xgb_preds_sj
xgb_preds_week_sj = pd.merge(data_sj, xgb_preds_week_sj, left_index = True, right_index = True)
plot_d = xgb_preds_week_sj.assign(residual=xgb_preds_week_sj.Actual - xgb_preds_week_sj .Predicted)
sns.factorplot(x="year", y="residual", hue="city", size=8, aspect=3,data=plot_d)
plt.title("Residuals")
plot_d = plot_d.melt(id_vars=['city', 'year', 'weekofyear_x', 'week_start_date', 'ndvi_ne', 'ndvi_nw',
'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 'reanalysis_air_temp_k',
'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k',
'reanalysis_precip_amt_kg_per_m2',
'reanalysis_relative_humidity_percent', 'reanalysis_tdtr_k',
'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
'station_min_temp_c', 'station_precip_mm', 'total_cases', 'month',
'odd_year', 'ndvi_mean', 'ndvi_mean_rolling_avg_x',
'ndvi_ne_rolling_avg', 'ndvi_nw_rolling_avg', 'ndvi_se_rolling_avg',
'ndvi_sw_rolling_avg', 'precipitation_amt_mm_rolling_avg_x',
'reanalysis_air_temp_k_rolling_avg_x',
'reanalysis_avg_temp_k_rolling_avg',
'reanalysis_dew_point_temp_k_rolling_avg_x',
'reanalysis_max_air_temp_k_rolling_avg',
'reanalysis_min_air_temp_k_rolling_avg',
'reanalysis_precip_amt_kg_per_m2_rolling_avg',
'reanalysis_relative_humidity_percent_rolling_avg_x',
'reanalysis_tdtr_k_rolling_avg', 'station_avg_temp_c_rolling_avg',
'station_diur_temp_rng_c_rolling_avg',
'station_max_temp_c_rolling_avg_x', 'station_min_temp_c_rolling_avg',
'station_precip_mm_rolling_avg_x', 'precipitation_amt_mm_rolling_avg_y',
'weekofyear_y', 'station_precip_mm_rolling_avg_y',
'reanalysis_dew_point_temp_k_rolling_avg_y',
'reanalysis_relative_humidity_percent_rolling_avg_y',
'station_max_temp_c_rolling_avg_y',
'reanalysis_air_temp_k_rolling_avg_y', 'ndvi_mean_rolling_avg_y', 'residual'], var_name='type')
sns.factorplot(x='week_start_date', y="value", hue="type", data=plot_d, size = 6, aspect =3)
plt.title("Actual vs Predicted for each year")
xgb_preds_final_sj = xgb_reg.fit(train_features_sj, train_outcomes_sj).predict(
data_test_sj_n[['precipitation_amt_mm_rolling_avg',
'weekofyear',
'station_precip_mm_rolling_avg',
'reanalysis_dew_point_temp_k_rolling_avg',
'reanalysis_relative_humidity_percent_rolling_avg',
'station_max_temp_c_rolling_avg',
'reanalysis_air_temp_k_rolling_avg',
'ndvi_mean_rolling_avg']]
)
submission_sj = data_test_sj[['city', 'year', 'weekofyear']].copy()
submission_sj['total_cases'] = np.round(xgb_preds_final_sj).astype(int)
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq_n,
data_iq['total_cases'],
test_size = 0.3
)
from sklearn.feature_selection import RFE
for n in range(1,20,1):
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq_n,
data_iq['total_cases'],
test_size = 0.3
)
rfe = RFE(XGBRegressor(), n)
fit = rfe.fit(test_features_iq, test_outcomes_iq)
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq[data_iq_n.columns[fit.ranking_ == 1]],
data_iq['total_cases'],
test_size = 0.3
)
xgb_reg = XGBRegressor(n_estimators = 1000, learning_rate = 0.03, max_depth = 10, subsample = 0.8, colsample_bytree = 0.701)
xgb_preds_iq = xgb_reg.fit(train_features_iq, train_outcomes_iq).predict(test_features_iq)
print('Features:', n, ', MAE:', mean_absolute_error(test_outcomes_iq, xgb_preds_iq))
model = XGBRegressor()
feature_imp = pd.DataFrame()
for i in range(1,10):
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq_n,
data_iq['total_cases'],
test_size = 0.3
)
for i in range(1,10):
model.fit(train_features_iq, train_outcomes_iq)
imp = pd.DataFrame({'Feature': data_iq_n.columns, 'Importance':model.feature_importances_})
frames = [feature_imp, imp]
feature_imp = pd.concat(frames).reset_index(drop = True)
feature_imp = feature_imp.groupby(['Feature'])['Importance'].mean().to_frame(name = 'Importance').reset_index()
feature_imp = feature_imp.set_index('Feature')
feature_imp.sort_values(by='Importance').plot(kind='barh', title='Feature Importance - San Juan, Puerto Rico', grid = True, legend = False, figsize=(10,10))
# ['month','odd_year','reanalysis_relative_humidity_percent', 'reanalysis_dew_point_temp_k', 'station_avg_temp_c', 'reanalysis_tdtr_k']
#Iquitos
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq_n[['ndvi_se_rolling_avg',
'ndvi_ne_rolling_avg',
'ndvi_nw_rolling_avg',
'weekofyear',
'station_max_temp_c_rolling_avg',
'reanalysis_precip_amt_kg_per_m2_rolling_avg',
'station_min_temp_c_rolling_avg']],
data_iq['total_cases'],
test_size = 0.3
)
params = {'n_estimators':[1000], 'learning_rate':[0.03], 'max_depth':[10], 'subsample':[0.8], 'colsample_bytree':[0.701]}
folds = KFold(n_splits = 10, shuffle=True)
grid_search = GridSearchCV(XGBRegressor(), param_grid=params, cv=folds, scoring='neg_mean_absolute_error')
xgb_preds_iq = grid_search.fit(train_features_iq, train_outcomes_iq).predict(test_features_iq)
xgb_mae_iq = mean_absolute_error(test_outcomes_iq, xgb_preds_iq)
xgb_mdae_iq = median_absolute_error(test_outcomes_iq, xgb_preds_iq)
xgb_evs_iq = explained_variance_score(test_outcomes_iq, xgb_preds_iq)
print(xgb_mae_iq)
plt.subplots(figsize=(11,7))
plt.title('Actual vs Predicted Dengue Cases (Iquitos, Peru)')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.scatter(test_outcomes_iq, xgb_preds_iq, edgecolors = '#1e1e1e', color='#bae1ff', alpha=0.8)
plt.plot([0, 80], [0, 80], 'red', alpha=0.7)
xgb_preds_week_iq = pd.DataFrame(test_features_iq)
xgb_preds_week_iq['Actual'] = test_outcomes_iq.values
xgb_preds_week_iq['Predicted'] = xgb_preds_iq
xgb_preds_week_iq = pd.merge(data_sj, xgb_preds_week_iq, left_index = True, right_index = True)
plot_d = xgb_preds_week_iq.assign(residual=xgb_preds_week_iq.Actual - xgb_preds_week_iq .Predicted)
sns.factorplot(x="year", y="residual", hue="city", size=8, aspect=3,data=plot_d)
plt.title("Residuals")
plot_d = plot_d.melt(id_vars=['city', 'year', 'weekofyear_x', 'week_start_date', 'ndvi_ne', 'ndvi_nw',
'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 'reanalysis_air_temp_k',
'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k',
'reanalysis_precip_amt_kg_per_m2',
'reanalysis_relative_humidity_percent', 'reanalysis_tdtr_k',
'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
'station_min_temp_c', 'station_precip_mm', 'total_cases', 'month',
'odd_year', 'ndvi_mean', 'ndvi_mean_rolling_avg',
'ndvi_ne_rolling_avg_x', 'ndvi_nw_rolling_avg_x',
'ndvi_se_rolling_avg_x', 'ndvi_sw_rolling_avg',
'precipitation_amt_mm_rolling_avg', 'reanalysis_air_temp_k_rolling_avg',
'reanalysis_avg_temp_k_rolling_avg',
'reanalysis_dew_point_temp_k_rolling_avg',
'reanalysis_max_air_temp_k_rolling_avg',
'reanalysis_min_air_temp_k_rolling_avg',
'reanalysis_precip_amt_kg_per_m2_rolling_avg_x',
'reanalysis_relative_humidity_percent_rolling_avg',
'reanalysis_tdtr_k_rolling_avg', 'station_avg_temp_c_rolling_avg',
'station_diur_temp_rng_c_rolling_avg',
'station_max_temp_c_rolling_avg_x', 'station_min_temp_c_rolling_avg_x',
'station_precip_mm_rolling_avg', 'ndvi_se_rolling_avg_y',
'ndvi_ne_rolling_avg_y', 'ndvi_nw_rolling_avg_y', 'weekofyear_y',
'station_max_temp_c_rolling_avg_y',
'reanalysis_precip_amt_kg_per_m2_rolling_avg_y',
'station_min_temp_c_rolling_avg_y','residual'], var_name='type')
sns.factorplot(x='week_start_date', y="value", hue="type", data=plot_d, size = 6, aspect =3)
plt.title("Actual vs Predicted for each year")
xgb_preds_final_iq = xgb_reg.fit(train_features_iq, train_outcomes_iq).predict(
data_test_iq_n[['ndvi_se_rolling_avg',
'ndvi_ne_rolling_avg',
'ndvi_nw_rolling_avg',
'weekofyear',
'station_max_temp_c_rolling_avg',
'reanalysis_precip_amt_kg_per_m2_rolling_avg',
'station_min_temp_c_rolling_avg']]
)
submission_iq = data_test_iq[['city', 'year', 'weekofyear']].copy()
submission_iq['total_cases'] = np.round(xgb_preds_final_iq).astype(int)
frames = [submission_sj, submission_iq]
submission = pd.concat(frames)
submission.to_csv('xgb.csv', index = False)
After reading this article we were inspired to try using decision trees.
We wanted to see if using a simple version of decision trees gave us a lesser mean absolute error and validate the idea of using XG Boost. Hence we used it.
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj_n,
data_sj['total_cases'],
test_size = 0.3
)
for n in range(1,20,1):
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj_n,
data_sj['total_cases'],
test_size = 0.3
)
rfe = RFE(DecisionTreeRegressor(), n)
fit = rfe.fit(test_features_sj, test_outcomes_sj)
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj[data_sj_n.columns[fit.ranking_ == 1]],
data_sj['total_cases'],
test_size = 0.3
)
dt_reg = DecisionTreeRegressor()
dt_preds_sj = dt_reg.fit(train_features_sj, train_outcomes_sj).predict(test_features_sj)
print('Features:', n, ', MAE:', mean_absolute_error(test_outcomes_sj, dt_preds_sj))
model = DecisionTreeRegressor()
feature_imp = pd.DataFrame({'Feature' : [], 'Importance' : []})
for i in range(1,20):
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj_n,
data_sj['total_cases'],
test_size = 0.3
)
for i in range(1,20):
model.fit(train_features_sj, train_outcomes_sj)
imp = pd.DataFrame({'Feature': data_sj_n.columns, 'Importance':model.feature_importances_})
frames = [feature_imp, imp]
feature_imp = pd.concat(frames).reset_index(drop = True)
feature_imp = feature_imp.groupby(['Feature'])['Importance'].mean().to_frame(name = 'Importance').reset_index()
feature_imp = feature_imp.set_index('Feature')
feature_imp.sort_values(by='Importance').plot(kind='barh', title='Feature Importance - San Juan, Puerto Rico', grid = True, legend = False, figsize=(10,10))
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj_n[['precipitation_amt_mm_rolling_avg',
'weekofyear',
'station_min_temp_c_rolling_avg',
'reanalysis_dew_point_temp_k_rolling_avg',
'station_precip_mm_rolling_avg',
'reanalysis_relative_humidity_percent_rolling_avg',
'ndvi_nw_rolling_avg']],
data_sj['total_cases'],
test_size = 0.3
)
params = {'max_depth':range(5, 30)}
folds = KFold(n_splits = 10, shuffle=True)
grid_search = GridSearchCV(DecisionTreeRegressor(), param_grid=params, cv=folds, scoring='neg_mean_absolute_error')
dt_preds_sj = grid_search.fit(train_features_sj, train_outcomes_sj).predict(test_features_sj)
dt_mae_sj = mean_absolute_error(test_outcomes_sj, dt_preds_sj)
dt_mdae_sj = median_absolute_error(test_outcomes_sj, dt_preds_sj)
dt_evs_sj = explained_variance_score(test_outcomes_sj, dt_preds_sj)
print(dt_mae_sj)
plt.subplots(figsize=(11,7))
plt.title('Actual vs Predicted Dengue Cases (San Juan, Puerto Rico)')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.scatter(test_outcomes_sj, dt_preds_sj, edgecolors = '#1e1e1e', color='#bae1ff', alpha=0.8)
plt.plot([0, 400], [0, 400], 'red', alpha=0.7)
dt_preds_week_sj = pd.DataFrame(test_features_sj)
dt_preds_week_sj['Actual'] = test_outcomes_sj.values
dt_preds_week_sj['Predicted'] = dt_preds_sj
dt_preds_week_sj = pd.merge(data_sj, dt_preds_week_sj, left_index = True, right_index = True)
plot_d = dt_preds_week_sj.assign(residual=dt_preds_week_sj.Actual - dt_preds_week_sj .Predicted)
sns.factorplot(x="year", y="residual", hue="city", size=8, aspect=3,data=plot_d)
plt.title("Residuals")
plot_d = plot_d.melt(id_vars=['city', 'year', 'weekofyear_x', 'week_start_date', 'ndvi_ne', 'ndvi_nw',
'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 'reanalysis_air_temp_k',
'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k',
'reanalysis_precip_amt_kg_per_m2',
'reanalysis_relative_humidity_percent', 'reanalysis_tdtr_k',
'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
'station_min_temp_c', 'station_precip_mm', 'total_cases', 'month',
'odd_year', 'ndvi_mean', 'ndvi_mean_rolling_avg', 'ndvi_ne_rolling_avg',
'ndvi_nw_rolling_avg_x', 'ndvi_se_rolling_avg', 'ndvi_sw_rolling_avg',
'precipitation_amt_mm_rolling_avg_x',
'reanalysis_air_temp_k_rolling_avg',
'reanalysis_avg_temp_k_rolling_avg',
'reanalysis_dew_point_temp_k_rolling_avg_x',
'reanalysis_max_air_temp_k_rolling_avg',
'reanalysis_min_air_temp_k_rolling_avg',
'reanalysis_precip_amt_kg_per_m2_rolling_avg',
'reanalysis_relative_humidity_percent_rolling_avg_x',
'reanalysis_tdtr_k_rolling_avg', 'station_avg_temp_c_rolling_avg',
'station_diur_temp_rng_c_rolling_avg', 'station_max_temp_c_rolling_avg',
'station_min_temp_c_rolling_avg_x', 'station_precip_mm_rolling_avg_x',
'precipitation_amt_mm_rolling_avg_y', 'weekofyear_y',
'station_min_temp_c_rolling_avg_y',
'reanalysis_dew_point_temp_k_rolling_avg_y',
'station_precip_mm_rolling_avg_y',
'reanalysis_relative_humidity_percent_rolling_avg_y',
'ndvi_nw_rolling_avg_y','residual'], var_name='type')
sns.factorplot(x='week_start_date', y="value", hue="type", data=plot_d, size = 6, aspect =3)
plt.title("Actual vs Predicted for each year")
dt_preds_final_sj = dt_reg.fit(train_features_sj, train_outcomes_sj).predict(
data_test_sj_n[['precipitation_amt_mm_rolling_avg',
'weekofyear',
'station_min_temp_c_rolling_avg',
'reanalysis_dew_point_temp_k_rolling_avg',
'station_precip_mm_rolling_avg',
'reanalysis_relative_humidity_percent_rolling_avg',
'ndvi_nw_rolling_avg']]
)
submission_sj = data_test_sj[['city', 'year', 'weekofyear']].copy()
submission_sj['total_cases'] = np.round(dt_preds_final_sj).astype(int)
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq_n,
data_iq['total_cases'],
test_size = 0.3
)
for n in range(1,20,1):
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq_n,
data_iq['total_cases'],
test_size = 0.3
)
rfe = RFE(DecisionTreeRegressor(), n)
fit = rfe.fit(test_features_iq, test_outcomes_iq)
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq[data_iq_n.columns[fit.ranking_ == 1]],
data_iq['total_cases'],
test_size = 0.3
)
dt_reg = DecisionTreeRegressor()
dt_preds_iq = dt_reg.fit(train_features_iq, train_outcomes_iq).predict(test_features_iq)
print('Features:', n, ', MAE:', mean_absolute_error(test_outcomes_iq, dt_preds_iq))
model = DecisionTreeRegressor()
feature_imp = pd.DataFrame({'Feature' : [], 'Importance' : []})
for i in range(1,20):
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq_n,
data_iq['total_cases'],
test_size = 0.3
)
for i in range(1,20):
model.fit(train_features_iq, train_outcomes_iq)
imp = pd.DataFrame({'Feature': data_iq_n.columns, 'Importance':model.feature_importances_})
frames = [feature_imp, imp]
feature_imp = pd.concat(frames).reset_index(drop = True)
feature_imp = feature_imp.groupby(['Feature'])['Importance'].mean().to_frame(name = 'Importance').reset_index()
feature_imp = feature_imp.set_index('Feature')
feature_imp.sort_values(by='Importance').plot(kind='barh', title='Feature Importance - Iquitos', grid = True, legend = False, figsize=(10,10))
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq_n[['ndvi_se_rolling_avg',
'reanalysis_avg_temp_k_rolling_avg',
'weekofyear',
'station_avg_temp_c_rolling_avg',
'reanalysis_max_air_temp_k_rolling_avg',
'odd_year',
'station_max_temp_c_rolling_avg',
'reanalysis_min_air_temp_k_rolling_avg']],
data_iq['total_cases'],
test_size = 0.3
)
params = {'n_estimators':range(5, 30)}
folds = KFold(n_splits = 10, shuffle=True)
grid_search = GridSearchCV(RandomForestRegressor(), param_grid=params, cv=folds, scoring='neg_mean_absolute_error')
dt_preds_iq = grid_search.fit(train_features_iq, train_outcomes_iq).predict(test_features_iq)
dt_mae_iq = mean_absolute_error(test_outcomes_iq, dt_preds_iq)
dt_mdae_iq = median_absolute_error(test_outcomes_iq, dt_preds_iq)
dt_evs_iq = explained_variance_score(test_outcomes_iq, dt_preds_iq)
print(dt_mae_iq)
plt.subplots(figsize=(11,7))
plt.title('Actual vs Predicted Dengue Cases (Iquitos, Peru)')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.scatter(test_outcomes_iq, dt_preds_iq, edgecolors = '#1e1e1e', color='#bae1ff', alpha=0.8)
plt.plot([0, 80], [0, 80], 'red', alpha=0.7)
dt_preds_week_iq = pd.DataFrame(test_features_iq)
dt_preds_week_iq['Actual'] = test_outcomes_iq.values
dt_preds_week_iq['Predicted'] = dt_preds_iq
dt_preds_week_iq = pd.merge(data_iq, dt_preds_week_iq, left_index = True, right_index = True)
plot_d = dt_preds_week_iq.assign(residual=dt_preds_week_iq.Actual - dt_preds_week_iq .Predicted)
sns.factorplot(x="year", y="residual", hue="city", size=8, aspect=3,data=plot_d)
plt.title("Residuals")
plot_d = plot_d.melt(id_vars=['city', 'year', 'weekofyear_x', 'week_start_date', 'ndvi_ne', 'ndvi_nw',
'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 'reanalysis_air_temp_k',
'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k',
'reanalysis_precip_amt_kg_per_m2',
'reanalysis_relative_humidity_percent', 'reanalysis_tdtr_k',
'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
'station_min_temp_c', 'station_precip_mm', 'total_cases', 'month',
'odd_year_x', 'ndvi_mean', 'ndvi_mean_rolling_avg',
'ndvi_ne_rolling_avg', 'ndvi_nw_rolling_avg', 'ndvi_se_rolling_avg_x',
'ndvi_sw_rolling_avg', 'precipitation_amt_mm_rolling_avg',
'reanalysis_air_temp_k_rolling_avg',
'reanalysis_avg_temp_k_rolling_avg_x',
'reanalysis_dew_point_temp_k_rolling_avg',
'reanalysis_max_air_temp_k_rolling_avg_x',
'reanalysis_min_air_temp_k_rolling_avg_x',
'reanalysis_precip_amt_kg_per_m2_rolling_avg',
'reanalysis_relative_humidity_percent_rolling_avg',
'reanalysis_tdtr_k_rolling_avg', 'station_avg_temp_c_rolling_avg_x',
'station_diur_temp_rng_c_rolling_avg',
'station_max_temp_c_rolling_avg_x', 'station_min_temp_c_rolling_avg',
'station_precip_mm_rolling_avg', 'ndvi_se_rolling_avg_y',
'reanalysis_avg_temp_k_rolling_avg_y', 'weekofyear_y',
'station_avg_temp_c_rolling_avg_y',
'reanalysis_max_air_temp_k_rolling_avg_y', 'odd_year_y',
'station_max_temp_c_rolling_avg_y',
'reanalysis_min_air_temp_k_rolling_avg_y','residual'], var_name='type')
sns.factorplot(x='week_start_date', y="value", hue="type", data=plot_d, size = 6, aspect =3)
plt.title("Actual vs Predicted for each year")
dt_preds_final_iq = dt_reg.fit(train_features_iq, train_outcomes_iq).predict(
data_test_iq_n[['ndvi_se_rolling_avg',
'reanalysis_avg_temp_k_rolling_avg',
'weekofyear',
'station_avg_temp_c_rolling_avg',
'reanalysis_max_air_temp_k_rolling_avg',
'odd_year',
'station_max_temp_c_rolling_avg',
'reanalysis_min_air_temp_k_rolling_avg']]
)
submission_iq = data_test_iq[['city', 'year', 'weekofyear']].copy()
submission_iq['total_cases'] = np.round(dt_preds_final_iq).astype(int)
frames = [submission_sj, submission_iq]
submission = pd.concat(frames)
submission.to_csv('randomforest.csv', index = False)
This article helped introduce us to how random forest regression is used for time series forecasting. Since our task is forecasting the total cases we decided to trying using random forest due to it's advantages over a simple decision tree.
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj_n,
data_sj['total_cases'],
test_size = 0.3
)
from sklearn.feature_selection import RFE
for n in range(1,20,1):
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj_n,
data_sj['total_cases'],
test_size = 0.3
)
rfe = RFE(RandomForestRegressor(), n)
fit = rfe.fit(test_features_sj, test_outcomes_sj)
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj[data_sj_n.columns[fit.ranking_ == 1]],
data_sj['total_cases'],
test_size = 0.3
)
rf_reg = RandomForestRegressor()
rf_preds_sj = rf_reg.fit(train_features_sj, train_outcomes_sj).predict(test_features_sj)
print('Features:', n, ', MAE:', mean_absolute_error(test_outcomes_sj, rf_preds_sj))
model = RandomForestRegressor()
feature_imp = pd.DataFrame({'Feature' : [], 'Importance' : []})
for i in range(1,10):
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj_n,
data_sj['total_cases'],
test_size = 0.3
)
for i in range(1,10):
model.fit(train_features_sj, train_outcomes_sj)
imp = pd.DataFrame({'Feature': data_sj_n.columns, 'Importance':model.feature_importances_})
frames = [feature_imp, imp]
feature_imp = pd.concat(frames).reset_index(drop = True)
feature_imp = feature_imp.groupby(['Feature'])['Importance'].mean().to_frame(name = 'Importance').reset_index()
feature_imp = feature_imp.set_index('Feature')
feature_imp.sort_values(by='Importance').plot(kind='barh', title='Feature Importance - San Juan, Puerto Rico', grid = True, legend = False, figsize=(10,10))
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
data_sj_n[['precipitation_amt_mm_rolling_avg',
'reanalysis_dew_point_temp_k_rolling_avg',
'month',
'ndvi_nw_rolling_avg',
'reanalysis_relative_humidity_percent_rolling_avg',
'ndvi_se_rolling_avg']],
data_sj['total_cases'],
test_size = 0.3
)
params = {'n_estimators':range(5, 30)}
folds = KFold(n_splits = 10, shuffle=True)
grid_search = GridSearchCV(RandomForestRegressor(), param_grid=params, cv=folds, scoring='neg_mean_absolute_error')
rf_preds_sj = grid_search.fit(train_features_sj, train_outcomes_sj).predict(test_features_sj)
rf_mae_sj = mean_absolute_error(test_outcomes_sj, rf_preds_sj)
rf_mdae_sj = median_absolute_error(test_outcomes_sj, rf_preds_sj)
rf_evs_sj = explained_variance_score(test_outcomes_sj, rf_preds_sj)
print(rf_mae_sj)
plt.subplots(figsize=(11,7))
plt.title('Actual vs Predicted Dengue Cases (San Juan, Puerto Rico)')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.scatter(test_outcomes_sj, rf_preds_sj, edgecolors = '#1e1e1e', color='#bae1ff', alpha=0.8)
plt.plot([0, 400], [0, 400], 'red', alpha=0.7)
rf_preds_week_sj = pd.DataFrame(test_features_sj)
rf_preds_week_sj['Actual'] = test_outcomes_sj.values
rf_preds_week_sj['Predicted'] = rf_preds_sj
rf_preds_week_sj = pd.merge(data_sj, rf_preds_week_sj, left_index = True, right_index = True)
plot_d = rf_preds_week_sj.assign(residual=rf_preds_week_sj.Actual - rf_preds_week_sj .Predicted)
sns.factorplot(x="year", y="residual", hue="city", size=8, aspect=3,data=plot_d)
plt.title("Residuals")
plot_d = plot_d.melt(id_vars=['city', 'year', 'weekofyear', 'week_start_date', 'ndvi_ne', 'ndvi_nw',
'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 'reanalysis_air_temp_k',
'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k',
'reanalysis_precip_amt_kg_per_m2',
'reanalysis_relative_humidity_percent', 'reanalysis_tdtr_k',
'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
'station_min_temp_c', 'station_precip_mm', 'total_cases', 'month_x',
'odd_year', 'ndvi_mean', 'ndvi_mean_rolling_avg', 'ndvi_ne_rolling_avg',
'ndvi_nw_rolling_avg_x', 'ndvi_se_rolling_avg_x', 'ndvi_sw_rolling_avg',
'precipitation_amt_mm_rolling_avg_x',
'reanalysis_air_temp_k_rolling_avg',
'reanalysis_avg_temp_k_rolling_avg',
'reanalysis_dew_point_temp_k_rolling_avg_x',
'reanalysis_max_air_temp_k_rolling_avg',
'reanalysis_min_air_temp_k_rolling_avg',
'reanalysis_precip_amt_kg_per_m2_rolling_avg',
'reanalysis_relative_humidity_percent_rolling_avg_x',
'reanalysis_tdtr_k_rolling_avg', 'station_avg_temp_c_rolling_avg',
'station_diur_temp_rng_c_rolling_avg', 'station_max_temp_c_rolling_avg',
'station_min_temp_c_rolling_avg', 'station_precip_mm_rolling_avg',
'precipitation_amt_mm_rolling_avg_y',
'reanalysis_dew_point_temp_k_rolling_avg_y', 'month_y',
'ndvi_nw_rolling_avg_y',
'reanalysis_relative_humidity_percent_rolling_avg_y',
'ndvi_se_rolling_avg_y','residual'], var_name='type')
sns.factorplot(x='week_start_date', y="value", hue="type", data=plot_d, size = 6, aspect =3)
plt.title("Actual vs Predicted for each year")
rf_preds_final_sj = rf_reg.fit(train_features_sj, train_outcomes_sj).predict(
data_test_sj_n[['precipitation_amt_mm_rolling_avg',
'reanalysis_dew_point_temp_k_rolling_avg',
'month',
'ndvi_nw_rolling_avg',
'reanalysis_relative_humidity_percent_rolling_avg',
'ndvi_se_rolling_avg']]
)
submission_sj = data_test_sj[['city', 'year', 'weekofyear']].copy()
submission_sj['total_cases'] = np.round(rf_preds_final_sj).astype(int)
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq_n,
data_iq['total_cases'],
test_size = 0.3
)
from sklearn.feature_selection import RFE
for n in range(1,20,1):
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq_n,
data_iq['total_cases'],
test_size = 0.3
)
rfe = RFE(RandomForestRegressor(), n)
fit = rfe.fit(test_features_iq, test_outcomes_iq)
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq[data_iq_n.columns[fit.ranking_ == 1]],
data_iq['total_cases'],
test_size = 0.3
)
rf_reg = RandomForestRegressor()
rf_preds_iq = rf_reg.fit(train_features_iq, train_outcomes_iq).predict(test_features_iq)
print('Features:', n, ', MAE:', mean_absolute_error(test_outcomes_iq, rf_preds_iq))
model = RandomForestRegressor()
feature_imp = pd.DataFrame({'Feature' : [], 'Importance' : []})
for i in range(1,10):
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq_n,
data_iq['total_cases'],
test_size = 0.3
)
for i in range(1,10):
model.fit(train_features_iq, train_outcomes_iq)
imp = pd.DataFrame({'Feature': data_iq_n.columns, 'Importance':model.feature_importances_})
frames = [feature_imp, imp]
feature_imp = pd.concat(frames).reset_index(drop = True)
feature_imp = feature_imp.groupby(['Feature'])['Importance'].mean().to_frame(name = 'Importance').reset_index()
feature_imp = feature_imp.set_index('Feature')
feature_imp.sort_values(by='Importance').plot(kind='barh', title='Feature Importance - San Juan, Puerto Rico', grid = True, legend = False, color = '#9b0a0a', figsize=(10,10))
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
data_iq_n[['ndvi_se_rolling_avg',
'reanalysis_avg_temp_k_rolling_avg',
'weekofyear',
'station_avg_temp_c_rolling_avg',
'reanalysis_max_air_temp_k_rolling_avg',
'odd_year',
'station_max_temp_c_rolling_avg',
'reanalysis_min_air_temp_k_rolling_avg']],
data_iq['total_cases'],
test_size = 0.3
)
params = {'n_estimators':range(5, 30)}
folds = KFold(n_splits = 10, shuffle=True)
grid_search = GridSearchCV(RandomForestRegressor(), param_grid=params, cv=folds, scoring='neg_mean_absolute_error')
rf_preds_iq = grid_search.fit(train_features_iq, train_outcomes_iq).predict(test_features_iq)
rf_mae_iq = mean_absolute_error(test_outcomes_iq, rf_preds_iq)
rf_mdae_iq = median_absolute_error(test_outcomes_iq, rf_preds_iq)
rf_evs_iq = explained_variance_score(test_outcomes_iq, rf_preds_iq)
print(rf_mae_iq)
plt.subplots(figsize=(11,7))
plt.title('Actual vs Predicted Dengue Cases (Iquitos, Peru)')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.scatter(test_outcomes_iq, rf_preds_iq, edgecolors = '#1e1e1e', color='#bae1ff', alpha=0.8)
plt.plot([0, 80], [0, 80], 'red', alpha=0.7)
rf_preds_week_iq = pd.DataFrame(test_features_iq)
rf_preds_week_iq['Actual'] = test_outcomes_iq.values
rf_preds_week_iq['Predicted'] = rf_preds_iq
rf_preds_week_iq = pd.merge(data_iq, rf_preds_week_iq, left_index = True, right_index = True)
plot_d = rf_preds_week_iq.assign(residual=rf_preds_week_iq.Actual - rf_preds_week_iq .Predicted)
sns.factorplot(x="year", y="residual", hue="city", size=8, aspect=3,data=plot_d)
plt.title("Residuals")
plot_d = plot_d.melt(id_vars=['city', 'year', 'weekofyear_x', 'week_start_date', 'ndvi_ne', 'ndvi_nw',
'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 'reanalysis_air_temp_k',
'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k',
'reanalysis_precip_amt_kg_per_m2',
'reanalysis_relative_humidity_percent', 'reanalysis_tdtr_k',
'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
'station_min_temp_c', 'station_precip_mm', 'total_cases', 'month',
'odd_year_x', 'ndvi_mean', 'ndvi_mean_rolling_avg',
'ndvi_ne_rolling_avg', 'ndvi_nw_rolling_avg', 'ndvi_se_rolling_avg_x',
'ndvi_sw_rolling_avg', 'precipitation_amt_mm_rolling_avg',
'reanalysis_air_temp_k_rolling_avg',
'reanalysis_avg_temp_k_rolling_avg_x',
'reanalysis_dew_point_temp_k_rolling_avg',
'reanalysis_max_air_temp_k_rolling_avg_x',
'reanalysis_min_air_temp_k_rolling_avg_x',
'reanalysis_precip_amt_kg_per_m2_rolling_avg',
'reanalysis_relative_humidity_percent_rolling_avg',
'reanalysis_tdtr_k_rolling_avg', 'station_avg_temp_c_rolling_avg_x',
'station_diur_temp_rng_c_rolling_avg',
'station_max_temp_c_rolling_avg_x', 'station_min_temp_c_rolling_avg',
'station_precip_mm_rolling_avg', 'ndvi_se_rolling_avg_y',
'reanalysis_avg_temp_k_rolling_avg_y', 'weekofyear_y',
'station_avg_temp_c_rolling_avg_y',
'reanalysis_max_air_temp_k_rolling_avg_y', 'odd_year_y',
'station_max_temp_c_rolling_avg_y',
'reanalysis_min_air_temp_k_rolling_avg_y','residual'], var_name='type')
sns.factorplot(x='week_start_date', y="value", hue="type", data=plot_d, size = 6, aspect =3)
plt.title("Actual vs Predicted for each year")
rf_preds_final_iq = rf_reg.fit(train_features_iq, train_outcomes_iq).predict(
data_test_iq_n[['ndvi_se_rolling_avg',
'reanalysis_avg_temp_k_rolling_avg',
'weekofyear',
'station_avg_temp_c_rolling_avg',
'reanalysis_max_air_temp_k_rolling_avg',
'odd_year',
'station_max_temp_c_rolling_avg',
'reanalysis_min_air_temp_k_rolling_avg']]
)
submission_iq = data_test_iq[['city', 'year', 'weekofyear']].copy()
submission_iq['total_cases'] = np.round(rf_preds_final_iq).astype(int)
frames = [submission_sj, submission_iq]
submission = pd.concat(frames)
submission.to_csv('rf.csv', index = False)
We will encode categorical integer features using a one-hot aka one-of-K scheme. we use one hot encoder to perform “binarization” of the category and include it as a feature to train the model. A 1 in a particular column will tell the computer the correct category for that row’s data. In other words, we have created an additional binary column for each category.
Refferred to:
sj = data_sj.copy()
sj_dum = pd.get_dummies(sj, prefix=['weekofyear', 'month'], columns=['weekofyear', 'month'])
sj_dum = sj_dum.drop(['city', 'week_start_date'], axis=1)
iq = data_iq.copy()
iq_dum = pd.get_dummies(iq, prefix=['weekofyear', 'month'], columns=['weekofyear', 'month'])
iq_dum = iq_dum.drop(['city', 'week_start_date'], axis=1)
test_sj = data_test_sj.copy()
test_sj_dum = pd.get_dummies(test_sj, prefix=['weekofyear', 'month'], columns=['weekofyear', 'month'])
test_sj_dum = test_sj_dum.drop(['city', 'week_start_date'], axis=1)
train_features_sj, test_features_sj, train_outcomes_sj, test_outcomes_sj = train_test_split(
sj_dum,
data_sj['total_cases'],
test_size = 0.3
)
reg = Pipeline([
('feature_selection', SelectFromModel(LinearSVC(penalty="l2"))),
('classification', KNeighborsRegressor(n_neighbors = 4, weights = 'distance'))
])
reg = reg.fit(train_features_sj, train_outcomes_sj).predict(test_features_sj)
print(mean_absolute_error(test_outcomes_sj, reg))
train_features_iq, test_features_iq, train_outcomes_iq, test_outcomes_iq = train_test_split(
iq_dum,
data_iq['total_cases'],
test_size = 0.3
)
reg = Pipeline([
('feature_selection', SelectFromModel(LinearSVC(penalty="l2"))),
('classification', KNeighborsRegressor(n_neighbors = 4, weights = 'distance'))
])
reg = reg.fit(train_features_iq, train_outcomes_iq).predict(test_features_iq)
print(mean_absolute_error(test_outcomes_iq, reg))
algs_mae = pd.DataFrame({'San Juan': [knr_mae_sj, xgb_mae_sj, dt_mae_sj, rf_mae_sj],
'Iquitos': [knr_mae_iq, xgb_mae_iq, dt_mae_iq,rf_mae_iq]},
index=['KNN', 'XGBoost', 'DecisionTree', 'RandomForest'])
algs_mdae = pd.DataFrame({'San Juan': [knr_mdae_sj, xgb_mdae_sj, dt_mdae_sj, rf_mdae_sj],
'Iquitos': [knr_mdae_iq, xgb_mdae_iq, dt_mdae_iq,rf_mdae_iq]},
index=['KNN', 'XGBoost', 'DecisionTree', 'RandomForest'])
algs_evs = pd.DataFrame({'San Juan': [knr_evs_sj, xgb_evs_sj, dt_evs_sj, rf_evs_sj],
'Iquitos': [knr_evs_iq, xgb_evs_iq, dt_evs_iq,rf_evs_iq]},
index=['KNN', 'XGBoost', 'DecisionTree', 'RandomForest'])
print("Mean Absolute Error")
print("XG Boost has the lowest MAE")
algs_mae
print("Median Absolute Error")
print("Random Forest has the lowest MDAE")
algs_mdae
print("Explained Variance Score")
print("Decision Tree has the best Explained Variance Score")
algs_evs
Let us visualize this and see how the models compare to each other.
plt.subplots(figsize=(6,4))
plt.plot(algs_mae)
plt.title('Mean Absolute Error Comparison')
plt.xlabel('Algorithm')
plt.ylabel('Mean Abs Error')
plt.subplots(figsize=(6,4))
plt.plot(algs_mdae)
plt.title('Median Absolute Error Comparison')
plt.xlabel('Algorithm')
plt.ylabel('Median Abs Error')
plt.subplots(figsize=(6,4))
plt.plot(algs_evs)
plt.title('Explained Variance Score Comparison')
plt.xlabel('Algorithm')
plt.ylabel('Explained Variance Score')
Thank You
Best Model: KNN (Score: 19.4533)