from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
import plotly.express as px
import plotly.io as pio
= "plotly_white"
pio.templates.default
#check against other studies, check p values
Final Project: Urban Air Pollution Predicted by Income and Racial Demographics
import warnings
'ignore')
warnings.filterwarnings(
%load_ext autoreload
%autoreload 2
Part One: Cleaning Data
from dataCleaning import DatasetClass
= DatasetClass()
ds
= pd.read_csv("pollution_income_race.csv")
dataset
= ds.train_test_data(dataset)
X_train, X_test, y_train, y_test
= y_test.mean()
base_rate
base_rate
0.8605200945626478
X_train
State Code | County Code | Date Local | Median Household Income | Total Pop | Total Male % | Total Female % | White Alone (M) % | White Alone (F) % | Black Alone (M) % | Black Alone (F) % | Am Indian+AK Native Alone (M) % | Am Indian+AK Native Alone (F) % | Asian Alone (M) % | Asian Alone (F) % | Nat. HI and PI (M) % | Nat. HI and PI (F) % | TOM_MALE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
66 | 6 | 1 | 2002 | 113518.0 | 10021506 | 0.495390 | 0.504610 | 0.244432 | 0.238668 | 0.051896 | 0.056041 | 0.005613 | 0.005458 | 0.161241 | 0.171451 | 0.004578 | 0.004916 | 0.027631 |
835 | 32 | 3 | 2001 | 62496.0 | 13662646 | 0.500808 | 0.499192 | 0.349270 | 0.334776 | 0.065935 | 0.068644 | 0.006304 | 0.006135 | 0.048658 | 0.058518 | 0.004651 | 0.004758 | 0.025988 |
67 | 6 | 1 | 2003 | 113518.0 | 10021506 | 0.495390 | 0.504610 | 0.244432 | 0.238668 | 0.051896 | 0.056041 | 0.005613 | 0.005458 | 0.161241 | 0.171451 | 0.004578 | 0.004916 | 0.027631 |
447 | 6 | 95 | 2001 | 83678.0 | 2716962 | 0.501149 | 0.498851 | 0.300509 | 0.288954 | 0.074948 | 0.072824 | 0.006718 | 0.006442 | 0.076265 | 0.088328 | 0.005441 | 0.005261 | 0.037269 |
217 | 6 | 37 | 2012 | 75624.0 | 59665436 | 0.495714 | 0.504286 | 0.353743 | 0.349500 | 0.042930 | 0.046823 | 0.007591 | 0.007220 | 0.073381 | 0.082331 | 0.001828 | 0.001838 | 0.016241 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1133 | 42 | 69 | 2002 | 56911.0 | 1294150 | 0.489517 | 0.510483 | 0.437606 | 0.461313 | 0.024152 | 0.021254 | 0.001740 | 0.001697 | 0.016049 | 0.016100 | 0.000323 | 0.000383 | 0.009646 |
1184 | 42 | 101 | 2005 | 55102.0 | 9562106 | 0.475856 | 0.524144 | 0.218985 | 0.226435 | 0.200090 | 0.235551 | 0.004473 | 0.004774 | 0.037813 | 0.041240 | 0.000867 | 0.000898 | 0.013629 |
1219 | 42 | 133 | 2000 | 71655.0 | 2743046 | 0.497499 | 0.502501 | 0.438321 | 0.443527 | 0.037588 | 0.036162 | 0.002079 | 0.001843 | 0.007190 | 0.008613 | 0.000395 | 0.000416 | 0.011926 |
924 | 36 | 103 | 2002 | 109084.0 | 9152726 | 0.495513 | 0.504487 | 0.416442 | 0.422438 | 0.043497 | 0.045976 | 0.003550 | 0.003241 | 0.021259 | 0.022082 | 0.000557 | 0.000519 | 0.010207 |
1215 | 42 | 129 | 2005 | 59349.0 | 2123436 | 0.493617 | 0.506383 | 0.464779 | 0.479903 | 0.014620 | 0.012278 | 0.000807 | 0.000656 | 0.004862 | 0.005320 | 0.000122 | 0.000111 | 0.008428 |
858 rows × 18 columns
Part Two: Determining Feature Importance
Feature importance determines how much each feature impacts the AQI when increased by one factor with all other features held constant. What we find below determining feature importance in our data set is that each feature has a relatively low impact on AQI. As we continue training models, however, it is clear that these features together have a significant impact on AQI as they can predict AQI outcome with remarkable accuracy.
from sklearn.ensemble import RandomForestClassifier
def feature_importance(X_train, y_train):
"""
This function determines the feature importance of each feature in the dataset and sorts them
by importance using Random Forest and Gini import.
"""
= RandomForestClassifier()
RF
RF.fit(X_train, y_train)
#get importance
= pd.DataFrame(data =
importances 'Attribute': X_train.columns,
{'Importance': RF.feature_importances_
})
'Importance')
importances.sort_values(
return importances
= feature_importance(X_train, y_train) importances
Below we see the results of the first 10 most important features; Date Local is the feature with the most impact on the AQI, followed by Median Household Income, State Code, Total Male % and then Total Female %. However, it is important to note that most feature importances are below approximately .005.
#plotting the importances in two batches
#first batch
= importances[0:10]
importances10 =importances10['Attribute'], width=importances10['Importance'], color='#087E8B')
plt.barh(y"Top 10 Features by Importance in Predicting AQI") plt.title(
Text(0.5, 1.0, 'Top 10 Features by Importance in Predicting AQI')
Below we see the results of the final 8 features and their importances, all of which have very miniscule feature importance values. However, the next most important features all have to do with racial and ethnic demographics.
#second 8
= importances[10:18]
importances20 =importances20['Attribute'], width=importances20['Importance'], color='#087E8B')
plt.barh(y"11th-18th Most Important Features in Predicting AQI") plt.title(
Text(0.5, 1.0, '11th-18th Most Important Features in Predicting AQI')
Part Three: Looking at Accuracies of Different Models
Logistic Regression
from sklearn.linear_model import LogisticRegression
= LogisticRegression()
LR
LR.fit(X_train, y_train)
print("Training Accuracy:")
print(LR.score(X_train, y_train))
print("Testing Accuracy:")
print(LR.score(X_test, y_test))
Training Accuracy:
0.8916083916083916
Testing Accuracy:
0.8605200945626478
Logistic Regression with Polynomial Features: Including Coefficients
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
def poly_LR(degree, **kwargs):
= Pipeline([("poly", PolynomialFeatures(degree = degree)),
plr "LR", LogisticRegression(**kwargs))])
(return plr
= poly_LR(degree = 2, max_iter = 1000000)
plr
plr.fit(X_train, y_train)print("Training accuracy")
print(plr.score(X_train, y_train))
print("Testing accuracy")
print(plr.score(X_test, y_test))
# check p values for statistical significance
Training accuracy
0.8916083916083916
Testing accuracy
0.8605200945626478
print("coefficients of model:")
print()
print(plr.named_steps['LR'].coef_)
print("biases of model:")
print(plr.named_steps['LR'].intercept_)
coefficients of model:
[[ 1.11276049e-23 2.70910398e-22 8.13314987e-22 2.23670684e-20
8.26684574e-19 5.39082056e-17 5.49742047e-24 5.63018441e-24
4.02195097e-24 4.05052126e-24 7.24190483e-25 7.90383926e-25
1.21744623e-25 1.22740132e-25 4.17892468e-25 4.50642155e-25
1.62466672e-26 1.60424146e-26 1.95395261e-25 9.71212566e-21
2.50700099e-20 5.44539274e-19 1.86454545e-17 1.45297020e-15
1.33430573e-22 1.37479824e-22 9.84446728e-23 9.97724228e-23
1.94709841e-23 2.14792962e-23 3.46827391e-24 3.52705503e-24
7.51825966e-24 8.05513764e-24 3.01250434e-25 2.97962788e-25
4.22713223e-24 1.43266830e-19 1.63502000e-18 5.78155511e-17
4.15345765e-15 4.01429417e-22 4.11885570e-22 2.88552987e-22
2.89759055e-22 6.61993339e-23 7.31387587e-23 6.18024253e-24
6.09656620e-24 2.60848784e-23 2.81342168e-23 9.93723991e-25
9.77824669e-25 1.34182512e-23 4.49591622e-17 1.66155303e-15
1.08558323e-13 1.10500506e-20 1.13170178e-20 8.08389662e-21
8.14129212e-21 1.45638400e-21 1.58959645e-21 2.44551325e-22
2.46546663e-22 8.39891803e-22 9.05697398e-22 3.26519447e-23
3.22418227e-23 3.92674887e-22 6.61244745e-14 4.29264003e-12
4.08815969e-19 4.17868605e-19 2.95706079e-19 2.96932342e-19
5.12802683e-20 5.58143629e-20 7.55101537e-21 7.53315405e-21
3.78513109e-20 4.08672204e-20 1.33712125e-21 1.32296402e-21
1.50901738e-20 -5.28658863e-15 2.66261809e-17 2.72820247e-17
1.83622140e-17 1.83963709e-17 4.25957653e-18 4.72170679e-18
3.68358727e-19 3.65098769e-19 2.58360385e-18 2.72716918e-18
9.34460019e-20 9.33720194e-20 9.58981750e-19 2.71681158e-24
2.78060890e-24 1.99107317e-24 2.00439402e-24 3.53545372e-25
3.85272718e-25 6.03240610e-26 6.07964433e-26 2.07110314e-25
2.23334739e-25 8.07456117e-27 7.96878746e-27 9.66840967e-26
2.84957551e-24 2.03087780e-24 2.04612724e-24 3.70645111e-25
4.05111208e-25 6.14205623e-26 6.19436882e-26 2.10782154e-25
2.27307417e-25 8.17210604e-27 8.07362716e-27 9.87111639e-26
1.51557445e-24 1.52678208e-24 2.25714898e-25 2.42076340e-25
4.01434546e-26 4.02884157e-26 1.35841746e-25 1.46686648e-25
5.65389432e-27 5.54558986e-27 6.81447289e-26 1.53949921e-24
2.28391354e-25 2.45193625e-25 4.05067222e-26 4.06905004e-26
1.34738394e-25 1.45478580e-25 5.61679316e-27 5.51176904e-27
6.83586772e-26 8.57859440e-26 9.78042487e-26 4.18078631e-27
4.14999729e-27 2.56684716e-26 2.75557490e-26 8.56095847e-28
8.67239011e-28 1.13391764e-26 1.11912957e-25 4.30807725e-27
4.27218307e-27 2.79523239e-26 2.99685240e-26 9.02381923e-28
9.16969423e-28 1.22293461e-26 9.30551427e-27 9.66277486e-27
2.89481010e-27 3.11549022e-27 1.72160789e-28 1.58426833e-28
3.62733494e-27 1.00428990e-26 2.81972503e-27 3.03328287e-27
1.70767871e-28 1.56821478e-28 3.70476254e-27 3.27900863e-26
3.52132882e-26 9.65928808e-28 9.75541416e-28 8.94927094e-27
3.79065517e-26 1.04978000e-27 1.05971914e-27 9.71378340e-27
5.19867671e-29 5.17342386e-29 3.74494641e-28 5.17066089e-29
3.70256095e-28 4.24909098e-27]]
biases of model:
[1.11276049e-23]
Support Vector Machine
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
= SVC(gamma = "auto")
svc
svc.fit(X_train, y_train)
print("Training Accuracy:")
print(svc.score(X_train, y_train))
print("Test accuracy")
print(svc.score(X_test, y_test))
Training Accuracy:
0.9463869463869464
Test accuracy
0.9054373522458629
= -np.inf
best_score = np.inf
best_gamma
for gammas in 6**(np.arange(-4,4, dtype = float)):
= SVC(gamma = gammas)
svc = cross_val_score(svc, X_train, y_train,cv=8)
scores if scores.mean()>best_score:
= scores.mean()
best_score = gammas
best_gamma
print(best_score, best_gamma)
0.9114096573208723 0.027777777777777776
= SVC(gamma = .027777777777777776)
svc_model
svc_model.fit(X_train,y_train)
print("training accuracy")
print(svc_model.score(X_train,y_train))
print("testing")
print(svc_model.score(X_test,y_test))
training accuracy
0.9358974358974359
testing
0.9054373522458629
Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
= -np.inf
best_score = 0
maximum
for depth in range(2,25):
= DecisionTreeClassifier(max_depth = depth)
tree #scores = cross_val_score(tree, X_train[subset], y_train, cv = 11)
= cross_val_score(tree, X_train, y_train, cv = 9)
scores if scores.mean() > best_score:
= scores.mean()
best_score = depth
maximum print(best_score, maximum)
0.8996954191033139 5
= DecisionTreeClassifier(max_depth = 5)
tree
tree.fit(X_train,y_train)print("training")
print(tree.score(X_train,y_train))
print("testing")
print(tree.score(X_test,y_test))
training
0.9335664335664335
testing
0.900709219858156
Part 4: Visualization of Data
First, let’s explore our data a bit by visualizing the AQI, Median Income, and some racial demographics using maps. (might want this at the begninng of the blog post)
import pandas as pd
= pd.read_csv("pollution_income_race.csv")
df = df.dropna()
df
from geomapping import Mapping
= Mapping()
mp "AQI Total") mp.plot_df(df,
"Median Household Income") mp.plot_df(df,
"black_alone_percent") mp.plot_df(df,
Our testing data
Next, let’s visualize our binary predictions compared to the actual AQI (want to label these, zoom, etc)
#creating dataframe with test data, labels, and predictions
from copy import deepcopy
= deepcopy(X_test)
test_combined
"Predicted AQI Binary"] = svc_model.predict(X_test)
test_combined["Actual AQI Binary"] = y_test
test_combined[
X_test.head()#plotting predictions
"Predicted AQI Binary") mp.plot_df(test_combined,
# #plotting actual
"Actual AQI Binary") mp.plot_df(test_combined,
Part 5: Analysis
The overall accuracy of the model was 0.9054373522458629
To find the PPV of our model we first need to calculate the number of true positives and false positives. To do so, we compare the predicted labels to the test labels. If the predicted label is 1 and the acutal label is 1, then that is a true positive. If the predicted label is 1 and the true label is 0 that is a false positive.
= len((np.where((test_combined["Predicted AQI Binary"]==1) & (test_combined["Actual AQI Binary"]==1))[0]))
TP = len((np.where((test_combined["Predicted AQI Binary"]==1) & (test_combined["Actual AQI Binary"]==0))[0]))
FP = round((TP)/(TP+FP),4)
PPV print("The positive predictive value of this model is: " + str(PPV))
The positive predictive value of this model is: 0.9112
The FPR represents the rate at which a test label of 0 is incorrectly predicted to be 1; or, when a high risk county is predicted to be a low risk count.
The FNR represents the rate at which a test label of 1 is incorrectly predicted to be 0.
from sklearn.metrics import confusion_matrix
= confusion_matrix(test_combined["Actual AQI Binary"],test_combined["Predicted AQI Binary"], normalize = 'true')
conf_matrix print("FPR = " + str(conf_matrix[0][1]))
print("FNR = " + str(conf_matrix[1][0]))
FPR = 0.5932203389830508
FNR = 0.013736263736263736
By Group Metrics: Income
First we will divide the data set into high income and low income counties. Counties where the median household incomw is more than $73,125 (mean) will be marked as high income (1) and under this threshold will be marked a lower income (0)
"Median Household Income"] <=73125,"High Income"]=0
test_combined.loc[test_combined["Median Household Income"] >73125,"High Income"]=1
test_combined.loc[test_combined[= test_combined["High Income"] medIncome
print("The accuracy for high income counties is " + str((test_combined["Predicted AQI Binary"]==test_combined["Actual AQI Binary"])[medIncome==1].mean().round(4)))
print("The accuracy for low income counties is " + str((test_combined["Predicted AQI Binary"]==test_combined["Actual AQI Binary"])[medIncome==0].mean().round(4)))
The accuracy for high income counties is 0.9653
The accuracy for low income counties is 0.864
Now we will compare some of the accuracy metrics such as the false positive rate and the false negative rate. Ideally, the values for each group will be about equal.
= confusion_matrix(test_combined["Actual AQI Binary"][medIncome==1],test_combined["Predicted AQI Binary"][medIncome==1], normalize = 'true')
conf_matrix_high_norm print("FPR for high income counties = " + str(conf_matrix_high_norm[0][1].round(4) * 100))
print("FNR for high income counties = " + str(conf_matrix_high_norm[1][0].round(4) *100))
= confusion_matrix(test_combined["Actual AQI Binary"][medIncome==0],test_combined["Predicted AQI Binary"][medIncome==0], normalize = 'true')
conf_matrix_low_norm print("FPR for low income counties = " + str(conf_matrix_low_norm[0][1].round(4) *100))
print("FNR for low income counties = " + str(conf_matrix_low_norm[1][0].round(4) *100))
FPR for high income counties = 50.0
FNR for high income counties = 0.61
FPR for low income counties = 61.22
FNR for low income counties = 1.9900000000000002
Bias measures for income
Calibration means that the fraction of predicted counties to have a AQI binary score of 1 (low risk) is the same across all income groups. So we will calculate this metric for both income groups. This metric can be calculated by dividing the number of true positives by the total number of predicted positives.
= confusion_matrix(test_combined["Actual AQI Binary"][medIncome==1],test_combined["Predicted AQI Binary"][medIncome==1])[0][0]/(confusion_matrix(test_combined["Actual AQI Binary"][medIncome==1],test_combined["Predicted AQI Binary"][medIncome==1])[0].sum())
high_income_calibration = confusion_matrix(test_combined["Actual AQI Binary"][medIncome==0],test_combined["Predicted AQI Binary"][medIncome==0])[0][0]/(confusion_matrix(test_combined["Actual AQI Binary"][medIncome==0],test_combined["Predicted AQI Binary"][medIncome==0])[0].sum())
low_income_calibration
print("The percentage of high income counties predicted to have a low risk AQI who actually had a low risk AQI is " + str(high_income_calibration.round(4)*100))
print("The percentage of low income counties predicted to have a low risk AQI who actually had a low risk AQI is " + str(low_income_calibration.round(4)*100))
The percentage of high income counties predicted to have a low risk AQI who actually had a low risk AQI is 50.0
The percentage of low income counties predicted to have a low risk AQI who actually had a low risk AQI is 38.78
This model is not calibrated as the percentages are very different. There is a 11.2% difference
A model satisfies error rate balance if the false positive and false negative rates are equal across groups. Looking at the previously calculated FPR anf FNR, I would say that this model does not satisfy error rate balance. The FPR for high income counties is 50%, while the FPR for low income counties is 61.22%. The difference between the two rates is 10.1%. In general, the high FPR may stem from a lack of positive observation in the training data itself.
A model satisifes statistical parity if the proportion of counties classified as having a low risk AQI is the same for each group. So we compare the total number of predicted positives.
print("The proportion of low income counties classified as having a low risk AQI is " + str((confusion_matrix(test_combined["Actual AQI Binary"][medIncome==0],test_combined["Actual AQI Binary"][medIncome==0])[0].sum())/(medIncome==0).sum()))
print("The proportion of high income counties classified as having a low risk AQI is " + str((confusion_matrix(test_combined["Actual AQI Binary"][medIncome==1],test_combined["Actual AQI Binary"][medIncome==1])[0].sum())/(medIncome==1).sum()))
The proportion of low income counties classified as having a low risk AQI is 0.196
The proportion of high income counties classified as having a low risk AQI is 0.057803468208092484
This model does not satisfy statistical parity. The proportion of counties predictied to have a low risk AQI is not the same for low and high income areas. The is a 14% difference between the two groups.
By Group Measures: Race
The data set will first be split into two halves. One set of counties is designated (1), where white people make up the majority of its residents. Additionally, counties without a significant white majority are denoted with a 0.
"White Alone (M) %"]+test_combined["White Alone (F) %"])<=.50,"Majority White"]=0
test_combined.loc[(test_combined["White Alone (M) %"]+test_combined["White Alone (F) %"])>.50,"Majority White"]=1
test_combined.loc[(test_combined[
= test_combined["Majority White"]
majority_white
print("The accuracy for counties with a white majority population is " + str((test_combined["Predicted AQI Binary"]==test_combined["Actual AQI Binary"])[majority_white==1].mean().round(4)))
print("The accuracy for counties without a white majority population is " + str((test_combined["Predicted AQI Binary"]==test_combined["Actual AQI Binary"])[majority_white==0].mean().round(4)))
The accuracy for counties with a white majority population is 0.9122
The accuracy for counties without a white majority population is 0.8511
Now we will compare some of the accuracy metrics such as the false positive rate and the false negative rate. Ideally the metrics will be about equal across the different demographic groups
= confusion_matrix(test_combined["Actual AQI Binary"][majority_white==1],test_combined["Predicted AQI Binary"][majority_white==1], normalize = 'true')
conf_matrix_major_white_norm print("FPR for counties with a majority white population = " + str(conf_matrix_major_white_norm[0][1].round(4) ))
print("FNR for counties with a majority white population = " + str(conf_matrix_major_white_norm[1][0].round(4)))
= confusion_matrix(test_combined["Actual AQI Binary"][majority_white==0],test_combined["Predicted AQI Binary"][majority_white==0], normalize = 'true')
conf_matrix_nonmajor_white_norm print("FPR for counties without a majority white population = " + str(conf_matrix_low_norm[0][1].round(4)))
print("FNR for counties without a majority white population = " + str(conf_matrix_low_norm[1][0].round(4)))
FPR for counties with a majority white population = 0.58
FNR for counties with a majority white population = 0.0123
FPR for counties without a majority white population = 0.6122
FNR for counties without a majority white population = 0.0199
Bias measures for race
Calibration means that the fraction of predicted counties to have a AQI binary score of 1 (low risk) is the same across all demographic groups. So we will calculate this metric for both demographics groups. This metric can be calculated by dividing the number of true positives by the total number of predicted positives.
= confusion_matrix(test_combined["Actual AQI Binary"][majority_white==1],test_combined["Predicted AQI Binary"][majority_white==1])[0][0]/(confusion_matrix(test_combined["Actual AQI Binary"][majority_white==1],test_combined["Predicted AQI Binary"][majority_white==1])[0].sum())
majority_calibration = confusion_matrix(test_combined["Actual AQI Binary"][majority_white==0],test_combined["Predicted AQI Binary"][majority_white==0])[0][0]/(confusion_matrix(test_combined["Actual AQI Binary"][majority_white==0],test_combined["Predicted AQI Binary"][majority_white==0])[0].sum())
nonmajority_calibration
print("The percentage of counties without a majority white population predicted have a low risk AQI who actually had a low risk AQI is " + str(nonmajority_calibration.round(4)*100))
print("The percentage of counties with a majority white population predicted have a low risk AQI who actually had a low risk AQI is " + str(majority_calibration.round(4)*100))
The percentage of counties without a majority white population predicted have a low risk AQI who actually had a low risk AQI is 33.33
The percentage of counties with a majority white population predicted have a low risk AQI who actually had a low risk AQI is 42.0
This model is not calibrated as the percentages are very different. There is a 9% difference
A model satisfies error rate balance if the false positive and false negative rates are equal across groups. Looking at the previously calculated FPR anf FNR, I would say that this model does not satisfy error rate balance. The FPR for counties with a majority white population is 58%, while the FPR for counties without a majority white population is 61%.
A model satisifes statistical parity if the proportion of counties classified as having a low risk AQI is the same for each group. So we compare the total number of predicted positives.
print("The proportion of counties with a majority white population classified as having a low risk AQI is " + str((confusion_matrix(test_combined["Actual AQI Binary"][majority_white==1],test_combined["Actual AQI Binary"][majority_white==1])[0].sum())/(majority_white==1).sum()))
print("The proportion of counties without a majority white population classified as having a low risk AQI is " + str((confusion_matrix(test_combined["Actual AQI Binary"][majority_white==0],test_combined["Actual AQI Binary"][majority_white==0])[0].sum())/(majority_white==0).sum()))
The proportion of counties with a majority white population classified as having a low risk AQI is 0.13297872340425532
The proportion of counties without a majority white population classified as having a low risk AQI is 0.19148936170212766
These values are roughly the same, so I would say that this model satisfies statistical parity.