Contents

4. Feature Importances

Contents

4. Feature Importances#

# Loading libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn import tree

# Loading cleaned data
df_features_combined = pd.read_csv('data/df_features_combined.csv')
df_features_combined.shape

(19387, 16)

# plotting correlation heatmap
plt.figure(figsize=(10,10))
dataplot = sns.heatmap(df_features_combined.corr(), cmap="YlGnBu", annot=True)
  
# displaying heatmap
plt.show()

_images/Notebook4-FeatureImportance_4_0.png

# random split into train and validation
train, test = train_test_split(df_features_combined, test_size = 0.25, random_state=1)

x_train = train.drop(['holdtime', 'pack_name'], axis=1)
y_train = train['holdtime']

x_test = test.drop(['holdtime', 'pack_name'], axis = 1)
y_test = test['holdtime']

4.1 Benchmark Model#

# First just try average hold time by member
train_holdtime_mean = y_train.mean()

pred1 = np.empty(len(y_test)) 
pred1.fill(train_holdtime_mean)

mean_absolute_error(y_test, pred1)

12.62234436965736

4.2 Linear Regression#

# dummy variable - hopefully x_test doesnt bring any surprises
x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)
x_train.shape, x_test.shape

((14540, 32), (4847, 32))

# simple linear regression
model_lr = LinearRegression(normalize=True)
model_lr.fit(x_train, y_train)  #fit the model
pred2 = model_lr.predict(x_test) #make prediction on test set
mean_absolute_error(y_test, pred2) #calculate mae

12.422053647064944

coefficients = pd.concat([pd.DataFrame(x_train.columns,columns=['columns']),
                          pd.DataFrame(np.transpose(model_lr.coef_),columns=['coefficients'])], axis = 1)
coefficients.sort_values(by = 'coefficients').head(15)

	columns	coefficients
0	piece_count_1	-5.427657e+11
1	piece_count_2	-5.427657e+11
29	brand_2_Stave	-4.353840e+00
16	brand_1_Nautilus	-2.782904e+00
18	brand_1_Other-Laser-cut	-2.748214e+00
27	brand_2_Other-Hand-cut	-1.944098e+00
13	brand_1_DaVici	-1.835529e+00
4	num_puzzles	-1.622190e+00
15	brand_1_Liberty	-1.421600e+00
20	brand_1_Wentworth	-1.149477e+00
22	brand_2_Artifact	-1.019384e+00
21	brand_1_unknown	-9.244420e-01
2	difficulty_rating_1	-2.347115e-01
3	difficulty_rating_2	-1.867834e-01
17	brand_1_Other-Hand-cut	4.988677e-02

Cannot say anything about the coefficients right now. Need p-values. Building OLS model to get that.

4.3 OLS model#

from sklearn import datasets, linear_model
import statsmodels.api as sma

X = x_train
y = y_train
X2  = sma.add_constant(X)

est = sma.OLS(y, X2.astype(float))
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               holdtime   R-squared:                       0.025
Model:                            OLS   Adj. R-squared:                  0.023
Method:                 Least Squares   F-statistic:                     13.29
Date:                Wed, 15 Mar 2023   Prob (F-statistic):           6.21e-61
Time:                        00:14:09   Log-Likelihood:                -63076.
No. Observations:               14540   AIC:                         1.262e+05
Df Residuals:                   14511   BIC:                         1.264e+05
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
=============================================================================================
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                        10.5168      2.925      3.596      0.000       4.784      16.250
piece_count_1                 0.0016      0.001      1.216      0.224      -0.001       0.004
piece_count_2                 0.0034      0.002      2.226      0.026       0.000       0.006
difficulty_rating_1          -0.2139      0.532     -0.402      0.688      -1.257       0.829
difficulty_rating_2          -0.2403      0.454     -0.529      0.597      -1.130       0.650
num_puzzles                  -1.5342      0.847     -1.811      0.070      -3.195       0.126
pieces_d1                     0.0014      0.001      1.044      0.297      -0.001       0.004
pieces_d2                    -0.0005      0.001     -0.756      0.449      -0.002       0.001
pieces_d3                    -0.0011      0.001     -1.180      0.238      -0.003       0.001
pieces_d4                     0.0052      0.002      2.568      0.010       0.001       0.009
piece_count_pack              0.0050      0.002      2.839      0.005       0.002       0.008
difficulty_rating_pack        0.3769      0.646      0.583      0.560      -0.890       1.644
w_pieces_diff_transformed     1.3435      0.974      1.379      0.168      -0.566       3.252
brand_1_Artifact              4.5053      0.958      4.705      0.000       2.628       6.382
brand_1_DaVici               -0.0230      2.007     -0.011      0.991      -3.957       3.911
brand_1_Ecru                  2.3095      0.986      2.342      0.019       0.377       4.242
brand_1_Liberty               0.3715      1.121      0.331      0.740      -1.826       2.570
brand_1_Nautilus             -0.9866      1.546     -0.638      0.523      -4.017       2.044
brand_1_Other-Hand-cut        1.8460      1.233      1.497      0.134      -0.571       4.263
brand_1_Other-Laser-cut      -0.8987      1.351     -0.665      0.506      -3.548       1.750
brand_1_Stave                 1.8465      1.407      1.312      0.189      -0.912       4.605
brand_1_Wentworth             0.6735      1.239      0.543      0.587      -1.756       3.103
brand_1_unknown               0.8729      1.526      0.572      0.567      -2.118       3.864
brand_2_Artifact              0.2771      0.901      0.308      0.758      -1.488       2.042
brand_2_DaVici                1.8875      1.955      0.965      0.334      -1.945       5.720
brand_2_Ecru                  1.9655      0.914      2.151      0.032       0.174       3.757
brand_2_Liberty               1.8286      1.128      1.620      0.105      -0.383       4.041
brand_2_Nautilus              3.6361      1.610      2.258      0.024       0.480       6.792
brand_2_Other-Hand-cut       -0.6691      1.178     -0.568      0.570      -2.979       1.641
brand_2_Other-Laser-cut       1.4615      1.260      1.160      0.246      -1.009       3.932
brand_2_Stave                -3.2194      2.467     -1.305      0.192      -8.056       1.617
brand_2_Wentworth             1.5982      1.318      1.213      0.225      -0.985       4.181
brand_2_unknown               1.7507      1.341      1.306      0.192      -0.878       4.379
==============================================================================
Omnibus:                     9068.435   Durbin-Watson:                   1.999
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            97048.061
Skew:                           2.910   Prob(JB):                         0.00
Kurtosis:                      14.239   Cond. No.                     1.14e+16
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 9.68e-23. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

4.4 Decision Tree#

def dtree_grid_search(X,y,nfolds):
    #create a dictionary of all values we want to test
    param_grid = {'max_depth': [3, 5, 7, 9, 11, 15, 17]}
    # decision tree model
    dtree_model = DecisionTreeRegressor()
    #use gridsearch to test all values
    dtree_gscv = GridSearchCV(dtree_model, param_grid, cv=nfolds)
    #fit model to data
    dtree_gscv.fit(X, y)
    return dtree_gscv.best_params_

dtree_grid_search(X = x_train, y = y_train, nfolds = 5)

{'max_depth': 3}

model_dt = DecisionTreeRegressor(max_depth = 3,random_state=0)
model_dt.fit(x_train, y_train)  #fit the model
pred3 = model_dt.predict(x_test) #make prediction on test set
mean_absolute_error(y_test, pred3) #calculate mae

12.447865314968272

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(x_train.columns, model_dt.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
# importances.sort_values(by='Gini-importance').plot(kind='bar', rot=45)

importances.sort_values(by = 'Gini-importance', ascending=False).head(15)

	Gini-importance
piece_count_pack	0.607699
w_pieces_diff_transformed	0.258662
brand_1_Artifact	0.088650
pieces_d3	0.044989
brand_2_Ecru	0.000000
brand_1_Wentworth	0.000000
brand_1_unknown	0.000000
brand_2_Artifact	0.000000
brand_2_DaVici	0.000000
piece_count_1	0.000000
brand_1_Other-Laser-cut	0.000000
brand_2_Liberty	0.000000
brand_2_Nautilus	0.000000
brand_2_Other-Hand-cut	0.000000
brand_2_Other-Laser-cut	0.000000

4.4 Random Forest#

def rf_grid_search(X,y,nfolds):
    param_grid = {'max_depth': [3, 5, 7, 9], 'n_estimators': [50, 70, 100, 150, 200]}
    rf_model = RandomForestRegressor()
    rf_gscv = GridSearchCV(rf_model, param_grid, cv=nfolds)
    rf_gscv.fit(X, y)
    return rf_gscv.best_params_

rf_grid_search(X = x_train, y = y_train, nfolds = 5)

{'max_depth': 3, 'n_estimators': 100}

model_rf = RandomForestRegressor(max_depth = 3,n_estimators=200, random_state=0)
model_rf.fit(x_train, y_train)  #fit the model
pred4 = model_rf.predict(x_test) #make prediction on test set
mean_absolute_error(y_test, pred4) #calculate mae

12.436204414656602

feats_rf = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(x_train.columns, model_rf.feature_importances_):
    feats_rf[feature] = importance #add the name/value pair 

importances_rf = pd.DataFrame.from_dict(feats_rf, orient='index').rename(columns={0: 'feature_importance'})
# importances.sort_values(by='Gini-importance').plot(kind='bar', rot=45)

importances_rf.sort_values(by='feature_importance', ascending=False).head(15)

	feature_importance
piece_count_pack	0.575958
w_pieces_diff_transformed	0.213131
brand_1_Artifact	0.046877
piece_count_1	0.043080
pieces_d4	0.033324
piece_count_2	0.016813
pieces_d3	0.015915
pieces_d1	0.015663
pieces_d2	0.011825
difficulty_rating_1	0.007307
brand_2_Ecru	0.004491
difficulty_rating_pack	0.003580
brand_2_Artifact	0.002352
brand_1_Stave	0.001416
brand_2_Other-Laser-cut	0.001191

Merging all feature importance#

#