5. Final Models per person#

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, RobustScaler
import pickle
import os
import glob
import seaborn as sns
data = pd.read_csv('data/df_cleaned.csv', index_col=0)
data.head()
#data = data[data.notna().all(axis=1)]
holdtime puzzlepack pack_name piece_count_1 piece_count_2 difficulty_rating_1 difficulty_rating_2 brand_1 brand_2 num_puzzles
memberID
member1 2.939411 Artifact Puzzles Justin Hillgrove Word Travels... Artifact Puzzles Justin Hillgrove Word Travels... 456 548 1 2 Artifact Artifact 2
member1 0.998885 DaVici Puzzles Full Moon Feast DaVici Puzzles ... DaVici Puzzles Full Moon Feast DaVici Puzzles ... 195 220 1 3 DaVici DaVici 2
member1 10.865032 DaVici Puzzles Flying Frigate DaVici Puzzles H... DaVici Puzzles Flying Frigate DaVici Puzzles H... 332 164 1 1 DaVici DaVici 2
member1 22.083971 Liberty Puzzles Haeckel Hummingbirds Nautilus ... Liberty Puzzles Haeckel Hummingbirds Nautilus ... 485 222 2 2 Liberty Nautilus 2
member1 5.077603 DaVici Puzzles Diana Zimens City Of Cats DaVici Puzzles Diana Zimens City Of Cats 700 0 2 2 DaVici DaVici 1

5.1 Linear Regression per Member#

def member_train_test_split(df, test_size = 0.25):
    """
    Creates train and test sets per member based on test_size
    assumes rows are in time order so does not shuffle
    """
    train_size = (1 - test_size)
    g = df.groupby('memberID')
    train_flags = (g.cumcount() + 1) <= g.transform('size') * train_size
    test_flags = (g.cumcount() + 1) > g.transform('size') * train_size
    
    # Split X and y (hold_time)
    X_train = df[train_flags].drop('holdtime', axis=1)
    X_test = df[test_flags].drop('holdtime', axis=1)
    y_train = df[train_flags].holdtime
    y_test = df[test_flags].holdtime
    
    return X_train, X_test, y_train, y_test
# Split train/test
X_train, X_test, y_train, y_test = member_train_test_split(data)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
C:\Users\Hannah Luebbering\.conda\envs\cse160\lib\site-packages\ipykernel_launcher.py:8: FutureWarning: Dropping invalid columns in DataFrameGroupBy.transform is deprecated. In a future version, a TypeError will be raised. Before calling .transform, select only columns which should be valid for the transforming function.
  
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~\.conda\envs\cse160\lib\site-packages\pandas\core\groupby\generic.py in _transform_general(self, func, *args, **kwargs)
   1316             try:
-> 1317                 path, res = self._choose_path(fast_path, slow_path, group)
   1318             except TypeError:

~\.conda\envs\cse160\lib\site-packages\pandas\core\groupby\generic.py in _choose_path(self, fast_path, slow_path, group)
   1393         path = slow_path
-> 1394         res = slow_path(group)
   1395 

~\.conda\envs\cse160\lib\site-packages\pandas\core\groupby\generic.py in <lambda>(group)
   1382             slow_path = lambda group: group.apply(
-> 1383                 lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis
   1384             )

~\.conda\envs\cse160\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwargs)
   8739         )
-> 8740         return op.apply()
   8741 

~\.conda\envs\cse160\lib\site-packages\pandas\core\apply.py in apply(self)
    687 
--> 688         return self.apply_standard()
    689 

~\.conda\envs\cse160\lib\site-packages\pandas\core\apply.py in apply_standard(self)
    811     def apply_standard(self):
--> 812         results, res_index = self.apply_series_generator()
    813 

~\.conda\envs\cse160\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
    827                 # ignore SettingWithCopy here in case the user mutates
--> 828                 results[i] = self.f(v)
    829                 if isinstance(results[i], ABCSeries):

~\.conda\envs\cse160\lib\site-packages\pandas\core\groupby\generic.py in <lambda>(x)
   1382             slow_path = lambda group: group.apply(
-> 1383                 lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis
   1384             )

TypeError: 'int' object is not callable

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
C:\conda_tmp\ipykernel_23096\731815928.py in <module>
      1 # Split train/test
----> 2 X_train, X_test, y_train, y_test = member_train_test_split(data)
      3 print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

C:\conda_tmp\ipykernel_23096\48851854.py in member_train_test_split(df, test_size)
      6     train_size = (1 - test_size)
      7     g = df.groupby('memberID')
----> 8     train_flags = (g.cumcount() + 1) <= g.transform('size') * train_size
      9     test_flags = (g.cumcount() + 1) > g.transform('size') * train_size
     10 

~\.conda\envs\cse160\lib\site-packages\pandas\core\groupby\generic.py in transform(self, func, engine, engine_kwargs, *args, **kwargs)
   1356     def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
   1357         return self._transform(
-> 1358             func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
   1359         )
   1360 

~\.conda\envs\cse160\lib\site-packages\pandas\core\groupby\groupby.py in _transform(self, func, engine, engine_kwargs, *args, **kwargs)
   1468 
   1469             # only reached for DataFrameGroupBy
-> 1470             return self._transform_general(func, *args, **kwargs)
   1471 
   1472     # -----------------------------------------------------------------

~\.conda\envs\cse160\lib\site-packages\pandas\core\groupby\generic.py in _transform_general(self, func, *args, **kwargs)
   1317                 path, res = self._choose_path(fast_path, slow_path, group)
   1318             except TypeError:
-> 1319                 return self._transform_item_by_item(obj, fast_path)
   1320             except ValueError as err:
   1321                 msg = "transform must return a scalar value for each group"

~\.conda\envs\cse160\lib\site-packages\pandas\core\groupby\generic.py in _transform_item_by_item(self, obj, wrapper)
   1446 
   1447         if not output:
-> 1448             raise TypeError("Transform function invalid for data types")
   1449 
   1450         columns = obj.columns.take(inds)

TypeError: Transform function invalid for data types
"""
Replaced with order based split to account for time
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(data.drop('hold_time', axis=1), data['hold_time'], random_state = 123)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
"""
"\nReplaced with order based split to account for time\n# Split train/test\nX_train, X_test, y_train, y_test = train_test_split(data.drop('hold_time', axis=1), data['hold_time'], random_state = 123)\nprint(X_train.shape, y_train.shape, X_test.shape, y_test.shape)\n"
# Dropping difficulty as difficulty should be encoded by splitting piece counts up by difficulty
X_train = X_train.drop(['diff_0', 'diff_1'], axis=1)
X_test = X_test.drop(['diff_0', 'diff_1'], axis=1)
print(len(X_train))
print(len(X_train[X_train.isna().any(axis=1)]))
print(len(X_train[X_train.notna().all(axis=1)]))
13132
1094
12038
print(len(X_test))
print(len(X_test[X_test.isna().any(axis=1)]))
print(len(X_test[X_test.notna().all(axis=1)]))
4703
298
4405
# Handle puzzles with no info, this is quite common
# Drop them from training as to not introduce non-existant signal
y_train = y_train[X_train.notna().all(axis=1)]
X_train = X_train[X_train.notna().all(axis=1)]

# Can't drop from test as we still need to predict something for these people
# Try filling with median?
X_test_not_missing = X_test.notna().all(axis=1)
X_test['pieces_d1'] = X_test.pieces_d1.fillna(X_train.pieces_d1.mean())
X_test['pieces_d2'] = X_test.pieces_d2.fillna(X_train.pieces_d2.mean())
X_test['pieces_d3'] = X_test.pieces_d3.fillna(X_train.pieces_d3.mean())
X_test['pieces_d4'] = X_test.pieces_d4.fillna(X_train.pieces_d4.mean())
X_test.num_puzzles = X_test.num_puzzles.fillna(2) # just doing this manually as basically every pack has 2 puzzles
# Calculate global distribution info for each puzzle
full_training_set = X_train.copy()
full_training_set['hold_time'] = y_train.copy()
hold_summary_by_pack = full_training_set.groupby(by=['pack_name'])['hold_time'].describe()

# Some packs only have 1 data point so std dev is NaN, fill with avg std dev from the entire set
hold_summary_by_pack['std'] = hold_summary_by_pack['std'].fillna(hold_summary_by_pack['std'].mean())

# There are likely going to be instances in the test set where we don't have data for a pack, use the global averages for now
# TODO come up with a more sophisticated way to handle packs we don't have data for
X_test[X_test.isna().any(axis=1)]
member pack_name pieces_d1 pieces_d2 pieces_d3 pieces_d4 num_puzzles
# Join the training data with the per pack info, just going to use mean and std for now
# TODO try out other variations on pack hold-time distribution information
X_train = pd.merge(X_train, hold_summary_by_pack[['count','std', 'mean']], left_on='pack_name', right_index=True, how='left')
X_train['pack_hold_time_std'] = X_train['std']
X_train['pack_hold_time_mean'] = X_train['mean']
X_train['pack_hold_time_count'] = X_train['count']
X_train = X_train.drop(['std', 'mean', 'count'], axis=1)
X_train[X_train.isna().any(axis=1)]
member pack_name pieces_d1 pieces_d2 pieces_d3 pieces_d4 num_puzzles pack_hold_time_std pack_hold_time_mean pack_hold_time_count
# Add pack hold_time avg and std dev from training set to test set data
X_test = pd.merge(X_test, hold_summary_by_pack[['count', 'std', 'mean']], left_on='pack_name', right_index=True, how='left')
X_test['pack_hold_time_std'] = X_test['std']
X_test['pack_hold_time_mean'] = X_test['mean']
X_test['pack_hold_time_count'] = X_test['count']
X_test = X_test.drop(['mean', 'std', 'count'], axis=1)

# For packs from the test set with pack hold time data, fill with the means from the hold_time summary created using only training data
X_test['pack_hold_time_mean'] = X_test['pack_hold_time_mean'].fillna(hold_summary_by_pack['mean'].mean())
X_test['pack_hold_time_std'] = X_test['pack_hold_time_std'].fillna(hold_summary_by_pack['std'].mean())
X_test['pack_hold_time_count'] = X_test['pack_hold_time_count'].fillna(hold_summary_by_pack['count'].mean())
# Similar hold time distribution info per member, if hold-time data not present use global avg
hold_summary_by_member = full_training_set.groupby(by='member')['hold_time'].describe()
X_train = pd.merge(X_train, hold_summary_by_member[['count', 'mean', 'std']].rename(columns={'count': 'member_hold_time_count', 'mean': 'member_hold_time_mean', 'std': 'member_hold_time_std'}), left_on='member', right_index=True, how='left')
X_test = pd.merge(X_test, hold_summary_by_member[['count', 'mean', 'std']].rename(columns={'count': 'member_hold_time_count', 'mean': 'member_hold_time_mean', 'std': 'member_hold_time_std'}), left_on='member', right_index=True, how='left')

# For missing hold time summary data impute with global avg of the training set
#Filling train
X_train['member_hold_time_mean'] = X_train['member_hold_time_mean'].fillna(hold_summary_by_member['mean'].mean())
X_train['member_hold_time_std'] = X_train['member_hold_time_std'].fillna(hold_summary_by_member['std'].mean())
X_train['member_hold_time_count'] = X_train['member_hold_time_count'].fillna(hold_summary_by_member['count'].mean())
# Filling test
X_test['member_hold_time_mean'] = X_test['member_hold_time_mean'].fillna(hold_summary_by_member['mean'].mean())
X_test['member_hold_time_std'] = X_test['member_hold_time_std'].fillna(hold_summary_by_member['std'].mean())
X_test['member_hold_time_count'] = X_test['member_hold_time_count'].fillna(hold_summary_by_member['count'].mean())
hold_summary_by_member.hist('mean', bins=100)
hold_summary_by_pack.hist('mean', bins=100)
array([[<AxesSubplot: title={'center': 'mean'}>]], dtype=object)
_images/Notebook5-perMemberLinReg_17_1.png _images/Notebook5-perMemberLinReg_17_2.png
# plotting correlation heatmap
X_train_numeric = pd.merge(X_train, full_training_set.groupby(by='member')['hold_time'].skew(), left_on='member', right_index=True, how='left')
X_train_numeric = X_train_numeric.drop(['member'], axis=1)
X_train_numeric = X_train_numeric.rename({'hold_time': 'member_skew'}, axis=1)
X_train_numeric['skew_std'] = X_train_numeric['member_skew'] * X_train_numeric.member_hold_time_std
X_train_numeric['skew_std'] = X_train_numeric['member_skew'] * X_train_numeric.member_hold_time_std
X_train_numeric['pack_v_member'] = X_train_numeric.pack_hold_time_mean - X_train_numeric.member_hold_time_mean
X_train_numeric['member_dev'] = (y_train - X_train_numeric['member_hold_time_mean'])
X_train_numeric['puzzle_dev'] = (y_train - X_train_numeric['pack_hold_time_mean'])
X_train_numeric['hold_time'] = y_train
plt.figure(figsize=(15,15))
dataplot = sns.heatmap(X_train_numeric.corr(), cmap="YlGnBu", annot=True)
  
# displaying heatmap
plt.show()
/var/folders/vs/2pyq73711mgf5lg0y3w7rllm0000gn/T/ipykernel_9105/2997534890.py:12: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  dataplot = sns.heatmap(X_train_numeric.corr(), cmap="YlGnBu", annot=True)
_images/Notebook5-perMemberLinReg_18_1.png
X_train_numeric.member_dev.hist(bins=100)
<AxesSubplot: >
_images/Notebook5-perMemberLinReg_19_1.png
X_train_numeric.plot(x='member_skew', y='member_dev', kind='scatter')
<AxesSubplot: xlabel='member_skew', ylabel='member_dev'>
_images/Notebook5-perMemberLinReg_20_1.png
# Dropping some things to see
#X_train = X_train.drop(['member_hold_time_mean', 'member_hold_time_std', 'member_hold_time_count', 'pack_hold_time_count'], axis=1)
#X_test = X_test.drop(['member_hold_time_mean', 'member_hold_time_std', 'member_hold_time_count', 'pack_hold_time_count'], axis=1)

X_train = X_train.drop(['pack_hold_time_mean', 'pack_hold_time_std', 'pack_hold_time_count', 'num_puzzles'], axis=1)
X_test = X_test.drop(['pack_hold_time_mean', 'pack_hold_time_std', 'pack_hold_time_count', 'num_puzzles'], axis=1)
record_counts = X_train.member.value_counts()
record_counts
member557    132
member474    106
member40      87
member414     85
member292     85
            ... 
member13       1
member642      1
member645      1
member649      1
member520      1
Name: member, Length: 619, dtype: int64
MAX_STD = 5000
MIN_HISTORY = 75 # Minimum number of data points for a member to get their own model
# 3.6 / 13.2 @ 100
# 3.5 / 12.9 @ 75
# 5.5 / 14.2 @ 50
# 8.5 / 17.2 @ 25
# 11.6 / 22.3 @ 10
# Fit Scaler on all training data
# TODO Try out other scaler
scaler = StandardScaler()
scaler = scaler.fit(X_train.drop(['member', 'pack_name'], axis=1))
def train_linear_regression(X, y, scaler):
    '''
    Trains linear regression model using the given X, y, and scaler
    
    Returns the trained model
    '''
    
    lr = linear_model.LinearRegression()
    
    # Scale X
    X_s = scaler.transform(X)
    #X_s = X.to_numpy()
    # Fit linear regression model
    lr.fit(X_s, y)
    
    return lr
# For each member with more than MIN_HISTORY data points, train a linear regression model for them and save
# Filter to members with more than MIN_HISTORY points
members = record_counts[(record_counts >= MIN_HISTORY)]
for member, c in members.items():
    # Require a max std in hold, only learn for people who seem stable
    member_std = hold_summary_by_member.loc[member]
    if member_std['std'] > MAX_STD:
        continue
        
    X_train_m = X_train[X_train['member'] == member]
    y_train_m = y_train[X_train['member'] == member]
    
    m = train_linear_regression(X_train_m.drop(['member', 'pack_name'], axis=1), y_train_m, scaler)
    
    # Save model for later use
    with open(f'user_models/{member}.pkl', 'wb') as f:
        pickle.dump(m, f)
# Create a overall model on all data points for use on users without MIN_HISTORY data points
overall_m = train_linear_regression(X_train.drop(['member', 'pack_name'], axis=1), y_train, scaler)
with open(f'user_models/universal.pkl', 'wb') as f:
    pickle.dump(overall_m, f)
# Create model just on users with less than MIN_HISTORY data points as a "new puzzler" model
newps = record_counts[record_counts < MIN_HISTORY].index
X_train_newps = X_train[X_train['member'].isin(newps)]
y_train_newps = y_train[X_train['member'].isin(newps)]
newps_m = train_linear_regression(X_train_newps.drop(['member', 'pack_name'], axis=1), y_train_newps, scaler)
with open(f'user_models/newps.pkl', 'wb') as f:
    pickle.dump(newps_m, f)
def make_pred(X, member, default):
    '''
    args:
        - X - scaled input data
        - member - member string
        - default - default model to use
        
    Look up the proper model and use it, if not use default model
    
    Returns the (predicted hold time, the model used)
    '''
    
    # Check if trained model for member exists
    path = f'user_models/{member}.pkl'

    if os.path.exists(path):
        m = pickle.load(open(path, 'rb'))
        
        return (m.predict(X), member)
        
    else:
        # Use default model
        return (default.predict(X), "default")
# Go through test set and either use the per user model or the universal model
univ_model = pickle.load(open('user_models/universal.pkl', 'rb'))
newps_model = pickle.load(open('user_models/newps.pkl', 'rb'))
print(X_train.drop(['member', 'pack_name'], axis=1).columns)
print(f"Universal Coef {univ_model.coef_}")
print(f"Newps Coef {newps_model.coef_}")
# Scale test data using scaler fit on the training data

X_test_members = X_test['member']
X_test_packs = X_test['pack_name']
X_test_scaled = scaler.transform(X_test.drop(['member', 'pack_name'], axis=1))
#X_test_scaled = X_test.drop(['member', 'pack_name'], axis=1).to_numpy()
Index(['pieces_d1', 'pieces_d2', 'pieces_d3', 'pieces_d4',
       'member_hold_time_count', 'member_hold_time_mean',
       'member_hold_time_std'],
      dtype='object')
Universal Coef [ 1.96268797  2.39112375  2.62206506  1.53420493  0.1002531  10.32316364
  0.17189966]
Newps Coef [ 2.03889872  2.4608197   2.69484494  1.56316826  0.13466524 10.33005481
  0.1730354 ]
y_pred_universal, y_pred_universal_models = zip(*[make_pred(x.reshape(1, -1), m, univ_model) for x, m in zip(X_test_scaled, X_test_members)])
y_pred_universal = np.array(list(y_pred_universal))
y_pred_universal_models = np.array(list(y_pred_universal_models))
mse_univ = mean_squared_error(y_test, y_pred_universal)
mae_univ = mean_absolute_error(y_test, y_pred_universal)
print(f'Using universal default combined mse: {mse_univ}, mae: {mae_univ}')

mse_univ_usermodel = mean_squared_error(y_test[y_pred_universal_models != "default"], y_pred_universal[y_pred_universal_models != "default"])
mae_univ_usermodel = mean_absolute_error(y_test[y_pred_universal_models != "default"], y_pred_universal[y_pred_universal_models != "default"])
print(f'Using universal, user mse: {mse_univ_usermodel}, mae: {mae_univ_usermodel}')

mse_univ_default = mean_squared_error(y_test[y_pred_universal_models == "default"], y_pred_universal[y_pred_universal_models == "default"])
mae_univ_default = mean_absolute_error(y_test[y_pred_universal_models == "default"], y_pred_universal[y_pred_universal_models == "default"])
print(f'Using universal, default mse: {mse_univ_default}, mae: {mae_univ_default}')
Using universal default combined mse: 522.1756476380621, mae: 13.0212957291373
Using universal, user mse: 62.98943021215402, mae: 4.625958495474126
Using universal, default mse: 560.4676104826092, mae: 13.721390656247658
y_pred_newps, y_pred_newps_models = zip(*[make_pred(x.reshape(1, -1 ), m, newps_model) for x, m in zip(X_test_scaled, X_test_members)])
y_pred_newps = np.array(list(y_pred_newps))
y_pred_newps_models = np.array(list(y_pred_newps_models))
mse_newps = mean_squared_error(y_test, y_pred_newps)
mae_newps = mean_absolute_error(y_test, y_pred_newps)
print(f'Using newps combined mse: {mse_newps}, mae: {mae_newps}')

mse_newps_usermodel = mean_squared_error(y_test[y_pred_newps_models != "default"], y_pred_newps[y_pred_newps_models != "default"])
mae_newps_usermodel = mean_absolute_error(y_test[y_pred_newps_models != "default"], y_pred_newps[y_pred_newps_models != "default"])
print(f'Using newps, user mse: {mse_newps_usermodel}, mae: {mae_newps_usermodel}')

mse_newps_default = mean_squared_error(y_test[y_pred_newps_models == "default"], y_pred_newps[y_pred_newps_models == "default"])
mae_newps_default = mean_absolute_error(y_test[y_pred_newps_models == "default"], y_pred_newps[y_pred_newps_models == "default"])
print(f'Using newps, default mse: {mse_newps_default}, mae: {mae_newps_default}')
Using newps combined mse: 522.0700482259101, mae: 13.021581398932065
Using newps, user mse: 62.98943021215402, mae: 4.625958495474126
Using newps, default mse: 560.3532050379303, mae: 13.721700148310497
set(y_pred_universal_models[y_pred_universal_models != "default"])
{'member117',
 'member279',
 'member292',
 'member324',
 'member363',
 'member40',
 'member402',
 'member414',
 'member474',
 'member557',
 'member608'}
print(len(y_test))
print(len(y_test[y_pred_newps_models != "default"]))
print(len(y_test[(y_pred_newps_models != "default") & (X_test_not_missing)]))
mean_absolute_error(y_test[(y_pred_newps_models != "default") & (X_test_not_missing)], y_pred_newps[(y_pred_newps_models != "default") & (X_test_not_missing)])
4703
362
340
4.59828444364085
file = glob.glob('user_models/*')
for f in file:
    os.remove(f)
    pass
approaches = ("Universal Deafult", "New User Specific")
model_mae = {
    "Combined": (mae_univ, mae_newps),
    "New Users": (mae_univ_default, mae_newps_default),
    "Established Users": (mae_univ_usermodel, mae_newps_usermodel)
}

x = np.arange(len(approaches))  # the label locations
width = 0.25  # the width of the bars
multiplier = 0

fig, ax = plt.subplots(layout='constrained')

for model, mae in model_mae.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, mae, width, label=model)
    ax.bar_label(rects, padding=3)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('MAE (days)')
ax.set_title('MAE by User Type and Default Model')
ax.set_xticks(x + width, approaches)
ax.legend(loc='upper left', ncols=3)
ax.set_ylim(0, 20)

plt.show()
_images/Notebook5-perMemberLinReg_36_0.png