3. Feature Engineering#

import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")
df_cleaned = pd.read_csv('data/df_cleaned.csv')
# member_info = pd.read_csv('data/member_holdtime_df.csv')
member_info = df_cleaned[['memberID','holdtime','puzzlepack']]
print(member_info.info())
print(member_info.dtypes)
member_info.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19387 entries, 0 to 19386
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   memberID    19387 non-null  object 
 1   holdtime    19387 non-null  float64
 2   puzzlepack  19387 non-null  object 
dtypes: float64(1), object(2)
memory usage: 454.5+ KB
None
memberID       object
holdtime      float64
puzzlepack     object
dtype: object
memberID holdtime puzzlepack
0 member1 2.939411 Artifact Puzzles Justin Hillgrove Word Travels...
1 member1 0.998885 DaVici Puzzles Full Moon Feast DaVici Puzzles ...
2 member1 10.865032 DaVici Puzzles Flying Frigate DaVici Puzzles H...
3 member1 22.083971 Liberty Puzzles Haeckel Hummingbirds Nautilus ...
4 member1 5.077603 DaVici Puzzles Diana Zimens City Of Cats

First feature is number of pieces for each difficulty d1, d2, d3, d4#

# taking code from jon's branch
pieces_by_difficulty = []

for i, row in df_cleaned.iterrows():
    out = {
        'name': row['pack_name'],
        'pieces_d1': 0,
        'pieces_d2': 0,
        'pieces_d3': 0,
        'pieces_d4': 0,
        'num_puzzles': row['num_puzzles']
    }

    if not math.isnan(row['piece_count_1']):
        out[f'pieces_d{int(row["difficulty_rating_1"])}'] += row['piece_count_1']

    if (row['num_puzzles'] == 2 and math.isnan(row['piece_count_1']) == False):
        out[f'pieces_d{int(row["difficulty_rating_2"])}'] += row['piece_count_2']

    pieces_by_difficulty.append(out)
puzzle_piece_info = pd.DataFrame(pieces_by_difficulty)
puzzle_piece_info.to_csv('data/pieces_by_puzzle_and_difficulty.csv',index=None)
pieces_d1 pieces_d2 pieces_d3 pieces_d4 num_puzzles puzzlepack
0 456 548 0 0 2 Artifact Puzzles Justin Hillgrove Word Travels...
1 195 0 220 0 2 DaVici Puzzles Full Moon Feast DaVici Puzzles ...
2 496 0 0 0 2 DaVici Puzzles Flying Frigate DaVici Puzzles H...
3 0 707 0 0 2 Liberty Puzzles Haeckel Hummingbirds Nautilus ...
4 0 700 0 0 1 DaVici Puzzles Diana Zimens City Of Cats
hold_times_joined = member_info.merge(puzzle_piece_info, left_on='puzzlepack', right_on='puzzlepack',how='left')
hold_times_joined.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 713169 entries, 0 to 713168
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   memberID     713169 non-null  object 
 1   holdtime     713169 non-null  float64
 2   puzzlepack   713169 non-null  object 
 3   pieces_d1    713169 non-null  int64  
 4   pieces_d2    713169 non-null  int64  
 5   pieces_d3    713169 non-null  int64  
 6   pieces_d4    713169 non-null  int64  
 7   num_puzzles  713169 non-null  int64  
dtypes: float64(1), int64(5), object(2)
memory usage: 49.0+ MB
hold_times_joined[hold_times_joined.num_puzzles.isna()]
hold_times_joined[hold_times_joined[['pieces_d1', 'pieces_d2', 'pieces_d3', 'pieces_d4', 'num_puzzles']].isna().any(axis=1)]
memberID holdtime puzzlepack pieces_d1 pieces_d2 pieces_d3 pieces_d4 num_puzzles
#packs_info = pd.read_csv(r'data/packs_cleaned_dropna.csv', index_col = 0)
packs_info = df_cleaned[['puzzlepack', 'brand_1',	'brand_2',	'piece_count_1', 'piece_count_2',
                        'difficulty_rating_1',	'difficulty_rating_2',	'num_puzzles']]
packs_info.head()
puzzlepack brand_1 brand_2 piece_count_1 piece_count_2 difficulty_rating_1 difficulty_rating_2 num_puzzles
0 Artifact Puzzles Justin Hillgrove Word Travels... Artifact Artifact 456 548 1 2 2
1 DaVici Puzzles Full Moon Feast DaVici Puzzles ... DaVici DaVici 195 220 1 3 2
2 DaVici Puzzles Flying Frigate DaVici Puzzles H... DaVici DaVici 332 164 1 1 2
3 Liberty Puzzles Haeckel Hummingbirds Nautilus ... Liberty Nautilus 485 222 2 2 2
4 DaVici Puzzles Diana Zimens City Of Cats DaVici DaVici 700 0 2 2 1
hold_times_diff = hold_times_joined.merge(packs_info[['puzzlepack', 'difficulty_rating_1', 'difficulty_rating_2']], left_on='puzzlepack', right_on='puzzlepack', how='left')
hold_times_diff.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 36641275 entries, 0 to 36641274
Data columns (total 10 columns):
 #   Column               Dtype  
---  ------               -----  
 0   memberID             object 
 1   holdtime             float64
 2   puzzlepack           object 
 3   pieces_d1            int64  
 4   pieces_d2            int64  
 5   pieces_d3            int64  
 6   pieces_d4            int64  
 7   num_puzzles          int64  
 8   difficulty_rating_1  int64  
 9   difficulty_rating_2  int64  
dtypes: float64(1), int64(7), object(2)
memory usage: 3.0+ GB
hold_times_diff[hold_times_diff.difficulty_rating_1.isna() | hold_times_diff.difficulty_rating_2.isna()]
memberID holdtime puzzlepack pieces_d1 pieces_d2 pieces_d3 pieces_d4 num_puzzles difficulty_rating_1 difficulty_rating_2
#hold_times_diff.to_csv('data/joined_hold_times_data_all.csv')
hold_times_diff[hold_times_diff.isna().any(axis=1)]
memberID holdtime puzzlepack pieces_d1 pieces_d2 pieces_d3 pieces_d4 num_puzzles difficulty_rating_1 difficulty_rating_2

Second Feature is counting total number of pieces at pack level#

# making things at pack level
df_cleaned['piece_count_pack'] = 0
df_cleaned['difficulty_rating_pack'] = 0
for i in range(len(df_cleaned)):
    if df_cleaned['num_puzzles'][i] == 1:
        df_cleaned['piece_count_pack'][i] = df_cleaned['piece_count_1'][i]
        df_cleaned['difficulty_rating_pack'][i] = df_cleaned['difficulty_rating_1'][i]
    else:
        df_cleaned['piece_count_pack'][i] = df_cleaned['piece_count_1'][i] + df_cleaned['piece_count_2'][i]
        df_cleaned['difficulty_rating_pack'][i] = (df_cleaned['difficulty_rating_1'][i] + df_cleaned['difficulty_rating_2'][i])//2

Third feautre is combining difficulty and piece count into a single column values#

# taking code from madalyn's branch
pieces_df = pd.read_csv('data/pieces_by_puzzle_and_difficulty.csv')
pieces_df['w_pieces_diff'] = pieces_df['pieces_d1'] + pieces_df['pieces_d2']*2 + pieces_df['pieces_d3']*3 + pieces_df['pieces_d4']*4

# use box cox method to transform weighted pieces by difficulty
from sklearn.preprocessing import power_transform 
pieces_df['w_pieces_diff_transformed'] = power_transform(pieces_df[['w_pieces_diff']], method='box-cox')
pieces_df
name pieces_d1 pieces_d2 pieces_d3 pieces_d4 num_puzzles w_pieces_diff w_pieces_diff_transformed
0 Artifact Puzzles Justin Hillgrove Word Travels... 456 548 0 0 2 1552 0.476563
1 DaVici Puzzles Full Moon Feast DaVici Puzzles ... 195 0 220 0 2 855 -0.550938
2 DaVici Puzzles Flying Frigate DaVici Puzzles H... 496 0 0 0 2 496 -1.335665
3 Liberty Puzzles Haeckel Hummingbirds Nautilus ... 0 707 0 0 2 1414 0.303165
4 DaVici Puzzles Diana Zimens City Of Cats 0 700 0 0 1 1400 0.284934
... ... ... ... ... ... ... ... ...
19382 Wentworth Puzzles Lars Stewart Rainbow Washi T... 0 0 260 280 2 1900 0.871166
19383 Liberty Puzzles Jean Jans The Younger The Batt... 0 0 602 0 1 1806 0.769838
19384 Wentworth Puzzles Christopher Rogers The Cove ... 500 0 500 0 2 2000 0.975250
19385 Puzzlewood Birds and Flowers JC Ayer Mystery P... 300 0 484 0 2 1752 0.709980
19386 Liberty Puzzles Plants And Insects 0 0 568 0 1 1704 0.655701

19387 rows × 8 columns

# creat list of bin names and label names
# I just picked an arbitrary number of 20 bins, to group the piece difficulty by, this can be improved on
step = (pieces_df['w_pieces_diff_transformed'].max()-pieces_df['w_pieces_diff_transformed'].min())/20
bins = np.arange(pieces_df['w_pieces_diff_transformed'].min(), pieces_df['w_pieces_diff_transformed'].max()+step, step)
labels = [f'bin{s}' for s in map(str, list(range(len(bins)-1)))]

# label each puzzle by binned piece & difficulty
pieces_df['bin_label'] = pd.cut(x = pieces_df['w_pieces_diff_transformed'], bins = bins, labels = labels, include_lowest = True)
pieces_df.head()
name pieces_d1 pieces_d2 pieces_d3 pieces_d4 num_puzzles w_pieces_diff w_pieces_diff_transformed bin_label
0 Artifact Puzzles Justin Hillgrove Word Travels... 456 548 0 0 2 1552 0.476563 bin9
1 DaVici Puzzles Full Moon Feast DaVici Puzzles ... 195 0 220 0 2 855 -0.550938 bin6
2 DaVici Puzzles Flying Frigate DaVici Puzzles H... 496 0 0 0 2 496 -1.335665 bin3
3 Liberty Puzzles Haeckel Hummingbirds Nautilus ... 0 707 0 0 2 1414 0.303165 bin9
4 DaVici Puzzles Diana Zimens City Of Cats 0 700 0 0 1 1400 0.284934 bin8
df_cleaned.shape, pieces_df.shape
((19387, 13), (19387, 9))
df_cleaned.columns
Index(['memberID', 'holdtime', 'puzzlepack', 'pack_name', 'piece_count_1',
       'piece_count_2', 'difficulty_rating_1', 'difficulty_rating_2',
       'brand_1', 'brand_2', 'num_puzzles', 'piece_count_pack',
       'difficulty_rating_pack'],
      dtype='object')
concat_df1 = df_cleaned[['pack_name', 'holdtime', 'piece_count_1', 'piece_count_2', 'difficulty_rating_1', 
                         'difficulty_rating_2',  'brand_1','piece_count_pack', 'difficulty_rating_pack',
                         'brand_2', 'num_puzzles']]
concat_df2 = pieces_df[['pieces_d1', 'pieces_d2', 'pieces_d3', 'pieces_d4', 'w_pieces_diff_transformed']]
df_features_combined = pd.concat([concat_df1, concat_df2], axis=1)
# setting order and saving file
df_features_combined = df_features_combined[['pack_name', 'piece_count_1', 'piece_count_2', 'difficulty_rating_1', 
                                             'difficulty_rating_2', 'brand_1', 'brand_2', 'num_puzzles', 
                                             'pieces_d1', 'pieces_d2', 'pieces_d3', 'pieces_d4','piece_count_pack',
                                             'difficulty_rating_pack', 'w_pieces_diff_transformed', 'holdtime']]
df_features_combined
pack_name piece_count_1 piece_count_2 difficulty_rating_1 difficulty_rating_2 brand_1 brand_2 num_puzzles pieces_d1 pieces_d2 pieces_d3 pieces_d4 piece_count_pack difficulty_rating_pack w_pieces_diff_transformed holdtime
0 Artifact Puzzles Justin Hillgrove Word Travels... 456 548 1 2 Artifact Artifact 2 456 548 0 0 1004 1 0.476563 2.939411
1 DaVici Puzzles Full Moon Feast DaVici Puzzles ... 195 220 1 3 DaVici DaVici 2 195 0 220 0 415 2 -0.550938 0.998885
2 DaVici Puzzles Flying Frigate DaVici Puzzles H... 332 164 1 1 DaVici DaVici 2 496 0 0 0 496 1 -1.335665 10.865032
3 Liberty Puzzles Haeckel Hummingbirds Nautilus ... 485 222 2 2 Liberty Nautilus 2 0 707 0 0 707 2 0.303165 22.083971
4 DaVici Puzzles Diana Zimens City Of Cats 700 0 2 2 DaVici DaVici 1 0 700 0 0 700 2 0.284934 5.077603
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
19382 Wentworth Puzzles Lars Stewart Rainbow Washi T... 280 260 4 3 Wentworth Wentworth 2 0 0 260 280 540 3 0.871166 14.738673
19383 Liberty Puzzles Jean Jans The Younger The Batt... 602 0 3 2 Liberty Liberty 1 0 0 602 0 602 3 0.769838 32.657524
19384 Wentworth Puzzles Christopher Rogers The Cove ... 500 500 1 3 Wentworth Wentworth 2 500 0 500 0 1000 2 0.975250 12.775870
19385 Puzzlewood Birds and Flowers JC Ayer Mystery P... 484 300 3 1 Other-Hand-cut Other-Hand-cut 2 300 0 484 0 784 2 0.709980 6.884502
19386 Liberty Puzzles Plants And Insects 568 0 3 2 Liberty Liberty 1 0 0 568 0 568 3 0.655701 13.694302

19387 rows × 16 columns

df_features_combined.to_csv('data/df_features_combined.csv', index=False)