Contents

3. Feature Engineering

Contents

3. Feature Engineering#

import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

df_cleaned = pd.read_csv('data/df_cleaned.csv')
# member_info = pd.read_csv('data/member_holdtime_df.csv')
member_info = df_cleaned[['memberID','holdtime','puzzlepack']]
print(member_info.info())
print(member_info.dtypes)
member_info.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19387 entries, 0 to 19386
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   memberID    19387 non-null  object 
 1   holdtime    19387 non-null  float64
 2   puzzlepack  19387 non-null  object 
dtypes: float64(1), object(2)
memory usage: 454.5+ KB
None
memberID       object
holdtime      float64
puzzlepack     object
dtype: object

	memberID	holdtime	puzzlepack
0	member1	2.939411	Artifact Puzzles Justin Hillgrove Word Travels...
1	member1	0.998885	DaVici Puzzles Full Moon Feast DaVici Puzzles ...
2	member1	10.865032	DaVici Puzzles Flying Frigate DaVici Puzzles H...
3	member1	22.083971	Liberty Puzzles Haeckel Hummingbirds Nautilus ...
4	member1	5.077603	DaVici Puzzles Diana Zimens City Of Cats

First feature is number of pieces for each difficulty d1, d2, d3, d4#

# taking code from jon's branch
pieces_by_difficulty = []

for i, row in df_cleaned.iterrows():
    out = {
        'name': row['pack_name'],
        'pieces_d1': 0,
        'pieces_d2': 0,
        'pieces_d3': 0,
        'pieces_d4': 0,
        'num_puzzles': row['num_puzzles']
    }

    if not math.isnan(row['piece_count_1']):
        out[f'pieces_d{int(row["difficulty_rating_1"])}'] += row['piece_count_1']

    if (row['num_puzzles'] == 2 and math.isnan(row['piece_count_1']) == False):
        out[f'pieces_d{int(row["difficulty_rating_2"])}'] += row['piece_count_2']

    pieces_by_difficulty.append(out)

puzzle_piece_info = pd.DataFrame(pieces_by_difficulty)
puzzle_piece_info.to_csv('data/pieces_by_puzzle_and_difficulty.csv',index=None)

	pieces_d1	pieces_d2	pieces_d3	pieces_d4	num_puzzles	puzzlepack
0	456	548	0	0	2	Artifact Puzzles Justin Hillgrove Word Travels...
1	195	0	220	0	2	DaVici Puzzles Full Moon Feast DaVici Puzzles ...
2	496	0	0	0	2	DaVici Puzzles Flying Frigate DaVici Puzzles H...
3	0	707	0	0	2	Liberty Puzzles Haeckel Hummingbirds Nautilus ...
4	0	700	0	0	1	DaVici Puzzles Diana Zimens City Of Cats

hold_times_joined = member_info.merge(puzzle_piece_info, left_on='puzzlepack', right_on='puzzlepack',how='left')
hold_times_joined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 713169 entries, 0 to 713168
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   memberID     713169 non-null  object 
 1   holdtime     713169 non-null  float64
 2   puzzlepack   713169 non-null  object 
 3   pieces_d1    713169 non-null  int64  
 4   pieces_d2    713169 non-null  int64  
 5   pieces_d3    713169 non-null  int64  
 6   pieces_d4    713169 non-null  int64  
 7   num_puzzles  713169 non-null  int64  
dtypes: float64(1), int64(5), object(2)
memory usage: 49.0+ MB

hold_times_joined[hold_times_joined.num_puzzles.isna()]
hold_times_joined[hold_times_joined[['pieces_d1', 'pieces_d2', 'pieces_d3', 'pieces_d4', 'num_puzzles']].isna().any(axis=1)]

	memberID	holdtime	puzzlepack	pieces_d1	pieces_d2	pieces_d3	pieces_d4	num_puzzles

#packs_info = pd.read_csv(r'data/packs_cleaned_dropna.csv', index_col = 0)
packs_info = df_cleaned[['puzzlepack', 'brand_1',	'brand_2',	'piece_count_1', 'piece_count_2',
                        'difficulty_rating_1',	'difficulty_rating_2',	'num_puzzles']]
packs_info.head()

	puzzlepack	brand_1	brand_2	piece_count_1	piece_count_2	difficulty_rating_1	difficulty_rating_2	num_puzzles
0	Artifact Puzzles Justin Hillgrove Word Travels...	Artifact	Artifact	456	548	1	2	2
1	DaVici Puzzles Full Moon Feast DaVici Puzzles ...	DaVici	DaVici	195	220	1	3	2
2	DaVici Puzzles Flying Frigate DaVici Puzzles H...	DaVici	DaVici	332	164	1	1	2
3	Liberty Puzzles Haeckel Hummingbirds Nautilus ...	Liberty	Nautilus	485	222	2	2	2
4	DaVici Puzzles Diana Zimens City Of Cats	DaVici	DaVici	700	0	2	2	1

hold_times_diff = hold_times_joined.merge(packs_info[['puzzlepack', 'difficulty_rating_1', 'difficulty_rating_2']], left_on='puzzlepack', right_on='puzzlepack', how='left')
hold_times_diff.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36641275 entries, 0 to 36641274
Data columns (total 10 columns):
 #   Column               Dtype  
---  ------               -----  
 0   memberID             object 
 1   holdtime             float64
 2   puzzlepack           object 
 3   pieces_d1            int64  
 4   pieces_d2            int64  
 5   pieces_d3            int64  
 6   pieces_d4            int64  
 7   num_puzzles          int64  
 8   difficulty_rating_1  int64  
 9   difficulty_rating_2  int64  
dtypes: float64(1), int64(7), object(2)
memory usage: 3.0+ GB

hold_times_diff[hold_times_diff.difficulty_rating_1.isna() | hold_times_diff.difficulty_rating_2.isna()]

	memberID	holdtime	puzzlepack	pieces_d1	pieces_d2	pieces_d3	pieces_d4	num_puzzles	difficulty_rating_1	difficulty_rating_2

#hold_times_diff.to_csv('data/joined_hold_times_data_all.csv')

hold_times_diff[hold_times_diff.isna().any(axis=1)]

	memberID	holdtime	puzzlepack	pieces_d1	pieces_d2	pieces_d3	pieces_d4	num_puzzles	difficulty_rating_1	difficulty_rating_2

Second Feature is counting total number of pieces at pack level#

# making things at pack level
df_cleaned['piece_count_pack'] = 0
df_cleaned['difficulty_rating_pack'] = 0
for i in range(len(df_cleaned)):
    if df_cleaned['num_puzzles'][i] == 1:
        df_cleaned['piece_count_pack'][i] = df_cleaned['piece_count_1'][i]
        df_cleaned['difficulty_rating_pack'][i] = df_cleaned['difficulty_rating_1'][i]
    else:
        df_cleaned['piece_count_pack'][i] = df_cleaned['piece_count_1'][i] + df_cleaned['piece_count_2'][i]
        df_cleaned['difficulty_rating_pack'][i] = (df_cleaned['difficulty_rating_1'][i] + df_cleaned['difficulty_rating_2'][i])//2

Third feautre is combining difficulty and piece count into a single column values#

# taking code from madalyn's branch
pieces_df = pd.read_csv('data/pieces_by_puzzle_and_difficulty.csv')
pieces_df['w_pieces_diff'] = pieces_df['pieces_d1'] + pieces_df['pieces_d2']*2 + pieces_df['pieces_d3']*3 + pieces_df['pieces_d4']*4

# use box cox method to transform weighted pieces by difficulty
from sklearn.preprocessing import power_transform 
pieces_df['w_pieces_diff_transformed'] = power_transform(pieces_df[['w_pieces_diff']], method='box-cox')

pieces_df

	name	pieces_d1	pieces_d2	pieces_d3	pieces_d4	num_puzzles	w_pieces_diff	w_pieces_diff_transformed
0	Artifact Puzzles Justin Hillgrove Word Travels...	456	548	0	0	2	1552	0.476563
1	DaVici Puzzles Full Moon Feast DaVici Puzzles ...	195	0	220	0	2	855	-0.550938
2	DaVici Puzzles Flying Frigate DaVici Puzzles H...	496	0	0	0	2	496	-1.335665
3	Liberty Puzzles Haeckel Hummingbirds Nautilus ...	0	707	0	0	2	1414	0.303165
4	DaVici Puzzles Diana Zimens City Of Cats	0	700	0	0	1	1400	0.284934
...	...	...	...	...	...	...	...	...
19382	Wentworth Puzzles Lars Stewart Rainbow Washi T...	0	0	260	280	2	1900	0.871166
19383	Liberty Puzzles Jean Jans The Younger The Batt...	0	0	602	0	1	1806	0.769838
19384	Wentworth Puzzles Christopher Rogers The Cove ...	500	0	500	0	2	2000	0.975250
19385	Puzzlewood Birds and Flowers JC Ayer Mystery P...	300	0	484	0	2	1752	0.709980
19386	Liberty Puzzles Plants And Insects	0	0	568	0	1	1704	0.655701

19387 rows × 8 columns

# creat list of bin names and label names
# I just picked an arbitrary number of 20 bins, to group the piece difficulty by, this can be improved on
step = (pieces_df['w_pieces_diff_transformed'].max()-pieces_df['w_pieces_diff_transformed'].min())/20
bins = np.arange(pieces_df['w_pieces_diff_transformed'].min(), pieces_df['w_pieces_diff_transformed'].max()+step, step)
labels = [f'bin{s}' for s in map(str, list(range(len(bins)-1)))]

# label each puzzle by binned piece & difficulty
pieces_df['bin_label'] = pd.cut(x = pieces_df['w_pieces_diff_transformed'], bins = bins, labels = labels, include_lowest = True)
pieces_df.head()

	name	pieces_d1	pieces_d2	pieces_d3	pieces_d4	num_puzzles	w_pieces_diff	w_pieces_diff_transformed	bin_label
0	Artifact Puzzles Justin Hillgrove Word Travels...	456	548	0	0	2	1552	0.476563	bin9
1	DaVici Puzzles Full Moon Feast DaVici Puzzles ...	195	0	220	0	2	855	-0.550938	bin6
2	DaVici Puzzles Flying Frigate DaVici Puzzles H...	496	0	0	0	2	496	-1.335665	bin3
3	Liberty Puzzles Haeckel Hummingbirds Nautilus ...	0	707	0	0	2	1414	0.303165	bin9
4	DaVici Puzzles Diana Zimens City Of Cats	0	700	0	0	1	1400	0.284934	bin8

df_cleaned.shape, pieces_df.shape

((19387, 13), (19387, 9))

df_cleaned.columns

Index(['memberID', 'holdtime', 'puzzlepack', 'pack_name', 'piece_count_1',
       'piece_count_2', 'difficulty_rating_1', 'difficulty_rating_2',
       'brand_1', 'brand_2', 'num_puzzles', 'piece_count_pack',
       'difficulty_rating_pack'],
      dtype='object')

concat_df1 = df_cleaned[['pack_name', 'holdtime', 'piece_count_1', 'piece_count_2', 'difficulty_rating_1', 
                         'difficulty_rating_2',  'brand_1','piece_count_pack', 'difficulty_rating_pack',
                         'brand_2', 'num_puzzles']]
concat_df2 = pieces_df[['pieces_d1', 'pieces_d2', 'pieces_d3', 'pieces_d4', 'w_pieces_diff_transformed']]

df_features_combined = pd.concat([concat_df1, concat_df2], axis=1)

# setting order and saving file
df_features_combined = df_features_combined[['pack_name', 'piece_count_1', 'piece_count_2', 'difficulty_rating_1', 
                                             'difficulty_rating_2', 'brand_1', 'brand_2', 'num_puzzles', 
                                             'pieces_d1', 'pieces_d2', 'pieces_d3', 'pieces_d4','piece_count_pack',
                                             'difficulty_rating_pack', 'w_pieces_diff_transformed', 'holdtime']]

df_features_combined

	pack_name	piece_count_1	piece_count_2	difficulty_rating_1	difficulty_rating_2	brand_1	brand_2	num_puzzles	pieces_d1	pieces_d2	pieces_d3	pieces_d4	piece_count_pack	difficulty_rating_pack	w_pieces_diff_transformed	holdtime
0	Artifact Puzzles Justin Hillgrove Word Travels...	456	548	1	2	Artifact	Artifact	2	456	548	0	0	1004	1	0.476563	2.939411
1	DaVici Puzzles Full Moon Feast DaVici Puzzles ...	195	220	1	3	DaVici	DaVici	2	195	0	220	0	415	2	-0.550938	0.998885
2	DaVici Puzzles Flying Frigate DaVici Puzzles H...	332	164	1	1	DaVici	DaVici	2	496	0	0	0	496	1	-1.335665	10.865032
3	Liberty Puzzles Haeckel Hummingbirds Nautilus ...	485	222	2	2	Liberty	Nautilus	2	0	707	0	0	707	2	0.303165	22.083971
4	DaVici Puzzles Diana Zimens City Of Cats	700	0	2	2	DaVici	DaVici	1	0	700	0	0	700	2	0.284934	5.077603
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
19382	Wentworth Puzzles Lars Stewart Rainbow Washi T...	280	260	4	3	Wentworth	Wentworth	2	0	0	260	280	540	3	0.871166	14.738673
19383	Liberty Puzzles Jean Jans The Younger The Batt...	602	0	3	2	Liberty	Liberty	1	0	0	602	0	602	3	0.769838	32.657524
19384	Wentworth Puzzles Christopher Rogers The Cove ...	500	500	1	3	Wentworth	Wentworth	2	500	0	500	0	1000	2	0.975250	12.775870
19385	Puzzlewood Birds and Flowers JC Ayer Mystery P...	484	300	3	1	Other-Hand-cut	Other-Hand-cut	2	300	0	484	0	784	2	0.709980	6.884502
19386	Liberty Puzzles Plants And Insects	568	0	3	2	Liberty	Liberty	1	0	0	568	0	568	3	0.655701	13.694302

19387 rows × 16 columns

df_features_combined.to_csv('data/df_features_combined.csv', index=False)