3. Feature Engineering
Contents
3. Feature Engineering#
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
df_cleaned = pd.read_csv('data/df_cleaned.csv')
# member_info = pd.read_csv('data/member_holdtime_df.csv')
member_info = df_cleaned[['memberID','holdtime','puzzlepack']]
print(member_info.info())
print(member_info.dtypes)
member_info.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19387 entries, 0 to 19386
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 memberID 19387 non-null object
1 holdtime 19387 non-null float64
2 puzzlepack 19387 non-null object
dtypes: float64(1), object(2)
memory usage: 454.5+ KB
None
memberID object
holdtime float64
puzzlepack object
dtype: object
memberID | holdtime | puzzlepack | |
---|---|---|---|
0 | member1 | 2.939411 | Artifact Puzzles Justin Hillgrove Word Travels... |
1 | member1 | 0.998885 | DaVici Puzzles Full Moon Feast DaVici Puzzles ... |
2 | member1 | 10.865032 | DaVici Puzzles Flying Frigate DaVici Puzzles H... |
3 | member1 | 22.083971 | Liberty Puzzles Haeckel Hummingbirds Nautilus ... |
4 | member1 | 5.077603 | DaVici Puzzles Diana Zimens City Of Cats |
First feature is number of pieces for each difficulty d1, d2, d3, d4#
# taking code from jon's branch
pieces_by_difficulty = []
for i, row in df_cleaned.iterrows():
out = {
'name': row['pack_name'],
'pieces_d1': 0,
'pieces_d2': 0,
'pieces_d3': 0,
'pieces_d4': 0,
'num_puzzles': row['num_puzzles']
}
if not math.isnan(row['piece_count_1']):
out[f'pieces_d{int(row["difficulty_rating_1"])}'] += row['piece_count_1']
if (row['num_puzzles'] == 2 and math.isnan(row['piece_count_1']) == False):
out[f'pieces_d{int(row["difficulty_rating_2"])}'] += row['piece_count_2']
pieces_by_difficulty.append(out)
puzzle_piece_info = pd.DataFrame(pieces_by_difficulty)
puzzle_piece_info.to_csv('data/pieces_by_puzzle_and_difficulty.csv',index=None)
pieces_d1 | pieces_d2 | pieces_d3 | pieces_d4 | num_puzzles | puzzlepack | |
---|---|---|---|---|---|---|
0 | 456 | 548 | 0 | 0 | 2 | Artifact Puzzles Justin Hillgrove Word Travels... |
1 | 195 | 0 | 220 | 0 | 2 | DaVici Puzzles Full Moon Feast DaVici Puzzles ... |
2 | 496 | 0 | 0 | 0 | 2 | DaVici Puzzles Flying Frigate DaVici Puzzles H... |
3 | 0 | 707 | 0 | 0 | 2 | Liberty Puzzles Haeckel Hummingbirds Nautilus ... |
4 | 0 | 700 | 0 | 0 | 1 | DaVici Puzzles Diana Zimens City Of Cats |
hold_times_joined = member_info.merge(puzzle_piece_info, left_on='puzzlepack', right_on='puzzlepack',how='left')
hold_times_joined.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 713169 entries, 0 to 713168
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 memberID 713169 non-null object
1 holdtime 713169 non-null float64
2 puzzlepack 713169 non-null object
3 pieces_d1 713169 non-null int64
4 pieces_d2 713169 non-null int64
5 pieces_d3 713169 non-null int64
6 pieces_d4 713169 non-null int64
7 num_puzzles 713169 non-null int64
dtypes: float64(1), int64(5), object(2)
memory usage: 49.0+ MB
hold_times_joined[hold_times_joined.num_puzzles.isna()]
hold_times_joined[hold_times_joined[['pieces_d1', 'pieces_d2', 'pieces_d3', 'pieces_d4', 'num_puzzles']].isna().any(axis=1)]
memberID | holdtime | puzzlepack | pieces_d1 | pieces_d2 | pieces_d3 | pieces_d4 | num_puzzles |
---|
#packs_info = pd.read_csv(r'data/packs_cleaned_dropna.csv', index_col = 0)
packs_info = df_cleaned[['puzzlepack', 'brand_1', 'brand_2', 'piece_count_1', 'piece_count_2',
'difficulty_rating_1', 'difficulty_rating_2', 'num_puzzles']]
packs_info.head()
puzzlepack | brand_1 | brand_2 | piece_count_1 | piece_count_2 | difficulty_rating_1 | difficulty_rating_2 | num_puzzles | |
---|---|---|---|---|---|---|---|---|
0 | Artifact Puzzles Justin Hillgrove Word Travels... | Artifact | Artifact | 456 | 548 | 1 | 2 | 2 |
1 | DaVici Puzzles Full Moon Feast DaVici Puzzles ... | DaVici | DaVici | 195 | 220 | 1 | 3 | 2 |
2 | DaVici Puzzles Flying Frigate DaVici Puzzles H... | DaVici | DaVici | 332 | 164 | 1 | 1 | 2 |
3 | Liberty Puzzles Haeckel Hummingbirds Nautilus ... | Liberty | Nautilus | 485 | 222 | 2 | 2 | 2 |
4 | DaVici Puzzles Diana Zimens City Of Cats | DaVici | DaVici | 700 | 0 | 2 | 2 | 1 |
hold_times_diff = hold_times_joined.merge(packs_info[['puzzlepack', 'difficulty_rating_1', 'difficulty_rating_2']], left_on='puzzlepack', right_on='puzzlepack', how='left')
hold_times_diff.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 36641275 entries, 0 to 36641274
Data columns (total 10 columns):
# Column Dtype
--- ------ -----
0 memberID object
1 holdtime float64
2 puzzlepack object
3 pieces_d1 int64
4 pieces_d2 int64
5 pieces_d3 int64
6 pieces_d4 int64
7 num_puzzles int64
8 difficulty_rating_1 int64
9 difficulty_rating_2 int64
dtypes: float64(1), int64(7), object(2)
memory usage: 3.0+ GB
hold_times_diff[hold_times_diff.difficulty_rating_1.isna() | hold_times_diff.difficulty_rating_2.isna()]
memberID | holdtime | puzzlepack | pieces_d1 | pieces_d2 | pieces_d3 | pieces_d4 | num_puzzles | difficulty_rating_1 | difficulty_rating_2 |
---|
#hold_times_diff.to_csv('data/joined_hold_times_data_all.csv')
hold_times_diff[hold_times_diff.isna().any(axis=1)]
memberID | holdtime | puzzlepack | pieces_d1 | pieces_d2 | pieces_d3 | pieces_d4 | num_puzzles | difficulty_rating_1 | difficulty_rating_2 |
---|
Second Feature is counting total number of pieces at pack level#
# making things at pack level
df_cleaned['piece_count_pack'] = 0
df_cleaned['difficulty_rating_pack'] = 0
for i in range(len(df_cleaned)):
if df_cleaned['num_puzzles'][i] == 1:
df_cleaned['piece_count_pack'][i] = df_cleaned['piece_count_1'][i]
df_cleaned['difficulty_rating_pack'][i] = df_cleaned['difficulty_rating_1'][i]
else:
df_cleaned['piece_count_pack'][i] = df_cleaned['piece_count_1'][i] + df_cleaned['piece_count_2'][i]
df_cleaned['difficulty_rating_pack'][i] = (df_cleaned['difficulty_rating_1'][i] + df_cleaned['difficulty_rating_2'][i])//2
Third feautre is combining difficulty and piece count into a single column values#
# taking code from madalyn's branch
pieces_df = pd.read_csv('data/pieces_by_puzzle_and_difficulty.csv')
pieces_df['w_pieces_diff'] = pieces_df['pieces_d1'] + pieces_df['pieces_d2']*2 + pieces_df['pieces_d3']*3 + pieces_df['pieces_d4']*4
# use box cox method to transform weighted pieces by difficulty
from sklearn.preprocessing import power_transform
pieces_df['w_pieces_diff_transformed'] = power_transform(pieces_df[['w_pieces_diff']], method='box-cox')
pieces_df
name | pieces_d1 | pieces_d2 | pieces_d3 | pieces_d4 | num_puzzles | w_pieces_diff | w_pieces_diff_transformed | |
---|---|---|---|---|---|---|---|---|
0 | Artifact Puzzles Justin Hillgrove Word Travels... | 456 | 548 | 0 | 0 | 2 | 1552 | 0.476563 |
1 | DaVici Puzzles Full Moon Feast DaVici Puzzles ... | 195 | 0 | 220 | 0 | 2 | 855 | -0.550938 |
2 | DaVici Puzzles Flying Frigate DaVici Puzzles H... | 496 | 0 | 0 | 0 | 2 | 496 | -1.335665 |
3 | Liberty Puzzles Haeckel Hummingbirds Nautilus ... | 0 | 707 | 0 | 0 | 2 | 1414 | 0.303165 |
4 | DaVici Puzzles Diana Zimens City Of Cats | 0 | 700 | 0 | 0 | 1 | 1400 | 0.284934 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
19382 | Wentworth Puzzles Lars Stewart Rainbow Washi T... | 0 | 0 | 260 | 280 | 2 | 1900 | 0.871166 |
19383 | Liberty Puzzles Jean Jans The Younger The Batt... | 0 | 0 | 602 | 0 | 1 | 1806 | 0.769838 |
19384 | Wentworth Puzzles Christopher Rogers The Cove ... | 500 | 0 | 500 | 0 | 2 | 2000 | 0.975250 |
19385 | Puzzlewood Birds and Flowers JC Ayer Mystery P... | 300 | 0 | 484 | 0 | 2 | 1752 | 0.709980 |
19386 | Liberty Puzzles Plants And Insects | 0 | 0 | 568 | 0 | 1 | 1704 | 0.655701 |
19387 rows × 8 columns
# creat list of bin names and label names
# I just picked an arbitrary number of 20 bins, to group the piece difficulty by, this can be improved on
step = (pieces_df['w_pieces_diff_transformed'].max()-pieces_df['w_pieces_diff_transformed'].min())/20
bins = np.arange(pieces_df['w_pieces_diff_transformed'].min(), pieces_df['w_pieces_diff_transformed'].max()+step, step)
labels = [f'bin{s}' for s in map(str, list(range(len(bins)-1)))]
# label each puzzle by binned piece & difficulty
pieces_df['bin_label'] = pd.cut(x = pieces_df['w_pieces_diff_transformed'], bins = bins, labels = labels, include_lowest = True)
pieces_df.head()
name | pieces_d1 | pieces_d2 | pieces_d3 | pieces_d4 | num_puzzles | w_pieces_diff | w_pieces_diff_transformed | bin_label | |
---|---|---|---|---|---|---|---|---|---|
0 | Artifact Puzzles Justin Hillgrove Word Travels... | 456 | 548 | 0 | 0 | 2 | 1552 | 0.476563 | bin9 |
1 | DaVici Puzzles Full Moon Feast DaVici Puzzles ... | 195 | 0 | 220 | 0 | 2 | 855 | -0.550938 | bin6 |
2 | DaVici Puzzles Flying Frigate DaVici Puzzles H... | 496 | 0 | 0 | 0 | 2 | 496 | -1.335665 | bin3 |
3 | Liberty Puzzles Haeckel Hummingbirds Nautilus ... | 0 | 707 | 0 | 0 | 2 | 1414 | 0.303165 | bin9 |
4 | DaVici Puzzles Diana Zimens City Of Cats | 0 | 700 | 0 | 0 | 1 | 1400 | 0.284934 | bin8 |
df_cleaned.shape, pieces_df.shape
((19387, 13), (19387, 9))
df_cleaned.columns
Index(['memberID', 'holdtime', 'puzzlepack', 'pack_name', 'piece_count_1',
'piece_count_2', 'difficulty_rating_1', 'difficulty_rating_2',
'brand_1', 'brand_2', 'num_puzzles', 'piece_count_pack',
'difficulty_rating_pack'],
dtype='object')
concat_df1 = df_cleaned[['pack_name', 'holdtime', 'piece_count_1', 'piece_count_2', 'difficulty_rating_1',
'difficulty_rating_2', 'brand_1','piece_count_pack', 'difficulty_rating_pack',
'brand_2', 'num_puzzles']]
concat_df2 = pieces_df[['pieces_d1', 'pieces_d2', 'pieces_d3', 'pieces_d4', 'w_pieces_diff_transformed']]
df_features_combined = pd.concat([concat_df1, concat_df2], axis=1)
# setting order and saving file
df_features_combined = df_features_combined[['pack_name', 'piece_count_1', 'piece_count_2', 'difficulty_rating_1',
'difficulty_rating_2', 'brand_1', 'brand_2', 'num_puzzles',
'pieces_d1', 'pieces_d2', 'pieces_d3', 'pieces_d4','piece_count_pack',
'difficulty_rating_pack', 'w_pieces_diff_transformed', 'holdtime']]
df_features_combined
pack_name | piece_count_1 | piece_count_2 | difficulty_rating_1 | difficulty_rating_2 | brand_1 | brand_2 | num_puzzles | pieces_d1 | pieces_d2 | pieces_d3 | pieces_d4 | piece_count_pack | difficulty_rating_pack | w_pieces_diff_transformed | holdtime | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Artifact Puzzles Justin Hillgrove Word Travels... | 456 | 548 | 1 | 2 | Artifact | Artifact | 2 | 456 | 548 | 0 | 0 | 1004 | 1 | 0.476563 | 2.939411 |
1 | DaVici Puzzles Full Moon Feast DaVici Puzzles ... | 195 | 220 | 1 | 3 | DaVici | DaVici | 2 | 195 | 0 | 220 | 0 | 415 | 2 | -0.550938 | 0.998885 |
2 | DaVici Puzzles Flying Frigate DaVici Puzzles H... | 332 | 164 | 1 | 1 | DaVici | DaVici | 2 | 496 | 0 | 0 | 0 | 496 | 1 | -1.335665 | 10.865032 |
3 | Liberty Puzzles Haeckel Hummingbirds Nautilus ... | 485 | 222 | 2 | 2 | Liberty | Nautilus | 2 | 0 | 707 | 0 | 0 | 707 | 2 | 0.303165 | 22.083971 |
4 | DaVici Puzzles Diana Zimens City Of Cats | 700 | 0 | 2 | 2 | DaVici | DaVici | 1 | 0 | 700 | 0 | 0 | 700 | 2 | 0.284934 | 5.077603 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
19382 | Wentworth Puzzles Lars Stewart Rainbow Washi T... | 280 | 260 | 4 | 3 | Wentworth | Wentworth | 2 | 0 | 0 | 260 | 280 | 540 | 3 | 0.871166 | 14.738673 |
19383 | Liberty Puzzles Jean Jans The Younger The Batt... | 602 | 0 | 3 | 2 | Liberty | Liberty | 1 | 0 | 0 | 602 | 0 | 602 | 3 | 0.769838 | 32.657524 |
19384 | Wentworth Puzzles Christopher Rogers The Cove ... | 500 | 500 | 1 | 3 | Wentworth | Wentworth | 2 | 500 | 0 | 500 | 0 | 1000 | 2 | 0.975250 | 12.775870 |
19385 | Puzzlewood Birds and Flowers JC Ayer Mystery P... | 484 | 300 | 3 | 1 | Other-Hand-cut | Other-Hand-cut | 2 | 300 | 0 | 484 | 0 | 784 | 2 | 0.709980 | 6.884502 |
19386 | Liberty Puzzles Plants And Insects | 568 | 0 | 3 | 2 | Liberty | Liberty | 1 | 0 | 0 | 568 | 0 | 568 | 3 | 0.655701 | 13.694302 |
19387 rows × 16 columns
df_features_combined.to_csv('data/df_features_combined.csv', index=False)