Contents

2. Data Cleaning

Contents

2. Data Cleaning#

2.1 Members Data#

member_holdtime_df=pd.read_csv('data/member_holdtime_df.csv')
member_holdtime_df.isnull().sum()

memberID      0
holdtime      0
puzzlepack    0
dtype: int64

# removing outliers
member_holdtime_df = member_holdtime_df[(member_holdtime_df['holdtime'] >= 0.1) & 
                                        (member_holdtime_df['holdtime'] <= 150)]

2.2 Packs Data#

packs_df=pd.read_csv('data/packs_df.csv')
packs_df.head()

	pack_name	brand	piece_count	difficulty_rating	piece_count_1	piece_count_2	difficulty_rating_1	difficulty_rating_2
0	Anne Belle Thompson The Mikado Anne Belle Thom...	Other-Hand-cut	387,242	A-Easy,A-Easy	387.0	242.0	A-Easy	A-Easy
1	Anthology Puzzles Alphonse Mucha La Plume Anth...	NaN	320,160	Average,Average	320.0	160.0	Average	Average
2	Anthology Puzzles Colorful Cat Anthology Puzzl...	NaN	150,170	Really-Hard,A-Easy	150.0	170.0	Really-Hard	A-Easy
3	Anthology Puzzles Framed American Gothic Antho...	NaN	320,300	Average,Average	320.0	300.0	Average	Average
4	Anthology Puzzles Over The Moon Anthology Puzz...	NaN	278,177	Average,A-Easy	278.0	177.0	Average	A-Easy

# splitting brand name
packs_df['brand_2'] = packs_df['brand'].str.split(',', expand=True)[1]
packs_df['brand_1'] = packs_df['brand'].str.split(',', expand=True)[0]

# adding number of puzzles feature
packs_df['num_puzzles'] = packs_df['pack_name'].map(lambda n: 1 if (n[-1] == ' ') else 2, na_action='ignore')

# fixing datatype
packs_df = packs_df.astype({'piece_count_1': 'int64', 'piece_count_2': 'int64'}, errors='ignore')

# dropping the initial variables
packs_df.drop(['brand', 'piece_count', 'difficulty_rating'], axis=1, inplace=True)

packs_df.isnull().sum()

pack_name                0
piece_count_1            8
piece_count_2          181
difficulty_rating_1      4
difficulty_rating_2      4
brand_2                775
brand_1                 25
num_puzzles              0
dtype: int64

# making the brand_2 same as brand_1
# ------ must be done only for puzzle = 2, right?
packs_df.loc[(packs_df['brand_2'].isna()), 'brand_2'] = packs_df['brand_1']

packs_df[['brand_1', 'brand_2']] = packs_df[['brand_1', 'brand_2']].fillna('unknown')

_images/Notebook2-DataCleaning_14_0.png

# Some 1 puzzle packs have a second number for pieces, this seems to be legit, updating to be 2 puzzle packs
packs_df.loc[((packs_df.num_puzzles == 1) & (packs_df.piece_count_2.notna())), 'num_puzzles'] = 2

packs_df.piece_count_1 = packs_df.piece_count_1.astype('float').astype('Int64')
packs_df.piece_count_2 = packs_df.piece_count_2.astype('float').astype('Int64')

avg_pc1 = packs_df['piece_count_1'].median()
avg_pc2 = packs_df['piece_count_2'].median()

# puzzles which have no value for piece_1 is replaced with avg values
packs_df['piece_count_1'].fillna(avg_pc1, inplace=True)

# if 2 puzzles and second piececount is not available, replace with average
packs_df.loc[((packs_df['piece_count_2'].isna() ) &(packs_df['num_puzzles'] ==2)), 'piece_count_2'] = int(avg_pc2)

# puzzles with no second piece_count, make second piece count 0 
# (because num_puzzle = 2 was dealt with earlier)
packs_df.loc[
    (packs_df['piece_count_2'].isna()) & (packs_df['num_puzzles'] == 1),
    'piece_count_2',
] = 0

packs_df.isnull().sum()

pack_name              0
piece_count_1          0
piece_count_2          0
difficulty_rating_1    4
difficulty_rating_2    4
brand_2                0
brand_1                0
num_puzzles            0
dtype: int64

packs_df['difficulty_rating_1'].fillna('Average', inplace=True)
packs_df['difficulty_rating_2'].fillna('Average', inplace=True)

2.3 Merged data#

df = member_holdtime_df.merge(packs_df, left_on='puzzlepack', right_on='pack_name', how='left')
df.head(2)

	memberID	holdtime	puzzlepack	pack_name	piece_count_1	piece_count_2	difficulty_rating_1	difficulty_rating_2	brand_2	brand_1	num_puzzles
0	member1	2.939411	Artifact Puzzles Justin Hillgrove Word Travels...	Artifact Puzzles Justin Hillgrove Word Travels...	456	548	A-Easy	Average	Artifact	Artifact	2.0
1	member1	0.998885	DaVici Puzzles Full Moon Feast DaVici Puzzles ...	DaVici Puzzles Full Moon Feast DaVici Puzzles ...	195	220	A-Easy	Hard	DaVici	DaVici	2.0

df.shape, member_holdtime_df.shape

((19387, 11), (19387, 3))

member_holdtime_df.puzzlepack.nunique(), packs_df.pack_name.nunique(), 

(968, 920)

df.isnull().sum()
# about 5% of the data is missing

memberID                  0
holdtime                  0
puzzlepack                0
pack_name              1085
piece_count_1          1085
piece_count_2          1085
difficulty_rating_1    1085
difficulty_rating_2    1085
brand_2                1085
brand_1                1085
num_puzzles            1085
dtype: int64

Tip: # Ideally we should have information about all the packs. Since we do not have this information and it is recommended that we do not drop these rows, we will impute these with the average values.

2.4 Dealing with Nulls!#

Methodology

num_puzzles \(\longrightarrow\) Count number of times ‘Puzzle’ appears in puzzlepack
brand_1, brand_2 \(\longrightarrow\) Take word before ‘Puzzle’ as brand name
difficulty_rating \(\longrightarrow\) Take average at brand level from known data
piece_count \(\longrightarrow\) Take average at brand level from known data

missing_pack_data = df.loc[df['pack_name'].isna()][['puzzlepack', 'piece_count_1', 'piece_count_2',
                               'difficulty_rating_1', 'difficulty_rating_2', 'brand_1', 
                                'brand_2', 'num_puzzles']]
missing_pack_data.reset_index(drop=True, inplace=True)

# getting number of puzzles
missing_pack_data['num_puzzles'] = missing_pack_data['puzzlepack'].apply(lambda x: (x.count('Puzzles')))

# packs can have only 1 or two puzzles
for i in range(len(missing_pack_data)):
    if missing_pack_data['num_puzzles'][i] < 1:
        missing_pack_data['num_puzzles'][i] = 1
    elif missing_pack_data['num_puzzles'][i] > 2:
        missing_pack_data['num_puzzles'][i] = 2

## to limit the number of brand names we will only use the brands that we know
known_brands = list(set(packs_df['brand_1'].unique()).union(set(packs_df['brand_2'].unique())))

# getting brand_1
for i in range(len(missing_pack_data)):
    words = missing_pack_data['puzzlepack'][i].split()[:2]

    if 'Puzzles' in words[1:]:  # if its the second or third word
        if words[words.index('Puzzles')-1] in (known_brands):
            missing_pack_data['brand_1'][i] = words[words.index('Puzzles')-1]
        else:
            missing_pack_data['brand_1'][i] = 'unknown'

# getting brand_2
for i in range(len(missing_pack_data)):
    words = missing_pack_data['puzzlepack'][i].split()[2:]
    if 'Puzzles' in words[1:]:  # if its the 4th word onwards
        if words[words.index('Puzzles')-1] in (known_brands):
            missing_pack_data['brand_2'][i] = words[words.index('Puzzles')-1]
        else:
            missing_pack_data['brand_2'][i] = 'unknown'

missing_pack_data[['num_puzzles', 'brand_1', 'brand_2']].isnull().sum()

num_puzzles      0
brand_1        139
brand_2        333
dtype: int64

# if anything still missing in num_puzzles, brands
missing_pack_data['num_puzzles'].fillna(1, inplace=True)
missing_pack_data['brand_1'].fillna('unknown', inplace=True)
missing_pack_data['brand_2'].fillna('unknown', inplace=True)

# brand level average piece count values -- KNOWN data
groupby_brand_pieces_1 = packs_df.groupby("brand_1")["piece_count_1"].mean()
groupby_brand_pieces_2 = packs_df.groupby("brand_2")["piece_count_2"].mean()

# getting piece_count_1
for i in range(0, len(missing_pack_data)):
    try:
        missing_pack_data['piece_count_1'][i] = int(groupby_brand_pieces_1[missing_pack_data['brand_1'][i]])
        # getting piece_count_2 if needed
        if missing_pack_data['num_puzzles'][i] == 1:
            missing_pack_data['piece_count_2'][i] = 0
        else:
            missing_pack_data['piece_count_2'][i] = int(groupby_brand_pieces_2[missing_pack_data['brand_2'][i]])
    except Exception as e:
        pass
#         print(e)

# brand level most common difficulty values -- KNOWN data
groupby_brand_diff_1 = packs_df.groupby("brand_1")["difficulty_rating_1"].agg(pd.Series.mode)
groupby_brand_diff_2 = packs_df.groupby("brand_2")["difficulty_rating_2"].agg(pd.Series.mode)

# getting difficulty_rating_1
for i in range(0, len(missing_pack_data)):
    try:
        missing_pack_data['difficulty_rating_1'][i] = groupby_brand_diff_1[missing_pack_data['brand_1'][i]]
        # getting difficulty_rating_2 if needed
        if missing_pack_data['num_puzzles'][i] == 1:
            missing_pack_data['difficulty_rating_2'][i] = 'Average'
        else:
            missing_pack_data['difficulty_rating_2'][i] = (groupby_brand_diff_2[missing_pack_data['brand_2'][i]])
    except Exception as e:
        pass

missing_pack_data.isnull().sum()

puzzlepack             0
piece_count_1          0
piece_count_2          0
difficulty_rating_1    0
difficulty_rating_2    0
brand_1                0
brand_2                0
num_puzzles            0
dtype: int64

Comments:

No missing values in the data
We have information about 675 members and 910 unique puzzle packs

Making a final packs data#

# setting order same as missing pack to concat easily
packs_df = packs_df[['pack_name', 'piece_count_1', 'piece_count_2', 'difficulty_rating_1', 
                             'difficulty_rating_2', 'brand_1', 'brand_2', 'num_puzzles']]

# setting same column names to concat easily
missing_pack_data = missing_pack_data.rename(columns = {'puzzlepack':'pack_name'})

packs_df.shape, missing_pack_data.shape

((920, 8), (1085, 8))

packs_updated = pd.concat([packs_df, missing_pack_data])
packs_updated.reset_index(inplace=True, drop = True)
packs_updated.shape

(2005, 8)

df_cleaned = member_holdtime_df.merge(packs_updated, left_on='puzzlepack', right_on='pack_name', how='left')
df_cleaned.head(2)

	memberID	holdtime	puzzlepack	pack_name	piece_count_1	piece_count_2	difficulty_rating_1	difficulty_rating_2	brand_1	brand_2	num_puzzles
0	member1	2.939411	Artifact Puzzles Justin Hillgrove Word Travels...	Artifact Puzzles Justin Hillgrove Word Travels...	456	548	A-Easy	Average	Artifact	Artifact	2
1	member1	0.998885	DaVici Puzzles Full Moon Feast DaVici Puzzles ...	DaVici Puzzles Full Moon Feast DaVici Puzzles ...	195	220	A-Easy	Hard	DaVici	DaVici	2

df_cleaned.drop_duplicates(inplace=True)
df_cleaned.reset_index(drop=True, inplace=True)
df_cleaned.to_csv('data/joined_hold_times_data_all.csv')

difficulty_mapping = {'A-Easy': 1, 'Average': 2, 'Hard': 3, 'Really-Hard': 4}

df_cleaned['difficulty_rating_1'] = df_cleaned['difficulty_rating_1'].map(lambda x: difficulty_mapping[x], na_action='ignore')
df_cleaned['difficulty_rating_2'] = df_cleaned['difficulty_rating_2'].map(lambda x: difficulty_mapping[x], na_action='ignore')

df_cleaned.shape, df.shape, member_holdtime_df.shape

((19387, 11), (19387, 11), (19387, 3))

df_cleaned.isnull().sum()

memberID               0
holdtime               0
puzzlepack             0
pack_name              0
piece_count_1          0
piece_count_2          0
difficulty_rating_1    0
difficulty_rating_2    0
brand_1                0
brand_2                0
num_puzzles            0
dtype: int64

df_cleaned.to_csv('data/df_cleaned.csv', index=False)