2. Data Cleaning#

2.1 Members Data#

member_holdtime_df=pd.read_csv('data/member_holdtime_df.csv')
member_holdtime_df.isnull().sum()
memberID      0
holdtime      0
puzzlepack    0
dtype: int64
# removing outliers
member_holdtime_df = member_holdtime_df[(member_holdtime_df['holdtime'] >= 0.1) & 
                                        (member_holdtime_df['holdtime'] <= 150)]

2.2 Packs Data#

packs_df=pd.read_csv('data/packs_df.csv')
packs_df.head()
pack_name brand piece_count difficulty_rating piece_count_1 piece_count_2 difficulty_rating_1 difficulty_rating_2
0 Anne Belle Thompson The Mikado Anne Belle Thom... Other-Hand-cut 387,242 A-Easy,A-Easy 387.0 242.0 A-Easy A-Easy
1 Anthology Puzzles Alphonse Mucha La Plume Anth... NaN 320,160 Average,Average 320.0 160.0 Average Average
2 Anthology Puzzles Colorful Cat Anthology Puzzl... NaN 150,170 Really-Hard,A-Easy 150.0 170.0 Really-Hard A-Easy
3 Anthology Puzzles Framed American Gothic Antho... NaN 320,300 Average,Average 320.0 300.0 Average Average
4 Anthology Puzzles Over The Moon Anthology Puzz... NaN 278,177 Average,A-Easy 278.0 177.0 Average A-Easy
# splitting brand name
packs_df['brand_2'] = packs_df['brand'].str.split(',', expand=True)[1]
packs_df['brand_1'] = packs_df['brand'].str.split(',', expand=True)[0]
# adding number of puzzles feature
packs_df['num_puzzles'] = packs_df['pack_name'].map(lambda n: 1 if (n[-1] == ' ') else 2, na_action='ignore')
# fixing datatype
packs_df = packs_df.astype({'piece_count_1': 'int64', 'piece_count_2': 'int64'}, errors='ignore')
# dropping the initial variables
packs_df.drop(['brand', 'piece_count', 'difficulty_rating'], axis=1, inplace=True)
packs_df.isnull().sum()
pack_name                0
piece_count_1            8
piece_count_2          181
difficulty_rating_1      4
difficulty_rating_2      4
brand_2                775
brand_1                 25
num_puzzles              0
dtype: int64
# making the brand_2 same as brand_1
# ------ must be done only for puzzle = 2, right?
packs_df.loc[(packs_df['brand_2'].isna()), 'brand_2'] = packs_df['brand_1']
packs_df[['brand_1', 'brand_2']] = packs_df[['brand_1', 'brand_2']].fillna('unknown')
_images/Notebook2-DataCleaning_14_0.png
# Some 1 puzzle packs have a second number for pieces, this seems to be legit, updating to be 2 puzzle packs
packs_df.loc[((packs_df.num_puzzles == 1) & (packs_df.piece_count_2.notna())), 'num_puzzles'] = 2
packs_df.piece_count_1 = packs_df.piece_count_1.astype('float').astype('Int64')
packs_df.piece_count_2 = packs_df.piece_count_2.astype('float').astype('Int64')

avg_pc1 = packs_df['piece_count_1'].median()
avg_pc2 = packs_df['piece_count_2'].median()
# puzzles which have no value for piece_1 is replaced with avg values
packs_df['piece_count_1'].fillna(avg_pc1, inplace=True)
# if 2 puzzles and second piececount is not available, replace with average
packs_df.loc[((packs_df['piece_count_2'].isna() ) &(packs_df['num_puzzles'] ==2)), 'piece_count_2'] = int(avg_pc2)
# puzzles with no second piece_count, make second piece count 0 
# (because num_puzzle = 2 was dealt with earlier)
packs_df.loc[
    (packs_df['piece_count_2'].isna()) & (packs_df['num_puzzles'] == 1),
    'piece_count_2',
] = 0
packs_df.isnull().sum()
pack_name              0
piece_count_1          0
piece_count_2          0
difficulty_rating_1    4
difficulty_rating_2    4
brand_2                0
brand_1                0
num_puzzles            0
dtype: int64
packs_df['difficulty_rating_1'].fillna('Average', inplace=True)
packs_df['difficulty_rating_2'].fillna('Average', inplace=True)

2.3 Merged data#

df = member_holdtime_df.merge(packs_df, left_on='puzzlepack', right_on='pack_name', how='left')
df.head(2)
memberID holdtime puzzlepack pack_name piece_count_1 piece_count_2 difficulty_rating_1 difficulty_rating_2 brand_2 brand_1 num_puzzles
0 member1 2.939411 Artifact Puzzles Justin Hillgrove Word Travels... Artifact Puzzles Justin Hillgrove Word Travels... 456 548 A-Easy Average Artifact Artifact 2.0
1 member1 0.998885 DaVici Puzzles Full Moon Feast DaVici Puzzles ... DaVici Puzzles Full Moon Feast DaVici Puzzles ... 195 220 A-Easy Hard DaVici DaVici 2.0
df.shape, member_holdtime_df.shape
((19387, 11), (19387, 3))
member_holdtime_df.puzzlepack.nunique(), packs_df.pack_name.nunique(), 
(968, 920)
df.isnull().sum()
# about 5% of the data is missing
memberID                  0
holdtime                  0
puzzlepack                0
pack_name              1085
piece_count_1          1085
piece_count_2          1085
difficulty_rating_1    1085
difficulty_rating_2    1085
brand_2                1085
brand_1                1085
num_puzzles            1085
dtype: int64
Tip: # Ideally we should have information about all the packs. Since we do not have this information and it is recommended that we do not drop these rows, we will impute these with the average values.

2.4 Dealing with Nulls!#

Methodology

  • num_puzzles \(\longrightarrow\) Count number of times ‘Puzzle’ appears in puzzlepack

  • brand_1, brand_2 \(\longrightarrow\) Take word before ‘Puzzle’ as brand name

  • difficulty_rating \(\longrightarrow\) Take average at brand level from known data

  • piece_count \(\longrightarrow\) Take average at brand level from known data

missing_pack_data = df.loc[df['pack_name'].isna()][['puzzlepack', 'piece_count_1', 'piece_count_2',
                               'difficulty_rating_1', 'difficulty_rating_2', 'brand_1', 
                                'brand_2', 'num_puzzles']]
missing_pack_data.reset_index(drop=True, inplace=True)
# getting number of puzzles
missing_pack_data['num_puzzles'] = missing_pack_data['puzzlepack'].apply(lambda x: (x.count('Puzzles')))
# packs can have only 1 or two puzzles
for i in range(len(missing_pack_data)):
    if missing_pack_data['num_puzzles'][i] < 1:
        missing_pack_data['num_puzzles'][i] = 1
    elif missing_pack_data['num_puzzles'][i] > 2:
        missing_pack_data['num_puzzles'][i] = 2
## to limit the number of brand names we will only use the brands that we know
known_brands = list(set(packs_df['brand_1'].unique()).union(set(packs_df['brand_2'].unique())))
# getting brand_1
for i in range(len(missing_pack_data)):
    words = missing_pack_data['puzzlepack'][i].split()[:2]

    if 'Puzzles' in words[1:]:  # if its the second or third word
        if words[words.index('Puzzles')-1] in (known_brands):
            missing_pack_data['brand_1'][i] = words[words.index('Puzzles')-1]
        else:
            missing_pack_data['brand_1'][i] = 'unknown'
# getting brand_2
for i in range(len(missing_pack_data)):
    words = missing_pack_data['puzzlepack'][i].split()[2:]
    if 'Puzzles' in words[1:]:  # if its the 4th word onwards
        if words[words.index('Puzzles')-1] in (known_brands):
            missing_pack_data['brand_2'][i] = words[words.index('Puzzles')-1]
        else:
            missing_pack_data['brand_2'][i] = 'unknown'
missing_pack_data[['num_puzzles', 'brand_1', 'brand_2']].isnull().sum()
num_puzzles      0
brand_1        139
brand_2        333
dtype: int64
# if anything still missing in num_puzzles, brands
missing_pack_data['num_puzzles'].fillna(1, inplace=True)
missing_pack_data['brand_1'].fillna('unknown', inplace=True)
missing_pack_data['brand_2'].fillna('unknown', inplace=True)
# brand level average piece count values -- KNOWN data
groupby_brand_pieces_1 = packs_df.groupby("brand_1")["piece_count_1"].mean()
groupby_brand_pieces_2 = packs_df.groupby("brand_2")["piece_count_2"].mean()
# getting piece_count_1
for i in range(0, len(missing_pack_data)):
    try:
        missing_pack_data['piece_count_1'][i] = int(groupby_brand_pieces_1[missing_pack_data['brand_1'][i]])
        # getting piece_count_2 if needed
        if missing_pack_data['num_puzzles'][i] == 1:
            missing_pack_data['piece_count_2'][i] = 0
        else:
            missing_pack_data['piece_count_2'][i] = int(groupby_brand_pieces_2[missing_pack_data['brand_2'][i]])
    except Exception as e:
        pass
#         print(e)
# brand level most common difficulty values -- KNOWN data
groupby_brand_diff_1 = packs_df.groupby("brand_1")["difficulty_rating_1"].agg(pd.Series.mode)
groupby_brand_diff_2 = packs_df.groupby("brand_2")["difficulty_rating_2"].agg(pd.Series.mode)
# getting difficulty_rating_1
for i in range(0, len(missing_pack_data)):
    try:
        missing_pack_data['difficulty_rating_1'][i] = groupby_brand_diff_1[missing_pack_data['brand_1'][i]]
        # getting difficulty_rating_2 if needed
        if missing_pack_data['num_puzzles'][i] == 1:
            missing_pack_data['difficulty_rating_2'][i] = 'Average'
        else:
            missing_pack_data['difficulty_rating_2'][i] = (groupby_brand_diff_2[missing_pack_data['brand_2'][i]])
    except Exception as e:
        pass
missing_pack_data.isnull().sum()
puzzlepack             0
piece_count_1          0
piece_count_2          0
difficulty_rating_1    0
difficulty_rating_2    0
brand_1                0
brand_2                0
num_puzzles            0
dtype: int64

Comments:

  • No missing values in the data

  • We have information about 675 members and 910 unique puzzle packs

Making a final packs data#

# setting order same as missing pack to concat easily
packs_df = packs_df[['pack_name', 'piece_count_1', 'piece_count_2', 'difficulty_rating_1', 
                             'difficulty_rating_2', 'brand_1', 'brand_2', 'num_puzzles']]
# setting same column names to concat easily
missing_pack_data = missing_pack_data.rename(columns = {'puzzlepack':'pack_name'})
packs_df.shape, missing_pack_data.shape
((920, 8), (1085, 8))
packs_updated = pd.concat([packs_df, missing_pack_data])
packs_updated.reset_index(inplace=True, drop = True)
packs_updated.shape
(2005, 8)
df_cleaned = member_holdtime_df.merge(packs_updated, left_on='puzzlepack', right_on='pack_name', how='left')
df_cleaned.head(2)
memberID holdtime puzzlepack pack_name piece_count_1 piece_count_2 difficulty_rating_1 difficulty_rating_2 brand_1 brand_2 num_puzzles
0 member1 2.939411 Artifact Puzzles Justin Hillgrove Word Travels... Artifact Puzzles Justin Hillgrove Word Travels... 456 548 A-Easy Average Artifact Artifact 2
1 member1 0.998885 DaVici Puzzles Full Moon Feast DaVici Puzzles ... DaVici Puzzles Full Moon Feast DaVici Puzzles ... 195 220 A-Easy Hard DaVici DaVici 2
df_cleaned.drop_duplicates(inplace=True)
df_cleaned.reset_index(drop=True, inplace=True)
df_cleaned.to_csv('data/joined_hold_times_data_all.csv')
difficulty_mapping = {'A-Easy': 1, 'Average': 2, 'Hard': 3, 'Really-Hard': 4}

df_cleaned['difficulty_rating_1'] = df_cleaned['difficulty_rating_1'].map(lambda x: difficulty_mapping[x], na_action='ignore')
df_cleaned['difficulty_rating_2'] = df_cleaned['difficulty_rating_2'].map(lambda x: difficulty_mapping[x], na_action='ignore')
df_cleaned.shape, df.shape, member_holdtime_df.shape
((19387, 11), (19387, 11), (19387, 3))
df_cleaned.isnull().sum()
memberID               0
holdtime               0
puzzlepack             0
pack_name              0
piece_count_1          0
piece_count_2          0
difficulty_rating_1    0
difficulty_rating_2    0
brand_1                0
brand_2                0
num_puzzles            0
dtype: int64
df_cleaned.to_csv('data/df_cleaned.csv', index=False)