# Copyright 2022 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Each user is responsible for checking the content of datasets and the
# applicable licenses and determining if suitable for the intended use.
https://developer.download.nvidia.com/notebooks/dlsw-notebooks/merlin_transformers4rec_tutorial-01-preprocess/nvidia_logo.png

Preliminary Preprocessing

Read and Process E-Commerce data

In this notebook, we are going to use a subset of a publicly available eCommerce dataset. The full dataset contains 7 months data (from October 2019 to April 2020) from a large multi-category online store. Each row in the file represents an event. All events are related to products and users. Each event is like many-to-many relation between products and users. Data collected by Open CDP project and the source of the dataset is REES46 Marketing Platform.

We use only 2019-Oct.csv file for training our models, so you can visit this site and download the csv file: https://www.kaggle.com/mkechinov/ecommerce-behavior-data-from-multi-category-store.

Import the required libraries

import os
import numpy as np 
import gc
import shutil
import glob

import cudf
import nvtabular as nvt

Read Data via cuDF from CSV

At this point we expect that you have already downloaded the 2019-Oct.csv dataset and stored it in the INPUT_DATA_DIR as defined below. It is worth mentioning that the raw dataset is ~ 6 GB, therefore a single GPU with 16 GB or less memory might run out of memory.

# define some information about where to get our data
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", "/workspace/data/")
%%time
raw_df = cudf.read_csv(os.path.join(INPUT_DATA_DIR, '2019-Oct.csv')) 
raw_df.head()
CPU times: user 3.2 s, sys: 1.5 s, total: 4.69 s
Wall time: 5.32 s
event_time event_type product_id category_id category_code brand price user_id user_session
0 2019-10-01 00:00:00 UTC view 44600062 2103807459595387724 <NA> shiseido 35.79 541312140 72d76fde-8bb3-4e00-8c23-a032dfed738c
1 2019-10-01 00:00:00 UTC view 3900821 2053013552326770905 appliances.environment.water_heater aqua 33.20 554748717 9333dfbd-b87a-4708-9857-6336556b0fcc
2 2019-10-01 00:00:01 UTC view 17200506 2053013559792632471 furniture.living_room.sofa <NA> 543.10 519107250 566511c2-e2e3-422b-b695-cf8e6e792ca8
3 2019-10-01 00:00:01 UTC view 1307067 2053013558920217191 computers.notebook lenovo 251.74 550050854 7c90fc70-0e80-4590-96f3-13c02c18c713
4 2019-10-01 00:00:04 UTC view 1004237 2053013555631882655 electronics.smartphone apple 1081.98 535871217 c6bd7419-2748-4c56-95b4-8cec9ff8b80d
raw_df.shape
(42448764, 9)

Convert timestamp from datetime

raw_df['event_time_dt'] = raw_df['event_time'].astype('datetime64[s]')
raw_df['event_time_ts']= raw_df['event_time_dt'].astype('int')
raw_df.head()
event_time event_type product_id category_id category_code brand price user_id user_session event_time_dt event_time_ts
0 2019-10-01 00:00:00 UTC view 44600062 2103807459595387724 <NA> shiseido 35.79 541312140 72d76fde-8bb3-4e00-8c23-a032dfed738c 2019-10-01 00:00:00 1569888000
1 2019-10-01 00:00:00 UTC view 3900821 2053013552326770905 appliances.environment.water_heater aqua 33.20 554748717 9333dfbd-b87a-4708-9857-6336556b0fcc 2019-10-01 00:00:00 1569888000
2 2019-10-01 00:00:01 UTC view 17200506 2053013559792632471 furniture.living_room.sofa <NA> 543.10 519107250 566511c2-e2e3-422b-b695-cf8e6e792ca8 2019-10-01 00:00:01 1569888001
3 2019-10-01 00:00:01 UTC view 1307067 2053013558920217191 computers.notebook lenovo 251.74 550050854 7c90fc70-0e80-4590-96f3-13c02c18c713 2019-10-01 00:00:01 1569888001
4 2019-10-01 00:00:04 UTC view 1004237 2053013555631882655 electronics.smartphone apple 1081.98 535871217 c6bd7419-2748-4c56-95b4-8cec9ff8b80d 2019-10-01 00:00:04 1569888004
# check out the columns with nulls
raw_df.isnull().any()
event_time       False
event_type       False
product_id       False
category_id      False
category_code     True
brand             True
price            False
user_id          False
user_session      True
event_time_dt    False
event_time_ts    False
dtype: bool
# Remove rows where `user_session` is null.
raw_df = raw_df[raw_df['user_session'].isnull()==False]
len(raw_df)
42448762

We no longer need event_time column.

raw_df = raw_df.drop(['event_time'],  axis=1)

Categorify user_session column

Although user_session is not used as an input feature for the model, it is useful to convert those raw long string to int values to avoid potential failures when grouping interactions by user_session in the next notebook.

cols = list(raw_df.columns)
cols.remove('user_session')
cols
['event_type',
 'product_id',
 'category_id',
 'category_code',
 'brand',
 'price',
 'user_id',
 'event_time_dt',
 'event_time_ts']
# load data 
df_event = nvt.Dataset(raw_df) 

# categorify user_session 
cat_feats = ['user_session'] >> nvt.ops.Categorify()

workflow = nvt.Workflow(cols + cat_feats)
workflow.fit(df_event)
df = workflow.transform(df_event).to_ddf().compute()
df.head()
user_session event_type product_id category_id category_code brand price user_id event_time_dt event_time_ts
0 5126085 view 44600062 2103807459595387724 <NA> shiseido 35.79 541312140 2019-10-01 00:00:00 1569888000
1 7854470 view 3900821 2053013552326770905 appliances.environment.water_heater aqua 33.20 554748717 2019-10-01 00:00:00 1569888000
2 730655 view 17200506 2053013559792632471 furniture.living_room.sofa <NA> 543.10 519107250 2019-10-01 00:00:01 1569888001
3 1637332 view 1307067 2053013558920217191 computers.notebook lenovo 251.74 550050854 2019-10-01 00:00:01 1569888001
4 4202155 view 1004237 2053013555631882655 electronics.smartphone apple 1081.98 535871217 2019-10-01 00:00:04 1569888004
raw_df = None
del(raw_df)
gc.collect()
145

Removing consecutive repeated (user, item) interactions

We keep repeated interactions on the same items, removing only consecutive interactions, because it might be due to browser tab refreshes or different interaction types (e.g. click, add-to-card, purchase)

%%time
df = df.sort_values(['user_session', 'event_time_ts']).reset_index(drop=True)

print("Count with in-session repeated interactions: {}".format(len(df)))
# Sorts the dataframe by session and timestamp, to remove consecutive repetitions
df['product_id_past'] = df['product_id'].shift(1).fillna(0)
df['session_id_past'] = df['user_session'].shift(1).fillna(0)
#Keeping only no consecutive repeated in session interactions
df = df[~((df['user_session'] == df['session_id_past']) & \
             (df['product_id'] == df['product_id_past']))]
print("Count after removed in-session repeated interactions: {}".format(len(df)))
del(df['product_id_past'])
del(df['session_id_past'])

gc.collect()
Count with in-session repeated interactions: 42448762
Count after removed in-session repeated interactions: 30733301
CPU times: user 789 ms, sys: 120 ms, total: 909 ms
Wall time: 1.16 s
0

Include the item first time seen feature (for recency calculation)

We create prod_first_event_time_ts column which indicates the timestamp that an item was seen first time.

item_first_interaction_df = df.groupby('product_id').agg({'event_time_ts': 'min'}) \
            .reset_index().rename(columns={'event_time_ts': 'prod_first_event_time_ts'})
item_first_interaction_df.head()
gc.collect()
0
df = df.merge(item_first_interaction_df, on=['product_id'], how='left').reset_index(drop=True)
df.head()
user_session event_type product_id category_id category_code brand price user_id event_time_dt event_time_ts prod_first_event_time_ts
0 94 view 26202560 2053013563693335403 <NA> <NA> 388.49 512892706 2019-10-15 17:21:59 1571160119 1569925682
1 94 view 26203994 2053013563693335403 <NA> <NA> 157.79 512892706 2019-10-15 17:22:17 1571160137 1569941460
2 94 view 26204036 2053013563693335403 <NA> sokolov 471.70 512892706 2019-10-15 17:22:29 1571160149 1569897265
3 94 view 26203994 2053013563693335403 <NA> <NA> 157.79 512892706 2019-10-15 17:22:58 1571160178 1569941460
4 94 view 26203727 2053013563693335403 <NA> lucente 317.38 512892706 2019-10-15 17:23:19 1571160199 1569901056
del(item_first_interaction_df)
item_first_interaction_df=None
gc.collect()
0

In this tutorial, we only use one week of data from Oct 2019 dataset.

# check the min date
df['event_time_dt'].min()
numpy.datetime64('2019-10-01T00:00:00')
# Filters only the first week of the data.
df = df[df['event_time_dt'] < np.datetime64('2019-10-08')].reset_index(drop=True)

We verify that we only have the first week of Oct-2019 dataset.

df['event_time_dt'].max()
numpy.datetime64('2019-10-07T23:59:59')

We drop event_time_dt column as it will not be used anymore.

df = df.drop(['event_time_dt'],  axis=1)
df.head()
user_session event_type product_id category_id category_code brand price user_id event_time_ts prod_first_event_time_ts
0 43 view 5300797 2053013563173241677 <NA> panasonic 39.90 513903572 1570460611 1569948287
1 43 view 5300798 2053013563173241677 <NA> panasonic 32.18 513903572 1570460616 1569934097
2 43 view 5300284 2053013563173241677 <NA> rowenta 30.86 513903572 1570460621 1569927253
3 43 view 5300382 2053013563173241677 <NA> remington 28.22 513903572 1570460636 1570026747
4 43 view 5300366 2053013563173241677 <NA> polaris 26.46 513903572 1570460650 1570097085

Save the data as a single parquet file to be used in the ETL notebook.

# save df as parquet files on disk
df.to_parquet(os.path.join(INPUT_DATA_DIR, 'Oct-2019.parquet'))
  • Shut down the kernel

import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)