# Copyright 2022 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
Preliminary Preprocessing
Read and Process E-Commerce data
In this notebook, we are going to use a subset of a publicly available eCommerce dataset. The full dataset contains 7 months data (from October 2019 to April 2020) from a large multi-category online store. Each row in the file represents an event. All events are related to products and users. Each event is like many-to-many relation between products and users. Data collected by Open CDP project and the source of the dataset is REES46 Marketing Platform.
We use only 2019-Oct.csv
file for training our models, so you can visit this site and download the csv file: https://www.kaggle.com/mkechinov/ecommerce-behavior-data-from-multi-category-store.
Import the required libraries
import os
import numpy as np
import gc
import shutil
import glob
import cudf
import nvtabular as nvt
Read Data via cuDF from CSV
At this point we expect that you have already downloaded the 2019-Oct.csv
dataset and stored it in the INPUT_DATA_DIR
as defined below. It is worth mentioning that the raw dataset is ~ 6 GB, therefore a single GPU with 16 GB or less memory might run out of memory.
# define some information about where to get our data
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", "/workspace/data/")
%%time
raw_df = cudf.read_csv(os.path.join(INPUT_DATA_DIR, '2019-Oct.csv'))
raw_df.head()
CPU times: user 3.2 s, sys: 1.5 s, total: 4.69 s
Wall time: 5.32 s
event_time | event_type | product_id | category_id | category_code | brand | price | user_id | user_session | |
---|---|---|---|---|---|---|---|---|---|
0 | 2019-10-01 00:00:00 UTC | view | 44600062 | 2103807459595387724 | <NA> | shiseido | 35.79 | 541312140 | 72d76fde-8bb3-4e00-8c23-a032dfed738c |
1 | 2019-10-01 00:00:00 UTC | view | 3900821 | 2053013552326770905 | appliances.environment.water_heater | aqua | 33.20 | 554748717 | 9333dfbd-b87a-4708-9857-6336556b0fcc |
2 | 2019-10-01 00:00:01 UTC | view | 17200506 | 2053013559792632471 | furniture.living_room.sofa | <NA> | 543.10 | 519107250 | 566511c2-e2e3-422b-b695-cf8e6e792ca8 |
3 | 2019-10-01 00:00:01 UTC | view | 1307067 | 2053013558920217191 | computers.notebook | lenovo | 251.74 | 550050854 | 7c90fc70-0e80-4590-96f3-13c02c18c713 |
4 | 2019-10-01 00:00:04 UTC | view | 1004237 | 2053013555631882655 | electronics.smartphone | apple | 1081.98 | 535871217 | c6bd7419-2748-4c56-95b4-8cec9ff8b80d |
raw_df.shape
(42448764, 9)
Convert timestamp from datetime
raw_df['event_time_dt'] = raw_df['event_time'].astype('datetime64[s]')
raw_df['event_time_ts']= raw_df['event_time_dt'].astype('int')
raw_df.head()
event_time | event_type | product_id | category_id | category_code | brand | price | user_id | user_session | event_time_dt | event_time_ts | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-10-01 00:00:00 UTC | view | 44600062 | 2103807459595387724 | <NA> | shiseido | 35.79 | 541312140 | 72d76fde-8bb3-4e00-8c23-a032dfed738c | 2019-10-01 00:00:00 | 1569888000 |
1 | 2019-10-01 00:00:00 UTC | view | 3900821 | 2053013552326770905 | appliances.environment.water_heater | aqua | 33.20 | 554748717 | 9333dfbd-b87a-4708-9857-6336556b0fcc | 2019-10-01 00:00:00 | 1569888000 |
2 | 2019-10-01 00:00:01 UTC | view | 17200506 | 2053013559792632471 | furniture.living_room.sofa | <NA> | 543.10 | 519107250 | 566511c2-e2e3-422b-b695-cf8e6e792ca8 | 2019-10-01 00:00:01 | 1569888001 |
3 | 2019-10-01 00:00:01 UTC | view | 1307067 | 2053013558920217191 | computers.notebook | lenovo | 251.74 | 550050854 | 7c90fc70-0e80-4590-96f3-13c02c18c713 | 2019-10-01 00:00:01 | 1569888001 |
4 | 2019-10-01 00:00:04 UTC | view | 1004237 | 2053013555631882655 | electronics.smartphone | apple | 1081.98 | 535871217 | c6bd7419-2748-4c56-95b4-8cec9ff8b80d | 2019-10-01 00:00:04 | 1569888004 |
# check out the columns with nulls
raw_df.isnull().any()
event_time False
event_type False
product_id False
category_id False
category_code True
brand True
price False
user_id False
user_session True
event_time_dt False
event_time_ts False
dtype: bool
# Remove rows where `user_session` is null.
raw_df = raw_df[raw_df['user_session'].isnull()==False]
len(raw_df)
42448762
We no longer need event_time
column.
raw_df = raw_df.drop(['event_time'], axis=1)
Categorify user_session
column
Although user_session
is not used as an input feature for the model, it is useful to convert those raw long string to int values to avoid potential failures when grouping interactions by user_session
in the next notebook.
cols = list(raw_df.columns)
cols.remove('user_session')
cols
['event_type',
'product_id',
'category_id',
'category_code',
'brand',
'price',
'user_id',
'event_time_dt',
'event_time_ts']
# load data
df_event = nvt.Dataset(raw_df)
# categorify user_session
cat_feats = ['user_session'] >> nvt.ops.Categorify()
workflow = nvt.Workflow(cols + cat_feats)
workflow.fit(df_event)
df = workflow.transform(df_event).to_ddf().compute()
df.head()
user_session | event_type | product_id | category_id | category_code | brand | price | user_id | event_time_dt | event_time_ts | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 5126085 | view | 44600062 | 2103807459595387724 | <NA> | shiseido | 35.79 | 541312140 | 2019-10-01 00:00:00 | 1569888000 |
1 | 7854470 | view | 3900821 | 2053013552326770905 | appliances.environment.water_heater | aqua | 33.20 | 554748717 | 2019-10-01 00:00:00 | 1569888000 |
2 | 730655 | view | 17200506 | 2053013559792632471 | furniture.living_room.sofa | <NA> | 543.10 | 519107250 | 2019-10-01 00:00:01 | 1569888001 |
3 | 1637332 | view | 1307067 | 2053013558920217191 | computers.notebook | lenovo | 251.74 | 550050854 | 2019-10-01 00:00:01 | 1569888001 |
4 | 4202155 | view | 1004237 | 2053013555631882655 | electronics.smartphone | apple | 1081.98 | 535871217 | 2019-10-01 00:00:04 | 1569888004 |
raw_df = None
del(raw_df)
gc.collect()
145
Removing consecutive repeated (user, item) interactions
We keep repeated interactions on the same items, removing only consecutive interactions, because it might be due to browser tab refreshes or different interaction types (e.g. click, add-to-card, purchase)
%%time
df = df.sort_values(['user_session', 'event_time_ts']).reset_index(drop=True)
print("Count with in-session repeated interactions: {}".format(len(df)))
# Sorts the dataframe by session and timestamp, to remove consecutive repetitions
df['product_id_past'] = df['product_id'].shift(1).fillna(0)
df['session_id_past'] = df['user_session'].shift(1).fillna(0)
#Keeping only no consecutive repeated in session interactions
df = df[~((df['user_session'] == df['session_id_past']) & \
(df['product_id'] == df['product_id_past']))]
print("Count after removed in-session repeated interactions: {}".format(len(df)))
del(df['product_id_past'])
del(df['session_id_past'])
gc.collect()
Count with in-session repeated interactions: 42448762
Count after removed in-session repeated interactions: 30733301
CPU times: user 789 ms, sys: 120 ms, total: 909 ms
Wall time: 1.16 s
0
Include the item first time seen feature (for recency calculation)
We create prod_first_event_time_ts
column which indicates the timestamp that an item was seen first time.
item_first_interaction_df = df.groupby('product_id').agg({'event_time_ts': 'min'}) \
.reset_index().rename(columns={'event_time_ts': 'prod_first_event_time_ts'})
item_first_interaction_df.head()
gc.collect()
0
df = df.merge(item_first_interaction_df, on=['product_id'], how='left').reset_index(drop=True)
df.head()
user_session | event_type | product_id | category_id | category_code | brand | price | user_id | event_time_dt | event_time_ts | prod_first_event_time_ts | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 94 | view | 26202560 | 2053013563693335403 | <NA> | <NA> | 388.49 | 512892706 | 2019-10-15 17:21:59 | 1571160119 | 1569925682 |
1 | 94 | view | 26203994 | 2053013563693335403 | <NA> | <NA> | 157.79 | 512892706 | 2019-10-15 17:22:17 | 1571160137 | 1569941460 |
2 | 94 | view | 26204036 | 2053013563693335403 | <NA> | sokolov | 471.70 | 512892706 | 2019-10-15 17:22:29 | 1571160149 | 1569897265 |
3 | 94 | view | 26203994 | 2053013563693335403 | <NA> | <NA> | 157.79 | 512892706 | 2019-10-15 17:22:58 | 1571160178 | 1569941460 |
4 | 94 | view | 26203727 | 2053013563693335403 | <NA> | lucente | 317.38 | 512892706 | 2019-10-15 17:23:19 | 1571160199 | 1569901056 |
del(item_first_interaction_df)
item_first_interaction_df=None
gc.collect()
0
In this tutorial, we only use one week of data from Oct 2019 dataset.
# check the min date
df['event_time_dt'].min()
numpy.datetime64('2019-10-01T00:00:00')
# Filters only the first week of the data.
df = df[df['event_time_dt'] < np.datetime64('2019-10-08')].reset_index(drop=True)
We verify that we only have the first week of Oct-2019 dataset.
df['event_time_dt'].max()
numpy.datetime64('2019-10-07T23:59:59')
We drop event_time_dt
column as it will not be used anymore.
df = df.drop(['event_time_dt'], axis=1)
df.head()
user_session | event_type | product_id | category_id | category_code | brand | price | user_id | event_time_ts | prod_first_event_time_ts | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 43 | view | 5300797 | 2053013563173241677 | <NA> | panasonic | 39.90 | 513903572 | 1570460611 | 1569948287 |
1 | 43 | view | 5300798 | 2053013563173241677 | <NA> | panasonic | 32.18 | 513903572 | 1570460616 | 1569934097 |
2 | 43 | view | 5300284 | 2053013563173241677 | <NA> | rowenta | 30.86 | 513903572 | 1570460621 | 1569927253 |
3 | 43 | view | 5300382 | 2053013563173241677 | <NA> | remington | 28.22 | 513903572 | 1570460636 | 1570026747 |
4 | 43 | view | 5300366 | 2053013563173241677 | <NA> | polaris | 26.46 | 513903572 | 1570460650 | 1570097085 |
Save the data as a single parquet file to be used in the ETL notebook.
# save df as parquet files on disk
df.to_parquet(os.path.join(INPUT_DATA_DIR, 'Oct-2019.parquet'))
Shut down the kernel
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)