# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

Preliminary Preprocessing

Read and Process E-Commerce data

In this notebook, we are going to use a subset of a publicly available eCommerce dataset. The full dataset contains 7 months data (from October 2019 to April 2020) from a large multi-category online store. Each row in the file represents an event. All events are related to products and users. Each event is like many-to-many relation between products and users. Data collected by Open CDP project and the source of the dataset is REES46 Marketing Platform.

We use only 2019-Oct.csv file for training our models, so you can visit this site and download the csv file: https://www.kaggle.com/mkechinov/ecommerce-behavior-data-from-multi-category-store.

Import the required libraries

import os
import numpy as np 
import gc
import shutil
import glob

import cudf
import nvtabular as nvt

Read Data via cuDF from CSV

At this point we expect that you have already downloaded the 2019-Oct.csv dataset and stored it in the INPUT_DATA_DIR as defined below. It is worth mentioning that the raw dataset is ~ 6 GB, therefore a single GPU with 16 GB or less memory might run out of memory. To avoid that, you can directly start from the second notebook, 02-ETL_with_NVTabular, using 'Oct-2019.parquet provided in here.

# define some information about where to get our data
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", "/workspace/data/")

%%time
raw_df = cudf.read_csv(os.path.join(INPUT_DATA_DIR, '2019-Oct.csv')) 
raw_df.head()

CPU times: user 3.2 s, sys: 1.5 s, total: 4.69 s
Wall time: 5.32 s

	event_time	event_type	product_id	category_id	category_code	brand	price	user_id	user_session
0	2019-10-01 00:00:00 UTC	view	44600062	2103807459595387724	<NA>	shiseido	35.79	541312140	72d76fde-8bb3-4e00-8c23-a032dfed738c
1	2019-10-01 00:00:00 UTC	view	3900821	2053013552326770905	appliances.environment.water_heater	aqua	33.20	554748717	9333dfbd-b87a-4708-9857-6336556b0fcc
2	2019-10-01 00:00:01 UTC	view	17200506	2053013559792632471	furniture.living_room.sofa	<NA>	543.10	519107250	566511c2-e2e3-422b-b695-cf8e6e792ca8
3	2019-10-01 00:00:01 UTC	view	1307067	2053013558920217191	computers.notebook	lenovo	251.74	550050854	7c90fc70-0e80-4590-96f3-13c02c18c713
4	2019-10-01 00:00:04 UTC	view	1004237	2053013555631882655	electronics.smartphone	apple	1081.98	535871217	c6bd7419-2748-4c56-95b4-8cec9ff8b80d

raw_df.shape

(42448764, 9)

Convert timestamp from datetime

raw_df['event_time_dt'] = raw_df['event_time'].astype('datetime64[s]')
raw_df['event_time_ts']= raw_df['event_time_dt'].astype('int')
raw_df.head()

	event_time	event_type	product_id	category_id	category_code	brand	price	user_id	user_session	event_time_dt	event_time_ts
0	2019-10-01 00:00:00 UTC	view	44600062	2103807459595387724	<NA>	shiseido	35.79	541312140	72d76fde-8bb3-4e00-8c23-a032dfed738c	2019-10-01 00:00:00	1569888000
1	2019-10-01 00:00:00 UTC	view	3900821	2053013552326770905	appliances.environment.water_heater	aqua	33.20	554748717	9333dfbd-b87a-4708-9857-6336556b0fcc	2019-10-01 00:00:00	1569888000
2	2019-10-01 00:00:01 UTC	view	17200506	2053013559792632471	furniture.living_room.sofa	<NA>	543.10	519107250	566511c2-e2e3-422b-b695-cf8e6e792ca8	2019-10-01 00:00:01	1569888001
3	2019-10-01 00:00:01 UTC	view	1307067	2053013558920217191	computers.notebook	lenovo	251.74	550050854	7c90fc70-0e80-4590-96f3-13c02c18c713	2019-10-01 00:00:01	1569888001
4	2019-10-01 00:00:04 UTC	view	1004237	2053013555631882655	electronics.smartphone	apple	1081.98	535871217	c6bd7419-2748-4c56-95b4-8cec9ff8b80d	2019-10-01 00:00:04	1569888004

# check out the columns with nulls
raw_df.isnull().any()

event_time       False
event_type       False
product_id       False
category_id      False
category_code     True
brand             True
price            False
user_id          False
user_session      True
event_time_dt    False
event_time_ts    False
dtype: bool

# Remove rows where `user_session` is null.
raw_df = raw_df[raw_df['user_session'].isnull()==False]
len(raw_df)

42448762

We no longer need event_time column.

raw_df = raw_df.drop(['event_time'],  axis=1)

Categorify `user_session` column

Although user_session is not used as an input feature for the model, it is useful to convert those raw long string to int values to avoid potential failures when grouping interactions by user_session in the next notebook.

cols = list(raw_df.columns)
cols.remove('user_session')
cols

['event_type',
 'product_id',
 'category_id',
 'category_code',
 'brand',
 'price',
 'user_id',
 'event_time_dt',
 'event_time_ts']

# load data 
df_event = nvt.Dataset(raw_df) 

# categorify user_session 
cat_feats = ['user_session'] >> nvt.ops.Categorify()

workflow = nvt.Workflow(cols + cat_feats)
workflow.fit(df_event)
df = workflow.transform(df_event).to_ddf().compute()

df.head()

	user_session	event_type	product_id	category_id	category_code	brand	price	user_id	event_time_dt	event_time_ts
0	5126085	view	44600062	2103807459595387724	<NA>	shiseido	35.79	541312140	2019-10-01 00:00:00	1569888000
1	7854470	view	3900821	2053013552326770905	appliances.environment.water_heater	aqua	33.20	554748717	2019-10-01 00:00:00	1569888000
2	730655	view	17200506	2053013559792632471	furniture.living_room.sofa	<NA>	543.10	519107250	2019-10-01 00:00:01	1569888001
3	1637332	view	1307067	2053013558920217191	computers.notebook	lenovo	251.74	550050854	2019-10-01 00:00:01	1569888001
4	4202155	view	1004237	2053013555631882655	electronics.smartphone	apple	1081.98	535871217	2019-10-01 00:00:04	1569888004

raw_df = None
del(raw_df)

gc.collect()

Removing consecutive repeated (user, item) interactions

We keep repeated interactions on the same items, removing only consecutive interactions, because it might be due to browser tab refreshes or different interaction types (e.g. click, add-to-card, purchase)

%%time
df = df.sort_values(['user_session', 'event_time_ts']).reset_index(drop=True)

print("Count with in-session repeated interactions: {}".format(len(df)))
# Sorts the dataframe by session and timestamp, to remove consecutive repetitions
df['product_id_past'] = df['product_id'].shift(1).fillna(0)
df['session_id_past'] = df['user_session'].shift(1).fillna(0)
#Keeping only no consecutive repeated in session interactions
df = df[~((df['user_session'] == df['session_id_past']) & \
             (df['product_id'] == df['product_id_past']))]
print("Count after removed in-session repeated interactions: {}".format(len(df)))
del(df['product_id_past'])
del(df['session_id_past'])

gc.collect()

Count with in-session repeated interactions: 42448762
Count after removed in-session repeated interactions: 30733301
CPU times: user 789 ms, sys: 120 ms, total: 909 ms
Wall time: 1.16 s

Include the item first time seen feature (for recency calculation)

We create prod_first_event_time_ts column which indicates the timestamp that an item was seen first time.

item_first_interaction_df = df.groupby('product_id').agg({'event_time_ts': 'min'}) \
            .reset_index().rename(columns={'event_time_ts': 'prod_first_event_time_ts'})
item_first_interaction_df.head()
gc.collect()

df = df.merge(item_first_interaction_df, on=['product_id'], how='left').reset_index(drop=True)

df.head()

	user_session	event_type	product_id	category_id	category_code	brand	price	user_id	event_time_dt	event_time_ts	prod_first_event_time_ts
0	94	view	26202560	2053013563693335403	<NA>	<NA>	388.49	512892706	2019-10-15 17:21:59	1571160119	1569925682
1	94	view	26203994	2053013563693335403	<NA>	<NA>	157.79	512892706	2019-10-15 17:22:17	1571160137	1569941460
2	94	view	26204036	2053013563693335403	<NA>	sokolov	471.70	512892706	2019-10-15 17:22:29	1571160149	1569897265
3	94	view	26203994	2053013563693335403	<NA>	<NA>	157.79	512892706	2019-10-15 17:22:58	1571160178	1569941460
4	94	view	26203727	2053013563693335403	<NA>	lucente	317.38	512892706	2019-10-15 17:23:19	1571160199	1569901056

del(item_first_interaction_df)
item_first_interaction_df=None
gc.collect()

In this tutorial, we only use one week of data from Oct 2019 dataset.

# check the min date
df['event_time_dt'].min()

numpy.datetime64('2019-10-01T00:00:00')

# Filters only the first week of the data.
df = df[df['event_time_dt'] < np.datetime64('2019-10-08')].reset_index(drop=True)

We verify that we only have the first week of Oct-2019 dataset.

df['event_time_dt'].max()

numpy.datetime64('2019-10-07T23:59:59')

We drop event_time_dt column as it will not be used anymore.

df = df.drop(['event_time_dt'],  axis=1)

df.head()

	user_session	event_type	product_id	category_id	category_code	brand	price	user_id	event_time_ts	prod_first_event_time_ts
0	43	view	5300797	2053013563173241677	<NA>	panasonic	39.90	513903572	1570460611	1569948287
1	43	view	5300798	2053013563173241677	<NA>	panasonic	32.18	513903572	1570460616	1569934097
2	43	view	5300284	2053013563173241677	<NA>	rowenta	30.86	513903572	1570460621	1569927253
3	43	view	5300382	2053013563173241677	<NA>	remington	28.22	513903572	1570460636	1570026747
4	43	view	5300366	2053013563173241677	<NA>	polaris	26.46	513903572	1570460650	1570097085

Save the data as a single parquet file to be used in the ETL notebook.

# save df as parquet files on disk
df.to_parquet(os.path.join(INPUT_DATA_DIR, 'Oct-2019.parquet'))

Shut down the kernel

import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)