# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
NVTabular demo on Rossmann data - FastAI
Overview
NVTabular is a feature engineering and preprocessing library for tabular data designed to quickly and easily manipulate terabyte scale datasets used to train deep learning based recommender systems. It provides a high level abstraction to simplify code and accelerates computation on the GPU using the RAPIDS cuDF library.
Learning objectives
In the previous notebooks (01-Download-Convert.ipynb and 02-ETL-with-NVTabular.ipynb), we downloaded, preprocessed and created features for the dataset. Now, we are ready to train our deep learning model on the dataset. In this notebook, we use FastAI with the NVTabular data loader for PyTorch to accelereate the training pipeline. FastAI uses PyTorch as a backend and we can combine the NVTabular data loader for PyTorch with the FastAI library.
import os
import math
import json
import nvtabular as nvt
import glob
Loading NVTabular workflow
This time, we only need to define our data directories. We can load the data schema from the NVTabular workflow.
DATA_DIR = os.environ.get("OUTPUT_DATA_DIR", os.path.expanduser("~/nvt-examples/data/"))
PREPROCESS_DIR = os.path.join(DATA_DIR, "ross_pre/")
PREPROCESS_DIR_TRAIN = os.path.join(PREPROCESS_DIR, "train")
PREPROCESS_DIR_VALID = os.path.join(PREPROCESS_DIR, "valid")
What files are available to train on in our directories?
!ls $PREPROCESS_DIR
stats.json train valid
!ls $PREPROCESS_DIR_TRAIN
0.1136d38916184bd39bf3d0cc6af8aecc.parquet _metadata
_file_list.txt _metadata.json
!ls $PREPROCESS_DIR_VALID
0.bcd2404e802640f29b1427feaacbd24a.parquet _metadata
_file_list.txt _metadata.json
We load the data schema and statistic information from stats.json
. We created the file in the previous notebook 02-ETL-with-NVTabular
.
stats = json.load(open(os.path.join(PREPROCESS_DIR, "stats.json"), "r"))
CATEGORICAL_COLUMNS = stats["CATEGORICAL_COLUMNS"]
CONTINUOUS_COLUMNS = stats["CONTINUOUS_COLUMNS"]
LABEL_COLUMNS = stats["LABEL_COLUMNS"]
COLUMNS = CATEGORICAL_COLUMNS + CONTINUOUS_COLUMNS + LABEL_COLUMNS
The embedding table shows the cardinality of each categorical variable along with its associated embedding size. Each entry is of the form (cardinality, embedding_size)
.
EMBEDDING_TABLE_SHAPES = stats["EMBEDDING_TABLE_SHAPES"]
EMBEDDING_TABLE_SHAPES
{'Assortment': [4, 16],
'CompetitionMonthsOpen': [26, 16],
'CompetitionOpenSinceYear': [24, 16],
'Day': [32, 16],
'DayOfWeek': [8, 16],
'Events': [22, 16],
'Month': [13, 16],
'Promo2SinceYear': [9, 16],
'Promo2Weeks': [27, 16],
'PromoInterval': [4, 16],
'Promo_bw': [7, 16],
'Promo_fw': [7, 16],
'SchoolHoliday_bw': [9, 16],
'SchoolHoliday_fw': [9, 16],
'State': [13, 16],
'StateHoliday': [3, 16],
'StateHoliday_bw': [4, 16],
'StateHoliday_fw': [4, 16],
'Store': [1116, 81],
'StoreType': [5, 16],
'Week': [53, 16],
'Year': [4, 16]}
Training a Network
Now that our data is preprocessed and saved out, we can leverage dataset
s to read through the preprocessed parquet files in an online fashion to train neural networks.
We’ll start by setting some universal hyperparameters for our model and optimizer. These settings will be shared across all of the frameworks that we explore below.
EMBEDDING_DROPOUT_RATE = 0.04
DROPOUT_RATES = [0.001, 0.01]
HIDDEN_DIMS = [1000, 500]
BATCH_SIZE = 65536
LEARNING_RATE = 0.001
EPOCHS = 25
# TODO: Calculate on the fly rather than recalling from previous analysis.
MAX_SALES_IN_TRAINING_SET = 38722.0
MAX_LOG_SALES_PREDICTION = 1.2 * math.log(MAX_SALES_IN_TRAINING_SET + 1.0)
TRAIN_PATHS = sorted(glob.glob(os.path.join(PREPROCESS_DIR_TRAIN, "*.parquet")))
VALID_PATHS = sorted(glob.glob(os.path.join(PREPROCESS_DIR_VALID, "*.parquet")))
fast.ai
fast.ai: Preparing Datasets
AsyncTensorBatchDatasetItr maps a symbolic dataset object to cat_features
, cont_features
, labels
PyTorch tenosrs by iterating through the dataset and concatenating the results.
import fastai
fastai.__version__
'2.2.5'
import torch
from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader
from nvtabular.framework_utils.torch.utils import FastaiTransform
from fastai.tabular.data import TabularDataLoaders
from fastai.tabular.model import TabularModel
from fastai.basics import Learner
from fastai.basics import MSELossFlat
from fastai.callback.progress import ProgressCallback
def make_batched_dataloader(paths, columns, batch_size):
dataset = nvt.Dataset(paths)
ds_batch_sets = TorchAsyncItr(
dataset,
batch_size=batch_size,
cats=CATEGORICAL_COLUMNS,
conts=CONTINUOUS_COLUMNS,
labels=LABEL_COLUMNS,
)
return DLDataLoader(ds_batch_sets,
batch_size=None,
pin_memory=False,
collate_fn=FastaiTransform(ds_batch_sets).transform,
num_workers=0)
train_dataset = make_batched_dataloader(TRAIN_PATHS, COLUMNS, BATCH_SIZE)
valid_dataset = make_batched_dataloader(VALID_PATHS, COLUMNS, BATCH_SIZE * 4)
databunch = TabularDataLoaders(train_dataset, valid_dataset)
fast.ai: Defining a Model
Next we’ll need to define the inputs that will feed our model and build an architecture on top of them. For now, we’ll just stick to a simple MLP model.
Using FastAI’s TabularModel
, we can build an MLP under the hood by defining its high-level characteristics.
pt_model = TabularModel(
emb_szs=list(EMBEDDING_TABLE_SHAPES.values()),
n_cont=len(CONTINUOUS_COLUMNS),
out_sz=1,
layers=HIDDEN_DIMS,
ps=DROPOUT_RATES,
use_bn=True,
embed_p=EMBEDDING_DROPOUT_RATE,
y_range=torch.tensor([0.0, MAX_LOG_SALES_PREDICTION]),
).cuda()
fast.ai: Training
from fastai.torch_core import flatten_check
from time import time
def exp_rmspe(pred, targ):
"Exp RMSE between `pred` and `targ`."
pred, targ = flatten_check(pred, targ)
pred, targ = torch.exp(pred) - 1, torch.exp(targ) - 1
pct_var = (targ - pred) / targ
return torch.sqrt((pct_var ** 2).mean())
loss_func = MSELossFlat()
learner = Learner(
databunch, pt_model, loss_func=loss_func, metrics=[exp_rmspe], cbs=ProgressCallback()
)
start = time()
learner.fit(EPOCHS, LEARNING_RATE)
t_final = time() - start
total_rows = train_dataset.dataset.num_rows_processed + valid_dataset.dataset.num_rows_processed
print(
f"run_time: {t_final} - rows: {total_rows} - epochs: {EPOCHS} - dl_thru: { (EPOCHS * total_rows) / t_final}"
)
epoch | train_loss | valid_loss | exp_rmspe | time |
---|
epoch | train_loss | valid_loss | exp_rmspe | time |
---|---|---|---|---|
0 | 0.870130 | 1.443352 | 0.668771 | 00:02 |
1 | 0.451688 | 0.238232 | 0.396827 | 00:01 |
2 | 0.290309 | 0.147549 | 0.484601 | 00:01 |
3 | 0.204925 | 0.151387 | 0.537334 | 00:01 |
4 | 0.153039 | 0.105131 | 0.404711 | 00:01 |
5 | 0.118746 | 0.060419 | 0.265174 | 00:01 |
6 | 0.094846 | 0.037690 | 0.197737 | 00:01 |
7 | 0.077555 | 0.034608 | 0.192175 | 00:01 |
8 | 0.064673 | 0.032721 | 0.187379 | 00:01 |
9 | 0.054867 | 0.032081 | 0.185658 | 00:01 |
10 | 0.047228 | 0.031881 | 0.189285 | 00:01 |
11 | 0.041429 | 0.030871 | 0.177481 | 00:01 |
12 | 0.037166 | 0.031383 | 0.185052 | 00:01 |
13 | 0.033308 | 0.029781 | 0.180436 | 00:01 |
14 | 0.030054 | 0.028644 | 0.175005 | 00:01 |
15 | 0.028240 | 0.029979 | 0.181500 | 00:01 |
16 | 0.026032 | 0.026933 | 0.168009 | 00:01 |
17 | 0.024473 | 0.028919 | 0.177013 | 00:01 |
18 | 0.022900 | 0.028642 | 0.163702 | 00:01 |
19 | 0.021604 | 0.027488 | 0.162483 | 00:01 |
20 | 0.020489 | 0.026400 | 0.161136 | 00:01 |
21 | 0.019540 | 0.026042 | 0.166578 | 00:01 |
22 | 0.018842 | 0.028341 | 0.162945 | 00:01 |
23 | 0.018130 | 0.025699 | 0.159816 | 00:01 |
24 | 0.017510 | 0.024265 | 0.162697 | 00:01 |
run_time: 49.69642353057861 - rows: 844338 - epochs: 25 - dl_thru: 424747.8691703398