# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Each user is responsible for checking the content of datasets and the
# applicable licenses and determining if suitable for the intended use.
http://developer.download.nvidia.com/notebooks/dlsw-notebooks/merlin_hugectr_embedding-training-cache-example/nvidia_logo.png

Embedding Training Cache Example

Overview

Embedding Training Cache enables you to train huge models that cannot fit into GPU memory in one time. In this example, we will go through an end-to-end training procedure using the embedding training cache feature of HugeCTR. We are going to use the Criteo dataset as our data source and NVTabular as our data preprocessing tool.

Setup

To setup the environment, refer to HugeCTR Example Notebooks and follow the instructions there before running the following.

Data Preparation

First, make a folder to store our data:

!mkdir etc_data

Second, make a script that uses the HugeCTR Data Generator to generate datasets:

%%writefile generate_data.py

import hugectr
from hugectr.tools import DataGenerator, DataGeneratorParams
from mpi4py import MPI
import argparse
parser = argparse.ArgumentParser(description=("Data Generation"))

parser.add_argument("--num_files", type=int, help="number of files in training data", default = 8)
parser.add_argument("--eval_num_files", type=int, help="number of files in validation data", default = 2)
parser.add_argument('--num_samples_per_file', type=int, help="number of samples per file", default=1000000)
parser.add_argument('--dir_name', type=str, help="data directory name(Required)")
args = parser.parse_args()

data_generator_params = DataGeneratorParams(
  format = hugectr.DataReaderType_t.Parquet,
  label_dim = 1,
  dense_dim = 13,
  num_slot = 26,
  num_files = args.num_files,
  eval_num_files = args.eval_num_files,
  i64_input_key = True,
  num_samples_per_file = args.num_samples_per_file,
  source = "./etc_data/" + args.dir_name + "/file_list.txt",
  eval_source = "./etc_data/" + args.dir_name + "/file_list_test.txt",
  slot_size_array = [12988, 7129, 8720, 5820, 15196, 4, 4914, 1020, 30, 14274, 10220, 15088, 10, 1518, 3672, 48, 4, 820, 15, 12817, 13908, 13447, 9447, 5867, 45, 33],
  # for parquet, check_type doesn't make any difference
  check_type = hugectr.Check_t.Non,
  dist_type = hugectr.Distribution_t.PowerLaw,
  power_law_type = hugectr.PowerLaw_t.Short)
data_generator = DataGenerator(data_generator_params)
data_generator.generate()
Overwriting generate_data.py
!python generate_data.py --dir_name "file0"
[HCTR][09:00:01][INFO][RK0][main]: Generate Parquet dataset
[HCTR][09:00:01][INFO][RK0][main]: train data folder: ./etc_data/file0, eval data folder: ./etc_data/file0, slot_size_array: 12988, 7129, 8720, 5820, 15196, 4, 4914, 1020, 30, 14274, 10220, 15088, 10, 1518, 3672, 48, 4, 820, 15, 12817, 13908, 13447, 9447, 5867, 45, 33, nnz array: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, #files for train: 8, #files for eval: 2, #samples per file: 1000000, Use power law distribution: 1, alpha of power law: 1.3
[HCTR][09:00:01][INFO][RK0][main]: ./etc_data/file0 exist
[HCTR][09:00:01][INFO][RK0][main]: ./etc_data/file0/train/gen_0.parquet
[HCTR][09:00:05][INFO][RK0][main]: ./etc_data/file0/train/gen_1.parquet
[HCTR][09:00:08][INFO][RK0][main]: ./etc_data/file0/train/gen_2.parquet
[HCTR][09:00:11][INFO][RK0][main]: ./etc_data/file0/train/gen_3.parquet
[HCTR][09:00:14][INFO][RK0][main]: ./etc_data/file0/train/gen_4.parquet
[HCTR][09:00:17][INFO][RK0][main]: ./etc_data/file0/train/gen_5.parquet
[HCTR][09:00:20][INFO][RK0][main]: ./etc_data/file0/train/gen_6.parquet
[HCTR][09:00:23][INFO][RK0][main]: ./etc_data/file0/train/gen_7.parquet
[HCTR][09:00:26][INFO][RK0][main]: ./etc_data/file0/file_list.txt done!
[HCTR][09:00:26][INFO][RK0][main]: ./etc_data/file0/val/gen_0.parquet
[HCTR][09:00:29][INFO][RK0][main]: ./etc_data/file0/val/gen_1.parquet
[HCTR][09:00:32][INFO][RK0][main]: ./etc_data/file0/file_list_test.txt done!
!python generate_data.py --dir_name "file1"
[HCTR][09:01:09][INFO][RK0][main]: Generate Parquet dataset
[HCTR][09:01:09][INFO][RK0][main]: train data folder: ./etc_data/file1, eval data folder: ./etc_data/file1, slot_size_array: 12988, 7129, 8720, 5820, 15196, 4, 4914, 1020, 30, 14274, 10220, 15088, 10, 1518, 3672, 48, 4, 820, 15, 12817, 13908, 13447, 9447, 5867, 45, 33, nnz array: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, #files for train: 8, #files for eval: 2, #samples per file: 1000000, Use power law distribution: 1, alpha of power law: 1.3
[HCTR][09:01:09][INFO][RK0][main]: ./etc_data/file1 exist
[HCTR][09:01:09][INFO][RK0][main]: ./etc_data/file1/train/gen_0.parquet
[HCTR][09:01:13][INFO][RK0][main]: ./etc_data/file1/train/gen_1.parquet
[HCTR][09:01:16][INFO][RK0][main]: ./etc_data/file1/train/gen_2.parquet
[HCTR][09:01:19][INFO][RK0][main]: ./etc_data/file1/train/gen_3.parquet
[HCTR][09:01:22][INFO][RK0][main]: ./etc_data/file1/train/gen_4.parquet
[HCTR][09:01:26][INFO][RK0][main]: ./etc_data/file1/train/gen_5.parquet
[HCTR][09:01:29][INFO][RK0][main]: ./etc_data/file1/train/gen_6.parquet
[HCTR][09:01:32][INFO][RK0][main]: ./etc_data/file1/train/gen_7.parquet
[HCTR][09:01:35][INFO][RK0][main]: ./etc_data/file1/file_list.txt done!
[HCTR][09:01:35][INFO][RK0][main]: ./etc_data/file1/val/gen_0.parquet
[HCTR][09:01:38][INFO][RK0][main]: ./etc_data/file1/val/gen_1.parquet
[HCTR][09:01:41][INFO][RK0][main]: ./etc_data/file1/file_list_test.txt done!

Extract Keyset

The HugeCTR repository on GitHub includes a keyset generator script for Parquet datasets. See the generate_keyset.py file in the keyset_scripts directory of the repository. We can use the script to generate keyset for our training datasets.

!python generate_keyset.py --src_dir_path ./etc_data/file0/train --keyset_path ./etc_data/file0/train/_hugectr.keyset  --slot_size_array 12988 7129 8720 5820 15196 4 4914 1020 30 14274 10220 15088 10 1518 3672 48 4 820 15 12817 13908 13447 9447 5867 45 33
2022-06-06 09:01:54,758 Extracted keyset from ./etc_data/file0/train

Do the same thing for file2:

!python generate_keyset.py --src_dir_path ./etc_data/file1/train --keyset_path ./etc_data/file1/train/_hugectr.keyset  --slot_size_array 12988 7129 8720 5820 15196 4 4914 1020 30 14274 10220 15088 10 1518 3672 48 4 820 15 12817 13908 13447 9447 5867 45 33
2022-06-06 09:02:01,163 Extracted keyset from ./etc_data/file1/train

Run ls -l ./data to make sure we have data and keyset ready:

!ls -l ./etc_data/file0/train
total 801387
-rw-r--r-- 1 root dip  1256424 Jun  6 09:00 _hugectr.keyset
-rw-r--r-- 1 root dip     1959 Jun  6 08:58 _metadata.json
-rw-r--r-- 1 root dip 91956719 Jun  6 08:58 gen_0.parquet
-rw-r--r-- 1 root dip 91951983 Jun  6 08:58 gen_1.parquet
-rw-r--r-- 1 root dip 91956559 Jun  6 08:58 gen_2.parquet
-rw-r--r-- 1 root dip 91954535 Jun  6 08:58 gen_3.parquet
-rw-r--r-- 1 root dip 91951501 Jun  6 08:58 gen_4.parquet
-rw-r--r-- 1 root dip 91963545 Jun  6 08:58 gen_5.parquet
-rw-r--r-- 1 root dip 91961051 Jun  6 08:58 gen_6.parquet
-rw-r--r-- 1 root dip 91955276 Jun  6 08:58 gen_7.parquet
!ls -l ./etc_data/file1/train
total 801387
-rw-r--r-- 1 root dip  1256432 Jun  6 09:00 _hugectr.keyset
-rw-r--r-- 1 root dip     1959 Jun  6 08:59 _metadata.json
-rw-r--r-- 1 root dip 91959333 Jun  6 08:59 gen_0.parquet
-rw-r--r-- 1 root dip 91962190 Jun  6 08:59 gen_1.parquet
-rw-r--r-- 1 root dip 91960276 Jun  6 08:59 gen_2.parquet
-rw-r--r-- 1 root dip 91951335 Jun  6 08:59 gen_3.parquet
-rw-r--r-- 1 root dip 91957041 Jun  6 08:59 gen_4.parquet
-rw-r--r-- 1 root dip 91959877 Jun  6 08:59 gen_5.parquet
-rw-r--r-- 1 root dip 91975033 Jun  6 08:59 gen_6.parquet
-rw-r--r-- 1 root dip 91962975 Jun  6 08:59 gen_7.parquet

Training using HugeCTR

%%writefile etc_sample.py
import hugectr
from mpi4py import MPI
solver = hugectr.CreateSolver(max_eval_batches = 5000,
                              batchsize_eval = 1024,
                              batchsize = 1024,
                              lr = 0.001,
                              vvgpu = [[0]],
                              i64_input_key = True,
                              use_mixed_precision = False,
                              repeat_dataset = False)
reader = hugectr.DataReaderParams(data_reader_type = hugectr.DataReaderType_t.Parquet,
                          source = ["./etc_data/file0/file_list.txt"],
                          keyset = ["./etc_data/file0/train/_hugectr.keyset"],
                          eval_source = "./etc_data/file0/file_list_test.txt",
                          slot_size_array = [12988, 7129, 8720, 5820, 15196, 4, 4914, 1020, 30, 14274, 10220, 15088, 10, 1518, 3672, 48, 4, 820, 15, 12817, 13908, 13447, 9447, 5867, 45, 33],
                          check_type = hugectr.Check_t.Non)
optimizer = hugectr.CreateOptimizer(optimizer_type = hugectr.Optimizer_t.Adam)
hc_cnfg = hugectr.CreateHMemCache(num_blocks = 1, target_hit_rate = 0.5, max_num_evict = 0)
etc = hugectr.CreateETC(ps_types = [hugectr.TrainPSType_t.Cached],
                       sparse_models = ["./dcn_sparse_model"],
                       local_paths = ["./"], hmem_cache_configs = [hc_cnfg])
model = hugectr.Model(solver, reader, optimizer, etc)
model.add(hugectr.Input(label_dim = 1, label_name = "label",
                        dense_dim = 13, dense_name = "dense",
                        data_reader_sparse_param_array = 
                        [hugectr.DataReaderSparseParam("data1", 1, True, 26)]))
model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, 
                            workspace_size_per_gpu_in_mb = 5000,
                            embedding_vec_size = 16,
                            combiner = "sum",
                            sparse_embedding_name = "sparse_embedding1",
                            bottom_name = "data1",
                            optimizer = optimizer))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape,
                            bottom_names = ["sparse_embedding1"],
                            top_names = ["reshape1"],
                            leading_dim=416))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Concat,
                            bottom_names = ["reshape1", "dense"], top_names = ["concat1"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.MultiCross,
                            bottom_names = ["concat1"],
                            top_names = ["multicross1"],
                            num_layers=6))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["concat1"],
                            top_names = ["fc1"],
                            num_output=1024))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc1"],
                            top_names = ["relu1"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu1"],
                            top_names = ["dropout1"],
                            dropout_rate=0.5))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["dropout1"],
                            top_names = ["fc2"],
                            num_output=1024))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc2"],
                            top_names = ["relu2"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu2"],
                            top_names = ["dropout2"],
                            dropout_rate=0.5))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Concat,
                            bottom_names = ["dropout2", "multicross1"],
                            top_names = ["concat2"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["concat2"],
                            top_names = ["fc3"],
                            num_output=1))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.BinaryCrossEntropyLoss,
                            bottom_names = ["fc3", "label"],
                            top_names = ["loss"]))
model.compile()
model.summary()
model.graph_to_json(graph_config_file = "dcn.json")
model.fit(num_epochs = 1, display = 500, eval_interval = 1000)

model.set_source(source = ["etc_data/file1/file_list.txt"], keyset = ["etc_data/file1/train/_hugectr.keyset"], eval_source = "etc_data/file1/file_list_test.txt")
model.fit(num_epochs = 1, display = 500, eval_interval = 1000)

model.save_params_to_files("dcn_etc")
Overwriting etc_sample.py
!python3 etc_sample.py
[HCTR][09:02:26][INFO][RK0][main]: Empty embedding, trained table will be stored in ./dcn_sparse_model
HugeCTR Version: 3.5
====================================================Model Init=====================================================
[HCTR][09:02:26][WARNING][RK0][main]: The model name is not specified when creating the solver.
[HCTR][09:02:26][WARNING][RK0][main]: MPI was already initialized somewhere elese. Lifetime service disabled.
[HCTR][09:02:26][INFO][RK0][main]: Global seed is 1968709516
[HCTR][09:02:26][INFO][RK0][main]: Device to NUMA mapping:
  GPU 0 ->  node 0
[HCTR][09:02:27][WARNING][RK0][main]: Peer-to-peer access cannot be fully enabled.
[HCTR][09:02:27][INFO][RK0][main]: Start all2all warmup
[HCTR][09:02:27][INFO][RK0][main]: End all2all warmup
[HCTR][09:02:27][INFO][RK0][main]: Using All-reduce algorithm: NCCL
[HCTR][09:02:27][INFO][RK0][main]: Device 0: Tesla V100-SXM2-32GB
[HCTR][09:02:27][INFO][RK0][main]: num of DataReader workers: 1
[HCTR][09:02:27][INFO][RK0][main]: Vocabulary size: 157054
[HCTR][09:02:27][INFO][RK0][main]: max_vocabulary_size_per_gpu_=27306666
[HCTR][09:02:27][INFO][RK0][main]: Graph analysis to resolve tensor dependency
[HCTR][09:02:27][INFO][RK0][main]: Add Slice layer for tensor: concat1, creating 2 copies
===================================================Model Compile===================================================
[HCTR][09:02:31][INFO][RK0][main]: gpu0 start to init embedding
[HCTR][09:02:31][INFO][RK0][main]: gpu0 init embedding done
[HCTR][09:02:31][INFO][RK0][main]: Enable HMemCache-Based Parameter Server
[HCTR][09:02:31][INFO][RK0][main]: ./dcn_sparse_model/key doesn't exist, created
[HCTR][09:02:31][INFO][RK0][main]: ./dcn_sparse_model/emb_vector doesn't exist, created
[HCTR][09:02:31][INFO][RK0][main]: ./dcn_sparse_model/Adam.m doesn't exist, created
[HCTR][09:02:31][INFO][RK0][main]: ./dcn_sparse_model/Adam.v doesn't exist, created
[HCTR][09:02:36][INFO][RK0][main]: Starting AUC NCCL warm-up
[HCTR][09:02:36][INFO][RK0][main]: Warm-up done
===================================================Model Summary===================================================
[HCTR][09:02:36][INFO][RK0][main]: label                                   Dense                         Sparse                        
label                                   dense                          data1                         
(None, 1)                               (None, 13)                              
——————————————————————————————————————————————————————————————————————————————————————————————————————————————————
Layer Type                              Input Name                    Output Name                   Output Shape                  
——————————————————————————————————————————————————————————————————————————————————————————————————————————————————
DistributedSlotSparseEmbeddingHash      data1                         sparse_embedding1             (None, 26, 16)                
------------------------------------------------------------------------------------------------------------------
Reshape                                 sparse_embedding1             reshape1                      (None, 416)                   
------------------------------------------------------------------------------------------------------------------
Concat                                  reshape1                      concat1                       (None, 429)                   
                                        dense                                                                                     
------------------------------------------------------------------------------------------------------------------
Slice                                   concat1                       concat1_slice0                (None, 429)                   
                                                                      concat1_slice1                (None, 429)                   
------------------------------------------------------------------------------------------------------------------
MultiCross                              concat1_slice0                multicross1                   (None, 429)                   
------------------------------------------------------------------------------------------------------------------
InnerProduct                            concat1_slice1                fc1                           (None, 1024)                  
------------------------------------------------------------------------------------------------------------------
ReLU                                    fc1                           relu1                         (None, 1024)                  
------------------------------------------------------------------------------------------------------------------
Dropout                                 relu1                         dropout1                      (None, 1024)                  
------------------------------------------------------------------------------------------------------------------
InnerProduct                            dropout1                      fc2                           (None, 1024)                  
------------------------------------------------------------------------------------------------------------------
ReLU                                    fc2                           relu2                         (None, 1024)                  
------------------------------------------------------------------------------------------------------------------
Dropout                                 relu2                         dropout2                      (None, 1024)                  
------------------------------------------------------------------------------------------------------------------
Concat                                  dropout2                      concat2                       (None, 1453)                  
                                        multicross1                                                                               
------------------------------------------------------------------------------------------------------------------
InnerProduct                            concat2                       fc3                           (None, 1)                     
------------------------------------------------------------------------------------------------------------------
BinaryCrossEntropyLoss                  fc3                           loss                                                        
                                        label                                                                                     
------------------------------------------------------------------------------------------------------------------
[HCTR][09:02:36][INFO][RK0][main]: Save the model graph to dcn.json successfully
=====================================================Model Fit=====================================================
[HCTR][09:02:36][INFO][RK0][main]: Use embedding training cache mode with number of training sources: 1, number of epochs: 1
[HCTR][09:02:36][INFO][RK0][main]: Training batchsize: 1024, evaluation batchsize: 1024
[HCTR][09:02:36][INFO][RK0][main]: Evaluation interval: 1000, snapshot interval: 10000
[HCTR][09:02:36][INFO][RK0][main]: Dense network trainable: True
[HCTR][09:02:36][INFO][RK0][main]: Sparse embedding sparse_embedding1 trainable: True
[HCTR][09:02:36][INFO][RK0][main]: Use mixed precision: False, scaler: 1.000000, use cuda graph: True
[HCTR][09:02:36][INFO][RK0][main]: lr: 0.001000, warmup_steps: 1, end_lr: 0.000000
[HCTR][09:02:36][INFO][RK0][main]: decay_start: 0, decay_steps: 1, decay_power: 2.000000
[HCTR][09:02:36][INFO][RK0][main]: Evaluation source file: ./etc_data/file0/file_list_test.txt
[HCTR][09:02:36][INFO][RK0][main]: --------------------Epoch 0, source file: ./etc_data/file0/file_list.txt--------------------
[HCTR][09:02:36][INFO][RK0][main]: Preparing embedding table for next pass
[HCTR][09:02:36][INFO][RK0][main]: HMEM-Cache PS: Hit rate [load]: 0 %
[HCTR][09:02:47][INFO][RK0][main]: Iter: 500 Time(500 iters): 10.3413s Loss: 0.692548 lr:0.001
[HCTR][09:02:56][INFO][RK0][main]: Iter: 1000 Time(500 iters): 9.39275s Loss: 0.692917 lr:0.001
[HCTR][09:02:56][INFO][RK0][main]: eval drop incomplete batch. batchsize:168
[HCTR][09:02:56][INFO][RK0][main]: Evaluation, AUC: 0.499922
[HCTR][09:02:56][INFO][RK0][main]: Eval Time for 5000 iters: 0.280399s
[HCTR][09:03:06][INFO][RK0][main]: Iter: 1500 Time(500 iters): 9.65627s Loss: 0.693724 lr:0.001
[HCTR][09:03:14][INFO][RK0][main]: train drop incomplete batch. batchsize:672
=====================================================Model Fit=====================================================
[HCTR][09:03:14][INFO][RK0][main]: Use embedding training cache mode with number of training sources: 1, number of epochs: 1
[HCTR][09:03:14][INFO][RK0][main]: Training batchsize: 1024, evaluation batchsize: 1024
[HCTR][09:03:14][INFO][RK0][main]: Evaluation interval: 1000, snapshot interval: 10000
[HCTR][09:03:14][INFO][RK0][main]: Dense network trainable: True
[HCTR][09:03:14][INFO][RK0][main]: Sparse embedding sparse_embedding1 trainable: True
[HCTR][09:03:14][INFO][RK0][main]: Use mixed precision: False, scaler: 1.000000, use cuda graph: True
[HCTR][09:03:14][INFO][RK0][main]: lr: 0.001000, warmup_steps: 1, end_lr: 0.000000
[HCTR][09:03:14][INFO][RK0][main]: decay_start: 0, decay_steps: 1, decay_power: 2.000000
[HCTR][09:03:14][INFO][RK0][main]: Evaluation source file: etc_data/file1/file_list_test.txt
[HCTR][09:03:14][INFO][RK0][main]: --------------------Epoch 0, source file: etc_data/file1/file_list.txt--------------------
[HCTR][09:03:14][INFO][RK0][main]: Preparing embedding table for next pass
[HCTR][09:03:15][INFO][RK0][main]: HMEM-Cache PS: Hit rate [dump]: 0 %
[HCTR][09:03:15][INFO][RK0][main]: HMEM-Cache PS: Hit rate [load]: 0 %
[HCTR][09:03:15][INFO][RK0][main]: HMEM-Cache PS: Hit rate [dump]: 0 %
[HCTR][09:03:15][INFO][RK0][main]: Sync blocks from HMEM-Cache to SSD
  ████████████████████████████████████████▏ 100.0% [   1/   1 | 53.8 Hz | 0s<0s]  m
[HCTR][09:03:15][INFO][RK0][main]: Dumping dense weights to file, successful
[HCTR][09:03:15][INFO][RK0][main]: Dumping dense optimizer states to file, successful