HPS TensorRT Plugin Demo for TensorFlow Trained Model

Overview

This notebook demonstrates how to build and deploy the HPS-integrated TensorRT engine for the model trained with TensorFlow.

For more details about HPS, please refer to HugeCTR Hierarchical Parameter Server (HPS).

Installation

Use NGC

The HPS TensorRT plugin is preinstalled in the 23.09 and later Merlin TensorFlow Container: nvcr.io/nvidia/merlin/merlin-tensorflow:23.09.

You can check the existence of the required libraries by running the following Python code after launching this container.

import ctypes
plugin_lib_name = "/usr/local/hps_trt/lib/libhps_plugin.so"
plugin_handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)

Configurations

First of all we specify the required configurations, e.g., the arguments needed for generating the dataset, the model parameters and the paths to save the model. We will use DLRM model which has one embedding table, bottom MLP layers, interaction layer and top MLP layers. Please note that the input to the embedding layer will be a dense key tensor of int32.

import os
import numpy as np
import tensorflow as tf
import struct

args = dict()

args["gpu_num"] = 1                               # the number of available GPUs
args["iter_num"] = 50                             # the number of training iteration
args["slot_num"] = 26                             # the number of feature fields in this embedding layer
args["embed_vec_size"] = 128                      # the dimension of embedding vectors
args["dense_dim"] = 13                            # the dimension of dense features
args["global_batch_size"] = 1024                  # the globally batchsize for all GPUs
args["max_vocabulary_size"] = 260000
args["vocabulary_range_per_slot"] = [[i*10000, (i+1)*10000] for i in range(26)]
args["combiner"] = "mean"

args["ps_config_file"] = "dlrm_tf.json"
args["embedding_table_path"] = "dlrm_tf_sparse.model"
args["saved_path"] = "dlrm_tf_saved_model"
args["np_key_type"] = np.int32
args["np_vector_type"] = np.float32
args["tf_key_type"] = tf.int32
args["tf_vector_type"] = tf.float32

os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, range(args["gpu_num"])))

2023-08-21 03:16:46.032517: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.

def generate_random_samples(num_samples, vocabulary_range_per_slot, dense_dim, key_dtype = args["np_key_type"]):
    keys = list()
    for vocab_range in vocabulary_range_per_slot:
        keys_per_slot = np.random.randint(low=vocab_range[0], high=vocab_range[1], size=(num_samples, 1), dtype=key_dtype)
        keys.append(keys_per_slot)
    keys = np.concatenate(np.array(keys), axis = 1)
    numerical_features = np.random.random((num_samples, dense_dim)).astype(np.float32)
    labels = np.random.randint(low=0, high=2, size=(num_samples, 1))
    return keys, numerical_features, labels

def tf_dataset(keys, numerical_features, labels, batchsize):
    dataset = tf.data.Dataset.from_tensor_slices((keys, numerical_features, labels))
    dataset = dataset.batch(batchsize, drop_remainder=True)
    return dataset

Train with native TF layers

We define the model graph for training with native TF layers, i.e., tf.nn.embedding_lookup, tf.keras.layers.Dense and so on. We can then train the model and extract the trained weights of the embedding table.

class MLP(tf.keras.layers.Layer):
    def __init__(self,
                arch,
                activation='relu',
                out_activation=None,
                **kwargs):
        super(MLP, self).__init__(**kwargs)
        self.layers = []
        index = 0
        for units in arch[:-1]:
            self.layers.append(tf.keras.layers.Dense(units, activation=activation, name="{}_{}".format(kwargs['name'], index)))
            index+=1
        self.layers.append(tf.keras.layers.Dense(arch[-1], activation=out_activation, name="{}_{}".format(kwargs['name'], index)))

            
    def call(self, inputs, training=True):
        x = self.layers[0](inputs)
        for layer in self.layers[1:]:
            x = layer(x)
        return x

class SecondOrderFeatureInteraction(tf.keras.layers.Layer):
    def __init__(self):
        super(SecondOrderFeatureInteraction, self).__init__()

    def call(self, inputs, num_feas):
        dot_products = tf.reshape(tf.matmul(inputs, inputs, transpose_b=True), (-1, num_feas * num_feas))
        indices = tf.constant([i * num_feas + j for j in range(1, num_feas) for i in range(j)])
        flat_interactions = tf.gather(dot_products, indices, axis=1)
        return flat_interactions

class DLRM(tf.keras.models.Model):
    def __init__(self,
                 init_tensors,
                 embed_vec_size,
                 slot_num,
                 dense_dim,
                 arch_bot,
                 arch_top,
                 **kwargs):
        super(DLRM, self).__init__(**kwargs)
        

        self.init_tensors = init_tensors
        self.params = tf.Variable(initial_value=tf.concat(self.init_tensors, axis=0))
        
        self.embed_vec_size = embed_vec_size
        self.slot_num = slot_num
        self.dense_dim = dense_dim
    
        self.bot_nn = MLP(arch_bot, name = "bottom", out_activation='relu')
        self.top_nn = MLP(arch_top, name = "top", out_activation='sigmoid')
        self.interaction_op = SecondOrderFeatureInteraction()

        self.interaction_out_dim = self.slot_num * (self.slot_num+1) // 2
        self.reshape_layer1 = tf.keras.layers.Reshape((1, arch_bot[-1]), name = "reshape1")
        self.concat1 = tf.keras.layers.Concatenate(axis=1, name = "concat1")
        self.concat2 = tf.keras.layers.Concatenate(axis=1, name = "concat2")
            
    def call(self, inputs, training=True):
        categorical_features = inputs["keys"]
        numerical_features = inputs["numerical_features"]
        
        embedding_vector = tf.nn.embedding_lookup(params=self.params, ids=categorical_features)
        dense_x = self.bot_nn(numerical_features)
        concat_features = self.concat1([embedding_vector, self.reshape_layer1(dense_x)])
        
        Z = self.interaction_op(concat_features, self.slot_num+1)
        z = self.concat2([dense_x, Z])
        logit = self.top_nn(z)
        return logit

    def summary(self):
        inputs = {"keys": tf.keras.Input(shape=(self.slot_num, ), dtype=args["tf_key_type"], name="keys"), 
                  "numerical_features": tf.keras.Input(shape=(self.dense_dim, ), dtype=tf.float32, name="numrical_features")}
        model = tf.keras.models.Model(inputs=inputs, outputs=self.call(inputs))
        return model.summary()

 def train(args):
    init_tensors = np.ones(shape=[args["max_vocabulary_size"], args["embed_vec_size"]], dtype=args["np_vector_type"])
    
    model = DLRM(init_tensors, args["embed_vec_size"], args["slot_num"], args["dense_dim"],
                arch_bot = [512, 256, args["embed_vec_size"]],
                arch_top = [1024, 1024, 512, 256, 1],
                name = "dlrm")
    model.summary()
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)
    loss_fn = tf.keras.losses.BinaryCrossentropy()
    
    def _train_step(inputs, labels):
        with tf.GradientTape() as tape:
            logit = model(inputs)
            loss = loss_fn(labels, logit)
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        return loss, logit

    keys, numerical_features, labels = generate_random_samples(args["global_batch_size"]  * args["iter_num"], args["vocabulary_range_per_slot"], args["dense_dim"], args["np_key_type"])
    dataset = tf_dataset(keys, numerical_features, labels, args["global_batch_size"])
    for i, (keys, numerical_features, labels) in enumerate(dataset):
        inputs = {"keys": keys, "numerical_features": numerical_features}
        loss, logit = _train_step(inputs, labels)
        print("-"*20, "Step {}, loss: {}".format(i, loss),  "-"*20)

    return model

trained_model = train(args)
weights_list = trained_model.get_weights()
embedding_weights = weights_list[-1]
trained_model.save(args["saved_path"])

2023-08-21 03:16:55.963734: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1638] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30974 MB memory:  -> device: 0, name: Tesla V100-SXM2-32GB, pci bus id: 0000:06:00.0, compute capability: 7.0

WARNING:tensorflow:The following Variables were used in a Lambda layer's call (tf.compat.v1.nn.embedding_lookup), but are not present in its tracked objects:   <tf.Variable 'Variable:0' shape=(260000, 128) dtype=float32>. This is a strong indication that the Lambda layer should be rewritten as a subclassed Layer.
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
==================================================================================================
 numrical_features (InputLayer)  [(None, 13)]        0           []                               
                                                                                                  
 bottom (MLP)                   (None, 128)          171392      ['numrical_features[0][0]']      
                                                                                                  
 keys (InputLayer)              [(None, 26)]         0           []                               
                                                                                                  
 tf.compat.v1.nn.embedding_look  (None, 26, 128)     0           ['keys[0][0]']                   
 up (TFOpLambda)                                                                                  
                                                                                                  
 reshape1 (Reshape)             (None, 1, 128)       0           ['bottom[0][0]']                 
                                                                                                  
 concat1 (Concatenate)          (None, 27, 128)      0           ['tf.compat.v1.nn.embedding_looku
                                                                 p[0][0]',                        
                                                                  'reshape1[0][0]']               
                                                                                                  
 second_order_feature_interacti  (None, 351)         0           ['concat1[0][0]']                
 on (SecondOrderFeatureInteract                                                                   
 ion)                                                                                             
                                                                                                  
 concat2 (Concatenate)          (None, 479)          0           ['bottom[0][0]',                 
                                                                  'second_order_feature_interactio
                                                                 n[0][0]']                        
                                                                                                  
 top (MLP)                      (None, 1)            2197505     ['concat2[0][0]']                
                                                                                                  
==================================================================================================
Total params: 2,368,897
Trainable params: 2,368,897
Non-trainable params: 0
__________________________________________________________________________________________________

2023-08-21 03:16:57.578464: I tensorflow/core/common_runtime/executor.cc:1209] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype int64 and shape [51200,1]
	 [[{{node Placeholder/_2}}]]
2023-08-21 03:16:58.892396: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x55e0fdfeb330 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-08-21 03:16:58.892450: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): Tesla V100-SXM2-32GB, Compute Capability 7.0
2023-08-21 03:16:58.897903: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-08-21 03:16:59.379151: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8902
2023-08-21 03:16:59.502058: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.

WARNING:tensorflow:5 out of the last 5 calls to <function _BaseOptimizer._update_step_xla at 0x7fa9660adab0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
WARNING:tensorflow:6 out of the last 6 calls to <function _BaseOptimizer._update_step_xla at 0x7fa9660adab0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
-------------------- Step 0, loss: 39.68028259277344 --------------------
-------------------- Step 1, loss: 2571352064.0 --------------------
-------------------- Step 2, loss: 639234.5 --------------------
-------------------- Step 3, loss: 4132346.75 --------------------
-------------------- Step 4, loss: 20792958.0 --------------------
-------------------- Step 5, loss: 5957.8994140625 --------------------
-------------------- Step 6, loss: 231005.96875 --------------------
-------------------- Step 7, loss: 185315.3125 --------------------
-------------------- Step 8, loss: 151740.75 --------------------
-------------------- Step 9, loss: 43695.6640625 --------------------
-------------------- Step 10, loss: 45556.24609375 --------------------
-------------------- Step 11, loss: 131654.78125 --------------------
-------------------- Step 12, loss: 1.8805829286575317 --------------------
-------------------- Step 13, loss: 49121.47265625 --------------------
-------------------- Step 14, loss: 60609.62109375 --------------------
-------------------- Step 15, loss: 676294.375 --------------------
-------------------- Step 16, loss: 31208.66015625 --------------------
-------------------- Step 17, loss: 156789.65625 --------------------
-------------------- Step 18, loss: 103213.1015625 --------------------
-------------------- Step 19, loss: 22.394046783447266 --------------------
-------------------- Step 20, loss: 10789.5703125 --------------------
-------------------- Step 21, loss: 2716.05859375 --------------------
-------------------- Step 22, loss: 139559.96875 --------------------
-------------------- Step 23, loss: 130419.9453125 --------------------
-------------------- Step 24, loss: 13583.6923828125 --------------------
-------------------- Step 25, loss: 7378.22802734375 --------------------
-------------------- Step 26, loss: 81185.40625 --------------------
-------------------- Step 27, loss: 18370.255859375 --------------------
-------------------- Step 28, loss: 3314.90478515625 --------------------
-------------------- Step 29, loss: 15871.3154296875 --------------------
-------------------- Step 30, loss: 545.2841796875 --------------------
-------------------- Step 31, loss: 1281.3038330078125 --------------------
-------------------- Step 32, loss: 52890.65625 --------------------
-------------------- Step 33, loss: 2550.232177734375 --------------------
-------------------- Step 34, loss: 4526.03759765625 --------------------
-------------------- Step 35, loss: 25.5832462310791 --------------------
-------------------- Step 36, loss: 22.22301483154297 --------------------
-------------------- Step 37, loss: 17.7525691986084 --------------------
-------------------- Step 38, loss: 9.034607887268066 --------------------
-------------------- Step 39, loss: 1.6510401964187622 --------------------
-------------------- Step 40, loss: 6.275766372680664 --------------------
-------------------- Step 41, loss: 3.707094430923462 --------------------
-------------------- Step 42, loss: 0.7623991966247559 --------------------
-------------------- Step 43, loss: 1.5783321857452393 --------------------
-------------------- Step 44, loss: 0.8166252374649048 --------------------
-------------------- Step 45, loss: 0.885994553565979 --------------------
-------------------- Step 46, loss: 0.912842869758606 --------------------
-------------------- Step 47, loss: 0.7323049902915955 --------------------
-------------------- Step 48, loss: 0.7469371557235718 --------------------
-------------------- Step 49, loss: 0.8475004434585571 --------------------
WARNING:tensorflow:Model's `__init__()` arguments contain non-serializable objects. Please implement a `get_config()` method in the subclassed Model for proper saving and loading. Defaulting to empty config.
WARNING:tensorflow:Model's `__init__()` arguments contain non-serializable objects. Please implement a `get_config()` method in the subclassed Model for proper saving and loading. Defaulting to empty config.

2023-08-21 03:17:12.248789: I tensorflow/core/common_runtime/executor.cc:1209] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,128]
	 [[{{node inputs}}]]
2023-08-21 03:17:12.721088: I tensorflow/core/common_runtime/executor.cc:1209] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,128]
	 [[{{node inputs}}]]
WARNING:absl:Found untraced functions such as bottom_0_layer_call_fn, bottom_0_layer_call_and_return_conditional_losses, bottom_1_layer_call_fn, bottom_1_layer_call_and_return_conditional_losses, bottom_2_layer_call_fn while saving (showing 5 of 16). These functions will not be directly callable after loading.

INFO:tensorflow:Assets written to: dlrm_tf_saved_model/assets

INFO:tensorflow:Assets written to: dlrm_tf_saved_model/assets

WARNING:tensorflow:Model's `__init__()` arguments contain non-serializable objects. Please implement a `get_config()` method in the subclassed Model for proper saving and loading. Defaulting to empty config.

WARNING:tensorflow:Model's `__init__()` arguments contain non-serializable objects. Please implement a `get_config()` method in the subclassed Model for proper saving and loading. Defaulting to empty config.

WARNING:tensorflow:Model's `__init__()` arguments contain non-serializable objects. Please implement a `get_config()` method in the subclassed Model for proper saving and loading. Defaulting to empty config.

WARNING:tensorflow:Model's `__init__()` arguments contain non-serializable objects. Please implement a `get_config()` method in the subclassed Model for proper saving and loading. Defaulting to empty config.

# Release the occupied GPU memory by TensorFlow and Keras
from numba import cuda
cuda.select_device(0)
cuda.close()

Build the HPS-integrated TensorRT engine

In order to use HPS in the inference stage, we need to convert the embedding weights to the formats required by HPS first and create JSON configuration file for HPS.

Then we convert the TF saved model to ONNX, and employ the ONNX GraphSurgoen tool to replace the native TF embedding lookup layer with the placeholder of HPS TensorRT plugin layer.

After that, we can build the TensorRT engine, which is comprised of the HPS TensorRT plugin layer and the dense network.

Step1: Prepare sparse model and JSON configuration file for HPS

Please note that the storage format in the dlrm_tf_sparse.model/key file is int64, while the HPS TensorRT plugin currently only support int32 when loading the keys into memory. There is no overflow since the key value range is 0~260000.

def convert_to_sparse_model(embeddings_weights, embedding_table_path, embedding_vec_size):
    os.system("mkdir -p {}".format(embedding_table_path))
    with open("{}/key".format(embedding_table_path), 'wb') as key_file, \
        open("{}/emb_vector".format(embedding_table_path), 'wb') as vec_file:
      for key in range(embeddings_weights.shape[0]):
        vec = embeddings_weights[key]
        key_struct = struct.pack('q', key)
        vec_struct = struct.pack(str(embedding_vec_size) + "f", *vec)
        key_file.write(key_struct)
        vec_file.write(vec_struct)

convert_to_sparse_model(embedding_weights, args["embedding_table_path"], args["embed_vec_size"])

%%writefile dlrm_tf.json
{
    "supportlonglong": false,
    "models": [{
        "model": "dlrm",
        "sparse_files": ["dlrm_tf_sparse.model"],
        "num_of_worker_buffer_in_pool": 3,
        "embedding_table_names":["sparse_embedding0"],
        "embedding_vecsize_per_table": [128],
        "maxnum_catfeature_query_per_table_per_sample": [26],
        "default_value_for_each_table": [1.0],
        "deployed_device_list": [0],
        "max_batch_size": 1024,
        "cache_refresh_percentage_per_iteration": 0.2,
        "hit_rate_threshold": 1.0,
        "gpucacheper": 1.0,
        "gpucache": true
        }
    ]
}

Writing dlrm_tf.json

Step2: Convert to ONNX and do ONNX graph surgery

# convert TF SavedModel to ONNX
!python -m tf2onnx.convert --saved-model dlrm_tf_saved_model --output dlrm_tf.onnx

/usr/lib/python3.10/runpy.py:126: RuntimeWarning: 'tf2onnx.convert' found in sys.modules after import of package 'tf2onnx', but prior to execution of 'tf2onnx.convert'; this may result in unpredictable behaviour
  warn(RuntimeWarning(msg))
2023-08-21 03:17:49,926 - WARNING - ***IMPORTANT*** Installed protobuf is not cpp accelerated. Conversion will be extremely slow. See https://github.com/onnx/tensorflow-onnx/issues/1557
2023-08-21 03:17:50,868 - WARNING - '--tag' not specified for saved_model. Using --tag serve
2023-08-21 03:17:56,302 - INFO - Signatures found in model: [serving_default].
2023-08-21 03:17:56,302 - WARNING - '--signature_def' not specified, using first signature: serving_default
2023-08-21 03:17:56,302 - INFO - Output names: ['output_1']
2023-08-21 03:18:02,064 - INFO - Using tensorflow=2.12.0, onnx=1.14.0, tf2onnx=1.14.0/8f8d49
2023-08-21 03:18:02,064 - INFO - Using opset <onnx, 15>
2023-08-21 03:18:03,255 - INFO - Computed 0 values for constant folding
2023-08-21 03:18:04,203 - INFO - Optimizing ONNX model
2023-08-21 03:18:04,624 - INFO - After optimization: Cast -3 (3->0), Concat -1 (3->2), Const -15 (35->20), Identity -2 (2->0), Shape -1 (1->0), Slice -1 (1->0), Squeeze -1 (1->0), Unsqueeze -3 (3->0)
2023-08-21 03:18:07,745 - INFO - 
2023-08-21 03:18:07,745 - INFO - Successfully converted TensorFlow model dlrm_tf_saved_model to ONNX
2023-08-21 03:18:07,745 - INFO - Model inputs: ['keys', 'numerical_features']
2023-08-21 03:18:07,745 - INFO - Model outputs: ['output_1']
2023-08-21 03:18:07,745 - INFO - ONNX model is saved at dlrm_tf.onnx

# ONNX graph surgery to insert HPS the TensorRT plugin placeholder
import onnx_graphsurgeon as gs
from onnx import  shape_inference
import numpy as np
import onnx

graph = gs.import_onnx(onnx.load("dlrm_tf.onnx"))
saved = []

for node in graph.nodes:
    if node.name == "StatefulPartitionedCall/dlrm/embedding_lookup":
        categorical_features = gs.Variable(name="categorical_features", dtype=np.int32, shape=("unknown", 26))
        hps_node = gs.Node(op="HPS_TRT", attrs={"ps_config_file": "dlrm_tf.json\0", "model_name": "dlrm\0", "table_id": 0, "emb_vec_size": 128}, 
                           inputs=[categorical_features], outputs=[node.outputs[0]])
        graph.nodes.append(hps_node)
        saved.append(categorical_features)
        node.outputs.clear()
for i in graph.inputs:
    if i.name == "numerical_features":
        saved.append(i)
graph.inputs = saved

graph.cleanup().toposort()
onnx.save(gs.export_onnx(graph), "dlrm_tf_with_hps.onnx")

Step3: Build the TensorRT engine

# build the TensorRT engine based on dlrm_tf_with_hps.onnx
import tensorrt as trt
import ctypes

plugin_lib_name = "/usr/local/hps_trt/lib/libhps_plugin.so"
handle = ctypes.CDLL(plugin_lib_name, mode=ctypes.RTLD_GLOBAL)

TRT_LOGGER = trt.Logger(trt.Logger.INFO)
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

def build_engine_from_onnx(onnx_model_path):
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser, builder.create_builder_config() as builder_config:        
        model = open(onnx_model_path, 'rb')
        parser.parse(model.read())

        profile = builder.create_optimization_profile()        
        profile.set_shape("categorical_features", (1, 26), (1024, 26), (1024, 26))    
        profile.set_shape("numerical_features", (1, 13), (1024, 13), (1024, 13))
        builder_config.add_optimization_profile(profile)
        engine = builder.build_serialized_network(network, builder_config)
        return engine

serialized_engine = build_engine_from_onnx("dlrm_tf_with_hps.onnx")
with open("dlrm_tf_with_hps.trt", "wb") as fout:
    fout.write(serialized_engine)
print("Successfully build the TensorRT engine")

[08/21/2023-03:18:16] [TRT] [I] [MemUsageChange] Init CUDA: CPU +2013, GPU +0, now: CPU 4018, GPU 721 (MiB)
[08/21/2023-03:18:22] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +421, GPU +72, now: CPU 4516, GPU 793 (MiB)
[08/21/2023-03:18:22] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage and speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#lazy-loading
[08/21/2023-03:18:22] [TRT] [W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[08/21/2023-03:18:22] [TRT] [I] No importer registered for op: HPS_TRT. Attempting to import as plugin.
[08/21/2023-03:18:22] [TRT] [I] Searching for plugin: HPS_TRT, plugin_version: 1, plugin_namespace: 
=====================================================HPS Parse====================================================
[HCTR][03:18:22.774][INFO][RK0][main]: fuse_embedding_table is not specified using default: 0
[HCTR][03:18:22.774][INFO][RK0][main]: dense_file is not specified using default: 
[HCTR][03:18:22.774][INFO][RK0][main]: num_of_refresher_buffer_in_pool is not specified using default: 1
[HCTR][03:18:22.774][INFO][RK0][main]: maxnum_des_feature_per_sample is not specified using default: 26
[HCTR][03:18:22.774][INFO][RK0][main]: refresh_delay is not specified using default: 0
[HCTR][03:18:22.774][INFO][RK0][main]: refresh_interval is not specified using default: 0
[HCTR][03:18:22.774][INFO][RK0][main]: use_static_table is not specified using default: 0
[HCTR][03:18:22.774][INFO][RK0][main]: use_context_stream is not specified using default: 1
[HCTR][03:18:22.774][INFO][RK0][main]: use_hctr_cache_implementation is not specified using default: 1
[HCTR][03:18:22.774][INFO][RK0][main]: thread_pool_size is not specified using default: 16
[HCTR][03:18:22.774][INFO][RK0][main]: init_ec is not specified using default: 1
[HCTR][03:18:22.774][INFO][RK0][main]: enable_pagelock is not specified using default: 0
[HCTR][03:18:22.774][INFO][RK0][main]: fp8_quant is not specified using default: 0
[HCTR][03:18:22.774][INFO][RK0][main]: HPS plugin uses context stream for model dlrm: True
====================================================HPS Create====================================================
[HCTR][03:18:22.775][INFO][RK0][main]: Creating HashMap CPU database backend...
[HCTR][03:18:22.775][DEBUG][RK0][main]: Created blank database backend in local memory!
[HCTR][03:18:22.775][INFO][RK0][main]: Volatile DB: initial cache rate = 1
[HCTR][03:18:22.775][INFO][RK0][main]: Volatile DB: cache missed embeddings = 0
[HCTR][03:18:22.775][DEBUG][RK0][main]: Created raw model loader in local memory!
[HCTR][03:18:24.860][INFO][RK0][main]: Table: hps_et.dlrm.sparse_embedding0; cached 260000 / 260000 embeddings in volatile database (HashMapBackend); load: 260000 / 18446744073709551615 (0.00%).
[HCTR][03:18:24.863][DEBUG][RK0][main]: Real-time subscribers created!
[HCTR][03:18:24.864][INFO][RK0][main]: Creating embedding cache in device 0.
[HCTR][03:18:24.869][INFO][RK0][main]: Model name: dlrm
[HCTR][03:18:24.869][INFO][RK0][main]: Max batch size: 1024
[HCTR][03:18:24.869][INFO][RK0][main]: Fuse embedding tables: False
[HCTR][03:18:24.869][INFO][RK0][main]: Number of embedding tables: 1
[HCTR][03:18:24.869][INFO][RK0][main]: Use GPU embedding cache: True, cache size percentage: 1.000000
[HCTR][03:18:24.869][INFO][RK0][main]: Embedding cache type: dynamic
[HCTR][03:18:24.869][INFO][RK0][main]: Use I64 input key: False
[HCTR][03:18:24.869][INFO][RK0][main]: Configured cache hit rate threshold: 1.000000
[HCTR][03:18:24.869][INFO][RK0][main]: The size of thread pool: 80
[HCTR][03:18:24.869][INFO][RK0][main]: The size of worker memory pool: 3
[HCTR][03:18:24.869][INFO][RK0][main]: The size of refresh memory pool: 1
[HCTR][03:18:24.869][INFO][RK0][main]: The refresh percentage : 0.200000
[HCTR][03:18:24.902][INFO][RK0][main]: Initialize the embedding cache by by inserting the same size model file with embedding cache from beginning
[HCTR][03:18:24.902][DEBUG][RK0][main]: Created raw model loader in local memory!
[HCTR][03:18:24.902][INFO][RK0][main]: EC initialization on device 0 for hps_et.dlrm.sparse_embedding0
[HCTR][03:18:24.947][INFO][RK0][main]: Initialize the embedding table 0 for iteration 0 with number of 51968 keys.
[HCTR][03:18:24.992][INFO][RK0][main]: Initialize the embedding table 0 for iteration 1 with number of 51968 keys.
[HCTR][03:18:25.018][INFO][RK0][main]: Initialize the embedding table 0 for iteration 2 with number of 51968 keys.
[HCTR][03:18:25.047][INFO][RK0][main]: Initialize the embedding table 0 for iteration 3 with number of 51968 keys.
[HCTR][03:18:25.069][INFO][RK0][main]: Initialize the embedding table 0 for iteration 4 with number of 51968 keys.
[HCTR][03:18:25.077][INFO][RK0][main]: LookupSession i64_input_key: False
[HCTR][03:18:25.077][INFO][RK0][main]: Creating lookup session for dlrm on device: 0
[08/21/2023-03:18:25] [TRT] [I] Successfully created plugin: HPS_TRT
[08/21/2023-03:18:25] [TRT] [I] BuilderFlag::kTF32 is set but hardware does not support TF32. Disabling TF32.
[08/21/2023-03:18:25] [TRT] [I] Graph optimization time: 0.034216 seconds.
[08/21/2023-03:18:25] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +1, GPU +8, now: CPU 8710, GPU 1051 (MiB)
[08/21/2023-03:18:25] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +1, GPU +10, now: CPU 8711, GPU 1061 (MiB)
[08/21/2023-03:18:25] [TRT] [I] BuilderFlag::kTF32 is set but hardware does not support TF32. Disabling TF32.
[08/21/2023-03:18:25] [TRT] [I] Local timing cache in use. Profiling results in this builder pass will not be stored.
[08/21/2023-03:18:27] [TRT] [I] Detected 2 inputs and 1 output network tensors.
[08/21/2023-03:18:27] [TRT] [I] Total Host Persistent Memory: 144
[08/21/2023-03:18:27] [TRT] [I] Total Device Persistent Memory: 0
[08/21/2023-03:18:27] [TRT] [I] Total Scratch Memory: 18350080
[08/21/2023-03:18:27] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 0 MiB, GPU 41 MiB
[08/21/2023-03:18:27] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 2 steps to complete.
[08/21/2023-03:18:27] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 0.007954ms to assign 2 blocks to 2 nodes requiring 31981568 bytes.
[08/21/2023-03:18:27] [TRT] [I] Total Activation Memory: 31981568
[08/21/2023-03:18:27] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 8764, GPU 1091 (MiB)
[08/21/2023-03:18:27] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +10, now: CPU 8764, GPU 1101 (MiB)
[08/21/2023-03:18:27] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in building engine: CPU +0, GPU +16, now: CPU 0, GPU 16 (MiB)
Successfully build the TensorRT engine

Deploy HPS-integrated TensorRT engine with Triton on multiple GPUs

In order to deploy the TensorRT engine with the Triton TensorRT backend, we need to create the model repository and define the config.pbtxt first. Since we are deploy model instances on multiple GPUs, we need to modify the "deployed_device_list" entry in dlrm_tf.json accordingly.

!mkdir -p model_repo/dlrm_tf_with_hps/1
!mv dlrm_tf_with_hps.trt model_repo/dlrm_tf_with_hps/1

%%writefile model_repo/dlrm_tf_with_hps/config.pbtxt

platform: "tensorrt_plan"
default_model_filename: "dlrm_tf_with_hps.trt"
backend: "tensorrt"
max_batch_size: 0
input [
  {
    name: "categorical_features"
    data_type: TYPE_INT32
    dims: [-1,26]
  },
  {
    name: "numerical_features"
    data_type: TYPE_FP32
    dims: [-1,13]
  }
]
output [
  {
      name: "output_1"
      data_type: TYPE_FP32
      dims: [-1,1]
  }
]
instance_group [
  {
    count: 1
    kind: KIND_GPU
    gpus:[0,1,2,3]
  }
]

Overwriting model_repo/dlrm_tf_with_hps/config.pbtxt

%%writefile dlrm_tf.json
{
    "supportlonglong": false,
    "models": [{
        "model": "dlrm",
        "sparse_files": ["dlrm_tf_sparse.model"],
        "num_of_worker_buffer_in_pool": 3,
        "embedding_table_names":["sparse_embedding0"],
        "embedding_vecsize_per_table": [128],
        "maxnum_catfeature_query_per_table_per_sample": [26],
        "default_value_for_each_table": [1.0],
        "deployed_device_list": [0,1,2,3],
        "max_batch_size": 1024,
        "cache_refresh_percentage_per_iteration": 0.2,
        "hit_rate_threshold": 1.0,
        "gpucacheper": 1.0,
        "gpucache": true
        }
    ]
}

Overwriting dlrm_tf.json

!tree model_repo/dlrm_tf_with_hps

model_repo/dlrm_tf_with_hps
├── 1
│   └── dlrm_tf_with_hps.trt
└── config.pbtxt

1 directory, 2 files

We can then launch the Triton inference server using the TensorRT backend. Please note that LD_PRELOAD is utilized to load the custom TensorRT plugin (i.e., HPS TensorRT plugin) into Triton.

Note: Since Background processes not supported by Jupyter, please launch the Triton Server according to the following command independently in the background.

LD_PRELOAD=/usr/local/hps_trt/lib/libhps_plugin.so tritonserver –model-repository=/hugectr/hps_trt/notebooks/model_repo/ –load-model=dlrm_tf_with_hps –model-control-mode=explicit

If you successfully started tritonserver, you should see a log similar to following:

TRITONBACKEND_ModelInstanceInitialize: dlrm_tf_with_hps_0 (GPU device 0)
TRITONBACKEND_ModelInstanceInitialize: dlrm_tf_with_hps_0 (GPU device 1)
TRITONBACKEND_ModelInstanceInitialize: dlrm_tf_with_hps_0 (GPU device 2)
TRITONBACKEND_ModelInstanceInitialize: dlrm_tf_with_hps_0 (GPU device 3)


+----------+--------------------------------+--------------------------------+
| Backend  | Path                           | Config                         |
+----------+--------------------------------+--------------------------------+
| tensorrt | /opt/tritonserver/backends/ten | {"cmdline":{"auto-complete-con |
|          | sorrt/libtriton_tensorrt.so    | fig":"true","min-compute-capab |
|          |                                | ility":"6.000000","backend-dir |
|          |                                | ectory":"/opt/tritonserver/bac |
|          |                                | kends","default-max-batch-size |
|          |                                | ":"4"}}                        |
|          |                                |                                |
+----------+--------------------------------+--------------------------------+

+------------------+---------+--------+
| Model            | Version | Status |
+------------------+---------+--------+
| dlrm_tf_with_hps | 1       | READY  |
+------------------+---------+--------+

We can then send the requests to the Triton inference server using the HTTP client.

import os
import shutil
import numpy as np
import tritonclient.http as httpclient
from tritonclient.utils import *

BATCH_SIZE = 1024

categorical_feature = np.random.randint(0,260000,size=(BATCH_SIZE,26)).astype(np.int32)
numerical_feature = np.random.random((BATCH_SIZE, 13)).astype(np.float32)

inputs = [
    httpclient.InferInput("categorical_features", 
                          categorical_feature.shape,
                          np_to_triton_dtype(np.int32)),
    httpclient.InferInput("numerical_features", 
                          numerical_feature.shape,
                          np_to_triton_dtype(np.float32)),                          
]
inputs[0].set_data_from_numpy(categorical_feature)
inputs[1].set_data_from_numpy(numerical_feature)


outputs = [
    httpclient.InferRequestedOutput("output_1")
]

model_name = "dlrm_tf_with_hps"

with httpclient.InferenceServerClient("localhost:8000") as client:
    response = client.infer(model_name,
                            inputs,
                            outputs=outputs)
    result = response.get_response()
    
    print("Prediction result is \n{}".format(response.as_numpy("output_1")))
    print("Response details:\n{}".format(result))

Prediction result is 
[[0.34091672]
 [0.34091672]
 [0.34091672]
 ...
 [0.34091672]
 [0.34091672]
 [0.34091672]]
Response details:
{'model_name': 'dlrm_tf_with_hps', 'model_version': '1', 'outputs': [{'name': 'output_1', 'datatype': 'FP32', 'shape': [1024, 1], 'parameters': {'binary_data_size': 4096}}]}