# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
Scaling Criteo: Triton Inference with TensorFlow
Overview
The last step is to deploy the ETL workflow and saved model to production. In the production setting, we want to transform the input data as during training (ETL). We need to apply the same mean/std for continuous features and use the same categorical mapping to convert the categories to continuous integer before we use the deep learning model for a prediction. Therefore, we deploy the NVTabular workflow with the TensorFlow model as an ensemble model to Triton Inference. The ensemble model garantuees that the same transformation are applied to the raw inputs.
Learning objectives
In this notebook, we learn how to deploy our models to production
Use NVTabular to generate config and model files for Triton Inference Server
Deploy an ensemble of NVTabular workflow and TensorFlow model
Send example request to Triton Inference Server
Inference with Triton and TensorFlow
First, we need to generate the Triton Inference Server configurations and save the models in the correct format. In the previous notebooks 02-ETL-with-NVTabular and 03-Training-with-TF we saved the NVTabular workflow and TensorFlow model to disk. We will load them.
Saving Ensemble Model for Triton Inference Server
import os
import tensorflow as tf
import nvtabular as nvt
BASE_DIR = os.environ.get("BASE_DIR", "/raid/data/criteo")
input_path = os.path.join(BASE_DIR, "test_dask/output")
workflow = nvt.Workflow.load(os.path.join(input_path, "workflow"))
model = tf.keras.models.load_model(os.path.join(input_path, "model.savedmodel"))
TensorFlow expect the Integer as int32
datatype. Therefore, we need to define the NVTabular output datatypes to int32
for categorical features.
for key in workflow.output_dtypes.keys():
if key.startswith("C"):
workflow.output_dtypes[key] = "int32"
NVTabular provides an easy function to deploy the ensemble model for Triton Inference Server.
from nvtabular.inference.triton import export_tensorflow_ensemble
export_tensorflow_ensemble(model, workflow, "criteo", "/models", ["label"])
INFO:tensorflow:Assets written to: /models/criteo_tf/1/model.savedmodel/assets
We can take a look on the generated files.
!tree /models
/models
├── criteo
│ ├── 1
│ └── config.pbtxt
├── criteo_nvt
│ ├── 1
│ │ ├── model.py
│ │ └── workflow
│ │ ├── categories
│ │ │ ├── unique.C1.parquet
│ │ │ ├── unique.C10.parquet
│ │ │ ├── unique.C11.parquet
│ │ │ ├── unique.C12.parquet
│ │ │ ├── unique.C13.parquet
│ │ │ ├── unique.C14.parquet
│ │ │ ├── unique.C15.parquet
│ │ │ ├── unique.C16.parquet
│ │ │ ├── unique.C17.parquet
│ │ │ ├── unique.C18.parquet
│ │ │ ├── unique.C19.parquet
│ │ │ ├── unique.C2.parquet
│ │ │ ├── unique.C20.parquet
│ │ │ ├── unique.C21.parquet
│ │ │ ├── unique.C22.parquet
│ │ │ ├── unique.C23.parquet
│ │ │ ├── unique.C24.parquet
│ │ │ ├── unique.C25.parquet
│ │ │ ├── unique.C26.parquet
│ │ │ ├── unique.C3.parquet
│ │ │ ├── unique.C4.parquet
│ │ │ ├── unique.C5.parquet
│ │ │ ├── unique.C6.parquet
│ │ │ ├── unique.C7.parquet
│ │ │ ├── unique.C8.parquet
│ │ │ └── unique.C9.parquet
│ │ ├── metadata.json
│ │ └── workflow.pkl
│ └── config.pbtxt
└── criteo_tf
├── 1
│ └── model.savedmodel
│ ├── assets
│ ├── saved_model.pb
│ └── variables
│ ├── variables.data-00000-of-00001
│ └── variables.index
└── config.pbtxt
11 directories, 35 files
Loading Ensemble Model with Triton Inference Server
We have only saved the models for Triton Inference Server. We started Triton Inference Server in explicit mode, meaning that we need to send a request that Triton will load the ensemble model.
First, we restart this notebook to free the GPU memory.
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)
{'status': 'ok', 'restart': True}
We define the BASE_DIR again.
import os
BASE_DIR = os.environ.get("BASE_DIR", "/raid/data/criteo")
We connect to the Triton Inference Server.
import tritonhttpclient
try:
triton_client = tritonhttpclient.InferenceServerClient(url="triton:8000", verbose=True)
print("client created.")
except Exception as e:
print("channel creation failed: " + str(e))
client created.
We deactivate warnings.
import warnings
warnings.filterwarnings("ignore")
We check if the server is alive.
triton_client.is_server_live()
GET /v2/health/live, headers None
<HTTPSocketPoolResponse status=200 headers={'content-length': '0', 'content-type': 'text/plain'}>
True
We check the available models in the repositories:
criteo: Ensemble
criteo_nvt: NVTabular
criteo_tf: TensorFlow model
triton_client.get_model_repository_index()
POST /v2/repository/index, headers None
<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '62'}>
bytearray(b'[{"name":"criteo"},{"name":"criteo_nvt"},{"name":"criteo_tf"}]')
[{'name': 'criteo'}, {'name': 'criteo_nvt'}, {'name': 'criteo_tf'}]
We load the ensembled model.
%%time
triton_client.load_model(model_name="criteo")
POST /v2/repository/models/criteo/load, headers None
<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '0'}>
Loaded model 'criteo'
CPU times: user 0 ns, sys: 3.33 ms, total: 3.33 ms
Wall time: 56.5 s
Example Request to Triton Inference Server
Now, the models are loaded and we can create a sample request. We read an example raw batch for inference.
# Get dataframe library - cudf or pandas
from merlin.core.dispatch import get_lib
df_lib = get_lib()
# read in the workflow (to get input/output schema to call triton with)
batch_path = os.path.join(BASE_DIR, "converted/criteo")
batch = df_lib.read_parquet(os.path.join(batch_path, "*.parquet"), num_rows=3)
batch = batch[[x for x in batch.columns if x != "label"]]
print(batch)
I1 I2 I3 I4 I5 I6 I7 I8 I9 I10 ... C17 \
0 5 110 <NA> 16 <NA> 1 0 14 7 1 ... -771205462
1 32 3 5 <NA> 1 0 0 61 5 0 ... -771205462
2 <NA> 233 1 146 1 0 0 99 7 0 ... -771205462
C18 C19 C20 C21 C22 C23 \
0 -1206449222 -1793932789 -1014091992 351689309 632402057 -675152885
1 -1578429167 -1793932789 -20981661 -1556988767 -924717482 391309800
2 1653545869 -1793932789 -1014091992 351689309 632402057 -675152885
C24 C25 C26
0 2091868316 809724924 -317696227
1 1966410890 -1726799382 -1218975401
2 883538181 -10139646 -317696227
[3 rows x 39 columns]
We prepare the batch for inference by using correct column names and data types. We use the same datatypes as defined in our dataframe.
batch.dtypes
I1 int32
I2 int32
I3 int32
I4 int32
I5 int32
I6 int32
I7 int32
I8 int32
I9 int32
I10 int32
I11 int32
I12 int32
I13 int32
C1 int32
C2 int32
C3 int32
C4 int32
C5 int32
C6 int32
C7 int32
C8 int32
C9 int32
C10 int32
C11 int32
C12 int32
C13 int32
C14 int32
C15 int32
C16 int32
C17 int32
C18 int32
C19 int32
C20 int32
C21 int32
C22 int32
C23 int32
C24 int32
C25 int32
C26 int32
dtype: object
import tritonclient.http as httpclient
from tritonclient.utils import np_to_triton_dtype
import numpy as np
inputs = []
col_names = list(batch.columns)
col_dtypes = [np.int32] * len(col_names)
for i, col in enumerate(batch.columns):
d = batch[col].values_host.astype(col_dtypes[i])
d = d.reshape(len(d), 1)
inputs.append(httpclient.InferInput(col_names[i], d.shape, np_to_triton_dtype(col_dtypes[i])))
inputs[i].set_data_from_numpy(d)
We send the request to the triton server and collect the last output.
# placeholder variables for the output
outputs = [httpclient.InferRequestedOutput("output")]
# build a client to connect to our server.
# This InferenceServerClient object is what we'll be using to talk to Triton.
# make the request with tritonclient.http.InferInput object
response = triton_client.infer("criteo", inputs, request_id="1", outputs=outputs)
print("predicted softmax result:\n", response.as_numpy("output"))
POST /v2/models/criteo/infer, headers {'Inference-Header-Content-Length': 3382}
b'{"id":"1","inputs":[{"name":"I1","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"I2","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"I3","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"I4","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"I5","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"I6","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"I7","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"I8","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"I9","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"I10","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"I11","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"I12","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"I13","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C1","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C2","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C3","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C4","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C5","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C6","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C7","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C8","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C9","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C10","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C11","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C12","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C13","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C14","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C15","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C16","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C17","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C18","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C19","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C20","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C21","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C22","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C23","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C24","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C25","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}},{"name":"C26","shape":[3,1],"datatype":"INT32","parameters":{"binary_data_size":12}}],"outputs":[{"name":"output","parameters":{"binary_data":true}}]}\x05\x00\x00\x00 \x00\x00\x00\x00\x00\x00\x00n\x00\x00\x00\x03\x00\x00\x00\xe9\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x01\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x92\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e\x00\x00\x00=\x00\x00\x00c\x00\x00\x00\x07\x00\x00\x00\x05\x00\x00\x00\x07\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x002\x01\x00\x00U\x0c\x00\x00\x1d\x0c\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x01\x00\x00\x00y\rwb\x8d\xfd\xf3\xe5y\rwbX]\x1f\xe2\xa6\xff\xaa\xa0\x03B\x98\xad/D\xea\xaf\xd5\x15\xaao\r\xc6\xbeb\xcf\x7f\\\x94!4\x8a\xda\xeeIl8H\'\xb08#\x9f\xd6<M\x06U\xe7\xcbm\xcdo\xcbm\xcdo\xcbm\xcdo!\xaa\x805\x81\xed\x16\xabb\xeb\xf5\xb5\x03\x89\x80()lBC\x8b\xcc\xf2\xd1\xa6\xdf\xdeFT\xe1\xf5\x1d\x1f\x82N.\xc1}\x02.\xa9\xc0\xe9}\xc1}\x02.1B|\x0cd\xdcRf1B|\x0c\x1f\x1d\x98\x95\'N\xeb\x99\x84aq\x12\xb7\xff\xc5\x00\xb7\xff\xc5\x00\xb7\xff\xc5\x007\xe5N\xbe7\xe5N\xbe7\xe5N\xbe\xcct\x0b\x8a\x99\xfe\xbb\xf3\x0b\r\x0f\xf7\xfa>\xdcL\xfa>\xdcL\xfa>\xdcL\xaaV\x08\xd2\xaaV\x08\xd2\xaaV\x08\xd2\xba\x0b\x17\xb8\x11\x15\xeb\xa1\x8d\x1b\x8fb\x0b\xc2\x12\x95\x0b\xc2\x12\x95\x0b\xc2\x12\x95(/\x8e\xc3c\xd8\xbf\xfe(/\x8e\xc3]Z\xf6\x14\xa1<2\xa3]Z\xf6\x14\x89\xb0\xb1%V\xee\xe1\xc8\x89\xb0\xb1%\x0b\xfc\xc1\xd7\xe8\xe9R\x17\x0b\xfc\xc1\xd7\x9c`\xaf|\x8a\x0c5u\x05\xb9\xa94\xfckC0\xea!\x13\x99\x02He\xff\x1dW\x10\xedW\xe9W\xb7\x1dW\x10\xed'
<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'inference-header-content-length': '226', 'content-length': '238'}>
bytearray(b'{"id":"1","model_name":"criteo","model_version":"1","parameters":{"sequence_id":0,"sequence_start":false,"sequence_end":false},"outputs":[{"name":"output","datatype":"FP32","shape":[3,1],"parameters":{"binary_data_size":12}}]}')
predicted softmax result:
[[0.02414342]
[0.0328052 ]
[0.02708623]]
Let’s unload the model. We need to unload each model.
triton_client.unload_model(model_name="criteo")
triton_client.unload_model(model_name="criteo_nvt")
triton_client.unload_model(model_name="criteo_tf")
POST /v2/repository/models/criteo/unload, headers None
<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '0'}>
Loaded model 'criteo'
POST /v2/repository/models/criteo_nvt/unload, headers None
<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '0'}>
Loaded model 'criteo_nvt'
POST /v2/repository/models/criteo_tf/unload, headers None
<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '0'}>
Loaded model 'criteo_tf'