# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ===================================
Overview
In this notebook, we will show how we do inference with our trained deep learning recommender model using Triton Inference Server. In this example, we deploy the NVTabular workflow and HugeCTR model with Triton Inference Server. We deploy them as an ensemble. For each request, Triton Inference Server will feed the input data through the NVTabular workflow and its output through the HugeCR model.
As we went through in the previous notebook, movielens-HugeCTR, NVTabular provides a function to save the NVTabular workflow via export_hugectr_ensemble
. This function does not only save NVTabular workflow, but also saves the trained HugeCTR model and ensemble model to be served to Triton IS.
Getting Started
We need to write a configuration file with the stored model weights and model configuration.
%%writefile '/model/models/ps.json'
{
"supportlonglong": true,
"models": [
{
"model": "movielens",
"sparse_files": ["/model/models/movielens/1/0_sparse_1900.model"],
"dense_file": "/model/models/movielens/1/_dense_1900.model",
"network_file": "/model/models/movielens/1/movielens.json"
}
]
}
Overwriting /model/models/ps.json
Let’s import required libraries.
import tritonclient.grpc as httpclient
import numpy as np
# Get dataframe library - cudf or pandas
from nvtabular.dispatch import get_lib
df_lib = get_lib()
Load Models on Triton Server
At this stage, you should launch the Triton Inference Server docker container with the following script:
docker run -it --gpus=all -p 8000:8000 -p 8001:8001 -p 8002:8002 -v ${PWD}:/model nvcr.io/nvidia/merlin/merlin-inference:21.11
After you started the container you can start triton server with the command below:
tritonserver --model-repository=<path_to_models> --backend-config=hugectr,ps=<path_to_models>/ps.json --model-control-mode=explicit
Note: The model-repository path is /model/models/
. The models haven’t been loaded, yet. We can request triton server to load the saved ensemble. We initialize a triton client. The path for the json file is /model/models/movielens/1/movielens.json
.
# disable warnings
import warnings
warnings.filterwarnings("ignore")
import tritonhttpclient
try:
triton_client = tritonhttpclient.InferenceServerClient(url="localhost:8000", verbose=True)
print("client created.")
except Exception as e:
print("channel creation failed: " + str(e))
client created.
/usr/local/lib/python3.8/dist-packages/tritonhttpclient/__init__.py:31: DeprecationWarning: The package `tritonhttpclient` is deprecated and will be removed in a future version. Please use instead `tritonclient.http`
warnings.warn(
triton_client.is_server_live()
GET /v2/health/live, headers None
<HTTPSocketPoolResponse status=200 headers={'content-length': '0', 'content-type': 'text/plain'}>
True
triton_client.get_model_repository_index()
POST /v2/repository/index, headers None
<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '72'}>
bytearray(b'[{"name":"movielens"},{"name":"movielens_ens"},{"name":"movielens_nvt"}]')
[{'name': 'movielens'}, {'name': 'movielens_ens'}, {'name': 'movielens_nvt'}]
Let’s load our models to Triton Server.
%%time
triton_client.load_model(model_name="movielens_nvt")
POST /v2/repository/models/movielens_nvt/load, headers None
<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '0'}>
Loaded model 'movielens_nvt'
CPU times: user 3.47 ms, sys: 0 ns, total: 3.47 ms
Wall time: 2.95 s
%%time
triton_client.load_model(model_name="movielens")
POST /v2/repository/models/movielens/load, headers None
<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '0'}>
Loaded model 'movielens'
CPU times: user 3.71 ms, sys: 0 ns, total: 3.71 ms
Wall time: 5.8 s
Finally, we load our ensemble model movielens_ens
.
%%time
triton_client.load_model(model_name="movielens_ens")
POST /v2/repository/models/movielens_ens/load, headers None
<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '0'}>
Loaded model 'movielens_ens'
CPU times: user 3.21 ms, sys: 0 ns, total: 3.21 ms
Wall time: 105 ms
Let’s send a request to Inference Server and print out the response. Since in our example above we do not have continuous columns, below our only inputs are categorical columns.
from tritonclient.utils import np_to_triton_dtype
model_name = "movielens_ens"
col_names = ["userId", "movieId"]
# read in a batch of data to get transforms for
batch = df_lib.read_parquet("/model/data/valid.parquet", num_rows=64)[col_names]
print(batch, "\n")
# convert the batch to a triton inputs
columns = [(col, batch[col]) for col in col_names]
inputs = []
col_dtypes = [np.int64, np.int64]
for i, (name, col) in enumerate(columns):
d = col.values_host.astype(col_dtypes[i])
d = d.reshape(len(d), 1)
inputs.append(httpclient.InferInput(name, d.shape, np_to_triton_dtype(col_dtypes[i])))
inputs[i].set_data_from_numpy(d)
# placeholder variables for the output
outputs = []
outputs.append(httpclient.InferRequestedOutput("OUTPUT0"))
# make the request
with httpclient.InferenceServerClient("localhost:8001") as client:
response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs)
print("predicted sigmoid result:\n", response.as_numpy("OUTPUT0"))
userId movieId
15347762 99476 104374
16647840 107979 2634
23915192 155372 1614
10052313 65225 7153
12214125 79161 500
... ... ...
17138306 111072 1625
21326655 138575 81591
5664631 36671 8861
217658 1535 111759
11842246 76766 109487
[64 rows x 2 columns]
predicted sigmoid result:
[0.5441651 0.44610545 0.6183038 0.4781851 0.57211477 0.45879382
0.5173291 0.4749932 0.55234563 0.6497125 0.6145904 0.54569465
0.61635995 0.54713815 0.5746383 0.66888094 0.66942275 0.57108265
0.5042718 0.54487634 0.5981037 0.65488183 0.5742305 0.5930837
0.6032248 0.6174893 0.5496881 0.54655844 0.5496461 0.6790834
0.5503165 0.61907697 0.5715238 0.6069336 0.6044322 0.6263752
0.5387236 0.6224779 0.59225804 0.6021576 0.62560654 0.5602548
0.5573395 0.6082372 0.599744 0.55870736 0.6260935 0.67932445
0.6371034 0.63626426 0.61129224 0.5861754 0.55234563 0.58470285
0.66258055 0.51953226 0.56719464 0.538553 0.58615 0.42244497
0.51779014 0.5611309 0.55880654 0.5693609 ]