# Copyright 2020 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Each user is responsible for checking the content of datasets and the
# applicable licenses and determining if suitable for the intended use.
https://developer.download.nvidia.com/notebooks/dlsw-notebooks/merlin_merlin_getting-started-movielens-04-triton-inference-with-tf/nvidia_logo.png

Serve Recommendations from the TensorFlow Model#

This notebook is created using the latest stable merlin-tensorflow container.

The last step is to deploy the saved model to production.

Launching and Starting the Triton Server#

We need to start the Triton Inference Server first. We can do that with the following command. You need to provide correct path for the models directory. Note that since we add --model-control-mode=explicit flag the models will not be loaded yet, we will load our model below.

tritonserver --model-repository=path_to_models --backend-config=tensorflow,version=2 --model-control-mode=explicit 

Note: The model-repository path is /workspace/nvt-examples/models/ensemble. The model hasn’t been loaded, yet. Below, we will request the Triton server to load the saved Tensorflow model.

# External dependencies
import os
from time import time

# Get dataframe library - cudf or pandas
from merlin.core.dispatch import get_lib
df_lib = get_lib()

import tritonclient.grpc as grpcclient
import merlin.systems.triton as merlin_triton

import merlin.dtypes as md
from merlin.schema import ColumnSchema, Schema

We define our base directory, containing the data.

# path to preprocessed data
INPUT_DATA_DIR = os.environ.get(
    "INPUT_DATA_DIR", os.path.expanduser("/workspace/nvt-examples/movielens/data/")
)

Let’s deactivate the warnings before sending requests.

import warnings

warnings.filterwarnings("ignore")

Loading the Tensorflow Model with Triton Inference Server#

At this stage, you should have launched the Triton Inference Server docker container with the instructions above.

Let’s connect to the Triton Inference Server. Use Triton’s ready endpoint to verify that the server and the models are ready for inference. Replace localhost with your host ip address.

import tritonhttpclient

try:
    triton_client = tritonhttpclient.InferenceServerClient(url="localhost:8000", verbose=True)
    print("client created.")
except Exception as e:
    print("channel creation failed: " + str(e))
client created.
/usr/local/lib/python3.8/dist-packages/tritonhttpclient/__init__.py:31: DeprecationWarning: The package `tritonhttpclient` is deprecated and will be removed in a future version. Please use instead `tritonclient.http`
  warnings.warn(
import warnings

warnings.filterwarnings("ignore")

We check if the server is alive.

triton_client.is_server_live()
GET /v2/health/live, headers None
<HTTPSocketPoolResponse status=200 headers={'content-length': '0', 'content-type': 'text/plain'}>
True

The HTTP request returns status 200 if Triton is ready and non-200 if it is not ready.

We check the available models in the repositories:

triton_client.get_model_repository_index()
POST /v2/repository/index, headers None

<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '64'}>
bytearray(b'[{"name":"0_predicttensorflowtriton"},{"name":"ensemble_model"}]')
[{'name': '0_predicttensorflowtriton'}, {'name': 'ensemble_model'}]

We load the 0_predicttensorflowtriton model.

%%time

triton_client.load_model(model_name="0_predicttensorflowtriton")
POST /v2/repository/models/0_predicttensorflowtriton/load, headers None
{}
<HTTPSocketPoolResponse status=200 headers={'content-type': 'application/json', 'content-length': '0'}>
Loaded model '0_predicttensorflowtriton'
CPU times: user 0 ns, sys: 1.86 ms, total: 1.86 ms
Wall time: 623 ms

Predicting#

Let’s now craft a request and obtain a response from the Triton Inference Server.

We will use the first 3 rows of userId and movieId as input. This is the transformed dataset that we used to train our model.

valid = df_lib.read_parquet(
    os.path.join(INPUT_DATA_DIR, "valid.parquet"), columns=["userId", "movieId"]
)
batch = valid[:3]
print(batch)
          userId  movieId
9098613    59266      553
2556225    17037      608
20514728  133412    40583

We now send the request.

request_schema = Schema([
    ColumnSchema("userId", dtype=md.int64),
    ColumnSchema("movieId", dtype=md.int64),
])

inputs = merlin_triton.convert_df_to_triton_input(request_schema, batch, grpcclient.InferInput)

outputs = [
    grpcclient.InferRequestedOutput(col)
    for col in ["rating/binary_output"]
]

with grpcclient.InferenceServerClient("localhost:8001") as client:
    response = client.infer("0_predicttensorflowtriton", inputs, request_id="1", outputs=outputs)

Let’s decode the request and see what information we receive.

print(response.as_numpy("rating/binary_output"), response.as_numpy("rating/binary_output").shape)
[[0.60931313]
 [0.6242866 ]
 [0.6068166 ]] (3, 1)

The returned scores reflect the probability that a user of a given id will rate highly the movie referenced in the movieId column.