# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Each user is responsible for checking the content of datasets and the
# applicable licenses and determining if suitable for the intended use.
http://developer.download.nvidia.com/notebooks/dlsw-notebooks/merlin_hugectr_multi-modal-data-04-feature-extraction-text/nvidia_logo.png

Movie Synopsis Feature Extraction with Bart text summarization

In this notebook, will will make use of the BART model to extract features from movie synopsis.

Note: this notebook should be executed from within the below container:

docker pull huggingface/transformers-pytorch-gpu
docker run --gpus=all  --rm -it --net=host -v $PWD:/workspace --ipc=host huggingface/transformers-pytorch-gpu 

Then from within the container:

cd /workspace
pip install jupyter jupyterlab
jupyter server extension disable nbclassic
jupyter-lab --allow-root --ip='0.0.0.0' --NotebookApp.token='admin'

First, we install some extra package.

!pip install imdbpy

# Cuda 11 and A100 support
!pip3 install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
import IPython

IPython.Application.instance().kernel.do_shutdown(True)
{'status': 'ok', 'restart': True}

Download pretrained BART model

First, we download a pretrained BART model from HuggingFace library.

from transformers import BartTokenizer, BartModel
import torch

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartModel.from_pretrained('facebook/bart-large').cuda()

Extracting embeddings for all movie’s synopsis

We will use the average hidden state of the last decoder layer as text feature, comprising 1024 float values.

import pickle

with open('movies_info.pkl', 'rb') as f:
    movies_infos = pickle.load(f)['movies_infos']
import torch
import numpy as np
from tqdm import tqdm

embeddings = {}
for movie, movie_info in tqdm(movies_infos.items()):
    synopsis = None
    synopsis = movie_info.get('synopsis')
    if synopsis is None:
        plots = movie_info.get('plot')
        if plots is not None:
            synopsis = plots[0]
    
    if synopsis is not None:
        inputs = tokenizer(synopsis, return_tensors="pt", truncation=True, max_length=1024).to('cuda')
        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True)
        embeddings[movie] = outputs.last_hidden_state.cpu().detach().numpy()
100%|██████████| 62423/62423 [43:41<00:00, 23.81it/s]  
average_embeddings = {}
for movie in embeddings:
    average_embeddings[movie] = np.mean(embeddings[movie].squeeze(), axis=0)
with open('movies_synopsis_embeddings-1024.pkl', 'wb') as f:
    pickle.dump({"embeddings": average_embeddings}, f, protocol=pickle.HIGHEST_PROTOCOL)