# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Each user is responsible for checking the content of datasets and the
# applicable licenses and determining if suitable for the intended use.
http://developer.download.nvidia.com/notebooks/dlsw-notebooks/merlin_hugectr_multi-modal-data-05-create-feature-store/nvidia_logo.png

Creating Multi-Modal Movie Feature Store

Finally, with both the text and image features ready, we now put the multi-modal movie features into a unified feature store.

If you have downloaded the real data and proceeded through the feature extraction process in notebooks 03-05, then proceed to create the feature store. Else, skip to the Synthetic data section below to create random features.

Real data

import pickle

with open('movies_poster_features.pkl', 'rb') as f:
    poster_feature = pickle.load(f)["feature_dict"]
    
len(poster_feature)
61947
with open('movies_synopsis_embeddings-1024.pkl', 'rb') as f:
    text_feature = pickle.load(f)["embeddings"]
len(text_feature)
61291
import pandas as pd
links = pd.read_csv("./data/ml-25m/links.csv", dtype={"imdbId": str})
links.shape
(62423, 3)
links.head()
movieId imdbId tmdbId
0 1 0114709 862.0
1 2 0113497 8844.0
2 3 0113228 15602.0
3 4 0114885 31357.0
4 5 0113041 11862.0
poster_feature['0105812'].shape
(2048,)
import numpy as np
feature_array = np.zeros((len(links), 1+2048+1024))

for i, row in links.iterrows():
    feature_array[i,0] = row['movieId']
    if row['imdbId'] in poster_feature:
        feature_array[i,1:2049] = poster_feature[row['imdbId']]
    if row['movieId'] in text_feature:
        feature_array[i,2049:] = text_feature[row['movieId']]
    
dtype= {**{'movieId': np.int64},**{x: np.float32 for x in ['poster_feature_%d'%i for i in range(2048)]+['text_feature_%d'%i for i in range(1024)]}}
len(dtype)
3073
feature_df = pd.DataFrame(feature_array, columns=['movieId']+['poster_feature_%d'%i for i in range(2048)]+['text_feature_%d'%i for i in range(1024)])
feature_df.head()
movieId poster_feature_0 poster_feature_1 poster_feature_2 poster_feature_3 poster_feature_4 poster_feature_5 poster_feature_6 poster_feature_7 poster_feature_8 ... text_feature_1014 text_feature_1015 text_feature_1016 text_feature_1017 text_feature_1018 text_feature_1019 text_feature_1020 text_feature_1021 text_feature_1022 text_feature_1023
0 1.0 0.000000 0.088281 0.036760 0.000000 0.006470 0.000000 0.023553 0.000163 0.238797 ... 0.291230 -0.197272 0.024294 1.307049 -0.789571 0.084938 -0.187339 0.061683 0.183281 -0.356245
1 2.0 0.000000 0.000000 0.000000 0.289105 0.134672 0.691380 0.045417 0.000000 0.051422 ... 0.203168 -0.617449 0.443821 1.501953 -0.736949 0.180542 -0.313696 0.274087 0.153105 -0.218745
2 3.0 0.000000 0.187553 0.000000 0.904370 0.069441 0.026665 0.817211 0.000000 0.125072 ... 0.173140 -0.209240 0.451933 1.491917 -0.743956 -0.069061 -0.900011 0.583347 0.192817 0.224088
3 4.0 0.182279 0.014646 0.004135 0.197796 0.077938 0.000000 0.215127 0.021160 0.023108 ... -0.394012 0.679462 1.225475 1.196255 -0.169627 -0.008575 -0.172138 0.114755 -0.127861 -0.003679
4 5.0 0.000000 0.082123 0.447287 0.002375 0.135956 0.000000 0.989514 0.808180 0.317510 ... -0.176658 -0.078992 0.726118 1.017430 -0.249834 0.183357 -0.071451 0.644567 0.090399 -1.147284

5 rows × 3073 columns

feature_df.shape
(62423, 3073)
!pip install pyarrow
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Requirement already satisfied: pyarrow in /usr/local/lib/python3.8/dist-packages (1.0.1)
Requirement already satisfied: numpy>=1.14 in /usr/local/lib/python3.8/dist-packages (from pyarrow) (1.20.3)
WARNING: You are using pip version 21.0.1; however, version 21.1.2 is available.
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.
feature_df.to_parquet('feature_df.parquet')

Synthetic data

If you have not extrated image and text features from real data, proceed with this section to create synthetic features.

import pandas as pd
links = pd.read_csv("./data/ml-25m/links.csv", dtype={"imdbId": str})
import numpy as np

feature_array = np.random.rand(links.shape[0], 3073)
feature_array[:,0] = links['movieId'].values
feature_df = pd.DataFrame(feature_array, columns=['movieId']+['poster_feature_%d'%i for i in range(2048)]+['text_feature_%d'%i for i in range(1024)])
feature_df.to_parquet('feature_df.parquet')
feature_df.head()
movieId poster_feature_0 poster_feature_1 poster_feature_2 poster_feature_3 poster_feature_4 poster_feature_5 poster_feature_6 poster_feature_7 poster_feature_8 ... text_feature_1014 text_feature_1015 text_feature_1016 text_feature_1017 text_feature_1018 text_feature_1019 text_feature_1020 text_feature_1021 text_feature_1022 text_feature_1023
0 1.0 0.026260 0.857608 0.410247 0.066654 0.382803 0.899998 0.511562 0.592291 0.565434 ... 0.636716 0.578369 0.996169 0.402107 0.412318 0.859952 0.293852 0.341114 0.727113 0.085829
1 2.0 0.141265 0.721758 0.679958 0.955634 0.391091 0.324611 0.505211 0.258331 0.048264 ... 0.161505 0.431864 0.836532 0.525013 0.654566 0.823841 0.818313 0.856280 0.638048 0.685537
2 3.0 0.119418 0.911146 0.470762 0.762258 0.626335 0.768947 0.241833 0.775992 0.236340 ... 0.865548 0.387806 0.668321 0.552122 0.750238 0.863707 0.382173 0.894487 0.565142 0.164083
3 4.0 0.538184 0.980678 0.643513 0.928519 0.794906 0.201022 0.744666 0.962188 0.915320 ... 0.777534 0.904200 0.167337 0.875194 0.180481 0.815904 0.808288 0.036711 0.902779 0.580946
4 5.0 0.772951 0.239788 0.061874 0.162997 0.388310 0.236311 0.162757 0.207134 0.111078 ... 0.250022 0.335043 0.091674 0.121507 0.418124 0.150020 0.803506 0.059504 0.002342 0.932321

5 rows × 3073 columns