# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# Each user is responsible for checking the content of datasets and the
# applicable licenses and determining if suitable for the intended use.
Creating Multi-Modal Movie Feature Store
Finally, with both the text and image features ready, we now put the multi-modal movie features into a unified feature store.
If you have downloaded the real data and proceeded through the feature extraction process in notebooks 03-05, then proceed to create the feature store. Else, skip to the Synthetic data
section below to create random features.
Real data
import pickle
with open('movies_poster_features.pkl', 'rb') as f:
poster_feature = pickle.load(f)["feature_dict"]
len(poster_feature)
61947
with open('movies_synopsis_embeddings-1024.pkl', 'rb') as f:
text_feature = pickle.load(f)["embeddings"]
len(text_feature)
61291
import pandas as pd
links = pd.read_csv("./data/ml-25m/links.csv", dtype={"imdbId": str})
links.shape
(62423, 3)
links.head()
movieId | imdbId | tmdbId | |
---|---|---|---|
0 | 1 | 0114709 | 862.0 |
1 | 2 | 0113497 | 8844.0 |
2 | 3 | 0113228 | 15602.0 |
3 | 4 | 0114885 | 31357.0 |
4 | 5 | 0113041 | 11862.0 |
poster_feature['0105812'].shape
(2048,)
import numpy as np
feature_array = np.zeros((len(links), 1+2048+1024))
for i, row in links.iterrows():
feature_array[i,0] = row['movieId']
if row['imdbId'] in poster_feature:
feature_array[i,1:2049] = poster_feature[row['imdbId']]
if row['movieId'] in text_feature:
feature_array[i,2049:] = text_feature[row['movieId']]
dtype= {**{'movieId': np.int64},**{x: np.float32 for x in ['poster_feature_%d'%i for i in range(2048)]+['text_feature_%d'%i for i in range(1024)]}}
len(dtype)
3073
feature_df = pd.DataFrame(feature_array, columns=['movieId']+['poster_feature_%d'%i for i in range(2048)]+['text_feature_%d'%i for i in range(1024)])
feature_df.head()
movieId | poster_feature_0 | poster_feature_1 | poster_feature_2 | poster_feature_3 | poster_feature_4 | poster_feature_5 | poster_feature_6 | poster_feature_7 | poster_feature_8 | ... | text_feature_1014 | text_feature_1015 | text_feature_1016 | text_feature_1017 | text_feature_1018 | text_feature_1019 | text_feature_1020 | text_feature_1021 | text_feature_1022 | text_feature_1023 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 0.000000 | 0.088281 | 0.036760 | 0.000000 | 0.006470 | 0.000000 | 0.023553 | 0.000163 | 0.238797 | ... | 0.291230 | -0.197272 | 0.024294 | 1.307049 | -0.789571 | 0.084938 | -0.187339 | 0.061683 | 0.183281 | -0.356245 |
1 | 2.0 | 0.000000 | 0.000000 | 0.000000 | 0.289105 | 0.134672 | 0.691380 | 0.045417 | 0.000000 | 0.051422 | ... | 0.203168 | -0.617449 | 0.443821 | 1.501953 | -0.736949 | 0.180542 | -0.313696 | 0.274087 | 0.153105 | -0.218745 |
2 | 3.0 | 0.000000 | 0.187553 | 0.000000 | 0.904370 | 0.069441 | 0.026665 | 0.817211 | 0.000000 | 0.125072 | ... | 0.173140 | -0.209240 | 0.451933 | 1.491917 | -0.743956 | -0.069061 | -0.900011 | 0.583347 | 0.192817 | 0.224088 |
3 | 4.0 | 0.182279 | 0.014646 | 0.004135 | 0.197796 | 0.077938 | 0.000000 | 0.215127 | 0.021160 | 0.023108 | ... | -0.394012 | 0.679462 | 1.225475 | 1.196255 | -0.169627 | -0.008575 | -0.172138 | 0.114755 | -0.127861 | -0.003679 |
4 | 5.0 | 0.000000 | 0.082123 | 0.447287 | 0.002375 | 0.135956 | 0.000000 | 0.989514 | 0.808180 | 0.317510 | ... | -0.176658 | -0.078992 | 0.726118 | 1.017430 | -0.249834 | 0.183357 | -0.071451 | 0.644567 | 0.090399 | -1.147284 |
5 rows × 3073 columns
feature_df.shape
(62423, 3073)
!pip install pyarrow
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Requirement already satisfied: pyarrow in /usr/local/lib/python3.8/dist-packages (1.0.1)
Requirement already satisfied: numpy>=1.14 in /usr/local/lib/python3.8/dist-packages (from pyarrow) (1.20.3)
WARNING: You are using pip version 21.0.1; however, version 21.1.2 is available.
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.
feature_df.to_parquet('feature_df.parquet')
Synthetic data
If you have not extrated image and text features from real data, proceed with this section to create synthetic features.
import pandas as pd
links = pd.read_csv("./data/ml-25m/links.csv", dtype={"imdbId": str})
import numpy as np
feature_array = np.random.rand(links.shape[0], 3073)
feature_array[:,0] = links['movieId'].values
feature_df = pd.DataFrame(feature_array, columns=['movieId']+['poster_feature_%d'%i for i in range(2048)]+['text_feature_%d'%i for i in range(1024)])
feature_df.to_parquet('feature_df.parquet')
feature_df.head()
movieId | poster_feature_0 | poster_feature_1 | poster_feature_2 | poster_feature_3 | poster_feature_4 | poster_feature_5 | poster_feature_6 | poster_feature_7 | poster_feature_8 | ... | text_feature_1014 | text_feature_1015 | text_feature_1016 | text_feature_1017 | text_feature_1018 | text_feature_1019 | text_feature_1020 | text_feature_1021 | text_feature_1022 | text_feature_1023 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 0.026260 | 0.857608 | 0.410247 | 0.066654 | 0.382803 | 0.899998 | 0.511562 | 0.592291 | 0.565434 | ... | 0.636716 | 0.578369 | 0.996169 | 0.402107 | 0.412318 | 0.859952 | 0.293852 | 0.341114 | 0.727113 | 0.085829 |
1 | 2.0 | 0.141265 | 0.721758 | 0.679958 | 0.955634 | 0.391091 | 0.324611 | 0.505211 | 0.258331 | 0.048264 | ... | 0.161505 | 0.431864 | 0.836532 | 0.525013 | 0.654566 | 0.823841 | 0.818313 | 0.856280 | 0.638048 | 0.685537 |
2 | 3.0 | 0.119418 | 0.911146 | 0.470762 | 0.762258 | 0.626335 | 0.768947 | 0.241833 | 0.775992 | 0.236340 | ... | 0.865548 | 0.387806 | 0.668321 | 0.552122 | 0.750238 | 0.863707 | 0.382173 | 0.894487 | 0.565142 | 0.164083 |
3 | 4.0 | 0.538184 | 0.980678 | 0.643513 | 0.928519 | 0.794906 | 0.201022 | 0.744666 | 0.962188 | 0.915320 | ... | 0.777534 | 0.904200 | 0.167337 | 0.875194 | 0.180481 | 0.815904 | 0.808288 | 0.036711 | 0.902779 | 0.580946 |
4 | 5.0 | 0.772951 | 0.239788 | 0.061874 | 0.162997 | 0.388310 | 0.236311 | 0.162757 | 0.207134 | 0.111078 | ... | 0.250022 | 0.335043 | 0.091674 | 0.121507 | 0.418124 | 0.150020 | 0.803506 | 0.059504 | 0.002342 | 0.932321 |
5 rows × 3073 columns