#
# Copyright (c) 2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Optional
import tensorflow as tf
from tensorflow.python.ops import embedding_ops
from merlin.models.tf.blocks.sampling.base import EmbeddingWithMetadata, ItemSampler
from merlin.models.tf.typing import TabularData
[docs]@tf.keras.utils.register_keras_serializable(package="merlin.models")
class PopularityBasedSampler(ItemSampler):
"""
Provides a popularity-based negative sampling for the softmax layer
to ensure training efficiency when the catalog of items is very large.
The capacity of the queue is fixed and is equal to the catalog size.
For each batch, we sample `max_num_samples` unique negatives.
This implementation does not require the actual item frequencies/probabilities
distribution, but instead tries to approximate the item
probabilities using the log_uniform (zipfian) distribution.
The only requirement is that the item ids are decreasingly sorted by their count frequency.
We use the default log-uniform (zipfian) sampler given by Tensorflow:
[log_uniform_candidate_sampler](https://www.tensorflow.org/api_docs/python/tf/random/log_uniform_candidate_sampler)
The `Categorify` op provided by nvtabular supports the frequency-based encoding as default.
P.s. Ignoring the false negatives (negative items equal to the positive ones) is
managed by `ItemRetrievalScorer(..., sampling_downscore_false_negatives=True)`
Parameters
----------
max_num_samples: int
The number of unique negatives to sample at each batch.
max_id: int
The maximum id value to be sampled. It should be equal to the
categorical feature cardinality
min_id: int
The minimum id value to be sampled. Useful to ignore the first categorical
encoded ids, which are usually reserved for <nulls>, out-of-vocabulary or padding.
Defaults to 0.
seed: int
Fix the random values returned by the sampler to ensure reproducibility
Defaults to None
item_id_feature_name: str
Name of the column containing the item ids
Defaults to `item_id`
"""
[docs] def __init__(
self,
max_id: int,
min_id: int = 0,
max_num_samples: int = 100,
seed: Optional[int] = None,
item_id_feature_name: str = "item_id",
**kwargs,
):
super().__init__(max_num_samples=max_num_samples, **kwargs)
self.max_id = max_id
self.min_id = min_id
self.seed = seed
self.item_id_feature_name = item_id_feature_name
assert (
self.max_num_samples <= self.max_id
), f"Number of items to sample `{self.max_num_samples}`"
f" should be less than total number of ids `{self.max_id}`"
def _check_inputs(self, inputs):
assert (
self.item_id_feature_name in inputs["metadata"]
), "The 'item_id' metadata feature is required by PopularityBasedSampler."
[docs] def add(self, embeddings: tf.Tensor, items_metadata: TabularData, training=True):
pass
[docs] def call(
self, inputs: TabularData, item_weights: tf.Tensor, training=True
) -> EmbeddingWithMetadata:
if training:
self._check_inputs(inputs)
tf.assert_equal(
int(tf.shape(item_weights)[0]),
self.max_id + 1,
"The first dimension of the items embeddings "
f"({int(tf.shape(item_weights)[0])}) and "
f"the the number of possible classes ({self.max_id+1}) should match.",
)
items_embeddings = self.sample(item_weights)
return items_embeddings
def _required_features(self):
return [self.item_id_feature_name]
[docs] def sample(self, item_weights) -> EmbeddingWithMetadata: # type: ignore
sampled_ids, _, _ = tf.random.log_uniform_candidate_sampler(
true_classes=tf.ones((1, 1), dtype=tf.int64),
num_true=1,
num_sampled=self.max_num_samples,
unique=True,
range_max=self.max_id - self.min_id,
seed=self.seed,
)
# Shifting the sampled ids to ignore the first ids (usually reserved for nulls, OOV)
sampled_ids += self.min_id
items_embeddings = embedding_ops.embedding_lookup(item_weights, sampled_ids)
return EmbeddingWithMetadata(
items_embeddings,
metadata={self.item_id_feature_name: tf.cast(sampled_ids, tf.int32)},
)
[docs] def get_distribution_probs(self):
"""Tries to approximate the log uniform (zipfian) distribution
used by tf.random.log_uniform_candidate_sampler
(https://www.tensorflow.org/api_docs/python/tf/random/log_uniform_candidate_sampler)
Returns
-------
tf.Tensor
A tensor with the expected probability distribution of item ids
assuming log-uniform (zipfian) distribution
"""
range_max = self.max_id - self.min_id
ids = tf.range(0, range_max, dtype=tf.float32)
estimated_probs = (tf.math.log(ids + 2.0) - tf.math.log(ids + 1.0)) / tf.math.log(
range_max + 1.0
)
# Appending zero(s) in the beginning as padding items should never be samples
# (thus prob must be zero)
estimated_probs = tf.concat(
[tf.zeros(self.min_id + 1, dtype=tf.float32), estimated_probs], axis=0
)
return estimated_probs
[docs] def get_config(self):
config = super().get_config()
config["max_id"] = self.max_id
config["min_id"] = self.min_id
config["max_num_samples"] = self.max_num_samples
config["seed"] = self.seed
config["item_id_feature_name"] = self.item_id_feature_name
return config