Source code for merlin.models.tf.blocks.sampling.cross_batch

#
# Copyright (c) 2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Optional

import tensorflow as tf
from tensorflow.python.ops import embedding_ops

from merlin.models.tf.blocks.sampling.base import EmbeddingWithMetadata, ItemSampler
from merlin.models.tf.typing import TabularData


[docs]@tf.keras.utils.register_keras_serializable(package="merlin.models") class PopularityBasedSampler(ItemSampler): """ Provides a popularity-based negative sampling for the softmax layer to ensure training efficiency when the catalog of items is very large. The capacity of the queue is fixed and is equal to the catalog size. For each batch, we sample `max_num_samples` unique negatives. This implementation does not require the actual item frequencies/probabilities distribution, but instead tries to approximate the item probabilities using the log_uniform (zipfian) distribution. The only requirement is that the item ids are decreasingly sorted by their count frequency. We use the default log-uniform (zipfian) sampler given by Tensorflow: [log_uniform_candidate_sampler](https://www.tensorflow.org/api_docs/python/tf/random/log_uniform_candidate_sampler) The `Categorify` op provided by nvtabular supports the frequency-based encoding as default. P.s. Ignoring the false negatives (negative items equal to the positive ones) is managed by `ItemRetrievalScorer(..., sampling_downscore_false_negatives=True)` Parameters ---------- max_num_samples: int The number of unique negatives to sample at each batch. max_id: int The maximum id value to be sampled. It should be equal to the categorical feature cardinality min_id: int The minimum id value to be sampled. Useful to ignore the first categorical encoded ids, which are usually reserved for <nulls>, out-of-vocabulary or padding. Defaults to 0. seed: int Fix the random values returned by the sampler to ensure reproducibility Defaults to None item_id_feature_name: str Name of the column containing the item ids Defaults to `item_id` """
[docs] def __init__( self, max_id: int, min_id: int = 0, max_num_samples: int = 100, seed: Optional[int] = None, item_id_feature_name: str = "item_id", **kwargs, ): super().__init__(max_num_samples=max_num_samples, **kwargs) self.max_id = max_id self.min_id = min_id self.seed = seed self.item_id_feature_name = item_id_feature_name assert ( self.max_num_samples <= self.max_id ), f"Number of items to sample `{self.max_num_samples}`" f" should be less than total number of ids `{self.max_id}`"
def _check_inputs(self, inputs): assert ( self.item_id_feature_name in inputs["metadata"] ), "The 'item_id' metadata feature is required by PopularityBasedSampler."
[docs] def add(self, embeddings: tf.Tensor, items_metadata: TabularData, training=True): pass
[docs] def call( self, inputs: TabularData, item_weights: tf.Tensor, training=True ) -> EmbeddingWithMetadata: if training: self._check_inputs(inputs) tf.assert_equal( int(tf.shape(item_weights)[0]), self.max_id + 1, "The first dimension of the items embeddings " f"({int(tf.shape(item_weights)[0])}) and " f"the the number of possible classes ({self.max_id+1}) should match.", ) items_embeddings = self.sample(item_weights) return items_embeddings
def _required_features(self): return [self.item_id_feature_name]
[docs] def sample(self, item_weights) -> EmbeddingWithMetadata: # type: ignore sampled_ids, _, _ = tf.random.log_uniform_candidate_sampler( true_classes=tf.ones((1, 1), dtype=tf.int64), num_true=1, num_sampled=self.max_num_samples, unique=True, range_max=self.max_id - self.min_id, seed=self.seed, ) # Shifting the sampled ids to ignore the first ids (usually reserved for nulls, OOV) sampled_ids += self.min_id items_embeddings = embedding_ops.embedding_lookup(item_weights, sampled_ids) return EmbeddingWithMetadata( items_embeddings, metadata={self.item_id_feature_name: tf.cast(sampled_ids, tf.int32)}, )
[docs] def get_distribution_probs(self): """Tries to approximate the log uniform (zipfian) distribution used by tf.random.log_uniform_candidate_sampler (https://www.tensorflow.org/api_docs/python/tf/random/log_uniform_candidate_sampler) Returns ------- tf.Tensor A tensor with the expected probability distribution of item ids assuming log-uniform (zipfian) distribution """ range_max = self.max_id - self.min_id ids = tf.range(0, range_max, dtype=tf.float32) estimated_probs = (tf.math.log(ids + 2.0) - tf.math.log(ids + 1.0)) / tf.math.log( range_max + 1.0 ) # Appending zero(s) in the beginning as padding items should never be samples # (thus prob must be zero) estimated_probs = tf.concat( [tf.zeros(self.min_id + 1, dtype=tf.float32), estimated_probs], axis=0 ) return estimated_probs
[docs] def get_config(self): config = super().get_config() config["max_id"] = self.max_id config["min_id"] = self.min_id config["max_num_samples"] = self.max_num_samples config["seed"] = self.seed config["item_id_feature_name"] = self.item_id_feature_name return config