#
# Copyright (c) 2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from typing import Dict, Optional, Sequence, Union
import tensorflow as tf
from keras.utils import tf_inspect
from tensorflow.python.ops import embedding_ops
from merlin.models.tf.blocks.sampling.base import ItemSampler
from merlin.models.tf.core.base import Block, BlockType, EmbeddingWithMetadata, PredictionOutput
from merlin.models.tf.core.combinators import ParallelBlock
from merlin.models.tf.core.tabular import TabularAggregationType
from merlin.models.tf.models.base import ModelBlock
from merlin.models.tf.transforms.regularization import L2Norm
from merlin.models.tf.typing import TabularData
from merlin.models.tf.utils.tf_utils import (
maybe_deserialize_keras_objects,
maybe_serialize_keras_objects,
rescore_false_negatives,
)
from merlin.models.utils.constants import MIN_FLOAT
from merlin.schema import Schema
LOG = logging.getLogger("merlin_models")
@tf.keras.utils.register_keras_serializable(package="merlin_models")
class TowerBlock(ModelBlock):
"""TowerBlock to wrap item or query tower"""
pass
class RetrievalMixin:
def query_block(self) -> TowerBlock:
"""Method to return the query tower from a RetrievalModel instance"""
raise NotImplementedError()
def item_block(self) -> TowerBlock:
"""Method to return the item tower from a RetrievalModel instance"""
raise NotImplementedError()
[docs]@tf.keras.utils.register_keras_serializable(package="merlin.models")
class DualEncoderBlock(ParallelBlock):
[docs] def __init__(
self,
query_block: Block,
item_block: Block,
pre: Optional[BlockType] = None,
post: Optional[BlockType] = None,
aggregation: Optional[TabularAggregationType] = None,
schema: Optional[Schema] = None,
name: Optional[str] = None,
strict: bool = False,
l2_normalization: bool = False,
**kwargs,
):
"""Prepare the Query and Item towers of a Retrieval block
Parameters
----------
query_block : Block
The `Block` instance that combines user features
item_block : Block
Optional `Block` instance that combines items features.
pre : Optional[BlockType], optional
Optional `Block` instance to apply before the `call` method of the Two-Tower block
post : Optional[BlockType], optional
Optional `Block` instance to apply on both outputs of Two-tower model
aggregation : Optional[TabularAggregationType], optional
The Aggregation operation to apply after processing the `call` method
to output a single Tensor.
schema : Optional[Schema], optional
The `Schema` object with the input features.
name : Optional[str], optional
Name of the layer.
strict : bool, optional
If enabled, check that the input of the ParallelBlock instance is a dictionary.
l2_normalization: bool
Apply L2 normalization to the user and item representations before
computing dot interactions.
Defaults to False.
"""
if l2_normalization:
query_block = query_block.connect(L2Norm())
item_block = item_block.connect(L2Norm())
self._query_block = TowerBlock(query_block)
self._item_block = TowerBlock(item_block)
branches = {"query": self._query_block, "item": self._item_block}
super().__init__(
branches,
pre=pre,
post=post,
aggregation=aggregation,
schema=schema,
name=name,
strict=strict,
**kwargs,
)
[docs] def query_block(self) -> TowerBlock:
return self._query_block
[docs] def item_block(self) -> TowerBlock:
return self._item_block
[docs] @classmethod
def from_config(cls, config, custom_objects=None):
inputs, config = cls.parse_config(config, custom_objects)
output = ParallelBlock(inputs, **config)
output.__class__ = cls
return output
@Block.registry.register_with_multiple_names("item_retrieval_scorer")
@tf.keras.utils.register_keras_serializable(package="merlin_models")
class ItemRetrievalScorer(Block):
"""Block for ItemRetrieval, which expects query/user and item embeddings as input and
uses dot product to score the positive item (inputs["item"]) and also sampled negative
items (during training).
Parameters
----------
samplers : List[ItemSampler], optional
List of item samplers that provide negative samples when `training=True`
sampling_downscore_false_negatives : bool, optional
Identify false negatives (sampled item ids equal to the positive item and downscore them
to the `sampling_downscore_false_negatives_value`), by default True
sampling_downscore_false_negatives_value : int, optional
Value to be used to downscore false negatives when
`sampling_downscore_false_negatives=True`, by default `np.finfo(np.float32).min / 100.0`
item_id_feature_name: str
Name of the column containing the item ids
Defaults to `item_id`
query_name: str
Identify query tower for query/user embeddings, by default 'query'
item_name: str
Identify item tower for item embeddings, by default'item'
cache_query: bool
Add query embeddings to the context block, by default False
sampled_softmax_mode: bool
Use sampled softmax for scoring, by default False
store_negative_ids: bool
Returns negative items ids as part of the output, by default False
"""
def __init__(
self,
samplers: Sequence[ItemSampler] = (),
sampling_downscore_false_negatives=True,
sampling_downscore_false_negatives_value: float = MIN_FLOAT,
item_id_feature_name: str = "item_id",
item_domain: str = "item_id",
query_name: str = "query",
item_name: str = "item",
cache_query: bool = False,
sampled_softmax_mode: bool = False,
store_negative_ids: bool = False,
**kwargs,
):
super().__init__(**kwargs)
self.downscore_false_negatives = sampling_downscore_false_negatives
self.false_negatives_score = sampling_downscore_false_negatives_value
self.item_id_feature_name = item_id_feature_name
self.item_domain = item_domain
self.query_name = query_name
self.item_name = item_name
self.cache_query = cache_query
self.store_negative_ids = store_negative_ids
if not isinstance(samplers, (list, tuple)):
samplers = (samplers,) # type: ignore
self.samplers = samplers
self.sampled_softmax_mode = sampled_softmax_mode
self.set_required_features()
def build(self, input_shapes):
if isinstance(input_shapes, dict):
query_shape = input_shapes[self.query_name]
self.context.add_weight(
name="query",
shape=query_shape,
dtype=tf.float32,
trainable=False,
initializer=tf.keras.initializers.Zeros(),
)
super().build(input_shapes)
def _check_input_from_two_tower(self, inputs):
if set(inputs.keys()) != set([self.query_name, self.item_name]):
raise ValueError(
f"Wrong input-names, expected: {[self.query_name, self.item_name]} "
f"but got: {inputs.keys()}"
)
def call(
self,
inputs: Union[tf.Tensor, TabularData],
training: bool = True,
testing: bool = False,
**kwargs,
) -> Union[tf.Tensor, TabularData]:
"""Based on the user/query embedding (inputs[self.query_name]), uses dot product to score
the positive item (inputs["item"]).
For the sampled-softmax mode, logits are computed by multiplying the query vector
and the item embeddings matrix (self.context.get_embedding(self.item_domain))
Parameters
----------
inputs : Union[tf.Tensor, TabularData]
Dict with the query and item embeddings (e.g. `{"query": <emb>}, "item": <emb>}`),
where embeddings are 2D tensors (batch size, embedding size)
training : bool, optional
Flag that indicates whether in training mode, by default True
Returns
-------
tf.Tensor
2D Tensor with the scores for the positive items,
If `training=True`, return the original inputs
"""
if self.cache_query:
# enabled only during top-k evaluation
query = inputs[self.query_name]
context_query_size = tf.shape(self.context["query"])[0]
# pad with zeros to match shape of initial query variable
padding_size = context_query_size - tf.shape(query)[0]
if padding_size > 0:
query = tf.pad(query, [[0, padding_size], [0, 0]])
query = query[:context_query_size]
self.context["query"].assign(tf.cast(query, tf.float32))
if training or testing:
return inputs
if self.sampled_softmax_mode:
return self._get_logits_for_sampled_softmax(inputs)
self._check_input_from_two_tower(inputs)
positive_scores = tf.reduce_sum(
tf.multiply(inputs[self.query_name], inputs[self.item_name]), keepdims=True, axis=-1
)
return positive_scores
@tf.function
def call_outputs(
self,
outputs: PredictionOutput,
features: Dict[str, tf.Tensor] = None,
training=True,
testing=False,
**kwargs,
) -> "PredictionOutput":
"""Based on the user/query embedding (inputs[self.query_name]), uses dot product to score
the positive item and also sampled negative items (during training).
Parameters
----------
inputs : TabularData
Dict with the query and item embeddings (e.g. `{"query": <emb>}, "item": <emb>}`),
where embeddings are 2D tensors (batch size, embedding size)
training : bool, optional
Flag that indicates whether in training mode, by default True
Returns
-------
[tf.Tensor,tf.Tensor]
all_scores: 2D Tensor with the scores for the positive items and, if `training=True`,
for the negative sampled items too.
Return tensor is 2D (batch size, 1 + #negatives)
"""
targets, predictions = outputs.targets, outputs.predictions
valid_negatives_mask = None
if self.sampled_softmax_mode or isinstance(targets, tf.Tensor):
positive_item_ids = targets
else:
positive_item_ids = tf.squeeze(features[self.item_id_feature_name])
neg_items_ids = None
if training or testing:
assert (
len(self.samplers) > 0
), "At least one sampler is required by ItemRetrievalScorer for negative sampling"
if self.sampled_softmax_mode:
predictions = self._prepare_query_item_vectors_for_sampled_softmax(
predictions, targets
)
batch_items_embeddings = predictions[self.item_name]
if self.sampled_softmax_mode:
batch_items_metadata = {self.item_id_feature_name: positive_item_ids}
else:
batch_items_metadata = {
feat_name: tf.squeeze(features[feat_name])
for feat_name in self._required_features
}
positive_scores = tf.reduce_sum(
tf.multiply(predictions[self.query_name], predictions[self.item_name]),
keepdims=True,
axis=-1,
)
neg_items_embeddings_list = []
neg_items_ids_list = []
# Adds items from the current batch into samplers and sample a number of negatives
for sampler in self.samplers:
input_data = EmbeddingWithMetadata(batch_items_embeddings, batch_items_metadata)
sampling_kwargs = {"training": training}
if "item_weights" in tf_inspect.getargspec(sampler.call).args:
sampling_kwargs["item_weights"] = self.context.get_embedding(self.item_domain)
neg_items = sampler(input_data.__dict__, **sampling_kwargs)
if tf.shape(neg_items.embeddings)[0] > 0:
# Accumulates sampled negative items from all samplers
neg_items_embeddings_list.append(neg_items.embeddings)
if self.downscore_false_negatives:
neg_items_ids_list.append(neg_items.metadata[self.item_id_feature_name])
else:
LOG.warn(
f"The sampler {type(sampler).__name__} returned no samples for this batch."
)
if len(neg_items_embeddings_list) == 0:
raise Exception(f"No negative items where sampled from samplers {self.samplers}")
elif len(neg_items_embeddings_list) == 1:
neg_items_embeddings = neg_items_embeddings_list[0]
else:
neg_items_embeddings = tf.concat(neg_items_embeddings_list, axis=0)
negative_scores = tf.linalg.matmul(
predictions[self.query_name], neg_items_embeddings, transpose_b=True
)
if self.downscore_false_negatives or self.store_negative_ids:
if isinstance(targets, tf.Tensor):
positive_item_ids = targets
else:
positive_item_ids = tf.squeeze(features[self.item_id_feature_name])
if len(neg_items_ids_list) == 1:
neg_items_ids = neg_items_ids_list[0]
else:
neg_items_ids = tf.concat(neg_items_ids_list, axis=0)
negative_scores, valid_negatives_mask = rescore_false_negatives(
positive_item_ids, neg_items_ids, negative_scores, self.false_negatives_score
)
predictions = tf.concat([positive_scores, negative_scores], axis=-1)
# To ensure that the output is always fp32, avoiding numerical
# instabilities with mixed_float16 policy
predictions = tf.cast(predictions, tf.float32)
assert isinstance(predictions, tf.Tensor), "Predictions must be a tensor"
# prepare targets for computing the loss and metrics
if self.sampled_softmax_mode and not training:
# Converts target ids to one-hot representation
num_classes = tf.shape(predictions)[-1]
targets_one_hot = tf.one_hot(tf.reshape(targets, (-1,)), num_classes)
return PredictionOutput(
predictions,
targets_one_hot,
positive_item_ids=positive_item_ids,
valid_negatives_mask=valid_negatives_mask,
negative_item_ids=neg_items_ids,
)
else:
# Positives in the first column and negatives in the subsequent columns
targets = tf.concat(
[
tf.ones([tf.shape(predictions)[0], 1], dtype=predictions.dtype),
tf.zeros(
[tf.shape(predictions)[0], tf.shape(predictions)[1] - 1],
dtype=predictions.dtype,
),
],
axis=1,
)
return PredictionOutput(
predictions,
targets,
positive_item_ids=positive_item_ids,
valid_negatives_mask=valid_negatives_mask,
negative_item_ids=neg_items_ids,
)
def _get_logits_for_sampled_softmax(self, inputs):
if not isinstance(inputs, tf.Tensor):
raise ValueError(
f"Inputs to the Sampled Softmax block should be tensors, got {type(inputs)}"
)
embedding_table = self.context.get_embedding(self.item_domain)
all_scores = tf.matmul(inputs, tf.transpose(embedding_table))
return all_scores
def _prepare_query_item_vectors_for_sampled_softmax(
self, predictions: tf.Tensor, targets: tf.Tensor
):
# extract positive items embeddings
if not isinstance(predictions, tf.Tensor):
raise ValueError(
f"Inputs to the Sampled Softmax block should be tensors, got {type(predictions)}"
)
embedding_table = self.context.get_embedding(self.item_domain)
batch_items_embeddings = embedding_ops.embedding_lookup(embedding_table, targets)
predictions = {self.query_name: predictions, self.item_name: batch_items_embeddings}
return predictions
def set_required_features(self):
required_features = set()
if self.downscore_false_negatives:
required_features.add(self.item_id_feature_name)
required_features.update(
[feature for sampler in self.samplers for feature in sampler.required_features]
)
self._required_features = list(required_features)
def get_config(self):
config = super().get_config()
config = maybe_serialize_keras_objects(self, config, ["samplers"])
config["sampling_downscore_false_negatives"] = self.downscore_false_negatives
config["sampling_downscore_false_negatives_value"] = self.false_negatives_score
config["item_id_feature_name"] = self.item_id_feature_name
config["item_domain"] = self.item_domain
config["query_name"] = self.query_name
config["item_name"] = self.item_name
config["cache_query"] = self.cache_query
config["sampled_softmax_mode"] = self.sampled_softmax_mode
config["store_negative_ids"] = self.store_negative_ids
return config
@classmethod
def from_config(cls, config):
config = maybe_deserialize_keras_objects(config, ["samplers"])
return super().from_config(config)