Source code for merlin.models.tf.utils.tf_utils

#
# Copyright (c) 2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import collections
from typing import Any, List, Sequence, Tuple, Union

import numpy as np
import tensorflow as tf
from keras.utils.tf_inspect import getfullargspec
from packaging import version
from tensorflow.python import to_dlpack

from merlin.core.compat import cudf, cupy
from merlin.core.dispatch import DataFrameType
from merlin.io import Dataset
from merlin.models.tf.core.base import Block, ModelContext
from merlin.models.tf.typing import TabularData
from merlin.models.utils.misc_utils import filter_kwargs

if version.parse(tf.__version__) < version.parse("2.3.0"):
    try:
        from tfdlpack import from_dlpack
    except ModuleNotFoundError as e:
        message = "If using TensorFlow < 2.3.0, you must install tfdlpack-gpu extension library"
        raise ModuleNotFoundError(message) from e

else:
    from tensorflow.experimental.dlpack import from_dlpack


def get_output_sizes_from_schema(schema, batch_size=0, max_sequence_length=None):
    sizes = {}
    for feature in schema:
        name = feature.name
        if feature.is_list:
            sizes[name] = tf.TensorShape(
                [
                    batch_size,
                    max_sequence_length if max_sequence_length else feature.shape.dims[1].max,
                ]
            )
        elif feature.HasField("shape"):
            sizes[name] = tf.TensorShape([batch_size] + [d.size for d in feature.shape.dim])
        else:
            sizes[name] = tf.TensorShape([batch_size, 1])

    return sizes


def calculate_batch_size_from_inputs(inputs):
    input_shapes = {k: v.shape for k, v in inputs.items()}
    batch_size = calculate_batch_size_from_input_shapes(input_shapes)
    return batch_size


def calculate_batch_size_from_input_shapes(input_shapes):
    non_ragged_features = list(
        [k for k in input_shapes if not k.endswith("__values") and not k.endswith("__offsets")]
    )
    if len(non_ragged_features) > 0:
        batch_size = input_shapes[non_ragged_features[0]][0]
        return batch_size

    ragged_features_offsets = list([k for k in input_shapes if k.endswith("__offsets")])
    if len(ragged_features_offsets) > 0:
        batch_size = input_shapes[ragged_features_offsets[0]][0]
        if batch_size is not None:
            batch_size -= 1
        return batch_size

    return None


def maybe_serialize_keras_objects(
    self,
    config,
    maybe_serialize_keys,
):
    for key in maybe_serialize_keys:
        maybe_value = getattr(self, key, None)
        if maybe_value:
            if isinstance(maybe_value, dict):
                config[key] = {
                    k: tf.keras.utils.serialize_keras_object(v) for k, v in maybe_value.items()
                }
            elif isinstance(maybe_value, (list, tuple)):
                config[key] = [tf.keras.utils.serialize_keras_object(v) for v in maybe_value]
            else:
                config[key] = tf.keras.utils.serialize_keras_object(maybe_value)

    return config


def maybe_deserialize_keras_objects(
    config,
    to_deserialize,
    deserialize_fn=tf.keras.utils.deserialize_keras_object,
    custom_objects={},
):
    if isinstance(to_deserialize, list):
        to_deserialize = {k: deserialize_fn for k in to_deserialize}

    for key, fn in to_deserialize.items():
        maybe_val = config.get(key, None)
        if maybe_val:
            if isinstance(maybe_val, list):
                config[key] = [fn(v, custom_objects=custom_objects) for v in maybe_val]
            else:
                config[key] = fn(maybe_val, custom_objects=custom_objects)

    return config


def rescore_false_negatives(
    positive_item_ids: tf.Tensor,
    neg_samples_item_ids: tf.Tensor,
    negative_scores: tf.Tensor,
    false_negatives_score: float,
):
    """
    Zeroes the logits of accidental negatives.
    """
    # Removing dimensions of size 1 from the shape of the item ids, if applicable
    positive_item_ids = tf.cast(tf.squeeze(positive_item_ids), neg_samples_item_ids.dtype)
    neg_samples_item_ids = tf.squeeze(neg_samples_item_ids)

    # Reshapes positive and negative ids so that false_negatives_mask matches the scores shape
    false_negatives_mask = tf.equal(
        tf.expand_dims(positive_item_ids, -1), tf.expand_dims(neg_samples_item_ids, 0)
    )

    # Setting a very small value for false negatives (accidental hits) so that it has
    # negligicle effect on the loss functions
    negative_scores = tf.where(
        false_negatives_mask,
        tf.ones_like(negative_scores) * false_negatives_score,
        negative_scores,
    )

    valid_negatives_mask = tf.logical_not(false_negatives_mask)

    return tf.squeeze(negative_scores), valid_negatives_mask


def extract_topk(
    k: int,
    predictions: tf.Tensor,
    labels: tf.Tensor,
    shuffle_ties: bool = False,
    shuffle_ties_epsilon=1e-6,
    seed=None,
) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
    """Extracts top-k values of predictions, sorting the corresponding
    labels accordingly.

    Parameters
    ----------
    k : int
        Cut-off to extract top-k items
    predictions : tf.Tensor
        Tensor with the predictions per example
    labels : tf.Tensor
        Tensor with the labels per example
    shuffle_ties : bool, optional
        Adds a small random value to predictions to break ties if any, by default False
    shuffle_ties_epsilon : float, optional
        The maximum random value to be added to break ties (used only if shuffle_ties=True),
        by default 1e-6
    seed : int, optional
        Random seed to use for tie breaking
    Returns
    -------
    Tuple(tf.Tensor,tf.Tensor,tf.Tensor)
        Returns a triple with the following tensors
        (topk_predictions,topk_labels,label_relevant_counts).
        The label_relevant_counts holds the total number of positive values per example,
        as some metrics (e.g. recall) need that information, which is lost when we
        extract top-k values
    """
    # Computes the number of relevant items per row (before extracting only the top-k)
    label_relevant_counts = tf.reduce_sum(labels, axis=-1)
    # Limits k to the number of prediction scores
    k = tf.minimum(k, tf.shape(predictions)[-1])

    if shuffle_ties:
        # Adds a small random value to break ties in the range [0,shuffle_ties_epsilon)
        if seed is not None:
            tf.random.set_seed(seed)
        predictions = predictions + (
            tf.random.uniform(tf.shape(predictions)) * shuffle_ties_epsilon
        )

    topk_predictions, topk_indices = tf.math.top_k(predictions, k)
    topk_labels = gather_torch_like(labels, topk_indices, k)
    return topk_predictions, topk_labels, label_relevant_counts


def transform_label_to_onehot(labels, vocab_size):
    return tf.one_hot(tf.reshape(labels, (-1,)), vocab_size)


def create_output_placeholder(scores, ks):
    return tf.Variable(tf.zeros([tf.shape(scores)[0], len(ks)], tf.float32))


def gather_torch_like(labels, indices, max_k):
    row_idxs = tf.repeat(tf.range(tf.shape(labels)[0]), max_k)
    col_idx = tf.reshape(indices, tf.shape(row_idxs))
    all_indices = tf.transpose(tf.stack([row_idxs, col_idx]))

    labels = tf.reshape(tf.gather_nd(labels, all_indices), (tf.shape(labels)[0], max_k))
    return labels


def batch_ref(inputs: Union[tf.Tensor, TabularData]):
    """Get hash-code of a tensor or a dictionary of tensors."""

    if isinstance(inputs, tf.Tensor):
        return hash(inputs.ref())

    refs = []
    keys = sorted(inputs.keys())
    for key in keys:
        refs.append(inputs[key].ref())

    return hash(tuple(refs))


def pack_df(gdf):
    if isinstance(gdf, np.ndarray):
        return gdf
    elif hasattr(gdf, "to_dlpack") and callable(getattr(gdf, "to_dlpack")):
        return gdf.to_dlpack()
    elif hasattr(gdf, "to_numpy") and callable(getattr(gdf, "to_numpy")):
        gdf = gdf.to_numpy()
        if isinstance(gdf[0], list):
            gdf = np.stack(gdf)
        return gdf
    return gdf.toDlpack()


def unpack_df(gdf):
    if hasattr(gdf, "shape"):
        return tf.convert_to_tensor(gdf)
    return from_dlpack(gdf)


def df_to_tensor(gdf, dtype=None):
    if gdf.empty:
        return

    # checks necessary because of this bug
    # https://github.com/tensorflow/tensorflow/issues/42660
    if len(gdf.shape) == 1 or gdf.shape[1] == 1:
        dlpack = pack_df(gdf)
    elif gdf.shape[0] == 1:
        dlpack = pack_df(gdf.values[0])
    else:
        dlpack = pack_df(gdf.values.T)
    # catch error caused by tf eager context
    # not being initialized

    try:
        x = unpack_df(dlpack)
    except AssertionError:
        tf.random.uniform((1,))
        x = unpack_df(dlpack)
    # if rank is already two it is  already in list format
    if gdf.shape[0] == 1 and not tf.rank(x) == 2:
        # batch size 1 so got squashed to a vector
        x = tf.expand_dims(x, 0)
    elif len(gdf.shape) == 1 or len(x.shape) == 1:
        # sort of a generic check for any other
        # len(shape)==1 case, could probably
        # be more specific
        x = tf.expand_dims(x, -1)
    elif gdf.shape[1] > 1:
        # matrix which means we had to transpose
        # for the bug above, so untranspose
        x = tf.transpose(x)

    if dtype:
        return tf.cast(x, dtype)

    return x


def tensor_to_df(tensor, index=None, gpu=None):
    if gpu is None:
        gpu = cudf

    if gpu:
        # Note: It is not possible to convert Tensorflow tensors to the cudf dataframe
        # directly using dlPack (as the example commented below) because cudf.from_dlpack()
        # expects the 2D tensor to be in Fortran order (column-major), which is not
        # supported by TF (https://github.com/rapidsai/cudf/issues/10754).
        # df = cudf.from_dlpack(to_dlpack(tf.convert_to_tensor(embeddings)))
        tensor_cupy = cupy.fromDlpack(to_dlpack(tf.convert_to_tensor(tensor)))
        df = cudf.DataFrame(tensor_cupy)
        df.columns = [str(col) for col in list(df.columns)]
        if not index:
            index = cudf.RangeIndex(0, tensor.shape[0])
        df.set_index(index)
    else:
        import pandas as pd

        df = pd.DataFrame(tensor.numpy())
        df.columns = [str(col) for col in list(df.columns)]
        if not index:
            index = pd.RangeIndex(0, tensor.shape[0])
        df.set_index(index)

    return df


def add_epsilon_to_zeros(tensor: tf.Tensor, epsilon: float = 1e-24) -> tf.Tensor:
    """Replaces zeros by adding a small epsilon value to them.
    This is useful to avoid inf and nan errors on math ops
    like log().

    Parameters
    ----------
    tensor : tf.Tensor
        Tensor to operate on
    epsilon : float, optional
        Small value to add to zeros, by default 1e-24

    Returns
    -------
    tf.Tensor
        The tensor without zeros
    """
    return tf.where(tf.equal(tensor, 0.0), tensor + epsilon, tensor)


def get_candidate_probs(
    item_freq_probs: Union[tf.Tensor, Sequence], is_prob_distribution: bool = False
):
    """Returns the candidate probs after checking if
    item_freq_probs is frequencies or probs and their
    dtype and shape according to the item feature cardinality

    Parameters:
    ----------
    item_freq_probs : Union[tf.Tensor, Sequence]
        A Tensor or list with item frequencies (if is_prob_distribution=False)
        or with item probabilities (if is_prob_distribution=True)
    is_prob_distribution: bool, optional
        If True, the item_freq_probs should be a probability distribution of the items.
        If False, the item frequencies is converted to probabilities
    Returns
    -------
        A tensor with the item probability distributon
    """
    item_freq_probs = tf.convert_to_tensor(item_freq_probs)

    if is_prob_distribution:
        tf.debugging.assert_type(
            item_freq_probs, tf.float32, message="The item_weights should have tf.float32 dtype"
        )
        tf.debugging.assert_near(
            tf.reduce_sum(item_freq_probs),
            1.0,
            message="The item_weights should be a probability distribution and sum to 1.0",
        )
        candidate_probs = item_freq_probs
    else:
        item_freq_probs = tf.cast(item_freq_probs, tf.float32)
        candidate_probs = item_freq_probs / tf.reduce_sum(item_freq_probs)

    return candidate_probs


[docs]@tf.keras.utils.register_keras_serializable(package="merlin.models") class TensorInitializer(tf.keras.initializers.Initializer): """Initializer that returns a tensor (e.g. pre-trained embeddings) set in the constructor """
[docs] def __init__(self, weights: Union[tf.Tensor, Any], **kwargs): self._weights = tf.convert_to_tensor(weights)
def __call__(self, shape: tf.TensorShape, dtype: tf.DType = None, **kwargs) -> tf.Tensor: """Returns a tensor object initialized with the tensor set in the constructor. Parameters ---------- shape : tf.TensorShape Shape of the variable to be initialized dtype : tf.DType, optional Optional dtype of the tensor. Only numeric or boolean dtypes are supported, by default None Returns ------- tf.Tensor Returns the tensor set in the constructor """ tf.assert_equal(shape, self._weights.shape) weights = self._weights if dtype: weights = tf.cast(self._weights, dtype) return weights
[docs] @classmethod def from_dataset(cls, data: Union[Dataset, DataFrameType], **kwargs) -> "TensorInitializer": if hasattr(data, "to_ddf"): data = data.to_ddf().compute() embeddings = df_to_tensor(data) return cls(weights=embeddings, **kwargs)
[docs] def get_config(self): # To support serialization return {"weights": self._weights.numpy()}
def call_layer(layer: tf.keras.layers.Layer, inputs, *args, **kwargs): """Calls a layer with the given inputs and filters kwargs. Returns the output""" has_custom_call = getattr(layer, "_has_custom__call__", False) _k = dict(cascade_kwargs_if_possible=True, argspec_fn=getfullargspec) filtered_kwargs = filter_kwargs(kwargs, layer, **_k) if not has_custom_call: if isinstance(layer, tf.keras.layers.Lambda): filtered_kwargs = filter_kwargs(kwargs, layer.function, **_k) else: # We need to check the call method on the type since when the model gets saved # we can't infer the kwargs from using `layer.call` directly call_fn = type(layer).call filtered_kwargs = filter_kwargs(filtered_kwargs, call_fn, **_k) return layer(inputs, *args, **filtered_kwargs) def get_sub_blocks(blocks: Sequence[Block]) -> List[Block]: """Get all sub-blocks of given blocks, including blocks themselves, return a list of blocks Traverse(Iterate) the model to check each block (sub_block) by BFS""" result_blocks = set() if not isinstance(blocks, (list, tuple)): blocks = [blocks] for block in blocks: # Iterate all submodule (BFS) except ModelContext deque = collections.deque() if not isinstance(block, ModelContext): deque.append(block) while deque: current_module = deque.popleft() # Add all sub-blocks include itself result_blocks.add(current_module) for sub_module in current_module._flatten_modules(include_self=False, recursive=False): # filter out modelcontext if type(sub_module) != ModelContext: deque.append(sub_module) return list(result_blocks) @tf.function def list_col_to_ragged(values: tf.Tensor, offsets: tf.Tensor): if offsets.dtype.is_floating: offsets = tf.cast(offsets, tf.int32) return tf.RaggedTensor.from_row_splits(values, offsets) def check_inputs_mask_compatible_shape( inputs: Union[tf.Tensor, tf.RaggedTensor], mask: Union[tf.Tensor, tf.RaggedTensor] ): """Check if the shape and the type of the input and mask tensors are compatible. Parameters ---------- inputs : Union[tf.Tensor, tf.RaggedTensor] The input tensor, which can be either a dense or ragged tensor. mask : Union[tf.Tensor, tf.RaggedTensor] The mask tensor, which can be either a dense or ragged tensor. Returns ------- bool: Returns True if the shape of the input and mask tensors are compatible, False otherwise. Notes ----- The function assumes that the `inputs` tensor has one more dimension than the `mask` tensor, with the extra dimension typically related to the embeddings dimension. """ result = False if type(inputs) == type(mask) and (inputs.shape.as_list()[:-1] == mask.shape.as_list()): if isinstance(inputs, tf.RaggedTensor): result = tf.reduce_all( tf.cast(inputs.row_lengths(), tf.int32) == tf.cast(mask.row_lengths(), tf.int32) ) else: result = True return result