Source code for transformers4rec.torch.tabular.base

#
# Copyright (c) 2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from abc import ABC
from copy import deepcopy
from functools import reduce
from typing import Dict, List, Optional, Tuple, Union

import torch
from merlin.models.utils.doc_utils import docstring_parameter
from merlin.models.utils.registry import Registry

from merlin_standard_lib import Schema

from ..block.base import BlockBase, SequentialBlock, right_shift_block
from ..typing import TabularData, TensorOrTabularData
from ..utils.torch_utils import OutputSizeMixin, calculate_batch_size_from_input_size

tabular_transformation_registry: Registry = Registry.class_registry("torch.tabular_transformations")
tabular_aggregation_registry: Registry = Registry.class_registry("torch.tabular_aggregations")


[docs]class TabularTransformation(OutputSizeMixin, torch.nn.Module, ABC):
    """Transformation that takes in `TabularData` and outputs `TabularData`."""

[docs]    def forward(self, inputs: TabularData, **kwargs) -> TabularData:
        raise NotImplementedError()

[docs]    @classmethod
    def parse(cls, class_or_str):
        return tabular_transformation_registry.parse(class_or_str)


[docs]class TabularAggregation(OutputSizeMixin, torch.nn.Module, ABC):
    """Aggregation of `TabularData` that outputs a single `Tensor`"""

[docs]    def forward(self, inputs: TabularData) -> torch.Tensor:
        raise NotImplementedError()

    def _expand_non_sequential_features(self, inputs: TabularData) -> TabularData:
        inputs_sizes = {k: v.shape for k, v in inputs.items()}
        seq_features_shapes, sequence_length = self._get_seq_features_shapes(inputs_sizes)

        if len(seq_features_shapes) > 0:
            non_seq_features = set(inputs.keys()).difference(set(seq_features_shapes.keys()))
            for fname in non_seq_features:
                # Including the 2nd dim and repeating for the sequence length
                inputs[fname] = inputs[fname].unsqueeze(dim=1).repeat(1, sequence_length, 1)

        return inputs

    def _get_seq_features_shapes(self, inputs_sizes: Dict[str, torch.Size]):
        seq_features_shapes = dict()
        for fname, fshape in inputs_sizes.items():
            # Saves the shapes of sequential features
            if len(fshape) >= 3:
                seq_features_shapes[fname] = tuple(fshape[:2])

        self._check_first_two_dims(seq_features_shapes)
        if len(seq_features_shapes) > 0:
            sequence_length = list(seq_features_shapes.values())[0][1]
        else:
            sequence_length = 0

        return seq_features_shapes, sequence_length

    def _check_first_two_dims(self, seq_features_shapes: Dict[str, Tuple[int, ...]]):
        if (
            not torch.jit.is_tracing()
            and len(seq_features_shapes) > 0
            and len(set(seq_features_shapes.values())) > 1
        ):
            raise ValueError(
                "All sequential features must share the same shape in the first two dims "
                "(batch_size, seq_length): {}".format(seq_features_shapes)
            )

    def _check_concat_shapes(self, inputs: TabularData):
        if torch.jit.is_tracing():
            return
        input_sizes = {k: v.shape for k, v in inputs.items()}
        if len(set(list([v[:-1] for v in input_sizes.values()]))) > 1:
            raise Exception(
                "All features dimensions except the last one must match: {}".format(input_sizes)
            )

    def _get_agg_output_size(self, input_size, agg_dim):
        batch_size = calculate_batch_size_from_input_size(input_size)
        seq_features_shapes, sequence_length = self._get_seq_features_shapes(input_size)

        if len(seq_features_shapes) > 0:
            return (
                batch_size,
                sequence_length,
                agg_dim,
            )

        else:
            return (batch_size, agg_dim)

[docs]    @classmethod
    def parse(cls, class_or_str):
        return tabular_aggregation_registry.parse(class_or_str)


TabularTransformationType = Union[str, TabularTransformation]
TabularTransformationsType = Union[TabularTransformationType, List[TabularTransformationType]]
TabularAggregationType = Union[str, TabularAggregation]


[docs]class SequentialTabularTransformations(SequentialBlock):
    """A sequential container, modules will be added to it in the order they are passed in.

    Parameters
    ----------
    transformation: TabularTransformationType
        transformations that are passed in here will be called in order.
    """

    def __init__(self, *transformation: TabularTransformationsType):
        if len(transformation) == 1 and isinstance(transformation[0], list):
            transformation = transformation[0]  # type: ignore
        if not isinstance(transformation, (list, tuple)):
            transformation = [transformation]  # type: ignore
        super().__init__(*[TabularTransformation.parse(t) for t in transformation])

[docs]    def append(self, transformation):
        self.transformations.append(TabularTransformation.parse(transformation))


TABULAR_MODULE_PARAMS_DOCSTRING = """
    pre: Union[str, TabularTransformation, List[str], List[TabularTransformation]], optional
        Transformations to apply on the inputs when the module is called (so **before** `forward`).
    post: Union[str, TabularTransformation, List[str], List[TabularTransformation]], optional
        Transformations to apply on the inputs after the module is called (so **after** `forward`).
    aggregation: Union[str, TabularAggregation], optional
        Aggregation to apply after processing the `forward`-method to output a single Tensor.
"""


[docs]@docstring_parameter(tabular_module_parameters=TABULAR_MODULE_PARAMS_DOCSTRING)
class TabularModule(torch.nn.Module):
    """PyTorch Module that's specialized for tabular-data by integrating many often used operations.

    Parameters
    ----------
    {tabular_module_parameters}
    """

    def __init__(
        self,
        pre: Optional[TabularTransformationsType] = None,
        post: Optional[TabularTransformationsType] = None,
        aggregation: Optional[TabularAggregationType] = None,
        **kwargs,
    ):
        super().__init__()
        self.input_size = None
        self.pre = pre  # type: ignore
        self.post = post  # type: ignore
        self.aggregation = aggregation  # type: ignore

[docs]    @classmethod
    def from_schema(cls, schema: Schema, tags=None, **kwargs) -> Optional["TabularModule"]:
        """Instantiate a TabularModule instance from a DatasetSchema.

        Parameters
        ----------
        schema
        tags
        kwargs

        Returns
        -------
        Optional[TabularModule]
        """
        schema_copy = deepcopy(schema)
        if tags:
            schema_copy = schema_copy.select_by_tag(tags)

        if not schema_copy.column_schemas:
            return None

        return cls.from_features(schema_copy.column_names, schema=schema_copy, **kwargs)

[docs]    @classmethod
    @docstring_parameter(tabular_module_parameters=TABULAR_MODULE_PARAMS_DOCSTRING, extra_padding=4)
    def from_features(
        cls,
        features: List[str],
        pre: Optional[TabularTransformationsType] = None,
        post: Optional[TabularTransformationsType] = None,
        aggregation: Optional[TabularAggregationType] = None,
    ) -> "TabularModule":
        """Initializes a TabularModule instance where the contents of features will be filtered
            out

        Parameters
        ----------
        features: List[str]
            A list of feature-names that will be used as the first pre-processing op to filter out
            all other features not in this list.
        {tabular_module_parameters}

        Returns
        -------
        TabularModule
        """
        pre = [FilterFeatures(features), pre] if pre else FilterFeatures(features)  # type: ignore

        return cls(pre=pre, post=post, aggregation=aggregation)

    @property
    def pre(self) -> Optional[SequentialTabularTransformations]:
        """

        Returns
        -------
        SequentialTabularTransformations, optional
        """
        return self._pre

    @pre.setter
    def pre(self, value: Optional[TabularTransformationsType]):
        if value:
            self._pre: Optional[
                SequentialTabularTransformations
            ] = SequentialTabularTransformations(value)
        else:
            self._pre = None

    @property
    def post(self) -> Optional[SequentialTabularTransformations]:
        """

        Returns
        -------
        SequentialTabularTransformations, optional
        """
        return self._post

    @post.setter
    def post(self, value: Optional[TabularTransformationsType]):
        if value:
            self._post: Optional[
                SequentialTabularTransformations
            ] = SequentialTabularTransformations(value)
        else:
            self._post = None

    @property
    def aggregation(self) -> Optional[TabularAggregation]:
        """

        Returns
        -------
        TabularAggregation, optional
        """
        return self._aggregation

    @aggregation.setter
    def aggregation(self, value: Optional[Union[str, TabularAggregation]]):
        """

        Parameters
        ----------
        value
        """
        if value:
            self._aggregation: Optional[TabularAggregation] = TabularAggregation.parse(value)
        else:
            self._aggregation = None

[docs]    def pre_forward(
        self, inputs: TabularData, transformations: Optional[TabularTransformationsType] = None
    ) -> TabularData:
        """Method that's typically called before the forward method for pre-processing.

        Parameters
        ----------
        inputs: TabularData
             input-data, typically the output of the forward method.
        transformations: TabularAggregationType, optional

        Returns
        -------
        TabularData
        """
        return self._maybe_apply_transformations(
            inputs, transformations=transformations or self.pre
        )

[docs]    def forward(self, x: TabularData, *args, **kwargs) -> TabularData:
        return x

[docs]    def post_forward(
        self,
        inputs: TabularData,
        transformations: Optional[TabularTransformationsType] = None,
        merge_with: Union["TabularModule", List["TabularModule"]] = None,
        aggregation: Optional[TabularAggregationType] = None,
    ) -> TensorOrTabularData:
        """Method that's typically called after the forward method for post-processing.

        Parameters
        ----------
        inputs: TabularData
            input-data, typically the output of the forward method.
        transformations: TabularTransformationType, optional
            Transformations to apply on the input data.
        merge_with: Union[TabularModule, List[TabularModule]], optional
            Other TabularModule's to call and merge the outputs with.
        aggregation: TabularAggregationType, optional
            Aggregation to aggregate the output to a single Tensor.

        Returns
        -------
        TensorOrTabularData (Tensor when aggregation is set, else TabularData)
        """
        _aggregation: Optional[TabularAggregation]
        if aggregation:
            _aggregation = TabularAggregation.parse(aggregation)
        else:
            _aggregation = getattr(self, "aggregation", None)

        outputs = inputs
        if merge_with:
            if not isinstance(merge_with, list):
                merge_with = [merge_with]
            for layer_or_tensor in merge_with:
                to_add = layer_or_tensor(inputs) if callable(layer_or_tensor) else layer_or_tensor
                outputs.update(to_add)

        outputs = self._maybe_apply_transformations(
            outputs, transformations=transformations or self.post
        )

        if _aggregation:
            schema = getattr(self, "schema", None)
            _aggregation.set_schema(schema)
            return _aggregation(outputs)

        return outputs

    def __call__(
        self,
        inputs: TabularData,
        *args,
        pre: Optional[TabularTransformationsType] = None,
        post: Optional[TabularTransformationsType] = None,
        merge_with: Union["TabularModule", List["TabularModule"]] = None,
        aggregation: Optional[TabularAggregationType] = None,
        **kwargs,
    ) -> TensorOrTabularData:
        """We overwrite the call method in order to be able to do pre- and post-processing.

        Parameters
        ----------
        inputs: TabularData
            Input TabularData.
        pre: TabularTransformationType, optional
            Transformations to apply before calling the forward method. If pre is None, this method
            will check if `self.pre` is set.
        post: TabularTransformationType, optional
            Transformations to apply after calling the forward method. If post is None, this method
            will check if `self.post` is set.
        merge_with: Union[TabularModule, List[TabularModule]]
            Other TabularModule's to call and merge the outputs with.
        aggregation: TabularAggregationType, optional
            Aggregation to aggregate the output to a single Tensor.

        Returns
        -------
        TensorOrTabularData (Tensor when aggregation is set, else TabularData)
        """
        inputs = self.pre_forward(inputs, transformations=pre)

        # This will call the `forward` method implemented by the super class.
        outputs = super().__call__(inputs, *args, **kwargs)  # noqa

        if isinstance(outputs, dict):
            outputs = self.post_forward(
                outputs, transformations=post, merge_with=merge_with, aggregation=aggregation
            )

        return outputs

    def _maybe_apply_transformations(
        self,
        inputs: TabularData,
        transformations: Optional[
            Union[TabularTransformationsType, SequentialTabularTransformations]
        ] = None,
    ) -> TabularData:
        """Apply transformations to the inputs if these are defined.

        Parameters
        ----------
        inputs
        transformations

        Returns
        -------

        """
        if transformations:
            _transformations = TabularTransformation.parse(transformations)
            return _transformations(inputs)

        return inputs

    def __rrshift__(self, other):
        return right_shift_block(self, other)


[docs]class FilterFeatures(TabularTransformation):
    """Module that filters out certain features from `TabularData`."

    Parameters
    ----------
    to_include: List[str]
        List of features to include in the result of calling the module
    pop: bool
        Boolean indicating whether to pop the features to exclude from the inputs dictionary.
    """

    def __init__(self, to_include: List[str], pop: bool = False):
        super().__init__()
        self.to_include = to_include
        self.pop = pop

[docs]    def forward(self, inputs: TabularData, **kwargs) -> TabularData:
        """

        Parameters
        ----------
        inputs: TabularData
            Input dictionary containing features to filter.

        Returns Filtered TabularData that only contains the feature-names in `self.to_include`.
        -------

        """
        assert isinstance(inputs, dict), "Inputs needs to be a dict"

        outputs = {k: v for k, v in inputs.items() if k in self.to_include}
        if self.pop:
            for key in outputs.keys():
                inputs.pop(key)

        return outputs

[docs]    def forward_output_size(self, input_shape):
        """

        Parameters
        ----------
        input_shape

        Returns
        -------

        """
        return {k: v for k, v in input_shape.items() if k in self.to_include}


[docs]@docstring_parameter(tabular_module_parameters=TABULAR_MODULE_PARAMS_DOCSTRING)
class TabularBlock(BlockBase, TabularModule, ABC):
    """TabularBlock extends TabularModule to turn it into a block with output size info.

    Parameters
    ----------
    {tabular_module_parameters}
    """

    def __init__(
        self,
        pre: Optional[TabularTransformationType] = None,
        post: Optional[TabularTransformationType] = None,
        aggregation: Optional[TabularAggregationType] = None,
        schema: Optional[Schema] = None,
        **kwargs,
    ):
        super().__init__(pre=pre, post=post, aggregation=aggregation, **kwargs)
        self.schema = schema

[docs]    def to_module(self, shape_or_module, device=None):
        shape = shape_or_module
        if isinstance(shape_or_module, torch.nn.Module):
            shape = getattr(shape_or_module, "output_size", None)
            if shape:
                shape = shape()

        return self.build(shape, device=device)

[docs]    def output_size(self, input_size=None):
        if self.pre:
            input_size = self.pre.output_size(input_size)

        output_size = self._check_post_output_size(super().output_size(input_size))

        return output_size

[docs]    def build(self, input_size, schema=None, **kwargs):
        output = super().build(input_size, schema=schema, **kwargs)
        output_size = input_size
        if self.pre:
            self.pre.build(input_size, schema=schema, **kwargs)
            output_size = self.pre.output_size(input_size)

        output_size = self.forward_output_size(output_size)

        if self.post:
            self.post.build(output_size, schema=schema, **kwargs)
            output_size = self.post.output_size(output_size)

        if self.aggregation:
            self.aggregation.build(output_size, schema=schema, **kwargs)

        return output

    def _check_post_output_size(self, input_size):
        output_size = input_size

        if isinstance(input_size, dict):
            if self.post:
                output_size = self.post.output_size(output_size)
            if self.aggregation:
                schema = getattr(self, "schema", None)
                # self.aggregation.build(output_size, schema=schema)
                self.aggregation.set_schema(schema)
                output_size = self.aggregation.forward_output_size(output_size)

        return output_size

    def __rrshift__(self, other):
        return right_shift_block(self, other)


[docs]@docstring_parameter(tabular_module_parameters=TABULAR_MODULE_PARAMS_DOCSTRING)
class MergeTabular(TabularBlock):
    """Merge multiple TabularModule's into a single output of TabularData.

    Parameters
    ----------
    modules_to_merge: Union[TabularModule, Dict[str, TabularModule]]
        TabularModules to merge into, this can also be one or multiple dictionaries keyed by the
        name the module should have.
    {tabular_module_parameters}
    """

    def __init__(
        self,
        *modules_to_merge: Union[TabularModule, Dict[str, TabularModule]],
        pre: Optional[TabularTransformationType] = None,
        post: Optional[TabularTransformationType] = None,
        aggregation: Optional[TabularAggregationType] = None,
        schema: Optional[Schema] = None,
        **kwargs,
    ):
        super().__init__(pre=pre, post=post, aggregation=aggregation, schema=schema, **kwargs)
        self.to_merge: Union[torch.nn.ModuleDict, torch.nn.ModuleList]
        if all(isinstance(x, dict) for x in modules_to_merge):
            to_merge: Dict[str, TabularModule]
            to_merge = reduce(lambda a, b: dict(a, **b), modules_to_merge)  # type: ignore
            self.to_merge = torch.nn.ModuleDict(to_merge)
        elif all(isinstance(x, torch.nn.Module) for x in modules_to_merge):
            self.to_merge = torch.nn.ModuleList(modules_to_merge)  # type: ignore
        else:
            raise ValueError(
                "Please provide one or multiple TabularBlock's to merge or "
                f"dictionaries of TabularBlocks. got: {modules_to_merge}"
            )

        # Merge schemas if necessary.
        if not schema and all(getattr(m, "schema", False) for m in self.merge_values):
            self.schema = reduce(lambda a, b: a + b, [m.schema for m in self.merge_values])

    @property
    def merge_values(self):
        if isinstance(self.to_merge, torch.nn.ModuleDict):
            return list(self.to_merge.values())

        return self.to_merge

[docs]    def forward(self, inputs: TabularData, training=True, **kwargs) -> TabularData:  # type: ignore
        assert isinstance(inputs, dict), "Inputs needs to be a dict"

        outputs = {}
        for layer in self.merge_values:
            outputs.update(layer(inputs))

        return outputs

[docs]    def forward_output_size(self, input_size):
        output_shapes = {}

        for layer in self.merge_values:
            output_shapes.update(layer.forward_output_size(input_size))

        return super(MergeTabular, self).forward_output_size(output_shapes)

[docs]    def build(self, input_size, **kwargs):
        super().build(input_size, **kwargs)

        for layer in self.merge_values:
            layer.build(input_size, **kwargs)

        return self


[docs]class AsTabular(TabularBlock):
    """Converts a Tensor to TabularData by converting it to a dictionary.

    Parameters
    ----------
    output_name: str
        Name that should be used as the key in the output dictionary.
    """

    def __init__(self, output_name: str):
        super().__init__()
        self.output_name = output_name

[docs]    def forward(self, inputs: torch.Tensor, **kwargs) -> TabularData:  # type: ignore
        return {self.output_name: inputs}

[docs]    def forward_output_size(self, input_size):
        return {self.output_name: input_size}


def merge_tabular(self, other):
    return MergeTabular(self, other)


TabularModule.__add__ = merge_tabular  # type: ignore
TabularModule.merge = merge_tabular  # type: ignore