Source code for transformers4rec.config.transformer

#
# Copyright (c) 2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import transformers
from merlin.models.utils.doc_utils import docstring_parameter
from merlin.models.utils.registry import Registry

transformer_registry: Registry = Registry("transformers")


TRANSFORMER_CONFIG_PARAMETER_DOCSTRING = """        
        d_model: int
            The  hidden dimension of the transformer layer.
        n_head: int
            The number of attention heads in each transformer layer.
        n_layer: int
            The number of transformer layers to stack.
        total_seq_length: int
            The maximum sequence length.
        hidden_act: str, optional
            The activation function in the hidden layers.
            By default 'gelu'
        initializer_range: float, optional
            The standard deviation of the `truncated_normal_initializer`
            for initializing all transformer's weights parameters.
            By default 0.01
        layer_norm_eps: float, optional
            The epsilon used by the layer normalization layers.
            By default 0.03
        dropout: float, optional
            The dropout probability. By default 0.3
        pad_token: int, optional
            The padding token ID. By default 0
        log_attention_weights: bool, optional
            Whether to log attention weights. By default False
"""


[docs]class T4RecConfig:
    """A class responsible for setting the configuration of the transformers class
    from Hugging Face and returning the corresponding T4Rec model.
    """

[docs]    def to_huggingface_torch_model(self):
        """
        Instantiate a Hugging Face transformer model based on
        the configuration parameters of the class.

        Returns
        -------
        transformers.PreTrainedModel
            The Hugging Face transformer model.
        """
        model_cls = transformers.MODEL_MAPPING[self.transformers_config_cls]

        return model_cls(self)

[docs]    def to_torch_model(
        self,
        input_features,
        *prediction_task,
        task_blocks=None,
        task_weights=None,
        loss_reduction="mean",
        **kwargs
    ):
        """Links the Hugging Face transformer model to the given input block and prediction tasks,
        and returns a T4Rec model.

        Parameters
        ----------
        input_features: torch4rec.TabularSequenceFeatures
            The sequential block that represents the input features and
            defines the masking strategy for training and evaluation.
        prediction_task: torch4rec.PredictionTask
            One or multiple prediction tasks.
        task_blocks: list, optional
            List of task-specific blocks that we apply on top of the HF transformer's output.
        task_weights: list, optional
            List of the weights to use for combining the tasks losses.
        loss_reduction: str, optional
            The reduction to apply to the prediction losses, possible values are:
                'none': no reduction will be applied,
                'mean': the weighted mean of the output is taken,
                'sum': the output will be summed.
            By default: 'mean'.

        Returns
        -------
        torch4rec.Model
            The T4Rec torch model.

        Raises
        ------
        ValueError
            If input block or prediction task is of the wrong type.
        """
        from .. import torch as torch4rec

        if not isinstance(input_features, torch4rec.TabularSequenceFeatures):
            raise ValueError("`input_features` must an instance of SequentialTabularFeatures")
        if not all(isinstance(t, torch4rec.PredictionTask) for t in prediction_task):
            raise ValueError(
                "`task` is of the wrong type, please provide one or multiple "
                "instance(s) of PredictionTask"
            )

        body = torch4rec.SequentialBlock(
            input_features, torch4rec.TransformerBlock(self, masking=input_features.masking)
        )

        return torch4rec.Head(
            body,
            *prediction_task,
            task_blocks=task_blocks,
            task_weights=task_weights,
            loss_reduction=loss_reduction,
        ).to_model(**kwargs)

    @property
    def transformers_config_cls(self):
        return self.__class__.__bases__[1]

[docs]    @classmethod
    def build(cls, *args, **kwargs):
        raise NotImplementedError


[docs]@transformer_registry.register("reformer")
class ReformerConfig(T4RecConfig, transformers.ReformerConfig):
    """Subclass of T4RecConfig and transformers.ReformerConfig from Hugging Face.
    It handles configuration for Reformer layers in the context of T4Rec models.
    """

[docs]    @docstring_parameter(transformer_cfg_parameters=TRANSFORMER_CONFIG_PARAMETER_DOCSTRING)
    @classmethod
    def build(
        cls,
        d_model,
        n_head,
        n_layer,
        total_seq_length,
        hidden_act="gelu",
        initializer_range=0.01,
        layer_norm_eps=0.03,
        dropout=0.3,
        pad_token=0,
        log_attention_weights=False,
        axial_pos_shape_first_dim=4,
        **kwargs
    ):
        """
        Creates an instance of ReformerConfig with the given parameters.

        Parameters
        ----------
        {transformer_cfg_parameters}
        axial_pos_shape_first_dim: int, optional
            The first dimension of the axial position encodings.
            During training, the product of the position dims has to be equal to the sequence length.

        Returns
        -------
        ReformerConfig
            An instance of ReformerConfig.
        """
        # To account for target positions at inference mode, we extend the maximum sequence length.
        total_seq_length = total_seq_length + 2
        return cls(
            hidden_size=d_model,
            attention_head_size=d_model,
            attn_layers=["local", "lsh"] * (n_layer // 2) if n_layer > 2 else ["local"],
            num_hidden_layers=n_layer,
            feed_forward_size=d_model * 4,
            num_attention_heads=n_head,
            hidden_act=hidden_act,
            initializer_range=initializer_range,
            layer_norm_eps=layer_norm_eps,
            hidden_dropout_prob=dropout,
            lsh_attention_probs_dropout_prob=dropout,
            pad_token_id=pad_token,
            output_attentions=log_attention_weights,
            max_position_embeddings=total_seq_length,
            axial_pos_embds_dim=[
                d_model // 2,
                d_model // 2,
            ],
            axial_pos_shape=[
                axial_pos_shape_first_dim,
                total_seq_length // axial_pos_shape_first_dim,
            ],
            vocab_size=1,
            **kwargs,
        )


[docs]@transformer_registry.register("gtp2")
@docstring_parameter(transformer_cfg_parameters=TRANSFORMER_CONFIG_PARAMETER_DOCSTRING)
class GPT2Config(T4RecConfig, transformers.GPT2Config):
    """Subclass of T4RecConfig and transformers.GPT2Config from Hugging Face.
    It handles configuration for GPT2 layers in the context of T4Rec models.
    """

[docs]    @classmethod
    def build(
        cls,
        d_model,
        n_head,
        n_layer,
        total_seq_length,
        hidden_act="gelu",
        initializer_range=0.01,
        layer_norm_eps=0.03,
        dropout=0.3,
        pad_token=0,
        log_attention_weights=False,
        **kwargs
    ):
        """
        Creates an instance of GPT2Config with the given parameters.

        Parameters
        ----------
        {transformer_cfg_parameters}

        Returns
        -------
        GPT2Config
            An instance of GPT2Config.
        """
        return cls(
            n_embd=d_model,
            n_inner=d_model * 4,
            n_layer=n_layer,
            n_head=n_head,
            activation_function=hidden_act,
            initializer_range=initializer_range,
            layer_norm_eps=layer_norm_eps,
            resid_pdrop=dropout,
            embd_pdrop=dropout,
            attn_pdrop=dropout,
            n_positions=total_seq_length,
            n_ctx=total_seq_length,
            output_attentions=log_attention_weights,
            vocab_size=1,
            **kwargs,
        )


[docs]@transformer_registry.register("longformer")
class LongformerConfig(T4RecConfig, transformers.LongformerConfig):
    """Subclass of T4RecConfig and transformers.LongformerConfig from Hugging Face.
    It handles configuration for LongformerConfig layers in the context of T4Rec models.
    """

[docs]    @docstring_parameter(transformer_cfg_parameters=TRANSFORMER_CONFIG_PARAMETER_DOCSTRING)
    @classmethod
    def build(
        cls,
        d_model,
        n_head,
        n_layer,
        total_seq_length,
        hidden_act="gelu",
        initializer_range=0.01,
        layer_norm_eps=0.03,
        dropout=0.3,
        pad_token=0,
        log_attention_weights=False,
        **kwargs
    ):
        """
        Creates an instance of LongformerConfig with the given parameters.

        Parameters
        ----------
        {transformer_cfg_parameters}

        Returns
        -------
        LongformerConfig
            An instance of LongformerConfig.
        """
        # To account for target positions at inference mode, we extend the maximum sequence length.
        total_seq_length = total_seq_length + 2
        return cls(
            hidden_size=d_model,
            num_hidden_layers=n_layer,
            num_attention_heads=n_head,
            hidden_act=hidden_act,
            attention_window=total_seq_length,
            initializer_range=initializer_range,
            layer_norm_eps=layer_norm_eps,
            dropout=dropout,
            pad_token_id=pad_token,
            output_attentions=log_attention_weights,
            vocab_size=1,
            **kwargs,
        )


[docs]@transformer_registry.register("electra")
class ElectraConfig(T4RecConfig, transformers.ElectraConfig):
    """Subclass of T4RecConfig and transformers.ElectraConfig from Hugging Face.
    It handles configuration for ElectraConfig layers in the context of T4Rec models.
    """

[docs]    @docstring_parameter(transformer_cfg_parameters=TRANSFORMER_CONFIG_PARAMETER_DOCSTRING)
    @classmethod
    def build(
        cls,
        d_model,
        n_head,
        n_layer,
        total_seq_length,
        hidden_act="gelu",
        initializer_range=0.01,
        layer_norm_eps=0.03,
        dropout=0.3,
        pad_token=0,
        log_attention_weights=False,
        **kwargs
    ):
        """
        Creates an instance of ElectraConfig with the given parameters.

        Parameters
        ----------
        {transformer_cfg_parameters}

        Returns
        -------
        ElectraConfig
            An instance of ElectraConfig.
        """
        # To account for target positions at inference mode, we extend the maximum sequence length.
        total_seq_length = total_seq_length + 2
        return cls(
            hidden_size=d_model,
            embedding_size=d_model,
            num_hidden_layers=n_layer,
            num_attention_heads=n_head,
            intermediate_size=d_model * 4,
            hidden_act=hidden_act,
            initializer_range=initializer_range,
            layer_norm_eps=layer_norm_eps,
            hidden_dropout_prob=dropout,
            max_position_embeddings=total_seq_length,
            pad_token_id=pad_token,
            output_attentions=log_attention_weights,
            vocab_size=1,
            **kwargs,
        )


[docs]@transformer_registry.register("albert")
class AlbertConfig(T4RecConfig, transformers.AlbertConfig):
    """Subclass of T4RecConfig and transformers.AlbertConfig from Hugging Face.
    It handles configuration for AlbertConfig layers in the context of T4Rec models.
    """

[docs]    @docstring_parameter(transformer_cfg_parameters=TRANSFORMER_CONFIG_PARAMETER_DOCSTRING)
    @classmethod
    def build(
        cls,
        d_model,
        n_head,
        n_layer,
        total_seq_length,
        hidden_act="gelu",
        initializer_range=0.01,
        layer_norm_eps=0.03,
        dropout=0.3,
        pad_token=0,
        log_attention_weights=False,
        **kwargs
    ):
        """
        Creates an instance of AlbertConfig with the given parameters.

        Parameters
        ----------
        {transformer_cfg_parameters}

        Returns
        -------
        AlbertConfig
            An instance of AlbertConfig.
        """
        # To account for target positions at inference mode, we extend the maximum sequence length.
        total_seq_length = total_seq_length + 2
        return cls(
            hidden_size=d_model,
            num_attention_heads=n_head,
            num_hidden_layers=n_layer,
            hidden_act=hidden_act,
            intermediate_size=d_model * 4,
            hidden_dropout_prob=dropout,
            attention_probs_dropout_prob=dropout,
            max_position_embeddings=total_seq_length,
            embedding_size=d_model,  # should be same as dimension of the input to ALBERT
            initializer_range=initializer_range,
            layer_norm_eps=layer_norm_eps,
            output_attentions=log_attention_weights,
            vocab_size=1,
            **kwargs,
        )


[docs]@transformer_registry.register("xlnet")
@docstring_parameter(transformer_cfg_parameters=TRANSFORMER_CONFIG_PARAMETER_DOCSTRING)
class XLNetConfig(T4RecConfig, transformers.XLNetConfig):
    """Subclass of T4RecConfig and transformers.XLNetConfig from Hugging Face.
    It handles configuration for XLNetConfig layers in the context of T4Rec models.
    """

[docs]    @docstring_parameter(transformer_cfg_parameters=TRANSFORMER_CONFIG_PARAMETER_DOCSTRING)
    @classmethod
    def build(
        cls,
        d_model,
        n_head,
        n_layer,
        total_seq_length=None,
        attn_type="bi",
        hidden_act="gelu",
        initializer_range=0.01,
        layer_norm_eps=0.03,
        dropout=0.3,
        pad_token=0,
        log_attention_weights=False,
        mem_len=1,
        **kwargs
    ):
        """
        Creates an instance of XLNetConfig with the given parameters.

        Parameters
        ----------
        {transformer_cfg_parameters}
        mem_len: int,
            The number of tokens to be cached. Pre-computed key/value pairs
            from a previous forward pass are stored and won't be re-computed.
            This parameter is especially useful for long sequence modeling where
            different batches may truncate the entire sequence.
            Tasks like user-aware recommendation could benefit from this feature.
            By default, this parameter is set to 1, which means no caching is used.

        Returns
        -------
        XLNetConfig
            An instance of XLNetConfig.
        """
        return cls(
            d_model=d_model,
            d_inner=d_model * 4,
            n_layer=n_layer,
            n_head=n_head,
            attn_type=attn_type,
            ff_activation=hidden_act,
            initializer_range=initializer_range,
            layer_norm_eps=layer_norm_eps,
            dropout=dropout,
            pad_token_id=pad_token,
            output_attentions=log_attention_weights,
            vocab_size=1,
            mem_len=mem_len,
            **kwargs,
        )


[docs]@transformer_registry.register("bert")
class BertConfig(T4RecConfig, transformers.BertConfig):
    """Subclass of T4RecConfig and transformers.BertConfig from Hugging Face.
    It handles configuration for BertConfig layers in the context of T4Rec models.
    """

[docs]    @docstring_parameter(transformer_cfg_parameters=TRANSFORMER_CONFIG_PARAMETER_DOCSTRING)
    @classmethod
    def build(
        cls,
        d_model,
        n_head,
        n_layer,
        total_seq_length,
        hidden_act="gelu",
        initializer_range=0.01,
        layer_norm_eps=0.03,
        dropout=0.3,
        pad_token=0,
        log_attention_weights=False,
        **kwargs
    ):
        """
        Creates an instance of BertConfig with the given parameters.

        Parameters
        ----------
        {transformer_cfg_parameters}

        Returns
        -------
        BertConfig
            An instance of BertConfig.
        """
        # To account for target positions at inference mode, we extend the maximum sequence length.
        total_seq_length = total_seq_length + 2
        return cls(
            hidden_size=d_model,
            num_hidden_layers=n_layer,
            num_attention_heads=n_head,
            hidden_act=hidden_act,
            initializer_range=initializer_range,
            layer_norm_eps=layer_norm_eps,
            dropout=dropout,
            pad_token_id=pad_token,
            output_attentions=log_attention_weights,
            max_position_embeddings=total_seq_length,
            vocab_size=1,
            **kwargs,
        )


[docs]@transformer_registry.register("roberta")
class RobertaConfig(T4RecConfig, transformers.RobertaConfig):
    """Subclass of T4RecConfig and transformers.RobertaConfig from Hugging Face.
    It handles configuration for RobertaConfig layers in the context of T4Rec models.
    """

[docs]    @docstring_parameter(transformer_cfg_parameters=TRANSFORMER_CONFIG_PARAMETER_DOCSTRING)
    @classmethod
    def build(
        cls,
        d_model,
        n_head,
        n_layer,
        total_seq_length,
        hidden_act="gelu",
        initializer_range=0.01,
        layer_norm_eps=0.03,
        dropout=0.3,
        pad_token=0,
        log_attention_weights=False,
        **kwargs
    ):
        """
        Creates an instance of RobertaConfig with the given parameters.

        Parameters
        ----------
        {transformer_cfg_parameters}

        Returns
        -------
        RobertaConfig
            An instance of RobertaConfig.
        """
        # To account for target positions at inference mode, we extend the maximum sequence length.
        total_seq_length = total_seq_length + 2
        return cls(
            hidden_size=d_model,
            num_hidden_layers=n_layer,
            num_attention_heads=n_head,
            hidden_act=hidden_act,
            initializer_range=initializer_range,
            layer_norm_eps=layer_norm_eps,
            dropout=dropout,
            pad_token_id=pad_token,
            output_attentions=log_attention_weights,
            max_position_embeddings=total_seq_length,
            vocab_size=1,
            **kwargs,
        )


[docs]@transformer_registry.register("transfo-xl")
class TransfoXLConfig(T4RecConfig, transformers.TransfoXLConfig):
    """Subclass of T4RecConfig and transformers. TransfoXLConfig from Hugging Face.
    It handles configuration for TransfoXLConfig layers in the context of T4Rec models.
    """

[docs]    @docstring_parameter(transformer_cfg_parameters=TRANSFORMER_CONFIG_PARAMETER_DOCSTRING)
    @classmethod
    def build(
        cls,
        d_model,
        n_head,
        n_layer,
        total_seq_length,
        hidden_act="gelu",
        initializer_range=0.01,
        layer_norm_eps=0.03,
        dropout=0.3,
        pad_token=0,
        log_attention_weights=False,
        **kwargs
    ):
        """
        Creates an instance of TransfoXLConfig with the given parameters.

        Parameters
        ----------
        {transformer_cfg_parameters}

        Returns
        -------
        TransfoXLConfig
            An instance of TransfoXLConfig.
        """
        return cls(
            d_model=d_model,
            d_embed=d_model,
            n_layer=n_layer,
            n_head=n_head,
            d_inner=d_model * 4,
            hidden_act=hidden_act,
            untie_r=True,
            attn_type=0,
            initializer_range=initializer_range,
            layer_norm_eps=layer_norm_eps,
            dropout=dropout,
            pad_token_id=pad_token,
            output_attentions=log_attention_weights,
            vocab_size=1,  # As the input_embeds will be fed in the forward function, limits the memory reserved by the internal input embedding table, which will not be used
            mem_len=1,  # We do not use mems, because we feed the full sequence to the Transformer models and not sliding segments (which is useful for the long sequences in NLP. As setting mem_len to 0 leads to NaN in loss, we set it to one, to minimize the computing overhead)
            div_val=1,  # Disables adaptative input (embeddings), because the embeddings are managed by TabularFeatures
            **kwargs,
        )