#
# Copyright (c) 2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import transformers
from merlin.models.utils.doc_utils import docstring_parameter
from merlin.models.utils.registry import Registry
transformer_registry: Registry = Registry("transformers")
TRANSFORMER_CONFIG_PARAMETER_DOCSTRING = """
d_model: int
The hidden dimension of the transformer layer.
n_head: int
The number of attention heads in each transformer layer.
n_layer: int
The number of transformer layers to stack.
total_seq_length: int
The maximum sequence length.
hidden_act: str, optional
The activation function in the hidden layers.
By default 'gelu'
initializer_range: float, optional
The standard deviation of the `truncated_normal_initializer`
for initializing all transformer's weights parameters.
By default 0.01
layer_norm_eps: float, optional
The epsilon used by the layer normalization layers.
By default 0.03
dropout: float, optional
The dropout probability. By default 0.3
pad_token: int, optional
The padding token ID. By default 0
log_attention_weights: bool, optional
Whether to log attention weights. By default False
"""
[docs]class T4RecConfig:
"""A class responsible for setting the configuration of the transformers class
from Hugging Face and returning the corresponding T4Rec model.
"""
[docs] def to_huggingface_torch_model(self):
"""
Instantiate a Hugging Face transformer model based on
the configuration parameters of the class.
Returns
-------
transformers.PreTrainedModel
The Hugging Face transformer model.
"""
model_cls = transformers.MODEL_MAPPING[self.transformers_config_cls]
return model_cls(self)
[docs] def to_torch_model(
self,
input_features,
*prediction_task,
task_blocks=None,
task_weights=None,
loss_reduction="mean",
**kwargs
):
"""Links the Hugging Face transformer model to the given input block and prediction tasks,
and returns a T4Rec model.
Parameters
----------
input_features: torch4rec.TabularSequenceFeatures
The sequential block that represents the input features and
defines the masking strategy for training and evaluation.
prediction_task: torch4rec.PredictionTask
One or multiple prediction tasks.
task_blocks: list, optional
List of task-specific blocks that we apply on top of the HF transformer's output.
task_weights: list, optional
List of the weights to use for combining the tasks losses.
loss_reduction: str, optional
The reduction to apply to the prediction losses, possible values are:
'none': no reduction will be applied,
'mean': the weighted mean of the output is taken,
'sum': the output will be summed.
By default: 'mean'.
Returns
-------
torch4rec.Model
The T4Rec torch model.
Raises
------
ValueError
If input block or prediction task is of the wrong type.
"""
from .. import torch as torch4rec
if not isinstance(input_features, torch4rec.TabularSequenceFeatures):
raise ValueError("`input_features` must an instance of SequentialTabularFeatures")
if not all(isinstance(t, torch4rec.PredictionTask) for t in prediction_task):
raise ValueError(
"`task` is of the wrong type, please provide one or multiple "
"instance(s) of PredictionTask"
)
body = torch4rec.SequentialBlock(
input_features, torch4rec.TransformerBlock(self, masking=input_features.masking)
)
return torch4rec.Head(
body,
*prediction_task,
task_blocks=task_blocks,
task_weights=task_weights,
loss_reduction=loss_reduction,
).to_model(**kwargs)
@property
def transformers_config_cls(self):
return self.__class__.__bases__[1]
[docs] @classmethod
def build(cls, *args, **kwargs):
raise NotImplementedError
[docs]@transformer_registry.register("gtp2")
@docstring_parameter(transformer_cfg_parameters=TRANSFORMER_CONFIG_PARAMETER_DOCSTRING)
class GPT2Config(T4RecConfig, transformers.GPT2Config):
"""Subclass of T4RecConfig and transformers.GPT2Config from Hugging Face.
It handles configuration for GPT2 layers in the context of T4Rec models.
"""
[docs] @classmethod
def build(
cls,
d_model,
n_head,
n_layer,
total_seq_length,
hidden_act="gelu",
initializer_range=0.01,
layer_norm_eps=0.03,
dropout=0.3,
pad_token=0,
log_attention_weights=False,
**kwargs
):
"""
Creates an instance of GPT2Config with the given parameters.
Parameters
----------
{transformer_cfg_parameters}
Returns
-------
GPT2Config
An instance of GPT2Config.
"""
return cls(
n_embd=d_model,
n_inner=d_model * 4,
n_layer=n_layer,
n_head=n_head,
activation_function=hidden_act,
initializer_range=initializer_range,
layer_norm_eps=layer_norm_eps,
resid_pdrop=dropout,
embd_pdrop=dropout,
attn_pdrop=dropout,
n_positions=total_seq_length,
n_ctx=total_seq_length,
output_attentions=log_attention_weights,
vocab_size=1,
**kwargs,
)
[docs]@transformer_registry.register("electra")
class ElectraConfig(T4RecConfig, transformers.ElectraConfig):
"""Subclass of T4RecConfig and transformers.ElectraConfig from Hugging Face.
It handles configuration for ElectraConfig layers in the context of T4Rec models.
"""
[docs] @docstring_parameter(transformer_cfg_parameters=TRANSFORMER_CONFIG_PARAMETER_DOCSTRING)
@classmethod
def build(
cls,
d_model,
n_head,
n_layer,
total_seq_length,
hidden_act="gelu",
initializer_range=0.01,
layer_norm_eps=0.03,
dropout=0.3,
pad_token=0,
log_attention_weights=False,
**kwargs
):
"""
Creates an instance of ElectraConfig with the given parameters.
Parameters
----------
{transformer_cfg_parameters}
Returns
-------
ElectraConfig
An instance of ElectraConfig.
"""
# To account for target positions at inference mode, we extend the maximum sequence length.
total_seq_length = total_seq_length + 2
return cls(
hidden_size=d_model,
embedding_size=d_model,
num_hidden_layers=n_layer,
num_attention_heads=n_head,
intermediate_size=d_model * 4,
hidden_act=hidden_act,
initializer_range=initializer_range,
layer_norm_eps=layer_norm_eps,
hidden_dropout_prob=dropout,
max_position_embeddings=total_seq_length,
pad_token_id=pad_token,
output_attentions=log_attention_weights,
vocab_size=1,
**kwargs,
)
[docs]@transformer_registry.register("albert")
class AlbertConfig(T4RecConfig, transformers.AlbertConfig):
"""Subclass of T4RecConfig and transformers.AlbertConfig from Hugging Face.
It handles configuration for AlbertConfig layers in the context of T4Rec models.
"""
[docs] @docstring_parameter(transformer_cfg_parameters=TRANSFORMER_CONFIG_PARAMETER_DOCSTRING)
@classmethod
def build(
cls,
d_model,
n_head,
n_layer,
total_seq_length,
hidden_act="gelu",
initializer_range=0.01,
layer_norm_eps=0.03,
dropout=0.3,
pad_token=0,
log_attention_weights=False,
**kwargs
):
"""
Creates an instance of AlbertConfig with the given parameters.
Parameters
----------
{transformer_cfg_parameters}
Returns
-------
AlbertConfig
An instance of AlbertConfig.
"""
# To account for target positions at inference mode, we extend the maximum sequence length.
total_seq_length = total_seq_length + 2
return cls(
hidden_size=d_model,
num_attention_heads=n_head,
num_hidden_layers=n_layer,
hidden_act=hidden_act,
intermediate_size=d_model * 4,
hidden_dropout_prob=dropout,
attention_probs_dropout_prob=dropout,
max_position_embeddings=total_seq_length,
embedding_size=d_model, # should be same as dimension of the input to ALBERT
initializer_range=initializer_range,
layer_norm_eps=layer_norm_eps,
output_attentions=log_attention_weights,
vocab_size=1,
**kwargs,
)
[docs]@transformer_registry.register("xlnet")
@docstring_parameter(transformer_cfg_parameters=TRANSFORMER_CONFIG_PARAMETER_DOCSTRING)
class XLNetConfig(T4RecConfig, transformers.XLNetConfig):
"""Subclass of T4RecConfig and transformers.XLNetConfig from Hugging Face.
It handles configuration for XLNetConfig layers in the context of T4Rec models.
"""
[docs] @docstring_parameter(transformer_cfg_parameters=TRANSFORMER_CONFIG_PARAMETER_DOCSTRING)
@classmethod
def build(
cls,
d_model,
n_head,
n_layer,
total_seq_length=None,
attn_type="bi",
hidden_act="gelu",
initializer_range=0.01,
layer_norm_eps=0.03,
dropout=0.3,
pad_token=0,
log_attention_weights=False,
mem_len=1,
**kwargs
):
"""
Creates an instance of XLNetConfig with the given parameters.
Parameters
----------
{transformer_cfg_parameters}
mem_len: int,
The number of tokens to be cached. Pre-computed key/value pairs
from a previous forward pass are stored and won't be re-computed.
This parameter is especially useful for long sequence modeling where
different batches may truncate the entire sequence.
Tasks like user-aware recommendation could benefit from this feature.
By default, this parameter is set to 1, which means no caching is used.
Returns
-------
XLNetConfig
An instance of XLNetConfig.
"""
return cls(
d_model=d_model,
d_inner=d_model * 4,
n_layer=n_layer,
n_head=n_head,
attn_type=attn_type,
ff_activation=hidden_act,
initializer_range=initializer_range,
layer_norm_eps=layer_norm_eps,
dropout=dropout,
pad_token_id=pad_token,
output_attentions=log_attention_weights,
vocab_size=1,
mem_len=mem_len,
**kwargs,
)
[docs]@transformer_registry.register("bert")
class BertConfig(T4RecConfig, transformers.BertConfig):
"""Subclass of T4RecConfig and transformers.BertConfig from Hugging Face.
It handles configuration for BertConfig layers in the context of T4Rec models.
"""
[docs] @docstring_parameter(transformer_cfg_parameters=TRANSFORMER_CONFIG_PARAMETER_DOCSTRING)
@classmethod
def build(
cls,
d_model,
n_head,
n_layer,
total_seq_length,
hidden_act="gelu",
initializer_range=0.01,
layer_norm_eps=0.03,
dropout=0.3,
pad_token=0,
log_attention_weights=False,
**kwargs
):
"""
Creates an instance of BertConfig with the given parameters.
Parameters
----------
{transformer_cfg_parameters}
Returns
-------
BertConfig
An instance of BertConfig.
"""
# To account for target positions at inference mode, we extend the maximum sequence length.
total_seq_length = total_seq_length + 2
return cls(
hidden_size=d_model,
num_hidden_layers=n_layer,
num_attention_heads=n_head,
hidden_act=hidden_act,
initializer_range=initializer_range,
layer_norm_eps=layer_norm_eps,
dropout=dropout,
pad_token_id=pad_token,
output_attentions=log_attention_weights,
max_position_embeddings=total_seq_length,
vocab_size=1,
**kwargs,
)
[docs]@transformer_registry.register("roberta")
class RobertaConfig(T4RecConfig, transformers.RobertaConfig):
"""Subclass of T4RecConfig and transformers.RobertaConfig from Hugging Face.
It handles configuration for RobertaConfig layers in the context of T4Rec models.
"""
[docs] @docstring_parameter(transformer_cfg_parameters=TRANSFORMER_CONFIG_PARAMETER_DOCSTRING)
@classmethod
def build(
cls,
d_model,
n_head,
n_layer,
total_seq_length,
hidden_act="gelu",
initializer_range=0.01,
layer_norm_eps=0.03,
dropout=0.3,
pad_token=0,
log_attention_weights=False,
**kwargs
):
"""
Creates an instance of RobertaConfig with the given parameters.
Parameters
----------
{transformer_cfg_parameters}
Returns
-------
RobertaConfig
An instance of RobertaConfig.
"""
# To account for target positions at inference mode, we extend the maximum sequence length.
total_seq_length = total_seq_length + 2
return cls(
hidden_size=d_model,
num_hidden_layers=n_layer,
num_attention_heads=n_head,
hidden_act=hidden_act,
initializer_range=initializer_range,
layer_norm_eps=layer_norm_eps,
dropout=dropout,
pad_token_id=pad_token,
output_attentions=log_attention_weights,
max_position_embeddings=total_seq_length,
vocab_size=1,
**kwargs,
)
[docs]@transformer_registry.register("transfo-xl")
class TransfoXLConfig(T4RecConfig, transformers.TransfoXLConfig):
"""Subclass of T4RecConfig and transformers. TransfoXLConfig from Hugging Face.
It handles configuration for TransfoXLConfig layers in the context of T4Rec models.
"""
[docs] @docstring_parameter(transformer_cfg_parameters=TRANSFORMER_CONFIG_PARAMETER_DOCSTRING)
@classmethod
def build(
cls,
d_model,
n_head,
n_layer,
total_seq_length,
hidden_act="gelu",
initializer_range=0.01,
layer_norm_eps=0.03,
dropout=0.3,
pad_token=0,
log_attention_weights=False,
**kwargs
):
"""
Creates an instance of TransfoXLConfig with the given parameters.
Parameters
----------
{transformer_cfg_parameters}
Returns
-------
TransfoXLConfig
An instance of TransfoXLConfig.
"""
return cls(
d_model=d_model,
d_embed=d_model,
n_layer=n_layer,
n_head=n_head,
d_inner=d_model * 4,
hidden_act=hidden_act,
untie_r=True,
attn_type=0,
initializer_range=initializer_range,
layer_norm_eps=layer_norm_eps,
dropout=dropout,
pad_token_id=pad_token,
output_attentions=log_attention_weights,
vocab_size=1, # As the input_embeds will be fed in the forward function, limits the memory reserved by the internal input embedding table, which will not be used
mem_len=1, # We do not use mems, because we feed the full sequence to the Transformer models and not sliding segments (which is useful for the long sequences in NLP. As setting mem_len to 0 leads to NaN in loss, we set it to one, to minimize the computing overhead)
div_val=1, # Disables adaptative input (embeddings), because the embeddings are managed by TabularFeatures
**kwargs,
)