Source code for merlin.models.utils.schema_utils

#
# Copyright (c) 2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import math
import os
from typing import Dict, Optional

import numpy as np

from merlin.schema import ColumnSchema, Schema, Tags, TagsType
from merlin.schema.io.tensorflow_metadata import TensorflowMetadata


[docs]def select_targets(schema: Schema, extra_tags: Optional[TagsType] = None) -> Schema: out = schema.select_by_tag(Tags.BINARY_CLASSIFICATION) out += schema.select_by_tag(Tags.TARGET) out += schema.select_by_tag(Tags.REGRESSION) if extra_tags: out += schema.select_by_tag(extra_tags) return out
[docs]def schema_to_tensorflow_metadata_json(schema, path=None): json = TensorflowMetadata.from_merlin_schema(schema).to_json() if path: with open(path, "w") as o: o.write(json) return json
[docs]def tensorflow_metadata_json_to_schema(value): if os.path.isfile(value): value = open(value).read() return TensorflowMetadata.from_json(value).to_merlin_schema()
[docs]def create_categorical_column( name, num_items, dtype=np.int32, domain_name=None, tags=None, properties=None, min_value_count=None, max_value_count=None, ): properties = properties or {} if not domain_name: domain_name = name if num_items: properties["domain"] = {"name": domain_name, "min": 0, "max": num_items} is_list, is_ragged = False, False value_count = {} if min_value_count is not None: value_count["min"] = min_value_count if max_value_count is not None: value_count["max"] = max_value_count if value_count: properties["value_count"] = value_count is_list = True is_ragged = min_value_count != max_value_count tags = tags or [] if Tags.CATEGORICAL not in tags: tags.append(Tags.CATEGORICAL) return ColumnSchema( name=name, tags=tags, dtype=dtype, properties=properties, is_list=is_list, is_ragged=is_ragged, )
[docs]def create_continuous_column( name, dtype=np.float32, tags=None, properties=None, min_value=None, max_value=None, ): properties = properties or {} domain = {} if min_value is not None: domain["min"] = min_value if max_value is not None: domain["max"] = max_value if domain: properties["domain"] = domain tags = tags or [] if Tags.CONTINUOUS not in tags: tags.append(Tags.CONTINUOUS) return ColumnSchema(name=name, tags=tags, properties=properties, dtype=dtype)
[docs]def filter_dict_by_schema(input_dict, schema): """Filters out entries from input_dict, returns a dictionary where every entry corresponds to a column in the schema""" column_names = set(schema.column_names) return {k: v for k, v in input_dict.items() if k in column_names}
[docs]def categorical_cardinalities(schema) -> Dict[str, int]: outputs = {} for col in schema: if Tags.CATEGORICAL in col.tags: domain = col.int_domain if domain: outputs[col.name] = domain.max + 1 return outputs
[docs]def categorical_domains(schema) -> Dict[str, str]: outputs = {} for col in schema: if Tags.CATEGORICAL in col.tags: domain = col.int_domain name = col.name if domain and domain.name: name = domain.name outputs[col.name] = name return outputs
[docs]def get_embedding_sizes_from_schema( schema: Schema, multiplier: float = 2.0, ensure_multiple_of_8: bool = False ) -> Dict[str, int]: """Provides a heristic (from Google) that suggests the embedding sizes as a function (forth root) of categorical features cardinalities, obtained from the schema. Parameters ---------- schema : Schema Featires schema multiplier : float, optional Multiplier to be applied on the forth root of the cardinality. Google recommends multiplier in the [2.0,10.0] range, by default 2.0 ensure_multiple_of_8 : bool, optional If enabled, adjusts the embedding dim to the smallest greater number multiple of 8, to ensure best performance with GPU ops, by default False Returns ------- Dict[str, int] A dict with the feature names and the suggested embedding sizes based on the features cardinalities obtained from the schema """ cardinalities = categorical_cardinalities(schema) return { key: get_embedding_size_from_cardinality(val, multiplier, ensure_multiple_of_8) for key, val in cardinalities.items() }
def col_is_list(col: ColumnSchema) -> bool: return Tags.SEQUENCE in col.tags or Tags.LIST in col.tags or col.is_list
[docs]def get_embedding_size_from_cardinality( cardinality: int, multiplier: float = 2.0, ensure_multiple_of_8: bool = False ) -> int: """Provides a heuristic (from Google) that suggests the embedding dimension as a function (forth root) of the feature cardinality. Parameters ---------- cardinality : int The number of unique values of a categorical feature multiplier : float, optional Multiplier to be applied on the forth root of the cardinality. Google recommends multiplier in the [2.0,10.0] range, by default 2.0 ensure_multiple_of_8 : bool, optional If enabled, adjusts the embedding dim to the smallest greater number multiple of 8, to ensure best performance with GPU ops, by default False Returns ------- int The suggested embedding dimension based on the feature cardinality """ # A rule-of-thumb from Google. embedding_size = int(math.ceil(math.pow(cardinality, 0.25) * multiplier)) if ensure_multiple_of_8: embedding_size = int(math.ceil((embedding_size / 8)) * 8) return embedding_size
def infer_embedding_dim( col_schema: ColumnSchema, multiplier: float = 2.0, ensure_multiple_of_8: bool = True ): cardinality = col_schema.int_domain.max + 1 return get_embedding_size_from_cardinality( cardinality, multiplier=multiplier, ensure_multiple_of_8=ensure_multiple_of_8 )