merlin.schema package#

class merlin.schema.Schema(column_schemas=None)[source]#

Bases: object

A collection of column schemas for a dataset.

property column_names#
select(selector) merlin.schema.schema.Schema[source]#

Select matching columns from this Schema object using a ColumnSelector

Parameters

selector (ColumnSelector) – Selector that describes which columns match

Returns

New object containing only the ColumnSchemas of selected columns

Return type

Schema

apply(selector) merlin.schema.schema.Schema[source]#
excluding(selector) merlin.schema.schema.Schema[source]#

Select non-matching columns from this Schema object using a ColumnSelector

Parameters

selector (ColumnSelector) – Selector that describes which columns match

Returns

New object containing only the ColumnSchemas of selected columns

Return type

Schema

apply_inverse(selector) merlin.schema.schema.Schema[source]#
select_by_tag(tags: Union[str, merlin.schema.tags.Tags, List[Union[str, merlin.schema.tags.Tags]]]) merlin.schema.schema.Schema[source]#

Select matching columns from this Schema object using a list of tags

Parameters

tags (List[Union[str, Tags]] :) – List of tags that describes which columns match

Returns

New object containing only the ColumnSchemas of selected columns

Return type

Schema

excluding_by_tag(tags) merlin.schema.schema.Schema[source]#
remove_by_tag(tags) merlin.schema.schema.Schema[source]#
select_by_name(names: List[str]) merlin.schema.schema.Schema[source]#

Select matching columns from this Schema object using a list of column names

Parameters

names (List[str] :) – List of column names that describes which columns match

Returns

New object containing only the ColumnSchemas of selected columns

Return type

Schema

excluding_by_name(col_names: List[str])[source]#

Remove columns from this Schema object by name

Parameters

col_names (List[str]) – Names of the column to remove

Returns

New Schema object after the columns are removed

Return type

Schema

remove_col(col_name: str) merlin.schema.schema.Schema[source]#

Remove a column from this Schema object by name

Parameters

col_name (str) – Name of the column to remove

Returns

This Schema object after the column is removed

Return type

Schema

without(col_names: List[str]) merlin.schema.schema.Schema[source]#
get(col_name: str, default: Optional[merlin.schema.schema.ColumnSchema] = None) merlin.schema.schema.ColumnSchema[source]#

Get a ColumnSchema by name

Parameters
  • col_name (str) – Name of the column to get

  • default (ColumnSchema :) –

    Default value to return if column is not found.

    (Default value = None)

Returns

Retrieved column schema (or default value, if not found)

Return type

ColumnSchema

property first: merlin.schema.schema.ColumnSchema#

Returns the first ColumnSchema in the Schema. Useful for cases where you select down to a single column via select_by_name or select_by_tag, and just want the value

Returns

The first column schema present in this Schema object

Return type

ColumnSchema

Raises

ValueError – If this Schema object contains no column schemas

to_pandas() pandas.core.frame.DataFrame[source]#

Convert this Schema object to a pandas DataFrame

Returns

DataFrame containing the column schemas in this Schema object

Return type

pd.DataFrame

class merlin.schema.ColumnSchema(name: str, tags: typing.Optional[merlin.schema.tags.TagSet] = <factory>, properties: typing.Optional[typing.Dict] = <factory>, dtype: typing.Optional[object] = None, is_list: typing.Optional[bool] = None, is_ragged: typing.Optional[bool] = None)[source]#

Bases: object

A schema containing metadata of a dataframe column.

name: str#
tags: Optional[merlin.schema.tags.TagSet]#
properties: Optional[Dict]#
dtype: Optional[object] = None#
is_list: Optional[bool] = None#
is_ragged: Optional[bool] = None#
property quantity#

Describes the number of elements in each row of this column

Returns

SCALAR when one element per row FIXED_LIST when the same number of elements per row RAGGED_LIST when different numbers of elements per row

Return type

ColumnQuantity

with_name(name: str) merlin.schema.schema.ColumnSchema[source]#

Create a copy of this ColumnSchema object with a different column name

Parameters

name (str) – New column name

Returns

Copied object with new column name

Return type

ColumnSchema

with_tags(tags: Union[str, merlin.schema.tags.Tags]) merlin.schema.schema.ColumnSchema[source]#

Create a copy of this ColumnSchema object with different column tags

Parameters

tags (Union[str, Tags]) – New column tags

Returns

Copied object with new column tags

Return type

ColumnSchema

with_properties(properties: dict) merlin.schema.schema.ColumnSchema[source]#

Create a copy of this ColumnSchema object with different column properties

Parameters

properties (dict) – New column properties

Returns

Copied object with new column properties

Return type

ColumnSchema

Raises

TypeError – If properties are not a dict

with_dtype(dtype, is_list: Optional[bool] = None, is_ragged: Optional[bool] = None) merlin.schema.schema.ColumnSchema[source]#

Create a copy of this ColumnSchema object with different column dtype

Parameters
  • dtype (np.dtype) – New column dtype

  • is_list (bool :) –

    Whether rows in this column contain lists.

    (Default value = None)

  • is_ragged (bool :) –

    Whether lists in this column have varying lengths.

    (Default value = None)

Returns

Copied object with new column dtype

Return type

ColumnSchema

property int_domain: Optional[merlin.schema.schema.Domain]#
property float_domain: Optional[merlin.schema.schema.Domain]#
property value_count: Optional[merlin.schema.schema.Domain]#
class merlin.schema.Tags(value)[source]#

Bases: enum.Enum

Standard tags used in the Merlin ecosystem

CATEGORICAL = 'categorical'#
CONTINUOUS = 'continuous'#
LIST = 'list'#
SEQUENCE = 'sequence'#
TEXT = 'text'#
TOKENIZED = 'tokenized'#
TIME = 'time'#
ID = 'id'#
USER = 'user'#
ITEM = 'item'#
SESSION = 'session'#
CONTEXT = 'context'#
TARGET = 'target'#
REGRESSION = 'regression'#
CLASSIFICATION = 'classification'#
BINARY = 'binary'#
MULTI_CLASS = 'multi_class'#
USER_ID = 'user_id'#
ITEM_ID = 'item_id'#
SESSION_ID = 'session_id'#
TEXT_TOKENIZED = 'text_tokenized'#
BINARY_CLASSIFICATION = 'binary_classification'#
MULTI_CLASS_CLASSIFICATION = 'multi_class_classification'#