merlin.schema package

merlin.schema package#

class merlin.schema.Schema(column_schemas=None)[source]#

Bases: object

A collection of column schemas for a dataset.

property column_names#

select(selector) → Schema[source]#

Select matching columns from this Schema object using a ColumnSelector

Parameters:: selector (ColumnSelector) – Selector that describes which columns match
Returns:: New object containing only the ColumnSchemas of selected columns
Return type:: Schema

apply(selector) → Schema[source]#

excluding(selector) → Schema[source]#

Select non-matching columns from this Schema object using a ColumnSelector

Parameters:: selector (ColumnSelector) – Selector that describes which columns match
Returns:: New object containing only the ColumnSchemas of selected columns
Return type:: Schema

apply_inverse(selector) → Schema[source]#

select_by_tag(tags: str | Tags | List[str | Tags], pred_fn=None) → Schema[source]#

Select columns from this Schema that match ANY of the supplied tags.

Parameters:

tags (List[Union[str, Tags]] :) – List of tags that describes which columns match
pred_fn (any or all) – Predicate function that decides if the column should be selected. Receives iterable of bool values indicating whether each of the provided tags is present on a column schema. Returning True selects this column, False will not return that column.

Returns:

New object containing only the ColumnSchemas of selected columns

Return type:

Schema

excluding_by_tag(tags) → Schema[source]#

remove_by_tag(tags) → Schema[source]#

select_by_name(names: List[str]) → Schema[source]#

Select matching columns from this Schema object using a list of column names

Parameters:: names (List[str] :) – List of column names that describes which columns match
Returns:: New object containing only the ColumnSchemas of selected columns
Return type:: Schema

excluding_by_name(col_names: List[str])[source]#

Remove columns from this Schema object by name

Parameters:: col_names (List[str]) – Names of the column to remove
Returns:: New Schema object after the columns are removed
Return type:: Schema

remove_col(col_name: str) → Schema[source]#

Remove a column from this Schema object by name

Parameters:: col_name (str) – Name of the column to remove
Returns:: This Schema object after the column is removed
Return type:: Schema

without(col_names: List[str]) → Schema[source]#

get(col_name: str, default: ColumnSchema | None = None) → ColumnSchema[source]#

Get a ColumnSchema by name

Parameters:

col_name (str) – Name of the column to get
default (ColumnSchema :) –

Default value to return if column is not found.
(Default value = None)

Returns:

Retrieved column schema (or default value, if not found)

Return type:

ColumnSchema

property first: ColumnSchema#

Returns the first ColumnSchema in the Schema. Useful for cases where you select down to a single column via select_by_name or select_by_tag, and just want the value

Returns:: The first column schema present in this Schema object
Return type:: ColumnSchema
Raises:: ValueError – If this Schema object contains no column schemas

to_pandas() → DataFrame[source]#

Convert this Schema object to a pandas DataFrame

Returns:: DataFrame containing the column schemas in this Schema object
Return type:: pd.DataFrame

copy() → Schema[source]#: Return a copy of the schema

class merlin.schema.ColumnSchema(name: str, tags: ~merlin.schema.tags.TagSet | ~typing.List[str | ~merlin.schema.tags.Tags] | None = <factory>, properties: ~typing.Dict | None = <factory>, dtype: ~merlin.dtypes.base.DType | None = None, is_list: bool | None = None, is_ragged: bool | None = None, dims: dataclasses.InitVar[typing.Union[typing.Tuple, merlin.dtypes.shape.Shape]] = None)[source]#

Bases: object

A schema containing metadata of a dataframe column.

name: str#

tags: TagSet | List[str | Tags] | None#

properties: Dict | None#

dtype: DType | None = None#

is_list: bool | None = None#

is_ragged: bool | None = None#

dims: dataclasses.InitVar[Union[Tuple, merlin.dtypes.shape.Shape]] = None#

property shape#

with_name(name: str) → ColumnSchema[source]#

Create a copy of this ColumnSchema object with a different column name

Parameters:: name (str) – New column name
Returns:: Copied object with new column name
Return type:: ColumnSchema

with_tags(tags: str | Tags) → ColumnSchema[source]#

Create a copy of this ColumnSchema object with different column tags

Parameters:: tags (Union[str, Tags]) – New column tags
Returns:: Copied object with new column tags
Return type:: ColumnSchema

with_properties(properties: dict) → ColumnSchema[source]#

Create a copy of this ColumnSchema object with different column properties

Parameters:: properties (dict) – New column properties
Returns:: Copied object with new column properties
Return type:: ColumnSchema
Raises:: TypeError – If properties are not a dict

with_dtype(dtype, is_list: bool | None = None, is_ragged: bool | None = None) → ColumnSchema[source]#

Create a copy of this ColumnSchema object with different column dtype

Parameters:

dtype (np.dtype) – New column dtype
is_list (bool :) –

Whether rows in this column contain lists.
(Default value = None)
is_ragged (bool :) –

Whether lists in this column have varying lengths.
(Default value = None)

Returns:

Copied object with new column dtype

Return type:

ColumnSchema

with_shape(shape: Tuple | Shape) → ColumnSchema[source]#

Create a copy of this object with a new shape

Parameters:: shape (Union[Tuple, Shape]) – Object to set as shape, must be either a tuple or Shape.
Returns:: A copy of this object containing the provided shape value
Return type:: ColumnSchema
Raises:: TypeError – If value is not either a tuple or a Shape

property int_domain: Domain | None#

property float_domain: Domain | None#

property value_count: Domain | None#

class merlin.schema.Tags(value)[source]#

Bases: Enum

Standard tags used in the Merlin ecosystem

CATEGORICAL = 'categorical'#

CONTINUOUS = 'continuous'#

LIST = 'list'#

SEQUENCE = 'sequence'#

TEXT = 'text'#

TOKENIZED = 'tokenized'#

TIME = 'time'#

EMBEDDING = 'embedding'#

ID = 'id'#

USER = 'user'#

ITEM = 'item'#

SESSION = 'session'#

CONTEXT = 'context'#

TARGET = 'target'#

REGRESSION = 'regression'#

CLASSIFICATION = 'classification'#

BINARY = 'binary'#

MULTI_CLASS = 'multi_class'#

USER_ID = 'user_id'#

ITEM_ID = 'item_id'#

SESSION_ID = 'session_id'#

TEXT_TOKENIZED = 'text_tokenized'#

BINARY_CLASSIFICATION = 'binary_classification'#

MULTI_CLASS_CLASSIFICATION = 'multi_class_classification'#

merlin.schema package

Contents

merlin.schema package#