Source code for nvtabular.ops.fill

#
# Copyright (c) 2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import dask.dataframe as dd

from nvtabular.columns import Schema
from nvtabular.dispatch import DataFrameType, annotate

from .operator import ColumnSelector, Operator
from .stat_operator import StatOperator


[docs]class FillMissing(Operator):
    """
    This operation replaces missing values with a constant pre-defined value

    Example usage::

        # Use FillMissing to define a workflow for continuous columns and specify the fill value
        # Default is 0
        cont_features = ['cont1', 'cont2', 'cont3'] >> ops.FillMissing() >> ...
        processor = nvtabular.Workflow(cont_features)

    Parameters
    -----------
    fill_val : float, default 0
        The constant value to replace missing values with.
    add_binary_cols : boolean, default False
        When True, adds binary columns that indicate whether cells in each column were filled
    """

    def __init__(self, fill_val=0, add_binary_cols=False):
        super().__init__()
        self.fill_val = fill_val
        self.add_binary_cols = add_binary_cols
        self._inference_transform = None

[docs]    @annotate("FillMissing_op", color="darkgreen", domain="nvt_python")
    def transform(self, col_selector: ColumnSelector, df: DataFrameType) -> DataFrameType:
        if self.add_binary_cols:
            for col in col_selector.names:
                df[f"{col}_filled"] = df[col].isna()
                df[col] = df[col].fillna(self.fill_val)
        else:
            df[col_selector.names] = df[col_selector.names].fillna(self.fill_val)

        return df

[docs]    def inference_initialize(self, col_selector, inference_config):
        """load up extra configuration about this op."""
        if self.add_binary_cols:
            return None
        import nvtabular_cpp

        return nvtabular_cpp.inference.FillTransform(self)

    transform.__doc__ = Operator.transform.__doc__

[docs]    def compute_output_schema(self, input_schema: Schema, col_selector: ColumnSelector) -> Schema:
        if not col_selector:
            col_selector = ColumnSelector(input_schema.column_names)
        if col_selector.tags:
            tags_col_selector = ColumnSelector(tags=col_selector.tags)
            filtered_schema = input_schema.apply(tags_col_selector)
            col_selector += ColumnSelector(filtered_schema.column_names)

            # zero tags because already filtered
            col_selector._tags = []
        output_schema = Schema()
        for column_name in col_selector.names:
            column_schema = input_schema.column_schemas[column_name]
            output_schema += Schema([self.transformed_schema(column_schema)])
            if self.add_binary_cols:
                column_schema = column_schema.with_name(f"{column_name}_filled")
                output_schema += Schema([column_schema])
        return output_schema

[docs]    def output_column_names(self, col_selector: ColumnSelector) -> ColumnSelector:
        output_cols = col_selector.names[:]
        if self.add_binary_cols:
            output_cols.extend([f"{col}_filled" for col in col_selector.names])
        return ColumnSelector(output_cols)


[docs]class FillMedian(StatOperator):
    """
    This operation replaces missing values with the median value for the column.

    Example usage::

        # Use FillMedian in a workflow for continuous columns
        cont_features = ['cont1', 'cont2', 'cont3'] >> ops.FillMedian()
        processor = nvtabular.Workflow(cont_features)

    Parameters
    -----------
    add_binary_cols : boolean, default False
        When True, adds binary columns that indicate whether cells in each column were filled
    """

    def __init__(self, add_binary_cols=False):
        super().__init__()
        self.add_binary_cols = add_binary_cols
        self.medians = {}

[docs]    @annotate("FillMedian_transform", color="darkgreen", domain="nvt_python")
    def transform(self, col_selector: ColumnSelector, df: DataFrameType) -> DataFrameType:
        if not self.medians:
            raise RuntimeError("need to call 'fit' before running transform")

        for col in col_selector.names:
            if self.add_binary_cols:
                df[f"{col}_filled"] = df[col].isna()
            df[col] = df[col].fillna(self.medians[col])
        return df

[docs]    @annotate("FillMedian_fit", color="green", domain="nvt_python")
    def fit(self, col_selector: ColumnSelector, ddf: dd.DataFrame):
        # TODO: Use `method="tidigest"` when crick supports device
        dask_stats = ddf[col_selector.names].quantile(q=0.5, method="dask")
        return dask_stats

[docs]    @annotate("FillMedian_finalize", color="green", domain="nvt_python")
    def fit_finalize(self, dask_stats):
        index = dask_stats.index
        vals = index.values_host if hasattr(index, "values_host") else index.values
        for col in vals:
            self.medians[col] = float(dask_stats[col])

    transform.__doc__ = Operator.transform.__doc__
    fit.__doc__ = StatOperator.fit.__doc__
    fit_finalize.__doc__ = StatOperator.fit_finalize.__doc__

[docs]    def clear(self):
        self.medians = {}

[docs]    def compute_output_schema(self, input_schema: Schema, col_selector: ColumnSelector) -> Schema:
        if not col_selector:
            col_selector = ColumnSelector(input_schema.column_names)
        output_schema = Schema()
        for column_name in col_selector.names:
            column_schema = input_schema.column_schemas[column_name]
            output_schema += Schema([self.transformed_schema(column_schema)])
            if self.add_binary_cols:
                column_schema = column_schema.with_name(f"{column_name}_filled")
                output_schema += Schema([column_schema])
        return output_schema

[docs]    def output_column_names(self, col_selector: ColumnSelector) -> ColumnSelector:
        output_cols = col_selector.names[:]
        if self.add_binary_cols:
            output_cols.extend([f"{col}_filled" for col in col_selector.names])
        return ColumnSelector(output_cols)