Source code for nvtabular.ops.reduce_dtype_size

#
# Copyright (c) 2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import dask.dataframe as dd
import numpy as np

from merlin.core.dispatch import DataFrameType, annotate
from merlin.schema import Schema

from .operator import ColumnSelector, Operator
from .stat_operator import StatOperator

_INT_DTYPES = [np.int8, np.int16, np.int32, np.int64]


[docs]class ReduceDtypeSize(StatOperator):
    """
    ReduceDtypeSize changes the dtypes of numeric columns. For integer columns
    this will choose a dtype such that the minimum and maximum values in the
    column will fit. For float columns this will cast to a float32.
    """

[docs]    def __init__(self, float_dtype=np.float32):
        super().__init__()
        self.float_dtype = float_dtype
        self.ranges = {}
        self.dtypes = {}

[docs]    @annotate("reduce_dtype_size_fit", color="green", domain="nvt_python")
    def fit(self, col_selector: ColumnSelector, ddf: dd.DataFrame):
        return {col: (ddf[col].min(), ddf[col].max()) for col in col_selector.names}

[docs]    def fit_finalize(self, dask_stats):
        self.ranges = dask_stats

[docs]    def clear(self):
        self.dtypes = {}
        self.ranges = {}

[docs]    @annotate("reduce_dtype_size_transform", color="darkgreen", domain="nvt_python")
    def transform(self, col_selector: ColumnSelector, df: DataFrameType) -> DataFrameType:
        for col, dtype in self.dtypes.items():
            df[col] = df[col].astype(dtype)
        return df

[docs]    def compute_output_schema(self, input_schema, selector, prev_output_schema=None):
        if not self.ranges:
            return input_schema

        output_columns = []
        for column, (min_value, max_value) in self.ranges.items():
            column = input_schema[column]

            dtype = column.dtype
            if np.issubdtype(column.dtype, np.integer):
                for possible_dtype in _INT_DTYPES:
                    dtype_range = np.iinfo(possible_dtype)
                    if min_value >= dtype_range.min and max_value <= dtype_range.max:
                        dtype = possible_dtype
                        break

            elif np.issubdtype(column.dtype, np.float):
                dtype = self.float_dtype

            output_columns.append(column.with_dtype(dtype))

        self.dtypes = {column.name: column.dtype for column in output_columns}
        return Schema(output_columns)

    transform.__doc__ = Operator.transform.__doc__
    compute_output_schema.__doc__ = Operator.compute_output_schema.__doc__
    fit.__doc__ = StatOperator.fit.__doc__
    fit_finalize.__doc__ = StatOperator.fit_finalize.__doc__
    clear.__doc__ = StatOperator.clear.__doc__