#
# Copyright (c) 2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from distutils.version import LooseVersion
import numpy as np
from nvtabular.dispatch import DataFrameType, _array, annotate
from ..tags import Tags
from .operator import ColumnSelector, Operator
[docs]class Bucketize(Operator):
"""This operation transforms continuous features into categorical features
with bins based on the provided bin boundaries.
Example usage::
#
cont_names = ['cont1', 'cont2']
boundaries = {
'cont1': [-50, 0, 50],
'cont2': [0, 25, 50, 75, 100]
}
bucketize_op = cont_names >> ops.Bucketize(boundaries)
processor = nvt.Workflow(bucketize_op)
Parameters
----------
boundaries : int, dict or callable
Defines how to transform the continuous values into bins
"""
def __init__(self, boundaries):
# Check if we have cupy.digitize support
try:
import cupy
self.use_digitize = LooseVersion(cupy.__version__) >= "8.0.0"
except ImportError:
# Assume cpu-backed data (since cupy is not even installed)
self.use_digitize = True
# transform boundaries into a lookup function on column names
if isinstance(boundaries, (list, tuple)):
self.boundaries = lambda col: boundaries
elif isinstance(boundaries, dict):
self.boundaries = lambda col: boundaries[col]
elif callable(boundaries):
self.boundaries = boundaries
else:
raise TypeError(
"`boundaries` must be dict, callable, or list, got type {}".format(type(boundaries))
)
super().__init__()
def _get_dtypes(self):
return np.int64
transform.__doc__ = Operator.transform.__doc__