Source code for nvtabular.ops.column_similarity

#
# Copyright (c) 2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numba
import pandas as pd
import scipy.sparse

from merlin.core.compat import cuda, cupy, numpy
from merlin.core.dispatch import DataFrameType, annotate
from merlin.schema import Schema, Tags
from nvtabular.ops.operator import ColumnSelector, Operator

if cupy:
    from cupyx.scipy.sparse import coo_matrix
else:
    from scipy.sparse import coo_matrix


[docs]class ColumnSimilarity(Operator):
    """Calculates the similarity between two columns using tf-idf, cosine or
    inner product as the distance metric. For each row, this calculates the distance
    between the two columns by looking up features for those columns in a sparse matrix,
    and then computing the distance between the rows of the feature matrices.

    Example usage::

        # Read in the 'document_categories' file from the kaggle outbrains dataset and convert
        # to a sparse matrix
        df = cudf.read_csv("document_categories.csv.zip")
        categories = cupyx.scipy.sparse.coo_matrix((cupy.ones(len(df)),
                                                   (df.document_id.values, df.category_id.values))
        # compute a new column 'document_id_document_id_promo_sim' between the document_id and
        # document_id_promo columns on tfidf distance on the categories matrix we just loaded up
        sim_features = [["document_id", "document_id_promo"]] >> ColumnSimilarity(categories,
                                                                metric='tfidf', on_device=False)
        workflow = nvt.Workflow(sim_features)

    Parameters
    -----------
    left_features : csr_matrix
        Sparse feature matrix for the left column
    right_features : csr_matrix, optional
        Sparse feature matrix for the right column in each pair. If not given will use the
        same feature matrix as for the left (for example when calculating document-document
        distances)
    on_device : bool
        Whether to compute on the GPU or CPU. Computing on the GPU will be
        faster, but requires that the left_features/right_features sparse matrices
        fit into GPU memory.
    """

[docs]    def __init__(self, left_features, right_features=None, metric="tfidf", on_device=True):
        super(ColumnSimilarity, self).__init__()

        self.metric = metric
        self.left_features = left_features
        self.right_features = right_features
        self.on_device = on_device
        self._initialized = False

    def _initialize_features(self):
        if not self._initialized:
            self.left_features = _convert_features(self.left_features, self.metric, self.on_device)
            self.right_features = (
                _convert_features(self.right_features, self.metric, self.on_device)
                if self.right_features is not None
                else self.left_features.copy()
            )
            self._initialized = True

[docs]    @annotate("ColumnSimilarity_op", color="darkgreen", domain="nvt_python")
    def transform(self, col_selector: ColumnSelector, df: DataFrameType) -> DataFrameType:
        use_values = self.on_device
        if isinstance(df, pd.DataFrame):
            # Disallow on-device computation for cpu-backed data
            self.on_device = False
            use_values = True

        # Check if features are initialized
        self._initialize_features()

        names = self.output_column_names(col_selector).names
        for name, (left, right) in zip(names, col_selector.grouped_names):
            a = df[left].values if use_values else df[left].values_host
            b = df[right].values if use_values else df[right].values_host

            if len(a) and len(b):
                similarities = row_wise_inner_product(
                    a, self.left_features, b, self.right_features, self.on_device
                )
            else:
                similarities = []
            df[name] = similarities

        return df

    transform.__doc__ = Operator.transform.__doc__

[docs]    def compute_selector(
        self,
        input_schema: Schema,
        selector: ColumnSelector,
        parents_selector: ColumnSelector,
        dependencies_selector: ColumnSelector,
    ) -> ColumnSelector:
        self._validate_matching_cols(input_schema, parents_selector, "computing input selector")
        return parents_selector

[docs]    def column_mapping(self, col_selector):
        column_mapping = {}
        for group in col_selector.grouped_names:
            a, b = group
            col_name = f"{a}_{b}_sim"
            column_mapping[col_name] = [a, b]
        return column_mapping

    @property
    def output_tags(self):
        return [Tags.CONTINUOUS]

    @property
    def output_dtype(self):
        return numpy.float


def row_wise_inner_product(a, a_features, b, b_features, on_device=True):
    """Computes the similarity between two columns, by computing the inner product
    along two sparse feature matrices . Both a_features and b_features are
    required to be in canonical CSR format.

    Parameters
    -----------
    a : array of int
        Array of rowids to use in looking up a_features
    a_features: CSR matrix
        Sparse feature matrix
    b : array of int
        Array of rowids to use in looking up in b_features
    b_features: CSR matrix
        Sparse feature matrix
    on_device: bool
        Whether to compute on the GPU or CPU. Computing on the GPU will be
        faster, but requires that the a_features/b_features sparse matrices
        fit into GPU memory.
    """
    # run a JIT compiled version of this either on gpu/cpu with numba.
    # note that numba doesn't handle sparse matrix types, so we're splitting
    # out to the relevant cupy/numpy arrays for indptr/indices/data
    if on_device:
        threadsperblock = 32
        blockspergrid = (a.size + (threadsperblock - 1)) // threadsperblock
        output = cupy.zeros(len(a), dtype=a_features.data.dtype)
        _row_wise_inner_product_gpu[blockspergrid, threadsperblock](
            a,
            a_features.indptr,
            a_features.indices,
            a_features.data,
            b,
            b_features.indptr,
            b_features.indices,
            b_features.data,
            output,
        )
    else:
        output = numpy.zeros(len(a), dtype=a_features.data.dtype)
        _row_wise_inner_product_cpu(
            a,
            a_features.indptr,
            a_features.indices,
            a_features.data,
            b,
            b_features.indptr,
            b_features.indices,
            b_features.data,
            output,
        )

    return output


@numba.njit(parallel=True)
def _row_wise_inner_product_cpu(
    a, a_indptr, a_indices, a_data, b, b_indptr, b_indices, b_data, output
):
    # https://github.com/PyCQA/pylint/issues/2910
    # pylint: disable=not-an-iterable
    for i in numba.prange(len(a)):
        output[i] = _inner_product_cpu(
            a[i], a_indptr, a_indices, a_data, b[i], b_indptr, b_indices, b_data
        )


if cuda:

    @numba.cuda.jit
    def _row_wise_inner_product_gpu(
        a, a_indptr, a_indices, a_data, b, b_indptr, b_indices, b_data, output
    ):
        i = numba.cuda.grid(1)
        if i < a.size:
            output[i] = _inner_product_gpu(
                a[i], a_indptr, a_indices, a_data, b[i], b_indptr, b_indices, b_data
            )


def _inner_product(a, a_indptr, a_indices, a_data, b, b_indptr, b_indices, b_data):
    # adapted from scipy:
    # https://github.com/scipy/scipy/blob/312b706c1d98980ed140adae943d41f9f7dc08f5/scipy/sparse/sparsetools/csr.h#L780-L854
    a_pos, a_end = a_indptr[a], a_indptr[a + 1]
    b_pos, b_end = b_indptr[b], b_indptr[b + 1]
    similarity = 0.0

    while a_pos < a_end and b_pos < b_end:
        a_j = a_indices[a_pos]
        b_j = b_indices[b_pos]
        if a_j == b_j:
            similarity += a_data[a_pos] * b_data[b_pos]
            a_pos += 1
            b_pos += 1
        elif a_j < b_j:
            a_pos += 1
        else:
            b_pos += 1

    return similarity


# JIT the _inner_product function to run on both CPU/GPU using numba
_inner_product_cpu = numba.njit(inline="always")(_inner_product)
_inner_product_gpu = numba.cuda.jit(device=True, inline=True)(_inner_product) if cuda else None


def _convert_features(features, metric, on_device):
    if on_device:
        # take a shallow copy to avoid mutating the input, but keep gpu
        # memory as low as possible. (also convert to coo_matrix if passed
        # a CSR etc)
        features = coo_matrix(features)
    else:
        if not isinstance(features, scipy.sparse.coo_matrix):
            # convert to host first if the sparse matrix is on the device
            if features.__class__.__module__.startswith("cupy"):
                features = features.get()
            # make sure we're a coo matrix
            if not isinstance(features, scipy.sparse.coo_matrix):
                features = scipy.sparse.coo_matrix(features)

    # Normalizes the matrix so that we can compute the distance metric
    # with only the inner product
    np = cupy if on_device else numpy
    if metric == "tfidf":
        features = _normalize(_tfidf_weight(features.copy(), np), np)
    elif metric == "cosine":
        features = _normalize(features.copy(), np)
    elif metric != "inner":
        raise ValueError(f"unknown distance metric {metric}")

    # we need features in CSR format to do the row lookup
    return features.tocsr()


def _tfidf_weight(X, np):
    N = float(X.shape[0])
    idf = np.log(N / np.bincount(X.col))
    X.data = X.data * idf[X.col]
    return X


def _normalize(X, np):
    X.data = X.data / np.sqrt(np.bincount(X.row, X.data**2))[X.row]
    return X