Source code for sparse_operation_kit

#
# Copyright (c) 2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import sys
import tensorflow
from tensorflow.python.framework import ops
from tensorflow.python.framework import load_library

#   When installed with pip, the .so files should be in
# sparse_operation_kit/lib/
#   When installed manually via `make install`, the .so files
# should be in /usr/local/lib/
lib_path = os.path.join(os.path.dirname(__file__), "../lib")
lib_path = os.path.abspath(lib_path)
lib_path = [lib_path, "/usr/local/lib/"]
syspath = [spath + "/sparse_operation_kit/lib" for spath in sys.path]
lib_path.extend(syspath)

raw_ops = None
for path in lib_path:
    file = os.path.join(path, "libsparse_operation_kit.so")
    if os.path.exists(file):
        # The order of loading core, embedding, sok cannot
        # be changed, because there is a dependency between them:
        # libsok.so -> libembedding.so -> libcore.so
        load_library.load_op_library(os.path.join(path, "libhugectr_core23.so"))
        load_library.load_op_library(os.path.join(path, "libembedding.so"))
        raw_ops = load_library.load_op_library(file)
        print("[SOK INFO] Import %s" % file)
        break
if raw_ops is None:
    raise Exception("[SOK INFO] libsok.so is not found")
tf_version = [int(v) for v in tensorflow.__version__.split(".")]

from sparse_operation_kit._version import __version__

import sparse_operation_kit.communication
from sparse_operation_kit.communication import set_comm_tool


from sparse_operation_kit.distributed_variable import Variable
from sparse_operation_kit.distributed_variable import DistributedVariable
from sparse_operation_kit.distributed_variable import LocalizedVariable


from sparse_operation_kit.dynamic_variable import DynamicVariable
from sparse_operation_kit.dynamic_variable import assign, export


from sparse_operation_kit.optimizer import OptimizerWrapper
from sparse_operation_kit.optimizer import SGD


from sparse_operation_kit.lookup import lookup_sparse, sparse_read_and_evict
from sparse_operation_kit.lookup import all2all_dense_embedding

from sparse_operation_kit.dump_load import dump, load, incremental_model_dump


# a specific code path for dl framework tf2.11.0
[docs]def init(comm_tool="horovod", use_legacy_optimizer=True):
    """
    Abbreviated as ``sok.init``.

    This function is used to do the initialization of SparseOperationKit (SOK).

    SOK will leverage all available GPUs for current CPU process. Please set
    `CUDA_VISIBLE_DEVICES` or `tf.config.set_visible_devices` to specify which
    GPU(s) are used in this process before launching tensorflow runtime
    and calling this function.

    Currently, these API only support ``horovod`` as the communication
    tool, so ``horovod.init`` must be called before initializing SOK.

    Parameters
    ----------
    comm_tool: string
            a string to specify which communication tool to use. Default value is "horovod".
    use_legacy_optimizer: bool
            From tensorflow 2.11.0 , keras default optimizer is optimizer experimental. SOK won't support it in future, so if you switch use_legacy_optimizer to True,
            SOK will redefine tensorflow.keras.optimizers to tensorflow.keras.optimizers.legacy(tf.keras.optimizers.optimizer_v2).
            Default value is True, if you want to use new optimizer in the other part in your code , and only use legacy optimizer in SOK, please set to False
    Returns
    -------
    None

    Example
    -------
    .. code-block:: python

        import tensorflow as tf
        import horovod.tensorflow as hvd
        import sparse_operation_kit as sok

        hvd.init()
        gpus = tf.config.experimental.list_physical_devices("GPU")
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        if gpus:
            tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")

        sok.init()

    """
    if use_legacy_optimizer:
        try:
            if tensorflow.keras.optimizers.legacy.Optimizer.__name__ == "OptimizerV2":
                tensorflow.keras.optimizers = tensorflow.keras.optimizers.legacy
                tensorflow.optimizers = tensorflow.optimizers.legacy
        except:
            pass

    set_comm_tool(comm_tool)
    print("[SOK INFO] Initialize finished, communication tool: " + comm_tool)


[docs]def filter_variables(vars):
    """
    When using dynamic variables, it is necessary to use sok.OptimizerWrapper to update these variables. Therefore, this API can filter out SOK variables from all TensorFlow variables.

    Parameters
    ----------
    vars: A list of TensorFlow training variables.

    Returns
    -------
    sok_vars:A list of SOK variables.
    other_vars:A list of variables that don't belongs to SOK.

    Example
    -------
    .. code-block:: python

        import numpy as np
        import tensorflow as tf
        import horovod.tensorflow as hvd
        import sparse_operation_kit as sok

        if __name__ == "__main__":
            gpus = tf.config.experimental.list_physical_devices("GPU")
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)

            hvd.init()
            sok.init()

            v1 = tf.Variable([[0, 1, 2]])
            v2 = sok.Variable([[3, 4, 5]])
            v3 = sok.Variable([[6, 7, 8]], mode="localized:0")
            v4 = sok.DynamicVariable(dimension=3, var_type="hbm", initializer="13")

            sok_vars, other_vars = sok.filter_variables([v1, v2, v3, v4])
            assert len(sok_vars) == 3
            assert len(other_vars) == 1

            print("[SOK INFO] filter_variables test passed")
    """
    sok_vars, other_vars = [], []
    for v in vars:
        if isinstance(v, DynamicVariable):
            sok_vars.append(v)
        elif isinstance(v, DistributedVariable):
            sok_vars.append(v)
        elif isinstance(v, LocalizedVariable):
            sok_vars.append(v)
        else:
            other_vars.append(v)
    return sok_vars, other_vars