Source code for sparse_operation_kit

#
# Copyright (c) 2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import sys
import tensorflow
from tensorflow.python.framework import ops
from tensorflow.python.framework import load_library

#   When installed with pip, the .so files should be in
# sparse_operation_kit/lib/
#   When installed manually via `make install`, the .so files
# should be in /usr/local/lib/
lib_path = os.path.join(os.path.dirname(__file__), "../lib")
lib_path = os.path.abspath(lib_path)
lib_path = [lib_path, "/usr/local/lib/"]
syspath = [spath + "/sparse_operation_kit/lib" for spath in sys.path]
lib_path.extend(syspath)

raw_ops = None
for path in lib_path:
    file = os.path.join(path, "libsparse_operation_kit.so")
    if os.path.exists(file):
        # The order of loading core, embedding, sok cannot
        # be changed, because there is a dependency between them:
        # libsok.so -> libembedding.so -> libcore.so
        load_library.load_op_library(os.path.join(path, "libhugectr_core23.so"))
        load_library.load_op_library(os.path.join(path, "libembedding.so"))
        raw_ops = load_library.load_op_library(file)
        print("[SOK INFO] Import %s" % file)
        break
if raw_ops is None:
    raise Exception("[SOK INFO] libsok.so is not found")
tf_version = [int(v) for v in tensorflow.__version__.split(".")]

from sparse_operation_kit._version import __version__

import sparse_operation_kit.communication
from sparse_operation_kit.communication import set_comm_tool


from sparse_operation_kit.distributed_variable import Variable
from sparse_operation_kit.distributed_variable import DistributedVariable
from sparse_operation_kit.distributed_variable import LocalizedVariable


from sparse_operation_kit.dynamic_variable import DynamicVariable
from sparse_operation_kit.dynamic_variable import assign, export


from sparse_operation_kit.optimizer import OptimizerWrapper
from sparse_operation_kit.optimizer import SGD


from sparse_operation_kit.lookup import lookup_sparse, sparse_read_and_evict
from sparse_operation_kit.lookup import all2all_dense_embedding

from sparse_operation_kit.dump_load import dump, load, incremental_model_dump


# a specific code path for dl framework tf2.11.0
[docs]def init(comm_tool="horovod", use_legacy_optimizer=True): """ Abbreviated as ``sok.init``. This function is used to do the initialization of SparseOperationKit (SOK). SOK will leverage all available GPUs for current CPU process. Please set `CUDA_VISIBLE_DEVICES` or `tf.config.set_visible_devices` to specify which GPU(s) are used in this process before launching tensorflow runtime and calling this function. Currently, these API only support ``horovod`` as the communication tool, so ``horovod.init`` must be called before initializing SOK. Parameters ---------- comm_tool: string a string to specify which communication tool to use. Default value is "horovod". use_legacy_optimizer: bool From tensorflow 2.11.0 , keras default optimizer is optimizer experimental. SOK won't support it in future, so if you switch use_legacy_optimizer to True, SOK will redefine tensorflow.keras.optimizers to tensorflow.keras.optimizers.legacy(tf.keras.optimizers.optimizer_v2). Default value is True, if you want to use new optimizer in the other part in your code , and only use legacy optimizer in SOK, please set to False Returns ------- None Example ------- .. code-block:: python import tensorflow as tf import horovod.tensorflow as hvd import sparse_operation_kit as sok hvd.init() gpus = tf.config.experimental.list_physical_devices("GPU") for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU") sok.init() """ if use_legacy_optimizer: try: if tensorflow.keras.optimizers.legacy.Optimizer.__name__ == "OptimizerV2": tensorflow.keras.optimizers = tensorflow.keras.optimizers.legacy tensorflow.optimizers = tensorflow.optimizers.legacy except: pass set_comm_tool(comm_tool) print("[SOK INFO] Initialize finished, communication tool: " + comm_tool)
[docs]def filter_variables(vars): """ When using dynamic variables, it is necessary to use sok.OptimizerWrapper to update these variables. Therefore, this API can filter out SOK variables from all TensorFlow variables. Parameters ---------- vars: A list of TensorFlow training variables. Returns ------- sok_vars:A list of SOK variables. other_vars:A list of variables that don't belongs to SOK. Example ------- .. code-block:: python import numpy as np import tensorflow as tf import horovod.tensorflow as hvd import sparse_operation_kit as sok if __name__ == "__main__": gpus = tf.config.experimental.list_physical_devices("GPU") for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) hvd.init() sok.init() v1 = tf.Variable([[0, 1, 2]]) v2 = sok.Variable([[3, 4, 5]]) v3 = sok.Variable([[6, 7, 8]], mode="localized:0") v4 = sok.DynamicVariable(dimension=3, var_type="hbm", initializer="13") sok_vars, other_vars = sok.filter_variables([v1, v2, v3, v4]) assert len(sok_vars) == 3 assert len(other_vars) == 1 print("[SOK INFO] filter_variables test passed") """ sok_vars, other_vars = [], [] for v in vars: if isinstance(v, DynamicVariable): sok_vars.append(v) elif isinstance(v, DistributedVariable): sok_vars.append(v) elif isinstance(v, LocalizedVariable): sok_vars.append(v) else: other_vars.append(v) return sok_vars, other_vars