Source code for transformers4rec.torch.utils.examples_utils

import gc
import glob
import os

import numpy as np
import torch

[docs]def list_files(startpath): """ Util function to print the nested structure of a directory """ for root, dirs, files in os.walk(startpath): level = root.replace(startpath, "").count(os.sep) indent = " " * 4 * (level) print("{}{}/".format(indent, os.path.basename(root))) subindent = " " * 4 * (level + 1) for f in files: print("{}{}".format(subindent, f))
[docs]def visualize_response(batch, response, top_k, session_col="session_id"): """ Util function to extract top-k encoded item-ids from logits Parameters ---------- batch : cudf.DataFrame the batch of raw data sent to triton server. response: tritonclient.grpc.InferResult the response returned by grpc client. top_k: int the `top_k` top items to retrieve from predictions. """ sessions = batch[session_col].drop_duplicates().values predictions = response.as_numpy("output") top_preds = np.argpartition(predictions, -top_k, axis=1)[:, -top_k:] for session, next_items in zip(sessions, top_preds): print( "- Top-%s predictions for session `%s`: %s\n" % (top_k, session, " || ".join([str(e) for e in next_items])) )
[docs]def fit_and_evaluate(trainer, start_time_index, end_time_index, input_dir): """ Util function for time-window based fine-tuning using the T4rec Trainer class. Iteratively train using data of a given index and evaluate on the validation data of the following index. Parameters ---------- start_time_index: int The start index for training, it should match the partitions of the data directory end_time_index: int The end index for training, it should match the partitions of the data directory input_dir: str The input directory where the parquet files were saved based on partition column Returns ------- indexed_by_time_metrics: dict The dictionary of ranking metrics: each item is the list of scores over time indices. """ indexed_by_time_metrics = {} for time_index in range(start_time_index, end_time_index + 1): # 1. Set data time_index_train = time_index time_index_eval = time_index + 1 train_paths = glob.glob(os.path.join(input_dir, f"{time_index_train}/train.parquet")) eval_paths = glob.glob(os.path.join(input_dir, f"{time_index_eval}/valid.parquet")) # 2. Train on train data of time_index print("\n***** Launch training for day %s: *****" % time_index) trainer.train_dataset_or_path = train_paths trainer.reset_lr_scheduler() trainer.train() # 3. Evaluate on valid data of time_index+1 trainer.eval_dataset_or_path = eval_paths eval_metrics = trainer.evaluate(metric_key_prefix="eval") print("\n***** Evaluation results for day %s:*****\n" % time_index_eval) for key in sorted(eval_metrics.keys()): if "at_" in key: print(" %s = %s" % (key.replace("_at_", "@"), str(eval_metrics[key]))) if "indexed_by_time_" + key.replace("_at_", "@") in indexed_by_time_metrics: indexed_by_time_metrics["indexed_by_time_" + key.replace("_at_", "@")] += [ eval_metrics[key] ] else: indexed_by_time_metrics["indexed_by_time_" + key.replace("_at_", "@")] = [ eval_metrics[key] ] # free GPU for next day training wipe_memory() return indexed_by_time_metrics
[docs]def wipe_memory(): gc.collect() torch.cuda.empty_cache()