Source code for nvtabular.workflow.node

#
# Copyright (c) 2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import collections.abc
import warnings

from nvtabular.columns import ColumnSelector, Schema
from nvtabular.ops import LambdaOp, Operator, internal
from nvtabular.ops.internal.concat_columns import ConcatColumns
from nvtabular.ops.internal.subset_columns import SubsetColumns


[docs]class WorkflowNode: """A WorkflowNode is a group of columns that you want to apply the same transformations to. WorkflowNode's can be transformed by shifting operators on to them, which returns a new WorkflowNode with the transformations applied. This lets you define a graph of operations that makes up your workflow Parameters ---------- selector: ColumnSelector Defines which columns to select from the input Dataset using column names and tags. """ def __init__(self, selector=None): self.parents = [] self.children = [] self.dependencies = [] self.op = None self.input_schema = None self.output_schema = None if isinstance(selector, list): warnings.warn( 'The `["a", "b", "c"] >> ops.Operator` syntax for creating a `ColumnGroup` ' "has been deprecated in NVTabular 21.09 and will be removed in a future version.", FutureWarning, ) selector = ColumnSelector(selector) if selector and not isinstance(selector, ColumnSelector): raise TypeError("The selector argument must be a list or a ColumnSelector") self._selector = selector @property def selector(self): return self._selector @selector.setter def selector(self, sel): if isinstance(sel, list): sel = ColumnSelector(sel) self._selector = sel def compute_schemas(self, root_schema): # If parent is an addition node, we may need to propagate grouping # unless we're a node that already has a selector if not self.selector: if ( len(self.parents) == 1 and isinstance(self.parents[0].op, internal.ConcatColumns) and self.parents[0].selector and (self.parents[0].selector.names) ): self.selector = self.parents[0].selector # If we have a selector, apply it to upstream schemas from nodes/dataset if self.selector: upstream_schema = root_schema + _combine_schemas(self.parents_with_dep_nodes) self.input_schema = upstream_schema.apply(self.selector) else: # If we don't have a selector but we're an addition node, if isinstance(self.op, ConcatColumns): upstream_selector = _combine_selectors(self.parents) upstream_selector += _combine_selectors(self.dependencies) if upstream_selector.names: self.selector = upstream_selector # For addition nodes, some of the operands are parents and # others are dependencies so grab schemas from both upstream_schema = root_schema + _combine_schemas(self.parents_with_dep_nodes) self.input_schema = upstream_schema.apply(self.selector) # If we're a subtraction node, we have to do some gymnastics to compute # the schema, because operands may be in the parents or the dependencies # or both elif isinstance(self.op, SubsetColumns): operands = self.parents + self.dependencies left_operand = operands.pop(0) left_operand_schema = _combine_schemas([left_operand]) operands_schema = _combine_schemas(operands) self.input_schema = left_operand_schema - operands_schema # If none of the above apply, then we don't have a selector # and we're not an add or sub node, so our input is just the # parents output else: self.input_schema = _combine_schemas(self.parents) # Then we delegate to the op (if there is one) to compute this node's # output schema. If there's no op, then outputs are just the inputs if self.op: self.output_schema = self.op.compute_output_schema(self.input_schema, self.selector) else: self.output_schema = self.input_schema
[docs] def __rshift__(self, operator): """Transforms this WorkflowNode by applying an Operator Parameters ----------- operators: Operator or callable Returns ------- WorkflowNode """ if isinstance(operator, type) and issubclass(operator, Operator): # handle case where an operator class is passed operator = operator() elif callable(operator): # implicit lambdaop conversion. operator = LambdaOp(operator) if not isinstance(operator, Operator): raise ValueError(f"Expected operator or callable, got {operator.__class__}") child = WorkflowNode() child.parents = [self] self.children.append(child) child.op = operator dependencies = operator.dependencies() if dependencies: if not isinstance(dependencies, collections.abc.Sequence): dependencies = [dependencies] for dependency in dependencies: if isinstance(dependency, WorkflowNode): dependency.children.append(child) child.parents.append(dependency) elif not isinstance(dependency, ColumnSelector): dependency = ColumnSelector(dependency) child.dependencies.append(dependency) return child
[docs] def __add__(self, other): """Adds columns from this WorkflowNode with another to return a new WorkflowNode Parameters ----------- other: WorkflowNode or str or list of str Returns ------- WorkflowNode """ if isinstance(self.op, internal.ConcatColumns): child = self else: # Create a child node child = WorkflowNode() child.op = internal.ConcatColumns(label="+") # Add self as a parent self.children.append(child) child.parents.append(self) # The right operand becomes a dependency if isinstance(other, list): other = _strs_to_selectors(other) elif not isinstance(other, (ColumnSelector, WorkflowNode)): other = ColumnSelector(other) # If the other node is a `+` node, we want to collapse it into this `+` node to # avoid creating a cascade of repeated `+`s that we'd need to optimize out by # re-combining them later in order to clean up the graph if isinstance(other, WorkflowNode) and isinstance(other.op, internal.ConcatColumns): child.dependencies += other.parents + other.dependencies else: child.dependencies.append(other) return child
# handle the "column_name" + WorkflowNode case __radd__ = __add__
[docs] def __sub__(self, other): """Removes columns from this WorkflowNode with another to return a new WorkflowNode Parameters ----------- other: WorkflowNode or str or list of str Columns to remove Returns ------- WorkflowNode """ if isinstance(self.op, internal.SubsetColumns): child = self else: # Create a child node child = WorkflowNode() child.op = internal.SubsetColumns(label="-") # Add self as a parent self.children.append(child) child.parents.append(self) # The right operand becomes a dependency if not isinstance(other, (ColumnSelector, WorkflowNode)): other = ColumnSelector(other) child.dependencies.append(other) return child
def __rsub__(self, other): # Create a child node child = WorkflowNode() child.op = internal.SubsetColumns(label="-") # The left operand becomes a dependency if not isinstance(other, (ColumnSelector, WorkflowNode)): other = ColumnSelector(other) # Add self as a dependency child.dependencies.append(other) child.dependencies.append(self) return child
[docs] def __getitem__(self, columns): """Selects certain columns from this WorkflowNode, and returns a new Columngroup with only those columns Parameters ----------- columns: str or list of str Columns to select Returns ------- WorkflowNode """ col_selector = ColumnSelector(columns) child = WorkflowNode(col_selector) child.parents = [self] self.children.append(child) child.op = internal.SubsetColumns(label=str(list(columns))) return child
def __repr__(self): output = " output" if not self.children else "" return f"<WorkflowNode {self.label}{output}>" @property def parents_with_dep_nodes(self): return self.parents + self.dependency_nodes @property def input_columns(self): if self.input_schema is None: raise RuntimeError( "The input columns aren't computed until the workflow " "is fit to a dataset or input schema." ) if self.selector: # To maintain column groupings return self.selector else: return ColumnSelector(self.input_schema.column_names) @property def output_columns(self): if self.output_schema is None: raise RuntimeError( "The output columns aren't computed until the workflow " "is fit to a dataset or input schema." ) return ColumnSelector(self.output_schema.column_names) @property def dependency_schema(self): return _combine_schemas(self.dependencies) @property def dependency_columns(self): return _combine_selectors(self.dependency_selectors) @property def dependency_nodes(self): return _filter_by_type(self.dependencies, WorkflowNode) @property def dependency_selectors(self): return _filter_by_type(self.dependencies, ColumnSelector) @property def label(self): if self.op and hasattr(self.op, "label"): return self.op.label elif self.op: return str(type(self.op)) elif not self.parents: return f"input cols=[{self._cols_repr}]" else: return "??" @property def _cols_repr(self): if self.input_schema: columns = self.input_schema.column_names elif self.selector: columns = self.selector.names else: columns = [] cols_repr = ", ".join(map(str, columns[:3])) if len(columns) > 3: cols_repr += "..." return cols_repr @property def graph(self): return _to_graphviz(self)
def iter_nodes(nodes): queue = nodes[:] while queue: current = queue.pop() yield current # TODO: deduplicate nodes? for parent in current.parents: queue.append(parent) for dep in current.dependency_nodes: queue.append(dep) def _filter_by_type(elements, type_): results = [] for elem in elements: if isinstance(elem, type_): results.append(elem) elif isinstance(elem, list): results += _filter_by_type(elem, type_) return results def _combine_schemas(elements): combined = Schema() for elem in elements: if isinstance(elem, WorkflowNode): combined += elem.output_schema elif isinstance(elem, ColumnSelector): combined += Schema(elem.names) elif isinstance(elem, list): combined += _combine_schemas(elem) return combined def _combine_selectors(elements): combined = ColumnSelector() for elem in elements: if isinstance(elem, WorkflowNode): combined += ColumnSelector(elem.output_schema.column_names) elif isinstance(elem, ColumnSelector): combined += elem elif isinstance(elem, list): combined += ColumnSelector(subgroups=_combine_selectors(elem)) return combined def _to_selector(value): if not isinstance(value, (ColumnSelector, WorkflowNode)): return ColumnSelector(value) else: return value def _strs_to_selectors(elements): return [_to_selector(elem) for elem in elements]
[docs]def _to_graphviz(workflow_node): """Converts a WorkflowNode to a GraphViz DiGraph object useful for display in notebooks""" from graphviz import Digraph graph = Digraph() # get all the nodes from parents of this columngroup # and add edges between each of them allnodes = list(set(iter_nodes([workflow_node]))) node_ids = {v: str(k) for k, v in enumerate(allnodes)} for node, nodeid in node_ids.items(): graph.node(nodeid, node.label) for parent in node.parents_with_dep_nodes: graph.edge(node_ids[parent], nodeid) full_selector = ColumnSelector() if node.selector and not node.parents: full_selector += node.selector full_selector += sum(node.dependency_selectors, full_selector) if full_selector.names: selector_id = f"{nodeid}_selector" graph.node(selector_id, str(full_selector.names)) graph.edge(selector_id, nodeid) # add a single 'output' node representing the final state output_node_id = str(len(allnodes)) output_string = "output cols" if workflow_node._cols_repr: output_string += f"=[{workflow_node._cols_repr}]" graph.node(output_node_id, output_string) graph.edge(node_ids[workflow_node], output_node_id) return graph
def _convert_col(col): if isinstance(col, (str, tuple)): return col elif isinstance(col, list): return tuple(col) else: raise ValueError(f"Invalid column value for WorkflowNode: {col}")