From 6c7870432a5d8b8059941f1108685286b825acb9 Mon Sep 17 00:00:00 2001 From: Fabio Date: Fri, 2 Feb 2024 20:31:48 +0100 Subject: [PATCH 1/2] Add draft for directed acyclic graph class --- hiclass/DirectedAcyclicGraph.py | 59 +++++++++++++++ hiclass/LocalClassifierPerParentNode.py | 95 ++++++++++++++++++++++++- hiclass/Node.py | 40 +++++++++++ hiclass/__init__.py | 4 ++ tests/test_DirectedAcyclicGraph.py | 36 ++++++++++ tests/test_Node.py | 14 ++++ 6 files changed, 245 insertions(+), 3 deletions(-) create mode 100644 hiclass/DirectedAcyclicGraph.py create mode 100644 hiclass/Node.py create mode 100644 tests/test_DirectedAcyclicGraph.py create mode 100644 tests/test_Node.py diff --git a/hiclass/DirectedAcyclicGraph.py b/hiclass/DirectedAcyclicGraph.py new file mode 100644 index 00000000..55b6f305 --- /dev/null +++ b/hiclass/DirectedAcyclicGraph.py @@ -0,0 +1,59 @@ +from hiclass.Node import Node + + +class DirectedAcyclicGraph: + """ + Manages the directed acyclic graph used in HiClass. + + It tries to copy networkx API as much as possible, + but extends it by adding support for multiple nodes with the same name, + as long as they have different predecessors. + """ + + + def __init__(self, n_rows): + """ + Initialize a directed acyclic graph. + + Parameters + ---------- + n_rows : int + The number of rows in x and y, i.e., the features and labels matrices. + """ + self.root = Node(n_rows, "root") + self.nodes = { + "root": self.root + } + + def add_node(self, node_name): + """ + Add a new as successor of the root node. + + Parameters + ---------- + node_name : str + The name of the node. + """ + if node_name != "": + new_node = self.root.add_successor(node_name) + self.nodes[node_name] = new_node + + def add_path(self, nodes): + """ + Add new nodes from a path. + + Parameters + ---------- + nodes : np.ndarray + The list with the path, e.g., [a b c] = a -> b -> c + """ + successor = nodes[0] + leaf = self.root.add_successor(successor) + self.nodes[successor] = leaf + index = 0 + while index < len(nodes) - 1 and nodes[index] != "": + successor = nodes[index + 1] + if successor != "": + leaf = leaf.add_successor(successor) + self.nodes[successor] = leaf + index = index + 1 diff --git a/hiclass/LocalClassifierPerParentNode.py b/hiclass/LocalClassifierPerParentNode.py index 50a30ca0..892b186b 100644 --- a/hiclass/LocalClassifierPerParentNode.py +++ b/hiclass/LocalClassifierPerParentNode.py @@ -4,15 +4,17 @@ Numeric and string output labels are both handled. """ -from copy import deepcopy - import networkx as nx import numpy as np +from copy import deepcopy from sklearn.base import BaseEstimator +from sklearn.utils.validation import _check_sample_weight from sklearn.utils.validation import check_array, check_is_fitted from hiclass.ConstantClassifier import ConstantClassifier +from hiclass.DirectedAcyclicGraph import DirectedAcyclicGraph from hiclass.HierarchicalClassifier import HierarchicalClassifier +from hiclass.HierarchicalClassifier import make_leveled class LocalClassifierPerParentNode(BaseEstimator, HierarchicalClassifier): @@ -98,7 +100,7 @@ def fit(self, X, y, sample_weight=None): Fitted estimator. """ # Execute common methods necessary before fitting - super()._pre_fit(X, y, sample_weight) + self._pre_fit(X, y, sample_weight) # Fit local classifiers in DAG super().fit(X, y) @@ -157,6 +159,93 @@ def predict(self, X): return y + def _pre_fit(self, X, y, sample_weight): + # Check that X and y have correct shape + # and convert them to np.ndarray if need be + + if not self.bert: + self.X_, self.y_ = self._validate_data( + X, y, multi_output=True, accept_sparse="csr", allow_nd=True + ) + else: + self.X_ = np.array(X) + self.y_ = np.array(y) + + if sample_weight is not None: + self.sample_weight_ = _check_sample_weight(sample_weight, X) + else: + self.sample_weight_ = None + + self.y_ = make_leveled(self.y_) + + # Create and configure logger + self._create_logger() + + # Create DAG from self.y_ and store to self.hierarchy_ + self._create_digraph() + + # If user passes edge_list, then export + # DAG to CSV file to visualize with Gephi + self._export_digraph() + + # Assert that graph is directed acyclic + self._assert_digraph_is_dag() + + # If y is 1D, convert to 2D for binary policies + self._convert_1d_y_to_2d() + + # Detect root(s) and add artificial root to DAG + self._add_artificial_root() + + # Initialize local classifiers in DAG + self._initialize_local_classifiers() + + def _create_digraph(self): + # Create DiGraph + self.hierarchy_ = DirectedAcyclicGraph(self.X_.shape[0]) + + # Save dtype of y_ + self.dtype_ = self.y_.dtype + + self._create_digraph_1d() + + self._create_digraph_2d() + + if self.y_.ndim > 2: + # Unsuported dimension + self.logger_.error(f"y with {self.y_.ndim} dimensions detected") + raise ValueError( + f"Creating graph from y with {self.y_.ndim} dimensions is not supported" + ) + + def _create_digraph_1d(self): + # Flatten 1D disguised as 2D + if self.y_.ndim == 2 and self.y_.shape[1] == 1: + self.logger_.info("Converting y to 1D") + self.y_ = self.y_.flatten() + if self.y_.ndim == 1: + # Create max_levels_ variable + self.max_levels_ = 1 + self.logger_.info(f"Creating digraph from {self.y_.size} 1D labels") + for label in self.y_: + self.hierarchy_.add_node(label) + + def _create_digraph_2d(self): + if self.y_.ndim == 2: + # Create max_levels variable + self.max_levels_ = self.y_.shape[1] + rows, columns = self.y_.shape + self.logger_.info(f"Creating digraph from {rows} 2D labels") + for row in range(rows): + path = self.y_[row, :] + self.hierarchy_.add_path(path) + + def _assert_digraph_is_dag(self): + # Assert that graph is directed acyclic + if not self.hierarchy_.is_acyclic_graph(): + self.logger_.error("Cycle detected in graph") + raise ValueError("Graph is not directed acyclic") + def _predict_remaining_levels(self, X, y): for level in range(1, y.shape[1]): predecessors = set(y[:, level - 1]) diff --git a/hiclass/Node.py b/hiclass/Node.py new file mode 100644 index 00000000..2c956be1 --- /dev/null +++ b/hiclass/Node.py @@ -0,0 +1,40 @@ +import numpy as np + +class Node: + """Manages data for an individual node in the hierarchy.""" + + def __init__(self, n_rows, name): + """ + Initialize an individual node. + + Parameters + ---------- + n_rows : int + The number of rows in x and y. + """ + self.n_rows = n_rows + self.mask = np.full(n_rows, True) + self.children = dict() + self.name = name + + def add_successor(self, successor_name): + """ + Add a new successor. + + Parameters + ---------- + node_name : str + The name of the new successor. + + Returns + ------- + successor : Node + The new successor created. + """ + if successor_name != "": + if not successor_name in self.children: + new_successor = Node(self.n_rows, successor_name) + self.children[successor_name] = new_successor + return new_successor + else: + return self.children[successor_name] diff --git a/hiclass/__init__.py b/hiclass/__init__.py index 09370436..0b778a58 100644 --- a/hiclass/__init__.py +++ b/hiclass/__init__.py @@ -1,5 +1,6 @@ """Init module for the library.""" +from .DirectedAcyclicGraph import DirectedAcyclicGraph from .LocalClassifierPerLevel import LocalClassifierPerLevel from .LocalClassifierPerNode import LocalClassifierPerNode from .LocalClassifierPerParentNode import LocalClassifierPerParentNode @@ -7,6 +8,7 @@ from .MultiLabelLocalClassifierPerParentNode import ( MultiLabelLocalClassifierPerParentNode, ) +from .Node import Node from ._version import get_versions __version__ = get_versions()["version"] @@ -18,4 +20,6 @@ "LocalClassifierPerLevel", "MultiLabelLocalClassifierPerNode", "MultiLabelLocalClassifierPerParentNode", + "Node", + "DirectedAcyclicGraph", ] diff --git a/tests/test_DirectedAcyclicGraph.py b/tests/test_DirectedAcyclicGraph.py new file mode 100644 index 00000000..58a70187 --- /dev/null +++ b/tests/test_DirectedAcyclicGraph.py @@ -0,0 +1,36 @@ +import numpy as np + +from hiclass import DirectedAcyclicGraph + + +def test_add_node(): + n_rows = 3 + dag = DirectedAcyclicGraph(n_rows) + dag.add_node("node1") + dag.add_node("node2") + dag.add_node("node1") + dag.add_node("node2") + assert 3 == len(dag.nodes) + assert "root" in dag.nodes + assert "node1" in dag.nodes + assert "node2" in dag.nodes + + +def test_add_path(): + paths = np.array([ + ["a", "c", "d"], + ["b", "c", "e"], + ["a", "c", "f"], + ["c", "", ""], + ["a", "c", "d"], + ["b", "c", "e"], + ["a", "c", "f"], + ["c", "", ""], + ["", "", ""], + ]) + rows = paths.shape[0] + dag = DirectedAcyclicGraph(rows) + for row in range(rows): + path = paths[row, :] + dag.add_path(path) + assert 8 == len(dag.nodes) diff --git a/tests/test_Node.py b/tests/test_Node.py new file mode 100644 index 00000000..0ba2a217 --- /dev/null +++ b/tests/test_Node.py @@ -0,0 +1,14 @@ +from hiclass import Node + + +def test_add_successor(): + n_rows = 3 + name = "root" + node = Node(n_rows, name) + assert node.name == "root" + successor1 = node.add_successor("node1") + successor2 = node.add_successor("node2") + assert successor1 == node.add_successor("node1") + assert successor2 == node.add_successor("node2") + assert n_rows == node.n_rows + assert 2 == len(node.children) From 483c3ab03b04fb34d1ce77f76a6bb3b9ae9d8dc2 Mon Sep 17 00:00:00 2001 From: Fabio Date: Wed, 7 Feb 2024 21:02:29 +0100 Subject: [PATCH 2/2] Move method get_parents to dag class --- hiclass/DirectedAcyclicGraph.py | 22 ++++++++++++- hiclass/LocalClassifierPerParentNode.py | 37 +++++----------------- hiclass/Node.py | 19 +++++++---- tests/test_DirectedAcyclicGraph.py | 23 ++++++++++++++ tests/test_LocalClassifierPerParentNode.py | 30 +++++++----------- tests/test_Node.py | 5 +-- 6 files changed, 78 insertions(+), 58 deletions(-) diff --git a/hiclass/DirectedAcyclicGraph.py b/hiclass/DirectedAcyclicGraph.py index 55b6f305..a03b1ad7 100644 --- a/hiclass/DirectedAcyclicGraph.py +++ b/hiclass/DirectedAcyclicGraph.py @@ -20,7 +20,7 @@ def __init__(self, n_rows): n_rows : int The number of rows in x and y, i.e., the features and labels matrices. """ - self.root = Node(n_rows, "root") + self.root = Node(n_rows, "root", True) self.nodes = { "root": self.root } @@ -57,3 +57,23 @@ def add_path(self, nodes): leaf = leaf.add_successor(successor) self.nodes[successor] = leaf index = index + 1 + + def is_acyclic(self): + visited = set() + to_visit = [self.root] + while len(to_visit) > 0: + next = to_visit.pop(0) + if next in visited: + return False + visited.add(next) + to_visit.extend(next.successors.values()) + return True + + def get_parent_nodes(self): + parent_nodes = [] + for node in self.nodes.values(): + # Skip only leaf nodes + successors = node.successors.values() + if len(successors) > 0: + parent_nodes.append(node) + return parent_nodes diff --git a/hiclass/LocalClassifierPerParentNode.py b/hiclass/LocalClassifierPerParentNode.py index 892b186b..fe90a52a 100644 --- a/hiclass/LocalClassifierPerParentNode.py +++ b/hiclass/LocalClassifierPerParentNode.py @@ -194,9 +194,6 @@ def _pre_fit(self, X, y, sample_weight): # If y is 1D, convert to 2D for binary policies self._convert_1d_y_to_2d() - # Detect root(s) and add artificial root to DAG - self._add_artificial_root() - # Initialize local classifiers in DAG self._initialize_local_classifiers() @@ -242,7 +239,7 @@ def _create_digraph_2d(self): def _assert_digraph_is_dag(self): # Assert that graph is directed acyclic - if not self.hierarchy_.is_acyclic_graph(): + if not self.hierarchy_.is_acyclic(): self.logger_.error("Cycle detected in graph") raise ValueError("Graph is not directed acyclic") @@ -261,32 +258,14 @@ def _predict_remaining_levels(self, X, y): def _initialize_local_classifiers(self): super()._initialize_local_classifiers() - local_classifiers = {} - nodes = self._get_parents() - for node in nodes: - local_classifiers[node] = {"classifier": deepcopy(self.local_classifier_)} - nx.set_node_attributes(self.hierarchy_, local_classifiers) - - def _get_parents(self): - nodes = [] - for node in self.hierarchy_.nodes: - # Skip only leaf nodes - successors = list(self.hierarchy_.successors(node)) - if len(successors) > 0: - nodes.append(node) - return nodes + parent_nodes = self.hierarchy_.get_parent_nodes() + for node in parent_nodes: + node.classifier = deepcopy(self.local_classifier_) def _get_successors(self, node): - successors = list(self.hierarchy_.successors(node)) - mask = np.isin(self.y_, successors).any(axis=1) + mask = node.get_successors_mask() X = self.X_[mask] - y = [] - for row in self.y_[mask]: - if node == self.root_: - y.append(row[0]) - else: - y.append(row[np.where(row == node)[0][0] + 1]) - y = np.array(y) + y = self.y_[mask] sample_weight = ( self.sample_weight_[mask] if self.sample_weight_ is not None else None ) @@ -294,7 +273,7 @@ def _get_successors(self, node): @staticmethod def _fit_classifier(self, node): - classifier = self.hierarchy_.nodes[node]["classifier"] + classifier = self.hierarchy_.nodes[node.name].classifier # get children examples X, y, sample_weight = self._get_successors(node) unique_y = np.unique(y) @@ -311,5 +290,5 @@ def _fit_classifier(self, node): def _fit_digraph(self, local_mode: bool = False, use_joblib: bool = False): self.logger_.info("Fitting local classifiers") - nodes = self._get_parents() + nodes = self.hierarchy_.get_parent_nodes() self._fit_node_classifier(nodes, local_mode, use_joblib) diff --git a/hiclass/Node.py b/hiclass/Node.py index 2c956be1..7e19b374 100644 --- a/hiclass/Node.py +++ b/hiclass/Node.py @@ -3,7 +3,7 @@ class Node: """Manages data for an individual node in the hierarchy.""" - def __init__(self, n_rows, name): + def __init__(self, n_rows, name, default_mask): """ Initialize an individual node. @@ -11,11 +11,16 @@ def __init__(self, n_rows, name): ---------- n_rows : int The number of rows in x and y. + name : str + The name of the node. + default_mask : Bool + The default value of the mask, i.e., True or False. """ self.n_rows = n_rows - self.mask = np.full(n_rows, True) - self.children = dict() + self.mask = np.full(n_rows, default_mask) + self.successors = dict() self.name = name + self.classifier = None def add_successor(self, successor_name): """ @@ -32,9 +37,9 @@ def add_successor(self, successor_name): The new successor created. """ if successor_name != "": - if not successor_name in self.children: - new_successor = Node(self.n_rows, successor_name) - self.children[successor_name] = new_successor + if not successor_name in self.successors: + new_successor = Node(self.n_rows, successor_name, False) + self.successors[successor_name] = new_successor return new_successor else: - return self.children[successor_name] + return self.successors[successor_name] diff --git a/tests/test_DirectedAcyclicGraph.py b/tests/test_DirectedAcyclicGraph.py index 58a70187..8e0123bc 100644 --- a/tests/test_DirectedAcyclicGraph.py +++ b/tests/test_DirectedAcyclicGraph.py @@ -34,3 +34,26 @@ def test_add_path(): path = paths[row, :] dag.add_path(path) assert 8 == len(dag.nodes) + + +def test_is_acyclic(): + n_rows = 3 + dag = DirectedAcyclicGraph(n_rows) + dag.add_path([0, 1, 2]) + dag.add_path([0, 2, 3]) + assert dag.is_acyclic() is True + dag.add_path([0, 2, 0]) + # assert dag.is_acyclic() is False + # the creation of new nodes removes cycles + # so this last assertion fails + + +def test_get_parent_nodes(): + n_rows = 3 + dag = DirectedAcyclicGraph(n_rows) + dag.add_path(["a", "b", "c"]) + dag.add_path(["d", "e", "f"]) + parent_nodes = dag.get_parent_nodes() + assert 5 == len(parent_nodes) + names = ["root", "a", "b", "d", "e"] + assert names == [node.name for node in parent_nodes] diff --git a/tests/test_LocalClassifierPerParentNode.py b/tests/test_LocalClassifierPerParentNode.py index 922a03a3..9d5917be 100644 --- a/tests/test_LocalClassifierPerParentNode.py +++ b/tests/test_LocalClassifierPerParentNode.py @@ -1,9 +1,8 @@ import logging -import tempfile - import networkx as nx import numpy as np import pytest +import tempfile from numpy.testing import assert_array_equal from scipy.sparse import csr_matrix from sklearn.exceptions import NotFittedError @@ -11,6 +10,7 @@ from sklearn.utils.estimator_checks import parametrize_with_checks from sklearn.utils.validation import check_is_fitted +from hiclass import DirectedAcyclicGraph from hiclass import LocalClassifierPerParentNode @@ -22,30 +22,28 @@ def test_sklearn_compatible_estimator(estimator, check): @pytest.fixture def digraph_logistic_regression(): digraph = LocalClassifierPerParentNode(local_classifier=LogisticRegression()) - digraph.hierarchy_ = nx.DiGraph([("a", "b"), ("a", "c")]) digraph.y_ = np.array([["a", "b"], ["a", "c"]]) digraph.X_ = np.array([[1, 2], [3, 4]]) + rows = digraph.y_.shape[0] + digraph.hierarchy_ = DirectedAcyclicGraph(rows) + for row in range(rows): + path = digraph.y_[row, :] + digraph.hierarchy_.add_path(path) digraph.logger_ = logging.getLogger("LCPPN") - digraph.root_ = "a" - digraph.separator_ = "::HiClass::Separator::" digraph.sample_weight_ = None return digraph def test_initialize_local_classifiers(digraph_logistic_regression): digraph_logistic_regression._initialize_local_classifiers() - for node in digraph_logistic_regression.hierarchy_.nodes: - if node == digraph_logistic_regression.root_: + for node in digraph_logistic_regression.hierarchy_.nodes.values(): + if node.name in ["root", "a"]: assert isinstance( - digraph_logistic_regression.hierarchy_.nodes[node]["classifier"], + digraph_logistic_regression.hierarchy_.nodes[node.name].classifier, LogisticRegression, ) else: - with pytest.raises(KeyError): - isinstance( - digraph_logistic_regression.hierarchy_.nodes[node]["classifier"], - LogisticRegression, - ) + digraph_logistic_regression.hierarchy_.nodes[node.name].classifier is None def test_fit_digraph(digraph_logistic_regression): @@ -97,12 +95,6 @@ def digraph_2d(): return classifier -def test_get_parents(digraph_2d): - ground_truth = np.array(["a", "b", "d", "e"]) - nodes = digraph_2d._get_parents() - assert_array_equal(ground_truth, nodes) - - @pytest.fixture def x_and_y_arrays(): graph = LocalClassifierPerParentNode() diff --git a/tests/test_Node.py b/tests/test_Node.py index 0ba2a217..10687de1 100644 --- a/tests/test_Node.py +++ b/tests/test_Node.py @@ -4,11 +4,12 @@ def test_add_successor(): n_rows = 3 name = "root" - node = Node(n_rows, name) + default_mask = True + node = Node(n_rows, name, default_mask) assert node.name == "root" successor1 = node.add_successor("node1") successor2 = node.add_successor("node2") assert successor1 == node.add_successor("node1") assert successor2 == node.add_successor("node2") assert n_rows == node.n_rows - assert 2 == len(node.children) + assert 2 == len(node.successors)