fix #29 fix #13 fix #15 improve guide with cache and diff hid dim

pygod-team · Apr 24, 2022 · d180edc · d180edc
1 parent a94563f
commit d180edc
Showing 1 changed file with 125 additions and 95 deletions.
diff --git a/pygod/models/guide.py b/pygod/models/guide.py
@@ -4,7 +4,9 @@
 # Author: Kay Liu <zliu234@uic.edu>
 # License: BSD 2 clause
 
+import os
 import torch
+import hashlib
 import numpy as np
 import networkx as nx
 import torch.nn as nn
@@ -42,8 +44,10 @@ class GUIDE(BaseDetector):
 
     Parameters
     ----------
-    hid_dim :  int, optional
-        Hidden dimension of model. Default: ``0``.
+    a_hid :  int, optional
+        Hidden dimension for attribute autoencoder. Default: ``32``.
+    s_hid :  int, optional
+        Hidden dimension for structure autoencoder. Default: ``4``.
     num_layers : int, optional
         Total number of layers in autoencoders. Default: ``4``.
     dropout : float, optional
@@ -77,6 +81,9 @@ class GUIDE(BaseDetector):
     selected_motif : bool
         Use selected motifs which are defined in the original paper.
         Default: ``True``.
+    cache_dir : str
+        The directory for the node motif degree caching.
+        Default: ``None``.
     verbose : bool
         Verbosity mode. Turn on to print out log information.
         Default: ``False``.
@@ -90,7 +97,8 @@ class GUIDE(BaseDetector):
     """
 
     def __init__(self,
-                 hid_dim=64,
+                 a_hid=32,
+                 s_hid=4,
                  num_layers=4,
                  dropout=0.3,
                  weight_decay=0.,
@@ -104,11 +112,13 @@ def __init__(self,
                  num_neigh=-1,
                  graphlet_size=4,
                  selected_motif=True,
+                 cache_dir=None,
                  verbose=False):
         super(GUIDE, self).__init__(contamination=contamination)
 
         # model param
-        self.hid_dim = hid_dim
+        self.a_hid = a_hid
+        self.s_hid = s_hid
         self.num_layers = num_layers
         self.dropout = dropout
         self.weight_decay = weight_decay
@@ -131,6 +141,10 @@ def __init__(self,
         self.verbose = verbose
         self.model = None
 
+        if cache_dir is None:
+            cache_dir = os.path.join(os.path.expanduser('~'), '.pygod')
+        self.cache_dir = cache_dir
+
     def fit(self, G, y_true=None):
         """
         Description
@@ -152,17 +166,17 @@ def fit(self, G, y_true=None):
             Fitted estimator.
         """
         G.node_idx = torch.arange(G.x.shape[0])
-        G.s = self.calc_nmf(G)
-        print(G.s)
+        G.s = self.get_nmf(G, self.cache_dir)
         if self.batch_size == 0:
             self.batch_size = G.x.shape[0]
         loader = NeighborLoader(G,
                                 [self.num_neigh] * self.num_layers,
                                 batch_size=self.batch_size)
 
-        self.model = GUIDE_Base(x_dim=G.x.shape[1],
+        self.model = GUIDE_Base(a_dim=G.x.shape[1],
                                 s_dim=G.s.shape[1],
-                                hid_dim=self.hid_dim,
+                                a_hid=self.a_hid,
+                                s_hid=self.s_hid,
                                 num_layers=self.num_layers,
                                 dropout=self.dropout,
                                 act=self.act).to(self.device)
@@ -225,7 +239,7 @@ def decision_function(self, G):
         """
         check_is_fitted(self, ['model'])
         G.node_idx = torch.arange(G.x.shape[0])
-        G.s = self.calc_nmf(G)
+        G.s = self.get_nmf(G, self.cache_dir)
         loader = NeighborLoader(G,
                                 [self.num_neigh] * self.num_layers,
                                 batch_size=self.batch_size)
@@ -289,7 +303,7 @@ def loss_func(self, x, x_, s, s_):
                 structure_errors
         return score
 
-    def calc_nmf(self, G):
+    def get_nmf(self, G, cache_dir):
         """
         Description
         -----------
@@ -301,116 +315,132 @@ def calc_nmf(self, G):
         ----------
         G : PyTorch Geometric Data instance (torch_geometric.data.Data)
             The input data.
+        cache_dir : str
+            The directory for the node motif degree caching
 
         Returns
         -------
         s : torch.Tensor
             Structure matrix (node motif degree/graphlet degree)
         """
-        edge_index = G.edge_index
-        g = nx.from_edgelist(edge_index.T.tolist())
-
-        # create edge subsets
-        edge_subsets = dict()
-        subsets = [[edge[0], edge[1]] for edge in g.edges()]
-        edge_subsets[2] = subsets
-        unique_subsets = dict()
-        for i in range(3, self.graphlet_size + 1):
-            for subset in subsets:
-                for node in subset:
-                    for neb in g.neighbors(node):
-                        new_subset = subset + [neb]
-                        if len(set(new_subset)) == i:
-                            new_subset.sort()
-                            unique_subsets[tuple(new_subset)] = 1
-            subsets = [list(k) for k, v in unique_subsets.items()]
-            edge_subsets[i] = subsets
-            unique_subsets = dict()
+        hash_func = hashlib.sha1()
+        hash_func.update(str(G).encode('utf-8'))
+        file_name = 'nmd_' + str(hash_func.hexdigest()[:8]) + \
+                    str(self.graphlet_size) + \
+                    str(self.selected_motif)[0] + '.pt'
+        file_path = os.path.join(cache_dir, file_name)
+        if os.path.exists(file_path):
+            s = torch.load(file_path)
+        else:
+            edge_index = G.edge_index
+            g = nx.from_edgelist(edge_index.T.tolist())
 
-        # enumerate graphs
-        graphs = graph_atlas_g()
-        interesting_graphs = {i: [] for i in
-                              range(2, self.graphlet_size + 1)}
-        for graph in graphs:
-            if 1 < graph.number_of_nodes() < self.graphlet_size + 1:
-                if nx.is_connected(graph):
-                    interesting_graphs[graph.number_of_nodes()].append(
-                        graph)
-
-        # enumerate categories
-        main_index = 0
-        categories = dict()
-        for size, graphs in interesting_graphs.items():
-            categories[size] = dict()
-            for index, graph in enumerate(graphs):
-                categories[size][index] = dict()
-                degrees = list(
-                    set([graph.degree(node) for node in graph.nodes()]))
-                for degree in degrees:
-                    categories[size][index][degree] = main_index
-                    main_index += 1
-        unique_motif_count = main_index
-
-        # setup feature
-        features = {node: {i: 0 for i in range(unique_motif_count)}
-                    for node in g.nodes()}
-        for size, node_lists in edge_subsets.items():
-            graphs = interesting_graphs[size]
-            for nodes in node_lists:
-                sub_gr = g.subgraph(nodes)
+            # create edge subsets
+            edge_subsets = dict()
+            subsets = [[edge[0], edge[1]] for edge in g.edges()]
+            edge_subsets[2] = subsets
+            unique_subsets = dict()
+            for i in range(3, self.graphlet_size + 1):
+                for subset in subsets:
+                    for node in subset:
+                        for neb in g.neighbors(node):
+                            new_subset = subset + [neb]
+                            if len(set(new_subset)) == i:
+                                new_subset.sort()
+                                unique_subsets[tuple(new_subset)] = 1
+                subsets = [list(k) for k, v in unique_subsets.items()]
+                edge_subsets[i] = subsets
+                unique_subsets = dict()
+
+            # enumerate graphs
+            graphs = graph_atlas_g()
+            interesting_graphs = {i: [] for i in
+                                  range(2, self.graphlet_size + 1)}
+            for graph in graphs:
+                if 1 < graph.number_of_nodes() < self.graphlet_size + 1:
+                    if nx.is_connected(graph):
+                        interesting_graphs[graph.number_of_nodes()].append(
+                            graph)
+
+            # enumerate categories
+            main_index = 0
+            categories = dict()
+            for size, graphs in interesting_graphs.items():
+                categories[size] = dict()
                 for index, graph in enumerate(graphs):
-                    if nx.is_isomorphic(sub_gr, graph):
-                        for node in sub_gr.nodes():
-                            features[node][categories[size][index][
-                                sub_gr.degree(node)]] += 1
-                        break
-
-        motifs = [[n] + [features[n][i] for i in range(
-            unique_motif_count)] for n in g.nodes()]
-        motifs = torch.Tensor(motifs)
-        motifs = motifs[torch.sort(motifs[:, 0]).indices, 1:]
-
-        if self.selected_motif:
-            # use motif selected in the original paper only
-            s = torch.zeros((G.x.shape[0], 6))
-            # m31
-            s[:, 0] = motifs[:, 3]
-            # m32
-            s[:, 1] = motifs[:, 1] + motifs[:, 2]
-            # m41
-            s[:, 2] = motifs[:, 14]
-            # m42
-            s[:, 3] = motifs[:, 12] + motifs[:, 13]
-            # m43
-            s[:, 4] = motifs[:, 11]
-            # node degree
-            s[:, 5] = motifs[:, 0]
-        else:
-            # use graphlet degree
-            s = motifs
+                    categories[size][index] = dict()
+                    degrees = list(
+                        set([graph.degree(node) for node in graph.nodes()]))
+                    for degree in degrees:
+                        categories[size][index][degree] = main_index
+                        main_index += 1
+            unique_motif_count = main_index
+
+            # setup feature
+            features = {node: {i: 0 for i in range(unique_motif_count)}
+                        for node in g.nodes()}
+            for size, node_lists in edge_subsets.items():
+                graphs = interesting_graphs[size]
+                for nodes in node_lists:
+                    sub_gr = g.subgraph(nodes)
+                    for index, graph in enumerate(graphs):
+                        if nx.is_isomorphic(sub_gr, graph):
+                            for node in sub_gr.nodes():
+                                features[node][categories[size][index][
+                                    sub_gr.degree(node)]] += 1
+                            break
+
+            motifs = [[n] + [features[n][i] for i in range(
+                unique_motif_count)] for n in g.nodes()]
+            motifs = torch.Tensor(motifs)
+            motifs = motifs[torch.sort(motifs[:, 0]).indices, 1:]
+
+            if self.selected_motif:
+                # use motif selected in the original paper only
+                s = torch.zeros((G.x.shape[0], 6))
+                # m31
+                s[:, 0] = motifs[:, 3]
+                # m32
+                s[:, 1] = motifs[:, 1] + motifs[:, 2]
+                # m41
+                s[:, 2] = motifs[:, 14]
+                # m42
+                s[:, 3] = motifs[:, 12] + motifs[:, 13]
+                # m43
+                s[:, 4] = motifs[:, 11]
+                # node degree
+                s[:, 5] = motifs[:, 0]
+            else:
+                # use graphlet degree
+                s = motifs
+
+            if not os.path.exists(cache_dir):
+                os.makedirs(cache_dir)
+            torch.save(s, file_path)
 
         return s
 
 
 class GUIDE_Base(nn.Module):
     def __init__(self,
-                 x_dim,
+                 a_dim,
                  s_dim,
-                 hid_dim,
+                 a_hid,
+                 s_hid,
                  num_layers,
                  dropout,
                  act):
         super(GUIDE_Base, self).__init__()
 
-        self.attr_ae = GCN(in_channels=x_dim,
-                           hidden_channels=hid_dim,
+        self.attr_ae = GCN(in_channels=a_dim,
+                           hidden_channels=a_hid,
                            num_layers=num_layers,
-                           out_channels=x_dim,
+                           out_channels=a_dim,
                            dropout=dropout,
                            act=act)
 
         self.struct_ae = GNA(in_channels=s_dim,
-                             hidden_channels=hid_dim,
+                             hidden_channels=s_hid,
                              num_layers=num_layers,
                              out_channels=s_dim,
                              dropout=dropout,