From 49916ae3296f830687aebe1eda55c6b8f1ceb07a Mon Sep 17 00:00:00 2001
From: Akhil Pandey <akhilpandey95@icloud.com>
Date: Wed, 27 Jul 2022 12:20:55 -0500
Subject: [PATCH] Issue #4: Added boilerplate code for gnnNAS (#5)

- added src/models.py to include the MPNN model class that which will be
utilized for all benchmarking experiments
- added src/util.py to include helper functions for training, eval, and
record metrics
- added src/dataset.py to include helper functions for loading Physical
chemistry datasets from MoleculeNet
- Updated the README.md

Signed-off-by: Akhil Akella <aakella@swing.lcrc.anl.gov>
---
 README.md     |  79 ++++++++++++++++++++++-
 src/data.py   |  56 +++++++++++++++++
 src/models.py | 115 ++++++++++++++++++++++++++++++++++
 src/util.py   | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 418 insertions(+), 2 deletions(-)
 create mode 100644 src/data.py
 create mode 100644 src/models.py
 create mode 100644 src/util.py

diff --git a/README.md b/README.md
index dbd1d59..4a00ca7 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,77 @@
-# gnn_uncertainty_ensembles
-The project will focus on the design and development of methods to automate the development of graph neural network ensembles and use them for uncertainty quantification
+# MetalgPy + gnnNAS
+
+## About
+The project focusses on leveraging the general purpose library [MetalgPy](https://github.com/deephyper/metalgpy) to write symbolized ML programs capable of leveraging graph hyperparameters for better surrogate model fitting. Our goal was to use `MetalgPy` to search for a representation learning algorithm for graph structures.
+
+## Packages
+
+- `PyTorch`
+- `PyTorch-Geometric`
+- `MetalgPy`
+
+```shell
+# Install Pytorch
+pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
+
+# Install Pytorch Geometric
+pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
+pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
+pip install -q git+https://github.com/pyg-team/pytorch_geometric.git
+
+# Install DeepHyper/MetalgPy
+pip install -q metalgpy
+
+# Install rdkit for the datasets
+pip install -q rdkit-pypi
+```
+
+## Datasets
+
+We use three benchmark datasets
+
+- `GNN Benchmark Dataset` 
+- `Planetoid-1`
+- `MoleculeNet`
+
+#### GNN Benchmark Dataset
+
+A variety of artificially and semi-artificially generated graph datasets. It is composed of datasets such as `PATTERN`, `CLUSTER`, `MNIST`, `CIFAR-10`, `TSP`, `CSL`. 
+
+`Reference`: https://arxiv.org/abs/2003.00982
+`Resource`: https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html
+
+#### Planetoid-1:
+
+`Planetoid` dataset comprising of citation network datasets `Cora`, `Citeseer`, and `Pubmed`. These are three benchmark datasets used for semi-supervised node classification tasks. Each of the mentioned graph datasets contains bag-of-words representation of documents and citation links between the documents
+
+`Reference`: https://arxiv.org/pdf/1603.08861.pdf
+`Resource`: https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html
+
+#### MoleculeNet:
+
+`MoleculetNet`: MoleculeNet is a benchmark specially designed for testing machine learning methods of molecular properties. As we aim to facilitate the development of molecular machine learning method, this work curates a number of dataset collections, creates a suite of software that implements many known featurizations and previously proposed algorithms. All methods and datasets are integrated as parts of the open source DeepChem package(MIT license).
+
+Within the `MoleculetNet`, we are interested to benchmark, Quantum Mechanics, and Physical chemistry datasets
+
+`Quantum Mechanics`:
+- QM7/QM7b (structure): Electronic properties(atomization energy, HOMO/LUMO, etc.) determined using ab-initio density functional theory(DFT).
+- QM8 (structure): Electronic spectra and excited state energy of small molecules calculated by multiple quantum mechanic methods.
+- QM9 (structure): Geometric,  energetic, electronic and thermodynamic properties of DFT-modelled small molecules.
+
+`Physical chemistry`:
+- ESOL: Water solubility data(log solubility in mols per litre) for common organic small molecules.
+- FreeSolv: Experimental and calculated hydration free energy of small molecules in water.
+- Lipophilicity: Experimental results of octanol/water distribution coefficient(logD at pH 7.4).
+
+`Reference`: https://moleculenet.org/datasets-1
+`Resource`: https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html
+
+## Results
+TBA
+
+## Author
+[Akhil Pandey](https://github.com/akhilpandey95)
+
+## Supervisor
+[Prasanna Balaprakash](https://github.com/pbalapra)
+
diff --git a/src/data.py b/src/data.py
new file mode 100644
index 0000000..76d11e8
--- /dev/null
+++ b/src/data.py
@@ -0,0 +1,56 @@
+# This Source Code Form is subject to the terms of the
+# BSD 2-Clause "Simplified" License. If a copy of the same 
+# was not distributed with this file, You can obtain one at
+# https://github.com/akhilpandey95/gnnNAS/blob/master/LICENSE.
+
+import torch
+import numpy as np
+import torch_geometric as pyg
+
+# define the helper method to load dataset
+def load_molnet_phys_chem_data(name, batch_size, training_split, seed=2022):
+    """
+    Load the specific Graph dataset from MoleculeNet
+    Parameters
+    ----------
+    arg1 | name: str
+        Name of the dataset to import from Pytorch Geometric MoleculeNet dataloader.
+    arg2 | batch_size: int
+        Batch size for creating the train/test dataloaders.
+    arg3 | training_split: float
+        Percentage of samples to be kept in training set.
+    arg4 | seed: int
+        Torch Random seed to ensure reproducibility. Default value is 2022
+    Returns
+    -------
+    Pytorch Geometric Dataset(s)
+        torch_geometric.datasets.molecule_net.MoleculeNet
+    """
+    # load the dataset
+    dataset = pyg.datasets.MoleculeNet(root='/tmp/Molnet', name=name)
+
+    # set the seed
+    torch.manual_seed(seed)
+
+    # shuffle the data
+    dataset = dataset.shuffle()
+
+    # set an stop index for gathering train data
+    stop_index = int(np.floor(training_split*dataset.len()))
+
+    # separate training data
+    train_dataset = dataset[0:stop_index]
+
+    # separate test data
+    test_dataset = dataset[stop_index:]
+
+    # create dataloaders for train and test samples
+    train_loader = pyg.loader.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    test_loader = pyg.loader.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
+
+    return dataset, train_loader, test_loader
+
+
+
+
+
diff --git a/src/models.py b/src/models.py
new file mode 100644
index 0000000..8e0bd7a
--- /dev/null
+++ b/src/models.py
@@ -0,0 +1,115 @@
+# This Source Code Form is subject to the terms of the
+# BSD 2-Clause "Simplified" License. If a copy of the same 
+# was not distributed with this file, You can obtain one at
+# https://github.com/akhilpandey95/gnnNAS/blob/master/LICENSE.
+
+import torch
+import numpy as np
+import torch_geometric as pyg
+
+class MPNN(torch.nn.Module):
+    """
+    Creates an MPNN model in pytorch geometric
+    """
+    def __init__(
+            self,
+            n_node_features: int,
+            n_edge_features: int,
+            n_hidden: int,
+            n_output: int,
+            MPNN_inp: torch.nn.Module,
+            MPNN_hidden: torch.nn.Module,
+            n_conv_blocks: int,
+            skip_connection: str="plain") -> None:
+        """
+        Build the MPNN model
+        Parameters
+        ----------
+        arg1 | n_node_features: int
+            Number of features at node level
+        arg2 | n_edge_features: int
+            Number of features at edge level
+        arg3 | n_hidden: int
+            Number of hidden activations
+        arg4 | n_output: int
+            Number of output activations
+        arg5 | n_conv_blocks: int
+            Number of convolutional kernels
+        Returns
+        -------
+        Nothing
+            None
+        """
+        # super class the class structure
+        super().__init__()
+        
+        # set the growth dimension
+        self.growth_dimension = n_hidden
+
+        # encode the node information
+        self.node_encoder = MPNN_inp(n_node_features, n_hidden)
+
+        # add the ability to add one or more conv layers
+        conv_blocks = []
+
+        # ability to add one or more conv blocks
+        for block in range(n_conv_blocks):
+            if skip_connection == "dense":
+                self.growth_dimension = n_hidden + (n_hidden * block)
+            conv = MPNN_hidden(self.growth_dimension, n_hidden)
+            norm = torch.nn.LayerNorm(n_hidden, elementwise_affine=True)
+            act = torch.nn.ReLU(inplace=True)
+            layer = pyg.nn.DeepGCNLayer(conv, norm, act, block=skip_connection)
+            conv_blocks.append(layer)
+
+        # group all the conv layers
+        self.conv_layers = torch.nn.ModuleList(conv_blocks)
+
+        # add the linear layers for flattening the output from MPNN
+        self.flatten = torch.nn.Sequential(
+            torch.nn.Linear(self.growth_dimension, n_hidden),
+            torch.nn.ReLU(),
+            torch.nn.Linear(n_hidden, n_output))
+        
+    def forward(self,
+                x: torch.Tensor,
+                edge_index: torch.Tensor,
+                batch_idx: torch.Tensor) -> torch.Tensor:
+        """
+        Process the MPNN model
+        Parameters
+        ----------
+        arg1 | x: torch.Tensor
+            Input features at node level
+        arg2 | edge_index: torch.Tensor
+            Index pairs of verticies
+        arg3 | batch_idx: torch.Tensor
+            Batch index
+        Returns
+        -------
+        Tensor
+            torch.Tensor
+        """
+        # obtaint the input
+        if isinstance(self.node_encoder, pyg.nn.MessagePassing):
+            x = self.node_encoder(x, edge_index)
+        else:
+            x = self.node_encoder(x)
+
+        # pass the node information to the conv layer
+        x = self.conv_layers[0].conv(x, edge_index)
+
+        # process the layers
+        for layer in range(len(self.conv_layers[1:])):
+            x = self.conv_layers[layer](x, edge_index)
+
+        # obtain the output from the MPNN final layer
+        y = pyg.nn.global_add_pool(x, batch=batch_idx)
+
+        # pass the output to the linear output layer
+        out = self.flatten(y)
+
+        # return the output
+        return out
+
+
diff --git a/src/util.py b/src/util.py
new file mode 100644
index 0000000..36a6614
--- /dev/null
+++ b/src/util.py
@@ -0,0 +1,170 @@
+# This Source Code Form is subject to the terms of the
+# BSD 2-Clause "Simplified" License. If a copy of the same 
+# was not distributed with this file, You can obtain one at
+# https://github.com/akhilpandey95/gnnNAS/blob/master/LICENSE.
+
+import torch
+import numpy as np
+import torch_geometric as pyg
+from typing import Callable
+from collections import defaultdict
+from sklearn.metrics import *
+
+# check if CUDA exists
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using {device} device")
+
+# define the helper method to train
+def model_train(
+        model: torch.nn.Module,
+        dataloader: pyg.loader.DataLoader,
+        learning_rate: float,
+        ITERATIONS: int,
+        optimizer: torch.optim.Optimizer,
+        compute_loss: Callable,
+        logging=False) -> torch.nn.Module:
+    """
+    Train the Pytorch Geometric model and return
+    the model
+    Parameters
+    ----------
+    arg1 | model: torch.nn.Module
+        Trained Neural network model
+    arg2 | dataloader: DataLoader
+        Dataset as a DataLoader object
+    arg3 | model: float
+        Trained Neural network model
+    arg4 | dataloader: int
+        Dataset as a DataLoader object
+    arg5 | model: torch.optim.Optimizer
+        Trained Neural network model
+    arg6 | dataloader: DataLoader
+        Dataset as a DataLoader object
+    Returns
+    -------
+    Pytorch model
+        torch.nn.Module
+    """
+    for iteration in range(ITERATIONS):
+        # set the model for training
+        model.train()
+
+        # iterate in batches over the training dataset
+        for data in dataloader:
+            # set the gradients to zero
+            optimizer.zero_grad()
+
+            # forward pass and compute the y hat values
+            y_hat = model(data.x.float().to(device),
+                          data.edge_index.long().to(device),
+                          data.batch.long().to(device))
+
+            # compute the mean squared error loss
+            cost = compute_loss(y_hat, data.y.to(device))
+
+            # compute mse loss again for the backward pass
+            cost.backward()
+
+            # update the weights
+            optimizer.step()
+
+        # display the stats
+        if logging:
+            print(f'Epoch: {iteration:03d}, Loss: {cost:.4f}')
+
+    # return the tuple [Ground truth, Predictions]
+    return model
+
+# define the helper method to evaluate
+def model_evaluate(
+        model: torch.nn.Module,
+        dataloader: pyg.loader.DataLoader,
+) -> (torch.Tensor, torch.Tensor):
+    """
+    Evaluate the Pytorch model and return
+    ground truth along with predictions
+    Parameters
+    ----------
+    arg1 | model: torch.nn.Module
+        Trained Neural network model
+    arg2 | dataloader: DataLoader
+        Dataset as a DataLoader object
+    """
+    # init an empty list to capture y hats
+    y_preds = []
+
+    # init an empty list to capture ground truth
+    y_true = []
+
+    # set the model to evaluate
+    model.eval()
+
+    # Iterate in batches over the test dataset.
+    for data in test_loader:
+        # store the ground truth
+        y_true.append(data.y)
+
+        # gather the model prediction
+        out = model(data.x.float().to(device),
+                    data.edge_index.long().to(device),
+                    data.batch.long().to(device))
+
+        # store the model predictions
+        y_preds.append(torch.flatten(out, start_dim=1))
+
+    # concat the predictions obtained in batches
+    y_preds = torch.cat(y_preds)
+
+    # concat the ground truth obtained in batches
+    y_true = torch.cat(y_true)
+
+    # return the tuple [Ground truth, Predictions]
+    return (y_true, y_preds)
+
+# define the helper method to obtain evaluation metrics
+def regression_evaluation_metrics(
+        y_true: torch.Tensor,
+        y_preds: torch.Tensor,
+        metric: str
+) -> None:
+    """
+    Print the Pytorch model metrics based
+    on the ground truth vs predictions
+    Parameters
+    ----------
+    arg1 | y_true: torch.Tensor
+        Ground truth values of the data
+    arg2 | y_preds: torch.Tensor
+        Model Predictions for the input data
+    """
+    # init an empty dict to store results
+    results = defaultdict(dict)
+
+    # store y_preds and y_true as numpy arrays
+    y_true = y_true.detach().numpy()
+    y_preds = y_preds.cpu().detach().numpy()
+
+    # MSE
+    results['mse'] = mean_squared_error(y_true, y_preds)
+
+    # MAE
+    results['mae'] = mean_absolute_error(y_true, y_preds)
+
+    # RMSE
+    results['rmse'] = mean_squared_error(y_true, y_preds, squared=False)
+
+    # R2
+    results['r2'] = r2_score(y_true, y_preds)
+
+    # return appropriate metric(s)
+    if metric == "all":
+        return results
+    if metric in results.keys():
+        return results[metric]
+    else:
+        return defaultdict(dict)
+
+
+
+
+