From 49916ae3296f830687aebe1eda55c6b8f1ceb07a Mon Sep 17 00:00:00 2001 From: Akhil Pandey Date: Wed, 27 Jul 2022 12:20:55 -0500 Subject: [PATCH] Issue #4: Added boilerplate code for gnnNAS (#5) - added src/models.py to include the MPNN model class that which will be utilized for all benchmarking experiments - added src/util.py to include helper functions for training, eval, and record metrics - added src/dataset.py to include helper functions for loading Physical chemistry datasets from MoleculeNet - Updated the README.md Signed-off-by: Akhil Akella --- README.md | 79 ++++++++++++++++++++++- src/data.py | 56 +++++++++++++++++ src/models.py | 115 ++++++++++++++++++++++++++++++++++ src/util.py | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 418 insertions(+), 2 deletions(-) create mode 100644 src/data.py create mode 100644 src/models.py create mode 100644 src/util.py diff --git a/README.md b/README.md index dbd1d59..4a00ca7 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,77 @@ -# gnn_uncertainty_ensembles -The project will focus on the design and development of methods to automate the development of graph neural network ensembles and use them for uncertainty quantification +# MetalgPy + gnnNAS + +## About +The project focusses on leveraging the general purpose library [MetalgPy](https://github.com/deephyper/metalgpy) to write symbolized ML programs capable of leveraging graph hyperparameters for better surrogate model fitting. Our goal was to use `MetalgPy` to search for a representation learning algorithm for graph structures. + +## Packages + +- `PyTorch` +- `PyTorch-Geometric` +- `MetalgPy` + +```shell +# Install Pytorch +pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116 + +# Install Pytorch Geometric +pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html +pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html +pip install -q git+https://github.com/pyg-team/pytorch_geometric.git + +# Install DeepHyper/MetalgPy +pip install -q metalgpy + +# Install rdkit for the datasets +pip install -q rdkit-pypi +``` + +## Datasets + +We use three benchmark datasets + +- `GNN Benchmark Dataset` +- `Planetoid-1` +- `MoleculeNet` + +#### GNN Benchmark Dataset + +A variety of artificially and semi-artificially generated graph datasets. It is composed of datasets such as `PATTERN`, `CLUSTER`, `MNIST`, `CIFAR-10`, `TSP`, `CSL`. + +`Reference`: https://arxiv.org/abs/2003.00982 +`Resource`: https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html + +#### Planetoid-1: + +`Planetoid` dataset comprising of citation network datasets `Cora`, `Citeseer`, and `Pubmed`. These are three benchmark datasets used for semi-supervised node classification tasks. Each of the mentioned graph datasets contains bag-of-words representation of documents and citation links between the documents + +`Reference`: https://arxiv.org/pdf/1603.08861.pdf +`Resource`: https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html + +#### MoleculeNet: + +`MoleculetNet`: MoleculeNet is a benchmark specially designed for testing machine learning methods of molecular properties. As we aim to facilitate the development of molecular machine learning method, this work curates a number of dataset collections, creates a suite of software that implements many known featurizations and previously proposed algorithms. All methods and datasets are integrated as parts of the open source DeepChem package(MIT license). + +Within the `MoleculetNet`, we are interested to benchmark, Quantum Mechanics, and Physical chemistry datasets + +`Quantum Mechanics`: +- QM7/QM7b (structure): Electronic properties(atomization energy, HOMO/LUMO, etc.) determined using ab-initio density functional theory(DFT). +- QM8 (structure): Electronic spectra and excited state energy of small molecules calculated by multiple quantum mechanic methods. +- QM9 (structure): Geometric, energetic, electronic and thermodynamic properties of DFT-modelled small molecules. + +`Physical chemistry`: +- ESOL: Water solubility data(log solubility in mols per litre) for common organic small molecules. +- FreeSolv: Experimental and calculated hydration free energy of small molecules in water. +- Lipophilicity: Experimental results of octanol/water distribution coefficient(logD at pH 7.4). + +`Reference`: https://moleculenet.org/datasets-1 +`Resource`: https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html + +## Results +TBA + +## Author +[Akhil Pandey](https://github.com/akhilpandey95) + +## Supervisor +[Prasanna Balaprakash](https://github.com/pbalapra) + diff --git a/src/data.py b/src/data.py new file mode 100644 index 0000000..76d11e8 --- /dev/null +++ b/src/data.py @@ -0,0 +1,56 @@ +# This Source Code Form is subject to the terms of the +# BSD 2-Clause "Simplified" License. If a copy of the same +# was not distributed with this file, You can obtain one at +# https://github.com/akhilpandey95/gnnNAS/blob/master/LICENSE. + +import torch +import numpy as np +import torch_geometric as pyg + +# define the helper method to load dataset +def load_molnet_phys_chem_data(name, batch_size, training_split, seed=2022): + """ + Load the specific Graph dataset from MoleculeNet + Parameters + ---------- + arg1 | name: str + Name of the dataset to import from Pytorch Geometric MoleculeNet dataloader. + arg2 | batch_size: int + Batch size for creating the train/test dataloaders. + arg3 | training_split: float + Percentage of samples to be kept in training set. + arg4 | seed: int + Torch Random seed to ensure reproducibility. Default value is 2022 + Returns + ------- + Pytorch Geometric Dataset(s) + torch_geometric.datasets.molecule_net.MoleculeNet + """ + # load the dataset + dataset = pyg.datasets.MoleculeNet(root='/tmp/Molnet', name=name) + + # set the seed + torch.manual_seed(seed) + + # shuffle the data + dataset = dataset.shuffle() + + # set an stop index for gathering train data + stop_index = int(np.floor(training_split*dataset.len())) + + # separate training data + train_dataset = dataset[0:stop_index] + + # separate test data + test_dataset = dataset[stop_index:] + + # create dataloaders for train and test samples + train_loader = pyg.loader.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) + test_loader = pyg.loader.DataLoader(test_dataset, batch_size=batch_size, shuffle=False) + + return dataset, train_loader, test_loader + + + + + diff --git a/src/models.py b/src/models.py new file mode 100644 index 0000000..8e0bd7a --- /dev/null +++ b/src/models.py @@ -0,0 +1,115 @@ +# This Source Code Form is subject to the terms of the +# BSD 2-Clause "Simplified" License. If a copy of the same +# was not distributed with this file, You can obtain one at +# https://github.com/akhilpandey95/gnnNAS/blob/master/LICENSE. + +import torch +import numpy as np +import torch_geometric as pyg + +class MPNN(torch.nn.Module): + """ + Creates an MPNN model in pytorch geometric + """ + def __init__( + self, + n_node_features: int, + n_edge_features: int, + n_hidden: int, + n_output: int, + MPNN_inp: torch.nn.Module, + MPNN_hidden: torch.nn.Module, + n_conv_blocks: int, + skip_connection: str="plain") -> None: + """ + Build the MPNN model + Parameters + ---------- + arg1 | n_node_features: int + Number of features at node level + arg2 | n_edge_features: int + Number of features at edge level + arg3 | n_hidden: int + Number of hidden activations + arg4 | n_output: int + Number of output activations + arg5 | n_conv_blocks: int + Number of convolutional kernels + Returns + ------- + Nothing + None + """ + # super class the class structure + super().__init__() + + # set the growth dimension + self.growth_dimension = n_hidden + + # encode the node information + self.node_encoder = MPNN_inp(n_node_features, n_hidden) + + # add the ability to add one or more conv layers + conv_blocks = [] + + # ability to add one or more conv blocks + for block in range(n_conv_blocks): + if skip_connection == "dense": + self.growth_dimension = n_hidden + (n_hidden * block) + conv = MPNN_hidden(self.growth_dimension, n_hidden) + norm = torch.nn.LayerNorm(n_hidden, elementwise_affine=True) + act = torch.nn.ReLU(inplace=True) + layer = pyg.nn.DeepGCNLayer(conv, norm, act, block=skip_connection) + conv_blocks.append(layer) + + # group all the conv layers + self.conv_layers = torch.nn.ModuleList(conv_blocks) + + # add the linear layers for flattening the output from MPNN + self.flatten = torch.nn.Sequential( + torch.nn.Linear(self.growth_dimension, n_hidden), + torch.nn.ReLU(), + torch.nn.Linear(n_hidden, n_output)) + + def forward(self, + x: torch.Tensor, + edge_index: torch.Tensor, + batch_idx: torch.Tensor) -> torch.Tensor: + """ + Process the MPNN model + Parameters + ---------- + arg1 | x: torch.Tensor + Input features at node level + arg2 | edge_index: torch.Tensor + Index pairs of verticies + arg3 | batch_idx: torch.Tensor + Batch index + Returns + ------- + Tensor + torch.Tensor + """ + # obtaint the input + if isinstance(self.node_encoder, pyg.nn.MessagePassing): + x = self.node_encoder(x, edge_index) + else: + x = self.node_encoder(x) + + # pass the node information to the conv layer + x = self.conv_layers[0].conv(x, edge_index) + + # process the layers + for layer in range(len(self.conv_layers[1:])): + x = self.conv_layers[layer](x, edge_index) + + # obtain the output from the MPNN final layer + y = pyg.nn.global_add_pool(x, batch=batch_idx) + + # pass the output to the linear output layer + out = self.flatten(y) + + # return the output + return out + + diff --git a/src/util.py b/src/util.py new file mode 100644 index 0000000..36a6614 --- /dev/null +++ b/src/util.py @@ -0,0 +1,170 @@ +# This Source Code Form is subject to the terms of the +# BSD 2-Clause "Simplified" License. If a copy of the same +# was not distributed with this file, You can obtain one at +# https://github.com/akhilpandey95/gnnNAS/blob/master/LICENSE. + +import torch +import numpy as np +import torch_geometric as pyg +from typing import Callable +from collections import defaultdict +from sklearn.metrics import * + +# check if CUDA exists +device = "cuda" if torch.cuda.is_available() else "cpu" +print(f"Using {device} device") + +# define the helper method to train +def model_train( + model: torch.nn.Module, + dataloader: pyg.loader.DataLoader, + learning_rate: float, + ITERATIONS: int, + optimizer: torch.optim.Optimizer, + compute_loss: Callable, + logging=False) -> torch.nn.Module: + """ + Train the Pytorch Geometric model and return + the model + Parameters + ---------- + arg1 | model: torch.nn.Module + Trained Neural network model + arg2 | dataloader: DataLoader + Dataset as a DataLoader object + arg3 | model: float + Trained Neural network model + arg4 | dataloader: int + Dataset as a DataLoader object + arg5 | model: torch.optim.Optimizer + Trained Neural network model + arg6 | dataloader: DataLoader + Dataset as a DataLoader object + Returns + ------- + Pytorch model + torch.nn.Module + """ + for iteration in range(ITERATIONS): + # set the model for training + model.train() + + # iterate in batches over the training dataset + for data in dataloader: + # set the gradients to zero + optimizer.zero_grad() + + # forward pass and compute the y hat values + y_hat = model(data.x.float().to(device), + data.edge_index.long().to(device), + data.batch.long().to(device)) + + # compute the mean squared error loss + cost = compute_loss(y_hat, data.y.to(device)) + + # compute mse loss again for the backward pass + cost.backward() + + # update the weights + optimizer.step() + + # display the stats + if logging: + print(f'Epoch: {iteration:03d}, Loss: {cost:.4f}') + + # return the tuple [Ground truth, Predictions] + return model + +# define the helper method to evaluate +def model_evaluate( + model: torch.nn.Module, + dataloader: pyg.loader.DataLoader, +) -> (torch.Tensor, torch.Tensor): + """ + Evaluate the Pytorch model and return + ground truth along with predictions + Parameters + ---------- + arg1 | model: torch.nn.Module + Trained Neural network model + arg2 | dataloader: DataLoader + Dataset as a DataLoader object + """ + # init an empty list to capture y hats + y_preds = [] + + # init an empty list to capture ground truth + y_true = [] + + # set the model to evaluate + model.eval() + + # Iterate in batches over the test dataset. + for data in test_loader: + # store the ground truth + y_true.append(data.y) + + # gather the model prediction + out = model(data.x.float().to(device), + data.edge_index.long().to(device), + data.batch.long().to(device)) + + # store the model predictions + y_preds.append(torch.flatten(out, start_dim=1)) + + # concat the predictions obtained in batches + y_preds = torch.cat(y_preds) + + # concat the ground truth obtained in batches + y_true = torch.cat(y_true) + + # return the tuple [Ground truth, Predictions] + return (y_true, y_preds) + +# define the helper method to obtain evaluation metrics +def regression_evaluation_metrics( + y_true: torch.Tensor, + y_preds: torch.Tensor, + metric: str +) -> None: + """ + Print the Pytorch model metrics based + on the ground truth vs predictions + Parameters + ---------- + arg1 | y_true: torch.Tensor + Ground truth values of the data + arg2 | y_preds: torch.Tensor + Model Predictions for the input data + """ + # init an empty dict to store results + results = defaultdict(dict) + + # store y_preds and y_true as numpy arrays + y_true = y_true.detach().numpy() + y_preds = y_preds.cpu().detach().numpy() + + # MSE + results['mse'] = mean_squared_error(y_true, y_preds) + + # MAE + results['mae'] = mean_absolute_error(y_true, y_preds) + + # RMSE + results['rmse'] = mean_squared_error(y_true, y_preds, squared=False) + + # R2 + results['r2'] = r2_score(y_true, y_preds) + + # return appropriate metric(s) + if metric == "all": + return results + if metric in results.keys(): + return results[metric] + else: + return defaultdict(dict) + + + + +