Skip to content

Commit

Permalink
Merge pull request #44 from nbrg-ppcu/release
Browse files Browse the repository at this point in the history
Finetuning update
  • Loading branch information
obalasz authored Feb 14, 2024
2 parents a3b37f0 + eeb8892 commit cca0379
Show file tree
Hide file tree
Showing 5 changed files with 245 additions and 45 deletions.
28 changes: 11 additions & 17 deletions examples/Finetuning.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
"outputs": [],
"source": [
"# ProkBERT\n",
"!pip install git+https://github.com/nbrg-ppcu/prokbert\n",
"!pip install prokbert\n",
"\n",
"# Imports\n",
"import torch\n",
Expand Down Expand Up @@ -108,7 +108,7 @@
"outputs": [],
"source": [
"# Loading the predefined dataset\n",
"dataset = load_dataset(\"nerualbioinfo/bacterial_promoters\")\n",
"dataset = load_dataset(\"neuralbioinfo/bacterial_promoters\")\n",
"\n",
"train_set = dataset[\"train\"]\n",
"test_sigma70_set = dataset[\"test_sigma70\"]\n",
Expand Down Expand Up @@ -143,7 +143,7 @@
"metadata": {},
"outputs": [],
"source": [
"model_name_path = 'nerualbioinfo/prokbert-mini-k6s1'\n",
"model_name_path = 'neuralbioinfo/prokbert-mini'\n",
"\n",
"\n",
"pretrained_model, tokenizer = get_default_pretrained_model_parameters(\n",
Expand Down Expand Up @@ -234,19 +234,6 @@
"\n",
"| Section | Parameter | Description | Type | Default |\n",
"|---------|-----------|-------------|------|---------|\n",
"| **data_collator** | | | | |\n",
"| | `mask_to_left` | The number of tokens to be masked to the left of the original mask tokens to avoid data leakage. | integer | 3 |\n",
"| | `mask_to_right` | The number of tokens to be masked to the RIGHT of the original mask tokens to avoid data leakage. | integer | 2 |\n",
"| | `mlm_probability` | The probability of defining a task on a given token. | float | 0.05 |\n",
"| | `replace_prob` | 1- The probability of restoring a masked token. Others will be changed or restored. | float | 0.8 |\n",
"| | `random_prob` | The probability of replacing a token with a random token. It introduces some random errors to avoid overfitting. | float | 0.01 \n",
"| **dataset** | | | | |\n",
"| | `dataset_path` | Path to the dataset. It triggers an error if empty. | string | '' |\n",
"| | `pretraining_dataset_data` | The raw dataset data. | list | [[]] |\n",
"| | `dataset_class` | The class of the dataset to be used. | string | 'IterableProkBERTPretrainingDataset' |\n",
"| | `input_batch_size` | Batch size to be loaded into memory from the disk for HDF datasets. | int | 10000 |\n",
"| | `dataset_iteration_batch_offset` | The offset value for dataset iteration start. | int | 0 |\n",
"| | `max_iteration_over_dataset` | Maximum times to iterate over a dataset. | int | 10 |\n",
"| **training** | | | | |\n",
"| | `output_dir` | Output directory for training artifacts. | string | './train_output' |\n",
"| | `num_train_epochs` | Total number of training epochs. | float | 1 |\n",
Expand All @@ -264,7 +251,14 @@
"| | `adam_beta2` | Beta2 hyperparameter for the Adam optimizer. | float | 0.98 |\n",
"| | `gradient_accumulation_steps` | Number of steps to accumulate gradients before updating weights. | integer | 1 |\n",
"| | `optim` | Optimizer to use for training. | string | \"adamw_torch\" |\n",
"| | `ignore_data_skip` | Whether to ignore data skip or not. | boolean | True |\n"
"| | `ignore_data_skip` | Whether to ignore data skip or not. | boolean | True |\n",
"| **dataset** | | | | |\n",
"| | `dataset_path` | Path to the dataset. It triggers an error if empty. | string | '' |\n",
"| | `pretraining_dataset_data` | The raw dataset data. | list | [[]] |\n",
"| | `dataset_class` | The class of the dataset to be used. | string | 'IterableProkBERTPretrainingDataset' |\n",
"| | `input_batch_size` | Batch size to be loaded into memory from the disk for HDF datasets. | int | 10000 |\n",
"| | `dataset_iteration_batch_offset` | The offset value for dataset iteration start. | int | 0 |\n",
"| | `max_iteration_over_dataset` | Maximum times to iterate over a dataset. | int | 10 |"
]
},
{
Expand Down
158 changes: 158 additions & 0 deletions examples/finetuning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import yaml
import pathlib
from os.path import join
import os
import sys
import argparse
import re

from transformers import MegatronBertForMaskedLM
from prokbert.prokbert_tokenizer import ProkBERTTokenizer
from transformers import MegatronBertModel, MegatronBertConfig, MegatronBertForMaskedLM

import pkg_resources
import random
import numpy as np
import torch

from os.path import join
from prokbert.sequtils import *
from prokbert.config_utils import SeqConfig
from prokbert.training_utils import get_training_tokenizer, get_data_collator_for_overlapping_sequences
from prokbert.prok_datasets import ProkBERTPretrainingHDFDataset
# Creating the model from scratch
from prokbert.config_utils import ProkBERTConfig, get_user_provided_args
from prokbert.training_utils import *
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
from prokbert.training_utils import get_default_pretrained_model_parameters, get_torch_data_from_segmentdb_classification
from prokbert.models import BertForBinaryClassificationWithPooling
from prokbert.prok_datasets import ProkBERTTrainingDatasetPT
from prokbert.config_utils import ProkBERTConfig
from prokbert.training_utils import compute_metrics_eval_prediction



seed=851115

torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)


def prepare_input_arguments():
"""
Prepare and validate input arguments for ProkBERT pretraining.
Parses command-line arguments and sets the configuration for the pretraining process.
Returns:
ProkBERTConfig: Configuration object for ProkBERT pretraining.
"""
prokbert_config = ProkBERTConfig()
keyset = ['finetuning', 'model', 'dataset', 'pretraining']
parser, cmd_argument2group_param, group2param2cmdarg = prokbert_config.get_cmd_arg_parser(keyset)
args = parser.parse_args()
user_provided_args = get_user_provided_args(args, parser)
input_args2check = list(set(user_provided_args.keys()) - {'help'})
parameter_group_names = list(prokbert_config.parameters.keys())
# Initialization of the input parameterset
parameters = {k: {} for k in parameter_group_names}
for provided_input_argument in input_args2check:
#print(f'Setting: {provided_input_argument}')
param_group, param_name = cmd_argument2group_param[provided_input_argument]
#print(f'It belongs to group: {param_group}. Maps to the parameter: {param_name}')
act_value = getattr(args, provided_input_argument)
parameters[param_group][param_name]=act_value
prokbert_config = ProkBERTConfig()

print(parameters.keys())


_ = prokbert_config.get_and_set_model_parameters(parameters['model'])
_ = prokbert_config.get_and_set_dataset_parameters(parameters['dataset'])
_ = prokbert_config.get_and_set_pretraining_parameters(parameters['pretraining'])
_ = prokbert_config.get_and_set_tokenization_parameters(parameters['tokenization'])
_ = prokbert_config.get_and_set_segmentation_parameters(parameters['segmentation'])
_ = prokbert_config.get_and_set_computation_params(parameters['computation'])
_ = prokbert_config.get_and_set_datacollator_parameters(parameters['data_collator'])
_ = prokbert_config.get_and_set_finetuning_parameters(parameters['finetuning'])

prokbert_config.default_torchtype = torch.long
#print(user_provided_args)

return prokbert_config

def main(prokbert_config):
"""
Main function to run the ProkBERT pretraining pipeline.
Initializes tokenizer, data collator, dataset, and model, and then starts the pretraining process.
Args:
prokbert_config (ProkBERTConfig): Configuration object containing all necessary parameters for pretraining.
"""
check_nvidia_gpu()
print(prokbert_config.finetuning_params)


model_name_path = prokbert_config.model_params['model_name']
print(model_name_path)
pretrained_model, tokenizer = get_default_pretrained_model_parameters(
model_name=model_name_path,
model_class='MegatronBertModel',
output_hidden_states=False,
output_attentions=False,
move_to_gpu=False
)
fine_tuned_model = BertForBinaryClassificationWithPooling(pretrained_model)

# Loading the predefined dataset
dataset = load_dataset("neuralbioinfo/bacterial_promoters")

train_set = dataset["train"]
test_sigma70_set = dataset["test_sigma70"]
multispecies_set = dataset["test_multispecies"]

train_db = train_set.to_pandas()
test_sigma70_db = test_sigma70_set.to_pandas()
test_ms_db = multispecies_set.to_pandas()


## Creating datasets!
print(f'Processing train database!')
[X_train, y_train, torchdb_train] = get_torch_data_from_segmentdb_classification(tokenizer, train_db)
print(f'Processing test database!')
[X_test, y_test, torchdb_test] = get_torch_data_from_segmentdb_classification(tokenizer, test_ms_db)
print(f'Processing validation database!')
[X_val, y_val, torchdb_val] = get_torch_data_from_segmentdb_classification(tokenizer, test_sigma70_db)
train_ds = ProkBERTTrainingDatasetPT(X_train, y_train, AddAttentionMask=True)
test_ds = ProkBERTTrainingDatasetPT(X_test, y_test, AddAttentionMask=True)
val_ds = ProkBERTTrainingDatasetPT(X_val, y_val, AddAttentionMask=True)

final_model_output = join(prokbert_config.model_params['model_outputpath'], prokbert_config.model_params['model_name'])

training_args = TrainingArguments(**prokbert_config.pretraining_params)
trainer = Trainer(
model=fine_tuned_model,
args=training_args,
train_dataset=train_ds,
eval_dataset = val_ds,
compute_metrics=compute_metrics_eval_prediction,
)
trainer.train()
# Saving the final model
print(f'Saving the model to: {final_model_output}')
fine_tuned_model.save_pretrained(final_model_output)



#print(input_args)

if __name__ == "__main__":
print(f'Parsing')

prokbert_config = prepare_input_arguments()
main(prokbert_config)

21 changes: 17 additions & 4 deletions src/prokbert/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,7 @@ def __init__(self):
self.model_params = self.get_set_parameters('model')
self.dataset_params = self.get_set_parameters('dataset')
self.pretraining_params = self.get_set_parameters('pretraining')
self.finetuning_params = self.get_set_parameters('finetuning')
# Getting the sequtils params as well

self.def_seq_config = SeqConfig()
Expand Down Expand Up @@ -572,10 +573,10 @@ def get_set_parameters(self, parameter_class: str, parameters: dict = {}) -> dic


for param, param_value in parameters.items():
if param not in class_params and parameter_class!='pretraining':
if param not in class_params and (parameter_class!='pretraining'):
raise ValueError(f"The provided {param} is an INVALID {parameter_class} parameter! The valid parameters are: {list(class_params.keys())}")
else:
if parameter_class == 'pretraining':
if parameter_class == 'pretraining' or parameter_class == 'finetuning' :
if param in self.hf_training_args_dict or param in class_params:
if param in class_params:
self.validate(parameter_class, param, param_value)
Expand Down Expand Up @@ -628,8 +629,17 @@ def get_and_set_computation_params(self, parameters: dict = {}) -> dict:
self.computation_params = self.def_seq_config.get_and_set_computational_parameters(parameters)
return self.computation_params

def get_and_set_finetuning_parameters(self, parameters: dict = {}) -> dict:
""" Setting the finetuning parameters """

def get_cmd_arg_parser(self) -> tuple[argparse.ArgumentParser, dict, dict]:
# Here we include the additional training arguments available for the trainer

self.finetuning_params = self.get_set_parameters('finetuning', parameters)

return self.finetuning_params


def get_cmd_arg_parser(self, keyset=[]) -> tuple[argparse.ArgumentParser, dict, dict]:
"""
Create and return a command-line argument parser for ProkBERT configurations, along with mappings
between command-line arguments and configuration parameters.
Expand All @@ -648,8 +658,11 @@ def get_cmd_arg_parser(self) -> tuple[argparse.ArgumentParser, dict, dict]:
Note: The method assumes that the configuration parameters for training and sequence configuration
are available within the class.
"""
if len(keyset) ==0:
trainin_conf_keysets = ['data_collator', 'model', 'dataset', 'pretraining', 'finetuning']
else:
trainin_conf_keysets = keyset

trainin_conf_keysets = ['data_collator', 'model', 'dataset', 'pretraining']
seq_config = deepcopy(self.def_seq_config.parameters)
default_other_config = deepcopy(self.parameters)
combined_params = {}
Expand Down
13 changes: 10 additions & 3 deletions src/prokbert/configs/pretraining.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ model:
model_name:
default: 'mini'
type: "string"
description: "Name of the ProkBERT model."
description: "Name of the pretrained ProkBERT model."
model_outputpath:
default: '/scratch/fastscratch/NBL/trained_models/test'
type: "string"
Expand Down Expand Up @@ -193,7 +193,14 @@ tokenization:
# For full definiation, please see the documentation of the sequitls parameters
computation:
numpy_token_integer_prec_byte: 2


finetuning:
ftmodel:
default: ""
type: "string"
description: "Model name for the finetuning"
modelclass:
default: ""
type: "string"
description: "Modell class to perform the analysis weights."


70 changes: 49 additions & 21 deletions src/prokbert/training_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,32 +392,60 @@ def get_torch_data_from_segmentdb_classification(tokenizer, segmentdb, L=None):

return X, y, torchdb

def get_default_pretrained_model_parameters(model_name, model_class, output_hidden_states=False,
output_attentions=False,
move_to_gpu=True):
def get_default_pretrained_model_parameters(model_name: str, model_class: str, output_hidden_states: bool = False,
output_attentions: bool = False, move_to_gpu: bool = True):
"""
Loading a default pretrained model with the corresponding tokenier and segmenation data.
Model name should be a valid model stored locally and should be registered in our database.
model_class: should be a valid transformer class in which the parameters will be loaded.
return: the loaded model to GPU or cpu and a valid tokenizer and it's default parameters, requeired for tokenization and prosseing input data
Load a default pretrained model along with the corresponding tokenizer based on the model name.
:param model_name: The name of the model to load. Should be a valid model stored locally or registered in the database.
Can be provided with or without the 'neuralbioinfo/' prefix.
:type model_name: str
:param model_class: The class of the transformer model into which the parameters will be loaded.
:type model_class: str
:param output_hidden_states: Whether to output hidden states.
:type output_hidden_states: bool
:param output_attentions: Whether to output attentions.
:type output_attentions: bool
:param move_to_gpu: Whether to move the model to GPU if available.
:type move_to_gpu: bool
:return: The loaded model (moved to GPU or CPU as specified) and the tokenizer with its default parameters.
:rtype: tuple
Raises:
ValueError: If the model name does not match the expected pattern and is not found in predefined exceptions.
"""
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
match = re.search(r'k(\d+)s(\d+)', model_name)
if not match:
raise ValueError("Model name does not match the expected pattern.")

kmer, shift = map(int, match.groups())
tokenization_params= {
'kmer': kmer,
'shift': shift
}
tokenizer = ProkBERTTokenizer(tokenization_params=tokenization_params,
operation_space='sequence')
# Normalize the model name by removing the 'neuralbioinfo/' prefix if present
normalized_model_name = model_name.replace('neuralbioinfo/', '')

print(f'normalized_model_name: {normalized_model_name}, model name_ {model_name}')
# Predefined exceptions for model names and their tokenization parameters
model_tokenization_params = {
'prokbert-mini': {'kmer': 6, 'shift': 1},
'prokbert-mini-long': {'kmer': 6, 'shift': 2},
'prokbert-mini-c': {'kmer': 1, 'shift': 1},
}

# Check for predefined exceptions first
if normalized_model_name in model_tokenization_params:
tokenization_params = model_tokenization_params[normalized_model_name]
else:
# If not found, try to parse using regex
match = re.search(r'k(\d+)s(\d+)', normalized_model_name)
if match:
kmer, shift = map(int, match.groups())
tokenization_params = {'kmer': kmer, 'shift': shift}
else:
print('fdsgfdgfgfggfgfgf')
raise ValueError(f"Model name '{model_name}' does not match the expected pattern and is not a predefined exception.")

tokenizer = ProkBERTTokenizer(tokenization_params=tokenization_params, operation_space='sequence')
model = load_pretrained_model(
model_path=model_name,
model_class=model_class, # Example model class
device=device, # Use 'cpu' if you are not using a GPU
output_hidden_states=output_hidden_states,
model_path=model_name, # Use original model_name here to preserve 'neuralbioinfo/' if it was included
model_class=model_class,
device=device,
output_hidden_states=output_hidden_states,
output_attentions=output_attentions,
move_to_gpu=move_to_gpu
)
Expand Down

0 comments on commit cca0379

Please sign in to comment.