diff --git a/examples/Finetuning.ipynb b/examples/Finetuning.ipynb index 9dcf6d6..bd67915 100644 --- a/examples/Finetuning.ipynb +++ b/examples/Finetuning.ipynb @@ -36,7 +36,7 @@ "outputs": [], "source": [ "# ProkBERT\n", - "!pip install git+https://github.com/nbrg-ppcu/prokbert\n", + "!pip install prokbert\n", "\n", "# Imports\n", "import torch\n", @@ -108,7 +108,7 @@ "outputs": [], "source": [ "# Loading the predefined dataset\n", - "dataset = load_dataset(\"nerualbioinfo/bacterial_promoters\")\n", + "dataset = load_dataset(\"neuralbioinfo/bacterial_promoters\")\n", "\n", "train_set = dataset[\"train\"]\n", "test_sigma70_set = dataset[\"test_sigma70\"]\n", @@ -143,7 +143,7 @@ "metadata": {}, "outputs": [], "source": [ - "model_name_path = 'nerualbioinfo/prokbert-mini-k6s1'\n", + "model_name_path = 'neuralbioinfo/prokbert-mini'\n", "\n", "\n", "pretrained_model, tokenizer = get_default_pretrained_model_parameters(\n", @@ -234,19 +234,6 @@ "\n", "| Section | Parameter | Description | Type | Default |\n", "|---------|-----------|-------------|------|---------|\n", - "| **data_collator** | | | | |\n", - "| | `mask_to_left` | The number of tokens to be masked to the left of the original mask tokens to avoid data leakage. | integer | 3 |\n", - "| | `mask_to_right` | The number of tokens to be masked to the RIGHT of the original mask tokens to avoid data leakage. | integer | 2 |\n", - "| | `mlm_probability` | The probability of defining a task on a given token. | float | 0.05 |\n", - "| | `replace_prob` | 1- The probability of restoring a masked token. Others will be changed or restored. | float | 0.8 |\n", - "| | `random_prob` | The probability of replacing a token with a random token. It introduces some random errors to avoid overfitting. | float | 0.01 \n", - "| **dataset** | | | | |\n", - "| | `dataset_path` | Path to the dataset. It triggers an error if empty. | string | '' |\n", - "| | `pretraining_dataset_data` | The raw dataset data. | list | [[]] |\n", - "| | `dataset_class` | The class of the dataset to be used. | string | 'IterableProkBERTPretrainingDataset' |\n", - "| | `input_batch_size` | Batch size to be loaded into memory from the disk for HDF datasets. | int | 10000 |\n", - "| | `dataset_iteration_batch_offset` | The offset value for dataset iteration start. | int | 0 |\n", - "| | `max_iteration_over_dataset` | Maximum times to iterate over a dataset. | int | 10 |\n", "| **training** | | | | |\n", "| | `output_dir` | Output directory for training artifacts. | string | './train_output' |\n", "| | `num_train_epochs` | Total number of training epochs. | float | 1 |\n", @@ -264,7 +251,14 @@ "| | `adam_beta2` | Beta2 hyperparameter for the Adam optimizer. | float | 0.98 |\n", "| | `gradient_accumulation_steps` | Number of steps to accumulate gradients before updating weights. | integer | 1 |\n", "| | `optim` | Optimizer to use for training. | string | \"adamw_torch\" |\n", - "| | `ignore_data_skip` | Whether to ignore data skip or not. | boolean | True |\n" + "| | `ignore_data_skip` | Whether to ignore data skip or not. | boolean | True |\n", + "| **dataset** | | | | |\n", + "| | `dataset_path` | Path to the dataset. It triggers an error if empty. | string | '' |\n", + "| | `pretraining_dataset_data` | The raw dataset data. | list | [[]] |\n", + "| | `dataset_class` | The class of the dataset to be used. | string | 'IterableProkBERTPretrainingDataset' |\n", + "| | `input_batch_size` | Batch size to be loaded into memory from the disk for HDF datasets. | int | 10000 |\n", + "| | `dataset_iteration_batch_offset` | The offset value for dataset iteration start. | int | 0 |\n", + "| | `max_iteration_over_dataset` | Maximum times to iterate over a dataset. | int | 10 |" ] }, { diff --git a/examples/finetuning.py b/examples/finetuning.py new file mode 100644 index 0000000..622d28f --- /dev/null +++ b/examples/finetuning.py @@ -0,0 +1,158 @@ +import yaml +import pathlib +from os.path import join +import os +import sys +import argparse +import re + +from transformers import MegatronBertForMaskedLM +from prokbert.prokbert_tokenizer import ProkBERTTokenizer +from transformers import MegatronBertModel, MegatronBertConfig, MegatronBertForMaskedLM + +import pkg_resources +import random +import numpy as np +import torch + +from os.path import join +from prokbert.sequtils import * +from prokbert.config_utils import SeqConfig +from prokbert.training_utils import get_training_tokenizer, get_data_collator_for_overlapping_sequences +from prokbert.prok_datasets import ProkBERTPretrainingHDFDataset +# Creating the model from scratch +from prokbert.config_utils import ProkBERTConfig, get_user_provided_args +from prokbert.training_utils import * +from transformers import TrainingArguments, Trainer +from datasets import load_dataset +from prokbert.training_utils import get_default_pretrained_model_parameters, get_torch_data_from_segmentdb_classification +from prokbert.models import BertForBinaryClassificationWithPooling +from prokbert.prok_datasets import ProkBERTTrainingDatasetPT +from prokbert.config_utils import ProkBERTConfig +from prokbert.training_utils import compute_metrics_eval_prediction + + + +seed=851115 + +torch.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +random.seed(seed) +np.random.seed(seed) + + +def prepare_input_arguments(): + """ + Prepare and validate input arguments for ProkBERT pretraining. + + Parses command-line arguments and sets the configuration for the pretraining process. + + Returns: + ProkBERTConfig: Configuration object for ProkBERT pretraining. + """ + prokbert_config = ProkBERTConfig() + keyset = ['finetuning', 'model', 'dataset', 'pretraining'] + parser, cmd_argument2group_param, group2param2cmdarg = prokbert_config.get_cmd_arg_parser(keyset) + args = parser.parse_args() + user_provided_args = get_user_provided_args(args, parser) + input_args2check = list(set(user_provided_args.keys()) - {'help'}) + parameter_group_names = list(prokbert_config.parameters.keys()) + # Initialization of the input parameterset + parameters = {k: {} for k in parameter_group_names} + for provided_input_argument in input_args2check: + #print(f'Setting: {provided_input_argument}') + param_group, param_name = cmd_argument2group_param[provided_input_argument] + #print(f'It belongs to group: {param_group}. Maps to the parameter: {param_name}') + act_value = getattr(args, provided_input_argument) + parameters[param_group][param_name]=act_value + prokbert_config = ProkBERTConfig() + + print(parameters.keys()) + + + _ = prokbert_config.get_and_set_model_parameters(parameters['model']) + _ = prokbert_config.get_and_set_dataset_parameters(parameters['dataset']) + _ = prokbert_config.get_and_set_pretraining_parameters(parameters['pretraining']) + _ = prokbert_config.get_and_set_tokenization_parameters(parameters['tokenization']) + _ = prokbert_config.get_and_set_segmentation_parameters(parameters['segmentation']) + _ = prokbert_config.get_and_set_computation_params(parameters['computation']) + _ = prokbert_config.get_and_set_datacollator_parameters(parameters['data_collator']) + _ = prokbert_config.get_and_set_finetuning_parameters(parameters['finetuning']) + + prokbert_config.default_torchtype = torch.long + #print(user_provided_args) + + return prokbert_config + +def main(prokbert_config): + """ + Main function to run the ProkBERT pretraining pipeline. + + Initializes tokenizer, data collator, dataset, and model, and then starts the pretraining process. + + Args: + prokbert_config (ProkBERTConfig): Configuration object containing all necessary parameters for pretraining. + """ + check_nvidia_gpu() + print(prokbert_config.finetuning_params) + + + model_name_path = prokbert_config.model_params['model_name'] + print(model_name_path) + pretrained_model, tokenizer = get_default_pretrained_model_parameters( + model_name=model_name_path, + model_class='MegatronBertModel', + output_hidden_states=False, + output_attentions=False, + move_to_gpu=False + ) + fine_tuned_model = BertForBinaryClassificationWithPooling(pretrained_model) + + # Loading the predefined dataset + dataset = load_dataset("neuralbioinfo/bacterial_promoters") + + train_set = dataset["train"] + test_sigma70_set = dataset["test_sigma70"] + multispecies_set = dataset["test_multispecies"] + + train_db = train_set.to_pandas() + test_sigma70_db = test_sigma70_set.to_pandas() + test_ms_db = multispecies_set.to_pandas() + + + ## Creating datasets! + print(f'Processing train database!') + [X_train, y_train, torchdb_train] = get_torch_data_from_segmentdb_classification(tokenizer, train_db) + print(f'Processing test database!') + [X_test, y_test, torchdb_test] = get_torch_data_from_segmentdb_classification(tokenizer, test_ms_db) + print(f'Processing validation database!') + [X_val, y_val, torchdb_val] = get_torch_data_from_segmentdb_classification(tokenizer, test_sigma70_db) + train_ds = ProkBERTTrainingDatasetPT(X_train, y_train, AddAttentionMask=True) + test_ds = ProkBERTTrainingDatasetPT(X_test, y_test, AddAttentionMask=True) + val_ds = ProkBERTTrainingDatasetPT(X_val, y_val, AddAttentionMask=True) + + final_model_output = join(prokbert_config.model_params['model_outputpath'], prokbert_config.model_params['model_name']) + + training_args = TrainingArguments(**prokbert_config.pretraining_params) + trainer = Trainer( + model=fine_tuned_model, + args=training_args, + train_dataset=train_ds, + eval_dataset = val_ds, + compute_metrics=compute_metrics_eval_prediction, + ) + trainer.train() + # Saving the final model + print(f'Saving the model to: {final_model_output}') + fine_tuned_model.save_pretrained(final_model_output) + + + + #print(input_args) + +if __name__ == "__main__": + print(f'Parsing') + + prokbert_config = prepare_input_arguments() + main(prokbert_config) + diff --git a/src/prokbert/config_utils.py b/src/prokbert/config_utils.py index 4210810..f20b5f8 100644 --- a/src/prokbert/config_utils.py +++ b/src/prokbert/config_utils.py @@ -518,6 +518,7 @@ def __init__(self): self.model_params = self.get_set_parameters('model') self.dataset_params = self.get_set_parameters('dataset') self.pretraining_params = self.get_set_parameters('pretraining') + self.finetuning_params = self.get_set_parameters('finetuning') # Getting the sequtils params as well self.def_seq_config = SeqConfig() @@ -572,10 +573,10 @@ def get_set_parameters(self, parameter_class: str, parameters: dict = {}) -> dic for param, param_value in parameters.items(): - if param not in class_params and parameter_class!='pretraining': + if param not in class_params and (parameter_class!='pretraining'): raise ValueError(f"The provided {param} is an INVALID {parameter_class} parameter! The valid parameters are: {list(class_params.keys())}") else: - if parameter_class == 'pretraining': + if parameter_class == 'pretraining' or parameter_class == 'finetuning' : if param in self.hf_training_args_dict or param in class_params: if param in class_params: self.validate(parameter_class, param, param_value) @@ -628,8 +629,17 @@ def get_and_set_computation_params(self, parameters: dict = {}) -> dict: self.computation_params = self.def_seq_config.get_and_set_computational_parameters(parameters) return self.computation_params + def get_and_set_finetuning_parameters(self, parameters: dict = {}) -> dict: + """ Setting the finetuning parameters """ - def get_cmd_arg_parser(self) -> tuple[argparse.ArgumentParser, dict, dict]: + # Here we include the additional training arguments available for the trainer + + self.finetuning_params = self.get_set_parameters('finetuning', parameters) + + return self.finetuning_params + + + def get_cmd_arg_parser(self, keyset=[]) -> tuple[argparse.ArgumentParser, dict, dict]: """ Create and return a command-line argument parser for ProkBERT configurations, along with mappings between command-line arguments and configuration parameters. @@ -648,8 +658,11 @@ def get_cmd_arg_parser(self) -> tuple[argparse.ArgumentParser, dict, dict]: Note: The method assumes that the configuration parameters for training and sequence configuration are available within the class. """ + if len(keyset) ==0: + trainin_conf_keysets = ['data_collator', 'model', 'dataset', 'pretraining', 'finetuning'] + else: + trainin_conf_keysets = keyset - trainin_conf_keysets = ['data_collator', 'model', 'dataset', 'pretraining'] seq_config = deepcopy(self.def_seq_config.parameters) default_other_config = deepcopy(self.parameters) combined_params = {} diff --git a/src/prokbert/configs/pretraining.yaml b/src/prokbert/configs/pretraining.yaml index 0184143..6fb48e1 100644 --- a/src/prokbert/configs/pretraining.yaml +++ b/src/prokbert/configs/pretraining.yaml @@ -41,7 +41,7 @@ model: model_name: default: 'mini' type: "string" - description: "Name of the ProkBERT model." + description: "Name of the pretrained ProkBERT model." model_outputpath: default: '/scratch/fastscratch/NBL/trained_models/test' type: "string" @@ -193,7 +193,14 @@ tokenization: # For full definiation, please see the documentation of the sequitls parameters computation: numpy_token_integer_prec_byte: 2 - - +finetuning: + ftmodel: + default: "" + type: "string" + description: "Model name for the finetuning" + modelclass: + default: "" + type: "string" + description: "Modell class to perform the analysis weights." \ No newline at end of file diff --git a/src/prokbert/training_utils.py b/src/prokbert/training_utils.py index 7b5b687..9fc8470 100644 --- a/src/prokbert/training_utils.py +++ b/src/prokbert/training_utils.py @@ -392,32 +392,60 @@ def get_torch_data_from_segmentdb_classification(tokenizer, segmentdb, L=None): return X, y, torchdb -def get_default_pretrained_model_parameters(model_name, model_class, output_hidden_states=False, - output_attentions=False, - move_to_gpu=True): +def get_default_pretrained_model_parameters(model_name: str, model_class: str, output_hidden_states: bool = False, + output_attentions: bool = False, move_to_gpu: bool = True): """ - Loading a default pretrained model with the corresponding tokenier and segmenation data. - Model name should be a valid model stored locally and should be registered in our database. - model_class: should be a valid transformer class in which the parameters will be loaded. - return: the loaded model to GPU or cpu and a valid tokenizer and it's default parameters, requeired for tokenization and prosseing input data + Load a default pretrained model along with the corresponding tokenizer based on the model name. + + :param model_name: The name of the model to load. Should be a valid model stored locally or registered in the database. + Can be provided with or without the 'neuralbioinfo/' prefix. + :type model_name: str + :param model_class: The class of the transformer model into which the parameters will be loaded. + :type model_class: str + :param output_hidden_states: Whether to output hidden states. + :type output_hidden_states: bool + :param output_attentions: Whether to output attentions. + :type output_attentions: bool + :param move_to_gpu: Whether to move the model to GPU if available. + :type move_to_gpu: bool + :return: The loaded model (moved to GPU or CPU as specified) and the tokenizer with its default parameters. + :rtype: tuple + + Raises: + ValueError: If the model name does not match the expected pattern and is not found in predefined exceptions. """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - match = re.search(r'k(\d+)s(\d+)', model_name) - if not match: - raise ValueError("Model name does not match the expected pattern.") - kmer, shift = map(int, match.groups()) - tokenization_params= { - 'kmer': kmer, - 'shift': shift - } - tokenizer = ProkBERTTokenizer(tokenization_params=tokenization_params, - operation_space='sequence') + # Normalize the model name by removing the 'neuralbioinfo/' prefix if present + normalized_model_name = model_name.replace('neuralbioinfo/', '') + + print(f'normalized_model_name: {normalized_model_name}, model name_ {model_name}') + # Predefined exceptions for model names and their tokenization parameters + model_tokenization_params = { + 'prokbert-mini': {'kmer': 6, 'shift': 1}, + 'prokbert-mini-long': {'kmer': 6, 'shift': 2}, + 'prokbert-mini-c': {'kmer': 1, 'shift': 1}, + } + + # Check for predefined exceptions first + if normalized_model_name in model_tokenization_params: + tokenization_params = model_tokenization_params[normalized_model_name] + else: + # If not found, try to parse using regex + match = re.search(r'k(\d+)s(\d+)', normalized_model_name) + if match: + kmer, shift = map(int, match.groups()) + tokenization_params = {'kmer': kmer, 'shift': shift} + else: + print('fdsgfdgfgfggfgfgf') + raise ValueError(f"Model name '{model_name}' does not match the expected pattern and is not a predefined exception.") + + tokenizer = ProkBERTTokenizer(tokenization_params=tokenization_params, operation_space='sequence') model = load_pretrained_model( - model_path=model_name, - model_class=model_class, # Example model class - device=device, # Use 'cpu' if you are not using a GPU - output_hidden_states=output_hidden_states, + model_path=model_name, # Use original model_name here to preserve 'neuralbioinfo/' if it was included + model_class=model_class, + device=device, + output_hidden_states=output_hidden_states, output_attentions=output_attentions, move_to_gpu=move_to_gpu )