Skip to content

Commit

Permalink
[Enhancements]: Re-arch inputs and tsv generations and few more updat…
Browse files Browse the repository at this point in the history
…es (#3876)

* More component updates

* Updating training component for MI2

---------

Co-authored-by: Matthias Blondeel <mablonde@microsoft.com>
  • Loading branch information
2 people authored and yeshsurya committed Feb 27, 2025
1 parent 0a15d6f commit ddc431e
Show file tree
Hide file tree
Showing 8 changed files with 473 additions and 168 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: medimgage_adapter_finetune
version: 0.0.4
version: 0.0.7
type: command

is_deterministic: True
Expand Down Expand Up @@ -29,6 +29,18 @@ inputs:
description: Path to the validation data file.
mode: ro_mount

validation_text_tsv:
type: uri_file
optional: false
description: Path to the evaluation text TSV file.
mode: ro_mount

train_text_tsv:
type: uri_file
optional: false
description: Path to the text TSV file.
mode: ro_mount

train_dataloader_batch_size:
type: integer
min: 1
Expand Down Expand Up @@ -101,6 +113,8 @@ command: >-
--task_name "AdapterTrain"
--train_data_path "${{inputs.train_data_path}}"
--validation_data_path "${{inputs.validation_data_path}}"
--validation_text_tsv "${{inputs.validation_text_tsv}}"
--train_text_tsv "${{inputs.train_text_tsv}}"
--label_file "${{inputs.label_file}}"
$[[--train_dataloader_batch_size "${{inputs.train_dataloader_batch_size}}"]]
$[[--validation_dataloader_batch_size "${{inputs.validation_dataloader_batch_size}}"]]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: medimgage_embedding_finetune
version: 0.0.23
version: 0.0.27

type: command

Expand All @@ -9,7 +9,7 @@ is_deterministic: True
display_name: Medical Image Insight Embedding Finetune
description: Component to finetune the model using the medical image data

environment : azureml://registries/mablonde-registry-101/environments/acpt-medimage-embedding/versions/15
environment : azureml://registries/mablonde-registry-101/environments/acpt-medimage-embedding/versions/16
code: ../../../src/medimage_insight_embedding_finetune

distribution:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
$schema: https://azuremlschemas.azureedge.net/latest/pipelineComponent.schema.json
name: medimage_insight_ft_pipeline
version: 0.0.16
version: 0.0.24
type: pipeline
display_name: Medical Image Insight Embedding Generator and Classification Adapter Pipeline
description: Pipeline Component to finetune Hugging Face pretrained models for chat completion task. The component supports optimizations such as LoRA, Deepspeed and ONNXRuntime for performance enhancement. See [docs](https://aka.ms/azureml/components/chat_completion_pipeline) to learn more.
Expand Down Expand Up @@ -341,7 +341,7 @@ outputs:
jobs:
medical_image_embedding_model_finetune:
type: command
component: azureml://registries/mablonde-registry-101/components/medimgage_embedding_finetune/versions/0.0.23
component: azureml://registries/mablonde-registry-101/components/medimgage_embedding_finetune/versions/0.0.27
compute: '${{parent.inputs.compute_finetune}}'
resources:
instance_type: '${{parent.inputs.instance_type_finetune}}'
Expand Down Expand Up @@ -374,27 +374,35 @@ jobs:
outputs:
save_dir: '${{parent.outputs.save_dir}}'
mlflow_model_folder: '${{parent.outputs.mlflow_model_folder}}'
medical_image_embedding_datapreprocessing:
medical_image_embedding_datapreprocessing_train:
type: command
component: azureml://registries/mablonde-registry-101/components/medical_image_embedding_datapreprocessing/versions/0.0.9
component: azureml://registries/mablonde-registry-101/components/medical_image_embedding_datapreprocessing/versions/0.0.11
compute: '${{parent.inputs.compute_preprocess}}'
resources:
instance_type: '${{parent.inputs.instance_type_preprocess}}'
inputs:
mlflow_model_path: '${{parent.jobs.medical_image_embedding_model_finetune.outputs.mlflow_model_folder}}'
eval_image_tsv: '${{parent.inputs.eval_image_tsv}}'
eval_text_tsv: '${{parent.inputs.eval_text_tsv}}'
image_tsv: '${{parent.inputs.image_tsv}}'
text_tsv: '${{parent.inputs.text_tsv}}'
medical_image_embedding_datapreprocessing_validation:
type: command
component: azureml://registries/mablonde-registry-101/components/medical_image_embedding_datapreprocessing/versions/0.0.11
compute: '${{parent.inputs.compute_preprocess}}'
resources:
instance_type: '${{parent.inputs.instance_type_preprocess}}'
inputs:
mlflow_model_path: '${{parent.jobs.medical_image_embedding_model_finetune.outputs.mlflow_model_folder}}'
image_tsv: '${{parent.inputs.eval_image_tsv}}'
medimgage_adapter_finetune:
type: command
component: azureml://registries/mablonde-registry-101/components/medimgage_adapter_finetune/versions/0.0.4
component: azureml://registries/mablonde-registry-101/components/medimgage_adapter_finetune/versions/0.0.7
compute: '${{parent.inputs.compute_finetune}}'
resources:
instance_type: '${{parent.inputs.instance_type_finetune}}'
inputs:
train_data_path: '${{parent.jobs.medical_image_embedding_datapreprocessing.outputs.output_train_pkl}}'
validation_data_path: '${{parent.jobs.medical_image_embedding_datapreprocessing.outputs.output_validation_pkl}}'
train_data_path: '${{parent.jobs.medical_image_embedding_datapreprocessing_train.outputs.output_pkl}}'
validation_data_path: '${{parent.jobs.medical_image_embedding_datapreprocessing_validation.outputs.output_pkl}}'
train_text_tsv: '${{parent.inputs.text_tsv}}'
validation_text_tsv: '${{parent.inputs.eval_text_tsv}}'
train_dataloader_batch_size: '${{parent.inputs.train_dataloader_batch_size}}'
validation_dataloader_batch_size: '${{parent.inputs.validation_dataloader_batch_size}}'
train_dataloader_workers: '${{parent.inputs.train_dataloader_workers}}'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: medical_image_embedding_datapreprocessing
version: 0.0.9
version: 0.0.11
type: command

is_deterministic: True
Expand All @@ -9,58 +9,31 @@ display_name: Embedding Generation for Medical Images
description: To genrate embeddings for medical images. See [docs](https://aka.ms/azureml/components/medical_image_embedding_datapreprocessing) to learn more.

#environment: azureml:/subscriptions/dbd697c3-ef40-488f-83e6-5ad4dfb78f9b/resourceGroups/rdondera/providers/Microsoft.MachineLearningServices/workspaces/validatr/environments/medimage-embedding-generation/versions/5
environment: azureml://registries/models-staging/environments/medimage-embedding-generation/versions/5
environment: azureml://registries/mablonde-registry-101/environments/medimage-embedding-generation/versions/7
code: ../../../src/medimage_insight_adapter_preprocess

inputs:
eval_image_tsv:
type: uri_file
optional: false
description: Path to the evaluation image TSV file.
mode: ro_mount

eval_text_tsv:
type: uri_file
optional: false
description: Path to the evaluation text TSV file.
mode: ro_mount

image_tsv:
type: uri_file
optional: false
description: Path to the image TSV file.
mode: ro_mount

text_tsv:
type: uri_file
optional: false
description: Path to the text TSV file.
mode: ro_mount

mlflow_model_path:
type: uri_folder
optional: false
description: Path to the MLflow model to be imported.
mode: ro_mount

outputs:
output_train_pkl:
output_pkl:
type: uri_folder
description: Path to the output training PKL file.
mode: rw_mount

output_validation_pkl:
type: uri_folder
description: Path to the output validation PKL file.
mode: rw_mount

command: >-
python medimage_datapreprocess.py
--task_name "MedEmbedding"
--eval_image_tsv "${{inputs.eval_image_tsv}}"
--eval_text_tsv "${{inputs.eval_text_tsv}}"
--task_name "MedEmbedding"
--image_tsv "${{inputs.image_tsv}}"
--text_tsv "${{inputs.text_tsv}}"
--output_train_pkl "${{outputs.output_train_pkl}}"
--output_validation_pkl "${{outputs.output_validation_pkl}}"
--output_pkl "${{outputs.output_pkl}}"
--mlflow_model_path "${{inputs.mlflow_model_path}}"
Original file line number Diff line number Diff line change
@@ -1,31 +1,24 @@
import argparse
import json
from azureml.acft.common_components import get_logger_app, set_logging_parameters, LoggingLiterals
from azureml.acft.common_components.utils.error_handling.exceptions import ACFTValidationException
from azureml.acft.common_components.utils.error_handling.error_definitions import ACFTUserError
from azureml.acft.common_components.utils.error_handling.swallow_all_exceptions_decorator import (
swallow_all_exceptions,
)
from azureml._common._error_definition.azureml_error import AzureMLError

from azureml.acft.contrib.hf import VERSION, PROJECT_NAME
from azureml.acft.contrib.hf.nlp.constants.constants import LOGS_TO_BE_FILTERED_IN_APPINSIGHTS
import pandas as pd
import torch
import os
from classification_demo.MedImageInsight import medimageinsight_package
from classification_demo.adaptor_training import training
import training
import matplotlib.pyplot as plt
import SimpleITK as sitk
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score

# Suppress SimpleITK warnings
sitk.ProcessObject_SetGlobalWarningDisplay(False)


COMPONENT_NAME = "ACFT-MedImage-Classification-Training"
logger = get_logger_app("azureml.acft.contrib.hf.scripts.src.train.classification_adaptor_train")
TRAIN_EMBEDDING_FILE_NAME = "train_embeddings.pkl"
VALIDATION_EMBEDDING_FILE_NAME = "validation_embeddings.pkl"
EMBEDDING_FILE_NAME = "embeddings.pkl"


def get_parser():
Expand Down Expand Up @@ -54,6 +47,18 @@ def get_parser():
required=True,
help='The path to the validation data.'
)
parser.add_argument(
"--train_text_tsv",
type=str,
help="Path to evaluation text TSV file.",
required=True
)
parser.add_argument(
"--validation_text_tsv",
type=str,
help="Path to training text TSV file.",
required=True
)
parser.add_argument(
'--train_dataloader_batch_size',
type=int,
Expand Down Expand Up @@ -117,8 +122,7 @@ def get_parser():
return parser


def load_data(train_data_path: str, validation_data_path: str, train_file_name: str,
validation_file_name: str) -> tuple[pd.DataFrame, pd.DataFrame]:
def load_data(train_data_path: str, validation_data_path: str) -> tuple[pd.DataFrame, pd.DataFrame]:
"""
Load the training and validation data from the provided folder paths.
Expand All @@ -132,12 +136,50 @@ def load_data(train_data_path: str, validation_data_path: str, train_file_name:
tuple[pd.DataFrame, pd.DataFrame]: DataFrames containing the training and validation data.
"""

train_data_file = os.path.join(train_data_path, train_file_name)
validation_data_file = os.path.join(validation_data_path, validation_file_name)
train_data_file = os.path.join(train_data_path, EMBEDDING_FILE_NAME)
validation_data_file = os.path.join(validation_data_path, EMBEDDING_FILE_NAME)
train_data = pd.read_pickle(train_data_file)
validation_data = pd.read_pickle(validation_data_file)
return train_data, validation_data

def merge_data_with_text(
train_data: pd.DataFrame,
validation_data: pd.DataFrame,
train_text_tsv: str,
validation_text_tsv: str
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""
Merge the training and validation data with the corresponding text data.
Args:
train_data (pd.DataFrame): DataFrame containing the training data.
validation_data (pd.DataFrame): DataFrame containing the validation data.
train_text_tsv (str): Path to the TSV file containing training text data.
validation_text_tsv (str): Path to the TSV file containing validation text data.
Returns:
tuple[pd.DataFrame, pd.DataFrame]: Merged DataFrames for training and validation data.
"""
train_text_df = pd.read_csv(train_text_tsv, sep="\t")
train_text_df.columns = ["Name", "classification_json"]
validation_text_df = pd.read_csv(validation_text_tsv, sep="\t")
validation_text_df.columns = ["Name", "classification_json"]

def extract_label_from_json(json_str):
try:
json_obj = json.loads(json_str)
return json_obj.get("class_id", -1)
except json.JSONDecodeError:
logger.error("Failed to decode JSON from text column")
return -1

train_text_df["Label"] = train_text_df["classification_json"].apply(extract_label_from_json)
validation_text_df["Label"] = validation_text_df["classification_json"].apply(extract_label_from_json)

train_data = pd.merge(train_data, train_text_df, on="Name")[["Name", "features", "Label"]]
validation_data = pd.merge(validation_data, validation_text_df, on="Name")[["Name", "features", "Label"]]

return train_data, validation_data

def initialize_model(args: argparse.Namespace) -> torch.nn.Module:
"""
Expand Down Expand Up @@ -255,10 +297,11 @@ def main():
},
azureml_pkg_denylist_logging_patterns=LOGS_TO_BE_FILTERED_IN_APPINSIGHTS,
)
train_data, validation_data = load_data(args.train_data_path, args.validation_data_path,
TRAIN_EMBEDDING_FILE_NAME, VALIDATION_EMBEDDING_FILE_NAME)
train_data, validation_data = load_data(args.train_data_path, args.validation_data_path)
train_data, validation_data = merge_data_with_text(train_data, validation_data, args.train_text_tsv, args.validation_text_tsv)
model = initialize_model(args)
train_dataloader, validation_dataloader = prepare_dataloaders(train_data, validation_data, args)

best_accuracy, best_auc = train_model(train_dataloader, validation_dataloader, model, args)
print(f"Best Accuracy of the Adaptor: {best_accuracy:.4f}")
print(f"Best AUC of the Adaptor: {best_auc:.4f}")
Expand All @@ -268,4 +311,4 @@ def main():
main()

# Example command to run this script:
# python medimage_train.py --task_name "AdapterTrain" --train_data_path "/home/healthcare-ai/train_merged.pkl" --validation_data_path "/home/healthcare-ai/val_merged.pkl" --train_dataloader_batch_size 8 --validation_dataloader_batch_size 1 --train_dataloader_workers 2 --validation_dataloader_workers 2 --output_classes 5 --hidden_dimensions 512 --input_channels 1024 --learning_rate 0.0003 --max_epochs 10 --output_model_path "/home/healthcare-ai/"
# python medimage_train.py --task_name "AdapterTrain" --train_data_path "/home/healthcare-ai/train_data" --validation_data_path "/home/healthcare-ai/val_data" --train_text_tsv "/home/healthcare-ai/train_text.tsv" --validation_text_tsv "/home/healthcare-ai/val_text.tsv" --train_dataloader_batch_size 8 --validation_dataloader_batch_size 1 --train_dataloader_workers 2 --validation_dataloader_workers 2 --label_file "/home/healthcare-ai/labels.txt" --hidden_dimensions 512 --input_channels 1024 --learning_rate 0.0003 --max_epochs 10 --output_model_path "/home/healthcare-ai/"
Loading

0 comments on commit ddc431e

Please sign in to comment.