more changes to components (#3862)

Co-authored-by: Matthias Blondeel <mablonde@microsoft.com>
Azure · Feb 25, 2025 · d74da53 · d74da53
1 parent df4a6cc
commit d74da53
Show file tree

Hide file tree

Showing 9 changed files with 226 additions and 222 deletions.
diff --git a/assets/training/finetune_acft_image/components/finetune/medimage_adapter/spec.yaml b/assets/training/finetune_acft_image/components/finetune/medimage_adapter/spec.yaml
@@ -1,6 +1,6 @@
 $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
 name: medimgage_adapter_finetune
-version: 0.0.1.yesh1
+version: 0.0.4
 type: command
 
 is_deterministic: True
@@ -57,12 +57,11 @@ inputs:
     optional: true
     description: Number of workers for the validation dataloader.
 
-  output_classes:
-    type: integer
-    min: 1
-    default: 5
-    optional: true
-    description: Number of output classes.
+  label_file:
+    type: uri_file
+    optional: false
+    description: Path to the label file.
+    mode: ro_mount
 
   hidden_dimensions:
     type: integer
@@ -102,11 +101,11 @@ command: >-
   --task_name "AdapterTrain" 
   --train_data_path "${{inputs.train_data_path}}" 
   --validation_data_path "${{inputs.validation_data_path}}" 
+  --label_file "${{inputs.label_file}}" 
   $[[--train_dataloader_batch_size "${{inputs.train_dataloader_batch_size}}"]]
   $[[--validation_dataloader_batch_size "${{inputs.validation_dataloader_batch_size}}"]]
   $[[--train_dataloader_workers "${{inputs.train_dataloader_workers}}"]]
   $[[--validation_dataloader_workers "${{inputs.validation_dataloader_workers}}"]]
-  $[[--output_classes "${{inputs.output_classes}}"]]
   $[[--hidden_dimensions "${{inputs.hidden_dimensions}}"]]
   $[[--input_channels "${{inputs.input_channels}}"]]
   $[[--learning_rate "${{inputs.learning_rate}}"]]

diff --git a/...ng/finetune_acft_image/components/model_converters/medimage_embed_adapter_merge/spec.yaml b/...ng/finetune_acft_image/components/model_converters/medimage_embed_adapter_merge/spec.yaml
@@ -1,7 +1,7 @@
 $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
 type: command
 
-version: 0.0.10
+version: 0.0.12
 name: medimage_embedding_adapter_merge
 display_name: Merge Adapter Model with Embedding Generation Model
 description: Import and integrate Adapter and Embedding Generation Model
@@ -25,11 +25,25 @@ inputs:
     optional: true
     description: Integrated Mlflow Model Asset.
 
-  configuration:
-    type: uri_folder
+  label_file:
+    type: uri_file
+    optional: false
+    description: Path to the label file.
+    mode: ro_mount
+
+  hidden_dimensions:
+    type: integer
+    min: 1
+    default: 512
     optional: true
-    description: Configuration file for the model merging process.
+    description: Number of hidden dimensions.
 
+  input_channels:
+    type: integer
+    min: 1
+    default: 1024
+    optional: true
+    description: Number of input channels.
 
 outputs:
   output_dir:
@@ -40,5 +54,7 @@ command: >-
   python medimage_model_merge.py 
   $[[--adapter_model ${{inputs.adapter_model}}]]
   $[[--mlflow_model ${{inputs.mlflow_model}}]]
-  $[[--configuration ${{inputs.configuration}}]]
+  $[[--hidden_dimensions "${{inputs.hidden_dimensions}}"]]
+  $[[--input_channels "${{inputs.input_channels}}"]]
+  --label_file "${{inputs.label_file}}" 
   --output_dir ${{outputs.output_dir}}
diff --git a/...training/finetune_acft_image/components/pipeline_components/medimage_insight_ft/spec.yaml b/...training/finetune_acft_image/components/pipeline_components/medimage_insight_ft/spec.yaml
@@ -1,6 +1,6 @@
 $schema: https://azuremlschemas.azureedge.net/latest/pipelineComponent.schema.json
 name: medimage_insight_ft_pipeline
-version: 0.0.7
+version: 0.0.14
 type: pipeline
 display_name: Medical Image Insight Embedding Generator and Classification Adapter Pipeline
 description: Pipeline Component to finetune Hugging Face pretrained models for chat completion task. The component supports optimizations such as LoRA, Deepspeed and ONNXRuntime for performance enhancement. See [docs](https://aka.ms/azureml/components/chat_completion_pipeline) to learn more.
@@ -142,7 +142,6 @@ inputs:
     optional: true
     description: Pin memory.
 
-
   knn:
     type: integer
     min: 0
@@ -267,18 +266,6 @@ inputs:
       compute is named 'FT-Cluster'. Special characters like \ and ' are invalid in the parameter value.
       If compute cluster name is provided, instance_type field will be ignored and the respective cluster will be used
 
-  zeroshot_path:
-    type: uri_file
-    optional: false
-    description: Path to the zeroshot data file.
-    mode: rw_mount
-
-  test_train_split_csv_path:
-    type: uri_file
-    optional: false
-    description: Path to the CSV file containing test-train split information.
-    mode: rw_mount
-
   train_dataloader_batch_size:
     type: integer
     min: 1
@@ -307,13 +294,6 @@ inputs:
     optional: true
     description: Number of workers for the validation dataloader.
 
-  output_classes:
-    type: integer
-    min: 1
-    default: 5
-    optional: true
-    description: Number of output classes.
-
   hidden_dimensions:
     type: integer
     min: 1
@@ -341,11 +321,6 @@ inputs:
     optional: true
     description: Maximum number of epochs for training.
 
-  merge_configuration_folder:
-    type: uri_folder
-    description: Files reqiured for merging the models.
-    mode: rw_mount
-
 outputs:
   save_dir:
     type: uri_folder
@@ -401,17 +376,19 @@ jobs:
       mlflow_model_folder: '${{parent.outputs.mlflow_model_folder}}'
   medical_image_embedding_datapreprocessing:
     type: command
-    component: azureml://registries/models-staging/components/medical_image_embedding_datapreprocessing/versions/0.0.1.yesh5
+    component: azureml://registries/mablonde-registry-101/components/medical_image_embedding_datapreprocessing/versions/0.0.9
     compute: '${{parent.inputs.compute_preprocess}}'
     resources:
       instance_type: '${{parent.inputs.instance_type_preprocess}}'
     inputs:
       mlflow_model_path: '${{parent.jobs.medical_image_embedding_model_finetune.outputs.mlflow_model_folder}}'
-      zeroshot_path: '${{parent.inputs.zeroshot_path}}'
-      test_train_split_csv_path: '${{parent.inputs.test_train_split_csv_path}}'
+      eval_image_tsv: '${{parent.inputs.eval_image_tsv}}'
+      eval_text_tsv: '${{parent.inputs.eval_text_tsv}}'
+      image_tsv: '${{parent.inputs.image_tsv}}'
+      text_tsv: '${{parent.inputs.text_tsv}}'
   medimgage_adapter_finetune:
     type: command
-    component: azureml://registries/models-staging/components/medimgage_adapter_finetune/versions/0.0.1.yesh1
+    component: azureml://registries/mablonde-registry-101/components/medimgage_adapter_finetune/versions/0.0.4
     compute: '${{parent.inputs.compute_finetune}}'
     resources:
       instance_type: '${{parent.inputs.instance_type_finetune}}'
@@ -422,20 +399,22 @@ jobs:
       validation_dataloader_batch_size: '${{parent.inputs.validation_dataloader_batch_size}}'
       train_dataloader_workers: '${{parent.inputs.train_dataloader_workers}}'
       validation_dataloader_workers: '${{parent.inputs.validation_dataloader_workers}}'
-      output_classes: '${{parent.inputs.output_classes}}'
+      label_file: '${{parent.inputs.label_file}}'
       hidden_dimensions: '${{parent.inputs.hidden_dimensions}}'
       input_channels: '${{parent.inputs.input_channels}}'
       learning_rate: '${{parent.inputs.learning_rate}}'
       max_epochs: '${{parent.inputs.max_epochs}}'
   medimage_embedding_adapter_merge:
     type: command
-    component: azureml://registries/mablonde-registry-101/components/medimage_embedding_adapter_merge/versions/0.0.10
+    component: azureml://registries/mablonde-registry-101/components/medimage_embedding_adapter_merge/versions/0.0.12
     compute: '${{parent.inputs.compute_finetune}}'
     resources:
       instance_type: '${{parent.inputs.instance_type_finetune}}'
     inputs:
       adapter_model: '${{parent.jobs.medimgage_adapter_finetune.outputs.output_model_path}}'
       mlflow_model: '${{parent.jobs.medical_image_embedding_model_finetune.outputs.mlflow_model_folder}}'
-      configuration: '${{parent.inputs.merge_configuration_folder}}'
+      label_file: '${{parent.inputs.label_file}}'
+      hidden_dimensions: '${{parent.inputs.hidden_dimensions}}'
+      input_channels: '${{parent.inputs.input_channels}}'
     outputs:
       output_dir: '${{parent.outputs.merged_mlfow_model}}'
diff --git a/assets/training/finetune_acft_image/components/preprocess/image_embedding/spec.yaml b/assets/training/finetune_acft_image/components/preprocess/image_embedding/spec.yaml
@@ -1,6 +1,6 @@
 $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
 name: medical_image_embedding_datapreprocessing
-version: 0.0.1.yesh5
+version: 0.0.9
 type: command
 
 is_deterministic: True
@@ -13,17 +13,29 @@ environment: azureml://registries/models-staging/environments/medimage-embedding
 code: ../../../src/medimage_insight_adapter_preprocess
 
 inputs:
-  zeroshot_path:
+  eval_image_tsv:
     type: uri_file
     optional: false
-    description: Path to the zeroshot data file.
-    mode: rw_mount
+    description: Path to the evaluation image TSV file.
+    mode: ro_mount
 
-  test_train_split_csv_path:
+  eval_text_tsv:
     type: uri_file
     optional: false
-    description: Path to the CSV file containing test-train split information.
-    mode: rw_mount
+    description: Path to the evaluation text TSV file.
+    mode: ro_mount
+
+  image_tsv:
+    type: uri_file
+    optional: false
+    description: Path to the image TSV file.
+    mode: ro_mount
+
+  text_tsv:
+    type: uri_file
+    optional: false
+    description: Path to the text TSV file.
+    mode: ro_mount
 
   mlflow_model_path:
     type: uri_folder
@@ -43,4 +55,12 @@ outputs:
     mode: rw_mount
 
 command: >-
-  python medimage_datapreprocess.py --task_name "MedEmbedding" --zeroshot_path "${{inputs.zeroshot_path}}" --test_train_split_csv_path "${{inputs.test_train_split_csv_path}}" --output_train_pkl "${{outputs.output_train_pkl}}" --output_validation_pkl "${{outputs.output_validation_pkl}}" --mlflow_model_path "${{inputs.mlflow_model_path}}"
+  python medimage_datapreprocess.py 
+  --task_name "MedEmbedding" 
+  --eval_image_tsv "${{inputs.eval_image_tsv}}" 
+  --eval_text_tsv "${{inputs.eval_text_tsv}}" 
+  --image_tsv "${{inputs.image_tsv}}" 
+  --text_tsv "${{inputs.text_tsv}}" 
+  --output_train_pkl "${{outputs.output_train_pkl}}" 
+  --output_validation_pkl "${{outputs.output_validation_pkl}}" 
+  --mlflow_model_path "${{inputs.mlflow_model_path}}"
diff --git a/assets/training/finetune_acft_image/src/medimage_insight_adapter_finetune/medimage_train.py b/assets/training/finetune_acft_image/src/medimage_insight_adapter_finetune/medimage_train.py
@@ -79,10 +79,9 @@ def get_parser():
         help='Number of workers for the validation dataloader.'
     )
     parser.add_argument(
-        '--output_classes',
-        type=int,
-        required=True,
-        help='Number of output classes.'
+        '--label_file',
+        type=str,
+        help='Path to label file.'
     )
     parser.add_argument(
         '--hidden_dimensions',
@@ -150,10 +149,13 @@ def initialize_model(args: argparse.Namespace) -> torch.nn.Module:
     Returns:
         torch.nn.Module: Initialized model.
     """
+    with open(args.label_file, "r") as f:
+        labels = [l.strip() for l in f.read().splitlines() if l.strip()]
+
     return training.create_model(
         in_channels=args.input_channels,
         hidden_dim=args.hidden_dimensions,
-        num_class=args.output_classes
+        num_class=len(labels)
     )