From b3b9e0440dd5d7d8fe9c49007c1b98629f1878eb Mon Sep 17 00:00:00 2001
From: acquayefrank <acquayefrank@gmail.com>
Date: Fri, 31 Jan 2025 16:12:10 +0100
Subject: [PATCH] finished refactoring

---
 tools/ipapy2/ipapy2_MS1_annotation.xml        |   1 +
 tools/ipapy2/ipapy2_MS2_annotation.xml        |   8 +-
 tools/ipapy2/ipapy2_clustering.xml            |   1 +
 tools/ipapy2/ipapy2_compute_all_adducts.xml   |   1 +
 tools/ipapy2/ipapy2_compute_bio.py            |  81 +++++---
 tools/ipapy2/ipapy2_compute_bio.xml           |  10 +-
 tools/ipapy2/ipapy2_gibbs_sampler.py          | 185 +++++++++++-------
 tools/ipapy2/ipapy2_gibbs_sampler.xml         |  52 +++--
 tools/ipapy2/ipapy2_gibbs_sampler_add.py      | 115 ++++++-----
 tools/ipapy2/ipapy2_gibbs_sampler_add.xml     |  32 +--
 tools/ipapy2/ipapy2_map_isotope_patterns.xml  |   1 +
 tools/ipapy2/macros.xml                       |  13 ++
 .../test-data/mapped_isotope_patterns.parquet | Bin 0 -> 2710 bytes
 tools/ipapy2/utils.py                         |  58 +++++-
 14 files changed, 356 insertions(+), 202 deletions(-)
 create mode 100644 tools/ipapy2/test-data/mapped_isotope_patterns.parquet
diff --git a/tools/ipapy2/ipapy2_MS1_annotation.xml b/tools/ipapy2/ipapy2_MS1_annotation.xml
index 7c87ce0d..f20e4c1a 100644
--- a/tools/ipapy2/ipapy2_MS1_annotation.xml
+++ b/tools/ipapy2/ipapy2_MS1_annotation.xml
@@ -5,6 +5,7 @@
     
     <requirements>
         <requirement type="package" version="@TOOL_VERSION@">ipapy2</requirement>
+        <expand macro="extra_requirements"/>
     </requirements>
 
     <command detect_errors="exit_code"><![CDATA[
diff --git a/tools/ipapy2/ipapy2_MS2_annotation.xml b/tools/ipapy2/ipapy2_MS2_annotation.xml
index b76f8918..fd85b51a 100644
--- a/tools/ipapy2/ipapy2_MS2_annotation.xml
+++ b/tools/ipapy2/ipapy2_MS2_annotation.xml
@@ -5,6 +5,7 @@
     
     <requirements>
         <requirement type="package" version="@TOOL_VERSION@">ipapy2</requirement>
+        <expand macro="extra_requirements"/>
     </requirements>
 
     <command detect_errors="exit_code"><![CDATA[
@@ -80,7 +81,12 @@
             <param name="all_adducts" value="all_adducts.csv"/>
             <param name="MS2_DB" value="MS2_DB.csv"/>
             <param name="ppm" value="3"/>
-            <output name="MS2_annotations" file="MS2_annotations.csv"/>
+            <output name="MS2_annotations">
+                <assert_contents>
+                    <has_n_columns n="13"  sep=","/>
+                    <has_n_lines n="158" delta="5" />
+                </assert_contents>
+            </output>
         </test>
     </tests>
 
diff --git a/tools/ipapy2/ipapy2_clustering.xml b/tools/ipapy2/ipapy2_clustering.xml
index 356f9d55..d918a3e6 100644
--- a/tools/ipapy2/ipapy2_clustering.xml
+++ b/tools/ipapy2/ipapy2_clustering.xml
@@ -4,6 +4,7 @@
     </macros>
     <requirements>
         <requirement type="package" version="@TOOL_VERSION@">ipapy2</requirement>
+        <expand macro="extra_requirements"/>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
         python3 '${__tool_directory__}/ipapy2_clustering.py'
diff --git a/tools/ipapy2/ipapy2_compute_all_adducts.xml b/tools/ipapy2/ipapy2_compute_all_adducts.xml
index 0adabfd6..f010ff2f 100644
--- a/tools/ipapy2/ipapy2_compute_all_adducts.xml
+++ b/tools/ipapy2/ipapy2_compute_all_adducts.xml
@@ -5,6 +5,7 @@
     
     <requirements>
         <requirement type="package" version="@TOOL_VERSION@">ipapy2</requirement>
+        <expand macro="extra_requirements"/>
     </requirements>
 
     <command detect_errors="exit_code"><![CDATA[
diff --git a/tools/ipapy2/ipapy2_compute_bio.py b/tools/ipapy2/ipapy2_compute_bio.py
index 2f921de3..bd8b5976 100644
--- a/tools/ipapy2/ipapy2_compute_bio.py
+++ b/tools/ipapy2/ipapy2_compute_bio.py
@@ -1,27 +1,29 @@
 import argparse
-import os
-import pandas as pd
 from ipaPy2 import ipa
+from utils import LoadDataAction, StoreOutputAction, group_by_peak_id
 
 
-def main(args):
-    MS1_DB = pd.read_csv(args.MS1_DB)
-    MS1_DB = MS1_DB.replace("", None)
+def main(
+    input_dataset_database,
+    input_dataset_annotations,
+    biochemical_mode,
+    connection_list,
+    output_dataset,
+    ncores,
+):
+    """
+    Compute matrix of biochemical connections. Either based on a list of
+    possible connections in the form of a list of formulas or based on the
+    reactions present in the database.
+    """
 
-    if args.annotations:
-        annotations_df = pd.read_csv(args.annotations, keep_default_na=False)
-        annotations_df = annotations_df.replace("", None)
-        annotations = {}
-        keys = set(annotations_df["peak_id"])
-        for i in keys:
-            annotations[i] = annotations_df[annotations_df["peak_id"] == i].drop(
-                "peak_id", axis=1
-            )
+    if input_dataset_annotations is not None:
+        annotations = group_by_peak_id(input_dataset_annotations)
     else:
         annotations = None
 
-    if args.biochemical_mode == "connections" and args.connection_list:
-        connections = args.connection_list
+    if biochemical_mode == "connections" and connection_list:
+        connections = connection_list
     else:
         connections = [
             "C3H5NO",
@@ -110,29 +112,34 @@ def main(args):
         ]
 
     Bio = ipa.Compute_Bio(
-        MS1_DB,
+        input_dataset_database,
         annotations=annotations,
-        mode=args.biochemical_mode,
+        mode=biochemical_mode,
         connections=connections,
-        ncores=int(os.environ.get("GALAXY_SLOTS")),
+        ncores=ncores,
     )
-    Bio.to_csv(args.compute_bio_output, index=False)
+    write_func, file_path = output_dataset
+    write_func(Bio, file_path)
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="cluster features before IPA pipeline."
+        description=""" Compute matrix of biochemical connections. Either based on a list of
+    possible connections in the form of a list of formulas or based on the
+    reactions present in the database."""
     )
     parser.add_argument(
-        "--MS1_DB",
-        type=str,
+        "--input_dataset_database",
+        nargs=2,
+        action=LoadDataAction,
         required=True,
-        help="a dataframe containing the measured intensities across several samples.",
+        help="a datset containing the database against which the annotationis performed.",
     )
     parser.add_argument(
-        "--annotations",
-        type=str,
-        help="a dataframe containing the annotations of the features.",
+        "--input_dataset_annotations",
+        nargs=2,
+        action=LoadDataAction,
+        help="a datset containing the annotations of the features.",
     )
     parser.add_argument(
         "--biochemical_mode",
@@ -144,11 +151,25 @@ def main(args):
         "--connection_list", type=str, help="intensity mode. Default 'max' or 'ave'."
     )
     parser.add_argument(
-        "--compute_bio_output",
-        type=str,
+        "--output_dataset",
+        nargs=2,
+        action=StoreOutputAction,
         required=True,
         help="Output file path for the dataframe.",
     )
+    parser.add_argument(
+        "--ncores",
+        type=int,
+        default=None,
+        help="number of cores to use for the computation.",
+    )
     args = parser.parse_args()
 
-    main(args)
+    main(
+        args.input_dataset_database,
+        args.input_dataset_annotations,
+        args.biochemical_mode,
+        args.connection_list,
+        args.output_dataset,
+        args.ncores,
+    )
diff --git a/tools/ipapy2/ipapy2_compute_bio.xml b/tools/ipapy2/ipapy2_compute_bio.xml
index a01d74c8..845be4b4 100644
--- a/tools/ipapy2/ipapy2_compute_bio.xml
+++ b/tools/ipapy2/ipapy2_compute_bio.xml
@@ -5,17 +5,19 @@
     
     <requirements>
         <requirement type="package" version="@TOOL_VERSION@">ipapy2</requirement>
+        <expand macro="extra_requirements"/>
     </requirements>
 
     <command detect_errors="exit_code"><![CDATA[
         python3 '${__tool_directory__}/ipapy2_compute_bio.py'
-        --MS1_DB '${MS1_DB}'
-        --annotations '${annotations}'
+        --input_dataset_database '${MS1_DB}' '${MS1_DB.ext}'
+        --input_dataset_annotations '${annotations}' '${annotations.ext}'
         --biochemical_mode '${biochemical_mode.biochemical_mode}'
         #if $biochemical_mode.biochemical_mode == "connections"
             --connection_list '${biochemical_mode.connection_list}'
         #end if
-        --compute_bio_output "${compute_bio_output}"
+        --output_dataset "${compute_bio_output}" "${compute_bio_output.ext}"
+        --ncores \${GALAXY_SLOTS:-1}
     ]]></command>
 
     <inputs>
@@ -37,7 +39,7 @@
     </inputs>
 
     <outputs>
-        <data label="${tool.name} on ${on_string}" name="compute_bio_output" format="csv,tsv,tabular,parquet"/>
+        <data label="${tool.name} on ${on_string}" name="compute_bio_output" format_source="MS1_DB"/>
     </outputs>
 
     <tests>
diff --git a/tools/ipapy2/ipapy2_gibbs_sampler.py b/tools/ipapy2/ipapy2_gibbs_sampler.py
index 953b7c17..e7a2f8d5 100644
--- a/tools/ipapy2/ipapy2_gibbs_sampler.py
+++ b/tools/ipapy2/ipapy2_gibbs_sampler.py
@@ -1,80 +1,78 @@
 import argparse
-import pandas as pd
 from ipaPy2 import ipa
+from utils import (
+    LoadDataAction,
+    StoreOutputAction,
+    LoadTextAction,
+    group_by_peak_id,
+    flattern_annotations,
+)
 
 
-def main(args):
-    df = pd.read_csv(args.mapped_isotope_patterns, keep_default_na=False)
-    df = df.replace("", None)
-
-    annotations_df = pd.read_csv(args.annotations, keep_default_na=False)
+def main(
+        input_dataset_mapped_isotope_patterns,
+        input_dataset_annotations,
+        integrating_mode,
+        input_dataset_bio,
+        noits,
+        burn,
+        delta_bio,
+        delta_add,
+        all_out,
+        zs,
+        zs_out,
+        output_dataset,
+):
+    annotations_df = input_dataset_annotations
     annotations_df["post"] = annotations_df["post"].replace("", 0)
     annotations_df = annotations_df.replace("", None)
-    annotations = {}
-
-    grouped = annotations_df.groupby("peak_id")
-    for peak_id, group in grouped:
-        annotations[peak_id] = group.drop("peak_id", axis=1)
+    annotations = group_by_peak_id(annotations_df)
 
-    if args.zs:
-        zs = []
-        with open(args.zs, "r") as f:
-            for line in f:
-                zs.append(int(line.strip()))
-
-    else:
+    if not zs:
         zs = None
 
-    if args.integrating_mode == "adducts":
+    if integrating_mode == "adducts":
         zs = ipa.Gibbs_sampler_add(
-            df,
+            input_dataset_mapped_isotope_patterns,
             annotations,
-            noits=args.noits,
-            burn=args.burn,
-            delta_add=args.delta_add,
-            all_out=args.all_out,
+            noits=noits,
+            burn=burn,
+            delta_add=delta_add,
+            all_out=all_out,
             zs=zs,
         )
     else:
-
-        Bio = pd.read_csv(args.Bio, keep_default_na=False)
-
         if args.integrating_mode == "biochemical":
             zs = ipa.Gibbs_sampler_bio(
-                df,
+                input_dataset_mapped_isotope_patterns,
                 annotations,
-                Bio=Bio,
-                noits=args.noits,
-                burn=args.burn,
-                delta_bio=args.delta_bio,
-                all_out=args.all_out,
+                Bio=input_dataset_bio,
+                noits=noits,
+                burn=burn,
+                delta_bio=delta_bio,
+                all_out=all_out,
                 zs=zs,
             )
         else:
             zs = ipa.Gibbs_sampler_bio_add(
-                df,
+                input_dataset_mapped_isotope_patterns,
                 annotations,
-                Bio=Bio,
-                noits=args.noits,
-                burn=args.burn,
-                delta_bio=args.delta_bio,
-                delta_add=args.delta_add,
-                all_out=args.all_out,
+                Bio=input_dataset_bio,
+                noits=noits,
+                burn=burn,
+                delta_bio=delta_bio,
+                delta_add=delta_add,
+                all_out=all_out,
                 zs=zs,
             )
 
-    annotations_flat = pd.DataFrame()
-    for peak_id in annotations:
-        annotation = annotations[peak_id]
-        annotation["peak_id"] = peak_id
-        annotations_flat = pd.concat([annotations_flat, annotation])
+    annotations_flat = flattern_annotations(annotations)
+    write_func, file_path = output_dataset
+    write_func(annotations_flat, file_path)
 
-    annotations_flat.to_csv(args.annotations_out, index=False)
-
-    if args.gibbs_out:
-        with open(args.zs_out, "w") as f:
-            for s in zs:
-                f.write(str(s) + "\n")
+    if args.all_out:
+        write_func, file_path = zs_out
+        write_func(zs, file_path)
 
 
 if __name__ == "__main__":
@@ -82,55 +80,100 @@ def main(args):
         description="cluster features before IPA pipeline."
     )
     parser.add_argument(
-        "--mapped_isotope_patterns",
-        type=str,
+        "--input_dataset_mapped_isotope_patterns",
+        nargs=2,
+        action=LoadDataAction,
         required=True,
         help="a dataframe containing the measured intensities across several samples.",
     )
     parser.add_argument(
-        "--annotations",
-        type=str,
+        "--input_dataset_annotations",
+        nargs=2,
+        action=LoadDataAction,
         required=True,
-        help="Default value 0.8. Minimum correlation allowed in each cluster.",
+        help="a datset containing the annotations of the features.",
     )
     parser.add_argument(
         "--integrating_mode",
         type=str,
         required=True,
-        help="Default value 0.8. Minimum correlation allowed in each cluster.",
+        choices=["adducts", "biochemical", "biochemical_adducts"],
+        help="The mode of integration. Options are 'adducts', 'biochemical', or 'biochemical_adducts'.",
     )
     parser.add_argument(
-        "--Bio", type=str, help="intensity mode. Default 'max' or 'ave'."
+        "--input_dataset_bio",
+        nargs=2,
+        action=LoadDataAction,
+        type=str,
+        help="""dataframe (2 columns), reporting all the possible connections between
+         compounds. It uses the unique ids from the database. It could be the
+         output of Compute_Bio() or Compute_Bio_Parallel()""",
     )
     parser.add_argument(
         "--noits",
         type=int,
-        help="Default value 1. Maximum difference in RT time between features in the same cluster.",
+        help="number of iterations if the Gibbs sampler to be run",
     )
     parser.add_argument(
-        "--burn", type=int, help="intensity mode. Default 'max' or 'ave'."
+        "--burn",
+        type=int,
+        help="""number of iterations to be ignored when computing posterior
+          probabilities. If None, is set to 10% of total iterations""",
     )
     parser.add_argument(
-        "--delta_bio", type=float, help="intensity mode. Default 'max' or 'ave'."
+        "--delta_bio",
+        type=float,
+        help="""parameter used when computing the conditional priors. The
+               parameter must be positive. The smaller the parameter the more
+               weight the adducts connections have on the posterior
+               probabilities. Default 1.""",
     )
     parser.add_argument(
-        "--delta_add", type=float, help="intensity mode. Default 'max' or 'ave'."
+        "--delta_add",
+        type=float,
+        help=""" parameter used when computing the conditional priors. The
+               parameter must be positive. The smaller the parameter the more
+               weight the adducts connections have on the posterior
+               probabilities. Default 1.""",
     )
     parser.add_argument(
-        "--all_out", type=str, help="intensity mode. Default 'max' or 'ave'."
+        "--all_out",
+        type=str,
+        help="""logical value. If true the list of assignments found in each
+            iteration is returned by the function. Default False.""",
     )
     parser.add_argument(
-        "--zs", type=str, help="intensity mode. Default 'max' or 'ave'."
+        "--zs",
+        nargs=2,
+        action=LoadTextAction,
+        help="""a txt file containing the list of assignments computed in a previous run of the Gibbs sampler. 
+        Optional, default None.""",
     )
     parser.add_argument(
-        "--gibbs_out", type=str, help="intensity mode. Default 'max' or 'ave'."
+        "--zs_out",
+        nargs=2,
+        action=StoreOutputAction,
+        help="file to save the list of assignments computed in the current run of the Gibbs sampler.",
     )
     parser.add_argument(
-        "--annotations_out",
-        type=str,
-        default="gibbs_sample_annotations.csv",
-        help="a dataframe of clustered features.",
+        "--output_dataset",
+        nargs=2,
+        action=StoreOutputAction,
+        required=True,
+        help="A file path for the output results from Gibbs Add.",
     )
-    parser.add_argument("--zs_out", type=str, help="a dataframe of clustered features.")
     args = parser.parse_args()
-    main(args)
+    main(
+        args.input_dataset_mapped_isotope_patterns,
+        args.input_dataset_annotations,
+        args.integrating_mode,
+        args.input_dataset_bio,
+        args.noits,
+        args.burn,
+        args.delta_bio,
+        args.delta_add,
+        args.all_out,
+        args.zs,
+        args.zs_out,
+        args.output_dataset,
+    )
diff --git a/tools/ipapy2/ipapy2_gibbs_sampler.xml b/tools/ipapy2/ipapy2_gibbs_sampler.xml
index 9012e8a9..5dfd1dda 100644
--- a/tools/ipapy2/ipapy2_gibbs_sampler.xml
+++ b/tools/ipapy2/ipapy2_gibbs_sampler.xml
@@ -5,45 +5,43 @@
     
     <requirements>
         <requirement type="package" version="@TOOL_VERSION@">ipapy2</requirement>
+        <expand macro="extra_requirements"/>
     </requirements>
 
     <command detect_errors="exit_code"><![CDATA[
         python3 '${__tool_directory__}/ipapy2_gibbs_sampler.py'
-        --mapped_isotope_patterns '${mapped_isotope_patterns}'
-        --annotations '${annotations}'
+        --input_dataset_mapped_isotope_patterns '${mapped_isotope_patterns}' '${mapped_isotope_patterns.ext}'
+        --input_dataset_annotations '${annotations}' '${annotations.ext}'
         --integrating_mode '${integrating_mode.integrating_mode}'
         #if $integrating_mode.integrating_mode == "adducts"
             --delta_add '${integrating_mode.delta_add}'
         #elif $integrating_mode.integrating_mode == "biochemical"
-            --Bio '${integrating_mode.Bio}'
+            --input_dataset_bio '${integrating_mode.Bio}' '${integrating_mode.Bio.ext}'
             --delta_bio '${integrating_mode.delta_bio}'
         #else
             --delta_add '${integrating_mode.delta_add}'
-            --Bio '${integrating_mode.Bio}'
+            --input_dataset_bio '${integrating_mode.Bio}' '${integrating_mode.Bio.ext}'
             --delta_bio '${integrating_mode.delta_bio}'
         #end if
-        --gibbs_out "${gibbs_out}"
-        --annotations_out "${annotations_out}" 
         --noits '${noits}'
-        #if $burn is not None
-            --burn '${burn}'
+        --burn '${burn}'
+        --all_out '${all_out}'
+        #if $zs:
+            --zs '${zs}' '${zs.ext}'
+        #else:
+            --zs '' ''
         #end if
-        #if $all_out is not None
-            --all_out '${all_out}'
-        #end if
-        #if $zs_file is not None
-            --zs_out '${zs_file}'
+        #if $zs_out:
+            --zs_out '${zs_out}' '${zs_out.ext}'
+        #else:
+            --zs_out '' ''
         #end if
+        --output_dataset '${annotations_out}' '${annotations_out.ext}'
+
     ]]></command>
 
     <inputs>
-        <param label="Mapped isotope patterns" name="mapped_isotope_patterns" type="data" format="csv,tsv,tabular,parquet" help="pandas dataframe containing the MS1 data."/>
-        <param label="annotations" name="annotations" type="data" format="csv,tsv,tabular,parquet" help="pandas dataframe containing all the possible annotations for the measured features."/>
-        <param label="gibbs sampler iterations" name="noits" type="integer" value="1000" help="number of iterations if the Gibbs sampler to be run."/>
-        <param label="ignored iterations" name="burn" type="integer" optional="true" value="10" help="number of iterations to be ignored when computing posterior probabilities. If None, is set to 10% of total iterations."/>
-        <param label="return each iteration" name="all_out" type="boolean" value="false" optional="true" help="logical value. If true the list of assignments found in each iteration is returned by the function. Default False."/>
-        <param label="previous Gibbs sampler" name="zs_file" type="data" format="txt" optional="true" help="list of assignments computed in a previous run of the Gibbs sampler. Optional, default None."/>
-        <param label="output Gibbs parameter" name="gibbs_out" type="boolean" truevalue="true" falsevalue="false" help="list of assignments computed in a previous run of the Gibbs sampler. Optional, default None."/>
+        <expand macro="gibbs"/>
         
         <conditional name="integrating_mode">
             <param name="integrating_mode" type="select" label="integrating mode" help="select the integrating mode">
@@ -79,22 +77,20 @@
     </inputs>
 
     <outputs>
-        <data label="${tool.name} annotations on ${on_string}" name="annotations_out" format="csv,tsv,tabular,parquet"/>
-        <data label="${tool.name} zs on ${on_string}" name="zs_out" format="csv,tsv,tabular,parquet">
-            <filter>options['gibbs_out']</filter>
+        <data label="${tool.name} annotations on ${on_string}" name="annotations_out" format_source="mapped_isotope_patterns"/>
+        <data label="${tool.name} zs on ${on_string}" name="zs_out" format="txt">
+            <filter>options['all_out']</filter>
         </data>
     </outputs>
 
     <tests>
-        <test  expect_num_outputs="1">
-            <param name="mapped_isotope_patterns" value="mapped_isotope_patterns.csv"/>
+        <test  expect_num_outputs="2">
+            <param name="mapped_isotope_patterns" value="mapped_isotope_patterns.parquet"/>
             <param name="annotations" value="clean_annotations.csv"/>
             <!-- Not the best way to test, but the results are stochastic hence difficult to test-->
             <output name="annotations_out">
                 <assert_contents>
-                    <has_n_columns n="15" sep="," />
-                    <has_n_lines n="15" delta="5" />
-                    <has_line line="id,name,formula,adduct,m/z,charge,RT range,ppm,isotope pattern score,fragmentation pattern score,prior,post,post Gibbs,chi-square pval,peak_id" />
+                    <has_size value="9185" delta="100" />
                 </assert_contents>
             </output>
         </test>
diff --git a/tools/ipapy2/ipapy2_gibbs_sampler_add.py b/tools/ipapy2/ipapy2_gibbs_sampler_add.py
index 7c233785..d184ccf6 100644
--- a/tools/ipapy2/ipapy2_gibbs_sampler_add.py
+++ b/tools/ipapy2/ipapy2_gibbs_sampler_add.py
@@ -1,53 +1,52 @@
-import os
-
 import argparse
-import pandas as pd
 from ipaPy2 import ipa
+from utils import (
+    LoadDataAction,
+    StoreOutputAction,
+    LoadTextAction,
+    group_by_peak_id,
+    flattern_annotations,
+)
 
 
-def main(args):
-    df = pd.read_csv(args.mapped_isotope_patterns, keep_default_na=False)
-    df = df.replace("", None)
+def main(
+    mapped_isotope_patterns,
+    annotations_df,
+    noits,
+    burn,
+    delta_add,
+    all_out,
+    zs,
+    zs_out,
+    output_dataset,
+):
+    df = mapped_isotope_patterns
 
-    annotations_df = pd.read_csv(args.annotations, keep_default_na=False)
+    annotations_df = annotations_df
     annotations_df["post"] = annotations_df["post"].replace("", 0)
     annotations_df = annotations_df.replace("", None)
-    annotations = {}
-
-    grouped = annotations_df.groupby("peak_id")
-    for peak_id, group in grouped:
-        annotations[peak_id] = group.drop("peak_id", axis=1)
+    annotations = group_by_peak_id(annotations_df)
 
-    if args.zs and args.zs.lower() != "none" and os.path.isfile(args.zs):
-        zs = []
-        with open(args.zs, "r") as f:
-            for line in f:
-                zs.append(int(line.strip()))
-
-    else:
+    if not zs:
         zs = None
+
     zs = ipa.Gibbs_sampler_add(
         df,
         annotations,
-        noits=args.noits,
-        burn=args.burn,
-        delta_add=args.delta_add,
-        all_out=args.all_out,
+        noits=noits,
+        burn=burn,
+        delta_add=delta_add,
+        all_out=all_out,
         zs=zs,
     )
 
-    annotations_flat = pd.DataFrame()
-    for peak_id in annotations:
-        annotation = annotations[peak_id]
-        annotation["peak_id"] = peak_id
-        annotations_flat = pd.concat([annotations_flat, annotation])
+    annotations_flat = flattern_annotations(annotations)
+    write_func, file_path = output_dataset
+    write_func(annotations_flat, file_path)
 
-    annotations_flat.to_csv(args.annotations_out, index=False)
-
-    if args.all_out:
-        with open(args.zs_out, "w") as f:
-            for s in zs:
-                f.write(str(s) + "\n")
+    if all_out:
+        write_func, file_path = zs_out
+        write_func(zs, file_path)
 
 
 if __name__ == "__main__":
@@ -55,21 +54,18 @@ def main(args):
         description="cluster features before IPA pipeline."
     )
     parser.add_argument(
-        "--mapped_isotope_patterns",
-        type=str,
+        "--input_dataset_mapped_isotope_patterns",
+        nargs=2,
+        action=LoadDataAction,
         required=True,
-        help="A csv file containing the MS1 data. Ideally obtained from map_isotope_patterns",
+        help="A dataset containing the MS1 data. Ideally obtained from map_isotope_patterns",
     )
     parser.add_argument(
-        "--annotations",
-        type=str,
+        "--input_dataset_annotations",
+        nargs=2,
+        action=LoadDataAction,
         required=True,
-        help=""" a dictionary containing all the possible annotations for the
-                measured features. The keys of the dictionary are the unique
-                ids for the features present in df. For each feature, the
-                annotations are summarized in a A csv file. Output of
-                functions MS1annotation(), MS1annotation_Parallel(),
-                MSMSannotation() or MSMSannotation_Parallel""",
+        help="a datset containing the annotations of the features.",
     )
     parser.add_argument(
         "--noits",
@@ -101,20 +97,33 @@ def main(args):
     )
     parser.add_argument(
         "--zs",
-        type=str,
+        nargs=2,
+        action=LoadTextAction,
         help="""a txt file containing the list of assignments computed in a previous run of the Gibbs sampler. 
         Optional, default None.""",
     )
     parser.add_argument(
         "--zs_out",
-        type=str,
-        default="gibbs_sample_add_zs.txt",
-        help="file name to save the list of assignments computed in the current run of the Gibbs sampler.",
+        nargs=2,
+        action=StoreOutputAction,
+        help="file to save the list of assignments computed in the current run of the Gibbs sampler.",
     )
     parser.add_argument(
-        "--annotations_out",
-        type=str,
-        default="gibbs_sample_add_annotations.csv",
+        "--output_dataset",
+        nargs=2,
+        action=StoreOutputAction,
+        required=True,
+        help="A file path for the output results from Gibbs Add.",
     )
     args = parser.parse_args()
-    main(args)
+    main(
+        args.input_dataset_mapped_isotope_patterns,
+        args.input_dataset_annotations,
+        args.noits,
+        args.burn,
+        args.delta_add,
+        args.all_out,
+        args.zs,
+        args.zs_out,
+        args.output_dataset,
+    )
diff --git a/tools/ipapy2/ipapy2_gibbs_sampler_add.xml b/tools/ipapy2/ipapy2_gibbs_sampler_add.xml
index 3008379c..2f64b371 100644
--- a/tools/ipapy2/ipapy2_gibbs_sampler_add.xml
+++ b/tools/ipapy2/ipapy2_gibbs_sampler_add.xml
@@ -5,40 +5,44 @@
     
     <requirements>
         <requirement type="package" version="@TOOL_VERSION@">ipapy2</requirement>
+        <expand macro="extra_requirements"/>
     </requirements>
 
     <command detect_errors="exit_code"><![CDATA[
         python3 '${__tool_directory__}/ipapy2_gibbs_sampler_add.py'
-        --mapped_isotope_patterns '${mapped_isotope_patterns}'
-        --annotations '${annotations}'
+        --input_dataset_mapped_isotope_patterns '${mapped_isotope_patterns}' '${mapped_isotope_patterns.ext}'
+        --input_dataset_annotations '${annotations}' '${annotations.ext}'
         --noits '${noits}'
         --burn '${burn}'
         --delta_add '${delta_add}'
         --all_out '${all_out}'
-        --zs '${zs}'
-        --zs_out '${zs_out}'
-        --annotations_out '${annotations_out}'
+        #if $zs:
+            --zs '${zs}' '${zs.ext}'
+        #else:
+            --zs '' ''
+        #end if
+        #if $zs_out:
+            --zs_out '${zs_out}' '${zs_out.ext}'
+        #else:
+            --zs_out '' ''
+        #end if
+        --output_dataset '${annotations_out}' '${annotations_out.ext}'
     ]]></command>
 
     <inputs>
-        <param label="Mapped isotope patterns" name="mapped_isotope_patterns" type="data" format="csv,tsv,tabular,parquet" help="A csv file containing the MS1 data. Ideally obtained from map_isotope_patterns."/>
-        <param label="annotations" name="annotations" type="data" format="csv,tsv,tabular,parquet" help="pandas dataframe containing all the possible annotations for the measured features."/>
-        <param label="gibbs sampler iterations" name="noits" type="integer" value="1000" help="number of iterations if the Gibbs sampler to be run."/>
-        <param label="ignored iterations" name="burn" type="integer" optional="true" value="10" help="number of iterations to be ignored when computing posterior probabilities. If None, is set to 10% of total iterations."/>
+        <expand macro="gibbs"/>
         <param name="delta_add" type="float" value="1" min="0" label="adducts weight" help="parameter used when computing the conditional priors. The parameter must be positive. The smaller the parameter the more weight the adducts connections have on the posterior probabilities. Default 1." />
-        <param label="return each iteration" name="all_out" type="boolean" value="false" optional="true" help="logical value. If true the list of assignments found in each iteration is returned by the function. Default False."/>
-        <param label="previous Gibbs sampler" name="zs" type="data" format="txt" optional="true" help="list of assignments computed in a previous run of the Gibbs sampler. Optional, default None."/>
     </inputs>
 
     <outputs>
-        <data label="${tool.name} annotations on ${on_string}" name="annotations_out" format="csv,tsv,tabular,parquet"/>
-        <data label="${tool.name} zs on ${on_string}" name="zs_out" format="csv,tsv,tabular,parquet">
+        <data label="${tool.name} annotations on ${on_string}" name="annotations_out" format_source="mapped_isotope_patterns"/>
+        <data label="${tool.name} zs on ${on_string}" name="zs_out" format="txt">
             <filter>options['all_out']</filter>
         </data>
     </outputs>
 
     <tests>
-        <test  expect_num_outputs="1">
+        <test  expect_num_outputs="2">
             <param name="mapped_isotope_patterns" value="mapped_isotope_patterns.csv"/>
             <param name="annotations" value="clean_annotations.csv"/>
             <param name="noits" value="1000"/>
diff --git a/tools/ipapy2/ipapy2_map_isotope_patterns.xml b/tools/ipapy2/ipapy2_map_isotope_patterns.xml
index cf272073..2cb09328 100644
--- a/tools/ipapy2/ipapy2_map_isotope_patterns.xml
+++ b/tools/ipapy2/ipapy2_map_isotope_patterns.xml
@@ -5,6 +5,7 @@
     
     <requirements>
         <requirement type="package" version="@TOOL_VERSION@">ipapy2</requirement>
+        <expand macro="extra_requirements"/>
     </requirements>
 
     <command detect_errors="exit_code"><![CDATA[
diff --git a/tools/ipapy2/macros.xml b/tools/ipapy2/macros.xml
index ab5242a6..63cac97a 100644
--- a/tools/ipapy2/macros.xml
+++ b/tools/ipapy2/macros.xml
@@ -67,4 +67,17 @@
             <help>multiplicative factor for the RT if measured RT is outside the RTrange present in the database.</help>
         </param>
     </xml>
+    <xml name="gibbs">
+        <param label="Mapped isotope patterns" name="mapped_isotope_patterns" type="data" format="csv,tsv,tabular,parquet" help="A csv file containing the MS1 data. Ideally obtained from map_isotope_patterns."/>
+        <param label="annotations" name="annotations" type="data" format="csv,tsv,tabular,parquet" help="pandas dataframe containing all the possible annotations for the measured features."/>
+        <param label="gibbs sampler iterations" name="noits" type="integer" value="1000" help="number of iterations if the Gibbs sampler to be run."/>
+        <param label="ignored iterations" name="burn" type="integer" optional="true" value="10" help="number of iterations to be ignored when computing posterior probabilities. If None, is set to 10% of total iterations."/>
+        <param label="output Gibbs parameter" name="all_out" type="boolean" value="false" optional="true" help="logical value. If true the list of assignments found in each iteration is returned by the function. Default False."/>
+        <param label="previous Gibbs sampler" name="zs" type="data" format="txt" optional="true" help="list of assignments computed in a previous run of the Gibbs sampler. Optional, default None."/>
+    </xml>
+
+    <xml name="extra_requirements">
+        <requirement type="package" version="19.0.0">pyarrow</requirement>
+        <requirement type="package" version="2024.11.0">fastparquet</requirement>
+    </xml>
 </macros>
\ No newline at end of file
diff --git a/tools/ipapy2/test-data/mapped_isotope_patterns.parquet b/tools/ipapy2/test-data/mapped_isotope_patterns.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..3ffae05a11889f7c7b36c2f6e824e3a3f62afbf9
GIT binary patch
literal 2710
zcmaJ@3s6+o89wLkz02Kouk0#!y&KUW16vYb;3HUy<X^Igp{Oe=@rB+6R(Z+-i}-41
z1r#YH9j(zcF-kBYR;?&w3O+~>MHAG<#+WdUtt(=q#u<yj$*3vQ|L&uen&i&h{r=ba
z&i|it&-vG8NsAGQSf>`xs>X>jifU0!$Yxb2LkN*6B^pW=YJynuP%yBw3>pJ9qcM+i
zdi1Z<P&H48O7ZU#c=mdbMnhOdn=SC<8{q}ksMQ$Q-wo05#3aaM0$MrE2$pQa;I%vk
ze}xvcCYU8-Q1~9-Y~e6DEVxmZg;<sdLKx{4B;ga5j1_>ypgKv|NG=OwgeI085~A23
z9nuU}=FAhOk;Q_ZO=r;J5&lKa2^O|W&`6|J;NvyHm@q!S^rNrpW@b>;lCNsAjuOL|
z1fI|YN)d>PiYy@=D6w=ZN|`8ebQDSrD5>dqloC-2^2Sx5#M4N>P2;tRJke4?hY1X!
zoB}!@2tF9<gJ8<dM4z&@%46^W@#VThJt)`yP1KjIXMIFhC7S`IKO6{ev1Y*1k)P|5
zdkf&@p4SfA*+MA0AD{Y_dNDj~SpA7Mc`>v`4Sgb2FNXfGw+7ssGGN@<rlEi7%z)0k
z>92Qk%b@blg}2%}vSHWiZ(hFIlMQ>8RaNb6c7Y+LtNS72fGfXF?7oiZ*f2Y~xx)o}
z7OY4aTd@)fV|ssMsg&Vn-$2chy)s<8{`;+&r)9|6v*eYCQYZB6xSO%lB13*#T==>e
zCm5!#o^Zr2!{u&OTD8Rqa)#61x>1IpVO>XR=Q<&->r_Dn<`)}W+}48mE#I^K{j-=~
zT-J9NHYXOtu?yCM+LU6rHAnlWt!c$DcKuREdE7>jPJQ&*&c(%WyXul<XU7U?Z?V=K
z=~)4v%^EGYFe~Bip?eido=+MdUsv+<@ZT3H>lgmQ<on8eZ6{N<DEi-?=pDy!Jx>fz
zUV94HBOzy;O~D&?;vTJSUJVob$4iR-^Skr!fuPuTduG7V)XDFB-a7+^xs&$Pj!1@!
z6Qv!yjmh5Z`tq&t_9fNcUlm%Ryzx^$sniM&4(8=u=}H1qVMgVci%IZ@4-YoxBi^7*
zo|$*;aI*AbP39dtoQ^E$eg3u`F2zUIoxf!VscH1CqkVQ*6cPL7BeW0sqAuz3KkX2E
zVe_}=7N*0zGfis^h%N5a#O;~s5HV0vzH(_gBx-jjt}3>{lH-3jx8&L2z0q4g=q$0p
znN1J+0d;9h-?i_YHhA1_Ey>BZ!R|wID}V7u3M{IJRPs&=j7yj^YxvC+=zZ}7H^NW`
zbs>=(?&`~6^RRNYqMy}&kjW$VM`~}jq5Zkun)TIR+aR>$QFKeU4GafP)TH*<;96P7
z>!}~k2kVsMmEDc=A?djZSJob%1B<@BVwih-4$KePAsQ{OLEq~9ml{+ycys?inzObH
zUI9}=+YjZ?aO=SSX?-?05>|Qs*j1N*clL3E&W_E0&Ixfpe$ivWV-^<k@LkI5+$HKY
z2<`8Wv?9Q6^EJhIXT~29NaK@v2=&iPR}kc}$F?DOf~PD*fE&lYL~viMs7I*(=;J(u
zuxeKt0=e2cm-4*qDBxtizAu@>@HO|p;3WR-Q`<S7rxVkap)I2pC}R%YyNDpChaW}o
zoSRvTK<=O4hz#{#f6BRa@`9Z<s@FkBT$d8z*3}?DY2$39np}Sl759|oO$hQ!t(y=?
zLsynE_`ySNUsO9OJgb!J@l5ADylh;=fll5sv6D(V_xRdsC0sYF5&=3Nxez?3Zz`3z
z8EYbfte%&G5N7FqmGaT`Q>G_-xhmOYi^z&=#LQHY7m+>Qdyi&1wVYbskXu^hDsnoM
zJI`6<a;$RY+6%%mogs005*l#g5jsxAH>k2ndS465fYCTYhj$!{ubQ%i;dDGp^qf9~
zA)!>o;JwN)8lLJXYtri(hB7LSrBXBo`vx&ouUwt2tXSn`_BXd(421m*wGKxmP(r9+
zFz_E^Bba93mr;@sMt-X&11L~}#fFGdDgyFoBQTi6C_j^MwsbM=sVc_)^i<IdR;MS-
zIP4W#A~L33Q;8&Gmu2N=XD%<z%5fChMa-X#WF%E?c4=gUPYx<^6io2zTE$nmu9S#g
zDQ$j0dUchHh&uG@Fxu|QEm~EYlk1EmK8j%fmV`O&E|;TZl|roqtFv<KB`X~mW|U2#
zraChT6<L!=Ox^@U#wZ(`)y8`&GaoPUv;56T;+4vk$4H`n#EOpp@dT0&aCI~0eif1h
z8l8ll)AuN3&szOtT3=FFd4TAo+I&HcME~spe4y(oM9*3W6XC)=MYKggqQG?`@k;wp
zE@*}5S?gdTT=xLcUpe!c;Yfr5-aE!8;W@G>M9*6NL^@L>(VGVEo0swdV?2zjWioFw
zg-Rh$pQOMcl9)`VahQ?+X|I0{-=6;eSKyuHAFOddHFu1}!~@&<)SiS#{fKpN;{I>Q
zOV&t@jnrH<%6u<0RHBn48e<GMvOWn%8SCE*G0JA1M5(R#)msw@ry|6L|49D>Bu%o#

literal 0
HcmV?d00001

diff --git a/tools/ipapy2/utils.py b/tools/ipapy2/utils.py
index 1eef4bec..13f32742 100644
--- a/tools/ipapy2/utils.py
+++ b/tools/ipapy2/utils.py
@@ -18,7 +18,7 @@ def __call__(self, parser, namespace, values, option_string=None):
         :param option_string: Option string
         :return: None
         """
-        
+
         file_path, file_extension = values
         file_extension = file_extension.lower()
         if file_extension == "csv":
@@ -34,6 +34,28 @@ def __call__(self, parser, namespace, values, option_string=None):
         setattr(namespace, self.dest, df)
 
 
+class LoadTextAction(argparse.Action):
+    """
+    Custom argparse action to load data from a text file.
+    """
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        """
+        Load data from a text file and store it in the namespace.
+        :param namespace: Namespace object
+        :param values: Tuple containing the file path and file extension
+        :param option_string: Option string
+        :return: None
+        """
+        file_path, _ = values
+        data = []
+        if file_path:
+            with open(file_path, "r") as f:
+                for line in f:
+                    data.append(int(line.strip()))
+        setattr(namespace, self.dest, data)
+
+
 def write_csv(df: pd.DataFrame, file_path: str) -> None:
     """
     Write the dataframe to a CSV file.
@@ -67,6 +89,20 @@ def write_parquet(df: pd.DataFrame, file_path: str) -> None:
     df.to_parquet(file_path, index=False)
 
 
+def write_text(data: list, file_path: str) -> None:
+    """
+    Write the data to a text file.
+
+    Parameters:
+    data (list): The data to write.
+    file_path (str): The path to the output text file.
+    """
+    if file_path:
+        with open(file_path, "w") as f:
+            for s in data:
+                f.write(str(s) + "\n")
+
+
 class StoreOutputAction(argparse.Action):
     def __call__(
         self,
@@ -92,6 +128,8 @@ def __call__(
             write_func = write_tsv
         elif file_extension == "parquet":
             write_func = write_parquet
+        elif file_extension == "txt":
+            write_func = write_text
         else:
             raise ValueError(f"Unsupported file format: {file_extension}")
         setattr(namespace, self.dest, (write_func, file_path))
@@ -113,3 +151,21 @@ def flattern_annotations(annotations: dict) -> pd.DataFrame:
         annotation["peak_id"] = peak_id
         annotations_flat = pd.concat([annotations_flat, annotation])
     return annotations_flat
+
+
+def group_by_peak_id(df: pd.DataFrame) -> dict:
+    """
+    Convert a pandas dataframe to a dictionary where each key is a unique 'peak_id'
+    and each value is a dataframe subset corresponding to that 'peak_id'.
+
+    Parameters:
+    df (pd.DataFrame): The input dataframe.
+
+    Returns:
+    dict: The dictionary representation of the dataframe.
+    """
+    annotations = {}
+    keys = set(df["peak_id"])
+    for i in keys:
+        annotations[i] = df[df["peak_id"] == i].drop("peak_id", axis=1)
+    return annotations