From b3b9e0440dd5d7d8fe9c49007c1b98629f1878eb Mon Sep 17 00:00:00 2001 From: acquayefrank Date: Fri, 31 Jan 2025 16:12:10 +0100 Subject: [PATCH] finished refactoring --- tools/ipapy2/ipapy2_MS1_annotation.xml | 1 + tools/ipapy2/ipapy2_MS2_annotation.xml | 8 +- tools/ipapy2/ipapy2_clustering.xml | 1 + tools/ipapy2/ipapy2_compute_all_adducts.xml | 1 + tools/ipapy2/ipapy2_compute_bio.py | 81 +++++--- tools/ipapy2/ipapy2_compute_bio.xml | 10 +- tools/ipapy2/ipapy2_gibbs_sampler.py | 185 +++++++++++------- tools/ipapy2/ipapy2_gibbs_sampler.xml | 52 +++-- tools/ipapy2/ipapy2_gibbs_sampler_add.py | 115 ++++++----- tools/ipapy2/ipapy2_gibbs_sampler_add.xml | 32 +-- tools/ipapy2/ipapy2_map_isotope_patterns.xml | 1 + tools/ipapy2/macros.xml | 13 ++ .../test-data/mapped_isotope_patterns.parquet | Bin 0 -> 2710 bytes tools/ipapy2/utils.py | 58 +++++- 14 files changed, 356 insertions(+), 202 deletions(-) create mode 100644 tools/ipapy2/test-data/mapped_isotope_patterns.parquet diff --git a/tools/ipapy2/ipapy2_MS1_annotation.xml b/tools/ipapy2/ipapy2_MS1_annotation.xml index 7c87ce0d..f20e4c1a 100644 --- a/tools/ipapy2/ipapy2_MS1_annotation.xml +++ b/tools/ipapy2/ipapy2_MS1_annotation.xml @@ -5,6 +5,7 @@ ipapy2 + ipapy2 + - + + + + + + diff --git a/tools/ipapy2/ipapy2_clustering.xml b/tools/ipapy2/ipapy2_clustering.xml index 356f9d55..d918a3e6 100644 --- a/tools/ipapy2/ipapy2_clustering.xml +++ b/tools/ipapy2/ipapy2_clustering.xml @@ -4,6 +4,7 @@ ipapy2 + ipapy2 + ipapy2 + @@ -37,7 +39,7 @@ - + diff --git a/tools/ipapy2/ipapy2_gibbs_sampler.py b/tools/ipapy2/ipapy2_gibbs_sampler.py index 953b7c17..e7a2f8d5 100644 --- a/tools/ipapy2/ipapy2_gibbs_sampler.py +++ b/tools/ipapy2/ipapy2_gibbs_sampler.py @@ -1,80 +1,78 @@ import argparse -import pandas as pd from ipaPy2 import ipa +from utils import ( + LoadDataAction, + StoreOutputAction, + LoadTextAction, + group_by_peak_id, + flattern_annotations, +) -def main(args): - df = pd.read_csv(args.mapped_isotope_patterns, keep_default_na=False) - df = df.replace("", None) - - annotations_df = pd.read_csv(args.annotations, keep_default_na=False) +def main( + input_dataset_mapped_isotope_patterns, + input_dataset_annotations, + integrating_mode, + input_dataset_bio, + noits, + burn, + delta_bio, + delta_add, + all_out, + zs, + zs_out, + output_dataset, +): + annotations_df = input_dataset_annotations annotations_df["post"] = annotations_df["post"].replace("", 0) annotations_df = annotations_df.replace("", None) - annotations = {} - - grouped = annotations_df.groupby("peak_id") - for peak_id, group in grouped: - annotations[peak_id] = group.drop("peak_id", axis=1) + annotations = group_by_peak_id(annotations_df) - if args.zs: - zs = [] - with open(args.zs, "r") as f: - for line in f: - zs.append(int(line.strip())) - - else: + if not zs: zs = None - if args.integrating_mode == "adducts": + if integrating_mode == "adducts": zs = ipa.Gibbs_sampler_add( - df, + input_dataset_mapped_isotope_patterns, annotations, - noits=args.noits, - burn=args.burn, - delta_add=args.delta_add, - all_out=args.all_out, + noits=noits, + burn=burn, + delta_add=delta_add, + all_out=all_out, zs=zs, ) else: - - Bio = pd.read_csv(args.Bio, keep_default_na=False) - if args.integrating_mode == "biochemical": zs = ipa.Gibbs_sampler_bio( - df, + input_dataset_mapped_isotope_patterns, annotations, - Bio=Bio, - noits=args.noits, - burn=args.burn, - delta_bio=args.delta_bio, - all_out=args.all_out, + Bio=input_dataset_bio, + noits=noits, + burn=burn, + delta_bio=delta_bio, + all_out=all_out, zs=zs, ) else: zs = ipa.Gibbs_sampler_bio_add( - df, + input_dataset_mapped_isotope_patterns, annotations, - Bio=Bio, - noits=args.noits, - burn=args.burn, - delta_bio=args.delta_bio, - delta_add=args.delta_add, - all_out=args.all_out, + Bio=input_dataset_bio, + noits=noits, + burn=burn, + delta_bio=delta_bio, + delta_add=delta_add, + all_out=all_out, zs=zs, ) - annotations_flat = pd.DataFrame() - for peak_id in annotations: - annotation = annotations[peak_id] - annotation["peak_id"] = peak_id - annotations_flat = pd.concat([annotations_flat, annotation]) + annotations_flat = flattern_annotations(annotations) + write_func, file_path = output_dataset + write_func(annotations_flat, file_path) - annotations_flat.to_csv(args.annotations_out, index=False) - - if args.gibbs_out: - with open(args.zs_out, "w") as f: - for s in zs: - f.write(str(s) + "\n") + if args.all_out: + write_func, file_path = zs_out + write_func(zs, file_path) if __name__ == "__main__": @@ -82,55 +80,100 @@ def main(args): description="cluster features before IPA pipeline." ) parser.add_argument( - "--mapped_isotope_patterns", - type=str, + "--input_dataset_mapped_isotope_patterns", + nargs=2, + action=LoadDataAction, required=True, help="a dataframe containing the measured intensities across several samples.", ) parser.add_argument( - "--annotations", - type=str, + "--input_dataset_annotations", + nargs=2, + action=LoadDataAction, required=True, - help="Default value 0.8. Minimum correlation allowed in each cluster.", + help="a datset containing the annotations of the features.", ) parser.add_argument( "--integrating_mode", type=str, required=True, - help="Default value 0.8. Minimum correlation allowed in each cluster.", + choices=["adducts", "biochemical", "biochemical_adducts"], + help="The mode of integration. Options are 'adducts', 'biochemical', or 'biochemical_adducts'.", ) parser.add_argument( - "--Bio", type=str, help="intensity mode. Default 'max' or 'ave'." + "--input_dataset_bio", + nargs=2, + action=LoadDataAction, + type=str, + help="""dataframe (2 columns), reporting all the possible connections between + compounds. It uses the unique ids from the database. It could be the + output of Compute_Bio() or Compute_Bio_Parallel()""", ) parser.add_argument( "--noits", type=int, - help="Default value 1. Maximum difference in RT time between features in the same cluster.", + help="number of iterations if the Gibbs sampler to be run", ) parser.add_argument( - "--burn", type=int, help="intensity mode. Default 'max' or 'ave'." + "--burn", + type=int, + help="""number of iterations to be ignored when computing posterior + probabilities. If None, is set to 10% of total iterations""", ) parser.add_argument( - "--delta_bio", type=float, help="intensity mode. Default 'max' or 'ave'." + "--delta_bio", + type=float, + help="""parameter used when computing the conditional priors. The + parameter must be positive. The smaller the parameter the more + weight the adducts connections have on the posterior + probabilities. Default 1.""", ) parser.add_argument( - "--delta_add", type=float, help="intensity mode. Default 'max' or 'ave'." + "--delta_add", + type=float, + help=""" parameter used when computing the conditional priors. The + parameter must be positive. The smaller the parameter the more + weight the adducts connections have on the posterior + probabilities. Default 1.""", ) parser.add_argument( - "--all_out", type=str, help="intensity mode. Default 'max' or 'ave'." + "--all_out", + type=str, + help="""logical value. If true the list of assignments found in each + iteration is returned by the function. Default False.""", ) parser.add_argument( - "--zs", type=str, help="intensity mode. Default 'max' or 'ave'." + "--zs", + nargs=2, + action=LoadTextAction, + help="""a txt file containing the list of assignments computed in a previous run of the Gibbs sampler. + Optional, default None.""", ) parser.add_argument( - "--gibbs_out", type=str, help="intensity mode. Default 'max' or 'ave'." + "--zs_out", + nargs=2, + action=StoreOutputAction, + help="file to save the list of assignments computed in the current run of the Gibbs sampler.", ) parser.add_argument( - "--annotations_out", - type=str, - default="gibbs_sample_annotations.csv", - help="a dataframe of clustered features.", + "--output_dataset", + nargs=2, + action=StoreOutputAction, + required=True, + help="A file path for the output results from Gibbs Add.", ) - parser.add_argument("--zs_out", type=str, help="a dataframe of clustered features.") args = parser.parse_args() - main(args) + main( + args.input_dataset_mapped_isotope_patterns, + args.input_dataset_annotations, + args.integrating_mode, + args.input_dataset_bio, + args.noits, + args.burn, + args.delta_bio, + args.delta_add, + args.all_out, + args.zs, + args.zs_out, + args.output_dataset, + ) diff --git a/tools/ipapy2/ipapy2_gibbs_sampler.xml b/tools/ipapy2/ipapy2_gibbs_sampler.xml index 9012e8a9..5dfd1dda 100644 --- a/tools/ipapy2/ipapy2_gibbs_sampler.xml +++ b/tools/ipapy2/ipapy2_gibbs_sampler.xml @@ -5,45 +5,43 @@ ipapy2 + - - - - - - - + @@ -79,22 +77,20 @@ - - - options['gibbs_out'] + + + options['all_out'] - - + + - - - + diff --git a/tools/ipapy2/ipapy2_gibbs_sampler_add.py b/tools/ipapy2/ipapy2_gibbs_sampler_add.py index 7c233785..d184ccf6 100644 --- a/tools/ipapy2/ipapy2_gibbs_sampler_add.py +++ b/tools/ipapy2/ipapy2_gibbs_sampler_add.py @@ -1,53 +1,52 @@ -import os - import argparse -import pandas as pd from ipaPy2 import ipa +from utils import ( + LoadDataAction, + StoreOutputAction, + LoadTextAction, + group_by_peak_id, + flattern_annotations, +) -def main(args): - df = pd.read_csv(args.mapped_isotope_patterns, keep_default_na=False) - df = df.replace("", None) +def main( + mapped_isotope_patterns, + annotations_df, + noits, + burn, + delta_add, + all_out, + zs, + zs_out, + output_dataset, +): + df = mapped_isotope_patterns - annotations_df = pd.read_csv(args.annotations, keep_default_na=False) + annotations_df = annotations_df annotations_df["post"] = annotations_df["post"].replace("", 0) annotations_df = annotations_df.replace("", None) - annotations = {} - - grouped = annotations_df.groupby("peak_id") - for peak_id, group in grouped: - annotations[peak_id] = group.drop("peak_id", axis=1) + annotations = group_by_peak_id(annotations_df) - if args.zs and args.zs.lower() != "none" and os.path.isfile(args.zs): - zs = [] - with open(args.zs, "r") as f: - for line in f: - zs.append(int(line.strip())) - - else: + if not zs: zs = None + zs = ipa.Gibbs_sampler_add( df, annotations, - noits=args.noits, - burn=args.burn, - delta_add=args.delta_add, - all_out=args.all_out, + noits=noits, + burn=burn, + delta_add=delta_add, + all_out=all_out, zs=zs, ) - annotations_flat = pd.DataFrame() - for peak_id in annotations: - annotation = annotations[peak_id] - annotation["peak_id"] = peak_id - annotations_flat = pd.concat([annotations_flat, annotation]) + annotations_flat = flattern_annotations(annotations) + write_func, file_path = output_dataset + write_func(annotations_flat, file_path) - annotations_flat.to_csv(args.annotations_out, index=False) - - if args.all_out: - with open(args.zs_out, "w") as f: - for s in zs: - f.write(str(s) + "\n") + if all_out: + write_func, file_path = zs_out + write_func(zs, file_path) if __name__ == "__main__": @@ -55,21 +54,18 @@ def main(args): description="cluster features before IPA pipeline." ) parser.add_argument( - "--mapped_isotope_patterns", - type=str, + "--input_dataset_mapped_isotope_patterns", + nargs=2, + action=LoadDataAction, required=True, - help="A csv file containing the MS1 data. Ideally obtained from map_isotope_patterns", + help="A dataset containing the MS1 data. Ideally obtained from map_isotope_patterns", ) parser.add_argument( - "--annotations", - type=str, + "--input_dataset_annotations", + nargs=2, + action=LoadDataAction, required=True, - help=""" a dictionary containing all the possible annotations for the - measured features. The keys of the dictionary are the unique - ids for the features present in df. For each feature, the - annotations are summarized in a A csv file. Output of - functions MS1annotation(), MS1annotation_Parallel(), - MSMSannotation() or MSMSannotation_Parallel""", + help="a datset containing the annotations of the features.", ) parser.add_argument( "--noits", @@ -101,20 +97,33 @@ def main(args): ) parser.add_argument( "--zs", - type=str, + nargs=2, + action=LoadTextAction, help="""a txt file containing the list of assignments computed in a previous run of the Gibbs sampler. Optional, default None.""", ) parser.add_argument( "--zs_out", - type=str, - default="gibbs_sample_add_zs.txt", - help="file name to save the list of assignments computed in the current run of the Gibbs sampler.", + nargs=2, + action=StoreOutputAction, + help="file to save the list of assignments computed in the current run of the Gibbs sampler.", ) parser.add_argument( - "--annotations_out", - type=str, - default="gibbs_sample_add_annotations.csv", + "--output_dataset", + nargs=2, + action=StoreOutputAction, + required=True, + help="A file path for the output results from Gibbs Add.", ) args = parser.parse_args() - main(args) + main( + args.input_dataset_mapped_isotope_patterns, + args.input_dataset_annotations, + args.noits, + args.burn, + args.delta_add, + args.all_out, + args.zs, + args.zs_out, + args.output_dataset, + ) diff --git a/tools/ipapy2/ipapy2_gibbs_sampler_add.xml b/tools/ipapy2/ipapy2_gibbs_sampler_add.xml index 3008379c..2f64b371 100644 --- a/tools/ipapy2/ipapy2_gibbs_sampler_add.xml +++ b/tools/ipapy2/ipapy2_gibbs_sampler_add.xml @@ -5,40 +5,44 @@ ipapy2 + - - - - + - - - - + + options['all_out'] - + diff --git a/tools/ipapy2/ipapy2_map_isotope_patterns.xml b/tools/ipapy2/ipapy2_map_isotope_patterns.xml index cf272073..2cb09328 100644 --- a/tools/ipapy2/ipapy2_map_isotope_patterns.xml +++ b/tools/ipapy2/ipapy2_map_isotope_patterns.xml @@ -5,6 +5,7 @@ ipapy2 + multiplicative factor for the RT if measured RT is outside the RTrange present in the database. + + + + + + + + + + + pyarrow + fastparquet + \ No newline at end of file diff --git a/tools/ipapy2/test-data/mapped_isotope_patterns.parquet b/tools/ipapy2/test-data/mapped_isotope_patterns.parquet new file mode 100644 index 0000000000000000000000000000000000000000..3ffae05a11889f7c7b36c2f6e824e3a3f62afbf9 GIT binary patch literal 2710 zcmaJ@3s6+o89wLkz02Kouk0#!y&KUW16vYb;3HUyMHAG<#+WdUtt(=q#u`xs>X>jifU0!$Yxb2LkN*6B^pW=YJynuP%yBw3>pJ9qcM+i zdi1ZypgKv|NG=OwgeI085~A23 z9nuU}=FAhOk;Q_ZO=r;J5&lKa2^O|W&`6|J;NvyHm@q!S^rNrpW@b>;lCNsAjuOL| z1fI|YN)d>PiYy@=D6w=ZN|`8ebQDSrD5>dqloC-2^2Sx5#M4N>P2;tRJke4?hY1X! zoB}!@2tF9v`4Sgb2FNXfGw+7ssGGN@92Qk%b@blg}2%}vSHWiZ(hFIlMQ>8RaNb6c7Y+LtNS72fGfXF?7oiZ*f2Y~xx)o} z7OY4aTd@)fV|ssMsg&Vn-$2chy)s<8{`;+&r)9|6v*eYCQYZB6xSO%lB13*#T==>e zCm5!#o^Zr2!{u&OTD8Rqa)#61x>1IpVO>XR=Q<&->r_Dn<`)}W+}48mE#I^K{j-=~ zT-J9NHYXOtu?yCM+LU6rHAnlWt!c$DcKuREdE7>jPJQ&*&c(%WyXullgmQfz zUV94HBOzy;O~D&?;vTJSUJVob$4iR-^Skr!fuPuTduG7V)XDFB-a7+^xs&$Pj!1@! z6Qv!yjmh5Z`tq&t_9fNcUlm%Ryzx^$sniM&4(8=u=}H1qVMgVci%IZ@4-YoxBi^7* zo|$*;aI*AbP39dtoQ^E$eg3u`F2zUIoxf!VscH1CqkVQ*6cPL7BeW0sqAuz3KkX2E zVe_}=7N*0zGfis^h%N5a#O;~s5HV0vzH(_gBx-jjt}3>{lH-3jx8&L2z0q4g=q$0p znN1J+0d;9h-?i_YHhA1_Ey>BZ!R|wID}V7u3M{IJRPs&=j7yj^YxvC+=zZ}7H^NW` zbs>=(?&`~6^RRNYqMy}&kjW$VM`~}jq5Zkun)TIR+aR>$QFKeU4GafP)TH*<;96P7 z>!}~k2kVsMmEDc=A?djZSJob%1B<@BVwih-4$KePAsQ{OLEq~9ml{+ycys?inzObH zUI9}=+YjZ?aO=SSX?-?05>|Qs*j1N*clL3E&W_E0&Ixfpe$ivWV-^@HO|p;3WR-Q`G_-xhmOYi^z&=#LQHY7m+>Qdyi&1wVYbskXu^hDsnoM zJI`6k2ndS465fYCTYhj$!{ubQ%i;dDGp^qf9~ zA)!>o;JwN)8lLJXYtri(hB7LSrBXBo`vx&ouUwt2tXSn`_BXd(421m*wGKxmP(r9+ zFz_E^Bba93mr;@sMt-X&11L~}#fFGdDgyFoBQTi6C_j^MwsbM=sVc_)^iM9*6NL^@L>(VGVEo0swdV?2zjWioFw zg-Rh$pQOMcl9)`VahQ?+X|I0{-=6;eSKyuHAFOddHFu1}!~@&<)SiS#{fKpN;{I>Q zOV&t@jnrH<%6u<0RHBn48eBu%o# literal 0 HcmV?d00001 diff --git a/tools/ipapy2/utils.py b/tools/ipapy2/utils.py index 1eef4bec..13f32742 100644 --- a/tools/ipapy2/utils.py +++ b/tools/ipapy2/utils.py @@ -18,7 +18,7 @@ def __call__(self, parser, namespace, values, option_string=None): :param option_string: Option string :return: None """ - + file_path, file_extension = values file_extension = file_extension.lower() if file_extension == "csv": @@ -34,6 +34,28 @@ def __call__(self, parser, namespace, values, option_string=None): setattr(namespace, self.dest, df) +class LoadTextAction(argparse.Action): + """ + Custom argparse action to load data from a text file. + """ + + def __call__(self, parser, namespace, values, option_string=None): + """ + Load data from a text file and store it in the namespace. + :param namespace: Namespace object + :param values: Tuple containing the file path and file extension + :param option_string: Option string + :return: None + """ + file_path, _ = values + data = [] + if file_path: + with open(file_path, "r") as f: + for line in f: + data.append(int(line.strip())) + setattr(namespace, self.dest, data) + + def write_csv(df: pd.DataFrame, file_path: str) -> None: """ Write the dataframe to a CSV file. @@ -67,6 +89,20 @@ def write_parquet(df: pd.DataFrame, file_path: str) -> None: df.to_parquet(file_path, index=False) +def write_text(data: list, file_path: str) -> None: + """ + Write the data to a text file. + + Parameters: + data (list): The data to write. + file_path (str): The path to the output text file. + """ + if file_path: + with open(file_path, "w") as f: + for s in data: + f.write(str(s) + "\n") + + class StoreOutputAction(argparse.Action): def __call__( self, @@ -92,6 +128,8 @@ def __call__( write_func = write_tsv elif file_extension == "parquet": write_func = write_parquet + elif file_extension == "txt": + write_func = write_text else: raise ValueError(f"Unsupported file format: {file_extension}") setattr(namespace, self.dest, (write_func, file_path)) @@ -113,3 +151,21 @@ def flattern_annotations(annotations: dict) -> pd.DataFrame: annotation["peak_id"] = peak_id annotations_flat = pd.concat([annotations_flat, annotation]) return annotations_flat + + +def group_by_peak_id(df: pd.DataFrame) -> dict: + """ + Convert a pandas dataframe to a dictionary where each key is a unique 'peak_id' + and each value is a dataframe subset corresponding to that 'peak_id'. + + Parameters: + df (pd.DataFrame): The input dataframe. + + Returns: + dict: The dictionary representation of the dataframe. + """ + annotations = {} + keys = set(df["peak_id"]) + for i in keys: + annotations[i] = df[df["peak_id"] == i].drop("peak_id", axis=1) + return annotations