biocore · fedarko · May 13, 2019 · May 11, 2019 · May 11, 2019 · May 12, 2019
diff --git a/README.md b/README.md
@@ -48,6 +48,15 @@ of these characters. (Eventually rankratioviz should be able to handle this
 accordingly, but in the meantime this is a necessary fix.) See
 [this issue](https://github.com/fedarko/rankratioviz/issues/66) for context.
 
+### Integration with metabolomics feature metadata
+
+If you have a GNPS feature metadata file (where each row in the file has a
+`parent mass` and `RTConsensus` column), you can pass in the `-gnps`
+(`--assume-gnps-feature-metadata`) command-line argument to rankratioviz'
+standalone script to make rankratioviz understand the metadata file. **Please
+note that this functionality is experimental**; furthermore, it is not yet
+available in the QIIME 2 plugin version of rankratioviz.
+
 ### Tutorials
 
 Examples of using rankratioviz (both inside and outside of QIIME 2) are

diff --git a/example_notebooks/DEICODE_sleep_apnea/deicode_example.ipynb b/example_notebooks/DEICODE_sleep_apnea/deicode_example.ipynb
@@ -185,7 +185,11 @@
       "                                  tables exceeding 1 million entries), for\r\n",
       "                                  which running rankratioviz normally might\r\n",
       "                                  take a long amount of time or crash due to\r\n",
-      "                                  memory limits.  [optional]\r\n",
+      "                                  memory limits. Additionally, following this\r\n",
+      "                                  feature-filtering step, all \"empty\" samples\r\n",
+      "                                  (i.e. those containing zeroes for every\r\n",
+      "                                  remaining feature) will be removed from the\r\n",
+      "                                  visualization.  [optional]\r\n",
       "  --o-visualization VISUALIZATION PATH\r\n",
       "                                  [required if not passing --output-dir]\r\n",
       "  --output-dir DIRECTORY          Output unspecified results to a directory\r\n",
@@ -311,6 +315,20 @@
       "                                  (i.e. those containing zeroes for every\r\n",
       "                                  remaining feature) will be removed from the\r\n",
       "                                  visualization.\r\n",
+      "  -gnps, --assume-gnps-feature-metadata\r\n",
+      "                                  If specified, rankratioviz will assume that\r\n",
+      "                                  the input feature metadata was obtained from\r\n",
+      "                                  GNPS. This means that rankratioviz will read\r\n",
+      "                                  each feature's ID as \"A;B\", where A is the\r\n",
+      "                                  mass-to-charge ratio of the feature\r\n",
+      "                                  (corresponding to the \"parent mass\" column\r\n",
+      "                                  in the feature metadata) and B is the\r\n",
+      "                                  discharge time of the feature (corresponding\r\n",
+      "                                  to the \"RTConsensus\" column in the feature\r\n",
+      "                                  metadata). rankratioviz will then only\r\n",
+      "                                  annotate feature IDs with their\r\n",
+      "                                  corresponding \"LibraryID\" column in the\r\n",
+      "                                  feature metadata file.\r\n",
       "  -v, --verbose                   If passed, this will output debug messages.\r\n",
       "  --help                          Show this message and exit.\r\n"
      ]

diff --git a/example_notebooks/songbird_red_sea/songbird_example.ipynb b/example_notebooks/songbird_red_sea/songbird_example.ipynb
@@ -153,7 +153,11 @@
       "                                  tables exceeding 1 million entries), for\r\n",
       "                                  which running rankratioviz normally might\r\n",
       "                                  take a long amount of time or crash due to\r\n",
-      "                                  memory limits.  [optional]\r\n",
+      "                                  memory limits. Additionally, following this\r\n",
+      "                                  feature-filtering step, all \"empty\" samples\r\n",
+      "                                  (i.e. those containing zeroes for every\r\n",
+      "                                  remaining feature) will be removed from the\r\n",
+      "                                  visualization.  [optional]\r\n",
       "  --o-visualization VISUALIZATION PATH\r\n",
       "                                  [required if not passing --output-dir]\r\n",
       "  --output-dir DIRECTORY          Output unspecified results to a directory\r\n",
@@ -222,8 +226,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2019-05-06 20:25:14.969262: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX\r\n",
-      "2019-05-06 20:25:14.969641: I tensorflow/core/common_runtime/process_util.cc:71] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.\r\n",
+      "2019-05-13 01:10:05.637409: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX\r\n",
+      "2019-05-13 01:10:05.637786: I tensorflow/core/common_runtime/process_util.cc:71] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.\r\n",
       "WARNING:tensorflow:From /Users/mfedarko/Software/forks/songbird/songbird/multinomial.py:75: multinomial (from tensorflow.python.ops.random_ops) is deprecated and will be removed in a future version.\r\n",
       "Instructions for updating:\r\n",
       "Use tf.random.categorical instead.\r\n",
@@ -242,7 +246,7 @@
       "WARNING:tensorflow:From /anaconda3/envs/q2-2019.1/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\r\n",
       "Instructions for updating:\r\n",
       "Use tf.cast instead.\r\n",
-      "100%|██████████████████████████████████████| 8000/8000 [00:12<00:00, 616.12it/s]\r\n"
+      "100%|██████████████████████████████████████| 8000/8000 [00:13<00:00, 590.29it/s]\r\n"
      ]
     }
    ],
@@ -306,6 +310,20 @@
       "                                  (i.e. those containing zeroes for every\r\n",
       "                                  remaining feature) will be removed from the\r\n",
       "                                  visualization.\r\n",
+      "  -gnps, --assume-gnps-feature-metadata\r\n",
+      "                                  If specified, rankratioviz will assume that\r\n",
+      "                                  the input feature metadata was obtained from\r\n",
+      "                                  GNPS. This means that rankratioviz will read\r\n",
+      "                                  each feature's ID as \"A;B\", where A is the\r\n",
+      "                                  mass-to-charge ratio of the feature\r\n",
+      "                                  (corresponding to the \"parent mass\" column\r\n",
+      "                                  in the feature metadata) and B is the\r\n",
+      "                                  discharge time of the feature (corresponding\r\n",
+      "                                  to the \"RTConsensus\" column in the feature\r\n",
+      "                                  metadata). rankratioviz will then only\r\n",
+      "                                  annotate feature IDs with their\r\n",
+      "                                  corresponding \"LibraryID\" column in the\r\n",
+      "                                  feature metadata file.\r\n",
       "  -v, --verbose                   If passed, this will output debug messages.\r\n",
       "  --help                          Show this message and exit.\r\n"
      ]

diff --git a/rankratioviz/_metadata_utils.py b/rankratioviz/_metadata_utils.py
@@ -7,6 +7,7 @@
 # The full license is in the file LICENSE.txt, distributed with this software.
 # ----------------------------------------------------------------------------
 
+import logging
 import pandas as pd
 
 
@@ -31,3 +32,96 @@ def read_metadata_file(md_file_loc):
         metadata_df = metadata_df.astype(type_conv_dict)
 
     return metadata_df
+
+
+def get_truncated_feature_id(full_feature_id):
+    """Computes a truncated GNPS feature ID from a full GNPS feature ID.
+
+       This function was originally contained in a Jupyter Notebook for
+       processing this sort of data written by Jamie Morton and
+       Julia Gauglitz.
+    """
+    mz, rt = list(map(float, full_feature_id.split(";")))
+    return "{:.4f};{:.4f}".format(mz, rt)
+
+
+def read_gnps_feature_metadata_file(md_file_loc, feature_ranks_df):
+    """Reads in a GNPS feature metadata file, producing a sane DataFrame.
+
+       Also requires a DataFrame describing feature ranks as input. This is so
+       that we can match up the feature rank IDs in the ranks and BIOM table
+       with rows in the GNPS metadata file -- the precision of the numbers from
+       which GNPS feature IDs are computed varies between the ranks/BIOM table
+       and the actual numbers contained in the GNPS metadata file.
+    """
+    # Note that we don't set index_col = 0 -- the columns we care about
+    # ("parent mass", "RTConsensus", and "LibraryID"), as far as I know, don't
+    # have a set position. So we'll just use the basic RangeIndex that pandas
+    # defaults to.
+    metadata_df = pd.read_csv(md_file_loc, sep="\t")
+
+    # Create a feature ID column from the parent mass and RTConsensus cols.
+    # Use of .map() here is derived from
+    # https://stackoverflow.com/a/22276757/10730311.
+    metadata_df["rankratioviz_trunc_feature_id"] = (
+        metadata_df["parent mass"].map("{:.4f}".format)
+        + ";"
+        + metadata_df["RTConsensus"].map("{:.4f}".format)
+    )
+
+    # Go through feature rank index, and for each create a mapping of
+    # (truncated feature ID) -> (full feature ID). Then use that mapping to
+    # create a new column of "rankratioviz_full_feature_id" in metadata_df.
+    # NOTE that if there are indistinguishable truncated IDs, this will raise
+    # an error.
+    truncated_id_to_full_id = {}
+    for fid in feature_ranks_df.index:
+        tfid = get_truncated_feature_id(fid)
+        if tfid not in truncated_id_to_full_id:
+            truncated_id_to_full_id[tfid] = fid
+        else:
+            logging.warning(
+                "Indistinguishable rows in GNPS feature "
+                "metadata file with truncated ID {}.".format(tfid)
+            )
+            # Replace the full feature ID with a bogus ID. This will prevent
+            # the >= 2 full feature IDs from which the conflicting truncated
+            # IDs were derived from getting annotated with anything -- better
+            # to annotate less than to annotate incorrectly.
+            truncated_id_to_full_id[tfid] = "rankratioviz_matching_conflict"
+
+    metadata_df["rankratioviz_full_feature_id"] = metadata_df[
+        "rankratioviz_trunc_feature_id"
+    ].apply(lambda tfid: truncated_id_to_full_id[tfid])
+
+    # Remove all rows in the metadata df with bogus full feature IDs,
+    # which will let us use verify_integrity=True when setting index below.
+    # There's definitely a faster way to do this than using iterrows(), but
+    # this at least works
+    indices_to_remove = []
+    for idx, row in metadata_df.iterrows():
+        if (
+            row["rankratioviz_full_feature_id"]
+            == "rankratioviz_matching_conflict"
+        ):
+            indices_to_remove.append(idx)
+
+    metadata_df.drop(index=indices_to_remove, inplace=True)
+
+    # Set the full feature ID column as the actual index of the DataFrame. If
+    # there are any duplicates (due to two features having the same
+    # mass-to-charge ratio and discharge time), our use of verify_integrity
+    # here will raise an error accordingly. (That almost certainly won't
+    # happen, # since we already look for indistinguishable truncated feature
+    # IDs above, but best to be safe until this function is more rigorously
+    # tested.)
+    metadata_df.set_index(
+        "rankratioviz_full_feature_id", verify_integrity=True, inplace=True
+    )
+
+    # Remove all the feature metadata that we don't care about (now, at least).
+    # metadata_df now only contains the full feature ID we constructed and the
+    # Library ID, so now it's ready to be used to annotate feature IDs from teh
+    # ranks DataFrame.
+    metadata_df = metadata_df.filter(items=["LibraryID"])
+    return metadata_df
diff --git a/rankratioviz/_parameter_descriptions.py b/rankratioviz/_parameter_descriptions.py
@@ -22,3 +22,14 @@
     "(i.e. those containing zeroes for every remaining feature) will be "
     "removed from the visualization."
 )
+
+ASSUME_GNPS_FEATURE_METADATA = (
+    "If specified, rankratioviz will assume that the input feature metadata "
+    "was obtained from GNPS. This means that rankratioviz will read each "
+    'feature\'s ID as "A;B", where A is the mass-to-charge ratio of the '
+    'feature (corresponding to the "parent mass" column in the feature '
+    "metadata) and B is the discharge time of the feature (corresponding to "
+    'the "RTConsensus" column in the feature metadata). rankratioviz will '
+    'then only annotate feature IDs with their corresponding "LibraryID" '
+    "column in the feature metadata file."
+)
diff --git a/rankratioviz/q2/plugin_setup.py b/rankratioviz/q2/plugin_setup.py
@@ -46,9 +46,13 @@
     "sample_metadata": Metadata,
     "feature_metadata": Metadata,
     "extreme_feature_count": Int,
+    # "assume_gnps_feature_metadata": Bool,
 }
 
-param_descs = {"extreme_feature_count": EXTREME_FEATURE_COUNT}
+param_descs = {
+    "extreme_feature_count": EXTREME_FEATURE_COUNT,
+    # "assume_gnps_feature_metadata": ASSUME_GNPS_FEATURE_METADATA,
+}
 
 ranks_desc = "A{} file describing feature rankings produced by {}."
 

diff --git a/rankratioviz/scripts/_plot.py b/rankratioviz/scripts/_plot.py
@@ -8,10 +8,17 @@
 import logging
 from biom import load_table
 import click
-from rankratioviz._parameter_descriptions import EXTREME_FEATURE_COUNT, TABLE
+from rankratioviz._parameter_descriptions import (
+    EXTREME_FEATURE_COUNT,
+    TABLE,
+    ASSUME_GNPS_FEATURE_METADATA,
+)
 from rankratioviz.generate import process_input, gen_visualization
 from rankratioviz._rank_utils import read_rank_file
-from rankratioviz._metadata_utils import read_metadata_file
+from rankratioviz._metadata_utils import (
+    read_metadata_file,
+    read_gnps_feature_metadata_file,
+)
 
 
 @click.command()
@@ -39,6 +46,12 @@
     type=int,
     help=EXTREME_FEATURE_COUNT,
 )
+@click.option(
+    "-gnps",
+    "--assume-gnps-feature-metadata",
+    is_flag=True,
+    help=ASSUME_GNPS_FEATURE_METADATA,
+)
 @click.option(
     "-v",
     "--verbose",
@@ -52,6 +65,7 @@ def plot(
     feature_metadata: str,
     output_dir: str,
     extreme_feature_count: int,
+    assume_gnps_feature_metadata: bool,
     verbose: bool,
 ) -> None:
     """Generates a visualization of feature rankings and log ratios.
@@ -77,7 +91,12 @@ def plot(
 
     df_feature_metadata = None
     if feature_metadata is not None:
-        df_feature_metadata = read_metadata_file(feature_metadata)
+        if assume_gnps_feature_metadata:
+            df_feature_metadata = read_gnps_feature_metadata_file(
+                feature_metadata, feature_ranks
+            )
+        else:
+            df_feature_metadata = read_metadata_file(feature_metadata)
     logging.debug("Read in metadata.")
 
     U, V, processed_table = process_input(