ENH: add basic func for reading GNPS feat metadata

Progress towards #49. Might make this standalone-rrv-exclusive for now?
biocore · May 11, 2019 · 2acfa66 · 2acfa66
1 parent 8149bc6
commit 2acfa66
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 2 deletions.
diff --git a/rankratioviz/_metadata_utils.py b/rankratioviz/_metadata_utils.py
@@ -31,3 +31,31 @@ def read_metadata_file(md_file_loc):
         metadata_df = metadata_df.astype(type_conv_dict)
 
     return metadata_df
+
+
+def read_gnps_feature_metadata_file(md_file_loc):
+    """Reads in a GNPS feature metadata file, producing a sane DataFrame."""
+    # Note that we don't set index_col = 0 -- the columns we care about
+    # ("parent mass", "RTConsensus", and "LibraryID"), as far as I know, don't
+    # have a set position. So we'll just use the basic RangeIndex that pandas
+    # defaults to.
+    metadata_df = pd.read_csv(md_file_loc, sep="\t")
+    # Create a feature ID column from the parent mass and RTConsensus cols.
+    metadata_df["rankratioviz_feature_id"] = (
+        metadata_df["parent mass"].astype(str)
+        + ";"
+        + metadata_df["RTConsensus"].astype(str)
+    )
+    # Set the feature ID column as the actual index of the DataFrame. If there
+    # are any duplicates (due to two features having the same mass-to-charge
+    # ratio and discharge time), our use of verify_integrity here will raise
+    # an error accordingly. (That probably won't happen, but best to be safe.)
+    metadata_df = metadata_df.set_index(
+        "rankratioviz_feature_id", verify_integrity=True
+    )
+    # Remove all the feature metadata that we don't care about (now, at least).
+    # metadata_df now only contains the feature ID we constructed and the
+    # Library ID, so it's ready to be used to annotate feature IDs (after those
+    # IDs' numbers have been truncated).
+    metadata_df = metadata_df.filter(items=["LibraryID"])
+    return metadata_df
diff --git a/rankratioviz/scripts/_plot.py b/rankratioviz/scripts/_plot.py
@@ -15,7 +15,10 @@
 )
 from rankratioviz.generate import process_input, gen_visualization
 from rankratioviz._rank_utils import read_rank_file
-from rankratioviz._metadata_utils import read_metadata_file
+from rankratioviz._metadata_utils import (
+    read_metadata_file,
+    read_gnps_feature_metadata_file,
+)
 
 
 @click.command()
@@ -88,7 +91,21 @@ def plot(
 
     df_feature_metadata = None
     if feature_metadata is not None:
-        df_feature_metadata = read_metadata_file(feature_metadata)
+        if assume_gnps_feature_metadata:
+            # TODO easiest thing here is to pass the feature ranks to
+            # read_gnps_feature_metadata_file(). Detect the precision of the
+            # numbers there (or assume it -- that's fine for now), and create a
+            # dict mapping each long ID (in the ranks) to a truncated ID,
+            # then replace the GNPS metadata DF index with the long IDs. THEN
+            # we can proceed as normal.
+            # TODO (Also, do the same tiny conditional here in the Q2 code
+            # as well? Or just temporarily remove Q2 support for GNPS feature
+            # metadata, since it flouts Q2 metadata standards.)
+            df_feature_metadata = read_gnps_feature_metadata_file(
+                feature_metadata
+            )
+        else:
+            df_feature_metadata = read_metadata_file(feature_metadata)
     logging.debug("Read in metadata.")
 
     U, V, processed_table = process_input(