Skip to content

Commit

Permalink
ENH: add basic func for reading GNPS feat metadata
Browse files Browse the repository at this point in the history
Progress towards #49.

Might make this standalone-rrv-exclusive for now?
  • Loading branch information
fedarko committed May 11, 2019
1 parent 8149bc6 commit 2acfa66
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 2 deletions.
28 changes: 28 additions & 0 deletions rankratioviz/_metadata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,31 @@ def read_metadata_file(md_file_loc):
metadata_df = metadata_df.astype(type_conv_dict)

return metadata_df


def read_gnps_feature_metadata_file(md_file_loc):
"""Reads in a GNPS feature metadata file, producing a sane DataFrame."""
# Note that we don't set index_col = 0 -- the columns we care about
# ("parent mass", "RTConsensus", and "LibraryID"), as far as I know, don't
# have a set position. So we'll just use the basic RangeIndex that pandas
# defaults to.
metadata_df = pd.read_csv(md_file_loc, sep="\t")
# Create a feature ID column from the parent mass and RTConsensus cols.
metadata_df["rankratioviz_feature_id"] = (
metadata_df["parent mass"].astype(str)
+ ";"
+ metadata_df["RTConsensus"].astype(str)
)
# Set the feature ID column as the actual index of the DataFrame. If there
# are any duplicates (due to two features having the same mass-to-charge
# ratio and discharge time), our use of verify_integrity here will raise
# an error accordingly. (That probably won't happen, but best to be safe.)
metadata_df = metadata_df.set_index(
"rankratioviz_feature_id", verify_integrity=True
)
# Remove all the feature metadata that we don't care about (now, at least).
# metadata_df now only contains the feature ID we constructed and the
# Library ID, so it's ready to be used to annotate feature IDs (after those
# IDs' numbers have been truncated).
metadata_df = metadata_df.filter(items=["LibraryID"])
return metadata_df
21 changes: 19 additions & 2 deletions rankratioviz/scripts/_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@
)
from rankratioviz.generate import process_input, gen_visualization
from rankratioviz._rank_utils import read_rank_file
from rankratioviz._metadata_utils import read_metadata_file
from rankratioviz._metadata_utils import (
read_metadata_file,
read_gnps_feature_metadata_file,
)


@click.command()
Expand Down Expand Up @@ -88,7 +91,21 @@ def plot(

df_feature_metadata = None
if feature_metadata is not None:
df_feature_metadata = read_metadata_file(feature_metadata)
if assume_gnps_feature_metadata:
# TODO easiest thing here is to pass the feature ranks to
# read_gnps_feature_metadata_file(). Detect the precision of the
# numbers there (or assume it -- that's fine for now), and create a
# dict mapping each long ID (in the ranks) to a truncated ID,
# then replace the GNPS metadata DF index with the long IDs. THEN
# we can proceed as normal.
# TODO (Also, do the same tiny conditional here in the Q2 code
# as well? Or just temporarily remove Q2 support for GNPS feature
# metadata, since it flouts Q2 metadata standards.)
df_feature_metadata = read_gnps_feature_metadata_file(
feature_metadata
)
else:
df_feature_metadata = read_metadata_file(feature_metadata)
logging.debug("Read in metadata.")

U, V, processed_table = process_input(
Expand Down

0 comments on commit 2acfa66

Please sign in to comment.