Merge pull request #5 from ArthurVM/anna_error_fix

Non-species hit error fix
ArthurVM · Feb 2, 2023 · 5bf66b4 · 5bf66b4
2 parents a4a38ba + 269a159
commit 5bf66b4
Show file tree

Hide file tree

Showing 10 changed files with 196 additions and 60 deletions.
diff --git a/setup.py b/setup.py
@@ -43,13 +43,6 @@ def _install_autodatabase(self):
         "Intended Audience :: Developers",
         "Operating System :: Unix",
         "Operating System :: POSIX",
-        "Programming Language :: Python",
-        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.3",
-        "Programming Language :: Python :: 3.4",
-        "Programming Language :: Python :: 3.5",
-        "Programming Language :: Python :: 3.6",
-        "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: Implementation :: CPython",

diff --git a/src/Afanc/_version.py b/src/Afanc/_version.py
@@ -1 +1 @@
-__version__="0.9a-alpha"
+__version__="0.9.2a"
diff --git a/src/Afanc/autodatabase/runFuncs.py b/src/Afanc/autodatabase/runFuncs.py
@@ -11,6 +11,7 @@ def runAutoDB(args):
     """ Run autodatabase pipeline
     """
     from Afanc.utilities.makeWD import initAutoDBDirStructure
+    from Afanc.utilities.get_versions import get_versions_autodatabase
 
     subprocessID = "MAIN"
     vprint(
@@ -27,6 +28,9 @@ def runAutoDB(args):
 
     fasta_db_path = args.fastaDir
 
+    ## capture python package and software versions for the autodatabase module in a JSON file
+    get_versions_autodatabase(args)
+
     ## download ncbi taxonomy and preprocess fastas
     fasta_dict = preprocessing(args, fasta_db_path)
 

diff --git a/src/Afanc/parser.py b/src/Afanc/parser.py
@@ -173,12 +173,25 @@ def initLogFiles(args):
 parser_screen.add_argument('-f', '--fetch_assemblies',
     default=False,
     action='store_true',
-    help='Fetch genome assemblies for species hits using from GenBank using the ENSEMBL software suite. Default=False; get assemblies from the autodb working directory.')
+    help='Fetch genome assemblies for species hits using from GenBank using the ENSEMBL software suite. Default=False; get assemblies from the autodb working directory where possible.')
 
 parser_screen.add_argument('-t', '--threads',
     type=int,
     default=4,
     action='store',
     help='Number of threads to used for this run. Default=4.')
 
+clean_group = parser_screen.add_mutually_exclusive_group()
+
+clean_group.add_argument('-c', '--clean',
+    default=False,
+    action='store_true',
+    help='Remove the bowtie2 working directory from the output directory. Default=False.')
+
+clean_group.add_argument('-s', '--superclean',
+    default=False,
+    action='store_true',
+    help='Delete the entire output directory, leaving only log files and the results .json. Default=False.')
+
+
 parser_screen.set_defaults(func=run_subtool)
diff --git a/src/Afanc/screen/getGenomes.py b/src/Afanc/screen/getGenomes.py
@@ -81,7 +81,8 @@ def download_genome(assembly, stdout, stderr, taxID=False):
     ## check anything was found
     if len(ftp_dirs) == 0:
         print(f"{assembly} NOT FOUND ON GENBANK! SKIPPING...")
-        return 2
+
+        return None
 
     ## get first ftp directory
     ftp_dir = ftp_dirs[0]
@@ -96,11 +97,15 @@ def download_genome(assembly, stdout, stderr, taxID=False):
 
     if os.path.exists(outfile):
         print(f"FILE ALREADY EXISTS! SKIPPING...")
-        return 3
+
+        return outfile
+
     else:
         grepline = f"curl {ftp_dir}/{base}_genomic.fna.gz --output {outfile}"
         stdout, stderr = command(grepline, "DOWNLOAD_HITS").run_comm(1, stdout, stderr)
 
+        return outfile
+
 
 def getLocalGenomes(out_json, args):
     """ if args.fetch_assemblies is False, get genomes from the autodatabase results directory:
@@ -124,8 +129,9 @@ def getLocalGenomes(out_json, args):
             ## check to see if there is variant information
             ## if so, make this the target
             if "closest_variant" in subdict:
-                taxID = str(subdict["closest_variant"]["taxon_id"])
-                assemblyID = subdict["closest_variant"]["name"]
+                subdict = subdict["closest_variant"]
+                taxID = str(subdict["taxon_id"])
+                assemblyID = subdict["name"]
 
             else:
                 taxID = str(subdict["taxon_id"])
@@ -142,10 +148,19 @@ def getLocalGenomes(out_json, args):
                 ## collect missing assemblies for unknown purpose
                 missing_assemblies.append(assemblyID)
 
-                download_genome(assemblyID, args.stdout, args.stderr, taxID)
+                assembly_path = download_genome(assemblyID, args.stdout, args.stderr, taxID)
 
             ## if the assembly can be found, copy to the bt2 working directory for bt2 db construction
             else:
                 assembly_path = path.join(args.cleanFasta, db_pathdict[taxID])
+
                 print(f"Copying {assembly_path}...")
                 shutil.copy(assembly_path, "./")
+
+            if not assembly_path == None:
+                subdict["assembly"] = assembly_path.split("/")[-1]
+            else:
+                subdict["assembly"] = assembly_path
+
+    with open(out_json, "w") as fout:
+        json.dump(json_data, fout, indent=4)
diff --git a/src/Afanc/screen/maths/mappingMetrics.py b/src/Afanc/screen/maths/mappingMetrics.py
@@ -24,8 +24,8 @@ def genomeSize(fasta_file):
 
 
 def gini(depth_stdout, w=1):
-    """Calculates the Gini coefficient of a coverage array isolated from a coverage .bed file
-    Maps pretty well to A/(A+B) in its functionality, but effectively calculates Gini as the mean absolute difference.
+    """Calculates the Gini coefficient of a coverage array isolated from a coverage .bed file.
+    Equivalent to A/(A+B) in its functionality, but effectively calculates Gini as the mean absolute difference.
     """
     lines = [int(line[2]) for line in depth_stdout if len(line) == 3]
 

diff --git a/src/Afanc/screen/report/parseK2report.py b/src/Afanc/screen/report/parseK2report.py
@@ -110,10 +110,9 @@ def find_best_hit(root_node, pct_threshold, num_threshold, local_threshold):
 
     ## find lowest level scoring nodes
     for node in root_node.traverse():
-        # print(node.name, node.level_int, node.clade_perc, node.clade_reads)
         ## check if node scores above the pct_threshold
         if node.clade_perc >= pct_threshold and node.clade_reads >= num_threshold and node.level_int >= 9:
-
+            # print(node.name, node.level_int, node.clade_perc, node.clade_reads)
             ## construct scorebox then strip off the head node
             scorebox = [ n.clade_perc for n in node.traverse() ][:-1]
 

diff --git a/src/Afanc/screen/runFuncs.py b/src/Afanc/screen/runFuncs.py
@@ -1,6 +1,8 @@
 import pysam
 import sys
-from os import chdir, path, listdir, mkdir
+from importlib.metadata import version
+from shutil import move, rmtree
+from os import chdir, path, listdir, mkdir, remove
 from collections import defaultdict
 
 from Afanc.utilities.runCommands import command
@@ -30,6 +32,9 @@ def runScreen(args):
     ## generate the final report
     final_report = makeFinalReport(args)
 
+    if args.clean or args.superclean:
+        final_report = clean_outdir(args, final_report)
+
     vprint("FINISHED", f"Final report can be found at {final_report}\n", "prGreen")
 
 
@@ -234,6 +239,8 @@ def makeFinalReport(args):
     import json
     from os import listdir
 
+    from Afanc.utilities.get_versions import get_versions_screen
+
     subprocessID = "REPORT"
     vprint(
         subprocessID,
@@ -244,14 +251,26 @@ def makeFinalReport(args):
     jsondict = defaultdict(dict)
 
     datadict = jsondict["results"] = {}
-    jsondict["arguments"] = { "database" : args.k2_database,
+
+    ## initialise dictionary key
+    if "Detection_events" not in datadict:
+        datadict["Detection_events"] = []
+
+    ## collect arguments
+    jsondict["arguments"] = {
+        "database" : args.k2_database,
         "fastqs" : args.fastq,
         "num_threshold" : args.num_threshold,
         "pct_threshold" : args.pct_threshold,
         "local_threshold" : args.local_threshold,
         "output_prefix" : args.output_prefix,
     }
 
+    ## collect versions
+    jsondict["versions"] = {
+        "screen" : get_versions_screen()
+    }
+
     ## collect k2 json reports
     bt2_reportsbox = [g for g in listdir(args.reportsDir) if g.endswith("mapstats.json")]
 
@@ -261,56 +280,91 @@ def makeFinalReport(args):
 
         for event in k2data["Detection_events"]:
 
-            ## horrible block to deal with most likely variants
+            ## initialise warnings
+            event["warnings"] = []
+
+            ## block to deal with most likely variants
             if "closest_variant" in event:
                 variant_flag = True
+                assembly = event["closest_variant"]["assembly"]
+                taxon_id = str(event["closest_variant"]["taxon_id"])
             else:
                 variant_flag = False
+                assembly = event["assembly"]
+                taxon_id = str(event["taxon_id"])
+
+            ## block dealing with hits which have an accompanying assembly which reads were mapped to
+            if not assembly == None:
+
+                ## find Bowtie2 report
+                ## NOTE: there should only ever be 1 element in this list, since taxon_id should be unique to each taxon
+                report_json = [report for report in bt2_reportsbox if report.startswith(taxon_id)][0]
+
+                ## read Bowtie2 report
+                with open(f"{args.reportsDir}/{report_json}", 'r') as bt2fin:
+                    bt2data = json.load(bt2fin)
+
+                    ## check the that the no_unique_map warning doesnt exist
+                    if "no_unique_map" not in bt2data["warnings"]:
+
+                        ## add fields to json dict
+                        ## if there is a likely variant, add to that subdict
+                        if variant_flag:
+                            event["closest_variant"]["mean_DOC"] = bt2data["map_data"]["mean_DOC"]
+                            event["closest_variant"]["median_DOC"] = bt2data["map_data"]["median_DOC"]
+                            event["closest_variant"]["reference_cov"] = bt2data["map_data"]["proportion_cov"]
+                            event["closest_variant"]["gini"] = bt2data["map_data"]["gini"]
+
+                        ## otherwise add to the base dict
+                        else:
+                            event["mean_DOC"] = bt2data["map_data"]["mean_DOC"]
+                            event["median_DOC"] = bt2data["map_data"]["median_DOC"]
+                            event["reference_cov"] = bt2data["map_data"]["proportion_cov"]
+                            event["gini"] = bt2data["map_data"]["gini"]
+
+                    ## if it does, append it to the final report warnings
+                    else:
+                        event["warnings"].append(bt2data["warnings"])
 
-            for report in bt2_reportsbox:
+            ## block dealing with hits which do not have an accompanying assembly
+            else:
+                event['warnings'].append(f"No assembly could be found for hit on {event['name']}.")
 
-                if variant_flag:
-                    if str(event["closest_variant"]['taxon_id']) in report:
-                        report_json = report
+            datadict["Detection_events"].append(event)
 
-                else:
-                    if str(event['taxon_id']) in report:
-                        report_json = report
+            final_report = path.abspath(f"./{args.output_prefix}.json")
 
-            ## read Bowtie2 report
-            with open(f"{args.reportsDir}/{report_json}", 'r') as bt2fin:      ## TODO: sort out this line to make it more robust, i.e. remove _genomic
-                bt2data = json.load(bt2fin)
+    with open(final_report, "w") as fout:
+        json.dump(jsondict, fout, indent = 4)
 
-                ## check the that the no_unique_map warning doesnt exist
-                if "no_unique_map" not in bt2data["warnings"]:
+    return final_report
 
-                    ## add fields to json dict
-                    ## if there is a likely variant, add to that subdict
-                    if variant_flag:
-                        event["closest_variant"]["mean_DOC"] = bt2data["map_data"]["mean_DOC"]
-                        event["closest_variant"]["median_DOC"] = bt2data["map_data"]["median_DOC"]
-                        event["closest_variant"]["reference_cov"] = bt2data["map_data"]["proportion_cov"]
-                        event["closest_variant"]["gini"] = bt2data["map_data"]["gini"]
-                    ## otherwise add to the base dict
-                    else:
-                        event["mean_DOC"] = bt2data["map_data"]["mean_DOC"]
-                        event["median_DOC"] = bt2data["map_data"]["median_DOC"]
-                        event["reference_cov"] = bt2data["map_data"]["proportion_cov"]
-                        event["gini"] = bt2data["map_data"]["gini"]
 
-                ## if it does, append it to the final report warnings
-                else:
-                    event["warnings"] = bt2data["warnings"]
+def clean_outdir(args, final_report):
+    """ Cleans the output directory according to provided arguments.
 
-                ## initialise dictionary key
-                if "Detection_events" not in datadict:
-                    datadict["Detection_events"] = []
+    clean : remove the bt2 working directory.
+    superclean : move the results JSON to ./ and remove the entire output directory
+    """
 
-                datadict["Detection_events"].append(event)
+    if args.clean:
+        ## check the bt2WDir exists
+        if path.isdir(args.bt2WDir):
+            rmtree(args.bt2WDir, ignore_errors=True)
+        else:
+            ## If it fails, inform the user.
+            print(f"Error: Could not remove{args.bt2WDir}, directory not found.")
 
-    final_report = path.abspath(f"./{args.output_prefix}.json")
+        return final_report
 
-    with open(final_report, "w") as fout:
-        json.dump(jsondict, fout, indent = 4)
+    elif args.superclean:
+        ## check the root run directory exists
+        if path.isdir(args.runWDir):
+            new_final_report_path = path.join(args.baseRunDir, final_report.split("/")[-1])
+            move(final_report, new_final_report_path)
+            rmtree(args.runWDir, ignore_errors=True)
+        else:
+            ## If it fails, inform the user.
+            print(f"Error: Could not remove {args.runWDir}, directory not found.")
 
-    return final_report
+        return new_final_report_path
diff --git a/src/Afanc/utilities/get_versions.py b/src/Afanc/utilities/get_versions.py
@@ -0,0 +1,56 @@
+import json
+from collections import defaultdict
+from importlib.metadata import version
+
+from Afanc.utilities.runCommands import command
+
+
+def get_versions_autodatabase(args):
+    """ get python package and software versions for afanc-autodatabase
+    """
+
+    version_dict = {
+        "afanc" : version("afanc"),
+        "Biopython" : version("Biopython"),
+        "numpy" : version("numpy"),
+        "pysam" : version("pysam"),
+        "pandas" : version("pandas"),
+        "scipy" : version("scipy"),
+        "ncbi_taxonomy_date" : args.ncbi_date
+    }
+
+    mash_stdout, mash_stderr = command(f"mash --version", "GET-VERSIONS").run_comm_quiet(1)
+    version_dict["mash"] = mash_stdout.decode().strip("\n")
+
+    k2_stdout, k2_stderr = command(f"kraken2 -v", "GET-VERSIONS").run_comm_quiet(1)
+    version_dict["Kraken2"] = k2_stdout.decode().split("\n")[0].split("version ")[1]
+
+    versions_json = f"{args.autoDB_WDir}/versions.json"
+
+    with open(versions_json, "w") as fout:
+        json.dump({ "afanc-autodatabase_versions" : version_dict }, fout, indent = 4)
+
+
+def get_versions_screen():
+    """ Gets python package and softeware versions for afanc-screen
+    """
+
+    version_dict = {
+        "afanc" : version("afanc"),
+        "Biopython" : version("Biopython"),
+        "numpy" : version("numpy"),
+        "pysam" : version("pysam"),
+        "pandas" : version("pandas"),
+        "scipy" : version("scipy")
+    }
+
+    mash_stdout, mash_stderr = command(f"mash --version", "GET-VERSIONS").run_comm_quiet(1)
+    version_dict["mash"] = mash_stdout.decode().strip("\n")
+
+    k2_stdout, k2_stderr = command(f"kraken2 -v", "GET-VERSIONS").run_comm_quiet(1)
+    version_dict["Kraken2"] = k2_stdout.decode().split("\n")[0].split("version ")[1]
+
+    bt2_stdout, bt2_stderr = command(f"bowtie2 --version", "GET-VERSIONS").run_comm_quiet(1)
+    version_dict["Bowtie2"] = bt2_stdout.decode().split("\n")[0].split("version ")[1]
+
+    return version_dict