Skip to content

Commit

Permalink
Merge pull request #5 from ArthurVM/anna_error_fix
Browse files Browse the repository at this point in the history
Non-species hit error fix
  • Loading branch information
ArthurVM authored Feb 2, 2023
2 parents a4a38ba + 269a159 commit 5bf66b4
Show file tree
Hide file tree
Showing 10 changed files with 196 additions and 60 deletions.
7 changes: 0 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,6 @@ def _install_autodatabase(self):
"Intended Audience :: Developers",
"Operating System :: Unix",
"Operating System :: POSIX",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.3",
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: Implementation :: CPython",
Expand Down
2 changes: 1 addition & 1 deletion src/Afanc/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__="0.9a-alpha"
__version__="0.9.2a"
4 changes: 4 additions & 0 deletions src/Afanc/autodatabase/runFuncs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def runAutoDB(args):
""" Run autodatabase pipeline
"""
from Afanc.utilities.makeWD import initAutoDBDirStructure
from Afanc.utilities.get_versions import get_versions_autodatabase

subprocessID = "MAIN"
vprint(
Expand All @@ -27,6 +28,9 @@ def runAutoDB(args):

fasta_db_path = args.fastaDir

## capture python package and software versions for the autodatabase module in a JSON file
get_versions_autodatabase(args)

## download ncbi taxonomy and preprocess fastas
fasta_dict = preprocessing(args, fasta_db_path)

Expand Down
15 changes: 14 additions & 1 deletion src/Afanc/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,12 +173,25 @@ def initLogFiles(args):
parser_screen.add_argument('-f', '--fetch_assemblies',
default=False,
action='store_true',
help='Fetch genome assemblies for species hits using from GenBank using the ENSEMBL software suite. Default=False; get assemblies from the autodb working directory.')
help='Fetch genome assemblies for species hits using from GenBank using the ENSEMBL software suite. Default=False; get assemblies from the autodb working directory where possible.')

parser_screen.add_argument('-t', '--threads',
type=int,
default=4,
action='store',
help='Number of threads to used for this run. Default=4.')

clean_group = parser_screen.add_mutually_exclusive_group()

clean_group.add_argument('-c', '--clean',
default=False,
action='store_true',
help='Remove the bowtie2 working directory from the output directory. Default=False.')

clean_group.add_argument('-s', '--superclean',
default=False,
action='store_true',
help='Delete the entire output directory, leaving only log files and the results .json. Default=False.')


parser_screen.set_defaults(func=run_subtool)
25 changes: 20 additions & 5 deletions src/Afanc/screen/getGenomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ def download_genome(assembly, stdout, stderr, taxID=False):
## check anything was found
if len(ftp_dirs) == 0:
print(f"{assembly} NOT FOUND ON GENBANK! SKIPPING...")
return 2

return None

## get first ftp directory
ftp_dir = ftp_dirs[0]
Expand All @@ -96,11 +97,15 @@ def download_genome(assembly, stdout, stderr, taxID=False):

if os.path.exists(outfile):
print(f"FILE ALREADY EXISTS! SKIPPING...")
return 3

return outfile

else:
grepline = f"curl {ftp_dir}/{base}_genomic.fna.gz --output {outfile}"
stdout, stderr = command(grepline, "DOWNLOAD_HITS").run_comm(1, stdout, stderr)

return outfile


def getLocalGenomes(out_json, args):
""" if args.fetch_assemblies is False, get genomes from the autodatabase results directory:
Expand All @@ -124,8 +129,9 @@ def getLocalGenomes(out_json, args):
## check to see if there is variant information
## if so, make this the target
if "closest_variant" in subdict:
taxID = str(subdict["closest_variant"]["taxon_id"])
assemblyID = subdict["closest_variant"]["name"]
subdict = subdict["closest_variant"]
taxID = str(subdict["taxon_id"])
assemblyID = subdict["name"]

else:
taxID = str(subdict["taxon_id"])
Expand All @@ -142,10 +148,19 @@ def getLocalGenomes(out_json, args):
## collect missing assemblies for unknown purpose
missing_assemblies.append(assemblyID)

download_genome(assemblyID, args.stdout, args.stderr, taxID)
assembly_path = download_genome(assemblyID, args.stdout, args.stderr, taxID)

## if the assembly can be found, copy to the bt2 working directory for bt2 db construction
else:
assembly_path = path.join(args.cleanFasta, db_pathdict[taxID])

print(f"Copying {assembly_path}...")
shutil.copy(assembly_path, "./")

if not assembly_path == None:
subdict["assembly"] = assembly_path.split("/")[-1]
else:
subdict["assembly"] = assembly_path

with open(out_json, "w") as fout:
json.dump(json_data, fout, indent=4)
4 changes: 2 additions & 2 deletions src/Afanc/screen/maths/mappingMetrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ def genomeSize(fasta_file):


def gini(depth_stdout, w=1):
"""Calculates the Gini coefficient of a coverage array isolated from a coverage .bed file
Maps pretty well to A/(A+B) in its functionality, but effectively calculates Gini as the mean absolute difference.
"""Calculates the Gini coefficient of a coverage array isolated from a coverage .bed file.
Equivalent to A/(A+B) in its functionality, but effectively calculates Gini as the mean absolute difference.
"""
lines = [int(line[2]) for line in depth_stdout if len(line) == 3]

Expand Down
3 changes: 1 addition & 2 deletions src/Afanc/screen/report/parseK2report.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,9 @@ def find_best_hit(root_node, pct_threshold, num_threshold, local_threshold):

## find lowest level scoring nodes
for node in root_node.traverse():
# print(node.name, node.level_int, node.clade_perc, node.clade_reads)
## check if node scores above the pct_threshold
if node.clade_perc >= pct_threshold and node.clade_reads >= num_threshold and node.level_int >= 9:

# print(node.name, node.level_int, node.clade_perc, node.clade_reads)
## construct scorebox then strip off the head node
scorebox = [ n.clade_perc for n in node.traverse() ][:-1]

Expand Down
132 changes: 93 additions & 39 deletions src/Afanc/screen/runFuncs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import pysam
import sys
from os import chdir, path, listdir, mkdir
from importlib.metadata import version
from shutil import move, rmtree
from os import chdir, path, listdir, mkdir, remove
from collections import defaultdict

from Afanc.utilities.runCommands import command
Expand Down Expand Up @@ -30,6 +32,9 @@ def runScreen(args):
## generate the final report
final_report = makeFinalReport(args)

if args.clean or args.superclean:
final_report = clean_outdir(args, final_report)

vprint("FINISHED", f"Final report can be found at {final_report}\n", "prGreen")


Expand Down Expand Up @@ -234,6 +239,8 @@ def makeFinalReport(args):
import json
from os import listdir

from Afanc.utilities.get_versions import get_versions_screen

subprocessID = "REPORT"
vprint(
subprocessID,
Expand All @@ -244,14 +251,26 @@ def makeFinalReport(args):
jsondict = defaultdict(dict)

datadict = jsondict["results"] = {}
jsondict["arguments"] = { "database" : args.k2_database,

## initialise dictionary key
if "Detection_events" not in datadict:
datadict["Detection_events"] = []

## collect arguments
jsondict["arguments"] = {
"database" : args.k2_database,
"fastqs" : args.fastq,
"num_threshold" : args.num_threshold,
"pct_threshold" : args.pct_threshold,
"local_threshold" : args.local_threshold,
"output_prefix" : args.output_prefix,
}

## collect versions
jsondict["versions"] = {
"screen" : get_versions_screen()
}

## collect k2 json reports
bt2_reportsbox = [g for g in listdir(args.reportsDir) if g.endswith("mapstats.json")]

Expand All @@ -261,56 +280,91 @@ def makeFinalReport(args):

for event in k2data["Detection_events"]:

## horrible block to deal with most likely variants
## initialise warnings
event["warnings"] = []

## block to deal with most likely variants
if "closest_variant" in event:
variant_flag = True
assembly = event["closest_variant"]["assembly"]
taxon_id = str(event["closest_variant"]["taxon_id"])
else:
variant_flag = False
assembly = event["assembly"]
taxon_id = str(event["taxon_id"])

## block dealing with hits which have an accompanying assembly which reads were mapped to
if not assembly == None:

## find Bowtie2 report
## NOTE: there should only ever be 1 element in this list, since taxon_id should be unique to each taxon
report_json = [report for report in bt2_reportsbox if report.startswith(taxon_id)][0]

## read Bowtie2 report
with open(f"{args.reportsDir}/{report_json}", 'r') as bt2fin:
bt2data = json.load(bt2fin)

## check the that the no_unique_map warning doesnt exist
if "no_unique_map" not in bt2data["warnings"]:

## add fields to json dict
## if there is a likely variant, add to that subdict
if variant_flag:
event["closest_variant"]["mean_DOC"] = bt2data["map_data"]["mean_DOC"]
event["closest_variant"]["median_DOC"] = bt2data["map_data"]["median_DOC"]
event["closest_variant"]["reference_cov"] = bt2data["map_data"]["proportion_cov"]
event["closest_variant"]["gini"] = bt2data["map_data"]["gini"]

## otherwise add to the base dict
else:
event["mean_DOC"] = bt2data["map_data"]["mean_DOC"]
event["median_DOC"] = bt2data["map_data"]["median_DOC"]
event["reference_cov"] = bt2data["map_data"]["proportion_cov"]
event["gini"] = bt2data["map_data"]["gini"]

## if it does, append it to the final report warnings
else:
event["warnings"].append(bt2data["warnings"])

for report in bt2_reportsbox:
## block dealing with hits which do not have an accompanying assembly
else:
event['warnings'].append(f"No assembly could be found for hit on {event['name']}.")

if variant_flag:
if str(event["closest_variant"]['taxon_id']) in report:
report_json = report
datadict["Detection_events"].append(event)

else:
if str(event['taxon_id']) in report:
report_json = report
final_report = path.abspath(f"./{args.output_prefix}.json")

## read Bowtie2 report
with open(f"{args.reportsDir}/{report_json}", 'r') as bt2fin: ## TODO: sort out this line to make it more robust, i.e. remove _genomic
bt2data = json.load(bt2fin)
with open(final_report, "w") as fout:
json.dump(jsondict, fout, indent = 4)

## check the that the no_unique_map warning doesnt exist
if "no_unique_map" not in bt2data["warnings"]:
return final_report

## add fields to json dict
## if there is a likely variant, add to that subdict
if variant_flag:
event["closest_variant"]["mean_DOC"] = bt2data["map_data"]["mean_DOC"]
event["closest_variant"]["median_DOC"] = bt2data["map_data"]["median_DOC"]
event["closest_variant"]["reference_cov"] = bt2data["map_data"]["proportion_cov"]
event["closest_variant"]["gini"] = bt2data["map_data"]["gini"]
## otherwise add to the base dict
else:
event["mean_DOC"] = bt2data["map_data"]["mean_DOC"]
event["median_DOC"] = bt2data["map_data"]["median_DOC"]
event["reference_cov"] = bt2data["map_data"]["proportion_cov"]
event["gini"] = bt2data["map_data"]["gini"]

## if it does, append it to the final report warnings
else:
event["warnings"] = bt2data["warnings"]
def clean_outdir(args, final_report):
""" Cleans the output directory according to provided arguments.
## initialise dictionary key
if "Detection_events" not in datadict:
datadict["Detection_events"] = []
clean : remove the bt2 working directory.
superclean : move the results JSON to ./ and remove the entire output directory
"""

datadict["Detection_events"].append(event)
if args.clean:
## check the bt2WDir exists
if path.isdir(args.bt2WDir):
rmtree(args.bt2WDir, ignore_errors=True)
else:
## If it fails, inform the user.
print(f"Error: Could not remove{args.bt2WDir}, directory not found.")

final_report = path.abspath(f"./{args.output_prefix}.json")
return final_report

with open(final_report, "w") as fout:
json.dump(jsondict, fout, indent = 4)
elif args.superclean:
## check the root run directory exists
if path.isdir(args.runWDir):
new_final_report_path = path.join(args.baseRunDir, final_report.split("/")[-1])
move(final_report, new_final_report_path)
rmtree(args.runWDir, ignore_errors=True)
else:
## If it fails, inform the user.
print(f"Error: Could not remove {args.runWDir}, directory not found.")

return final_report
return new_final_report_path
56 changes: 56 additions & 0 deletions src/Afanc/utilities/get_versions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import json
from collections import defaultdict
from importlib.metadata import version

from Afanc.utilities.runCommands import command


def get_versions_autodatabase(args):
""" get python package and software versions for afanc-autodatabase
"""

version_dict = {
"afanc" : version("afanc"),
"Biopython" : version("Biopython"),
"numpy" : version("numpy"),
"pysam" : version("pysam"),
"pandas" : version("pandas"),
"scipy" : version("scipy"),
"ncbi_taxonomy_date" : args.ncbi_date
}

mash_stdout, mash_stderr = command(f"mash --version", "GET-VERSIONS").run_comm_quiet(1)
version_dict["mash"] = mash_stdout.decode().strip("\n")

k2_stdout, k2_stderr = command(f"kraken2 -v", "GET-VERSIONS").run_comm_quiet(1)
version_dict["Kraken2"] = k2_stdout.decode().split("\n")[0].split("version ")[1]

versions_json = f"{args.autoDB_WDir}/versions.json"

with open(versions_json, "w") as fout:
json.dump({ "afanc-autodatabase_versions" : version_dict }, fout, indent = 4)


def get_versions_screen():
""" Gets python package and softeware versions for afanc-screen
"""

version_dict = {
"afanc" : version("afanc"),
"Biopython" : version("Biopython"),
"numpy" : version("numpy"),
"pysam" : version("pysam"),
"pandas" : version("pandas"),
"scipy" : version("scipy")
}

mash_stdout, mash_stderr = command(f"mash --version", "GET-VERSIONS").run_comm_quiet(1)
version_dict["mash"] = mash_stdout.decode().strip("\n")

k2_stdout, k2_stderr = command(f"kraken2 -v", "GET-VERSIONS").run_comm_quiet(1)
version_dict["Kraken2"] = k2_stdout.decode().split("\n")[0].split("version ")[1]

bt2_stdout, bt2_stderr = command(f"bowtie2 --version", "GET-VERSIONS").run_comm_quiet(1)
version_dict["Bowtie2"] = bt2_stdout.decode().split("\n")[0].split("version ")[1]

return version_dict
Loading

0 comments on commit 5bf66b4

Please sign in to comment.