Skip to content

Commit

Permalink
Tweak report to use latest onecodex library, add cov >10x depth
Browse files Browse the repository at this point in the history
  • Loading branch information
boydgreenfield committed Mar 31, 2020
1 parent e8386ad commit 9585ea4
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 78 deletions.
1 change: 1 addition & 0 deletions covid19_call_variants.sh
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ sed \
mv "${prefix}.consensus.fa" "consensus.fa"
mv "${prefix}.ivar.tsv" "variants.tsv"
mv "${prefix}.sorted.bam" "covid19.bam"
mv "${prefix}.sorted.bam.bai" "covid19.bam.bai"
mv "${prefix}.pileup" "covid19.pileup"
rm "${prefix}"*

Expand Down
122 changes: 44 additions & 78 deletions report.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,19 @@
"metadata": {},
"outputs": [],
"source": [
"import binascii\n",
"import json\n",
"import os\n",
"import sys\n",
"\n",
"from io import BytesIO\n",
"\n",
"import pandas as pd\n",
"import altair as alt\n",
"from Bio import SeqIO\n",
"\n",
"from altair_saver import save\n",
"from Bio import SeqIO\n",
"from IPython.display import HTML\n",
"\n",
"from onecodex import Api\n",
"from onecodex.notebooks.report import set_style, title"
]
Expand Down Expand Up @@ -55,10 +60,12 @@
"# note our DB actually has v1 of this assembly as of Feb 2020\n",
"VARIANTS_TSV_PATH = \"variants.tsv\"\n",
"BAM_PATH = \"covid19.bam\"\n",
"PILEUP_PATH = \"covid19.pileup\"\n",
"REFERENCE_PATH = os.environ.get(\n",
" \"FASTA_REFERENCE\", \"reference/nCoV-2019.reference.fasta\"\n",
")\n",
"BED_FILE_PATH = os.environ.get(\"BED_FILE_PATH\", \"reference/artic-v1/ARTIC-V1.bed\")"
"BED_FILE_PATH = os.environ.get(\"BED_FILE_PATH\", \"reference/artic-v1/ARTIC-V1.bed\")\n",
"REFERENCE_NAME = os.path.basename(REFERENCE_PATH).rstrip('.fasta')"
]
},
{
Expand Down Expand Up @@ -86,8 +93,8 @@
"metadata": {},
"outputs": [],
"source": [
"# calculate mapping depth\n",
"!samtools depth $BAM_PATH > snps.depth"
"# calculate mapping depth, trim another 20bp to avoid mapping to polyA tail in depth calcs\n",
"!samtools depth -r MN908947.3:1-29850 -Q 20 $BAM_PATH > snps.depth 2> /dev/null"
]
},
{
Expand All @@ -105,9 +112,8 @@
"metadata": {},
"outputs": [],
"source": [
"# see https://www.biostars.org/p/138116/\n",
"n_aligned_reads = !samtools view -F 0x4 $BAM_PATH | cut -f 1 | sort | uniq | wc -l\n",
"n_aligned_reads = int(n_aligned_reads[0])"
"pileup_lines = !wc -l $PILEUP_PATH\n",
"n_aligned_reads = int(pileup_lines[0].split(\" \")[0])"
]
},
{
Expand All @@ -124,7 +130,7 @@
" depth_table.append(\n",
" {\"reference\": row[0], \"position\": int(row[1]), \"depth\": int(row[2])}\n",
" )\n",
"depth_table = pd.DataFrame(depth_table)"
"depth_table = pd.DataFrame(depth_table, columns=[\"reference\", \"position\", \"depth\"])"
]
},
{
Expand All @@ -133,19 +139,22 @@
"metadata": {},
"outputs": [],
"source": [
"# calculate genome coverage\n",
"# (what percent of bases are coveraged at X coverage)\n",
"min_depth = 1\n",
"reference_length = len(reference[0])\n",
"# calculate genome coverage (what percent of bases are coveraged at X coverage)\n",
"# Use a fixed reference length that we use for `samtools depth` above\n",
"reference_length = 29850\n",
"\n",
"covered_sites = set()\n",
"covered_sites_10x = set()\n",
"\n",
"for _, row in depth_table.iterrows():\n",
" row = row.to_dict()\n",
" if row[\"depth\"] >= min_depth:\n",
" if row[\"depth\"] >= 1:\n",
" covered_sites.add(row[\"position\"])\n",
" if row[\"depth\"] >= 10:\n",
" covered_sites_10x.add(row[\"position\"]) \n",
"\n",
"cov = len(covered_sites) / reference_length"
"cov = len(covered_sites) / reference_length\n",
"cov_10x = len(covered_sites_10x) / reference_length"
]
},
{
Expand All @@ -167,16 +176,8 @@
" {\"position\": i, \"depth\": window[\"depth\"].mean(),}\n",
" )\n",
"\n",
"binned_depths = pd.DataFrame(binned_depths)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"depth = depth_table[\"depth\"].mean()"
"binned_depths = pd.DataFrame(binned_depths)\n",
"depth = depth_table[\"depth\"].mean() if not depth_table.empty else 0"
]
},
{
Expand Down Expand Up @@ -218,59 +219,13 @@
"<strong>{sample_filename}</strong>. \n",
"This sample contained <strong>{n_reads:,}</strong> reads, with\n",
"<strong>{n_aligned_reads:,}</strong> mapping to the \n",
"<a href='https://www.ncbi.nlm.nih.gov/nuccore/NC_045512' target='_blank'>reference</a>. \n",
"Reads cover <strong>{cov:.1%}</strong> of the SARS-CoV-2 genome, with a mean depth of <strong>{depth:.1f}x</strong>.\n",
"<a href='https://www.ncbi.nlm.nih.gov/nuccore/MN908947.3/' target='_blank'>reference</a>. \n",
"Reads cover <strong>{cov:.0%}</strong> of the genome ({cov_10x:.0%} over 10×), with a mean depth of <strong>{depth:.1f}×</strong>.\n",
"A total of <strong>{n_snps}</strong> variant{'s were' if n_snps != 1 else 'was'} detected.\"\"\"\n",
"\n",
"HTML(text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# An updated theme not yet in onecodex v0.7.2\n",
"def onecodex_theme_alt():\n",
" onecodex_palette = [\n",
" \"#ffffcc\",\n",
" \"#c7e9b4\",\n",
" \"#7fcdbb\",\n",
" \"#41b6c4\",\n",
" \"#2c7fb8\",\n",
" \"#264153\",\n",
" ]\n",
"\n",
" return {\n",
" \"config\": {\n",
" \"range\": {\"heatmap\": list(reversed(onecodex_palette))},\n",
" \"axis\": {\n",
" \"labelFont\": \"Palatino\",\n",
" \"labelFontSize\": 12,\n",
" \"titleFont\": \"Palatino\",\n",
" \"titleFontSize\": 12,\n",
" \"grid\": False,\n",
" },\n",
" \"area\": {\"fill\": \"#128887\",},\n",
" \"legend\": {\n",
" \"labelFont\": \"Palatino\",\n",
" \"labelFontSize\": 12,\n",
" \"titleFont\": \"Palatino\",\n",
" \"titleFontSize\": 12,\n",
" },\n",
" \"title\": {\"font\": \"Palatino\"},\n",
" \"view\": {\"width\": 400, \"height\": 400, \"strokeWidth\": 0},\n",
" \"background\": \"transparent\",\n",
" }\n",
" }\n",
"\n",
"\n",
"alt.themes.register(\"onecodex2\", onecodex_theme_alt)\n",
"alt.themes.enable(\"onecodex2\")\n",
"None"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -291,7 +246,7 @@
" y=alt.Y(\"rolling_mean:Q\", scale=alt.Scale(type=\"linear\"), title=\"Depth\"),\n",
" )\n",
" .properties(\n",
" title=f\"SARS-CoV-2 ({os.path.basename(REFERENCE_PATH).rstrip('.gbk')})\",\n",
" title=f\"SARS-CoV-2 ({REFERENCE_NAME})\",\n",
" width=550,\n",
" height=150,\n",
" )\n",
Expand All @@ -305,6 +260,10 @@
"metadata": {},
"outputs": [],
"source": [
"DEPTH_FILTER = 10\n",
"if not snp_table.empty and snp_table.loc[snp_table[\"ALT_DP\"] >= DEPTH_FILTER, :].shape[0] == 0:\n",
" DEPTH_FILTER = 1\n",
"\n",
"if snp_table.empty:\n",
" table = pd.DataFrame(columns=[\"POS\", \"Variant\", \"Gene\"])\n",
" table = HTML(\n",
Expand All @@ -314,10 +273,11 @@
" snp_table[\"Position\"] = snp_table[\"POS\"]\n",
" snp_table[\"Variant\"] = [f\"{r['REF']} → {r['ALT']}\" for _, r in snp_table.iterrows()]\n",
" snp_table[\"Depth\"] = snp_table[\"ALT_DP\"].apply(lambda x: f\"{x}×\")\n",
" snp_table[\"Frequency\"] = snp_table[\"ALT_FREQ\"].apply(lambda x: f\"{x * 100:.0f}%\")\n",
"\n",
" # snp_table[\"Gene\"] = \"TODO\"\n",
" table = snp_table.loc[\n",
" snp_table[\"ALT_DP\"] >= 10, [\"Position\", \"Variant\", \"Depth\"]\n",
" snp_table[\"ALT_DP\"] >= DEPTH_FILTER, [\"Position\", \"Variant\", \"Depth\", \"Frequency\"]\n",
" ] # , \"Gene\"]]\n",
" table = HTML(table.to_html(index=False))\n",
"table"
Expand All @@ -331,8 +291,10 @@
"source": [
"legend_text = \"SARS-CoV-2 variants.\"\n",
"n_extra_variants = (\n",
" snp_table[snp_table[\"ALT_DP\"] < 10].shape[0] if not snp_table.empty else 0\n",
" snp_table[snp_table[\"ALT_DP\"] < DEPTH_FILTER].shape[0] if not snp_table.empty else 0\n",
")\n",
"if DEPTH_FILTER == 1:\n",
" legend_text += \" Use caution in interpreting low depth variants!\"\n",
"if n_extra_variants:\n",
" legend_text += f\" An additional {n_extra_variants} variant{'s' if n_extra_variants > 1 else ''} <10× depth {'are' if n_extra_variants > 1 else 'is'} not shown.\"\n",
"if os.environ.get(\"ONE_CODEX_REPORT_UUID\"):\n",
Expand Down Expand Up @@ -384,7 +346,11 @@
"outputs": [],
"source": [
"# Save a JSON too, including filtered variants <10x\n",
"results = [r.to_dict() for _, r in snp_table.iterrows()]\n",
"results = {\n",
" \"report_id\": os.environ.get(\"ONE_CODEX_REPORT_UUID\"), \n",
" \"sample_id\": os.environ.get(\"ONE_CODEX_SAMPLE_UUID\"),\n",
" \"variants\": [r.to_dict() for _, r in snp_table.iterrows()]\n",
"}\n",
"\n",
"with open(\"results.json\", \"w\") as f:\n",
" json.dump(results, f)"
Expand All @@ -397,7 +363,7 @@
"outputs": [],
"source": [
"# Clean up files\n",
"!rm -f {sample.filename} snps.depth variants.log covid19.bam"
"!rm -f {sample.filename} snps.depth variants.log covid19.pileup "
]
},
{
Expand Down

0 comments on commit 9585ea4

Please sign in to comment.