diff --git a/covid19_call_variants.sh b/covid19_call_variants.sh
index c595632..5792ccf 100755
--- a/covid19_call_variants.sh
+++ b/covid19_call_variants.sh
@@ -117,6 +117,7 @@ sed \
mv "${prefix}.consensus.fa" "consensus.fa"
mv "${prefix}.ivar.tsv" "variants.tsv"
mv "${prefix}.sorted.bam" "covid19.bam"
+mv "${prefix}.sorted.bam.bai" "covid19.bam.bai"
mv "${prefix}.pileup" "covid19.pileup"
rm "${prefix}"*
diff --git a/report.ipynb b/report.ipynb
index 1f33c86..ff941b9 100644
--- a/report.ipynb
+++ b/report.ipynb
@@ -6,14 +6,19 @@
"metadata": {},
"outputs": [],
"source": [
+ "import binascii\n",
"import json\n",
"import os\n",
+ "import sys\n",
+ "\n",
+ "from io import BytesIO\n",
+ "\n",
"import pandas as pd\n",
"import altair as alt\n",
- "from Bio import SeqIO\n",
"\n",
+ "from altair_saver import save\n",
+ "from Bio import SeqIO\n",
"from IPython.display import HTML\n",
- "\n",
"from onecodex import Api\n",
"from onecodex.notebooks.report import set_style, title"
]
@@ -55,10 +60,12 @@
"# note our DB actually has v1 of this assembly as of Feb 2020\n",
"VARIANTS_TSV_PATH = \"variants.tsv\"\n",
"BAM_PATH = \"covid19.bam\"\n",
+ "PILEUP_PATH = \"covid19.pileup\"\n",
"REFERENCE_PATH = os.environ.get(\n",
" \"FASTA_REFERENCE\", \"reference/nCoV-2019.reference.fasta\"\n",
")\n",
- "BED_FILE_PATH = os.environ.get(\"BED_FILE_PATH\", \"reference/artic-v1/ARTIC-V1.bed\")"
+ "BED_FILE_PATH = os.environ.get(\"BED_FILE_PATH\", \"reference/artic-v1/ARTIC-V1.bed\")\n",
+ "REFERENCE_NAME = os.path.basename(REFERENCE_PATH).rstrip('.fasta')"
]
},
{
@@ -86,8 +93,8 @@
"metadata": {},
"outputs": [],
"source": [
- "# calculate mapping depth\n",
- "!samtools depth $BAM_PATH > snps.depth"
+ "# calculate mapping depth, trim another 20bp to avoid mapping to polyA tail in depth calcs\n",
+ "!samtools depth -r MN908947.3:1-29850 -Q 20 $BAM_PATH > snps.depth 2> /dev/null"
]
},
{
@@ -105,9 +112,8 @@
"metadata": {},
"outputs": [],
"source": [
- "# see https://www.biostars.org/p/138116/\n",
- "n_aligned_reads = !samtools view -F 0x4 $BAM_PATH | cut -f 1 | sort | uniq | wc -l\n",
- "n_aligned_reads = int(n_aligned_reads[0])"
+ "pileup_lines = !wc -l $PILEUP_PATH\n",
+ "n_aligned_reads = int(pileup_lines[0].split(\" \")[0])"
]
},
{
@@ -124,7 +130,7 @@
" depth_table.append(\n",
" {\"reference\": row[0], \"position\": int(row[1]), \"depth\": int(row[2])}\n",
" )\n",
- "depth_table = pd.DataFrame(depth_table)"
+ "depth_table = pd.DataFrame(depth_table, columns=[\"reference\", \"position\", \"depth\"])"
]
},
{
@@ -133,19 +139,22 @@
"metadata": {},
"outputs": [],
"source": [
- "# calculate genome coverage\n",
- "# (what percent of bases are coveraged at X coverage)\n",
- "min_depth = 1\n",
- "reference_length = len(reference[0])\n",
+ "# calculate genome coverage (what percent of bases are coveraged at X coverage)\n",
+ "# Use a fixed reference length that we use for `samtools depth` above\n",
+ "reference_length = 29850\n",
"\n",
"covered_sites = set()\n",
+ "covered_sites_10x = set()\n",
"\n",
"for _, row in depth_table.iterrows():\n",
" row = row.to_dict()\n",
- " if row[\"depth\"] >= min_depth:\n",
+ " if row[\"depth\"] >= 1:\n",
" covered_sites.add(row[\"position\"])\n",
+ " if row[\"depth\"] >= 10:\n",
+ " covered_sites_10x.add(row[\"position\"]) \n",
"\n",
- "cov = len(covered_sites) / reference_length"
+ "cov = len(covered_sites) / reference_length\n",
+ "cov_10x = len(covered_sites_10x) / reference_length"
]
},
{
@@ -167,16 +176,8 @@
" {\"position\": i, \"depth\": window[\"depth\"].mean(),}\n",
" )\n",
"\n",
- "binned_depths = pd.DataFrame(binned_depths)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "depth = depth_table[\"depth\"].mean()"
+ "binned_depths = pd.DataFrame(binned_depths)\n",
+ "depth = depth_table[\"depth\"].mean() if not depth_table.empty else 0"
]
},
{
@@ -218,59 +219,13 @@
"{sample_filename}. \n",
"This sample contained {n_reads:,} reads, with\n",
"{n_aligned_reads:,} mapping to the \n",
- "reference. \n",
- "Reads cover {cov:.1%} of the SARS-CoV-2 genome, with a mean depth of {depth:.1f}x.\n",
+ "reference. \n",
+ "Reads cover {cov:.0%} of the genome ({cov_10x:.0%} over 10×), with a mean depth of {depth:.1f}×.\n",
"A total of {n_snps} variant{'s were' if n_snps != 1 else 'was'} detected.\"\"\"\n",
"\n",
"HTML(text)"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# An updated theme not yet in onecodex v0.7.2\n",
- "def onecodex_theme_alt():\n",
- " onecodex_palette = [\n",
- " \"#ffffcc\",\n",
- " \"#c7e9b4\",\n",
- " \"#7fcdbb\",\n",
- " \"#41b6c4\",\n",
- " \"#2c7fb8\",\n",
- " \"#264153\",\n",
- " ]\n",
- "\n",
- " return {\n",
- " \"config\": {\n",
- " \"range\": {\"heatmap\": list(reversed(onecodex_palette))},\n",
- " \"axis\": {\n",
- " \"labelFont\": \"Palatino\",\n",
- " \"labelFontSize\": 12,\n",
- " \"titleFont\": \"Palatino\",\n",
- " \"titleFontSize\": 12,\n",
- " \"grid\": False,\n",
- " },\n",
- " \"area\": {\"fill\": \"#128887\",},\n",
- " \"legend\": {\n",
- " \"labelFont\": \"Palatino\",\n",
- " \"labelFontSize\": 12,\n",
- " \"titleFont\": \"Palatino\",\n",
- " \"titleFontSize\": 12,\n",
- " },\n",
- " \"title\": {\"font\": \"Palatino\"},\n",
- " \"view\": {\"width\": 400, \"height\": 400, \"strokeWidth\": 0},\n",
- " \"background\": \"transparent\",\n",
- " }\n",
- " }\n",
- "\n",
- "\n",
- "alt.themes.register(\"onecodex2\", onecodex_theme_alt)\n",
- "alt.themes.enable(\"onecodex2\")\n",
- "None"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
@@ -291,7 +246,7 @@
" y=alt.Y(\"rolling_mean:Q\", scale=alt.Scale(type=\"linear\"), title=\"Depth\"),\n",
" )\n",
" .properties(\n",
- " title=f\"SARS-CoV-2 ({os.path.basename(REFERENCE_PATH).rstrip('.gbk')})\",\n",
+ " title=f\"SARS-CoV-2 ({REFERENCE_NAME})\",\n",
" width=550,\n",
" height=150,\n",
" )\n",
@@ -305,6 +260,10 @@
"metadata": {},
"outputs": [],
"source": [
+ "DEPTH_FILTER = 10\n",
+ "if not snp_table.empty and snp_table.loc[snp_table[\"ALT_DP\"] >= DEPTH_FILTER, :].shape[0] == 0:\n",
+ " DEPTH_FILTER = 1\n",
+ "\n",
"if snp_table.empty:\n",
" table = pd.DataFrame(columns=[\"POS\", \"Variant\", \"Gene\"])\n",
" table = HTML(\n",
@@ -314,10 +273,11 @@
" snp_table[\"Position\"] = snp_table[\"POS\"]\n",
" snp_table[\"Variant\"] = [f\"{r['REF']} → {r['ALT']}\" for _, r in snp_table.iterrows()]\n",
" snp_table[\"Depth\"] = snp_table[\"ALT_DP\"].apply(lambda x: f\"{x}×\")\n",
+ " snp_table[\"Frequency\"] = snp_table[\"ALT_FREQ\"].apply(lambda x: f\"{x * 100:.0f}%\")\n",
"\n",
" # snp_table[\"Gene\"] = \"TODO\"\n",
" table = snp_table.loc[\n",
- " snp_table[\"ALT_DP\"] >= 10, [\"Position\", \"Variant\", \"Depth\"]\n",
+ " snp_table[\"ALT_DP\"] >= DEPTH_FILTER, [\"Position\", \"Variant\", \"Depth\", \"Frequency\"]\n",
" ] # , \"Gene\"]]\n",
" table = HTML(table.to_html(index=False))\n",
"table"
@@ -331,8 +291,10 @@
"source": [
"legend_text = \"SARS-CoV-2 variants.\"\n",
"n_extra_variants = (\n",
- " snp_table[snp_table[\"ALT_DP\"] < 10].shape[0] if not snp_table.empty else 0\n",
+ " snp_table[snp_table[\"ALT_DP\"] < DEPTH_FILTER].shape[0] if not snp_table.empty else 0\n",
")\n",
+ "if DEPTH_FILTER == 1:\n",
+ " legend_text += \" Use caution in interpreting low depth variants!\"\n",
"if n_extra_variants:\n",
" legend_text += f\" An additional {n_extra_variants} variant{'s' if n_extra_variants > 1 else ''} <10× depth {'are' if n_extra_variants > 1 else 'is'} not shown.\"\n",
"if os.environ.get(\"ONE_CODEX_REPORT_UUID\"):\n",
@@ -384,7 +346,11 @@
"outputs": [],
"source": [
"# Save a JSON too, including filtered variants <10x\n",
- "results = [r.to_dict() for _, r in snp_table.iterrows()]\n",
+ "results = {\n",
+ " \"report_id\": os.environ.get(\"ONE_CODEX_REPORT_UUID\"), \n",
+ " \"sample_id\": os.environ.get(\"ONE_CODEX_SAMPLE_UUID\"),\n",
+ " \"variants\": [r.to_dict() for _, r in snp_table.iterrows()]\n",
+ "}\n",
"\n",
"with open(\"results.json\", \"w\") as f:\n",
" json.dump(results, f)"
@@ -397,7 +363,7 @@
"outputs": [],
"source": [
"# Clean up files\n",
- "!rm -f {sample.filename} snps.depth variants.log covid19.bam"
+ "!rm -f {sample.filename} snps.depth variants.log covid19.pileup "
]
},
{