From 9585ea4741b5d483853bc0be1e5f98834d9fcf3a Mon Sep 17 00:00:00 2001 From: Nick Greenfield Date: Tue, 31 Mar 2020 00:29:33 -0700 Subject: [PATCH] Tweak report to use latest onecodex library, add cov >10x depth --- covid19_call_variants.sh | 1 + report.ipynb | 122 ++++++++++++++------------------------- 2 files changed, 45 insertions(+), 78 deletions(-) diff --git a/covid19_call_variants.sh b/covid19_call_variants.sh index c595632..5792ccf 100755 --- a/covid19_call_variants.sh +++ b/covid19_call_variants.sh @@ -117,6 +117,7 @@ sed \ mv "${prefix}.consensus.fa" "consensus.fa" mv "${prefix}.ivar.tsv" "variants.tsv" mv "${prefix}.sorted.bam" "covid19.bam" +mv "${prefix}.sorted.bam.bai" "covid19.bam.bai" mv "${prefix}.pileup" "covid19.pileup" rm "${prefix}"* diff --git a/report.ipynb b/report.ipynb index 1f33c86..ff941b9 100644 --- a/report.ipynb +++ b/report.ipynb @@ -6,14 +6,19 @@ "metadata": {}, "outputs": [], "source": [ + "import binascii\n", "import json\n", "import os\n", + "import sys\n", + "\n", + "from io import BytesIO\n", + "\n", "import pandas as pd\n", "import altair as alt\n", - "from Bio import SeqIO\n", "\n", + "from altair_saver import save\n", + "from Bio import SeqIO\n", "from IPython.display import HTML\n", - "\n", "from onecodex import Api\n", "from onecodex.notebooks.report import set_style, title" ] @@ -55,10 +60,12 @@ "# note our DB actually has v1 of this assembly as of Feb 2020\n", "VARIANTS_TSV_PATH = \"variants.tsv\"\n", "BAM_PATH = \"covid19.bam\"\n", + "PILEUP_PATH = \"covid19.pileup\"\n", "REFERENCE_PATH = os.environ.get(\n", " \"FASTA_REFERENCE\", \"reference/nCoV-2019.reference.fasta\"\n", ")\n", - "BED_FILE_PATH = os.environ.get(\"BED_FILE_PATH\", \"reference/artic-v1/ARTIC-V1.bed\")" + "BED_FILE_PATH = os.environ.get(\"BED_FILE_PATH\", \"reference/artic-v1/ARTIC-V1.bed\")\n", + "REFERENCE_NAME = os.path.basename(REFERENCE_PATH).rstrip('.fasta')" ] }, { @@ -86,8 +93,8 @@ "metadata": {}, "outputs": [], "source": [ - "# calculate mapping depth\n", - "!samtools depth $BAM_PATH > snps.depth" + "# calculate mapping depth, trim another 20bp to avoid mapping to polyA tail in depth calcs\n", + "!samtools depth -r MN908947.3:1-29850 -Q 20 $BAM_PATH > snps.depth 2> /dev/null" ] }, { @@ -105,9 +112,8 @@ "metadata": {}, "outputs": [], "source": [ - "# see https://www.biostars.org/p/138116/\n", - "n_aligned_reads = !samtools view -F 0x4 $BAM_PATH | cut -f 1 | sort | uniq | wc -l\n", - "n_aligned_reads = int(n_aligned_reads[0])" + "pileup_lines = !wc -l $PILEUP_PATH\n", + "n_aligned_reads = int(pileup_lines[0].split(\" \")[0])" ] }, { @@ -124,7 +130,7 @@ " depth_table.append(\n", " {\"reference\": row[0], \"position\": int(row[1]), \"depth\": int(row[2])}\n", " )\n", - "depth_table = pd.DataFrame(depth_table)" + "depth_table = pd.DataFrame(depth_table, columns=[\"reference\", \"position\", \"depth\"])" ] }, { @@ -133,19 +139,22 @@ "metadata": {}, "outputs": [], "source": [ - "# calculate genome coverage\n", - "# (what percent of bases are coveraged at X coverage)\n", - "min_depth = 1\n", - "reference_length = len(reference[0])\n", + "# calculate genome coverage (what percent of bases are coveraged at X coverage)\n", + "# Use a fixed reference length that we use for `samtools depth` above\n", + "reference_length = 29850\n", "\n", "covered_sites = set()\n", + "covered_sites_10x = set()\n", "\n", "for _, row in depth_table.iterrows():\n", " row = row.to_dict()\n", - " if row[\"depth\"] >= min_depth:\n", + " if row[\"depth\"] >= 1:\n", " covered_sites.add(row[\"position\"])\n", + " if row[\"depth\"] >= 10:\n", + " covered_sites_10x.add(row[\"position\"]) \n", "\n", - "cov = len(covered_sites) / reference_length" + "cov = len(covered_sites) / reference_length\n", + "cov_10x = len(covered_sites_10x) / reference_length" ] }, { @@ -167,16 +176,8 @@ " {\"position\": i, \"depth\": window[\"depth\"].mean(),}\n", " )\n", "\n", - "binned_depths = pd.DataFrame(binned_depths)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "depth = depth_table[\"depth\"].mean()" + "binned_depths = pd.DataFrame(binned_depths)\n", + "depth = depth_table[\"depth\"].mean() if not depth_table.empty else 0" ] }, { @@ -218,59 +219,13 @@ "{sample_filename}. \n", "This sample contained {n_reads:,} reads, with\n", "{n_aligned_reads:,} mapping to the \n", - "reference. \n", - "Reads cover {cov:.1%} of the SARS-CoV-2 genome, with a mean depth of {depth:.1f}x.\n", + "reference. \n", + "Reads cover {cov:.0%} of the genome ({cov_10x:.0%} over 10×), with a mean depth of {depth:.1f}×.\n", "A total of {n_snps} variant{'s were' if n_snps != 1 else 'was'} detected.\"\"\"\n", "\n", "HTML(text)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# An updated theme not yet in onecodex v0.7.2\n", - "def onecodex_theme_alt():\n", - " onecodex_palette = [\n", - " \"#ffffcc\",\n", - " \"#c7e9b4\",\n", - " \"#7fcdbb\",\n", - " \"#41b6c4\",\n", - " \"#2c7fb8\",\n", - " \"#264153\",\n", - " ]\n", - "\n", - " return {\n", - " \"config\": {\n", - " \"range\": {\"heatmap\": list(reversed(onecodex_palette))},\n", - " \"axis\": {\n", - " \"labelFont\": \"Palatino\",\n", - " \"labelFontSize\": 12,\n", - " \"titleFont\": \"Palatino\",\n", - " \"titleFontSize\": 12,\n", - " \"grid\": False,\n", - " },\n", - " \"area\": {\"fill\": \"#128887\",},\n", - " \"legend\": {\n", - " \"labelFont\": \"Palatino\",\n", - " \"labelFontSize\": 12,\n", - " \"titleFont\": \"Palatino\",\n", - " \"titleFontSize\": 12,\n", - " },\n", - " \"title\": {\"font\": \"Palatino\"},\n", - " \"view\": {\"width\": 400, \"height\": 400, \"strokeWidth\": 0},\n", - " \"background\": \"transparent\",\n", - " }\n", - " }\n", - "\n", - "\n", - "alt.themes.register(\"onecodex2\", onecodex_theme_alt)\n", - "alt.themes.enable(\"onecodex2\")\n", - "None" - ] - }, { "cell_type": "code", "execution_count": null, @@ -291,7 +246,7 @@ " y=alt.Y(\"rolling_mean:Q\", scale=alt.Scale(type=\"linear\"), title=\"Depth\"),\n", " )\n", " .properties(\n", - " title=f\"SARS-CoV-2 ({os.path.basename(REFERENCE_PATH).rstrip('.gbk')})\",\n", + " title=f\"SARS-CoV-2 ({REFERENCE_NAME})\",\n", " width=550,\n", " height=150,\n", " )\n", @@ -305,6 +260,10 @@ "metadata": {}, "outputs": [], "source": [ + "DEPTH_FILTER = 10\n", + "if not snp_table.empty and snp_table.loc[snp_table[\"ALT_DP\"] >= DEPTH_FILTER, :].shape[0] == 0:\n", + " DEPTH_FILTER = 1\n", + "\n", "if snp_table.empty:\n", " table = pd.DataFrame(columns=[\"POS\", \"Variant\", \"Gene\"])\n", " table = HTML(\n", @@ -314,10 +273,11 @@ " snp_table[\"Position\"] = snp_table[\"POS\"]\n", " snp_table[\"Variant\"] = [f\"{r['REF']} → {r['ALT']}\" for _, r in snp_table.iterrows()]\n", " snp_table[\"Depth\"] = snp_table[\"ALT_DP\"].apply(lambda x: f\"{x}×\")\n", + " snp_table[\"Frequency\"] = snp_table[\"ALT_FREQ\"].apply(lambda x: f\"{x * 100:.0f}%\")\n", "\n", " # snp_table[\"Gene\"] = \"TODO\"\n", " table = snp_table.loc[\n", - " snp_table[\"ALT_DP\"] >= 10, [\"Position\", \"Variant\", \"Depth\"]\n", + " snp_table[\"ALT_DP\"] >= DEPTH_FILTER, [\"Position\", \"Variant\", \"Depth\", \"Frequency\"]\n", " ] # , \"Gene\"]]\n", " table = HTML(table.to_html(index=False))\n", "table" @@ -331,8 +291,10 @@ "source": [ "legend_text = \"SARS-CoV-2 variants.\"\n", "n_extra_variants = (\n", - " snp_table[snp_table[\"ALT_DP\"] < 10].shape[0] if not snp_table.empty else 0\n", + " snp_table[snp_table[\"ALT_DP\"] < DEPTH_FILTER].shape[0] if not snp_table.empty else 0\n", ")\n", + "if DEPTH_FILTER == 1:\n", + " legend_text += \" Use caution in interpreting low depth variants!\"\n", "if n_extra_variants:\n", " legend_text += f\" An additional {n_extra_variants} variant{'s' if n_extra_variants > 1 else ''} <10× depth {'are' if n_extra_variants > 1 else 'is'} not shown.\"\n", "if os.environ.get(\"ONE_CODEX_REPORT_UUID\"):\n", @@ -384,7 +346,11 @@ "outputs": [], "source": [ "# Save a JSON too, including filtered variants <10x\n", - "results = [r.to_dict() for _, r in snp_table.iterrows()]\n", + "results = {\n", + " \"report_id\": os.environ.get(\"ONE_CODEX_REPORT_UUID\"), \n", + " \"sample_id\": os.environ.get(\"ONE_CODEX_SAMPLE_UUID\"),\n", + " \"variants\": [r.to_dict() for _, r in snp_table.iterrows()]\n", + "}\n", "\n", "with open(\"results.json\", \"w\") as f:\n", " json.dump(results, f)" @@ -397,7 +363,7 @@ "outputs": [], "source": [ "# Clean up files\n", - "!rm -f {sample.filename} snps.depth variants.log covid19.bam" + "!rm -f {sample.filename} snps.depth variants.log covid19.pileup " ] }, {