Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev 0.7.6 - Notebook updates #285

Merged
merged 23 commits into from
Oct 3, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
8e7ec05
notebook: use heatmap to depict COG distribution
matinnuhamunada Sep 26, 2023
b068d5a
notebook: enrich deeptf with faa annotation
matinnuhamunada Sep 26, 2023
6f8117a
notebook: generate graphml file for cytoscape
matinnuhamunada Sep 27, 2023
3684896
fix: correct notebook links and display
matinnuhamunada Sep 27, 2023
403a349
feat: colorise bigscape class and add knownclusterblast
matinnuhamunada Sep 27, 2023
975ea78
fix: cleanup unused cell
matinnuhamunada Sep 27, 2023
e99f4d1
feat: extract ARTS 4 tables
matinnuhamunada Sep 29, 2023
07f7522
fix: correct new arts output format
matinnuhamunada Sep 29, 2023
0936781
fix: update rule for arts output and notebook
matinnuhamunada Sep 29, 2023
67a8dd9
test: update GTDB API result
matinnuhamunada Sep 29, 2023
75cdd38
test: update expected output for arts extract
matinnuhamunada Sep 29, 2023
d0c4cef
test: merge arts results
matinnuhamunada Sep 29, 2023
b261a0c
test: add missing expected duptable
matinnuhamunada Sep 29, 2023
cc24be6
test: add missing config and symlink
matinnuhamunada Sep 29, 2023
2989d76
test: add final step of arts
matinnuhamunada Sep 29, 2023
7bf99b8
test: add config
matinnuhamunada Sep 30, 2023
34ade74
feat: annotate bigfam models
matinnuhamunada Sep 30, 2023
c3e0474
fix: refrain using directory in params
matinnuhamunada Sep 30, 2023
aef96dc
fix: correct shell script
matinnuhamunada Sep 30, 2023
9e4de6d
chore: update java requirement for metabase
matinnuhamunada Oct 3, 2023
0c11c77
notebook: add instruction for cblaster-bgc
matinnuhamunada Oct 3, 2023
e057e48
chore: remove unused notebooks
matinnuhamunada Oct 3, 2023
d32336d
chore: bump version 0.7.6
matinnuhamunada Oct 3, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix: update rule for arts output and notebook
  • Loading branch information
matinnuhamunada committed Sep 29, 2023
commit 0936781a1a9c6ae2ec607f1cd3b9d1ca6c2da3fa
268 changes: 207 additions & 61 deletions workflow/notebook/arts.py.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,10 @@
},
"outputs": [],
"source": [
"report_dir = Path(\"../\")\n",
"\n",
"dependency_version = report_dir / \"metadata/dependency_versions.json\"\n",
"with open(dependency_version, \"r\") as file:\n",
" dependency_version = json.load(file)\n",
"antismash_version = dependency_version[\"antismash\"]\n",
"\n",
"df_arts = pd.read_csv(report_dir / f\"tables/df_arts_bgctable_as-{antismash_version}.csv\")"
"with open(\"../metadata/project_metadata.json\", \"r\") as f:\n",
" project_configuration = json.load(f)\n",
"with open(\"../metadata/dependency_versions.json\", \"r\") as f:\n",
" dependency_version = json.load(f)"
]
},
{
Expand All @@ -72,6 +68,33 @@
"tags": []
},
"outputs": [],
"source": [
"project_name = [i for i in project_configuration.keys()][0]\n",
"antismash_version = dependency_version[\"antismash\"]\n",
"report_dir = Path(\"../\")\n",
"\n",
"df_arts = pd.read_csv(report_dir / f\"tables/df_arts_bgctable_as-{antismash_version}.csv\")\n",
"df_bgcs = pd.read_csv(report_dir / f\"tables/df_regions_antismash_{antismash_version}.csv\", index_col=0)\n",
"df_tax = pd.read_csv(report_dir / \"tables/df_gtdb_meta.csv\", index_col=0)\n",
"df_arts_core = pd.read_csv(report_dir / f\"tables/df_arts_coretable_as-{antismash_version}.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fec0845a-c584-4e67-84bc-39ba9b7401ba",
"metadata": {},
"outputs": [],
"source": [
"df_arts_hits = pd.read_csv(report_dir / f\"tables/df_arts_allhits_as-{antismash_version}.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f869f1a3-9696-438f-8858-36f39824d192",
"metadata": {},
"outputs": [],
"source": [
"result = {}\n",
"ctr = 1\n",
Expand All @@ -91,11 +114,49 @@
" 'description' : hits[5],\n",
" 'function' : hits[6]\n",
" }\n",
" if arts_hits['function'] == \"N/A\":\n",
" arts_hits['function'] = \"ResModel\"\n",
" result[ctr] = arts_hits\n",
" ctr = ctr + 1\n",
"df_hits = pd.DataFrame.from_dict(result).T\n",
"\n",
"display(HTML(DT(df_hits, columnDefs=[{\"className\": \"dt-center\", \"targets\": \"_all\"}],)))"
"df_hits = pd.DataFrame.from_dict(result).T"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bc7e3412-5340-4a21-a238-0d261471d479",
"metadata": {},
"outputs": [],
"source": [
"function_map = df_hits.set_index(\"profile\").loc[:, \"function\"].to_dict()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2a617a31-bee0-4fee-8dba-0665036e73b2",
"metadata": {},
"outputs": [],
"source": [
"for i in df_arts_hits.index:\n",
" profile = df_arts_hits.loc[i, \"core_gene_or_model\"]\n",
" if profile in function_map:\n",
" df_arts_hits.loc[i, \"function\"] = function_map[profile]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9fc3762d-97fa-4e80-9e4e-f31644f93d22",
"metadata": {},
"outputs": [],
"source": [
"# select boolean columns\n",
"bool_cols = df_arts_hits.select_dtypes(include=['bool'])\n",
"# Summing True values row-wise and creating a new column 'true_count'\n",
"df_arts_hits['hits'] = bool_cols.sum(axis=1)\n",
"display(HTML(DT(df_arts_hits.rename(columns={\"core_gene_or_model\" : \"profile\"}).loc[:, ['profile', 'name', 'product', 'function', 'hits', 'duplication', 'phylogeny', 'known_target', 'bgc_proximity', 'bgc_id',\n",
" 'genome_id', 'scaffold', 'start', 'stop']], columnDefs=[{\"className\": \"dt-center\", \"targets\": \"_all\"}],)))"
]
},
{
Expand All @@ -106,19 +167,36 @@
"tags": []
},
"outputs": [],
"source": [
"summary_report = f\"A total of {len(df_hits.bgc_id.unique())} BGCs from {len(df_hits.genome_id.unique())} genomes have hits with {len(df_hits.profile.unique())} ARTS2 profile.\"\n",
"summary_report"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "df3a2b07-ad54-478f-b0ea-06e157a8f563",
"metadata": {},
"outputs": [],
"source": [
"color = [\"#264653\", \"#287271\", \"#2a9d8f\", \"#8ab17d\", \"#e9c46a\", \"#f4a261\", \"#ee8959\", \"#e76f51\"]\n",
"function_map = df_hits.function.value_counts().to_dict()\n",
"arts_function_color_map = {}\n",
"for num, item in enumerate(function_map.keys()):\n",
" if num+1 > len(color):\n",
"ctr = 0\n",
"for item in function_map.keys():\n",
" if item == \"Unclassified\":\n",
" arts_function_color_map[item] = 'grey'\n",
" ctr = ctr - 1\n",
" elif ctr+1 > len(color):\n",
" arts_function_color_map[item] = 'grey'\n",
" elif item == \"ResModel\":\n",
" arts_function_color_map[item] = 'red'\n",
" else:\n",
" arts_function_color_map[item] = color[num]\n",
"arts_function_color_map['unclassified'] = \"grey\"\n",
" arts_function_color_map[item] = color[ctr]\n",
" ctr = ctr + 1\n",
"\n",
"arts_node_mapping = df_hits.loc[:, [\"profile\", \"description\", \"function\"]].drop_duplicates().set_index(\"profile\", drop=False)\n",
"bgc_id_mapping = df_hits.loc[:, [\"bgc_id\", \"bgc_type\", \"genome_id\"]].drop_duplicates().set_index(\"bgc_id\", drop=False)\n",
"bgc_id_mapping = df_hits.loc[:, [\"bgc_id\", \"bgc_type\", \"genome_id\"]].drop_duplicates().set_index(\"bgc_id\", drop=False)#.T.to_dict()\n",
"\n",
"for c in [\"bgc_id\", \"bgc_type\", \"genome_id\"]:\n",
" arts_node_mapping[c] = None\n",
Expand All @@ -133,38 +211,47 @@
{
"cell_type": "code",
"execution_count": null,
"id": "df3a2b07-ad54-478f-b0ea-06e157a8f563",
"id": "bca39649-4537-4324-9746-7609c3633894",
"metadata": {},
"outputs": [],
"source": [
"color_map = []\n",
"G = nx.from_pandas_edgelist(df_hits, source='bgc_id', target='profile')\n",
"\n",
"pos = nx.nx_agraph.graphviz_layout(G, prog=\"sfdp\")\n",
"pos = nx.nx_agraph.graphviz_layout(G)\n",
"\n",
"region_score = df_arts_hits.loc[:, [\"bgc_id\", \"hits\"]].dropna().set_index(\"bgc_id\").to_dict()['hits']\n",
"for g in G.nodes:\n",
" # annotate ARTS model\n",
" if g in arts_node_mapping.keys():\n",
" for column in arts_node_mapping[g].keys():\n",
" attrib = str(column)\n",
" G.nodes[g][attrib] = arts_node_mapping[g][attrib]\n",
" color = arts_function_color_map[G.nodes[g]['function']]\n",
" color_map.append(color)\n",
" G.nodes[g][\"color\"] = color\n",
" G.nodes[g][\"node_type\"] = \"arts_model\"\n",
" G.nodes[g][\"node_type\"] = G.nodes[g][\"function\"]\n",
" G.nodes[g][\"text\"] = f\"{G.nodes[g]['profile']}<br>{G.nodes[g]['function']}<br>{G.nodes[g]['description']}\"\n",
" # annotate BGCs\n",
" elif g in bgc_id_mapping.keys():\n",
" for column in bgc_id_mapping[g].keys():\n",
" attrib = str(column)\n",
" G.nodes[g][attrib] = bgc_id_mapping[g][attrib]\n",
" color = \"blue\"\n",
" color_map.append(color)\n",
" G.nodes[g][\"color\"] = color\n",
" G.nodes[g][\"node_type\"] = \"bgc_region\""
" G.nodes[g][\"node_type\"] = \"BGC\"\n",
" taxonomy = df_tax.loc[G.nodes[g]['genome_id'], \"Organism\"]\n",
" G.nodes[g][\"text\"] = f\"{G.nodes[g]['bgc_id']}<br>{G.nodes[g]['bgc_type']}<br>{G.nodes[g]['genome_id']}<br>{taxonomy}\"\n",
" if region_score[g] > 1:\n",
" G.nodes[g][\"node_type\"] = f\"BGC_with_{region_score[g]}_ARTS_hits\"\n",
" G.nodes[g][\"text\"] = G.nodes[g][\"text\"] + \"<br>\" + f\"ARTS hits: {region_score[g]}\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bca39649-4537-4324-9746-7609c3633894",
"id": "21003596-1937-4837-9bd4-f8cc52df7a00",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -175,13 +262,14 @@
{
"cell_type": "code",
"execution_count": null,
"id": "21003596-1937-4837-9bd4-f8cc52df7a00",
"id": "881256ce-f0dd-4f5b-88b9-b17aa360b7f0",
"metadata": {},
"outputs": [],
"source": [
"edge_trace = go.Scatter(\n",
" x=[],\n",
" y=[],\n",
" name=\"ARTS2 hit\",\n",
" line=dict(width=0.5,color='#888'),\n",
" hoverinfo='none',\n",
" mode='lines')\n",
Expand All @@ -196,46 +284,113 @@
{
"cell_type": "code",
"execution_count": null,
"id": "881256ce-f0dd-4f5b-88b9-b17aa360b7f0",
"id": "1a0d9c3d-62b6-4c39-a7da-1ae08af383ff",
"metadata": {},
"outputs": [],
"source": [
"node_trace = go.Scatter(\n",
" x=[],\n",
" y=[],\n",
" text=[],\n",
" mode='markers',\n",
" hoverinfo='text',\n",
" marker=dict(\n",
" showscale=False,\n",
" color=color_map,\n",
" size=10,\n",
" line=dict(width=0)))\n",
"def create_node_trace(G, node_type, shape=\"circle\", opacity=0.8, linewidth=0, linecolor=\"red\"):\n",
" node_color = []\n",
" for node in G.nodes():\n",
" if G.nodes[node]['node_type'] == node_type:\n",
" node_color.append(G.nodes[node]['color']) \n",
"\n",
"for node in G.nodes():\n",
" x, y = G.nodes[node]['pos']\n",
" node_trace['x'] += tuple([x])\n",
" node_trace['y'] += tuple([y])"
" node_trace = go.Scatter(\n",
" ids=[],\n",
" x=[],\n",
" y=[],\n",
" name=node_type,\n",
" text=[],\n",
" mode='markers',\n",
" hoverinfo='text',\n",
" marker_symbol=shape,\n",
" opacity=opacity,\n",
" marker=dict(\n",
" showscale=False,\n",
" color=node_color,\n",
" size=10,\n",
" line=dict(width=linewidth,\n",
" color=linecolor)))\n",
"\n",
" for node in G.nodes():\n",
" if G.nodes[node]['node_type'] == node_type:\n",
" x, y = G.nodes[node]['pos']\n",
" node_trace['ids'] += tuple([node])\n",
" node_trace['x'] += tuple([x])\n",
" node_trace['y'] += tuple([y])\n",
"\n",
" for node, adjacencies in enumerate(G.adjacency()):\n",
" if G.nodes[adjacencies[0]]['node_type'] == node_type:\n",
" node_trace['marker']['color']+=tuple([len(adjacencies[1])])\n",
" node_type = G.nodes[adjacencies[0]]['node_type']\n",
" description = G.nodes[adjacencies[0]]['bgc_type']\n",
" node_info = G.nodes[adjacencies[0]]['text']\n",
" node_trace['text']+=tuple([node_info])\n",
" \n",
" return node_trace"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a0d9c3d-62b6-4c39-a7da-1ae08af383ff",
"id": "c00ce940-e8b1-4fb5-b2d0-f3f61a80b669",
"metadata": {},
"outputs": [],
"source": [
"node_traces = list(function_map.keys())\n",
"node_traces.append(\"BGC\")\n",
"for i in range(4):\n",
" node_traces.append(f\"BGC_with_{i}_ARTS_hits\")\n",
"\n",
"traces = [edge_trace]\n",
"for trace in node_traces:\n",
" shape = \"square\"\n",
" linewidth = 0\n",
" linecolor = \"black\"\n",
" if trace == \"BGC\":\n",
" shape = \"circle\"\n",
" elif trace == \"BGC_with_2_ARTS_hits\":\n",
" shape = \"circle\"\n",
" linewidth = 1\n",
" linecolor = \"orange\"\n",
" elif trace == \"BGC_with_3_ARTS_hits\":\n",
" shape = \"circle\"\n",
" linewidth = 3\n",
" linecolor = \"red\"\n",
" elif trace == \"BGC_with_4_ARTS_hits\":\n",
" shape = \"circle\"\n",
" linewidth = 5\n",
" linecolor = \"red\"\n",
" elif trace == \"ResModel\":\n",
" shape = \"star\"\n",
" new_trace = create_node_trace(G, trace, shape=shape, linewidth=linewidth, linecolor=linecolor)\n",
" traces.append(new_trace)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e1841d93-007f-4130-a22f-407a40059ca7",
"metadata": {},
"outputs": [],
"source": [
"for node, adjacencies in enumerate(G.adjacency()):\n",
" node_trace['marker']['color']+=tuple([len(adjacencies[1])])\n",
" node_type = G.nodes[adjacencies[0]]['node_type']\n",
" if node_type == \"bgc_region\":\n",
" description = G.nodes[adjacencies[0]]['bgc_type']\n",
" function = \"\"\n",
" elif node_type == \"arts_model\":\n",
" description = G.nodes[adjacencies[0]]['description']\n",
" function = G.nodes[adjacencies[0]]['function']\n",
" node_info = f\"{adjacencies[0]}<br>{G.nodes[adjacencies[0]]['node_type']}<br>{description}<br>{function}\"\n",
" node_trace['text']+=tuple([node_info])"
"fig = go.Figure(data=traces,\n",
" layout=go.Layout(\n",
" paper_bgcolor='rgba(0,0,0,0)',\n",
" plot_bgcolor='rgba(0,0,0,0)',\n",
" showlegend=True,\n",
" hovermode='closest',\n",
" margin=dict(b=20,l=5,r=5,t=40),\n",
" xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),\n",
" yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),\n",
" width=750, height=900))\n",
"\n",
"fig = fig.update_layout(legend=dict(\n",
" orientation=\"h\",\n",
" yanchor=\"top\",\n",
" y=0,\n",
" xanchor=\"left\",\n",
" x=0\n",
"))"
]
},
{
Expand All @@ -245,16 +400,7 @@
"metadata": {},
"outputs": [],
"source": [
"fig = go.Figure(data=[edge_trace, node_trace],\n",
" layout=go.Layout(\n",
" showlegend=False,\n",
" hovermode='closest',\n",
" margin=dict(b=20,l=5,r=5,t=40),\n",
" xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),\n",
" yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),\n",
" width=600, height=600))\n",
"\n",
"outfile = Path(\"assets/figures/arts.html\")\n",
"outfile = Path(f\"assets/figures/arts_as{antismash_version}.html\")\n",
"outfile.parent.mkdir(parents=True, exist_ok=True)\n",
"fig.write_html(outfile)\n",
"\n",
Expand Down Expand Up @@ -320,7 +466,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
"version": "3.9.18"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ antismash:
Simon Shaw, Alexander M Kloosterman, Zach Charlop-Powers, Gilles P van Weezel,
Marnix H Medema, & Tilmann Weber. Nucleic Acids Research (2021) [doi: 10.1093/nar/gkab335.](https://academic.oup.com/nar/article/49/W1/W29/6274535?login=false)'
arts:
final_output: "data/processed/{name}/tables/df_arts_as-{version}.csv"
final_output: "data/processed/{name}/tables/df_arts_allhits_as-{version}.csv"
description: Run Antibiotic Resistant Target Seeker (ARTS) on samples.
category: Genome Mining
link:
Expand Down
Loading