-
Notifications
You must be signed in to change notification settings - Fork 166
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1684 from nextstrain/entropy-updates
Entropy panel mk2
- Loading branch information
Showing
34 changed files
with
2,879 additions
and
1,094 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
NEW_ANNOTATION = { | ||
"nuc": { | ||
"start": 1, | ||
"end": 29903, | ||
"strand": "+" | ||
}, | ||
"ORF1ab": { | ||
"gene": "ORF1ab", | ||
"strand": "+", | ||
"segments":[ | ||
{"start": 266, "end": 13468, "name": "ORF1a"}, | ||
{"start": 13468, "end": 21555, "name": "ORF1b"} | ||
], | ||
"display_name": "AKA polyprotein PP1ab. -1 ribisomal frameshift. Cleaved to yield 15 nonstructural proteins (NSP1-10, 12-16)" | ||
}, | ||
"PP1a": { | ||
"gene": "ORF1ab", | ||
"start": 266, | ||
"end": 13483, | ||
"display_name": "Polyprotein PP1a. Cleaved to yield 11 nonstructural proteins (NSP1-11)" | ||
}, | ||
"NSP3": { | ||
"gene": "ORF1ab", | ||
"color": "#2c7fb8", | ||
"start": 266 + (819-1)*3, | ||
"end": 266 + (2763-1)*3 -1, | ||
"display_name": "Cleaved from short + long polyproteins", | ||
"strand": "+", | ||
}, | ||
"RdRp": { | ||
"gene": "ORF1ab", | ||
"color": "#41b6c4", | ||
# Length is 2796nt (932aa) | ||
"segments":[ | ||
{ # first segment is before the slip | ||
"start": 266 + (4393-1)*3, # 13442 | ||
"end": 13468, | ||
}, | ||
{ | ||
"start": 13468, | ||
"end": 13468 + 2796 -1 | ||
} | ||
], | ||
"display_name": "NSP12; Cleaved from long polyprotein only; I'm not sure if the coordinates are correct, BTW!!!", | ||
"strand": "+", | ||
}, | ||
"S": { | ||
"gene": "Spike", | ||
"end": 25384, | ||
"display_name": "structural protein; spike protein; surface glycoprotein", | ||
"start": 21563, | ||
"strand": "+", | ||
}, | ||
"E": { | ||
"end": 26472, | ||
"dsiplay_name": "ORF4; structural protein; E protein", | ||
"start": 26245, | ||
"strand": "+", | ||
"type": "CDS" | ||
}, | ||
"M": { | ||
"end": 27191, | ||
"start": 26523, | ||
"strand": "+", | ||
"gene": "M", | ||
"display_name": "ORF5; structural protein (membrane glycoprotein)" | ||
}, | ||
"N": { | ||
"end": 29533, | ||
"display_name": "nucleocapsid phosphoprotein (ORF9)", | ||
"start": 28274, | ||
"strand": "+", | ||
}, | ||
"ORF3a": { | ||
"end": 26220, | ||
"start": 25393, | ||
"strand": "+", | ||
}, | ||
"ORF6": { | ||
"end": 27387, | ||
"start": 27202, | ||
"strand": "+", | ||
}, | ||
"ORF7a": { | ||
"end": 27759, | ||
"start": 27394, | ||
"strand": "+", | ||
}, | ||
"ORF7b": { | ||
"end": 27887, | ||
"start": 27756, | ||
"strand": "+", | ||
}, | ||
"ORF8": { | ||
"end": 28259, | ||
"start": 27894, | ||
"strand": "+", | ||
}, | ||
"ORF9b": { | ||
"end": 28577, | ||
"start": 28284, | ||
"strand": "+", | ||
}, | ||
} | ||
|
||
def a_pos_b(m): | ||
return (m[0], int(m[1:-1]), m[-1]) | ||
|
||
def recurse(node): | ||
|
||
mutations = node.get('branch_attrs', {}).get('mutations', {}) | ||
if 'ORF1a' in mutations: | ||
# ORF1a -> ORF1ab is no-change | ||
mutations['ORF1ab'] = [*mutations['ORF1a']] | ||
mutations['PP1a'] = [*mutations['ORF1a']] | ||
del mutations['ORF1a'] | ||
if 'ORF1b' in mutations: | ||
if 'ORF1ab' not in mutations: | ||
mutations['ORF1ab'] = []; | ||
for m in mutations['ORF1b']: | ||
# ORF1b is in phase with ORF1a | ||
a, pos, b = a_pos_b(m) | ||
mutations['ORF1ab'].append(f"{a}{pos+4401}{b}") | ||
del mutations['ORF1b'] | ||
|
||
# Extract mutations which fall in NSP3 | ||
if 'ORF1ab' in mutations: | ||
mutations['NSP3'] = [] | ||
for m in mutations['ORF1ab']: | ||
a, pos, b = a_pos_b(m) | ||
# relative to PP1ab the coords are 819..2763 (in aa space) | ||
if pos>=819 and pos<=2763: | ||
mutations['NSP3'].append(f"{a}{pos-819+1}{b}") | ||
|
||
# Extract mutations which fall in RdRp | ||
if 'ORF1ab' in mutations: | ||
mutations['RdRp'] = [] | ||
for m in mutations['ORF1ab']: | ||
a, pos, b = a_pos_b(m) | ||
# relative to PP1ab the coords are 4393..5324 (in aa space, so don't need to worry about -1 slippage) | ||
if pos>=4393 and pos<=5324: | ||
mutations['RdRp'].append(f"{a}{pos-4393+1}{b}") | ||
|
||
if "children" in node: | ||
[recurse(child) for child in node["children"]] | ||
|
||
|
||
|
||
import json | ||
with open("./data/nextclade_sars-cov-2.json", 'r') as fh: | ||
dataset = json.load(fh) | ||
recurse(dataset['tree']) | ||
dataset['meta']['genome_annotations'] = NEW_ANNOTATION | ||
dataset['meta']['title'] = 'nCoV with adjusted annotations (use with caution!)' | ||
with open("./datasets/entropy2023/entropy-test-data_ncov.json", 'w') as fh: | ||
json.dump(dataset, fh, indent=2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,84 +1,20 @@ | ||
#!/bin/bash | ||
|
||
data_files=( | ||
"dengue_all.json" "dengue_denv1.json" "dengue_denv2.json" "dengue_denv3.json" "dengue_denv4.json"\ | ||
"ebola.json" "ebola_root-sequence.json" \ | ||
"ebola_2019-09-14-no-epi-id_meta.json" "ebola_2019-09-14-no-epi-id_tree.json" \ | ||
"lassa_s_tree.json" "lassa_s_meta.json" \ | ||
"lassa_l_tree.json" "lassa_l_meta.json" \ | ||
"measles.json" \ | ||
"mers_tree.json" "mers_meta.json" \ | ||
"mumps_global.json" "mumps_na.json" \ | ||
"WNV_NA_tree.json" "WNV_NA_meta.json" \ | ||
"entropy-test-data_hepB.json" \ | ||
"entropy-test-data_ncov.json" \ | ||
"zika.json" \ | ||
"tb_global_meta.json" "tb_global_tree.json" \ | ||
"enterovirus_d68_genome_meta.json" "enterovirus_d68_genome_tree.json" \ | ||
"enterovirus_d68_vp1_meta.json" "enterovirus_d68_vp1_tree.json" \ | ||
############## AVIAN FLU ############## | ||
"flu_avian_h7n9_ha.json" \ | ||
"flu_avian_h7n9_mp.json" \ | ||
"flu_avian_h7n9_na.json" \ | ||
"flu_avian_h7n9_np.json" \ | ||
"flu_avian_h7n9_ns.json" \ | ||
"flu_avian_h7n9_pa.json" \ | ||
"flu_avian_h7n9_pb1.json" \ | ||
"flu_avian_h7n9_pb2.json" \ | ||
############## SEASONAL FLU ############## | ||
"flu_seasonal_h3n2_ha_2y.json" "flu_seasonal_h3n2_ha_2y_tip-frequencies.json" \ | ||
"flu_seasonal_h3n2_ha_3y.json" "flu_seasonal_h3n2_ha_3y_tip-frequencies.json" \ | ||
"flu_seasonal_h3n2_ha_6y.json" "flu_seasonal_h3n2_ha_6y_tip-frequencies.json" \ | ||
"flu_seasonal_h3n2_ha_12y.json" "flu_seasonal_h3n2_ha_12y_tip-frequencies.json" \ | ||
"flu_seasonal_h3n2_na_2y.json" "flu_seasonal_h3n2_na_2y_tip-frequencies.json" \ | ||
"flu_seasonal_h3n2_na_3y.json" "flu_seasonal_h3n2_na_3y_tip-frequencies.json" \ | ||
"flu_seasonal_h3n2_na_6y.json" "flu_seasonal_h3n2_na_6y_tip-frequencies.json" \ | ||
"flu_seasonal_h3n2_na_12y.json" "flu_seasonal_h3n2_na_12y_tip-frequencies.json" \ | ||
"flu_seasonal_h1n1pdm_ha_2y.json" "flu_seasonal_h1n1pdm_ha_2y_tip-frequencies.json" \ | ||
"flu_seasonal_h1n1pdm_ha_3y.json" "flu_seasonal_h1n1pdm_ha_3y_tip-frequencies.json" \ | ||
"flu_seasonal_h1n1pdm_ha_6y.json" "flu_seasonal_h1n1pdm_ha_6y_tip-frequencies.json" \ | ||
"flu_seasonal_h1n1pdm_ha_12y.json" "flu_seasonal_h1n1pdm_ha_12y_tip-frequencies.json" \ | ||
"flu_seasonal_h1n1pdm_ha_pandemic_meta.json" "flu_seasonal_h1n1pdm_ha_pandemic_tree.json" "flu_seasonal_h1n1pdm_ha_pandemic_tip-frequencies.json" \ | ||
"flu_seasonal_h1n1pdm_na_2y.json" "flu_seasonal_h1n1pdm_na_2y_tip-frequencies.json" \ | ||
"flu_seasonal_h1n1pdm_na_3y.json" "flu_seasonal_h1n1pdm_na_3y_tip-frequencies.json" \ | ||
"flu_seasonal_h1n1pdm_na_6y.json" "flu_seasonal_h1n1pdm_na_6y_tip-frequencies.json" \ | ||
"flu_seasonal_h1n1pdm_na_12y.json" "flu_seasonal_h1n1pdm_na_12y_tip-frequencies.json" \ | ||
"flu_seasonal_h1n1pdm_na_pandemic_tree.json" "flu_seasonal_h1n1pdm_na_pandemic_meta.json" "flu_seasonal_h1n1pdm_na_pandemic_tip-frequencies.json" \ | ||
"flu_seasonal_vic_ha_2y.json" "flu_seasonal_vic_ha_2y_tip-frequencies.json" "flu_seasonal_vic_ha_2y_root-sequence.json" \ | ||
"flu_seasonal_vic_ha_3y.json" "flu_seasonal_vic_ha_3y_tip-frequencies.json" "flu_seasonal_vic_ha_3y_root-sequence.json" \ | ||
"flu_seasonal_vic_ha_6y.json" "flu_seasonal_vic_ha_6y_tip-frequencies.json" "flu_seasonal_vic_ha_6y_root-sequence.json" \ | ||
"flu_seasonal_vic_ha_12y.json" "flu_seasonal_vic_ha_12y_tip-frequencies.json" "flu_seasonal_vic_ha_12y_root-sequence.json" \ | ||
"flu_seasonal_vic_na_2y.json" "flu_seasonal_vic_na_2y_tip-frequencies.json" "flu_seasonal_vic_na_2y_root-sequence.json" \ | ||
"flu_seasonal_vic_na_3y.json" "flu_seasonal_vic_na_3y_tip-frequencies.json" "flu_seasonal_vic_na_3y_root-sequence.json" \ | ||
"flu_seasonal_vic_na_6y.json" "flu_seasonal_vic_na_6y_tip-frequencies.json" "flu_seasonal_vic_na_6y_root-sequence.json" \ | ||
"flu_seasonal_vic_na_12y.json" "flu_seasonal_vic_na_12y_tip-frequencies.json" "flu_seasonal_vic_na_12y_root-sequence.json" \ | ||
"flu_seasonal_yam_ha_2y.json" "flu_seasonal_yam_ha_2y_tip-frequencies.json" "flu_seasonal_yam_ha_2y_root-sequence.json" \ | ||
"flu_seasonal_yam_ha_3y.json" "flu_seasonal_yam_ha_3y_tip-frequencies.json" "flu_seasonal_yam_ha_3y_root-sequence.json" \ | ||
"flu_seasonal_yam_ha_6y.json" "flu_seasonal_yam_ha_6y_tip-frequencies.json" "flu_seasonal_yam_ha_6y_root-sequence.json" \ | ||
"flu_seasonal_yam_ha_12y.json" "flu_seasonal_yam_ha_12y_tip-frequencies.json" "flu_seasonal_yam_ha_12y_root-sequence.json" \ | ||
"flu_seasonal_yam_na_2y.json" "flu_seasonal_yam_na_2y_tip-frequencies.json" "flu_seasonal_yam_na_2y_root-sequence.json" \ | ||
"flu_seasonal_yam_na_3y.json" "flu_seasonal_yam_na_3y_tip-frequencies.json" "flu_seasonal_yam_na_3y_root-sequence.json" \ | ||
"flu_seasonal_yam_na_6y.json" "flu_seasonal_yam_na_6y_tip-frequencies.json" "flu_seasonal_yam_na_6y_root-sequence.json" \ | ||
"flu_seasonal_yam_na_12y.json" "flu_seasonal_yam_na_12y_tip-frequencies.json" "flu_seasonal_yam_na_12y_root-sequence.json" \ | ||
############## LATEST CORE SARS-CoV-2 (COVID-19) BUILDS ############## | ||
"ncov_gisaid_global.json" "ncov_gisaid_global_tip-frequencies.json" \ | ||
"ncov_gisaid_africa.json" "ncov_gisaid_africa_tip-frequencies.json" \ | ||
"ncov_gisaid_asia.json" "ncov_gisaid_asia_tip-frequencies.json" \ | ||
"ncov_gisaid_europe.json" "ncov_gisaid_europe_tip-frequencies.json" \ | ||
"ncov_gisaid_north-america.json" "ncov_gisaid_north-america_tip-frequencies.json" \ | ||
"ncov_gisaid_oceania.json" "ncov_gisaid_oceania_tip-frequencies.json" \ | ||
"ncov_gisaid_south-america.json" "ncov_gisaid_south-america_tip-frequencies.json" \ | ||
############## TIMESTAMPED SARS-CoV-2 BUILDS USED IN NARRATIVES ############# | ||
"ncov_2020-01-23.json" "ncov_2020-01-25.json" "ncov_2020-01-26.json" "ncov_2020-01-30.json" \ | ||
"ncov_2020-03-04.json" "ncov_2020-03-05.json" "ncov_2020-03-11.json" "ncov_2020-03-13.json" \ | ||
"ncov_2020-03-20.json" "ncov_2020-03-27.json" "ncov_2020-04-03.json" \ | ||
"ncov_global_2020-04-09.json" "ncov_north-america_2020-04-17.json" \ | ||
"monkeypox_mpxv.json" \ | ||
) | ||
|
||
rm -rf data/ | ||
mkdir -p data/ | ||
for i in "${data_files[@]}" | ||
do | ||
curl http://data.nextstrain.org/"${i}" --compressed -o data/"${i}" | ||
curl http://staging.nextstrain.org/"${i}" --compressed -o data/"${i}" | ||
done | ||
|
||
echo "Copying the test datasets from test/data to data" | ||
cp -r test/data/*.json data/ | ||
|
||
echo "The local data directory ./data now contains a selection of up-to-date datasets from http://data.nextstrain.org" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.