Skip to content

Commit

Permalink
Removes work around from matplotlib/matplotlib#14751 and data visuali…
Browse files Browse the repository at this point in the history
…sation for #83
  • Loading branch information
JamesABaker committed Jan 31, 2020
1 parent 82226cd commit 3130348
Show file tree
Hide file tree
Showing 4 changed files with 289 additions and 55 deletions.
173 changes: 153 additions & 20 deletions scripts/graphs.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from requests import get
from datetime import date
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import scipy.stats as stats
import collections
from datetime import datetime
from scripts.populate_general_functions import impossible_subs
date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')


Expand All @@ -15,7 +17,8 @@ def barchart(objects, performance, source, state, x_label, y_label):
"a": "grey"
}
y_pos = np.arange(len(objects))
plt.bar(y_pos, performance, color=color_dic[state], align='center', alpha=0.5, edgecolor="grey", width=0.3)
plt.bar(y_pos, performance,
color=color_dic[state], align='center', alpha=0.5, edgecolor="grey", width=0.3)
plt.xticks(y_pos, objects)
#plt.xlabel('Residue type')
plt.xlabel(x_label)
Expand All @@ -27,16 +30,108 @@ def barchart(objects, performance, source, state, x_label, y_label):
plt.savefig(filename, bbox_inches='tight')
plt.clf()

def histogram(performance, source, state,x_label, y_label):

def histogram(performance, source, state, x_label, y_label):

plt.yscale('log', nonposy='clip')
plt.hist(performance, bins=len(performance))
plt.gca().set(title=source, ylabel=y_label, xlabel=x_label);
plt.gca().set(title=source, ylabel=y_label, xlabel=x_label)
plt.title(source)
filename = f"images/{source}_{date}.png"
plt.savefig(filename, bbox_inches='tight')
plt.clf()


def impossible_cordinates(aa_order):

impossible_substitutions = impossible_subs()
impossible_aa = []
impossible_x_coordinates = []
impossible_y_coorindates = []
for x_coordinate, x_amino_acid in enumerate(aa_order):
for y_coordinate, y_amino_acid in enumerate(aa_order):
if y_amino_acid in impossible_substitutions[x_amino_acid]:
impossible_x_coordinates.append(x_coordinate)
impossible_y_coorindates.append(y_coordinate)
return(impossible_x_coordinates, impossible_y_coorindates)


def x_histogram(var_freqs_list):
x_values = []
for column_number, frequencies in enumerate(var_freqs_list):
x_coord_total = []
for row in var_freqs_list:
x_coord_total.append(row[column_number])
x_values.append(sum(x_coord_total))
return(x_values)


def y_histogram(var_freqs_list):
y_values = []
for row_number, frequencies in enumerate(var_freqs_list):
y_values.append(sum(var_freqs_list[row_number]))
return(y_values)


def annotate_heatmap(im, data=None, valfmt="{x:.2f}",
textcolors=["black", "white"],
threshold=None, **textkw):
"""
A function to annotate a heatmap.
Parameters
----------
im
The AxesImage to be labeled.
data
Data used to annotate. If None, the image's data is used. Optional.
valfmt
The format of the annotations inside the heatmap. This should either
use the string format method, e.g. "$ {x:.2f}", or be a
`matplotlib.ticker.Formatter`. Optional.
textcolors
A list or array of two color specifications. The first is used for
values below a threshold, the second for those above. Optional.
threshold
Value in data units according to which the colors from textcolors are
applied. If None (the default) uses the middle of the colormap as
separation. Optional.
**kwargs
All other arguments are forwarded to each call to `text` used to create
the text labels.
"""

if not isinstance(data, (list, np.ndarray)):
data = im.get_array()

# Normalize the threshold to the images color range.
if threshold is not None:
threshold = im.norm(threshold)
else:
threshold = im.norm(data.max()) / 2.

# Set default alignment to center, but allow it to be
# overwritten by textkw.
kw = dict(horizontalalignment="center",
verticalalignment="center")
kw.update(textkw)

# Get the formatter in case a string is supplied
if isinstance(valfmt, str):
valfmt = matplotlib.ticker.StrMethodFormatter(valfmt)

# Loop over the data and create a `Text` for each "pixel".
# Change the text's color depending on the data.
texts = []
for i in range(data.shape[0]):
for j in range(data.shape[1]):
kw.update(color=textcolors[int(im.norm(data[i, j]) > threshold)])
text = im.axes.text(j, i, valfmt(data[i, j], None), **kw)
texts.append(text)

return texts


def heatmap(var_freqs_list, title, aa_order, color, is_it_log):
'''
var_freqs_list
Expand All @@ -49,32 +144,70 @@ def heatmap(var_freqs_list, title, aa_order, color, is_it_log):
is the scale logarithmic? Accepts None.
'''

fig, ax = plt.subplots()
im = ax.imshow(var_freqs_list, cmap=color, interpolation='nearest', norm=is_it_log)
left, width = 0.1, 0.65
bottom, height = 0.1, 0.65
spacing = 0.01

rect_heatmap = [left, bottom, width, height]
rect_histx = [left, bottom + height + spacing, width, 0.2]
rect_histy = [left + width + spacing, bottom, 0.2, height]
rect_scale = [left - width, bottom, width, height]

#fig, ax = plt.subplots()
plt.figure(figsize=(10, 10), dpi=80)

ax_heatmap = plt.axes(rect_heatmap)
ax_heatmap.tick_params(direction='in', top=True, right=True)
ax_histy = plt.axes(rect_histy, xticklabels=[], yticklabels=[])
ax_histy.tick_params(direction='in', labelleft=False)
ax_histx = plt.axes(rect_histx, xticklabels=[], yticklabels=[])
ax_histx.tick_params(direction='in', labelbottom=False)
ax_scale = plt.axes(rect_scale)

im = ax_heatmap.imshow(var_freqs_list, cmap=color,
interpolation='nearest', norm=is_it_log)
texts = annotate_heatmap(im, valfmt="{x:.1f}")

# We want to show all ticks...
ax.set_xticks(np.arange(len(aa_order)))
ax.set_yticks(np.arange(len(aa_order)))
ax_heatmap.set_xticks(np.arange(len(aa_order)))
ax_heatmap.set_yticks(np.arange(len(aa_order)))
# ... and label them with the respective list entries
ax.set_xticklabels(aa_order)
ax.set_yticklabels(aa_order)
ax_heatmap.set_xticklabels(aa_order)
ax_heatmap.set_yticklabels(aa_order)
# Add boxes
ax.grid(which='minor', color='b', linestyle='-', linewidth=2)
ax.set_xlabel("Wildtype (from)")
ax.set_ylabel("Variant (to)")
ax.set_ylim(len(aa_order)-0.5, -0.5) # temp bug workaround: https://github.com/matplotlib/matplotlib/issues/14751
ax.set_xlim(len(aa_order)-0, -0.5) # temp bug workaround: https://github.com/matplotlib/matplotlib/issues/14751
ax.set_title(title)

#fig.tight_layout()
filename = f"images/{title}_{date}.png"
ax_heatmap.grid(which='minor', color='b', linestyle='-', linewidth=2)
ax_heatmap.set_xlabel("Wildtype (from)")
ax_heatmap.set_ylabel("Variant (to)")
# ax.set_ylim(len(aa_order)-0.5, -0.5) # temp bug workaround: https://github.com/matplotlib/matplotlib/issues/14751
# ax.set_xlim(len(aa_order)-0, -0.5) # temp bug workaround: https://github.com/matplotlib/matplotlib/issues/14751
ax_heatmap.set_title(title)

flagged_cells = impossible_cordinates(aa_order)
ax_heatmap.scatter(
flagged_cells[0], flagged_cells[1], marker="x", color="gainsboro", s=75)

# histogram on the right
ax_histx.bar(list(range(0, 20)), x_histogram(var_freqs_list),
orientation='vertical', color='grey')
#ax_histx.hist(x_histogram(var_freqs_list), 21, histtype='stepfilled', orientation='vertical', color='grey')

# histogram in the bottom
ax_histy.bar(list(range(0, 20)), y_histogram(var_freqs_list),
orientation='horizontal', color='grey')

# ax_histx.set_xlim(ax_heatmap.get_xlim())
# ax_histy.set_ylim(ax_heatmap.get_ylim())
# And now the colorbar
# --------------------------------------------------------
fig.colorbar(im)
ax_scale = plt.colorbar(im)

# fig.tight_layout()
filename = f"images/{title}_{date}.png"

for x_coordinate, x_amino_acid in enumerate(aa_order):
for y_coordinate, y_amino_acid in enumerate(aa_order):
plt.Circle((x_coordinate, y_coordinate), 0.5, color='black', fill=False)
plt.Circle((x_coordinate, y_coordinate),
0.5, color='black', fill=False)

plt.savefig(filename)
plt.clf()
Expand Down
83 changes: 67 additions & 16 deletions scripts/heatmap_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from django.contrib.admin.models import LogEntry
from django.contrib.auth.models import Group, Permission, User
from django.contrib.contenttypes.models import ContentType
from tmh_db.models import Binding_residue, Database_Metadata, Funfam, Funfam_residue, Funfamstatus, Go, Keyword, Pfam, Pfam_residue, Protein, Residue, Structural_residue, Structure, Subcellular_location, Tmh, Tmh_deltag, Tmh_hydrophobicity, Tmh_residue, Tmh_tmsoc, Uniref, Variant
from tmh_db.models import Database_Metadata, Funfam, Funfam_residue, Funfamstatus, Go, Keyword, Protein, Residue, Structural_residue, Structure, Subcellular_location, Tmh, Tmh_deltag, Tmh_hydrophobicity, Tmh_residue, Tmh_tmsoc, Uniref, Variant
# Shell Plus Django Imports
from django.db import transaction
from django.db.models import Avg, Case, Count, F, Max, Min, Prefetch, Q, Sum, When, Exists, OuterRef, Subquery
Expand All @@ -13,22 +13,29 @@
import numpy as np
import collections
from scripts.graphs import *
from scripts.populate_general_functions import *
from scripts.populate_general_functions import impossible_subs, aa_baezo_order, heatmap_array
from matplotlib.colors import LogNorm

aa_list_baezo_order=['K', 'R', 'E', 'D', 'Q', 'H', 'N', 'P', 'Y', 'W', 'C', 'M', 'T', 'S', 'G', 'V', 'F', 'A', 'I', 'L']

aa_list_baezo_order=aa_baezo_order()
impossible_subs_dict=impossible_subs()

def heatmap_normalised_by_heatmap(title, heatmap_one, heatmap_two):
new_heatmap = []

for row_number, row in enumerate(heatmap_one):
new_heatmap.append([])
for column_number, column in enumerate(heatmap_one):
new_heatmap[row_number].append('')
try:
value=heatmap_one[row_number][column_number]/heatmap_two[row_number][column_number]
except:
#print(aa_list_baezo_order[row_number], impossible_subs_dict[aa_list_baezo_order[column_number]])
if heatmap_two[row_number][column_number] == 0 or heatmap_one[row_number][column_number] == 0:
value=0
elif aa_list_baezo_order[row_number] in impossible_subs_dict[aa_list_baezo_order[column_number]]:
value=0
else:
value=int(heatmap_one[row_number][column_number])/int(heatmap_two[row_number][column_number])
new_heatmap[row_number][column_number]=value
heatmap(np.array(new_heatmap), title, aa_list_baezo_order, "d", None)
heatmap(np.array(new_heatmap), title, aa_list_baezo_order, "PuRd", None)

def remove_duplicate_variants(list_of_variants):
remove_duplicate_list_of_variants = set(list_of_variants)
Expand All @@ -38,15 +45,59 @@ def remove_duplicate_variants(list_of_variants):
truncate_list.append((variant[0], variant[1]))
return(truncate_list)

outside_flank_disease_variants = heatmap_array(remove_duplicate_variants(list(Variant.objects.filter(residue__tmh_residue__feature_location="Outside flank", residue__tmh_residue__evidence="TOPDB", disease_status='d', variant_source="ClinVar").values_list("aa_wt", "aa_mut", "residue__sequence_position", "residue__protein__uniprot_id"))), aa_list_baezo_order)
outside_flank_gnomad_variants = heatmap_array(remove_duplicate_variants(list(Variant.objects.filter(residue__tmh_residue__feature_location="Outside flank", residue__tmh_residue__evidence="TOPDB", variant_source="gnomAD").exclude(aa_mut=F("aa_wt")).values_list("aa_wt", "aa_mut", "residue__sequence_position", "residue__protein__uniprot_id"))), aa_list_baezo_order)
outside_disease_query=Variant.objects.filter(residue__flank_residue__feature_location="Outside flank", residue__flank_residue__flank__tmh__meta_tmh=True, disease_status='d', variant_source="ClinVar").values_list("aa_wt", "aa_mut", "residue__sequence_position", "residue__protein__uniprot_id")
print(len(outside_disease_query), "disease variants in the outside flank")
outside_flank_disease_variants = heatmap_array(remove_duplicate_variants(list(outside_disease_query)), aa_list_baezo_order)

outside_gnomad_query=Variant.objects.filter(residue__flank_residue__feature_location="Outside flank", residue__flank_residue__flank__tmh__meta_tmh=True, variant_source="gnomAD3").exclude(aa_mut=F("aa_wt")).values_list("aa_wt", "aa_mut", "residue__sequence_position", "residue__protein__uniprot_id")
print(len(outside_gnomad_query), "gnomad variants in the outside flank")
outside_flank_gnomad_variants = heatmap_array(remove_duplicate_variants(list(outside_gnomad_query)), aa_list_baezo_order)



inside_disease_query=Variant.objects.filter(residue__flank_residue__feature_location="Inside flank", residue__flank_residue__flank__tmh__meta_tmh=True, disease_status='d', variant_source="ClinVar").values_list("aa_wt", "aa_mut", "residue__sequence_position", "residue__protein__uniprot_id")
print(len(inside_disease_query), "disease variants in the inside flank")
inside_flank_disease_variants = heatmap_array(remove_duplicate_variants(list(inside_disease_query)), aa_list_baezo_order)

inside_gnomad_query=Variant.objects.filter(residue__flank_residue__feature_location="Inside flank", residue__flank_residue__flank__tmh__meta_tmh=True, variant_source="gnomAD3").exclude(aa_mut=F("aa_wt")).values_list("aa_wt", "aa_mut", "residue__sequence_position", "residue__protein__uniprot_id")
print(len(inside_gnomad_query), "gnomad variants in the inside flank")
inside_flank_gnomad_variants = heatmap_array(remove_duplicate_variants(list(inside_gnomad_query)), aa_list_baezo_order)



tmh_disease_query=Variant.objects.filter(residue__tmh_residue__feature_location="TMH", residue__tmh_residue__tmh_id__meta_tmh=True, disease_status='d', variant_source="ClinVar").values_list("aa_wt", "aa_mut", "residue__sequence_position", "residue__protein__uniprot_id")
print(len(tmh_disease_query), "disease variants in the tmh")
tmh_disease_variants = heatmap_array(remove_duplicate_variants(list(tmh_disease_query)), aa_list_baezo_order)
heatmap(np.array(tmh_disease_variants), "test", aa_list_baezo_order, "PuRd", None)

tmh_gnomad_query=Variant.objects.filter(residue__tmh_residue__feature_location="TMH", residue__tmh_residue__tmh_id__meta_tmh=True, variant_source="gnomAD3").exclude(aa_mut=F("aa_wt")).values_list("aa_wt", "aa_mut", "residue__sequence_position", "residue__protein__uniprot_id")
print(len(tmh_gnomad_query), "gnomad variants in the tmh")
tmh_gnomad_variants = heatmap_array(remove_duplicate_variants(list(tmh_gnomad_query)), aa_list_baezo_order)

# I came across some TMHs labelled incorrectly as helices. They mayhave snuck into the database, so to ensure they are not counted as variants, meta-tmh exclude is needed.
helix_disease_query=Variant.objects.filter(residue__non_tmh_helix_residue__nont_tmh_helix_id__helix_start__gte=0, disease_status='d', variant_source="ClinVar").exclude(residue__tmh_residue__tmh_id__meta_tmh=True).values_list("aa_wt", "aa_mut", "residue__sequence_position", "residue__protein__uniprot_id")
print(len(helix_disease_query), "disease variants in the helix")
helix_disease_variants = heatmap_array(remove_duplicate_variants(list(helix_disease_query)), aa_list_baezo_order)

helix_gnomad_query=Variant.objects.filter(residue__non_tmh_helix_residue__nont_tmh_helix_id__helix_start__gte=0, variant_source="gnomAD3").exclude(aa_mut=F("aa_wt")).exclude(residue__tmh_residue__tmh_id__meta_tmh=True).values_list("aa_wt", "aa_mut", "residue__sequence_position", "residue__protein__uniprot_id")
print(len(helix_gnomad_query), "gnomad variants in the helix")
helix_gnomad_variants = heatmap_array(remove_duplicate_variants(list(helix_gnomad_query)), aa_list_baezo_order)



sp_disease_query=Variant.objects.filter(residue__signal_residue__the_signal_peptide__signal_start__gte=0, disease_status='d', variant_source="ClinVar").values_list("aa_wt", "aa_mut", "residue__sequence_position", "residue__protein__uniprot_id")
print(len(sp_disease_query), "disease variants in the signal peptides")
sp_disease_variants = heatmap_array(remove_duplicate_variants(list(sp_disease_query)), aa_list_baezo_order)

sp_gnomad_query=Variant.objects.filter(residue__signal_residue__the_signal_peptide__signal_start__gte=0, variant_source="gnomAD3").exclude(aa_mut=F("aa_wt")).values_list("aa_wt", "aa_mut", "residue__sequence_position", "residue__protein__uniprot_id")
print(len(sp_gnomad_query), "gnomad variants in the signal peptides")
sp_gnomad_variants = heatmap_array(remove_duplicate_variants(list(sp_gnomad_query)), aa_list_baezo_order)


inside_flank_disease_variants = heatmap_array(remove_duplicate_variants(list(Variant.objects.filter(residue__tmh_residue__feature_location="Inside flank", residue__tmh_residue__evidence="TOPDB", disease_status='d', variant_source="ClinVar").values_list("aa_wt", "aa_mut", "residue__sequence_position", "residue__protein__uniprot_id"))), aa_list_baezo_order)
inside_flank_gnomad_variants = heatmap_array(remove_duplicate_variants(list(Variant.objects.filter(residue__tmh_residue__feature_location="Inside flank", residue__tmh_residue__evidence="TOPDB", variant_source="gnomAD").exclude(aa_mut=F("aa_wt")).values_list("aa_wt", "aa_mut", "residue__sequence_position", "residue__protein__uniprot_id"))), aa_list_baezo_order)

tmh_disease_variants = heatmap_array(remove_duplicate_variants(list(Variant.objects.filter(residue__tmh_residue__feature_location="TMH", residue__tmh_residue__evidence="TOPDB", disease_status='d', variant_source="ClinVar").values_list("aa_wt", "aa_mut", "residue__sequence_position", "residue__protein__uniprot_id"))), aa_list_baezo_order)
tmh_gnomad_variants = heatmap_array(remove_duplicate_variants(list(Variant.objects.filter(residue__tmh_residue__feature_location="TMH", residue__tmh_residue__evidence="TOPDB", variant_source="gnomAD").exclude(aa_mut=F("aa_wt")).values_list("aa_wt", "aa_mut", "residue__sequence_position", "residue__protein__uniprot_id"))), aa_list_baezo_order)

heatmap_normalised_by_heatmap("ClinVar normalised by gnomad TOPDB Outside flank", outside_flank_disease_variants, outside_flank_gnomad_variants)
heatmap_normalised_by_heatmap("ClinVar normalised by gnomad TOPDB Inside flank", inside_flank_disease_variants, inside_flank_gnomad_variants)
heatmap_normalised_by_heatmap("ClinVar normalised by gnomad TOPDB TMHnormalised by gnomad", tmh_disease_variants, tmh_gnomad_variants)
heatmap_normalised_by_heatmap("ClinVar normalised by gnomad v3 meta-tmh Outside flank", outside_flank_disease_variants, outside_flank_gnomad_variants)
heatmap_normalised_by_heatmap("ClinVar normalised by gnomad v3 meta-tmh Inside flank", inside_flank_disease_variants, inside_flank_gnomad_variants)
heatmap_normalised_by_heatmap("ClinVar normalised by gnomad v3 meta-tmh TMH", tmh_disease_variants, tmh_gnomad_variants)
heatmap_normalised_by_heatmap("ClinVar normalised by gnomad v3 non-TMH Helix", helix_disease_variants, helix_gnomad_variants)
heatmap_normalised_by_heatmap("ClinVar normalised by gnomad v3 Signal Peptides", sp_disease_variants, sp_gnomad_variants)
4 changes: 2 additions & 2 deletions scripts/heatmaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,9 +442,9 @@
("V", "V"): 98485,
}

aa_list_baezo_order = ['K', 'R', 'E', 'D', 'Q', 'H', 'N', 'P', 'Y', 'W', 'C', 'M', 'T', 'S', 'G', 'V', 'F', 'A', 'I', 'L']
aa_list_baezo_order = ['K', 'R', 'E', 'D', 'Q', 'H', 'N', 'P', 'Y', 'W', 'C', 'M', 'T', 'S', 'G', 'V', 'F', 'A', 'I', 'L']

aa_list_alpha = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
aa_list_alpha = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

# prefetch the residues with variants
Residue.objects.all().prefetch_related("variant")
Expand Down
Loading

0 comments on commit 3130348

Please sign in to comment.