-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathASE.py
117 lines (108 loc) · 4.73 KB
/
ASE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""Update 02/18/2020
Modified for filtering genes with bad SNP-position,
and count reads based on individual SNP sites. """
"""This script is to filter bads genes for each genotype
according to the read count results. I filter out bad genes
based on 2 conditions:
1). mis-mapped reads which greater than 5 will be list as candidate bad genes;
2). mis-mapped reads account for some proportion (cutoff) will at last list as bad genes. """
import pandas as pd
import argparse
import os
import pysam
def args_parser():
'''parser the arguments from terminal command'''
parser=argparse.ArgumentParser(prog = "PROG", formatter_class = argparse.RawDescriptionHelpFormatter, description="Perform allele-specific read count on gene-basis. \n\
Usage: python ASE.py ")
parser.add_argument("-v", "--vcf", help = "VCF file with only variants information on exon regions. ")
parser.add_argument("-gtf", "--gtf", help = "gene annotation file in gtf format. ")
parser.add_argument("-bam", "--bam", help = "the bam file for read counts. ")
parser.add_argument("-O", "--output", help = "output name ")
args = parser.parse_args()
return args
def gene_info(args):
'''For each gene id, get the gene oritation from gtf file. Return as a dictionary. '''
ginfo = {}
gtf_handle = pysam.TabixFile(args.gtf, parser = pysam.asGTF())
### only consider genes with variants
for gene in gtf_handle.fetch():
if (gene.feature == "gene") and (gene.gene_biotype in ["pseudogene", "protein_coding"]):
gtig = gene.contig
gid = gene.gene_id
gstrand = gene.strand
gstart = gene.start
gend = gene.end
ginfo[gid] = [gstrand, gtig, gstart, gend]
return ginfo
def read_strand(read, gene_id, strand):
if strand == "+":
if ((read.is_read1 and read.is_reverse) or (read.is_read2 and read.is_reverse == False)):
return True
else:
return False
elif strand == "-":
if ((read.is_read1 and read.is_reverse == False) or (read.is_read2 and read.is_reverse)):
return True
else:
return False
else:
return False
def allele_gene_count(args, gid_inf):
'''Read counts based on SNP sites. '''
### open vcf, bam and gtf
vcf_handle = pysam.TabixFile(args.vcf, parser = pysam.asVCF())
bam_handle = pysam.AlignmentFile(args.bam, "rb")
gtf_handle = pysam.TabixFile(args.gtf, parser = pysam.asGTF())
### output file
filew = open(args.output + ".txt", "w")
filew.write("gene\ttotal_count\tref_count\talt_count\tnon_count\n")
### get gene strand dictionary given gtf file
ginfo = gene_info(args)
###
for gg in ginfo: # gid:[gstrand,gtig, gstart,gend]
gstrand = ginfo[gg][0]
gtig = ginfo[gg][1]
gstart = ginfo[gg][2]
gend = ginfo[gg][3]
total_read = set()
ref_read = set()
alt_read = set()
non_read = set()
for var in vcf_handle.fetch(gtig, gstart, gend):
vpos = var.pos
ref = var.ref
alt = var.alt
### based on SNP positions
for read_column in bam_handle.pileup(gtig, vpos-1, vpos):
if read_column.reference_pos == vpos-1: # important, cannot delete
for bam_read in read_column.pileups:
read_name = bam_read.alignment.query_name
# uniquely mapped, mapped to exon, read strand
if (("NH", 1) in bam_read.alignment.tags and bam_read.query_position != None and read_strand(bam_read.alignment, gg, gstrand)):
###
total_read.add(read_name)
###
allele = bam_read.alignment.query_sequence[bam_read.query_position]
# print("chromosome " + chr + " position " + str(p) + " ref " + ref + " alt " + alt + " allele " + allele)
if allele == ref:
ref_read.add(read_name)
elif allele == alt:
alt_read.add(read_name)
else:
non_read.add(read_name)
###
total_read_count = len(total_read)
ref_read_count = len(ref_read)
alt_read_count = len(alt_read)
non_read_count = len(non_read)
filew.write(f"{gg}\t{total_read_count}\t{ref_read_count}\t {alt_read_count}\t{non_read_count}\n")
filew.close()
def main():
args = args_parser()
gid_inf = gene_info(args)
allele_gene_count(args, gid_inf)
################
#### Run it ####
################
if __name__ == "__main__":
main()