-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathlibis
executable file
·222 lines (194 loc) · 9.47 KB
/
libis
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/env python3
import time
import argparse
import os
import sys
import xml.dom.minidom
from LiBis.computeProcess import computeProcess
from LiBis.utils import *
libis_version = '0.1.6'
def bam_file_name_format(s):
return s.strip().split(',')
def fastq_file_name_format(s):
#split samples using blank. split double end files using comma(,);
ans=[]
for f in s:
if ',' in f:
ff=f.strip().split(',')
else:
ff = [f]
ans.append(ff)
return ans
def getxmlcontent(dom,x):
temp = dom.getElementsByTagName(x)
if len(temp)==0:
return None
else:
return temp[0].firstChild.data
def text_process(filename):
'''
All process for reading config text in XML format.
This only works for the mode which use file to input parameters
'''
dom = xml.dom.minidom.parse(filename)
#root = dom.documentElement
dic={}
tagnames = ['fastq','label','clip','window','step','ref','process','genome','qc','trim','binsize','filter','bam']
tagcontent = list(map(lambda x:getxmlcontent(dom,x),tagnames))
for i in range(len(tagnames)):
name = tagnames[i]
content = tagcontent[i]
if name=='fastq':
dic[name]=fastq_file_name_format(content.strip().split())
'''
Split fastq file name using comma(,)
Format will be like S1_1,S1_2 S2 S3 S4_1,S4_2
feed fastq_file_name_format() a name list like above(use split to eliminate the blank)
it will return [['S1_1','S1_2'],['S2'],['S3'],['S4_1','S4_2']]
'''
continue
if name=='bam':
dic[name]=bam_file_name_format(content)
if name=='label':
dic[name]=content.strip().split()
continue
if name=='ref':
dic[name]=content
continue
if name=='genome':
dic[name]=content
dic[name]=int(content)
return dic
def inputorN(v):
if v:
return v
else:
return None
def valid(param):
'''
check whether parameters are valid.
1. All fastq file exists.
2. Length of labels equals the number of samples.
3. Reference genome exists.
4. Given bam file length equals the number of samples. Even though some of the samples doesn't have given bam files, the number of commas still should be enough.
5. All given bam file exists.
6. RESULT folder should be totally new.
'''
name = param['name']
#for n in name:
# for nn in n:
# if not os.path.exists(nn):
unexist_filename, exist_status = exist(name)
if not exist_status:
raise Exception('Fastq file '+ unexist_filename +' not exist!')
if param['label']==None:
param['label'] = [str(i) for i in range(len(param['name']))]
#raise Exception('Label is needed! Please redefine -l parameter.')
if len(param['label'])!=len(param['name']):
raise Exception('Number of samples and number of labels should be the same!')
if len(set(param['label']))!=len(param['label']):
raise Exception('Repeat labels!')
if param['ref']==None:
raise Exception('Reference is required by -r!')
if not os.path.exists(param['ref']):
raise Exception('Reference '+param['ref']+' not exist!')
bamfile_list = param['bamfiles']
if len(bamfile_list)!=len(param['label']):
raise Exception("Number of samples and number of given bam files should be the sam! Please leave the area blank if you don't need it. For example: a.bam,b.bam,,,,f.bam")
for name in bamfile_list:
if name!='':
if not name.endswith('.bam'):
raise Exception("Given file "+name+' is not a bam file! Please make sure bam files end with ".bam"!')
if not os.path.exists(name):
raise Exception("Given bam file "+ unexist_filename +' not exist!')
# if param['clip'] and param['trim']:
# print("Don't need to trim for clipping mode! Set -t to 0")
#if param['plot']:
if os.path.exists('RESULT'):
if param['nocheck']:
print('"RESULT" exists! --nocheck enabled so continue running.')
else:
raise Exception("RESULT file exists! Please delete RESULT or change name")
else:
os.mkdir("RESULT")
def input_args():
parser = argparse.ArgumentParser()
parser.add_argument('-f','--file',type=int, help=r'Enter a number, 0 means using parameter to set up, 1 means using text file to set up',default=0)
parser.add_argument('-sf','--settingfile',help='Setting txt file name. Ignore if -f is 0.')
parser.add_argument('-n','--name',nargs="*",help=r'Required. Fastq file name. ')
parser.add_argument('-c','--clip',help=r'Clip mode. 0 means close. 1 means open. default: 1',type=int,default=1)
parser.add_argument('-l','--label',nargs="*",help=r'Labels for samples')
parser.add_argument('-g','--genome',help='Genome the reference belong to.(Use for plotting) hg18/hg19/mm10/mm9 and so on. Plotting script will not avaliable if leave it blank. default=hg38.',default='hg38')
parser.add_argument('-w','--window',type=int,help=r'Window length for clipping mode, default=40',default=40)
parser.add_argument('-s','--step',type=int,help=r'Step size for clipping mode, default=5.',default=5)
parser.add_argument('-p','--process',type=int,help=r'Process using for one pipeline. Normally bsmap will cost 8 cpu number. So total will be 8p.',default=8)
parser.add_argument('-r','--ref',help=r'Required. Reference genome file name.')
parser.add_argument('-qc','--QualityControl',help=r'Do quality control',action="store_true")
parser.add_argument('-t','--trim',help=r"Do trimming.",action="store_true")
parser.add_argument('-b','--binsize',type=int,help="Plot setting. Set the bin size for averaging methylation ratio among samples, default=1000k",default=1000000,required=False)
parser.add_argument('-ft','--filter',type=int,help="Minimal length for recombined reads, default=46.",default=46)
parser.add_argument('-bam','--bam',help="Processed bam file for the first step. If bam files are offered here, the first step of bsmap will be skipped. BAM files can only be generated by BSMAP. Different files should be seperated by ','. If there's no bam file for part of the samples, leave the space blank. For example: a.bam,b.bam,,,,f.bam", default='', required=False)
parser.add_argument('-mcall','--mcall',help="Run mcall for mapped bams or not", action="store_true")
parser.add_argument('-plot','--plot',help="Generate the final report or not", action="store_true")
parser.add_argument('-nc','--nocheck',help="Skip the checking step for result folders. Using this parameter may rewrite the previous results.", action="store_true")
parser.add_argument('-module','--MOABSmodule',help='Run LiBis as MOABS module. Please only apply this when integrating LiBis to MOABS.',action='store_true')
parser.add_argument('-mm','--multiplemapping',help="Apply multiple mapping after first around bsmap", action="store_true")
parser.add_argument('-pnf','--pairendfilter',help="Apply pairend filter for all LiBis rescued reads. Only keep reads have uniquely mapped pair by BSMAP or LiBis.", action="store_true")
# parser.add_argument('-rc','--reportclippedregion',help='generate two additional fastq to store clipped region, one from the begining of the reads and the other one from the end.',action='store_true')
parser.add_argument('-bu','--bamu',help='Given processed bam file contains unmapped reads. Only use with -bam is open.',action='store_true')
parser.add_argument('-gz','--gzip',help='Temporary files are in gz format.',action='store_true')
parser.add_argument('-fullmode','--fullmode',help="Keep all temp files.",action='store_true')
parser.add_argument('-v','--version',help="Version information",action='store_true')
args = parser.parse_args()
#print(args.name)
if args.version:
print(libis_version)
sys.exit()
if args.file==1:
param=text_process(args.settingfile)
else:
default_labels = []
param={'name':(fastq_file_name_format(args.name) or None),
'clip':args.clip,
'label':(args.label or None),
'window':int(inputorN(args.window)),
'step':int(inputorN(args.step)),
'threads':str(args.process),
'ref':(args.ref or None),
'qc':int(args.QualityControl),
'trim':int(args.trim),
'genome':args.genome,
'bin':args.binsize,
'filter_len':int(args.filter),
'cleanmode':not args.fullmode,
'mcall': args.mcall,
'nocheck': args.nocheck,
'plot': args.plot,
'moabs': args.MOABSmodule,
'multiple':args.multiplemapping,
'pairend':args.pairendfilter,
'report_clip':0,
'gz':args.gzip,
'bamu': args.bamu
}
if args.bam:
param['bamfiles'] = bam_file_name_format(args.bam)
else:
param['bamfiles'] = ['']*len(param['name'])
if param['moabs']:
param['qc']=0
param['trim']=0
param['mcall']=0
param['plot']=0
#param['multiple'] = True
#param['pairend'] = True
valid(param)
return param
def main():
computeProcess(input_args())
if __name__=="__main__":
main()
#print(time.time())
#param=input_args()
#computeProcess(param)
#print(time.time())