-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmetautils.py
executable file
·452 lines (401 loc) · 23.8 KB
/
metautils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
import pandas
import os
import sys
import json
import cloudstorage as gcs
#different readlengths for trimmomatic outputs
READLENGTHS = [36,75]
#from utils import metautils
#metautils.srpMeta('SRP012098')
#study_accession experiment_accession sample_accession run_accession experiment_title experiment_attribute taxon_id library_selection library_layout library_strategy library_source library_name bases spots adapter_spec avg_read_length
# SRP141397 SRX3980107 SRS3205258 SRR7049033 S78_shotgun 0 RANDOM PAIRED - WGS METAGENOMIC S78_shotgun 229716627 1009555 227.54245880610765
#study_accession experiment_accession experiment_title experiment_desc organism_taxid organism_name library_name library_strategy library_source library_selection library_layout sample_accession sample_title instrument instrument_model instrument_model_desc total_spots total_size run_accession run_total_spots run_total_bases
#SRP012098 SRX136624 GSM908336: HepG2_IFN_Input; Homo sapiens; RNA-Seq GSM908336: HepG2_IFN_Input; Homo sapiens; RNA-Seq 9606 Homo sapiens GSM908336: HepG2_IFN_Input RNA-Seq TRANSCRIPTOMIC cDNA SINGLE SRS308332 Illumina Genome Analyzer IIx Illumina Genome Analyzer IIx ILLUMINA 30351576 610197365 SRR456549 30351576 1092656736
#read in an SPR metadata
#python -c "from utils import metautils;srp=metautils.srpMeta('SRP012098');print(srp.getFilesByTreatment('IFN'));"
class srpMeta():
def __init__(self,SRP,gstype="gs"):
self.srp=SRP
self.st = pandas.read_csv("metadata/"+SRP+".metadata",sep="\t")
#GSM908336: HepG2_IFN_Input; Homo sapiens; RNA-Seq
#self.st[['GSM','celltype','treatment','casecontrol']] = pandas.read_csv("metadata/"+SRP+".metadata",sep="\t")["library_name"].str.replace(' ','',regex=False).str.split('[_:]',expand=True)
if SRP=='SRP012098':
self.st[['GSM','celltype','treatment','casecontrol']] = pandas.read_csv("metadata/"+SRP+".metadata",sep="\t")["experiment_title"].str.replace(' ','',regex=False).str.replace(';.*','',regex=True).str.split('[_:]',expand=True)
elif SRP=='SRP012096':
self.st[['GSM','treatment','casecontrol']] = pandas.read_csv("metadata/"+SRP+".metadata",sep="\t")["experiment_title"].str.replace(' ','',regex=False).str.replace(';.*','',regex=True).str.replace('[0-9]$','',regex=True).str.split('[_:]',expand=True)
self.st['celltype']='HepG2'
elif SRP=='SRP012099':
self.st[['GSM','casecontrol']] = pandas.read_csv("metadata/"+SRP+".metadata",sep="\t")["experiment_title"].str.replace(' ','',regex=False).str.replace(';.*','',regex=True).str.replace('[0-9]$','',regex=True).str.split('[_:]',expand=True)
self.st['celltype']='HepG2'
self.st['treatment']='None'
self.st['run']=self.st['celltype']+'_'+self.st['treatment']+'_'+self.st['casecontrol']
self.st.columns = self.st.columns.str.replace('.', '',regex=False)
self.st.columns = self.st.columns.str.replace(' ', '',regex=False)
self.process(gstype)
#['NA06984.1.M_111124_4_1.fastq.gz', 'NA06984.1.M_111124_4_2.fastq.gz',
def process(self,gstype='gs',compressed=True):
#create the gs specific paths
#gs://truwl-quertermous/SRR7058289/102901.1.fastq.gz
#gs://truwl-quertermous/SRR7058289/102901.2.fastq.gz
#from utils.metautils import srpMeta
#srp=srpMeta('SRP142360').process()
if gstype=='https':
prefix="https://storage.googleapis.com/truwl-dominissini"
elif gstype=='gs':
prefix="gs://truwl-dominissini"
elif gstype=='local':
prefix=self.srp
else:
raise ValueError("need a gstype of https or gs or local")
if compressed:
suffix='fastq.gz'
else:
suffix='fastq'
self.st['sequence_file'] = self.st.apply(lambda x: "{0}/{1}/{2}_{3}_{4}.{5}".format(prefix,x['run_accession'],x['celltype'],x['treatment'],x['casecontrol'],suffix), axis=1)
def getSRRs(self):
return(self.st['run_accession'].tolist())
def getAllFiles(self,flatten=True,compress=True,gstype='gs'):
"""
flatten - all files go in top directory vs SRP/EXP/SRR
compress - .gz suffix
"""
files=[]
filesofinterest=self.st.copy()
files=list(filesofinterest['sequence_file'].astype(str))
return(files)
def getTreatments(self):
return(list(set(self.st['treatment'].astype(str))))
def getFilesByTreatment(self,treatment,gstype='local',flatten=True,compress=True):
"""
flatten - all files go in top directory vs SRP/EXP/SRR
compress - .gz suffix
"""
self.process(gstype=gstype)
files=[]
filesofinterest=self.st[self.st['treatment']==treatment].copy()
types=list(filesofinterest['casecontrol'].astype(str))
files=list(filesofinterest['sequence_file'].astype(str))
res = dict(zip(types, files))
return(res)
def getRunsByTreatment(self,treatment,gstype='local',flatten=True,compress=True):
"""
flatten - all files go in top directory vs SRP/EXP/SRR
compress - .gz suffix
"""
self.process(gstype=gstype)
files=[]
filesofinterest=self.st[self.st['treatment']==treatment].copy()
types=list(filesofinterest['casecontrol'].astype(str))
files=list(filesofinterest['run'].astype(str))
res = dict(zip(types, files))
return(res)
def getControlRun(self,treatment):
run=self.st.loc[(self.st['treatment']==treatment) & (self.st['casecontrol']=='Input'),'run']
return(run.to_string(index=False))
def getReplicatesofMate(self,strategy,cell,mate):
pair = "truwl_pair{}".format(mate)
foi=self.st.loc[(self.st['library_strategy']==strategy) & (self.st['library_source']==cell), pair]
files = list(foi.astype(str))
return(files)
def printmeripseqManifest(self,gstype='local',compress=False):
"""
Manifest for https://github.com/kingzhuky/meripseqpipe
"""
#Sample_ID,input_FileName,ip_FileName,Group
#HS,HepG2_HS_Input,HepG2_HS_IP,HS
#IFN,HepG2_IFN_Input,HepG2_IFN_IP,IFN
#HGF,HepG2_HGF_Input,HepG2_HGF_IP,HGF
#UV,HepG2_UV_Input,HepG2_UV_IP,UV
print("Sample_ID,input_FileName,ip_FileName,Group")
for treatment in self.getTreatments():
tdict=self.getRunsByTreatment(treatment,gstype=gstype,compress=compress)
print("{},{},{},{}".format(treatment,tdict['Input'],tdict['IP'],treatment))
def printmeripseqConfig(self,gstype='local',compress=False):
for treatment in self.getTreatments():
tdict=self.getRunsByTreatment(treatment,gstype=gstype,compress=compress)
print("['{}.input',['{}']],".format(treatment,tdict['Input']))
print("['{}.ip',['{}']],".format(treatment,tdict['IP']))
def printchipnfManifest(self):
"""
Mainfest for https://github.com/guigolab/chip-nf
"""
# sample1 sample1_run1 /path/to/sample1_run1.fastq.gz - H3
# sample1 sample1_run2 /path/to/sample1_run2.fastq.gz - H3
# sample1 sample1_run3 /path/to/sample1_run3.fastq.gz - H3
# sample1 sample1_run4 /path/to/sample1_run4.fastq.gz - H3
# sample2 sample2_run1 /path/to/sample2_run1.fastq.gz control1 H3K4me2
# control1 control1_run1 /path/to/control1_run1.fastq.gz control1 input
self.process(gstype='local')
self.st['controlrun']=self.st.apply(lambda x: self.getControlRun(x['treatment']), axis=1)
#self.st.apply(lambda x: "{0}\t{1}\t{2}\t{3}\t{4}".format(x['study_accession'],x['run'],x['sequence_file'],self.getControlRun(x['treatment']),x['casecontrol']), axis=1)
print(self.st[['run_accession','run','sequence_file','controlrun','casecontrol']].to_csv(header=False,index=False,sep="\t"))
def printnfcoreManifest(self):
"""
Manifest for https://github.com/nf-core/chipseq
"""
#group,replicate,fastq_1,fastq_2,antibody,control
#WT_BCATENIN_IP,1,BLA203A1_S27_L006_R1_001.fastq.gz,,BCATENIN,WT_INPUT
#WT_BCATENIN_IP,2,BLA203A25_S16_L002_R1_001.fastq.gz,,BCATENIN,WT_INPUT
#WT_BCATENIN_IP,3,BLA203A49_S40_L001_R1_001.fastq.gz,,BCATENIN,WT_INPUT
#WT_INPUT,1,BLA203A6_S32_L006_R1_001.fastq.gz,,,
#WT_INPUT,2,BLA203A30_S21_L002_R1_001.fastq.gz,,,
#WT_INPUT,3,BLA203A31_S21_L003_R1_001.fastq.gz,,,
self.process(gstype='local')
self.st['controlrun']=self.st.apply(lambda x: self.getControlRun(x['treatment']) if x['casecontrol']!='Input' else '', axis=1)
self.st['replicate']=1
self.st['empty']=''
print("group,replicate,fastq_1,fastq_2,antibody,control")
print(self.st[['run','replicate','sequence_file','empty','casecontrol','controlrun']].to_csv(header=False,index=False))
def getATACManifest(self):
cells = list(set(self.st.loc[self.st['library_strategy']=='ATAC-seq', 'library_source'].astype(int)))
manifest = {}
manifest["atac.pipeline_type"] = "atac"
manifest["atac.genome_tsv"] = "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v3/hg38.tsv"
manifest["atac.paired_end"] = "true"
manifest["atac.auto_detect_adapter"] = "true"
manifest["atac.enable_xcor"] = "true"
manifest["atac.title"] = "SRP142360ATAC"
manifest["atac.description"] = "ATAC-seq on human coronary artery smooth muscle cells (HCASMC)"
for i in range(len(cells)):
for pair in [1,2]:
reps=self.getReplicatesofMate('ATAC-seq',cells[i],pair)
manifest["atac.fastqs_rep{}_R{}".format(i+1,pair)] = reps
print(json.dumps(manifest, indent=4))
# {
# "atac.pipeline_type" : "atac",
# "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v3/hg38.tsv",
# "atac.fastqs_rep1_R1" : [
# "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz",
# "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz"
# ],
# "atac.fastqs_rep1_R2" : [
# "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz",
# "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz"
# ],
# "atac.fastqs_rep2_R1" : [
# "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz",
# "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz",
# "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz",
# "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz",
# "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz",
# "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz"
# ],
# "atac.fastqs_rep2_R2" : [
# "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz",
# "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz",
# "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz",
# "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz",
# "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz",
# "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz"
# ],
# "atac.paired_end" : true,
# "atac.auto_detect_adapter" : true,
# "atac.enable_xcor" : true,
# "atac.title" : "SRP142360ATAC",
# "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation"
# }
def getRNAManifest(self):
cells = list(set(self.st.loc[self.st['library_strategy']=='RNA-Seq', 'library_source'].astype(int)))
manifest = {}
manifest["rna.pipeline_type"] = "atac"
manifest["rna.genome_tsv"] = "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v3/hg38.tsv"
manifest["rna.paired_end"] = "true"
manifest["rna.auto_detect_adapter"] = "true"
manifest["rna.enable_xcor"] = "true"
manifest["rna.title"] = "SRP142360RNA"
manifest["rna.description"] = "RNA-seq on human coronary artery smooth muscle cells (HCASMC)"
for i in range(len(cells)):
for pair in [1,2]:
reps=self.getReplicatesofMate('RNA-Seq',cells[i],pair)
manifest["rna.fastqs_rep{}_R{}".format(i+1,pair)] = reps
print(json.dumps(manifest, indent=4))
def getAllFiles(self):
getFilesFromRunList(self.populationRuns('ALL'))
#>>> metautils.calculateExpected('ALL')
#Expecting 413 fastq and 253 bams
def calculateExpected(self,population):
fastq = self.getFilesFromRunList(self.populationRuns(population))
bams = self.populationRuns(population)
print("Expecting {0} fastq and {1} bams".format(len(fastq),len(bams)))
#GBR FIN ALL YRI TSI
#>>> metautils.populationRuns('ALL')
#['NA12286.2.MI_120126_5', 'NA10851.4.M_120208_1',
def populationRuns(self,population):
return(self.getSRRs())
#Check to see if paired-end sequencing was used in each sample so appropriate stAR parameters are called
def isPaired(self,run):
if self.st.loc[self.st['Assay Name']==run]['Comment[LIBRARY_LAYOUT]'].all()=='PAIRED':
return(True)
#Either run star with paired or unpaired options
def starProgram(self,run):
if self.isPaired(run):
return("star_paired.sh")
else:
return("star_unpaired.sh")
#Run trimmomatic bash script with paired or unpaired options
def trimProgram(self,run):
if self.isPaired(run):
return("trimmomatic_paired.sh")
else:
return("trimmomatic_unpaired.sh")
#pairedOrSinglefastqInput('NA06984.1.M_111124_4')
#['NA06984.1.M_111124_4_1.fastq.gz', 'NA06984.1.M_111124_4_2.fastq.gz']
#pairedOrSinglefastqInput('NA06985.1.MI_120104_3')
#['NA06985.1.MI_120104_3_1.fastq.gz']
#set file names for trimmomatic based on whether a trimmed input or output is expected
def pairedOrSinglefastqInput(self,run,trimmed=False,readlength=None):
if trimmed:
extName=".trimmed_{0}.fastq.gz".format(readlength)
else:
extName=".fastq.gz"
if self.isPaired(run):
return([run+"_1"+extName,run+"_2"+extName])
else:
return([run+"_1"+extName])
def getPathFromName(self,name):
return(ALL_LOOKUP[name])
#ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR188/ERR188404/ERR188404_1.fastq.gz
def getType(self,runName):
return(self.getProperty(runName, 'Platform'))
def getProperty(self, runName, column):
stseries = self.st.loc[self.st['Run'] == runName, column]
if len(self.stseries) == 1:
return(self.stseries.to_self.string(index=False)) # pandas is ridiculous
elif len(self.stseries) > 1:
raise ValueError("Not expecting to match multiple run names, just 1")
else:
raise ValueError("Can't find that run {0}".format(runName))
def getMemory(self, runName):
return(int(getProperty(runName, 'size_MB')))
#Up to 128 letters (uppercase and lowercase), numbers, hyphens, and underscores are allowed
def cleanJobname(self, name):
return(name.replace('.','_'))
def getECS(self, runName, units, program):
if program == 'minimap':
mb = 32000
elif program == 'star':
superIntensive = ['NA12716.7.M_120219_6', 'NA12775.7.M_120219_5',
'NA12814.7.M_120219_3', 'NA11831.7.M_120219_2',
'NA11994.2.M_111216_7', 'NA11893.7.M_120219_3']
if runName in superIntensive:
mb = 64000
else:
mb = 48000
elif program == 'IsoModule':
mb = 192000
elif program == 'samtoolsindex':
mb = 16000
elif program == 'samtoolsmerge':
mb = 16000
elif program == 'samtoolssubsample':
mb = 16000
elif program == 'fastx':
mb = 8000
elif program == 'trimmomatic':
mb = 8000
elif program == 'rmats':
mb = 16000
else:
raise ValueError
if units == 'bytes':
return(1048576*mb)
elif units == 'mb':
return(mb)
# 2 # number of sample
# 3 # number of replicates in sample #1
# sam1_rep1.bam
# sam1_rep2.bam
# sam1_rep3.bam
# 3 # number of replicates in sample #2
# sam2_rep1.bam
# sam2_rep2.bam
# sam2_rep3.bam
#Used for a two way comparison between various samples
def twoSampleComparisonManifest(self, biosamp1, biosamp2, filename):
text_file = open(filename, "w")
text_file.write("2\n") # two-way comparison
for biosamp in [biosamp1, biosamp2]:
run = getRunBamsFromBioSample(biosamp, include_bai=False)
text_file.write("{0}\n".format(len(run)))
for replicate in run:
text_file.write("{0}\n".format(replicate))
#Used for a two way comparison between various samples
def twoTreatmentComparisonManifest(self, treatment1, treatment2, filename):
text_file = open(filename, "w")
text_file.write("2\n") # two-way comparison
for treatment in [treatment1, treatment2]:
run = getRunBamsFromTreatment(treatment, include_bai=False)
text_file.write("{0}\n".format(len(run)))
for replicate in run:
text_file.write("{0}\n".format(replicate))
# "panorama-clk-repro/SRP091981/
#>>> metautils.getRunBamsFromBioSample('NA12778')
#['NA12778.1.M_111124_4.Aligned.sortedByCoord.out.bam', 'NA12778.1.M_111124_4.Aligned.sortedByCoord.out.bam.bai', 'NA12778.1.M_111124_4.Aligned.sortedByCoord.out.bam', 'NA12778.1.M_111124_4.Aligned.sortedByCoord.out.bam.bai', 'NA12778.1.MI_120104_3.Aligned.sortedByCoord.out.bam', 'NA12778.1.MI_120104_3.Aligned.sortedByCoord.out.bam.bai']
#Get a List of bams originating from the same biological source with muleiples removed (useful for the merge step, where multiple runs must be collapsed down to one)
def getRunBamsFromBioSample(self, biosamp, readlength, include_s3=None, include_bai=True):
# if include_bai:
# exts = ['bam', 'bam.bai']
# else:
exts = ['bam']
runs = self.getRunsFromBioSample(biosamp)
if include_s3:
bams = ["{0}/{1}.{3}_trimmed.Aligned.sortedByCoord.out.{2}".format(
include_s3, replicate, ext, readlength) for replicate in runs for ext in exts]
else:
bams = ["{0}.trimmed_{2}.Aligned.sortedByCoord.out.{1}".format(
replicate, ext, readlength) for replicate in runs for ext in exts]
return(bams)
def getRunBamsFromTreatment(self, treatment, readlength, include_s3=None, include_bai=True):
# if include_bai:
# exts = ['bam', 'bam.bai']
# else:
exts = ['bam']
runs = self.getRunsFromTreatment(treatment)
if include_s3:
bams = ["{0}/{1}.{3}_trimmed.Aligned.sortedByCoord.out.{2}".format(
include_s3, replicate, ext, readlength) for replicate in runs for ext in exts]
else:
bams = ["{0}.trimmed_{2}.Aligned.sortedByCoord.out.{1}".format(
replicate, ext, readlength) for replicate in runs for ext in exts]
return(bams)
#>>> metautils.getRunsFromBioSample('NA12778')
#['NA12778.1.M_111124_4', 'NA12778.1.M_111124_4', 'NA12778.1.MI_120104_3']
#Get a List of runs from the same source with muleiples removed (useful for the merge step, where multiple runs must be collapsed down to one)
def getRunsFromBioSample(self, biosample,include_bai = True,allowSingle=True):
return(List(set(self.st.loc[(self.st['Source Name'] == biosample) & (self.st['Comment[Quality Control passed]'] == 1)]['Assay Name'].toList())))
def getRunsFromTreatment(self, treatment,include_bai = True,allowSingle=True):
return(List(set(self.st.loc[(self.st['treatment'] == treatment) & (self.st['Comment[Quality Control passed]'] == 1)]['Assay Name'].toList())))
#metautils.populationBiosamples('ALL')
#['NA12778', 'NA12045', 'NA12144', ...
#Returns either all runs that pass the QC check for a specific population or all populations depending on arguments
def populationBiosamples(self,population):
if(population=='ALL'):
return(list(set(self.st.loc[(self.st['Comment[Quality Control passed]'] == 1)]['Source Name'])))
else:
return(list(set(self.st.loc[(self.st['Characteristics[population]'] == population) & (self.st['Comment[Quality Control passed]'] == 1)]['Source Name'])))
#This returns all sequencing runs over 70 nt (longruns) looks through the population being considered appends them to the sample List using the mergedbam file naming convention, and then appends all runs from that population with a 36nt trimmed suffix
def getListOfLengthOutputs(self,population):
x = 0
biosamps = self.populationBiosamples(population)
biosamp_lengthList = []
longruns = list(set(self.st.loc[self.st['Comment[SEQUENCE_LENGTH]'].astype(int)<70]['Source Name'].toList()))
for item in longruns:
if (item in biosamps) and (item != 'NA07000'):
biosamp_lengthList.append("trimmed_75nt/"+item +"/"+item + ".rmats")
for item in biosamps:
if item != 'NA07000':
biosamp_lengthList.append("trimmed_75nt/"+item +"/"+item + ".rmats")
return(biosamp_lengthList)
def GCSExists(gcs_file):
'''
True if file exists; pass complete /bucket/file
'''
try:
file = gcs.open(gcs_file,'r')
file.close()
status = True
except:
status = False
return status
if __name__== "__main__":
srpMeta(sys.argv[1]).get16SFiles()