annotate mergeJobs.py @ 10:7d10b55965c9 draft default tip

planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author thondeboer
date Wed, 16 May 2018 17:02:51 -0400
parents 6e75a84e9338
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
1 #!/usr/bin/env python
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
2 import os
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
3 import argparse
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
4
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
5 def getListOfFiles(inDir,pattern):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
6 return [inDir+n for n in os.listdir(inDir) if (pattern in n and os.path.getsize(inDir+n))]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
7
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
8 TEMP_IND = 0
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
9 def stripVCF_header(fn):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
10 global TEMP_IND
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
11 f = open(fn,'r')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
12 ftn = fn+'_temp'+str(TEMP_IND)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
13 f_t = open(ftn,'w')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
14 hasHeader = False
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
15 for line in f:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
16 if line[0] == '#':
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
17 if not hasHeader:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
18 TEMP_IND += 1
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
19 hasHeader = True
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
20 elif hasHeader:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
21 f_t.write(line)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
22 else:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
23 break
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
24 f_t.close()
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
25 f.close()
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
26 if hasHeader:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
27 return ftn
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
28 else:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
29 os.system('rm '+ftn)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
30 return fn
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
31
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
32 def catListOfFiles(l,outName,gzipped=False):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
33 for n in l:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
34 if n[-3:] == '.gz' or n[-5:] == '.gzip':
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
35 gzipped = True
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
36 if gzipped:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
37 for n in l:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
38 if not n[-3:] == '.gz' and not n[-5:] == '.gzip':
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
39 print '\nError: Found a mixture of compressed and decompressed files with the specified prefix. Abandoning ship...\n'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
40 for m in l:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
41 print m
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
42 print ''
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
43 exit(1)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
44 cmd = 'cat '+' '.join(sorted(l))+' > '+outName+'.gz'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
45 else:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
46 cmd = 'cat '+' '.join(sorted(l))+' > '+outName
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
47 print cmd
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
48 os.system(cmd)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
49
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
50 def catBams(l,outName,samtools_exe):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
51 l_sort = sorted(l)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
52 tmp = outName+'.tempHeader.sam'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
53 os.system(samtools_exe+' view -H '+l_sort[0]+' > '+tmp)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
54 cmd = samtools_exe+' cat -h '+tmp+' '+' '.join(l_sort)+' > '+outName
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
55 print cmd
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
56 os.system(cmd)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
57 os.system('rm '+tmp)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
58
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
59
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
60 #####################################
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
61 # main() #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
62 #####################################
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
63
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
64 def main():
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
65
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
66 parser = argparse.ArgumentParser(description='mergeJobs.py')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
67 parser.add_argument('-i', type=str, required=True, metavar='<str>', nargs='+', help="* input prefix: [prefix_1] [prefix_2] ...")
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
68 parser.add_argument('-o', type=str, required=True, metavar='<str>', help="* output prefix")
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
69 parser.add_argument('-s', type=str, required=True, metavar='<str>', help="* /path/to/samtools")
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
70
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
71 args = parser.parse_args()
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
72 (INP,OUP,SAMTOOLS) = (args.i,args.o,args.s)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
73
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
74 inDir = '/'.join(INP[0].split('/')[:-1])+'/'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
75 if inDir == '/':
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
76 inDir = './'
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
77 #print inDir
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
78
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
79 INP_LIST = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
80 for n in INP:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
81 if n[-1] == '/':
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
82 n = n[:-1]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
83 INP_LIST.append(n.split('/')[-1])
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
84 listing_r1 = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
85 listing_r2 = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
86 listing_b = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
87 listing_v = []
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
88 for n in INP_LIST:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
89 listing_r1 += getListOfFiles(inDir,n+'_read1.fq.job')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
90 listing_r2 += getListOfFiles(inDir,n+'_read2.fq.job')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
91 listing_b += getListOfFiles(inDir,n+'_golden.bam.job')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
92 if len(listing_v): # remove headers from vcf files that aren't the first being processed
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
93 initList = getListOfFiles(inDir,n+'_golden.vcf.job')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
94 listing_v += [stripVCF_header(n) for n in initList]
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
95 else:
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
96 listing_v += getListOfFiles(inDir,n+'_golden.vcf.job')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
97
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
98 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
99 # merge fq files
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
100 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
101 if len(listing_r1):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
102 catListOfFiles(listing_r1,OUP+'_read1.fq')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
103 if len(listing_r2):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
104 catListOfFiles(listing_r2,OUP+'_read2.fq')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
105
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
106 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
107 # merge golden alignments, if present
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
108 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
109 if len(listing_b):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
110 catBams(listing_b,OUP+'_golden.bam',SAMTOOLS)
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
111
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
112 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
113 # merge golden vcfs, if present
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
114 #
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
115 if len(listing_v):
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
116 catListOfFiles(listing_v,OUP+'_golden.vcf')
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
117
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
118
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
119 if __name__ == "__main__":
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
120 main()
6e75a84e9338 planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff changeset
121