comparison mergeJobs.py @ 0:6e75a84e9338 draft

planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author thondeboer
date Tue, 15 May 2018 02:39:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6e75a84e9338
1 #!/usr/bin/env python
2 import os
3 import argparse
4
5 def getListOfFiles(inDir,pattern):
6 return [inDir+n for n in os.listdir(inDir) if (pattern in n and os.path.getsize(inDir+n))]
7
8 TEMP_IND = 0
9 def stripVCF_header(fn):
10 global TEMP_IND
11 f = open(fn,'r')
12 ftn = fn+'_temp'+str(TEMP_IND)
13 f_t = open(ftn,'w')
14 hasHeader = False
15 for line in f:
16 if line[0] == '#':
17 if not hasHeader:
18 TEMP_IND += 1
19 hasHeader = True
20 elif hasHeader:
21 f_t.write(line)
22 else:
23 break
24 f_t.close()
25 f.close()
26 if hasHeader:
27 return ftn
28 else:
29 os.system('rm '+ftn)
30 return fn
31
32 def catListOfFiles(l,outName,gzipped=False):
33 for n in l:
34 if n[-3:] == '.gz' or n[-5:] == '.gzip':
35 gzipped = True
36 if gzipped:
37 for n in l:
38 if not n[-3:] == '.gz' and not n[-5:] == '.gzip':
39 print '\nError: Found a mixture of compressed and decompressed files with the specified prefix. Abandoning ship...\n'
40 for m in l:
41 print m
42 print ''
43 exit(1)
44 cmd = 'cat '+' '.join(sorted(l))+' > '+outName+'.gz'
45 else:
46 cmd = 'cat '+' '.join(sorted(l))+' > '+outName
47 print cmd
48 os.system(cmd)
49
50 def catBams(l,outName,samtools_exe):
51 l_sort = sorted(l)
52 tmp = outName+'.tempHeader.sam'
53 os.system(samtools_exe+' view -H '+l_sort[0]+' > '+tmp)
54 cmd = samtools_exe+' cat -h '+tmp+' '+' '.join(l_sort)+' > '+outName
55 print cmd
56 os.system(cmd)
57 os.system('rm '+tmp)
58
59
60 #####################################
61 # main() #
62 #####################################
63
64 def main():
65
66 parser = argparse.ArgumentParser(description='mergeJobs.py')
67 parser.add_argument('-i', type=str, required=True, metavar='<str>', nargs='+', help="* input prefix: [prefix_1] [prefix_2] ...")
68 parser.add_argument('-o', type=str, required=True, metavar='<str>', help="* output prefix")
69 parser.add_argument('-s', type=str, required=True, metavar='<str>', help="* /path/to/samtools")
70
71 args = parser.parse_args()
72 (INP,OUP,SAMTOOLS) = (args.i,args.o,args.s)
73
74 inDir = '/'.join(INP[0].split('/')[:-1])+'/'
75 if inDir == '/':
76 inDir = './'
77 #print inDir
78
79 INP_LIST = []
80 for n in INP:
81 if n[-1] == '/':
82 n = n[:-1]
83 INP_LIST.append(n.split('/')[-1])
84 listing_r1 = []
85 listing_r2 = []
86 listing_b = []
87 listing_v = []
88 for n in INP_LIST:
89 listing_r1 += getListOfFiles(inDir,n+'_read1.fq.job')
90 listing_r2 += getListOfFiles(inDir,n+'_read2.fq.job')
91 listing_b += getListOfFiles(inDir,n+'_golden.bam.job')
92 if len(listing_v): # remove headers from vcf files that aren't the first being processed
93 initList = getListOfFiles(inDir,n+'_golden.vcf.job')
94 listing_v += [stripVCF_header(n) for n in initList]
95 else:
96 listing_v += getListOfFiles(inDir,n+'_golden.vcf.job')
97
98 #
99 # merge fq files
100 #
101 if len(listing_r1):
102 catListOfFiles(listing_r1,OUP+'_read1.fq')
103 if len(listing_r2):
104 catListOfFiles(listing_r2,OUP+'_read2.fq')
105
106 #
107 # merge golden alignments, if present
108 #
109 if len(listing_b):
110 catBams(listing_b,OUP+'_golden.bam',SAMTOOLS)
111
112 #
113 # merge golden vcfs, if present
114 #
115 if len(listing_v):
116 catListOfFiles(listing_v,OUP+'_golden.vcf')
117
118
119 if __name__ == "__main__":
120 main()
121