Mercurial > repos > thondeboer > neat_genreads
comparison mergeJobs.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author | thondeboer |
---|---|
date | Tue, 15 May 2018 02:39:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6e75a84e9338 |
---|---|
1 #!/usr/bin/env python | |
2 import os | |
3 import argparse | |
4 | |
5 def getListOfFiles(inDir,pattern): | |
6 return [inDir+n for n in os.listdir(inDir) if (pattern in n and os.path.getsize(inDir+n))] | |
7 | |
8 TEMP_IND = 0 | |
9 def stripVCF_header(fn): | |
10 global TEMP_IND | |
11 f = open(fn,'r') | |
12 ftn = fn+'_temp'+str(TEMP_IND) | |
13 f_t = open(ftn,'w') | |
14 hasHeader = False | |
15 for line in f: | |
16 if line[0] == '#': | |
17 if not hasHeader: | |
18 TEMP_IND += 1 | |
19 hasHeader = True | |
20 elif hasHeader: | |
21 f_t.write(line) | |
22 else: | |
23 break | |
24 f_t.close() | |
25 f.close() | |
26 if hasHeader: | |
27 return ftn | |
28 else: | |
29 os.system('rm '+ftn) | |
30 return fn | |
31 | |
32 def catListOfFiles(l,outName,gzipped=False): | |
33 for n in l: | |
34 if n[-3:] == '.gz' or n[-5:] == '.gzip': | |
35 gzipped = True | |
36 if gzipped: | |
37 for n in l: | |
38 if not n[-3:] == '.gz' and not n[-5:] == '.gzip': | |
39 print '\nError: Found a mixture of compressed and decompressed files with the specified prefix. Abandoning ship...\n' | |
40 for m in l: | |
41 print m | |
42 print '' | |
43 exit(1) | |
44 cmd = 'cat '+' '.join(sorted(l))+' > '+outName+'.gz' | |
45 else: | |
46 cmd = 'cat '+' '.join(sorted(l))+' > '+outName | |
47 print cmd | |
48 os.system(cmd) | |
49 | |
50 def catBams(l,outName,samtools_exe): | |
51 l_sort = sorted(l) | |
52 tmp = outName+'.tempHeader.sam' | |
53 os.system(samtools_exe+' view -H '+l_sort[0]+' > '+tmp) | |
54 cmd = samtools_exe+' cat -h '+tmp+' '+' '.join(l_sort)+' > '+outName | |
55 print cmd | |
56 os.system(cmd) | |
57 os.system('rm '+tmp) | |
58 | |
59 | |
60 ##################################### | |
61 # main() # | |
62 ##################################### | |
63 | |
64 def main(): | |
65 | |
66 parser = argparse.ArgumentParser(description='mergeJobs.py') | |
67 parser.add_argument('-i', type=str, required=True, metavar='<str>', nargs='+', help="* input prefix: [prefix_1] [prefix_2] ...") | |
68 parser.add_argument('-o', type=str, required=True, metavar='<str>', help="* output prefix") | |
69 parser.add_argument('-s', type=str, required=True, metavar='<str>', help="* /path/to/samtools") | |
70 | |
71 args = parser.parse_args() | |
72 (INP,OUP,SAMTOOLS) = (args.i,args.o,args.s) | |
73 | |
74 inDir = '/'.join(INP[0].split('/')[:-1])+'/' | |
75 if inDir == '/': | |
76 inDir = './' | |
77 #print inDir | |
78 | |
79 INP_LIST = [] | |
80 for n in INP: | |
81 if n[-1] == '/': | |
82 n = n[:-1] | |
83 INP_LIST.append(n.split('/')[-1]) | |
84 listing_r1 = [] | |
85 listing_r2 = [] | |
86 listing_b = [] | |
87 listing_v = [] | |
88 for n in INP_LIST: | |
89 listing_r1 += getListOfFiles(inDir,n+'_read1.fq.job') | |
90 listing_r2 += getListOfFiles(inDir,n+'_read2.fq.job') | |
91 listing_b += getListOfFiles(inDir,n+'_golden.bam.job') | |
92 if len(listing_v): # remove headers from vcf files that aren't the first being processed | |
93 initList = getListOfFiles(inDir,n+'_golden.vcf.job') | |
94 listing_v += [stripVCF_header(n) for n in initList] | |
95 else: | |
96 listing_v += getListOfFiles(inDir,n+'_golden.vcf.job') | |
97 | |
98 # | |
99 # merge fq files | |
100 # | |
101 if len(listing_r1): | |
102 catListOfFiles(listing_r1,OUP+'_read1.fq') | |
103 if len(listing_r2): | |
104 catListOfFiles(listing_r2,OUP+'_read2.fq') | |
105 | |
106 # | |
107 # merge golden alignments, if present | |
108 # | |
109 if len(listing_b): | |
110 catBams(listing_b,OUP+'_golden.bam',SAMTOOLS) | |
111 | |
112 # | |
113 # merge golden vcfs, if present | |
114 # | |
115 if len(listing_v): | |
116 catListOfFiles(listing_v,OUP+'_golden.vcf') | |
117 | |
118 | |
119 if __name__ == "__main__": | |
120 main() | |
121 |