comparison py/OutputFileWriter.py @ 0:6e75a84e9338 draft

planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author thondeboer
date Tue, 15 May 2018 02:39:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6e75a84e9338
1 import sys
2 import os
3 import re
4 import gzip
5 from struct import pack
6
7 from biopython_modified_bgzf import BgzfWriter
8
9 BAM_COMPRESSION_LEVEL = 6
10
11 # return the reverse complement of a string
12 RC_DICT = {'A':'T','C':'G','G':'C','T':'A','N':'N'}
13 def RC(s):
14 return ''.join(RC_DICT[n] for n in s[::-1])
15
16 # SAMtools reg2bin function
17 def reg2bin(a,b):
18 b -= 1
19 if (a>>14 == b>>14): return ((1<<15)-1)/7 + (a>>14)
20 if (a>>17 == b>>17): return ((1<<12)-1)/7 + (a>>17)
21 if (a>>20 == b>>20): return ((1<<9)-1)/7 + (a>>20)
22 if (a>>23 == b>>23): return ((1<<6)-1)/7 + (a>>23)
23 if (a>>26 == b>>26): return ((1<<3)-1)/7 + (a>>26)
24 return 0
25
26 CIGAR_PACKED = {'M':0, 'I':1, 'D':2, 'N':3, 'S':4, 'H':5, 'P':6, '=':7, 'X':8}
27 SEQ_PACKED = {'=':0, 'A':1, 'C':2, 'M':3, 'G':4, 'R':5, 'S':6, 'V':7,
28 'T':8, 'W':9, 'Y':10,'H':11,'K':12,'D':13,'B':14,'N':15}
29
30 BUFFER_BATCH_SIZE = 1000 # write out to file after this many reads
31
32 #
33 # outFQ = path to output FASTQ prefix
34 # paired = True for PE reads, False for SE
35 # BAM_header = [refIndex]
36 # VCF_header = [path_to_ref]
37 # gzipped = True for compressed FASTQ/VCF, False for uncompressed
38 #
39 class OutputFileWriter:
40 def __init__(self, outPrefix, paired=False, BAM_header=None, VCF_header=None, gzipped=False, jobTuple=(1,1), noFASTQ=False):
41
42 jobSuffix = ''
43 if jobTuple[1] > 1:
44 jsl = len(str(jobTuple[1]))
45 jsb = '0'*(jsl-len(str(jobTuple[0])))
46 jobSuffix = '.job'+jsb+str(jobTuple[0])+'of'+str(jobTuple[1])
47
48 fq1 = outPrefix+'_read1.fq'+jobSuffix
49 fq2 = outPrefix+'_read2.fq'+jobSuffix
50 bam = outPrefix+'_golden.bam'+jobSuffix
51 vcf = outPrefix+'_golden.vcf'+jobSuffix
52
53 self.noFASTQ = noFASTQ
54 if not self.noFASTQ:
55 if gzipped:
56 self.fq1_file = gzip.open(fq1+'.gz', 'wb')
57 else:
58 self.fq1_file = open(fq1,'w')
59
60 self.fq2_file = None
61 if paired:
62 if gzipped:
63 self.fq2_file = gzip.open(fq2+'.gz', 'wb')
64 else:
65 self.fq2_file = open(fq2,'w')
66
67 #
68 # VCF OUTPUT
69 #
70 self.vcf_file = None
71 if VCF_header != None:
72 if gzipped:
73 self.vcf_file = gzip.open(vcf+'.gz', 'wb')
74 else:
75 self.vcf_file = open(vcf, 'wb')
76
77 # WRITE VCF HEADER (if parallel: only for first job)
78 if jobTuple[0] == 1:
79 self.vcf_file.write('##fileformat=VCFv4.1\n')
80 self.vcf_file.write('##reference='+VCF_header[0]+'\n')
81 self.vcf_file.write('##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">\n')
82 self.vcf_file.write('##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n')
83 #self.vcf_file.write('##INFO=<ID=READS,Number=1,Type=String,Description="Names of Reads Covering this Variant">\n')
84 self.vcf_file.write('##INFO=<ID=VMX,Number=1,Type=String,Description="SNP is Missense in these Read Frames">\n')
85 self.vcf_file.write('##INFO=<ID=VNX,Number=1,Type=String,Description="SNP is Nonsense in these Read Frames">\n')
86 self.vcf_file.write('##INFO=<ID=VFX,Number=1,Type=String,Description="Indel Causes Frameshift">\n')
87 self.vcf_file.write('##INFO=<ID=WP,Number=A,Type=Integer,Description="NEAT-GenReads ploidy indicator">\n')
88 self.vcf_file.write('##ALT=<ID=DEL,Description="Deletion">\n')
89 self.vcf_file.write('##ALT=<ID=DUP,Description="Duplication">\n')
90 self.vcf_file.write('##ALT=<ID=INS,Description="Insertion of novel sequence">\n')
91 self.vcf_file.write('##ALT=<ID=INV,Description="Inversion">\n')
92 self.vcf_file.write('##ALT=<ID=CNV,Description="Copy number variable region">\n')
93 self.vcf_file.write('##ALT=<ID=TRANS,Description="Translocation">\n')
94 self.vcf_file.write('##ALT=<ID=INV-TRANS,Description="Inverted translocation">\n')
95 self.vcf_file.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n')
96
97 #
98 # BAM OUTPUT
99 #
100 self.bam_file = None
101 if BAM_header != None:
102 self.bam_file = BgzfWriter(bam, 'w', compresslevel=BAM_COMPRESSION_LEVEL)
103
104 # WRITE BAM HEADER (if parallel: only for first job)
105 if True or jobTuple[0] == 1:
106 self.bam_file.write("BAM\1")
107 header = '@HD\tVN:1.5\tSO:coordinate\n'
108 for n in BAM_header[0]:
109 header += '@SQ\tSN:'+n[0]+'\tLN:'+str(n[3])+'\n'
110 header += '@RG\tID:NEAT\tSM:NEAT\tLB:NEAT\tPL:NEAT\n'
111 headerBytes = len(header)
112 numRefs = len(BAM_header[0])
113 self.bam_file.write(pack('<i',headerBytes))
114 self.bam_file.write(header)
115 self.bam_file.write(pack('<i',numRefs))
116
117 for n in BAM_header[0]:
118 l_name = len(n[0])+1
119 self.bam_file.write(pack('<i',l_name))
120 self.bam_file.write(n[0]+'\0')
121 self.bam_file.write(pack('<i',n[3]))
122
123 # buffers for more efficient writing
124 self.fq1_buffer = []
125 self.fq2_buffer = []
126 self.bam_buffer = []
127
128 def writeFASTQRecord(self,readName,read1,qual1,read2=None,qual2=None):
129 ###self.fq1_file.write('@'+readName+'/1\n'+read1+'\n+\n'+qual1+'\n')
130 self.fq1_buffer.append('@'+readName+'/1\n'+read1+'\n+\n'+qual1+'\n')
131 if read2 != None:
132 ####self.fq2_file.write('@'+readName+'/2\n'+RC(read2)+'\n+\n'+qual2[::-1]+'\n')
133 self.fq2_buffer.append('@'+readName+'/2\n'+RC(read2)+'\n+\n'+qual2[::-1]+'\n')
134
135 def writeVCFRecord(self, chrom, pos, idStr, ref, alt, qual, filt, info):
136 self.vcf_file.write(str(chrom)+'\t'+str(pos)+'\t'+str(idStr)+'\t'+str(ref)+'\t'+str(alt)+'\t'+str(qual)+'\t'+str(filt)+'\t'+str(info)+'\n')
137
138 def writeBAMRecord(self, refID, readName, pos_0, cigar, seq, qual, samFlag, matePos=None, alnMapQual=70):
139
140 myBin = reg2bin(pos_0,pos_0+len(seq))
141 #myBin = 0 # or just use a dummy value, does this actually matter?
142
143 myMapQual = alnMapQual
144 cig_letters = re.split(r"\d+",cigar)[1:]
145 cig_numbers = [int(n) for n in re.findall(r"\d+",cigar)]
146 cig_ops = len(cig_letters)
147 next_refID = refID
148 if matePos == None:
149 next_pos = 0
150 my_tlen = 0
151 else:
152 next_pos = matePos
153 if pos_0 < next_pos:
154 my_tlen = next_pos + len(seq) - pos_0
155 else:
156 my_tlen = -pos_0 - len(seq) + next_pos
157
158 encodedCig = ''
159 for i in xrange(cig_ops):
160 encodedCig += pack('<I',(cig_numbers[i]<<4) + CIGAR_PACKED[cig_letters[i]])
161 encodedSeq = ''
162 encodedLen = (len(seq)+1)/2
163 seqLen = len(seq)
164 if seqLen&1:
165 seq += '='
166 for i in xrange(encodedLen):
167 encodedSeq += pack('<B',(SEQ_PACKED[seq[2*i]]<<4) + SEQ_PACKED[seq[2*i+1]])
168
169 # apparently samtools automatically adds 33 to the quality score string...
170 encodedQual = ''.join([chr(ord(n)-33) for n in qual])
171
172 #blockSize = 4 + # refID int32
173 # 4 + # pos int32
174 # 4 + # bin_mq_nl uint32
175 # 4 + # flag_nc uint32
176 # 4 + # l_seq int32
177 # 4 + # next_refID int32
178 # 4 + # next_pos int32
179 # 4 + # tlen int32
180 # len(readName)+1 +
181 # 4*cig_ops +
182 # encodedLen +
183 # len(seq)
184
185 #blockSize = 32 + len(readName)+1 + 4*cig_ops + encodedLen + len(seq)
186 blockSize = 32 + len(readName)+1 + len(encodedCig) + len(encodedSeq) + len(encodedQual)
187
188 ####self.bam_file.write(pack('<i',blockSize))
189 ####self.bam_file.write(pack('<i',refID))
190 ####self.bam_file.write(pack('<i',pos_0))
191 ####self.bam_file.write(pack('<I',(myBin<<16) + (myMapQual<<8) + len(readName)+1))
192 ####self.bam_file.write(pack('<I',(samFlag<<16) + cig_ops))
193 ####self.bam_file.write(pack('<i',seqLen))
194 ####self.bam_file.write(pack('<i',next_refID))
195 ####self.bam_file.write(pack('<i',next_pos))
196 ####self.bam_file.write(pack('<i',my_tlen))
197 ####self.bam_file.write(readName+'\0')
198 ####self.bam_file.write(encodedCig)
199 ####self.bam_file.write(encodedSeq)
200 ####self.bam_file.write(encodedQual)
201
202 # a horribly compressed line, I'm sorry.
203 # (ref_index, position, data)
204 self.bam_buffer.append((refID, pos_0, pack('<i',blockSize) + pack('<i',refID) + pack('<i',pos_0) + pack('<I',(myBin<<16) + (myMapQual<<8) + len(readName)+1) + pack('<I',(samFlag<<16) + cig_ops) + pack('<i',seqLen) + pack('<i',next_refID) + pack('<i',next_pos) + pack('<i',my_tlen) + readName+'\0' + encodedCig + encodedSeq + encodedQual))
205
206
207 def flushBuffers(self,bamMax=None,lastTime=False):
208 if (len(self.fq1_buffer) >= BUFFER_BATCH_SIZE or len(self.bam_buffer) >= BUFFER_BATCH_SIZE) or (len(self.fq1_buffer) and lastTime) or (len(self.bam_buffer) and lastTime):
209 # fq
210 if not self.noFASTQ:
211 self.fq1_file.write(''.join(self.fq1_buffer))
212 if len(self.fq2_buffer):
213 self.fq2_file.write(''.join(self.fq2_buffer))
214 # bam
215 if len(self.bam_buffer):
216 bam_data = sorted(self.bam_buffer)
217 if lastTime:
218 self.bam_file.write(''.join([n[2] for n in bam_data]))
219 self.bam_buffer = []
220 else:
221 ind_to_stop_at = 0
222 for i in xrange(0,len(bam_data)):
223 # if we are from previous reference, or have coordinates lower than next window position, it's safe to write out to file
224 if bam_data[i][0] != bam_data[-1][0] or bam_data[i][1] < bamMax:
225 ind_to_stop_at = i+1
226 else:
227 break
228 self.bam_file.write(''.join([n[2] for n in bam_data[:ind_to_stop_at]]))
229 ####print 'BAM WRITING:',ind_to_stop_at,'/',len(bam_data)
230 if ind_to_stop_at >= len(bam_data):
231 self.bam_buffer = []
232 else:
233 self.bam_buffer = bam_data[ind_to_stop_at:]
234 self.fq1_buffer = []
235 self.fq2_buffer = []
236
237
238 def closeFiles(self):
239 self.flushBuffers(lastTime=True)
240 if not self.noFASTQ:
241 self.fq1_file.close()
242 if self.fq2_file != None:
243 self.fq2_file.close()
244 if self.vcf_file != None:
245 self.vcf_file.close()
246 if self.bam_file != None:
247 self.bam_file.close()
248
249
250
251