comparison jemultiplexer.py @ 2:1b79b43626ef draft

Uploaded
author gbcs-embl-heidelberg
date Wed, 03 Sep 2014 04:12:06 -0400
parents
children 861cbe4eca25
comparison
equal deleted inserted replaced
1:9764802ffae8 2:1b79b43626ef
1 #!/usr/bin/env python
2
3 import os, sys, string, shutil, subprocess, tempfile, re
4
5 def cleanup(tmpdir):
6 # cleanup
7 try:
8 if os.path.exists( tmpdir ):
9 shutil.rmtree( tmpdir )
10 except Exception, e:
11 stop_err( 'Error cleaning. ' + str( e ) )
12
13 # In the unlikely event of a fire, please use the nearest emergency exit
14 def stop_err( msg ):
15 sys.stderr.write( '%s\n' % msg )
16 sys.exit()
17
18 def __main__():
19 tooldir = os.path.dirname(sys.argv[0])
20 executable = tooldir + "/jemultiplexer_embase_1.0.4_bundle.jar"
21 if not os.path.exists(executable):
22 stop_err( "The file \"%s\" was not found. " % ( executable ) )
23 mpxdata = sys.argv[1]
24 output1 = sys.argv[2]
25 output1id = sys.argv[3]
26 barcodes = sys.argv[4]
27 barcode_list = sys.argv[5]
28 newfilepath = sys.argv[6]
29 extension = sys.argv[7]
30 barlen = sys.argv[8]
31 qualityFormat = sys.argv[9]
32 maxMismatches = sys.argv[10]
33 minBaseQuality = sys.argv[11]
34 minMismatchingDelta = sys.argv[12]
35 xTrimLen = sys.argv[13]
36 zTrimLen = sys.argv[14]
37 clipBarcode = sys.argv[15]
38 addBarcodeToHeader = sys.argv[16]
39 gzipOutput = sys.argv[17]
40 barcodeDiagFile = sys.argv[18]
41 rChar = sys.argv[19]
42 barcodeReadPos = sys.argv[20]
43 barcodeForSampleMatching = sys.argv[21]
44 redundantBarcode = sys.argv[22]
45 strict = sys.argv[23]
46 MpxData2 = sys.argv[24]
47
48 ## create the tmpdir & co
49 tmpdir = newfilepath + "/demultiplex_" + output1id
50 oldnames=[]
51 stat = tmpdir+"/jemultiplexer_out_stats.txt" #the default output stat file name
52 try:
53 if not os.path.isdir(tmpdir):
54 os.mkdir(tmpdir)
55 except Exception, e:
56 stop_err( "Error creating directory \"%s\". %s" % ( tmpdir, str( e ) ) )
57
58 #output file extension
59 if gzipOutput == "true":
60 outputExtension=".gz"
61 else:
62 outputExtension=""
63
64 # Reconstructing the output file names as jemultiplexer writes them
65 if MpxData2 != "single":
66 oldnames.append('unassigned_1.txt' + outputExtension)
67 oldnames.append('unassigned_2.txt' + outputExtension)
68 else:
69 oldnames.append('unassigned_1.txt' + outputExtension)
70 if barcodeDiagFile=="true":
71 oldnames.append('barcode_match_report.txt')
72 if barcode_list == "none": # If a .bs file was given
73 bc = open(barcodes, "r")
74 bcw = open( tmpdir + "/barcodes.txt", "w" )
75 for line in bc.readlines():
76 l = line.split()
77 if l[0] != "":
78 if MpxData2 != "single":
79 oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension)
80 oldnames.append(l[0]+'_'+l[1]+'_2.txt' + outputExtension)
81 bcw.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\t" + l[0]+'_'+l[1]+'_2.txt' + outputExtension + "\n")
82 else:
83 oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension)
84 bcw.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\n")
85 bc.close()
86 bcw.close()
87 elif barcodes == "none": # If the text area was used
88 lines = barcode_list.split("__cr____cn__")
89 #bc = csv.writer( open( tmpdir + "/barcodes.txt", "w" ), delimiter = "\t" )
90 bc = open( tmpdir + "/barcodes.txt", "w" )
91 for l in lines:
92 l = l.replace("__tc__", " ").split()
93 if len(l) < 2:
94 continue
95 if l[0] != "":
96 if MpxData2 != "single":
97 oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension)
98 oldnames.append(l[0]+'_'+l[1]+'_2.txt' + outputExtension)
99 bc.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\t" + l[0]+'_'+l[1]+'_2.txt' + outputExtension + "\n")
100 else:
101 oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension)
102 bc.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\n")
103 bc.close()
104
105 ## Building the command line
106 cmd = "java -Xmx8g -jar " + executable
107 cmd+= " F1="
108 cmd+= mpxdata
109 ## if we have PE data
110 if MpxData2 != "single":
111 cmd+= " F2="
112 cmd+= MpxData2
113 cmd+= " BPOS="
114 cmd+= barcodeReadPos
115 cmd+= " BRED="
116 cmd+= redundantBarcode
117 cmd+= " BM="
118 cmd+= barcodeForSampleMatching
119 cmd+= " S="
120 cmd+= strict
121 cmd+= " BF="
122 cmd+= tmpdir + "/barcodes.txt"
123 cmd+= " OUTPUT_DIR=\""
124 cmd+= tmpdir
125 cmd+= "\""
126 cmd+= " BCLEN="
127 cmd+= barlen
128 cmd+= " QUALITY_FORMAT="
129 cmd+= qualityFormat
130 cmd+= " MAX_MISMATCHES="
131 cmd+= maxMismatches
132 cmd+= " MIN_BASE_QUALITY="
133 cmd+= minBaseQuality
134 cmd+= " MMD="
135 cmd+= minMismatchingDelta
136 cmd+= " XT="
137 cmd+= xTrimLen
138 cmd+= " ZT="
139 cmd+= zTrimLen
140 cmd+= " C="
141 cmd+= clipBarcode
142 cmd+= " ADD="
143 cmd+= addBarcodeToHeader
144 cmd+= " GZ="
145 cmd+= gzipOutput
146 if rChar=="2":
147 cmd+= " RCHAR="
148 cmd+= ":"
149 elif rChar=="3":
150 cmd+= " RCHAR="
151 cmd+= "_"
152 elif rChar=="4":
153 cmd+= " RCHAR="
154 cmd+= "-"
155 if barcodeDiagFile=="true":
156 cmd+= " BARCODE_DIAG_FILE="
157 cmd+= 'barcode_match_report.txt'
158
159
160 # Executing jemultiplexer
161 # status = os.system(cmd)
162 try:
163 tmperr = tempfile.NamedTemporaryFile( dir=tmpdir ).name
164 tmpout = tempfile.NamedTemporaryFile( dir=tmpdir ).name
165 tmp_stderr = open( tmperr, 'wb' )
166 tmp_stdout = open( tmpout, 'wb' )
167 proc = subprocess.Popen( args=cmd,
168 shell=True,
169 cwd=tmpdir,
170 stdout=tmp_stdout.fileno(),
171 stderr=tmp_stderr.fileno() )
172 returncode = proc.wait()
173 tmp_stderr.close()
174 tmp_stdout.close()
175 # get stderr, allowing for case where it's very large
176 tmp_stderr = open( tmperr, 'rb' )
177 stderr = ''
178 buffsize = 1048576
179 try:
180 while True:
181 stderr += tmp_stderr.read( buffsize )
182 if not stderr or len( stderr ) % buffsize != 0:
183 break
184 except OverflowError:
185 pass
186 tmp_stderr.close()
187 if returncode != 0:
188 raise Exception, stderr
189 except Exception, e:
190 #cleanup(tmpdir)
191 stop_err( 'Error demultiplexing sequence. ' + str( e ) )
192
193 # Creating the required paths for multiple outputs
194 newnames=[]
195 if os.path.isdir(tmpdir):
196 for f in oldnames:
197 tmpf = tmpdir+"/"+f
198 if os.path.isfile(tmpf):
199 # check the size
200 if os.path.getsize(tmpf) == 0:
201 stop_err( 'The output file: ' + f + ' is empty, there may be an error with your barcode file or settings.')
202 name = f
203 s = "primary_"
204 s+= output1id
205 s+= "_"
206 s+= string.replace(name, "_", "-")
207 s+= "_visible_"
208 if extension == "fastqillumina":
209 if gzipOutput == "true":
210 s+="gz"
211 else:
212 s+= extension
213 else:
214 if f=="barcode_match_report.txt":
215 s+="text"
216 else:
217 if gzipOutput == "true":
218 s+="gz"
219 else:
220 s+="fastqsanger"
221 newnames.append(newfilepath+"/"+s)
222 # Adding the appropriate prefixes to the old filenames
223 for i in range(len(oldnames)):
224 oldnames[i] = tmpdir+"/"+oldnames[i]
225
226 ### NUMSTAT rewriting ###
227 htmlout = open(output1, "w")
228 numstat = open(stat, "r")
229 htmlout.write("<html><head><title>numStat</title></head><body><!--\n")
230 for l in numstat.readlines():
231 htmlout.write(l)
232 numstat.seek(0)
233 htmlout.write("-->\n<h2>Please refresh your history to display the new datasets</h2>\n<h3>numStat</h3><table border=\"1\">\n")
234 htmlout.write("<tr><td>%s</td></tr>\n" % (cmd ))
235 for l in numstat.readlines():
236 l = l.split()
237 htmlout.write("<tr><td>%s</td><td>%s</td></tr>\n" % (l[0], l[1]) )
238 htmlout.write("</table></body></html>")
239 numstat.close()
240 htmlout.close()
241
242 #~ # Moving the first file (the mandatory output file defined in the xml (the txt stats))
243 #~ shutil.move(stat,output1)
244 #~
245 #~ # add a warning to the numStat
246 #~ statfh = open(stat,'a')
247 #~ statfh.write("\nRefresh you library to see the new DataSets.\n")
248 #~ statfh.close()
249
250 # Moving everything where it will be seen properly by Galaxy
251 for i in range(len(oldnames)):
252 shutil.move(oldnames[i],newnames[i])
253
254 # cleanup
255 cleanup(tmpdir)
256
257 # Ta-da!
258 if __name__=="__main__": __main__()