# HG changeset patch # User gbcs-embl-heidelberg # Date 1409731926 14400 # Node ID 1b79b43626efb2dcfc1ca489792d17d34262d901 # Parent 9764802ffae841b7ff14292c3ca3b5a980093e7b Uploaded diff -r 9764802ffae8 -r 1b79b43626ef jemultiplexer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/jemultiplexer.py Wed Sep 03 04:12:06 2014 -0400 @@ -0,0 +1,258 @@ +#!/usr/bin/env python + +import os, sys, string, shutil, subprocess, tempfile, re + +def cleanup(tmpdir): + # cleanup + try: + if os.path.exists( tmpdir ): + shutil.rmtree( tmpdir ) + except Exception, e: + stop_err( 'Error cleaning. ' + str( e ) ) + +# In the unlikely event of a fire, please use the nearest emergency exit +def stop_err( msg ): + sys.stderr.write( '%s\n' % msg ) + sys.exit() + +def __main__(): + tooldir = os.path.dirname(sys.argv[0]) + executable = tooldir + "/jemultiplexer_embase_1.0.4_bundle.jar" + if not os.path.exists(executable): + stop_err( "The file \"%s\" was not found. " % ( executable ) ) + mpxdata = sys.argv[1] + output1 = sys.argv[2] + output1id = sys.argv[3] + barcodes = sys.argv[4] + barcode_list = sys.argv[5] + newfilepath = sys.argv[6] + extension = sys.argv[7] + barlen = sys.argv[8] + qualityFormat = sys.argv[9] + maxMismatches = sys.argv[10] + minBaseQuality = sys.argv[11] + minMismatchingDelta = sys.argv[12] + xTrimLen = sys.argv[13] + zTrimLen = sys.argv[14] + clipBarcode = sys.argv[15] + addBarcodeToHeader = sys.argv[16] + gzipOutput = sys.argv[17] + barcodeDiagFile = sys.argv[18] + rChar = sys.argv[19] + barcodeReadPos = sys.argv[20] + barcodeForSampleMatching = sys.argv[21] + redundantBarcode = sys.argv[22] + strict = sys.argv[23] + MpxData2 = sys.argv[24] + +## create the tmpdir & co + tmpdir = newfilepath + "/demultiplex_" + output1id + oldnames=[] + stat = tmpdir+"/jemultiplexer_out_stats.txt" #the default output stat file name + try: + if not os.path.isdir(tmpdir): + os.mkdir(tmpdir) + except Exception, e: + stop_err( "Error creating directory \"%s\". %s" % ( tmpdir, str( e ) ) ) + + #output file extension + if gzipOutput == "true": + outputExtension=".gz" + else: + outputExtension="" + + # Reconstructing the output file names as jemultiplexer writes them + if MpxData2 != "single": + oldnames.append('unassigned_1.txt' + outputExtension) + oldnames.append('unassigned_2.txt' + outputExtension) + else: + oldnames.append('unassigned_1.txt' + outputExtension) + if barcodeDiagFile=="true": + oldnames.append('barcode_match_report.txt') + if barcode_list == "none": # If a .bs file was given + bc = open(barcodes, "r") + bcw = open( tmpdir + "/barcodes.txt", "w" ) + for line in bc.readlines(): + l = line.split() + if l[0] != "": + if MpxData2 != "single": + oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension) + oldnames.append(l[0]+'_'+l[1]+'_2.txt' + outputExtension) + bcw.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\t" + l[0]+'_'+l[1]+'_2.txt' + outputExtension + "\n") + else: + oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension) + bcw.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\n") + bc.close() + bcw.close() + elif barcodes == "none": # If the text area was used + lines = barcode_list.split("__cr____cn__") + #bc = csv.writer( open( tmpdir + "/barcodes.txt", "w" ), delimiter = "\t" ) + bc = open( tmpdir + "/barcodes.txt", "w" ) + for l in lines: + l = l.replace("__tc__", " ").split() + if len(l) < 2: + continue + if l[0] != "": + if MpxData2 != "single": + oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension) + oldnames.append(l[0]+'_'+l[1]+'_2.txt' + outputExtension) + bc.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\t" + l[0]+'_'+l[1]+'_2.txt' + outputExtension + "\n") + else: + oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension) + bc.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\n") + bc.close() + +## Building the command line + cmd = "java -Xmx8g -jar " + executable + cmd+= " F1=" + cmd+= mpxdata +## if we have PE data + if MpxData2 != "single": + cmd+= " F2=" + cmd+= MpxData2 + cmd+= " BPOS=" + cmd+= barcodeReadPos + cmd+= " BRED=" + cmd+= redundantBarcode + cmd+= " BM=" + cmd+= barcodeForSampleMatching + cmd+= " S=" + cmd+= strict + cmd+= " BF=" + cmd+= tmpdir + "/barcodes.txt" + cmd+= " OUTPUT_DIR=\"" + cmd+= tmpdir + cmd+= "\"" + cmd+= " BCLEN=" + cmd+= barlen + cmd+= " QUALITY_FORMAT=" + cmd+= qualityFormat + cmd+= " MAX_MISMATCHES=" + cmd+= maxMismatches + cmd+= " MIN_BASE_QUALITY=" + cmd+= minBaseQuality + cmd+= " MMD=" + cmd+= minMismatchingDelta + cmd+= " XT=" + cmd+= xTrimLen + cmd+= " ZT=" + cmd+= zTrimLen + cmd+= " C=" + cmd+= clipBarcode + cmd+= " ADD=" + cmd+= addBarcodeToHeader + cmd+= " GZ=" + cmd+= gzipOutput + if rChar=="2": + cmd+= " RCHAR=" + cmd+= ":" + elif rChar=="3": + cmd+= " RCHAR=" + cmd+= "_" + elif rChar=="4": + cmd+= " RCHAR=" + cmd+= "-" + if barcodeDiagFile=="true": + cmd+= " BARCODE_DIAG_FILE=" + cmd+= 'barcode_match_report.txt' + + + # Executing jemultiplexer + # status = os.system(cmd) + try: + tmperr = tempfile.NamedTemporaryFile( dir=tmpdir ).name + tmpout = tempfile.NamedTemporaryFile( dir=tmpdir ).name + tmp_stderr = open( tmperr, 'wb' ) + tmp_stdout = open( tmpout, 'wb' ) + proc = subprocess.Popen( args=cmd, + shell=True, + cwd=tmpdir, + stdout=tmp_stdout.fileno(), + stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + tmp_stdout.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmperr, 'rb' ) + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + except Exception, e: + #cleanup(tmpdir) + stop_err( 'Error demultiplexing sequence. ' + str( e ) ) + + # Creating the required paths for multiple outputs + newnames=[] + if os.path.isdir(tmpdir): + for f in oldnames: + tmpf = tmpdir+"/"+f + if os.path.isfile(tmpf): + # check the size + if os.path.getsize(tmpf) == 0: + stop_err( 'The output file: ' + f + ' is empty, there may be an error with your barcode file or settings.') + name = f + s = "primary_" + s+= output1id + s+= "_" + s+= string.replace(name, "_", "-") + s+= "_visible_" + if extension == "fastqillumina": + if gzipOutput == "true": + s+="gz" + else: + s+= extension + else: + if f=="barcode_match_report.txt": + s+="text" + else: + if gzipOutput == "true": + s+="gz" + else: + s+="fastqsanger" + newnames.append(newfilepath+"/"+s) + # Adding the appropriate prefixes to the old filenames + for i in range(len(oldnames)): + oldnames[i] = tmpdir+"/"+oldnames[i] + + ### NUMSTAT rewriting ### + htmlout = open(output1, "w") + numstat = open(stat, "r") + htmlout.write("numStat\n

Please refresh your history to display the new datasets

\n

numStat

\n") + htmlout.write("\n" % (cmd )) + for l in numstat.readlines(): + l = l.split() + htmlout.write("\n" % (l[0], l[1]) ) + htmlout.write("
%s
%s%s
") + numstat.close() + htmlout.close() + + #~ # Moving the first file (the mandatory output file defined in the xml (the txt stats)) + #~ shutil.move(stat,output1) + #~ + #~ # add a warning to the numStat + #~ statfh = open(stat,'a') + #~ statfh.write("\nRefresh you library to see the new DataSets.\n") + #~ statfh.close() + + # Moving everything where it will be seen properly by Galaxy + for i in range(len(oldnames)): + shutil.move(oldnames[i],newnames[i]) + + # cleanup + cleanup(tmpdir) + +# Ta-da! +if __name__=="__main__": __main__()