changeset 2:1b79b43626ef draft

Uploaded
author gbcs-embl-heidelberg
date Wed, 03 Sep 2014 04:12:06 -0400
parents 9764802ffae8
children 321b695b1a33
files jemultiplexer.py
diffstat 1 files changed, 258 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/jemultiplexer.py	Wed Sep 03 04:12:06 2014 -0400
@@ -0,0 +1,258 @@
+#!/usr/bin/env python
+
+import os, sys, string, shutil, subprocess, tempfile, re
+
+def cleanup(tmpdir):
+	# cleanup
+	try:
+		if os.path.exists( tmpdir ):
+			shutil.rmtree( tmpdir )
+	except Exception, e:
+		stop_err( 'Error cleaning. ' + str( e ) )
+
+# In the unlikely event of a fire, please use the nearest emergency exit
+def stop_err( msg ):
+	sys.stderr.write( '%s\n' % msg )
+	sys.exit()
+	
+def __main__():
+	tooldir = os.path.dirname(sys.argv[0])
+	executable = tooldir + "/jemultiplexer_embase_1.0.4_bundle.jar"
+	if not os.path.exists(executable):
+		stop_err( "The file \"%s\" was not found. " % ( executable ) )
+	mpxdata         = sys.argv[1]
+	output1         = sys.argv[2]
+	output1id       = sys.argv[3]
+	barcodes        = sys.argv[4]
+	barcode_list    = sys.argv[5]
+	newfilepath     = sys.argv[6]
+	extension       = sys.argv[7]
+	barlen          = sys.argv[8]
+	qualityFormat        = sys.argv[9]
+	maxMismatches        = sys.argv[10]
+	minBaseQuality        = sys.argv[11]
+	minMismatchingDelta        = sys.argv[12]
+	xTrimLen        = sys.argv[13]
+	zTrimLen        = sys.argv[14]
+	clipBarcode        = sys.argv[15]
+	addBarcodeToHeader        = sys.argv[16]
+	gzipOutput        = sys.argv[17]
+	barcodeDiagFile        = sys.argv[18]
+	rChar        = sys.argv[19]
+	barcodeReadPos        = sys.argv[20]
+	barcodeForSampleMatching        = sys.argv[21]
+	redundantBarcode        = sys.argv[22]
+	strict        = sys.argv[23]
+	MpxData2        = sys.argv[24]
+
+## create the tmpdir & co
+	tmpdir = newfilepath + "/demultiplex_" + output1id
+	oldnames=[]
+	stat = tmpdir+"/jemultiplexer_out_stats.txt" #the default output stat file name
+	try:
+		if not os.path.isdir(tmpdir):
+			os.mkdir(tmpdir)
+	except Exception, e:
+		stop_err( "Error creating directory \"%s\". %s" % ( tmpdir, str( e ) ) )
+		
+	#output file extension
+	if gzipOutput == "true":
+			outputExtension=".gz"
+	else:
+			outputExtension=""
+
+	# Reconstructing the output file names as jemultiplexer writes them
+	if MpxData2 != "single":
+		oldnames.append('unassigned_1.txt' + outputExtension)
+		oldnames.append('unassigned_2.txt' + outputExtension)
+	else:
+		oldnames.append('unassigned_1.txt' + outputExtension)
+	if barcodeDiagFile=="true":
+		oldnames.append('barcode_match_report.txt')
+	if barcode_list == "none": # If a .bs file was given
+		bc = open(barcodes, "r")
+		bcw = open( tmpdir + "/barcodes.txt", "w" )
+		for line in bc.readlines():
+			l = line.split()
+			if l[0] != "":
+				if MpxData2 != "single":
+					oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension)
+					oldnames.append(l[0]+'_'+l[1]+'_2.txt' + outputExtension)
+					bcw.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\t" + l[0]+'_'+l[1]+'_2.txt' + outputExtension + "\n")
+				else:
+					oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension)
+					bcw.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\n")
+		bc.close()
+		bcw.close()
+	elif barcodes == "none": # If the text area was used
+		lines =  barcode_list.split("__cr____cn__")
+		#bc = csv.writer( open( tmpdir + "/barcodes.txt", "w" ), delimiter = "\t" )
+		bc = open( tmpdir + "/barcodes.txt", "w" )
+		for l in lines:
+			l = l.replace("__tc__", " ").split()
+			if len(l) < 2:
+				continue
+			if l[0] != "":
+				if MpxData2 != "single":
+					oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension)
+					oldnames.append(l[0]+'_'+l[1]+'_2.txt' + outputExtension)
+					bc.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\t" + l[0]+'_'+l[1]+'_2.txt' + outputExtension + "\n")
+				else:
+					oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension)
+					bc.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\n")
+		bc.close()
+
+## Building the command line
+	cmd = "java -Xmx8g -jar " + executable
+	cmd+= " F1="
+	cmd+= mpxdata
+## if we have PE data
+	if MpxData2 != "single":
+		cmd+= " F2="
+		cmd+= MpxData2
+		cmd+= " BPOS="
+		cmd+= barcodeReadPos
+		cmd+= " BRED="
+		cmd+= redundantBarcode
+		cmd+= " BM="
+		cmd+= barcodeForSampleMatching
+		cmd+= " S="
+		cmd+= strict
+	cmd+= " BF="
+	cmd+= tmpdir + "/barcodes.txt"
+	cmd+= " OUTPUT_DIR=\""
+	cmd+= tmpdir
+	cmd+= "\""
+	cmd+= " BCLEN="
+	cmd+= barlen
+	cmd+= " QUALITY_FORMAT="
+	cmd+= qualityFormat
+	cmd+= " MAX_MISMATCHES="
+	cmd+= maxMismatches
+	cmd+= " MIN_BASE_QUALITY="
+	cmd+= minBaseQuality
+	cmd+= " MMD="
+	cmd+= minMismatchingDelta
+	cmd+= " XT="
+	cmd+= xTrimLen
+	cmd+= " ZT="
+	cmd+= zTrimLen
+	cmd+= " C="
+	cmd+= clipBarcode
+	cmd+= " ADD="
+	cmd+= addBarcodeToHeader
+	cmd+= " GZ="
+	cmd+= gzipOutput
+	if rChar=="2":
+		cmd+= " RCHAR="
+		cmd+= ":"
+	elif rChar=="3":
+		cmd+= " RCHAR="
+		cmd+= "_"
+	elif rChar=="4":
+		cmd+= " RCHAR="
+		cmd+= "-"
+	if barcodeDiagFile=="true":
+		cmd+= " BARCODE_DIAG_FILE="
+		cmd+= 'barcode_match_report.txt'
+
+			
+	# Executing jemultiplexer
+	# status = os.system(cmd)
+	try:
+		tmperr = tempfile.NamedTemporaryFile( dir=tmpdir ).name
+		tmpout = tempfile.NamedTemporaryFile( dir=tmpdir ).name
+		tmp_stderr = open( tmperr, 'wb' )
+		tmp_stdout = open( tmpout, 'wb' )
+		proc = subprocess.Popen( args=cmd,
+					 shell=True,
+					 cwd=tmpdir,
+					 stdout=tmp_stdout.fileno(),
+					 stderr=tmp_stderr.fileno() )
+		returncode = proc.wait()
+		tmp_stderr.close()
+		tmp_stdout.close()
+		# get stderr, allowing for case where it's very large
+		tmp_stderr = open( tmperr, 'rb' )
+		stderr = ''
+		buffsize = 1048576
+		try:
+			while True:
+				stderr += tmp_stderr.read( buffsize )
+				if not stderr or len( stderr ) % buffsize != 0:
+					break
+       		except OverflowError:
+				pass
+		tmp_stderr.close()
+		if returncode != 0:
+			raise Exception, stderr
+	except Exception, e:
+		#cleanup(tmpdir)
+		stop_err( 'Error demultiplexing sequence. ' + str( e ) )
+
+	# Creating the required paths for multiple outputs
+	newnames=[]
+	if os.path.isdir(tmpdir):
+		for f in oldnames:
+			tmpf = tmpdir+"/"+f
+			if os.path.isfile(tmpf):
+				# check the size
+				if os.path.getsize(tmpf) == 0:
+					stop_err( 'The output file: ' + f + ' is empty, there may be an error with your barcode file or settings.')
+				name = f
+				s = "primary_"
+				s+= output1id
+				s+= "_"
+				s+= string.replace(name, "_", "-")
+				s+= "_visible_"
+				if extension == "fastqillumina":
+					if gzipOutput == "true":
+						s+="gz"
+					else:
+						s+= extension
+				else:
+					if f=="barcode_match_report.txt":
+						s+="text"
+					else:
+						if gzipOutput == "true":
+							s+="gz"
+						else:
+							s+="fastqsanger"
+				newnames.append(newfilepath+"/"+s)				
+	# Adding the appropriate prefixes to the old filenames		
+	for i in range(len(oldnames)):
+		oldnames[i] = tmpdir+"/"+oldnames[i]
+		
+	### NUMSTAT rewriting ###	
+	htmlout = open(output1, "w")
+	numstat = open(stat, "r")
+	htmlout.write("<html><head><title>numStat</title></head><body><!--\n")
+	for l in numstat.readlines():
+		htmlout.write(l)
+	numstat.seek(0)
+	htmlout.write("-->\n<h2>Please refresh your history to display the new datasets</h2>\n<h3>numStat</h3><table border=\"1\">\n")
+	htmlout.write("<tr><td>%s</td></tr>\n" % (cmd ))
+	for l in numstat.readlines():
+		l = l.split()
+		htmlout.write("<tr><td>%s</td><td>%s</td></tr>\n" % (l[0], l[1]) )
+	htmlout.write("</table></body></html>")
+	numstat.close()
+	htmlout.close()
+
+	#~ # Moving the first file (the mandatory output file defined in the xml (the txt stats))
+	#~ shutil.move(stat,output1)
+		#~ 
+	#~ # add a warning to the numStat
+	#~ statfh = open(stat,'a')
+	#~ statfh.write("\nRefresh you library to see the new DataSets.\n")
+	#~ statfh.close()
+
+	# Moving everything where it will be seen properly by Galaxy
+	for i in range(len(oldnames)):
+		shutil.move(oldnames[i],newnames[i])
+
+	# cleanup
+	cleanup(tmpdir)
+
+# Ta-da!
+if __name__=="__main__": __main__()