view jemultiplexer.py @ 2:1b79b43626ef draft

Uploaded
author gbcs-embl-heidelberg
date Wed, 03 Sep 2014 04:12:06 -0400
parents
children 861cbe4eca25
line wrap: on
line source

#!/usr/bin/env python

import os, sys, string, shutil, subprocess, tempfile, re

def cleanup(tmpdir):
	# cleanup
	try:
		if os.path.exists( tmpdir ):
			shutil.rmtree( tmpdir )
	except Exception, e:
		stop_err( 'Error cleaning. ' + str( e ) )

# In the unlikely event of a fire, please use the nearest emergency exit
def stop_err( msg ):
	sys.stderr.write( '%s\n' % msg )
	sys.exit()
	
def __main__():
	tooldir = os.path.dirname(sys.argv[0])
	executable = tooldir + "/jemultiplexer_embase_1.0.4_bundle.jar"
	if not os.path.exists(executable):
		stop_err( "The file \"%s\" was not found. " % ( executable ) )
	mpxdata         = sys.argv[1]
	output1         = sys.argv[2]
	output1id       = sys.argv[3]
	barcodes        = sys.argv[4]
	barcode_list    = sys.argv[5]
	newfilepath     = sys.argv[6]
	extension       = sys.argv[7]
	barlen          = sys.argv[8]
	qualityFormat        = sys.argv[9]
	maxMismatches        = sys.argv[10]
	minBaseQuality        = sys.argv[11]
	minMismatchingDelta        = sys.argv[12]
	xTrimLen        = sys.argv[13]
	zTrimLen        = sys.argv[14]
	clipBarcode        = sys.argv[15]
	addBarcodeToHeader        = sys.argv[16]
	gzipOutput        = sys.argv[17]
	barcodeDiagFile        = sys.argv[18]
	rChar        = sys.argv[19]
	barcodeReadPos        = sys.argv[20]
	barcodeForSampleMatching        = sys.argv[21]
	redundantBarcode        = sys.argv[22]
	strict        = sys.argv[23]
	MpxData2        = sys.argv[24]

## create the tmpdir & co
	tmpdir = newfilepath + "/demultiplex_" + output1id
	oldnames=[]
	stat = tmpdir+"/jemultiplexer_out_stats.txt" #the default output stat file name
	try:
		if not os.path.isdir(tmpdir):
			os.mkdir(tmpdir)
	except Exception, e:
		stop_err( "Error creating directory \"%s\". %s" % ( tmpdir, str( e ) ) )
		
	#output file extension
	if gzipOutput == "true":
			outputExtension=".gz"
	else:
			outputExtension=""

	# Reconstructing the output file names as jemultiplexer writes them
	if MpxData2 != "single":
		oldnames.append('unassigned_1.txt' + outputExtension)
		oldnames.append('unassigned_2.txt' + outputExtension)
	else:
		oldnames.append('unassigned_1.txt' + outputExtension)
	if barcodeDiagFile=="true":
		oldnames.append('barcode_match_report.txt')
	if barcode_list == "none": # If a .bs file was given
		bc = open(barcodes, "r")
		bcw = open( tmpdir + "/barcodes.txt", "w" )
		for line in bc.readlines():
			l = line.split()
			if l[0] != "":
				if MpxData2 != "single":
					oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension)
					oldnames.append(l[0]+'_'+l[1]+'_2.txt' + outputExtension)
					bcw.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\t" + l[0]+'_'+l[1]+'_2.txt' + outputExtension + "\n")
				else:
					oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension)
					bcw.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\n")
		bc.close()
		bcw.close()
	elif barcodes == "none": # If the text area was used
		lines =  barcode_list.split("__cr____cn__")
		#bc = csv.writer( open( tmpdir + "/barcodes.txt", "w" ), delimiter = "\t" )
		bc = open( tmpdir + "/barcodes.txt", "w" )
		for l in lines:
			l = l.replace("__tc__", " ").split()
			if len(l) < 2:
				continue
			if l[0] != "":
				if MpxData2 != "single":
					oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension)
					oldnames.append(l[0]+'_'+l[1]+'_2.txt' + outputExtension)
					bc.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\t" + l[0]+'_'+l[1]+'_2.txt' + outputExtension + "\n")
				else:
					oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension)
					bc.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\n")
		bc.close()

## Building the command line
	cmd = "java -Xmx8g -jar " + executable
	cmd+= " F1="
	cmd+= mpxdata
## if we have PE data
	if MpxData2 != "single":
		cmd+= " F2="
		cmd+= MpxData2
		cmd+= " BPOS="
		cmd+= barcodeReadPos
		cmd+= " BRED="
		cmd+= redundantBarcode
		cmd+= " BM="
		cmd+= barcodeForSampleMatching
		cmd+= " S="
		cmd+= strict
	cmd+= " BF="
	cmd+= tmpdir + "/barcodes.txt"
	cmd+= " OUTPUT_DIR=\""
	cmd+= tmpdir
	cmd+= "\""
	cmd+= " BCLEN="
	cmd+= barlen
	cmd+= " QUALITY_FORMAT="
	cmd+= qualityFormat
	cmd+= " MAX_MISMATCHES="
	cmd+= maxMismatches
	cmd+= " MIN_BASE_QUALITY="
	cmd+= minBaseQuality
	cmd+= " MMD="
	cmd+= minMismatchingDelta
	cmd+= " XT="
	cmd+= xTrimLen
	cmd+= " ZT="
	cmd+= zTrimLen
	cmd+= " C="
	cmd+= clipBarcode
	cmd+= " ADD="
	cmd+= addBarcodeToHeader
	cmd+= " GZ="
	cmd+= gzipOutput
	if rChar=="2":
		cmd+= " RCHAR="
		cmd+= ":"
	elif rChar=="3":
		cmd+= " RCHAR="
		cmd+= "_"
	elif rChar=="4":
		cmd+= " RCHAR="
		cmd+= "-"
	if barcodeDiagFile=="true":
		cmd+= " BARCODE_DIAG_FILE="
		cmd+= 'barcode_match_report.txt'

			
	# Executing jemultiplexer
	# status = os.system(cmd)
	try:
		tmperr = tempfile.NamedTemporaryFile( dir=tmpdir ).name
		tmpout = tempfile.NamedTemporaryFile( dir=tmpdir ).name
		tmp_stderr = open( tmperr, 'wb' )
		tmp_stdout = open( tmpout, 'wb' )
		proc = subprocess.Popen( args=cmd,
					 shell=True,
					 cwd=tmpdir,
					 stdout=tmp_stdout.fileno(),
					 stderr=tmp_stderr.fileno() )
		returncode = proc.wait()
		tmp_stderr.close()
		tmp_stdout.close()
		# get stderr, allowing for case where it's very large
		tmp_stderr = open( tmperr, 'rb' )
		stderr = ''
		buffsize = 1048576
		try:
			while True:
				stderr += tmp_stderr.read( buffsize )
				if not stderr or len( stderr ) % buffsize != 0:
					break
       		except OverflowError:
				pass
		tmp_stderr.close()
		if returncode != 0:
			raise Exception, stderr
	except Exception, e:
		#cleanup(tmpdir)
		stop_err( 'Error demultiplexing sequence. ' + str( e ) )

	# Creating the required paths for multiple outputs
	newnames=[]
	if os.path.isdir(tmpdir):
		for f in oldnames:
			tmpf = tmpdir+"/"+f
			if os.path.isfile(tmpf):
				# check the size
				if os.path.getsize(tmpf) == 0:
					stop_err( 'The output file: ' + f + ' is empty, there may be an error with your barcode file or settings.')
				name = f
				s = "primary_"
				s+= output1id
				s+= "_"
				s+= string.replace(name, "_", "-")
				s+= "_visible_"
				if extension == "fastqillumina":
					if gzipOutput == "true":
						s+="gz"
					else:
						s+= extension
				else:
					if f=="barcode_match_report.txt":
						s+="text"
					else:
						if gzipOutput == "true":
							s+="gz"
						else:
							s+="fastqsanger"
				newnames.append(newfilepath+"/"+s)				
	# Adding the appropriate prefixes to the old filenames		
	for i in range(len(oldnames)):
		oldnames[i] = tmpdir+"/"+oldnames[i]
		
	### NUMSTAT rewriting ###	
	htmlout = open(output1, "w")
	numstat = open(stat, "r")
	htmlout.write("<html><head><title>numStat</title></head><body><!--\n")
	for l in numstat.readlines():
		htmlout.write(l)
	numstat.seek(0)
	htmlout.write("-->\n<h2>Please refresh your history to display the new datasets</h2>\n<h3>numStat</h3><table border=\"1\">\n")
	htmlout.write("<tr><td>%s</td></tr>\n" % (cmd ))
	for l in numstat.readlines():
		l = l.split()
		htmlout.write("<tr><td>%s</td><td>%s</td></tr>\n" % (l[0], l[1]) )
	htmlout.write("</table></body></html>")
	numstat.close()
	htmlout.close()

	#~ # Moving the first file (the mandatory output file defined in the xml (the txt stats))
	#~ shutil.move(stat,output1)
		#~ 
	#~ # add a warning to the numStat
	#~ statfh = open(stat,'a')
	#~ statfh.write("\nRefresh you library to see the new DataSets.\n")
	#~ statfh.close()

	# Moving everything where it will be seen properly by Galaxy
	for i in range(len(oldnames)):
		shutil.move(oldnames[i],newnames[i])

	# cleanup
	cleanup(tmpdir)

# Ta-da!
if __name__=="__main__": __main__()