view SMART/DiffExpAnal/compareOverlapping_parallel.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line source

#! /usr/bin/env python
#This program is a wrapp for CompareOverlapping.py.
import optparse, os, sys, subprocess, tempfile, shutil, tarfile, glob
import os, struct, time, random
from optparse import OptionParser
from commons.core.parsing.ParserChooser import ParserChooser
from commons.core.writer.Gff3Writer import Gff3Writer
from SMART.Java.Python.CompareOverlapping import CompareOverlapping
from SMART.Java.Python.structure.Transcript import Transcript
from SMART.Java.Python.structure.Interval import Interval
from SMART.Java.Python.ncList.NCList import NCList
from SMART.Java.Python.ncList.NCListCursor import NCListCursor
from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle
from SMART.Java.Python.ncList.FileSorter import FileSorter
from SMART.Java.Python.misc.Progress import Progress
from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
from SMART.Java.Python.misc import Utils



def stop_err( msg ):
	sys.stderr.write( "%s\n" % msg )
	sys.exit()

def toTar(tarFileName, overlapOutputNames):
	dir = os.path.dirname(tarFileName)	
	tfile = tarfile.open(tarFileName + ".tmp.tar", "w")
	currentPath = os.getcwd()
	os.chdir(dir)
	for file in overlapOutputNames:
		relativeFileName = os.path.basename(file)
		tfile.add(relativeFileName)
	os.system("mv %s %s" % (tarFileName + ".tmp.tar", tarFileName))
	tfile.close()
	os.chdir(currentPath)

def __main__():
	description = "Compare Overlapping wrapp script: Get the a list of data which overlap with a reference set. [Category: Data Comparison]"
	parser = OptionParser(description = description)
	parser.add_option("-i", "--input1",		   dest="inputFileName1", action="store",					 type="string", help="input file 1 (for annotation) [compulsory] [format: file in transcript format given by -f]")
	parser.add_option("-f", "--format1",		  dest="format1",		action="store",					 type="string", help="format of file 1 [compulsory] [format: transcript file format]")
	parser.add_option("", "--inputTxt", 		dest="inputTxt", 		action="store", 				type="string", 	help="input, a txt file for a list of input reads files. Should identify all reads files format, given by -g [compulsory]")
	#parser.add_option("-j", "--input2",		   dest="inputFileName2", action="store",	default="inputRead",	 type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]")
	parser.add_option("-g", "--format2",		  dest="format2",		action="store",				 type="string", help="format of file 2 [compulsory] [format: transcript file format]")
	#parser.add_option("-o", "--output",		   dest="output",		 action="store",	  default=None,  type="string", help="output file [compulsory] [format: output file in GFF3 format]")
	parser.add_option("-S", "--start1",		   dest="start1",		 action="store",	  default=None,  type="int",	help="only consider the n first nucleotides of the transcripts in file 1 (do not use it with -U) [format: int]")
	parser.add_option("-s", "--start2",		   dest="start2",		 action="store",	  default=None,  type="int",	help="only consider the n first nucleotides of the transcripts in file 2 (do not use it with -u) [format: int]")
	parser.add_option("-U", "--end1",			 dest="end1",		   action="store",	  default=None,  type="int",	help="only consider the n last nucleotides of the transcripts in file 1 (do not use it with -S) [format: int]")
	parser.add_option("-u", "--end2",			 dest="end2",		   action="store",	  default=None,  type="int",	help="only consider the n last nucleotides of the transcripts in file 2 (do not use it with -s) [format: int]")
	parser.add_option("-t", "--intron",		   dest="introns",		action="store_true", default=False,				help="also report introns [format: bool] [default: false]")
	parser.add_option("-E", "--5primeExtension1", dest="fivePrime1",	 action="store",	  default=None,  type="int",	help="extension towards 5' in file 1 [format: int]")
	parser.add_option("-e", "--5primeExtension2", dest="fivePrime2",	 action="store",	  default=None,  type="int",	help="extension towards 5' in file 2 [format: int]")
	parser.add_option("-N", "--3primeExtension1", dest="threePrime1",	action="store",	  default=None,  type="int",	help="extension towards 3' in file 1 [format: int]")
	parser.add_option("-n", "--3primeExtension2", dest="threePrime2",	action="store",	  default=None,  type="int",	help="extension towards 3' in file 2 [format: int]")
	parser.add_option("-c", "--colinear",		 dest="colinear",	   action="store_true", default=False,				help="colinear only [format: bool] [default: false]")
	parser.add_option("-a", "--antisense",		dest="antisense",	  action="store_true", default=False,				help="antisense only [format: bool] [default: false]")
	parser.add_option("-d", "--distance",		 dest="distance",	   action="store",	  default=None,	 type="int",	help="accept some distance between query and reference [format: int]")
	parser.add_option("-k", "--included",		 dest="included",	   action="store_true", default=False,				help="keep only elements from file 1 which are included in an element of file 2 [format: bool] [default: false]")
	parser.add_option("-K", "--including",		dest="including",	  action="store_true", default=False,				help="keep only elements from file 2 which are included in an element of file 1 [format: bool] [default: false]")
	parser.add_option("-m", "--minOverlap",	   dest="minOverlap",	 action="store",	  default=None,	 type="int",	help="minimum number of nucleotides overlapping to declare an overlap [format: int] [default: 1]")
	parser.add_option("-p", "--pcOverlap",		dest="pcOverlap",	  action="store",	  default=None,  type="int",	help="minimum percentage of nucleotides to overlap to declare an overlap [format: int]")
	parser.add_option("-O", "--notOverlapping",   dest="notOverlapping", action="store_true", default=False,				help="also output not overlapping data [format: bool] [default: false]")
	parser.add_option("-x", "--exclude",		  dest="exclude",		action="store_true", default=False,				help="invert the match [format: bool] [default: false]")
	parser.add_option("-v", "--verbosity",		dest="verbosity",	  action="store",	  default=1,	 type="int",	help="trace level [format: int]")
	parser.add_option('', '--tar', dest='outputTar', default=None, help='output all SAM results in a tar file.' )
	parser.add_option( '', '--outTxt', dest='outTxtFile', help='The output list of results files on txt format.[compulsory]' )
	(options, args) = parser.parse_args()
	
	
	#Parse the input txt file and read a list of BAM files.
	file = open(options.inputTxt, "r")
	lines = file.readlines()
	inputFileNames = []
	overlapOutputNames = []
	outputName = options.outTxtFile
	resDirName = os.path.dirname(outputName) + "/"
	#Write output txt file and define all output sam file names.
	out = open(outputName, "w")
	for line in lines:
		tab = line.split()
		inputFileNames.append(tab[1])
		overlapOutName = resDirName + tab[0] + '_overlapOut_%s.gff3' % random.randrange(0, 10000)
		overlapOutputNames.append(overlapOutName)
		out.write(tab[0] + '\t' + overlapOutName  + '\n')
	file.close()
	out.close()
	
	#construction the commandes for each input file
	cmds = []
	for i in range(len(inputFileNames)):
		absFile = sys.argv[0]
		absDir = os.path.dirname(absFile)
		parentDir = os.path.abspath(os.path.join(absDir, os.path.pardir))
		cmd = "python %s/Java/Python/CompareOverlappingSmallQuery.py " % parentDir
		opts = "-i %s -f %s -j %s -g %s -o %s " % (options.inputFileName1, options.format1, inputFileNames[i], options.format2, overlapOutputNames[i])
		#if options.start1 != None:
		#	opts += "-S %s " % options.start1
		#if options.start2 != None:
		#	opts += "-s %s " % options.start2
		#if options.end1 != None:
		#	opts += "-U %s " % options.end1
		#if options.end2 != None:
		#	opts += "-u %s " % options.end2
		#if options.fivePrime1 != None:
		#	opts += "-E %s " % options.fivePrime1
		#if options.fivePrime2 != None:
		#	opts += "-e %s " % options.fivePrime2
		#if options.threePrime1 != None:
		#	opts += "-N %s " % options.threePrime1
		#if options.threePrime2 != None:
		#	opts += "-n %s " % options.threePrime2
		#if options.colinear:
		#	opts += "-c "
		#if options.antisense:
		#	opts +="-a "
		#if options.included:
		#	opts += "-k "
		#if options.including:
		#	opts += "-K "
		#if options.pcOverlap != None:
		#	opts += "-p %s " % options.pcOverlap
		if options.notOverlapping:
			opts += "-O "
		if options.exclude:
			opts += "-x "
		if options.distance != None:
			opts += "-d %s " % options.distance
		#if options.minOverlap != None:
		#	opts += "-m %s " % options.minOverlap
		cmd += opts
		cmds.append(cmd)


	print "les commandes sont %s \n" % cmds

	tmp_files = []	
	for i in range(len(cmds)):
		try:
			tmp_out = tempfile.NamedTemporaryFile().name
			tmp_files.append(tmp_out)
			tmp_stdout = open( tmp_out, 'wb' )
			tmp_err = tempfile.NamedTemporaryFile().name
			tmp_files.append(tmp_err)
			tmp_stderr = open( tmp_err, 'wb' )
			proc = subprocess.Popen( args=cmds[i], shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr )
			returncode = proc.wait()
			tmp_stderr.close()
			# get stderr, allowing for case where it's very large
			tmp_stderr = open( tmp_err, 'rb' )
			stderr = ''
			buffsize = 1048576
			try:
				while True:
					stderr += tmp_stderr.read( buffsize )
					if not stderr or len( stderr ) % buffsize != 0:
						break
			except OverflowError:
				pass
			tmp_stdout.close()
			tmp_stderr.close()
			if returncode != 0:
				raise Exception, stderr
		except Exception, e:
			stop_err( 'Error in :\n' + str( e ) )

	if options.outputTar != None:
		toTar(options.outputTar, overlapOutputNames)	
	
	for tmp_file in tmp_files:
		os.remove(tmp_file)


if __name__=="__main__": __main__()