# HG changeset patch
# User m-zytnicki
# Date 1392989556 18000
# Node ID 90f4b29d884f868799ee5e944796d1013311f52d
# Parent 2a4884ba3e5cda00462a038499f77aff61aa1b5d
Uploaded
diff -r 2a4884ba3e5c -r 90f4b29d884f SMART/Java/Python/CompareOverlappingSmallRef.py
--- a/SMART/Java/Python/CompareOverlappingSmallRef.py Mon Feb 10 03:39:09 2014 -0500
+++ b/SMART/Java/Python/CompareOverlappingSmallRef.py Fri Feb 21 08:32:36 2014 -0500
@@ -235,6 +235,7 @@
cosr.includeNotOverlapping(options.notOverlapping)
cosr.setDistance(options.distance)
cosr.setAntisense(options.antisense)
+ cosr.setCollinear(options.collinear)
cosr.setInvert(options.exclude)
cosr.setMinPercentOverlap(options.pcOverlapQuery, options.pcOverlapRef)
cosr.setMinOverlap(options.minOverlap)
diff -r 2a4884ba3e5c -r 90f4b29d884f SMART/Java/Python/clusterize.py
--- a/SMART/Java/Python/clusterize.py Mon Feb 10 03:39:09 2014 -0500
+++ b/SMART/Java/Python/clusterize.py Fri Feb 21 08:32:36 2014 -0500
@@ -33,6 +33,7 @@
import os, os.path, random
from optparse import OptionParser
+from heapq import heappush, heappop
from commons.core.parsing.ParserChooser import ParserChooser
from commons.core.writer.Gff3Writer import Gff3Writer
from SMART.Java.Python.structure.Transcript import Transcript
@@ -44,26 +45,31 @@
class Clusterize(object):
def __init__(self, verbosity):
- self.normalize = False
- self.presorted = False
- self.distance = 1
- self.colinear = False
- self.nbWritten = 0
- self.nbMerges = 0
- self.verbosity = verbosity
+ self.parsers = {}
+ self.sortedFileNames = {}
+ self.normalize = False
+ self.presorted = False
+ self.distance = 1
+ self.collinear = False
+ self.nbWritten = 0
+ self.nbMerges = 0
+ self.verbosity = verbosity
self.splittedFileNames = {}
+ self.chromosomes = set()
def __del__(self):
- for fileName in self.splittedFileNames.values():
- os.remove(fileName)
+ for fileName1 in self.splittedFileNames:
+ for fileName2 in self.splittedFileNames[fileName1].values():
+ os.remove(fileName2)
- def setInputFile(self, fileName, format):
+ def setInputFiles(self, fileNames, format):
parserChooser = ParserChooser(self.verbosity)
parserChooser.findFormat(format)
- self.parser = parserChooser.getParser(fileName)
- self.sortedFileName = "%s_sorted_%d.pkl" % (os.path.splitext(fileName)[0], random.randint(1, 100000))
- if "SMARTTMPPATH" in os.environ:
- self.sortedFileName = os.path.join(os.environ["SMARTTMPPATH"], os.path.basename(self.sortedFileName))
+ for fileName in fileNames:
+ self.parsers[fileName] = parserChooser.getParser(fileName)
+ self.sortedFileNames[fileName] = "%s_sorted_%d.pkl" % (os.path.splitext(fileName)[0], random.randint(1, 100000))
+ if "SMARTTMPPATH" in os.environ:
+ self.sortedFileNames[fileName] = os.path.join(os.environ["SMARTTMPPATH"], os.path.basename(self.sortedFileNames[fileName]))
def setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"):
writerChooser = WriterChooser()
@@ -76,8 +82,8 @@
def setDistance(self, distance):
self.distance = distance
- def setColinear(self, colinear):
- self.colinear = colinear
+ def setColinear(self, collinear):
+ self.collinear = collinear
def setNormalize(self, normalize):
self.normalize = normalize
@@ -85,42 +91,60 @@
def setPresorted(self, presorted):
self.presorted = presorted
- def _sortFile(self):
+ def _sortFiles(self):
if self.presorted:
return
- fs = FileSorter(self.parser, self.verbosity-4)
- fs.perChromosome(True)
- fs.setPresorted(self.presorted)
- fs.setOutputFileName(self.sortedFileName)
- fs.sort()
- self.splittedFileNames = fs.getOutputFileNames()
- self.nbElementsPerChromosome = fs.getNbElementsPerChromosome()
- self.nbElements = fs.getNbElements()
+ for fileName, parser in self.parsers.iteritems():
+ fs = FileSorter(parser, self.verbosity-4)
+ fs.perChromosome(True)
+ fs.setPresorted(self.presorted)
+ fs.setOutputFileName(self.sortedFileNames[fileName])
+ fs.sort()
+ self.splittedFileNames[fileName] = fs.getOutputFileNames()
+ self.nbElementsPerChromosome = fs.getNbElementsPerChromosome()
+ self.nbElements = fs.getNbElements()
+ self.chromosomes.update(self.splittedFileNames[fileName].keys())
- def _iterate(self, chromosome):
- if chromosome == None:
- progress = UnlimitedProgress(10000, "Reading input file", self.verbosity)
- parser = self.parser
- else:
- progress = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity)
- parser = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity)
+ def _iterate(self):
+ progress = UnlimitedProgress(10000, "Reading input file", self.verbosity)
transcripts = []
- for newTranscript in parser.getIterator():
- newTranscripts = []
- if newTranscript.__class__.__name__ == "Mapping":
- newTranscript = newTranscript.getTranscript()
- for oldTranscript in transcripts:
- if self._checkOverlap(newTranscript, oldTranscript):
- self._merge(newTranscript, oldTranscript)
- elif self._checkPassed(newTranscript, oldTranscript):
- self._write(oldTranscript)
- else:
- newTranscripts.append(oldTranscript)
- newTranscripts.append(newTranscript)
- transcripts = newTranscripts
- progress.inc()
- for transcript in transcripts:
- self._write(transcript)
+ heap = []
+ parsersSets = []
+ if self.chromosomes:
+ parsersSets.append(self.parsers.values())
+ else:
+ for chromosome in self.chromosomes:
+ parsersSets.append([self.splittedFileNames[fileName][chromosome] for fileName in self.splittedFileNames if chromosome in self.splittedFileNames[fileName]])
+ for parsers in parsersSets:
+ for parser in parsers:
+ iterator = parser.getIterator()
+ for transcript in iterator:
+ if transcript.__class__.__name__ == "Mapping":
+ transcript = transcript.getTranscript()
+ heappush(heap, (transcript.getChromosome(), transcript.getStart(), -transcript.getEnd(), transcript, iterator))
+ break
+ while heap:
+ chromosome, start, end, newTranscript, iterator = heappop(heap)
+ for transcript in iterator:
+ if transcript.__class__.__name__ == "Mapping":
+ transcript = transcript.getTranscript()
+ heappush(heap, (transcript.getChromosome(), transcript.getStart(), -transcript.getEnd(), transcript, iterator))
+ break
+ newTranscripts = []
+ if newTranscript.__class__.__name__ == "Mapping":
+ newTranscript = newTranscript.getTranscript()
+ for oldTranscript in transcripts:
+ if self._checkOverlap(newTranscript, oldTranscript):
+ self._merge(newTranscript, oldTranscript)
+ elif self._checkPassed(newTranscript, oldTranscript):
+ self._write(oldTranscript)
+ else:
+ newTranscripts.append(oldTranscript)
+ newTranscripts.append(newTranscript)
+ transcripts = newTranscripts
+ progress.inc()
+ for transcript in transcripts:
+ self._write(transcript)
progress.done()
def _merge(self, transcript1, transcript2):
@@ -135,7 +159,7 @@
def _checkOverlap(self, transcript1, transcript2):
if transcript1.getChromosome() != transcript2.getChromosome():
return False
- if self.colinear and transcript1.getDirection() != transcript2.getDirection():
+ if self.collinear and transcript1.getDirection() != transcript2.getDirection():
return False
if transcript1.getDistance(transcript2) > self.distance:
return False
@@ -145,12 +169,8 @@
return ((transcript1.getChromosome() != transcript2.getChromosome()) or (transcript1.getDistance(transcript2) > self.distance))
def run(self):
- self._sortFile()
- if self.presorted:
- self._iterate(None)
- else:
- for chromosome in sorted(self.splittedFileNames.keys()):
- self._iterate(chromosome)
+ self._sortFiles()
+ self._iterate()
self.writer.close()
if self.verbosity > 0:
print "# input: %d" % (self.nbElements)
@@ -162,21 +182,21 @@
description = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]"
parser = OptionParser(description = description)
- parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]")
- parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [format: transcript file format]")
- parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]")
- parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="output file format [format: transcript file format]")
- parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="merge colinear transcripts only [format: bool] [default: false]")
- parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts to be merged [format: int] [default: 0]")
- parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]")
- parser.add_option("-s", "--sorted", dest="sorted", action="store_true", default=False, help="input is already sorted [format: bool] [default: false]")
- parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]")
+ parser.add_option("-i", "--inputs", dest="inputFileNames", action="store", type="string", help="input files (separated by commas) [compulsory] [format: string]")
+ parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [format: transcript file format]")
+ parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]")
+ parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="output file format [format: transcript file format]")
+ parser.add_option("-c", "--collinear", dest="collinear", action="store_true", default=False, help="merge collinear transcripts only [format: bool] [default: false]")
+ parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts to be merged [format: int] [default: 0]")
+ parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]")
+ parser.add_option("-s", "--sorted", dest="sorted", action="store_true", default=False, help="input is already sorted [format: bool] [default: false]")
+ parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]")
(options, args) = parser.parse_args()
c = Clusterize(options.verbosity)
- c.setInputFile(options.inputFileName, options.format)
+ c.setInputFiles(options.inputFileNames.split(","), options.format)
c.setOutputFileName(options.outputFileName, options.outputFormat)
- c.setColinear(options.colinear)
+ c.setColinear(options.collinear)
c.setDistance(options.distance)
c.setNormalize(options.normalize)
c.setPresorted(options.sorted)
diff -r 2a4884ba3e5c -r 90f4b29d884f SMART/Java/Python/misc/RPlotter.pyc
Binary file SMART/Java/Python/misc/RPlotter.pyc has changed
diff -r 2a4884ba3e5c -r 90f4b29d884f SMART/Java/Python/structure/TranscriptContainer.pyc
Binary file SMART/Java/Python/structure/TranscriptContainer.pyc has changed
diff -r 2a4884ba3e5c -r 90f4b29d884f SMART/galaxy/.CompareOverlappingSmallRef.xml.swp
Binary file SMART/galaxy/.CompareOverlappingSmallRef.xml.swp has changed
diff -r 2a4884ba3e5c -r 90f4b29d884f SMART/galaxy/Clusterize.xml
--- a/SMART/galaxy/Clusterize.xml Mon Feb 10 03:39:09 2014 -0500
+++ b/SMART/galaxy/Clusterize.xml Fri Feb 21 08:32:36 2014 -0500
@@ -4,26 +4,8 @@
PYTHONPATH
- ../Java/Python/clusterize.py -i $formatType.inputFileName
- #if $formatType.FormatInputFileName == 'bed':
- -f bed
- #elif $formatType.FormatInputFileName == 'gff':
- -f gff
- #elif $formatType.FormatInputFileName == 'gff2':
- -f gff2
- #elif $formatType.FormatInputFileName == 'gff3':
- -f gff3
- #elif $formatType.FormatInputFileName == 'sam':
- -f sam
- #elif $formatType.FormatInputFileName == 'bam':
- -f bam
- #elif $formatType.FormatInputFileName == 'gtf':
- -f gtf
- #end if
- -o $outputFileGff
- $colinear
- $normalize
- -d $distance
+ #set $inputFiles = ",".join(["%s" % (s) for s in $formatType.inputFileName])
+ ../Java/Python/clusterize.py -i $inputFiles -f $formatType.FormatInputFileName -o $outputFileGff $colinear $normalize -d $distance
@@ -34,29 +16,29 @@
+
-
-
+
-
+
-
+
-
+
-
+
+
+
+
-
-
-
-
+
diff -r 2a4884ba3e5c -r 90f4b29d884f SMART/galaxy/CompareOverlappingAdapt.xml
--- a/SMART/galaxy/CompareOverlappingAdapt.xml Mon Feb 10 03:39:09 2014 -0500
+++ b/SMART/galaxy/CompareOverlappingAdapt.xml Fri Feb 21 08:32:36 2014 -0500
@@ -4,45 +4,15 @@
PYTHONPATH
- ../Java/Python/CompareOverlappingAdapt.py -i $formatType.inputFileName1
- #if $formatType.FormatInputFileName1 == 'bed':
- -f bed
- #elif $formatType.FormatInputFileName1 == 'gff':
- -f gff
- #elif $formatType.FormatInputFileName1 == 'gff2':
- -f gff2
- #elif $formatType.FormatInputFileName1 == 'gff3':
- -f gff3
- #elif $formatType.FormatInputFileName1 == 'sam':
- -f sam
- #elif $formatType.FormatInputFileName1 == 'gtf':
- -f gtf
- #end if
- -j $formatType2.inputFileName2
- #if $formatType2.FormatInputFileName2 == 'bed':
- -g bed
- #elif $formatType2.FormatInputFileName2 == 'gff':
- -g gff
- #elif $formatType2.FormatInputFileName2 == 'gff2':
- -g gff2
- #elif $formatType2.FormatInputFileName2 == 'gff3':
- -g gff3
- #elif $formatType2.FormatInputFileName2 == 'sam':
- -g sam
- #elif $formatType2.FormatInputFileName2 == 'gtf':
- -g gtf
- #end if
- -o $outputFileGff
+ ../Java/Python/CompareOverlappingAdapt.py -i $formatType.inputFileName1 -f $formatType.FormatInputFileName1 -j $formatType2.inputFileName2 -g $formatType2.FormatInputFileName2 -o $outputFileGff $InvertMatch $NotOverlapping -m $NbOverlap
#if $OptionDistance.Dist == 'Yes':
-d $OptionDistance.distance
#end if
- #if $OptionCollinearOrAntiSens.OptionCA == 'Collinear':
+ #if $OptionCA == 'Collinear':
-c
- #elif $OptionCollinearOrAntiSens.OptionCA == 'AntiSens':
+ #elif $OptionCA == 'AntiSense':
-a
#end if
- $InvertMatch
- $NotOverlapping
@@ -114,21 +84,14 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
diff -r 2a4884ba3e5c -r 90f4b29d884f SMART/galaxy/CompareOverlappingSmallQuery.xml
--- a/SMART/galaxy/CompareOverlappingSmallQuery.xml Mon Feb 10 03:39:09 2014 -0500
+++ b/SMART/galaxy/CompareOverlappingSmallQuery.xml Fri Feb 21 08:32:36 2014 -0500
@@ -4,60 +4,21 @@
PYTHONPATH
- ../Java/Python/CompareOverlappingSmallQuery.py -i $formatType.inputFileName1
- #if $formatType.FormatInputFileName1 == 'bed':
- -f bed
- #elif $formatType.FormatInputFileName1 == 'gff':
- -f gff
- #elif $formatType.FormatInputFileName1 == 'gff2':
- -f gff2
- #elif $formatType.FormatInputFileName1 == 'gff3':
- -f gff3
- #elif $formatType.FormatInputFileName1 == 'sam':
- -f sam
- #elif $formatType.FormatInputFileName1 == 'bam':
- -f bam
- #elif $formatType.FormatInputFileName1 == 'gtf':
- -f gtf
- #end if
- -j $formatType2.inputFileName2
- #if $formatType2.FormatInputFileName2 == 'bed':
- -g bed
- #elif $formatType2.FormatInputFileName2 == 'gff':
- -g gff
- #elif $formatType2.FormatInputFileName2 == 'gff2':
- -g gff2
- #elif $formatType2.FormatInputFileName2 == 'gff3':
- -g gff3
- #elif $formatType2.FormatInputFileName2 == 'sam':
- -g sam
- #elif $formatType2.FormatInputFileName2 == 'bam':
- -g bam
- #elif $formatType2.FormatInputFileName2 == 'gtf':
- -g gtf
- #end if
- -o $outputFileGff
+ ../Java/Python/CompareOverlappingSmallQuery.py -i $formatType.inputFileName1 -f $formatType.FormatInputFileName1 -j $formatType2.inputFileName2 -g $formatType2.FormatInputFileName2 -o $outputFileGff $InvertMatch $NotOverlapping $OptionInclusionQuery $OptionInclusionRef -m $OptionMinOverlap
#if $OptionDistance.Dist == 'Yes':
-d $OptionDistance.distance
#end if
- #if $OptionMinOverlap.present == 'Yes':
- -m $OptionMinOverlap.minOverlap
- #end if
#if $OptionPcOverlapQuery.present == 'Yes':
-p $OptionPcOverlapQuery.minOverlap
#end if
#if $OptionPcOverlapRef.present == 'Yes':
-P $OptionPcOverlapRef.minOverlap
#end if
- #if $OptionCollinearOrAntiSens.OptionCA == 'Collinear':
+ #if $OptionCA == 'Collinear':
-c
- #elif $OptionCollinearOrAntiSens.OptionCA == 'AntiSens':
+ #elif $OptionCA == 'AntiSense':
-a
#end if
- $InvertMatch
- $NotOverlapping
- $OptionInclusionQuery
- $OptionInclusionRef
@@ -91,7 +52,7 @@
-
+
@@ -137,24 +98,14 @@
-
-
-
-
-
-
-
-
-
-
-
+
-
+
@@ -165,26 +116,18 @@
-
+
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
diff -r 2a4884ba3e5c -r 90f4b29d884f SMART/galaxy/CompareOverlappingSmallRef.xml
--- a/SMART/galaxy/CompareOverlappingSmallRef.xml Mon Feb 10 03:39:09 2014 -0500
+++ b/SMART/galaxy/CompareOverlappingSmallRef.xml Fri Feb 21 08:32:36 2014 -0500
@@ -1,63 +1,24 @@
- Provide the queries that overlap with a reference, when the reference dataset is small.
+ Provide the queries that overlap with a reference, when the query data set is small.
PYTHONPATH
- ../Java/Python/CompareOverlappingSmallQuery.py -i $formatType.inputFileName1
- #if $formatType.FormatInputFileName1 == 'bed':
- -f bed
- #elif $formatType.FormatInputFileName1 == 'gff':
- -f gff
- #elif $formatType.FormatInputFileName1 == 'gff2':
- -f gff2
- #elif $formatType.FormatInputFileName1 == 'gff3':
- -f gff3
- #elif $formatType.FormatInputFileName1 == 'sam':
- -f sam
- #elif $formatType.FormatInputFileName1 == 'bam':
- -f bam
- #elif $formatType.FormatInputFileName1 == 'gtf':
- -f gtf
- #end if
- -j $formatType2.inputFileName2
- #if $formatType2.FormatInputFileName2 == 'bed':
- -g bed
- #elif $formatType2.FormatInputFileName2 == 'gff':
- -g gff
- #elif $formatType2.FormatInputFileName2 == 'gff2':
- -g gff2
- #elif $formatType2.FormatInputFileName2 == 'gff3':
- -g gff3
- #elif $formatType2.FormatInputFileName2 == 'sam':
- -g sam
- #elif $formatType2.FormatInputFileName2 == 'bam':
- -g bam
- #elif $formatType2.FormatInputFileName2 == 'gtf':
- -g gtf
- #end if
- -o $outputFileGff
+ ../Java/Python/CompareOverlappingSmallRef.py -i $formatType.inputFileName1 -f $formatType.FormatInputFileName1 -j $formatType2.inputFileName2 -g $formatType2.FormatInputFileName2 -o $outputFileGff $InvertMatch $NotOverlapping $OptionInclusionQuery $OptionInclusionRef -m $OptionMinOverlap
#if $OptionDistance.Dist == 'Yes':
-d $OptionDistance.distance
#end if
- #if $OptionMinOverlap.present == 'Yes':
- -m $OptionMinOverlap.minOverlap
- #end if
#if $OptionPcOverlapQuery.present == 'Yes':
-p $OptionPcOverlapQuery.minOverlap
#end if
#if $OptionPcOverlapRef.present == 'Yes':
-P $OptionPcOverlapRef.minOverlap
#end if
- #if $OptionCollinearOrAntiSens.OptionCA == 'Collinear':
+ #if $OptionCA == 'Collinear':
-c
- #elif $OptionCollinearOrAntiSens.OptionCA == 'AntiSens':
+ #elif $OptionCA == 'AntiSense':
-a
#end if
- $InvertMatch
- $NotOverlapping
- $OptionInclusionQuery
- $OptionInclusionRef
@@ -91,7 +52,7 @@
-
+
@@ -137,24 +98,14 @@
-
-
-
-
-
-
-
-
-
-
-
+
-
+
@@ -165,26 +116,18 @@
-
+
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+