diff smart_toolShed/SMART/Java/Python/adaptorStripper.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/smart_toolShed/SMART/Java/Python/adaptorStripper.py	Thu Jan 17 10:52:14 2013 -0500
@@ -0,0 +1,115 @@
+#! /usr/bin/env python
+#
+# Copyright INRA-URGI 2009-2010
+# 
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+# 
+# As a counterpart to the access to the source code and rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty and the software's author, the holder of the
+# economic rights, and the successive licensors have only limited
+# liability.
+# 
+# In this respect, the user's attention is drawn to the risks associated
+# with loading, using, modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean that it is complicated to manipulate, and that also
+# therefore means that it is reserved for developers and experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and, more generally, to use and operate it in the
+# same conditions as regards security.
+# 
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+#
+"""Remove adaptors"""
+
+import os
+from optparse import OptionParser
+from SMART.Java.Python.structure.Sequence import Sequence
+from SMART.Java.Python.structure.SequenceList import SequenceList
+from commons.core.parsing.FastaParser import FastaParser
+from commons.core.writer.FastaWriter import FastaWriter
+from SMART.Java.Python.misc.Progress import Progress
+
+
+def distance (string1, string2):
+    if len(string1) != len(string2):
+        return None
+    distance = 0
+    for i in range(0, len(string1)):
+        if string1[i] != string2[i]:
+            distance += 1
+    return distance
+
+
+
+if __name__ == "__main__":
+    nbRemaining = 0
+    
+    # parse command line
+    description = "Adaptor Stripper v1.0.1: Remove the adaptor of a list of reads. [Category: Personnal]"
+
+    parser = OptionParser(description = description)
+    parser.add_option("-i", "--input",         dest="inputFileName",      action="store",                     type="string", help="input file [compulsory] [format: file in FASTA format]")
+    parser.add_option("-o", "--output",        dest="outputFileName",     action="store",                     type="string", help="output file [compulsory] [format: output file in FASTA format]")
+    parser.add_option("-5", "--5primeAdaptor", dest="fivePrimeAdaptor",   action="store",                     type="string", help="five prime adaptor [format: string]")
+    parser.add_option("-3", "--3primeAdaptor", dest="threePrimeAdaptor",  action="store",                     type="string", help="three prime adaptor [format: string]")
+    parser.add_option("-d", "--5primeDist",    dest="fivePrimeDistance",  action="store",      default=3,     type="int",    help="five prime distance [format: int] [default: 3]")
+    parser.add_option("-e", "--3primeDist",    dest="threePrimeDistance", action="store",      default=3,     type="int",    help="three prime distance [format: int [default: 3]]")
+    parser.add_option("-m", "--3primeSize",    dest="threePrimeSize",     action="store",      default=10,    type="int",    help="three prime size [format: int] [default: 10]")
+    parser.add_option("-v", "--verbosity",     dest="verbosity",          action="store",      default=1,     type="int",    help="trace level [format: int] [default: 1]")
+    parser.add_option("-l", "--log",           dest="log",                action="store_true", default=False,                help="write a log file [format: bool] [default: false]")
+    (options, args) = parser.parse_args()
+
+    if options.log:
+        logHandle = open(options.outputFileName + ".log", "w")
+
+
+    writer         = FastaWriter(options.outputFileName + ".fas", options.verbosity)
+    sequenceParser = FastaParser(options.inputFileName, options.verbosity)
+    nbSequences    = sequenceParser.getNbSequences()
+
+    # treat sequences
+    progress = Progress(sequenceParser.getNbSequences(), "Analyzing " + options.inputFileName, options.verbosity)
+    for sequence in sequenceParser.getIterator():
+        fivePrimeAdaptor  = sequence.getSequence()[0:len(options.fivePrimeAdaptor)]
+        threePrimeAdaptor = sequence.getSequence()[len(sequence.sequence)-len(options.threePrimeAdaptor):]
+
+        # check 5' adaptor
+        fivePrimeDistance = distance(fivePrimeAdaptor, options.fivePrimeAdaptor)
+        # check 3' adaptor
+        threePrimeDistance = len(threePrimeAdaptor)
+        for i in range(options.threePrimeSize, len(threePrimeAdaptor)+1):
+            threePrimeDistance = min(threePrimeDistance, distance(threePrimeAdaptor[-i:], options.threePrimeAdaptor[:i]))
+
+        # sort candidates
+        if fivePrimeDistance > options.fivePrimeDistance:
+            if options.log:
+                logHandle.write("Sequence %s does not start with the right adaptor (%s != %s)\n" % (sequence.getSequence(), fivePrimeAdaptor, options.fivePrimeAdaptor))
+        elif threePrimeDistance > options.threePrimeDistance:
+            if options.log:
+                logHandle.write("Sequence %s does not end with the right adaptor (%s != %s)\n" % (sequence.getSequence(), threePrimeAdaptor, options.threePrimeAdaptor))
+        else:
+            nbRemaining += 1
+            sequence.setSequence(sequence.getSequence()[len(options.fivePrimeAdaptor):len(sequence.getSequence())-len(options.threePrimeAdaptor)])
+            writer.addSequence(sequence)
+
+        progress.inc()
+
+    progress.done()
+
+    if options.log:
+        logHandle.close()
+
+    writer.write()
+
+    print "kept %i over %i (%.f%%)" % (nbRemaining, nbSequences, float(nbRemaining) / nbSequences * 100)
+
+