comparison smart_toolShed/SMART/Java/Python/trimSequences.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 from optparse import OptionParser
32 from commons.core.parsing.FastaParser import FastaParser
33 from commons.core.parsing.FastqParser import FastqParser
34 from commons.core.writer.FastaWriter import FastaWriter
35 from commons.core.writer.FastqWriter import FastqWriter
36 from SMART.Java.Python.misc.Progress import Progress
37 from SMART.Java.Python.misc import Utils
38
39
40 if __name__ == "__main__":
41
42 # parse command line
43 description = "Trim Sequences v1.0.3: Remove the 5' and/or 3' adaptors of a list of reads. [Category: Data Modification]"
44
45 parser = OptionParser(description = description)
46 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in sequence format given by -f]")
47 parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: sequence file format]")
48 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in sequence format given by -f]")
49 parser.add_option("-3", "--threePAdaptor", dest="threePAdaptor", action="store", default=None, type="string", help="3' adaptor [format: string] [default: None]")
50 parser.add_option("-5", "--fivePAdaptor", dest="fivePAdaptor", action="store", default=None, type="string", help="5' adaptor [format: string] [default: None]")
51 parser.add_option("-e", "--errors", dest="errors", action="store", default=0, type="int", help="number of errors in percent [format: int] [default: 0]")
52 parser.add_option("-d", "--indels", dest="indels", action="store_true", default=False, help="also accept indels [format: bool] [default: False]")
53 parser.add_option("-n", "--noAdaptor5p", dest="noAdaptor5p", action="store", default=None, type="string", help="print sequences with no 5' adaptor [format: output file in sequence format given by -f]")
54 parser.add_option("-m", "--noAdaptor3p", dest="noAdaptor3p", action="store", default=None, type="string", help="print sequences with no 3' adaptor [format: output file in sequence format given by -f]")
55 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
56 (options, args) = parser.parse_args()
57
58 minSize = 3
59
60 if options.format == "fasta":
61 parser = FastaParser(options.inputFileName, options.verbosity)
62 elif options.format == "fastq":
63 parser = FastqParser(options.inputFileName, options.verbosity)
64 else:
65 raise Exception("Cannot handle files with '%s' format." % (options.format))
66
67 if options.format == "fasta":
68 writer = FastaWriter(options.outputFileName, options.verbosity)
69 elif options.format == "fastq":
70 writer = FastqWriter(options.outputFileName, options.verbosity)
71 else:
72 raise Exception("Cannot handle files with '%s' format." % (options.format))
73
74
75 if options.noAdaptor5p != None:
76 if options.format == "fasta":
77 writer5pNoAdaptor = FastaWriter(options.noAdaptor5p, options.verbosity)
78 elif options.format == "fastq":
79 writer5pNoAdaptor = FastqWriter(options.noAdaptor5p, options.verbosity)
80 else:
81 raise Exception("Cannot handle files with '%s' format." % (options.format))
82 nbFound5p = 0
83
84 if options.noAdaptor3p != None:
85 if options.format == "fasta":
86 writer3pNoAdaptor = FastaWriter(options.noAdaptor3p, options.verbosity)
87 elif options.format == "fastq":
88 writer3pNoAdaptor = FastqWriter(options.noAdaptor3p, options.verbosity)
89 else:
90 raise Exception("Cannot handle files with '%s' format." % (options.format))
91 nbFound3p = 0
92
93 progress = Progress(parser.getNbSequences(), "Reading %s" % (options.inputFileName), options.verbosity)
94 for sequence in parser.getIterator():
95 progress.inc()
96 if options.threePAdaptor != None:
97 nucleotides = sequence.sequence
98 found = False
99 bestScore = 10000
100 bestRegion = 0
101 for i in range(len(nucleotides) - minSize):
102 nucleotidesPart = nucleotides[i:]
103 adaptorPart = options.threePAdaptor if len(nucleotidesPart) >= len(options.threePAdaptor) else options.threePAdaptor[:len(nucleotidesPart)]
104 nucleotidesPart = nucleotidesPart if len(adaptorPart) == len(nucleotidesPart) else nucleotidesPart[:len(adaptorPart)]
105 if options.indels:
106 score = Utils.getLevenshteinDistance(adaptorPart, nucleotidesPart)
107 else:
108 score = Utils.getHammingDistance(adaptorPart, nucleotidesPart)
109 if score <= int(options.errors / 100.0 * len(adaptorPart)) and score < bestScore:
110 bestScore = score
111 bestRegion = i
112 found = True
113 if found:
114 nbFound3p += 1
115 sequence.shrinkToFirstNucleotides(bestRegion)
116 elif options.noAdaptor3p:
117 writer3pNoAdaptor.addSequence(sequence)
118 if options.fivePAdaptor != None:
119 nucleotides = sequence.sequence
120 found = False
121 bestScore = 10000
122 bestRegion = 0
123 for i in reversed(range(minSize, len(nucleotides))):
124 nucleotidesPart = nucleotides[:i]
125 adaptorPart = options.fivePAdaptor if len(nucleotidesPart) >= len(options.fivePAdaptor) else options.fivePAdaptor[-len(nucleotidesPart):]
126 nucleotidesPart = nucleotidesPart if len(adaptorPart) == len(nucleotidesPart) else nucleotidesPart[-len(adaptorPart):]
127 if options.indels:
128 score = Utils.getLevenshteinDistance(adaptorPart, nucleotidesPart)
129 else:
130 score = Utils.getHammingDistance(adaptorPart, nucleotidesPart)
131 if score <= int(options.errors / 100.0 * len(adaptorPart)) and score < bestScore:
132 bestScore = score
133 bestRegion = i
134 found = True
135 if found:
136 nbFound5p += 1
137 sequence.shrinkToLastNucleotides(len(nucleotides) - bestRegion)
138 elif options.noAdaptor5p:
139 writer5pNoAdaptor.addSequence(sequence)
140 writer.addSequence(sequence)
141 progress.done()
142 writer.close()
143
144 print "%d sequences" % (parser.getNbSequences())
145 if options.fivePAdaptor != None:
146 print "%d sequences with 5' adaptors (%.2f%%)" % (nbFound5p, float(nbFound5p) / parser.getNbSequences() * 100)
147 if options.threePAdaptor != None:
148 print "%d sequences with 3' adaptors (%.2f%%)" % (nbFound3p, float(nbFound3p) / parser.getNbSequences() * 100)
149