annotate tools/filters/trimmer.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 import sys
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 import optparse
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 def stop_err( msg ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 sys.stderr.write( msg )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 sys.exit()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 def main():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 usage = """%prog [options]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 options (listed below) default to 'None' if omitted
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 parser = optparse.OptionParser(usage=usage)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 parser.add_option(
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 '-a','--ascii',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 dest='ascii',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 action='store_true',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 default = False,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 help='Use ascii codes to defined ignored beginnings instead of raw characters')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 parser.add_option(
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 '-q','--fastq',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 dest='fastq',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 action='store_true',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 default = False,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 help='The input data in fastq format. It selected the script skips every even line since they contain sequence ids')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 parser.add_option(
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 '-i','--ignore',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 dest='ignore',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 help='A comma separated list on ignored beginnings (e.g., ">,@"), or its ascii codes (e.g., "60,42") if option -a is enabled')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 parser.add_option(
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 '-s','--start',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 dest='start',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 default = '0',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 help='Trim from beginning to here (1-based)')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 parser.add_option(
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 '-e','--end',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 dest='end',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 default = '0',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 help='Trim from here to the ned (1-based)')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 parser.add_option(
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 '-f','--file',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 dest='input_txt',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 default = False,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 help='Name of file to be chopped. STDIN is default')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 parser.add_option(
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 '-c','--column',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 dest='col',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 default = '0',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 help='Column to chop. If 0 = chop the whole line')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 options, args = parser.parse_args()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 invalid_starts = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 if options.input_txt:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 infile = open ( options.input_txt, 'r')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 infile = sys.stdin
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 if options.ignore and options.ignore != "None":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 invalid_starts = options.ignore.split(',')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 if options.ascii and options.ignore and options.ignore != "None":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 for i, item in enumerate( invalid_starts ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 invalid_starts[i] = chr( int( item ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 col = int( options.col )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 for i, line in enumerate( infile ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 line = line.rstrip( '\r\n' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 if line:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 if options.fastq and i % 2 == 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 print line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 if line[0] not in invalid_starts:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 if col == 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 if int( options.end ) > 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 line = line[ int( options.start )-1 : int( options.end ) ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 line = line[ int( options.start )-1 : ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 fields = line.split( '\t' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 if col-1 > len( fields ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 stop_err('Column %d does not exist. Check input parameters\n' % col)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 if int( options.end ) > 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 fields[col - 1] = fields[col - 1][ int( options.start )-1 : int( options.end ) ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 fields[col - 1] = fields[col - 1][ int( options.start )-1 : ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 line = '\t'.join(fields)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 print line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 if __name__ == "__main__": main()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106