comparison BSseeker2/FilterReads.py @ 0:e6df770c0e58 draft

Initial upload
author weilong-guo
date Fri, 12 Jul 2013 18:47:28 -0400
parents
children 8b26adf64adc
comparison
equal deleted inserted replaced
-1:000000000000 0:e6df770c0e58
1 #!/usr/bin/python
2
3 import sys
4 import os
5 import os.path
6 import re
7
8 # Define the Dictionary & the Reverse Dictionary for barcode
9
10 # ===========================================
11 #
12 def FilterFastq (infile, outfile) :
13 try:
14 INFILE = open(infile, 'r')
15 except IOError:
16 print "\n[Error]:\n\t File cannot be open: ", infile;
17 exit(-1)
18 try:
19 OUTFILE = open(outfile, 'w')
20 except IOError:
21 print "\n[Error]:\n\t File cannot be open: ", outfile;
22 exit(-1)
23 Dict = {}
24 count = 0
25 total = 0
26 while 1 :
27 line1 = INFILE.readline()
28 if not line1 :
29 break
30 line1 = line1
31 line2 = INFILE.readline()
32 line3 = INFILE.readline()
33 line4 = INFILE.readline()
34 total += 1
35 # print line2
36 if ( line2 in Dict ) :
37 Dict[line2] += 1
38 else :
39 Dict[line2] = 1
40 OUTFILE.write(line1)
41 OUTFILE.write(line2)
42 OUTFILE.write(line3)
43 OUTFILE.write(line4)
44 count += 1
45 INFILE.close()
46 OUTFILE.close()
47 print "Count of total raw reads: ", total
48 print "Count of reads left: ", count
49 print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%"
50
51
52 # ===============================
53 #
54
55 def FilterSequence (infile, outfile) :
56 try:
57 INFILE = open(infile, 'r')
58 except IOError:
59 print "\n[Error]:\n\t File cannot be open: ", infile;
60 exit(-1)
61 try:
62 OUTFILE = open(outfile, 'w')
63 except IOError:
64 print "\n[Error]:\n\t File cannot be open: ", outfile;
65 exit(-1)
66 Dict = {}
67 count = 0
68 total = 0
69 for line in INFILE:
70 line = line.strip()
71 total += 1
72 if ( line in Dict ) :
73 Dict[line] += 1
74 else :
75 Dict[line] = 1
76 count += 1
77 OUTFILE.write(line + "\n")
78 INFILE.close()
79 OUTFILE.close()
80 print "Count of total raw reads: ", total
81 print "Count of reads left: ", count
82 print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%"
83
84 # ===============================
85 # SN603 WA047 6 1101 41.40 99.10 0 1 .GGGA.......TTAG.............. @SZ__@@@@@@@RR__@@@@@@@@@@@@@@ 0
86 #
87 def FilterQseq (infile, outfile, keepquality) :
88 if keepquality :
89 print "User specified '-k' and read quality will not be considered"
90 else :
91 print "Reads with PF=0 will be filtered"
92
93 try:
94 INFILE = open(infile, 'r')
95 except IOError:
96 print "\n[Error]:\n\t File cannot be open: ", infile;
97 exit(-1)
98 try:
99 OUTFILE = open(outfile, 'w')
100 except IOError:
101 print "\n[Error]:\n\t File cannot be open: ", outfile;
102 exit(-1)
103 Dict = {}
104 count = 0
105 total = 0
106 for line in INFILE:
107 tokens = line.strip().split()
108 total += 1
109 if ( (not keepquality and tokens[10] == "1") or keepquality ) :
110 if ( tokens[8] in Dict ) :
111 Dict[tokens[8]] += 1
112 else :
113 Dict[tokens[8]] = 1
114 count += 1
115 OUTFILE.write(line)
116
117 INFILE.close()
118 OUTFILE.close()
119 print "Count of total raw reads: ", total
120 print "Count of reads left: ", count
121 print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%"
122
123
124 # ===============================
125 #
126
127 def FilterFasta (infile, outfile) :
128 try:
129 INFILE = open(infile, 'r')
130 except IOError:
131 print "\n[Error]:\n\t File cannot be open: ", infile;
132 exit(-1)
133 try:
134 OUTFILE = open(outfile, 'w')
135 except IOError:
136 print "\n[Error]:\n\t File cannot be open: ", outfile;
137 exit(-1)
138 Dict = {}
139 count = 0
140 total = 0
141 name = ""
142 read = ""
143 for line in INFILE:
144 line = line.strip()
145 if (line[0] == '>') :
146 total += 1
147 if ( name != "" ) :
148 if ( read in Dict ) :
149 Dict[read] += 1
150 else :
151 Dict[read] = 1
152 count += 1
153 OUTFILE.write(">" + name + "\n")
154 OUTFILE.write(read + "\n")
155 name = line[1:]
156 read = ""
157 else :
158 read = read + line
159 if (name != "") :
160 if ( read in Dict ) :
161 Dict[read] += 1
162 else :
163 Dict[read] = 1
164 count += 1
165 OUTFILE.write(">" + name + "\n")
166 OUTFILE.write(read + "\n")
167 print "Count of total raw reads: ", total
168 print "Count of reads left: ", count
169 print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%"
170
171
172
173 # ==============================
174 # Decide branch functions for different format
175
176 def FilterReads (infile, outfile, keepquality):
177 try:
178 INFILE = open(infile, 'r')
179 except IOError:
180 print "\n[Error]:\n\t File cannot be open: ", infile;
181 exit(-1)
182 i = 0
183 # Count the numbers of barcodes in first 10000 lines
184 line = INFILE.readline()
185 tokens = line.strip().split()
186 input_format="";
187 if line[0]=="@" :
188 input_format = "FastQ"
189 n_fastq = 0
190 elif len(tokens) == 1 and line[0] != ">":
191 input_format = "sequence"
192 elif len(tokens) == 11:
193 input_format = "qseq"
194 elif line[0]==">" :
195 input_format = "fasta"
196 INFILE.close()
197
198 print "Input file format detected: ", input_format
199
200 if input_format == "FastQ" :
201 FilterFastq(infile, outfile)
202 elif input_format == "sequence" :
203 FilterSequence(infile, outfile)
204 elif input_format == "qseq" :
205 FilterQseq(infile, outfile, keepquality)
206 elif input_format == "fasta" :
207 FilterFasta(infile, outfile)
208
209
210 from optparse import OptionParser
211
212 # ===========================================
213 def main():
214 usage = "Usage: %prog -i <input> -o <output> [-k]\n" \
215 "Author : Guo, Weilong; guoweilong@gmail.com; 2012-11-10\n" \
216 "Last Update: 2013-04-01" \
217 "Description: Unique reads for qseq/fastq/fasta/sequencce,\n" \
218 " and filter low quality reads in qseq file."
219 parser = OptionParser(usage)
220 parser.add_option("-i", dest="infile",
221 help="Name of the input qseq/fastq/fasta/sequence file", metavar="FILE")
222 parser.add_option("-o", dest="outfile",
223 help="Name of the output file", metavar="FILE")
224 parser.add_option("-k", dest="keepquality", default = False, action = "store_true",
225 help="Would not filter low quality reads if specified")
226 (options, args) = parser.parse_args()
227
228 if (options.infile is None) or (options.outfile is None) :
229 print parser.print_help()
230 exit(-1)
231 FilterReads(options.infile, options.outfile, options.keepquality)
232
233
234 # ===========================================
235 if __name__ == "__main__":
236 main()
237