Mercurial > repos > nick > sequence_content_trimmer
annotate getreads.py @ 0:7f170cb06e2e draft
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
| author | nick | 
|---|---|
| date | Tue, 01 Dec 2015 21:33:27 -0500 | 
| parents | |
| children | 464aee13e2df | 
| rev | line source | 
|---|---|
| 
0
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
1 """A simple parser for FASTA, FASTQ, SAM, etc. Create generators that just return the read name and | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
2 sequence. | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
3 All format parsers follow this API: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
4 with open('sequence.fasta') as fasta: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
5 for read in getreads.getparser(fasta, filetype='fasta'): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
6 print "There is a sequence with this FASTA identifier: "+read.id | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
7 print "Its sequence is "+read.seq | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
8 The properties of Read are: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
9 name: The entire FASTA header line, SAM column 1, etc. | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
10 id: The first whitespace-delimited part of the name. | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
11 seq: The sequence. | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
12 qual: The quality scores (unless the format is FASTA). | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
13 """ | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
14 | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
15 | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
16 def getparser(filehandle, filetype='fasta'): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
17 if filetype == 'fasta': | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
18 return FastaReader(filehandle) | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
19 elif filetype == 'fastq': | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
20 return FastqReader(filehandle) | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
21 elif filetype == 'sam': | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
22 return SamReader(filehandle) | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
23 elif filetype == 'tsv': | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
24 return TsvReader(filehandle) | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
25 else: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
26 raise ValueError('Illegal argument: filetype=\''+filetype+'\'') | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
27 | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
28 | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
29 class FormatError(Exception): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
30 def __init__(self, message=None): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
31 if message: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
32 Exception.__init__(self, message) | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
33 | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
34 | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
35 class Read(object): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
36 def __init__(self, name='', seq='', id_='', qual=''): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
37 self.name = name | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
38 self.seq = seq | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
39 self.id = id_ | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
40 self.qual = qual | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
41 | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
42 | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
43 class Reader(object): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
44 """Base class for all other parsers.""" | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
45 def __init__(self, filehandle): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
46 self.filehandle = filehandle | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
47 def __iter__(self): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
48 return self.parser() | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
49 | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
50 | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
51 class TsvReader(Reader): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
52 """A parser for a simple tab-delimited format. | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
53 Column 1: name | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
54 Column 2: sequence | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
55 Column 3: quality scores (optional)""" | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
56 def parser(self): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
57 for line in self.filehandle: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
58 fields = line.rstrip('\r\n').split('\t') | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
59 if len(fields) < 2: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
60 continue | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
61 read = Read() | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
62 read.name = fields[0] | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
63 if read.name: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
64 read.id = read.name.split()[0] | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
65 read.seq = fields[1] | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
66 if len(fields) >= 3: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
67 read.qual = fields[2] | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
68 yield read | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
69 | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
70 | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
71 class SamReader(Reader): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
72 """A simple SAM parser. | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
73 Assumptions: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
74 Lines starting with "@" with 3 fields are headers. All others are alignments. | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
75 All alignment lines have 11 or more fields. Other lines will be skipped. | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
76 """ | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
77 def parser(self): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
78 for line in self.filehandle: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
79 fields = line.split('\t') | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
80 if len(fields) < 11: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
81 continue | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
82 # Skip headers. | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
83 if fields[0].startswith('@') and len(fields[0]) == 3: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
84 continue | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
85 read = Read() | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
86 read.name = fields[0] | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
87 if read.name: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
88 read.id = read.name.split()[0] | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
89 read.seq = fields[9] | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
90 read.qual = fields[10].rstrip('\r\n') | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
91 yield read | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
92 | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
93 | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
94 class FastaReader(Reader): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
95 """A simple FASTA parser that reads one sequence at a time into memory.""" | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
96 def parser(self): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
97 read = Read() | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
98 while True: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
99 line_raw = self.filehandle.readline() | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
100 if not line_raw: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
101 if read.seq: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
102 yield read | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
103 raise StopIteration | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
104 line = line_raw.strip() | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
105 # Allow empty lines. | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
106 if not line: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
107 continue | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
108 if line.startswith('>'): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
109 if read.seq: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
110 yield read | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
111 read = Read() | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
112 read.name = line[1:] # remove ">" | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
113 if read.name: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
114 read.id = read.name.split()[0] | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
115 continue | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
116 else: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
117 read.seq += line | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
118 | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
119 | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
120 class FastqReader(Reader): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
121 """A simple FASTQ parser. Can handle multi-line sequences, though.""" | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
122 def parser(self): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
123 read = Read() | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
124 state = 'header' | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
125 while True: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
126 line_raw = self.filehandle.readline() | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
127 if not line_raw: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
128 if read.seq: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
129 yield read | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
130 raise StopIteration | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
131 line = line_raw.strip() | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
132 # Allow empty lines. | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
133 if not line: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
134 continue | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
135 if state == 'header': | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
136 if not line.startswith('@'): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
137 raise FormatError('line state = "header" but line does not start with "@"') | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
138 if read.seq: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
139 yield read | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
140 read = Read() | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
141 read.name = line[1:] # remove '@' | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
142 if read.name: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
143 read.id = read.name.split()[0] | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
144 state = 'sequence' | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
145 elif state == 'sequence': | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
146 if line.startswith('+'): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
147 state = 'plus' | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
148 else: | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
149 read.seq += line | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
150 elif state == 'plus' or state == 'quality': | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
151 state = 'quality' | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
152 togo = len(read.seq) - len(read.qual) | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
153 read.qual += line[:togo] | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
154 # The end of the quality lines is when we have a quality string as long as the sequence. | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
155 if len(read.qual) >= len(read.seq): | 
| 
 
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
 
nick 
parents:  
diff
changeset
 | 
156 state = 'header' | 
