Mercurial > repos > petr-novak > re_utils
comparison fasta_interlacer.py @ 0:a4cd8608ef6b draft
Uploaded
author | petr-novak |
---|---|
date | Mon, 01 Apr 2019 07:56:36 -0400 |
parents | |
children | d397f5a85464 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a4cd8608ef6b |
---|---|
1 #!/usr/bin/env python | |
2 ''' interlacing two fastq sequences''' | |
3 import sys | |
4 | |
5 def readSingleSeq(file): | |
6 ''' read single seq from fasta file''' | |
7 line = file.readline() | |
8 if not line: | |
9 return False # end of file | |
10 if line[0] != ">": | |
11 raise Exception("no header on the first line") | |
12 seqname = line[1:].strip() | |
13 seq = "" | |
14 # read sequences | |
15 while True: | |
16 last_pos = file.tell() | |
17 line = file.readline() | |
18 if not line: | |
19 break | |
20 if line[0] == ">": | |
21 file.seek(last_pos) | |
22 break | |
23 seq = seq + line.strip() | |
24 return {'name': seqname, 'sequence': seq} | |
25 | |
26 | |
27 def writeSingleSeq(fileobject, seq): | |
28 ''' write single sequence to fasta file''' | |
29 fileobject.write(">") | |
30 fileobject.write(seq['name'] + "\n") | |
31 fileobject.write(seq['sequence'] + "\n") | |
32 | |
33 | |
34 def main(): | |
35 from optparse import OptionParser | |
36 parser = OptionParser() | |
37 parser.add_option("-a", | |
38 "--fasta_file_A", | |
39 dest="seqfileA", | |
40 help="input sequences in fasta format") | |
41 parser.add_option("-b", | |
42 "--fasta_file_B", | |
43 dest="seqfileB", | |
44 help="input sequences in fasta format") | |
45 parser.add_option("-p", | |
46 "--fasta_file_pairs", | |
47 dest="seqfile_pairs", | |
48 help="output file with paired sequences") | |
49 parser.add_option("-x", | |
50 "--fasta_file_singles", | |
51 dest="seqfile_singles", | |
52 help="output file with single sequences") | |
53 options, args = parser.parse_args() | |
54 | |
55 # Input files | |
56 fA = open(options.seqfileA, 'r') | |
57 fB = open(options.seqfileB, 'r') | |
58 # Output files | |
59 if options.seqfile_pairs: | |
60 fPairs = open(options.seqfile_pairs, 'w') | |
61 else: | |
62 fPairs = open(options.seqfileA + ".pairs", 'w') | |
63 | |
64 if options.seqfile_singles: | |
65 single = open(options.seqfile_singles, "w") | |
66 else: | |
67 single = open(options.seqfileA + ".single", "w") | |
68 | |
69 | |
70 sA1 = readSingleSeq(fA) | |
71 sB1 = readSingleSeq(fB) | |
72 if not sA1 or not sB1: | |
73 raise Exception("\nEmpty sequence on input, nothing to interlace!\n") | |
74 charA = sA1['name'][-1] | |
75 charB = sB1['name'][-1] | |
76 # validate sequence names | |
77 if charA == charB: | |
78 sys.stderr.write( | |
79 "last character of sequence id must be used for distinguishing pairs!") | |
80 exit(1) | |
81 # check first thousand! | |
82 for i in range(1000): | |
83 seqA = readSingleSeq(fA) | |
84 seqB = readSingleSeq(fB) | |
85 if (not seqA) or (not seqB): | |
86 # end of file: | |
87 if i == 0: | |
88 sys.stderr.write("input file is empty") | |
89 exit(1) | |
90 else: | |
91 break | |
92 if seqA['name'][-1] == charA and seqB['name'][-1] == charB: | |
93 continue | |
94 else: | |
95 sys.stderr.write( | |
96 "last character of sequence id must be used for distinguishing pairs!") | |
97 exit(1) | |
98 | |
99 fA.seek(0) | |
100 fB.seek(0) | |
101 | |
102 buffA = {} | |
103 buffB = {} | |
104 buffA_names = [] | |
105 buffB_names = [] | |
106 | |
107 while True: | |
108 | |
109 seqA = readSingleSeq(fA) | |
110 seqB = readSingleSeq(fB) | |
111 | |
112 if not seqA and not seqB: | |
113 break # end of file | |
114 | |
115 ## validation and direct checking only if not end of files | |
116 if seqA and seqB: | |
117 #validate: | |
118 if not (seqA['name'][-1] == charA and seqB['name'][-1] == charB): | |
119 sys.stderr.write( | |
120 "last character of sequence id must be used for distinguishing pairs!") | |
121 exit(1) | |
122 | |
123 # check if current seqs are pairs | |
124 if seqA['name'][:-1] == seqB['name'][:-1]: | |
125 writeSingleSeq(fPairs, seqA) | |
126 writeSingleSeq(fPairs, seqB) | |
127 continue | |
128 | |
129 ### compare whith buffers | |
130 ### seqA vs buffB | |
131 if seqA: | |
132 if seqA["name"][:-1] in buffB: | |
133 writeSingleSeq(fPairs, seqA) | |
134 seqtmp = {"name": seqA["name"][:-1] + charB, | |
135 "sequence": buffB[seqA["name"][:-1]]} | |
136 writeSingleSeq(fPairs, seqtmp) | |
137 # can I empty buffA ??? | |
138 for i in buffA_names: | |
139 seqtmp = {"name": i + charA, "sequence": buffA[i]} | |
140 writeSingleSeq(single, seqtmp) | |
141 buffA = {} | |
142 buffA_names = [] | |
143 | |
144 j = 0 | |
145 for i in buffB_names: | |
146 seqtmp = {"name": i + charB, "sequence": buffB[i]} | |
147 del buffB[i] | |
148 j += 1 | |
149 if i == seqA["name"][:-1]: | |
150 del buffB_names[0:j] | |
151 break | |
152 else: | |
153 writeSingleSeq(single, seqtmp) | |
154 else: | |
155 buffA[seqA["name"][:-1]] = seqA['sequence'] | |
156 buffA_names.append(seqA["name"][:-1]) | |
157 | |
158 ### seqA vs buffB | |
159 if seqB: | |
160 if seqB["name"][:-1] in buffA: | |
161 seqtmp = {"name": seqB["name"][:-1] + charA, | |
162 "sequence": buffA[seqB["name"][:-1]]} | |
163 writeSingleSeq(fPairs, seqtmp) | |
164 writeSingleSeq(fPairs, seqB) | |
165 # can I empty buffB ??? | |
166 for i in buffB_names: | |
167 seqtmp = {"name": i + charB, "sequence": buffB[i]} | |
168 writeSingleSeq(single, seqtmp) | |
169 buffB = {} | |
170 buffB_names = [] | |
171 | |
172 j = 0 | |
173 for i in buffA_names: | |
174 seqtmp = {"name": i + charA, "sequence": buffA[i]} | |
175 del buffA[i] | |
176 j += 1 | |
177 if i == seqB["name"][:-1]: | |
178 del buffA_names[0:j] | |
179 break | |
180 else: | |
181 writeSingleSeq(single, seqtmp) | |
182 | |
183 else: | |
184 buffB[seqB["name"][:-1]] = seqB['sequence'] | |
185 buffB_names.append(seqB["name"][:-1]) | |
186 fA.close() | |
187 fB.close() | |
188 fPairs.close() | |
189 # write rest of singles: | |
190 for i in buffA: | |
191 seqtmp = {"name": i + charA, "sequence": buffA[i]} | |
192 writeSingleSeq(single, seqtmp) | |
193 for i in buffB: | |
194 seqtmp = {"name": i + charB, "sequence": buffB[i]} | |
195 writeSingleSeq(single, seqtmp) | |
196 single.close() | |
197 | |
198 | |
199 if __name__ == "__main__": | |
200 main() |