comparison fasta_interlacer.py @ 0:a4cd8608ef6b draft

Uploaded
author petr-novak
date Mon, 01 Apr 2019 07:56:36 -0400
parents
children d397f5a85464
comparison
equal deleted inserted replaced
-1:000000000000 0:a4cd8608ef6b
1 #!/usr/bin/env python
2 ''' interlacing two fastq sequences'''
3 import sys
4
5 def readSingleSeq(file):
6 ''' read single seq from fasta file'''
7 line = file.readline()
8 if not line:
9 return False # end of file
10 if line[0] != ">":
11 raise Exception("no header on the first line")
12 seqname = line[1:].strip()
13 seq = ""
14 # read sequences
15 while True:
16 last_pos = file.tell()
17 line = file.readline()
18 if not line:
19 break
20 if line[0] == ">":
21 file.seek(last_pos)
22 break
23 seq = seq + line.strip()
24 return {'name': seqname, 'sequence': seq}
25
26
27 def writeSingleSeq(fileobject, seq):
28 ''' write single sequence to fasta file'''
29 fileobject.write(">")
30 fileobject.write(seq['name'] + "\n")
31 fileobject.write(seq['sequence'] + "\n")
32
33
34 def main():
35 from optparse import OptionParser
36 parser = OptionParser()
37 parser.add_option("-a",
38 "--fasta_file_A",
39 dest="seqfileA",
40 help="input sequences in fasta format")
41 parser.add_option("-b",
42 "--fasta_file_B",
43 dest="seqfileB",
44 help="input sequences in fasta format")
45 parser.add_option("-p",
46 "--fasta_file_pairs",
47 dest="seqfile_pairs",
48 help="output file with paired sequences")
49 parser.add_option("-x",
50 "--fasta_file_singles",
51 dest="seqfile_singles",
52 help="output file with single sequences")
53 options, args = parser.parse_args()
54
55 # Input files
56 fA = open(options.seqfileA, 'r')
57 fB = open(options.seqfileB, 'r')
58 # Output files
59 if options.seqfile_pairs:
60 fPairs = open(options.seqfile_pairs, 'w')
61 else:
62 fPairs = open(options.seqfileA + ".pairs", 'w')
63
64 if options.seqfile_singles:
65 single = open(options.seqfile_singles, "w")
66 else:
67 single = open(options.seqfileA + ".single", "w")
68
69
70 sA1 = readSingleSeq(fA)
71 sB1 = readSingleSeq(fB)
72 if not sA1 or not sB1:
73 raise Exception("\nEmpty sequence on input, nothing to interlace!\n")
74 charA = sA1['name'][-1]
75 charB = sB1['name'][-1]
76 # validate sequence names
77 if charA == charB:
78 sys.stderr.write(
79 "last character of sequence id must be used for distinguishing pairs!")
80 exit(1)
81 # check first thousand!
82 for i in range(1000):
83 seqA = readSingleSeq(fA)
84 seqB = readSingleSeq(fB)
85 if (not seqA) or (not seqB):
86 # end of file:
87 if i == 0:
88 sys.stderr.write("input file is empty")
89 exit(1)
90 else:
91 break
92 if seqA['name'][-1] == charA and seqB['name'][-1] == charB:
93 continue
94 else:
95 sys.stderr.write(
96 "last character of sequence id must be used for distinguishing pairs!")
97 exit(1)
98
99 fA.seek(0)
100 fB.seek(0)
101
102 buffA = {}
103 buffB = {}
104 buffA_names = []
105 buffB_names = []
106
107 while True:
108
109 seqA = readSingleSeq(fA)
110 seqB = readSingleSeq(fB)
111
112 if not seqA and not seqB:
113 break # end of file
114
115 ## validation and direct checking only if not end of files
116 if seqA and seqB:
117 #validate:
118 if not (seqA['name'][-1] == charA and seqB['name'][-1] == charB):
119 sys.stderr.write(
120 "last character of sequence id must be used for distinguishing pairs!")
121 exit(1)
122
123 # check if current seqs are pairs
124 if seqA['name'][:-1] == seqB['name'][:-1]:
125 writeSingleSeq(fPairs, seqA)
126 writeSingleSeq(fPairs, seqB)
127 continue
128
129 ### compare whith buffers
130 ### seqA vs buffB
131 if seqA:
132 if seqA["name"][:-1] in buffB:
133 writeSingleSeq(fPairs, seqA)
134 seqtmp = {"name": seqA["name"][:-1] + charB,
135 "sequence": buffB[seqA["name"][:-1]]}
136 writeSingleSeq(fPairs, seqtmp)
137 # can I empty buffA ???
138 for i in buffA_names:
139 seqtmp = {"name": i + charA, "sequence": buffA[i]}
140 writeSingleSeq(single, seqtmp)
141 buffA = {}
142 buffA_names = []
143
144 j = 0
145 for i in buffB_names:
146 seqtmp = {"name": i + charB, "sequence": buffB[i]}
147 del buffB[i]
148 j += 1
149 if i == seqA["name"][:-1]:
150 del buffB_names[0:j]
151 break
152 else:
153 writeSingleSeq(single, seqtmp)
154 else:
155 buffA[seqA["name"][:-1]] = seqA['sequence']
156 buffA_names.append(seqA["name"][:-1])
157
158 ### seqA vs buffB
159 if seqB:
160 if seqB["name"][:-1] in buffA:
161 seqtmp = {"name": seqB["name"][:-1] + charA,
162 "sequence": buffA[seqB["name"][:-1]]}
163 writeSingleSeq(fPairs, seqtmp)
164 writeSingleSeq(fPairs, seqB)
165 # can I empty buffB ???
166 for i in buffB_names:
167 seqtmp = {"name": i + charB, "sequence": buffB[i]}
168 writeSingleSeq(single, seqtmp)
169 buffB = {}
170 buffB_names = []
171
172 j = 0
173 for i in buffA_names:
174 seqtmp = {"name": i + charA, "sequence": buffA[i]}
175 del buffA[i]
176 j += 1
177 if i == seqB["name"][:-1]:
178 del buffA_names[0:j]
179 break
180 else:
181 writeSingleSeq(single, seqtmp)
182
183 else:
184 buffB[seqB["name"][:-1]] = seqB['sequence']
185 buffB_names.append(seqB["name"][:-1])
186 fA.close()
187 fB.close()
188 fPairs.close()
189 # write rest of singles:
190 for i in buffA:
191 seqtmp = {"name": i + charA, "sequence": buffA[i]}
192 writeSingleSeq(single, seqtmp)
193 for i in buffB:
194 seqtmp = {"name": i + charB, "sequence": buffB[i]}
195 writeSingleSeq(single, seqtmp)
196 single.close()
197
198
199 if __name__ == "__main__":
200 main()