0
|
1 #!/usr/bin/env python
|
|
2 ''' interlacing two fastq sequences'''
|
|
3 import sys
|
|
4
|
|
5 def readSingleSeq(file):
|
|
6 ''' read single seq from fasta file'''
|
|
7 line = file.readline()
|
|
8 if not line:
|
|
9 return False # end of file
|
|
10 if line[0] != ">":
|
|
11 raise Exception("no header on the first line")
|
|
12 seqname = line[1:].strip()
|
|
13 seq = ""
|
|
14 # read sequences
|
|
15 while True:
|
|
16 last_pos = file.tell()
|
|
17 line = file.readline()
|
|
18 if not line:
|
|
19 break
|
|
20 if line[0] == ">":
|
|
21 file.seek(last_pos)
|
|
22 break
|
|
23 seq = seq + line.strip()
|
|
24 return {'name': seqname, 'sequence': seq}
|
|
25
|
|
26
|
|
27 def writeSingleSeq(fileobject, seq):
|
|
28 ''' write single sequence to fasta file'''
|
|
29 fileobject.write(">")
|
|
30 fileobject.write(seq['name'] + "\n")
|
|
31 fileobject.write(seq['sequence'] + "\n")
|
|
32
|
|
33
|
|
34 def main():
|
|
35 from optparse import OptionParser
|
|
36 parser = OptionParser()
|
|
37 parser.add_option("-a",
|
|
38 "--fasta_file_A",
|
|
39 dest="seqfileA",
|
|
40 help="input sequences in fasta format")
|
|
41 parser.add_option("-b",
|
|
42 "--fasta_file_B",
|
|
43 dest="seqfileB",
|
|
44 help="input sequences in fasta format")
|
|
45 parser.add_option("-p",
|
|
46 "--fasta_file_pairs",
|
|
47 dest="seqfile_pairs",
|
|
48 help="output file with paired sequences")
|
|
49 parser.add_option("-x",
|
|
50 "--fasta_file_singles",
|
|
51 dest="seqfile_singles",
|
|
52 help="output file with single sequences")
|
|
53 options, args = parser.parse_args()
|
|
54
|
|
55 # Input files
|
|
56 fA = open(options.seqfileA, 'r')
|
|
57 fB = open(options.seqfileB, 'r')
|
|
58 # Output files
|
|
59 if options.seqfile_pairs:
|
|
60 fPairs = open(options.seqfile_pairs, 'w')
|
|
61 else:
|
|
62 fPairs = open(options.seqfileA + ".pairs", 'w')
|
|
63
|
|
64 if options.seqfile_singles:
|
|
65 single = open(options.seqfile_singles, "w")
|
|
66 else:
|
|
67 single = open(options.seqfileA + ".single", "w")
|
|
68
|
|
69
|
|
70 sA1 = readSingleSeq(fA)
|
|
71 sB1 = readSingleSeq(fB)
|
|
72 if not sA1 or not sB1:
|
|
73 raise Exception("\nEmpty sequence on input, nothing to interlace!\n")
|
|
74 charA = sA1['name'][-1]
|
|
75 charB = sB1['name'][-1]
|
|
76 # validate sequence names
|
|
77 if charA == charB:
|
|
78 sys.stderr.write(
|
|
79 "last character of sequence id must be used for distinguishing pairs!")
|
|
80 exit(1)
|
|
81 # check first thousand!
|
4
|
82 for i in range(3):
|
0
|
83 seqA = readSingleSeq(fA)
|
|
84 seqB = readSingleSeq(fB)
|
|
85 if (not seqA) or (not seqB):
|
|
86 # end of file:
|
|
87 if i == 0:
|
|
88 sys.stderr.write("input file is empty")
|
|
89 exit(1)
|
|
90 else:
|
|
91 break
|
|
92 if seqA['name'][-1] == charA and seqB['name'][-1] == charB:
|
|
93 continue
|
|
94 else:
|
|
95 sys.stderr.write(
|
|
96 "last character of sequence id must be used for distinguishing pairs!")
|
|
97 exit(1)
|
|
98
|
|
99 fA.seek(0)
|
|
100 fB.seek(0)
|
|
101
|
|
102 buffA = {}
|
|
103 buffB = {}
|
|
104 buffA_names = []
|
|
105 buffB_names = []
|
|
106
|
|
107 while True:
|
|
108 seqA = readSingleSeq(fA)
|
|
109 seqB = readSingleSeq(fB)
|
|
110 if not seqA and not seqB:
|
|
111 break # end of file
|
|
112
|
|
113 ## validation and direct checking only if not end of files
|
|
114 if seqA and seqB:
|
|
115 #validate:
|
|
116 if not (seqA['name'][-1] == charA and seqB['name'][-1] == charB):
|
|
117 sys.stderr.write(
|
|
118 "last character of sequence id must be used for distinguishing pairs!")
|
|
119 exit(1)
|
|
120
|
|
121 # check if current seqs are pairs
|
|
122 if seqA['name'][:-1] == seqB['name'][:-1]:
|
|
123 writeSingleSeq(fPairs, seqA)
|
|
124 writeSingleSeq(fPairs, seqB)
|
|
125 continue
|
|
126
|
|
127 ### compare whith buffers
|
|
128 ### seqA vs buffB
|
|
129 if seqA:
|
|
130 if seqA["name"][:-1] in buffB:
|
|
131 writeSingleSeq(fPairs, seqA)
|
|
132 seqtmp = {"name": seqA["name"][:-1] + charB,
|
|
133 "sequence": buffB[seqA["name"][:-1]]}
|
|
134 writeSingleSeq(fPairs, seqtmp)
|
|
135 # can I empty buffA ???
|
|
136 for i in buffA_names:
|
|
137 seqtmp = {"name": i + charA, "sequence": buffA[i]}
|
|
138 writeSingleSeq(single, seqtmp)
|
4
|
139 buffA = {}
|
|
140 buffA_names = []
|
0
|
141
|
|
142 j = 0
|
|
143 for i in buffB_names:
|
|
144 seqtmp = {"name": i + charB, "sequence": buffB[i]}
|
|
145 del buffB[i]
|
|
146 j += 1
|
|
147 if i == seqA["name"][:-1]:
|
|
148 del buffB_names[0:j]
|
|
149 break
|
|
150 else:
|
|
151 writeSingleSeq(single, seqtmp)
|
|
152 else:
|
|
153 buffA[seqA["name"][:-1]] = seqA['sequence']
|
|
154 buffA_names.append(seqA["name"][:-1])
|
|
155
|
|
156 ### seqA vs buffB
|
|
157 if seqB:
|
|
158 if seqB["name"][:-1] in buffA:
|
|
159 seqtmp = {"name": seqB["name"][:-1] + charA,
|
|
160 "sequence": buffA[seqB["name"][:-1]]}
|
|
161 writeSingleSeq(fPairs, seqtmp)
|
|
162 writeSingleSeq(fPairs, seqB)
|
|
163 # can I empty buffB ???
|
|
164 for i in buffB_names:
|
|
165 seqtmp = {"name": i + charB, "sequence": buffB[i]}
|
|
166 writeSingleSeq(single, seqtmp)
|
4
|
167 buffB = {}
|
|
168 buffB_names = []
|
0
|
169
|
|
170 j = 0
|
|
171 for i in buffA_names:
|
|
172 seqtmp = {"name": i + charA, "sequence": buffA[i]}
|
|
173 del buffA[i]
|
|
174 j += 1
|
|
175 if i == seqB["name"][:-1]:
|
|
176 del buffA_names[0:j]
|
|
177 break
|
|
178 else:
|
|
179 writeSingleSeq(single, seqtmp)
|
|
180
|
|
181 else:
|
|
182 buffB[seqB["name"][:-1]] = seqB['sequence']
|
|
183 buffB_names.append(seqB["name"][:-1])
|
4
|
184
|
|
185 fA.close()
|
|
186 fB.close()
|
|
187 fPairs.close()
|
|
188
|
|
189 # write rest of singles:
|
0
|
190 for i in buffA:
|
|
191 seqtmp = {"name": i + charA, "sequence": buffA[i]}
|
|
192 writeSingleSeq(single, seqtmp)
|
|
193 for i in buffB:
|
|
194 seqtmp = {"name": i + charB, "sequence": buffB[i]}
|
|
195 writeSingleSeq(single, seqtmp)
|
4
|
196 single.close()
|
0
|
197
|
|
198
|
|
199 if __name__ == "__main__":
|
|
200 main()
|