0
|
1 #!/usr/bin/env python
|
|
2 import sys,os
|
|
3
|
|
4 #====================================================================== Classes
|
|
5 class Sequence:
|
|
6 ''' Holds protein sequence information '''
|
|
7 def __init__(self):
|
|
8 self.header = ""
|
|
9 self.sequence = ""
|
|
10
|
|
11 class FASTAReader:
|
|
12 """
|
|
13 FASTA db iterator. Returns a single FASTA sequence object.
|
|
14 """
|
|
15 def __init__(self, fasta_name):
|
|
16 self.fasta_file = open(fasta_name)
|
|
17
|
|
18 def __iter__(self):
|
|
19 return self
|
|
20
|
|
21 def __next__(self):
|
|
22 ''' Iteration '''
|
|
23 while True:
|
|
24 line = self.fasta_file.readline()
|
|
25 if not line:
|
|
26 raise StopIteration
|
|
27 if line[0] == '>':
|
|
28 break
|
|
29
|
|
30 seq = Sequence()
|
|
31 seq.header = line.rstrip().replace('\n','').replace('\r','')
|
|
32
|
|
33 while True:
|
|
34 tail = self.fasta_file.tell()
|
|
35 line = self.fasta_file.readline()
|
|
36 if not line:
|
|
37 break
|
|
38 if line[0] == '>':
|
|
39 self.fasta_file.seek(tail)
|
|
40 break
|
|
41 seq.sequence = seq.sequence + line.rstrip().replace('\n','').replace('\r','')
|
|
42 return seq
|
|
43
|
|
44 # Python 2/3 compat
|
|
45 next = __next__
|
|
46
|
|
47
|
|
48 def main():
|
|
49 seen_sequences = set([])
|
|
50
|
|
51 out_file = open(sys.argv[1], 'w')
|
|
52 for fasta_file in sys.argv[2:]:
|
|
53 fa_reader = FASTAReader(fasta_file)
|
|
54 for protein in fa_reader:
|
|
55 if protein.sequence in seen_sequences:
|
|
56 pass
|
|
57 else:
|
|
58 seen_sequences.add(protein.sequence)
|
|
59
|
|
60 out_file.write(protein.header)
|
|
61 out_file.write(os.linesep)
|
|
62 out_file.write(protein.sequence)
|
|
63 out_file.write(os.linesep)
|
|
64 out_file.close()
|
|
65
|
|
66 if __name__ == "__main__":
|
|
67 main()
|