annotate fasta_merge_files_and_filter_unique_sequences.py @ 0:2904d46167da draft

Uploaded
author galaxyp
date Fri, 26 Sep 2014 14:02:14 -0400
parents
children 74144834b0bd
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
1 #!/usr/bin/env python
2904d46167da Uploaded
galaxyp
parents:
diff changeset
2 import sys,os
2904d46167da Uploaded
galaxyp
parents:
diff changeset
3
2904d46167da Uploaded
galaxyp
parents:
diff changeset
4 #====================================================================== Classes
2904d46167da Uploaded
galaxyp
parents:
diff changeset
5 class Sequence:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
6 ''' Holds protein sequence information '''
2904d46167da Uploaded
galaxyp
parents:
diff changeset
7 def __init__(self):
2904d46167da Uploaded
galaxyp
parents:
diff changeset
8 self.header = ""
2904d46167da Uploaded
galaxyp
parents:
diff changeset
9 self.sequence = ""
2904d46167da Uploaded
galaxyp
parents:
diff changeset
10
2904d46167da Uploaded
galaxyp
parents:
diff changeset
11 class FASTAReader:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
12 """
2904d46167da Uploaded
galaxyp
parents:
diff changeset
13 FASTA db iterator. Returns a single FASTA sequence object.
2904d46167da Uploaded
galaxyp
parents:
diff changeset
14 """
2904d46167da Uploaded
galaxyp
parents:
diff changeset
15 def __init__(self, fasta_name):
2904d46167da Uploaded
galaxyp
parents:
diff changeset
16 self.fasta_file = open(fasta_name)
2904d46167da Uploaded
galaxyp
parents:
diff changeset
17
2904d46167da Uploaded
galaxyp
parents:
diff changeset
18 def __iter__(self):
2904d46167da Uploaded
galaxyp
parents:
diff changeset
19 return self
2904d46167da Uploaded
galaxyp
parents:
diff changeset
20
2904d46167da Uploaded
galaxyp
parents:
diff changeset
21 def __next__(self):
2904d46167da Uploaded
galaxyp
parents:
diff changeset
22 ''' Iteration '''
2904d46167da Uploaded
galaxyp
parents:
diff changeset
23 while True:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
24 line = self.fasta_file.readline()
2904d46167da Uploaded
galaxyp
parents:
diff changeset
25 if not line:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
26 raise StopIteration
2904d46167da Uploaded
galaxyp
parents:
diff changeset
27 if line[0] == '>':
2904d46167da Uploaded
galaxyp
parents:
diff changeset
28 break
2904d46167da Uploaded
galaxyp
parents:
diff changeset
29
2904d46167da Uploaded
galaxyp
parents:
diff changeset
30 seq = Sequence()
2904d46167da Uploaded
galaxyp
parents:
diff changeset
31 seq.header = line.rstrip().replace('\n','').replace('\r','')
2904d46167da Uploaded
galaxyp
parents:
diff changeset
32
2904d46167da Uploaded
galaxyp
parents:
diff changeset
33 while True:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
34 tail = self.fasta_file.tell()
2904d46167da Uploaded
galaxyp
parents:
diff changeset
35 line = self.fasta_file.readline()
2904d46167da Uploaded
galaxyp
parents:
diff changeset
36 if not line:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
37 break
2904d46167da Uploaded
galaxyp
parents:
diff changeset
38 if line[0] == '>':
2904d46167da Uploaded
galaxyp
parents:
diff changeset
39 self.fasta_file.seek(tail)
2904d46167da Uploaded
galaxyp
parents:
diff changeset
40 break
2904d46167da Uploaded
galaxyp
parents:
diff changeset
41 seq.sequence = seq.sequence + line.rstrip().replace('\n','').replace('\r','')
2904d46167da Uploaded
galaxyp
parents:
diff changeset
42 return seq
2904d46167da Uploaded
galaxyp
parents:
diff changeset
43
2904d46167da Uploaded
galaxyp
parents:
diff changeset
44 # Python 2/3 compat
2904d46167da Uploaded
galaxyp
parents:
diff changeset
45 next = __next__
2904d46167da Uploaded
galaxyp
parents:
diff changeset
46
2904d46167da Uploaded
galaxyp
parents:
diff changeset
47
2904d46167da Uploaded
galaxyp
parents:
diff changeset
48 def main():
2904d46167da Uploaded
galaxyp
parents:
diff changeset
49 seen_sequences = set([])
2904d46167da Uploaded
galaxyp
parents:
diff changeset
50
2904d46167da Uploaded
galaxyp
parents:
diff changeset
51 out_file = open(sys.argv[1], 'w')
2904d46167da Uploaded
galaxyp
parents:
diff changeset
52 for fasta_file in sys.argv[2:]:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
53 fa_reader = FASTAReader(fasta_file)
2904d46167da Uploaded
galaxyp
parents:
diff changeset
54 for protein in fa_reader:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
55 if protein.sequence in seen_sequences:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
56 pass
2904d46167da Uploaded
galaxyp
parents:
diff changeset
57 else:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
58 seen_sequences.add(protein.sequence)
2904d46167da Uploaded
galaxyp
parents:
diff changeset
59
2904d46167da Uploaded
galaxyp
parents:
diff changeset
60 out_file.write(protein.header)
2904d46167da Uploaded
galaxyp
parents:
diff changeset
61 out_file.write(os.linesep)
2904d46167da Uploaded
galaxyp
parents:
diff changeset
62 out_file.write(protein.sequence)
2904d46167da Uploaded
galaxyp
parents:
diff changeset
63 out_file.write(os.linesep)
2904d46167da Uploaded
galaxyp
parents:
diff changeset
64 out_file.close()
2904d46167da Uploaded
galaxyp
parents:
diff changeset
65
2904d46167da Uploaded
galaxyp
parents:
diff changeset
66 if __name__ == "__main__":
2904d46167da Uploaded
galaxyp
parents:
diff changeset
67 main()