annotate fasta_merge_files_and_filter_unique_sequences.py @ 1:74144834b0bd draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 9f9eba8df62b4db1ef35718d880a1bcda7457b99
author galaxyp
date Fri, 16 Dec 2016 05:19:27 -0500
parents 2904d46167da
children 379c41d859aa
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
1 #!/usr/bin/env python
1
74144834b0bd planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 9f9eba8df62b4db1ef35718d880a1bcda7457b99
galaxyp
parents: 0
diff changeset
2 import os
74144834b0bd planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 9f9eba8df62b4db1ef35718d880a1bcda7457b99
galaxyp
parents: 0
diff changeset
3 import sys
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
4
2904d46167da Uploaded
galaxyp
parents:
diff changeset
5 class Sequence:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
6 ''' Holds protein sequence information '''
2904d46167da Uploaded
galaxyp
parents:
diff changeset
7 def __init__(self):
2904d46167da Uploaded
galaxyp
parents:
diff changeset
8 self.header = ""
2904d46167da Uploaded
galaxyp
parents:
diff changeset
9 self.sequence = ""
2904d46167da Uploaded
galaxyp
parents:
diff changeset
10
2904d46167da Uploaded
galaxyp
parents:
diff changeset
11 class FASTAReader:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
12 """
2904d46167da Uploaded
galaxyp
parents:
diff changeset
13 FASTA db iterator. Returns a single FASTA sequence object.
2904d46167da Uploaded
galaxyp
parents:
diff changeset
14 """
2904d46167da Uploaded
galaxyp
parents:
diff changeset
15 def __init__(self, fasta_name):
2904d46167da Uploaded
galaxyp
parents:
diff changeset
16 self.fasta_file = open(fasta_name)
2904d46167da Uploaded
galaxyp
parents:
diff changeset
17
2904d46167da Uploaded
galaxyp
parents:
diff changeset
18 def __iter__(self):
2904d46167da Uploaded
galaxyp
parents:
diff changeset
19 return self
2904d46167da Uploaded
galaxyp
parents:
diff changeset
20
2904d46167da Uploaded
galaxyp
parents:
diff changeset
21 def __next__(self):
2904d46167da Uploaded
galaxyp
parents:
diff changeset
22 ''' Iteration '''
2904d46167da Uploaded
galaxyp
parents:
diff changeset
23 while True:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
24 line = self.fasta_file.readline()
2904d46167da Uploaded
galaxyp
parents:
diff changeset
25 if not line:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
26 raise StopIteration
2904d46167da Uploaded
galaxyp
parents:
diff changeset
27 if line[0] == '>':
2904d46167da Uploaded
galaxyp
parents:
diff changeset
28 break
2904d46167da Uploaded
galaxyp
parents:
diff changeset
29
2904d46167da Uploaded
galaxyp
parents:
diff changeset
30 seq = Sequence()
2904d46167da Uploaded
galaxyp
parents:
diff changeset
31 seq.header = line.rstrip().replace('\n','').replace('\r','')
2904d46167da Uploaded
galaxyp
parents:
diff changeset
32
2904d46167da Uploaded
galaxyp
parents:
diff changeset
33 while True:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
34 tail = self.fasta_file.tell()
2904d46167da Uploaded
galaxyp
parents:
diff changeset
35 line = self.fasta_file.readline()
2904d46167da Uploaded
galaxyp
parents:
diff changeset
36 if not line:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
37 break
2904d46167da Uploaded
galaxyp
parents:
diff changeset
38 if line[0] == '>':
2904d46167da Uploaded
galaxyp
parents:
diff changeset
39 self.fasta_file.seek(tail)
2904d46167da Uploaded
galaxyp
parents:
diff changeset
40 break
2904d46167da Uploaded
galaxyp
parents:
diff changeset
41 seq.sequence = seq.sequence + line.rstrip().replace('\n','').replace('\r','')
2904d46167da Uploaded
galaxyp
parents:
diff changeset
42 return seq
2904d46167da Uploaded
galaxyp
parents:
diff changeset
43
2904d46167da Uploaded
galaxyp
parents:
diff changeset
44 # Python 2/3 compat
2904d46167da Uploaded
galaxyp
parents:
diff changeset
45 next = __next__
2904d46167da Uploaded
galaxyp
parents:
diff changeset
46
2904d46167da Uploaded
galaxyp
parents:
diff changeset
47
2904d46167da Uploaded
galaxyp
parents:
diff changeset
48 def main():
2904d46167da Uploaded
galaxyp
parents:
diff changeset
49 seen_sequences = set([])
2904d46167da Uploaded
galaxyp
parents:
diff changeset
50
2904d46167da Uploaded
galaxyp
parents:
diff changeset
51 out_file = open(sys.argv[1], 'w')
2904d46167da Uploaded
galaxyp
parents:
diff changeset
52 for fasta_file in sys.argv[2:]:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
53 fa_reader = FASTAReader(fasta_file)
2904d46167da Uploaded
galaxyp
parents:
diff changeset
54 for protein in fa_reader:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
55 if protein.sequence in seen_sequences:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
56 pass
2904d46167da Uploaded
galaxyp
parents:
diff changeset
57 else:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
58 seen_sequences.add(protein.sequence)
2904d46167da Uploaded
galaxyp
parents:
diff changeset
59 out_file.write(protein.header)
2904d46167da Uploaded
galaxyp
parents:
diff changeset
60 out_file.write(os.linesep)
2904d46167da Uploaded
galaxyp
parents:
diff changeset
61 out_file.write(protein.sequence)
2904d46167da Uploaded
galaxyp
parents:
diff changeset
62 out_file.write(os.linesep)
2904d46167da Uploaded
galaxyp
parents:
diff changeset
63 out_file.close()
2904d46167da Uploaded
galaxyp
parents:
diff changeset
64
2904d46167da Uploaded
galaxyp
parents:
diff changeset
65 if __name__ == "__main__":
2904d46167da Uploaded
galaxyp
parents:
diff changeset
66 main()