Mercurial > repos > galaxyp > fasta_merge_files_and_filter_unique_sequences
annotate fasta_merge_files_and_filter_unique_sequences.py @ 1:74144834b0bd draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 9f9eba8df62b4db1ef35718d880a1bcda7457b99
author | galaxyp |
---|---|
date | Fri, 16 Dec 2016 05:19:27 -0500 |
parents | 2904d46167da |
children | 379c41d859aa |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
1
74144834b0bd
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 9f9eba8df62b4db1ef35718d880a1bcda7457b99
galaxyp
parents:
0
diff
changeset
|
2 import os |
74144834b0bd
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 9f9eba8df62b4db1ef35718d880a1bcda7457b99
galaxyp
parents:
0
diff
changeset
|
3 import sys |
0 | 4 |
5 class Sequence: | |
6 ''' Holds protein sequence information ''' | |
7 def __init__(self): | |
8 self.header = "" | |
9 self.sequence = "" | |
10 | |
11 class FASTAReader: | |
12 """ | |
13 FASTA db iterator. Returns a single FASTA sequence object. | |
14 """ | |
15 def __init__(self, fasta_name): | |
16 self.fasta_file = open(fasta_name) | |
17 | |
18 def __iter__(self): | |
19 return self | |
20 | |
21 def __next__(self): | |
22 ''' Iteration ''' | |
23 while True: | |
24 line = self.fasta_file.readline() | |
25 if not line: | |
26 raise StopIteration | |
27 if line[0] == '>': | |
28 break | |
29 | |
30 seq = Sequence() | |
31 seq.header = line.rstrip().replace('\n','').replace('\r','') | |
32 | |
33 while True: | |
34 tail = self.fasta_file.tell() | |
35 line = self.fasta_file.readline() | |
36 if not line: | |
37 break | |
38 if line[0] == '>': | |
39 self.fasta_file.seek(tail) | |
40 break | |
41 seq.sequence = seq.sequence + line.rstrip().replace('\n','').replace('\r','') | |
42 return seq | |
43 | |
44 # Python 2/3 compat | |
45 next = __next__ | |
46 | |
47 | |
48 def main(): | |
49 seen_sequences = set([]) | |
50 | |
51 out_file = open(sys.argv[1], 'w') | |
52 for fasta_file in sys.argv[2:]: | |
53 fa_reader = FASTAReader(fasta_file) | |
54 for protein in fa_reader: | |
55 if protein.sequence in seen_sequences: | |
56 pass | |
57 else: | |
58 seen_sequences.add(protein.sequence) | |
59 out_file.write(protein.header) | |
60 out_file.write(os.linesep) | |
61 out_file.write(protein.sequence) | |
62 out_file.write(os.linesep) | |
63 out_file.close() | |
64 | |
65 if __name__ == "__main__": | |
66 main() |