Mercurial > repos > galaxyp > fasta_merge_files_and_filter_unique_sequences
annotate fasta_merge_files_and_filter_unique_sequences.py @ 3:9ad0d336e5ed draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
author | galaxyp |
---|---|
date | Fri, 03 Feb 2017 14:27:56 -0500 |
parents | 379c41d859aa |
children | f546e7278f04 |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
1
74144834b0bd
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 9f9eba8df62b4db1ef35718d880a1bcda7457b99
galaxyp
parents:
0
diff
changeset
|
2 import os |
74144834b0bd
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 9f9eba8df62b4db1ef35718d880a1bcda7457b99
galaxyp
parents:
0
diff
changeset
|
3 import sys |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
4 import re |
0 | 5 |
6 class Sequence: | |
7 ''' Holds protein sequence information ''' | |
8 def __init__(self): | |
9 self.header = "" | |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
10 self.accession = "" |
0 | 11 self.sequence = "" |
12 | |
13 class FASTAReader: | |
14 """ | |
15 FASTA db iterator. Returns a single FASTA sequence object. | |
16 """ | |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
17 def __init__(self, fasta_name, accession_parser): |
0 | 18 self.fasta_file = open(fasta_name) |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
19 self.accession_parser = accession_parser |
0 | 20 |
21 def __iter__(self): | |
22 return self | |
23 | |
24 def __next__(self): | |
25 ''' Iteration ''' | |
26 while True: | |
27 line = self.fasta_file.readline() | |
28 if not line: | |
29 raise StopIteration | |
30 if line[0] == '>': | |
31 break | |
32 | |
33 seq = Sequence() | |
34 seq.header = line.rstrip().replace('\n','').replace('\r','') | |
35 | |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
36 m = re.search(self.accession_parser, seq.header) |
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
37 if not m or len(m.groups()) < 1 or len(m.group(1)) == 0: |
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
38 sys.exit("Could not parse accession from '%s'" % seq.header) |
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
39 seq.accession = m.group(1) |
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
40 |
0 | 41 while True: |
42 tail = self.fasta_file.tell() | |
43 line = self.fasta_file.readline() | |
44 if not line: | |
45 break | |
46 if line[0] == '>': | |
47 self.fasta_file.seek(tail) | |
48 break | |
49 seq.sequence = seq.sequence + line.rstrip().replace('\n','').replace('\r','') | |
50 return seq | |
51 | |
52 # Python 2/3 compat | |
53 next = __next__ | |
54 | |
55 | |
56 def main(): | |
2
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
57 seen_sequences = dict([]) |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
58 seen_accessions = set([]) |
0 | 59 |
60 out_file = open(sys.argv[1], 'w') | |
2
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
61 if sys.argv[2] == "sequence": |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
62 unique_sequences = True |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
63 elif sys.argv[2] == "accession": |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
64 unique_sequences = False |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
65 else: |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
66 sys.exit("2nd argument must be 'sequence' or 'accession'") |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
67 |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
68 accession_parser = sys.argv[3] |
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
69 for key, value in { '\'' :'__sq__', '\\' : '__backslash__' }.items(): |
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
70 accession_parser = accession_parser.replace(value, key) |
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
71 |
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
72 for fasta_file in sys.argv[4:]: |
2
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
73 print("Reading entries from '%s'" % fasta_file) |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
74 fa_reader = FASTAReader(fasta_file, accession_parser) |
0 | 75 for protein in fa_reader: |
2
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
76 if unique_sequences: |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
77 if protein.accession in seen_accessions: |
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
78 print("Skipping protein '%s' with duplicate accession" % protein.header) |
2
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
79 continue |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
80 elif hash(protein.sequence) in seen_sequences: |
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
81 print("Skipping protein '%s' with duplicate sequence (first seen as '%s')" % (protein.header, seen_sequences[hash(protein.sequence)])) |
2
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
82 continue |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
83 else: |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
84 seen_sequences[hash(protein.sequence)] = protein.header |
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
85 seen_accessions.add(protein.accession) |
0 | 86 else: |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
87 if protein.accession in seen_accessions: |
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
88 print("Skipping protein '%s' with duplicate accession" % protein.header) |
2
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
89 continue |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
90 else: |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
91 seen_accessions.add(protein.accession) |
2
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
92 out_file.write(protein.header) |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
93 out_file.write(os.linesep) |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
94 out_file.write(protein.sequence) |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
95 out_file.write(os.linesep) |
0 | 96 out_file.close() |
97 | |
98 if __name__ == "__main__": | |
99 main() |