Mercurial > repos > galaxyp > fasta_merge_files_and_filter_unique_sequences
annotate fasta_merge_files_and_filter_unique_sequences.py @ 6:f546e7278f04 draft default tip
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
author | galaxyp |
---|---|
date | Mon, 23 Nov 2020 19:35:09 +0000 |
parents | 9ad0d336e5ed |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
1
74144834b0bd
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 9f9eba8df62b4db1ef35718d880a1bcda7457b99
galaxyp
parents:
0
diff
changeset
|
2 import os |
6
f546e7278f04
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents:
3
diff
changeset
|
3 import re |
1
74144834b0bd
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 9f9eba8df62b4db1ef35718d880a1bcda7457b99
galaxyp
parents:
0
diff
changeset
|
4 import sys |
6
f546e7278f04
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents:
3
diff
changeset
|
5 |
0 | 6 |
7 class Sequence: | |
8 ''' Holds protein sequence information ''' | |
6
f546e7278f04
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents:
3
diff
changeset
|
9 |
0 | 10 def __init__(self): |
11 self.header = "" | |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
12 self.accession = "" |
0 | 13 self.sequence = "" |
14 | |
6
f546e7278f04
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents:
3
diff
changeset
|
15 |
0 | 16 class FASTAReader: |
17 """ | |
18 FASTA db iterator. Returns a single FASTA sequence object. | |
19 """ | |
6
f546e7278f04
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents:
3
diff
changeset
|
20 |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
21 def __init__(self, fasta_name, accession_parser): |
0 | 22 self.fasta_file = open(fasta_name) |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
23 self.accession_parser = accession_parser |
0 | 24 |
25 def __iter__(self): | |
26 return self | |
27 | |
28 def __next__(self): | |
29 ''' Iteration ''' | |
30 while True: | |
31 line = self.fasta_file.readline() | |
32 if not line: | |
33 raise StopIteration | |
34 if line[0] == '>': | |
35 break | |
36 | |
37 seq = Sequence() | |
6
f546e7278f04
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents:
3
diff
changeset
|
38 seq.header = line.rstrip().replace('\n', '').replace('\r', '') |
0 | 39 |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
40 m = re.search(self.accession_parser, seq.header) |
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
41 if not m or len(m.groups()) < 1 or len(m.group(1)) == 0: |
6
f546e7278f04
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents:
3
diff
changeset
|
42 sys.exit("Could not parse accession from '%s'" % seq.header) |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
43 seq.accession = m.group(1) |
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
44 |
0 | 45 while True: |
46 tail = self.fasta_file.tell() | |
47 line = self.fasta_file.readline() | |
48 if not line: | |
49 break | |
50 if line[0] == '>': | |
51 self.fasta_file.seek(tail) | |
52 break | |
6
f546e7278f04
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents:
3
diff
changeset
|
53 seq.sequence = seq.sequence + line.rstrip().replace('\n', '').replace('\r', '') |
0 | 54 return seq |
55 | |
56 # Python 2/3 compat | |
57 next = __next__ | |
58 | |
59 | |
60 def main(): | |
2
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
61 seen_sequences = dict([]) |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
62 seen_accessions = set([]) |
0 | 63 |
64 out_file = open(sys.argv[1], 'w') | |
2
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
65 if sys.argv[2] == "sequence": |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
66 unique_sequences = True |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
67 elif sys.argv[2] == "accession": |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
68 unique_sequences = False |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
69 else: |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
70 sys.exit("2nd argument must be 'sequence' or 'accession'") |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
71 |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
72 accession_parser = sys.argv[3] |
6
f546e7278f04
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents:
3
diff
changeset
|
73 for key, value in {'\'': '__sq__', '\\': '__backslash__'}.items(): |
f546e7278f04
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents:
3
diff
changeset
|
74 accession_parser = accession_parser.replace(value, key) |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
75 |
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
76 for fasta_file in sys.argv[4:]: |
2
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
77 print("Reading entries from '%s'" % fasta_file) |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
78 fa_reader = FASTAReader(fasta_file, accession_parser) |
0 | 79 for protein in fa_reader: |
2
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
80 if unique_sequences: |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
81 if protein.accession in seen_accessions: |
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
82 print("Skipping protein '%s' with duplicate accession" % protein.header) |
2
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
83 continue |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
84 elif hash(protein.sequence) in seen_sequences: |
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
85 print("Skipping protein '%s' with duplicate sequence (first seen as '%s')" % (protein.header, seen_sequences[hash(protein.sequence)])) |
2
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
86 continue |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
87 else: |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
88 seen_sequences[hash(protein.sequence)] = protein.header |
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
89 seen_accessions.add(protein.accession) |
0 | 90 else: |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
91 if protein.accession in seen_accessions: |
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
92 print("Skipping protein '%s' with duplicate accession" % protein.header) |
2
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
93 continue |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
94 else: |
3
9ad0d336e5ed
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents:
2
diff
changeset
|
95 seen_accessions.add(protein.accession) |
2
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
96 out_file.write(protein.header) |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
97 out_file.write(os.linesep) |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
98 out_file.write(protein.sequence) |
379c41d859aa
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents:
1
diff
changeset
|
99 out_file.write(os.linesep) |
0 | 100 out_file.close() |
101 | |
6
f546e7278f04
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents:
3
diff
changeset
|
102 |
0 | 103 if __name__ == "__main__": |
104 main() |