annotate fasta_merge_files_and_filter_unique_sequences.py @ 6:f546e7278f04 draft default tip

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
author galaxyp
date Mon, 23 Nov 2020 19:35:09 +0000
parents 9ad0d336e5ed
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
1 #!/usr/bin/env python
1
74144834b0bd planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 9f9eba8df62b4db1ef35718d880a1bcda7457b99
galaxyp
parents: 0
diff changeset
2 import os
6
f546e7278f04 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents: 3
diff changeset
3 import re
1
74144834b0bd planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 9f9eba8df62b4db1ef35718d880a1bcda7457b99
galaxyp
parents: 0
diff changeset
4 import sys
6
f546e7278f04 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents: 3
diff changeset
5
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
6
2904d46167da Uploaded
galaxyp
parents:
diff changeset
7 class Sequence:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
8 ''' Holds protein sequence information '''
6
f546e7278f04 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents: 3
diff changeset
9
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
10 def __init__(self):
2904d46167da Uploaded
galaxyp
parents:
diff changeset
11 self.header = ""
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
12 self.accession = ""
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
13 self.sequence = ""
2904d46167da Uploaded
galaxyp
parents:
diff changeset
14
6
f546e7278f04 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents: 3
diff changeset
15
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
16 class FASTAReader:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
17 """
2904d46167da Uploaded
galaxyp
parents:
diff changeset
18 FASTA db iterator. Returns a single FASTA sequence object.
2904d46167da Uploaded
galaxyp
parents:
diff changeset
19 """
6
f546e7278f04 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents: 3
diff changeset
20
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
21 def __init__(self, fasta_name, accession_parser):
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
22 self.fasta_file = open(fasta_name)
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
23 self.accession_parser = accession_parser
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
24
2904d46167da Uploaded
galaxyp
parents:
diff changeset
25 def __iter__(self):
2904d46167da Uploaded
galaxyp
parents:
diff changeset
26 return self
2904d46167da Uploaded
galaxyp
parents:
diff changeset
27
2904d46167da Uploaded
galaxyp
parents:
diff changeset
28 def __next__(self):
2904d46167da Uploaded
galaxyp
parents:
diff changeset
29 ''' Iteration '''
2904d46167da Uploaded
galaxyp
parents:
diff changeset
30 while True:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
31 line = self.fasta_file.readline()
2904d46167da Uploaded
galaxyp
parents:
diff changeset
32 if not line:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
33 raise StopIteration
2904d46167da Uploaded
galaxyp
parents:
diff changeset
34 if line[0] == '>':
2904d46167da Uploaded
galaxyp
parents:
diff changeset
35 break
2904d46167da Uploaded
galaxyp
parents:
diff changeset
36
2904d46167da Uploaded
galaxyp
parents:
diff changeset
37 seq = Sequence()
6
f546e7278f04 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents: 3
diff changeset
38 seq.header = line.rstrip().replace('\n', '').replace('\r', '')
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
39
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
40 m = re.search(self.accession_parser, seq.header)
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
41 if not m or len(m.groups()) < 1 or len(m.group(1)) == 0:
6
f546e7278f04 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents: 3
diff changeset
42 sys.exit("Could not parse accession from '%s'" % seq.header)
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
43 seq.accession = m.group(1)
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
44
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
45 while True:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
46 tail = self.fasta_file.tell()
2904d46167da Uploaded
galaxyp
parents:
diff changeset
47 line = self.fasta_file.readline()
2904d46167da Uploaded
galaxyp
parents:
diff changeset
48 if not line:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
49 break
2904d46167da Uploaded
galaxyp
parents:
diff changeset
50 if line[0] == '>':
2904d46167da Uploaded
galaxyp
parents:
diff changeset
51 self.fasta_file.seek(tail)
2904d46167da Uploaded
galaxyp
parents:
diff changeset
52 break
6
f546e7278f04 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents: 3
diff changeset
53 seq.sequence = seq.sequence + line.rstrip().replace('\n', '').replace('\r', '')
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
54 return seq
2904d46167da Uploaded
galaxyp
parents:
diff changeset
55
2904d46167da Uploaded
galaxyp
parents:
diff changeset
56 # Python 2/3 compat
2904d46167da Uploaded
galaxyp
parents:
diff changeset
57 next = __next__
2904d46167da Uploaded
galaxyp
parents:
diff changeset
58
2904d46167da Uploaded
galaxyp
parents:
diff changeset
59
2904d46167da Uploaded
galaxyp
parents:
diff changeset
60 def main():
2
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
61 seen_sequences = dict([])
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
62 seen_accessions = set([])
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
63
2904d46167da Uploaded
galaxyp
parents:
diff changeset
64 out_file = open(sys.argv[1], 'w')
2
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
65 if sys.argv[2] == "sequence":
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
66 unique_sequences = True
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
67 elif sys.argv[2] == "accession":
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
68 unique_sequences = False
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
69 else:
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
70 sys.exit("2nd argument must be 'sequence' or 'accession'")
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
71
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
72 accession_parser = sys.argv[3]
6
f546e7278f04 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents: 3
diff changeset
73 for key, value in {'\'': '__sq__', '\\': '__backslash__'}.items():
f546e7278f04 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents: 3
diff changeset
74 accession_parser = accession_parser.replace(value, key)
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
75
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
76 for fasta_file in sys.argv[4:]:
2
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
77 print("Reading entries from '%s'" % fasta_file)
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
78 fa_reader = FASTAReader(fasta_file, accession_parser)
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
79 for protein in fa_reader:
2
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
80 if unique_sequences:
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
81 if protein.accession in seen_accessions:
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
82 print("Skipping protein '%s' with duplicate accession" % protein.header)
2
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
83 continue
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
84 elif hash(protein.sequence) in seen_sequences:
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
85 print("Skipping protein '%s' with duplicate sequence (first seen as '%s')" % (protein.header, seen_sequences[hash(protein.sequence)]))
2
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
86 continue
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
87 else:
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
88 seen_sequences[hash(protein.sequence)] = protein.header
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
89 seen_accessions.add(protein.accession)
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
90 else:
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
91 if protein.accession in seen_accessions:
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
92 print("Skipping protein '%s' with duplicate accession" % protein.header)
2
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
93 continue
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
94 else:
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
95 seen_accessions.add(protein.accession)
2
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
96 out_file.write(protein.header)
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
97 out_file.write(os.linesep)
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
98 out_file.write(protein.sequence)
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
99 out_file.write(os.linesep)
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
100 out_file.close()
2904d46167da Uploaded
galaxyp
parents:
diff changeset
101
6
f546e7278f04 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
galaxyp
parents: 3
diff changeset
102
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
103 if __name__ == "__main__":
2904d46167da Uploaded
galaxyp
parents:
diff changeset
104 main()