annotate fasta_merge_files_and_filter_unique_sequences.py @ 3:9ad0d336e5ed draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
author galaxyp
date Fri, 03 Feb 2017 14:27:56 -0500
parents 379c41d859aa
children f546e7278f04
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
1 #!/usr/bin/env python
1
74144834b0bd planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 9f9eba8df62b4db1ef35718d880a1bcda7457b99
galaxyp
parents: 0
diff changeset
2 import os
74144834b0bd planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 9f9eba8df62b4db1ef35718d880a1bcda7457b99
galaxyp
parents: 0
diff changeset
3 import sys
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
4 import re
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
5
2904d46167da Uploaded
galaxyp
parents:
diff changeset
6 class Sequence:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
7 ''' Holds protein sequence information '''
2904d46167da Uploaded
galaxyp
parents:
diff changeset
8 def __init__(self):
2904d46167da Uploaded
galaxyp
parents:
diff changeset
9 self.header = ""
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
10 self.accession = ""
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
11 self.sequence = ""
2904d46167da Uploaded
galaxyp
parents:
diff changeset
12
2904d46167da Uploaded
galaxyp
parents:
diff changeset
13 class FASTAReader:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
14 """
2904d46167da Uploaded
galaxyp
parents:
diff changeset
15 FASTA db iterator. Returns a single FASTA sequence object.
2904d46167da Uploaded
galaxyp
parents:
diff changeset
16 """
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
17 def __init__(self, fasta_name, accession_parser):
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
18 self.fasta_file = open(fasta_name)
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
19 self.accession_parser = accession_parser
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
20
2904d46167da Uploaded
galaxyp
parents:
diff changeset
21 def __iter__(self):
2904d46167da Uploaded
galaxyp
parents:
diff changeset
22 return self
2904d46167da Uploaded
galaxyp
parents:
diff changeset
23
2904d46167da Uploaded
galaxyp
parents:
diff changeset
24 def __next__(self):
2904d46167da Uploaded
galaxyp
parents:
diff changeset
25 ''' Iteration '''
2904d46167da Uploaded
galaxyp
parents:
diff changeset
26 while True:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
27 line = self.fasta_file.readline()
2904d46167da Uploaded
galaxyp
parents:
diff changeset
28 if not line:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
29 raise StopIteration
2904d46167da Uploaded
galaxyp
parents:
diff changeset
30 if line[0] == '>':
2904d46167da Uploaded
galaxyp
parents:
diff changeset
31 break
2904d46167da Uploaded
galaxyp
parents:
diff changeset
32
2904d46167da Uploaded
galaxyp
parents:
diff changeset
33 seq = Sequence()
2904d46167da Uploaded
galaxyp
parents:
diff changeset
34 seq.header = line.rstrip().replace('\n','').replace('\r','')
2904d46167da Uploaded
galaxyp
parents:
diff changeset
35
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
36 m = re.search(self.accession_parser, seq.header)
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
37 if not m or len(m.groups()) < 1 or len(m.group(1)) == 0:
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
38 sys.exit("Could not parse accession from '%s'" % seq.header)
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
39 seq.accession = m.group(1)
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
40
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
41 while True:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
42 tail = self.fasta_file.tell()
2904d46167da Uploaded
galaxyp
parents:
diff changeset
43 line = self.fasta_file.readline()
2904d46167da Uploaded
galaxyp
parents:
diff changeset
44 if not line:
2904d46167da Uploaded
galaxyp
parents:
diff changeset
45 break
2904d46167da Uploaded
galaxyp
parents:
diff changeset
46 if line[0] == '>':
2904d46167da Uploaded
galaxyp
parents:
diff changeset
47 self.fasta_file.seek(tail)
2904d46167da Uploaded
galaxyp
parents:
diff changeset
48 break
2904d46167da Uploaded
galaxyp
parents:
diff changeset
49 seq.sequence = seq.sequence + line.rstrip().replace('\n','').replace('\r','')
2904d46167da Uploaded
galaxyp
parents:
diff changeset
50 return seq
2904d46167da Uploaded
galaxyp
parents:
diff changeset
51
2904d46167da Uploaded
galaxyp
parents:
diff changeset
52 # Python 2/3 compat
2904d46167da Uploaded
galaxyp
parents:
diff changeset
53 next = __next__
2904d46167da Uploaded
galaxyp
parents:
diff changeset
54
2904d46167da Uploaded
galaxyp
parents:
diff changeset
55
2904d46167da Uploaded
galaxyp
parents:
diff changeset
56 def main():
2
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
57 seen_sequences = dict([])
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
58 seen_accessions = set([])
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
59
2904d46167da Uploaded
galaxyp
parents:
diff changeset
60 out_file = open(sys.argv[1], 'w')
2
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
61 if sys.argv[2] == "sequence":
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
62 unique_sequences = True
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
63 elif sys.argv[2] == "accession":
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
64 unique_sequences = False
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
65 else:
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
66 sys.exit("2nd argument must be 'sequence' or 'accession'")
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
67
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
68 accession_parser = sys.argv[3]
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
69 for key, value in { '\'' :'__sq__', '\\' : '__backslash__' }.items():
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
70 accession_parser = accession_parser.replace(value, key)
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
71
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
72 for fasta_file in sys.argv[4:]:
2
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
73 print("Reading entries from '%s'" % fasta_file)
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
74 fa_reader = FASTAReader(fasta_file, accession_parser)
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
75 for protein in fa_reader:
2
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
76 if unique_sequences:
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
77 if protein.accession in seen_accessions:
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
78 print("Skipping protein '%s' with duplicate accession" % protein.header)
2
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
79 continue
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
80 elif hash(protein.sequence) in seen_sequences:
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
81 print("Skipping protein '%s' with duplicate sequence (first seen as '%s')" % (protein.header, seen_sequences[hash(protein.sequence)]))
2
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
82 continue
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
83 else:
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
84 seen_sequences[hash(protein.sequence)] = protein.header
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
85 seen_accessions.add(protein.accession)
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
86 else:
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
87 if protein.accession in seen_accessions:
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
88 print("Skipping protein '%s' with duplicate accession" % protein.header)
2
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
89 continue
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
90 else:
3
9ad0d336e5ed planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
galaxyp
parents: 2
diff changeset
91 seen_accessions.add(protein.accession)
2
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
92 out_file.write(protein.header)
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
93 out_file.write(os.linesep)
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
94 out_file.write(protein.sequence)
379c41d859aa planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
galaxyp
parents: 1
diff changeset
95 out_file.write(os.linesep)
0
2904d46167da Uploaded
galaxyp
parents:
diff changeset
96 out_file.close()
2904d46167da Uploaded
galaxyp
parents:
diff changeset
97
2904d46167da Uploaded
galaxyp
parents:
diff changeset
98 if __name__ == "__main__":
2904d46167da Uploaded
galaxyp
parents:
diff changeset
99 main()