Mercurial > repos > galaxyp > fasta_merge_files_and_filter_unique_sequences
comparison fasta_merge_files_and_filter_unique_sequences.py @ 6:f546e7278f04 draft default tip
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
author | galaxyp |
---|---|
date | Mon, 23 Nov 2020 19:35:09 +0000 |
parents | 9ad0d336e5ed |
children |
comparison
equal
deleted
inserted
replaced
5:650d553c1fda | 6:f546e7278f04 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 import os | 2 import os |
3 import re | |
3 import sys | 4 import sys |
4 import re | 5 |
5 | 6 |
6 class Sequence: | 7 class Sequence: |
7 ''' Holds protein sequence information ''' | 8 ''' Holds protein sequence information ''' |
9 | |
8 def __init__(self): | 10 def __init__(self): |
9 self.header = "" | 11 self.header = "" |
10 self.accession = "" | 12 self.accession = "" |
11 self.sequence = "" | 13 self.sequence = "" |
12 | 14 |
15 | |
13 class FASTAReader: | 16 class FASTAReader: |
14 """ | 17 """ |
15 FASTA db iterator. Returns a single FASTA sequence object. | 18 FASTA db iterator. Returns a single FASTA sequence object. |
16 """ | 19 """ |
20 | |
17 def __init__(self, fasta_name, accession_parser): | 21 def __init__(self, fasta_name, accession_parser): |
18 self.fasta_file = open(fasta_name) | 22 self.fasta_file = open(fasta_name) |
19 self.accession_parser = accession_parser | 23 self.accession_parser = accession_parser |
20 | 24 |
21 def __iter__(self): | 25 def __iter__(self): |
29 raise StopIteration | 33 raise StopIteration |
30 if line[0] == '>': | 34 if line[0] == '>': |
31 break | 35 break |
32 | 36 |
33 seq = Sequence() | 37 seq = Sequence() |
34 seq.header = line.rstrip().replace('\n','').replace('\r','') | 38 seq.header = line.rstrip().replace('\n', '').replace('\r', '') |
35 | 39 |
36 m = re.search(self.accession_parser, seq.header) | 40 m = re.search(self.accession_parser, seq.header) |
37 if not m or len(m.groups()) < 1 or len(m.group(1)) == 0: | 41 if not m or len(m.groups()) < 1 or len(m.group(1)) == 0: |
38 sys.exit("Could not parse accession from '%s'" % seq.header) | 42 sys.exit("Could not parse accession from '%s'" % seq.header) |
39 seq.accession = m.group(1) | 43 seq.accession = m.group(1) |
40 | 44 |
41 while True: | 45 while True: |
42 tail = self.fasta_file.tell() | 46 tail = self.fasta_file.tell() |
43 line = self.fasta_file.readline() | 47 line = self.fasta_file.readline() |
44 if not line: | 48 if not line: |
45 break | 49 break |
46 if line[0] == '>': | 50 if line[0] == '>': |
47 self.fasta_file.seek(tail) | 51 self.fasta_file.seek(tail) |
48 break | 52 break |
49 seq.sequence = seq.sequence + line.rstrip().replace('\n','').replace('\r','') | 53 seq.sequence = seq.sequence + line.rstrip().replace('\n', '').replace('\r', '') |
50 return seq | 54 return seq |
51 | 55 |
52 # Python 2/3 compat | 56 # Python 2/3 compat |
53 next = __next__ | 57 next = __next__ |
54 | 58 |
64 unique_sequences = False | 68 unique_sequences = False |
65 else: | 69 else: |
66 sys.exit("2nd argument must be 'sequence' or 'accession'") | 70 sys.exit("2nd argument must be 'sequence' or 'accession'") |
67 | 71 |
68 accession_parser = sys.argv[3] | 72 accession_parser = sys.argv[3] |
69 for key, value in { '\'' :'__sq__', '\\' : '__backslash__' }.items(): | 73 for key, value in {'\'': '__sq__', '\\': '__backslash__'}.items(): |
70 accession_parser = accession_parser.replace(value, key) | 74 accession_parser = accession_parser.replace(value, key) |
71 | 75 |
72 for fasta_file in sys.argv[4:]: | 76 for fasta_file in sys.argv[4:]: |
73 print("Reading entries from '%s'" % fasta_file) | 77 print("Reading entries from '%s'" % fasta_file) |
74 fa_reader = FASTAReader(fasta_file, accession_parser) | 78 fa_reader = FASTAReader(fasta_file, accession_parser) |
75 for protein in fa_reader: | 79 for protein in fa_reader: |
93 out_file.write(os.linesep) | 97 out_file.write(os.linesep) |
94 out_file.write(protein.sequence) | 98 out_file.write(protein.sequence) |
95 out_file.write(os.linesep) | 99 out_file.write(os.linesep) |
96 out_file.close() | 100 out_file.close() |
97 | 101 |
102 | |
98 if __name__ == "__main__": | 103 if __name__ == "__main__": |
99 main() | 104 main() |