comparison fasta_merge_files_and_filter_unique_sequences.py @ 6:f546e7278f04 draft default tip

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 0ce1979ec9cf851f85ad74c78a3cc88826a2f070"
author galaxyp
date Mon, 23 Nov 2020 19:35:09 +0000
parents 9ad0d336e5ed
children
comparison
equal deleted inserted replaced
5:650d553c1fda 6:f546e7278f04
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 import os 2 import os
3 import re
3 import sys 4 import sys
4 import re 5
5 6
6 class Sequence: 7 class Sequence:
7 ''' Holds protein sequence information ''' 8 ''' Holds protein sequence information '''
9
8 def __init__(self): 10 def __init__(self):
9 self.header = "" 11 self.header = ""
10 self.accession = "" 12 self.accession = ""
11 self.sequence = "" 13 self.sequence = ""
12 14
15
13 class FASTAReader: 16 class FASTAReader:
14 """ 17 """
15 FASTA db iterator. Returns a single FASTA sequence object. 18 FASTA db iterator. Returns a single FASTA sequence object.
16 """ 19 """
20
17 def __init__(self, fasta_name, accession_parser): 21 def __init__(self, fasta_name, accession_parser):
18 self.fasta_file = open(fasta_name) 22 self.fasta_file = open(fasta_name)
19 self.accession_parser = accession_parser 23 self.accession_parser = accession_parser
20 24
21 def __iter__(self): 25 def __iter__(self):
29 raise StopIteration 33 raise StopIteration
30 if line[0] == '>': 34 if line[0] == '>':
31 break 35 break
32 36
33 seq = Sequence() 37 seq = Sequence()
34 seq.header = line.rstrip().replace('\n','').replace('\r','') 38 seq.header = line.rstrip().replace('\n', '').replace('\r', '')
35 39
36 m = re.search(self.accession_parser, seq.header) 40 m = re.search(self.accession_parser, seq.header)
37 if not m or len(m.groups()) < 1 or len(m.group(1)) == 0: 41 if not m or len(m.groups()) < 1 or len(m.group(1)) == 0:
38 sys.exit("Could not parse accession from '%s'" % seq.header) 42 sys.exit("Could not parse accession from '%s'" % seq.header)
39 seq.accession = m.group(1) 43 seq.accession = m.group(1)
40 44
41 while True: 45 while True:
42 tail = self.fasta_file.tell() 46 tail = self.fasta_file.tell()
43 line = self.fasta_file.readline() 47 line = self.fasta_file.readline()
44 if not line: 48 if not line:
45 break 49 break
46 if line[0] == '>': 50 if line[0] == '>':
47 self.fasta_file.seek(tail) 51 self.fasta_file.seek(tail)
48 break 52 break
49 seq.sequence = seq.sequence + line.rstrip().replace('\n','').replace('\r','') 53 seq.sequence = seq.sequence + line.rstrip().replace('\n', '').replace('\r', '')
50 return seq 54 return seq
51 55
52 # Python 2/3 compat 56 # Python 2/3 compat
53 next = __next__ 57 next = __next__
54 58
64 unique_sequences = False 68 unique_sequences = False
65 else: 69 else:
66 sys.exit("2nd argument must be 'sequence' or 'accession'") 70 sys.exit("2nd argument must be 'sequence' or 'accession'")
67 71
68 accession_parser = sys.argv[3] 72 accession_parser = sys.argv[3]
69 for key, value in { '\'' :'__sq__', '\\' : '__backslash__' }.items(): 73 for key, value in {'\'': '__sq__', '\\': '__backslash__'}.items():
70 accession_parser = accession_parser.replace(value, key) 74 accession_parser = accession_parser.replace(value, key)
71 75
72 for fasta_file in sys.argv[4:]: 76 for fasta_file in sys.argv[4:]:
73 print("Reading entries from '%s'" % fasta_file) 77 print("Reading entries from '%s'" % fasta_file)
74 fa_reader = FASTAReader(fasta_file, accession_parser) 78 fa_reader = FASTAReader(fasta_file, accession_parser)
75 for protein in fa_reader: 79 for protein in fa_reader:
93 out_file.write(os.linesep) 97 out_file.write(os.linesep)
94 out_file.write(protein.sequence) 98 out_file.write(protein.sequence)
95 out_file.write(os.linesep) 99 out_file.write(os.linesep)
96 out_file.close() 100 out_file.close()
97 101
102
98 if __name__ == "__main__": 103 if __name__ == "__main__":
99 main() 104 main()