Mercurial > repos > galaxyp > filter_by_fasta_ids
comparison filter_by_fasta_ids.py @ 4:cd22452edec2 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
author | galaxyp |
---|---|
date | Thu, 18 Apr 2019 02:45:18 -0400 |
parents | 3c623e81be77 |
children | dff7df6fcab5 |
comparison
equal
deleted
inserted
replaced
3:3c623e81be77 | 4:cd22452edec2 |
---|---|
39 sequence_parts.append(line.rstrip()) | 39 sequence_parts.append(line.rstrip()) |
40 line = fasta_file.readline() | 40 line = fasta_file.readline() |
41 yield Sequence(header, sequence_parts) | 41 yield Sequence(header, sequence_parts) |
42 | 42 |
43 | 43 |
44 def target_match(targets, search_entry, pattern='>([^| ]+)'): | 44 def target_match(targets, search_entry, pattern): |
45 ''' Matches ''' | 45 ''' Matches ''' |
46 search_entry = search_entry.upper() | 46 search_entry = search_entry.upper() |
47 m = re.search(pattern,search_entry) | 47 m = pattern.search(search_entry) |
48 if m: | 48 if m: |
49 target = m.group(len(m.groups())) | 49 target = m.group(len(m.groups())) |
50 if target in targets: | 50 if target in targets: |
51 return target | 51 return target |
52 else: | 52 else: |
53 print( 'No ID match: %s' % search_entry, file=sys.stdout) | 53 print('No ID match: %s' % search_entry, file=sys.stdout) |
54 return None | 54 return None |
55 | 55 |
56 | 56 |
57 def main(): | 57 def main(): |
58 ''' the main function''' | |
59 | |
60 parser = argparse.ArgumentParser() | 58 parser = argparse.ArgumentParser() |
61 parser.add_argument('-i', required=True, help='Path to input FASTA file') | 59 parser.add_argument('-i', required=True, help='Path to input FASTA file') |
62 parser.add_argument('-o', required=True, help='Path to output FASTA file') | 60 parser.add_argument('-o', required=True, help='Path to output FASTA file') |
63 parser.add_argument('-d', help='Path to discarded entries file') | 61 parser.add_argument('-d', help='Path to discarded entries file') |
64 header_criteria = parser.add_mutually_exclusive_group() | 62 header_criteria = parser.add_mutually_exclusive_group() |
65 header_criteria.add_argument('--id_list', help='Path to the ID list file') | 63 header_criteria.add_argument('--id_list', help='Path to the ID list file') |
66 parser.add_argument('--pattern', help='regex earch attern for ID in Fasta entry') | 64 parser.add_argument('--pattern', help='regex earch attern for ID in FASTA entry') |
67 header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match') | 65 header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match') |
68 sequence_criteria = parser.add_mutually_exclusive_group() | 66 sequence_criteria = parser.add_mutually_exclusive_group() |
69 sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length') | 67 sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length') |
70 sequence_criteria.add_argument('--sequence_regexp', help='Regular expression pattern the header should match') | 68 sequence_criteria.add_argument('--sequence_regexp', help='Regular expression pattern the header should match') |
71 parser.add_argument('--max_length', type=int, help='Maximum sequence length') | 69 parser.add_argument('--max_length', type=int, help='Maximum sequence length') |
72 parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences') | 70 parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences') |
73 options = parser.parse_args() | 71 options = parser.parse_args() |
74 | 72 |
75 | |
76 if options.pattern: | 73 if options.pattern: |
77 pattern = options.pattern | 74 if not re.match('^.*[(](?![?]:).*[)].*$', options.pattern): |
78 if not re.match('^.*[(](?![?]:).*[)].*$',pattern): | 75 sys.exit('pattern: "%s" did not include capture group "()" in regex ' % options.pattern) |
79 print('pattern: "%s" did not include capture group "()" in regex ' % pattern) | 76 pattern = re.compile(options.pattern) |
80 exit(1) | 77 |
81 | |
82 if options.min_length is not None and options.max_length is None: | 78 if options.min_length is not None and options.max_length is None: |
83 options.max_length = sys.maxsize | 79 options.max_length = sys.maxsize |
84 if options.header_regexp: | 80 if options.header_regexp: |
85 regexp = re.compile(options.header_regexp) | 81 header_regexp = re.compile(options.header_regexp) |
86 if options.sequence_regexp: | 82 if options.sequence_regexp: |
87 regexp = re.compile(options.sequence_regexp) | 83 sequence_regexp = re.compile(options.sequence_regexp) |
88 | 84 |
89 work_summary = {'found': 0} | 85 work_summary = {'found': 0, 'discarded': 0} |
90 | 86 |
91 if options.dedup: | 87 if options.dedup: |
92 used_sequences = set() | 88 used_sequences = set() |
93 work_summary['duplicates'] = 0 | 89 work_summary['duplicates'] = 0 |
94 | 90 |
95 if options.id_list: | 91 if options.id_list: |
96 targets = [] | 92 targets = [] |
97 with open(options.id_list) as f_target: | 93 with open(options.id_list) as f_target: |
98 for line in f_target.readlines(): | 94 for line in f_target: |
99 targets.append(line.strip().upper()) | 95 targets.append(line.strip().upper()) |
100 work_summary['wanted'] = len(targets) | 96 work_summary['wanted'] = len(targets) |
101 | 97 |
102 homd_db = FASTAReader_gen(options.i) | 98 homd_db = FASTAReader_gen(options.i) |
103 if options.d: | 99 if options.d: |
105 | 101 |
106 with open(options.o, "w") as output: | 102 with open(options.o, "w") as output: |
107 for entry in homd_db: | 103 for entry in homd_db: |
108 print_entry = True | 104 print_entry = True |
109 if options.id_list: | 105 if options.id_list: |
110 target_matched_results = target_match(targets, entry.header, pattern=pattern) | 106 target_matched_results = target_match(targets, entry.header, pattern) |
111 if target_matched_results: | 107 if target_matched_results: |
112 work_summary['found'] += 1 | |
113 targets.remove(target_matched_results) | 108 targets.remove(target_matched_results) |
114 else: | 109 else: |
115 print_entry = False | 110 print_entry = False |
116 | |
117 elif options.header_regexp: | 111 elif options.header_regexp: |
118 if regexp.search(entry.header) is None: | 112 if header_regexp.search(entry.header) is None: |
119 print_entry = False | 113 print_entry = False |
120 if options.min_length is not None: | 114 if options.min_length is not None: |
121 sequence_length = len(entry.sequence) | 115 sequence_length = len(entry.sequence) |
122 if not(options.min_length <= sequence_length <= options.max_length): | 116 if not(options.min_length <= sequence_length <= options.max_length): |
123 print_entry = False | 117 print_entry = False |
124 elif options.sequence_regexp: | 118 elif options.sequence_regexp: |
125 if regexp.search(entry.sequence) is None: | 119 if sequence_regexp.search(entry.sequence) is None: |
126 print_entry = False | 120 print_entry = False |
127 if print_entry: | 121 if print_entry: |
128 if options.dedup: | 122 if options.dedup: |
129 if entry.sequence in used_sequences: | 123 if entry.sequence in used_sequences: |
130 work_summary['duplicates'] += 1 | 124 work_summary['duplicates'] += 1 |
131 continue | 125 continue |
132 else: | 126 else: |
133 used_sequences.add(entry.sequence) | 127 used_sequences.add(entry.sequence) |
128 work_summary['found'] += 1 | |
134 entry.print(output) | 129 entry.print(output) |
135 elif options.d: | 130 else: |
136 entry.print(discarded) | 131 work_summary['discarded'] += 1 |
132 if options.d: | |
133 entry.print(discarded) | |
134 | |
135 if options.d: | |
136 discarded.close() | |
137 | 137 |
138 for parm, count in work_summary.items(): | 138 for parm, count in work_summary.items(): |
139 print('%s ==> %d' % (parm, count)) | 139 print('%s ==> %d' % (parm, count)) |
140 | 140 |
141 | 141 |