Mercurial > repos > galaxy-australia > alphafold2
comparison scripts/validate_fasta.py @ 23:2891385d6ace draft
planemo upload for repository https://github.com/usegalaxy-au/tools-au commit b347c6ccc82b14fcbff360b3357050d1d43e3ef5-dirty
| author | galaxy-australia |
|---|---|
| date | Wed, 16 Apr 2025 05:46:58 +0000 |
| parents | 2f7702fd0a4c |
| children | 31f648b7555a |
comparison
equal
deleted
inserted
replaced
| 22:3f188450ca4f | 23:2891385d6ace |
|---|---|
| 10 | 10 |
| 11 | 11 |
| 12 class Fasta: | 12 class Fasta: |
| 13 def __init__(self, header_str: str, seq_str: str): | 13 def __init__(self, header_str: str, seq_str: str): |
| 14 self.header = header_str | 14 self.header = header_str |
| 15 self.aa_seq = seq_str | 15 self.sequence = seq_str |
| 16 | 16 |
| 17 | 17 |
| 18 class FastaLoader: | 18 class FastaLoader: |
| 19 def __init__(self, fasta_path: str): | 19 def __init__(self, fasta_path: str): |
| 20 """Initialize from FASTA file.""" | 20 """Initialize from FASTA file.""" |
| 138 | 138 |
| 139 def validate_length(self): | 139 def validate_length(self): |
| 140 """Confirm whether sequence length is valid.""" | 140 """Confirm whether sequence length is valid.""" |
| 141 fasta = self.fasta_list[0] | 141 fasta = self.fasta_list[0] |
| 142 if self.min_length: | 142 if self.min_length: |
| 143 if len(fasta.aa_seq) < self.min_length: | 143 if len(fasta.sequence) < self.min_length: |
| 144 raise ValueError( | 144 raise ValueError( |
| 145 'Error encountered validating FASTA:\n Sequence too short' | 145 'Error encountered validating FASTA:\n Sequence too short' |
| 146 f' ({len(fasta.aa_seq)}AA).' | 146 f' ({len(fasta.sequence)}AA).' |
| 147 f' Minimum length is {self.min_length}AA.') | 147 f' Minimum length is {self.min_length}AA.') |
| 148 if self.max_length: | 148 if self.max_length: |
| 149 if len(fasta.aa_seq) > self.max_length: | 149 if len(fasta.sequence) > self.max_length: |
| 150 raise ValueError( | 150 raise ValueError( |
| 151 'Error encountered validating FASTA:\n' | 151 'Error encountered validating FASTA:\n' |
| 152 f' Sequence too long ({len(fasta.aa_seq)}AA).' | 152 f' Sequence too long ({len(fasta.sequence)}AA).' |
| 153 f' Maximum length is {self.max_length}AA.') | 153 f' Maximum length is {self.max_length}AA.') |
| 154 | 154 |
| 155 def validate_alphabet(self): | 155 def validate_alphabet(self): |
| 156 """Confirm whether the sequence conforms to IUPAC codes. | 156 """Confirm whether the sequence conforms to IUPAC codes. |
| 157 | 157 |
| 158 If not, report the offending character and its position. | 158 If not, report the offending character and its position. |
| 159 """ | 159 """ |
| 160 fasta = self.fasta_list[0] | 160 fasta = self.fasta_list[0] |
| 161 for i, char in enumerate(fasta.aa_seq.upper()): | 161 for i, char in enumerate(fasta.sequence.upper()): |
| 162 if char not in self.iupac_characters: | 162 if char not in self.iupac_characters: |
| 163 raise ValueError( | 163 raise ValueError( |
| 164 'Error encountered validating FASTA:\n Invalid amino acid' | 164 'Error encountered validating FASTA:\n Invalid amino acid' |
| 165 f' found at pos {i}: "{char}"') | 165 f' found at pos {i}: "{char}"') |
| 166 | 166 |
| 167 def validate_x(self): | 167 def validate_x(self): |
| 168 """Check for X bases.""" | 168 """Check for X bases.""" |
| 169 fasta = self.fasta_list[0] | 169 fasta = self.fasta_list[0] |
| 170 for i, char in enumerate(fasta.aa_seq.upper()): | 170 for i, char in enumerate(fasta.sequence.upper()): |
| 171 if char == 'X': | 171 if char == 'X': |
| 172 raise ValueError( | 172 raise ValueError( |
| 173 'Error encountered validating FASTA:\n Unsupported AA code' | 173 'Error encountered validating FASTA:\n Unsupported AA code' |
| 174 f' "X" found at pos {i}') | 174 f' "X" found at pos {i}') |
| 175 | 175 |
| 178 def __init__(self) -> None: | 178 def __init__(self) -> None: |
| 179 self.line_wrap = 60 | 179 self.line_wrap = 60 |
| 180 | 180 |
| 181 def write(self, fasta: Fasta): | 181 def write(self, fasta: Fasta): |
| 182 header = fasta.header | 182 header = fasta.header |
| 183 seq = self.format_sequence(fasta.aa_seq) | 183 seq = self.format_sequence(fasta.sequence) |
| 184 sys.stdout.write(header + '\n') | 184 sys.stdout.write(header + '\n') |
| 185 sys.stdout.write(seq) | 185 sys.stdout.write(seq) |
| 186 | 186 |
| 187 def format_sequence(self, aa_seq: str): | 187 def format_sequence(self, sequence: str): |
| 188 formatted_seq = '' | 188 formatted_seq = '' |
| 189 for i in range(0, len(aa_seq), self.line_wrap): | 189 for i in range(0, len(sequence), self.line_wrap): |
| 190 formatted_seq += aa_seq[i: i + self.line_wrap] + '\n' | 190 formatted_seq += sequence[i: i + self.line_wrap] + '\n' |
| 191 return formatted_seq.upper() | 191 return formatted_seq.upper() |
| 192 | 192 |
| 193 | 193 |
| 194 def main(): | 194 def main(): |
| 195 # load fasta file | 195 # load fasta file |
| 212 fw.write(fas) | 212 fw.write(fas) |
| 213 | 213 |
| 214 sys.stderr.write("Validated FASTA sequence(s):\n\n") | 214 sys.stderr.write("Validated FASTA sequence(s):\n\n") |
| 215 for fas in clean_fastas: | 215 for fas in clean_fastas: |
| 216 sys.stderr.write(fas.header + '\n') | 216 sys.stderr.write(fas.header + '\n') |
| 217 sys.stderr.write(fas.aa_seq + '\n\n') | 217 sys.stderr.write(fas.sequence + '\n\n') |
| 218 | 218 |
| 219 except ValueError as exc: | 219 except ValueError as exc: |
| 220 sys.stderr.write(f"{exc}\n\n") | 220 sys.stderr.write(f"{exc}\n\n") |
| 221 raise exc | 221 raise exc |
| 222 | 222 |
