Mercurial > repos > galaxy-australia > alphafold2
annotate validate_fasta.py @ 0:7ae9d78b06f5 draft
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
author | galaxy-australia |
---|---|
date | Fri, 28 Jan 2022 04:56:29 +0000 |
parents | |
children | 6c92e000d684 |
rev | line source |
---|---|
0
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
1 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
2 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
3 import argparse |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
4 from typing import List, TextIO |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
5 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
6 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
7 class Fasta: |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
8 def __init__(self, header_str: str, seq_str: str): |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
9 self.header = header_str |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
10 self.aa_seq = seq_str |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
11 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
12 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
13 class FastaLoader: |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
14 def __init__(self): |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
15 """creates a Fasta() from a file""" |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
16 self.fastas: List[Fasta] = [] |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
17 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
18 def load(self, fasta_path: str): |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
19 """ |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
20 load function has to be very flexible. |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
21 file may be normal fasta format (header, seq) or can just be a bare sequence. |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
22 """ |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
23 with open(fasta_path, 'r') as fp: |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
24 header, sequence = self.interpret_first_line(fp) |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
25 line = fp.readline().rstrip('\n') |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
26 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
27 while line: |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
28 if line.startswith('>'): |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
29 self.update_fastas(header, sequence) |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
30 header = line |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
31 sequence = '' |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
32 else: |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
33 sequence += line |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
34 line = fp.readline().rstrip('\n') |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
35 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
36 # after reading whole file, header & sequence buffers might be full |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
37 self.update_fastas(header, sequence) |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
38 return self.fastas |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
39 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
40 def interpret_first_line(self, fp: TextIO): |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
41 header = '' |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
42 sequence = '' |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
43 line = fp.readline().rstrip('\n') |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
44 if line.startswith('>'): |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
45 header = line |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
46 else: |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
47 sequence += line |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
48 return header, sequence |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
49 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
50 def update_fastas(self, header: str, sequence: str): |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
51 # if we have a sequence |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
52 if not sequence == '': |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
53 # create generic header if not exists |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
54 if header == '': |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
55 fasta_count = len(self.fastas) |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
56 header = f'>sequence_{fasta_count}' |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
57 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
58 # create new Fasta |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
59 self.fastas.append(Fasta(header, sequence)) |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
60 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
61 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
62 class FastaValidator: |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
63 def __init__(self, fasta_list: List[Fasta]): |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
64 self.fasta_list = fasta_list |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
65 self.min_length = 30 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
66 self.max_length = 2000 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
67 self.iupac_characters = { |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
68 'A', 'B', 'C', 'D', 'E', 'F', 'G', |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
69 'H', 'I', 'K', 'L', 'M', 'N', 'P', |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
70 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
71 'Y', 'Z', '-' |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
72 } |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
73 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
74 def validate(self): |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
75 """performs fasta validation""" |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
76 self.validate_num_seqs() |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
77 self.validate_length() |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
78 self.validate_alphabet() |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
79 # not checking for 'X' nucleotides at the moment. |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
80 # alphafold can throw an error if it doesn't like it. |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
81 #self.validate_x() |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
82 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
83 def validate_num_seqs(self) -> None: |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
84 if len(self.fasta_list) > 1: |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
85 raise Exception(f'Error encountered validating fasta: More than 1 sequence detected ({len(self.fasta_list)}). Please use single fasta sequence as input') |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
86 elif len(self.fasta_list) == 0: |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
87 raise Exception(f'Error encountered validating fasta: input file has no fasta sequences') |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
88 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
89 def validate_length(self): |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
90 """Confirms whether sequence length is valid. """ |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
91 fasta = self.fasta_list[0] |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
92 if len(fasta.aa_seq) < self.min_length: |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
93 raise Exception(f'Error encountered validating fasta: Sequence too short ({len(fasta.aa_seq)}aa). Must be > 30aa') |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
94 if len(fasta.aa_seq) > self.max_length: |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
95 raise Exception(f'Error encountered validating fasta: Sequence too long ({len(fasta.aa_seq)}aa). Must be < 2000aa') |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
96 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
97 def validate_alphabet(self): |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
98 """ |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
99 Confirms whether the sequence conforms to IUPAC codes. |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
100 If not, reports the offending character and its position. |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
101 """ |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
102 fasta = self.fasta_list[0] |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
103 for i, char in enumerate(fasta.aa_seq.upper()): |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
104 if char not in self.iupac_characters: |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
105 raise Exception(f'Error encountered validating fasta: Invalid amino acid found at pos {i}: {char}') |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
106 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
107 def validate_x(self): |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
108 """checks if any bases are X. TODO check whether alphafold accepts X bases. """ |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
109 fasta = self.fasta_list[0] |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
110 for i, char in enumerate(fasta.aa_seq.upper()): |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
111 if char == 'X': |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
112 raise Exception(f'Error encountered validating fasta: Unsupported aa code "X" found at pos {i}') |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
113 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
114 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
115 class FastaWriter: |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
116 def __init__(self) -> None: |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
117 self.outfile = 'alphafold.fasta' |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
118 self.formatted_line_len = 60 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
119 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
120 def write(self, fasta: Fasta): |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
121 with open(self.outfile, 'w') as fp: |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
122 header = fasta.header |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
123 seq = self.format_sequence(fasta.aa_seq) |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
124 fp.write(header + '\n') |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
125 fp.write(seq + '\n') |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
126 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
127 def format_sequence(self, aa_seq: str): |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
128 formatted_seq = '' |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
129 for i in range(0, len(aa_seq), self.formatted_line_len): |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
130 formatted_seq += aa_seq[i: i + self.formatted_line_len] + '\n' |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
131 return formatted_seq |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
132 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
133 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
134 def main(): |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
135 # load fasta file |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
136 args = parse_args() |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
137 fl = FastaLoader() |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
138 fastas = fl.load(args.input_fasta) |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
139 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
140 # validate |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
141 fv = FastaValidator(fastas) |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
142 fv.validate() |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
143 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
144 # write cleaned version |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
145 fw = FastaWriter() |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
146 fw.write(fastas[0]) |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
147 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
148 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
149 def parse_args() -> argparse.Namespace: |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
150 parser = argparse.ArgumentParser() |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
151 parser.add_argument( |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
152 "input_fasta", |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
153 help="input fasta file", |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
154 type=str |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
155 ) |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
156 return parser.parse_args() |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
157 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
158 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
159 |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
160 if __name__ == '__main__': |
7ae9d78b06f5
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty"
galaxy-australia
parents:
diff
changeset
|
161 main() |