annotate BIO_FIX_TOPO.py @ 1:1bdd481d5c25 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:42:57 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
1 import Bio.GenBank
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
2
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
3
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
4 def record_end(self, content):
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
5 """Clean up when we've finished the record."""
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
6 # from Bio import Alphabet
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
7 # from Bio.Alphabet import IUPAC
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
8 from Bio.Seq import Seq, UnknownSeq
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
9
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
10 # Try and append the version number to the accession for the full id
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
11 if not self.data.id:
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
12 assert "accessions" not in self.data.annotations, self.data.annotations[
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
13 "accessions"
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
14 ]
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
15 self.data.id = self.data.name # Good fall back?
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
16 elif self.data.id.count(".") == 0:
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
17 try:
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
18 self.data.id += ".%i" % self.data.annotations["sequence_version"]
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
19 except KeyError:
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
20 pass
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
21
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
22 # add the sequence information
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
23 # first, determine the alphabet
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
24 # we default to an generic alphabet if we don't have a
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
25 # seq type or have strange sequence information.
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
26
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
27 # seq_alphabet = Alphabet.generic_alphabet
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
28
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
29 # now set the sequence
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
30 sequence = "".join(self._seq_data)
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
31
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
32 if (
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
33 self._expected_size is not None
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
34 and len(sequence) != 0
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
35 and self._expected_size != len(sequence)
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
36 ):
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
37 import warnings
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
38 from Bio import BiopythonParserWarning
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
39
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
40 warnings.warn(
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
41 "Expected sequence length %i, found %i (%s)."
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
42 % (self._expected_size, len(sequence), self.data.id),
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
43 BiopythonParserWarning,
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
44 )
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
45 """
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
46 if self._seq_type:
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
47 # mRNA is really also DNA, since it is actually cDNA
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
48 if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper():
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
49 seq_alphabet = IUPAC.ambiguous_dna
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
50 # are there ever really RNA sequences in GenBank?
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
51 elif "RNA" in self._seq_type.upper():
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
52 # Even for data which was from RNA, the sequence string
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
53 # is usually given as DNA (T not U). Bug 2408
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
54 if "T" in sequence and "U" not in sequence:
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
55 seq_alphabet = IUPAC.ambiguous_dna
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
56 else:
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
57 seq_alphabet = IUPAC.ambiguous_rna
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
58 elif (
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
59 "PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT"
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
60 ): # PRT is used in EMBL-bank for patents
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
61 seq_alphabet = IUPAC.protein # or extended protein?
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
62 # work around ugly GenBank records which have circular or
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
63 # linear but no indication of sequence type
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
64 elif self._seq_type in ["circular", "linear", "unspecified"]:
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
65 pass
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
66 # we have a bug if we get here
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
67 else:
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
68 raise ValueError(
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
69 "Could not determine alphabet for seq_type %s" % self._seq_type
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
70 )
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
71
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
72 # Also save the chomosome layout
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
73 if "circular" in self._seq_type.lower():
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
74 self.data.annotations["topology"] = "circular"
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
75 elif "linear" in self._seq_type.lower():
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
76 self.data.annotations["topology"] = "linear"
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
77 """
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
78 if not sequence and self.__expected_size:
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
79 self.data.seq = UnknownSeq(self._expected_size) # , seq_alphabet)
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
80 else:
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
81 self.data.seq = Seq(sequence) # , seq_alphabet)
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
82
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
83
1bdd481d5c25 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
cpt
parents:
diff changeset
84 Bio.GenBank._FeatureConsumer.record_end = record_end