Mercurial > repos > cpt > cpt_gbk_renumber
comparison cpt_renumber_gbk/BIO_FIX_TOPO.py @ 0:8cac332dbc77 draft default tip
Uploaded
author | cpt |
---|---|
date | Fri, 17 Jun 2022 13:13:47 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8cac332dbc77 |
---|---|
1 import Bio.GenBank | |
2 | |
3 | |
4 def record_end(self, content): | |
5 """Clean up when we've finished the record. | |
6 """ | |
7 #from Bio import Alphabet | |
8 #from Bio.Alphabet import IUPAC | |
9 from Bio.Seq import Seq, UnknownSeq | |
10 | |
11 # Try and append the version number to the accession for the full id | |
12 if not self.data.id: | |
13 assert "accessions" not in self.data.annotations, self.data.annotations[ | |
14 "accessions" | |
15 ] | |
16 self.data.id = self.data.name # Good fall back? | |
17 elif self.data.id.count(".") == 0: | |
18 try: | |
19 self.data.id += ".%i" % self.data.annotations["sequence_version"] | |
20 except KeyError: | |
21 pass | |
22 | |
23 # add the sequence information | |
24 # first, determine the alphabet | |
25 # we default to an generic alphabet if we don't have a | |
26 # seq type or have strange sequence information. | |
27 | |
28 #seq_alphabet = Alphabet.generic_alphabet | |
29 | |
30 # now set the sequence | |
31 sequence = "".join(self._seq_data) | |
32 | |
33 if ( | |
34 self._expected_size is not None | |
35 and len(sequence) != 0 | |
36 and self._expected_size != len(sequence) | |
37 ): | |
38 import warnings | |
39 from Bio import BiopythonParserWarning | |
40 | |
41 warnings.warn( | |
42 "Expected sequence length %i, found %i (%s)." | |
43 % (self._expected_size, len(sequence), self.data.id), | |
44 BiopythonParserWarning, | |
45 ) | |
46 """ | |
47 if self._seq_type: | |
48 # mRNA is really also DNA, since it is actually cDNA | |
49 if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper(): | |
50 seq_alphabet = IUPAC.ambiguous_dna | |
51 # are there ever really RNA sequences in GenBank? | |
52 elif "RNA" in self._seq_type.upper(): | |
53 # Even for data which was from RNA, the sequence string | |
54 # is usually given as DNA (T not U). Bug 2408 | |
55 if "T" in sequence and "U" not in sequence: | |
56 seq_alphabet = IUPAC.ambiguous_dna | |
57 else: | |
58 seq_alphabet = IUPAC.ambiguous_rna | |
59 elif ( | |
60 "PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT" | |
61 ): # PRT is used in EMBL-bank for patents | |
62 seq_alphabet = IUPAC.protein # or extended protein? | |
63 # work around ugly GenBank records which have circular or | |
64 # linear but no indication of sequence type | |
65 elif self._seq_type in ["circular", "linear", "unspecified"]: | |
66 pass | |
67 # we have a bug if we get here | |
68 else: | |
69 raise ValueError( | |
70 "Could not determine alphabet for seq_type %s" % self._seq_type | |
71 ) | |
72 | |
73 # Also save the chomosome layout | |
74 if "circular" in self._seq_type.lower(): | |
75 self.data.annotations["topology"] = "circular" | |
76 elif "linear" in self._seq_type.lower(): | |
77 self.data.annotations["topology"] = "linear" | |
78 """ | |
79 if not sequence and self.__expected_size: | |
80 self.data.seq = UnknownSeq(self._expected_size)#, seq_alphabet) | |
81 else: | |
82 self.data.seq = Seq(sequence)#, seq_alphabet) | |
83 | |
84 | |
85 Bio.GenBank._FeatureConsumer.record_end = record_end |