0
|
1 import Bio.GenBank
|
|
2
|
|
3
|
|
4 def record_end(self, content):
|
|
5 """Clean up when we've finished the record.
|
|
6 """
|
|
7 #from Bio import Alphabet
|
|
8 #from Bio.Alphabet import IUPAC
|
|
9 from Bio.Seq import Seq, UnknownSeq
|
|
10
|
|
11 # Try and append the version number to the accession for the full id
|
|
12 if not self.data.id:
|
|
13 assert "accessions" not in self.data.annotations, self.data.annotations[
|
|
14 "accessions"
|
|
15 ]
|
|
16 self.data.id = self.data.name # Good fall back?
|
|
17 elif self.data.id.count(".") == 0:
|
|
18 try:
|
|
19 self.data.id += ".%i" % self.data.annotations["sequence_version"]
|
|
20 except KeyError:
|
|
21 pass
|
|
22
|
|
23 # add the sequence information
|
|
24 # first, determine the alphabet
|
|
25 # we default to an generic alphabet if we don't have a
|
|
26 # seq type or have strange sequence information.
|
|
27
|
|
28 #seq_alphabet = Alphabet.generic_alphabet
|
|
29
|
|
30 # now set the sequence
|
|
31 sequence = "".join(self._seq_data)
|
|
32
|
|
33 if (
|
|
34 self._expected_size is not None
|
|
35 and len(sequence) != 0
|
|
36 and self._expected_size != len(sequence)
|
|
37 ):
|
|
38 import warnings
|
|
39 from Bio import BiopythonParserWarning
|
|
40
|
|
41 warnings.warn(
|
|
42 "Expected sequence length %i, found %i (%s)."
|
|
43 % (self._expected_size, len(sequence), self.data.id),
|
|
44 BiopythonParserWarning,
|
|
45 )
|
|
46 """
|
|
47 if self._seq_type:
|
|
48 # mRNA is really also DNA, since it is actually cDNA
|
|
49 if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper():
|
|
50 seq_alphabet = IUPAC.ambiguous_dna
|
|
51 # are there ever really RNA sequences in GenBank?
|
|
52 elif "RNA" in self._seq_type.upper():
|
|
53 # Even for data which was from RNA, the sequence string
|
|
54 # is usually given as DNA (T not U). Bug 2408
|
|
55 if "T" in sequence and "U" not in sequence:
|
|
56 seq_alphabet = IUPAC.ambiguous_dna
|
|
57 else:
|
|
58 seq_alphabet = IUPAC.ambiguous_rna
|
|
59 elif (
|
|
60 "PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT"
|
|
61 ): # PRT is used in EMBL-bank for patents
|
|
62 seq_alphabet = IUPAC.protein # or extended protein?
|
|
63 # work around ugly GenBank records which have circular or
|
|
64 # linear but no indication of sequence type
|
|
65 elif self._seq_type in ["circular", "linear", "unspecified"]:
|
|
66 pass
|
|
67 # we have a bug if we get here
|
|
68 else:
|
|
69 raise ValueError(
|
|
70 "Could not determine alphabet for seq_type %s" % self._seq_type
|
|
71 )
|
|
72
|
|
73 # Also save the chomosome layout
|
|
74 if "circular" in self._seq_type.lower():
|
|
75 self.data.annotations["topology"] = "circular"
|
|
76 elif "linear" in self._seq_type.lower():
|
|
77 self.data.annotations["topology"] = "linear"
|
|
78 """
|
|
79 if not sequence and self.__expected_size:
|
|
80 self.data.seq = UnknownSeq(self._expected_size)#, seq_alphabet)
|
|
81 else:
|
|
82 self.data.seq = Seq(sequence)#, seq_alphabet)
|
|
83
|
|
84
|
|
85 Bio.GenBank._FeatureConsumer.record_end = record_end
|