comparison dna_features_viewer/biotools.py @ 1:e923c686ead9 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:45:31 +0000
parents
children b79e98299a78
comparison
equal deleted inserted replaced
0:621754dd31f8 1:e923c686ead9
1 import textwrap
2 from Bio.Seq import Seq
3 from Bio.SeqFeature import SeqFeature, FeatureLocation
4 from Bio.PDB.Polypeptide import aa1, aa3
5 from Bio import SeqIO
6
7 try:
8 from BCBio import GFF
9 except ImportError:
10
11 class GFF:
12 def parse(*a):
13 """Not available. Please install bcbio-gff."""
14 raise ImportError("Please install the bcbio-gff library to parse GFF data")
15
16
17 def complement(dna_sequence):
18 """Return the complement of the DNA sequence.
19
20 For instance ``complement("ATGCCG")`` returns ``"TACGGC"``.
21
22 Uses BioPython for speed.
23 """
24 return str(Seq(dna_sequence).complement())
25
26
27 def reverse_complement(sequence):
28 """Return the reverse-complement of the DNA sequence.
29
30 For instance ``complement("ATGCCG")`` returns ``"GCCGTA"``.
31
32 Uses BioPython for speed.
33 """
34 return complement(sequence)[::-1]
35
36
37 aa_short_to_long_form_dict = {
38 _aa1: _aa3[0] + _aa3[1:].lower() for (_aa1, _aa3) in zip(aa1 + "*", aa3 + ["*"])
39 }
40
41
42 def translate(dna_sequence, long_form=False):
43 """Translate the DNA sequence into an amino-acids sequence MLKYQT...
44
45 If long_form is true, a list of 3-letter amino acid representations
46 is returned instead (['Ala', 'Ser', ...]).
47 """
48 result = str(Seq(dna_sequence).translate())
49 if long_form:
50 result = [aa_short_to_long_form_dict[aa] for aa in result]
51 return result
52
53
54 def extract_graphical_translation(sequence, location, long_form=False):
55 """Return a string of the "graphical" translation of a sequence's subsegment.
56
57 Here "graphical" means that the amino acid sequence is always given
58 left-to-right, as it will appear under the sequence in the plot. This matters
59 when the location is on the -1 strand. In this case, the amino-acids are
60 determined by (part of) the reverse-complement of the sequence, however
61 the sequence returned will be the mirror of the translated sequence, as
62 this is the left-to-right order in which the codons corresponding to the
63 amino-acids appear in the sequence.
64
65 Parameters
66 ----------
67 sequence
68 An "ATGC" string.
69
70 location
71 Either (start, end) or (start, end, strand), with strand in (0, 1, -1).
72
73 long_form
74 if True, a list of 3-letter amino acid representations is returned instead
75 (['Ala', 'Ser', ...]).
76
77 """
78 if len(location) == 3:
79 start, end, strand = location
80 else:
81 start, end = location
82 strand = 1
83 subsequence = sequence[start:end]
84 if strand == -1:
85 subsequence = reverse_complement(subsequence)
86 translation = translate(subsequence, long_form=long_form)
87 if strand == -1:
88 translation = translation[::-1]
89 return translation
90
91
92 def load_record(path):
93 """Load a Genbank file"""
94 if isinstance(path, str):
95 # Input is a file path
96 if path.lower().endswith(".gff"):
97 return list(GFF.parse(path))[0]
98 else:
99 return SeqIO.read(path, "genbank")
100 else:
101 # Input is a file-like object
102 try:
103 return SeqIO.read(path, "genbank")
104 except:
105 path.seek(0)
106 return list(GFF.parse(path))[0]
107
108
109 def annotate_biopython_record(
110 seqrecord, location="full", feature_type="misc_feature", margin=0, **qualifiers
111 ):
112 """Add a feature to a Biopython SeqRecord.
113
114 Parameters
115 ----------
116
117 seqrecord
118 The biopython seqrecord to be annotated.
119
120 location
121 Either (start, end) or (start, end, strand). (strand defaults to +1)
122
123 feature_type
124 The type associated with the feature
125
126 margin
127 Number of extra bases added on each side of the given location.
128
129 qualifiers
130 Dictionnary that will be the Biopython feature's `qualifiers` attribute.
131 """
132 if location == "full":
133 location = (margin, len(seqrecord) - margin)
134
135 strand = location[2] if len(location) == 3 else 1
136 seqrecord.features.append(
137 SeqFeature(
138 FeatureLocation(location[0], location[1], strand),
139 qualifiers=qualifiers,
140 type=feature_type,
141 )
142 )
143
144
145 def find_narrowest_text_wrap(text, max_line_length):
146 """Wrap the text into a multi-line text minimizing the longest line length.
147
148 This is done by first wrapping the text using max_line_length, then
149 attempt new wraps by iteratively decreasing the line_length, as long as the
150 number of lines stays the same as with max_line_length.
151 """
152 narrowest_wrap = textwrap.wrap(text, max_line_length)
153 narrowest_width = max([len(l) for l in narrowest_wrap])
154 for line_length in range(max_line_length - 1, 0, -1):
155 wrap = textwrap.wrap(text, line_length)
156 if len(wrap) <= len(narrowest_wrap):
157 width = max([len(l) for l in wrap])
158 if width < narrowest_width:
159 narrowest_wrap = wrap
160 narrowest_width = width
161 else:
162 break
163 return "\n".join(narrowest_wrap)