Mercurial > repos > cpt > cpt_linear_genome_plot
comparison dna_features_viewer/biotools.py @ 1:e923c686ead9 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:45:31 +0000 |
parents | |
children | b79e98299a78 |
comparison
equal
deleted
inserted
replaced
0:621754dd31f8 | 1:e923c686ead9 |
---|---|
1 import textwrap | |
2 from Bio.Seq import Seq | |
3 from Bio.SeqFeature import SeqFeature, FeatureLocation | |
4 from Bio.PDB.Polypeptide import aa1, aa3 | |
5 from Bio import SeqIO | |
6 | |
7 try: | |
8 from BCBio import GFF | |
9 except ImportError: | |
10 | |
11 class GFF: | |
12 def parse(*a): | |
13 """Not available. Please install bcbio-gff.""" | |
14 raise ImportError("Please install the bcbio-gff library to parse GFF data") | |
15 | |
16 | |
17 def complement(dna_sequence): | |
18 """Return the complement of the DNA sequence. | |
19 | |
20 For instance ``complement("ATGCCG")`` returns ``"TACGGC"``. | |
21 | |
22 Uses BioPython for speed. | |
23 """ | |
24 return str(Seq(dna_sequence).complement()) | |
25 | |
26 | |
27 def reverse_complement(sequence): | |
28 """Return the reverse-complement of the DNA sequence. | |
29 | |
30 For instance ``complement("ATGCCG")`` returns ``"GCCGTA"``. | |
31 | |
32 Uses BioPython for speed. | |
33 """ | |
34 return complement(sequence)[::-1] | |
35 | |
36 | |
37 aa_short_to_long_form_dict = { | |
38 _aa1: _aa3[0] + _aa3[1:].lower() for (_aa1, _aa3) in zip(aa1 + "*", aa3 + ["*"]) | |
39 } | |
40 | |
41 | |
42 def translate(dna_sequence, long_form=False): | |
43 """Translate the DNA sequence into an amino-acids sequence MLKYQT... | |
44 | |
45 If long_form is true, a list of 3-letter amino acid representations | |
46 is returned instead (['Ala', 'Ser', ...]). | |
47 """ | |
48 result = str(Seq(dna_sequence).translate()) | |
49 if long_form: | |
50 result = [aa_short_to_long_form_dict[aa] for aa in result] | |
51 return result | |
52 | |
53 | |
54 def extract_graphical_translation(sequence, location, long_form=False): | |
55 """Return a string of the "graphical" translation of a sequence's subsegment. | |
56 | |
57 Here "graphical" means that the amino acid sequence is always given | |
58 left-to-right, as it will appear under the sequence in the plot. This matters | |
59 when the location is on the -1 strand. In this case, the amino-acids are | |
60 determined by (part of) the reverse-complement of the sequence, however | |
61 the sequence returned will be the mirror of the translated sequence, as | |
62 this is the left-to-right order in which the codons corresponding to the | |
63 amino-acids appear in the sequence. | |
64 | |
65 Parameters | |
66 ---------- | |
67 sequence | |
68 An "ATGC" string. | |
69 | |
70 location | |
71 Either (start, end) or (start, end, strand), with strand in (0, 1, -1). | |
72 | |
73 long_form | |
74 if True, a list of 3-letter amino acid representations is returned instead | |
75 (['Ala', 'Ser', ...]). | |
76 | |
77 """ | |
78 if len(location) == 3: | |
79 start, end, strand = location | |
80 else: | |
81 start, end = location | |
82 strand = 1 | |
83 subsequence = sequence[start:end] | |
84 if strand == -1: | |
85 subsequence = reverse_complement(subsequence) | |
86 translation = translate(subsequence, long_form=long_form) | |
87 if strand == -1: | |
88 translation = translation[::-1] | |
89 return translation | |
90 | |
91 | |
92 def load_record(path): | |
93 """Load a Genbank file""" | |
94 if isinstance(path, str): | |
95 # Input is a file path | |
96 if path.lower().endswith(".gff"): | |
97 return list(GFF.parse(path))[0] | |
98 else: | |
99 return SeqIO.read(path, "genbank") | |
100 else: | |
101 # Input is a file-like object | |
102 try: | |
103 return SeqIO.read(path, "genbank") | |
104 except: | |
105 path.seek(0) | |
106 return list(GFF.parse(path))[0] | |
107 | |
108 | |
109 def annotate_biopython_record( | |
110 seqrecord, location="full", feature_type="misc_feature", margin=0, **qualifiers | |
111 ): | |
112 """Add a feature to a Biopython SeqRecord. | |
113 | |
114 Parameters | |
115 ---------- | |
116 | |
117 seqrecord | |
118 The biopython seqrecord to be annotated. | |
119 | |
120 location | |
121 Either (start, end) or (start, end, strand). (strand defaults to +1) | |
122 | |
123 feature_type | |
124 The type associated with the feature | |
125 | |
126 margin | |
127 Number of extra bases added on each side of the given location. | |
128 | |
129 qualifiers | |
130 Dictionnary that will be the Biopython feature's `qualifiers` attribute. | |
131 """ | |
132 if location == "full": | |
133 location = (margin, len(seqrecord) - margin) | |
134 | |
135 strand = location[2] if len(location) == 3 else 1 | |
136 seqrecord.features.append( | |
137 SeqFeature( | |
138 FeatureLocation(location[0], location[1], strand), | |
139 qualifiers=qualifiers, | |
140 type=feature_type, | |
141 ) | |
142 ) | |
143 | |
144 | |
145 def find_narrowest_text_wrap(text, max_line_length): | |
146 """Wrap the text into a multi-line text minimizing the longest line length. | |
147 | |
148 This is done by first wrapping the text using max_line_length, then | |
149 attempt new wraps by iteratively decreasing the line_length, as long as the | |
150 number of lines stays the same as with max_line_length. | |
151 """ | |
152 narrowest_wrap = textwrap.wrap(text, max_line_length) | |
153 narrowest_width = max([len(l) for l in narrowest_wrap]) | |
154 for line_length in range(max_line_length - 1, 0, -1): | |
155 wrap = textwrap.wrap(text, line_length) | |
156 if len(wrap) <= len(narrowest_wrap): | |
157 width = max([len(l) for l in wrap]) | |
158 if width < narrowest_width: | |
159 narrowest_wrap = wrap | |
160 narrowest_width = width | |
161 else: | |
162 break | |
163 return "\n".join(narrowest_wrap) |