Mercurial > repos > cpt > cpt_linear_genome_plot
diff dna_features_viewer/biotools.py @ 1:e923c686ead9 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:45:31 +0000 |
parents | |
children | b79e98299a78 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dna_features_viewer/biotools.py Mon Jun 05 02:45:31 2023 +0000 @@ -0,0 +1,163 @@ +import textwrap +from Bio.Seq import Seq +from Bio.SeqFeature import SeqFeature, FeatureLocation +from Bio.PDB.Polypeptide import aa1, aa3 +from Bio import SeqIO + +try: + from BCBio import GFF +except ImportError: + + class GFF: + def parse(*a): + """Not available. Please install bcbio-gff.""" + raise ImportError("Please install the bcbio-gff library to parse GFF data") + + +def complement(dna_sequence): + """Return the complement of the DNA sequence. + + For instance ``complement("ATGCCG")`` returns ``"TACGGC"``. + + Uses BioPython for speed. + """ + return str(Seq(dna_sequence).complement()) + + +def reverse_complement(sequence): + """Return the reverse-complement of the DNA sequence. + + For instance ``complement("ATGCCG")`` returns ``"GCCGTA"``. + + Uses BioPython for speed. + """ + return complement(sequence)[::-1] + + +aa_short_to_long_form_dict = { + _aa1: _aa3[0] + _aa3[1:].lower() for (_aa1, _aa3) in zip(aa1 + "*", aa3 + ["*"]) +} + + +def translate(dna_sequence, long_form=False): + """Translate the DNA sequence into an amino-acids sequence MLKYQT... + + If long_form is true, a list of 3-letter amino acid representations + is returned instead (['Ala', 'Ser', ...]). + """ + result = str(Seq(dna_sequence).translate()) + if long_form: + result = [aa_short_to_long_form_dict[aa] for aa in result] + return result + + +def extract_graphical_translation(sequence, location, long_form=False): + """Return a string of the "graphical" translation of a sequence's subsegment. + + Here "graphical" means that the amino acid sequence is always given + left-to-right, as it will appear under the sequence in the plot. This matters + when the location is on the -1 strand. In this case, the amino-acids are + determined by (part of) the reverse-complement of the sequence, however + the sequence returned will be the mirror of the translated sequence, as + this is the left-to-right order in which the codons corresponding to the + amino-acids appear in the sequence. + + Parameters + ---------- + sequence + An "ATGC" string. + + location + Either (start, end) or (start, end, strand), with strand in (0, 1, -1). + + long_form + if True, a list of 3-letter amino acid representations is returned instead + (['Ala', 'Ser', ...]). + + """ + if len(location) == 3: + start, end, strand = location + else: + start, end = location + strand = 1 + subsequence = sequence[start:end] + if strand == -1: + subsequence = reverse_complement(subsequence) + translation = translate(subsequence, long_form=long_form) + if strand == -1: + translation = translation[::-1] + return translation + + +def load_record(path): + """Load a Genbank file""" + if isinstance(path, str): + # Input is a file path + if path.lower().endswith(".gff"): + return list(GFF.parse(path))[0] + else: + return SeqIO.read(path, "genbank") + else: + # Input is a file-like object + try: + return SeqIO.read(path, "genbank") + except: + path.seek(0) + return list(GFF.parse(path))[0] + + +def annotate_biopython_record( + seqrecord, location="full", feature_type="misc_feature", margin=0, **qualifiers +): + """Add a feature to a Biopython SeqRecord. + + Parameters + ---------- + + seqrecord + The biopython seqrecord to be annotated. + + location + Either (start, end) or (start, end, strand). (strand defaults to +1) + + feature_type + The type associated with the feature + + margin + Number of extra bases added on each side of the given location. + + qualifiers + Dictionnary that will be the Biopython feature's `qualifiers` attribute. + """ + if location == "full": + location = (margin, len(seqrecord) - margin) + + strand = location[2] if len(location) == 3 else 1 + seqrecord.features.append( + SeqFeature( + FeatureLocation(location[0], location[1], strand), + qualifiers=qualifiers, + type=feature_type, + ) + ) + + +def find_narrowest_text_wrap(text, max_line_length): + """Wrap the text into a multi-line text minimizing the longest line length. + + This is done by first wrapping the text using max_line_length, then + attempt new wraps by iteratively decreasing the line_length, as long as the + number of lines stays the same as with max_line_length. + """ + narrowest_wrap = textwrap.wrap(text, max_line_length) + narrowest_width = max([len(l) for l in narrowest_wrap]) + for line_length in range(max_line_length - 1, 0, -1): + wrap = textwrap.wrap(text, line_length) + if len(wrap) <= len(narrowest_wrap): + width = max([len(l) for l in wrap]) + if width < narrowest_width: + narrowest_wrap = wrap + narrowest_width = width + else: + break + return "\n".join(narrowest_wrap)