Mercurial > repos > cpt > cpt_linear_genome_plot
diff linear_genome_plot.py @ 1:e923c686ead9 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:45:31 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linear_genome_plot.py Mon Jun 05 02:45:31 2023 +0000 @@ -0,0 +1,396 @@ +#!/usr/bin/env python +from Bio import SeqIO +from dna_features_viewer import BiopythonTranslator, GraphicRecord +from matplotlib import rc_context +import matplotlib +import matplotlib.pyplot as plt +from itertools import cycle +import re +import sys +import argparse + + +class CPTTranslator(BiopythonTranslator): + """ + This is a customized translator from the dna_features_viewer module to fit Galaxy + """ + + global custom_feature_colors + global box_status + global label_fields + global custom_name_colors + global ignored_features_types + global ignored_gene_labels + global ignored_feature_labels + + def compute_feature_color(self, feature): + if feature.type == "CDS": + if "product" in feature.qualifiers: + color_specific = any( + re.search( + ("(\\b" + str(item) + "\\b)"), feature.qualifiers["product"][0] + ) + for item in custom_name_colors.keys() + ) or any( + re.search((item), feature.qualifiers["product"][0]) + for item in custom_name_colors.keys() + ) + if color_specific: + try: + return custom_name_colors[feature.qualifiers["product"][0]] + except KeyError: + for item in custom_name_colors.keys(): + if item in feature.qualifiers["product"][0]: + custom_name_colors[ + feature.qualifiers["product"][0] + ] = custom_name_colors[item] + return custom_name_colors[ + feature.qualifiers["product"][0] + ] + # print(feature.qualifiers["product"][0]) + else: + try: + return custom_feature_colors[feature.type] + except KeyError: + return BiopythonTranslator.compute_feature_color(self, feature) + else: + if feature.type not in ignored_features_types: + try: + return custom_feature_colors[feature.type] + except KeyError: + return BiopythonTranslator.compute_feature_color(self, feature) + + def compute_feature_label(self, feature): # remove the chop_blocks + self.label_fields = label_fields + if feature.type == "CDS": + if "product" in feature.qualifiers: + if ignored_gene_labels: # product name drop + verify_chops = any( + re.search( + ("(\\b" + str(item) + "\\b)"), + feature.qualifiers["product"][0], + ) + for item in ignored_gene_labels + ) or any( + re.search((item), feature.qualifiers["product"][0]) + for item in ignored_gene_labels + ) + if verify_chops: + return None + else: + return BiopythonTranslator.compute_feature_label(self, feature) + else: + return BiopythonTranslator.compute_feature_label(self, feature) + elif feature.type in ignored_feature_labels: + return None + else: + return BiopythonTranslator.compute_feature_label(self, feature) + + def compute_filtered_features(self, features): + return [ + feature + for feature in features + if feature.type not in ignored_features_types + ] + + def compute_feature_legend_text(self, feature): + return feature.type + + def compute_feature_box_color(self, feature): + if feature.type == "CDS": + return "white" + + def compute_feature_label_link_color(self, feature): + return "black" + + def compute_feature_box_linewidth(self, feature): + if box_status: + return 0.5 + else: + return 0 + + +def parse_gbk(file): + """simple function to parse out the feature information AND products""" + + record = SeqIO.read(file, "genbank") + count = 0 + feature_types = {} + product_names = [] + for feat in record.features: + if feat.type not in feature_types: + feature_types[feat.type] = 1 + else: + feature_types[feat.type] += 1 + if "product" in feat.qualifiers: + product_names.append(feat.qualifiers["product"][0]) + + return feature_types, product_names, record + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Linear Genome Plot") + # Input and Parameters + parser.add_argument( + "input_file", type=argparse.FileType("r"), help="genbank or gff3 file" + ) + parser.add_argument("--plot_width", type=int, default=20) + # parser.add_argument("--plot_height",type=int,default=4) + parser.add_argument( + "--title", type=str, default="genome plot" + ) # NEED TO ADD TO XML + parser.add_argument( + "--common_features_excluded", default="", help="common features to be excluded" + ) + parser.add_argument( + "--features_excluded", + default="", + help="features to be excluded from plot, separate by commas", + ) + parser.add_argument( + "--common_ignore_feature_labels", + default="", + help="common feature labels to be excluded", + ) + parser.add_argument( + "--ignored_feature_labels", + default="", + help="ignore labeling of specific features", + ) + parser.add_argument( + "--common_ignore_product_labels", + default="", + help="common product names to not label", + ) + parser.add_argument( + "--ignore_labeling", + default="", + help="labeling for specific products to ignore, separate by commas", + ) + parser.add_argument( + "--feature_label_order", + default="locus_tag", + help="label order, where the first choice is the first feature listed to pull name labels from", + ) # NEED TO ADD TO XML + parser.add_argument( + "--label_box", + action="store_true", + help="Use to have label box around feature labels", + ) + parser.add_argument( + "--label_algo", + action="store_true", + help="use dna features spacing algo for label placement (in or above feature)", + ) + # parser.add_argument("--level_offset",type=int,default=0,help="All features and annotations will be pushed up by the input amount. Useful for when plotting several sets of features successively on the same axis.") # Will exclude for now + # parser.add_argument("--custom_region",action="store_true",help="cropped region for plot") + parser.add_argument("--sz", type=int, help="beginning location for crop") + parser.add_argument("--ez", type=int, help="end location for crop") + parser.add_argument("--st", type=int, help="start site of translation") + parser.add_argument("--et", type=int, help="end site of translation") + parser.add_argument( + "--translation_on", action="store_true", help="plot the translation sub-axis" + ) + parser.add_argument( + "--feature_id", + nargs="*", + action="append", + help="feature label to have custom color", + ) # NEED TO ADD TO XML + parser.add_argument( + "--feature_id_color", + nargs="*", + action="append", + help="feature's accompanying color", + ) + parser.add_argument( + "--gene_id", + nargs="*", + action="append", + help="gene/cds label to have custom color", + ) + parser.add_argument( + "--gene_id_color", + nargs="*", + action="append", + help="gene/cds's accompanying color", + ) + parser.add_argument("--multiline", action="store_true", help="Plot multiline plot") + parser.add_argument( + "--nucl_per_line", type=int, help="nucleotides per line of multiline" + ) + # Output + parser.add_argument( + "--file_stats", + type=argparse.FileType("w"), + default="out_stats.txt", + help="output stat file", + ) + # parser.add_argument("--tmp_img",dest="tmp_img",type=argparse.FileType("wb"),default="out_tmp.svg") + parser.add_argument( + "--out_img", + dest="out_img", + type=argparse.FileType("wb"), + default="out_img.svg", + help="svg genome plot", + ) + args = parser.parse_args() + + ## Part I ; Parse and send output of features count and the list of product names + feature_counts, products, genome = parse_gbk(args.input_file) + with args.file_stats as f: + f.writelines("---::: FILE BREAKDOWN :::---\n\n") + f.writelines("------::: Feature Count :::------\n") + for feature, count in feature_counts.items(): + f.writelines(f"Feature: {feature} ::::: Count: {count}\n") + f.writelines("------::: Product Names :::------\n") + if products != []: + for product in products: + f.writelines(f"Product Name: {product}\n") + else: + f.writelines("No Annotated Product Names Found") + + ## Part II ; Prep Global Variables + ## Make K:V pairs for Feature Colors + if args.label_box: + box_status = True + else: + box_status = False + + if args.feature_id: + feature_ids = [f for listed_obj in args.feature_id for f in listed_obj] + feature_ids_colors = [ + f for listed_obj in args.feature_id_color for f in listed_obj + ] + custom_feature_colors = dict(zip(feature_ids, feature_ids_colors)) + else: + custom_feature_colors = {} + + ## Make K:V pairs for Name Colors (as above) + if args.gene_id: + gene_ids = [g for listed_obj in args.gene_id for g in listed_obj] + gene_ids_colors = [g for listed_obj in args.gene_id_color for g in listed_obj] + custom_name_colors = dict(zip(gene_ids, gene_ids_colors)) + else: + custom_name_colors = {} + + ## Ignored Features + # ignored_features_types = str.split(args.features_excluded,",") + if args.common_features_excluded: + ignored_features_types = str.split(args.common_features_excluded, ",") + if args.features_excluded: + ignored_features_types += str.split(args.features_excluded, ",") + elif args.features_excluded: + ignored_features_types = str.split(args.features_excluded, ",") + else: + ignored_features_types = False + + print(ignored_features_types) + + ## product labels + if args.common_ignore_product_labels: + ignored_gene_labels = str.split(args.common_ignore_product_labels, ",") + if args.ignore_labeling: + ignored_gene_labels += str.split(args.ignore_labeling, ",") + elif args.ignore_labeling: + ignored_gene_labels = str.split(args.ignore_labeling, ",") + else: + ignored_gene_labels = False + + print(ignored_gene_labels) + + if args.feature_label_order != [""]: + label_fields = str.split(args.feature_label_order, ",") + + # if ignored_gene_labels == ['']: + # ignored_gene_labels = False + + ## Ignored Labeling + if args.common_ignore_feature_labels: + ignored_feature_labels = str.split(args.common_ignore_feature_labels, ",") + if args.ignored_feature_labels: + ignored_feature_labels += str.split(args.ignored_feature_labels, ",") + elif args.ignored_feature_labels: + ignored_feature_labels = str.split(args.ignored_feature_labels, ",") + else: + ignored_feature_labels = False + + print(ignored_feature_labels) + ## Print Statements for Debugging + # print(custom_feature_colors) + # print(custom_name_colors) + # print(ignored_features_types) + # print(ignored_gene_labels) + # print(label_fields) + + ## Part III ; PLOT + # Housekeeping + rc_context( + { + "font.family": ["monospace"], + } + ) # courier-like + matplotlib.use("Agg") # I think this has to be used... + + if args.label_algo: + lab_algo = True + else: + lab_algo = False + + translator = CPTTranslator() + graphic_record = translator.translate_record(genome) + + with open("tmp.svg", "wb") as img: + img.truncate(0) + img.close() + + if ( + args.sz and not args.multiline + ): # if user is wanting to look at a subset region of the genome + zoom_start, zoom_end = args.sz, args.ez + cropped = graphic_record.crop((zoom_start, zoom_end)) + ax, _ = cropped.plot( + figure_width=args.plot_width, annotate_inline=lab_algo, figure_height=None + ) + if args.translation_on: + crop_seq = (args.st - 1, args.et) + cropped.plot_translation( + ax, + location=crop_seq, + fontdict={"size": 8, "weight": "bold"}, + y_offset=1, + ) + ax.set_title(args.title) + # Galaxy specific shenanigans + tmp_fig = "./tmp.svg" + plt.savefig(tmp_fig) + plt.close() + elif args.multiline: + if args.sz: + zoom_start, zoom_end = args.sz, args.ez + else: + zoom_start, zoom_end = 1, graphic_record.sequence_length + cropped = graphic_record.crop((zoom_start, zoom_end)) + ax, _ = cropped.plot_on_multiple_lines( + figure_width=args.plot_width, + annotate_inline=lab_algo, + figure_height=None, + nucl_per_line=args.nucl_per_line, + plot_sequence=False, + ) + # ax.set_title(args.title) + tmp_fig = "./tmp.svg" + plt.savefig(tmp_fig) + plt.close() + else: + ax, _ = graphic_record.plot( + figure_width=args.plot_width, annotate_inline=lab_algo + ) + ax.set_title(args.title) + tmp_fig = "./tmp.svg" + # Galaxy specific shenanigans + plt.savefig(tmp_fig) + plt.close() + with open("tmp.svg", "rb") as img: + for line in img: + args.out_img.write(line)