Previous changeset 42:950982dda2aa (2017-12-21) Next changeset 44:3a051528e47d (2017-12-21) |
Commit message:
Uploaded |
modified:
alfa/.shed.yml alfa/ALFA.py alfa/ALFA.xml alfa/tool_dependencies.xml |
b |
diff -r 950982dda2aa -r c9929240f796 alfa/.shed.yml --- a/alfa/.shed.yml Thu Dec 21 10:01:31 2017 -0500 +++ b/alfa/.shed.yml Thu Dec 21 11:54:18 2017 -0500 |
b |
@@ -3,7 +3,7 @@ - Next Gen Mappers - Sequence Analysis - Visualization -description: A tool to Compute and display distribution of reads by genomic categories +description: Plot the distribution of the genomic features captured by aligned reads long_description: | ALFA provides a global overview of features distribution composing New Generation Sequencing dataset(s). Given a set of aligned reads (BAM files) and an annotation file (GTF format), the tool produces plots of the raw and normalized distributions of those reads among genomic categories (stop codon, 5'-UTR, CDS, intergenic, etc.) and biotypes (protein coding genes, miRNA, tRNA, etc.). Whatever the sequencing technique, whatever the organism. |
b |
diff -r 950982dda2aa -r c9929240f796 alfa/ALFA.py --- a/alfa/ALFA.py Thu Dec 21 10:01:31 2017 -0500 +++ b/alfa/ALFA.py Thu Dec 21 11:54:18 2017 -0500 |
[ |
b'@@ -1,12 +1,13 @@\n #!/usr/bin/python\n-#-*- coding: utf-8 -*-\n+# -*- coding: utf-8 -*-\n \n-__author__ = \'noel & bahin\'\n-\'\'\' <decription> \'\'\'\n+__author__ = "noel & bahin"\n+""" ALFA provides a global overview of features distribution composing NGS dataset(s). """\n \n import argparse\n import os\n import numpy\n+import copy\n import sys\n import subprocess\n import matplotlib.pyplot as plt\n@@ -15,856 +16,1151 @@\n import matplotlib.patheffects as PathEffects\n import re\n from matplotlib.backends.backend_pdf import PdfPages\n-# To correctly embbed the texts when saving plots in svg format\n+# To correctly embed the texts when saving plots in svg format\n import matplotlib\n-matplotlib.rcParams[\'svg.fonttype\'] = \'none\'\n+import progressbar\n+import collections\n+import matplotlib as mpl\n+import numpy as np\n+\n+matplotlib.rcParams["svg.fonttype"] = "none"\n+\n \n ##########################################################################\n # FUNCTIONS #\n ##########################################################################\n \n def init_dict(d, key, init):\n-\tif key not in d:\n-\t\td[key] = init\n+ if key not in d:\n+ d[key] = init\n+\n+\n+def tryint(s):\n+ """ Function called by "alphanum_key" function to sort the chromosome names. """\n+ try:\n+ return int(s)\n+ except ValueError:\n+ return s\n+\n \n-def get_chromosome_lengths(args):\n-\t"""\n-\tParse the file containing the chromosomes lengths.\n-\tIf no length file is provided, browse the annotation file (GTF) to estimate the chromosome sizes (\n-\t"""\n-\tlengths={}\n-\tgtf_chrom_names=set()\n-\tforce_get_lengths = False\n-\t# If the user provided the chromosome length file\n-\tif args.chr_len:\n-\t\twith open(args.chr_len, \'r\') as chr_len_file:\n-\t\t\tfor line in chr_len_file:\n-\t\t\t\tlengths[line.split(\'\\t\')[0]] = int(line.rstrip().split(\'\\t\')[1])\n-\t\twith open(args.annotation,\'r\') as gtf_file:\n-\t\t\tfor line in gtf_file:\n-\t\t\t\tif not line.startswith(\'#\'):\n-\t\t\t\t\tchrom = line.split(\'\\t\')[0]\n-\t\t\t\t\tif chrom not in gtf_chrom_names:\n-\t\t\t\t\t\tgtf_chrom_names.add(chrom)\n-\t\tfor chrom in lengths.keys():\n-\t\t\tif chrom not in gtf_chrom_names:\n-\t\t\t\tprint "Warning: at least one chromosome name (\'"+chrom+"\') of the file \'"+args.chr_len+"\'does not match any chromosome name if GTF and was ignored."\n-\t\t\t\t#del lengths[chrom]\n-\t\t\t\tbreak\n-\t\tfor chrom in gtf_chrom_names:\n-\t\t\tif force_get_lengths: break\n-\t\t\tif chrom not in lengths.keys():\n-\t\t\t\tprint "WARNING: chromosome name \'"+chrom+"\' was found in gtf and does not match any chromosome name provided in",args.chr_len+". "\n-\t\t\t\tprint "\\t=> The chromosome lenghts will be approximated using annotations in the GTF file."\n-\t\t\t\tcontinue_value =""\n-\t\t\t\twhile continue_value not in {"yes","y","no","n"}:\n-\t\t\t\t\tcontinue_value = raw_input("\\tDo you want to continue (\'yes\' or \'y\')?\\n\\tElse write \'no\' or \'n\' to exit the script and check your file of lengths.\\n")\n-\t\t\t\t\tif continue_value == "no" or continue_value == "n":\n-\t\t\t\t\t\tsys.exit("Exiting")\n-\t\t\t\t\telif continue_value == "yes" or continue_value == "y":\n-\t\t\t\t\t\tforce_get_lengths = True\n-\t\t\t\t\t\tbreak\n-\t\t\t\t\tprint "Error: use \'yes/y/no/n\' only"\n-\t\tif not force_get_lengths:\n-\t\t\treturn lengths\n-\t# Otherwise, (or if at least one chromosome was missing in chromosome lengths file) we consider the end of the last annotation of the chromosome in the GTF file as the chromosome length\n-\twith open(args.annotation, \'r\') as gtf_file:\n-\t\tfor line in gtf_file:\n-\t\t\tif not line.startswith(\'#\'):\n-\t\t\t\tchrom = line.split(\'\\t\')[0]\n-\t\t\t\tend = int(line.split(\'\\t\')[4])\n-\t\t\t\tinit_dict(lengths, chrom, 0)\n-\t\t\t\tlengths[chrom] = max(lengths[chrom], end)\n-\t\tif force_get_lengths:\n-\t\t\tprint "The chromosome lenghts have been approximated using the last annotations in the GTF file."\n-\t\treturn lengths\n+def alphanum_key(s):\n+ """ Turn a string into a list of string and number chunks.\n+ "z23a" -> ["z", 23, "a"]\n+ """\n+ return [ tryint(c) for c in re.split("([0-9]+)", s) ]\n+\n \n-def write_feature_on_index(feat,ch'..b' #make_plot(labels, cat_list, sample_labels, filtered_cat_cpt, final_genome_cpt, pdf, "categories", options.threshold, title="Categories distribution for \'" + filtered_biotype + "\' biotype", svg=options.svg, png=options.png)\n+ make_plot(labels, cat_list, filtered_cat_cpt, final_genome_cpt, pdf, "Categories", options.threshold, title="Categories distribution for \'" + filtered_biotype + "\' biotype", svg=options.svg, png=options.png, categ_groups= parent_categs)\n+ ## Generate the biotypes plot\n+ # Recategorization within the final biotypes and plot generation\n+ final_cat_cpt, final_genome_cpt = group_counts_by_biotype(cpt, cpt_genome, biotypes)\n+ #make_plot(biotypes, sample_labels, final_cat_cpt, final_genome_cpt, pdf, "biotypes", options.threshold, svg=options.svg, png=options.png)\n+ make_plot(labels, biotypes, final_cat_cpt, final_genome_cpt, pdf, "Biotypes", options.threshold, svg=options.svg, png=options.png)\n \n \n-#print \'\\nCounts for every category/biotype pair: \',cpt\n-\n-# Generating plots\n-if options.pdf != False:\n-\tif options.pdf == None:\n-\t\toptions.pdf = "categories_plots.pdf"\n-\tpdf = PdfPages(options.pdf)\n-else:\n-\tpdf = False\n-\n-selected_biotype = None\n-if options.biotype_filter:\n-\toptions.biotype_filter = options.biotype_filter[0]\n-\tfor sample in cpt:\n-\t\tfor feature in cpt[sample]:\n-\t\t\tbiotype = feature[1]\n-\t\t\tif options.biotype_filter.lower() == biotype.lower():\n-\t\t\t\tselected_biotype=biotype\n-\t\t\t\tbreak\n-\tif selected_biotype == None :\n-\t\tprint "\\nError: biotype \'"+options.biotype_filter+"\' not found. Please check the biotype name and that this biotype exists in your sample(s)."\n-\t\tsys.exit()\n-\n-#Print a warning message if the UTRs are not specified as 5\' or 3\' (they will be ploted as 5\'UTR)\n-if \'UTR\' in [categ[0] for counts in cpt.values() for categ in counts.keys()]:\n-\tprint \'\'\'\\nWARNING: (some) 5\'UTR/3\'UTR are not precisely defined. Consequently, positions annotated as "UTR" will be counted as "5\'UTR"\\n\'\'\'\n-\n-#### Make the plot by categories\n-\t#### Recategorizing with the final categories\n-final_cats=categs_groups[options.categories_depth-1]\n-final_cat_cpt,final_genome_cpt, filtered_cat_cpt = group_counts_by_categ(cpt,cpt_genome,final_cats,selected_biotype)\n-\t#### Display the distribution of specified categories (or biotypes) in samples on a barplot\n-# Remove the \'antisense\' category if the library type is \'unstranded\'\n-for dic in cpt.values():\n-\tif (\'antisense\',\'antisense\') in dic.keys(): break\n-else:\n-\tcat_list.remove(\'antisense\')\n-make_plot(cat_list,samples_names,final_cat_cpt,final_genome_cpt,pdf, "categories",options.threshold, svg = options.svg, png = options.png)\n-if selected_biotype :\n-\tmake_plot(cat_list,samples_names,filtered_cat_cpt,final_genome_cpt,pdf, "categories",options.threshold,title="Categories distribution for \'"+selected_biotype+"\' biotype", svg = options.svg, png = options.png)\n-\n-#### Make the plot by biotypes\n-\t#### Recategorizing with the final categories\n-final_cat_cpt,final_genome_cpt = group_counts_by_biotype(cpt,cpt_genome,biotypes)\n-\t#### Display the distribution of specified categories (or biotypes) in samples on a barplot\n-make_plot(biotypes,samples_names,final_cat_cpt,final_genome_cpt,pdf, "biotypes",options.threshold, svg = options.svg, png = options.png)\n-\n-\n-\n-\t##### Recategorizing with the final categories\n-#final_cat_cpt,final_genome_cpt = group_counts_by_biotype(cpt,cpt_genome,biotypes_group1)\n-\t##### Display the distribution of specified categories (or biotypes) in samples on a barplot\n-#make_plot(biotypes_group1,samples_names,final_cat_cpt,final_genome_cpt,pdf,"Biotype groups", options.threshold, title="Biotypes distribution in mapped reads \\n(biotypes are grouped by \'family\')", svg = options.svg, png = options.png)\n-\n-\n-if options.pdf:\n-\tpdf.close()\n-\tprint "\\n### Plots saved in pdf file: %s" %options.pdf\n-\t\n-print "\\n### End of program"\n\\ No newline at end of file\n+ print "### End of program ###"\n\\ No newline at end of file\n' |
b |
diff -r 950982dda2aa -r c9929240f796 alfa/ALFA.xml --- a/alfa/ALFA.xml Thu Dec 21 10:01:31 2017 -0500 +++ b/alfa/ALFA.xml Thu Dec 21 11:54:18 2017 -0500 |
b |
@@ -1,5 +1,5 @@ <tool id="alfa" name="ALFA" version="0.1.0"> - <description>- Plot the distribution of genomic features in your aligned reads </description> + <description>- Plot the distribution of the genomic features captured by aligned reads </description> <!-- ALFA requires bedtools suite v2.20.0 and above --> <requirements> |
b |
diff -r 950982dda2aa -r c9929240f796 alfa/tool_dependencies.xml --- a/alfa/tool_dependencies.xml Thu Dec 21 10:01:31 2017 -0500 +++ b/alfa/tool_dependencies.xml Thu Dec 21 11:54:18 2017 -0500 |
b |
@@ -1,12 +1,12 @@ <?xml version="1.0"?> <tool_dependency> - <package name="bedtools"> + <package name="bedtools" version="2.24"> <repository changeset_revision="3416a1d4a582" name="package_bedtools_2_24" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> </package> - <package name="samtools"> + <package name="samtools" version="1.2"> <repository changeset_revision="f6ae3ba3f3c1" name="package_samtools_1_2" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> </package> - <package name="matplotlib"> + <package name="matplotlib" version="1.4"> <repository changeset_revision="f7424e1cf115" name="package_python_2_7_matplotlib_1_4" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> </package> </tool_dependency> |