Mercurial > repos > peterjc > venn_list
changeset 0:baf7031d470e
Migrated tool version 0.0.3 from old tool shed archive to new tool shed repository
author | peterjc |
---|---|
date | Tue, 07 Jun 2011 18:08:11 -0400 |
parents | |
children | f1323a651777 |
files | tools/plotting/venn_list.py tools/plotting/venn_list.txt tools/plotting/venn_list.xml |
diffstat | 3 files changed, 323 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/plotting/venn_list.py Tue Jun 07 18:08:11 2011 -0400 @@ -0,0 +1,135 @@ +#!/usr/bin/env python +"""Plot up to 3-way Venn Diagram using R limma vennDiagram (via rpy) + +This script is copyright 2010 by Peter Cock, The James Hutton Institute +(formerly SCRI), UK. All rights reserved. +See accompanying text file for licence details (MIT/BSD style). + +This is version 0.0.3 of the script. +""" + + +import sys +import rpy + +def stop_err(msg, error_level=1): + """Print error message to stdout and quit with given error level.""" + sys.stderr.write("%s\n" % msg) + sys.exit(error_level) + +try: + import rpy +except ImportError: + stop_err("Requires the Python library rpy (to call R)") + +try: + rpy.r.library("limma") +except: + stop_err("Requires the R library limma (for vennDiagram function)") + + +if len(sys.argv)-1 not in [7, 10, 13]: + stop_err("Expected 7, 10 or 13 arguments (for 1, 2 or 3 sets), not %i" % (len(sys.argv)-1)) + +all_file, all_type, all_label = sys.argv[1:4] +set_data = [] +if len(sys.argv)-1 >= 7: + set_data.append(tuple(sys.argv[4:7])) +if len(sys.argv)-1 >= 10: + set_data.append(tuple(sys.argv[7:10])) +if len(sys.argv)-1 >= 13: + set_data.append(tuple(sys.argv[10:13])) +pdf_file = sys.argv[-1] +n = len(set_data) +print "Doing %i-way Venn Diagram" % n + +def load_ids(filename, filetype): + if filetype=="tabular": + for line in open(filename): + if not line.startswith("#"): + yield line.rstrip("\n").split("\t",1)[0] + elif filetype=="fasta": + for line in open(filename): + if line.startswith(">"): + yield line[1:].rstrip("\n").split(None,1)[0] + elif filetype.startswith("fastq"): + #Use the Galaxy library not Biopython to cope with CS + from galaxy_utils.sequence.fastq import fastqReader + handle = open(filename, "rU") + for record in fastqReader(handle): + #The [1:] is because the fastaReader leaves the @ on the identifer. + yield record.identifier.split()[0][1:] + handle.close() + elif filetype=="sff": + try: + from Bio.SeqIO import index + except ImportError: + stop_err("Require Biopython 1.54 or later (to read SFF files)") + #This will read the SFF index block if present (very fast) + for name in index(filename, "sff"): + yield name + else: + stop_err("Unexpected file type %s" % filetype) + +def load_ids_whitelist(filename, filetype, whitelist): + for name in load_ids(filename, filetype): + if name in whitelist: + yield name + else: + stop_err("Unexpected ID %s in %s file %s" % (name, filetype, filename)) + +if all_file in ["", "-", '""', '"-"']: + #Load without white list + sets = [set(load_ids(f,t)) for (f,t,c) in set_data] + #Take union + all = set() + for s in sets: + all.update(s) + print "Inferred total of %i IDs" % len(all) +else: + all = set(load_ids(all_file, all_type)) + print "Total of %i IDs" % len(all) + sets = [set(load_ids_whitelist(f,t,all)) for (f,t,c) in set_data] + +for s, (f,t,c) in zip(sets, set_data): + print "%i in %s" % (len(s), c) + +#Now call R library to draw simple Venn diagram +try: + #Create dummy Venn diagram counts object for three groups + cols = 'c("%s")' % '","'.join("Set%i" % (i+1) for i in range(n)) + rpy.r('groups <- cbind(%s)' % ','.join(['1']*n)) + rpy.r('colnames(groups) <- %s' % cols) + rpy.r('vc <- vennCounts(groups)') + #Populate the 2^n classes with real counts + #Don't make any assumptions about the class order + #print rpy.r('vc') + for index, row in enumerate(rpy.r('vc[,%s]' % cols)): + if isinstance(row, int) or isinstance(row, float): + #Hack for rpy being too clever for single element row + row = [row] + names = all + for wanted, s in zip(row, sets): + if wanted: + names = names.intersection(s) + else: + names = names.difference(s) + rpy.r('vc[%i,"Counts"] <- %i' % (index+1, len(names))) + #print rpy.r('vc') + if n == 1: + #Single circle, don't need to add (Total XXX) line + names = [c for (t,f,c) in set_data] + else: + names = ["%s\n(Total %i)" % (c, len(s)) for s, (f,t,c) in zip(sets, set_data)] + rpy.r.assign("names", names) + rpy.r.assign("colors", ["red","green","blue"][:n]) + rpy.r.pdf(pdf_file, 8, 8) + rpy.r("""vennDiagram(vc, include="both", names=names, + main="%s", sub="(Total %i)", + circle.col=colors) + """ % (all_label, len(all))) + rpy.r.dev_off() +except Exception, exc: + stop_err( "%s" %str( exc ) ) +rpy.r.quit( save="no" ) +print "Done"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/plotting/venn_list.txt Tue Jun 07 18:08:11 2011 -0400 @@ -0,0 +1,75 @@ +Galaxy tool to draw a Venn Diagram with up to 3 sets +==================================================== + +This tool is copyright 2011 by Peter Cock, The James Hutton Institute +(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. +See the licence text below. + +This tool is a short Python script (using both the Galaxy and Biopython library +functions) to extract ID lists from tabular, FASTA, FASTQ or SFF files to build +sets, which are then drawn using the R limma package function vennDiagram +(called from Python using rpy). + +There are just two files to install: + +* venn_list.py (the Python script) +* venn_list.xml (the Galaxy tool definition) + +The suggested location is in the Galaxy folder tools/plotting next to other +graph drawing tools. + +You will also need to modify the tools_conf.xml file to tell Galaxy to offer the +tool. The suggested location is in the "Graph/Display Data" section. Simply add +the line: + +<tool file="plotting/venn_list.xml" /> + +You will also need to install Biopython 1.54 or later, and the R/Bioconductor +pacakge limma. You should already have rpy installed for other Galaxy tools. + + +History +======= + +v0.0.3 - Initial public release. + + +Developers +========== + +This script and related tools are being developed on the following hg branch: +http://bitbucket.org/peterjc/galaxy-central/src/tools + +For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball use +the following command from the Galaxy root folder: + +tar -czf venn_list.tar.gz tools/plotting/venn_list.* + +Check this worked: + +$ tar -tzf venn_list.tar.gz +tools/plotting/venn_list.py +tools/plotting/venn_list.txt +tools/plotting/venn_list.xml + + +Licence (MIT/BSD style) +======================= + +Permission to use, copy, modify, and distribute this software and its +documentation with or without modifications and for any purpose and +without fee is hereby granted, provided that any copyright notices +appear in all copies and that both those copyright notices and this +permission notice appear in supporting documentation, and that the +names of the contributors or copyright holders not be used in +advertising or publicity pertaining to distribution of the software +without specific prior permission. + +THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT +OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +OR PERFORMANCE OF THIS SOFTWARE.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/plotting/venn_list.xml Tue Jun 07 18:08:11 2011 -0400 @@ -0,0 +1,113 @@ +<tool id="venn_list" name="Venn Diagram" version="0.0.3"> + <description>from lists</description> + <command interpreter="python"> +venn_list.py +#if $universe.type_select=="implicit": + - - +#else: + $main $main.ext +#end if +"$main_lab" +#for $s in $sets: + $s.set $s.set.ext "$s.lab" +#end for +$PDF</command> + <inputs> + <param name="main_lab" size="30" type="text" value="Venn Diagram" label="Plot title"/> + <conditional name="universe"> + <param name="type_select" type="select" label="Implicit or explicit full ID list?"> + <option value="explicit">Explicit</option> + <option value="implicit">Implicit (use union of sets below)</option> + </param> + <when value="explicit"> + <param name="main" type="data" format="tabular,fasta,fastq,sff" label="Full dataset (with all identifiers)" help="Tabular file (uses column one), FASTA, FASTQ or SFF file."/> + </when> + <when value="implicit"/> + </conditional> + <repeat name="sets" min="1" max="3" title="Sets"> + <param name="set" type="data" format="tabular,fasta,fastq,sff" label="Members of set" help="Tabular file (uses column one), FASTA, FASTQ or SFF file."/> + <param name="lab" size="30" type="text" value="Group" label="Caption for set"/> + </repeat> + </inputs> + <outputs> + <data format="pdf" name="PDF" /> + </outputs> + <requirements> + <requirement type="python-module">rpy</requirement> + <requirement type="python-module">Bio</requirement> + </requirements> + <tests> + <!-- Doesn't seem to work properly, manages to get two sets, both + with same FASTA file, but second with default "Group" label. + <test> + <param name="type_select" value="explicit"/> + <param name="main" value="venn_list.tabular" ftype="tabular"/> + <param name="main_lab" value="Some Proteins"/> + <param name="set" value="rhodopsin_proteins.fasta"/> + <param name="lab" value="Rhodopsins"/> + <output name="PDF" file="venn_list1.pdf" ftype="pdf"/> + </test> + --> + <!-- Can't use more than one repeat value in tests (yet) + <test> + <param name="type_select" value="explicit"/> + <param name="main" value="venn_list.tabular" ftype="tabular"/> + <param name="main_lab" value="Some Proteins"/> + <param name="count" value="3"/> + <param name="set" value="rhodopsin_proteins.fasta"/> + <param name="lab" value="Rhodopsins"/> + <param name="set" value="four_human_proteins.fasta"/> + <param name="lab" value="Human"/> + <param name="set" value="blastp_four_human_vs_rhodopsin.tabular"/> + <param name="lab" value="Human vs Rhodopsin BLAST"/> + <output name="PDF" file="venn_list3.pdf" ftype="pdf"/> + </test> + --> + </tests> + <help> + +.. class:: infomark + +**TIP:** If your data is in tabular files, the identifier is assumed to be in column one. + +**What it does** + +Draws Venn Diagram for one, two or three sets (as a PDF file). + +You must supply one, two or three sets of identifiers -- corresponding +to one, two or three circles on the Venn Diagram. + +In general you should also give the full list of all the identifiers +explicitly. This is used to calculate the number of identifers outside +the circles (and check the identifiers in the other files match up). +The full list can be omitted by implicitly taking the union of the +category sets. In this case, the count outside the categories (circles) +will always be zero. + +The identifiers can be taken from the first column of a tabular file +(e.g. query names in BLAST tabular output, or signal peptide predictions +after filtering, etc), or from a sequence file (FASTA, FASTQ, SFF). + +For example, you may have a set of NGS reads (as a FASTA, FASTQ or SFF +file), and the results of several different read mappings (e.g. to +different references) as tabular files (filtered to have just the mapped +reads). You could then show the different mappings (and their overlaps) +as a Venn Diagram, and the outside count would be the unmapped reads. + +**Citations** + +The Venn Diagrams are drawn using Gordon Smyth's limma package from +R/Bioconductor, http://www.bioconductor.org/ + +The R library is called from Python via rpy, http://rpy.sourceforge.net/ + +This tool uses Biopython to read SFF files. If you use this tool with +SFF files in scientific work leading to a publication, please cite the +Biopython application note: + +Cock et al 2009. Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + + </help> +</tool>