Mercurial > repos > peterjc > venn_list
changeset 6:ea68a1a4c1d9 draft
v0.0.10 explicit galaxy_sequence_utils dependency etc
author | peterjc |
---|---|
date | Thu, 02 Feb 2017 11:17:31 -0500 |
parents | 26e35d5133a1 |
children | ba31415fedc5 |
files | tools/venn_list/README.rst tools/venn_list/tool_dependencies.xml tools/venn_list/venn_list.py tools/venn_list/venn_list.xml |
diffstat | 4 files changed, 217 insertions(+), 213 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/venn_list/README.rst Sat Oct 10 08:52:01 2015 -0400 +++ b/tools/venn_list/README.rst Thu Feb 02 11:17:31 2017 -0500 @@ -1,7 +1,7 @@ Galaxy tool to draw a Venn Diagram with up to 3 sets ==================================================== -This tool is copyright 2011-2015 by Peter Cock, The James Hutton Institute +This tool is copyright 2011-2017 by Peter Cock, The James Hutton Institute (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. See the licence text below. @@ -72,6 +72,9 @@ - Includes testing of failure mode. - Planemo for Tool Shed upload (``.shed.yml``, internal change only). - Tool Shed dependency for rpy and limma (thanks to Björn Grüning). +v0.0.10 - Updated to point at Biopython 1.67 (latest version in Tool Shed). + - Explicit dependency on ``galaxy_sequence_utils``. + - Python style updates (internal change only). ======= ======================================================================
--- a/tools/venn_list/tool_dependencies.xml Sat Oct 10 08:52:01 2015 -0400 +++ b/tools/venn_list/tool_dependencies.xml Thu Feb 02 11:17:31 2017 -0500 @@ -1,12 +1,15 @@ <?xml version="1.0"?> <tool_dependency> + <package name="galaxy_sequence_utils" version="1.0.1"> + <repository changeset_revision="c1ab450748ba" name="package_galaxy_sequence_utils_1_0_1" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> <package name="limma" version="3.25.3"> <repository changeset_revision="b19c06e97bce" name="package_r2_limma_3_25_3" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> </package> <package name="rpy" version="1.0.3"> <repository changeset_revision="82170c94ca7c" name="package_rpy_1_0_3" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> </package> - <package name="biopython" version="1.65"> - <repository changeset_revision="dc595937617c" name="package_biopython_1_65" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> + <package name="biopython" version="1.67"> + <repository changeset_revision="a42f244cce44" name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> </package> </tool_dependency>
--- a/tools/venn_list/venn_list.py Sat Oct 10 08:52:01 2015 -0400 +++ b/tools/venn_list/venn_list.py Thu Feb 02 11:17:31 2017 -0500 @@ -11,127 +11,124 @@ import sys -def sys_exit(msg, error_level=1): - """Print error message to stdout and quit with given error level.""" - sys.stderr.write("%s\n" % msg) - sys.exit(error_level) - try: import rpy except ImportError: - sys_exit("Requires the Python library rpy (to call R)") + sys.exit("Requires the Python library rpy (to call R)") except RuntimeError, e: - sys_exit("The Python library rpy is not availble for the current R version\n\n%s" % e) + sys.exit("The Python library rpy is not availble for the current R version\n\n%s" % e) try: rpy.r.library("limma") -except: - sys_exit("Requires the R library limma (for vennDiagram function)") +except Exception: + sys.exit("Requires the R library limma (for vennDiagram function)") -if len(sys.argv)-1 not in [7, 10, 13]: - sys_exit("Expected 7, 10 or 13 arguments (for 1, 2 or 3 sets), not %i" % (len(sys.argv)-1)) +if len(sys.argv) - 1 not in [7, 10, 13]: + sys.exit("Expected 7, 10 or 13 arguments (for 1, 2 or 3 sets), not %i" % (len(sys.argv) - 1)) all_file, all_type, all_label = sys.argv[1:4] set_data = [] -if len(sys.argv)-1 >= 7: +if len(sys.argv) - 1 >= 7: set_data.append(tuple(sys.argv[4:7])) -if len(sys.argv)-1 >= 10: +if len(sys.argv) - 1 >= 10: set_data.append(tuple(sys.argv[7:10])) -if len(sys.argv)-1 >= 13: +if len(sys.argv) - 1 >= 13: set_data.append(tuple(sys.argv[10:13])) pdf_file = sys.argv[-1] n = len(set_data) print "Doing %i-way Venn Diagram" % n + def load_ids(filename, filetype): - if filetype=="tabular": + if filetype == "tabular": for line in open(filename): line = line.rstrip("\n") if line and not line.startswith("#"): - yield line.split("\t",1)[0] - elif filetype=="fasta": + yield line.split("\t", 1)[0] + elif filetype == "fasta": for line in open(filename): if line.startswith(">"): - yield line[1:].rstrip("\n").split(None,1)[0] + yield line[1:].rstrip("\n").split(None, 1)[0] elif filetype.startswith("fastq"): - #Use the Galaxy library not Biopython to cope with CS + # Use the Galaxy library not Biopython to cope with CS from galaxy_utils.sequence.fastq import fastqReader handle = open(filename, "rU") for record in fastqReader(handle): - #The [1:] is because the fastaReader leaves the @ on the identifer. + # The [1:] is because the fastaReader leaves the @ on the identifer. yield record.identifier.split()[0][1:] handle.close() - elif filetype=="sff": + elif filetype == "sff": try: from Bio.SeqIO import index except ImportError: - sys_exit("Require Biopython 1.54 or later (to read SFF files)") - #This will read the SFF index block if present (very fast) + sys.exit("Require Biopython 1.54 or later (to read SFF files)") + # This will read the SFF index block if present (very fast) for name in index(filename, "sff"): yield name else: - sys_exit("Unexpected file type %s" % filetype) + sys.exit("Unexpected file type %s" % filetype) + def load_ids_whitelist(filename, filetype, whitelist): for name in load_ids(filename, filetype): if name in whitelist: yield name else: - sys_exit("Unexpected ID %s in %s file %s" % (name, filetype, filename)) + sys.exit("Unexpected ID %s in %s file %s" % (name, filetype, filename)) if all_file in ["", "-", '""', '"-"']: - #Load without white list - sets = [set(load_ids(f,t)) for (f,t,c) in set_data] - #Take union - all = set() + # Load without white list + sets = [set(load_ids(f, t)) for (f, t, c) in set_data] + # Take union + all_ids = set() for s in sets: - all.update(s) - print "Inferred total of %i IDs" % len(all) + all_ids.update(s) + print "Inferred total of %i IDs" % len(all_ids) else: - all = set(load_ids(all_file, all_type)) - print "Total of %i IDs" % len(all) - sets = [set(load_ids_whitelist(f,t,all)) for (f,t,c) in set_data] + all_ids = set(load_ids(all_file, all_type)) + print "Total of %i IDs" % len(all_ids) + sets = [set(load_ids_whitelist(f, t, all_ids)) for (f, t, c) in set_data] -for s, (f,t,c) in zip(sets, set_data): +for s, (f, t, c) in zip(sets, set_data): print "%i in %s" % (len(s), c) -#Now call R library to draw simple Venn diagram +# Now call R library to draw simple Venn diagram try: - #Create dummy Venn diagram counts object for three groups - cols = 'c("%s")' % '","'.join("Set%i" % (i+1) for i in range(n)) - rpy.r('groups <- cbind(%s)' % ','.join(['1']*n)) + # Create dummy Venn diagram counts object for three groups + cols = 'c("%s")' % '","'.join("Set%i" % (i + 1) for i in range(n)) + rpy.r('groups <- cbind(%s)' % ','.join(['1'] * n)) rpy.r('colnames(groups) <- %s' % cols) rpy.r('vc <- vennCounts(groups)') - #Populate the 2^n classes with real counts - #Don't make any assumptions about the class order - #print rpy.r('vc') + # Populate the 2^n classes with real counts + # Don't make any assumptions about the class order + # print rpy.r('vc') for index, row in enumerate(rpy.r('vc[,%s]' % cols)): if isinstance(row, int) or isinstance(row, float): - #Hack for rpy being too clever for single element row + # Hack for rpy being too clever for single element row row = [row] - names = all + names = all_ids for wanted, s in zip(row, sets): if wanted: names = names.intersection(s) else: names = names.difference(s) - rpy.r('vc[%i,"Counts"] <- %i' % (index+1, len(names))) - #print rpy.r('vc') + rpy.r('vc[%i,"Counts"] <- %i' % (index + 1, len(names))) + # print rpy.r('vc') if n == 1: - #Single circle, don't need to add (Total XXX) line - names = [c for (t,f,c) in set_data] + # Single circle, don't need to add (Total XXX) line + names = [c for (t, f, c) in set_data] else: - names = ["%s\n(Total %i)" % (c, len(s)) for s, (f,t,c) in zip(sets, set_data)] + names = ["%s\n(Total %i)" % (c, len(s)) for s, (f, t, c) in zip(sets, set_data)] rpy.r.assign("names", names) - rpy.r.assign("colors", ["red","green","blue"][:n]) + rpy.r.assign("colors", ["red", "green", "blue"][:n]) rpy.r.pdf(pdf_file, 8, 8) rpy.r("""vennDiagram(vc, include="both", names=names, main="%s", sub="(Total %i)", circle.col=colors) - """ % (all_label, len(all))) + """ % (all_label, len(all_ids))) rpy.r.dev_off() except Exception, exc: - sys_exit( "%s" %str( exc ) ) -rpy.r.quit( save="no" ) + sys.exit("%s" % str(exc)) +rpy.r.quit(save="no") print "Done"
--- a/tools/venn_list/venn_list.xml Sat Oct 10 08:52:01 2015 -0400 +++ b/tools/venn_list/venn_list.xml Thu Feb 02 11:17:31 2017 -0500 @@ -1,157 +1,158 @@ -<tool id="venn_list" name="Venn Diagram" version="0.0.9"> - <description>from lists</description> - <requirements> - <requirement type="python-module">rpy</requirement> - <requirement type="python-module">Bio</requirement> - <requirement type="package" version="1.0.3">rpy</requirement> - <requirement type="package" version="3.25.3">limma</requirement> - <requirement type="package" version="1.65">biopython</requirement> - </requirements> - <stdio> - <!-- Anything other than zero is an error --> - <exit_code range="1:" /> - <exit_code range=":-1" /> - </stdio> - <command interpreter="python"> -venn_list.py -#if $universe.type_select=="implicit": - - - -#else: - "$main" $main.ext -#end if -"$main_lab" -#for $s in $sets: - "$s.set" $s.set.ext "$s.lab" -#end for -$PDF - </command> - <inputs> - <param name="main_lab" size="30" type="text" value="Venn Diagram" label="Plot title"/> - <conditional name="universe"> - <param name="type_select" type="select" label="Implicit or explicit full ID list?"> - <option value="explicit">Explicit</option> - <option value="implicit">Implicit (use union of sets below)</option> - </param> - <when value="explicit"> - <param name="main" type="data" format="tabular,fasta,fastq,sff" label="Full dataset (with all identifiers)" help="Tabular file (uses column one), FASTA, FASTQ or SFF file."/> - </when> - <when value="implicit"/> - </conditional> - <repeat name="sets" min="1" max="3" title="Sets"> - <param name="set" type="data" format="tabular,fasta,fastq,sff" label="Members of set" help="Tabular file (uses column one), FASTA, FASTQ or SFF file."/> - <param name="lab" size="30" type="text" value="Group" label="Caption for set"/> - </repeat> - </inputs> - <outputs> - <data format="pdf" name="PDF" /> - </outputs> - <tests> - <!-- Doesn't seem to work properly, manages to get two sets, both - with same FASTA file, but second with default "Group" label. --> - <test> - <param name="type_select" value="explicit"/> - <param name="main" value="venn_list.tabular" ftype="tabular"/> - <param name="main_lab" value="Some Proteins"/> - <param name="set" value="rhodopsin_proteins.fasta"/> - <param name="lab" value="Rhodopsins"/> - <output name="PDF" file="magic.pdf" ftype="pdf" compare="contains" /> - <assert_stdout> - <has_line line="Doing 1-way Venn Diagram" /> - <has_line line="Total of 10 IDs" /> - <has_line line="6 in Rhodopsins" /> - </assert_stdout> - </test> - <test> - <param name="type_select" value="implicit"/> - <param name="sets_0|set" value="rhodopsin_proteins.fasta"/> - <param name="sets_0|lab" value="Rhodopsins"/> - <param name="sets_1|set" value="four_human_proteins.fasta"/> - <param name="sets_1|lab" value="Human"/> - <param name="sets_2|set" value="blastp_four_human_vs_rhodopsin.tabular"/> - <param name="sets_2|lab" value="Human vs Rhodopsin BLAST"/> - <output name="PDF" file="magic.pdf" ftype="pdf" compare="contains" /> - <assert_stdout> - <has_line line="Doing 3-way Venn Diagram" /> - <has_line line="Inferred total of 10 IDs" /> - <has_line line="6 in Rhodopsins" /> - <has_line line="4 in Human" /> - <has_line line="1 in Human vs Rhodopsin BLAST" /> - </assert_stdout> - </test> - <test expect_failure="true" expect_exit_code="1"> - <param name="type_select" value="explicit"/> - <param name="main" value="venn_list.tabular" ftype="tabular"/> - <param name="main_lab" value="Some Proteins"/> - <param name="sets_0|set" value="rhodopsin_proteins.fasta"/> - <param name="sets_0|lab" value="Rhodopsins"/> - <param name="sets_1|set" value="four_human_proteins.fasta"/> - <param name="sets_1|lab" value="Human"/> - <param name="sets_2|set" value="blastp_four_human_vs_rhodopsin.tabular"/> - <param name="sets_2|lab" value="Human vs Rhodopsin BLAST"/> - <assert_stdout> - <has_line line="Doing 3-way Venn Diagram" /> - <has_line line="Total of 10 IDs" /> - </assert_stdout> - <assert_stderr> - <has_text_matching expression="Unexpected ID sp|Q9BS26|ERP44_HUMAN in fasta file *" /> - </assert_stderr> - </test> - </tests> - <help> - -.. class:: infomark - -**TIP:** If your data is in tabular files, the identifier is assumed to be in column one. - -**What it does** - -Draws Venn Diagram for one, two or three sets (as a PDF file). - -You must supply one, two or three sets of identifiers -- corresponding -to one, two or three circles on the Venn Diagram. - -In general you should also give the full list of all the identifiers -explicitly. This is used to calculate the number of identifers outside -the circles (and check the identifiers in the other files match up). -The full list can be omitted by implicitly taking the union of the -category sets. In this case, the count outside the categories (circles) -will always be zero. - -The identifiers can be taken from the first column of a tabular file -(e.g. query names in BLAST tabular output, or signal peptide predictions -after filtering, etc), or from a sequence file (FASTA, FASTQ, SFF). - -For example, you may have a set of NGS reads (as a FASTA, FASTQ or SFF -file), and the results of several different read mappings (e.g. to -different references) as tabular files (filtered to have just the mapped -reads). You could then show the different mappings (and their overlaps) -as a Venn Diagram, and the outside count would be the unmapped reads. - -**Citations** - -The Venn Diagrams are drawn using Gordon Smyth's limma package from -R/Bioconductor, http://www.bioconductor.org/ - -The R library is called from Python via rpy, http://rpy.sourceforge.net/ - -If you use this Galaxy tool in work leading to a scientific publication please -cite: - -Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). -Galaxy tools and workflows for sequence analysis with applications -in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 - -This tool uses Biopython to read and write SFF files, so you may also wish to -cite the Biopython application note (and Galaxy too of course): - -Cock et al 2009. Biopython: freely available Python tools for computational -molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. -http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. - - </help> - <citations> - <citation type="doi">10.7717/peerj.167</citation> - <citation type="doi">10.1093/bioinformatics/15.5.356</citation> - </citations> -</tool> +<tool id="venn_list" name="Venn Diagram" version="0.0.10"> + <description>from lists</description> + <requirements> + <requirement type="package" version="1.0.1">galaxy_sequence_utils</requirement> + <requirement type="python-module">rpy</requirement> + <requirement type="python-module">Bio</requirement> + <requirement type="package" version="1.0.3">rpy</requirement> + <requirement type="package" version="3.25.3">limma</requirement> + <requirement type="package" version="1.65">biopython</requirement> + </requirements> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> + <command interpreter="python"> +venn_list.py +#if $universe.type_select=="implicit": + - - +#else: + "$main" $main.ext +#end if +"$main_lab" +#for $s in $sets: + "$s.set" $s.set.ext "$s.lab" +#end for +$PDF + </command> + <inputs> + <param name="main_lab" size="30" type="text" value="Venn Diagram" label="Plot title"/> + <conditional name="universe"> + <param name="type_select" type="select" label="Implicit or explicit full ID list?"> + <option value="explicit">Explicit</option> + <option value="implicit">Implicit (use union of sets below)</option> + </param> + <when value="explicit"> + <param name="main" type="data" format="tabular,fasta,fastq,sff" label="Full dataset (with all identifiers)" help="Tabular file (uses column one), FASTA, FASTQ or SFF file."/> + </when> + <when value="implicit"/> + </conditional> + <repeat name="sets" min="1" max="3" title="Sets"> + <param name="set" type="data" format="tabular,fasta,fastq,sff" label="Members of set" help="Tabular file (uses column one), FASTA, FASTQ or SFF file."/> + <param name="lab" size="30" type="text" value="Group" label="Caption for set"/> + </repeat> + </inputs> + <outputs> + <data format="pdf" name="PDF" /> + </outputs> + <tests> + <!-- Doesn't seem to work properly, manages to get two sets, both + with same FASTA file, but second with default "Group" label. --> + <test> + <param name="type_select" value="explicit"/> + <param name="main" value="venn_list.tabular" ftype="tabular"/> + <param name="main_lab" value="Some Proteins"/> + <param name="set" value="rhodopsin_proteins.fasta"/> + <param name="lab" value="Rhodopsins"/> + <output name="PDF" file="magic.pdf" ftype="pdf" compare="contains" /> + <assert_stdout> + <has_line line="Doing 1-way Venn Diagram" /> + <has_line line="Total of 10 IDs" /> + <has_line line="6 in Rhodopsins" /> + </assert_stdout> + </test> + <test> + <param name="type_select" value="implicit"/> + <param name="sets_0|set" value="rhodopsin_proteins.fasta"/> + <param name="sets_0|lab" value="Rhodopsins"/> + <param name="sets_1|set" value="four_human_proteins.fasta"/> + <param name="sets_1|lab" value="Human"/> + <param name="sets_2|set" value="blastp_four_human_vs_rhodopsin.tabular"/> + <param name="sets_2|lab" value="Human vs Rhodopsin BLAST"/> + <output name="PDF" file="magic.pdf" ftype="pdf" compare="contains" /> + <assert_stdout> + <has_line line="Doing 3-way Venn Diagram" /> + <has_line line="Inferred total of 10 IDs" /> + <has_line line="6 in Rhodopsins" /> + <has_line line="4 in Human" /> + <has_line line="1 in Human vs Rhodopsin BLAST" /> + </assert_stdout> + </test> + <test expect_failure="true" expect_exit_code="1"> + <param name="type_select" value="explicit"/> + <param name="main" value="venn_list.tabular" ftype="tabular"/> + <param name="main_lab" value="Some Proteins"/> + <param name="sets_0|set" value="rhodopsin_proteins.fasta"/> + <param name="sets_0|lab" value="Rhodopsins"/> + <param name="sets_1|set" value="four_human_proteins.fasta"/> + <param name="sets_1|lab" value="Human"/> + <param name="sets_2|set" value="blastp_four_human_vs_rhodopsin.tabular"/> + <param name="sets_2|lab" value="Human vs Rhodopsin BLAST"/> + <assert_stdout> + <has_line line="Doing 3-way Venn Diagram" /> + <has_line line="Total of 10 IDs" /> + </assert_stdout> + <assert_stderr> + <has_text_matching expression="Unexpected ID sp|Q9BS26|ERP44_HUMAN in fasta file *" /> + </assert_stderr> + </test> + </tests> + <help> + +.. class:: infomark + +**TIP:** If your data is in tabular files, the identifier is assumed to be in column one. + +**What it does** + +Draws Venn Diagram for one, two or three sets (as a PDF file). + +You must supply one, two or three sets of identifiers -- corresponding +to one, two or three circles on the Venn Diagram. + +In general you should also give the full list of all the identifiers +explicitly. This is used to calculate the number of identifers outside +the circles (and check the identifiers in the other files match up). +The full list can be omitted by implicitly taking the union of the +category sets. In this case, the count outside the categories (circles) +will always be zero. + +The identifiers can be taken from the first column of a tabular file +(e.g. query names in BLAST tabular output, or signal peptide predictions +after filtering, etc), or from a sequence file (FASTA, FASTQ, SFF). + +For example, you may have a set of NGS reads (as a FASTA, FASTQ or SFF +file), and the results of several different read mappings (e.g. to +different references) as tabular files (filtered to have just the mapped +reads). You could then show the different mappings (and their overlaps) +as a Venn Diagram, and the outside count would be the unmapped reads. + +**Citations** + +The Venn Diagrams are drawn using Gordon Smyth's limma package from +R/Bioconductor, http://www.bioconductor.org/ + +The R library is called from Python via rpy, http://rpy.sourceforge.net/ + +If you use this Galaxy tool in work leading to a scientific publication please +cite: + +Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). +Galaxy tools and workflows for sequence analysis with applications +in molecular plant pathology. PeerJ 1:e167 +http://dx.doi.org/10.7717/peerj.167 + +This tool uses Biopython to read and write SFF files, so you may also wish to +cite the Biopython application note (and Galaxy too of course): + +Cock et al 2009. Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + + </help> + <citations> + <citation type="doi">10.7717/peerj.167</citation> + <citation type="doi">10.1093/bioinformatics/15.5.356</citation> + </citations> +</tool>