Galaxy |

Changeset 0:b35ec780aac1 (2011-09-19)

Next changeset 1:7d26d64539b2 (2011-09-19)

Commit message:
Uploaded

added:
tools/grinder.xml
tools/grinder_multiple_outputs.py
tools/stderr_wrapper.py

diff -r 000000000000 -r b35ec780aac1 tools/grinder.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/grinder.xml Mon Sep 19 01:01:58 2011 -0400

[

b'@@ -0,0 +1,334 @@\n+<tool id="grinder" name="Grinder" version="0.3.7" force_history_refresh="True">\n+\n+ \n+\n+ <description>genomic, metagenomic and amplicon read simulator (BETA)</description>\n+\n+ <requirements>\n+ <requirement type="binary">grinder</requirement>\n+ </requirements>\n+\n+ <version_string>grinder --version</version_string>\n+\n+ <command>\n+ #set $tool_dir = os.path.join( os.path.abspath($__root_dir__), \'tools\', \'ngs_simulation\' )\n+ #set $script1 = os.path.join( $tool_dir, \'stderr_wrapper.py\' )\n+ #set $script2 = os.path.join( $tool_dir, \'grinder_multiple_outputs.py\' )\n+\n+ $script1\n+ grinder\n+ #if $reference_file.specify == "builtin":\n+ -reference_file ${ filter( lambda x: str( x[0] ) == str( $reference_file.value ), $__app__.tool_data_tables[ \'all_fasta\' ].get_fields() )[0][-1] }\n+ #else if $reference_file.specify == "uploaded":\n+ -reference_file $reference_file.value\n+ #end if\n+ #if str($coverage_fold):\n+ -coverage_fold $coverage_fold\n+ #end if\n+ #if str($total_reads):\n+ -total_reads $total_reads\n+ #end if\n+ #if str($read_dist):\n+ -read_dist $read_dist\n+ #end if\n+ #if str($insert_dist):\n+ -insert_dist $insert_dist\n+ #end if\n+ #if str($exclude_chars):\n+ -exclude_chars $exclude_chars\n+ #end if\n+ #if str($delete_chars):\n+ -delete_chars $delete_chars\n+ #end if\n+ #if str($forward_reverse) != "None":\n+ -forward_reverse $forward_reverse\n+ #end if\n+ #if str($unidirectional):\n+ -unidirectional $unidirectional\n+ #end if\n+ #if str($length_bias):\n+ -length_bias $length_bias\n+ #end if\n+ #if str($copy_bias):\n+ -copy_bias $copy_bias\n+ #end if\n+ #if str($mutation_dist):\n+ -mutation_dist $mutation_dist\n+ #end if\n+ #if str($mutation_ratio):\n+ -mutation_ratio $mutation_ratio\n+ #end if\n+ #if str($homopolymer_dist):\n+ -homopolymer_dist $homopolymer_dist\n+ #end if\n+ #if str($chimera_perc):\n+ -chimera_perc $chimera_perc\n+ #end if\n+ #if str($abundance_file) != "None":\n+ -abundance_file $abundance_file\n+ #end if\n+ #if str($abundance_model):\n+ -abundance_model $abundance_model\n+ #end if\n+ #if str($num_libraries):\n+ -num_libraries $num_libraries\n+ #end if\n+ #if str($multiplex_ids) != "None":\n+ -multiplex_ids $multiplex_ids\n+ #end if\n+ #if str($diversity):\n+ -diversity $diversity\n+ #end if\n+ #if str($shared_perc):\n+ -shared_perc $shared_perc\n+ #end if\n+ #if str($permuted_perc):\n+ -permuted_perc $permuted_perc\n+ #end if\n+ #if str($random_seed):\n+ -random_seed $random_seed\n+ #end if\n+ #if str($permuted_perc):\n+ -desc_track $desc_track\n+ #end if\n+ #if str($qual_levels):\n+ -qual_levels $qual_levels\n+ #end if\n+ #if str($profile_file) != "None":\n+ -profile_file $profile_file.value\n+ #end if\n+ \n+ \n+ \n+\n+ #set $output_dir = $__new_file_path__\n+ -output_dir $output_dir\n+\n+ #set $base_name = $output.id\n+ -base_name $base_name\n+ ;\n+\n+ $script2 '..b' <conditional/>\n+ <data format="fasta" name="fasta" from_work_dir="grinder-reads.fa" label="${tool.name} reads from ${on_string}" />\n+ <data format="qual" name="qual" from_work_dir="grinder-reads.qual" label="${tool.name} read quals from ${on_string}" >\n+ <filter>(str(qual_levels))</filter>\n+ </data>\n+ </outputs>\n+ -->\n+\n+ <outputs>\n+ <data format="text" name="output" />\n+ </outputs>\n+\n+ <tests>\n+ \n+ \n+\n+ </tests>\n+\n+ <help>\n+\n+**What it does**\n+\n+Grinder is a program to create random shotgun and amplicon sequence libraries\n+based on reference sequences in a FASTA file. Features include:\n+\n+ * shotgun library or amplicon library\n+ * arbitrary read length distribution and number of reads\n+ * simulation of PCR and sequencing errors (chimeras, point mutations, homopolymers)\n+ * support for creating paired-end (mate pair) datasets\n+ * specific rank-abundance settings or manually given abundance for each genome\n+ * creation of datasets with a given richness (alpha diversity)\n+ * independent datasets can share a variable number of genomes (beta diversity)\n+ * modeling of the bias created by varying genome lengths or gene copy number\n+ * profile mechanism to store preferred options\n+ * API to automate the creation of a large number of simulated datasets\n+\n+\n+**Input**\n+\n+A variety of FASTA databases containing genes or genomes can be used as input\n+for Grinder, such as the NCBI RefSeq collection (ftp://ftp.ncbi.nih.gov/refseq/release/microbial/),\n+the GreenGenes 16S rRNA database (http://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/Isolated_named_strains_16S_aligned.fasta), theh uman genome and transcriptome (ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/, ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.rna.fna.gz), ...\n+\n+These input files can either be provided as a Galaxy dataset, or can be uploaded\n+by Galaxy users in their history.\n+ \n+\n+**Output**\n+\n+For each library requested, a first file contains the abundance of the species\n+in the simulated community created, e.g.::\n+\n+ # rank seqID rel. abundance\n+ 1 86715_Lachnospiraceae 0.367936925098555 \n+ 2 6439_Neisseria_polysaccharea 0.183968462549277 \n+ 3 103712_Fusobacterium_nucleatum 0.122645641699518 \n+ 4 103024_Frigoribacterium 0.0919842312746386 \n+ 5 129066_Streptococcus_pyogenes 0.0735873850197109 \n+ 6 106485_Pseudomonas_aeruginosa 0.0613228208497591 \n+ 7 13824_Veillonella_criceti 0.0525624178712221 \n+ 8 28044_Lactosphaera 0.0459921156373193 \n+\n+The second file is a FASTA file containing shotgun or amplicon reads, e.g.::\n+\n+ >1 reference=13824_Veillonella_criceti position=89-1088 strand=+\n+ ACCAACCTGCCCTTCAGAGGGGGATAACAACGGGAAACCGTTGCTAATACCGCGTACGAA\n+ TGGACTTCGGCATCGGAGTTCATTGAAAGGTGGCCTCTATTTATAAGCTATCGCTGAAGG\n+ AGGGGGTTGCGTCTGATTAGCTAGTTGGAGGGGTAATGGCCCACCAAGGCAA\n+\n+ >2 reference=103712_Fusobacterium_nucleatum position=2-1001 strand=+\n+ TGAACGAAGAGTTTGATCCTGGCTCAGGATGAACGCTGACAGAATGCTTAACACATGCAA\n+ GTCAACTTGAATTTGGGTTTTTAACTTAGGTTTGGG\n+\n+If you specify the quality score levels option, a third file representing the\n+quality scores of the reads is created::\n+\n+ >1 reference=103712_Fusobacterium_nucleatum position=2-1001 strand=+\n+ 30 30 30 10 30 30 ...\n+\n+\n+ </help>\n+\n+</tool>\n+\n'

diff -r 000000000000 -r b35ec780aac1 tools/grinder_multiple_outputs.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/grinder_multiple_outputs.py Mon Sep 19 01:01:58 2011 -0400

[

@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+
+"""
+Move files create by Grinder to a location where it is going to be recognized by
+Galaxy as multiple output files with the right format. See
+http://wiki.g2.bx.psu.edu/Admin/Tools/Multiple Output Files
+Example: python grinder_move_outputs output_dir output_id
+Author: Florent Angly
+"""
+
+import sys, os, re
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+def __main__():
+    # Get output dir and ID
+    args = sys.argv
+    output_dir = args[1]
+    output_id  = args[2]
+
+    # Move Grinder files to the proper output
+    # Grinder filenames look like this
+    #   grinder-ranks.txt
+    #   grinder-reads.fa
+    #   grinder-reads.qual
+    #   grinder-1-ranks.txt
+    #   grinder-1-reads.fa
+    #   grinder-1-reads.qual
+    #   grinder-2-ranks.txt
+    #   grinder-2-reads.fa
+    #   grinder-2-reads.qual
+
+    p = re.compile(output_id)
+    q = re.compile('-(\d+)-')
+    r = re.compile('-(\w+)$')
+
+
+    for fname in os.listdir(output_dir):
+
+        # Skip files that do not start with the output_id
+        source = os.path.join( output_dir, fname )
+        basename, extension = os.path.splitext(fname)
+        if not p.match(fname):
+           continue
+
+        # Assign the dataset format
+        if extension == '.txt':
+           format = 'text'
+        elif extension == '.fa':
+           format = 'fasta'
+        elif extension == '.fna':
+           format = 'fasta'
+        elif extension == '.faa':
+           format = 'fasta'
+        elif extension == '.fasta':
+           format = 'fasta'
+        elif extension == '.fq':
+           format = 'fastq'
+        elif extension == '.fastq':
+           format = 'fastq'
+        elif extension == '.qual':
+           format = 'qual'
+        else:
+           stop_err( 'Error: File %s had the unknown extension %s' % ( fname, extension ) )
+
+        # Assign the dataset name
+        name = ''
+        match = q.search(basename)
+        if match != None:
+          lib_num = match.group(1)
+          name = 'lib%s' % lib_num
+
+        match = r.search(basename)
+        if match == None:
+          stop_err( 'Error: File with basename %s did not have a recognized name' % (basename) )
+
+        lib_type = match.group(1)
+        if format == 'qual':
+          lib_type = 'qual'
+
+        name = name + '-' + lib_type
+
+        # Move the dataset to the proper place
+        optional_spec = 'asdf'
+        destination = os.path.join( output_dir, 'primary_%s_%s_visible_%s_%s' % ( output_id, name, format, optional_spec ) )
+
+        print "moving %s to %s" % (source, destination)
+
+        try:
+          os.rename(source, destination)
+        except Exception, e:
+          stop_err( 'Error: ' + str( e ) )
+
+if __name__ == "__main__": __main__()

diff -r 000000000000 -r b35ec780aac1 tools/stderr_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stderr_wrapper.py Mon Sep 19 01:01:58 2011 -0400

[

@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+
+"""
+Wrapper that executes a program with its arguments but reports standard error
+messages only if the program exit status was not 0. This is useful to prevent
+Galaxy to interpret that there was an error if something was printed on stderr,
+e.g. if this was simply a warning.
+Example: ./stderr_wrapper.py myprog arg1 -f arg2
+Author: Florent Angly
+"""
+
+import sys, subprocess
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+def __main__():
+    # Get command-line arguments
+    args = sys.argv
+    # Remove name of calling program, i.e. ./stderr_wrapper.py
+    args.pop(0)
+    # If there are no arguments left, we're done
+    if len(args) == 0:
+        return
+
+    # If one needs to silence stdout
+    #args.append( ">" )
+    #args.append( "/dev/null" )
+
+    #cmdline = " ".join(args)
+    #print cmdline
+    try:
+        # Run program
+        proc = subprocess.Popen( args=args, shell=False, stderr=subprocess.PIPE )
+        returncode = proc.wait()
+        # Capture stderr, allowing for case where it's very large
+        stderr = ''
+        buffsize = 1048576
+        try:
+            while True:
+                stderr += proc.stderr.read( buffsize )
+                if not stderr or len( stderr ) % buffsize != 0:
+                    break
+        except OverflowError:
+            pass
+        # Running Grinder failed: write error message to stderr
+        if returncode != 0:
+            raise Exception, stderr
+    except Exception, e:
+        # Running Grinder failed: write error message to stderr
+        stop_err( 'Error: ' + str( e ) )
+
+
+if __name__ == "__main__": __main__()