Repository 'grinder'
hg clone https://toolshed.g2.bx.psu.edu/repos/fangly/grinder

Changeset 0:b35ec780aac1 (2011-09-19)
Next changeset 1:7d26d64539b2 (2011-09-19)
Commit message:
Uploaded
added:
tools/grinder.xml
tools/grinder_multiple_outputs.py
tools/stderr_wrapper.py
b
diff -r 000000000000 -r b35ec780aac1 tools/grinder.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/grinder.xml Mon Sep 19 01:01:58 2011 -0400
[
b'@@ -0,0 +1,334 @@\n+<tool id="grinder" name="Grinder" version="0.3.7" force_history_refresh="True">\n+\n+  <!--\n+    Author: florent.angly@gmail.com\n+    TODO:\n+    \xe2\x80\xa2 See bfast tool (tools/sr_mapping/bfast_wrapper.xml) for how to use datatables easily\n+    \xe2\x80\xa2 Basic tests\n+    \xe2\x80\xa2 Link to full manual\n+    \xe2\x80\xa2 Better sync with Grinder parameters, defaults and help\n+  -->\n+\n+  <description>genomic, metagenomic and amplicon read simulator (BETA)</description>\n+\n+  <requirements>\n+    <requirement type="binary">grinder</requirement>\n+  </requirements>\n+\n+  <version_string>grinder --version</version_string>\n+\n+  <command>\n+    #set $tool_dir = os.path.join( os.path.abspath($__root_dir__), \'tools\', \'ngs_simulation\' )\n+    #set $script1  = os.path.join( $tool_dir, \'stderr_wrapper.py\' )\n+    #set $script2  = os.path.join( $tool_dir, \'grinder_multiple_outputs.py\' )\n+\n+    $script1\n+      grinder\n+      #if $reference_file.specify == "builtin":\n+        -reference_file   ${ filter( lambda x: str( x[0] ) == str( $reference_file.value ), $__app__.tool_data_tables[ \'all_fasta\' ].get_fields() )[0][-1] }\n+      #else if $reference_file.specify == "uploaded":\n+        -reference_file   $reference_file.value\n+      #end if\n+      #if str($coverage_fold):\n+        -coverage_fold    $coverage_fold\n+      #end if\n+      #if str($total_reads):\n+        -total_reads      $total_reads\n+      #end if\n+      #if str($read_dist):\n+        -read_dist        $read_dist\n+      #end if\n+      #if str($insert_dist):\n+        -insert_dist      $insert_dist\n+      #end if\n+      #if str($exclude_chars):\n+        -exclude_chars    $exclude_chars\n+      #end if\n+      #if str($delete_chars):\n+        -delete_chars     $delete_chars\n+      #end if\n+      #if str($forward_reverse) != "None":\n+        -forward_reverse  $forward_reverse\n+      #end if\n+      #if str($unidirectional):\n+        -unidirectional   $unidirectional\n+      #end if\n+      #if str($length_bias):\n+        -length_bias      $length_bias\n+      #end if\n+      #if str($copy_bias):\n+        -copy_bias        $copy_bias\n+      #end if\n+      #if str($mutation_dist):\n+        -mutation_dist    $mutation_dist\n+      #end if\n+      #if str($mutation_ratio):\n+        -mutation_ratio   $mutation_ratio\n+      #end if\n+      #if str($homopolymer_dist):\n+        -homopolymer_dist $homopolymer_dist\n+      #end if\n+      #if str($chimera_perc):\n+        -chimera_perc     $chimera_perc\n+      #end if\n+      #if str($abundance_file) != "None":\n+        -abundance_file   $abundance_file\n+      #end if\n+      #if str($abundance_model):\n+        -abundance_model  $abundance_model\n+      #end if\n+      #if str($num_libraries):\n+        -num_libraries    $num_libraries\n+      #end if\n+      #if str($multiplex_ids) != "None":\n+        -multiplex_ids    $multiplex_ids\n+      #end if\n+      #if str($diversity):\n+        -diversity        $diversity\n+      #end if\n+      #if str($shared_perc):\n+        -shared_perc      $shared_perc\n+      #end if\n+      #if str($permuted_perc):\n+        -permuted_perc    $permuted_perc\n+      #end if\n+      #if str($random_seed):\n+        -random_seed      $random_seed\n+      #end if\n+      #if str($permuted_perc):\n+        -desc_track       $desc_track\n+      #end if\n+      #if str($qual_levels):\n+        -qual_levels      $qual_levels\n+      #end if\n+      #if str($profile_file) != "None":\n+        -profile_file     $profile_file.value\n+      #end if\n+      <!-- When Galaxy bug #661 is resolved, then we can use the same method to check for all optional argument  -->\n+      <!-- i.e. either   if str($param) != "None":   or   if str($param):                                        -->\n+      <!-- URL: https://bitbucket.org/galaxy/galaxy-central/issue/661/optional-arguments-problems#comment-655611 -->\n+\n+      #set $output_dir = $__new_file_path__\n+      -output_dir         $output_dir\n+\n+      #set $base_name  = $output.id\n+      -base_name          $base_name\n+    ;\n+\n+    $script2 '..b' <conditional/>\n+    <data format="fasta"   name="fasta" from_work_dir="grinder-reads.fa"   label="${tool.name} reads from ${on_string}"      />\n+    <data format="qual"    name="qual"  from_work_dir="grinder-reads.qual" label="${tool.name} read quals from ${on_string}"  >\n+      <filter>(str(qual_levels))</filter>\n+    </data>\n+  </outputs>\n+  -->\n+\n+  <outputs>\n+    <data format="text" name="output" />\n+  </outputs>\n+\n+  <tests>\n+    <!-- no tests since they would not not always return the same results -->\n+    <!--\n+    <test>\n+      <param name="specify" value="uploaded" />\n+      <param name="value" value="ngs_simulation_in1.fasta" ftype="fasta" />\n+      <output name="ranks" file="" />\n+      <output name="fasta" file="" />\n+      <output name="qual" file="" />\n+    </test>\n+\n+    <test>\n+      <param name="specify" value="builtin" />\n+      <param name="builtin" value="pUC18" />\n+      <output name="ranks" file="" />\n+      <output name="fasta" file="" />\n+      <output name="qual" file="" />\n+    </test>\n+    -->\n+\n+  </tests>\n+\n+  <help>\n+\n+**What it does**\n+\n+Grinder is a program to create random shotgun and amplicon sequence libraries\n+based on reference sequences in a FASTA file. Features include:\n+\n+  * shotgun library or amplicon library\n+  * arbitrary read length distribution and number of reads\n+  * simulation of PCR and sequencing errors (chimeras, point mutations, homopolymers)\n+  * support for creating paired-end (mate pair) datasets\n+  * specific rank-abundance settings or manually given abundance for each genome\n+  * creation of datasets with a given richness (alpha diversity)\n+  * independent datasets can share a variable number of genomes (beta diversity)\n+  * modeling of the bias created by varying genome lengths or gene copy number\n+  * profile mechanism to store preferred options\n+  * API to automate the creation of a large number of simulated datasets\n+\n+\n+**Input**\n+\n+A variety of FASTA databases containing genes or genomes can be used as input\n+for Grinder, such as the NCBI RefSeq collection (ftp://ftp.ncbi.nih.gov/refseq/release/microbial/),\n+the GreenGenes 16S rRNA database (http://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/Isolated_named_strains_16S_aligned.fasta), theh uman genome and transcriptome (ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/, ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.rna.fna.gz), ...\n+\n+These input files can either be provided as a Galaxy dataset, or can be uploaded\n+by Galaxy users in their history.\n+ \n+\n+**Output**\n+\n+For each library requested, a first file contains the abundance of the species\n+in the simulated community created, e.g.::\n+\n+  # rank  seqID                           rel. abundance\n+  1       86715_Lachnospiraceae           0.367936925098555 \n+  2       6439_Neisseria_polysaccharea    0.183968462549277 \n+  3       103712_Fusobacterium_nucleatum  0.122645641699518 \n+  4       103024_Frigoribacterium         0.0919842312746386 \n+  5       129066_Streptococcus_pyogenes   0.0735873850197109 \n+  6       106485_Pseudomonas_aeruginosa   0.0613228208497591 \n+  7       13824_Veillonella_criceti       0.0525624178712221 \n+  8       28044_Lactosphaera              0.0459921156373193 \n+\n+The second file is a FASTA file containing shotgun or amplicon reads, e.g.::\n+\n+  >1 reference=13824_Veillonella_criceti position=89-1088 strand=+\n+  ACCAACCTGCCCTTCAGAGGGGGATAACAACGGGAAACCGTTGCTAATACCGCGTACGAA\n+  TGGACTTCGGCATCGGAGTTCATTGAAAGGTGGCCTCTATTTATAAGCTATCGCTGAAGG\n+  AGGGGGTTGCGTCTGATTAGCTAGTTGGAGGGGTAATGGCCCACCAAGGCAA\n+\n+  >2 reference=103712_Fusobacterium_nucleatum position=2-1001 strand=+\n+  TGAACGAAGAGTTTGATCCTGGCTCAGGATGAACGCTGACAGAATGCTTAACACATGCAA\n+  GTCAACTTGAATTTGGGTTTTTAACTTAGGTTTGGG\n+\n+If you specify the quality score levels option, a third file representing the\n+quality scores of the reads is created::\n+\n+  >1 reference=103712_Fusobacterium_nucleatum position=2-1001 strand=+\n+  30 30 30 10 30 30 ...\n+\n+\n+  </help>\n+\n+</tool>\n+\n'
b
diff -r 000000000000 -r b35ec780aac1 tools/grinder_multiple_outputs.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/grinder_multiple_outputs.py Mon Sep 19 01:01:58 2011 -0400
[
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+
+"""
+Move files create by Grinder to a location where it is going to be recognized by
+Galaxy as multiple output files with the right format. See
+http://wiki.g2.bx.psu.edu/Admin/Tools/Multiple Output Files
+Example: python grinder_move_outputs output_dir output_id
+Author: Florent Angly
+"""
+
+import sys, os, re
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+def __main__():
+    # Get output dir and ID
+    args = sys.argv
+    output_dir = args[1]
+    output_id  = args[2]
+
+    # Move Grinder files to the proper output
+    # Grinder filenames look like this
+    #   grinder-ranks.txt
+    #   grinder-reads.fa
+    #   grinder-reads.qual
+    #   grinder-1-ranks.txt
+    #   grinder-1-reads.fa
+    #   grinder-1-reads.qual
+    #   grinder-2-ranks.txt
+    #   grinder-2-reads.fa
+    #   grinder-2-reads.qual
+
+    p = re.compile(output_id)
+    q = re.compile('-(\d+)-')
+    r = re.compile('-(\w+)$')
+    
+
+    for fname in os.listdir(output_dir):
+
+        # Skip files that do not start with the output_id
+        source = os.path.join( output_dir, fname )
+        basename, extension = os.path.splitext(fname)
+        if not p.match(fname):
+           continue
+
+        # Assign the dataset format
+        if extension == '.txt': 
+           format = 'text'
+        elif extension == '.fa':
+           format = 'fasta'
+        elif extension == '.fna':
+           format = 'fasta'
+        elif extension == '.faa':
+           format = 'fasta'
+        elif extension == '.fasta':
+           format = 'fasta'
+        elif extension == '.fq':
+           format = 'fastq'
+        elif extension == '.fastq':
+           format = 'fastq'
+        elif extension == '.qual':
+           format = 'qual'
+        else:
+           stop_err( 'Error: File %s had the unknown extension %s' % ( fname, extension ) )
+        
+        # Assign the dataset name
+        name = ''
+        match = q.search(basename)
+        if match != None:
+          lib_num = match.group(1)
+          name = 'lib%s' % lib_num
+
+        match = r.search(basename)
+        if match == None:
+          stop_err( 'Error: File with basename %s did not have a recognized name' % (basename) )
+        
+        lib_type = match.group(1)
+        if format == 'qual':
+          lib_type = 'qual'
+
+        name = name + '-' + lib_type        
+
+        # Move the dataset to the proper place
+        optional_spec = 'asdf'
+        destination = os.path.join( output_dir, 'primary_%s_%s_visible_%s_%s' % ( output_id, name, format, optional_spec ) )
+
+        print "moving %s to %s" % (source, destination)
+
+        try:
+          os.rename(source, destination)
+        except Exception, e:
+          stop_err( 'Error: ' + str( e ) )
+
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r b35ec780aac1 tools/stderr_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stderr_wrapper.py Mon Sep 19 01:01:58 2011 -0400
[
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+
+"""
+Wrapper that executes a program with its arguments but reports standard error
+messages only if the program exit status was not 0. This is useful to prevent
+Galaxy to interpret that there was an error if something was printed on stderr,
+e.g. if this was simply a warning.
+Example: ./stderr_wrapper.py myprog arg1 -f arg2
+Author: Florent Angly
+"""
+
+import sys, subprocess
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+def __main__():
+    # Get command-line arguments
+    args = sys.argv
+    # Remove name of calling program, i.e. ./stderr_wrapper.py
+    args.pop(0)
+    # If there are no arguments left, we're done
+    if len(args) == 0:
+        return
+   
+    # If one needs to silence stdout 
+    #args.append( ">" )
+    #args.append( "/dev/null" )
+
+    #cmdline = " ".join(args)
+    #print cmdline
+    try:
+        # Run program
+        proc = subprocess.Popen( args=args, shell=False, stderr=subprocess.PIPE )
+        returncode = proc.wait()
+        # Capture stderr, allowing for case where it's very large
+        stderr = ''
+        buffsize = 1048576
+        try:
+            while True:
+                stderr += proc.stderr.read( buffsize )
+                if not stderr or len( stderr ) % buffsize != 0:
+                    break
+        except OverflowError:
+            pass
+        # Running Grinder failed: write error message to stderr
+        if returncode != 0:
+            raise Exception, stderr
+    except Exception, e:
+        # Running Grinder failed: write error message to stderr
+        stop_err( 'Error: ' + str( e ) )
+
+
+if __name__ == "__main__": __main__()