# HG changeset patch
# User fangly
# Date 1316408848 14400
# Node ID 7d26d64539b2197baf260a9b1f89b9d8b15db567
# Parent b35ec780aac1e8535f35e34a541830aaf4d1676b
Uploaded
diff -r b35ec780aac1 -r 7d26d64539b2 tools/grinder.xml
--- a/tools/grinder.xml Mon Sep 19 01:01:58 2011 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,334 +0,0 @@
-
-
-
-
- genomic, metagenomic and amplicon read simulator (BETA)
-
-
- grinder
-
-
- grinder --version
-
-
- #set $tool_dir = os.path.join( os.path.abspath($__root_dir__), 'tools', 'ngs_simulation' )
- #set $script1 = os.path.join( $tool_dir, 'stderr_wrapper.py' )
- #set $script2 = os.path.join( $tool_dir, 'grinder_multiple_outputs.py' )
-
- $script1
- grinder
- #if $reference_file.specify == "builtin":
- -reference_file ${ filter( lambda x: str( x[0] ) == str( $reference_file.value ), $__app__.tool_data_tables[ 'all_fasta' ].get_fields() )[0][-1] }
- #else if $reference_file.specify == "uploaded":
- -reference_file $reference_file.value
- #end if
- #if str($coverage_fold):
- -coverage_fold $coverage_fold
- #end if
- #if str($total_reads):
- -total_reads $total_reads
- #end if
- #if str($read_dist):
- -read_dist $read_dist
- #end if
- #if str($insert_dist):
- -insert_dist $insert_dist
- #end if
- #if str($exclude_chars):
- -exclude_chars $exclude_chars
- #end if
- #if str($delete_chars):
- -delete_chars $delete_chars
- #end if
- #if str($forward_reverse) != "None":
- -forward_reverse $forward_reverse
- #end if
- #if str($unidirectional):
- -unidirectional $unidirectional
- #end if
- #if str($length_bias):
- -length_bias $length_bias
- #end if
- #if str($copy_bias):
- -copy_bias $copy_bias
- #end if
- #if str($mutation_dist):
- -mutation_dist $mutation_dist
- #end if
- #if str($mutation_ratio):
- -mutation_ratio $mutation_ratio
- #end if
- #if str($homopolymer_dist):
- -homopolymer_dist $homopolymer_dist
- #end if
- #if str($chimera_perc):
- -chimera_perc $chimera_perc
- #end if
- #if str($abundance_file) != "None":
- -abundance_file $abundance_file
- #end if
- #if str($abundance_model):
- -abundance_model $abundance_model
- #end if
- #if str($num_libraries):
- -num_libraries $num_libraries
- #end if
- #if str($multiplex_ids) != "None":
- -multiplex_ids $multiplex_ids
- #end if
- #if str($diversity):
- -diversity $diversity
- #end if
- #if str($shared_perc):
- -shared_perc $shared_perc
- #end if
- #if str($permuted_perc):
- -permuted_perc $permuted_perc
- #end if
- #if str($random_seed):
- -random_seed $random_seed
- #end if
- #if str($permuted_perc):
- -desc_track $desc_track
- #end if
- #if str($qual_levels):
- -qual_levels $qual_levels
- #end if
- #if str($profile_file) != "None":
- -profile_file $profile_file.value
- #end if
-
-
-
-
- #set $output_dir = $__new_file_path__
- -output_dir $output_dir
-
- #set $base_name = $output.id
- -base_name $base_name
- ;
-
- $script2 $output_dir $base_name
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-**What it does**
-
-Grinder is a program to create random shotgun and amplicon sequence libraries
-based on reference sequences in a FASTA file. Features include:
-
- * shotgun library or amplicon library
- * arbitrary read length distribution and number of reads
- * simulation of PCR and sequencing errors (chimeras, point mutations, homopolymers)
- * support for creating paired-end (mate pair) datasets
- * specific rank-abundance settings or manually given abundance for each genome
- * creation of datasets with a given richness (alpha diversity)
- * independent datasets can share a variable number of genomes (beta diversity)
- * modeling of the bias created by varying genome lengths or gene copy number
- * profile mechanism to store preferred options
- * API to automate the creation of a large number of simulated datasets
-
-
-**Input**
-
-A variety of FASTA databases containing genes or genomes can be used as input
-for Grinder, such as the NCBI RefSeq collection (ftp://ftp.ncbi.nih.gov/refseq/release/microbial/),
-the GreenGenes 16S rRNA database (http://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/Isolated_named_strains_16S_aligned.fasta), theh uman genome and transcriptome (ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/, ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.rna.fna.gz), ...
-
-These input files can either be provided as a Galaxy dataset, or can be uploaded
-by Galaxy users in their history.
-
-
-**Output**
-
-For each library requested, a first file contains the abundance of the species
-in the simulated community created, e.g.::
-
- # rank seqID rel. abundance
- 1 86715_Lachnospiraceae 0.367936925098555
- 2 6439_Neisseria_polysaccharea 0.183968462549277
- 3 103712_Fusobacterium_nucleatum 0.122645641699518
- 4 103024_Frigoribacterium 0.0919842312746386
- 5 129066_Streptococcus_pyogenes 0.0735873850197109
- 6 106485_Pseudomonas_aeruginosa 0.0613228208497591
- 7 13824_Veillonella_criceti 0.0525624178712221
- 8 28044_Lactosphaera 0.0459921156373193
-
-The second file is a FASTA file containing shotgun or amplicon reads, e.g.::
-
- >1 reference=13824_Veillonella_criceti position=89-1088 strand=+
- ACCAACCTGCCCTTCAGAGGGGGATAACAACGGGAAACCGTTGCTAATACCGCGTACGAA
- TGGACTTCGGCATCGGAGTTCATTGAAAGGTGGCCTCTATTTATAAGCTATCGCTGAAGG
- AGGGGGTTGCGTCTGATTAGCTAGTTGGAGGGGTAATGGCCCACCAAGGCAA
-
- >2 reference=103712_Fusobacterium_nucleatum position=2-1001 strand=+
- TGAACGAAGAGTTTGATCCTGGCTCAGGATGAACGCTGACAGAATGCTTAACACATGCAA
- GTCAACTTGAATTTGGGTTTTTAACTTAGGTTTGGG
-
-If you specify the quality score levels option, a third file representing the
-quality scores of the reads is created::
-
- >1 reference=103712_Fusobacterium_nucleatum position=2-1001 strand=+
- 30 30 30 10 30 30 ...
-
-
-
-
-
-
diff -r b35ec780aac1 -r 7d26d64539b2 tools/grinder_multiple_outputs.py
--- a/tools/grinder_multiple_outputs.py Mon Sep 19 01:01:58 2011 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,98 +0,0 @@
-#!/usr/bin/env python
-
-"""
-Move files create by Grinder to a location where it is going to be recognized by
-Galaxy as multiple output files with the right format. See
-http://wiki.g2.bx.psu.edu/Admin/Tools/Multiple Output Files
-Example: python grinder_move_outputs output_dir output_id
-Author: Florent Angly
-"""
-
-import sys, os, re
-
-assert sys.version_info[:2] >= ( 2, 4 )
-
-def stop_err( msg ):
- sys.stderr.write( "%s\n" % msg )
- sys.exit()
-
-def __main__():
- # Get output dir and ID
- args = sys.argv
- output_dir = args[1]
- output_id = args[2]
-
- # Move Grinder files to the proper output
- # Grinder filenames look like this
- # grinder-ranks.txt
- # grinder-reads.fa
- # grinder-reads.qual
- # grinder-1-ranks.txt
- # grinder-1-reads.fa
- # grinder-1-reads.qual
- # grinder-2-ranks.txt
- # grinder-2-reads.fa
- # grinder-2-reads.qual
-
- p = re.compile(output_id)
- q = re.compile('-(\d+)-')
- r = re.compile('-(\w+)$')
-
-
- for fname in os.listdir(output_dir):
-
- # Skip files that do not start with the output_id
- source = os.path.join( output_dir, fname )
- basename, extension = os.path.splitext(fname)
- if not p.match(fname):
- continue
-
- # Assign the dataset format
- if extension == '.txt':
- format = 'text'
- elif extension == '.fa':
- format = 'fasta'
- elif extension == '.fna':
- format = 'fasta'
- elif extension == '.faa':
- format = 'fasta'
- elif extension == '.fasta':
- format = 'fasta'
- elif extension == '.fq':
- format = 'fastq'
- elif extension == '.fastq':
- format = 'fastq'
- elif extension == '.qual':
- format = 'qual'
- else:
- stop_err( 'Error: File %s had the unknown extension %s' % ( fname, extension ) )
-
- # Assign the dataset name
- name = ''
- match = q.search(basename)
- if match != None:
- lib_num = match.group(1)
- name = 'lib%s' % lib_num
-
- match = r.search(basename)
- if match == None:
- stop_err( 'Error: File with basename %s did not have a recognized name' % (basename) )
-
- lib_type = match.group(1)
- if format == 'qual':
- lib_type = 'qual'
-
- name = name + '-' + lib_type
-
- # Move the dataset to the proper place
- optional_spec = 'asdf'
- destination = os.path.join( output_dir, 'primary_%s_%s_visible_%s_%s' % ( output_id, name, format, optional_spec ) )
-
- print "moving %s to %s" % (source, destination)
-
- try:
- os.rename(source, destination)
- except Exception, e:
- stop_err( 'Error: ' + str( e ) )
-
-if __name__ == "__main__": __main__()
diff -r b35ec780aac1 -r 7d26d64539b2 tools/ngs_simulation/grinder.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ngs_simulation/grinder.xml Mon Sep 19 01:07:28 2011 -0400
@@ -0,0 +1,334 @@
+
+
+
+
+ genomic, metagenomic and amplicon read simulator (BETA)
+
+
+ grinder
+
+
+ grinder --version
+
+
+ #set $tool_dir = os.path.join( os.path.abspath($__root_dir__), 'tools', 'ngs_simulation' )
+ #set $script1 = os.path.join( $tool_dir, 'stderr_wrapper.py' )
+ #set $script2 = os.path.join( $tool_dir, 'grinder_multiple_outputs.py' )
+
+ $script1
+ grinder
+ #if $reference_file.specify == "builtin":
+ -reference_file ${ filter( lambda x: str( x[0] ) == str( $reference_file.value ), $__app__.tool_data_tables[ 'all_fasta' ].get_fields() )[0][-1] }
+ #else if $reference_file.specify == "uploaded":
+ -reference_file $reference_file.value
+ #end if
+ #if str($coverage_fold):
+ -coverage_fold $coverage_fold
+ #end if
+ #if str($total_reads):
+ -total_reads $total_reads
+ #end if
+ #if str($read_dist):
+ -read_dist $read_dist
+ #end if
+ #if str($insert_dist):
+ -insert_dist $insert_dist
+ #end if
+ #if str($exclude_chars):
+ -exclude_chars $exclude_chars
+ #end if
+ #if str($delete_chars):
+ -delete_chars $delete_chars
+ #end if
+ #if str($forward_reverse) != "None":
+ -forward_reverse $forward_reverse
+ #end if
+ #if str($unidirectional):
+ -unidirectional $unidirectional
+ #end if
+ #if str($length_bias):
+ -length_bias $length_bias
+ #end if
+ #if str($copy_bias):
+ -copy_bias $copy_bias
+ #end if
+ #if str($mutation_dist):
+ -mutation_dist $mutation_dist
+ #end if
+ #if str($mutation_ratio):
+ -mutation_ratio $mutation_ratio
+ #end if
+ #if str($homopolymer_dist):
+ -homopolymer_dist $homopolymer_dist
+ #end if
+ #if str($chimera_perc):
+ -chimera_perc $chimera_perc
+ #end if
+ #if str($abundance_file) != "None":
+ -abundance_file $abundance_file
+ #end if
+ #if str($abundance_model):
+ -abundance_model $abundance_model
+ #end if
+ #if str($num_libraries):
+ -num_libraries $num_libraries
+ #end if
+ #if str($multiplex_ids) != "None":
+ -multiplex_ids $multiplex_ids
+ #end if
+ #if str($diversity):
+ -diversity $diversity
+ #end if
+ #if str($shared_perc):
+ -shared_perc $shared_perc
+ #end if
+ #if str($permuted_perc):
+ -permuted_perc $permuted_perc
+ #end if
+ #if str($random_seed):
+ -random_seed $random_seed
+ #end if
+ #if str($permuted_perc):
+ -desc_track $desc_track
+ #end if
+ #if str($qual_levels):
+ -qual_levels $qual_levels
+ #end if
+ #if str($profile_file) != "None":
+ -profile_file $profile_file.value
+ #end if
+
+
+
+
+ #set $output_dir = $__new_file_path__
+ -output_dir $output_dir
+
+ #set $base_name = $output.id
+ -base_name $base_name
+ ;
+
+ $script2 $output_dir $base_name
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+Grinder is a program to create random shotgun and amplicon sequence libraries
+based on reference sequences in a FASTA file. Features include:
+
+ * shotgun library or amplicon library
+ * arbitrary read length distribution and number of reads
+ * simulation of PCR and sequencing errors (chimeras, point mutations, homopolymers)
+ * support for creating paired-end (mate pair) datasets
+ * specific rank-abundance settings or manually given abundance for each genome
+ * creation of datasets with a given richness (alpha diversity)
+ * independent datasets can share a variable number of genomes (beta diversity)
+ * modeling of the bias created by varying genome lengths or gene copy number
+ * profile mechanism to store preferred options
+ * API to automate the creation of a large number of simulated datasets
+
+
+**Input**
+
+A variety of FASTA databases containing genes or genomes can be used as input
+for Grinder, such as the NCBI RefSeq collection (ftp://ftp.ncbi.nih.gov/refseq/release/microbial/),
+the GreenGenes 16S rRNA database (http://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/Isolated_named_strains_16S_aligned.fasta), theh uman genome and transcriptome (ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/, ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.rna.fna.gz), ...
+
+These input files can either be provided as a Galaxy dataset, or can be uploaded
+by Galaxy users in their history.
+
+
+**Output**
+
+For each library requested, a first file contains the abundance of the species
+in the simulated community created, e.g.::
+
+ # rank seqID rel. abundance
+ 1 86715_Lachnospiraceae 0.367936925098555
+ 2 6439_Neisseria_polysaccharea 0.183968462549277
+ 3 103712_Fusobacterium_nucleatum 0.122645641699518
+ 4 103024_Frigoribacterium 0.0919842312746386
+ 5 129066_Streptococcus_pyogenes 0.0735873850197109
+ 6 106485_Pseudomonas_aeruginosa 0.0613228208497591
+ 7 13824_Veillonella_criceti 0.0525624178712221
+ 8 28044_Lactosphaera 0.0459921156373193
+
+The second file is a FASTA file containing shotgun or amplicon reads, e.g.::
+
+ >1 reference=13824_Veillonella_criceti position=89-1088 strand=+
+ ACCAACCTGCCCTTCAGAGGGGGATAACAACGGGAAACCGTTGCTAATACCGCGTACGAA
+ TGGACTTCGGCATCGGAGTTCATTGAAAGGTGGCCTCTATTTATAAGCTATCGCTGAAGG
+ AGGGGGTTGCGTCTGATTAGCTAGTTGGAGGGGTAATGGCCCACCAAGGCAA
+
+ >2 reference=103712_Fusobacterium_nucleatum position=2-1001 strand=+
+ TGAACGAAGAGTTTGATCCTGGCTCAGGATGAACGCTGACAGAATGCTTAACACATGCAA
+ GTCAACTTGAATTTGGGTTTTTAACTTAGGTTTGGG
+
+If you specify the quality score levels option, a third file representing the
+quality scores of the reads is created::
+
+ >1 reference=103712_Fusobacterium_nucleatum position=2-1001 strand=+
+ 30 30 30 10 30 30 ...
+
+
+
+
+
+
diff -r b35ec780aac1 -r 7d26d64539b2 tools/ngs_simulation/grinder_multiple_outputs.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ngs_simulation/grinder_multiple_outputs.py Mon Sep 19 01:07:28 2011 -0400
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+
+"""
+Move files create by Grinder to a location where it is going to be recognized by
+Galaxy as multiple output files with the right format. See
+http://wiki.g2.bx.psu.edu/Admin/Tools/Multiple Output Files
+Example: python grinder_move_outputs output_dir output_id
+Author: Florent Angly
+"""
+
+import sys, os, re
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+ sys.stderr.write( "%s\n" % msg )
+ sys.exit()
+
+def __main__():
+ # Get output dir and ID
+ args = sys.argv
+ output_dir = args[1]
+ output_id = args[2]
+
+ # Move Grinder files to the proper output
+ # Grinder filenames look like this
+ # grinder-ranks.txt
+ # grinder-reads.fa
+ # grinder-reads.qual
+ # grinder-1-ranks.txt
+ # grinder-1-reads.fa
+ # grinder-1-reads.qual
+ # grinder-2-ranks.txt
+ # grinder-2-reads.fa
+ # grinder-2-reads.qual
+
+ p = re.compile(output_id)
+ q = re.compile('-(\d+)-')
+ r = re.compile('-(\w+)$')
+
+
+ for fname in os.listdir(output_dir):
+
+ # Skip files that do not start with the output_id
+ source = os.path.join( output_dir, fname )
+ basename, extension = os.path.splitext(fname)
+ if not p.match(fname):
+ continue
+
+ # Assign the dataset format
+ if extension == '.txt':
+ format = 'text'
+ elif extension == '.fa':
+ format = 'fasta'
+ elif extension == '.fna':
+ format = 'fasta'
+ elif extension == '.faa':
+ format = 'fasta'
+ elif extension == '.fasta':
+ format = 'fasta'
+ elif extension == '.fq':
+ format = 'fastq'
+ elif extension == '.fastq':
+ format = 'fastq'
+ elif extension == '.qual':
+ format = 'qual'
+ else:
+ stop_err( 'Error: File %s had the unknown extension %s' % ( fname, extension ) )
+
+ # Assign the dataset name
+ name = ''
+ match = q.search(basename)
+ if match != None:
+ lib_num = match.group(1)
+ name = 'lib%s' % lib_num
+
+ match = r.search(basename)
+ if match == None:
+ stop_err( 'Error: File with basename %s did not have a recognized name' % (basename) )
+
+ lib_type = match.group(1)
+ if format == 'qual':
+ lib_type = 'qual'
+
+ name = name + '-' + lib_type
+
+ # Move the dataset to the proper place
+ optional_spec = 'asdf'
+ destination = os.path.join( output_dir, 'primary_%s_%s_visible_%s_%s' % ( output_id, name, format, optional_spec ) )
+
+ print "moving %s to %s" % (source, destination)
+
+ try:
+ os.rename(source, destination)
+ except Exception, e:
+ stop_err( 'Error: ' + str( e ) )
+
+if __name__ == "__main__": __main__()
diff -r b35ec780aac1 -r 7d26d64539b2 tools/ngs_simulation/stderr_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ngs_simulation/stderr_wrapper.py Mon Sep 19 01:07:28 2011 -0400
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+
+"""
+Wrapper that executes a program with its arguments but reports standard error
+messages only if the program exit status was not 0. This is useful to prevent
+Galaxy to interpret that there was an error if something was printed on stderr,
+e.g. if this was simply a warning.
+Example: ./stderr_wrapper.py myprog arg1 -f arg2
+Author: Florent Angly
+"""
+
+import sys, subprocess
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+ sys.stderr.write( "%s\n" % msg )
+ sys.exit()
+
+def __main__():
+ # Get command-line arguments
+ args = sys.argv
+ # Remove name of calling program, i.e. ./stderr_wrapper.py
+ args.pop(0)
+ # If there are no arguments left, we're done
+ if len(args) == 0:
+ return
+
+ # If one needs to silence stdout
+ #args.append( ">" )
+ #args.append( "/dev/null" )
+
+ #cmdline = " ".join(args)
+ #print cmdline
+ try:
+ # Run program
+ proc = subprocess.Popen( args=args, shell=False, stderr=subprocess.PIPE )
+ returncode = proc.wait()
+ # Capture stderr, allowing for case where it's very large
+ stderr = ''
+ buffsize = 1048576
+ try:
+ while True:
+ stderr += proc.stderr.read( buffsize )
+ if not stderr or len( stderr ) % buffsize != 0:
+ break
+ except OverflowError:
+ pass
+ # Running Grinder failed: write error message to stderr
+ if returncode != 0:
+ raise Exception, stderr
+ except Exception, e:
+ # Running Grinder failed: write error message to stderr
+ stop_err( 'Error: ' + str( e ) )
+
+
+if __name__ == "__main__": __main__()
diff -r b35ec780aac1 -r 7d26d64539b2 tools/stderr_wrapper.py
--- a/tools/stderr_wrapper.py Mon Sep 19 01:01:58 2011 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,57 +0,0 @@
-#!/usr/bin/env python
-
-"""
-Wrapper that executes a program with its arguments but reports standard error
-messages only if the program exit status was not 0. This is useful to prevent
-Galaxy to interpret that there was an error if something was printed on stderr,
-e.g. if this was simply a warning.
-Example: ./stderr_wrapper.py myprog arg1 -f arg2
-Author: Florent Angly
-"""
-
-import sys, subprocess
-
-assert sys.version_info[:2] >= ( 2, 4 )
-
-def stop_err( msg ):
- sys.stderr.write( "%s\n" % msg )
- sys.exit()
-
-def __main__():
- # Get command-line arguments
- args = sys.argv
- # Remove name of calling program, i.e. ./stderr_wrapper.py
- args.pop(0)
- # If there are no arguments left, we're done
- if len(args) == 0:
- return
-
- # If one needs to silence stdout
- #args.append( ">" )
- #args.append( "/dev/null" )
-
- #cmdline = " ".join(args)
- #print cmdline
- try:
- # Run program
- proc = subprocess.Popen( args=args, shell=False, stderr=subprocess.PIPE )
- returncode = proc.wait()
- # Capture stderr, allowing for case where it's very large
- stderr = ''
- buffsize = 1048576
- try:
- while True:
- stderr += proc.stderr.read( buffsize )
- if not stderr or len( stderr ) % buffsize != 0:
- break
- except OverflowError:
- pass
- # Running Grinder failed: write error message to stderr
- if returncode != 0:
- raise Exception, stderr
- except Exception, e:
- # Running Grinder failed: write error message to stderr
- stop_err( 'Error: ' + str( e ) )
-
-
-if __name__ == "__main__": __main__()