# HG changeset patch # User fangly # Date 1316408848 14400 # Node ID 7d26d64539b2197baf260a9b1f89b9d8b15db567 # Parent b35ec780aac1e8535f35e34a541830aaf4d1676b Uploaded diff -r b35ec780aac1 -r 7d26d64539b2 tools/grinder.xml --- a/tools/grinder.xml Mon Sep 19 01:01:58 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,334 +0,0 @@ - - - - - genomic, metagenomic and amplicon read simulator (BETA) - - - grinder - - - grinder --version - - - #set $tool_dir = os.path.join( os.path.abspath($__root_dir__), 'tools', 'ngs_simulation' ) - #set $script1 = os.path.join( $tool_dir, 'stderr_wrapper.py' ) - #set $script2 = os.path.join( $tool_dir, 'grinder_multiple_outputs.py' ) - - $script1 - grinder - #if $reference_file.specify == "builtin": - -reference_file ${ filter( lambda x: str( x[0] ) == str( $reference_file.value ), $__app__.tool_data_tables[ 'all_fasta' ].get_fields() )[0][-1] } - #else if $reference_file.specify == "uploaded": - -reference_file $reference_file.value - #end if - #if str($coverage_fold): - -coverage_fold $coverage_fold - #end if - #if str($total_reads): - -total_reads $total_reads - #end if - #if str($read_dist): - -read_dist $read_dist - #end if - #if str($insert_dist): - -insert_dist $insert_dist - #end if - #if str($exclude_chars): - -exclude_chars $exclude_chars - #end if - #if str($delete_chars): - -delete_chars $delete_chars - #end if - #if str($forward_reverse) != "None": - -forward_reverse $forward_reverse - #end if - #if str($unidirectional): - -unidirectional $unidirectional - #end if - #if str($length_bias): - -length_bias $length_bias - #end if - #if str($copy_bias): - -copy_bias $copy_bias - #end if - #if str($mutation_dist): - -mutation_dist $mutation_dist - #end if - #if str($mutation_ratio): - -mutation_ratio $mutation_ratio - #end if - #if str($homopolymer_dist): - -homopolymer_dist $homopolymer_dist - #end if - #if str($chimera_perc): - -chimera_perc $chimera_perc - #end if - #if str($abundance_file) != "None": - -abundance_file $abundance_file - #end if - #if str($abundance_model): - -abundance_model $abundance_model - #end if - #if str($num_libraries): - -num_libraries $num_libraries - #end if - #if str($multiplex_ids) != "None": - -multiplex_ids $multiplex_ids - #end if - #if str($diversity): - -diversity $diversity - #end if - #if str($shared_perc): - -shared_perc $shared_perc - #end if - #if str($permuted_perc): - -permuted_perc $permuted_perc - #end if - #if str($random_seed): - -random_seed $random_seed - #end if - #if str($permuted_perc): - -desc_track $desc_track - #end if - #if str($qual_levels): - -qual_levels $qual_levels - #end if - #if str($profile_file) != "None": - -profile_file $profile_file.value - #end if - - - - - #set $output_dir = $__new_file_path__ - -output_dir $output_dir - - #set $base_name = $output.id - -base_name $base_name - ; - - $script2 $output_dir $base_name - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -Grinder is a program to create random shotgun and amplicon sequence libraries -based on reference sequences in a FASTA file. Features include: - - * shotgun library or amplicon library - * arbitrary read length distribution and number of reads - * simulation of PCR and sequencing errors (chimeras, point mutations, homopolymers) - * support for creating paired-end (mate pair) datasets - * specific rank-abundance settings or manually given abundance for each genome - * creation of datasets with a given richness (alpha diversity) - * independent datasets can share a variable number of genomes (beta diversity) - * modeling of the bias created by varying genome lengths or gene copy number - * profile mechanism to store preferred options - * API to automate the creation of a large number of simulated datasets - - -**Input** - -A variety of FASTA databases containing genes or genomes can be used as input -for Grinder, such as the NCBI RefSeq collection (ftp://ftp.ncbi.nih.gov/refseq/release/microbial/), -the GreenGenes 16S rRNA database (http://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/Isolated_named_strains_16S_aligned.fasta), theh uman genome and transcriptome (ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/, ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.rna.fna.gz), ... - -These input files can either be provided as a Galaxy dataset, or can be uploaded -by Galaxy users in their history. - - -**Output** - -For each library requested, a first file contains the abundance of the species -in the simulated community created, e.g.:: - - # rank seqID rel. abundance - 1 86715_Lachnospiraceae 0.367936925098555 - 2 6439_Neisseria_polysaccharea 0.183968462549277 - 3 103712_Fusobacterium_nucleatum 0.122645641699518 - 4 103024_Frigoribacterium 0.0919842312746386 - 5 129066_Streptococcus_pyogenes 0.0735873850197109 - 6 106485_Pseudomonas_aeruginosa 0.0613228208497591 - 7 13824_Veillonella_criceti 0.0525624178712221 - 8 28044_Lactosphaera 0.0459921156373193 - -The second file is a FASTA file containing shotgun or amplicon reads, e.g.:: - - >1 reference=13824_Veillonella_criceti position=89-1088 strand=+ - ACCAACCTGCCCTTCAGAGGGGGATAACAACGGGAAACCGTTGCTAATACCGCGTACGAA - TGGACTTCGGCATCGGAGTTCATTGAAAGGTGGCCTCTATTTATAAGCTATCGCTGAAGG - AGGGGGTTGCGTCTGATTAGCTAGTTGGAGGGGTAATGGCCCACCAAGGCAA - - >2 reference=103712_Fusobacterium_nucleatum position=2-1001 strand=+ - TGAACGAAGAGTTTGATCCTGGCTCAGGATGAACGCTGACAGAATGCTTAACACATGCAA - GTCAACTTGAATTTGGGTTTTTAACTTAGGTTTGGG - -If you specify the quality score levels option, a third file representing the -quality scores of the reads is created:: - - >1 reference=103712_Fusobacterium_nucleatum position=2-1001 strand=+ - 30 30 30 10 30 30 ... - - - - - - diff -r b35ec780aac1 -r 7d26d64539b2 tools/grinder_multiple_outputs.py --- a/tools/grinder_multiple_outputs.py Mon Sep 19 01:01:58 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,98 +0,0 @@ -#!/usr/bin/env python - -""" -Move files create by Grinder to a location where it is going to be recognized by -Galaxy as multiple output files with the right format. See -http://wiki.g2.bx.psu.edu/Admin/Tools/Multiple Output Files -Example: python grinder_move_outputs output_dir output_id -Author: Florent Angly -""" - -import sys, os, re - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def __main__(): - # Get output dir and ID - args = sys.argv - output_dir = args[1] - output_id = args[2] - - # Move Grinder files to the proper output - # Grinder filenames look like this - # grinder-ranks.txt - # grinder-reads.fa - # grinder-reads.qual - # grinder-1-ranks.txt - # grinder-1-reads.fa - # grinder-1-reads.qual - # grinder-2-ranks.txt - # grinder-2-reads.fa - # grinder-2-reads.qual - - p = re.compile(output_id) - q = re.compile('-(\d+)-') - r = re.compile('-(\w+)$') - - - for fname in os.listdir(output_dir): - - # Skip files that do not start with the output_id - source = os.path.join( output_dir, fname ) - basename, extension = os.path.splitext(fname) - if not p.match(fname): - continue - - # Assign the dataset format - if extension == '.txt': - format = 'text' - elif extension == '.fa': - format = 'fasta' - elif extension == '.fna': - format = 'fasta' - elif extension == '.faa': - format = 'fasta' - elif extension == '.fasta': - format = 'fasta' - elif extension == '.fq': - format = 'fastq' - elif extension == '.fastq': - format = 'fastq' - elif extension == '.qual': - format = 'qual' - else: - stop_err( 'Error: File %s had the unknown extension %s' % ( fname, extension ) ) - - # Assign the dataset name - name = '' - match = q.search(basename) - if match != None: - lib_num = match.group(1) - name = 'lib%s' % lib_num - - match = r.search(basename) - if match == None: - stop_err( 'Error: File with basename %s did not have a recognized name' % (basename) ) - - lib_type = match.group(1) - if format == 'qual': - lib_type = 'qual' - - name = name + '-' + lib_type - - # Move the dataset to the proper place - optional_spec = 'asdf' - destination = os.path.join( output_dir, 'primary_%s_%s_visible_%s_%s' % ( output_id, name, format, optional_spec ) ) - - print "moving %s to %s" % (source, destination) - - try: - os.rename(source, destination) - except Exception, e: - stop_err( 'Error: ' + str( e ) ) - -if __name__ == "__main__": __main__() diff -r b35ec780aac1 -r 7d26d64539b2 tools/ngs_simulation/grinder.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ngs_simulation/grinder.xml Mon Sep 19 01:07:28 2011 -0400 @@ -0,0 +1,334 @@ + + + + + genomic, metagenomic and amplicon read simulator (BETA) + + + grinder + + + grinder --version + + + #set $tool_dir = os.path.join( os.path.abspath($__root_dir__), 'tools', 'ngs_simulation' ) + #set $script1 = os.path.join( $tool_dir, 'stderr_wrapper.py' ) + #set $script2 = os.path.join( $tool_dir, 'grinder_multiple_outputs.py' ) + + $script1 + grinder + #if $reference_file.specify == "builtin": + -reference_file ${ filter( lambda x: str( x[0] ) == str( $reference_file.value ), $__app__.tool_data_tables[ 'all_fasta' ].get_fields() )[0][-1] } + #else if $reference_file.specify == "uploaded": + -reference_file $reference_file.value + #end if + #if str($coverage_fold): + -coverage_fold $coverage_fold + #end if + #if str($total_reads): + -total_reads $total_reads + #end if + #if str($read_dist): + -read_dist $read_dist + #end if + #if str($insert_dist): + -insert_dist $insert_dist + #end if + #if str($exclude_chars): + -exclude_chars $exclude_chars + #end if + #if str($delete_chars): + -delete_chars $delete_chars + #end if + #if str($forward_reverse) != "None": + -forward_reverse $forward_reverse + #end if + #if str($unidirectional): + -unidirectional $unidirectional + #end if + #if str($length_bias): + -length_bias $length_bias + #end if + #if str($copy_bias): + -copy_bias $copy_bias + #end if + #if str($mutation_dist): + -mutation_dist $mutation_dist + #end if + #if str($mutation_ratio): + -mutation_ratio $mutation_ratio + #end if + #if str($homopolymer_dist): + -homopolymer_dist $homopolymer_dist + #end if + #if str($chimera_perc): + -chimera_perc $chimera_perc + #end if + #if str($abundance_file) != "None": + -abundance_file $abundance_file + #end if + #if str($abundance_model): + -abundance_model $abundance_model + #end if + #if str($num_libraries): + -num_libraries $num_libraries + #end if + #if str($multiplex_ids) != "None": + -multiplex_ids $multiplex_ids + #end if + #if str($diversity): + -diversity $diversity + #end if + #if str($shared_perc): + -shared_perc $shared_perc + #end if + #if str($permuted_perc): + -permuted_perc $permuted_perc + #end if + #if str($random_seed): + -random_seed $random_seed + #end if + #if str($permuted_perc): + -desc_track $desc_track + #end if + #if str($qual_levels): + -qual_levels $qual_levels + #end if + #if str($profile_file) != "None": + -profile_file $profile_file.value + #end if + + + + + #set $output_dir = $__new_file_path__ + -output_dir $output_dir + + #set $base_name = $output.id + -base_name $base_name + ; + + $script2 $output_dir $base_name + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Grinder is a program to create random shotgun and amplicon sequence libraries +based on reference sequences in a FASTA file. Features include: + + * shotgun library or amplicon library + * arbitrary read length distribution and number of reads + * simulation of PCR and sequencing errors (chimeras, point mutations, homopolymers) + * support for creating paired-end (mate pair) datasets + * specific rank-abundance settings or manually given abundance for each genome + * creation of datasets with a given richness (alpha diversity) + * independent datasets can share a variable number of genomes (beta diversity) + * modeling of the bias created by varying genome lengths or gene copy number + * profile mechanism to store preferred options + * API to automate the creation of a large number of simulated datasets + + +**Input** + +A variety of FASTA databases containing genes or genomes can be used as input +for Grinder, such as the NCBI RefSeq collection (ftp://ftp.ncbi.nih.gov/refseq/release/microbial/), +the GreenGenes 16S rRNA database (http://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/Isolated_named_strains_16S_aligned.fasta), theh uman genome and transcriptome (ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/, ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.rna.fna.gz), ... + +These input files can either be provided as a Galaxy dataset, or can be uploaded +by Galaxy users in their history. + + +**Output** + +For each library requested, a first file contains the abundance of the species +in the simulated community created, e.g.:: + + # rank seqID rel. abundance + 1 86715_Lachnospiraceae 0.367936925098555 + 2 6439_Neisseria_polysaccharea 0.183968462549277 + 3 103712_Fusobacterium_nucleatum 0.122645641699518 + 4 103024_Frigoribacterium 0.0919842312746386 + 5 129066_Streptococcus_pyogenes 0.0735873850197109 + 6 106485_Pseudomonas_aeruginosa 0.0613228208497591 + 7 13824_Veillonella_criceti 0.0525624178712221 + 8 28044_Lactosphaera 0.0459921156373193 + +The second file is a FASTA file containing shotgun or amplicon reads, e.g.:: + + >1 reference=13824_Veillonella_criceti position=89-1088 strand=+ + ACCAACCTGCCCTTCAGAGGGGGATAACAACGGGAAACCGTTGCTAATACCGCGTACGAA + TGGACTTCGGCATCGGAGTTCATTGAAAGGTGGCCTCTATTTATAAGCTATCGCTGAAGG + AGGGGGTTGCGTCTGATTAGCTAGTTGGAGGGGTAATGGCCCACCAAGGCAA + + >2 reference=103712_Fusobacterium_nucleatum position=2-1001 strand=+ + TGAACGAAGAGTTTGATCCTGGCTCAGGATGAACGCTGACAGAATGCTTAACACATGCAA + GTCAACTTGAATTTGGGTTTTTAACTTAGGTTTGGG + +If you specify the quality score levels option, a third file representing the +quality scores of the reads is created:: + + >1 reference=103712_Fusobacterium_nucleatum position=2-1001 strand=+ + 30 30 30 10 30 30 ... + + + + + + diff -r b35ec780aac1 -r 7d26d64539b2 tools/ngs_simulation/grinder_multiple_outputs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ngs_simulation/grinder_multiple_outputs.py Mon Sep 19 01:07:28 2011 -0400 @@ -0,0 +1,98 @@ +#!/usr/bin/env python + +""" +Move files create by Grinder to a location where it is going to be recognized by +Galaxy as multiple output files with the right format. See +http://wiki.g2.bx.psu.edu/Admin/Tools/Multiple Output Files +Example: python grinder_move_outputs output_dir output_id +Author: Florent Angly +""" + +import sys, os, re + +assert sys.version_info[:2] >= ( 2, 4 ) + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +def __main__(): + # Get output dir and ID + args = sys.argv + output_dir = args[1] + output_id = args[2] + + # Move Grinder files to the proper output + # Grinder filenames look like this + # grinder-ranks.txt + # grinder-reads.fa + # grinder-reads.qual + # grinder-1-ranks.txt + # grinder-1-reads.fa + # grinder-1-reads.qual + # grinder-2-ranks.txt + # grinder-2-reads.fa + # grinder-2-reads.qual + + p = re.compile(output_id) + q = re.compile('-(\d+)-') + r = re.compile('-(\w+)$') + + + for fname in os.listdir(output_dir): + + # Skip files that do not start with the output_id + source = os.path.join( output_dir, fname ) + basename, extension = os.path.splitext(fname) + if not p.match(fname): + continue + + # Assign the dataset format + if extension == '.txt': + format = 'text' + elif extension == '.fa': + format = 'fasta' + elif extension == '.fna': + format = 'fasta' + elif extension == '.faa': + format = 'fasta' + elif extension == '.fasta': + format = 'fasta' + elif extension == '.fq': + format = 'fastq' + elif extension == '.fastq': + format = 'fastq' + elif extension == '.qual': + format = 'qual' + else: + stop_err( 'Error: File %s had the unknown extension %s' % ( fname, extension ) ) + + # Assign the dataset name + name = '' + match = q.search(basename) + if match != None: + lib_num = match.group(1) + name = 'lib%s' % lib_num + + match = r.search(basename) + if match == None: + stop_err( 'Error: File with basename %s did not have a recognized name' % (basename) ) + + lib_type = match.group(1) + if format == 'qual': + lib_type = 'qual' + + name = name + '-' + lib_type + + # Move the dataset to the proper place + optional_spec = 'asdf' + destination = os.path.join( output_dir, 'primary_%s_%s_visible_%s_%s' % ( output_id, name, format, optional_spec ) ) + + print "moving %s to %s" % (source, destination) + + try: + os.rename(source, destination) + except Exception, e: + stop_err( 'Error: ' + str( e ) ) + +if __name__ == "__main__": __main__() diff -r b35ec780aac1 -r 7d26d64539b2 tools/ngs_simulation/stderr_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ngs_simulation/stderr_wrapper.py Mon Sep 19 01:07:28 2011 -0400 @@ -0,0 +1,57 @@ +#!/usr/bin/env python + +""" +Wrapper that executes a program with its arguments but reports standard error +messages only if the program exit status was not 0. This is useful to prevent +Galaxy to interpret that there was an error if something was printed on stderr, +e.g. if this was simply a warning. +Example: ./stderr_wrapper.py myprog arg1 -f arg2 +Author: Florent Angly +""" + +import sys, subprocess + +assert sys.version_info[:2] >= ( 2, 4 ) + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +def __main__(): + # Get command-line arguments + args = sys.argv + # Remove name of calling program, i.e. ./stderr_wrapper.py + args.pop(0) + # If there are no arguments left, we're done + if len(args) == 0: + return + + # If one needs to silence stdout + #args.append( ">" ) + #args.append( "/dev/null" ) + + #cmdline = " ".join(args) + #print cmdline + try: + # Run program + proc = subprocess.Popen( args=args, shell=False, stderr=subprocess.PIPE ) + returncode = proc.wait() + # Capture stderr, allowing for case where it's very large + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + # Running Grinder failed: write error message to stderr + if returncode != 0: + raise Exception, stderr + except Exception, e: + # Running Grinder failed: write error message to stderr + stop_err( 'Error: ' + str( e ) ) + + +if __name__ == "__main__": __main__() diff -r b35ec780aac1 -r 7d26d64539b2 tools/stderr_wrapper.py --- a/tools/stderr_wrapper.py Mon Sep 19 01:01:58 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,57 +0,0 @@ -#!/usr/bin/env python - -""" -Wrapper that executes a program with its arguments but reports standard error -messages only if the program exit status was not 0. This is useful to prevent -Galaxy to interpret that there was an error if something was printed on stderr, -e.g. if this was simply a warning. -Example: ./stderr_wrapper.py myprog arg1 -f arg2 -Author: Florent Angly -""" - -import sys, subprocess - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def __main__(): - # Get command-line arguments - args = sys.argv - # Remove name of calling program, i.e. ./stderr_wrapper.py - args.pop(0) - # If there are no arguments left, we're done - if len(args) == 0: - return - - # If one needs to silence stdout - #args.append( ">" ) - #args.append( "/dev/null" ) - - #cmdline = " ".join(args) - #print cmdline - try: - # Run program - proc = subprocess.Popen( args=args, shell=False, stderr=subprocess.PIPE ) - returncode = proc.wait() - # Capture stderr, allowing for case where it's very large - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += proc.stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass - # Running Grinder failed: write error message to stderr - if returncode != 0: - raise Exception, stderr - except Exception, e: - # Running Grinder failed: write error message to stderr - stop_err( 'Error: ' + str( e ) ) - - -if __name__ == "__main__": __main__()