Repository 'sharplabtool'
hg clone https://toolshed.g2.bx.psu.edu/repos/xuebing/sharplabtool

Changeset 0:9071e359b9a3 (2012-03-09)
Next changeset 1:cdcb0ce84a1b (2012-03-09)
Commit message:
Uploaded
added:
tools/.DS_Store
tools/._.DS_Store
tools/._mytools
tools/._tool_conf.xml
tools/annotation_profiler/annotation_profiler.xml
tools/annotation_profiler/annotation_profiler_for_interval.py
tools/bedtools/._bedToBam.xml
tools/bedtools/bedToBam.xml
tools/data_destination/epigraph.xml
tools/data_destination/epigraph_test.xml
tools/data_source/access_libraries.xml
tools/data_source/bed_convert.xml
tools/data_source/biomart.xml
tools/data_source/biomart_test.xml
tools/data_source/bx_browser.xml
tools/data_source/cbi_rice_mart.xml
tools/data_source/data_source.py
tools/data_source/echo.py
tools/data_source/echo.xml
tools/data_source/encode_db.xml
tools/data_source/epigraph_import.xml
tools/data_source/epigraph_import_test.xml
tools/data_source/eupathdb.xml
tools/data_source/fetch.py
tools/data_source/fly_modencode.xml
tools/data_source/flymine.xml
tools/data_source/flymine_test.xml
tools/data_source/genbank.py
tools/data_source/genbank.xml
tools/data_source/gramene_mart.xml
tools/data_source/hapmapmart.xml
tools/data_source/hbvar.xml
tools/data_source/hbvar_filter.py
tools/data_source/import.py
tools/data_source/import.xml
tools/data_source/metabolicmine.xml
tools/data_source/microbial_import.py
tools/data_source/microbial_import.xml
tools/data_source/microbial_import_code.py
tools/data_source/modmine.xml
tools/data_source/ratmine.xml
tools/data_source/ucsc_archaea.xml
tools/data_source/ucsc_filter.py
tools/data_source/ucsc_proxy.py
tools/data_source/ucsc_proxy.xml
tools/data_source/ucsc_tablebrowser.xml
tools/data_source/ucsc_tablebrowser_archaea.xml
tools/data_source/ucsc_tablebrowser_test.xml
tools/data_source/ucsc_testproxy.xml
tools/data_source/upload.py
tools/data_source/upload.xml
tools/data_source/worm_modencode.xml
tools/data_source/wormbase.xml
tools/data_source/wormbase_test.xml
tools/data_source/yeastmine.xml
tools/discreteWavelet/execute_dwt_IvC_all.pl
tools/discreteWavelet/execute_dwt_IvC_all.xml
tools/discreteWavelet/execute_dwt_cor_aVa_perClass.pl
tools/discreteWavelet/execute_dwt_cor_aVa_perClass.xml
tools/discreteWavelet/execute_dwt_cor_aVb_all.pl
tools/discreteWavelet/execute_dwt_cor_aVb_all.xml
tools/discreteWavelet/execute_dwt_var_perClass.pl
tools/discreteWavelet/execute_dwt_var_perClass.xml
tools/discreteWavelet/execute_dwt_var_perFeature.pl
tools/discreteWavelet/execute_dwt_var_perFeature.xml
tools/emboss_5/emboss_antigenic.xml
tools/emboss_5/emboss_backtranseq.xml
tools/emboss_5/emboss_banana.pl
tools/emboss_5/emboss_banana.xml
tools/emboss_5/emboss_biosed.xml
tools/emboss_5/emboss_btwisted.xml
tools/emboss_5/emboss_cai.xml
tools/emboss_5/emboss_cai_custom.xml
tools/emboss_5/emboss_chaos.xml
tools/emboss_5/emboss_charge.xml
tools/emboss_5/emboss_checktrans.xml
tools/emboss_5/emboss_chips.xml
tools/emboss_5/emboss_cirdna.xml
tools/emboss_5/emboss_codcmp.xml
tools/emboss_5/emboss_coderet.xml
tools/emboss_5/emboss_compseq.xml
tools/emboss_5/emboss_cpgplot.xml
tools/emboss_5/emboss_cpgplot_wrapper.pl
tools/emboss_5/emboss_cpgreport.xml
tools/emboss_5/emboss_cusp.xml
tools/emboss_5/emboss_cutseq.xml
tools/emboss_5/emboss_dan.xml
tools/emboss_5/emboss_degapseq.xml
tools/emboss_5/emboss_descseq.xml
tools/emboss_5/emboss_diffseq.xml
tools/emboss_5/emboss_digest.xml
tools/emboss_5/emboss_dotmatcher.xml
tools/emboss_5/emboss_dotpath.xml
tools/emboss_5/emboss_dottup.xml
tools/emboss_5/emboss_dreg.xml
tools/emboss_5/emboss_einverted.xml
tools/emboss_5/emboss_epestfind.xml
tools/emboss_5/emboss_equicktandem.xml
tools/emboss_5/emboss_est2genome.xml
tools/emboss_5/emboss_etandem.xml
tools/emboss_5/emboss_extractfeat.xml
tools/emboss_5/emboss_extractseq.xml
tools/emboss_5/emboss_format_corrector.py
tools/emboss_5/emboss_freak.xml
tools/emboss_5/emboss_fuzznuc.xml
tools/emboss_5/emboss_fuzzpro.xml
tools/emboss_5/emboss_fuzztran.xml
tools/emboss_5/emboss_garnier.xml
tools/emboss_5/emboss_geecee.xml
tools/emboss_5/emboss_getorf.xml
tools/emboss_5/emboss_helixturnhelix.xml
tools/emboss_5/emboss_hmoment.xml
tools/emboss_5/emboss_iep.xml
tools/emboss_5/emboss_infoseq.xml
tools/emboss_5/emboss_infoseq_wrapper.pl
tools/emboss_5/emboss_isochore.xml
tools/emboss_5/emboss_lindna.xml
tools/emboss_5/emboss_marscan.xml
tools/emboss_5/emboss_maskfeat.xml
tools/emboss_5/emboss_maskseq.xml
tools/emboss_5/emboss_matcher.xml
tools/emboss_5/emboss_megamerger.xml
tools/emboss_5/emboss_merger.xml
tools/emboss_5/emboss_msbar.xml
tools/emboss_5/emboss_multiple_outputfile_wrapper.pl
tools/emboss_5/emboss_needle.xml
tools/emboss_5/emboss_newcpgreport.xml
tools/emboss_5/emboss_newcpgseek.xml
tools/emboss_5/emboss_newseq.xml
tools/emboss_5/emboss_noreturn.xml
tools/emboss_5/emboss_notseq.xml
tools/emboss_5/emboss_nthseq.xml
tools/emboss_5/emboss_octanol.xml
tools/emboss_5/emboss_oddcomp.xml
tools/emboss_5/emboss_palindrome.xml
tools/emboss_5/emboss_pasteseq.xml
tools/emboss_5/emboss_patmatdb.xml
tools/emboss_5/emboss_pepcoil.xml
tools/emboss_5/emboss_pepinfo.xml
tools/emboss_5/emboss_pepnet.xml
tools/emboss_5/emboss_pepstats.xml
tools/emboss_5/emboss_pepwheel.xml
tools/emboss_5/emboss_pepwindow.xml
tools/emboss_5/emboss_pepwindowall.xml
tools/emboss_5/emboss_plotcon.xml
tools/emboss_5/emboss_plotorf.xml
tools/emboss_5/emboss_polydot.xml
tools/emboss_5/emboss_preg.xml
tools/emboss_5/emboss_prettyplot.xml
tools/emboss_5/emboss_prettyseq.xml
tools/emboss_5/emboss_primersearch.xml
tools/emboss_5/emboss_revseq.xml
tools/emboss_5/emboss_seqmatchall.xml
tools/emboss_5/emboss_seqret.xml
tools/emboss_5/emboss_showfeat.xml
tools/emboss_5/emboss_shuffleseq.xml
tools/emboss_5/emboss_sigcleave.xml
tools/emboss_5/emboss_single_outputfile_wrapper.pl
tools/emboss_5/emboss_sirna.xml
tools/emboss_5/emboss_sixpack.xml
tools/emboss_5/emboss_skipseq.xml
tools/emboss_5/emboss_splitter.xml
tools/emboss_5/emboss_supermatcher.xml
tools/emboss_5/emboss_syco.xml
tools/emboss_5/emboss_tcode.xml
tools/emboss_5/emboss_textsearch.xml
tools/emboss_5/emboss_tmap.xml
tools/emboss_5/emboss_tranalign.xml
tools/emboss_5/emboss_transeq.xml
tools/emboss_5/emboss_trimest.xml
tools/emboss_5/emboss_trimseq.xml
tools/emboss_5/emboss_twofeat.xml
tools/emboss_5/emboss_union.xml
tools/emboss_5/emboss_vectorstrip.xml
tools/emboss_5/emboss_water.xml
tools/emboss_5/emboss_wobble.xml
tools/emboss_5/emboss_wordcount.xml
tools/emboss_5/emboss_wordmatch.xml
tools/encode/gencode_partition.xml
tools/encode/random_intervals.xml
tools/encode/random_intervals_no_bits.py
tools/encode/split_by_partitions.py
tools/evolution/add_scores.xml
tools/evolution/codingSnps.pl
tools/evolution/codingSnps.xml
tools/evolution/codingSnps_filter.py
tools/evolution/mutate_snp_codon.py
tools/evolution/mutate_snp_codon.xml
tools/extract/extract_genomic_dna.py
tools/extract/extract_genomic_dna.xml
tools/extract/liftOver_wrapper.py
tools/extract/liftOver_wrapper.xml
tools/extract/phastOdds/get_scores_galaxy.py
tools/extract/phastOdds/phastOdds_tool.xml
tools/fasta_tools/fasta_compute_length.py
tools/fasta_tools/fasta_compute_length.xml
tools/fasta_tools/fasta_concatenate_by_species.py
tools/fasta_tools/fasta_concatenate_by_species.xml
tools/fasta_tools/fasta_filter_by_length.py
tools/fasta_tools/fasta_filter_by_length.xml
tools/fasta_tools/fasta_to_tabular.py
tools/fasta_tools/fasta_to_tabular.xml
tools/fasta_tools/tabular_to_fasta.py
tools/fasta_tools/tabular_to_fasta.xml
tools/fastq/fastq_combiner.py
tools/fastq/fastq_combiner.xml
tools/fastq/fastq_filter.py
tools/fastq/fastq_filter.xml
tools/fastq/fastq_groomer.py
tools/fastq/fastq_groomer.xml
tools/fastq/fastq_manipulation.py
tools/fastq/fastq_manipulation.xml
tools/fastq/fastq_masker_by_quality.py
tools/fastq/fastq_masker_by_quality.xml
tools/fastq/fastq_paired_end_deinterlacer.py
tools/fastq/fastq_paired_end_deinterlacer.xml
tools/fastq/fastq_paired_end_interlacer.py
tools/fastq/fastq_paired_end_interlacer.xml
tools/fastq/fastq_paired_end_joiner.py
tools/fastq/fastq_paired_end_joiner.xml
tools/fastq/fastq_paired_end_splitter.py
tools/fastq/fastq_paired_end_splitter.xml
tools/fastq/fastq_stats.py
tools/fastq/fastq_stats.xml
tools/fastq/fastq_to_fasta.py
tools/fastq/fastq_to_fasta.xml
tools/fastq/fastq_to_tabular.py
tools/fastq/fastq_to_tabular.xml
tools/fastq/fastq_trimmer.py
tools/fastq/fastq_trimmer.xml
tools/fastq/fastq_trimmer_by_quality.py
tools/fastq/fastq_trimmer_by_quality.xml
tools/fastq/tabular_to_fastq.py
tools/fastq/tabular_to_fastq.xml
tools/fastx_toolkit/fasta_clipping_histogram.xml
tools/fastx_toolkit/fasta_formatter.xml
tools/fastx_toolkit/fasta_nucleotide_changer.xml
tools/fastx_toolkit/fastq_quality_boxplot.xml
tools/fastx_toolkit/fastq_quality_converter.xml
tools/fastx_toolkit/fastq_quality_filter.xml
tools/fastx_toolkit/fastq_to_fasta.xml
tools/fastx_toolkit/fastx_artifacts_filter.xml
tools/fastx_toolkit/fastx_barcode_splitter.xml
tools/fastx_toolkit/fastx_barcode_splitter_galaxy_wrapper.sh
tools/fastx_toolkit/fastx_clipper.xml
tools/fastx_toolkit/fastx_collapser.xml
tools/fastx_toolkit/fastx_nucleotides_distribution.xml
tools/fastx_toolkit/fastx_quality_statistics.xml
tools/fastx_toolkit/fastx_renamer.xml
tools/fastx_toolkit/fastx_reverse_complement.xml
tools/fastx_toolkit/fastx_trimmer.xml
tools/filters/CreateInterval.pl
tools/filters/CreateInterval.xml
tools/filters/axt_to_concat_fasta.py
tools/filters/axt_to_concat_fasta.xml
tools/filters/axt_to_fasta.py
tools/filters/axt_to_fasta.xml
tools/filters/axt_to_lav.py
tools/filters/axt_to_lav.xml
tools/filters/axt_to_lav_code.py
tools/filters/bed2gff.xml
tools/filters/bed_to_bigbed.xml
tools/filters/bed_to_gff_converter.py
tools/filters/catWrapper.py
tools/filters/catWrapper.xml
tools/filters/changeCase.pl
tools/filters/changeCase.xml
tools/filters/commWrapper.pl
tools/filters/commWrapper.xml
tools/filters/compare.xml
tools/filters/condense_characters.pl
tools/filters/condense_characters.xml
tools/filters/convert_characters.pl
tools/filters/convert_characters.py
tools/filters/convert_characters.xml
tools/filters/cutWrapper.pl
tools/filters/cutWrapper.xml
tools/filters/fileGrep.xml
tools/filters/fixedValueColumn.pl
tools/filters/fixedValueColumn.xml
tools/filters/gff/extract_GFF_Features.py
tools/filters/gff/extract_GFF_Features.xml
tools/filters/gff/gff_filter_by_attribute.py
tools/filters/gff/gff_filter_by_attribute.xml
tools/filters/gff/gff_filter_by_feature_count.py
tools/filters/gff/gff_filter_by_feature_count.xml
tools/filters/gff/gtf_filter_by_attribute_values_list.py
tools/filters/gff/gtf_filter_by_attribute_values_list.xml
tools/filters/gff2bed.xml
tools/filters/gff_to_bed_converter.py
tools/filters/grep.py
tools/filters/grep.xml
tools/filters/gtf2bedgraph.xml
tools/filters/gtf_to_bedgraph_converter.py
tools/filters/headWrapper.pl
tools/filters/headWrapper.xml
tools/filters/join.py
tools/filters/joinWrapper.pl
tools/filters/joinWrapper.py
tools/filters/joiner.xml
tools/filters/joiner2.xml
tools/filters/lav_to_bed.py
tools/filters/lav_to_bed.xml
tools/filters/lav_to_bed_code.py
tools/filters/mergeCols.py
tools/filters/mergeCols.xml
tools/filters/pasteWrapper.pl
tools/filters/pasteWrapper.xml
tools/filters/randomlines.py
tools/filters/randomlines.xml
tools/filters/remove_beginning.pl
tools/filters/remove_beginning.xml
tools/filters/sff_extract.py
tools/filters/sff_extractor.xml
tools/filters/sorter.py
tools/filters/sorter.xml
tools/filters/tailWrapper.pl
tools/filters/tailWrapper.xml
tools/filters/trimmer.py
tools/filters/trimmer.xml
tools/filters/ucsc_gene_bed_to_exon_bed.py
tools/filters/ucsc_gene_bed_to_exon_bed.xml
tools/filters/ucsc_gene_bed_to_intron_bed.py
tools/filters/ucsc_gene_bed_to_intron_bed.xml
tools/filters/ucsc_gene_table_to_intervals.py
tools/filters/ucsc_gene_table_to_intervals.xml
tools/filters/uniq.py
tools/filters/uniq.xml
tools/filters/wc_gnu.xml
tools/filters/wig_to_bigwig.xml
tools/filters/wiggle_to_simple.py
tools/filters/wiggle_to_simple.xml
tools/galaxy-loc.tar.gz
tools/gatk/analyze_covariates.xml
tools/gatk/count_covariates.xml
tools/gatk/gatk_wrapper.py
tools/gatk/indel_realigner.xml
tools/gatk/realigner_target_creator.xml
tools/gatk/table_recalibration.xml
tools/gatk/unified_genotyper.xml
tools/genetrack/genetrack_indexer.py
tools/genetrack/genetrack_indexer.xml
tools/genetrack/genetrack_peak_prediction.py
tools/genetrack/genetrack_peak_prediction.xml
tools/genome_diversity/cdblib.py
tools/genome_diversity/extract_flanking_dna.py
tools/genome_diversity/extract_flanking_dna.xml
tools/genome_diversity/extract_primers.py
tools/genome_diversity/extract_primers.xml
tools/genome_diversity/genome_diversity.py
tools/genome_diversity/select_restriction_enzymes.py
tools/genome_diversity/select_restriction_enzymes.xml
tools/genome_diversity/select_snps.py
tools/genome_diversity/select_snps.xml
tools/human_genome_variation/BEAM2_wrapper.sh
tools/human_genome_variation/beam.xml
tools/human_genome_variation/ctd.pl
tools/human_genome_variation/ctd.xml
tools/human_genome_variation/disease_ontology_gene_fuzzy_selector.pl
tools/human_genome_variation/freebayes.xml
tools/human_genome_variation/funDo.xml
tools/human_genome_variation/gpass.pl
tools/human_genome_variation/gpass.xml
tools/human_genome_variation/hilbertvis.sh
tools/human_genome_variation/hilbertvis.xml
tools/human_genome_variation/ldtools.xml
tools/human_genome_variation/ldtools_wrapper.sh
tools/human_genome_variation/linkToDavid.pl
tools/human_genome_variation/linkToDavid.xml
tools/human_genome_variation/linkToGProfile.pl
tools/human_genome_variation/linkToGProfile.xml
tools/human_genome_variation/lped_to_geno.pl
tools/human_genome_variation/lps.xml
tools/human_genome_variation/lps_tool_wrapper.sh
tools/human_genome_variation/mergeSnps.pl
tools/human_genome_variation/pagetag.py
tools/human_genome_variation/pass.xml
tools/human_genome_variation/pass_wrapper.sh
tools/human_genome_variation/senatag.py
tools/human_genome_variation/sift.xml
tools/human_genome_variation/sift_variants_wrapper.sh
tools/human_genome_variation/snpFreq.xml
tools/human_genome_variation/snpFreq2.pl
tools/hyphy/hyphy_branch_lengths_wrapper.py
tools/hyphy/hyphy_branch_lengths_wrapper.xml
tools/hyphy/hyphy_dnds_wrapper.py
tools/hyphy/hyphy_dnds_wrapper.xml
tools/hyphy/hyphy_nj_tree_wrapper.py
tools/hyphy/hyphy_nj_tree_wrapper.xml
tools/ilmn_pacbio/abyss.xml
tools/ilmn_pacbio/assembly_stats.py
tools/ilmn_pacbio/assembly_stats.xml
tools/ilmn_pacbio/cov_model.py
tools/ilmn_pacbio/quake.py
tools/ilmn_pacbio/quake.xml
tools/ilmn_pacbio/quake_pe.xml
tools/ilmn_pacbio/quake_wrapper.py
tools/ilmn_pacbio/smrtpipe.py
tools/ilmn_pacbio/smrtpipe_filter.xml
tools/ilmn_pacbio/smrtpipe_galaxy.py
tools/ilmn_pacbio/smrtpipe_hybrid.xml
tools/ilmn_pacbio/soap_denovo.xml
tools/indels/indel_analysis.py
tools/indels/indel_analysis.xml
tools/indels/indel_sam2interval.py
tools/indels/indel_sam2interval.xml
tools/indels/indel_table.py
tools/indels/indel_table.xml
tools/indels/sam_indel_filter.py
tools/indels/sam_indel_filter.xml
tools/maf/genebed_maf_to_fasta.xml
tools/maf/interval2maf.py
tools/maf/interval2maf.xml
tools/maf/interval2maf_pairwise.xml
tools/maf/interval_maf_to_merged_fasta.py
tools/maf/interval_maf_to_merged_fasta.xml
tools/maf/maf_by_block_number.py
tools/maf/maf_by_block_number.xml
tools/maf/maf_filter.py
tools/maf/maf_filter.xml
tools/maf/maf_limit_size.py
tools/maf/maf_limit_size.xml
tools/maf/maf_limit_to_species.py
tools/maf/maf_limit_to_species.xml
tools/maf/maf_reverse_complement.py
tools/maf/maf_reverse_complement.xml
tools/maf/maf_split_by_species.py
tools/maf/maf_split_by_species.xml
tools/maf/maf_stats.py
tools/maf/maf_stats.xml
tools/maf/maf_thread_for_species.py
tools/maf/maf_thread_for_species.xml
tools/maf/maf_to_bed.py
tools/maf/maf_to_bed.xml
tools/maf/maf_to_bed_code.py
tools/maf/maf_to_fasta.xml
tools/maf/maf_to_fasta_concat.py
tools/maf/maf_to_fasta_multiple_sets.py
tools/maf/maf_to_interval.py
tools/maf/maf_to_interval.xml
tools/maf/vcf_to_maf_customtrack.py
tools/maf/vcf_to_maf_customtrack.xml
tools/meme/._meme.xml
tools/meme/fimo.xml
tools/meme/fimo_wrapper.py
tools/meme/meme.xml
tools/metag_tools/blat_coverage_report.py
tools/metag_tools/blat_coverage_report.xml
tools/metag_tools/blat_mapping.py
tools/metag_tools/blat_mapping.xml
tools/metag_tools/blat_wrapper.py
tools/metag_tools/blat_wrapper.xml
tools/metag_tools/convert_SOLiD_color2nuc.py
tools/metag_tools/convert_SOLiD_color2nuc.xml
tools/metag_tools/fastqsolexa_to_fasta_qual.py
tools/metag_tools/fastqsolexa_to_fasta_qual.xml
tools/metag_tools/mapping_to_ucsc.py
tools/metag_tools/mapping_to_ucsc.xml
tools/metag_tools/megablast_wrapper.py
tools/metag_tools/megablast_wrapper.xml
tools/metag_tools/megablast_xml_parser.py
tools/metag_tools/megablast_xml_parser.xml
tools/metag_tools/rmap_wrapper.py
tools/metag_tools/rmap_wrapper.xml
tools/metag_tools/rmapq_wrapper.py
tools/metag_tools/rmapq_wrapper.xml
tools/metag_tools/short_reads_figure_high_quality_length.py
tools/metag_tools/short_reads_figure_high_quality_length.xml
tools/metag_tools/short_reads_figure_score.py
tools/metag_tools/short_reads_figure_score.xml
tools/metag_tools/short_reads_trim_seq.py
tools/metag_tools/short_reads_trim_seq.xml
tools/metag_tools/shrimp_color_wrapper.py
tools/metag_tools/shrimp_color_wrapper.xml
tools/metag_tools/shrimp_wrapper.py
tools/metag_tools/shrimp_wrapper.xml
tools/metag_tools/split_paired_reads.py
tools/metag_tools/split_paired_reads.xml
tools/multivariate_stats/cca.py
tools/multivariate_stats/cca.xml
tools/multivariate_stats/kcca.py
tools/multivariate_stats/kcca.xml
tools/multivariate_stats/kpca.py
tools/multivariate_stats/kpca.xml
tools/multivariate_stats/pca.py
tools/multivariate_stats/pca.xml
tools/mutation/visualize.py
tools/mutation/visualize.xml
tools/mytools/.DS_Store
tools/mytools/._.DS_Store
tools/mytools/._StartGenometriCorr.xml
tools/mytools/._Start_GenometriCorr.R
tools/mytools/._align2database.py
tools/mytools/._align2database.xml
tools/mytools/._align2multiple.xml
tools/mytools/._alignr.py
tools/mytools/._alignr.xml
tools/mytools/._alignvis.xml
tools/mytools/._altschulEriksonDinuclShuffle.py
tools/mytools/._bed_to_bam.xml
tools/mytools/._bedclean.xml
tools/mytools/._bedsort.xml
tools/mytools/._bigWigAverageOverBed.xml
tools/mytools/._binaverage.xml
tools/mytools/._bowtie2bed.pl
tools/mytools/._bowtie2bed.xml
tools/mytools/._bwBinavg.xml
tools/mytools/._cdf.r
tools/mytools/._cdf.xml
tools/mytools/._closestBed.xml
tools/mytools/._collapseBed.py
tools/mytools/._collapseBed.xml
tools/mytools/._collapseTab.xml
tools/mytools/._convertEnsembl.xml
tools/mytools/._dreme.xml
tools/mytools/._endbias.xml
tools/mytools/._fastamarkov.xml
tools/mytools/._fastashuffle1.xml
tools/mytools/._fastashuffle2.xml
tools/mytools/._fastqdump.xml
tools/mytools/._fimo2-old.xml
tools/mytools/._fimo2.xml
tools/mytools/._fimo2bed.py
tools/mytools/._fimo2bed.xml
tools/mytools/._genomeView.xml
tools/mytools/._genomeview-old2.r
tools/mytools/._genomeview.r
tools/mytools/._genomeview_notused
tools/mytools/._headtail.xml
tools/mytools/._intersectSig.xml
tools/mytools/._intersectbed.xml
tools/mytools/._intervalSize.xml
tools/mytools/._iupac2meme.xml
tools/mytools/._makebigwig.sh
tools/mytools/._makebigwig.sh-old
tools/mytools/._makebigwig.xml
tools/mytools/._makewindow.xml
tools/mytools/._meme.xml
tools/mytools/._memelogo.xml
tools/mytools/._metaintv.xml
tools/mytools/._metaintv_ext.xml
tools/mytools/._phastCons.xml
tools/mytools/._plotmatrix.xml
tools/mytools/._r_wrapper.sh
tools/mytools/._r_wrapper_old.sh
tools/mytools/._random_interval.py
tools/mytools/._random_interval.xml
tools/mytools/._removeDuplicate.xml
tools/mytools/._resize.xml
tools/mytools/._revcompl.py
tools/mytools/._revcompl.xml
tools/mytools/._sampline.py
tools/mytools/._seq2meme.py
tools/mytools/._seq2meme.xml
tools/mytools/._seqshuffle.py
tools/mytools/._shuffleBed.py
tools/mytools/._shuffleBed.xml
tools/mytools/._shuffleSequenceUsingAltschulErikson.txt
tools/mytools/._spatial_proximity.xml
tools/mytools/._splicesite.xml
tools/mytools/._splicesitescore
tools/mytools/._stats.txt
tools/mytools/._venn.xml
tools/mytools/.sorted.bed
tools/mytools/AATAAA.motif
tools/mytools/StartGenometriCorr.xml
tools/mytools/Start_GenometriCorr.R
tools/mytools/align2database.py
tools/mytools/align2database.xml
tools/mytools/align2multiple.xml
tools/mytools/alignr.py
tools/mytools/alignr.xml
tools/mytools/alignvis.py
tools/mytools/alignvis.r
tools/mytools/alignvis.xml
tools/mytools/altschulEriksonDinuclShuffle.py
tools/mytools/bedClean.py
tools/mytools/bed_to_bam.xml
tools/mytools/bedclean.xml
tools/mytools/bedsort.xml
tools/mytools/bigWigAverageOverBed.xml
tools/mytools/binaverage.xml
tools/mytools/binnedAverage.py
tools/mytools/bowtie2bed.pl
tools/mytools/bowtie2bed.xml
tools/mytools/bwBinavg.xml
tools/mytools/cdf-old-not-used/._cdf.xml
tools/mytools/cdf-old-not-used/._cdf2-old.xml
tools/mytools/cdf-old-not-used/cdf.py
tools/mytools/cdf-old-not-used/cdf.xml
tools/mytools/cdf-old-not-used/cdf2-old.xml
tools/mytools/cdf-old-not-used/cdf2.py
tools/mytools/cdf.r
tools/mytools/cdf.xml
tools/mytools/closestBed.py
tools/mytools/closestBed.xml
tools/mytools/collapseBed.py
tools/mytools/collapseBed.xml
tools/mytools/collapseBed2.py
tools/mytools/collapseTab.py
tools/mytools/collapseTab.xml
tools/mytools/convertEnsembl.py
tools/mytools/convertEnsembl.xml
tools/mytools/dreme.xml
tools/mytools/dreme_out/dreme.html
tools/mytools/dreme_out/dreme.txt
tools/mytools/dreme_out/dreme.xml
tools/mytools/endbias.py
tools/mytools/endbias.xml
tools/mytools/fasta-dinucleotide-shuffle.py
tools/mytools/fastamarkov.xml
tools/mytools/fastashuffle1.xml
tools/mytools/fastashuffle2.xml
tools/mytools/fastqdump.xml
tools/mytools/fimo2-old.xml
tools/mytools/fimo2.xml
tools/mytools/fimo2bed.py
tools/mytools/fimo2bed.xml
tools/mytools/fimo_out/cisml.css
tools/mytools/fimo_out/cisml.xml
tools/mytools/fimo_out/fimo-to-html.xsl
tools/mytools/fimo_out/fimo.gff
tools/mytools/fimo_out/fimo.html
tools/mytools/fimo_out/fimo.txt
tools/mytools/fimo_out/fimo.wig
tools/mytools/fimo_out/fimo.xml
tools/mytools/genomeView.xml
tools/mytools/genomeview-old2.r
tools/mytools/genomeview.r
tools/mytools/genomeview_notused
tools/mytools/getGenomicScore.py
tools/mytools/headtail.xml
tools/mytools/intersectSig.py
tools/mytools/intersectSig.xml
tools/mytools/intersectbed.xml
tools/mytools/intervalOverlap.py
tools/mytools/intervalSize.py
tools/mytools/intervalSize.xml
tools/mytools/iupac2meme.xml
tools/mytools/makebigwig.sh
tools/mytools/makebigwig.sh-old
tools/mytools/makebigwig.xml
tools/mytools/makewindow.py
tools/mytools/makewindow.xml
tools/mytools/meme.xml
tools/mytools/memelogo.xml
tools/mytools/metaintv.py
tools/mytools/metaintv.xml
tools/mytools/metaintv2.py
tools/mytools/metaintv3.py
tools/mytools/metaintv_ext.py
tools/mytools/metaintv_ext.xml
tools/mytools/phastCons.xml
tools/mytools/plotmatrix.py
tools/mytools/plotmatrix.xml
tools/mytools/ptb-3t3
tools/mytools/ptb-ptb
tools/mytools/r_wrapper.sh
tools/mytools/r_wrapper_old.sh
tools/mytools/random_interval.py
tools/mytools/random_interval.xml
tools/mytools/removeDuplicate.xml
tools/mytools/resize.py
tools/mytools/resize.xml
tools/mytools/revcompl.py
tools/mytools/revcompl.xml
tools/mytools/sampline.py
tools/mytools/sampline.xml
tools/mytools/seq2meme.py
tools/mytools/seq2meme.xml
tools/mytools/seqshuffle.py
tools/mytools/sequence.py
tools/mytools/shuffleBed.py
tools/mytools/shuffleBed.xml
tools/mytools/shuffleSequenceUsingAltschulErikson.txt
tools/mytools/spatial_proximity.py
tools/mytools/spatial_proximity.xml
tools/mytools/splicesite.xml
tools/mytools/splicesitescore/._me2x5
tools/mytools/splicesitescore/._score3.pl
tools/mytools/splicesitescore/._score5.pl
tools/mytools/splicesitescore/._splicemodels
tools/mytools/splicesitescore/._test3
tools/mytools/splicesitescore/._test3.fa
tools/mytools/splicesitescore/._test5
tools/mytools/splicesitescore/._test5.fa
tools/mytools/splicesitescore/me2x5
tools/mytools/splicesitescore/score3.pl
tools/mytools/splicesitescore/score5.pl
tools/mytools/splicesitescore/splicemodels/._hashseq.m
tools/mytools/splicesitescore/splicemodels/._hashseq.m~
tools/mytools/splicesitescore/splicemodels/._me1s0acc1
tools/mytools/splicesitescore/splicemodels/._me1s0acc2
tools/mytools/splicesitescore/splicemodels/._me1s0acc3
tools/mytools/splicesitescore/splicemodels/._me1s0acc4
tools/mytools/splicesitescore/splicemodels/._me1s0acc5
tools/mytools/splicesitescore/splicemodels/._me1s0acc6
tools/mytools/splicesitescore/splicemodels/._me1s0acc7
tools/mytools/splicesitescore/splicemodels/._me1s0acc8
tools/mytools/splicesitescore/splicemodels/._me1s0acc9
tools/mytools/splicesitescore/splicemodels/._me2s0
tools/mytools/splicesitescore/splicemodels/._me2s0acc1
tools/mytools/splicesitescore/splicemodels/._me2s0acc2
tools/mytools/splicesitescore/splicemodels/._me2s0acc3
tools/mytools/splicesitescore/splicemodels/._me2s0acc4
tools/mytools/splicesitescore/splicemodels/._me2s0acc5
tools/mytools/splicesitescore/splicemodels/._me2s0acc6
tools/mytools/splicesitescore/splicemodels/._me2s0acc7
tools/mytools/splicesitescore/splicemodels/._me2s0acc8
tools/mytools/splicesitescore/splicemodels/._me2s0acc9
tools/mytools/splicesitescore/splicemodels/._me2x3acc1
tools/mytools/splicesitescore/splicemodels/._me2x3acc2
tools/mytools/splicesitescore/splicemodels/._me2x3acc3
tools/mytools/splicesitescore/splicemodels/._me2x3acc4
tools/mytools/splicesitescore/splicemodels/._me2x3acc5
tools/mytools/splicesitescore/splicemodels/._me2x3acc6
tools/mytools/splicesitescore/splicemodels/._me2x3acc7
tools/mytools/splicesitescore/splicemodels/._me2x3acc8
tools/mytools/splicesitescore/splicemodels/._me2x3acc9
tools/mytools/splicesitescore/splicemodels/._me2x5
tools/mytools/splicesitescore/splicemodels/._splice5sequences
tools/mytools/splicesitescore/splicemodels/hashseq.m
tools/mytools/splicesitescore/splicemodels/hashseq.m~
tools/mytools/splicesitescore/splicemodels/me1s0acc1
tools/mytools/splicesitescore/splicemodels/me1s0acc2
tools/mytools/splicesitescore/splicemodels/me1s0acc3
tools/mytools/splicesitescore/splicemodels/me1s0acc4
tools/mytools/splicesitescore/splicemodels/me1s0acc5
tools/mytools/splicesitescore/splicemodels/me1s0acc6
tools/mytools/splicesitescore/splicemodels/me1s0acc7
tools/mytools/splicesitescore/splicemodels/me1s0acc8
tools/mytools/splicesitescore/splicemodels/me1s0acc9
tools/mytools/splicesitescore/splicemodels/me2s0
tools/mytools/splicesitescore/splicemodels/me2s0acc1
tools/mytools/splicesitescore/splicemodels/me2s0acc2
tools/mytools/splicesitescore/splicemodels/me2s0acc3
tools/mytools/splicesitescore/splicemodels/me2s0acc4
tools/mytools/splicesitescore/splicemodels/me2s0acc5
tools/mytools/splicesitescore/splicemodels/me2s0acc6
tools/mytools/splicesitescore/splicemodels/me2s0acc7
tools/mytools/splicesitescore/splicemodels/me2s0acc8
tools/mytools/splicesitescore/splicemodels/me2s0acc9
tools/mytools/splicesitescore/splicemodels/me2x3acc1
tools/mytools/splicesitescore/splicemodels/me2x3acc2
tools/mytools/splicesitescore/splicemodels/me2x3acc3
tools/mytools/splicesitescore/splicemodels/me2x3acc4
tools/mytools/splicesitescore/splicemodels/me2x3acc5
tools/mytools/splicesitescore/splicemodels/me2x3acc6
tools/mytools/splicesitescore/splicemodels/me2x3acc7
tools/mytools/splicesitescore/splicemodels/me2x3acc8
tools/mytools/splicesitescore/splicemodels/me2x3acc9
tools/mytools/splicesitescore/splicemodels/me2x5
tools/mytools/splicesitescore/splicemodels/splice5sequences
tools/mytools/splicesitescore/test3
tools/mytools/splicesitescore/test3.fa
tools/mytools/splicesitescore/test5
tools/mytools/splicesitescore/test5.fa
tools/mytools/stats.txt
tools/mytools/venn.xml
tools/ncbi_blast_plus/blastxml_to_tabular.py
tools/ncbi_blast_plus/blastxml_to_tabular.xml
tools/ncbi_blast_plus/hide_stderr.py
tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml
tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml
tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml
tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml
tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml
tools/new_operations/basecoverage.xml
tools/new_operations/cluster.xml
tools/new_operations/column_join.py
tools/new_operations/column_join.xml
tools/new_operations/complement.xml
tools/new_operations/concat.xml
tools/new_operations/coverage.xml
tools/new_operations/flanking_features.py
tools/new_operations/flanking_features.xml
tools/new_operations/get_flanks.py
tools/new_operations/get_flanks.xml
tools/new_operations/gops_basecoverage.py
tools/new_operations/gops_cluster.py
tools/new_operations/gops_complement.py
tools/new_operations/gops_concat.py
tools/new_operations/gops_coverage.py
tools/new_operations/gops_intersect.py
tools/new_operations/gops_join.py
tools/new_operations/gops_merge.py
tools/new_operations/gops_subtract.py
tools/new_operations/intersect.xml
tools/new_operations/join.xml
tools/new_operations/merge.xml
tools/new_operations/operation_filter.py
tools/new_operations/subtract.xml
tools/new_operations/subtract_query.py
tools/new_operations/subtract_query.xml
tools/new_operations/tables_arithmetic_operations.pl
tools/new_operations/tables_arithmetic_operations.xml
tools/next_gen_conversion/bwa_solid2fastq_modified.pl
tools/next_gen_conversion/fastq_conversions.py
tools/next_gen_conversion/fastq_conversions.xml
tools/next_gen_conversion/fastq_gen_conv.py
tools/next_gen_conversion/fastq_gen_conv.xml
tools/next_gen_conversion/solid2fastq.py
tools/next_gen_conversion/solid2fastq.xml
tools/next_gen_conversion/solid_to_fastq.py
tools/next_gen_conversion/solid_to_fastq.xml
tools/ngs_rna/cuffcompare_wrapper.py
tools/ngs_rna/cuffcompare_wrapper.xml
tools/ngs_rna/cuffdiff_wrapper.py
tools/ngs_rna/cuffdiff_wrapper.xml
tools/ngs_rna/cufflinks_wrapper.py
tools/ngs_rna/cufflinks_wrapper.xml
tools/ngs_rna/filter_transcripts_via_tracking.py
tools/ngs_rna/filter_transcripts_via_tracking.xml
tools/ngs_rna/tophat_color_wrapper.xml
tools/ngs_rna/tophat_wrapper.py
tools/ngs_rna/tophat_wrapper.xml
tools/ngs_rna/trinity_all.xml
tools/ngs_simulation/ngs_simulation.py
tools/ngs_simulation/ngs_simulation.xml
tools/peak_calling/ccat_2_wrapper.xml
tools/peak_calling/ccat_wrapper.py
tools/peak_calling/ccat_wrapper.xml
tools/peak_calling/macs_wrapper.py
tools/peak_calling/macs_wrapper.xml
tools/peak_calling/sicer_wrapper.py
tools/peak_calling/sicer_wrapper.xml
tools/picard/picard_AddOrReplaceReadGroups.xml
tools/picard/picard_BamIndexStats.xml
tools/picard/picard_MarkDuplicates.xml
tools/picard/picard_ReorderSam.xml
tools/picard/picard_ReplaceSamHeader.xml
tools/picard/picard_wrapper.py
tools/picard/rgPicardASMetrics.xml
tools/picard/rgPicardFixMate.xml
tools/picard/rgPicardGCBiasMetrics.xml
tools/picard/rgPicardHsMetrics.xml
tools/picard/rgPicardInsertSize.xml
tools/picard/rgPicardLibComplexity.xml
tools/picard/rgPicardMarkDups.xml
tools/plotting/bar_chart.py
tools/plotting/bar_chart.xml
tools/plotting/boxplot.xml
tools/plotting/histogram.py
tools/plotting/histogram2.xml
tools/plotting/plot_filter.py
tools/plotting/plotter.py
tools/plotting/r_wrapper.sh
tools/plotting/scatterplot.py
tools/plotting/scatterplot.xml
tools/plotting/xy_plot.xml
tools/regVariation/best_regression_subsets.py
tools/regVariation/best_regression_subsets.xml
tools/regVariation/categorize_elements_satisfying_criteria.pl
tools/regVariation/categorize_elements_satisfying_criteria.xml
tools/regVariation/compute_motif_frequencies_for_all_motifs.pl
tools/regVariation/compute_motif_frequencies_for_all_motifs.xml
tools/regVariation/compute_motifs_frequency.pl
tools/regVariation/compute_motifs_frequency.xml
tools/regVariation/compute_q_values.pl
tools/regVariation/compute_q_values.xml
tools/regVariation/delete_overlapping_indels.pl
tools/regVariation/delete_overlapping_indels.xml
tools/regVariation/draw_stacked_barplots.pl
tools/regVariation/draw_stacked_barplots.xml
tools/regVariation/featureCounter.py
tools/regVariation/featureCounter.xml
tools/regVariation/getIndelRates_3way.py
tools/regVariation/getIndelRates_3way.xml
tools/regVariation/getIndels.py
tools/regVariation/getIndels_2way.xml
tools/regVariation/getIndels_3way.xml
tools/regVariation/linear_regression.py
tools/regVariation/linear_regression.xml
tools/regVariation/maf_cpg_filter.py
tools/regVariation/maf_cpg_filter.xml
tools/regVariation/microsatellite_birthdeath.pl
tools/regVariation/microsatellite_birthdeath.xml
tools/regVariation/microsats_alignment_level.py
tools/regVariation/microsats_alignment_level.xml
tools/regVariation/microsats_mutability.py
tools/regVariation/microsats_mutability.xml
tools/regVariation/multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl
tools/regVariation/multispecies_MicrosatDataGenerator_interrupted_GALAXY.xml
tools/regVariation/parseMAF_smallIndels.pl
tools/regVariation/quality_filter.py
tools/regVariation/quality_filter.xml
tools/regVariation/qv_to_bqv.py
tools/regVariation/qv_to_bqv.xml
tools/regVariation/rcve.py
tools/regVariation/rcve.xml
tools/regVariation/substitution_rates.py
tools/regVariation/substitution_rates.xml
tools/regVariation/substitutions.py
tools/regVariation/substitutions.xml
tools/regVariation/t_test_two_samples.pl
tools/regVariation/t_test_two_samples.xml
tools/regVariation/windowSplitter.py
tools/regVariation/windowSplitter.xml
tools/rgenetics/listFiles.py
tools/rgenetics/plinkbinJZ.py
tools/rgenetics/plinkbinJZ.pyc
tools/rgenetics/rgCaCo.py
tools/rgenetics/rgCaCo.xml
tools/rgenetics/rgClean.py
tools/rgenetics/rgClean.xml
tools/rgenetics/rgClustalw.py
tools/rgenetics/rgClustalw.xml
tools/rgenetics/rgEigPCA.py
tools/rgenetics/rgEigPCA.xml
tools/rgenetics/rgFastQC.py
tools/rgenetics/rgFastQC.xml
tools/rgenetics/rgGLM.py
tools/rgenetics/rgGLM.xml
tools/rgenetics/rgGLM_code.py
tools/rgenetics/rgGRR.py
tools/rgenetics/rgGRR.xml
tools/rgenetics/rgGTOOL.py
tools/rgenetics/rgGTOOL.xml
tools/rgenetics/rgHaploView.py
tools/rgenetics/rgHaploView.xml
tools/rgenetics/rgLDIndep.py
tools/rgenetics/rgLDIndep.xml
tools/rgenetics/rgLDIndep_code.py
tools/rgenetics/rgManQQ.py
tools/rgenetics/rgManQQ.xml
tools/rgenetics/rgManQQ_code.py
tools/rgenetics/rgPedSub.py
tools/rgenetics/rgPedSub.xml
tools/rgenetics/rgQC.py
tools/rgenetics/rgQC.xml
tools/rgenetics/rgQQ.py
tools/rgenetics/rgQQ.xml
tools/rgenetics/rgQQ_code.py
tools/rgenetics/rgRegion.py
tools/rgenetics/rgRegion.xml
tools/rgenetics/rgTDT.py
tools/rgenetics/rgTDT.xml
tools/rgenetics/rgWebLogo3.py
tools/rgenetics/rgWebLogo3.xml
tools/rgenetics/rgfakePed.py
tools/rgenetics/rgfakePed.xml
tools/rgenetics/rgfakePhe.py
tools/rgenetics/rgfakePhe.xml
tools/rgenetics/rgtest.sh
tools/rgenetics/rgtest_one_tool.sh
tools/rgenetics/rgutils.py
tools/rgenetics/rgutils.pyc
tools/rgenetics/test
tools/rgenetics/test.eps
tools/rgenetics/test.pdf
tools/rgenetics/test.png
tools/samtools/bam_to_sam.py
tools/samtools/bam_to_sam.xml
tools/samtools/pileup_interval.py
tools/samtools/pileup_interval.xml
tools/samtools/pileup_parser.pl
tools/samtools/pileup_parser.xml
tools/samtools/sam2interval.py
tools/samtools/sam2interval.xml
tools/samtools/sam_bitwise_flag_filter.py
tools/samtools/sam_bitwise_flag_filter.xml
tools/samtools/sam_merge.py
tools/samtools/sam_merge.xml
tools/samtools/sam_merge_code.py
tools/samtools/sam_pileup.py
tools/samtools/sam_pileup.xml
tools/samtools/sam_to_bam.py
tools/samtools/sam_to_bam.xml
tools/samtools/samtools_flagstat.xml
tools/solid_tools/maq_cs_wrapper.py
tools/solid_tools/maq_cs_wrapper.xml
tools/solid_tools/maq_cs_wrapper_code.py
tools/solid_tools/qualsolid_boxplot_graph.sh
tools/solid_tools/solid_qual_boxplot.xml
tools/solid_tools/solid_qual_stats.py
tools/solid_tools/solid_qual_stats.xml
tools/sr_assembly/velvetg.xml
tools/sr_assembly/velvetg_wrapper.py
tools/sr_assembly/velveth.xml
tools/sr_assembly/velveth_wrapper.py
tools/sr_mapping/PerM.xml
tools/sr_mapping/bfast_wrapper.py
tools/sr_mapping/bfast_wrapper.xml
tools/sr_mapping/bowtie_color_wrapper.xml
tools/sr_mapping/bowtie_wrapper.py
tools/sr_mapping/bowtie_wrapper.xml
tools/sr_mapping/bwa_color_wrapper.xml
tools/sr_mapping/bwa_wrapper.py
tools/sr_mapping/bwa_wrapper.xml
tools/sr_mapping/fastq_statistics.xml
tools/sr_mapping/lastz_paired_reads_wrapper.py
tools/sr_mapping/lastz_paired_reads_wrapper.xml
tools/sr_mapping/lastz_wrapper.py
tools/sr_mapping/lastz_wrapper.xml
tools/sr_mapping/mosaik.xml
tools/sr_mapping/srma_wrapper.py
tools/sr_mapping/srma_wrapper.xml
tools/stats/aggregate_binned_scores_in_intervals.xml
tools/stats/aggregate_scores_in_intervals.py
tools/stats/column_maker.py
tools/stats/column_maker.xml
tools/stats/cor.py
tools/stats/cor.xml
tools/stats/correlation.pl
tools/stats/correlation.xml
tools/stats/count_gff_features.py
tools/stats/count_gff_features.xml
tools/stats/dna_filtering.py
tools/stats/dna_filtering.xml
tools/stats/filtering.py
tools/stats/filtering.xml
tools/stats/generate_matrix_for_pca_lda.pl
tools/stats/generate_matrix_for_pca_lda.xml
tools/stats/grouping.py
tools/stats/grouping.xml
tools/stats/gsummary.py
tools/stats/gsummary.xml
tools/stats/gsummary.xml.groups
tools/stats/lda_analy.xml
tools/stats/plot_from_lda.xml
tools/stats/r_wrapper.sh
tools/stats/wiggle_to_simple.py
tools/stats/wiggle_to_simple.xml
tools/taxonomy/find_diag_hits.py
tools/taxonomy/find_diag_hits.xml
tools/taxonomy/gi2taxonomy.py
tools/taxonomy/gi2taxonomy.xml
tools/taxonomy/lca.py
tools/taxonomy/lca.xml
tools/taxonomy/poisson2test.py
tools/taxonomy/poisson2test.xml
tools/taxonomy/t2ps_wrapper.py
tools/taxonomy/t2ps_wrapper.xml
tools/taxonomy/t2t_report.xml
tools/tool_conf.xml
tools/unix_tools/._awk_tool.xml
tools/unix_tools/._awk_wrapper.sh
tools/unix_tools/._cut_tool.xml
tools/unix_tools/._cut_wrapper.sh
tools/unix_tools/._find_and_replace.pl
tools/unix_tools/._find_and_replace.xml
tools/unix_tools/._grep_tool.xml
tools/unix_tools/._grep_wrapper.sh
tools/unix_tools/._grep_wrapper_old.sh
tools/unix_tools/._join_tool.sh
tools/unix_tools/._join_tool.xml
tools/unix_tools/._remove_ending.sh
tools/unix_tools/._remove_ending.xml
tools/unix_tools/._sed_tool.xml
tools/unix_tools/._sed_wrapper.sh
tools/unix_tools/._sort_tool.xml
tools/unix_tools/._uniq_tool.xml
tools/unix_tools/._word_list_grep.pl
tools/unix_tools/._word_list_grep.xml
tools/unix_tools/awk_tool.xml
tools/unix_tools/awk_wrapper.sh
tools/unix_tools/cut_tool.xml
tools/unix_tools/cut_wrapper.sh
tools/unix_tools/find_and_replace.pl
tools/unix_tools/find_and_replace.xml
tools/unix_tools/grep_tool.xml
tools/unix_tools/grep_wrapper.sh
tools/unix_tools/grep_wrapper_old.sh
tools/unix_tools/join_tool.sh
tools/unix_tools/join_tool.xml
tools/unix_tools/remove_ending.sh
tools/unix_tools/remove_ending.xml
tools/unix_tools/sed_tool.xml
tools/unix_tools/sed_wrapper.sh
tools/unix_tools/sort_tool.xml
tools/unix_tools/uniq_tool.xml
tools/unix_tools/word_list_grep.pl
tools/unix_tools/word_list_grep.xml
tools/validation/fix_errors.py
tools/validation/fix_errors.xml
tools/validation/fix_errors_code.py
tools/validation/validate.py
tools/vcf_tools/annotate.py
tools/vcf_tools/annotate.xml
tools/vcf_tools/bedClass.py
tools/vcf_tools/extract.py
tools/vcf_tools/extract.xml
tools/vcf_tools/filter.py
tools/vcf_tools/filter.xml
tools/vcf_tools/intersect.py
tools/vcf_tools/intersect.xml
tools/vcf_tools/tools.py
tools/vcf_tools/vcfClass.py
tools/vcf_tools/vcfPytools.py
tools/visualization/GMAJ.py
tools/visualization/GMAJ.xml
tools/visualization/LAJ.py
tools/visualization/LAJ.xml
tools/visualization/LAJ_code.py
tools/visualization/build_ucsc_custom_track.py
tools/visualization/build_ucsc_custom_track.xml
tools/visualization/build_ucsc_custom_track_code.py
b
diff -r 000000000000 -r 9071e359b9a3 tools/.DS_Store
b
Binary file tools/.DS_Store has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/._.DS_Store
b
Binary file tools/._.DS_Store has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/._mytools
b
Binary file tools/._mytools has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/._tool_conf.xml
b
Binary file tools/._tool_conf.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/annotation_profiler/annotation_profiler.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/annotation_profiler/annotation_profiler.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,136 @@
+<tool id="Annotation_Profiler_0" name="Profile Annotations" version="1.0.0">
+  <description>for a set of genomic intervals</description>
+  <command interpreter="python">annotation_profiler_for_interval.py -i $input1 -c ${input1.metadata.chromCol} -s ${input1.metadata.startCol} -e ${input1.metadata.endCol} -o $out_file1 $keep_empty -p ${GALAXY_DATA_INDEX_DIR}/annotation_profiler/$dbkey $summary -b 3 -t $table_names</command>
+  <inputs>
+    <param format="interval" name="input1" type="data" label="Choose Intervals">
+      <validator type="dataset_metadata_in_file" filename="annotation_profiler_valid_builds.txt" metadata_name="dbkey" metadata_column="0" message="Profiling is not currently available for this species."/>
+    </param>
+    <param name="keep_empty" type="select" label="Keep Region/Table Pairs with 0 Coverage">
+      <option value="-k">Keep</option>
+      <option value="" selected="true">Discard</option>
+    </param>
+    <param name="summary" type="select" label="Output per Region/Summary">
+      <option value="-S">Summary</option>
+      <option value="" selected="true">Per Region</option>
+    </param>
+    <param name="table_names" type="drill_down" display="checkbox" hierarchy="recurse" multiple="true" label="Choose Tables to Use" help="Selecting no tables will result in using all tables." from_file="annotation_profiler_options.xml"/>
+   </inputs>
+   <outputs>
+     <data format="input" name="out_file1">
+       <change_format>
+         <when input="summary" value="-S" format="tabular" />
+       </change_format>
+     </data>
+   </outputs>
+   <tests>
+     <test>
+       <param name="input1" value="4.bed" dbkey="hg18"/>
+       <param name="keep_empty" value=""/>
+       <param name="summary" value=""/>
+       <param name="table_names" value="acembly,affyGnf1h,knownAlt,knownGene,mrna,multiz17way,multiz28way,refGene,snp126"/>
+       <output name="out_file1" file="annotation_profiler_1.out" />
+     </test>
+     <test>
+       <param name="input1" value="3.bed" dbkey="hg18"/>
+       <param name="keep_empty" value=""/>
+       <param name="summary" value="Summary"/>
+       <param name="table_names" value="acembly,affyGnf1h,knownAlt,knownGene,mrna,multiz17way,multiz28way,refGene,snp126"/>
+       <output name="out_file1" file="annotation_profiler_2.out" />
+     </test>
+   </tests>
+   <help>
+**What it does**
+
+Takes an input set of intervals and for each interval determines the base coverage of the interval by a set of features (tables) available from UCSC. Genomic regions from the input feature data have been merged by overlap / direct adjacency (e.g. a table having ranges of: 1-10, 6-12, 12-20 and 25-28 results in two merged ranges of: 1-20 and 25-28).
+
+By default, this tool will check the coverage of your intervals against all available features; you may, however, choose to select only those tables that you want to include. Selecting a section heading will effectively cause all of its children to be selected.
+
+You may alternatively choose to receive a summary across all of the intervals that you provide.
+
+-----
+
+**Example**
+
+Using the interval below and selecting several tables::
+
+ chr1 4558 14764 uc001aab.1 0 -
+
+results in::
+
+ chr1 4558 14764 uc001aab.1 0 - snp126Exceptions 151 142
+ chr1 4558 14764 uc001aab.1 0 - genomicSuperDups 10206 1
+ chr1 4558 14764 uc001aab.1 0 - chainOryLat1 3718 1
+ chr1 4558 14764 uc001aab.1 0 - multiz28way 10206 1
+ chr1 4558 14764 uc001aab.1 0 - affyHuEx1 3553 32
+ chr1 4558 14764 uc001aab.1 0 - netXenTro2 3050 1
+ chr1 4558 14764 uc001aab.1 0 - intronEst 10206 1
+ chr1 4558 14764 uc001aab.1 0 - xenoMrna 10203 1
+ chr1 4558 14764 uc001aab.1 0 - ctgPos 10206 1
+ chr1 4558 14764 uc001aab.1 0 - clonePos 10206 1
+ chr1 4558 14764 uc001aab.1 0 - chainStrPur2Link 1323 29
+ chr1 4558 14764 uc001aab.1 0 - affyTxnPhase3HeLaNuclear 9011 8
+ chr1 4558 14764 uc001aab.1 0 - snp126orthoPanTro2RheMac2 61 58
+ chr1 4558 14764 uc001aab.1 0 - snp126 205 192
+ chr1 4558 14764 uc001aab.1 0 - chainEquCab1 10206 1
+ chr1 4558 14764 uc001aab.1 0 - netGalGal3 3686 1
+ chr1 4558 14764 uc001aab.1 0 - phastCons28wayPlacMammal 10172 3
+
+Where::
+
+ The first added column is the table name.
+ The second added column is the number of bases covered by the table.
+ The third added column is the number of regions from the table that is covered by the interval.
+
+Alternatively, requesting a summary, using the intervals below and selecting several tables::
+
+ chr1 4558 14764 uc001aab.1 0 -
+ chr1 4558 19346 uc001aac.1 0 -
+
+results in::
+
+ #tableName tableSize tableRegionCount allIntervalCount allIntervalSize allCoverage allTableRegionsOverlaped allIntervalsOverlapingTable nrIntervalCount nrIntervalSize nrCoverage nrTableRegionsOverlaped nrIntervalsOverlapingTable
+ snp126Exceptions 133601 92469 2 24994 388 359 2 1 14788 237 217 1
+ genomicSuperDups 12268847 657 2 24994 24994 2 2 1 14788 14788 1 1
+ chainOryLat1 70337730 2542 2 24994 7436 2 2 1 14788 3718 1 1
+ affyHuEx1 15703901 112274 2 24994 7846 70 2 1 14788 4293 38 1
+ netXenTro2 111440392 1877 2 24994 6100 2 2 1 14788 3050 1 1
+ snp126orthoPanTro2RheMac2 700436 690674 2 24994 124 118 2 1 14788 63 60 1
+ intronEst 135796064 2332 2 24994 24994 2 2 1 14788 14788 1 1
+ xenoMrna 129031327 1586 2 24994 20406 2 2 1 14788 10203 1 1
+ snp126 956976 838091 2 24994 498 461 2 1 14788 293 269 1
+ clonePos 224999719 39 2 24994 24994 2 2 1 14788 14788 1 1
+ chainStrPur2Link 7948016 119841 2 24994 2646 58 2 1 14788 1323 29 1
+ affyTxnPhase3HeLaNuclear 136797870 140244 2 24994 22601 17 2 1 14788 13590 9 1
+ multiz28way 225928588 38 2 24994 24994 2 2 1 14788 14788 1 1
+ ctgPos 224999719 39 2 24994 24994 2 2 1 14788 14788 1 1
+ chainEquCab1 246306414 141 2 24994 24994 2 2 1 14788 14788 1 1
+ netGalGal3 203351973 461 2 24994 7372 2 2 1 14788 3686 1 1
+ phastCons28wayPlacMammal 221017670 22803 2 24994 24926 6 2 1 14788 14754 3 1
+
+Where::

+ tableName is the name of the table
+ tableChromosomeCoverage is the number of positions existing in the table for only the chromosomes that were referenced by the interval file
+ tableChromosomeCount is the number of regions existing in the table for only the chromosomes that were referenced by the interval file
+ tableRegionCoverage is the number of positions existing in the table between the minimal and maximal bounding regions that were referenced by the interval file
+ tableRegionCount is the number of regions existing in the table between the minimal and maximal bounding regions that were referenced by the interval file

+ allIntervalCount is the number of provided intervals
+ allIntervalSize is the sum of the lengths of the provided interval file
+ allCoverage is the sum of the coverage for each provided interval
+ allTableRegionsOverlapped is the sum of the number of regions of the table (non-unique) that were overlapped for each interval
+ allIntervalsOverlappingTable is the number of provided intervals which overlap the table

+ nrIntervalCount is the number of non-redundant intervals
+ nrIntervalSize is the sum of the lengths of non-redundant intervals
+ nrCoverage is the sum of the coverage of non-redundant intervals
+ nrTableRegionsOverlapped is the number of regions of the table (unique) that were overlapped by the non-redundant intervals
+ nrIntervalsOverlappingTable is the number of non-redundant intervals which overlap the table

+
+.. class:: infomark
+
+**TIP:** non-redundant (nr) refers to the set of intervals that remains after the intervals provided have been merged to resolve overlaps
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/annotation_profiler/annotation_profiler_for_interval.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/annotation_profiler/annotation_profiler_for_interval.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,360 @@\n+#!/usr/bin/env python\r\n+#Dan Blankenberg\r\n+#For a set of intervals, this tool returns the same set of intervals \r\n+#with 2 additional fields: the name of a Table/Feature and the number of\r\n+#bases covered. The original intervals are repeated for each Table/Feature.\r\n+\r\n+import sys, struct, optparse, os, random\r\n+from galaxy import eggs\r\n+import pkg_resources; pkg_resources.require( "bx-python" )\r\n+import bx.intervals.io\r\n+import bx.bitset\r\n+try:\r\n+    import psyco\r\n+    psyco.full()\r\n+except:\r\n+    pass\r\n+\r\n+assert sys.version_info[:2] >= ( 2, 4 )\r\n+\r\n+class CachedRangesInFile:\r\n+    DEFAULT_STRUCT_FORMAT = \'<I\'\r\n+    def __init__( self, filename, profiler_info ):\r\n+        self.file_size = os.stat( filename ).st_size\r\n+        self.file = open( filename, \'rb\' )\n+        self.filename = filename\r\n+        self.fmt = profiler_info.get( \'profiler_struct_format\', self.DEFAULT_STRUCT_FORMAT )\r\n+        self.fmt_size = int( profiler_info.get( \'profiler_struct_size\', struct.calcsize( self.fmt ) ) )\r\n+        self.length = int( self.file_size / self.fmt_size / 2 )\r\n+        self._cached_ranges = [ None for i in xrange( self.length ) ]\r\n+    def __getitem__( self, i ):\n+        if self._cached_ranges[i] is not None:\r\n+            return self._cached_ranges[i]\r\n+        if i < 0: i = self.length + i\r\n+        offset = i * self.fmt_size * 2\r\n+        self.file.seek( offset )\r\n+        try:\r\n+            start = struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]\r\n+            end = struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]\r\n+        except Exception, e:\n+            raise IndexError, e\r\n+        self._cached_ranges[i] = ( start, end )\r\n+        return start, end\r\n+    def __len__( self ):\r\n+        return self.length\r\n+\r\n+class RegionCoverage:\r\n+    def __init__( self, filename_base, profiler_info ):\r\n+        try:\r\n+            self._coverage = CachedRangesInFile( "%s.covered" % filename_base, profiler_info )\r\n+        except Exception, e:\r\n+            #print "Error loading coverage file %s: %s" % ( "%s.covered" % filename_base, e )\r\n+            self._coverage = []\r\n+        try: \r\n+            self._total_coverage = int( open( "%s.total_coverage" % filename_base ).read() )\r\n+        except Exception, e:\r\n+            #print "Error loading total coverage file %s: %s" % ( "%s.total_coverage" % filename_base, e )\r\n+            self._total_coverage = 0\r\n+    def get_start_index( self, start ):\r\n+        #binary search: returns index of range closest to start\r\n+        if start > self._coverage[-1][1]:\r\n+            return len( self._coverage ) - 1\r\n+        i = 0\r\n+        j = len( self._coverage) - 1\r\n+        while i < j:\r\n+            k = ( i + j ) / 2\r\n+            if start <= self._coverage[k][1]:\r\n+                j = k\r\n+            else:\r\n+                i = k + 1\r\n+        return i\r\n+    def get_coverage( self, start, end ):\r\n+        return self.get_coverage_regions_overlap( start, end )[0]\r\n+    def get_coverage_regions_overlap( self, start, end ):\r\n+        return self.get_coverage_regions_index_overlap( start, end )[0:2]\r\n+    def get_coverage_regions_index_overlap( self, start, end ):\r\n+        if len( self._coverage ) < 1 or start > self._coverage[-1][1] or end < self._coverage[0][0]:\r\n+            return 0, 0, 0\r\n+        if self._total_coverage and start <= self._coverage[0][0] and end >= self._coverage[-1][1]:\r\n+            return self._total_coverage, len( self._coverage ), 0\r\n+        coverage = 0\r\n+        region_count = 0\r\n+        start_index = self.get_start_index( start )\r\n+        for i in xrange( start_index, len( self._coverage ) ):\r\n+            c_start, c_end = self._coverage[i]\r\n+            if c_start > end:\r\n+                break\r\n+            if c_start <= end and c_end >= start:\r\n+                coverage += min( end, c_end ) - max( start, c_start )\r\n+                region_count += 1\r\n+        return coverage, region_count, start_index\r\n+\r\n+class Cac'..b'                if len( fields ) == 2:\r\n+                        self.chroms[ fields[0] ] = int( fields[1] )\r\n+                    else:\r\n+                        self.chroms[ fields[0] ] = self.default_bitset_size\r\n+    def get( self, name ):\r\n+        return self.chroms.get( name, self.default_bitset_size )\r\n+\r\n+def parse_profiler_info( filename ):\r\n+    profiler_info = {}\r\n+    try:\r\n+        for line in open( filename ):\r\n+            fields = line.rstrip( \'\\n\\r\' ).split( \'\\t\', 1 )\r\n+            if len( fields ) == 2:\r\n+                if fields[0] in profiler_info:\r\n+                    if not isinstance( profiler_info[ fields[0] ], list ):\r\n+                        profiler_info[ fields[0] ] = [ profiler_info[ fields[0] ] ]\r\n+                    profiler_info[ fields[0] ].append( fields[1] )\r\n+                else:\r\n+                    profiler_info[ fields[0] ] = fields[1]\r\n+    except:\r\n+        pass #likely missing file\r\n+    return profiler_info\r\n+\r\n+def __main__():\r\n+    parser = optparse.OptionParser()\r\n+    parser.add_option(\r\n+        \'-k\',\'--keep_empty\',\r\n+        action="store_true",\r\n+        dest=\'keep_empty\',\r\n+        default=False,\r\n+        help=\'Keep tables with 0 coverage\'\r\n+    )\r\n+    parser.add_option(\r\n+        \'-b\',\'--buffer\',\r\n+        dest=\'buffer\',\r\n+        type=\'int\',default=10,\r\n+        help=\'Number of Chromosomes to keep buffered\'\r\n+    )\r\n+    parser.add_option(\r\n+        \'-c\',\'--chrom_col\',\r\n+        dest=\'chrom_col\',\r\n+        type=\'int\',default=1,\r\n+        help=\'Chromosome column\'\r\n+    )\r\n+    parser.add_option(\r\n+        \'-s\',\'--start_col\',\r\n+        dest=\'start_col\',\r\n+        type=\'int\',default=2,\r\n+        help=\'Start Column\'\r\n+    )\r\n+    parser.add_option(\r\n+        \'-e\',\'--end_col\',\r\n+        dest=\'end_col\',\r\n+        type=\'int\',default=3,\r\n+        help=\'End Column\'\r\n+    )\r\n+    parser.add_option(\r\n+        \'-p\',\'--path\',\r\n+        dest=\'path\',\r\n+        type=\'str\',default=\'/galaxy/data/annotation_profiler/hg18\',\r\n+        help=\'Path to profiled data for this organism\'\r\n+    )\r\n+    parser.add_option(\r\n+        \'-t\',\'--table_names\',\r\n+        dest=\'table_names\',\r\n+        type=\'str\',default=\'None\',\r\n+        help=\'Table names requested\'\r\n+    )\r\n+    parser.add_option(\r\n+        \'-i\',\'--input\',\r\n+        dest=\'interval_filename\',\r\n+        type=\'str\',\r\n+        help=\'Input Interval File\'\r\n+    )\r\n+    parser.add_option(\r\n+        \'-o\',\'--output\',\r\n+        dest=\'out_filename\',\r\n+        type=\'str\',\r\n+        help=\'Input Interval File\'\r\n+    )\r\n+    parser.add_option(\r\n+        \'-S\',\'--summary\',\r\n+        action="store_true",\r\n+        dest=\'summary\',\r\n+        default=False,\r\n+        help=\'Display Summary Results\'\r\n+    )\r\n+    \r\n+    options, args = parser.parse_args()\r\n+    \r\n+    assert os.path.isdir( options.path ), IOError( "Configuration error: Table directory is missing (%s)" % options.path )\r\n+    \r\n+    #get profiler_info\r\n+    profiler_info = parse_profiler_info( os.path.join( options.path, \'profiler_info.txt\' ) )\r\n+    \r\n+    table_names = options.table_names.split( "," )\r\n+    if table_names == [\'None\']: table_names = None\r\n+    coverage_reader = CachedCoverageReader( options.path, buffer = options.buffer, table_names = table_names, profiler_info = profiler_info )\r\n+    \r\n+    if options.summary:\r\n+        profile_summary( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader, ChromosomeLengths( profiler_info ) )\r\n+    else:\r\n+        profile_per_interval( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader )\r\n+    \r\n+    #print out data version info\r\n+    print \'Data version (%s:%s:%s)\' % ( profiler_info.get( \'dbkey\', \'unknown\' ), profiler_info.get( \'profiler_hash\', \'unknown\' ), profiler_info.get( \'dump_time\', \'unknown\' ) )\r\n+\r\n+if __name__ == "__main__": __main__()\r\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/bedtools/._bedToBam.xml
b
Binary file tools/bedtools/._bedToBam.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/bedtools/bedToBam.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/bedtools/bedToBam.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,29 @@
+<tool id="bedToBam" name="bedToBam">
+  <description>convert BED or GFF or VCF to BAM</description>
+  <command>bedToBam -i $input -g $genome $bed12 $mapq $ubam > $outfile </command>
+  <inputs>
+    <param name="input" format="bed,gff,vcf" type="data" label="Input file (BED,GFF,VCF)" help="BED files must be at least BED4 to be amenable to BAM (needs name field)"/>
+    <param name="genome" type="select" label="Select genome">
+     <option value="/Users/xuebing/tools/BEDTools-Version-2.13.3/genomes/mouse.mm9.genome" selected="true">mm9</option>
+     <option value="/Users/xuebing/tools/BEDTools-Version-2.13.3/genomes/mouse.mm8.genome">mm8</option>
+     <option value="/Users/xuebing/tools/BEDTools-Version-2.13.3/genomes/mouse.hg18.genome">hg18</option>
+     <option value="/Users/xuebing/tools/BEDTools-Version-2.13.3/genomes/mouse.hg19.genome">hg19</option>
+    </param>
+    <param name="mapq" size="10" type="integer" value="255" label="Set the mappinq quality for the BAM records"/>
+    <param name="bed12" label="The BED file is in BED12 format" help="The BAM CIGAR string will reflect BED blocks" type="boolean" truevalue="-bed12" falsevalue="" checked="False"/>
+    <param name="ubam" label="Write uncompressed BAM output" help="Default is to write compressed BAM" type="boolean" truevalue="-ubam" falsevalue="" checked="False"/>
+  </inputs>
+  <outputs>
+    <data format="bam" name="outfile" />
+  </outputs>
+  <help>
+
+**What it does**
+
+Program: bedToBam (v2.13.3)
+Author:  Aaron Quinlan (aaronquinlan@gmail.com)
+Summary: Converts feature records to BAM format.
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_destination/epigraph.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_destination/epigraph.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,41 @@
+<?xml version="1.0"?>
+<tool name="Perform genome analysis" id="epigraph_export">
+    <description> and prediction with EpiGRAPH</description>
+    <redirect_url_params>GENOME=${input1.dbkey} NAME=${input1.name} INFO=${input1.info}</redirect_url_params>
+    <inputs>
+        <param format="bed" name="input1" type="data" label="Send this dataset to EpiGRAPH">
+            <validator type="unspecified_build" />
+        </param>
+        <param name="REDIRECT_URL" type="hidden" value="http://epigraph.mpi-inf.mpg.de/WebGRAPH/faces/DataImport.jsp" />
+        <param name="DATA_URL" type="baseurl" value="/datasets" />
+        <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=epigraph_import" />
+    </inputs>
+    <outputs/>
+    <help>
+
+.. class:: warningmark
+
+After clicking the **Execute** button, you will be redirected to the EpiGRAPH website. Please be patient while the dataset is being imported. Inside EpiGRAPH, buttons are available to send the results of the EpiGRAPH analysis back to Galaxy. In addition, you can always abandon an EpiGRAPH session and return to Galaxy by directing your browser to your current Galaxy instance.
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+This tool sends the selected dataset to EpiGRAPH in order to perform an in-depth analysis with statistical and machine learning methods.
+
+-----
+
+.. class:: infomark
+
+**EpiGRAPH outline**
+
+The EpiGRAPH_ web service enables biologists to uncover hidden associations in vertebrate genome and epigenome datasets. Users can upload or import sets of genomic regions and EpiGRAPH will test a wide range of attributes (including DNA sequence and structure, gene density, chromatin modifications and evolutionary conservation) for enrichment or depletion among these regions. Furthermore, EpiGRAPH learns to predictively identify genomic regions that exhibit similar properties.
+
+.. _EpiGRAPH: http://epigraph.mpi-inf.mpg.de/
+
+    </help>
+</tool>
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_destination/epigraph_test.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_destination/epigraph_test.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,40 @@
+<?xml version="1.0"?>
+<tool name="Perform genome analysis" id="epigraph_test_export">
+    <description> and prediction with EpiGRAPH Test</description>
+    <redirect_url_params>GENOME=${input1.dbkey} NAME=${input1.name} INFO=${input1.info}</redirect_url_params>
+    <inputs>
+        <param format="bed" name="input1" type="data" label="Send this dataset to EpiGRAPH">
+            <validator type="unspecified_build" />
+        </param>
+        <param name="REDIRECT_URL" type="hidden" value="http://epigraph.mpi-inf.mpg.de/WebGRAPH_Public_Test/faces/DataImport.jsp" />
+        <param name="DATA_URL" type="baseurl" value="/datasets" />
+        <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=epigraph_import" />
+    </inputs>
+    <outputs/>
+    <help>
+
+.. class:: warningmark
+
+After clicking the **Execute** button, you will be redirected to the EpiGRAPH test website. Please be patient while the dataset is being imported. Inside EpiGRAPH, buttons are available to send the results of the EpiGRAPH analysis back to Galaxy. In addition, you can always abandon an EpiGRAPH session and return to Galaxy by directing your browser to your current Galaxy instance.
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+This tool sends the selected dataset to EpiGRAPH in order to perform an in-depth analysis with statistical and machine learning methods.
+
+-----
+
+.. class:: infomark
+
+**EpiGRAPH outline**
+
+The EpiGRAPH_ web service enables biologists to uncover hidden associations in vertebrate genome and epigenome datasets. Users can upload or import sets of genomic regions and EpiGRAPH will test a wide range of attributes (including DNA sequence and structure, gene density, chromatin modifications and evolutionary conservation) for enrichment or depletion among these regions. Furthermore, EpiGRAPH learns to predictively identify genomic regions that exhibit similar properties.
+
+.. _EpiGRAPH: http://epigraph.mpi-inf.mpg.de/
+
+    </help>
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/access_libraries.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/access_libraries.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<tool name="Access Libraries" id="library_access1">
+    <description>stored locally</description>
+    <inputs action="/library/index" method="get" target="_parent">
+        <param name="default_action" type="hidden" value="import_to_histories" />
+    </inputs>
+    <uihints minwidth="800"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/bed_convert.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/bed_convert.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,14 @@
+<tool id="BED File Converter1" name="BED File Converter">
+  <description>creates a bed or xbed file containing from text query</description>
+  <command>noop</command>
+  <inputs>
+    <display>creates a bed or xbed file containing user assigned input of $input</display>
+    <param format="tabular" name="input" type="data" />
+    <param name="chrom" size="4" type="text" value="all" />
+  </inputs>
+  <outputs>
+    <data format="bed" name="out_file1" />
+  </outputs>
+  <help>User specifies delimiter, header information, and column assignments and the file will be converted to BED or xBED.
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/biomart.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/biomart.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,42 @@
+<?xml version="1.0"?>
+<!--
+    If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in
+    the initial response.  If value of 'URL_method' is 'post', any additional params coming back in the
+    initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed.
+
+    TODO: Hack to get biomart to work - the 'add_to_URL' param can be eliminated when the Biomart team encodes URL prior to sending, meanwhile
+    everything including and beyond the first '&' is truncated from URL.  They said they'll let us know when this is fixed at their end.
+-->
+<tool name="BioMart" id="biomart" tool_type="data_source" version="1.0.1">
+ <description>Central server</description>
+ <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+ <inputs action="http://www.biomart.org/biomart/martview" check_values="false" method="get" target="_top">
+ <display>go to BioMart Central $GALAXY_URL</display>
+ <param name="GALAXY_URL" type="baseurl" value="/tool_runner/biomart" />
+    </inputs>
+    <request_param_translation>
+        <request_param galaxy_name="URL" remote_name="URL" missing="">
+            <append_param separator="&amp;" first_separator="?" join="=">
+                <value name="_export" missing="1" />
+                <value name="GALAXY_URL" missing="0" />
+            </append_param>
+        </request_param>
+        <request_param galaxy_name="data_type" remote_name="exportView_outputformat" missing="tabular" >
+            <value_translation>
+                <value galaxy_value="tabular" remote_value="TSV" />
+            </value_translation>
+        </request_param>
+        <request_param galaxy_name="URL_method" remote_name="URL_method" missing="get" />
+        <request_param galaxy_name="dbkey" remote_name="dbkey" missing="?" />
+        <request_param galaxy_name="organism" remote_name="organism" missing="" />
+        <request_param galaxy_name="table" remote_name="table" missing="" />
+        <request_param galaxy_name="description" remote_name="description" missing="" />
+        <request_param galaxy_name="name" remote_name="name" missing="Biomart query" />
+        <request_param galaxy_name="info" remote_name="info" missing="" />
+    </request_param_translation>
+ <uihints minwidth="800"/>
+ <outputs>
+ <data name="output" format="tabular" />
+ </outputs>
+ <options sanitize="False" refresh="True"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/biomart_test.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/biomart_test.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,42 @@
+<?xml version="1.0"?>
+<!--
+    If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in
+    the initial response.  If value of 'URL_method' is 'post', any additional params coming back in the
+    initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed.
+
+    TODO: Hack to get biomart to work - the 'add_to_URL' param can be eliminated when the Biomart team encodes URL prior to sending, meanwhile
+    everything including and beyond the first '&' is truncated from URL.  They said they'll let us know when this is fixed at their end.
+-->
+<tool name="BioMart" id="biomart_test" tool_type="data_source" version="1.0.1">
+ <description>Test server</description>
+ <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+ <inputs action="http://test.biomart.org/biomart/martview" check_values="false" method="get" target="_top">
+ <display>go to BioMart Central $GALAXY_URL</display>
+ <param name="GALAXY_URL" type="baseurl" value="/tool_runner/biomart" />
+ </inputs>
+    <request_param_translation>
+        <request_param galaxy_name="URL" remote_name="URL" missing="">
+            <append_param separator="&amp;" first_separator="?" join="=">
+                <value name="_export" missing="1" />
+                <value name="GALAXY_URL" missing="0" />
+            </append_param>
+        </request_param>
+        <request_param galaxy_name="data_type" remote_name="exportView_outputformat" missing="tabular" >
+            <value_translation>
+                <value galaxy_value="tabular" remote_value="TSV" />
+            </value_translation>
+        </request_param>
+        <request_param galaxy_name="URL_method" remote_name="URL_method" missing="get" />
+        <request_param galaxy_name="dbkey" remote_name="dbkey" missing="?" />
+        <request_param galaxy_name="organism" remote_name="organism" missing="" />
+        <request_param galaxy_name="table" remote_name="table" missing="" />
+        <request_param galaxy_name="description" remote_name="description" missing="" />
+        <request_param galaxy_name="name" remote_name="name" missing="Biomart test query" />
+        <request_param galaxy_name="info" remote_name="info" missing="" />
+    </request_param_translation>
+ <uihints minwidth="800"/>
+ <outputs>
+ <data name="output" format="tabular" />
+ </outputs>
+ <options sanitize="False" refresh="True"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/bx_browser.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/bx_browser.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,41 @@
+<?xml version="1.0"?>
+<!--
+    If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in
+    the initial response.  If value of 'URL_method' is 'post', any additional params coming back in the
+    initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed.
+-->
+<tool name="BX main" id="bx_browser" tool_type="data_source">
+    <description>browser</description>
+    <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+    <inputs action="http://main.genome-browser.bx.psu.edu/cgi-bin/hgTables" check_values="false" method="get">
+        <display>go to BX Browser $GALAXY_URL</display>
+        <param name="GALAXY_URL" type="baseurl" value="/tool_runner" />
+        <param name="tool_id" type="hidden" value="bx_browser" />
+        <param name="sendToGalaxy" type="hidden" value="1" />
+        <param name="hgta_compressType" type="hidden" value="none" />
+        <param name="hgta_outputType" type="hidden" value="bed" />
+    </inputs>
+    <request_param_translation>
+        <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" />
+        <request_param galaxy_name="URL" remote_name="URL" missing="" />
+        <request_param galaxy_name="dbkey" remote_name="db" missing="?" />
+        <request_param galaxy_name="organism" remote_name="org" missing="unknown species" />
+        <request_param galaxy_name="table" remote_name="hgta_table" missing="unknown table" />
+        <request_param galaxy_name="description" remote_name="hgta_regionType" missing="no description" />
+        <request_param galaxy_name="data_type" remote_name="hgta_outputType" missing="tabular" >
+            <value_translation>
+                <value galaxy_value="tabular" remote_value="primaryTable" />
+                <value galaxy_value="tabular" remote_value="selectedFields" />
+                <value galaxy_value="wig" remote_value="wigData" />
+                <value galaxy_value="interval" remote_value="tab" />
+                <value galaxy_value="html" remote_value="hyperlinks" />
+                <value galaxy_value="fasta" remote_value="sequence" />
+            </value_translation>
+        </request_param>
+    </request_param_translation>
+    <uihints minwidth="800"/>
+    <outputs>
+        <data name="output" format="tabular" />
+    </outputs>
+    <options sanitize="False" refresh="True"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/cbi_rice_mart.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/cbi_rice_mart.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,39 @@
+<?xml version="1.0"?>
+<!--
+    If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in
+    the initial response.  If value of 'URL_method' is 'post', any additional params coming back in the
+    initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed.
+-->
+<tool name="CBI Rice Mart" id="cbi_rice_mart" tool_type="data_source" version="1.0.1">
+    <description>rice mart</description>
+    <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+    <inputs action="http://ricemart.cbi.edu.cn/biomart/martview/" check_values="false" method="get" target="_top">
+        <display>go to RMap rice mart $GALAXY_URL</display>
+        <param name="GALAXY_URL" type="baseurl" value="/tool_runner/biomart" />
+    </inputs>
+    <request_param_translation>
+        <request_param galaxy_name="URL" remote_name="URL" missing="">
+            <append_param separator="&amp;" first_separator="?" join="=">
+                <value name="_export" missing="1" />
+                <value name="GALAXY_URL" missing="0" />
+            </append_param>
+        </request_param>
+        <request_param galaxy_name="data_type" remote_name="exportView_outputformat" missing="tabular" >
+            <value_translation>
+                <value galaxy_value="tabular" remote_value="TSV" />
+            </value_translation>
+        </request_param>
+        <request_param galaxy_name="URL_method" remote_name="URL_method" missing="get" />
+        <request_param galaxy_name="dbkey" remote_name="dbkey" missing="?" />
+        <request_param galaxy_name="organism" remote_name="organism" missing="" />
+        <request_param galaxy_name="table" remote_name="table" missing="" />
+        <request_param galaxy_name="description" remote_name="description" missing="" />
+        <request_param galaxy_name="name" remote_name="name" missing="Rice mart query" />
+        <request_param galaxy_name="info" remote_name="info" missing="" />
+    </request_param_translation>
+    <uihints minwidth="800"/>
+    <outputs>
+        <data name="output" format="tabular" />
+    </outputs>
+    <options sanitize="False" refresh="True"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/data_source.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/data_source.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+# Retrieves data from external data source applications and stores in a dataset file.
+# Data source application parameters are temporarily stored in the dataset file.
+import socket, urllib, sys, os
+from galaxy import eggs #eggs needs to be imported so that galaxy.util can find docutils egg...
+from galaxy.util.json import from_json_string, to_json_string
+import galaxy.model # need to import model before sniff to resolve a circular import dependency
+from galaxy.datatypes import sniff
+from galaxy.datatypes.registry import Registry
+from galaxy.jobs import TOOL_PROVIDED_JOB_METADATA_FILE
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+GALAXY_PARAM_PREFIX = 'GALAXY'
+GALAXY_ROOT_DIR = os.path.realpath( os.path.join( os.path.split( os.path.realpath( __file__ ) )[0], '..', '..' ) )
+GALAXY_DATATYPES_CONF_FILE = os.path.join( GALAXY_ROOT_DIR, 'datatypes_conf.xml' )
+
+def load_input_parameters( filename, erase_file = True ):
+    datasource_params = {}
+    try:
+        json_params = from_json_string( open( filename, 'r' ).read() )
+        datasource_params = json_params.get( 'param_dict' )
+    except:
+        json_params = None
+        for line in open( filename, 'r' ):
+            try:
+                line = line.strip()
+                fields = line.split( '\t' )
+                datasource_params[ fields[0] ] = fields[1]
+            except:
+                continue
+    if erase_file:
+        open( filename, 'w' ).close() #open file for writing, then close, removes params from file
+    return json_params, datasource_params
+
+def __main__():
+    filename = sys.argv[1]
+    try:
+        max_file_size = int( sys.argv[2] )
+    except:
+        max_file_size = 0
+    
+    job_params, params = load_input_parameters( filename )
+    if job_params is None: #using an older tabular file
+        enhanced_handling = False
+        job_params = dict( param_dict = params )
+        job_params[ 'output_data' ] =  [ dict( out_data_name = 'output',
+                                               ext = 'data',
+                                               file_name = filename,
+                                               extra_files_path = None ) ]
+        job_params[ 'job_config' ] = dict( GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE = TOOL_PROVIDED_JOB_METADATA_FILE )
+    else:
+        enhanced_handling = True
+        json_file = open( job_params[ 'job_config' ][ 'TOOL_PROVIDED_JOB_METADATA_FILE' ], 'w' ) #specially named file for output junk to pass onto set metadata
+    
+    datatypes_registry = Registry( root_dir = job_params[ 'job_config' ][ 'GALAXY_ROOT_DIR' ], config = job_params[ 'job_config' ][ 'GALAXY_DATATYPES_CONF_FILE' ] )
+    
+    URL = params.get( 'URL', None ) #using exactly URL indicates that only one dataset is being downloaded
+    URL_method = params.get( 'URL_method', None )
+    
+    # The Python support for fetching resources from the web is layered. urllib uses the httplib
+    # library, which in turn uses the socket library.  As of Python 2.3 you can specify how long
+    # a socket should wait for a response before timing out. By default the socket module has no
+    # timeout and can hang. Currently, the socket timeout is not exposed at the httplib or urllib2
+    # levels. However, you can set the default timeout ( in seconds ) globally for all sockets by
+    # doing the following.
+    socket.setdefaulttimeout( 600 )
+    
+    for data_dict in job_params[ 'output_data' ]:
+        cur_filename =  data_dict.get( 'file_name', filename )
+        cur_URL =  params.get( '%s|%s|URL' % ( GALAXY_PARAM_PREFIX, data_dict[ 'out_data_name' ] ), URL )
+        if not cur_URL:
+            open( cur_filename, 'w' ).write( "" )
+            stop_err( 'The remote data source application has not sent back a URL parameter in the request.' )
+        
+        # The following calls to urllib.urlopen() will use the above default timeout
+        try:
+            if not URL_method or URL_method == 'get':
+                page = urllib.urlopen( cur_URL )
+            elif URL_method == 'post':
+                page = urllib.urlopen( cur_URL, urllib.urlencode( params ) )
+        except Exception, e:
+            stop_err( 'The remote data source application may be off line, please try again later. Error: %s' % str( e ) )
+        if max_file_size:
+            file_size = int( page.info().get( 'Content-Length', 0 ) )
+            if file_size > max_file_size:
+                stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % ( file_size, max_file_size ) )
+        #do sniff stream for multi_byte
+        try:
+            cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open( cur_filename, os.O_WRONLY | os.O_CREAT ), cur_filename )
+        except Exception, e:
+            stop_err( 'Unable to fetch %s:\n%s' % ( cur_URL, e ) )
+        
+        #here import checks that upload tool performs
+        if enhanced_handling:
+            try:
+                ext = sniff.handle_uploaded_dataset_file( filename, datatypes_registry, ext = data_dict[ 'ext' ], is_multi_byte = is_multi_byte )
+            except Exception, e:
+                stop_err( str( e ) )
+            info = dict( type = 'dataset',
+                         dataset_id = data_dict[ 'dataset_id' ],
+                         ext = ext)
+            
+            json_file.write( "%s\n" % to_json_string( info ) )
+    
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/echo.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/echo.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+
+"""
+Script that just echos the command line.
+"""
+
+import sys
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+print '-' * 20, "<br>"
+for elem in sys.argv:
+    print elem, "<br>"
+print '-' * 20, "<br>"
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/echo.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/echo.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+
+<tool name="Echo" id="echo1">
+
+ <description>
+ echoes parameters  
+ </description>
+
+ <command interpreter="python">echo.py $input $database $output </command>
+
+ <inputs>
+ <param format="tabular" name="input" type="data" label="Input stuff"/>
+        <param type="select" name="database" label="Database">
+            <option value="alignseq.loc">Human (hg18)</option>
+            <option value="faseq.loc">Fly (dm3)</option>
+        </param>
+ </inputs>
+
+ <outputs>
+ <data format="input" name="output" label="Blat on ${database.value_label}" />
+ </outputs>
+
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/encode_db.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/encode_db.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+
+<tool name="EncodeDB" id="encode_db1">
+
+ <description>
+ at NHGRI 
+ </description>
+
+ <command interpreter="python">
+ fetch.py "$url" $output
+ </command>
+
+ <inputs action="http://research.nhgri.nih.gov/projects/ENCODEdb/cgi-bin/power_query.cgi" target="_top"> 
+<!-- <inputs action="http://localhost:9000/prepared"> -->
+ <display>go to EncodeDB $GALAXY_URL</display>
+ <param name="GALAXY_URL" type="baseurl" value="/async/encode_db1" />
+ </inputs>
+
+  <uihints minwidth="800"/>
+  
+  <outputs>
+    <data format="bed" name="output" />
+  </outputs>
+
+ <options sanitize="False" refresh="True"/>
+
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/epigraph_import.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/epigraph_import.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,30 @@
+<?xml version="1.0"?>
+<!--
+    If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in
+    the initial response.  If value of 'URL_method' is 'post', any additional params coming back in the
+    initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed.
+-->
+<tool name="EpiGRAPH" id="epigraph_import" tool_type="data_source">
+    <description> server</description>
+    <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+    <inputs action="http://epigraph.mpi-inf.mpg.de/WebGRAPH/faces/Login.jsp" check_values="false" method="get">
+        <display>go to EpiGRAPH server $GALAXY_URL</display>
+        <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=epigraph_import" />
+    </inputs>
+    <request_param_translation>
+        <request_param galaxy_name="URL_method" remote_name="URL_method" missing="get" />
+        <request_param galaxy_name="URL" remote_name="URL" missing="" />
+        <request_param galaxy_name="dbkey" remote_name="GENOME" missing="?" />
+        <request_param galaxy_name="organism" remote_name="organism" missing="" />
+        <request_param galaxy_name="table" remote_name="table" missing="" />
+        <request_param galaxy_name="description" remote_name="description" missing="" />
+        <request_param galaxy_name="name" remote_name="NAME" missing="EpiGRAPH query" />
+        <request_param galaxy_name="info" remote_name="INFO" missing="" />
+        <request_param galaxy_name="data_type" remote_name="data_type" missing="txt" />
+    </request_param_translation>
+    <uihints minwidth="800"/>  
+    <outputs>
+        <data name="output" format="txt" />
+    </outputs>
+    <options sanitize="False" refresh="True"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/epigraph_import_test.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/epigraph_import_test.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,30 @@
+<?xml version="1.0"?>
+<!--
+    If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in
+    the initial response.  If value of 'URL_method' is 'post', any additional params coming back in the
+    initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed.
+-->
+<tool name="EpiGRAPH" id="epigraph_import_test" tool_type="data_source">
+    <description> test server</description>
+    <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+    <inputs action="http://epigraph.mpi-inf.mpg.de/WebGRAPH_Public_Test/faces/Login.jsp" check_values="false" method="get">
+        <display>go to EpiGRAPH server $GALAXY_URL</display>
+        <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=epigraph_import_test" />
+    </inputs>
+    <request_param_translation>
+        <request_param galaxy_name="URL_method" remote_name="URL_method" missing="get" />
+        <request_param galaxy_name="URL" remote_name="URL" missing="" />
+        <request_param galaxy_name="dbkey" remote_name="GENOME" missing="?" />
+        <request_param galaxy_name="organism" remote_name="organism" missing="" />
+        <request_param galaxy_name="table" remote_name="table" missing="" />
+        <request_param galaxy_name="description" remote_name="description" missing="" />
+        <request_param galaxy_name="name" remote_name="NAME" missing="EpiGRAPH query" />
+        <request_param galaxy_name="info" remote_name="INFO" missing="" />
+        <request_param galaxy_name="data_type" remote_name="data_type" missing="txt" />
+    </request_param_translation>
+    <uihints minwidth="800"/>  
+    <outputs>
+        <data name="output" format="txt" />
+    </outputs>
+    <options sanitize="False" refresh="True"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/eupathdb.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/eupathdb.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,13 @@
+<tool name="EuPathDB" id="eupathdb" tool_type="data_source" url_method="post">
+    <description>server</description>
+    <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+    <inputs action="http://eupathdb.org/eupathdb/queries_tools.jsp" check_values="false" method="get"> 
+        <display>go to EuPathDB server $GALAXY_URL</display>
+        <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=eupathdb" />
+    </inputs>
+    <uihints minwidth="800"/>
+    <outputs>
+        <data name="output" format="tabular" />
+    </outputs>
+    <options sanitize="False" refresh="True"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/fetch.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/fetch.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+
+"""
+Script that just echos the command line.
+"""
+
+import sys, os, urllib
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+BUFFER = 1048576
+
+url      = sys.argv[1]
+out_name = sys.argv[2]
+
+out = open(out_name, 'wt')
+try:
+    page = urllib.urlopen(url)
+    while 1:
+        data = page.read(BUFFER)
+        if not data:
+            break
+        out.write(data)
+except Exception, e:
+    print 'Error getting the data -> %s' % e
+out.close()
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/fly_modencode.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/fly_modencode.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+<tool name="modENCODE fly" id="modENCODEfly" tool_type="data_source">
+    <description>server</description>
+    <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+    <inputs action="http://modencode.oicr.on.ca/fgb2/gbrowse/fly" check_values="false" target="_top"> 
+        <display>go to modENCODE fly server $GALAXY_URL</display>
+        <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=modENCODEfly" />
+    </inputs>
+    <request_param_translation>
+        <request_param galaxy_name="dbkey" remote_name="dbkey" missing="dm2" >
+            <value_translation>
+                <value galaxy_value="dm2" remote_value="fly" />
+            </value_translation>
+        </request_param>
+        <request_param galaxy_name="URL" remote_name="URL" missing="">
+            <append_param separator="&amp;" first_separator="?" join="=">
+                <value name="d" missing="" />
+                <value name="dbkey" missing="dm2" />
+                <value name="q" missing="" />
+                <value name="s" missing="" />
+                <value name="t" missing="" />
+            </append_param>
+        </request_param>
+        <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" />
+        <request_param galaxy_name="data_type" remote_name="data_type" missing="auto" />
+    </request_param_translation>
+    <uihints minwidth="800"/>
+    <outputs>
+        <data name="output" format="txt" label="${tool.name} on $getVar( 'q', 'unknown position' )"/>
+    </outputs>
+    <options sanitize="False" refresh="True"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/flymine.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/flymine.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,35 @@
+<?xml version="1.0"?>
+<!--
+    If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in
+    the initial response.  If value of 'URL_method' is 'post', any additional params coming back in the
+    initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed.
+-->
+<tool name="Flymine" id="flymine" tool_type="data_source">
+    <description>server</description>
+    <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+    <inputs action="http://www.flymine.org" check_values="false" method="get"> 
+        <display>go to Flymine server $GALAXY_URL</display>
+        <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=flymine" />
+    </inputs>
+    <request_param_translation>
+        <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" />
+        <request_param galaxy_name="URL" remote_name="URL" missing="" />
+        <request_param galaxy_name="dbkey" remote_name="db" missing="?" />
+        <request_param galaxy_name="organism" remote_name="organism" missing="" />
+        <request_param galaxy_name="table" remote_name="table" missing="" />
+        <request_param galaxy_name="description" remote_name="description" missing="" />
+        <request_param galaxy_name="name" remote_name="name" missing="FlyMine query" />
+        <request_param galaxy_name="info" remote_name="info" missing="" />
+        <request_param galaxy_name="data_type" remote_name="data_type" missing="auto" >
+            <value_translation>
+                <value galaxy_value="auto" remote_value="txt" /> <!-- intermine currently always provides 'txt', make this auto detect -->
+            </value_translation>
+        </request_param>
+    </request_param_translation>
+    <uihints minwidth="800"/>
+    <outputs>
+        <data name="output" format="txt" />
+    </outputs>
+    <options sanitize="False" refresh="True"/>
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/flymine_test.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/flymine_test.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,31 @@
+<?xml version="1.0"?>
+<!--
+    If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in
+    the initial response.  If value of 'URL_method' is 'post', any additional params coming back in the
+    initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed.
+-->
+<tool name="Flymine test" id="flymine_test" tool_type="data_source">
+    <description>server</description>
+    <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+    <inputs action="http://preview.flymine.org/preview/begin.do" check_values="false" method="get"> 
+        <display>go to Flymine server $GALAXY_URL</display>
+        <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=flymine" />
+    </inputs>
+    <request_param_translation>
+        <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" />
+        <request_param galaxy_name="URL" remote_name="URL" missing="" />
+        <request_param galaxy_name="dbkey" remote_name="db" missing="?" />
+        <request_param galaxy_name="organism" remote_name="organism" missing="" />
+        <request_param galaxy_name="table" remote_name="table" missing="" />
+        <request_param galaxy_name="description" remote_name="description" missing="" />
+        <request_param galaxy_name="name" remote_name="name" missing="FlyMine query" />
+        <request_param galaxy_name="info" remote_name="info" missing="" />
+        <request_param galaxy_name="data_type" remote_name="data_type" missing="txt" />
+    </request_param_translation>
+    <uihints minwidth="800"/>
+    <outputs>
+        <data name="output" format="txt" />
+    </outputs>
+    <options sanitize="False" refresh="True"/>
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/genbank.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/genbank.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+from Bio import GenBank
+import sys, os, textwrap
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def make_fasta(rec):
+    '''Creates fasta format from a record'''
+    gi   = rec.annotations.get('gi','')
+    org  = rec.annotations.get('organism','')
+    date = rec.annotations.get('date','')
+    head = '>gi:%s, id:%s, org:%s, date:%s\n' % (gi, rec.id, org, date)
+    body = '\n'.join(textwrap.wrap(rec.seq.data, width=80))
+    return head, body
+    
+if __name__ == '__main__':
+    
+    mode  = sys.argv[1]
+    text  = sys.argv[2]
+    output_file = sys.argv[3]
+
+    print 'Searching for %s <br>' % text
+    
+    # check if inputs are all numbers
+    try:
+        gi_list = text.split()
+        tmp = map(int, gi_list)
+    except ValueError:
+        gi_list = GenBank.search_for(text, max_ids=10)
+    
+    fp = open(output_file, 'wt')
+    record_parser = GenBank.FeatureParser()
+    ncbi_dict = GenBank.NCBIDictionary(mode, 'genbank', parser = record_parser)
+    for gid in gi_list:
+        res = ncbi_dict[gid]
+        head, body =  make_fasta(res)
+        fp.write(head+body+'\n')
+        print head
+    fp.close()
+
+   
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/genbank.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/genbank.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,25 @@
+<tool id="genbank" name="Connect to Genbank">
+<!--  <description>queries genbank</description> -->
+  <command interpreter="python">genbank.py $mode "$text" $output</command>
+  <inputs>
+    <param name="mode" type="select">
+      <option value="nucleotide">nucleotide database</option>
+      <option value="protein">proteins database</option>
+      <label>Get sequences from the</label>
+    </param>
+    <param name="text" size="40" type="text" value="6273291">
+      <label>with accession ID</label>
+    </param>   
+  </inputs>
+  <outputs>
+    <data format="fasta" name="output" />
+  </outputs>
+  <help>
+At the moment this tool allows the following simple searches:
+
+- by GI: **51594135**
+- by accession: **CF622840**
+- using text: **human hbb1** (this feature is experimental)
+  </help>
+
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/gramene_mart.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/gramene_mart.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,42 @@
+<?xml version="1.0"?>
+<!--
+    If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in
+    the initial response.  If value of 'URL_method' is 'post', any additional params coming back in the
+    initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed.
+
+    TODO: Hack to get biomart to work - the 'add_to_URL' param can be eliminated when the Biomart team encodes URL prior to sending, meanwhile
+    everything including and beyond the first '&' is truncated from URL.  They said they'll let us know when this is fixed at their end.
+-->
+<tool name="GrameneMart" id="gramenemart" tool_type="data_source" version="1.0.1">
+    <description> Central server</description>
+    <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+    <inputs action="http://www.gramene.org/biomart/martview" check_values="false" method="get" target="_top">
+        <display>go to GrameneMart Central $GALAXY_URL</display>
+        <param name="GALAXY_URL" type="baseurl" value="/tool_runner/biomart" />
+    </inputs>
+    <request_param_translation>
+        <request_param galaxy_name="URL" remote_name="URL" missing="">
+            <append_param separator="&amp;" first_separator="?" join="=">
+                <value name="_export" missing="1" />
+                <value name="GALAXY_URL" missing="0" />
+            </append_param>
+        </request_param>
+        <request_param galaxy_name="data_type" remote_name="exportView_outputformat" missing="tabular">
+            <value_translation>
+                <value galaxy_value="tabular" remote_value="TSV" />
+            </value_translation> 
+        </request_param>
+        <request_param galaxy_name="URL_method" remote_name="URL_method" missing="get" />
+        <request_param galaxy_name="dbkey" remote_name="dbkey" missing="?" />
+        <request_param galaxy_name="organism" remote_name="organism" missing="" />
+        <request_param galaxy_name="table" remote_name="table" missing="" />
+        <request_param galaxy_name="description" remote_name="description" missing="" />
+        <request_param galaxy_name="name" remote_name="name" missing="Biomart query" />
+        <request_param galaxy_name="info" remote_name="info" missing="" />
+    </request_param_translation>
+    <uihints minwidth="800"/>
+    <outputs>
+        <data name="output" format="tabular" />
+    </outputs>
+    <options sanitize="False" refresh="True"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/hapmapmart.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/hapmapmart.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,46 @@
+<?xml version="1.0"?>
+<!--
+    hacked from biomart.xml - testing hapmap biomart - problem is going to be converting these to lped/pbed
+    the data returned will be in all sorts of different shapes - and the sample ids need to be obtained separately
+    to create reliable pedigrees. eesh...
+
+    If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in
+    the initial response.  If value of 'URL_method' is 'post', any additional params coming back in the
+    initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed.
+
+    TODO: Hack to get biomart to work - the 'add_to_URL' param can be eliminated when the Biomart team encodes URL prior to sending, meanwhile
+    everything including and beyond the first '&' is truncated from URL.  They said they'll let us know when this is fixed at their end.
+-->
+<tool name="HapMapMart" id="hapmapmart" tool_type="data_source" version="0.0.01">
+ <description>HapMap Biomart</description>
+ <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+ <inputs action="http://hapmap.ncbi.nlm.nih.gov/biomart/martview" check_values="false" method="get" target="_top">
+ <display>go to HapMap BioMart $GALAXY_URL</display>
+ <param name="GALAXY_URL" type="baseurl" value="/tool_runner/hapmapmart" />
+    </inputs>
+    <request_param_translation>
+        <request_param galaxy_name="URL" remote_name="URL" missing="">
+            <append_param separator="&amp;" first_separator="?" join="=">
+                <value name="_export" missing="1" />
+                <value name="GALAXY_URL" missing="0" />
+            </append_param>
+        </request_param>
+        <request_param galaxy_name="data_type" remote_name="exportView_outputformat" missing="tabular" >
+            <value_translation>
+                <value galaxy_value="tabular" remote_value="TSV" />
+            </value_translation>
+        </request_param>
+        <request_param galaxy_name="URL_method" remote_name="URL_method" missing="get" />
+        <request_param galaxy_name="dbkey" remote_name="dbkey" missing="hg18" />
+        <request_param galaxy_name="organism" remote_name="organism" missing="human" />
+        <request_param galaxy_name="table" remote_name="table" missing="" />
+        <request_param galaxy_name="description" remote_name="description" missing="" />
+        <request_param galaxy_name="name" remote_name="name" missing="HapMap query" />
+        <request_param galaxy_name="info" remote_name="info" missing="" />
+    </request_param_translation>
+ <uihints minwidth="800"/>
+ <outputs>
+ <data name="output" format="tabular" />
+ </outputs>
+ <options sanitize="False" refresh="True"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/hbvar.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/hbvar.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,25 @@
+<?xml version="1.0"?>
+<tool name="HbVar" id="hbvar">
+
+ <description>Human Hemoglobin Variants and Thalassemias</description>
+
+ <command/>
+
+ <inputs action="http://globin.bx.psu.edu/cgi-bin/hbvar/query_vars3" check_values="false" method="get" target="_top">
+ <display>go to HbVar database $GALAXY_URL $tool_id</display>
+ <param name="GALAXY_URL" type="baseurl" value="/tool_runner/hbvar" />
+ <param name="tool_id" type="hidden" value = "hbvar"/>
+ </inputs>
+
+ <uihints minwidth="800"/>
+
+ <code file="hbvar_filter.py"/>
+
+ <outputs>
+ <data name="output" format="txt" />
+ </outputs>
+
+ <options sanitize="False" refresh="True"/>
+
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/hbvar_filter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/hbvar_filter.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,77 @@
+#TODO: Set dbkey to proper UCSC build, if known
+import urllib
+
+from galaxy import datatypes, config
+import tempfile, shutil
+
+def exec_before_job( app, inp_data, out_data, param_dict, tool=None):
+    """Sets the name of the data"""
+    data_name = param_dict.get( 'name', 'HbVar query' )
+    data_type = param_dict.get( 'type', 'txt' )
+    if data_type == 'txt': data_type='interval' #All data is TSV, assume interval
+    name, data = out_data.items()[0]
+    data = app.datatypes_registry.change_datatype(data, data_type)
+    data.name = data_name
+    out_data[name] = data
+
+def exec_after_process(app, inp_data, out_data, param_dict, tool=None, stdout=None, stderr=None):
+    """Verifies the data after the run"""
+
+    URL = param_dict.get( 'URL', None )
+    URL = URL + '&_export=1&GALAXY_URL=0'
+    if not URL:
+        raise Exception('Datasource has not sent back a URL parameter')
+
+    CHUNK_SIZE = 2**20 # 1Mb 
+    MAX_SIZE   = CHUNK_SIZE * 100
+    
+    try:
+        page = urllib.urlopen(URL)
+    except Exception, exc:
+        raise Exception('Problems connecting to %s (%s)' % (URL, exc) )
+
+    name, data = out_data.items()[0]
+    
+    fp = open(data.file_name, 'wb')
+    size = 0
+    while 1:
+        chunk = page.read(CHUNK_SIZE)
+        if not chunk:
+            break
+        if size > MAX_SIZE:
+            raise Exception('----- maximum datasize exceeded ---')
+        size += len(chunk)
+        fp.write(chunk)
+
+    fp.close()
+    #Set meta data, format file to be valid interval type
+    if isinstance(data.datatype, datatypes.interval.Interval):
+        data.set_meta(first_line_is_header=True)
+        #check for missing meta data, if all there, comment first line and process file
+        if not data.missing_meta():
+            line_ctr = -1
+            temp = tempfile.NamedTemporaryFile('w')
+            temp_filename = temp.name
+            temp.close()
+            temp = open(temp_filename,'w')
+            chromCol = int(data.metadata.chromCol) - 1
+            startCol = int(data.metadata.startCol) - 1
+            strandCol = int(data.metadata.strandCol) - 1
+            
+            
+            for line in open(data.file_name, 'r'):
+                line_ctr += 1
+                
+                fields = line.strip().split('\t')
+                
+                temp.write("%s\n" % '\t'.join(fields))
+            
+            temp.close()
+            shutil.move(temp_filename,data.file_name)
+            
+        else:
+            data = app.datatypes_registry.change_datatype(data, 'tabular')
+    data.set_size()
+    data.set_peek()
+    app.model.context.add( data )
+    app.model.context.flush()
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/import.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/import.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+
+"""
+Script that imports locally stored data as a new dataset for the user
+Usage: import id outputfile
+"""
+import sys, os
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+BUFFER = 1048576
+
+dataid   = sys.argv[1]
+out_name = sys.argv[2]
+
+
+id2name = {
+    'eryth'         : 'ErythPreCRMmm3_cusTrk.txt',
+    'cishg16'       : 'ReglRegHBBhg16CusTrk.txt',
+    'cishg17'       : 'ReglRegHBBhg17CusTrk.txt',
+    'exons'         : 'ExonsKnownGenes_mm3.txt',
+    'krhg16'        : 'known_regulatory_hg16.bed',
+    'krhg17'        : 'known_regulatory_hg17.bed',
+    'tARhg16mmc'    : 'hg16.mouse.t_AR.cold.bed',
+    'tARhg16mmm'    : 'hg16.mouse.t_AR.medium.bed',
+    'tARhg16mmh'    : 'hg16.mouse.t_AR.hot.bed',
+    'tARhg16rnc'    : 'hg16.rat.t_AR.cold.bed',
+    'tARhg16rnm'    : 'hg16.rat.t_AR.medium.bed',
+    'tARhg16rnh'    : 'hg16.rat.t_AR.hot.bed',
+    'phastConsHg16' : 'phastConsMost_hg16.bed',
+    'omimhg16'      : 'omimDisorders_hg16.tab',
+    'omimhg17'      : 'omimDisorders_hg17.tab',
+
+}
+
+fname = id2name.get(dataid, '')
+if not fname:
+    print 'Importing invalid data %s' % dataid
+    sys.exit()
+else:
+    print 'Imported %s' % fname
+
+# this path is hardcoded
+inp_name = os.path.join('database', 'import', fname)
+
+try:
+    inp = open(inp_name, 'rt')
+except:
+    print 'Could not find file %s' % inp_name
+    sys.exit()
+
+out = open(out_name, 'wt')
+
+while 1:
+    data = inp.read(BUFFER)
+    if not data:
+        break
+    out.write(data)
+
+inp.close()
+out.close()
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/import.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/import.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,27 @@
+<tool id="Featured datasets4" name="Featured datasets">
+  <description>(PSU prepared queries)</description>
+  <command interpreter="python">import.py $data $output</command>
+  <inputs>
+ <display>$data</display>
+ <param name="data" type="select" display="radio">
+      <option value="eryth">Erythroid predicted cis-regulatory modules</option>
+      <option value="exons">Exons of protein-coding genes in the mouse genome, assembly mm3</option>
+      <option value="cishg16 ">Known cis-regulatory modules in the human HBB gene complex (hg16)</option>
+      <option value="cishg17">Known cis-regulatory modules in the human HBB gene complex (hg17)</option>
+      <option value="krhg16">Known regulatory regions (hg16)</option>
+      <option value="krhg17">Known regulatory regions (hg17)</option>
+      <option value="tARhg16mmc">Human (hg16) evolutionary cold region (vs mouse)</option>
+      <option value="tARhg16mmm">Human (hg16) evolutionary medium region (vs mouse)</option>
+      <option value="tARhg16mmh">Human (hg16) evolutionary hot region (vs mouse)</option>
+      <option value="tARhg16rnc">Human (hg16) evolutionary cold region (vs rat)</option>
+      <option value="tARhg16rnm">Human (hg16) evolutionary medium region (vs rat)</option>
+      <option value="tARhg16rnh">Human (hg16) evolutionary hot region (vs rat)</option>
+      <option value="phastConsHg16">phastCons hg16 (stringent, top ~5%) from UCSC</option>
+      <option value="omimhg16">OMIM disorders (hg16)</option>
+      <option value="omimhg17">OMIM disorders (hg17)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="bed" name="output" />
+  </outputs>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/metabolicmine.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/metabolicmine.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+<tool name="metabolicMine" id="metabolicmine" tool_type="data_source">
+    <description>server</description>
+    <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+    <inputs action="http://www.metabolicmine.org/beta/begin.do" check_values="false" method="get"> 
+        <display>go to metabolicMine server $GALAXY_URL</display>
+    </inputs>
+    <uihints minwidth="800"/>
+    <outputs>
+        <data name="output" format="txt" />
+    </outputs>
+    <options sanitize="False" refresh="True"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/microbial_import.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/microbial_import.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+
+"""
+Script that imports locally stored data as a new dataset for the user
+Usage: import id outputfile
+"""
+import sys, os
+from shutil import copyfile
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+BUFFER = 1048576
+
+uids = sys.argv[1].split(",")
+out_file1 = sys.argv[2]
+
+#remove NONE from uids
+have_none = True
+while have_none:
+    try:
+        uids.remove('None')
+    except:
+        have_none = False
+
+
+#create dictionary keyed by uid of tuples of (displayName,filePath,build) for all files
+available_files = {}
+try:
+    filename = sys.argv[-1]
+    for i, line in enumerate( file( filename ) ):
+        if not line or line[0:1] == "#" : continue
+        fields = line.split('\t')
+        try:
+            info_type = fields.pop(0)
+            
+            if info_type.upper()=="DATA":
+                uid = fields.pop(0)
+                org_num = fields.pop(0)
+                chr_acc = fields.pop(0)
+                feature = fields.pop(0)
+                filetype = fields.pop(0)
+                path = fields.pop(0).replace("\r","").replace("\n","")
+                
+                file_type = filetype
+                build = org_num
+                description = uid
+            else:
+                continue
+        except:
+            continue
+
+        available_files[uid]=(description,path,build,file_type,chr_acc)
+except:
+    print >>sys.stderr, "It appears that the configuration file for this tool is missing."
+
+#create list of tuples of (displayName,FileName,build) for desired files
+desired_files = []
+for uid in uids:
+    try:
+        desired_files.append(available_files[uid])
+    except:
+        continue
+
+#copy first file to contents of given output file
+file1_copied = False
+while not file1_copied:
+    try:
+        first_file = desired_files.pop(0)
+    except:
+        print >>sys.stderr, "There were no valid files requested."
+        sys.exit()
+    file1_desc, file1_path, file1_build, file1_type,file1_chr_acc = first_file
+    try:
+        copyfile(file1_path,out_file1)
+        print "#File1\t"+file1_desc+"\t"+file1_chr_acc+"\t"+file1_build+"\t"+file1_type
+        file1_copied = True
+    except:
+        print >>sys.stderr, "The file specified is missing."
+        continue
+        #print >>sys.stderr, "The file specified is missing."
+    
+
+#Tell post-process filter where remaining files reside
+for extra_output in desired_files:
+    file_desc, file_path, file_build, file_type,file_chr_acc = extra_output
+    print "#NewFile\t"+file_desc+"\t"+file_chr_acc+"\t"+file_build+"\t"+file_path+"\t"+file_type
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/microbial_import.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/microbial_import.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,115 @@
+<tool id="microbial_import1" name="Get Microbial Data">
+  <command interpreter="python">microbial_import.py $CDS,$tRNA,$rRNA,$sequence,$GeneMark,$GeneMarkHMM,$Glimmer3 $output ${GALAXY_DATA_INDEX_DIR}/microbial_data.loc</command>
+  <inputs>
+      <param name="kingdom" type="select" label="Select the Desired Kingdom">
+        <options from_file="microbial_data.loc" startswith="ORG">
+          <column name="name" index="3"/>
+          <column name="value" index="3"/>
+          <filter type="unique_value" name="unique" column="3"/>
+        </options>
+      </param>
+      <param name="org" type="select" label="Select the Desired Organism">
+        <options from_file="microbial_data.loc" startswith="ORG">
+          <column name="name" index="2"/>
+          <column name="value" index="1"/>
+          <filter type="param_value" ref="kingdom" name="kingdom" column="3"/>
+          <filter type="sort_by" column="2"/>
+        </options>
+      </param>
+      <param name="CDS" type="select" label="Select Desired Coding Sequences" display="checkboxes" multiple="True">
+        <options from_file="microbial_data.loc" startswith="DATA">
+          <column name="name" index="3"/>
+          <column name="value" index="1"/>
+          <column name="feature" index="4"/>
+          <filter type="param_value" ref="org" name="kingdom" column="2"/>
+          <filter type="static_value" name="feature" value="CDS" column="4"/>
+        </options>
+      </param>
+      <param name="tRNA" type="select" label="Select Desired tRNA" display="checkboxes" multiple="True">
+        <options from_file="microbial_data.loc" startswith="DATA">
+          <column name="name" index="3"/>
+          <column name="value" index="1"/>
+          <column name="feature" index="4"/>
+          <filter type="param_value" ref="org" name="kingdom" column="2"/>
+          <filter type="static_value" name="feature" value="tRNA" column="4"/>
+        </options>
+      </param>
+      <param name="rRNA" type="select" label="Select Desired rRNA" display="checkboxes" multiple="True">
+        <options from_file="microbial_data.loc" startswith="DATA">
+          <column name="name" index="3"/>
+          <column name="value" index="1"/>
+          <column name="feature" index="4"/>
+          <filter type="param_value" ref="org" name="kingdom" column="2"/>
+          <filter type="static_value" name="feature" value="rRNA" column="4"/>
+        </options>
+      </param>
+      <param name="sequence" type="select" label="Select Desired DNA Sequences" display="checkboxes" multiple="True">
+        <options from_file="microbial_data.loc" startswith="DATA">
+          <column name="name" index="3"/>
+          <column name="value" index="1"/>
+          <column name="feature" index="4"/>
+          <filter type="param_value" ref="org" name="kingdom" column="2"/>
+          <filter type="static_value" name="feature" value="sequence" column="4"/>
+        </options>
+      </param>
+      <param name="GeneMark" type="select" label="Select Desired GeneMark Annotations" display="checkboxes" multiple="True">
+        <options from_file="microbial_data.loc" startswith="DATA">
+          <column name="name" index="3"/>
+          <column name="value" index="1"/>
+          <column name="feature" index="4"/>
+          <filter type="param_value" ref="org" name="kingdom" column="2"/>
+          <filter type="static_value" name="feature" value="GeneMark" column="4"/>
+        </options>
+      </param>
+      <param name="GeneMarkHMM" type="select" label="Select Desired GeneMarkHMM Annotations" display="checkboxes" multiple="True">
+        <options from_file="microbial_data.loc" startswith="DATA">
+          <column name="name" index="3"/>
+          <column name="value" index="1"/>
+          <column name="feature" index="4"/>
+          <filter type="param_value" ref="org" name="kingdom" column="2"/>
+          <filter type="static_value" name="feature" value="GeneMarkHMM" column="4"/>
+        </options>
+      </param>
+      <param name="Glimmer3" type="select" label="Select Desired Glimmer3 Annotations" display="checkboxes" multiple="True">
+        <options from_file="microbial_data.loc" startswith="DATA">
+          <column name="name" index="3"/>
+          <column name="value" index="1"/>
+          <column name="feature" index="4"/>
+          <filter type="param_value" ref="org" name="kingdom" column="2"/>
+          <filter type="static_value" name="feature" value="Glimmer3" column="4"/>
+        </options>
+      </param>
+  </inputs>
+  <outputs>
+    <data format="bed" name="output"/>
+  </outputs>
+  <code file="microbial_import_code.py"/>
+  <help>
+
+This tool will allow you to obtain various genomic datasets for any completed Microbial Genome Project as listed at NCBI_.
+
+.. _NCBI: http://www.ncbi.nlm.nih.gov/genomes/lproks.cgi?view=1
+
+Current datasets available include
+  1. CDS
+  2. tRNA
+  3. rRNA
+  4. FASTA Sequences
+  5. GeneMark Annotations
+  6. GeneMarkHMM Annotations
+  7. Glimmer3 Annotations
+
+-----
+
+Organisms in **bold** are available at the UCSC Browser.
+
+-----
+
+.. class:: infomark
+
+**Note:** Having trouble locating your organism?  Click here_ for a list of available species and their location.
+
+.. _here: http://wiki.g2.bx.psu.edu/Main/Data%20Libraries/Microbes
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/microbial_import_code.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/microbial_import_code.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,154 @@
+
+def load_microbial_data( GALAXY_DATA_INDEX_DIR, sep='\t' ):
+    # FIXME: this function is duplicated in the DynamicOptions class.  It is used here only to
+    # set data.name in exec_after_process(). 
+    microbe_info= {}
+    orgs = {}
+    
+    filename = "%s/microbial_data.loc" % GALAXY_DATA_INDEX_DIR
+    for i, line in enumerate( open( filename ) ):
+        line = line.rstrip( '\r\n' )
+        if line and not line.startswith( '#' ):
+            fields = line.split( sep )
+            #read each line, if not enough fields, go to next line
+            try:
+                info_type = fields.pop(0)
+                if info_type.upper() == "ORG":
+                    #ORG     12521   Clostridium perfringens SM101   bacteria        Firmicutes      CP000312,CP000313,CP000314,CP000315     http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=genomeprj&cmd=Retrieve&dopt=Overview&list_uids=12521
+                    org_num = fields.pop(0)
+                    name = fields.pop(0)
+                    kingdom = fields.pop(0)
+                    group = fields.pop(0)
+                    chromosomes = fields.pop(0)
+                    info_url = fields.pop(0)
+                    link_site = fields.pop(0)
+                    if org_num not in orgs:
+                        orgs[ org_num ] = {}
+                        orgs[ org_num ][ 'chrs' ] = {}
+                    orgs[ org_num ][ 'name' ] = name
+                    orgs[ org_num ][ 'kingdom' ] = kingdom
+                    orgs[ org_num ][ 'group' ] = group
+                    orgs[ org_num ][ 'chromosomes' ] = chromosomes
+                    orgs[ org_num ][ 'info_url' ] = info_url
+                    orgs[ org_num ][ 'link_site' ] = link_site
+                elif info_type.upper() == "CHR":
+                    #CHR     12521   CP000315        Clostridium perfringens phage phiSM101, complete genome 38092   110684521       CP000315.1
+                    org_num = fields.pop(0)
+                    chr_acc = fields.pop(0)
+                    name = fields.pop(0)
+                    length = fields.pop(0)
+                    gi = fields.pop(0)
+                    gb = fields.pop(0)
+                    info_url = fields.pop(0)
+                    chr = {}
+                    chr[ 'name' ] = name
+                    chr[ 'length' ] = length
+                    chr[ 'gi' ] = gi
+                    chr[ 'gb' ] = gb
+                    chr[ 'info_url' ] = info_url
+                    if org_num not in orgs:
+                        orgs[ org_num ] = {}
+                        orgs[ org_num ][ 'chrs' ] = {}
+                    orgs[ org_num ][ 'chrs' ][ chr_acc ] = chr
+                elif info_type.upper() == "DATA":
+                    #DATA    12521_12521_CDS 12521   CP000315        CDS     bed     /home/djb396/alignments/playground/bacteria/12521/CP000315.CDS.bed
+                    uid = fields.pop(0)
+                    org_num = fields.pop(0)
+                    chr_acc = fields.pop(0)
+                    feature = fields.pop(0)
+                    filetype = fields.pop(0)
+                    path = fields.pop(0)
+                    data = {}
+                    data[ 'filetype' ] = filetype
+                    data[ 'path' ] = path
+                    data[ 'feature' ] = feature
+
+                    if org_num not in orgs:
+                        orgs[ org_num ] = {}
+                        orgs[ org_num ][ 'chrs' ] = {}
+                    if 'data' not in orgs[ org_num ][ 'chrs' ][ chr_acc ]:
+                        orgs[ org_num ][ 'chrs' ][ chr_acc ][ 'data' ] = {}
+                    orgs[ org_num ][ 'chrs' ][ chr_acc ][ 'data' ][ uid ] = data
+                else: continue
+            except: continue
+    for org_num in orgs:
+        org = orgs[ org_num ]
+        if org[ 'kingdom' ] not in microbe_info:
+            microbe_info[ org[ 'kingdom' ] ] = {}
+        if org_num not in microbe_info[ org[ 'kingdom' ] ]:
+            microbe_info[ org[ 'kingdom' ] ][org_num] = org
+    return microbe_info
+
+#post processing, set build for data and add additional data to history
+from galaxy import datatypes, config, jobs, tools
+from shutil import copyfile
+
+def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
+    base_dataset = out_data.items()[0][1]
+    history = base_dataset.history
+    if history == None:
+        print "unknown history!"
+        return
+    kingdom = param_dict.get( 'kingdom', None )
+    #group = param_dict.get( 'group', None )
+    org = param_dict.get( 'org', None )
+    
+    #if not (kingdom or group or org):
+    if not (kingdom or org):
+        print "Parameters are not available."
+    #workflow passes galaxy.tools.parameters.basic.UnvalidatedValue instead of values
+    if isinstance( kingdom, tools.parameters.basic.UnvalidatedValue ):
+        kingdom = kingdom.value
+    if isinstance( org, tools.parameters.basic.UnvalidatedValue ):
+        org = org.value
+    
+    GALAXY_DATA_INDEX_DIR = app.config.tool_data_path
+    microbe_info = load_microbial_data( GALAXY_DATA_INDEX_DIR, sep='\t' )
+    new_stdout = ""
+    split_stdout = stdout.split("\n")
+    basic_name = ""
+    for line in split_stdout:
+        fields = line.split("\t")
+        if fields[0] == "#File1":
+            description = fields[1]
+            chr = fields[2]
+            dbkey = fields[3]
+            file_type = fields[4]
+            name, data = out_data.items()[0]
+            data.set_size()
+            basic_name = data.name
+            data.name = data.name + " (" + microbe_info[kingdom][org]['chrs'][chr]['data'][description]['feature'] +" for " + microbe_info[kingdom][org]['name'] + ":" + chr + ")"
+            data.dbkey = dbkey
+            data.info = data.name
+            data = app.datatypes_registry.change_datatype( data, file_type )
+            data.init_meta()
+            data.set_peek()
+            app.model.context.add( data )
+            app.model.context.flush()
+        elif fields[0] == "#NewFile":
+            description = fields[1]
+            chr = fields[2]
+            dbkey = fields[3]
+            filepath = fields[4]
+            file_type = fields[5]
+            newdata = app.model.HistoryDatasetAssociation( create_dataset = True, sa_session = app.model.context ) #This import should become a library
+            newdata.set_size()
+            newdata.extension = file_type
+            newdata.name = basic_name + " (" + microbe_info[kingdom][org]['chrs'][chr]['data'][description]['feature'] +" for "+microbe_info[kingdom][org]['name']+":"+chr + ")"
+            app.model.context.add( newdata )
+            app.model.context.flush()
+            app.security_agent.copy_dataset_permissions( base_dataset.dataset, newdata.dataset )
+            history.add_dataset( newdata )
+            app.model.context.add( history )
+            app.model.context.flush()
+            try:
+                copyfile(filepath,newdata.file_name)
+                newdata.info = newdata.name
+                newdata.state = jobs.JOB_OK
+            except:
+                newdata.info = "The requested file is missing from the system."
+                newdata.state = jobs.JOB_ERROR
+            newdata.dbkey = dbkey
+            newdata.init_meta()
+            newdata.set_peek()
+            app.model.context.flush()
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/modmine.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/modmine.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,19 @@
+<?xml version="1.0"?>
+<!--
+    If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in
+    the initial response.  If value of 'URL_method' is 'post', any additional params coming back in the
+    initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed.
+-->
+<tool name="modENCODE modMine" id="modmine" tool_type="data_source">
+    <description>server</description>
+    <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+    <inputs action="http://intermine.modencode.org/" check_values="false" method="get"> 
+        <display>go to modENCODE modMine server $GALAXY_URL</display>
+    </inputs>
+    <uihints minwidth="800"/>
+    <outputs>
+        <data name="output" format="txt" />
+    </outputs>
+    <options sanitize="False" refresh="True"/>
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/ratmine.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/ratmine.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,34 @@
+<?xml version="1.0"?>
+<!--
+    If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in
+    the initial response.  If value of 'URL_method' is 'post', any additional params coming back in the
+    initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed.
+-->
+<tool name="Ratmine" id="ratmine" tool_type="data_source">
+    <description>server</description>
+    <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+    <inputs action="http://ratmine.mcw.edu/ratmine/begin.do" check_values="false" method="get"> 
+        <display>go to Ratmine server $GALAXY_URL</display>
+        <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=ratmine" />
+    </inputs>
+    <request_param_translation>
+        <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" />
+        <request_param galaxy_name="URL" remote_name="URL" missing="" />
+        <request_param galaxy_name="dbkey" remote_name="db" missing="?" />
+        <request_param galaxy_name="organism" remote_name="organism" missing="" />
+        <request_param galaxy_name="table" remote_name="table" missing="" />
+        <request_param galaxy_name="description" remote_name="description" missing="" />
+        <request_param galaxy_name="name" remote_name="name" missing="Ratmine query" />
+        <request_param galaxy_name="info" remote_name="info" missing="" />
+        <request_param galaxy_name="data_type" remote_name="data_type" missing="auto" >
+            <value_translation>
+                <value galaxy_value="auto" remote_value="txt" /> <!-- intermine currently always provides 'txt', make this auto detect -->
+            </value_translation>
+        </request_param>
+    </request_param_translation>
+    <uihints minwidth="800"/>
+    <outputs>
+        <data name="output" format="txt" />
+    </outputs>
+    <options sanitize="False" refresh="True"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/ucsc_archaea.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/ucsc_archaea.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+<tool name="UCSC Archaea" id="ucsc_proxy">
+
+ <description>table browser</description>
+
+ <command interpreter="python">
+ ucsc_proxy.py $param_file $output
+ </command>
+
+ <inputs action="/ucsc_proxy/index" check_values="false">
+ <display>go to UCSC $init $hgta_outputType</display>
+ <param type="hidden" name="init" value="3"/>
+ <param type="hidden" name="hgta_outputType" value="bed"/>
+ </inputs>
+
+ <code file="ucsc_filter.py"/>
+
+ <outputs>
+ <data name="output" format="bed" />
+ </outputs>
+
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/ucsc_filter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/ucsc_filter.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,68 @@
+# runs after the job (and after the default post-filter)
+from galaxy import datatypes, jobs
+
+def validate(incoming):
+    """Validator"""
+    #raise Exception, 'not quite right'
+    pass
+
+def exec_before_job( app, inp_data, out_data, param_dict, tool=None):
+    """Sets the name of the data"""
+    outputType = param_dict.get( 'hgta_outputType', None )
+    if isinstance(outputType, list) and len(outputType)>0: outputType = outputType[-1]
+    items = out_data.items()
+    
+    for name, data in items:
+        data.name  = param_dict.get('display', data.name)
+        data.dbkey = param_dict.get('dbkey', '???')
+
+        if outputType == 'wigData':
+            ext = "wig"
+        elif outputType == 'maf':
+            ext = "maf"
+        elif outputType == 'gff':
+            ext = "gff"
+        elif outputType == 'gff3':
+            ext = "gff3"
+        else:
+            if 'hgta_doPrintSelectedFields' in param_dict:
+                ext = "interval"
+            elif 'hgta_doGetBed' in param_dict:
+                ext = "bed"
+            elif 'hgta_doGenomicDna' in param_dict:
+                ext = "fasta"
+            elif 'hgta_doGenePredSequence' in param_dict:
+                ext = "fasta"
+            else:
+                ext = "interval"
+        
+        data = app.datatypes_registry.change_datatype(data, ext)
+        out_data[name] = data
+        
+def exec_after_process( app, inp_data, out_data, param_dict, tool=None, stdout=None, stderr=None):
+    """Verifies the data after the run"""
+    items = out_data.items()
+    for name, data in items:
+        data.set_size()
+        try:            
+            err_msg, err_flag = 'Errors:', False
+            line_count = 0
+            num_lines = len(file(data.file_name).readlines())
+            for line in file(data.file_name):
+                line_count += 1
+                if line and line[0] == '-':
+                    if line_count + 3 == num_lines and not err_flag:
+                        err_flag = True
+                        err_msg = "Warning: It appears that your results have been truncated by UCSC. View the bottom of your result file for details."
+                        break
+                    err_flag = True
+                    err_msg = err_msg +" (line "+str(line_count)+")"+line
+            data.set_peek()
+            if isinstance(data.datatype, datatypes.interval.Interval) and data.missing_meta():
+                data = app.datatypes_registry.change_datatype(data, 'tabular')
+                out_data[name] = data
+            if err_flag:
+                raise Exception(err_msg)
+        except Exception, exc:
+            data.info  = data.info + "\n" + str(exc)
+            data.blurb = "error"
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/ucsc_proxy.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/ucsc_proxy.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+import urllib
+import sys, os
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+CHUNK   = 2**20 # 1Mb 
+MAXSIZE = CHUNK * 100
+if __name__ == '__main__':
+
+    if len(sys.argv) != 3:
+        print 'Usage ucsc.py input_params output_file'
+        sys.exit()
+
+    inp_file = sys.argv[1]
+    out_file = sys.argv[2]
+
+    DEFAULT_URL = "http://genome.ucsc.edu/hgTables?"
+    
+    # this must stay a list to allow multiple selections for the same widget name (checkboxes)
+    params  = []
+    for line in file(inp_file):
+        line = line.strip()
+        if line:
+            parts = line.split('=')
+            if len(parts) == 0:
+                key = ""
+                value = ""
+            elif len(parts) == 1:
+                key = parts[0]
+                value = ""
+            else:
+                key = parts[0]
+                value = parts[1]
+            if key == 'display':
+                print value
+            # get url from params, refered from proxy.py, initialized by the tool xml
+            elif key == 'proxy_url':
+                DEFAULT_URL = value
+            else:
+                params.append( (key, value) )
+    
+    #print params
+    
+    encoded_params = urllib.urlencode(params)
+    url = DEFAULT_URL + encoded_params
+
+    #print url
+
+    page = urllib.urlopen(url)
+
+    fp = open(out_file, 'wt')
+    size = 0
+    while 1:
+        data = page.read(CHUNK)
+        if not data:
+            break
+        if size > MAXSIZE:
+            fp.write('----- maximum datasize exceeded ---\n')
+            break
+        size += len(data)
+        fp.write(data)
+
+    fp.close()
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/ucsc_proxy.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/ucsc_proxy.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+<tool name="UCSC Main" id="ucsc_proxy">
+
+ <description>table browser proxy</description>
+
+ <command interpreter="python">
+ ucsc_proxy.py $param_file $output
+ </command>
+
+ <inputs action="/ucsc_proxy/index" check_values="false">
+ <display>go to UCSC $init $hgta_outputType</display>
+ <param type="hidden" name="init" value="1"/>
+ <param type="hidden" name="hgta_outputType" value="bed"/>
+ </inputs>
+
+ <code file="ucsc_filter.py"/>
+
+ <outputs>
+ <data name="output" format="bed" />
+ </outputs>
+
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/ucsc_tablebrowser.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/ucsc_tablebrowser.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,42 @@
+<?xml version="1.0"?>
+<!--
+    If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in
+    the initial response.  If value of 'URL_method' is 'post', any additional params coming back in the
+    initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed.
+-->
+<tool name="UCSC Main" id="ucsc_table_direct1" tool_type="data_source">
+    <description>table browser</description>
+    <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+    <inputs action="http://genome.ucsc.edu/cgi-bin/hgTables" check_values="false" method="get">
+        <display>go to UCSC Table Browser $GALAXY_URL</display>
+        <param name="GALAXY_URL" type="baseurl" value="/tool_runner" />
+        <param name="tool_id" type="hidden" value="ucsc_table_direct1" />
+        <param name="sendToGalaxy" type="hidden" value="1" />
+        <param name="hgta_compressType" type="hidden" value="none" />
+        <param name="hgta_outputType" type="hidden" value="bed" />
+    </inputs>
+    <request_param_translation>
+        <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" />
+        <request_param galaxy_name="URL" remote_name="URL" missing="" />
+        <request_param galaxy_name="dbkey" remote_name="db" missing="?" />
+        <request_param galaxy_name="organism" remote_name="org" missing="unknown species" />
+        <request_param galaxy_name="table" remote_name="hgta_table" missing="unknown table" />
+        <request_param galaxy_name="description" remote_name="hgta_regionType" missing="no description" />
+        <request_param galaxy_name="data_type" remote_name="hgta_outputType" missing="auto" >
+            <value_translation>
+                <value galaxy_value="auto" remote_value="primaryTable" />
+                <value galaxy_value="auto" remote_value="selectedFields" />
+                <value galaxy_value="wig" remote_value="wigData" />
+                <value galaxy_value="interval" remote_value="tab" />
+                <value galaxy_value="html" remote_value="hyperlinks" />
+                <value galaxy_value="fasta" remote_value="sequence" />
+                <value galaxy_value="gtf" remote_value="gff" />
+            </value_translation>
+        </request_param>
+    </request_param_translation>
+    <uihints minwidth="800"/>
+    <outputs>
+        <data name="output" format="tabular" label="${tool.name} on ${organism}: ${table} (#if $description == 'range' then $getVar( 'position', 'unknown position' ) else $description#)"/>
+    </outputs>
+    <options sanitize="False" refresh="True"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/ucsc_tablebrowser_archaea.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/ucsc_tablebrowser_archaea.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,42 @@
+<?xml version="1.0"?>
+<!--
+    If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in
+    the initial response.  If value of 'URL_method' is 'post', any additional params coming back in the
+    initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed.
+-->
+<tool name="UCSC Archaea" id="ucsc_table_direct_archaea1" tool_type="data_source">
+    <description>table browser</description>
+    <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+    <inputs action="http://archaea.ucsc.edu/cgi-bin/hgTables" check_values="false" method="get">
+        <display>go to UCSC Table Browser $GALAXY_URL</display>
+        <param name="GALAXY_URL" type="baseurl" value="/tool_runner" />
+        <param name="tool_id" type="hidden" value="ucsc_table_direct_archaea1" />
+        <param name="sendToGalaxy" type="hidden" value="1" />
+        <param name="hgta_compressType" type="hidden" value="none" />
+        <param name="hgta_outputType" type="hidden" value="bed" />
+    </inputs>
+    <request_param_translation>
+        <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" />
+        <request_param galaxy_name="URL" remote_name="URL" missing="" />
+        <request_param galaxy_name="dbkey" remote_name="db" missing="?" />
+        <request_param galaxy_name="organism" remote_name="org" missing="unknown species" />
+        <request_param galaxy_name="table" remote_name="hgta_table" missing="unknown table" />
+        <request_param galaxy_name="description" remote_name="hgta_regionType" missing="no description" />
+        <request_param galaxy_name="data_type" remote_name="hgta_outputType" missing="auto" >
+            <value_translation>
+                <value galaxy_value="auto" remote_value="primaryTable" />
+                <value galaxy_value="auto" remote_value="selectedFields" />
+                <value galaxy_value="wig" remote_value="wigData" />
+                <value galaxy_value="interval" remote_value="tab" />
+                <value galaxy_value="html" remote_value="hyperlinks" />
+                <value galaxy_value="fasta" remote_value="sequence" />
+                <value galaxy_value="gtf" remote_value="gff" />
+            </value_translation>
+        </request_param>
+    </request_param_translation>
+    <uihints minwidth="800"/>
+    <outputs>
+        <data name="output" format="tabular" label="${tool.name} on ${organism}: ${table} (#if $description == 'range' then $getVar( 'position', 'unknown position' ) else $description#)"/>
+    </outputs>
+    <options sanitize="False" refresh="True"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/ucsc_tablebrowser_test.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/ucsc_tablebrowser_test.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,42 @@
+<?xml version="1.0"?>
+<!--
+    If the value of 'URL_method' is 'get', the request will consist of the value of 'URL' coming back in
+    the initial response.  If value of 'URL_method' is 'post', any additional params coming back in the
+    initial response ( in addition to 'URL' ) will be encoded and appended to URL and a post will be performed.
+-->
+<tool name="UCSC Test" id="ucsc_table_direct_test1" tool_type="data_source">
+    <description>table browser</description>
+    <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+    <inputs action="http://genome-test.cse.ucsc.edu/cgi-bin/hgTables" check_values="false" method="get">
+        <display>go to UCSC Table Browser $GALAXY_URL</display>
+        <param name="GALAXY_URL" type="baseurl" value="/tool_runner" />
+        <param name="tool_id" type="hidden" value="ucsc_table_direct_test1" />
+        <param name="sendToGalaxy" type="hidden" value="1" />
+        <param name="hgta_compressType" type="hidden" value="none" />
+        <param name="hgta_outputType" type="hidden" value="bed" />
+    </inputs>
+    <request_param_translation>
+        <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" />
+        <request_param galaxy_name="URL" remote_name="URL" missing="" />
+        <request_param galaxy_name="dbkey" remote_name="db" missing="?" />
+        <request_param galaxy_name="organism" remote_name="org" missing="unknown species" />
+        <request_param galaxy_name="table" remote_name="hgta_table" missing="unknown table" />
+        <request_param galaxy_name="description" remote_name="hgta_regionType" missing="no description" />
+        <request_param galaxy_name="data_type" remote_name="hgta_outputType" missing="auto" >
+            <value_translation>
+                <value galaxy_value="auto" remote_value="primaryTable" />
+                <value galaxy_value="auto" remote_value="selectedFields" />
+                <value galaxy_value="wig" remote_value="wigData" />
+                <value galaxy_value="interval" remote_value="tab" />
+                <value galaxy_value="html" remote_value="hyperlinks" />
+                <value galaxy_value="fasta" remote_value="sequence" />
+                <value galaxy_value="gtf" remote_value="gff" />
+            </value_translation>
+        </request_param>
+    </request_param_translation>
+    <uihints minwidth="800"/>
+    <outputs>
+        <data name="output" format="tabular" label="${tool.name} on ${organism}: ${table} (#if $description == 'range' then $getVar( 'position', 'unknown position' ) else $description#)"/>
+    </outputs>
+    <options sanitize="False" refresh="True"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/ucsc_testproxy.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/ucsc_testproxy.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+<tool name="UCSC Test" id="ucsc_testproxy">
+
+ <description>table browser proxy</description>
+
+ <command interpreter="python">
+ ucsc_proxy.py $param_file $output
+ </command>
+
+ <inputs action="/ucsc_proxy/index" check_values="false">
+ <display>go to UCSC genome-test $init $hgta_outputType</display>
+ <param type="hidden" name="init" value="2"/>
+ <param type="hidden" name="hgta_outputType" value="bed"/>
+ </inputs>
+
+ <code file="ucsc_filter.py"/>
+
+ <outputs>
+ <data name="output" format="bed" />
+ </outputs>
+
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/upload.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/upload.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,394 @@\n+#!/usr/bin/env python\n+#Processes uploads from the user.\n+\n+# WARNING: Changes in this tool (particularly as related to parsing) may need\n+# to be reflected in galaxy.web.controllers.tool_runner and galaxy.tools\n+\n+import urllib, sys, os, gzip, tempfile, shutil, re, gzip, zipfile, codecs, binascii\n+from galaxy import eggs\n+# need to import model before sniff to resolve a circular import dependency\n+import galaxy.model\n+from galaxy.datatypes.checkers import *\n+from galaxy.datatypes import sniff\n+from galaxy.datatypes.binary import *\n+from galaxy.datatypes.images import Pdf\n+from galaxy.datatypes.registry import Registry\n+from galaxy import util\n+from galaxy.datatypes.util.image_util import *\n+from galaxy.util.json import *\n+\n+try:\n+    import Image as PIL\n+except ImportError:\n+    try:\n+        from PIL import Image as PIL\n+    except:\n+        PIL = None\n+\n+try:\n+    import bz2\n+except:\n+    bz2 = None\n+\n+assert sys.version_info[:2] >= ( 2, 4 )\n+\n+def stop_err( msg, ret=1 ):\n+    sys.stderr.write( msg )\n+    sys.exit( ret )\n+def file_err( msg, dataset, json_file ):\n+    json_file.write( to_json_string( dict( type = \'dataset\',\n+                                           ext = \'data\',\n+                                           dataset_id = dataset.dataset_id,\n+                                           stderr = msg ) ) + "\\n" )\n+    # never remove a server-side upload\n+    if dataset.type in ( \'server_dir\', \'path_paste\' ):\n+        return\n+    try:\n+        os.remove( dataset.path )\n+    except:\n+        pass\n+def safe_dict(d):\n+    """\n+    Recursively clone json structure with UTF-8 dictionary keys\n+    http://mellowmachines.com/blog/2009/06/exploding-dictionary-with-unicode-keys-as-python-arguments/\n+    """\n+    if isinstance(d, dict):\n+        return dict([(k.encode(\'utf-8\'), safe_dict(v)) for k,v in d.iteritems()])\n+    elif isinstance(d, list):\n+        return [safe_dict(x) for x in d]\n+    else:\n+        return d\n+def check_bam( file_path ):\n+    return Bam().sniff( file_path )\n+def check_sff( file_path ):\n+    return Sff().sniff( file_path )\n+def check_pdf( file_path ):\n+    return Pdf().sniff( file_path )\n+def check_bigwig( file_path ):\n+    return BigWig().sniff( file_path )\n+def check_bigbed( file_path ):\n+    return BigBed().sniff( file_path )\n+def parse_outputs( args ):\n+    rval = {}\n+    for arg in args:\n+        id, files_path, path = arg.split( \':\', 2 )\n+        rval[int( id )] = ( path, files_path )\n+    return rval\n+def add_file( dataset, registry, json_file, output_path ):\n+    data_type = None\n+    line_count = None\n+    converted_path = None\n+    stdout = None\n+    link_data_only = dataset.get( \'link_data_only\', \'copy_files\' )\n+\n+    try:\n+        ext = dataset.file_type\n+    except AttributeError:\n+        file_err( \'Unable to process uploaded file, missing file_type parameter.\', dataset, json_file )\n+        return\n+\n+    if dataset.type == \'url\':\n+        try:\n+            temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen( dataset.path ), prefix=\'url_paste\' )\n+        except Exception, e:\n+            file_err( \'Unable to fetch %s\\n%s\' % ( dataset.path, str( e ) ), dataset, json_file )\n+            return\n+        dataset.path = temp_name\n+    # See if we have an empty file\n+    if not os.path.exists( dataset.path ):\n+        file_err( \'Uploaded temporary file (%s) does not exist.\' % dataset.path, dataset, json_file )\n+        return\n+    if not os.path.getsize( dataset.path ) > 0:\n+        file_err( \'The uploaded file is empty\', dataset, json_file )\n+        return\n+    if not dataset.type == \'url\':\n+        # Already set is_multi_byte above if type == \'url\'\n+        try:\n+            dataset.is_multi_byte = util.is_multi_byte( codecs.open( dataset.path, \'r\', \'utf-8\' ).read( 100 ) )\n+        except UnicodeDecodeError, e:\n+            dataset.is_multi_byte = False\n+    # Is dataset an image?\n+    image = check_image( dataset.path )\n+    if image:\n+        if no'..b'tput_path )\n+            try:\n+                os.remove( converted_path )\n+            except:\n+                pass\n+        else:\n+            # This should not happen, but it\'s here just in case\n+            shutil.copy( dataset.path, output_path )\n+    elif link_data_only == \'copy_files\':\n+        shutil.move( dataset.path, output_path )\n+    # Write the job info\n+    stdout = stdout or \'uploaded %s file\' % data_type\n+    info = dict( type = \'dataset\',\n+                 dataset_id = dataset.dataset_id,\n+                 ext = ext,\n+                 stdout = stdout,\n+                 name = dataset.name,\n+                 line_count = line_count )\n+    json_file.write( to_json_string( info ) + "\\n" )\n+    if link_data_only == \'copy_files\' and datatype.dataset_content_needs_grooming( output_path ):\n+        # Groom the dataset content if necessary\n+        datatype.groom_dataset_content( output_path )\n+def add_composite_file( dataset, registry, json_file, output_path, files_path ):\n+        if dataset.composite_files:\n+            os.mkdir( files_path )\n+            for name, value in dataset.composite_files.iteritems():\n+                value = util.bunch.Bunch( **value )\n+                if dataset.composite_file_paths[ value.name ] is None and not value.optional:\n+                    file_err( \'A required composite data file was not provided (%s)\' % name, dataset, json_file )\n+                    break\n+                elif dataset.composite_file_paths[value.name] is not None:\n+                    dp = dataset.composite_file_paths[value.name][ \'path\' ]\n+                    isurl = dp.find(\'://\') <> -1 # todo fixme\n+                    if isurl:\n+                       try:\n+                           temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen( dp ), prefix=\'url_paste\' )\n+                       except Exception, e:\n+                           file_err( \'Unable to fetch %s\\n%s\' % ( dp, str( e ) ), dataset, json_file )\n+                           return\n+                       dataset.path = temp_name\n+                       dp = temp_name\n+                    if not value.is_binary:\n+                        if dataset.composite_file_paths[ value.name ].get( \'space_to_tab\', value.space_to_tab ):\n+                            sniff.convert_newlines_sep2tabs( dp )\n+                        else:\n+                            sniff.convert_newlines( dp )\n+                    shutil.move( dp, os.path.join( files_path, name ) )\n+        # Move the dataset to its "real" path\n+        shutil.move( dataset.primary_file, output_path )\n+        # Write the job info\n+        info = dict( type = \'dataset\',\n+                     dataset_id = dataset.dataset_id,\n+                     stdout = \'uploaded %s file\' % dataset.file_type )\n+        json_file.write( to_json_string( info ) + "\\n" )\n+\n+def __main__():\n+\n+    if len( sys.argv ) < 4:\n+        print >>sys.stderr, \'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...\'\n+        sys.exit( 1 )\n+\n+    output_paths = parse_outputs( sys.argv[4:] )\n+    json_file = open( \'galaxy.json\', \'w\' )\n+\n+    registry = Registry( sys.argv[1], sys.argv[2] )\n+\n+    for line in open( sys.argv[3], \'r\' ):\n+        dataset = from_json_string( line )\n+        dataset = util.bunch.Bunch( **safe_dict( dataset ) )\n+        try:\n+            output_path = output_paths[int( dataset.dataset_id )][0]\n+        except:\n+            print >>sys.stderr, \'Output path for dataset %s not found on command line\' % dataset.dataset_id\n+            sys.exit( 1 )\n+        if dataset.type == \'composite\':\n+            files_path = output_paths[int( dataset.dataset_id )][1]\n+            add_composite_file( dataset, registry, json_file, output_path, files_path )\n+        else:\n+            add_file( dataset, registry, json_file, output_path )\n+    # clean up paramfile\n+    try:\n+        os.remove( sys.argv[3] )\n+    except:\n+        pass\n+\n+if __name__ == \'__main__\':\n+    __main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/upload.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/upload.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,213 @@\n+<?xml version="1.0"?>\n+\n+<tool name="Upload File" id="upload1" version="1.1.3">\n+  <description>\n+    from your computer  \n+  </description>\n+  <action module="galaxy.tools.actions.upload" class="UploadToolAction"/>\n+  <command interpreter="python">\n+      upload.py $GALAXY_ROOT_DIR $GALAXY_DATATYPES_CONF_FILE $paramfile\n+    #set $outnum = 0\n+    #while $varExists(\'output%i\' % $outnum):\n+        #set $output = $getVar(\'output%i\' % $outnum)\n+        #set $outnum += 1\n+        #set $file_name = $output.file_name\n+        ## FIXME: This is not future-proof for other uses of external_filename (other than for use by the library upload\'s "link data" feature)\n+        #if $output.dataset.dataset.external_filename:\n+            #set $file_name = "None"\n+        #end if\n+        ${output.dataset.dataset.id}:${output.files_path}:${file_name}\n+    #end while\n+  </command>\n+  <inputs nginx_upload="true">\n+    <param name="file_type" type="select" label="File Format" help="Which format? See help below">\n+      <options from_parameter="tool.app.datatypes_registry.upload_file_formats" transform_lines="[ &quot;%s%s%s&quot; % ( line, self.separator, line ) for line in obj ]">\n+        <column name="value" index="1"/>\n+        <column name="name" index="0"/>\n+        <filter type="sort_by" column="0"/>\n+        <filter type="add_value" name="Auto-detect" value="auto" index="0"/>\n+      </options>\n+    </param>\n+    <param name="async_datasets" type="hidden" value="None"/>\n+    <upload_dataset name="files" title="Specify Files for Dataset" file_type_name="file_type" metadata_ref="files_metadata">\n+        <param name="file_data" type="file" size="30" label="File" ajax-upload="true" help="TIP: Due to browser limitations, uploading files larger than 2GB is guaranteed to fail.  To upload large files, use the URL method (below) or FTP (if enabled by the site administrator).">\n+        <validator type="expression" message="You will need to reselect the file you specified (%s)." substitute_value_in_message="True">not ( ( isinstance( value, unicode ) or isinstance( value, str ) ) and value != "" )</validator> <!-- use validator to post message to user about needing to reselect the file, since most browsers won\'t accept the value attribute for file inputs -->\n+      </param>\n+      <param name="url_paste" type="text" area="true" size="5x35" label="URL/Text" help="Here you may specify a list of URLs (one per line) or paste the contents of a file."/> \n+      <param name="ftp_files" type="ftpfile" label="Files uploaded via FTP"/>\n+      <param name="space_to_tab" type="select" display="checkboxes" multiple="True" label="Convert spaces to tabs" help="Use this option if you are entering intervals by hand."> \n+        <option value="Yes">Yes</option>\n+      </param>\n+    </upload_dataset>\n+    <param name="dbkey" type="genomebuild" label="Genome" />\n+    <conditional name="files_metadata" title="Specify metadata" value_from="self:app.datatypes_registry.get_upload_metadata_params" value_ref="file_type" value_ref_in_group="False" />\n+    <!-- <param name="other_dbkey" type="text" label="Or user-defined Genome" /> -->\n+  </inputs>\n+  <help>\n+  \n+**Auto-detect**\n+\n+The system will attempt to detect Axt, Fasta, Fastqsolexa, Gff, Gff3, Html, Lav, Maf, Tabular, Wiggle, Bed and Interval (Bed with headers) formats. If your file is not detected properly as one of the known formats, it most likely means that it has some format problems (e.g., different number of columns on different rows). You can still coerce the system to set your data to the format you think it should be.  You can also upload compressed files, which will automatically be decompressed. \n+\n+-----\n+\n+**Ab1**\n+\n+A binary sequence file in \'ab1\' format with a \'.ab1\' file extension.  You must manually select this \'File Format\' when uploading the file.\n+\n+-----\n+\n+**Axt**\n+\n+blastz pairwise alignment format.  Each alignment block in an axt file contains three lines: a summary line and 2 '..b'play color of the data contained in this BED line. NOTE: It is recommended that a simple color scheme (eight colors or less) be used with this attribute to avoid overwhelming the color resources of the Genome Browser and your Internet browser.\n+  - blockCount - The number of blocks (exons) in the BED line.\n+  - blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.\n+  - blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.\n+\n+* Example::\n+\n+    chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488, 0,3512\n+    chr22 2000 6000 cloneB 900 - 2000 6000 0 2 433,399, 0,3601\n+\n+-----\n+\n+**Fasta**\n+\n+A sequence in FASTA format consists of a single-line description, followed by lines of sequence data.  The first character of the description line is a greater-than (">") symbol in the first column.  All lines should be shorter than 80 characters::\n+\n+    >sequence1\n+    atgcgtttgcgtgc\n+    gtcggtttcgttgc\n+    >sequence2\n+    tttcgtgcgtatag\n+    tggcgcggtga\n+\n+-----\n+\n+**FastqSolexa**\n+\n+FastqSolexa is the Illumina (Solexa) variant of the Fastq format, which stores sequences and quality scores in a single file::\n+\n+    @seq1  \n+    GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT  \n+    +seq1  \n+    hhhhhhhhhhhhhhhhhhhhhhhhhhPW@hhhhhh  \n+    @seq2  \n+    GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG  \n+    +seq2  \n+    hhhhhhhhhhhhhhYhhahhhhWhAhFhSIJGChO\n+    \n+Or:: \n+\n+    @seq1\n+    GAATTGATCAGGACATAGGACAACTGTAGGCACCAT\n+    +seq1\n+    40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4\n+    @seq2\n+    GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG\n+    +seq2\n+    40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9\n+    \n+-----\n+\n+**Gff**\n+\n+GFF lines have nine required fields that must be tab-separated.\n+\n+-----\n+\n+**Gff3**\n+\n+The GFF3 format addresses the most common extensions to GFF, while preserving backward compatibility with previous formats.\n+\n+-----\n+\n+**Interval (Genomic Intervals)**\n+\n+- Tab delimited format (tabular)\n+- File must start with definition line in the following format (columns may be in any order).::\n+\n+    #CHROM START END STRAND\n+\n+- CHROM - The name of the chromosome (e.g. chr3, chrY, chr2_random) or contig (e.g. ctgY1).\n+- START - The starting position of the feature in the chromosome or contig. The first base in a chromosome is numbered 0.\n+- END - The ending position of the feature in the chromosome or contig. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.\n+- STRAND - Defines the strand - either \'+\' or \'-\'.\n+\n+- Example::\n+\n+    #CHROM START END   STRAND NAME COMMENT\n+    chr1   10    100   +      exon myExon\n+    chrX   1000  10050 -      gene myGene\n+\n+-----\n+\n+**Lav**\n+\n+Lav is the primary output format for BLASTZ.  The first line of a .lav file begins with #:lav..\n+\n+-----\n+\n+**MAF**\n+\n+TBA and multiz multiple alignment format.  The first line of a .maf file begins with ##maf. This word is followed by white-space-separated "variable=value" pairs. There should be no white space surrounding the "=".\n+\n+-----\n+\n+**Scf**\n+\n+A binary sequence file in \'scf\' format with a \'.scf\' file extension.  You must manually select this \'File Format\' when uploading the file.\n+\n+-----\n+\n+**Sff**\n+\n+A binary file in \'Standard Flowgram Format\' with a \'.sff\' file extension.\n+\n+-----\n+\n+**Tabular (tab delimited)**\n+\n+Any data in tab delimited format (tabular)\n+\n+-----\n+\n+**Wig**\n+\n+The wiggle format is line-oriented.  Wiggle data is preceded by a track definition line, which adds a number of options for controlling the default display of this track.\n+\n+-----\n+\n+**Other text type**\n+\n+Any text file\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/worm_modencode.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/worm_modencode.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+<tool name="modENCODE worm" id="modENCODEworm" tool_type="data_source">
+    <description>server</description>
+    <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+    <inputs action="http://modencode.oicr.on.ca/fgb2/gbrowse/worm" check_values="false" target="_top"> 
+        <display>go to modENCODE worm server $GALAXY_URL</display>
+        <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=modENCODEworm" />
+    </inputs>
+    <request_param_translation>
+        <request_param galaxy_name="dbkey" remote_name="dbkey" missing="ce6" >
+            <value_translation>
+                <value galaxy_value="ce6" remote_value="worm" />
+            </value_translation>
+        </request_param>
+        <request_param galaxy_name="URL" remote_name="URL" missing="">
+            <append_param separator="&amp;" first_separator="?" join="=">
+                <value name="d" missing="" />
+                <value name="dbkey" missing="ce6" />
+                <value name="q" missing="" />
+                <value name="s" missing="" />
+                <value name="t" missing="" />
+            </append_param>
+        </request_param>
+        <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" />
+        <request_param galaxy_name="data_type" remote_name="data_type" missing="auto" />
+    </request_param_translation>
+    <uihints minwidth="800"/>
+    <outputs>
+        <data name="output" format="txt" label="${tool.name} on $getVar( 'q', 'unknown position' )"/>
+    </outputs>
+    <options sanitize="False" refresh="True"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/wormbase.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/wormbase.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<tool name="Wormbase" id="wormbase" tool_type="data_source">
+    <description>server</description>
+    <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+    <inputs action="http://www.wormbase.org/db/seq/gbgff/c_elegans/" check_values="false" target="_top"> 
+        <display>go to Wormbase server $GALAXY_URL</display>
+        <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=wormbase" />
+    </inputs>
+    <request_param_translation>
+        <request_param galaxy_name="URL" remote_name="URL" missing="">
+            <append_param separator="&amp;" first_separator="?" join="=">
+                <value name="d" missing="" />
+                <value name="dbkey" missing="" />
+                <value name="q" missing="" />
+                <value name="s" missing="" />
+                <value name="t" missing="" />
+            </append_param>
+        </request_param>
+        <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" />
+        <request_param galaxy_name="data_type" remote_name="data_type" missing="auto" />
+    </request_param_translation>
+    <uihints minwidth="800"/>
+    <outputs>
+        <data name="output" format="txt" label="${tool.name} on $getVar( 'q', 'unknown position' )"/>
+    </outputs>
+    <options sanitize="False" refresh="True"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/wormbase_test.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/wormbase_test.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<tool name="Wormbase" id="wormbase_test" tool_type="data_source">
+ <description>test server</description>
+ <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+ <inputs action="http://dev.wormbase.org/db/seq/gbrowse/c_elegans/" check_values="false" target="_top"> 
+ <display>go to Wormbase test server $GALAXY_URL</display>
+ <param name="GALAXY_URL" type="baseurl" value="/tool_runner?tool_id=wormbase_test" />
+ </inputs>
+    <request_param_translation>
+        <request_param galaxy_name="URL" remote_name="URL" missing="">
+            <append_param separator="&amp;" first_separator="?" join="=">
+                <value name="d" missing="" />
+                <value name="dbkey" missing="" />
+                <value name="q" missing="" />
+                <value name="s" missing="" />
+                <value name="t" missing="" />
+            </append_param>
+        </request_param>
+        <request_param galaxy_name="URL_method" remote_name="URL_method" missing="post" />
+        <request_param galaxy_name="data_type" remote_name="data_type" missing="auto" />
+    </request_param_translation>
+    <uihints minwidth="800"/>
+    <outputs>
+        <data name="output" format="txt" label="${tool.name} on $getVar( 'q', 'unknown position' )"/>
+ </outputs>
+ <options sanitize="False" refresh="True"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/data_source/yeastmine.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/data_source/yeastmine.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+<tool name="YeastMine" id="yeastmine" tool_type="data_source">
+    <description>server</description>
+    <command interpreter="python">data_source.py $output $__app__.config.output_size_limit</command>
+    <inputs action="http://yeastmine.yeastgenome.org/yeastmine/begin.do" check_values="false" method="get"> 
+        <display>go to yeastMine server $GALAXY_URL</display>
+    </inputs>
+    <request_param_translation>
+        <request_param galaxy_name="data_type" remote_name="data_type" missing="auto" >
+            <value_translation>
+                <value galaxy_value="auto" remote_value="txt" /> <!-- intermine currently always provides 'txt', make this auto detect -->
+            </value_translation>
+        </request_param>
+    </request_param_translation>
+    <uihints minwidth="800"/>
+    <outputs>
+        <data name="output" format="txt" />
+    </outputs>
+    <options sanitize="False" refresh="True"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/discreteWavelet/execute_dwt_IvC_all.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/discreteWavelet/execute_dwt_IvC_all.pl Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,210 @@\n+#!/usr/bin/perl -w\n+use warnings;\n+use IO::Handle;\n+\n+$usage = "execute_dwt_IvC_all.pl [TABULAR.in] [TABULAR.in] [TABULAR.out] [PDF.out]  \\n";\n+die $usage unless @ARGV == 4;\n+\n+#get the input arguments\n+my $firstInputFile = $ARGV[0];\n+my $secondInputFile = $ARGV[1];\n+my $firstOutputFile = $ARGV[2];\n+my $secondOutputFile = $ARGV[3];\n+\n+open (INPUT1, "<", $firstInputFile) || die("Could not open file $firstInputFile \\n");\n+open (INPUT2, "<", $secondInputFile) || die("Could not open file $secondInputFile \\n");\n+open (OUTPUT1, ">", $firstOutputFile) || die("Could not open file $firstOutputFile \\n");\n+open (OUTPUT2, ">", $secondOutputFile) || die("Could not open file $secondOutputFile \\n");\n+open (ERROR,  ">", "error.txt")  or die ("Could not open file error.txt \\n");\n+\n+#save all error messages into the error file $errorFile using the error file handle ERROR\n+STDERR -> fdopen( \\*ERROR,  "w" ) or die ("Could not direct errors to the error file error.txt \\n");\n+\n+\n+print "There are two input data files: \\n";\n+print "The input data file is: $firstInputFile \\n";\n+print "The control data file is: $secondInputFile \\n";\n+\n+# IvC test\n+$test = "IvC";\n+\n+# construct an R script to implement the IvC test\n+print "\\n";\n+\n+$r_script = "get_dwt_IvC_test.r"; \n+print "$r_script \\n";\n+\n+# R script\n+open(Rcmd, ">", "$r_script") or die "Cannot open $r_script \\n\\n";\n+print Rcmd "\n+        ###########################################################################################\n+        # code to do wavelet Indel vs. Control\n+        # signal is the difference I-C; function is second moment i.e. variance from zero not mean\n+        # to perform wavelet transf. of signal, scale-by-scale analysis of the function \n+        # create null bands by permuting the original data series\n+        # generate plots and table matrix of correlation coefficients including p-values\n+        ############################################################################################\n+        library(\\"Rwave\\");\n+        library(\\"wavethresh\\");\n+        library(\\"waveslim\\");\n+        \n+        options(echo = FALSE)\n+        \n+        # normalize data\n+        norm <- function(data){\n+            v <- (data - mean(data))/sd(data);\n+            if(sum(is.na(v)) >= 1){\n+                v <- data;\n+            }\n+            return(v);\n+        }\n+        \n+        dwt_cor <- function(data.short, names.short, data.long, names.long, test, pdf, table, filter = 4, bc = \\"symmetric\\", wf = \\"haar\\", boundary = \\"reflection\\") {\n+            print(test);\n+            print(pdf);\n+            print(table);\n+            \n+            pdf(file = pdf);\n+            final_pvalue = NULL;\n+            title = NULL;\n+                \n+            short.levels <- wd(data.short[, 1], filter.number = filter, bc = bc)\\$nlevels;\n+            title <- c(\\"motif\\");\n+            for (i in 1:short.levels){\n+            \ttitle <- c(title, paste(i, \\"moment2\\", sep = \\"_\\"), paste(i, \\"pval\\", sep = \\"_\\"), paste(i, \\"test\\", sep = \\"_\\"));\n+            }\n+            print(title);\n+        \n+            # loop to compare a vs a\n+            for(i in 1:length(names.short)){\n+        \t\twave1.dwt = NULL;\n+        \t\tm2.dwt = diff = var.dwt = NULL;\n+        \t\tout = NULL;\n+                out <- vector(length = length(title));\n+        \n+        \t\tprint(names.short[i]);\n+        \t\tprint(names.long[i]);\n+                        \n+        \t\t# need exit if not comparing motif(a) vs motif(a)\n+        \t\tif (names.short[i] != names.long[i]){\n+                \tstop(paste(\\"motif\\", names.short[i], \\"is not the same as\\", names.long[i], sep = \\" \\"));\n+        \t\t}\n+        \t\telse {\n+                \t# signal is the difference I-C data sets\n+                    diff<-data.short[,i]-data.long[,i];\n+        \n+                    # normalize the signal\n+                    diff<-norm(diff);\n+        \n+                    # function is 2nd moment\n+                    # 2nd moment m_j = 1/N[s'..b'LSE);\n+                \t\tnk_2 <- sample(feature2, length(feature2), replace = FALSE);\n+                \t\tnull.levels <- wd(nk_1, filter.number = filter, bc = bc)\\$nlevels;\n+                \t\tnull_diff <- nk_1-nk_2;\n+                \t\tnull_diff <- norm(null_diff);\n+                \t\tnull_wave1 <- dwt(null_diff, wf = wf, short.levels, boundary = boundary);\n+                        var_null <- wave.variance(null_wave1);\n+                \t\tm2_null <- vector(length = null.levels);\n+                \t\tfor(level in 1:null.levels){\n+                        \tm2_null[level] <- var_null[level, 1] + (mean(null_diff)^2);\n+                \t\t}\n+                \t\tnull= rbind(null, m2_null);\n+            \t\t}\n+                \n+            \t\tnull <- apply(null, 2, sort, na.last = TRUE);\n+            \t\tm2_25 <- null[25,];\n+            \t\tm2_975 <- null[975,];\n+            \t\tmed <- apply(null, 2, median, na.rm = TRUE);\n+\n+            \t\t# plot\n+            \t\tresults <- cbind(m2.dwt, m2_25, m2_975);\n+            \t\tmatplot(results, type = \\"b\\", pch = \\"*\\", lty = 1, col = c(1, 2, 2), xlab = \\"Wavelet Scale\\", ylab = c(\\"Wavelet 2nd Moment\\", test), main = (names.short[i]), cex.main = 0.75);\n+            \t\tabline(h = 1);\n+\n+            \t\t# get pvalues by comparison to null distribution\n+            \t\tout <- c(names.short[i]);\n+            \t\tfor (m in 1:length(m2.dwt)){\n+                    \tprint(paste(\\"scale\\", m, sep = \\" \\"));\n+                        print(paste(\\"m2\\", m2.dwt[m], sep = \\" \\"));\n+                        print(paste(\\"median\\", med[m], sep = \\" \\"));\n+                        out <- c(out, format(m2.dwt[m], digits = 4));\t\n+                        pv = NULL;\n+                        if(is.na(m2.dwt[m])){\n+                        \tpv <- \\"NA\\"; \n+                        } \n+                        else {\n+                        \tif (m2.dwt[m] >= med[m]){\n+                            \t# R tail test\n+                                tail <- \\"R\\";\n+                                pv <- (length(which(null[, m] >= m2.dwt[m])))/(length(na.exclude(null[, m])));\n+                            }\n+                            else{\n+                                if (m2.dwt[m] < med[m]){\n+                                \t# L tail test\n+                                    tail <- \\"L\\";\n+                                    pv <- (length(which(null[, m] <= m2.dwt[m])))/(length(na.exclude(null[, m])));\n+                                }\n+                            }\n+                        }\n+                        out <- c(out, pv);\n+                        print(pv);  \n+                        out <- c(out, tail);\n+                    }\n+                    final_pvalue <-rbind(final_pvalue, out);\n+                    print(out);\n+                }\n+            }\n+            \n+            colnames(final_pvalue) <- title;\n+            write.table(final_pvalue, file = table, sep = \\"\\\\t\\", quote = FALSE, row.names = FALSE);\n+            dev.off();\n+        }\\n";\n+\n+print Rcmd "\n+        # execute\n+        # read in data \n+        \n+        inputData <- read.delim(\\"$firstInputFile\\");\n+        inputDataNames <- colnames(inputData);\n+        \n+        controlData <- read.delim(\\"$secondInputFile\\");\n+        controlDataNames <- colnames(controlData);\n+        \n+        # call the test function to implement IvC test\n+        dwt_cor(inputData, inputDataNames, controlData, controlDataNames, test = \\"$test\\", pdf = \\"$secondOutputFile\\", table = \\"$firstOutputFile\\");\n+        print (\\"done with the correlation test\\");\n+\\n";\n+\n+print Rcmd "#eof\\n";\n+\n+close Rcmd;\n+\n+system("echo \\"wavelet IvC test started on \\`hostname\\` at \\`date\\`\\"\\n");\n+system("R --no-restore --no-save --no-readline < $r_script > $r_script.out\\n");\n+system("echo \\"wavelet IvC test ended on \\`hostname\\` at \\`date\\`\\"\\n");\n+\n+#close the input and output and error files\n+close(ERROR);\n+close(OUTPUT2);\n+close(OUTPUT1);\n+close(INPUT2);\n+close(INPUT1);\n\\ No newline at end of file\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/discreteWavelet/execute_dwt_IvC_all.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/discreteWavelet/execute_dwt_IvC_all.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,112 @@
+<tool id="compute_p-values_second_moments_feature_occurrences_between_two_datasets_using_discrete_wavelet_transfom" name="Compute P-values and Second Moments for Feature Occurrences" version="1.0.0">
+  <description>between two datasets using Discrete Wavelet Transfoms</description>
+  
+  <command interpreter="perl">
+   execute_dwt_IvC_all.pl $inputFile1 $inputFile2 $outputFile1 $outputFile2
+  </command>
+
+  <inputs>
+   <param format="tabular" name="inputFile1" type="data" label="Select the first input file"/>
+   <param format="tabular" name="inputFile2" type="data" label="Select the second input file"/>
+  </inputs>
+  
+  <outputs>
+    <data format="tabular" name="outputFile1"/> 
+    <data format="pdf" name="outputFile2"/>
+  </outputs>
+  
+  <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This program generates plots and computes table matrix of second moments, p-values, and test orientations at multiple scales for the correlation between the occurrences of features in one dataset and their occurrences in another using multiscale wavelet analysis technique. 
+
+The program assumes that the user has two sets of DNA sequences, S1 and S1, each of which consists of one or more sequences of equal length. Each sequence in each set is divided into the same number of multiple intervals n such that n = 2^k, where k is a positive integer and  k >= 1. Thus, n could be any value of the set {2, 4, 8, 16, 32, 64, 128, ...}. k represents the number of scales.
+
+The program has two input files obtained as follows:
+
+For a given set of features, say motifs, the user counts the number of occurrences of each feature in each interval of each sequence in S1 and S1, and builds two tabular files representing the count results in each interval of S1 and S1. These are the input files of the program. 
+
+The program gives two output files:
+
+- The first output file is a TABULAR format file representing the second moments, p-values, and test orientations for each feature at each scale.
+- The second output file is a PDF file consisting of as many figures as the number of features, such that each figure represents the values of the second moment for that feature at every scale.
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+In order to obtain empirical p-values, a random perumtation test is implemented by the program, which results in the fact that the program gives slightly different results each time it is run on the same input file. 
+
+-----
+
+**Example**
+
+Counting the occurrences of 5 features (motifs) in 16 intervals (one line per interval) of the DNA sequences in S1 gives the following tabular file::
+
+ deletionHoptspot insertionHoptspot dnaPolPauseFrameshift topoisomeraseCleavageSite translinTarget
+ 226 403 416 221 1165
+ 236 444 380 241 1223
+ 242 496 391 195 1116
+ 243 429 364 191 1118
+ 244 410 371 236 1063
+ 230 386 370 217 1087
+ 275 404 402 214 1044
+ 265 443 365 231 1086
+ 255 390 354 246 1114
+ 281 384 406 232 1102
+ 263 459 369 251 1135
+ 280 433 400 251 1159
+ 278 385 382 231 1147
+ 248 393 389 211 1162
+ 251 403 385 246 1114
+ 239 383 347 227 1172
+
+And counting the occurrences of 5 features (motifs) in 16 intervals (one line per interval) of the DNA sequences in S2 gives the following tabular file:: 
+
+ deletionHoptspot insertionHoptspot dnaPolPauseFrameshift topoisomeraseCleavageSite translinTarget
+ 235 374 407 257 1159
+ 244 356 353 212 1128
+ 233 343 322 204 1110
+ 222 329 398 253 1054
+ 216 325 328 253 1129
+ 257 368 352 221 1115
+ 238 360 346 224 1102
+ 225 350 377 248 1107
+ 230 330 365 236 1132
+ 241 389 357 220 1120
+ 274 354 392 235 1120
+ 250 379 354 210 1102
+ 254 329 320 251 1080
+ 221 355 406 279 1127
+ 224 330 390 249 1129
+ 246 366 364 218 1176
+
+  
+We notice that the number of scales here is 4 because 16 = 2^4. Runnig the program on the above input files gives the following output:
+
+The first output file::
+
+ motif 1_moment2 1_pval 1_test 2_moment2 2_pval 2_test 3_moment2 3_pval 3_test 4_moment2 4_pval 4_test
+
+ deletionHoptspot 0.8751 0.376 L 1.549 0.168 R 0.6152 0.434 L 0.5735 0.488 R
+ insertionHoptspot 0.902 0.396 L 1.172 0.332 R 0.6843 0.456 L 1.728 0.213 R
+ dnaPolPauseFrameshift 1.65 0.013 R 0.267 0.055 L 0.1387 0.124 L 0.4516 0.498 L
+ topoisomeraseCleavageSite 0.7443 0.233 L 1.023 0.432 R 1.933 0.155 R 1.09 0.3 R
+ translinTarget 0.5084 0.057 L 0.8219 0.446 L 3.604 0.019 R 0.4377 0.492 L
+
+The second output file:
+
+.. image:: ./static/operation_icons/dwt_IvC_1.png
+.. image:: ./static/operation_icons/dwt_IvC_2.png
+.. image:: ./static/operation_icons/dwt_IvC_3.png
+.. image:: ./static/operation_icons/dwt_IvC_4.png
+.. image:: ./static/operation_icons/dwt_IvC_5.png
+
+  </help>  
+  
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/discreteWavelet/execute_dwt_cor_aVa_perClass.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/discreteWavelet/execute_dwt_cor_aVa_perClass.pl Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,221 @@\n+#!/usr/bin/perl -w\n+\n+use warnings;\n+use IO::Handle;\n+\n+$usage = "execute_dwt_cor_aVa_perClass.pl [TABULAR.in] [TABULAR.in] [TABULAR.out] [PDF.out]  \\n";\n+die $usage unless @ARGV == 4;\n+\n+#get the input arguments\n+my $firstInputFile = $ARGV[0];\n+my $secondInputFile = $ARGV[1];\n+my $firstOutputFile = $ARGV[2];\n+my $secondOutputFile = $ARGV[3];\n+\n+open (INPUT1, "<", $firstInputFile) || die("Could not open file $firstInputFile \\n");\n+open (INPUT2, "<", $secondInputFile) || die("Could not open file $secondInputFile \\n");\n+open (OUTPUT1, ">", $firstOutputFile) || die("Could not open file $firstOutputFile \\n");\n+open (OUTPUT2, ">", $secondOutputFile) || die("Could not open file $secondOutputFile \\n");\n+open (ERROR,  ">", "error.txt")  or die ("Could not open file error.txt \\n");\n+\n+#save all error messages into the error file $errorFile using the error file handle ERROR\n+STDERR -> fdopen( \\*ERROR,  "w" ) or die ("Could not direct errors to the error file error.txt \\n");\n+\n+print "There are two input data files: \\n";\n+print "The input data file is: $firstInputFile \\n";\n+print "The control data file is: $secondInputFile \\n";\n+\n+# IvC test\n+$test = "cor_aVa";\n+\n+# construct an R script to implement the IvC test\n+print "\\n";\n+\n+$r_script = "get_dwt_cor_aVa_test.r"; \n+print "$r_script \\n";\n+\n+open(Rcmd, ">", "$r_script") or die "Cannot open $r_script \\n\\n";\n+print Rcmd "\n+\t##################################################################################\n+\t# code to do all correlation tests of form: motif(a) vs. motif(a)\n+\t# add code to create null bands by permuting the original data series\n+\t# generate plots and table matrix of correlation coefficients including p-values\n+\t##################################################################################\n+\tlibrary(\\"Rwave\\");\n+\tlibrary(\\"wavethresh\\");\n+\tlibrary(\\"waveslim\\");\n+\t\n+\toptions(echo = FALSE)\n+\t\n+\t# normalize data\n+\tnorm <- function(data){\n+        v <- (data - mean(data))/sd(data);\n+        if(sum(is.na(v)) >= 1){\n+        \tv <- data;\n+        }\n+        return(v);\n+\t}\n+\t\n+\tdwt_cor <- function(data.short, names.short, data.long, names.long, test, pdf, table, filter = 4, bc = \\"symmetric\\", method = \\"kendall\\", wf = \\"haar\\", boundary = \\"reflection\\") {\n+\t\tprint(test);\n+\t    print(pdf);\n+\t\tprint(table);\n+\t\t\n+\t    pdf(file = pdf);   \n+\t    final_pvalue = NULL;\n+\t\ttitle = NULL;\n+\t\t\n+\t    short.levels <- wd(data.short[, 1], filter.number = filter, bc = bc)\\$nlevels;\n+\t\ttitle <- c(\\"motif\\");\n+        for (i in 1:short.levels){\n+\t        title <- c(title, paste(i, \\"cor\\", sep = \\"_\\"), paste(i, \\"pval\\", sep = \\"_\\"));\n+        }\n+        print(title);\n+\t\n+        # normalize the raw data\n+        data.short <- apply(data.short, 2, norm);\n+        data.long <- apply(data.long, 2, norm);\n+        \n+        for(i in 1:length(names.short)){\n+        \t# Kendall Tau\n+            # DWT wavelet correlation function\n+            # include significance to compare\n+            wave1.dwt = wave2.dwt = NULL;\n+            tau.dwt = NULL;\n+            out = NULL;\n+\n+            print(names.short[i]);\n+            print(names.long[i]);\n+            \n+            # need exit if not comparing motif(a) vs motif(a)\n+            if (names.short[i] != names.long[i]){\n+            \tstop(paste(\\"motif\\", names.short[i], \\"is not the same as\\", names.long[i], sep = \\" \\"));\n+            }\n+            else {\n+            \twave1.dwt <- dwt(data.short[, i], wf = wf, short.levels, boundary = boundary);\n+                wave2.dwt <- dwt(data.long[, i], wf = wf, short.levels, boundary = boundary);\n+                tau.dwt <- vector(length=short.levels)\n+                       \n+\t\t\t\t#perform cor test on wavelet coefficients per scale \n+\t\t\t\tfor(level in 1:short.levels){\n+                \tw1_level = w2_level = NULL;\n+                    w1_level <- (wave1.dwt[[level]]);\n+                    w2_level <- (wave2.dwt[[level]]);\n+                    tau.dwt[level] <- cor.test(w1_level, w2_lev'..b';\n+                        null_level1 <- (null_wave1[[level]]);\n+                        null_level2 <- (null_wave2[[level]]);\n+                        cor[level] <- cor.test(null_level1, null_level2, method = method)\\$estimate;\n+                    }\n+                    null = rbind(null, cor);\n+                }\n+                \n+                null <- apply(null, 2, sort, na.last = TRUE);\n+                print(paste(\\"NAs\\", length(which(is.na(null))), sep = \\" \\"));\n+                cor_25 <- null[25,];\n+                cor_975 <- null[975,];\n+                med <- (apply(null, 2, median, na.rm = TRUE));\n+\n+\t\t\t\t# plot\n+                results <- cbind(tau.dwt, cor_25, cor_975);\n+                matplot(results, type = \\"b\\", pch = \\"*\\" , lty = 1, col = c(1, 2, 2), ylim = c(-1, 1), xlab = \\"Wavelet Scale\\", ylab = \\"Wavelet Correlation Kendall\'s Tau\\", main = (paste(test, names.short[i], sep = \\" \\")), cex.main = 0.75);\n+                abline(h = 0);\n+\n+                # get pvalues by comparison to null distribution\n+ \t\t\t    ### modify pval calculation for error type II of T test ####\n+                out <- (names.short[i]);\n+                for (m in 1:length(tau.dwt)){\n+                \tprint(paste(\\"scale\\", m, sep = \\" \\"));\n+                    print(paste(\\"tau\\", tau.dwt[m], sep = \\" \\"));\n+                    print(paste(\\"med\\", med[m], sep = \\" \\"));\n+\t\t\t\t\tout <- c(out, format(tau.dwt[m], digits = 3));\t\n+                    pv = NULL;\n+                    if(is.na(tau.dwt[m])){\n+                    \tpv <- \\"NA\\"; \n+                    } \n+                    else {\n+                    \tif (tau.dwt[m] >= med[m]){\n+                        \t# R tail test\n+                            print(paste(\\"R\\"));\n+                            ### per sv ok to use inequality not strict\n+                            pv <- (length(which(null[, m] >= tau.dwt[m])))/(length(na.exclude(null[, m])));\n+                            if (tau.dwt[m] == med[m]){\n+\t\t\t\t\t\t\t\tprint(\\"tau == med\\");\n+                                print(summary(null[, m]));\n+                            }\n+                    \t}\n+                        else if (tau.dwt[m] < med[m]){\n+                        \t# L tail test\n+                            print(paste(\\"L\\"));\n+                            pv <- (length(which(null[, m] <= tau.dwt[m])))/(length(na.exclude(null[, m])));\n+                        }\n+\t\t\t\t\t}\n+\t\t\t\t\tout <- c(out, pv);\n+                    print(paste(\\"pval\\", pv, sep = \\" \\"));\n+                }\n+                final_pvalue <- rbind(final_pvalue, out);\n+\t\t\t\tprint(out);\n+        \t}\n+        }\n+        colnames(final_pvalue) <- title;\n+        write.table(final_pvalue, file = table, sep = \\"\\\\t\\", quote = FALSE, row.names = FALSE)\n+        dev.off();\n+\t}\\n";\n+\n+print Rcmd "\n+\t# execute\n+\t# read in data \n+\t\t\n+\tinputData1 = inputData2 = NULL;\n+\tinputData.short1 = inputData.short2 = NULL;\n+\tinputDataNames.short1 = inputDataNames.short2 = NULL;\n+\t\t\n+\tinputData1 <- read.delim(\\"$firstInputFile\\");\n+\tinputData.short1 <- inputData1[, +c(1:ncol(inputData1))];\n+\tinputDataNames.short1 <- colnames(inputData.short1);\n+\t\t\n+\tinputData2 <- read.delim(\\"$secondInputFile\\");\n+\tinputData.short2 <- inputData2[, +c(1:ncol(inputData2))];\n+\tinputDataNames.short2 <- colnames(inputData.short2);\n+\t\n+\t# cor test for motif(a) in inputData1 vs motif(a) in inputData2\n+\tdwt_cor(inputData.short1, inputDataNames.short1, inputData.short2, inputDataNames.short2, test = \\"$test\\", pdf = \\"$secondOutputFile\\", table = \\"$firstOutputFile\\");\n+\tprint (\\"done with the correlation test\\");\n+\t\n+\t#eof\\n";\n+close Rcmd;\n+\n+system("echo \\"wavelet IvC test started on \\`hostname\\` at \\`date\\`\\"\\n");\n+system("R --no-restore --no-save --no-readline < $r_script > $r_script.out\\n");\n+system("echo \\"wavelet IvC test ended on \\`hostname\\` at \\`date\\`\\"\\n");\n+\n+#close the input and output and error files\n+close(ERROR);\n+close(OUTPUT2);\n+close(OUTPUT1);\n+close(INPUT2);\n+close(INPUT1);\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/discreteWavelet/execute_dwt_cor_aVa_perClass.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/discreteWavelet/execute_dwt_cor_aVa_perClass.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,112 @@
+<tool id="compute_p-values_correlation_coefficients_feature_occurrences_between_two_datasets_using_discrete_wavelet_transfom" name="Compute P-values and Correlation Coefficients for Feature Occurrences" version="1.0.0">
+  <description>between two datasets using Discrete Wavelet Transfoms</description>
+  
+  <command interpreter="perl">
+   execute_dwt_cor_aVa_perClass.pl $inputFile1 $inputFile2 $outputFile1 $outputFile2
+  </command>
+
+  <inputs>
+   <param format="tabular" name="inputFile1" type="data" label="Select the first input file"/>
+   <param format="tabular" name="inputFile2" type="data" label="Select the second input file"/>
+  </inputs>
+  
+  <outputs>
+    <data format="tabular" name="outputFile1"/> 
+    <data format="pdf" name="outputFile2"/>
+  </outputs>
+  
+  <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This program generates plots and computes table matrix of coefficient correlations and p-values at multiple scales for the correlation between the occurrences of features in one dataset and their occurrences in another using multiscale wavelet analysis technique. 
+
+The program assumes that the user has two sets of DNA sequences, S1 and S1, each of which consists of one or more sequences of equal length. Each sequence in each set is divided into the same number of multiple intervals n such that n = 2^k, where k is a positive integer and  k >= 1. Thus, n could be any value of the set {2, 4, 8, 16, 32, 64, 128, ...}. k represents the number of scales.
+
+The program has two input files obtained as follows:
+
+For a given set of features, say motifs, the user counts the number of occurrences of each feature in each interval of each sequence in S1 and S1, and builds two tabular files representing the count results in each interval of S1 and S1. These are the input files of the program. 
+
+The program gives two output files:
+
+- The first output file is a TABULAR format file representing the coefficient correlations and p-values for each feature at each scale.
+- The second output file is a PDF file consisting of as many figures as the number of features, such that each figure represents the values of the coefficient correlation for that feature at every scale.
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+In order to obtain empirical p-values, a random perumtation test is implemented by the program, which results in the fact that the program gives slightly different results each time it is run on the same input file. 
+
+-----
+
+**Example**
+
+Counting the occurrences of 5 features (motifs) in 16 intervals (one line per interval) of the DNA sequences in S1 gives the following tabular file::
+
+ deletionHoptspot insertionHoptspot dnaPolPauseFrameshift topoisomeraseCleavageSite translinTarget
+ 269 366 330 238 1129
+ 239 328 327 283 1188
+ 254 351 358 297 1151
+ 262 371 355 256 1107
+ 254 361 352 234 1192
+ 265 354 367 240 1182
+ 255 359 333 235 1217
+ 271 389 387 272 1241
+ 240 305 341 249 1159
+ 272 351 337 257 1169
+ 275 351 337 233 1158
+ 305 331 361 253 1172
+ 277 341 343 253 1113
+ 266 362 355 267 1162
+ 235 326 329 241 1230
+ 254 335 360 251 1172
+
+And counting the occurrences of 5 features (motifs) in 16 intervals (one line per interval) of the DNA sequences in S2 gives the following tabular file::
+
+ deletionHoptspot insertionHoptspot dnaPolPauseFrameshift topoisomeraseCleavageSite translinTarget
+ 104 146 142 113 478
+ 89 146 151 94 495
+ 100 176 151 88 435
+ 96 163 128 114 468
+ 99 138 144 91 513
+ 112 126 162 106 468
+ 86 127 145 83 491
+ 104 145 171 110 496
+ 91 121 147 104 469
+ 103 141 145 98 458
+ 92 134 142 117 468
+ 97 146 145 107 471
+ 115 121 136 109 470
+ 113 135 138 101 491
+ 111 150 138 102 451
+ 94 128 151 138 481
+
+  
+We notice that the number of scales here is 4 because 16 = 2^4. Running the program on the above input files gives the following output:
+
+The first output file::
+
+ motif 1_cor 1_pval 2_cor 2_pval 3_cor 3_pval 4_cor 4_pval
+
+ deletionHoptspot 0.4 0.072 0.143 0.394 -0.667 0.244 1 0.491
+ insertionHoptspot 0.343 0.082 -0.0714 0.446 -1 0.12 1 0.502
+ dnaPolPauseFrameshift 0.617 0.004 -0.5 0.13 0.667 0.234 1 0.506
+ topoisomeraseCleavageSite -0.183 0.242 -0.286 0.256 0.333 0.353 -1 0.489
+ translinTarget 0.0167 0.503 -0.0714 0.469 1 0.136 1 0.485
+
+The second output file:
+
+.. image:: ./static/operation_icons/dwt_cor_aVa_1.png
+.. image:: ./static/operation_icons/dwt_cor_aVa_2.png
+.. image:: ./static/operation_icons/dwt_cor_aVa_3.png
+.. image:: ./static/operation_icons/dwt_cor_aVa_4.png
+.. image:: ./static/operation_icons/dwt_cor_aVa_5.png
+
+  </help>  
+  
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/discreteWavelet/execute_dwt_cor_aVb_all.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/discreteWavelet/execute_dwt_cor_aVb_all.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,223 @@
+#!/usr/bin/perl -w
+
+use warnings;
+use IO::Handle;
+
+$usage = "execute_dwt_cor_aVb_all.pl [TABULAR.in] [TABULAR.in] [TABULAR.out] [PDF.out]  \n";
+die $usage unless @ARGV == 4;
+
+#get the input arguments
+my $firstInputFile = $ARGV[0];
+my $secondInputFile = $ARGV[1];
+my $firstOutputFile = $ARGV[2];
+my $secondOutputFile = $ARGV[3];
+
+open (INPUT1, "<", $firstInputFile) || die("Could not open file $firstInputFile \n");
+open (INPUT2, "<", $secondInputFile) || die("Could not open file $secondInputFile \n");
+open (OUTPUT1, ">", $firstOutputFile) || die("Could not open file $firstOutputFile \n");
+open (OUTPUT2, ">", $secondOutputFile) || die("Could not open file $secondOutputFile \n");
+open (ERROR,  ">", "error.txt")  or die ("Could not open file error.txt \n");
+
+#save all error messages into the error file $errorFile using the error file handle ERROR
+STDERR -> fdopen( \*ERROR,  "w" ) or die ("Could not direct errors to the error file error.txt \n");
+
+print "There are two input data files: \n";
+print "The input data file is: $firstInputFile \n";
+print "The control data file is: $secondInputFile \n";
+
+# IvC test
+$test = "cor_aVb_all";
+
+# construct an R script to implement the IvC test
+print "\n";
+
+$r_script = "get_dwt_cor_aVa_test.r"; 
+print "$r_script \n";
+
+
+# R script
+open(Rcmd, ">", "$r_script") or die "Cannot open $r_script \n\n";
+print Rcmd "
+ #################################################################################
+ # code to do all correlation tests of form: motif(a) vs. motif(b)
+ # add code to create null bands by permuting the original data series
+ # generate plots and table matrix of correlation coefficients including p-values
+ #################################################################################
+ library(\"Rwave\");
+ library(\"wavethresh\");
+ library(\"waveslim\");
+
+ options(echo = FALSE)
+
+ # normalize data
+ norm <- function(data){
+ v <- (data - mean(data))/sd(data);
+ if(sum(is.na(v)) >= 1){
+ v <- data;
+ }
+ return(v);
+ }
+
+ dwt_cor <- function(data.short, names.short, data.long, names.long, test, pdf, table, filter = 4, bc = \"symmetric\", method = \"kendall\", wf = \"haar\", boundary = \"reflection\") {
+ print(test);
+ print(pdf);
+ print(table);
+
+ pdf(file = pdf);
+ final_pvalue = NULL;
+ title = NULL;
+
+ short.levels <- wd(data.short[, 1], filter.number = filter, bc = bc)\$nlevels;
+ title <- c(\"motif1\", \"motif2\");
+ for (i in 1:short.levels){
+ title <- c(title, paste(i, \"cor\", sep = \"_\"), paste(i, \"pval\", sep = \"_\"));
+ }
+ print(title);
+
+ # normalize the raw data
+ data.short <- apply(data.short, 2, norm);
+ data.long <- apply(data.long, 2, norm);
+
+ # loop to compare a vs b
+ for(i in 1:length(names.short)){
+ for(j in 1:length(names.long)){
+ if(i >= j){
+ next;
+ } 
+ else { 
+ # Kendall Tau
+ # DWT wavelet correlation function
+ # include significance to compare
+ wave1.dwt = wave2.dwt = NULL;
+ tau.dwt = NULL;
+ out = NULL;
+
+ print(names.short[i]);
+ print(names.long[j]);
+
+ # need exit if not comparing motif(a) vs motif(a)
+ if (names.short[i] == names.long[j]){
+ stop(paste(\"motif\", names.short[i], \"is the same as\", names.long[j], sep = \" \"));
+ }
+ else {
+ wave1.dwt <- dwt(data.short[, i], wf = wf, short.levels, boundary = boundary);
+ wave2.dwt <- dwt(data.long[, j], wf = wf, short.levels, boundary = boundary);
+ tau.dwt <-vector(length = short.levels)
+    
+ # perform cor test on wavelet coefficients per scale 
+ for(level in 1:short.levels){
+ w1_level = w2_level = NULL;
+ w1_level <- (wave1.dwt[[level]]);
+ w2_level <- (wave2.dwt[[level]]);
+ tau.dwt[level] <- cor.test(w1_level, w2_level, method = method)\$estimate;
+ }
+
+ # CI bands by permutation of time series
+ feature1 = feature2 = NULL;
+ feature1 = data.short[, i];
+ feature2 = data.long[, j];
+ null = results = med = NULL; 
+ cor_25 = cor_975 = NULL;
+
+ for (k in 1:1000) {
+ nk_1 = nk_2 = NULL;
+ null.levels = NULL;
+ cor = NULL;
+ null_wave1 = null_wave2 = NULL;
+
+ nk_1 <- sample(feature1, length(feature1), replace = FALSE);
+ nk_2 <- sample(feature2, length(feature2), replace = FALSE);
+ null.levels <- wd(nk_1, filter.number = filter, bc = bc)\$nlevels;
+ cor <- vector(length = null.levels);
+ null_wave1 <- dwt(nk_1, wf = wf, short.levels, boundary = boundary);
+ null_wave2 <- dwt(nk_2, wf = wf, short.levels, boundary = boundary);
+
+ for(level in 1:null.levels){
+ null_level1 = null_level2 = NULL;
+ null_level1 <- (null_wave1[[level]]);
+ null_level2 <- (null_wave2[[level]]);
+ cor[level] <- cor.test(null_level1, null_level2, method = method)\$estimate;
+ }
+ null = rbind(null, cor);
+ }
+
+ null <- apply(null, 2, sort, na.last = TRUE);
+ cor_25 <- null[25, ];
+ cor_975 <- null[975, ];
+ med <- (apply(null, 2, median, na.rm = TRUE));
+
+ # plot
+ results <- cbind(tau.dwt, cor_25, cor_975);
+ matplot(results, type = \"b\", pch = \"*\", lty = 1, col = c(1, 2, 2), ylim = c(-1, 1), xlab = \"Wavelet Scale\", ylab = \"Wavelet Correlation Kendall's Tau\", main = (paste(test, names.short[i], \"vs.\", names.long[j], sep = \" \")), cex.main = 0.75);
+ abline(h = 0);
+
+ # get pvalues by comparison to null distribution
+ ### modify pval calculation for error type II of T test ####
+ out <- c(names.short[i],names.long[j]);
+ for (m in 1:length(tau.dwt)){
+ print(m);
+ print(tau.dwt[m]);
+ out <- c(out, format(tau.dwt[m], digits = 3));
+ pv = NULL;
+ if(is.na(tau.dwt[m])){
+ pv <- \"NA\"; 
+ } 
+ else{
+ if (tau.dwt[m] >= med[m]){
+ # R tail test
+ pv <- (length(which(null[, m] >= tau.dwt[m])))/(length(na.exclude(null[, m])));
+ }
+ else{
+ if (tau.dwt[m] < med[m]){
+ # L tail test
+ pv <- (length(which(null[, m] <= tau.dwt[m])))/(length(na.exclude(null[, m])));
+ }
+ }
+ }
+ out <- c(out, pv);
+ print(pv);
+ }
+ final_pvalue <-rbind(final_pvalue, out);
+ print(out);
+ }
+ }
+ }
+ }
+ colnames(final_pvalue) <- title;
+ write.table(final_pvalue, file = table, sep = \"\\t\", quote = FALSE, row.names = FALSE)
+ dev.off();
+ }\n";
+
+print Rcmd "
+ # execute
+ # read in data 
+
+ inputData1 = inputData2 = NULL;
+ inputData.short1 = inputData.short2 = NULL;
+ inputDataNames.short1 = inputDataNames.short2 = NULL;
+
+ inputData1 <- read.delim(\"$firstInputFile\");
+ inputData.short1 <- inputData1[, +c(1:ncol(inputData1))];
+ inputDataNames.short1 <- colnames(inputData.short1);
+
+ inputData2 <- read.delim(\"$secondInputFile\");
+ inputData.short2 <- inputData2[, +c(1:ncol(inputData2))];
+ inputDataNames.short2 <- colnames(inputData.short2);
+
+ # cor test for motif(a) in inputData1 vs motif(b) in inputData2
+ dwt_cor(inputData.short1, inputDataNames.short1, inputData.short2, inputDataNames.short2, test = \"$test\", pdf = \"$secondOutputFile\", table = \"$firstOutputFile\");
+ print (\"done with the correlation test\");
+
+ #eof\n";
+close Rcmd;
+
+system("echo \"wavelet IvC test started on \`hostname\` at \`date\`\"\n");
+system("R --no-restore --no-save --no-readline < $r_script > $r_script.out\n");
+system("echo \"wavelet IvC test ended on \`hostname\` at \`date\`\"\n");
+
+#close the input and output and error files
+close(ERROR);
+close(OUTPUT2);
+close(OUTPUT1);
+close(INPUT2);
+close(INPUT1);
b
diff -r 000000000000 -r 9071e359b9a3 tools/discreteWavelet/execute_dwt_cor_aVb_all.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/discreteWavelet/execute_dwt_cor_aVb_all.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,123 @@
+<tool id="compute_p-values_correlation_coefficients_featureA_featureB_occurrences_between_two_datasets_using_discrete_wavelet_transfom" name="Compute P-values and Correlation Coefficients for Occurrences of Two Set of Features" version="1.0.0">
+  <description>between two datasets using Discrete Wavelet Transfoms</description>
+  
+  <command interpreter="perl">
+   execute_dwt_cor_aVb_all.pl $inputFile1 $inputFile2 $outputFile1 $outputFile2
+  </command>
+
+  <inputs>
+   <param format="tabular" name="inputFile1" type="data" label="Select the first input file"/>
+   <param format="tabular" name="inputFile2" type="data" label="Select the second input file"/>
+  </inputs>
+  
+  <outputs>
+    <data format="tabular" name="outputFile1"/> 
+    <data format="pdf" name="outputFile2"/>
+  </outputs>
+  
+  <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This program generates plots and computes table matrix of coefficient correlations and p-values at multiple scales for the correlation between the occurrences of features in one dataset and their occurrences in another using multiscale wavelet analysis technique. 
+
+The program assumes that the user has two sets of DNA sequences, S1 and S1, each of which consists of one or more sequences of equal length. Each sequence in each set is divided into the same number of multiple intervals n such that n = 2^k, where k is a positive integer and  k >= 1. Thus, n could be any value of the set {2, 4, 8, 16, 32, 64, 128, ...}. k represents the number of scales.
+
+The program has two input files obtained as follows:
+
+For a given set of features, say motifs, the user counts the number of occurrences of each feature in each interval of each sequence in S1 and S1, and builds two tabular files representing the count results in each interval of S1 and S1. These are the input files of the program. 
+
+The program gives two output files:
+
+- The first output file is a TABULAR format file representing the coefficient correlations and p-values for each feature at each scale.
+- The second output file is a PDF file consisting of as many figures as the number of features, such that each figure represents the values of the coefficient correlations for that feature at every scale.
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+In order to obtain empirical p-values, a random perumtation test is implemented by the program, which results in the fact that the program gives slightly different results each time it is run on the same input file. 
+
+-----
+
+**Example**
+
+Counting the occurrences of 5 features (motifs) in 16 intervals (one line per interval) of the DNA sequences in S1 gives the following tabular file::
+
+ deletionHoptspot insertionHoptspot dnaPolPauseFrameshift topoisomeraseCleavageSite translinTarget
+ 82 162 158 79 459
+ 111 196 154 75 459
+ 98 178 160 79 475
+ 113 201 170 113 436
+ 113 173 147 95 446
+ 107 150 155 84 436
+ 106 166 175 96 448
+ 113 176 135 106 514
+ 113 170 152 87 450
+ 95 152 167 93 467
+ 91 171 169 118 426
+ 84 139 160 100 459
+ 92 154 164 104 440
+ 100 145 154 98 472
+ 91 161 152 71 461
+ 117 164 139 97 463
+
+And counting the occurrences of 5 features (motifs) in 16 intervals (one line per interval) of the DNA sequences in S2 gives the following tabular file::
+
+ deletionHoptspot insertionHoptspot dnaPolPauseFrameshift topoisomeraseCleavageSite translinTarget
+ 269 366 330 238 1129
+ 239 328 327 283 1188
+ 254 351 358 297 1151
+ 262 371 355 256 1107
+ 254 361 352 234 1192
+ 265 354 367 240 1182
+ 255 359 333 235 1217
+ 271 389 387 272 1241
+ 240 305 341 249 1159
+ 272 351 337 257 1169
+ 275 351 337 233 1158
+ 305 331 361 253 1172
+ 277 341 343 253 1113
+ 266 362 355 267 1162
+ 235 326 329 241 1230
+ 254 335 360 251 1172
+
+  
+We notice that the number of scales here is 4 because 16 = 2^4. Running the program on the above input files gives the following output:
+
+The first output file::
+
+ motif1 motif2 1_cor 1_pval 2_cor 2_pval 3_cor 3_pval 4_cor 4_pval
+
+ deletionHoptspot insertionHoptspot -0.1 0.346 -0.214 0.338 1 0.127 1 0.467
+ deletionHoptspot dnaPolPauseFrameshift 0.167 0.267 -0.214 0.334 1 0.122 1 0.511
+ deletionHoptspot topoisomeraseCleavageSite 0.167 0.277 0.143 0.412 -0.667 0.243 1 0.521
+ deletionHoptspot translinTarget 0 0.505 0.0714 0.441 1 0.124 1 0.518
+ insertionHoptspot dnaPolPauseFrameshift -0.202 0.238 0.143 0.379 -1 0.122 1 0.517
+ insertionHoptspot topoisomeraseCleavageSite -0.0336 0.457 0.214 0.29 0.667 0.252 1 0.503
+ insertionHoptspot translinTarget 0.0672 0.389 0.429 0.186 -1 0.119 1 0.506
+ dnaPolPauseFrameshift topoisomeraseCleavageSite -0.353 0.101 0.357 0.228 0 0.612 -1 0.49
+ dnaPolPauseFrameshift translinTarget -0.151 0.303 -0.571 0.09 -0.333 0.37 -1 1
+ topoisomeraseCleavageSite translinTarget -0.37 0.077 -0.222 0.297 0.667 0.234 -1 0.471
+
+The second output file:
+
+.. image:: ./static/operation_icons/dwt_cor_aVb_all_1.png
+.. image:: ./static/operation_icons/dwt_cor_aVb_all_2.png
+.. image:: ./static/operation_icons/dwt_cor_aVb_all_3.png
+.. image:: ./static/operation_icons/dwt_cor_aVb_all_4.png
+.. image:: ./static/operation_icons/dwt_cor_aVb_all_5.png
+.. image:: ./static/operation_icons/dwt_cor_aVb_all_6.png
+.. image:: ./static/operation_icons/dwt_cor_aVb_all_7.png
+.. image:: ./static/operation_icons/dwt_cor_aVb_all_8.png
+.. image:: ./static/operation_icons/dwt_cor_aVb_all_9.png
+.. image:: ./static/operation_icons/dwt_cor_aVb_all_10.png
+
+
+  </help>  
+  
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/discreteWavelet/execute_dwt_var_perClass.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/discreteWavelet/execute_dwt_var_perClass.pl Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,320 @@\n+#!/usr/bin/perl -w\n+\n+use warnings;\n+use IO::Handle;\n+use POSIX qw(floor ceil);\n+\n+# example: perl execute_dwt_var_perClass.pl hg18_NCNR_10bp_3flanks_deletionHotspot_data_del.txt deletionHotspot 3flanks del\n+\n+$usage = "execute_dwt_var_perClass.pl [TABULAR.in] [TABULAR.out] [TABULAR.out] [PDF.out] \\n";\n+die $usage unless @ARGV == 4;\n+\n+#get the input arguments\n+my $inputFile = $ARGV[0];\n+my $firstOutputFile = $ARGV[1];\n+my $secondOutputFile = $ARGV[2];\n+my $thirdOutputFile = $ARGV[3];\n+\n+open (INPUT, "<", $inputFile) || die("Could not open file $inputFile \\n");\n+open (OUTPUT1, ">", $firstOutputFile) || die("Could not open file $firstOutputFile \\n");\n+open (OUTPUT2, ">", $secondOutputFile) || die("Could not open file $secondOutputFile \\n");\n+open (OUTPUT3, ">", $thirdOutputFile) || die("Could not open file $thirdOutputFile \\n");\n+open (ERROR,  ">", "error.txt")  or die ("Could not open file error.txt \\n");\n+\n+#save all error messages into the error file $errorFile using the error file handle ERROR\n+STDERR -> fdopen( \\*ERROR,  "w" ) or die ("Could not direct errors to the error file error.txt \\n");\n+\n+# choosing meaningful names for the output files\n+$max_dwt = $firstOutputFile; \n+$pvalue = $secondOutputFile; \n+$pdf = $thirdOutputFile; \n+\n+# count the number of columns in the input file\n+while($buffer = <INPUT>){\n+\t#if ($buffer =~ m/interval/){\n+\t\tchomp($buffer);\n+\t\t$buffer =~ s/^#\\s*//;\n+\t\t@contrl = split(/\\t/, $buffer);\n+\t\tlast;\n+\t#}\n+}\n+print "The number of columns in the input file is: " . (@contrl) . "\\n";\n+print "\\n";\n+\n+# count the number of motifs in the input file\n+$count = 0;\n+for ($i = 0; $i < @contrl; $i++){\n+\t$count++;\n+\tprint "# $contrl[$i]\\n";\n+}\n+print "The number of motifs in the input file is:  $count \\n";\n+\n+# check if the number of motifs is not a multiple of 12, and round up is so\n+$count2 = ($count/12);\n+if ($count2 =~ m/(\\D)/){\n+\tprint "the number of motifs is not a multiple of 12 \\n";\n+\t$count2 = ceil($count2);\n+}\n+else {\n+\tprint "the number of motifs is a multiple of 12 \\n";\n+}\n+print "There will be $count2 subfiles\\n\\n";\n+\n+# split infile into subfiles only 12 motif per file for R plotting\n+for ($x = 1; $x <= $count2; $x++){\n+\t$a = (($x - 1) * 12 + 1);\n+\t$b = $x * 12;\n+\t\n+\tif ($x < $count2){\n+\t\tprint "# data.short $x <- data_test[, +c($a:$b)]; \\n"; \n+\t}\n+\telse{\n+\t\tprint "# data.short $x <- data_test[, +c($a:ncol(data_test)]; \\n";\n+\t}\n+}\n+\n+print "\\n";\n+print "There are 4 output files: \\n";\n+print "The first output file is a pdf file\\n";\n+print "The second output file is a max_dwt file\\n";\n+print "The third output file is a pvalues file\\n";\n+print "The fourth output file is a test_final_pvalues file\\n";\n+\n+# write R script\n+$r_script = "get_dwt_varPermut_getMax.r"; \n+print "The R file name is: $r_script \\n";\n+\n+open(Rcmd, ">", "$r_script") or die "Cannot open $r_script \\n\\n";\n+\n+print Rcmd "\n+\t######################################################################\n+\t# plot power spectra, i.e. wavelet variance by class\n+\t# add code to create null bands by permuting the original data series\n+\t# get class of maximum significant variance per feature\n+\t# generate plots and table matrix of variance including p-values\n+\t######################################################################\n+\tlibrary(\\"Rwave\\");\n+\tlibrary(\\"wavethresh\\");\n+\tlibrary(\\"waveslim\\");\n+\n+\toptions(echo = FALSE)\n+\n+\t# normalize data\n+\tnorm <- function(data){\n+\t\tv <- (data-mean(data))/sd(data);\n+    \tif(sum(is.na(v)) >= 1){\n+    \t\tv<-data;\n+    \t}\n+    \treturn(v);\n+\t}\n+\n+\tdwt_var_permut_getMax <- function(data, names, filter = 4, bc = \\"symmetric\\", method = \\"kendall\\", wf = \\"haar\\", boundary = \\"reflection\\") {\n+\t\tmax_var = NULL;\n+    \tmatrix = NULL;\n+\t\ttitle = NULL;\n+    \tfinal_pvalue = NULL;\n+\t\tshort.levels = NULL;\n+\t\tscale = NULL;\n+\t\n+    \tprint(names);\n+    \t\n+   \t \tpar(mfcol = c(length(names), length(names)), mar = c(0, 0, 0, 0), oma = c(4, 3, 3, 2), xaxt = \\"s\\", cex = 1, las = 1);\n+   \t \t\n+    \tshort.levels '..b'= med[m]){\n+                        \t\t# R tail test\n+                            \tprint(\\"R\\");\n+\t                        \ttail <- \\"R\\";\n+                            \tpv <- (length(which(null[, m] >= temp[m])))/(length(na.exclude(null[, m])));\n+\n+                        \t} else {\n+                        \t\tif (temp[m] < med[m]){\n+                                \t# L tail test\n+                                \tprint(\\"L\\");\n+\t                            \ttail <- \\"L\\";\n+                                \tpv <- (length(which(null[, m] <= temp[m])))/(length(na.exclude(null[, m])));\n+                        \t\t}\n+\t\t\t\t\t\t\t}\n+\t\t\t\t\t\t\tout <- c(out, pv);\n+\t\t\t\t\t\t\tprint(pv);\n+\t\t\t\t\t\t\tout <- c(out, tail);\n+                    \t}\n+                    \tfinal_pvalue <-rbind(final_pvalue, out);\n+                 \t\n+                 \n+                    \t# get variances outside null bands by comparing temp to null\n+                    \t## temp stores variance for each scale, and null stores permuted variances for null bands\n+                    \tfor (n in 1:length(temp)){\n+                    \t\tif (temp[n] <= var_975[n]){\n+                        \t\ttemp[n] <- NA;\n+                        \t} else {\n+                        \t\ttemp[n] <- temp[n];\n+                        \t}\n+                    \t}\n+                    \tmatrix <- rbind(matrix, temp)\n+            \t\t}\n+            \t}\n+\t        \t# labels\n+\t        \tif (i == 1){\n+\t        \t\tmtext(names[j], side = 2, line = 0.5, las = 3, cex = 0.25);\n+\t        \t}\n+\t        \tif (j == 1){\n+\t        \t\tmtext(names[i], side = 3, line = 0.5, cex = 0.25);\n+\t        \t}\n+\t        \tif (j == length(names)){\n+\t        \t\taxis(1, at = (1:short.levels), las = 3, cex.axis = 0.5);\n+\t        \t}\n+    \t\t}\n+    \t}\n+\t\tcolnames(final_pvalue) <- title;\n+    \t#write.table(final_pvalue, file = \\"test_final_pvalue.txt\\", sep = \\"\\\\t\\", quote = FALSE, row.names = FALSE, append = TRUE);\n+\n+\t\t# get maximum variance larger than expectation by comparison to null bands\n+    \tvarnames <- vector();\n+    \tfor(i in 1:length(names)){\n+    \t\tname1 = paste(names[i], \\"var\\", sep = \\"_\\")\n+        \tvarnames <- c(varnames, name1)\n+    \t}\n+   \t\trownames(matrix) <- varnames;\n+    \tcolnames(matrix) <- (1:short.levels);\n+    \tmax_var <- names;\n+    \tscale <- vector(length = length(names));\n+    \tfor (x in 1:nrow(matrix)){\n+        \tif (length(which.max(matrix[x, ])) == 0){\n+            \tscale[x] <- NA;\n+        \t}\n+        \telse{\n+        \t\tscale[x] <- colnames(matrix)[which.max(matrix[x, ])];\n+        \t}\n+    \t}\n+    \tmax_var <- cbind(max_var, scale);\n+    \twrite.table(max_var, file = \\"$max_dwt\\", sep = \\"\\\\t\\", quote = FALSE, row.names = FALSE, append = TRUE);\n+    \treturn(final_pvalue);\n+\t}\\n";\n+\n+print Rcmd "\n+\t# execute\n+\t# read in data \n+\t\n+\tdata_test = NULL;\n+\tdata_test <- read.delim(\\"$inputFile\\");\n+\t\n+\tpdf(file = \\"$pdf\\", width = 11, height = 8);\n+\t\n+\t# loop to read and execute on all $count2 subfiles\n+\tfinal = NULL;\n+\tfor (x in 1:$count2){\n+\t\tsub = NULL;\n+\t\tsub_names = NULL;\n+\t\ta = NULL;\n+\t\tb = NULL;\n+\t\t\n+    \ta = ((x - 1) * 12 + 1);\n+    \tb = x * 12;\n+    \n+    \tif (x < $count2){\n+    \t\tsub <- data_test[, +c(a:b)];\n+\t\t\tsub_names <- colnames(data_test)[a:b];\n+\t\t\tfinal <- rbind(final, dwt_var_permut_getMax(sub, sub_names));\n+    \t}\n+    \telse{\n+    \t\tsub <- data_test[, +c(a:ncol(data_test))];\n+\t\t\tsub_names <- colnames(data_test)[a:ncol(data_test)];\n+\t\t\tfinal <- rbind(final, dwt_var_permut_getMax(sub, sub_names));\n+\t\t\t\n+    \t}\n+\t}\n+\n+\tdev.off();\n+\n+\twrite.table(final, file = \\"$pvalue\\", sep = \\"\\\\t\\", quote = FALSE, row.names = FALSE);\n+\n+\t#eof\\n";\n+\n+close Rcmd;\n+\n+system("echo \\"wavelet ANOVA started on \\`hostname\\` at \\`date\\`\\"\\n");\n+system("R --no-restore --no-save --no-readline < $r_script > $r_script.out");\n+system("echo \\"wavelet ANOVA ended on \\`hostname\\` at \\`date\\`\\"\\n");\n+\n+#close the input and output and error files\n+close(ERROR);\n+close(OUTPUT3);\n+close(OUTPUT2);\n+close(OUTPUT1);\n+close(INPUT);\n\\ No newline at end of file\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/discreteWavelet/execute_dwt_var_perClass.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/discreteWavelet/execute_dwt_var_perClass.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,105 @@
+<tool id="compute_p-values_max_variances_feature_occurrences_in_one_dataset_using_discrete_wavelet_transfom" name="Compute P-values and Max Variances for Feature Occurrences" version="1.0.0">
+  <description>in one dataset using Discrete Wavelet Transfoms</description>
+  
+  <command interpreter="perl">
+   execute_dwt_var_perClass.pl $inputFile $outputFile1 $outputFile2 $outputFile3
+  </command>
+  
+  <inputs>
+   <param format="tabular" name="inputFile" type="data" label="Select the input file"/>
+  </inputs>
+  
+  <outputs>
+    <data format="tabular" name="outputFile1"/> 
+    <data format="tabular" name="outputFile2"/>
+    <data format="pdf" name="outputFile3"/>
+  </outputs>
+  
+  <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This program generates plots and computes table matrix of maximum variances, p-values, and test orientations at multiple scales for the occurrences of a class of features in one dataset of DNA sequences using multiscale wavelet analysis technique. 
+
+The program assumes that the user has one set of DNA sequences, S, which consists of one or more sequences of equal length. Each sequence in S is divided into the same number of multiple intervals n such that n = 2^k, where k is a positive integer and  k >= 1. Thus, n could be any value of the set {2, 4, 8, 16, 32, 64, 128, ...}. k represents the number of scales.
+
+The program has one input file obtained as follows:
+
+For a given set of features, say motifs, the user counts the number of occurrences of each feature in each interval of each sequence in S, and builds a tabular file representing the count results in each interval of S. This is the input file of the program. 
+
+The program gives three output files:
+
+- The first output file is a TABULAR format file giving the scales at which each features has a maximum variances.
+- The second output file is a TABULAR format file representing the variances, p-values, and test orientation for the occurrences of features at each scale based on a random permutation test and using multiscale wavelet analysis technique.
+- The third output file is a PDF file plotting the wavelet variances of each feature at each scale.
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+- If the number of features is greater than 12, the program will divide each output file into subfiles, such that each subfile represents the results of a group of 12 features except the last subfile that will represents the results of the rest. For example, if the number of features is 17, the p-values file will consists of two subfiles, the first for the features 1-12 and the second for the features 13-17. As for the PDF file, it will consists of two pages in this case.
+- In order to obtain empirical p-values, a random perumtation test is implemented by the program, which results in the fact that the program gives slightly different results each time it is run on the same input file. 
+
+-----
+
+
+**Example**
+
+Counting the occurrences of 8 features (motifs) in 16 intervals (one line per interval) of set of DNA sequences in S gives the following tabular file::
+
+ deletionHoptspot insertionHoptspot dnaPolPauseFrameshift indelHotspot topoisomeraseCleavageSite translinTarget vDjRecombinationSignal x-likeSite
+ 226 403 416 221 1165 832 749 1056
+ 236 444 380 241 1223 746 782 1207
+ 242 496 391 195 1116 643 770 1219
+ 243 429 364 191 1118 694 783 1223
+ 244 410 371 236 1063 692 805 1233
+ 230 386 370 217 1087 657 787 1215
+ 275 404 402 214 1044 697 831 1188
+ 265 443 365 231 1086 694 782 1184
+ 255 390 354 246 1114 642 773 1176
+ 281 384 406 232 1102 719 787 1191
+ 263 459 369 251 1135 643 810 1215
+ 280 433 400 251 1159 701 777 1151
+ 278 385 382 231 1147 697 707 1161
+ 248 393 389 211 1162 723 759 1183
+ 251 403 385 246 1114 752 776 1153
+ 239 383 347 227 1172 759 789 1141
+  
+We notice that the number of scales here is 4 because 16 = 2^4. Runnig the program on the above input file gives the following 3 output files:
+
+The first output file::
+
+ motifs max_var at scale
+ deletionHoptspot NA
+ insertionHoptspot NA
+ dnaPolPauseFrameshift NA
+ indelHotspot NA
+ topoisomeraseCleavageSite 3
+ translinTarget NA
+ vDjRecombinationSignal NA
+ x.likeSite NA
+
+The second output file::
+
+ motif 1_var 1_pval 1_test 2_var 2_pval 2_test 3_var 3_pval 3_test 4_var 4_pval 4_test
+
+ deletionHoptspot 0.457 0.048 L 1.18 0.334 R 1.61 0.194 R 3.41 0.055 R
+ insertionHoptspot 0.556 0.109 L 1.34 0.272 R 1.59 0.223 R 2.02 0.157 R
+ dnaPolPauseFrameshift 1.42 0.089 R 0.66 0.331 L 0.421 0.305 L 0.121 0.268 L
+ indelHotspot 0.373 0.021 L 1.36 0.254 R 1.24 0.301 R 4.09 0.047 R
+ topoisomeraseCleavageSite 0.305 0.002 L 0.936 0.489 R 3.78 0.01 R 1.25 0.272 R
+ translinTarget 0.525 0.061 L 1.69 0.11 R 2.02 0.131 R 0.00891 0.069 L
+ vDjRecombinationSignal 0.68 0.138 L 0.957 0.46 R 2.35 0.071 R 1.03 0.357 R
+ x.likeSite 0.928 0.402 L 1.33 0.261 R 0.735 0.431 L 0.783 0.422 R
+
+The third output file:
+
+.. image:: ./static/operation_icons/dwt_var_perClass.png
+
+  </help>  
+  
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/discreteWavelet/execute_dwt_var_perFeature.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/discreteWavelet/execute_dwt_var_perFeature.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,199 @@
+#!/usr/bin/perl -w
+# Author: Erika Kvikstad
+
+use warnings;
+use IO::Handle;
+use POSIX qw(floor ceil);
+
+$usage = "execute_dwt_var_perFeature.pl [TABULAR.in] [FEATURE] [ALPHA] [TABULAR.out] [PDF.out] \n";
+die $usage unless @ARGV == 5;
+
+#get the input arguments
+my $inputFile = $ARGV[0];
+my @features = split(/,/,$ARGV[1]);
+my $features_count = scalar(@features);
+my $alpha = $ARGV[2];
+my $outFile1 = $ARGV[3];
+my $outFile2 = $ARGV[4];
+
+open (INPUT, "<", $inputFile) || die("Could not open file $inputFile \n");
+open (OUTPUT2, ">", $outFile1) || die("Could not open file $outFile1 \n");
+open (OUTPUT3, ">", $outFile2) || die("Could not open file $outFile2 \n");
+#open (ERROR,  ">", "error.txt")  or die ("Could not open file error.txt \n");
+
+# choosing meaningful names for the output files
+$pvalue = $outFile1; 
+$pdf = $outFile2; 
+
+# write R script
+$r_script = "get_dwt_varPermut.r"; 
+
+open(Rcmd, ">", "$r_script") or die "Cannot open $r_script \n\n";
+
+print Rcmd "
+ ######################################################################
+ # plot multiscale wavelet variance 
+ # create null bands by permuting the original data series
+ # generate plots and table of wavelet variance including p-values
+ ######################################################################
+ options(echo = FALSE)
+ #library(\"Rwave\");
+ #library(\"wavethresh\");
+ #library(\"waveslim\");
+ # turn off diagnostics for de-bugging only, turn back on for functional tests on test
+ require(\"Rwave\",quietly=TRUE,warn.conflicts = FALSE);
+ require(\"wavethresh\",quietly=TRUE,warn.conflicts = FALSE);
+ require(\"waveslim\",quietly=TRUE,warn.conflicts = FALSE);
+ require(\"bitops\",quietly=TRUE,warn.conflicts = FALSE);
+
+ # to determine if data is properly formatted 2^N observations
+ is.power2<- function(x){x && !(bitAnd(x,x - 1));}
+
+ # dwt : discrete wavelet transform using Haar wavelet filter, simplest wavelet function but later can modify to let user-define the wavelet filter function
+ dwt_var_permut_getMax <- function(data, names, alpha, filter = 1,family=\"DaubExPhase\", bc = \"symmetric\", method = \"kendall\", wf = \"haar\", boundary = \"reflection\") {
+ max_var = NULL;
+     matrix = NULL;
+ title = NULL;
+     final_pvalue = NULL;
+ J = NULL;
+ scale = NULL;
+ out = NULL;
+
+ print(class(data));
+     print(names);
+ print(alpha);
+    
+ par(mar=c(5,4,4,3),oma = c(4, 4, 3, 2), xaxt = \"s\", cex = 1, las = 1);
+   
+ title<-c(\"Wavelet\",\"Variance\",\"Pvalue\",\"Test\");
+ print(title);
+
+     for(i in 1:length(names)){
+ temp = NULL;
+ results = NULL;
+ wave1.dwt = NULL;
+
+ # if data fails formatting check, do something
+
+ print(is.numeric(as.matrix(data)[, i]));
+ if(!is.numeric(as.matrix(data)[, i]))
+ stop(\"data must be a numeric vector\");
+
+ print(length(as.matrix(data)[, i]));
+ print(is.power2(length(as.matrix(data)[, i])));
+ if(!is.power2(length(as.matrix(data)[, i])))
+ stop(\"data length must be a power of two\");
+
+
+     J <- wd(as.matrix(data)[, i], filter.number = filter, family=family, bc = bc)\$nlevels;
+ print(J);
+             temp <- vector(length = J);
+                wave1.dwt <- dwt(as.matrix(data)[, i], wf = wf, J, boundary = boundary); 
+ #print(wave1.dwt);
+                
+                temp <- wave.variance(wave1.dwt)[-(J+1), 1];
+ print(temp);
+
+                #permutations code :
+                feature1 = NULL;
+ null = NULL;
+ var_lower=limit_lower=NULL;
+ var_upper=limit_upper=NULL;
+ med = NULL;
+
+ limit_lower = alpha/2*1000;
+ print(limit_lower);
+ limit_upper = (1-alpha/2)*1000;
+ print(limit_upper);
+
+ feature1 = as.matrix(data)[,i];
+                for (k in 1:1000) {
+ nk_1 = NULL;
+ null.levels = NULL;
+ var = NULL;
+ null_wave1 = NULL;
+
+                        nk_1 = sample(feature1, length(feature1), replace = FALSE);
+                        null.levels <- wd(nk_1, filter.number = filter,family=family ,bc = bc)\$nlevels;
+                        var <- vector(length = length(null.levels));
+                        null_wave1 <- dwt(nk_1, wf = wf, J, boundary = boundary);
+                        var<- wave.variance(null_wave1)[-(null.levels+1), 1];
+                        null= rbind(null, var);
+               }
+               null <- apply(null, 2, sort, na.last = TRUE);
+               var_lower <- null[limit_lower, ];
+               var_upper <- null[limit_upper, ];
+               med <- (apply(null, 2, median, na.rm = TRUE));
+
+               # plot
+               results <- cbind(temp, var_lower, var_upper);
+ print(results);
+                matplot(results, type = \"b\", pch = \"*\", lty = 1, col = c(1, 2, 2),xaxt='n',xlab=\"Wavelet Scale\",ylab=\"Wavelet variance\" );
+ mtext(names[i], side = 3, line = 0.5, cex = 1);
+ axis(1, at = 1:J , labels=c(2^(0:(J-1))), las = 3, cex.axis = 1);
+
+                # get pvalues by comparison to null distribution
+ #out <- (names[i]);
+                for (m in 1:length(temp)){
+                     print(paste(\"scale\", m, sep = \" \"));
+                        print(paste(\"var\", temp[m], sep = \" \"));
+                        print(paste(\"med\", med[m], sep = \" \"));
+                        pv = tail =scale = NULL;
+ scale=2^(m-1);
+ #out <- c(out, format(temp[m], digits = 3));
+                        if (temp[m] >= med[m]){
+                        # R tail test
+                            print(\"R\");
+                         tail <- \"R\";
+                             pv <- (length(which(null[, m] >= temp[m])))/(length(na.exclude(null[, m])));
+
+                        } else {
+                        if (temp[m] < med[m]){
+                                # L tail test
+                                print(\"L\");
+                              tail <- \"L\";
+                                 pv <- (length(which(null[, m] <= temp[m])))/(length(na.exclude(null[, m])));
+                         }
+ }
+ print(pv);
+ out<-rbind(out,c(paste(\"Scale\", scale, sep=\"_\"),format(temp[m], digits = 3),pv,tail));
+                }
+ final_pvalue <-rbind(final_pvalue, out);
+   }
+ colnames(final_pvalue) <- title;
+     return(final_pvalue);
+}\n";
+
+print Rcmd "
+# execute
+# read in data 
+data_test = final = NULL;
+sub = sub_names = NULL;
+data_test <- read.delim(\"$inputFile\",header=FALSE);
+pdf(file = \"$pdf\", width = 11, height = 8)\n";
+
+for ($x=0;$x<$features_count;$x++){
+ $feature=$features[$x];
+print Rcmd "
+ if ($feature > ncol(data_test))
+ stop(\"column $feature doesn't exist\");
+ sub<-data_test[,$feature];
+ #sub_names <- colnames(data_test);
+ sub_names<-colnames(data_test)[$feature];
+ final <- rbind(final,dwt_var_permut_getMax(sub, sub_names,$alpha));\n";
+}
+
+print Rcmd "
+
+ dev.off();
+ write.table(final, file = \"$pvalue\", sep = \"\\t\", quote = FALSE, row.names = FALSE);
+
+#eof\n";
+
+close Rcmd;
+system("R --no-restore --no-save --no-readline < $r_script > $r_script.out");
+
+#close the input and output and error files
+close(OUTPUT3);
+close(OUTPUT2);
+close(INPUT);
b
diff -r 000000000000 -r 9071e359b9a3 tools/discreteWavelet/execute_dwt_var_perFeature.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/discreteWavelet/execute_dwt_var_perFeature.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,57 @@
+<tool id="dwt_var1" name="Wavelet variance" version="1.0.0">
+  <description>using Discrete Wavelet Transfoms</description>
+  
+  <command interpreter="perl">
+   execute_dwt_var_perFeature.pl $inputFile $feature $alpha $outputFile1 $outputFile2
+  </command>
+  
+  <inputs>
+   <param format="tabular" name="inputFile" type="data" label="Select data"/>
+ <param name="feature" label="Feature column" type="data_column" data_ref="inputFile" multiple="true" help="Please select at least one column"/>
+ <param name="alpha" size="10" type="float" value="0.05" label="alpha (significance level)" />
+  </inputs>
+  
+  <outputs>
+    <data format="tabular" name="outputFile1"/>
+    <data format="pdf" name="outputFile2"/>
+  </outputs>
+  <tests>
+    <test>
+ <param name="inputFile" value="discreteWavelet/dwt_var1/dwt_var_in.interval"/>
+ <param name="feature" value="4"/>
+ <param name="alpha" value="0.05"/>
+ <output name="outputFile1" file="discreteWavelet/dwt_var1/dwt_var_out1.tabular" compare="re_match"/>
+ <output name="outputFile2" file="discreteWavelet/dwt_var1/dwt_var_out2.pdf" compare="sim_size"/>
+    </test>
+  </tests>     
+  
+  <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This tool computes the scale-specific variance in wavelet coeffients obtained from the discrete wavelet transform of a feature of interest.
+
+Input data consists of an ordered series of data, S, equispaced and of sample size N, where N is of the form N = 2^k, and k is a positive integer and represents the number of levels of wavelet decomposition. S could be a time series, or a set of DNA sequences. The user calculates a statistic of interest for each feature in each interval of S: say, expression level of a particular gene in a time course, or the number of LINE elements per window across a chromosome. This tool then performs a discrete wavelet transform of the feature of interest, and plots the resulting variance in wavelet coefficients per wavelet scale. In addition, statistical significance of variances are determined by 1,000 random permutations of the intervals in S, to generate null bands (representing the user provided alpha value) corresponding to the empirical distribution of wavelet variances under the null hypothesis of no inherent order to the series in S.
+
+This tool generates two output files:
+
+- The first output file is a TABULAR format file representing the variances, p-values, and test orientation for the features at each wavelet scale based on a random permutation test.
+- The second output file is a PDF image plotting the wavelet variances of each feature at each scale.
+
+-----
+
+.. class:: warningmark
+
+**Note**
+In order to obtain empirical p-values, a random perumtation scheme is implemented by the tool, such that the output may generate slightly variations in results each time it is run on the same input file. 
+
+-----
+
+
+**Example**
+
+  </help>  
+  
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_antigenic.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_antigenic.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,49 @@
+<tool id="EMBOSS: antigenic1" name="antigenic" version="5.0.0">
+  <description>Predicts potentially antigenic regions of a protein sequence, using the method of Kolaskar and Tongaonkar.</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>antigenic -sequence $input1 -outfile $out_file1 -minlen $minlen -rformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>On query</label>
+    </param>
+    <param name="minlen" size="4" type="text" value="6">
+      <label>Minimum Length of region</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output format</label>
+      <option value="gff">GFF</option>
+      <option value="pir">PIR</option>
+      <option value="swiss">SwissProt</option>
+      <option value="dbmotif">DbMotif</option>
+      <option value="diffseq">diffseq</option>
+      <option value="excel">Excel (TAB Delimited)</option>
+      <option value="feattable">FeatTable</option>
+      <option value="motif">Motif</option>
+      <option value="nametable">NameTable</option>
+      <option value="regions">Regions</option>
+      <option value="seqtable">SeqTable</option>
+      <option value="simple">SRS simple</option>
+      <option value="srs">SRS</option>
+      <option value="table">Table</option>
+      <option value="tagseq">Tagseq</option>
+      <option value="antigenic">Antigenic Output File</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="antigenic" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="minlen" value="6"/>
+      <param name="out_format1" value="excel"/>
+      <output name="out_file1" file="emboss_antigenic_out.tabular"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/antigenic.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_backtranseq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_backtranseq.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,220 @@\n+<tool id="EMBOSS: backtranseq2" name="backtranseq" version="5.0.0">\r\n+  <description>Back translate a protein sequence</description>\r\n+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>\r\n+  <command>backtranseq -sequence $input1 -outfile $out_file1 -cfile $cfile -osformat2 $out_format1 -auto</command>\r\n+  <inputs>\r\n+    <param format="fasta" name="input1" type="data">\r\n+      <label>On query</label>\r\n+    </param>\r\n+    <param name="cfile" type="select">\r\n+      <label>Codon Usage File</label>\r\n+      <option value="Ehum.cut">Ehum.cut</option>\r\n+      <option value="Eacc.cut">Eacc.cut</option>\r\n+      <option value="Eadenovirus5.cut">Eadenovirus5.cut</option>\r\n+      <option value="Eadenovirus7.cut">Eadenovirus7.cut</option>\r\n+      <option value="Eaidlav.cut">Eaidlav.cut</option>\r\n+      <option value="Eanasp.cut">Eanasp.cut</option>\r\n+      <option value="Eani.cut">Eani.cut</option>\r\n+      <option value="Eani_h.cut">Eani_h.cut</option>\r\n+      <option value="Eanidmit.cut">Eanidmit.cut</option>\r\n+      <option value="Easn.cut">Easn.cut</option>\r\n+      <option value="Eath.cut">Eath.cut</option>\r\n+      <option value="Eatu.cut">Eatu.cut</option>\r\n+      <option value="Eavi.cut">Eavi.cut</option>\r\n+      <option value="Ebja.cut">Ebja.cut</option>\r\n+      <option value="Ebly.cut">Ebly.cut</option>\r\n+      <option value="Ebme.cut">Ebme.cut</option>\r\n+      <option value="Ebmo.cut">Ebmo.cut</option>\r\n+      <option value="Ebna.cut">Ebna.cut</option>\r\n+      <option value="Ebov.cut">Ebov.cut</option>\r\n+      <option value="Ebovsp.cut">Ebovsp.cut</option>\r\n+      <option value="Ebst.cut">Ebst.cut</option>\r\n+      <option value="Ebsu.cut">Ebsu.cut</option>\r\n+      <option value="Ebsu_h.cut">Ebsu_h.cut</option>\r\n+      <option value="Ecac.cut">Ecac.cut</option>\r\n+      <option value="Ecal.cut">Ecal.cut</option>\r\n+      <option value="Eccr.cut">Eccr.cut</option>\r\n+      <option value="Ecel.cut">Ecel.cut</option>\r\n+      <option value="Echi.cut">Echi.cut</option>\r\n+      <option value="Echicken.cut">Echicken.cut</option>\r\n+      <option value="Echisp.cut">Echisp.cut</option>\r\n+      <option value="Echk.cut">Echk.cut</option>\r\n+      <option value="Echmp.cut">Echmp.cut</option>\r\n+      <option value="Echnt.cut">Echnt.cut</option>\r\n+      <option value="Echos.cut">Echos.cut</option>\r\n+      <option value="Echzm.cut">Echzm.cut</option>\r\n+      <option value="Echzmrubp.cut">Echzmrubp.cut</option>\r\n+      <option value="Ecpx.cut">Ecpx.cut</option>\r\n+      <option value="Ecre.cut">Ecre.cut</option>\r\n+      <option value="Ecrisp.cut">Ecrisp.cut</option>\r\n+      <option value="Ectr.cut">Ectr.cut</option>\r\n+      <option value="Edayhoff.cut">Edayhoff.cut</option>\r\n+      <option value="Eddi.cut">Eddi.cut</option>\r\n+      <option value="Eddi_h.cut">Eddi_h.cut</option>\r\n+      <option value="Edog.cut">Edog.cut</option>\r\n+      <option value="Edro.cut">Edro.cut</option>\r\n+      <option value="Edro_h.cut">Edro_h.cut</option>\r\n+      <option value="Edrosophila.cut">Edrosophila.cut</option>\r\n+      <option value="Eeca.cut">Eeca.cut</option>\r\n+      <option value="Eeco.cut">Eeco.cut</option>\r\n+      <option value="Eeco_h.cut">Eeco_h.cut</option>\r\n+      <option value="Eecoli.cut">Eecoli.cut</option>\r\n+      <option value="Ef1.cut">Ef1.cut</option>\r\n+      <option value="Efish.cut">Efish.cut</option>\r\n+      <option value="Efmdvpolyp.cut">Efmdvpolyp.cut</option>\r\n+      <option value="Eham.cut">Eham.cut</option>\r\n+      <option value="Ehha.cut">Ehha.cut</option>\r\n+      <option value="Ehin.cut">Ehin.cut</option>\r\n+      <option value="Ehma.cut">Ehma.cut</option>\r\n+      <option value="Ehuman.cut">Ehuman.cut</option>\r\n+      <option value="Ekla.cut">Ekla.cut</option>\r\n+      <option value="Ekpn.cut">Ekpn.cut</option>\r\n+      <option value="Ella.cut">Ella.cut</option>\r\n+      <option value="Emac.cut">Emac.cut</option>\r\n+      <option value="Emaize.cut">Emaize.cut</option>\r\n+      <option va'..b'/option>\r\n+      <option value="Espo.cut">Espo.cut</option>\r\n+      <option value="Espo_h.cut">Espo_h.cut</option>\r\n+      <option value="Espu.cut">Espu.cut</option>\r\n+      <option value="Esta.cut">Esta.cut</option>\r\n+      <option value="Esty.cut">Esty.cut</option>\r\n+      <option value="Esus.cut">Esus.cut</option>\r\n+      <option value="Esv40.cut">Esv40.cut</option>\r\n+      <option value="Esyhsp.cut">Esyhsp.cut</option>\r\n+      <option value="Esynsp.cut">Esynsp.cut</option>\r\n+      <option value="Etbr.cut">Etbr.cut</option>\r\n+      <option value="Etcr.cut">Etcr.cut</option>\r\n+      <option value="Eter.cut">Eter.cut</option>\r\n+      <option value="Etetsp.cut">Etetsp.cut</option>\r\n+      <option value="Etob.cut">Etob.cut</option>\r\n+      <option value="Etobcp.cut">Etobcp.cut</option>\r\n+      <option value="Etom.cut">Etom.cut</option>\r\n+      <option value="Etrb.cut">Etrb.cut</option>\r\n+      <option value="Evco.cut">Evco.cut</option>\r\n+      <option value="Ewht.cut">Ewht.cut</option>\r\n+      <option value="Exel.cut">Exel.cut</option>\r\n+      <option value="Exenopus.cut">Exenopus.cut</option>\r\n+      <option value="Eyeast.cut">Eyeast.cut</option>\r\n+      <option value="Eyeastcai.cut">Eyeastcai.cut</option>\r\n+      <option value="Eyen.cut">Eyen.cut</option>\r\n+      <option value="Eysc.cut">Eysc.cut</option>\r\n+      <option value="Eysc_h.cut">Eysc_h.cut</option>\r\n+      <option value="Eyscmt.cut">Eyscmt.cut</option>\r\n+      <option value="Eysp.cut">Eysp.cut</option>\r\n+      <option value="Ezebrafish.cut">Ezebrafish.cut</option>\r\n+      <option value="Ezma.cut">Ezma.cut</option>\r\n+    </param>\r\n+    <param name="out_format1" type="select">\r\n+      <label>Output Sequence File Format</label>\r\n+      <option value="fasta">FASTA (m)</option>\r\n+      <option value="acedb">ACeDB (m)</option>\r\n+      <option value="asn1">ASN.1 (m)</option>\r\n+      <option value="clustal">Clustal (m)</option>\r\n+      <option value="codata">CODATA (m)</option>\r\n+      <option value="embl">EMBL (m)</option>\r\n+      <option value="fitch">Fitch (m)</option>\r\n+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>\r\n+      <option value="genbank">GENBANK (m)</option>\r\n+      <option value="gff">GFF (m)</option>\r\n+      <option value="hennig86">Hennig86 (m)</option>\r\n+      <option value="ig">Intelligenetics (m)</option>\r\n+      <option value="jackknifer">Jackknifer (m)</option>\r\n+      <option value="jackknifernon">Jackknifernon (m)</option>\r\n+      <option value="mega">Mega (m)</option>\r\n+      <option value="meganon">Meganon (m)</option>\r\n+      <option value="msf">Wisconsin Package GCG\'s MSF (m)</option>\r\n+      <option value="pir">NBRF (PIR) (m)</option>\r\n+      <option value="ncbi">NCBI style FASTA (m)</option>\r\n+      <option value="nexus">Nexus/PAUP (m)</option>\r\n+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>\r\n+      <option value="phylip">PHYLIP interleaved (m)</option>\r\n+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>\r\n+      <option value="selex">SELEX (m)</option>\r\n+      <option value="staden">Staden (s)</option>\r\n+      <option value="strider">DNA strider (m)</option>\r\n+      <option value="swiss">SwisProt entry (m)</option>\r\n+      <option value="text">Plain sequence (s)</option>\r\n+      <option value="treecon">Treecon (m)</option>\r\n+    </param>\r\n+  </inputs>\r\n+  <outputs>\r\n+    <data format="txt" name="out_file1" />\r\n+  </outputs>\n+  <tests>\n+    <test>\n+      <param name="input1" value="2.fasta"/>\n+      <param name="cfile" value="Ehum.cut"/>\n+      <param name="out_format1" value="fasta"/>\n+      <output name="out_file1" file="emboss_backtranseq_out.fasta"/>\n+    </test>\n+  </tests>\r\n+  <code file="emboss_format_corrector.py" />\r\n+  <help>\n+\n+.. class:: warningmark \n+\n+The input dataset needs to be sequences. \n+\n+----- \n+\n+    You can view the original documentation here_.\n+    \n+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/backtranseq.html\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_banana.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_banana.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,16 @@
+#! /usr/bin/perl -w
+use strict;
+
+my $cmd_string = join (" ",@ARGV);
+#my $cmd_string = "/home/djb396/temp/emboss/bin/banana -sequence /home/djb396/universe-prototype/test.fasta -outfile result.txt -graph png -goutfile results -auto";
+my $results = `$cmd_string`;
+my @files = split("\n",$results);
+foreach my $thisLine (@files)
+{
+ if ($thisLine =~ /Created /i)
+ {
+ $thisLine =~ /[\w|\.]+$/;
+ $thisLine =$&;
+ print "outfile: $thisLine\n";
+ }
+}
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_banana.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_banana.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,24 @@
+<tool id="EMBOSS: banana3" name="banana" version="5.0.0">
+  <description>Bending and curvature plot in B-DNA</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>banana -sequence $input1 -outfile $out_file1 -graph none -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>On query</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="txt" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <output name="out_file1" file="emboss_banana_out.txt"/>
+    </test>
+  </tests>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/banana.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_biosed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_biosed.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,73 @@
+<tool id="EMBOSS: biosed4" name="biosed" version="5.0.0">
+  <description>Replace or delete sequence sections</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>biosed -sequence $input1 -outseq $out_file1 -target $target -replace $replace -osformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>On query</label>
+    </param>
+    <param name="target" size="6" type="text" value="atg">
+      <label>Replace all</label>
+    </param>
+    <param name="replace" size="6" type="text" value="atg">
+      <label>with</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="txt" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="target" value="atg"/>
+      <param name="replace" value="agt"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_biosed_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark 
+
+The input dataset needs to be sequences. 
+
+----- 
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/biosed.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_btwisted.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_btwisted.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,24 @@
+<tool id="EMBOSS: btwisted5" name="btwisted" version="5.0.0">
+  <description>Calculates the twisting in a B-DNA sequence</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>btwisted -sequence $input1 -outfile $out_file1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>On query</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="btwisted" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <output name="out_file1" file="emboss_btwisted_out.btwisted"/>
+    </test>
+  </tests>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/btwisted.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_cai.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_cai.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,185 @@\n+<tool id="EMBOSS: cai6" name="cai" version="5.0.0">\r\n+  <description>CAI codon adaptation index</description>\r\n+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>\r\n+  <command>cai -seqall $input1 -outfile $out_file1 -cfile $cfile -auto</command>\r\n+  <inputs>\r\n+    <param format="fasta" name="input1" type="data">\r\n+      <label>On query</label>\r\n+    </param>\r\n+    <param name="cfile" type="select">\r\n+      <label>Codon Usage File</label>\r\n+      <option value="Eyeastcai.cut">Eyeastcai.cut</option>\r\n+      <option value="Ehum.cut">Ehum.cut</option>\r\n+      <option value="Eacc.cut">Eacc.cut</option>\r\n+      <option value="Eadenovirus5.cut">Eadenovirus5.cut</option>\r\n+      <option value="Eadenovirus7.cut">Eadenovirus7.cut</option>\r\n+      <option value="Eaidlav.cut">Eaidlav.cut</option>\r\n+      <option value="Eanasp.cut">Eanasp.cut</option>\r\n+      <option value="Eani.cut">Eani.cut</option>\r\n+      <option value="Eani_h.cut">Eani_h.cut</option>\r\n+      <option value="Eanidmit.cut">Eanidmit.cut</option>\r\n+      <option value="Easn.cut">Easn.cut</option>\r\n+      <option value="Eath.cut">Eath.cut</option>\r\n+      <option value="Eatu.cut">Eatu.cut</option>\r\n+      <option value="Eavi.cut">Eavi.cut</option>\r\n+      <option value="Ebja.cut">Ebja.cut</option>\r\n+      <option value="Ebly.cut">Ebly.cut</option>\r\n+      <option value="Ebme.cut">Ebme.cut</option>\r\n+      <option value="Ebmo.cut">Ebmo.cut</option>\r\n+      <option value="Ebna.cut">Ebna.cut</option>\r\n+      <option value="Ebov.cut">Ebov.cut</option>\r\n+      <option value="Ebovsp.cut">Ebovsp.cut</option>\r\n+      <option value="Ebst.cut">Ebst.cut</option>\r\n+      <option value="Ebsu.cut">Ebsu.cut</option>\r\n+      <option value="Ebsu_h.cut">Ebsu_h.cut</option>\r\n+      <option value="Ecac.cut">Ecac.cut</option>\r\n+      <option value="Ecal.cut">Ecal.cut</option>\r\n+      <option value="Eccr.cut">Eccr.cut</option>\r\n+      <option value="Ecel.cut">Ecel.cut</option>\r\n+      <option value="Echi.cut">Echi.cut</option>\r\n+      <option value="Echicken.cut">Echicken.cut</option>\r\n+      <option value="Echisp.cut">Echisp.cut</option>\r\n+      <option value="Echk.cut">Echk.cut</option>\r\n+      <option value="Echmp.cut">Echmp.cut</option>\r\n+      <option value="Echnt.cut">Echnt.cut</option>\r\n+      <option value="Echos.cut">Echos.cut</option>\r\n+      <option value="Echzm.cut">Echzm.cut</option>\r\n+      <option value="Echzmrubp.cut">Echzmrubp.cut</option>\r\n+      <option value="Ecpx.cut">Ecpx.cut</option>\r\n+      <option value="Ecre.cut">Ecre.cut</option>\r\n+      <option value="Ecrisp.cut">Ecrisp.cut</option>\r\n+      <option value="Ectr.cut">Ectr.cut</option>\r\n+      <option value="Edayhoff.cut">Edayhoff.cut</option>\r\n+      <option value="Eddi.cut">Eddi.cut</option>\r\n+      <option value="Eddi_h.cut">Eddi_h.cut</option>\r\n+      <option value="Edog.cut">Edog.cut</option>\r\n+      <option value="Edro.cut">Edro.cut</option>\r\n+      <option value="Edro_h.cut">Edro_h.cut</option>\r\n+      <option value="Edrosophila.cut">Edrosophila.cut</option>\r\n+      <option value="Eeca.cut">Eeca.cut</option>\r\n+      <option value="Eeco.cut">Eeco.cut</option>\r\n+      <option value="Eeco_h.cut">Eeco_h.cut</option>\r\n+      <option value="Eecoli.cut">Eecoli.cut</option>\r\n+      <option value="Ef1.cut">Ef1.cut</option>\r\n+      <option value="Efish.cut">Efish.cut</option>\r\n+      <option value="Efmdvpolyp.cut">Efmdvpolyp.cut</option>\r\n+      <option value="Eham.cut">Eham.cut</option>\r\n+      <option value="Ehha.cut">Ehha.cut</option>\r\n+      <option value="Ehin.cut">Ehin.cut</option>\r\n+      <option value="Ehma.cut">Ehma.cut</option>\r\n+      <option value="Ehuman.cut">Ehuman.cut</option>\r\n+      <option value="Ekla.cut">Ekla.cut</option>\r\n+      <option value="Ekpn.cut">Ekpn.cut</option>\r\n+      <option value="Ella.cut">Ella.cut</option>\r\n+      <option value="Emac.cut">Emac.cut</option>\r\n+      <option value="Emaize.cut">Emaize.cut</option>\r\n+      <optio'..b'>Ephix174.cut</option>\r\n+      <option value="Ephv.cut">Ephv.cut</option>\r\n+      <option value="Ephy.cut">Ephy.cut</option>\r\n+      <option value="Epig.cut">Epig.cut</option>\r\n+      <option value="Epolyomaa2.cut">Epolyomaa2.cut</option>\r\n+      <option value="Epombe.cut">Epombe.cut</option>\r\n+      <option value="Epombecai.cut">Epombecai.cut</option>\r\n+      <option value="Epot.cut">Epot.cut</option>\r\n+      <option value="Eppu.cut">Eppu.cut</option>\r\n+      <option value="Epse.cut">Epse.cut</option>\r\n+      <option value="Epsy.cut">Epsy.cut</option>\r\n+      <option value="Epvu.cut">Epvu.cut</option>\r\n+      <option value="Erab.cut">Erab.cut</option>\r\n+      <option value="Erabbit.cut">Erabbit.cut</option>\r\n+      <option value="Erabsp.cut">Erabsp.cut</option>\r\n+      <option value="Erat.cut">Erat.cut</option>\r\n+      <option value="Eratsp.cut">Eratsp.cut</option>\r\n+      <option value="Erca.cut">Erca.cut</option>\r\n+      <option value="Erhm.cut">Erhm.cut</option>\r\n+      <option value="Eric.cut">Eric.cut</option>\r\n+      <option value="Erle.cut">Erle.cut</option>\r\n+      <option value="Erme.cut">Erme.cut</option>\r\n+      <option value="Ersp.cut">Ersp.cut</option>\r\n+      <option value="Esalsp.cut">Esalsp.cut</option>\r\n+      <option value="Esau.cut">Esau.cut</option>\r\n+      <option value="Esco.cut">Esco.cut</option>\r\n+      <option value="Esgi.cut">Esgi.cut</option>\r\n+      <option value="Eshp.cut">Eshp.cut</option>\r\n+      <option value="Eshpsp.cut">Eshpsp.cut</option>\r\n+      <option value="Esli.cut">Esli.cut</option>\r\n+      <option value="Eslm.cut">Eslm.cut</option>\r\n+      <option value="Esma.cut">Esma.cut</option>\r\n+      <option value="Esmi.cut">Esmi.cut</option>\r\n+      <option value="Esmu.cut">Esmu.cut</option>\r\n+      <option value="Esoy.cut">Esoy.cut</option>\r\n+      <option value="Espi.cut">Espi.cut</option>\r\n+      <option value="Espn.cut">Espn.cut</option>\r\n+      <option value="Espo.cut">Espo.cut</option>\r\n+      <option value="Espo_h.cut">Espo_h.cut</option>\r\n+      <option value="Espu.cut">Espu.cut</option>\r\n+      <option value="Esta.cut">Esta.cut</option>\r\n+      <option value="Esty.cut">Esty.cut</option>\r\n+      <option value="Esus.cut">Esus.cut</option>\r\n+      <option value="Esv40.cut">Esv40.cut</option>\r\n+      <option value="Esyhsp.cut">Esyhsp.cut</option>\r\n+      <option value="Esynsp.cut">Esynsp.cut</option>\r\n+      <option value="Etbr.cut">Etbr.cut</option>\r\n+      <option value="Etcr.cut">Etcr.cut</option>\r\n+      <option value="Eter.cut">Eter.cut</option>\r\n+      <option value="Etetsp.cut">Etetsp.cut</option>\r\n+      <option value="Etob.cut">Etob.cut</option>\r\n+      <option value="Etobcp.cut">Etobcp.cut</option>\r\n+      <option value="Etom.cut">Etom.cut</option>\r\n+      <option value="Etrb.cut">Etrb.cut</option>\r\n+      <option value="Evco.cut">Evco.cut</option>\r\n+      <option value="Ewht.cut">Ewht.cut</option>\r\n+      <option value="Exel.cut">Exel.cut</option>\r\n+      <option value="Exenopus.cut">Exenopus.cut</option>\r\n+      <option value="Eyeast.cut">Eyeast.cut</option>\r\n+      <option value="Eyen.cut">Eyen.cut</option>\r\n+      <option value="Eysc.cut">Eysc.cut</option>\r\n+      <option value="Eysc_h.cut">Eysc_h.cut</option>\r\n+      <option value="Eyscmt.cut">Eyscmt.cut</option>\r\n+      <option value="Eysp.cut">Eysp.cut</option>\r\n+      <option value="Ezebrafish.cut">Ezebrafish.cut</option>\r\n+      <option value="Ezma.cut">Ezma.cut</option>\r\n+    </param>\r\n+  </inputs>\r\n+  <outputs>\r\n+    <data format="cai" name="out_file1" />\r\n+  </outputs>\n+  <tests>\n+    <test>\n+      <param name="input1" value="2.fasta"/>\n+      <param name="cfile" value="Eyeastcai.cut"/>\n+      <output name="out_file1" file="emboss_cai_out.cai"/>\n+    </test>\n+  </tests>\r\n+  <help>\n+.. class:: warningmark\n+\n+The input dataset needs to be sequences.\n+\n+-----\n+\n+    You can view the original documentation here_.\n+    \n+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/cai.html\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_cai_custom.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_cai_custom.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,27 @@
+<tool id="EMBOSS: cai_custom6" name="cai custom" version="5.0.0">
+  <description>CAI codon adaptation index using custom codon usage file</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>cai -seqall $input1 -outfile $out_file1 -cfile $input2 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>On query</label>
+    </param>
+    <param format="txt" name="input2" type="data">
+      <label>Codon Usage File</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="txt" name="out_file1" />
+  </outputs>
+  <help>
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/cai_custom.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_chaos.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_chaos.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,23 @@
+<tool id="EMBOSS: chaos7" name="chaos" version="5.0.0">
+  <description>Create a chaos game representation plot for a sequence</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command interpreter="perl">emboss_single_outputfile_wrapper.pl chaos -sequence $input1 -graph png -goutfile $out_file1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>On query</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="png" name="out_file1" />
+  </outputs>
+<!--    <tests>
+    <test>
+         puts name of file into the png
+    </test>
+  </tests> -->
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/chaos.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_charge.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_charge.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,35 @@
+<tool id="EMBOSS: charge8" name="charge" version="5.0.0">
+  <description>Protein charge plot</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>charge -seqall $input1 -outfile $out_file1 -window $window -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>On query</label>
+    </param>
+    <param name="window" size="4" type="text" value="5">
+      <label>Window Size</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="charge" name="out_file1" />
+  </outputs>
+ <!--   <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="window" value="5"/>
+      <output name="out_file1" file="emboss_charge_out.charge"/>
+    </test>
+  </tests>--> 
+  <help>
+
+.. class:: warningmark 
+
+The input dataset needs to be sequences. 
+
+----- 
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/charge.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_checktrans.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_checktrans.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,87 @@
+<tool id="EMBOSS: checktrans9" name="checktrans" version="5.0.0">
+  <description>Reports STOP codons and ORF statistics of a protein</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>checktrans -sequence $input1 -outfile $out_file1 -outseq $out_file2 -osformat3 $out_format2 -outfeat $out_file3 -offormat4 $out_format3 -orfml $orfml -addlast $addlast -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>On query</label>
+    </param>
+    <param name="orfml" size="4" type="text" value="100">
+      <label>Minimum ORF Length to report</label>
+    </param>
+    <param name="addlast" type="select">
+      <label>An asterisk in the protein sequence indicates the position of a STOP codon. Checktrans assumes that all ORFs end in a STOP codon. Forcing the sequence to end with an asterisk, if there
+      is not one there already, makes checktrans treat the end as a potential ORF. If an asterisk is added, it is not included in the reported count of STOPs</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="out_format2" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+    <param name="out_format3" type="select">
+      <label>Output Feature File Format</label>
+      <option value="gff">GFF</option>
+      <option value="embl">EMBL</option>
+      <option value="swiss">SwissProt</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="checktrans" name="out_file1" />
+    <data format="fasta" name="out_file2" />
+    <data format="gff" name="out_file3" />
+  </outputs>
+ <!--   <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="orfml" value="100"/>
+      <param name="addlast" value="yes"/>
+      <param name="out_format2" value="fasta"/>
+      <param name="out_format3" value="gff"/>
+      <output name="out_file1" file="emboss_checktrans_out1.txt"/>
+      <output name="out_file2" file="emboss_checktrans_out2.fasta"/>
+      <output name="out_file3" file="emboss_checktrans_out3.gff"/>
+    </test>
+  </tests> -->
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark 
+
+The input dataset needs to be sequences. 
+
+----- 
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/checktrans.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_chips.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_chips.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,30 @@
+<tool id="EMBOSS: chips10" name="chips" version="5.0.0">
+  <description>Codon usage statistics</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>chips -seqall $input1 -outfile $out_file1 -sum $sum -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>On query</label>
+    </param>
+    <param name="sum" type="select">
+      <label>Sum codons over all sequences</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="chips" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="sum" value="yes"/>
+      <output name="out_file1" file="emboss_chips_out.chips"/>
+    </test>
+  </tests>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/chips.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_cirdna.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_cirdna.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,23 @@
+<tool id="EMBOSS: cirdna11" name="cirdna" version="5.0.0">
+  <description>Draws circular maps of DNA constructs</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command interpreter="perl">emboss_single_outputfile_wrapper.pl cirdna -infile $input1 -graphout png -goutfile $out_file1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>On query</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="png" name="out_file1" />
+  </outputs>
+  <!--    <tests>
+    <test>
+         puts name of file into the png
+    </test>
+  </tests> -->
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/cirdna.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_codcmp.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_codcmp.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,330 @@\n+<tool id="EMBOSS: codcmp12" name="codcmp" version="5.0.0">\r\n+  <description>Codon usage table comparison</description>\r\n+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>\r\n+  <command>codcmp -first $cfile1 -second $cfile2 -outfile $out_file1 -auto</command>\r\n+  <inputs>\r\n+    <param name="cfile1" type="select">\r\n+      <label>Codon Usage File 1</label>\r\n+      <option value="Ehum.cut">Ehum.cut</option>\r\n+      <option value="Eacc.cut">Eacc.cut</option>\r\n+      <option value="Eadenovirus5.cut">Eadenovirus5.cut</option>\r\n+      <option value="Eadenovirus7.cut">Eadenovirus7.cut</option>\r\n+      <option value="Eaidlav.cut">Eaidlav.cut</option>\r\n+      <option value="Eanasp.cut">Eanasp.cut</option>\r\n+      <option value="Eani.cut">Eani.cut</option>\r\n+      <option value="Eani_h.cut">Eani_h.cut</option>\r\n+      <option value="Eanidmit.cut">Eanidmit.cut</option>\r\n+      <option value="Easn.cut">Easn.cut</option>\r\n+      <option value="Eath.cut">Eath.cut</option>\r\n+      <option value="Eatu.cut">Eatu.cut</option>\r\n+      <option value="Eavi.cut">Eavi.cut</option>\r\n+      <option value="Ebja.cut">Ebja.cut</option>\r\n+      <option value="Ebly.cut">Ebly.cut</option>\r\n+      <option value="Ebme.cut">Ebme.cut</option>\r\n+      <option value="Ebmo.cut">Ebmo.cut</option>\r\n+      <option value="Ebna.cut">Ebna.cut</option>\r\n+      <option value="Ebov.cut">Ebov.cut</option>\r\n+      <option value="Ebovsp.cut">Ebovsp.cut</option>\r\n+      <option value="Ebst.cut">Ebst.cut</option>\r\n+      <option value="Ebsu.cut">Ebsu.cut</option>\r\n+      <option value="Ebsu_h.cut">Ebsu_h.cut</option>\r\n+      <option value="Ecac.cut">Ecac.cut</option>\r\n+      <option value="Ecal.cut">Ecal.cut</option>\r\n+      <option value="Eccr.cut">Eccr.cut</option>\r\n+      <option value="Ecel.cut">Ecel.cut</option>\r\n+      <option value="Echi.cut">Echi.cut</option>\r\n+      <option value="Echicken.cut">Echicken.cut</option>\r\n+      <option value="Echisp.cut">Echisp.cut</option>\r\n+      <option value="Echk.cut">Echk.cut</option>\r\n+      <option value="Echmp.cut">Echmp.cut</option>\r\n+      <option value="Echnt.cut">Echnt.cut</option>\r\n+      <option value="Echos.cut">Echos.cut</option>\r\n+      <option value="Echzm.cut">Echzm.cut</option>\r\n+      <option value="Echzmrubp.cut">Echzmrubp.cut</option>\r\n+      <option value="Ecpx.cut">Ecpx.cut</option>\r\n+      <option value="Ecre.cut">Ecre.cut</option>\r\n+      <option value="Ecrisp.cut">Ecrisp.cut</option>\r\n+      <option value="Ectr.cut">Ectr.cut</option>\r\n+      <option value="Edayhoff.cut">Edayhoff.cut</option>\r\n+      <option value="Eddi.cut">Eddi.cut</option>\r\n+      <option value="Eddi_h.cut">Eddi_h.cut</option>\r\n+      <option value="Edog.cut">Edog.cut</option>\r\n+      <option value="Edro.cut">Edro.cut</option>\r\n+      <option value="Edro_h.cut">Edro_h.cut</option>\r\n+      <option value="Edrosophila.cut">Edrosophila.cut</option>\r\n+      <option value="Eeca.cut">Eeca.cut</option>\r\n+      <option value="Eeco.cut">Eeco.cut</option>\r\n+      <option value="Eeco_h.cut">Eeco_h.cut</option>\r\n+      <option value="Eecoli.cut">Eecoli.cut</option>\r\n+      <option value="Ef1.cut">Ef1.cut</option>\r\n+      <option value="Efish.cut">Efish.cut</option>\r\n+      <option value="Efmdvpolyp.cut">Efmdvpolyp.cut</option>\r\n+      <option value="Eham.cut">Eham.cut</option>\r\n+      <option value="Ehha.cut">Ehha.cut</option>\r\n+      <option value="Ehin.cut">Ehin.cut</option>\r\n+      <option value="Ehma.cut">Ehma.cut</option>\r\n+      <option value="Ehuman.cut">Ehuman.cut</option>\r\n+      <option value="Ekla.cut">Ekla.cut</option>\r\n+      <option value="Ekpn.cut">Ekpn.cut</option>\r\n+      <option value="Ella.cut">Ella.cut</option>\r\n+      <option value="Emac.cut">Emac.cut</option>\r\n+      <option value="Emaize.cut">Emaize.cut</option>\r\n+      <option value="Emam_h.cut">Emam_h.cut</option>\r\n+      <option value="Emixlg.cut">Emixlg.cut</option>\r\n+      <option value="Emouse.cut">Emouse.cut</opti'..b'on>\r\n+      <option value="Ephv.cut">Ephv.cut</option>\r\n+      <option value="Ephy.cut">Ephy.cut</option>\r\n+      <option value="Epig.cut">Epig.cut</option>\r\n+      <option value="Epolyomaa2.cut">Epolyomaa2.cut</option>\r\n+      <option value="Epombe.cut">Epombe.cut</option>\r\n+      <option value="Epombecai.cut">Epombecai.cut</option>\r\n+      <option value="Epot.cut">Epot.cut</option>\r\n+      <option value="Eppu.cut">Eppu.cut</option>\r\n+      <option value="Epse.cut">Epse.cut</option>\r\n+      <option value="Epsy.cut">Epsy.cut</option>\r\n+      <option value="Epvu.cut">Epvu.cut</option>\r\n+      <option value="Erab.cut">Erab.cut</option>\r\n+      <option value="Erabbit.cut">Erabbit.cut</option>\r\n+      <option value="Erabsp.cut">Erabsp.cut</option>\r\n+      <option value="Erat.cut">Erat.cut</option>\r\n+      <option value="Eratsp.cut">Eratsp.cut</option>\r\n+      <option value="Erca.cut">Erca.cut</option>\r\n+      <option value="Erhm.cut">Erhm.cut</option>\r\n+      <option value="Eric.cut">Eric.cut</option>\r\n+      <option value="Erle.cut">Erle.cut</option>\r\n+      <option value="Erme.cut">Erme.cut</option>\r\n+      <option value="Ersp.cut">Ersp.cut</option>\r\n+      <option value="Esalsp.cut">Esalsp.cut</option>\r\n+      <option value="Esau.cut">Esau.cut</option>\r\n+      <option value="Esco.cut">Esco.cut</option>\r\n+      <option value="Esgi.cut">Esgi.cut</option>\r\n+      <option value="Eshp.cut">Eshp.cut</option>\r\n+      <option value="Eshpsp.cut">Eshpsp.cut</option>\r\n+      <option value="Esli.cut">Esli.cut</option>\r\n+      <option value="Eslm.cut">Eslm.cut</option>\r\n+      <option value="Esma.cut">Esma.cut</option>\r\n+      <option value="Esmi.cut">Esmi.cut</option>\r\n+      <option value="Esmu.cut">Esmu.cut</option>\r\n+      <option value="Esoy.cut">Esoy.cut</option>\r\n+      <option value="Espi.cut">Espi.cut</option>\r\n+      <option value="Espn.cut">Espn.cut</option>\r\n+      <option value="Espo.cut">Espo.cut</option>\r\n+      <option value="Espo_h.cut">Espo_h.cut</option>\r\n+      <option value="Espu.cut">Espu.cut</option>\r\n+      <option value="Esta.cut">Esta.cut</option>\r\n+      <option value="Esty.cut">Esty.cut</option>\r\n+      <option value="Esus.cut">Esus.cut</option>\r\n+      <option value="Esv40.cut">Esv40.cut</option>\r\n+      <option value="Esyhsp.cut">Esyhsp.cut</option>\r\n+      <option value="Esynsp.cut">Esynsp.cut</option>\r\n+      <option value="Etbr.cut">Etbr.cut</option>\r\n+      <option value="Etcr.cut">Etcr.cut</option>\r\n+      <option value="Eter.cut">Eter.cut</option>\r\n+      <option value="Etetsp.cut">Etetsp.cut</option>\r\n+      <option value="Etob.cut">Etob.cut</option>\r\n+      <option value="Etobcp.cut">Etobcp.cut</option>\r\n+      <option value="Etom.cut">Etom.cut</option>\r\n+      <option value="Etrb.cut">Etrb.cut</option>\r\n+      <option value="Evco.cut">Evco.cut</option>\r\n+      <option value="Ewht.cut">Ewht.cut</option>\r\n+      <option value="Exel.cut">Exel.cut</option>\r\n+      <option value="Exenopus.cut">Exenopus.cut</option>\r\n+      <option value="Eyeast.cut">Eyeast.cut</option>\r\n+      <option value="Eyeastcai.cut">Eyeastcai.cut</option>\r\n+      <option value="Eyen.cut">Eyen.cut</option>\r\n+      <option value="Eysc.cut">Eysc.cut</option>\r\n+      <option value="Eysc_h.cut">Eysc_h.cut</option>\r\n+      <option value="Eyscmt.cut">Eyscmt.cut</option>\r\n+      <option value="Eysp.cut">Eysp.cut</option>\r\n+      <option value="Ezebrafish.cut">Ezebrafish.cut</option>\r\n+      <option value="Ezma.cut">Ezma.cut</option>\r\n+    </param>\r\n+  </inputs>\r\n+  <outputs>\r\n+    <data format="codcmp" name="out_file1" />\r\n+  </outputs>\n+  <tests>\n+    <test>\n+      <param name="cfile1" value="Ehum.cut"/>\n+      <param name="cfile2" value="Eacc.cut"/>\n+      <output name="out_file1" file="emboss_codcmp_out.codcmp"/>\n+    </test>\n+  </tests>\r\n+  <help>\n+    You can view the original documentation here_.\n+    \n+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/codcmp.html\n+  </help>\n+</tool>\n\\ No newline at end of file\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_coderet.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_coderet.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,73 @@
+<tool id="EMBOSS: coderet13" name="coderet" version="5.0.0">
+  <description>Extract CDS, mRNA and translations from feature tables</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <!--  <command>coderet -seqall $input1 -outfile $out_file1 -osformat2 $out_format1 -cds $cds -mrna $mrna -translation $translation -auto</command>-->
+  <command>coderet -seqall $input1 -outfile $out_file1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>On query</label>
+    </param>
+    <!--
+    <param name="cds" type="boolean" truevalue="yes" falsevalue="no" checked="true">
+      <label>Extract CDS sequences</label>
+    </param>
+    <param name="mrna" type="select">
+      <label>Extract mRNA sequences</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="translation" type="select">
+      <label>Extract translated sequences</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param> 
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option> 
+    </param> 
+    -->
+  </inputs>
+  <outputs>
+    <data format="coderet" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <output name="out_file1" file="emboss_coderet_out.coderet"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/coderet.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_compseq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_compseq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,42 @@
+<tool id="EMBOSS: compseq14" name="compseq" version="5.0.0">
+  <description>Count composition of dimer/trimer/etc words in a sequence</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>compseq -sequence $input1 -outfile $out_file1 -word $word -frame $frame -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>On query</label>
+    </param>
+    <param name="word" size="4" type="text" value="2">
+      <label>Size of word (window) to count</label>
+    </param>
+    <param name="frame" type="select">
+      <label>Frame to inspect</label>
+      <option value="0">All Frames</option>
+      <option value="1">Frame 1</option>
+      <option value="2">Frame 2</option>
+      <option value="3">Frame 3</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="compseq" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="word" value="2"/>
+      <param name="frame" value="0"/>
+      <output name="out_file1" file="emboss_compseq_out.compseq"/>
+    </test>
+  </tests>
+  <help>
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/compseq.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_cpgplot.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_cpgplot.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,33 @@
+<tool id="EMBOSS: cpgplot15" name="cpgplot" version="5.0.0">
+  <description>Plot CpG rich areas</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command interpreter="perl">emboss_cpgplot_wrapper.pl cpgplot -sequence $input1 -window $window -minlen $minlen -minpc $minpc -outfile $outfile -graph png -goutfile $goutfile -outfeat $outfeat -minoe $minoe -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>On query</label>
+    </param>
+    <param name="window" size="4" type="integer" value="100">
+      <label>Window Size</label>
+    </param>
+    <param name="minlen" size="4" type="integer" value="200">
+      <label>Minimum length</label>
+    </param>
+    <param name="minoe" size="4" type="float" value="0.6">
+      <label>Minimum average observed to expected ratio</label>
+    </param>
+    <param name="minpc" size="4" type="float" value="50.0">
+      <label>Minimum average percentage of G plus C</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="cpgplot" name="outfile" />
+    <data format="png" name="goutfile" />
+    <data format="gff" name="outfeat" />
+  </outputs>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/cpgplot.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_cpgplot_wrapper.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_cpgplot_wrapper.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,9 @@
+#! /usr/bin/perl -w
+use strict;
+use File::Copy;
+
+my $cmd_string = join (" ",@ARGV);
+my $results = `$cmd_string`;
+my @files = split("\n",$results);
+my $fileNameOut = $ARGV[14];
+move($fileNameOut.".1.png",$fileNameOut);
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_cpgreport.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_cpgreport.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,49 @@
+<tool id="EMBOSS: cpgreport16" name="cpgreport" version="5.0.0">
+  <description>Reports all CpG rich regions</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>cpgreport -sequence $input1 -outfile $out_file1 -outfeat $out_file2 -offormat3 $out_format2 -score $score -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>On query</label>
+    </param>
+    <param name="score" size="4" type="text" value="17">
+      <label>Score for each CG sequence found (1-200)</label>
+    </param>
+    <param name="out_format2" type="select">
+      <label>Output Feature File Format</label>
+      <option value="gff">GFF</option>
+      <option value="embl">EMBL</option>
+      <option value="swiss">SwissProt</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="cpgreport" name="out_file1" />
+    <data format="gff" name="out_file2" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="score" value="17"/>
+      <param name="out_format2" value="gff"/>
+      <output name="out_file2" file="emboss_cpgreport_out2.cpgreport"/>
+    </test>
+    <!-- <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="score" value="17"/>
+      <param name="out_format2" value="gff"/>
+      <output name="out_file1" file="emboss_cpgreport_out1.gff"/>
+    </test>  -->
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/cpgreport.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_cusp.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_cusp.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,30 @@
+<tool id="EMBOSS: cusp17" name="cusp" version="5.0.0">
+  <description>Create a codon usage table</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>cusp -sequence $input1 -outfile $out_file1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>On query</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="cusp" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <output name="out_file1" file="emboss_cusp_out.cusp"/>
+    </test>
+  </tests>
+  <help>
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/cusp.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_cutseq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_cutseq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,72 @@
+<tool id="EMBOSS: cutseq18" name="cutseq" version="5.0.0">
+  <description>Removes a specified section from a sequence</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>cutseq -sequence $input1 -outseq $out_file1 -from $from -to $to -osformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>On query</label>
+    </param>
+    <param name="from" size="8" type="text" value="1">
+      <label>Start of region to delete</label>
+    </param>
+    <param name="to" size="8" type="text" value="1">
+      <label>End of region to delete</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="from" value="1"/>
+      <param name="to" value="1"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_cutseq_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/cutseq.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_dan.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_dan.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,84 @@
+<tool id="EMBOSS: dan19" name="dan" version="5.0.0">
+  <description>Calculates DNA RNA/DNA melting temperature</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command interpreter="perl">emboss_single_outputfile_wrapper.pl dan -sequence $input1 -windowsize $window -goutfile $out_file1 -graph png -plot $plot1 -shiftincrement $shift -dnaconc $dnaconc
+  -saltconc $saltconc -product $product -formamide $formamide -mismatch $mismatch -prodlen $prodlen -thermo $thermo -temperature $temperature -rna $rna -outfile $out_file1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>On query</label>
+    </param>
+    <param name="window" size="4" type="text" value="20">
+      <label>Window Size</label>
+    </param>
+    <param name="shift" size="4" type="text" value="1">
+      <label>Step size (shift increment)</label>
+    </param>
+    <param name="dnaconc" size="4" type="text" value="50.0">
+      <label>DNA Concentration (nM)</label>
+    </param>
+    <param name="saltconc" size="4" type="text" value="50.0">
+      <label>Salt concentration (mM)</label>
+    </param>
+    <param name="thermo" type="select">
+      <label>Output the DeltaG, DeltaH and DeltaS values</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="temperature" size="4" type="text" value="25 ">
+      <label>Temperature at which to calculate the DeltaG, DeltaH and DeltaS values</label>
+    </param>
+    <param name="rna" type="select">
+      <label>Sequence is RNA</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="product" type="select">
+      <label>Include percent formamide, percent of mismatches allowed and product length</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="formamide" size="4" type="text" value="0 ">
+      <label>Formamide concentration (nM)</label>
+    </param>
+    <param name="mismatch" size="4" type="text" value="0 ">
+      <label>Percent mismatch to be used in calculations</label>
+    </param>
+    <param name="prodlen" size="4" type="text" value="20">
+      <label>Product length to be used in calculations</label>
+    </param>
+    <param name="plot1" type="select">
+      <label>Create a graph</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="dan" name="out_file1" />
+  </outputs>
+  <!--
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="window" value="20"/>
+      <param name="shift" value="1"/>
+      <param name="dnaconc" value="50"/>
+      <param name="saltconc" value="50"/>
+      <param name="thermo" value="yes"/>
+      <param name="temperature" value="25"/>
+      <param name="rna" value="no"/>
+      <param name="product" value="no"/>
+      <param name="formamide" value="0"/>
+      <param name="mismatch" value="0"/>
+      <param name="prodlen" value="20"/>
+      <param name="plot1" value="yes"/>
+      <output name="out_file1" file="emboss_dan_out.png"/>
+    </test>
+  </tests>
+  -->
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/dan.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_degapseq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_degapseq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,58 @@
+<tool id="EMBOSS: degapseq20" name="degapseq" version="5.0.0">
+  <description>Removes gap characters from sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>degapseq -sequence $input1 -outseq $out_file1 -osformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>On query</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_degapseq_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/degapseq.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_descseq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_descseq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,72 @@
+<tool id="EMBOSS: descseq21" name="descseq" version="5.0.0">
+  <description>Alter the name or description of a sequence</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>descseq -sequence $input1 -outseq $out_file1 -name "$seqname" -description "$desc" -append $append -osformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>On query</label>
+    </param>
+    <param name="seqname" size="50" type="text" value="">
+      <label>Name of the sequence</label>
+    </param>
+    <param name="desc" size="50" type="text" value="">
+      <label>Description of the sequence</label>
+    </param>
+    <param name="append" type="select">
+      <label>Append the name or description on to the end of the existing name or description</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="seqname" value="test"/>
+      <param name="desc" value="test"/>
+      <param name="append" value="yes"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_descseq_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/descseq.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_diffseq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_diffseq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,64 @@
+<tool id="EMBOSS: diffseq22" name="diffseq" version="5.0.0">
+  <description>Find differences between nearly identical sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>diffseq -asequence $input1 -bsequence $input2 -outfile $out_file1 -aoutfeat $out_file2 -boutfeat $out_file3 -wordsize $wordsize -globaldifferences $globaldifferences -rformat3
+  $out_format1 -offormat4 $out_format2 -offormat5 $out_format3 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence 1</label>
+    </param>
+    <param format="data" name="input2" type="data">
+      <label>Sequence 2</label>
+    </param>
+    <param name="wordsize" size="4" type="text" value="20">
+      <label>Wordsize</label>
+    </param>
+    <param name="globaldifferences" type="select">
+      <label>Report differences at the ends</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Report File Format</label>
+      <option value="diffseq">Diffseq</option>
+      <option value="embl">EMBL</option>
+      <option value="genbank">GENBANK</option>
+      <option value="gff">GFF</option>
+      <option value="pir">PIR</option>
+      <option value="swiss">SwissProt</option>
+      <option value="dbmotif">DbMotif</option>
+      <option value="excel">Excel (tab delimited)</option>
+      <option value="feattable">FeatTable</option>
+      <option value="motif">Motif</option>
+      <option value="regions">Regions</option>
+      <option value="seqtable">SeqTable</option>
+      <option value="simple">SRS Simple</option>
+      <option value="srs">SRS</option>
+      <option value="table">Table</option>
+      <option value="tagseq">TagSeq</option>
+    </param>
+    <param name="out_format2" type="select">
+      <label>Sequence 1 Output Feature File Format</label>
+      <option value="gff">GFF</option>
+      <option value="embl">EMBL</option>
+      <option value="swiss">SwissProt</option>
+    </param>
+    <param name="out_format3" type="select">
+      <label>Sequence 2 Output Feature File Format</label>
+      <option value="gff">GFF</option>
+      <option value="embl">EMBL</option>
+      <option value="swiss">SwissProt</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="diffseq" name="out_file1" />
+    <data format="gff" name="out_file2" />
+    <data format="gff" name="out_file3" />
+  </outputs>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/diffseq.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_digest.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_digest.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,65 @@
+<tool id="EMBOSS: digest23" name="digest" version="5.0.0">
+  <description>Protein proteolytic enzyme or reagent cleavage digest</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>digest -seqall $input1 -outfile $out_file1 -menu $menu -unfavoured $unfavoured -overlap $overlap -allpartials $allpartials -rformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="menu" type="select">
+      <label>Enzyme/Reagent</label>
+      <option value="1">Trypsin</option>
+      <option value="2">Lys-C</option>
+      <option value="3">Arg-C</option>
+      <option value="4">Asp-N</option>
+      <option value="5">V8-bicarb</option>
+      <option value="6">V8-phosph</option>
+      <option value="7">Chymotrypsin</option>
+      <option value="8">CNBr</option>
+    </param>
+    <param name="unfavoured" type="select">
+      <label>Trypsin will not normally cut after a K if it is followed by (e.g.) another K or a P. Specifying this shows those cuts, as well as the favoured ones.</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="overlap" type="select">
+      <label>Used for partial digestion. Shows all cuts from favoured cut sites plus 1..3, 2..4, 3..5 etc but not (e.g.) 2..5. Overlaps are therefore fragments with exactly one potential cut site
+      within it.</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="allpartials" type="select">
+      <label>As for overlap but fragments containing more than one potential cut site are included.</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Report File Format</label>
+      <option value="seqtable">SeqTable</option>
+      <option value="embl">EMBL</option>
+      <option value="genbank">GENBANK</option>
+      <option value="gff">GFF</option>
+      <option value="pir">PIR</option>
+      <option value="swiss">SwissProt</option>
+      <option value="dbmotif">DbMotif</option>
+      <option value="diffseq">Diffseq</option>
+      <option value="excel">Excel (tab delimited)</option>
+      <option value="feattable">FeatTable</option>
+      <option value="motif">Motif</option>
+      <option value="regions">Regions</option>
+      <option value="simple">SRS Simple</option>
+      <option value="srs">SRS</option>
+      <option value="table">Table</option>
+      <option value="tagseq">TagSeq</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="digest" name="out_file1" />
+  </outputs>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/digest.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_dotmatcher.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_dotmatcher.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,29 @@
+<tool id="EMBOSS: dotmatcher24" name="dotmatcher" version="5.0.0">
+  <description>Displays a thresholded dotplot of two sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command interpreter="perl">emboss_single_outputfile_wrapper.pl dotmatcher -asequence $input1 -bsequence $input2 -goutfile $out_file1 -windowsize $windowsize -threshold $threshold -graph png -xygraph png
+  -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence 1</label>
+    </param>
+    <param format="data" name="input2" type="data">
+      <label>Sequence 2</label>
+    </param>
+    <param name="windowsize" size="4" type="text" value="10">
+      <label>Window size</label>
+    </param>
+    <param name="threshold" size="4" type="text" value="23">
+      <label>Threshold</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="png" name="out_file1" />
+  </outputs>
+  <!-- functional tests not possible since image output contains file name information and timestamp -->
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/dotmatcher.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_dotpath.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_dotpath.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,36 @@
+<tool id="EMBOSS: dotpath25" name="dotpath" version="5.0.0">
+  <description>Non-overlapping wordmatch dotplot of two sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command interpreter="perl">emboss_single_outputfile_wrapper.pl dotpath -asequence $input1 -bsequence $input2 -goutfile $out_file1 -wordsize $wordsize -overlaps $overlaps -boxit $boxit -graph png
+  -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence 1</label>
+    </param>
+    <param format="data" name="input2" type="data">
+      <label>Sequence 2</label>
+    </param>
+    <param name="wordsize" size="4" type="text" value="4">
+      <label>Word size (Integer 2 or more)</label>
+    </param>
+    <param name="overlaps" type="select">
+      <label>Display the overlapping matches (in red) as well as the minimal set of non-overlapping matches</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="boxit" type="select">
+      <label>Draw a box around dotplot</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="png" name="out_file1" />
+  </outputs>
+  <!-- functional tests not possible since image output contains file name information and timestamp -->
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/dotpath.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_dottup.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_dottup.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,30 @@
+<tool id="EMBOSS: dottup26" name="dottup" version="5.0.0">
+  <description>Displays a wordmatch dotplot of two sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command interpreter="perl">emboss_single_outputfile_wrapper.pl dottup -asequence $input1 -bsequence $input2 -goutfile $out_file1 -wordsize $wordsize -boxit $boxit -graph png  -xygraph png -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence 1</label>
+    </param>
+    <param format="data" name="input2" type="data">
+      <label>Sequence 2</label>
+    </param>
+    <param name="wordsize" size="4" type="text" value="4">
+      <label>Word size</label>
+    </param>
+    <param name="boxit" type="select">
+      <label>Draw a box around dotplot</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="png" name="out_file1" />
+  </outputs>
+  <!-- functional tests not possible since image output contains file name information and timestamp -->
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/dottup.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_dreg.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_dreg.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,22 @@
+<tool id="EMBOSS: dreg27" name="dreg" version="5.0.0">
+  <description>Regular expression search of a nucleotide sequence</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>dreg -sequence $input1 -outfile $out_file1 -pattern "$pattern" -raccshow3 "no" -rusashow3 "no" -rdesshow3 "no" -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence 1</label>
+    </param>
+    <param name="pattern" size="50" type="text" value="(AUG)">
+      <label>Regular expression pattern</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="dreg" name="out_file1" />
+  </outputs>
+  <!-- tests not possible since dreg timestamps output file -->  
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/dreg.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_einverted.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_einverted.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,50 @@
+<tool id="EMBOSS: einverted28" name="einverted" version="5.0.0">
+  <description>Finds DNA inverted repeats</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>einverted -sequence $input1 -outfile $out_file1 -gap $gap -threshold $threshold -match $match -mismatch $mismatch -maxrepeat $maxrepeat -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>On query</label>
+    </param>
+    <param name="gap" size="4" type="text" value="12">
+      <label>Gap penalty</label>
+    </param>
+    <param name="threshold" size="4" type="text" value="50">
+      <label>Minimum score threshold</label>
+    </param>
+    <param name="match" size="4" type="text" value="3">
+      <label>Match score</label>
+    </param>
+    <param name="mismatch" size="4" type="text" value="-4">
+      <label>Mismatch score</label>
+    </param>
+    <param name="maxrepeat" size="4" type="text" value="2000">
+      <label>Maximum separation between the start of repeat and the end of the inverted repeat</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="einverted" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="1.fasta"/>
+      <param name="gap" value="12"/>
+      <param name="threshold" value="50"/>
+      <param name="match" value="3"/>
+      <param name="mismatch" value="-4"/>
+      <param name="maxrepeat" value="2000"/>
+      <output name="out_file1" file="emboss_einverted_out.einverted"/>
+    </test>
+  </tests>
+  <help>
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/einverted.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_epestfind.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_epestfind.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,65 @@
+<tool id="EMBOSS: epestfind29" name="epestfind" version="5.0.0">
+  <description>Finds PEST motifs as potential proteolytic cleavage sites</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command interpreter="perl">emboss_single_outputfile_wrapper.pl epestfind -sequence $input1 -goutfile $ofile2 -outfile $ofile1 -window $window -order $order -potential $potential -poor $poor
+  -invalid $invalid -map $map -graph png -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="window" size="4" type="text" value="10">
+      <label>Minimal distance between positively charged amino acids</label>
+    </param>
+    <param name="order" type="select">
+      <label>Sort by</label>
+      <option value="3">Score</option>
+      <option value="1">Length</option>
+      <option value="2">Position</option>
+    </param>
+    <param name="threshold" size="4" type="text" value="5.0">
+      <label>Threshold value to discriminate weak from potential PEST motifs.</label>
+    </param>
+    <param name="potential" type="select">
+      <label>Decide whether potential PEST motifs should be printed</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="poor" type="select">
+      <label>Decide whether poor PEST motifs should be printed</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="invalid" type="select">
+      <label>Decide whether invalid PEST motifs should be printed</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="map" type="select">
+      <label>Decide whether PEST motifs should be mapped to sequence</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="png" name="ofile2" />
+    <data format="epestfind" name="ofile1" />
+  </outputs>
+<!--    <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="window" value="10"/>
+      <param name="order" value="3"/>
+      <param name="threshold" value="5.0"/>
+      <param name="potential" value="yes"/>
+      <param name="poor" value="yes"/>
+      <param name="invalid" value="no"/>
+      <param name="map" value="yes"/>
+      <output name="ofile1" file="emboss_epestfind_out.epestfind"/>
+    </test>
+  </tests>  output file contains file location info  -->
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/epestfind.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_equicktandem.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_equicktandem.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,60 @@
+<tool id="EMBOSS: equicktandem31" name="equicktandem" version="5.0.0">
+  <description>Finds tandem repeats</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>equicktandem -sequence $input1 -outfile $out_file1 -origfile $ofile2 -maxrepeat $maxrepeat -threshold $threshold -rformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="maxrepeat" size="4" type="text" value="600">
+      <label>Maximum repeat size</label>
+    </param>
+    <param name="threshold" size="4" type="text" value="20">
+      <label>Threshold score</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Report File Format</label>
+      <option value="table">Table</option>
+      <option value="embl">EMBL</option>
+      <option value="genbank">GENBANK</option>
+      <option value="gff">GFF</option>
+      <option value="pir">PIR</option>
+      <option value="swiss">SwissProt</option>
+      <option value="dbmotif">DbMotif</option>
+      <option value="diffseq">Diffseq</option>
+      <option value="excel">Excel (tab delimited)</option>
+      <option value="feattable">FeatTable</option>
+      <option value="motif">Motif</option>
+      <option value="regions">Regions</option>
+      <option value="seqtable">SeqTable</option>
+      <option value="simple">SRS Simple</option>
+      <option value="srs">SRS</option>
+      <option value="tagseq">TagSeq</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="table" name="out_file1" />
+    <data format="equicktandem" name="ofile2" />
+  </outputs>
+    <tests>
+    <test>
+      <param name="input1" value="1.fasta"/>
+      <param name="maxrepeat" value="600"/>
+      <param name="threshold" value="20"/>
+      <param name="out_format1" value="table"/>
+      <output name="ofile2" file="emboss_equicktandem_out.equicktandem"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/equicktandem.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_est2genome.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_est2genome.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,103 @@
+<tool id="EMBOSS: est2genome32" name="est2genome" version="5.0.0">
+  <description>Align EST and genomic DNA sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>est2genome -estsequence $input1 -genomesequence $input2 -outfile $out_file1 -match $match -mismatch $mismatch -gappenalty $gappenalty -intronpenalty $intronpenalty -splicepenalty
+  $splicepenalty -minscore $minscore -reverse $reverse -splice $splice -mode $mode -best $best -shuffle $shuffle -seed $seed -align $align -width $width -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>EST sequence(s)</label>
+    </param>
+    <param format="data" name="input2" type="data">
+      <label>Genomic sequence</label>
+    </param>
+    <param name="match" size="4" type="text" value="1">
+      <label>Score for matching two bases</label>
+    </param>
+    <param name="mismatch" size="4" type="text" value="1">
+      <label>Cost for mismatching two bases</label>
+    </param>
+    <param name="gappenalty" size="4" type="text" value="2">
+      <label>Cost for deleting a single base in either sequence, excluding introns</label>
+    </param>
+    <param name="intronpenalty" size="4" type="text" value="40">
+      <label>Cost for an intron, independent of length</label>
+    </param>
+    <param name="splicepenalty" size="4" type="text" value="20">
+      <label>Cost for an intron, independent of length and starting/ending on donor-acceptor sites</label>
+    </param>
+    <param name="minscore" size="4" type="text" value="30">
+      <label>Exclude alignments with scores below this threshold score</label>
+    </param>
+    <param name="reverse" type="select">
+      <label>Reverse the orientation of the EST sequence</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="splice" type="select">
+      <label>Use donor and acceptor splice sites</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="mode" type="select">
+      <label>Comparison mode</label>
+      <option value="both">Both strands</option>
+      <option value="forward">Forward strand only</option>
+      <option value="reverse">Reverse strand only</option>
+    </param>
+    <param name="best" type="select">
+      <label>Only best comparisons</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="shuffle" size="4" type="text" value="0">
+      <label>Shuffle</label>
+    </param>
+    <param name="seed" size="4" type="text" value="20825">
+      <label>Random number seed</label>
+    </param>
+    <param name="align" type="select">
+      <label>Show the alignment</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="width" size="4" type="text" value="50">
+      <label>Alignment width</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="est2genome" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="input2" value="1.fasta"/>
+      <param name="match" value="1"/>
+      <param name="mismatch" value="1"/>
+      <param name="match" value="1"/>
+      <param name="gappenalty" value="2"/>
+      <param name="intronpenalty" value="40"/>
+      <param name="splicepenalty" value="20"/>
+      <param name="minscore" value="30"/>
+      <param name="reverse" value="no"/>
+      <param name="splice" value="yes"/>
+      <param name="mode" value="both"/>
+      <param name="best" value="yes"/>
+      <param name="shuffle" value="0"/>
+      <param name="seed" value="20825"/>
+      <param name="align" value="no"/>
+      <param name="width" value="50"/>
+      <output name="out_file1" file="emboss_est2genome_out.est2genome"/>
+    </test>
+  </tests>
+  <help>
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/est2genome.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_etandem.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_etandem.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,78 @@
+<tool id="EMBOSS: etandem33" name="etandem" version="5.0.0">
+  <description>Looks for tandem repeats in a nucleotide sequence</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>etandem -sequence $input1 -outfile $out_file1 -origfile $ofile2 -minrepeat $minrepeat -maxrepeat $maxrepeat -threshold $threshold -mismatch $mismatch -uniform $uniform -rformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="minrepeat" size="4" type="text" value="10">
+      <label>Minimum repeat size</label>
+    </param>
+    <param name="maxrepeat" size="4" type="text" value="10">
+      <label>Maximum repeat size</label>
+    </param>
+    <param name="threshold" size="4" type="text" value="20">
+      <label>Threshold score</label>
+    </param>
+    <param name="mismatch" type="select">
+      <label>Allow N as a mismatch</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="uniform" type="select">
+      <label>Allow uniform consensus</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Report File Format</label>
+      <option value="table">Table</option>
+      <option value="embl">EMBL</option>
+      <option value="genbank">GENBANK</option>
+      <option value="gff">GFF</option>
+      <option value="pir">PIR</option>
+      <option value="swiss">SwissProt</option>
+      <option value="dbmotif">DbMotif</option>
+      <option value="diffseq">Diffseq</option>
+      <option value="excel">Excel (tab delimited)</option>
+      <option value="feattable">FeatTable</option>
+      <option value="motif">Motif</option>
+      <option value="regions">Regions</option>
+      <option value="seqtable">SeqTable</option>
+      <option value="simple">SRS Simple</option>
+      <option value="srs">SRS</option>
+      <option value="tagseq">TagSeq</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="etandem" name="out_file1" />
+    <data format="table" name="ofile2" />    
+  </outputs>
+    <tests>
+    <test>
+      <param name="input1" value="1.fasta"/>
+      <param name="minrepeat" value="10"/>
+      <param name="maxrepeat" value="10"/>
+      <param name="threshold" value="20"/>
+      <param name="mismatch" value="no"/>
+       <param name="uniform" value="no"/>
+      <param name="out_format1" value="table"/>
+      <output name="ofile2" file="emboss_etandem_out.table"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/etandem.html
+  </help>
+</tool>
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_extractfeat.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_extractfeat.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,96 @@
+<tool id="EMBOSS: extractfeat34" name="extractfeat" version="5.0.0">
+  <!-- tool tested with documentation, functional test not designed due to empty files resulting from test input sequences -->  
+  <description>Extract features from a sequence</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>extractfeat -sequence $input1 -outseq $out_file1 -before $before -after $after -source "$source" -type "$type" -sense $sense -minscore $minscore -maxscore $maxscore -tag "$tag" -value
+  "$value" -join $join -featinname $featinname -describe "$describe" -osformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="before" size="4" type="text" value="0">
+      <label>Number of bases or residues before the feature to include in the extracted sequence</label>
+    </param>
+    <param name="after" size="4" type="text" value="0">
+      <label>Number of bases or residues after the feature to include in the extracted sequence</label>
+    </param>
+    <param name="source" size="4" type="text" value="*">
+      <label>Feature source</label>
+    </param>
+    <param name="type" size="4" type="text" value="*">
+      <label>Feature type</label>
+    </param>
+    <param name="sense" type="select">
+      <label>Feature sense</label>
+      <option value="0">Any sense</option>
+      <option value="1">Forward sense</option>
+      <option value="-1">Reverse sense</option>
+    </param>
+    <param name="minscore" size="4" type="text" value="0.0">
+      <label>Minimum score</label>
+    </param>
+    <param name="maxscore" size="4" type="text" value="0.0">
+      <label>Maximum score</label>
+    </param>
+    <param name="tag" size="4" type="text" value="*">
+      <label>Feature tags</label>
+    </param>
+    <param name="value" size="4" type="text" value="*">
+      <label>Tag values</label>
+    </param>
+    <param name="join" type="select">
+      <label>Join features</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="featinname" type="select">
+      <label>Put feature type in sequence name</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="describe" size="4" type="text" value="">
+      <label>Specify one or more tag names that should be added to the output sequence Description text</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/extractfeat.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_extractseq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_extractseq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,68 @@
+<tool id="EMBOSS: extractseq35" name="extractseq" version="5.0.0">
+  <description>Extract regions from a sequence</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>extractseq -sequence $input1 -outseq $out_file1 -regions $regions -separate $separate -osformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="regions" size="20" type="text" value="1-9999999">
+      <label>Regions to extract</label>
+    </param>
+    <param name="separate" type="select">
+      <label>Write each specified region as a separate sequence</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="regions" value="1-9999999"/>
+      <param name="separate" value="no"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_extractseq_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/extractseq.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_format_corrector.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_format_corrector.py Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,53 @@
+#EMBOSS format corrector
+
+import operator
+#from galaxy import datatypes
+
+#Properly set file formats after job run
+def exec_after_process( app, inp_data, out_data, param_dict,tool, stdout, stderr):
+#Properly set file formats before job run
+#def exec_before_job(trans, inp_data, out_data, param_dict,tool):
+    #why isn't items an ordered list?
+    items = out_data.items()
+    #lets sort it ourselves....
+    items = sorted(items, key=operator.itemgetter(0))
+    #items is now sorted...
+    
+    #normal filetype correction
+    data_count=1
+    for name, data in items:
+        outputType = param_dict.get( 'out_format'+str(data_count), None )
+        #print "data_count",data_count, "name", name, "outputType", outputType
+        if outputType !=None:
+            if outputType == 'ncbi':
+                outputType = "fasta"
+            elif outputType == 'excel':
+                outputType = "tabular"
+            elif outputType == 'text':
+                outputType = "txt"
+            data = app.datatypes_registry.change_datatype(data, outputType)
+            app.model.context.add( data )
+            app.model.context.flush()
+        data_count+=1
+    
+    #html filetype correction
+    data_count=1
+    for name, data in items:
+        wants_plot = param_dict.get( 'html_out'+str(data_count), None )
+        ext = "html"
+        if wants_plot == "yes":
+            data = app.datatypes_registry.change_datatype(data, ext)
+            app.model.context.add( data )
+            app.model.context.flush()
+        data_count+=1
+    
+    #png file correction
+    data_count=1
+    for name, data in items:
+        wants_plot = param_dict.get( 'plot'+str(data_count), None )
+        ext = "png"
+        if wants_plot == "yes":
+            data = app.datatypes_registry.change_datatype(data, ext)
+            app.model.context.add( data )
+            app.model.context.flush()
+        data_count+=1
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_freak.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_freak.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,36 @@
+<tool id="EMBOSS: freak36" name="freak" version="5.0.0">
+  <description>Residue/base frequency table or plot</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>freak -seqall $input1 -outfile $out_file1 -window $window -letters $letters -graph png -step $step -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="letters" size="5" type="text" value="gc">
+      <label>Residue letters</label>
+    </param>
+    <param name="step" size="5" type="text" value="1">
+      <label>Stepping value</label>
+    </param>
+    <param name="window" size="5" type="text" value="30">
+      <label>Averaging window</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="freak" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="letters" value="gc"/>
+      <param name="step" value="1"/>
+      <param name="window" value="30"/>
+      <output name="out_file1" file="emboss_freak_out.freak"/>
+    </test>
+  </tests>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/freak.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_fuzznuc.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_fuzznuc.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,75 @@
+<tool id="EMBOSS: fuzznuc37" name="fuzznuc" version="5.0.1">
+  <description>Nucleic acid pattern search</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>fuzznuc -sequence $input1 -outfile $out_file1 -pattern '$pattern' -pmismatch $mismatch -complement $complement -rformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="pattern" size="5" type="text" value="">
+      <label>Search pattern</label>
+      <sanitizer>
+        <valid initial="string.printable">
+         <remove value="&apos;"/>
+        </valid>
+        <mapping initial="none">
+          <add source="&apos;" target=""/>
+        </mapping>
+      </sanitizer>
+    </param>
+    <param name="mismatch" size="5" type="text" value="0">
+      <label>Number of mismatches</label>
+    </param>
+    <param name="complement" type="select">
+      <label>Search complementary strand</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Report File Format</label>
+      <option value="seqtable">SeqTable</option>
+      <option value="embl">EMBL</option>
+      <option value="genbank">GENBANK</option>
+      <option value="gff">GFF</option>
+      <option value="pir">PIR</option>
+      <option value="swiss">SwissProt</option>
+      <option value="dbmotif">DbMotif</option>
+      <option value="diffseq">Diffseq</option>
+      <option value="excel">Excel (tab delimited)</option>
+      <option value="feattable">FeatTable</option>
+      <option value="motif">Motif</option>
+      <option value="regions">Regions</option>
+      <option value="simple">SRS Simple</option>
+      <option value="fuzznuc">Fuzznuc Output File</option>
+      <option value="srs">SRS</option>
+      <option value="table">Table</option>
+      <option value="tagseq">TagSeq</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fuzznuc" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="pattern" value="AA"/>
+      <param name="mismatch" value="0"/>
+      <param name="complement" value="no"/>
+      <param name="out_format1" value="excel"/>
+      <output name="out_file1" file="emboss_fuzznuc_out.tabular"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/fuzznuc.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_fuzzpro.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_fuzzpro.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,44 @@
+<tool id="EMBOSS: fuzzpro38" name="fuzzpro" version="5.0.0">
+  <description>Protein pattern search</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>fuzzpro -sequence $input1 -outfile $out_file1 -pattern "$pattern" -pmismatch $mismatch -rformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="pattern" size="30" type="text" value="">
+      <label>Search pattern</label>
+    </param>
+    <param name="mismatch" size="5" type="text" value="0">
+      <label>Number of mismatches</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Report File Format</label>
+      <option value="seqtable">SeqTable</option>
+      <option value="embl">EMBL</option>
+      <option value="genbank">GENBANK</option>
+      <option value="gff">GFF</option>
+      <option value="pir">PIR</option>
+      <option value="swiss">SwissProt</option>
+      <option value="dbmotif">DbMotif</option>
+      <option value="diffseq">Diffseq</option>
+      <option value="excel">Excel (tab delimited)</option>
+      <option value="feattable">FeatTable</option>
+      <option value="motif">Motif</option>
+      <option value="regions">Regions</option>
+      <option value="simple">SRS Simple</option>
+      <option value="srs">SRS</option>
+      <option value="table">Table</option>
+      <option value="tagseq">TagSeq</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fuzzpro" name="out_file1" />
+  </outputs>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/fuzzpro.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_fuzztran.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_fuzztran.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,95 @@
+<tool id="EMBOSS: fuzztran39" name="fuzztran" version="5.0.0">
+  <description>Protein pattern search after translation</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>fuzztran -sequence $input1 -outfile $out_file1 -pattern "$pattern" -pmismatch $mismatch -frame $frame -table $table -rformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="pattern" size="5" type="text" value="">
+      <label>Search pattern</label>
+    </param>
+    <param name="mismatch" size="5" type="text" value="0">
+      <label>Number of mismatches</label>
+    </param>
+    <param name="frame" type="select">
+      <label>Frame(s) to translate</label>
+      <option value="1">Frame 1</option>
+      <option value="2">Frame 2</option>
+      <option value="3">Frame 3</option>
+      <option value="F">Forward three frames</option>
+      <option value="-1">Frame -1</option>
+      <option value="-2">Frame -2</option>
+      <option value="-3">Frame -3</option>
+      <option value="R">Reverse three frames</option>
+      <option value="6">All six frames</option>
+    </param>
+    <param name="table" type="select">
+      <label>Code to use</label>
+      <option value="0">Standard</option>
+      <option value="1">Standard (with alternative initiation codons)</option>
+      <option value="2">Vertebrate Mitochondrial</option>
+      <option value="3">Yeast Mitochondrial</option>
+      <option value="4">Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma</option>
+      <option value="5">Invertebrate Mitochondrial</option>
+      <option value="6">Ciliate Macronuclear and Dasycladacean</option>
+      <option value="9">Echinoderm Mitochondrial</option>
+      <option value="10">Euplotid Nuclear</option>
+      <option value="11">Bacterial</option>
+      <option value="12">Alternative Yeast Nuclear</option>
+      <option value="13">Ascidian Mitochondrial</option>
+      <option value="14">Flatworm Mitochondrial</option>
+      <option value="15">Blepharisma Macronuclear</option>
+      <option value="16">Chlorophycean Mitochondrial</option>
+      <option value="21">Trematode Mitochondrial</option>
+      <option value="22">Scenedesmus obliquus</option>
+      <option value="23">Thraustochytrium Mitochondrial</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Report File Format</label>
+      <option value="table">Table</option>
+      <option value="embl">EMBL</option>
+      <option value="genbank">GENBANK</option>
+      <option value="gff">GFF</option>
+      <option value="pir">PIR</option>
+      <option value="swiss">SwissProt</option>
+      <option value="dbmotif">DbMotif</option>
+      <option value="diffseq">Diffseq</option>
+      <option value="excel">Excel (tab delimited)</option>
+      <option value="feattable">FeatTable</option>
+      <option value="motif">Motif</option>
+      <option value="regions">Regions</option>
+      <option value="seqtable">SeqTable</option>
+      <option value="simple">SRS Simple</option>
+      <option value="srs">SRS</option>
+      <option value="tagseq">TagSeq</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fuzztran" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="1.fasta"/>
+      <param name="pattern" value="AA"/>
+      <param name="mismatch" value="0"/>
+      <param name="frame" value="6"/>
+      <param name="table" value="0"/>
+      <param name="out_format1" value="excel"/>
+      <output name="out_file1" file="emboss_fuzztran_out.tabular"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/fuzztran.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_garnier.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_garnier.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,58 @@
+<tool id="EMBOSS: garnier40" name="garnier" version="5.0.0">
+  <description>Predicts protein secondary structure</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>garnier -sequence $input1 -outfile $out_file1 -idc $idc -rformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="idc" type="select">
+      <label>In their paper, GOR mention that if you know something about the secondary structure content of the protein you are analyzing, you can do better in prediction. 'idc' is an index into a
+      set of arrays, dharr[] and dsarr[], which provide 'decision constants' (dch, dcs), which are offsets that are applied to the weights for the helix and sheet (extend) terms. So, idc=0 says don't
+      use the decision constant offsets, and idc=1 to 6 indicates that various combinations of dch,dcs offsets should be used</label>
+      <option value="0">idc 0</option>
+      <option value="1">idc 1</option>
+      <option value="2">idc 2</option>
+      <option value="3">idc 3</option>
+      <option value="4">idc 4</option>
+      <option value="5">idc 5</option>
+      <option value="6">idc 6</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Report File Format</label>
+      <option value="tagseq">TagSeq</option>
+      <option value="embl">EMBL</option>
+      <option value="genbank">GENBANK</option>
+      <option value="gff">GFF</option>
+      <option value="pir">PIR</option>
+      <option value="swiss">SwissProt</option>
+      <option value="dbmotif">DbMotif</option>
+      <option value="diffseq">Diffseq</option>
+      <option value="excel">Excel (tab delimited)</option>
+      <option value="feattable">FeatTable</option>
+      <option value="motif">Motif</option>
+      <option value="regions">Regions</option>
+      <option value="seqtable">SeqTable</option>
+      <option value="simple">SRS Simple</option>
+      <option value="srs">SRS</option>
+      <option value="table">Table</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="garnier" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="idc" value="0"/>
+      <param name="out_format1" value="excel"/>
+      <output name="out_file1" file="emboss_garnier_out.tabular"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/4.0/emboss/apps/garnier.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_geecee.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_geecee.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,24 @@
+<tool id="EMBOSS: geecee41" name="geecee" version="5.0.0">
+  <description>Calculates fractional GC content of nucleic acid sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>geecee -sequence $input1 -outfile $out_file1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="geecee" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <output name="out_file1" file="emboss_geecee_out.geecee"/>
+    </test>
+  </tests>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/geecee.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_getorf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_getorf.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,129 @@
+<tool id="EMBOSS: getorf42" name="getorf" version="5.0.0">
+  <description>Finds and extracts open reading frames (ORFs)</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>getorf -sequence $input1 -outseq $out_file1 -table $table -minsize $minsize -maxsize $maxsize -find $find -methionine $methionine -circular $circular -reverse $reverse -flanking $flanking
+  -osformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="table" type="select">
+      <label>Code to use</label>
+      <option value="0">Standard</option>
+      <option value="1">Standard (with alternative initiation codons)</option>
+      <option value="2">Vertebrate Mitochondrial</option>
+      <option value="3">Yeast Mitochondrial</option>
+      <option value="4">Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma</option>
+      <option value="5">Invertebrate Mitochondrial</option>
+      <option value="6">Ciliate Macronuclear and Dasycladacean</option>
+      <option value="9">Echinoderm Mitochondrial</option>
+      <option value="10">Euplotid Nuclear</option>
+      <option value="11">Bacterial</option>
+      <option value="12">Alternative Yeast Nuclear</option>
+      <option value="13">Ascidian Mitochondrial</option>
+      <option value="14">Flatworm Mitochondrial</option>
+      <option value="15">Blepharisma Macronuclear</option>
+      <option value="16">Chlorophycean Mitochondrial</option>
+      <option value="21">Trematode Mitochondrial</option>
+      <option value="22">Scenedesmus obliquus</option>
+      <option value="23">Thraustochytrium Mitochondrial</option>
+    </param>
+    <param name="minsize" size="10" type="text" value="30">
+      <label>Minimum nucleotide size of ORF to report</label>
+    </param>
+    <param name="maxsize" size="10" type="text" value="1000000">
+      <label>Maximum nucleotide size of ORF to report</label>
+    </param>
+    <param name="find" type="select">
+      <label>What to output</label>
+      <option value="0">Translation of regions between STOP codons</option>
+      <option value="1">Translation of regions between START and STOP codons</option>
+      <option value="2">Nucleic sequences between STOP codons</option>
+      <option value="3">Nucleic sequences between START and STOP codons</option>
+      <option value="4">Nucleotides flanking START codons</option>
+      <option value="5">Nucleotides flanking initial STOP codons</option>
+      <option value="6">Nucleotides flanking ending STOP codons</option>
+    </param>
+    <param name="methionine" type="select">
+      <label>All START codons to code for Methionine</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="circular" type="select">
+      <label>Circular sequence</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="reverse" type="select">
+      <label>Find ORFs in the reverse complement</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="flanking" size="10" type="text" value="100">
+      <label>Number of flanking nucleotides to output</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <!-- <option value="gff">GFF (m)</option> -->
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="minsize" value="30"/>
+      <param name="maxsize" value="1000000"/>
+      <param name="find" value="0"/>
+      <param name="methionine" value="yes"/>
+      <param name="circular" value="no"/>
+      <param name="reverse" value="yes"/>
+      <param name="table" value="0"/>
+      <param name="flanking" value="100"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_getorf_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/getorf.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_helixturnhelix.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_helixturnhelix.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,63 @@
+<tool id="EMBOSS: helixturnhelix43" name="helixturnhelix" version="5.0.0">
+  <description>Report nucleic acid binding motifs</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>helixturnhelix -sequence $input1 -outfile $out_file1 -mean $mean -sd $sd -minsd $minsd -eightyseven $eightyseven -rformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="mean" size="10" type="text" value="238.71">
+      <label>Mean value</label>
+    </param>
+    <param name="sd" size="10" type="text" value="293.61">
+      <label>Standard Deviation value</label>
+    </param>
+    <param name="minsd" size="10" type="text" value="2.5">
+      <label>Minimum SD</label>
+    </param>
+    <param name="eightyseven" type="select">
+      <label>Use the old (1987) weight data</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Report File Format</label>
+      <option value="motif">Motif</option>
+      <option value="embl">EMBL</option>
+      <option value="genbank">GENBANK</option>
+      <option value="gff">GFF</option>
+      <option value="pir">PIR</option>
+      <option value="swiss">SwissProt</option>
+      <option value="dbmotif">DbMotif</option>
+      <option value="diffseq">Diffseq</option>
+      <option value="excel">Excel (tab delimited)</option>
+      <option value="feattable">FeatTable</option>
+      <option value="regions">Regions</option>
+      <option value="seqtable">SeqTable</option>
+      <option value="simple">SRS Simple</option>
+      <option value="srs">SRS</option>
+      <option value="table">Table</option>
+      <option value="tagseq">TagSeq</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="motif" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="mean" value="238.71"/>
+      <param name="sd" value="293.61"/>
+      <param name="minsd" value="2.5"/>
+      <param name="eightyseven" value="no"/>
+      <param name="out_format1" value="excel"/>
+      <output name="out_file1" file="emboss_helixturnhelix_out.tabular"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/helixturnhelix.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_hmoment.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_hmoment.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,32 @@
+<tool id="EMBOSS: hmoment44" name="hmoment" version="5.0.0">
+  <description>Hydrophobic moment calculation</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>hmoment -seqall $input1 -outfile $out_file1 -window $window -aangle $aangle -graph png -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="window" size="5" type="text" value="10">
+      <label>Window</label>
+    </param>
+    <param name="aangle" size="5" type="text" value="100">
+      <label>Alpha helix angle (degrees)</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="hmoment" name="out_file1" />
+  </outputs>
+    <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="window" value="10"/>
+      <param name="aangle" value="100"/>
+      <output name="out_file1" file="emboss_hmoment_out.hmoment"/>
+    </test>
+  </tests>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/hmoment.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_iep.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_iep.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,38 @@
+<tool id="EMBOSS: iep45" name="iep" version="5.0.0">
+  <description>Calculates the isoelectric point of a protein</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>iep -sequence $input1 -outfile $out_file1 -step $step -amino $amino -graph png -termini $termini -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="step" size="5" type="text" value=".5">
+      <label>Step value for pH</label>
+    </param>
+    <param name="amino" size="5" type="text" value="1">
+      <label>Number of N-termini</label>
+    </param>
+    <param name="termini" type="select">
+      <label>Include charge at N and C terminus</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="iep" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="step" value="0.5"/>
+      <param name="amino" value="1"/>
+      <param name="termini" value="yes"/>
+      <output name="out_file1" file="emboss_iep_out.iep"/>
+    </test>
+  </tests>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/iep.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_infoseq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_infoseq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,76 @@
+<tool id="EMBOSS: infoseq46" name="infoseq" version="5.0.0">
+  <!-- info contains file information always -->
+  <description>Displays some simple information about sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>infoseq -sequence $input1 -outfile $out_file1 -html $html_out1 -heading $heading -usa $usa -name $disname -accession $accession -gi $gi -version $version -type $type -length $length -pgc
+  $pgc -description $description -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="html_out1" type="select">
+      <label>Format output as an HTML table</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="heading" type="select">
+      <label>Display column headings</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="usa" type="select">
+      <label>Display the USA of the sequence</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="disname" type="select">
+      <label>Display 'name' column</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="accession" type="select">
+      <label>Display 'accession' column</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="gi" type="select">
+      <label>Display 'GI' column</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="version" type="select">
+      <label>Display 'version' column</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="type" type="select">
+      <label>Display 'type' column</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="length" type="select">
+      <label>Display 'length' column</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="pgc" type="select">
+      <label>Display 'percent GC content' column</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="description" type="select">
+      <label>Display 'description' column</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="txt" name="out_file1" />
+  </outputs>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/infoseq.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_infoseq_wrapper.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_infoseq_wrapper.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,9 @@
+#! /usr/bin/perl -w
+use strict;
+
+my $cmd_string = join (" ",@ARGV);
+my $results = `$cmd_string`;
+if ($ARGV[6]=~/yes/)
+{
+ print "Extension: html\n";
+}
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_isochore.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_isochore.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,82 @@
+<tool id="EMBOSS: isochore47" name="isochore" version="5.0.0">
+  <description>Plots isochores in large DNA sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command interpreter="perl">emboss_single_outputfile_wrapper.pl isochore -sequence $input1 -outfile $ofile2 -goutfile $ofile1 -graph png -window $window -shift $shift -auto</command>
+  <!--  <command interpreter="perl">emboss_single_outputfile_wrapper.pl isochore -sequence $input1 -goutfile $ofile1 -graph png -window $window -shift $shift -auto</command>-->
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="window" size="4" type="text" value="1000">
+      <label>Window size</label>
+    </param>
+    <param name="shift" size="4" type="text" value="100">
+      <label>Shift increment</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="png" name="ofile1" />
+    <data format="isochore" name="ofile2" />
+  </outputs>
+  <!-- <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="window" value="1000"/>
+      <param name="shift" value="100"/>
+      <output name="ofile1" file="emboss_isochore_out.isochore"/> 
+      <output name="ofile2" file="emboss_isochore_out.isochore"/>
+    </test>
+         <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="window" value="1000"/>
+      <param name="shift" value="100"/>
+      <output name="ofile2" file="emboss_isochore_out.isochore"/>
+    </test> 
+  </tests>-->
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+**Syntax**
+
+This application plots GC content over a sequence. It is intended for large sequences such as complete chromosomes or large genomic contigs, although interesting results can also be obtained from shorter sequences. You can view the original documentation here_.    
+
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/isochore.html
+
+- Both **Window size** and **Shift increment** are intergers.
+
+-----
+
+**Example**
+
+- Input sequences::
+
+    >hg18_dna range=chrX:151073054-151073376 5'pad=0 3'pad=0 revComp=FALSE strand=? repeatMasking=none
+    TTTATGTCTATAATCCTTACCAAAAGTTACCTTGGAATAAGAAGAAGTCA
+    GTAAAAAGAAGGCTGTTGTTCCGTGAAATACTGTCTTTATGCCTCAGATT
+    TGGAGTGCTCAGAGCCTCTGCAGCAAAGATTTGGCATGTGTCCTAGGCCT
+    GCTCAGAGCAGCAAATCCCACCCTCTTGGAGAATGAGACTCATAGAGGGA
+    CAGCTCCCTCCTCAGAGGCTTCTCTAATGGGACTCCAAAGAGCAAACACT
+    CAGCCCCATGAGGACTGGCCAGGCCAAGTGGTGTGTGGGAACAGGGAGCA
+    GCGGTTTCCAAGAGGATACAGTA
+
+- Output data file::
+
+    Position Percent G+C 1 .. 323
+    80 0.422
+    112 0.460
+    144 0.509
+    176 0.534
+    208 0.553
+    240 0.553
+
+- Output graphics file:
+
+.. image:: ./static/emboss_icons/isochore.png
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_lindna.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_lindna.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,99 @@
+<tool id="EMBOSS: lindna48" name="lindna" version="5.0.0">
+  <!-- tool produces memory error in ajmem.c -->
+  <description>Draws linear maps of DNA constructs</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>lindna -infile $input1 -graphout png -goutfile $out_file1 -ruler $ruler -blocktype $blocktype -maxgroups $maxgroups -maxlabels $maxlabels -intersymbol $intersymbol -intercolour $intercolour
+  -interticks $interticks -gapsize $gapsize -ticklines $ticklines -textheight $textheight -textlength $textlength -margin $margin -tickheight $tickheight -blockheight $blockheight -rangeheight
+  $rangeheight -gapgroup $gapgroup -postext $postext -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="ruler" type="select">
+      <label>Draw a ruler</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="blocktype" type="select">
+      <label>Type of blocks</label>
+      <option value="Filled">Filled blocks</option>
+      <option value="Open">Open blocks</option>
+      <option value="Outline">Black border</option>
+    </param>
+    <param name="maxgroups" size="4" type="text" value="20">
+      <label>Maximum number of groups</label>
+    </param>
+    <param name="maxlabels" size="6" type="text" value="10000">
+      <label>Maximum number of labels</label>
+    </param>
+    <param name="intersymbol" type="select">
+      <label>Type of blocks</label>
+      <option value="Straight">Straight</option>
+    </param>
+    <param name="intercolour" type="select">
+      <label>Colour of junctions between blocks</label>
+      <option value="1">Red</option>
+      <option value="0">Black</option>
+      <option value="2">Yellow</option>
+      <option value="3">Green</option>
+      <option value="4">Aquamarine</option>
+      <option value="5">Pink</option>
+      <option value="6">Wheat</option>
+      <option value="7">Grey</option>
+      <option value="8">Brown</option>
+      <option value="9">Blue</option>
+      <option value="10">Blue-violet</option>
+      <option value="11">Cyan</option>
+      <option value="12">Turquoise</option>
+      <option value="13">Magenta</option>
+      <option value="14">Salmon</option>
+      <option value="15">White</option>
+    </param>
+    <param name="interticks" type="select">
+      <label>Horizontal junctions between ticks</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="gapsize" size="6" type="text" value="500">
+      <label>Interval between ticks in the ruler</label>
+    </param>
+    <param name="ticklines" type="select">
+      <label>Vertical lines at the ruler's ticks</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="textheight" size="4" type="text" value="1.0">
+      <label>Height of text. Enter a number &lt;1.0 or &gt;1.0 to decrease or increase the size, respectively</label>
+    </param>
+    <param name="textlength" size="4" type="text" value="1.0">
+      <label>Length of text. Enter a number &lt;1.0 or &gt;1.0 to decrease or increase the size, respectively</label>
+    </param>
+    <param name="margin" size="4" type="text" value="1.0">
+      <label>Width of left margin. This is the region left to the groups where the names of the groups are displayed. Enter a number &lt;1.0 or &gt;1.0 to decrease or increase the size,
+      respectively</label>
+    </param>
+    <param name="tickheight" size="4" type="text" value="1.0">
+      <label>Height of ticks. Enter a number &lt;1.0 or &gt;1.0 to decrease or increase the size, respectively</label>
+    </param>
+    <param name="blockheight" size="4" type="text" value="1.0">
+      <label>Height of blocks. Enter a number &lt;1.0 or &gt;1.0 to decrease or increase the size, respectively</label>
+    </param>
+    <param name="rangeheight" size="4" type="text" value="1.0">
+      <label>Height of range ends. Enter a number &lt;1.0 or &gt;1.0 to decrease or increase the size, respectively</label>
+    </param>
+    <param name="gapgroup" size="4" type="text" value="1.0">
+      <label>Space between groups. Enter a number &lt;1.0 or &gt;1.0 to decrease or increase the size, respectively</label>
+    </param>
+    <param name="postext" size="4" type="text" value="1.0">
+      <label>Space between text and ticks, blocks, and ranges. Enter a number &lt;1.0 or &gt;1.0 to decrease or increase the size, respectively</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="png" name="out_file1" />
+  </outputs>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/lindna.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_marscan.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_marscan.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,45 @@
+<tool id="EMBOSS: marscan49" name="marscan" version="5.0.0">
+  <description>Finds MAR/SAR sites in nucleic sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>marscan -sequence $input1 -outfile $out_file1 -rformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Report File Format</label>
+      <option value="gff">GFF</option>
+      <option value="embl">EMBL</option>
+      <option value="genbank">GENBANK</option>
+      <option value="pir">PIR</option>
+      <option value="swiss">SwissProt</option>
+      <option value="dbmotif">DbMotif</option>
+      <option value="diffseq">Diffseq</option>
+      <option value="excel">Excel (tab delimited)</option>
+      <option value="feattable">FeatTable</option>
+      <option value="motif">Motif</option>
+      <option value="regions">Regions</option>
+      <option value="seqtable">SeqTable</option>
+      <option value="simple">SRS Simple</option>
+      <option value="srs">SRS</option>
+      <option value="table">Table</option>
+      <option value="tagseq">TagSeq</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="gff" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="1.fasta"/>
+      <param name="out_format1" value="excel"/>
+      <output name="out_file1" file="emboss_marscan_out.tabular"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/marscan.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_maskfeat.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_maskfeat.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,72 @@
+<tool id="EMBOSS: maskfeat50" name="maskfeat" version="5.0.0">
+  <description>Mask off features of a sequence</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>maskfeat -sequence $input1 -outseq $out_file1 -type "$type" -tolower $tolower -maskchar "$maskchar" -osformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="type" size="50" type="text" value="repeat*">
+      <label>Feature to mask</label>
+    </param>
+    <param name="tolower" type="select">
+      <label>Mask features by converting to lowercase</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="maskchar" size="1" type="text" value="N">
+      <label>Character to mask with</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="type" value="repeat*"/>
+      <param name="tolower" value="no"/>
+      <param name="maskchar" value="N"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_maskfeat_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/maskfeat.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_maskseq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_maskseq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,72 @@
+<tool id="EMBOSS: maskseq51" name="maskseq" version="5.0.0">
+  <description>Mask off regions of a sequence</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>maskseq -sequence $input1 -outseq $out_file1 -regions "$regions" -tolower $tolower -maskchar "$maskchar" -osformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="regions" size="50" type="text" value="">
+      <label>Regions to mask (Example 1-99)</label>
+    </param>
+    <param name="tolower" type="select">
+      <label>mask by converting to lowercase</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="maskchar" size="1" type="text" value="N">
+      <label>Character to use when masking</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="regions" value="1-3"/>
+      <param name="tolower" value="no"/>
+      <param name="maskchar" value="N"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_maskseq_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/maskseq.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_matcher.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_matcher.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,57 @@
+<tool id="EMBOSS: matcher52" name="matcher" version="5.0.0">
+  <description>Finds the best local alignments between two sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>matcher -asequence $input1 -bsequence $input2 -outfile $out_file1 -alternatives $alternatives -gapopen $gapopen -gapextend $gapextend -aformat3 $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence 1</label>
+    </param>
+    <param format="data" name="input2" type="data">
+      <label>Sequence 2</label>
+    </param>
+    <param name="alternatives" size="4" type="text" value="1">
+      <label>Number of alternative matches</label>
+    </param>
+    <param name="gapopen" size="4" type="text" value="16">
+      <label>Gap penalty</label>
+    </param>
+    <param name="gapextend" size="4" type="text" value="4">
+      <label>Gap length (extension) penalty</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Alignment File Format</label>
+      <option value="markx0">Markx0 (p)</option>
+      <option value="simple">Simple (m)</option>
+      <option value="fasta">FASTA (m)</option>
+      <option value="msf">MSF (m)</option>
+      <option value="srs">SRS (m)</option>
+      <option value="pair">Pair (p)</option>
+      <option value="markx1">Markx1 (p)</option>
+      <option value="markx2">Markx2 (p)</option>
+      <option value="markx3">Markx3 (p)</option>
+      <option value="markx10">Markx10 (p)</option>
+      <option value="srspair">SRS pair (p)</option>
+      <option value="score">Score (p)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="markx0" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="input2" value="1.fasta"/>
+      <param name="alternatives" value="1"/>
+      <param name="gapopen" value="16"/>
+      <param name="gapextend" value="4"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_matcher_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/matcher.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_megamerger.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_megamerger.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,63 @@
+<tool id="EMBOSS: megamerger53" name="megamerger" version="5.0.0">
+  <description>Merge two large overlapping nucleic acid sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>megamerger -asequence $input1 -bsequence $input2 -outseq $out_file1 -outfile $out_file2 -wordsize $wordsize -prefer $prefer -osformat3 $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence 1</label>
+    </param>
+    <param format="data" name="input2" type="data">
+      <label>Sequence 2</label>
+    </param>
+    <param name="wordsize" size="4" type="text" value="20">
+      <label>Word size</label>
+    </param>
+    <param name="prefer" type="select">
+      <label>Prefer first sequence when mismatches occur</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+    <data format="txt" name="out_file2" />
+  </outputs>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/megamerger.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_merger.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_merger.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,76 @@
+<tool id="EMBOSS: merger54" name="merger" version="5.0.0">
+  <description>Merge two overlapping nucleic acid sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>merger -asequence $input1 -bsequence $input2 -outseq $out_file1 -outfile $out_file2 -gapopen $gapopen -gapextend $gapextend -osformat4 $out_format1 -aformat3 $out_format2 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence 1</label>
+    </param>
+    <param format="data" name="input2" type="data">
+      <label>Sequence 2</label>
+    </param>
+    <param name="gapopen" size="4" type="text" value="50.0">
+      <label>Gap opening penalty</label>
+    </param>
+    <param name="gapextend" size="4" type="text" value="5.0">
+      <label>Gap extension penalty</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+    <param name="out_format2" type="select">
+      <label>Output Alignment File Format</label>
+      <option value="simple">Simple (m)</option>
+      <option value="fasta">FASTA (m)</option>
+      <option value="msf">MSF (m)</option>
+      <option value="srs">SRS (m)</option>
+      <option value="pair">Pair (p)</option>
+      <option value="markx0">Markx0 (p)</option>
+      <option value="markx1">Markx1 (p)</option>
+      <option value="markx2">Markx2 (p)</option>
+      <option value="markx3">Markx3 (p)</option>
+      <option value="markx10">Markx10 (p)</option>
+      <option value="srspair">SRS pair (p)</option>
+      <option value="score">Score (p)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+    <data format="simple" name="out_file2" />
+  </outputs>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/merger.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_msbar.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_msbar.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,117 @@
+<tool id="EMBOSS: msbar55" name="msbar" version="5.0.0">
+  <description>Mutate sequence beyond all recognition</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>msbar -sequence $input1 -outseq $out_file1 -count $count -point $point -block $block -codon $codon -inframe $inframe -minimum $minimum -maximum $maximum -osformat2 $out_format1
+  -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequence 1</label>
+    </param>
+    <param name="count" size="4" type="text" value="1">
+      <label>Number of times to perform the mutation operations</label>
+    </param>
+    <param name="point" type="select">
+      <label>Types of point mutations to perform</label>
+      <option value="0">None</option>
+      <option value="1">Any of the following</option>
+      <option value="2">Insertions</option>
+      <option value="3">Deletions</option>
+      <option value="4">Changes</option>
+      <option value="5">Duplications</option>
+      <option value="6">Moves</option>
+    </param>
+    <param name="block" type="select">
+      <label>Types of block mutations to perform</label>
+      <option value="0">None</option>
+      <option value="1">Any of the following</option>
+      <option value="2">Insertions</option>
+      <option value="3">Deletions</option>
+      <option value="4">Changes</option>
+      <option value="5">Duplications</option>
+      <option value="6">Moves</option>
+    </param>
+    <param name="codon" type="select">
+      <label>Types of codon mutations to perform. These are only done if the sequence is nucleic</label>
+      <option value="0">None</option>
+      <option value="1">Any of the following</option>
+      <option value="2">Insertions</option>
+      <option value="3">Deletions</option>
+      <option value="4">Changes</option>
+      <option value="5">Duplications</option>
+      <option value="6">Moves</option>
+    </param>
+    <param name="inframe" type="select">
+      <label>Do 'codon' and 'block' operations in frame</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="minimum" size="4" type="text" value="1">
+      <label>Minimum size for a block mutation</label>
+    </param>
+    <param name="maximum" size="4" type="text" value="10">
+      <label>Maximum size for a block mutation</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="count" value="1"/>
+      <param name="point" value="0"/>
+      <param name="block" value="0"/>
+      <param name="codon" value="0"/>
+      <param name="inframe" value="no"/>
+      <param name="minimum" value="1"/>
+      <param name="maximum" value="10"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_msbar_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/msbar.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_multiple_outputfile_wrapper.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_multiple_outputfile_wrapper.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,19 @@
+#! /usr/bin/perl -w
+use strict;
+
+my $cmd_string = join (" ",@ARGV);
+my $results = `$cmd_string`;
+my @files = split("\n",$results);
+foreach my $thisLine (@files)
+{
+ if ($thisLine =~ /Created /)
+ {
+ $thisLine =~ /[\w|\.]+$/;
+ $thisLine =$&;
+ print "outfile: $thisLine\n";
+ }
+ else
+ {
+ print $thisLine,"\n";
+ }
+}
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_needle.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_needle.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,126 @@
+<tool id="EMBOSS: needle56" name="needle" version="5.0.0">
+  <description>Needleman-Wunsch global alignment</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>needle -asequence $input1 -bsequence $input2 -outfile $out_file1 -gapopen $gapopen -gapextend $gapextend -brief $brief -aformat3 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequence 1</label>
+    </param>
+    <param format="fasta" name="input2" type="data">
+      <label>Sequence 2</label>
+    </param>
+    <param name="gapopen" size="4" type="text" value="10.0">
+      <label>Gap open penalty</label>
+    </param>
+    <param name="gapextend" size="4" type="text" value="0.5">
+      <label>Gap extension penalty</label>
+    </param>
+    <param name="brief" type="select">
+      <label>Brief identity and similarity</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Alignment File Format</label>
+      <option value="srspair">SRS pair (p)</option>
+      <option value="simple">Simple (m)</option>
+      <option value="fasta">FASTA (m)</option>
+      <option value="msf">MSF (m)</option>
+      <option value="srs">SRS (m)</option>
+      <option value="pair">Pair (p)</option>
+      <option value="markx0">Markx0 (p)</option>
+      <option value="markx1">Markx1 (p)</option>
+      <option value="markx2">Markx2 (p)</option>
+      <option value="markx3">Markx3 (p)</option>
+      <option value="markx10">Markx10 (p)</option>
+      <option value="score">Score (p)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="needle" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="input2" value="1.fasta"/>
+      <param name="gapopen" value="10"/>
+      <param name="gapextend" value="0.5"/>
+      <param name="brief" value="yes"/>
+      <param name="out_format1" value="score"/>
+      <output name="out_file1" file="emboss_needle_out.score"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark
+
+needle reads any two sequences of the same type (DNA or protein).
+
+-----
+
+**Syntax**
+
+This tool uses the Needleman-Wunsch global alignment algorithm to find the optimum alignment (including gaps) of two sequences when considering their entire length. 
+
+- **Optimal alignment:** Dynamic programming methods ensure the optimal global alignment by exploring all possible alignments and choosing the best.
+
+- **The Needleman-Wunsch algorithm** is a member of the class of algorithms that can calculate the best score and alignment in the order of mn steps, (where 'n' and 'm' are the lengths of the two sequences).
+
+- **Gap open penalty:** [10.0 for any sequence] The gap open penalty is the score taken away when a gap is created. The best value depends on the choice of comparison matrix. The default value assumes you are using the EBLOSUM62 matrix for protein sequences, and the EDNAFULL matrix for nucleotide sequences. (Floating point number from 1.0 to 100.0)
+
+- **Gap extension penalty:** [0.5 for any sequence] The gap extension, penalty is added to the standard gap penalty for each base or residue in the gap. This is how long gaps are penalized. Usually you will expect a few long gaps rather than many short gaps, so the gap extension penalty should be lower than the gap penalty. An exception is where one or both sequences are single reads with possible sequencing errors in which case you would expect many single base gaps. You can get this result by setting the gap open penalty to zero (or very low) and using the gap extension penalty to control gap scoring. (Floating point number from 0.0 to 10.0)
+
+You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/needle.html
+
+-----
+
+**Example**
+
+- Input File::
+
+    >hg18_dna range=chrX:151073054-151073136 5'pad=0 3'pad=0 revComp=FALSE strand=? repeatMasking=none
+    TTTATGTCTATAATCCTTACCAAAAGTTACCTTGGAATAAGAAGAAGTCA
+    GTAAAAAGAAGGCTGTTGTTCCGTGAAATACTG
+
+- If both Sequence1 and Sequence2 take the above file as input, Gap open penalty equals 10.0, Gap extension penalty equals 0.5, Brief identity and similarity is set to Yes, Output Alignment File Format is set to SRS pairs, the output file is::
+
+    ########################################
+    # Program: needle
+    # Rundate: Mon Apr 02 2007 14:23:16
+    # Align_format: srspair
+    # Report_file: ./database/files/dataset_7.dat
+    ########################################
+     
+    #=======================================
+    #
+    # Aligned_sequences: 2
+    # 1: hg18_dna
+    # 2: hg18_dna
+    # Matrix: EDNAFULL
+    # Gap_penalty: 10.0
+    # Extend_penalty: 0.5
+    #
+    # Length: 83
+    # Identity:      83/83 (100.0%)
+    # Similarity:    83/83 (100.0%)
+    # Gaps:           0/83 ( 0.0%)
+    # Score: 415.0
+    #
+    #=======================================
+
+    hg18_dna           1 TTTATGTCTATAATCCTTACCAAAAGTTACCTTGGAATAAGAAGAAGTCA     50
+                       ||||||||||||||||||||||||||||||||||||||||||||||||||
+    hg18_dna           1 TTTATGTCTATAATCCTTACCAAAAGTTACCTTGGAATAAGAAGAAGTCA     50
+        
+    hg18_dna          51 GTAAAAAGAAGGCTGTTGTTCCGTGAAATACTG     83
+                       |||||||||||||||||||||||||||||||||
+    hg18_dna          51 GTAAAAAGAAGGCTGTTGTTCCGTGAAATACTG     83
+        
+    #---------------------------------------
+    #---------------------------------------
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_newcpgreport.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_newcpgreport.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,44 @@
+<tool id="EMBOSS: newcpgreport57" name="newcpgreport" version="5.0.0">
+  <description>Report CpG rich areas</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>newcpgreport -sequence $input1 -window $window -shift $shift -minlen $minlen -minpc $minpc -outfile $out_file1 -minoe $minoe -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="window" size="4" type="text" value="100">
+      <label>Window Size</label>
+    </param>
+    <param name="shift" size="4" type="text" value="1">
+      <label>Step size (shift)</label>
+    </param>
+    <param name="minlen" size="4" type="text" value="200">
+      <label>Minimum length</label>
+    </param>
+    <param name="minoe" size="4" type="text" value="0.6">
+      <label>Minimum average observed to expected ratio</label>
+    </param>
+    <param name="minpc" size="4" type="text" value="50.0">
+      <label>Minimum average percentage of G plus C</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="newcpgreport" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="window" value="100"/>
+      <param name="shift" value="1"/>
+      <param name="minlen" value="200"/>
+      <param name="minoe" value="0.6"/>
+      <param name="minpc" value="50.0"/>
+      <output name="out_file1" file="emboss_newcpgreport_out.newcpgreport"/>
+    </test>
+  </tests>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/newcpgreport.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_newcpgseek.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_newcpgseek.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,35 @@
+<tool id="EMBOSS: newcpgseek58" name="newcpgseek" version="5.0.0">
+  <description>Reports CpG rich region</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>newcpgseek -sequence $input1 -outfile $out_file1 -score $score -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="score" size="4" type="text" value="17">
+      <label>CpG score</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="newcpgseek" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="score" value="17"/>
+      <output name="out_file1" file="emboss_newcpgseek_out.newcpgseek"/>
+    </test>
+  </tests>
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/newcpgseek.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_newseq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_newseq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,72 @@
+<tool id="EMBOSS: newseq59" name="newseq" version="5.0.0">
+  <description>Type in a short new sequence</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>newseq -outseq $out_file1 -name "$seqname" -description "$description" -type $type -sequence "$sequence" -osformat5 $out_format1 -auto</command>
+  <inputs>
+    <param name="seqname" size="10" type="text" value="">
+      <label>Name of of the sequence</label>
+    </param>
+    <param name="description" size="10" type="text" value="">
+      <label>Description of the sequence</label>
+    </param>
+    <param name="type" type="select">
+      <label>Type of sequence</label>
+      <option value="N">Nucleic</option>
+      <option value="P">Protein</option>
+    </param>
+    <param name="sequence" size="50" type="text" value="">
+      <label>The sequence itself</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="seqname" value="cytoc"/>
+      <param name="description" value="fragment of cytochrome c"/>
+      <param name="type" value="N"/>
+      <param name="sequence" value="KKKEERADLIAY"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_newseq_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/newseq.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_noreturn.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_noreturn.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,31 @@
+<tool id="EMBOSS: noreturn60" name="noreturn" version="5.0.0">
+  <description>Removes carriage return from ASCII files</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>noreturn -infile $input1 -outfile $out_file1 -system $system -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>On query</label>
+    </param>
+    <param name="system" type="select">
+      <label>Target operating system for end-of-line format</label>
+      <option value="unix">Unix/Linux systems</option>
+      <option value="pc">Windows/DOS</option>
+      <option value="mac">Apple Macintosh</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="noreturn" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="system" value="unix"/>
+      <output name="out_file1" file="emboss_noreturn_out.noreturn"/>
+    </test>
+  </tests>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/noreturn.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_notseq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_notseq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,69 @@
+<tool id="EMBOSS: notseq61" name="notseq" version="5.0.0">
+  <description>Exclude a set of sequences and write out the remaining ones</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>notseq -sequence $input1 -outseq $out_file1 -exclude "$exclude" -osformat3 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>On query</label>
+    </param>
+    <param name="exclude" size="50" type="text" value="">
+      <label>Exclude list</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="exclude" value="AA"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_notseq_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/notseq.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_nthseq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_nthseq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,69 @@
+<tool id="EMBOSS: nthseq62" name="nthseq" version="5.0.0">
+  <description>Writes one sequence from a multiple set of sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>nthseq -sequence $input1 -outseq $out_file1 -number $number -osformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="number" size="4" type="text" value="1">
+      <label>Number of the sequence to output</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="number" value="1"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_nthseq_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/nthseq.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_octanol.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_octanol.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,38 @@
+<tool id="EMBOSS: octanol63" name="octanol" version="5.0.0">
+  <!-- graphics output with filename, no functional tests added -->  
+  <description>Displays protein hydropathy</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command interpreter="perl">emboss_single_outputfile_wrapper.pl octanol -sequence $input1 -graph png -goutfile $out_file1 -width $width -octanolplot $octanolplot -interfaceplot $interfaceplot
+  -differenceplot $differenceplot -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="width" size="4" type="text" value="19">
+      <label>Window size</label>
+    </param>
+    <param name="octanolplot" type="select">
+      <label>Display the octanol plot</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="interfaceplot" type="select">
+      <label>Display the interface plot</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="differenceplot" type="select">
+      <label>Display the difference plot</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="png" name="out_file1" />
+  </outputs>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/octanol.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_oddcomp.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_oddcomp.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,40 @@
+<tool id="EMBOSS: oddcomp64" name="oddcomp" version="5.0.0">
+  <!-- output contains file location info, commented out functional tests -->
+  <description>Find protein sequence regions with a biased composition</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>oddcomp -sequence $input1 -infile $input2 -outfile $out_file1 -window $window -ignorebz $ignorebz -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param format="data" name="input2" type="data">
+      <label>This is a file in the format of the output produced by 'compseq' that is used to set the minimum frequencies of words in this analysis</label>
+    </param>
+    <param name="window" size="4" type="text" value="30">
+      <label>This is the size of window in which to count. Thus if you want to count frequencies in a 40 aa stretch you should enter 40 here</label>
+    </param>
+    <param name="ignorebz" type="select">
+      <label>The amino acid code B represents Asparagine or Aspartic acid and the code Z represents Glutamine or Glutamic acid. These are not commonly used codes and you may wish not to count words
+      containing them, just noting them in the count of 'Other' words</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="oddcomp" name="out_file1" />
+  </outputs>
+<!--    <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="input2" value="emboss_compseq_out.compseq"/>
+      <param name="window" value="30"/>
+      <param name="ignorebz" value="yes"/>
+      <output name="out_file1" file="emboss_oddcomp_out.oddcomp"/>
+    </test>
+  </tests> -->
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/oddcomp.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_palindrome.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_palindrome.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,53 @@
+<tool id="EMBOSS: palindrome65" name="palindrome" version="5.0.0">
+  <description>Looks for inverted repeats in a nucleotide sequence</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>palindrome -sequence $input1 -outfile $out_file1 -minpallen $minpallen -maxpallen $maxpallen -gaplimit $gaplimit -nummismatches $nummismatches -overlap $overlap -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="minpallen" size="4" type="text" value="10">
+      <label>Minimum length of palindrome</label>
+    </param>
+    <param name="maxpallen" size="4" type="text" value="100">
+      <label>Maximum length of palindrome</label>
+    </param>
+    <param name="gaplimit" size="4" type="text" value="100">
+      <label>Maximum gap between repeated regions</label>
+    </param>
+    <param name="nummismatches" size="4" type="text" value="0">
+      <label>Number of mismatches allowed</label>
+    </param>
+    <param name="overlap" type="select">
+      <label>Report overlapping matches</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="palindrome" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="minpallen" value="10"/>
+      <param name="maxpallen" value="100"/>
+      <param name="gaplimit" value="100"/>
+      <param name="nummismatches" value="0"/>
+      <param name="overlap" value="yes"/>
+      <output name="out_file1" file="emboss_palindrome_out.palindrome"/>
+    </test>
+  </tests>
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/palindrome.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_pasteseq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_pasteseq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,73 @@
+<tool id="EMBOSS: pasteseq66" name="pasteseq" version="5.0.0">
+  <description>Insert one sequence into another</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>pasteseq -asequence $input2 -bsequence $input1 -outseq $out_file1 -pos $pos -osformat3 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Main sequence</label>
+    </param>
+    <param format="fasta" name="input2" type="data">
+      <label>Sequence to insert</label>
+    </param>
+    <param name="pos" size="4" type="text" value="0">
+      <label>Position to insert after</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="1.fasta"/>
+      <param name="input2" value="2.fasta"/>
+      <param name="pos" value="0"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_pasteseq_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark
+
+The input datasets need to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/pasteseq.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_patmatdb.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_patmatdb.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,49 @@
+<tool id="EMBOSS: patmatdb67" name="patmatdb" version="5.0.0">
+  <description>Search a protein sequence with a motif</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>patmatdb -sequence $input1 -outfile $out_file1 -motif "$motif" -rformat3 $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Main sequence</label>
+    </param>
+    <param name="motif" size="4" type="text" value="">
+      <label>Motif to search for</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Report File Format</label>
+      <option value="dbmotif">DbMotif</option>
+      <option value="embl">EMBL</option>
+      <option value="genbank">GENBANK</option>
+      <option value="gff">GFF</option>
+      <option value="pir">PIR</option>
+      <option value="swiss">SwissProt</option>
+      <option value="diffseq">Diffseq</option>
+      <option value="excel">Excel (tab delimited)</option>
+      <option value="feattable">FeatTable</option>
+      <option value="motif">Motif</option>
+      <option value="regions">Regions</option>
+      <option value="seqtable">SeqTable</option>
+      <option value="simple">SRS Simple</option>
+      <option value="srs">SRS</option>
+      <option value="table">Table</option>
+      <option value="tagseq">TagSeq</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="dbmotif" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="motif" value="aa"/>
+      <param name="out_format1" value="excel"/>
+      <output name="out_file1" file="emboss_patmatdb_out.tabular"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/patmatdb.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_pepcoil.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_pepcoil.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,46 @@
+<tool id="EMBOSS: pepcoil68" name="pepcoil" version="5.0.0">
+  <description>Predicts coiled coil regions</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>pepcoil -sequence $input1 -outfile $out_file1 -window $window -coil $coil -frame $frame -other $other -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="window" size="4" type="text" value="28">
+      <label>Window size</label>
+    </param>
+    <param name="coil" type="select">
+      <label>Report coiled coil regions</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="frame" type="select">
+      <label>Show coil frameshifts</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="other" type="select">
+      <label>Report non coiled coil regions</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="pepcoil" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="window" value="28"/>
+      <param name="coil" value="yes"/>
+      <param name="frame" value="yes"/>
+      <param name="other" value="yes"/>
+      <output name="out_file1" file="emboss_pepcoil_out.pepcoil"/>
+    </test>
+  </tests>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/pepcoil.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_pepinfo.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_pepinfo.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,28 @@
+<tool id="EMBOSS: pepinfo69" name="pepinfo" version="5.0.0">
+  <!-- puts file info in output files -->
+  <description>Plots simple amino acid properties in parallel</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command interpreter="perl">emboss_single_outputfile_wrapper.pl pepinfo -sequence $input1 -outfile $out_file1 -goutfile $out_file2 -graph png -hwindow $hwindow $plot_type -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="hwindow" size="4" type="text" value="9">
+      <label>Window size for hydropathy averaging</label>
+    </param>
+    <param name="plot_type" type="select">
+      <label>Choose a plot type</label>
+      <option value="-generalplot yes -hydropathyplot no">Histogram of general properties</option>
+      <option value="-generalplot no -hydropathyplot yes">Graphs of hydropathy</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="pepinfo" name="out_file1" />
+    <data format="png" name="out_file2" />
+  </outputs>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/pepinfo.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_pepnet.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_pepnet.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,33 @@
+<tool id="EMBOSS: pepnet70" name="pepnet" version="5.0.0">
+  <!-- graphical output file with path information -->
+  <description>Displays proteins as a helical net</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>pepnet -sequence $input1 -graph png -goutfile $out_file1 -squares $squares -diamonds $diamonds -octags $octags -amphipathic $amphipathic -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="squares" size="10" type="text" value="ILVM">
+      <label>Residues to mark with squares</label>
+    </param>
+    <param name="diamonds" size="10" type="text" value="DENQST">
+      <label>Residues to mark with diamonds</label>
+    </param>
+    <param name="octags" size="10" type="text" value="HKR ">
+      <label>Residues to mark with octagons</label>
+    </param>
+    <param name="amphipathic" type="select">
+      <label>If this is true then the residues ACFGILMVWY are marked as squares and all other residues are unmarked. This overrides any other markup that you may have specified</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="png" name="out_file1" />
+  </outputs>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/pepnet.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_pepstats.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_pepstats.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,30 @@
+<tool id="EMBOSS: pepstats71" name="pepstats" version="5.0.0">
+  <description>Protein statistics</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>pepstats -sequence $input1 -outfile $out_file1 -termini $termini -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="termini" type="select">
+      <label>Include charge at N and C terminus</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="pepstats" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="termini" value="yes"/>
+      <output name="out_file1" file="emboss_pepstats_out.pepstats"/>
+    </test>
+  </tests>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/pepstats.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_pepwheel.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_pepwheel.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,45 @@
+<tool id="EMBOSS: pepwheel72" name="pepwheel" version="5.0.0">
+  <!-- produces png file -->
+  <description>Shows protein sequences as helices</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command interpreter="perl">emboss_single_outputfile_wrapper.pl pepwheel -sequence $input1 -graph png -goutfile $out_file1 -squares $squares -diamonds $diamonds -octags $octags -amphipathic
+  $amphipathic -steps $steps -turns $turns -wheel $wheel -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="steps" size="10" type="text" value="18">
+      <label>Steps, the number of residues plotted per turn is this value divided by the 'turns' value</label>
+    </param>
+    <param name="turns" size="10" type="text" value="5">
+      <label>Turns, the number of residues plotted per turn is the 'steps' value divided by this value</label>
+    </param>
+    <param name="squares" size="10" type="text" value="ILVM">
+      <label>Residues to mark with squares</label>
+    </param>
+    <param name="diamonds" size="10" type="text" value="DENQST">
+      <label>Residues to mark with diamonds</label>
+    </param>
+    <param name="octags" size="10" type="text" value="HKR">
+      <label>Residues to mark with octagons</label>
+    </param>
+    <param name="wheel" type="select">
+      <label>Plot the wheel</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="amphipathic" type="select">
+      <label>If this is true then the residues ACFGILMVWY are marked as squares and all other residues are unmarked. This overrides any other markup that you may have specified</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="png" name="out_file1" />
+  </outputs>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/pepwheel.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_pepwindow.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_pepwindow.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,22 @@
+<tool id="EMBOSS: pepwindow73" name="pepwindow" version="5.0.0">
+  <!-- produces png file -->
+  <description>Displays protein hydropathy</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command interpreter="perl">emboss_single_outputfile_wrapper.pl pepwindow -sequence $input1 -graph png -goutfile $out_file1 -length $length -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="length" size="10" type="text" value="7">
+      <label>Window size</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="png" name="out_file1" />
+  </outputs>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/pepwindow.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_pepwindowall.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_pepwindowall.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,22 @@
+<tool id="EMBOSS: pepwindowall74" name="pepwindowall" version="5.0.0">
+  <!-- produces png file -->
+  <description>Displays protein hydropathy of a set of sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command interpreter="perl">emboss_single_outputfile_wrapper.pl pepwindowall -sequence $input1 -graph png -goutfile $out_file1 -length $length -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="length" size="10" type="text" value="7">
+      <label>Window size</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="png" name="out_file1" />
+  </outputs>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/pepwindowall.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_plotcon.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_plotcon.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,22 @@
+<tool id="EMBOSS: plotcon75" name="plotcon" version="5.0.0">
+  <!-- produces png file -->
+  <description>Plot quality of conservation of a sequence alignment</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command interpreter="perl">emboss_single_outputfile_wrapper.pl plotcon -sequences $input1 -graph png -goutfile $out_file1 -winsize $winsize -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="winsize" size="10" type="text" value="4">
+      <label>Number of columns to average alignment quality over</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="png" name="out_file1" />
+  </outputs>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/plotcon.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_plotorf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_plotorf.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,40 @@
+<tool id="EMBOSS: plotorf76" name="plotorf" version="5.0.0">
+  <!-- produces png file output -->
+  <description>Plot potential open reading frames</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command interpreter="perl">emboss_single_outputfile_wrapper.pl plotorf -sequence $input1 -graph png -goutfile $out_file1 -start $start -stop $stop -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="start" size="15" type="text" value="ATG">
+      <label>Start codons</label>
+    </param>
+    <param name="stop" size="15" type="text" value="TAA">
+      <label>Stop codons</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="png" name="out_file1" />
+  </outputs>
+  <!--  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="start" value="ATG"/>
+      <param name="stop" value="TAA"/>
+      <output name="out_file1" file="emboss_plotorf_out.png"/>
+    </test>
+  </tests> -->
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/plotorf.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_polydot.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_polydot.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,48 @@
+<tool id="EMBOSS: polydot77" name="polydot" version="5.0.0">
+  <!-- produces png file, not added functional tests -->
+  <description>Displays all-against-all dotplots of a set of sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command interpreter="perl">emboss_single_outputfile_wrapper.pl polydot -sequence $input1 -graph png -goutfile $output2 -outfeat $output1 -wordsize $wordsize -boxit $boxit -dumpfeat yes -gap
+  $gap -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="wordsize" size="10" type="text" value="6">
+      <label>Word size</label>
+    </param>
+    <param name="boxit" type="select">
+      <label>Draw a box around each dotplot</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="gap" size="10" type="text" value="10">
+      <label>Size of gap</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="polydot" name="output1" />
+    <data format="png" name="output2" />
+  </outputs>
+<!--    <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="wordsize" value="6"/>
+      <param name="boxit" value="yes"/>
+      <param name="gap" value="10"/>
+      <output name="output1" file="emboss_polydot_out.png"/>
+    </test>
+  </tests>-->
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/polydot.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_preg.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_preg.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,21 @@
+<tool id="EMBOSS: preg78" name="preg" version="5.0.0">
+  <description>Regular expression search of a protein sequence</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>preg -sequence $input1 -outfile $out_file1 -pattern "$pattern" -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence 1</label>
+    </param>
+    <param name="pattern" size="50" type="text" value="(ACD)">
+      <label>Regular expression pattern</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="preg" name="out_file1" />
+  </outputs>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/preg.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_prettyplot.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_prettyplot.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,113 @@
+<tool id="EMBOSS: prettyplot79" name="prettyplot" version="5.0.0">
+  <!-- produces png output with file name -->
+  <description>Displays aligned sequences, with colouring and boxing</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>prettyplot -sequences $input1 -graph png -goutfile $out_file1 -residuesperline $residuesperline -resbreak $resbreak -ccolours $ccolours -cidentity $cidentity -csimilarity $csimilarity
+  -cother $cother -docolour $docolour -gtitle $title -pair $pair -identity $identity -box $box -boxcol $boxcol -boxcolval $boxcolval -name $name -maxnamelen $maxnamelen -number $number -listoptions
+  $listoptions -consensus $consensus -collision $collision -alternative $alternative -showscore $showscore -portrait $portrait -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="residuesperline" size="10" type="text" value="50">
+      <label>The number of residues to be displayed on each line</label>
+    </param>
+    <param name="resbreak" size="10" type="text" value="50">
+      <label>Residues before a space</label>
+    </param>
+    <param name="ccolours" type="select">
+      <label>Colour residues by their consensus value</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="cidentity" size="10" type="text" value="RED">
+      <label>Colour to display identical residues</label>
+    </param>
+    <param name="csimilarity" size="10" type="text" value="GREEN">
+      <label>Colour to display similar residues</label>
+    </param>
+    <param name="cother" size="10" type="text" value="BLACK">
+      <label>Colour to display other residues</label>
+    </param>
+    <param name="docolour" type="select">
+      <label>Colour residues by table oily, amide etc.</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="title" type="select">
+      <label>Display the title</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="pair" size="10" type="text" value="1.5,1.0,0.5">
+      <label>Values to represent identical similar related</label>
+    </param>
+    <param name="identity" size="10" type="text" value="0">
+      <label>Only match those which are identical in all sequences</label>
+    </param>
+    <param name="box" type="select">
+      <label>Display prettyboxes</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="boxcol" type="select">
+      <label>Colour the background in the boxes</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="boxcolval" size="10" type="text" value="GREY">
+      <label>Colour to be used for background</label>
+    </param>
+    <param name="name" type="select">
+      <label>Display the sequence names</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="maxnamelen" size="10" type="text" value="10">
+      <label>Margin size for the sequence name</label>
+    </param>
+    <param name="number" type="select">
+      <label>Display the residue number</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="listoptions" type="select">
+      <label>Display the date and options used</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="consensus" type="select">
+      <label>Display the consensus</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="collision" type="select">
+      <label>Allow collisions in calculating consensus</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="alternative" type="select">
+      <label>Use alternative collisions routine</label>
+      <option value="0">Normal collision check</option>
+      <option value="1">Checks identical scores with the max score found. So if any other residue matches the identical score then a collision has occurred</option>
+      <option value="2">If another residue has a greater than or equal to matching score and these do not match then a collision has occurred</option>
+      <option value="3">Checks all those not in the current consensus.If any of these give a top score for matching or identical scores then a collision has occured</option>
+    </param>
+    <param name="showscore" size="10" type="text" value="-1">
+      <label>Print residue scores</label>
+    </param>
+    <param name="portrait" type="select">
+      <label>Set page to Portrait</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="png" name="out_file1" />
+  </outputs>
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/prettyplot.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_prettyseq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_prettyseq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,53 @@
+<tool id="EMBOSS: prettyseq80" name="prettyseq" version="5.0.0">
+  <description>Output sequence with translated ranges</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>prettyseq -sequence $input1 -outfile $out_file1 -ruler $ruler -plabel $plabel -nlabel $nlabel -width $width -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="ruler" type="select">
+      <label>Add a ruler</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="plabel" type="select">
+      <label>Number translations</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="nlabel" type="select">
+      <label>Number DNA sequence</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="width" size="4" type="text" value="60">
+      <label>Width of screen</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="prettyseq" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="ruler" value="yes"/>
+      <param name="plabel" value="yes"/>
+      <param name="nlabel" value="yes"/>
+      <param name="width" value="60"/>
+      <output name="out_file1" file="emboss_prettyseq_out.prettyseq"/>
+    </test>
+  </tests>
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/prettyseq.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_primersearch.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_primersearch.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,33 @@
+<tool id="EMBOSS: primersearch81" name="primersearch" version="5.0.0">
+  <description>Searches DNA sequences for matches with primer pairs</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>primersearch -seqall $input1 -infile $input2 -outfile $out_file1 -mismatchpercent $mismatchpercent -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Main sequences</label>
+    </param>
+    <param format="data" name="input2" type="data">
+      <label>Primer file</label>
+    </param>
+    <param name="mismatchpercent" size="4" type="text" value="0">
+      <label>Allowed percent mismatch</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="primersearch" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="input2" value="emboss_primersearch.fasta"/>
+      <param name="mismatchpercent" value="0"/>
+      <output name="out_file1" file="emboss_primersearch_out.primersearch"/>
+    </test>
+  </tests>
+  <help>
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/primersearch.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_revseq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_revseq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,77 @@
+<tool id="EMBOSS: revseq82" name="revseq" version="5.0.0">
+  <description>Reverse and complement a sequence</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>revseq -sequence $input1 -outseq $out_file1 -reverse $reverse -complement $complement -osformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="reverse" type="select">
+      <label>Reverse the sequence</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="complement" type="select">
+      <label>Complement the sequence</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="reverse" value="yes"/>
+      <param name="complement" value="yes"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_revseq_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/revseq.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_seqmatchall.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_seqmatchall.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,54 @@
+<tool id="EMBOSS: seqmatchall83" name="seqmatchall" version="5.0.0">
+  <description>All-against-all comparison of a set of sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>seqmatchall -sequence $input1 -outfile $out_file1 -wordsize $wordsize -aformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="wordsize" size="4" type="text" value="4">
+      <label>Word size</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Alignment File Format</label>
+      <option value="match">Match (m)</option>
+      <option value="simple">Simple (m)</option>
+      <option value="fasta">FASTA (m)</option>
+      <option value="msf">MSF (m)</option>
+      <option value="srs">SRS (m)</option>
+      <option value="pair">Pair (p)</option>
+      <option value="markx0">Markx0 (p)</option>
+      <option value="markx1">Markx1 (p)</option>
+      <option value="markx2">Markx2 (p)</option>
+      <option value="markx3">Markx3 (p)</option>
+      <option value="markx10">Markx10 (p)</option>
+      <option value="srspair">SRS pair (p)</option>
+      <option value="score">Score (p)</option>
+      <option value="seqmatchall">Seqmatchall Output File</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="seqmatchall" name="out_file1" />.
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="wordsize" value="2"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_seqmatchall_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/seqmatchall.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_seqret.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_seqret.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,70 @@
+<tool id="EMBOSS: seqret84" name="seqret" version="5.0.0">
+  <description>Reads and writes sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>seqret -sequence $input1 -outseq $out_file1 -feature $feature -firstonly $firstonly -osformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="feature" type="select">
+      <label>Use feature information</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="firstonly" type="select">
+      <label>Read one sequence and stop</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="feature" value="no"/>
+      <param name="firstonly" value="no"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_seqret_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/seqret.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_showfeat.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_showfeat.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,123 @@
+<tool id="EMBOSS: showfeat85" name="showfeat" version="5.0.0">
+  <!-- tool gives memory errors -->
+  <description>Show features of a sequence</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>showfeat -sequence $input1 -outfile $out_file1 -matchsource "$matchsource" -matchtype "$matchtype" -matchtag "$matchtag" -matchvalue "$matchvalue" -sort $sort -annotation "$annotation" -id
+  $id -description "$description" -scale "$scale" -width "$width" -collapse $collapse -forward $forward -reverse $reverse -unknown $unknown -strand $strand -source $source -position $position -type
+  $type -tags $tags -values $values -stricttags $stricttags -html $html_out1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="html_out1" type="select">
+      <label>Format output as an HTML table</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="matchsource" size="50" type="text" value="*">
+      <label>Feature source to match</label>
+    </param>
+    <param name="matchtype" size="50" type="text" value="*">
+      <label>Feature type to match</label>
+    </param>
+    <param name="matchtag" size="50" type="text" value="*">
+      <label>Feature tags to match</label>
+    </param>
+    <param name="matchvalue" size="50" type="text" value="*">
+      <label>Tag values to match</label>
+    </param>
+    <param name="sort" type="select">
+      <label>Sort by</label>
+      <option value="start">Start position</option>
+      <option value="source">Source</option>
+      <option value="type">Type</option>
+      <option value="nosort">No sorting done</option>
+      <option value="join">Join coding regions together</option>
+    </param>
+    <param name="annotation" size="50" type="text" value="">
+      <label>Regions to annotate by marking</label>
+    </param>
+    <param name="id" type="select">
+      <label>Display the ID name of the sequence</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="description" type="select">
+      <label>Display the description of the sequence</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="scale" type="select">
+      <label>Display the scale line</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="width" size="50" type="text" value="60">
+      <label>Screen width</label>
+    </param>
+    <param name="collapse" type="select">
+      <label>Collapse features</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="forward" type="select">
+      <label>Display forward sense features</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="reverse" type="select">
+      <label>Display reverse sense features</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="unknown" type="select">
+      <label>Display unknown sense features</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="strand" type="select">
+      <label>Display the strand of the features</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="source" type="select">
+      <label>Display the source of the features</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="position" type="select">
+      <label>SDisplay the start and end position of the features</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="type" type="select">
+      <label>Display the type of the features</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="tags" type="select">
+      <label>Display the tags and values of the features</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="values" type="select">
+      <label>Display the tag values of the features</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="stricttags" type="select">
+      <label>Display only those tag/value pairs in a feature that match the specified tag and value</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="showfeat" name="out_file1" />
+  </outputs>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/showfeat.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_shuffleseq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_shuffleseq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,62 @@
+<tool id="EMBOSS: shuffleseq87" name="shuffleseq" version="5.0.0">
+  <!-- produces random outputs each time -->
+  <description>Shuffles a set of sequences maintaining composition</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>shuffleseq -sequence $input1 -outseq $out_file1 -shuffle "$shuffle" -osformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="shuffle" size="4" type="text" value="1">
+      <label>Number of shuffles</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/shuffleseq.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_sigcleave.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_sigcleave.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,55 @@
+<tool id="EMBOSS: sigcleave88" name="sigcleave" version="5.0.0">
+  <description>Reports protein signal cleavage sites</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>sigcleave -sequence $input1 -outfile $out_file1 -minweight "$minweight" -prokaryote $prokaryote -rformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="minweight" size="4" type="text" value="3.5">
+      <label>Minimum scoring weight value for the predicted cleavage site</label>
+    </param>
+    <param name="prokaryote" type="select">
+      <label>Specifies the sequence is prokaryotic and changes the default scoring data file</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Report File Format</label>
+      <option value="motif">Motif</option>
+      <option value="embl">EMBL</option>
+      <option value="genbank">GENBANK</option>
+      <option value="gff">GFF</option>
+      <option value="pir">PIR</option>
+      <option value="swiss">SwissProt</option>
+      <option value="dbmotif">DbMotif</option>
+      <option value="diffseq">Diffseq</option>
+      <option value="excel">Excel (tab delimited)</option>
+      <option value="feattable">FeatTable</option>
+      <option value="regions">Regions</option>
+      <option value="seqtable">SeqTable</option>
+      <option value="simple">SRS Simple</option>
+      <option value="srs">SRS</option>
+      <option value="table">Table</option>
+      <option value="tagseq">TagSeq</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="motif" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="minweight" value="3.5"/>
+      <param name="prokaryote" value="no"/>
+      <param name="out_format1" value="excel"/>
+      <output name="out_file1" file="emboss_sigcleave_out.tabular"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/sigcleave.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_single_outputfile_wrapper.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_single_outputfile_wrapper.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,27 @@
+#! /usr/bin/perl -w
+use strict;
+use File::Copy;
+
+my $cmd_string = join (" ",@ARGV);
+my $results = `$cmd_string`;
+my @files = split("\n",$results);
+my $fileNameOut = $ARGV[6];
+my ($drive, $outputDir, $file) = File::Spec->splitpath( $fileNameOut );
+my $destination = $fileNameOut;
+
+foreach my $thisLine (@files)
+{
+ if ($thisLine =~ /Created /)
+ {
+ $thisLine =~ /[\w|\.]+$/;
+ $thisLine =$&;
+ #print "outfile: $thisLine\n";
+ #there is only one file to move, so we can quit after finding it
+ move($drive.$outputDir.$thisLine,$fileNameOut);
+ exit(1);
+ }
+ else
+ {
+ print $thisLine,"\n";
+ }
+}
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_sirna.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_sirna.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,118 @@
+<tool id="EMBOSS: sirna89" name="sirna" version="5.0.0">
+  <description>Finds siRNA duplexes in mRNA</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>sirna -sequence $input1 -outfile $ofile1 -outseq $ofile2 -poliii $poliii -aa $aa -tt $tt -polybase $polybase -context $context -rformat2 $out_format1 -osformat3 $out_format2
+  -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="poliii" type="select">
+      <label>Select only the 21 base probes that start with a purine (Pol III expression vectors)</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="aa" type="select">
+      <label>Select only those 23 base regions that start with AA</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="tt" type="select">
+      <label>Select only those 23 base regions that end with TT</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="polybase" type="select">
+      <label>Report more than those 23 base regions that have no repeat of 4 or more of any bases in a row</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="context" type="select">
+      <label>Displays the whole 23 bases of the region with the first two bases in brackets</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Report File Format</label>
+      <option value="table">Table</option>
+      <option value="embl">EMBL</option>
+      <option value="genbank">GENBANK</option>
+      <option value="gff">GFF</option>
+      <option value="pir">PIR</option>
+      <option value="swiss">SwissProt</option>
+      <option value="dbmotif">DbMotif</option>
+      <option value="diffseq">Diffseq</option>
+      <option value="excel">Excel (tab delimited)</option>
+      <option value="feattable">FeatTable</option>
+      <option value="motif">Motif</option>
+      <option value="regions">Regions</option>
+      <option value="seqtable">SeqTable</option>
+      <option value="simple">SRS Simple</option>
+      <option value="srs">SRS</option>
+      <option value="tagseq">TagSeq</option>
+    </param>
+    <param name="out_format2" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="table" name="ofile1" />
+    <data format="fasta" name="ofile2" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="poliii" value="no"/>
+      <param name="aa" value="no"/>
+      <param name="tt" value="no"/>
+      <param name="polybase" value="yes"/>
+      <param name="context" value="no"/>
+      <param name="mismatchpercent" value="0"/>
+      <param name="out_format1" value="gff"/>
+      <param name="out_format2" value="fasta"/>
+      <output name="ofile2" file="emboss_sirna_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/sirna.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_sixpack.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_sixpack.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,162 @@
+<tool id="EMBOSS: sixpack90" name="sixpack" version="5.0.0">
+  <!-- tool adds file description and timestamp to output data -->
+  <description>Display a DNA sequence with 6-frame translation and ORFs</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>sixpack -sequence $input1 -outfile $ofile1 -outseq $ofile2 -table $table -firstorf $firstorf -lastorf $lastorf -mstart $mstart -reverse $reverse -orfminsize $orfminsize -uppercase
+  "$uppercase" -number $number -width "$width" -length "$length" -margin "$margin" -name $disp_name -description $description -offset "$offset" -html $html_out1 -osformat $out_format2 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="table" type="select">
+      <label>Code to use</label>
+      <option value="0">Standard</option>
+      <option value="1">Standard (with alternative initiation codons)</option>
+      <option value="2">Vertebrate Mitochondrial</option>
+      <option value="3">Yeast Mitochondrial</option>
+      <option value="4">Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma</option>
+      <option value="5">Invertebrate Mitochondrial</option>
+      <option value="6">Ciliate Macronuclear and Dasycladacean</option>
+      <option value="9">Echinoderm Mitochondrial</option>
+      <option value="10">Euplotid Nuclear</option>
+      <option value="11">Bacterial</option>
+      <option value="12">Alternative Yeast Nuclear</option>
+      <option value="13">Ascidian Mitochondrial</option>
+      <option value="14">Flatworm Mitochondrial</option>
+      <option value="15">Blepharisma Macronuclear</option>
+      <option value="16">Chlorophycean Mitochondrial</option>
+      <option value="21">Trematode Mitochondrial</option>
+      <option value="22">Scenedesmus obliquus</option>
+      <option value="23">Thraustochytrium Mitochondrial</option>
+    </param>
+    <param name="firstorf" type="select">
+      <label>Count the beginning of a sequence as a possible ORF</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="lastorf" type="select">
+      <label>Count the end of a sequence as a possible ORF</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="mstart" type="select">
+      <label>Displays only ORFs starting with an M</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="reverse" type="select">
+      <label>Display the translation of the DNA sequence in the 3 reverse frames</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="orfminsize" size="4" type="text" value="1">
+      <label>Minimum size of Open Reading Frames (ORFs) to display in the translations</label>
+    </param>
+    <param name="uppercase" size="50" type="text" value="">
+      <label>Regions to put in uppercase</label>
+    </param>
+    <param name="number" type="select">
+      <label>Number the sequence at the beginning and the end of each line</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="width" size="4" type="text" value="60">
+      <label>Number of nucleotides displayed on each line</label>
+    </param>
+    <param name="length" size="4" type="text" value="0">
+      <label>Line length of page</label>
+    </param>
+    <param name="margin" size="4" type="text" value="10">
+      <label>Margin around sequence for numbering</label>
+    </param>
+    <param name="disp_name" type="select">
+      <label>Display the ID name of the sequence</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="description" type="select">
+      <label>Display the description of the sequence</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="offset" size="4" type="text" value="1">
+      <label>Number from which you want the DNA sequence to be numbered</label>
+    </param>
+    <param name="html_out1" type="select">
+      <label>Format output as an HTML table</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="out_format2" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="sixpack" name="ofile1" />
+    <data format="fasta" name="ofile2" />
+  </outputs>
+<!--    <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="table" value="0"/>
+      <param name="firstorf" value="no"/>
+      <param name="lastorf" value="no"/>
+      <param name="mstart" value="no"/>
+      <param name="reverse" value="no"/>
+      <param name="orfminsize" value="1"/>
+      <param name="uppercase" value=""/>
+      <param name="number" value="no"/>
+      <param name="width" value="60"/>
+      <param name="length" value="0"/>
+      <param name="margin" value="10"/>
+      <param name="disp_name" value="no"/>
+      <param name="description" value="no"/>
+      <param name="offset" value="1"/>
+      <param name="html_out1" value="no"/>
+      <param name="out_format2" value="fasta"/>
+      <output name="ofile2" file="emboss_sixpack_out.fasta"/>
+    </test>
+  </tests> -->
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/sixpack.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_skipseq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_skipseq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,59 @@
+<tool id="EMBOSS: skipseq91" name="skipseq" version="5.0.0">
+  <description>Reads and writes sequences, skipping first few</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>skipseq -sequence $input1 -outseq $out_file1 -skip "$skip" -feature $feature -osformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="skip" size="4" type="text" value="0">
+      <label>Number of sequences to skip at start</label>
+    </param>
+    <param name="feature" type="select">
+      <label>Use feature information</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/skipseq.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_splitter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_splitter.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,79 @@
+<tool id="EMBOSS: splitter92" name="splitter" version="5.0.0">
+  <description>Split a sequence into (overlapping) smaller sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>splitter -sequence $input1 -outseq $out_file1 -size "$size" -overlap "$overlap" -addoverlap $addoverlap -osformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="size" size="10" type="text" value="10000">
+      <label>Size to split at</label>
+    </param>
+    <param name="overlap" size="4" type="text" value="0">
+      <label>Overlap between split sequences</label>
+    </param>
+    <param name="addoverlap" type="select">
+      <label>Add overlap to size</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="size" value="10000"/>
+      <param name="overlap" value="0"/>
+      <param name="addoverlap" value="no"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_splitter_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/splitter.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_supermatcher.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_supermatcher.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,64 @@
+<tool id="EMBOSS: supermatcher95" name="supermatcher" version="5.0.0">
+  <!-- puts file information in output report -->
+  <description>Match large sequences against one or more other sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>supermatcher -asequence $input1 -bsequence $input2 -gapopen "$gapopen" -gapextend "$gapextend" -width "$width" -wordlen "$wordlen" -outfile $ofile1 -errorfile $ofile2 -aformat3
+  $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Large sequences</label>
+    </param>
+    <param format="data" name="input2" type="data">
+      <label>Sequences to match</label>
+    </param>
+    <param name="gapopen" size="4" type="text" value="10.0">
+      <label>Gap opening penalty</label>
+    </param>
+    <param name="gapextend" size="4" type="text" value="0.5">
+      <label>Gap extension penalty</label>
+    </param>
+    <param name="width" size="4" type="text" value="16">
+      <label>Alignment width</label>
+    </param>
+    <param name="wordlen" size="4" type="text" value="6">
+      <label>Word length for initial matching</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Alignment File Format</label>
+      <option value="simple">Simple (m)</option>
+      <option value="fasta">FASTA (m)</option>
+      <option value="msf">MSF (m)</option>
+      <option value="srs">SRS (m)</option>
+      <option value="pair">Pair (p)</option>
+      <option value="markx0">Markx0 (p)</option>
+      <option value="markx1">Markx1 (p)</option>
+      <option value="markx2">Markx2 (p)</option>
+      <option value="markx3">Markx3 (p)</option>
+      <option value="markx10">Markx10 (p)</option>
+      <option value="srspair">SRS pair (p)</option>
+      <option value="score">Score (p)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="simple" name="ofile1" />
+    <data format="supermatcher" name="ofile2" />
+  </outputs>
+<!--    <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="input2" value="1.fasta"/>
+      <param name="gapopen" value="10.0"/>
+      <param name="gapextend" value="0.5"/>
+      <param name="width" value="16"/>
+      <param name="wordlen" value="6"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="ofile1" file="emboss_supermatcher_out.fasta"/>
+    </test>
+  </tests> -->
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/supermatcher.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_syco.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_syco.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,197 @@\n+<tool id="EMBOSS: syco96" name="syco" version="5.0.0">\r\n+  <!-- graphics output -->\n+  <description>Synonymous codon usage Gribskov statistic plot</description>\r\n+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>\r\n+  <command interpreter="perl">emboss_single_outputfile_wrapper.pl syco -sequence $input1 -graph png -goutfile $ofile1 -outfile $ofile2 -cfile $cfile -window "$window" -uncommon $uncommon -minimum "$minimum"\r\n+  -auto</command>\r\n+  <inputs>\r\n+    <param format="fasta" name="input1" type="data">\r\n+      <label>Sequence</label>\r\n+    </param>\r\n+    <param name="cfile" type="select">\r\n+      <label>Codon Usage File</label>\r\n+      <option value="Ehum.cut">Ehum.cut</option>\r\n+      <option value="Eyeastcai.cut">Eyeastcai.cut</option>\r\n+      <option value="Eacc.cut">Eacc.cut</option>\r\n+      <option value="Eadenovirus5.cut">Eadenovirus5.cut</option>\r\n+      <option value="Eadenovirus7.cut">Eadenovirus7.cut</option>\r\n+      <option value="Eaidlav.cut">Eaidlav.cut</option>\r\n+      <option value="Eanasp.cut">Eanasp.cut</option>\r\n+      <option value="Eani.cut">Eani.cut</option>\r\n+      <option value="Eani_h.cut">Eani_h.cut</option>\r\n+      <option value="Eanidmit.cut">Eanidmit.cut</option>\r\n+      <option value="Easn.cut">Easn.cut</option>\r\n+      <option value="Eath.cut">Eath.cut</option>\r\n+      <option value="Eatu.cut">Eatu.cut</option>\r\n+      <option value="Eavi.cut">Eavi.cut</option>\r\n+      <option value="Ebja.cut">Ebja.cut</option>\r\n+      <option value="Ebly.cut">Ebly.cut</option>\r\n+      <option value="Ebme.cut">Ebme.cut</option>\r\n+      <option value="Ebmo.cut">Ebmo.cut</option>\r\n+      <option value="Ebna.cut">Ebna.cut</option>\r\n+      <option value="Ebov.cut">Ebov.cut</option>\r\n+      <option value="Ebovsp.cut">Ebovsp.cut</option>\r\n+      <option value="Ebst.cut">Ebst.cut</option>\r\n+      <option value="Ebsu.cut">Ebsu.cut</option>\r\n+      <option value="Ebsu_h.cut">Ebsu_h.cut</option>\r\n+      <option value="Ecac.cut">Ecac.cut</option>\r\n+      <option value="Ecal.cut">Ecal.cut</option>\r\n+      <option value="Eccr.cut">Eccr.cut</option>\r\n+      <option value="Ecel.cut">Ecel.cut</option>\r\n+      <option value="Echi.cut">Echi.cut</option>\r\n+      <option value="Echicken.cut">Echicken.cut</option>\r\n+      <option value="Echisp.cut">Echisp.cut</option>\r\n+      <option value="Echk.cut">Echk.cut</option>\r\n+      <option value="Echmp.cut">Echmp.cut</option>\r\n+      <option value="Echnt.cut">Echnt.cut</option>\r\n+      <option value="Echos.cut">Echos.cut</option>\r\n+      <option value="Echzm.cut">Echzm.cut</option>\r\n+      <option value="Echzmrubp.cut">Echzmrubp.cut</option>\r\n+      <option value="Ecpx.cut">Ecpx.cut</option>\r\n+      <option value="Ecre.cut">Ecre.cut</option>\r\n+      <option value="Ecrisp.cut">Ecrisp.cut</option>\r\n+      <option value="Ectr.cut">Ectr.cut</option>\r\n+      <option value="Edayhoff.cut">Edayhoff.cut</option>\r\n+      <option value="Eddi.cut">Eddi.cut</option>\r\n+      <option value="Eddi_h.cut">Eddi_h.cut</option>\r\n+      <option value="Edog.cut">Edog.cut</option>\r\n+      <option value="Edro.cut">Edro.cut</option>\r\n+      <option value="Edro_h.cut">Edro_h.cut</option>\r\n+      <option value="Edrosophila.cut">Edrosophila.cut</option>\r\n+      <option value="Eeca.cut">Eeca.cut</option>\r\n+      <option value="Eeco.cut">Eeco.cut</option>\r\n+      <option value="Eeco_h.cut">Eeco_h.cut</option>\r\n+      <option value="Eecoli.cut">Eecoli.cut</option>\r\n+      <option value="Ef1.cut">Ef1.cut</option>\r\n+      <option value="Efish.cut">Efish.cut</option>\r\n+      <option value="Efmdvpolyp.cut">Efmdvpolyp.cut</option>\r\n+      <option value="Eham.cut">Eham.cut</option>\r\n+      <option value="Ehha.cut">Ehha.cut</option>\r\n+      <option value="Ehin.cut">Ehin.cut</option>\r\n+      <option value="Ehma.cut">Ehma.cut</option>\r\n+      <option value="Ehuman.cut">Ehuman.cut</option>\r\n+      <option value="Ekla.cut">Ekla.cut</option>\r\n+      <option value="Ek'..b'"Epsy.cut">Epsy.cut</option>\r\n+      <option value="Epvu.cut">Epvu.cut</option>\r\n+      <option value="Erab.cut">Erab.cut</option>\r\n+      <option value="Erabbit.cut">Erabbit.cut</option>\r\n+      <option value="Erabsp.cut">Erabsp.cut</option>\r\n+      <option value="Erat.cut">Erat.cut</option>\r\n+      <option value="Eratsp.cut">Eratsp.cut</option>\r\n+      <option value="Erca.cut">Erca.cut</option>\r\n+      <option value="Erhm.cut">Erhm.cut</option>\r\n+      <option value="Eric.cut">Eric.cut</option>\r\n+      <option value="Erle.cut">Erle.cut</option>\r\n+      <option value="Erme.cut">Erme.cut</option>\r\n+      <option value="Ersp.cut">Ersp.cut</option>\r\n+      <option value="Esalsp.cut">Esalsp.cut</option>\r\n+      <option value="Esau.cut">Esau.cut</option>\r\n+      <option value="Esco.cut">Esco.cut</option>\r\n+      <option value="Esgi.cut">Esgi.cut</option>\r\n+      <option value="Eshp.cut">Eshp.cut</option>\r\n+      <option value="Eshpsp.cut">Eshpsp.cut</option>\r\n+      <option value="Esli.cut">Esli.cut</option>\r\n+      <option value="Eslm.cut">Eslm.cut</option>\r\n+      <option value="Esma.cut">Esma.cut</option>\r\n+      <option value="Esmi.cut">Esmi.cut</option>\r\n+      <option value="Esmu.cut">Esmu.cut</option>\r\n+      <option value="Esoy.cut">Esoy.cut</option>\r\n+      <option value="Espi.cut">Espi.cut</option>\r\n+      <option value="Espn.cut">Espn.cut</option>\r\n+      <option value="Espo.cut">Espo.cut</option>\r\n+      <option value="Espo_h.cut">Espo_h.cut</option>\r\n+      <option value="Espu.cut">Espu.cut</option>\r\n+      <option value="Esta.cut">Esta.cut</option>\r\n+      <option value="Esty.cut">Esty.cut</option>\r\n+      <option value="Esus.cut">Esus.cut</option>\r\n+      <option value="Esv40.cut">Esv40.cut</option>\r\n+      <option value="Esyhsp.cut">Esyhsp.cut</option>\r\n+      <option value="Esynsp.cut">Esynsp.cut</option>\r\n+      <option value="Etbr.cut">Etbr.cut</option>\r\n+      <option value="Etcr.cut">Etcr.cut</option>\r\n+      <option value="Eter.cut">Eter.cut</option>\r\n+      <option value="Etetsp.cut">Etetsp.cut</option>\r\n+      <option value="Etob.cut">Etob.cut</option>\r\n+      <option value="Etobcp.cut">Etobcp.cut</option>\r\n+      <option value="Etom.cut">Etom.cut</option>\r\n+      <option value="Etrb.cut">Etrb.cut</option>\r\n+      <option value="Evco.cut">Evco.cut</option>\r\n+      <option value="Ewht.cut">Ewht.cut</option>\r\n+      <option value="Exel.cut">Exel.cut</option>\r\n+      <option value="Exenopus.cut">Exenopus.cut</option>\r\n+      <option value="Eyeast.cut">Eyeast.cut</option>\r\n+      <option value="Eyen.cut">Eyen.cut</option>\r\n+      <option value="Eysc.cut">Eysc.cut</option>\r\n+      <option value="Eysc_h.cut">Eysc_h.cut</option>\r\n+      <option value="Eyscmt.cut">Eyscmt.cut</option>\r\n+      <option value="Eysp.cut">Eysp.cut</option>\r\n+      <option value="Ezebrafish.cut">Ezebrafish.cut</option>\r\n+      <option value="Ezma.cut">Ezma.cut</option>\r\n+    </param>\r\n+    <param name="window" size="4" type="text" value="30">\r\n+      <label>Averaging window</label>\r\n+    </param>\r\n+    <param name="uncommon" type="select">\r\n+      <label>Show common codon usage</label>\r\n+      <option value="no">No</option>\r\n+      <option value="yes">Yes</option>\r\n+    </param>\r\n+    <param name="minimum" size="4" type="text" value="0.15">\r\n+      <label>Minimum value for a common codon</label>\r\n+    </param>\r\n+  </inputs>\r\n+  <outputs>\r\n+    <data format="png" name="ofile1" />\n+    <data format="syco" name="ofile2" />\r\n+  </outputs>\n+ <!--   <tests>\n+    <test>\n+      <param name="input1" value="2.fasta"/>\n+      <param name="cfile" value="Ehum.cut"/>\n+      <param name="window" value="30"/>\n+      <param name="uncommon" value="no"/>\n+      <param name="minimum" value="0.15"/>\n+      <output name="ofile2" file="emboss_syco_out.syco"/>\n+    </test>\n+  </tests> -->\r\n+  <help>\n+\n+    You can view the original documentation here_.\n+    \n+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/syco.html\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_tcode.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_tcode.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,44 @@
+<tool id="EMBOSS: tcode97" name="tcode" version="5.0.0">
+  <description>Fickett TESTCODE statistic to identify protein-coding DNA</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>tcode -sequence $input1 -outfile $out_file1 -window "$window" -step "$step" -rformat $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="window" size="5" type="text" value="200">
+      <label>Window size</label>
+    </param>
+    <param name="step" size="5" type="text" value="3">
+      <label>Step size</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Report File Format</label>
+      <option value="table">Table</option>
+      <option value="embl">EMBL</option>
+      <option value="genbank">GENBANK</option>
+      <option value="gff">GFF</option>
+      <option value="pir">PIR</option>
+      <option value="swiss">SwissProt</option>
+      <option value="dbmotif">DbMotif</option>
+      <option value="diffseq">Diffseq</option>
+      <option value="excel">Excel (tab delimited)</option>
+      <option value="feattable">FeatTable</option>
+      <option value="motif">Motif</option>
+      <option value="regions">Regions</option>
+      <option value="seqtable">SeqTable</option>
+      <option value="simple">SRS Simple</option>
+      <option value="srs">SRS</option>
+      <option value="tagseq">TagSeq</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="table" name="out_file1" />
+  </outputs>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/tcode.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_textsearch.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_textsearch.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,58 @@
+<tool id="EMBOSS: textsearch98" name="textsearch" version="5.0.0">
+  <description>Search sequence documentation. Slow, use SRS and Entrez!</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>textsearch -sequence $input1 -outfile $out_file1 -pattern "$pattern" -casesensitive -heading $heading -usa $usa -accession $accession -name $search_name -description $description -html
+  $html_out1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="pattern" size="50" type="text" value="">
+      <label>Pattern to search for</label>
+    </param>
+    <param name="casesensitive" type="select">
+      <label>Do a case-sensitive search</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="heading" type="select">
+      <label>Display column headings</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="usa" type="select">
+      <label>Display the USA of the sequence</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="accession" type="select">
+      <label>Display accession column</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="search_name" type="select">
+      <label>Display name column</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="description" type="select">
+      <label>Display description column</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="html_out1" type="select">
+      <label>Format output as an HTML table</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="textsearch" name="out_file1" />
+  </outputs>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/textsearch.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_tmap.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_tmap.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,39 @@
+<tool id="EMBOSS: tmap99" name="tmap" version="5.0.0">
+  <description>Displays membrane spanning regions</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command interpreter="perl">emboss_single_outputfile_wrapper.pl tmap -sequences $input1 -outfile $out_file1 -goutfile $out_file2 -graph png -rformat $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Report File Format</label>
+      <option value="seqtable ">SeqTable</option>
+      <option value="embl">EMBL</option>
+      <option value="genbank">GENBANK</option>
+      <option value="gff">GFF</option>
+      <option value="pir">PIR</option>
+      <option value="swiss">SwissProt</option>
+      <option value="dbmotif">DbMotif</option>
+      <option value="diffseq">Diffseq</option>
+      <option value="excel">Excel (tab delimited)</option>
+      <option value="feattable">FeatTable</option>
+      <option value="motif">Motif</option>
+      <option value="regions">Regions</option>
+      <option value="simple">SRS Simple</option>
+      <option value="srs">SRS</option>
+      <option value="table">Table</option>
+      <option value="tagseq">TagSeq</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="seqtable" name="out_file1" />
+    <data format="png" name="out_file2" />
+  </outputs>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/tmap.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_tranalign.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_tranalign.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,84 @@
+<tool id="EMBOSS: tranalign100" name="tranalign" version="5.0.0">
+  <description>Align nucleic coding regions given the aligned proteins</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>tranalign -asequence $input1 -bsequence $input2 -outseq $out_file1 -table $table -osformat3 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Nucleic Sequences</label>
+    </param>
+    <param format="data" name="input2" type="data">
+      <label>Protein Sequences</label>
+    </param>
+    <param name="table" type="select">
+      <label>Code to use</label>
+      <option value="0">Standard</option>
+      <option value="1">Standard (with alternative initiation codons)</option>
+      <option value="2">Vertebrate Mitochondrial</option>
+      <option value="3">Yeast Mitochondrial</option>
+      <option value="4">Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma</option>
+      <option value="5">Invertebrate Mitochondrial</option>
+      <option value="6">Ciliate Macronuclear and Dasycladacean</option>
+      <option value="9">Echinoderm Mitochondrial</option>
+      <option value="10">Euplotid Nuclear</option>
+      <option value="11">Bacterial</option>
+      <option value="12">Alternative Yeast Nuclear</option>
+      <option value="13">Ascidian Mitochondrial</option>
+      <option value="14">Flatworm Mitochondrial</option>
+      <option value="15">Blepharisma Macronuclear</option>
+      <option value="16">Chlorophycean Mitochondrial</option>
+      <option value="21">Trematode Mitochondrial</option>
+      <option value="22">Scenedesmus obliquus</option>
+      <option value="23">Thraustochytrium Mitochondrial</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="3.fasta"/>
+      <param name="input2" value="2.pep"/>
+      <param name="table" value="0"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_tranalign_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/tranalign.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_transeq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_transeq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,122 @@
+<tool id="EMBOSS: transeq101" name="transeq" version="5.0.0">
+  <description>Translate nucleic acid sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>transeq -sequence $input1 -outseq $out_file1 -frame $frame -table $table -regions "$regions" -trim $trim -clean $clean -alternative $alternative -osformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="frame" type="select">
+      <label>Frame(s) to translate</label>
+      <option value="1">Frame 1</option>
+      <option value="2">Frame 2</option>
+      <option value="3">Frame 3</option>
+      <option value="F">Forward three frames</option>
+      <option value="-1">Frame -1</option>
+      <option value="-2">Frame -2</option>
+      <option value="-3">Frame -3</option>
+      <option value="R">Reverse three frames</option>
+      <option value="6">All six frames</option>
+    </param>
+    <param name="table" type="select">
+      <label>Code to use</label>
+      <option value="0">Standard</option>
+      <option value="1">Standard (with alternative initiation codons)</option>
+      <option value="2">Vertebrate Mitochondrial</option>
+      <option value="3">Yeast Mitochondrial</option>
+      <option value="4">Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma</option>
+      <option value="5">Invertebrate Mitochondrial</option>
+      <option value="6">Ciliate Macronuclear and Dasycladacean</option>
+      <option value="9">Echinoderm Mitochondrial</option>
+      <option value="10">Euplotid Nuclear</option>
+      <option value="11">Bacterial</option>
+      <option value="12">Alternative Yeast Nuclear</option>
+      <option value="13">Ascidian Mitochondrial</option>
+      <option value="14">Flatworm Mitochondrial</option>
+      <option value="15">Blepharisma Macronuclear</option>
+      <option value="16">Chlorophycean Mitochondrial</option>
+      <option value="21">Trematode Mitochondrial</option>
+      <option value="22">Scenedesmus obliquus</option>
+      <option value="23">Thraustochytrium Mitochondrial</option>
+    </param>
+    <param name="regions" size="10" type="text" value="">
+      <label>Regions to translate</label>
+    </param>
+    <param name="trim" type="select">
+      <label>Remove all 'X' and '*' characters from the right end of the translation</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="clean" type="select">
+      <label>Change all STOP codon positions from the '*' character to 'X'</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="alternative" type="select">
+      <label>Define frame '-1' as using the set of codons starting with the last codon of the sequence</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="frame" value="1"/>
+      <param name="table" value="0"/>
+      <param name="regions" value=""/>
+      <param name="trim" value="no"/>
+      <param name="clean" value="no"/>
+      <param name="alternative" value="no"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_transeq_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/transeq.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_trimest.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_trimest.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,92 @@
+<tool id="EMBOSS: trimest102" name="trimest" version="5.0.0">
+  <description>Trim poly-A tails off EST sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>trimest -sequence $input1 -outseq $out_file1 -minlength "$minlength" -mismatches "$mismatches" -reverse $reverse -tolower $tolower -fiveprime $fiveprime -osformat2 $out_format1
+  -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="minlength" size="4" type="text" value="4">
+      <label>Minimum length that a poly-A (or poly-T) tail must have before it is removed</label>
+    </param>
+    <param name="mismatches" size="4" type="text" value="1">
+      <label>Number of fewer mismatched non-A bases in a poly-A tail</label>
+    </param>
+    <param name="reverse" type="select">
+      <label>Change the sequence to the forward sense when it is written out</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="tolower" type="select">
+      <label>Mask poly-A by converting to lowercase</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="fiveprime" type="select">
+      <label>Inspect 5' end of the sequence for poly-T tails</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="minlength" value="4"/>
+      <param name="mismatches" value="1"/>
+      <param name="reverse" value="yes"/>
+      <param name="tolower" value="no"/>
+      <param name="fiveprime" value="yes"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_trimest_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark
+
+The input dataset needs to be sequences.
+
+-----
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/trimest.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_trimseq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_trimseq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,97 @@
+<tool id="EMBOSS: trimseq103" name="trimseq" version="5.0.0">
+  <description>Trim ambiguous bits off the ends of sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>trimseq -sequence $input1 -outseq $out_file1 -window "$window" -percent "$percent" -strict $strict -star $star -left $left -right $right -osformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="window" size="4" type="text" value="1">
+      <label>Window size</label>
+    </param>
+    <param name="percent" size="5" type="text" value="100.0">
+      <label>Threshold of the percentage ambiguity</label>
+    </param>
+    <param name="strict" type="select">
+      <label>Trim all ambiguity codes</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="star" type="select">
+      <label>In protein sequences, trim off not only X's, but also the *'s</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="left" type="select">
+      <label>Trim at the start</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="right" type="select">
+      <label>Trim at the end</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="window" value="1"/>
+      <param name="percent" value="100.0"/>
+      <param name="strict" value="no"/>
+      <param name="star" value="no"/>
+      <param name="left" value="yes"/>
+      <param name="right" value="yes"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_trimseq_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark 
+
+The input dataset needs to be sequences. 
+
+----- 
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/trimseq.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_twofeat.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_twofeat.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,130 @@
+<tool id="EMBOSS: twofeat104" name="twofeat" version="5.0.0">
+  <description>Finds neighbouring pairs of features in sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>twofeat -sequence $input1 -outfile $out_file1 -atype "$atype" -btype "$btype" -minrange "$minrange" -maxrange "$maxrange" -asource "$asource" -asense $asense -aminscore "$aminscore"
+  -amaxscore "$amaxscore" -atag "$atag" -avalue "$avalue" -bsource "$bsource" -bsense "$bsense" -bminscore "$bminscore" -bmaxscore "$bmaxscore" -btag "$btag" -bvalue "$bvalue" -overlap $overlap
+  -rangetype $rangetype -sense $sense -order $order -twoout $twoout -typeout "$typeout" -rformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="data" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="atype" size="50" type="text" value="*">
+      <label>Feature type you wish to allow. Feature 1</label>
+    </param>
+    <param name="btype" size="50" type="text" value="*">
+      <label>Feature type you wish to allow. Feature 2</label>
+    </param>
+    <param name="minrange" size="5" type="text" value="0">
+      <label>Minimun range</label>
+    </param>
+    <param name="maxrange" size="5" type="text" value="0">
+      <label>Maximum range</label>
+    </param>
+    <param name="asource" size="50" type="text" value="*">
+      <label>Feature source 1</label>
+    </param>
+    <param name="asense" type="select">
+      <label>Feature sense 1</label>
+      <option value="0">Any sense</option>
+      <option value="+">Forward sense</option>
+      <option value="-">Reverse sense</option>
+    </param>
+    <param name="aminscore" size="5" type="text" value="0.0">
+      <label>Feature 1 minimum score</label>
+    </param>
+    <param name="amaxscore" size="5" type="text" value="0.0">
+      <label>Feature1 maxiumum score</label>
+    </param>
+    <param name="atag" size="50" type="text" value="*">
+      <label>Feature 1 tag</label>
+    </param>
+    <param name="avalue" size="50" type="text" value="*">
+      <label>Tag 1 value</label>
+    </param>
+    <param name="bsource" size="50" type="text" value="*">
+      <label>Feature 2 source</label>
+    </param>
+    <param name="bsense" type="select">
+      <label>Feature 2 sense</label>
+      <option value="0">Any sense</option>
+      <option value="+">Forward sense</option>
+      <option value="-">Reverse sense</option>
+    </param>
+    <param name="bminscore" size="5" type="text" value="0.0">
+      <label>Feature 2 miniumum score</label>
+    </param>
+    <param name="bmaxscore" size="5" type="text" value="0.0">
+      <label>Feature 2 maximum score</label>
+    </param>
+    <param name="btag" size="50" type="text" value="*">
+      <label>Feature 2 tag</label>
+    </param>
+    <param name="bvalue" size="50" type="text" value="*">
+      <label>Feature 2 tag value</label>
+    </param>
+    <param name="overlap" type="select">
+      <label>opverlaps allowed</label>
+      <option value="A">Any</option>
+      <option value="O">Overlap required but not within</option>
+      <option value="NO">No overlaps are allowed</option>
+      <option value="NW:">Overlap required but not within</option>
+      <option value="AW">A must be all within B</option>
+      <option value="BW">B must be all within A</option>
+    </param>
+    <param name="rangetype" type="select">
+      <label>How to determine range</label>
+      <option value="N">From nearest ends</option>
+      <option value="L">From left ends</option>
+      <option value="R">From right ends</option>
+      <option value="F">From furthest ends</option>
+    </param>
+    <param name="sense" type="select">
+      <label>Required sense</label>
+      <option value="A">Any sense</option>
+      <option value="S">Same sense</option>
+      <option value="O">Opposite sense</option>
+    </param>
+    <param name="order" type="select">
+      <label>Required order of the two features</label>
+      <option value="A">Any</option>
+      <option value="AB">Feature A then feature B</option>
+      <option value="BA">Feature B then feature A</option>
+    </param>
+    <param name="twoout" type="select">
+      <label>Write out the two features themselves</label>
+      <option value="no">No</option>
+      <option value="yes">Yes</option>
+    </param>
+    <param name="typeout" size="50" type="text" value="misc_feature">
+      <label>New feature type</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Report File Format</label>
+      <option value="table">Table</option>
+      <option value="embl">EMBL</option>
+      <option value="genbank">GENBANK</option>
+      <option value="gff">GFF</option>
+      <option value="pir">PIR</option>
+      <option value="swiss">SwissProt</option>
+      <option value="dbmotif">DbMotif</option>
+      <option value="diffseq">Diffseq</option>
+      <option value="excel">Excel (tab delimited)</option>
+      <option value="feattable">FeatTable</option>
+      <option value="motif">Motif</option>
+      <option value="regions">Regions</option>
+      <option value="seqtable">SeqTable</option>
+      <option value="simple">SRS Simple</option>
+      <option value="srs">SRS</option>
+      <option value="tagseq">TagSeq</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="table" name="out_file1" />
+  </outputs>
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/twofeat.html
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_union.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_union.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,65 @@
+<tool id="EMBOSS: union105" name="union" version="5.0.0">
+  <description>Reads sequence fragments and builds one sequence</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>union -sequence $input1 -outseq $out_file1 -osformat2 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="out_format1" value="fasta"/>
+      <output name="out_file1" file="emboss_union_out.fasta"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark 
+
+The input dataset needs to be sequences. 
+
+----- 
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/union.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_vectorstrip.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_vectorstrip.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,82 @@
+<tool id="EMBOSS: vectorstrip106" name="vectorstrip" version="5.0.0">
+  <description>Strips out DNA between a pair of vector sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>vectorstrip -sequence $input1 -vectorsfile $input2 -outseq $ofile1 -outfile $ofile2 -vectorfile yes -mismatch "$mismatch" -besthits $besthits -linkera "$linkera" -linkerb
+  "$linkerb" -osformat4 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequences</label>
+    </param>
+    <param format="data" name="input2" type="data">
+      <label>Vector file</label>
+    </param>
+    <param name="mismatch" size="4" type="text" value="10">
+      <label>Max allowed percent mismatch</label>
+    </param>
+    <param name="besthits" type="select">
+      <label>Show only the best hits (minimize mismatches)</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="linkera" size="50" type="text" value="">
+      <label>The 5' sequence</label>
+    </param>
+    <param name="linkerb" size="50" type="text" value="">
+      <label>The 3' sequence</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Sequence File Format</label>
+      <option value="fasta">FASTA (m)</option>
+      <option value="acedb">ACeDB (m)</option>
+      <option value="asn1">ASN.1 (m)</option>
+      <option value="clustal">Clustal (m)</option>
+      <option value="codata">CODATA (m)</option>
+      <option value="embl">EMBL (m)</option>
+      <option value="fitch">Fitch (m)</option>
+      <option value="gcg">Wisconsin Package GCG 9.x and 10.x (s)</option>
+      <option value="genbank">GENBANK (m)</option>
+      <option value="gff">GFF (m)</option>
+      <option value="hennig86">Hennig86 (m)</option>
+      <option value="ig">Intelligenetics (m)</option>
+      <option value="jackknifer">Jackknifer (m)</option>
+      <option value="jackknifernon">Jackknifernon (m)</option>
+      <option value="mega">Mega (m)</option>
+      <option value="meganon">Meganon (m)</option>
+      <option value="msf">Wisconsin Package GCG's MSF (m)</option>
+      <option value="pir">NBRF (PIR) (m)</option>
+      <option value="ncbi">NCBI style FASTA (m)</option>
+      <option value="nexus">Nexus/PAUP (m)</option>
+      <option value="nexusnon">Nexusnon/PAUPnon (m)</option>
+      <option value="phylip">PHYLIP interleaved (m)</option>
+      <option value="phylipnon">PHYLIP non-interleaved (m)</option>
+      <option value="selex">SELEX (m)</option>
+      <option value="staden">Staden (s)</option>
+      <option value="strider">DNA strider (m)</option>
+      <option value="swiss">SwisProt entry (m)</option>
+      <option value="text">Plain sequence (s)</option>
+      <option value="treecon">Treecon (m)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="ofile1" />
+    <data format="vectorstrip" name="ofile2" />
+  </outputs>
+  <!--  <tests>
+    <test>
+      <param name="input1" value="1.fasta"/>
+      <param name="input2" value="2.fasta"/>
+      <param name="mismatch" value="10"/>
+      <param name="besthits" value="yes"/>
+      <param name="linkera" value=""/>
+      <param name="linkerb" value=""/>
+      <param name="out_format1" value="fasta"/>
+      <output name="ofile1" file="emboss_vectorstrip_out.fasta"/>
+    </test>
+  </tests> -->
+  <code file="emboss_format_corrector.py" />
+  <help>
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/vectorstrip.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_water.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_water.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,66 @@
+<tool id="EMBOSS: water107" name="water" version="5.0.0">
+  <description>Smith-Waterman local alignment</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>water -asequence $input1 -bsequence $input2 -outfile $out_file1 -gapopen "$gapopen" -gapextend "$gapextend" -brief $brief -aformat3 $out_format1 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequence 1</label>
+    </param>
+    <param format="fasta" name="input2" type="data">
+      <label>Sequence 2</label>
+    </param>
+    <param name="gapopen" size="6" type="text" value="10.0">
+      <label>Gap open penalty</label>
+    </param>
+    <param name="gapextend" size="6" type="text" value="0.5">
+      <label>Gap extension penalty</label>
+    </param>
+    <param name="brief" type="select">
+      <label>Brief identity and similarity</label>
+      <option value="yes">Yes</option>
+      <option value="no">No</option>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Alignment File Format</label>
+      <option value="srs">SRS (m)</option>
+      <option value="simple">Simple (m)</option>
+      <option value="fasta">FASTA (m)</option>
+      <option value="msf">MSF (m)</option>
+      <option value="pair">Pair (p)</option>
+      <option value="markx0">Markx0 (p)</option>
+      <option value="markx1">Markx1 (p)</option>
+      <option value="markx2">Markx2 (p)</option>
+      <option value="markx3">Markx3 (p)</option>
+      <option value="markx10">Markx10 (p)</option>
+      <option value="srspair">SRS pair (p)</option>
+      <option value="score">Score (p)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="srs" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="input2" value="1.fasta"/>
+      <param name="gapopen" value="10.0"/>
+      <param name="gapextend" value="0.5"/>
+      <param name="brief" value="no"/>
+      <param name="out_format1" value="score"/>
+      <output name="out_file1" file="emboss_water_out.score"/>
+    </test>
+  </tests>
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark 
+
+The input datasets need to be sequences. 
+
+----- 
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/water.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_wobble.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_wobble.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,40 @@
+<tool id="EMBOSS: wobble108" name="wobble" version="5.0.0">
+  <description>Wobble base plot</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command interpreter="perl">emboss_single_outputfile_wrapper.pl wobble -sequence $input1 -graph png -goutfile $ofile1 -outfile $ofile2 -window "$window" -bases "$bases" -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="window" size="5" type="text" value="30">
+      <label>Window size, in codons</label>
+    </param>
+    <param name="bases" size="6" type="text" value="GC">
+      <label>Bases used</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="png" name="ofile1" />
+    <data format="wobble" name="ofile2" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="window" value="30"/>
+      <param name="bases" value="GC"/>
+      <output name="ofile2" file="emboss_wobble_out.wobble"/>
+    </test>
+  </tests>
+  <help>
+
+.. class:: warningmark 
+
+The input dataset needs to be sequences. 
+
+----- 
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/wobble.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_wordcount.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_wordcount.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,35 @@
+<tool id="EMBOSS: wordcount109" name="wordcount" version="5.0.0">
+  <description>Counts words of a specified size in a DNA sequence</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>wordcount -sequence $input1 -outfile $out_file1 -wordsize "$wordsize" -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequence</label>
+    </param>
+    <param name="wordsize" size="5" type="text" value="4">
+      <label>Word size</label>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="wordcount" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="wordsize" value="4"/>
+      <output name="out_file1" file="emboss_wordcount_out.wordcount"/>
+    </test>
+  </tests>
+  <help>
+
+.. class:: warningmark 
+
+The input dataset needs to be sequences. 
+
+----- 
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/wordcount.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/emboss_5/emboss_wordmatch.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/emboss_5/emboss_wordmatch.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,74 @@
+<tool id="EMBOSS: wordmatch110" name="wordmatch" version="5.0.0">
+  <description>Finds all exact matches of a given size between 2 sequences</description>
+  <requirements><requirement type="package" version="5.0.0">emboss</requirement></requirements>
+  <command>wordmatch -asequence $input1 -bsequence $input2 -outfile $out_file1 -aoutfeat $out_file2 -boutfeat $out_file3 -wordsize "$wordsize" -aformat3 $out_format1 -offormat4 $out_format2
+  -offormat5 $out_format3 -auto</command>
+  <inputs>
+    <param format="fasta" name="input1" type="data">
+      <label>Sequence 1</label>
+    </param>
+    <param format="fasta" name="input2" type="data">
+      <label>Sequence 2</label>
+    </param>
+    <param name="wordsize" size="5" type="text" value="4">
+      <label>Word size</label>
+    </param>
+    <param name="out_format1" type="select">
+      <label>Output Alignment File Format</label>
+      <option value="match">Match (m)</option>
+      <option value="simple">Simple (m)</option>
+      <option value="fasta">FASTA (m)</option>
+      <option value="msf">MSF (m)</option>
+      <option value="srs">SRS (m)</option>
+      <option value="pair">Pair (p)</option>
+      <option value="markx0">Markx0 (p)</option>
+      <option value="markx1">Markx1 (p)</option>
+      <option value="markx2">Markx2 (p)</option>
+      <option value="markx3">Markx3 (p)</option>
+      <option value="markx10">Markx10 (p)</option>
+      <option value="srspair">SRS pair (p)</option>
+      <option value="score">Score (p)</option>
+    </param>
+    <param name="out_format2" type="select">
+      <label>Output Feature 1 File Format</label>
+      <option value="gff">GFF</option>
+      <option value="embl">EMBL</option>
+      <option value="swiss">SwissProt</option>
+    </param>
+    <param name="out_format3" type="select">
+      <label>Output Feature 2 File Format</label>
+      <option value="gff">GFF</option>
+      <option value="embl">EMBL</option>
+      <option value="swiss">SwissProt</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="match" name="out_file1" />
+    <data format="gff" name="out_file2" />
+    <data format="gff" name="out_file3" />
+  </outputs>
+  <!--   <tests>
+    <test>
+      <param name="input1" value="2.fasta"/>
+      <param name="input2" value="1.fasta"/>
+      <param name="wordsize" value="4"/>
+      <param name="out_format1" value="fasta"/>
+      <param name="out_format2" value="gff"/>
+      <param name="out_format3" value="gff"/>
+      <output name="ofile2" file="emboss_wordmatch_out.embl"/>
+    </test> 
+  </tests> test takes a long time to run-->
+  <code file="emboss_format_corrector.py" />
+  <help>
+
+.. class:: warningmark 
+
+The input datasets need to be sequences. 
+
+----- 
+
+    You can view the original documentation here_.
+    
+    .. _here: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/wordmatch.html
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/encode/gencode_partition.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/encode/gencode_partition.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,39 @@
+<tool id="gencode_partition1" name="Gencode Partition">
+  <description>an interval file</description>
+  <command interpreter="python">split_by_partitions.py ${GALAXY_DATA_INDEX_DIR} $input1 $out_file1 ${input1.metadata.chromCol} ${input1.metadata.startCol} ${input1.metadata.endCol} ${input1.metadata.strandCol}</command>
+  <inputs>
+    <param name="input1" type="data" format="interval" label="File to Partition"/>
+  </inputs>
+  <outputs>
+    <data name="out_file1" format="bed"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="encode_1.bed"/>
+      <output name="out_file1" file="gencode_partition_out.bed"/>
+    </test>
+  </tests>
+  <help>
+For detailed information about partitioning, click here_.
+
+.. _here: http://genome.imim.es/gencode/wiki/index.php/Collecting_Feature_Sets_from_All_Analysis_Groups
+
+Datasets are partitioned according to the protocol below:
+
+A partition scheme has been defined that is similar to what has previously been done with TARs/TRANSFRAGs such that any feature can be classified as falling into one of the following 6 categories:
+  1. **Coding** -- coding exons defined from the GENCODE experimentally verified coding set (coding in any transcript)
+  2. **5UTR** -- 5' UTR exons defined from the GENCODE experimentally verified coding set (5' UTR in some transcript but never coding in any other)
+  3. **3UTR** -- 3' UTR exons defined from the GENCODE experimentally verified coding set (3' UTR in some transcript but never coding in any other)
+  4. **Intronic Proximal** -- intronic and no more than 5kb away from an exon.
+  5. **Intergenic Proximal** -- between genes and no more than 5kb away from an exon.
+  6. **Intronic Distal** -- intronic and greater than 5kb away from an exon.
+  7. **Intergenic Distal** -- between genes and greater than 5kb away from an exon.
+
+-----
+
+.. class:: infomark
+
+**Note:** Features overlapping more than one partition will take the identity of the lower-numbered partition. 
+
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/encode/random_intervals.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/encode/random_intervals.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,58 @@
+<tool id="random_intervals1" name="Random Intervals">
+<description>create a random set of intervals</description>
+  <command interpreter="python">random_intervals_no_bits.py $regions $input2 $input1 $out_file1 ${input2.metadata.chromCol} ${input2.metadata.startCol} ${input2.metadata.endCol} ${input1.metadata.chromCol} ${input1.metadata.startCol} ${input1.metadata.endCol} ${input1.metadata.strandCol} $use_mask $strand_overlaps ${GALAXY_DATA_INDEX_DIR}</command>
+  <inputs>
+    <param name="input1" type="data" format="interval" label="File to Mimick">
+      <validator type="unspecified_build" message="Unspecified build, this tool works with data from genome builds hg16 or hg17. Click the pencil icon in your history item to set the genome build."/>
+    </param>
+    <param name="input2" type="data" format="interval" label="Intervals to Mask"/>
+    <param name="use_mask" type="select" label="Use mask">
+      <option value="no_mask">No</option>
+      <option value="use_mask">Yes</option>
+    </param>
+    <param name="strand_overlaps" type="select" label="Allow overlaps">
+      <option value="all">Any</option>
+      <option value="strand">Across Strands</option>
+      <option value="none">None</option>
+    </param>
+    <param name="regions" type="select" label="Regions to use">
+      <options from_file="regions.loc">
+        <column name="name" index="2"/>
+        <column name="value" index="1"/>
+        <column name="dbkey" index="0"/>
+        <filter type="data_meta" ref="input1" key="dbkey" column="0" />
+        <validator type="no_options" message="This tool currently only works with ENCODE data from genome builds hg16 or hg17."/>
+      </options>
+    </param> 
+  </inputs>
+  <outputs>
+    <data name="out_file1" format="input"/>
+  </outputs>
+  <help>
+
+.. class:: warningmark
+
+This tool currently only works with ENCODE data from genome builds hg16 or hg17.
+
+-----
+
+.. class:: infomark
+
+**Note:** If you do not wish to mask a set of intervals, change the Use Mask option to No, this option will override any Mask files selected.
+
+-----
+
+**Syntax**
+
+This tool will attempt to create a random set of intervals that mimic those found within your source file.  You may also specify a set of intervals to mask.
+
+**Allow overlaps** options
+  * **Across Strands** - random regions are allowed to overlap only if they are on different strands.
+  * **Any** - all overlaps are allowed.
+  * **None** - no overlapping regions are allowed.
+
+**Regions to use** options
+  * Bounding region of interest based on the dataset build.
+
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/encode/random_intervals_no_bits.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/encode/random_intervals_no_bits.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,253 @@\n+#!/usr/bin/env python\r\n+#Dan Blankenberg\r\n+#%prog bounding_region_file mask_intervals_file intervals_to_mimic_file out_file mask_chr mask_start mask_end interval_chr interval_start interval_end interval_strand use_mask allow_strand_overlaps\r\n+import sys, random\r\n+from copy import deepcopy\r\n+from galaxy import eggs\r\n+import pkg_resources\r\n+pkg_resources.require( "bx-python" )\r\n+import bx.intervals.io\r\n+import bx.intervals.intersection\r\n+import psyco_full\r\n+\r\n+assert sys.version_info[:2] >= ( 2, 4 )\r\n+\r\n+max_iters = 5\r\n+\r\n+def stop_err( msg ):\r\n+    sys.stderr.write( msg )\r\n+    sys.exit()\r\n+\r\n+#Try to add a random region\r\n+def add_random_region( mimic_region, bound, exist_regions, plus_mask, minus_mask, overlaps ):\r\n+    region_length, region_strand = mimic_region\r\n+    plus_count = plus_mask.count_range()\r\n+    minus_count = minus_mask.count_range()\r\n+    gaps = []\r\n+\r\n+    if region_strand == "-":\r\n+        gaps = minus_mask.get_gaps( region_length )\r\n+    else:\r\n+        gaps = plus_mask.get_gaps( region_length )\r\n+    \r\n+    while True:\r\n+        try:\r\n+            gap_length, gap_start, gap_end = gaps.pop( random.randint( 0, len( gaps ) - 1 ) )\r\n+        except:\r\n+            break\r\n+        try:\r\n+            start = random.randint( bound.start + gap_start, bound.start + gap_end - region_length - 1 )\r\n+        except ValueError, ve:\r\n+            stop_err( "Exception thrown generating random start value: %s" %str( ve ) )\r\n+\r\n+        end = start + region_length\r\n+        try_plus_mask = plus_mask.copy()\r\n+        try_minus_mask = minus_mask.copy()\r\n+        \r\n+        if region_strand == "-":\r\n+            try_minus_mask.set_range( start - bound.start, end - bound.start )\r\n+        else:\r\n+            try_plus_mask.set_range( start - bound.start, end - bound.start )\r\n+        \r\n+        rand_region = bx.intervals.io.GenomicInterval( None, [bound.chrom, start, end, region_strand], 0, 1, 2, 3, "+", fix_strand=True )\r\n+        \r\n+        if try_plus_mask.count_range() == plus_count + region_length or try_minus_mask.count_range() == minus_count + region_length:\r\n+            if overlaps in ["strand", "all"]: #overlaps allowed across strands\r\n+                exist_regions.append( rand_region )\r\n+                if overlaps == "strand":\r\n+                    return exist_regions, True, try_plus_mask, try_minus_mask\r\n+                else: #overlaps allowed everywhere\r\n+                    return exist_regions, True, plus_mask, minus_mask\r\n+            else: #no overlapping anywhere\r\n+                exist_regions.append( rand_region )\r\n+                if region_strand == "-":\r\n+                    return exist_regions, True, try_minus_mask.copy(), try_minus_mask\r\n+                else: \r\n+                    return exist_regions, True, try_plus_mask, try_plus_mask.copy()\r\n+    return exist_regions, False, plus_mask, minus_mask\r\n+\r\n+def main():\r\n+    includes_strand = False\r\n+    region_uid = sys.argv[1]\r\n+    mask_fname = sys.argv[2]\r\n+    intervals_fname = sys.argv[3]\r\n+    out_fname = sys.argv[4]\r\n+    try:\r\n+        mask_chr = int( sys.argv[5] ) - 1\r\n+    except:\r\n+        stop_err( "\'%s\' is an invalid chrom column for \'Intervals to Mask\' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[5] ) )\r\n+    try:\r\n+        mask_start = int( sys.argv[6] ) - 1\r\n+    except:\r\n+        stop_err( "\'%s\' is an invalid start column for \'Intervals to Mask\' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[6] ) )\r\n+    try:\r\n+        mask_end = int( sys.argv[7] ) - 1\r\n+    except:\r\n+        stop_err( "\'%s\' is an invalid end column for \'Intervals to Mask\' dataset, click the pencil icon in the history item to edit column settings." % str( sys.argv[7] ) )\r\n+    try:\r\n+        interval_chr = int( sys.argv[8] ) - 1\r\n+    except:\r\n+        stop_err( "\'%s\' is an invalid chrom column for \'File to Mimick\' dataset, click the pencil icon in the hi'..b'  for region in regions:\r\n+        region.sort()\r\n+        region.reverse()\r\n+    \r\n+    #read mask file\r\n+    mask = []\r\n+    if use_mask != "no_mask":\r\n+        for region in bx.intervals.io.NiceReaderWrapper( open( mask_fname, \'r\' ), chrom_col=mask_chr, start_col=mask_start, end_col=mask_end, fix_strand=True, return_header=False, return_comments=False ):\r\n+            mask.append( region )\r\n+\r\n+    try:\r\n+        out_file = open ( out_fname, "w" )\r\n+    except:\r\n+        stop_err( "Error opening output file \'%s\'." % out_fname )\r\n+\r\n+    i = 0\r\n+    i_iters = 0\r\n+    region_count = 0\r\n+    best_regions = []\r\n+    num_fail = 0\r\n+    while i < len( bounds ):\r\n+        i_iters += 1\r\n+        #order regions to mimic\r\n+        regions_to_mimic = regions[i][0:]\r\n+        if len( regions_to_mimic ) < 1: #if no regions to mimic, skip\r\n+            i += 1\r\n+            i_iters = 0\r\n+            continue \r\n+        #set up region mask\r\n+        plus_mask = Region( bounds[i].end - bounds[i].start )\r\n+        for region in mask:\r\n+            if region.chrom != bounds[i].chrom: continue\r\n+            mask_start = region.start - bounds[i].start\r\n+            mask_end = region.end - bounds[i].start\r\n+            if mask_start >= 0 and mask_end > 0:\r\n+                plus_mask.set_range( mask_start, mask_end )\r\n+        minus_mask = plus_mask.copy()\r\n+        random_regions = []\r\n+        num_added = 0\r\n+        for j in range( len( regions[i] ) ):\r\n+            random_regions, added, plus_mask, minus_mask = add_random_region( regions_to_mimic[j], bounds[i], random_regions, plus_mask, minus_mask, overlaps )\r\n+            if added: \r\n+                num_added += 1\r\n+        if num_added == len( regions_to_mimic ) or i_iters >= max_iters:\r\n+            if len( best_regions ) > len( random_regions ):\r\n+                random_regions = best_regions.copy()\r\n+            num_fail += ( len( regions_to_mimic ) - len( random_regions ) )\r\n+            i_iters = 0\r\n+            best_regions = []\r\n+            for region in random_regions:\r\n+                print >>out_file, "%s\\t%d\\t%d\\t%s\\t%s\\t%s" % ( region.chrom, region.start, region.end, "region_" + str( region_count ), "0", region.strand )\r\n+                region_count += 1\r\n+        else:\r\n+            i -= 1\r\n+            if len( best_regions ) < len( random_regions ):\r\n+                best_regions = random_regions[:]\r\n+        i+=1\r\n+    \r\n+    out_file.close()\r\n+    if num_fail:\r\n+        print "After %i iterations, %i regions could not be added." % (max_iters, num_fail)\r\n+        if use_mask == "use_mask":\r\n+            print "The mask you have provided may be too restrictive."\r\n+\r\n+class Region( list ):\r\n+    """\r\n+    A list for on/off regions\r\n+    """\r\n+    def __init__( self, size=0 ):\r\n+        for i in range( size ):\r\n+            self.append( False )\r\n+    def copy( self ):\r\n+        return deepcopy( self )\r\n+    def set_range( self, start=0, end=None ):\r\n+        if start < 0:\r\n+            start = 0\r\n+        if ( not end and end != 0 ) or end > len( self ):\r\n+            end = len( self )\r\n+        for i in range( start, end ):\r\n+            self[i]=True\r\n+    def count_range( self, start=0, end=None ):\r\n+        if start < 0:\r\n+            start = 0\r\n+        if ( not end and end != 0 ) or end > len( self ):\r\n+            end = len( self )\r\n+        return self[start:end].count( True )\r\n+    def get_gaps( self, min_size = 0 ):\r\n+        gaps = []\r\n+        start = end = 0\r\n+        while True:\r\n+            try: \r\n+                start = self[end:].index( False ) + end\r\n+            except: \r\n+                break\r\n+            try: \r\n+                end = self[start:].index( True ) + start\r\n+            except:\r\n+                end = len( self )\r\n+            if end > start and end - start >= min_size:\r\n+                gaps.append( ( end - start, start, end ) )\r\n+        gaps.sort()\r\n+        gaps.reverse()\r\n+        return gaps\r\n+\r\n+if __name__ == "__main__": main()\r\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/encode/split_by_partitions.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/encode/split_by_partitions.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+#Original script from /home/james/work/encode/feature_partitions/split_by_partitions.py
+
+#Usage: python(2.4) split_by_partitions.py partition_index in_file out_file chrCol startCol endCol strandCol
+
+from __future__ import division
+
+import sys
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.bitset import *
+from bx.bitset_builders import *
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def main():
+    GALAXY_DATA_INDEX_DIR = sys.argv[1]
+    partition_index = '%s/encode_feature_partitions/partition_list.txt' % GALAXY_DATA_INDEX_DIR
+    partition_offset = "%s/encode_feature_partitions/" % GALAXY_DATA_INDEX_DIR
+    
+    warnings = []
+    
+    # Load up the partitions
+    partitions = list()
+    try: 
+        for line in open( partition_index ):
+            name, score, filename = line.split()
+            partitions.append( ( name, score, binned_bitsets_from_file( open( partition_offset+filename ) ) ) )
+    except:
+        stop_err( "Error loading partitioning dataset." )
+    
+    try:
+        in_file = open( sys.argv[2] )
+    except:
+        stop_err( "Bad input data." )
+        
+    try:
+        out_file = open( sys.argv[3], "w" )
+    except:
+        stop_err( "Bad output file." )
+    
+    try:
+        chrCol = int( sys.argv[4] ) - 1
+    except:
+        stop_err( "Bad chr column: %s" % ( str( sys.argv[4] ) ) )
+    try:
+        startCol = int( sys.argv[5] ) - 1
+    except:
+        stop_err( "Bad start column: %s" % ( str( sys.argv[5] ) ) )
+    try:
+        endCol = int( sys.argv[6] ) - 1
+    except:
+        stop_err( "Bad end column: %s" % ( str( sys.argv[6] ) ) )
+    try:
+        strandCol = int( sys.argv[7] )-1
+    except:
+        strandCol = -1
+    
+    line_count = 0
+    skipped_lines = 0
+    first_invalid_line = None
+    invalid_line = ''
+    try:
+        for line in in_file:
+            line_count += 1
+            line = line.rstrip( '\r\n' )
+            if line and not line.startswith( '#' ):
+                fields = line.split( '\t' )
+                try:
+                    chr, start, end = fields[chrCol], int( fields[startCol] ), int( fields[endCol] )
+                except:
+                    skipped_lines += 1
+                    if first_invalid_line is None:
+                        first_invalid_line = line_count
+                        invalid_line = line
+                    continue
+                label = "input_line_" + str( line_count ) #if input file type was known to be bed, then could guess at label column
+                
+                if strandCol < 0:
+                    strand = "+"
+                else:
+                    try:
+                        strand = fields[strandCol]
+                    except:
+                        strand = "+"
+                
+                # Find which partition it overlaps
+                overlap = 0
+                for name, score, bb in partitions:
+                    # Is there at least 1bp overlap?
+                    if chr in bb:
+                        overlap = bb[chr].count_range( start, end-start )
+                        if overlap > 0:
+                            break
+                else:
+                    # No overlap with any partition? For now throw this since the 
+                    # partitions tile the encode regions completely, indicate an interval
+                    # that does not even overlap an encode region
+                    warning = "warning: Interval (%s, %d, %d) does not overlap any partition" % ( chr, start, end ) + ", line[" + str( line_count ) + "]. "
+                    warnings.append( warning )
+                    name = "no_overlap"
+                    score = 0
+                # Annotate with the name of the partition
+                frac_overlap = overlap / ( end-start )
+                # BED6 plus?
+                print >>out_file, "%s\t%d\t%d\t%s\t%s\t%s\t%s\t%0.4f" % ( chr, start, end, label, score, strand, name, frac_overlap )
+    except:
+        out_file.close()
+        in_file.close()
+        stop_err( "Unknown error while processing line # %d: %s" % ( line_count, line ) )
+    out_file.close()
+    in_file.close()
+
+    if warnings:
+        warn_msg = "This tool is useful on ENCODE regions only, %d warnings, 1st is: " % len( warnings )
+        warn_msg += warnings[0]
+        print warn_msg
+    if skipped_lines:
+        print "Skipped %d invalid lines starting at line # %d: %s" % ( skipped_lines, first_invalid_line, invalid_line )
+
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/evolution/add_scores.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/evolution/add_scores.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,106 @@
+<tool id="hgv_add_scores" name="phyloP" version="1.0.0">
+  <description>interspecies conservation scores</description>
+
+  <command>
+    add_scores $input1 ${input1.metadata.dbkey} ${input1.metadata.chromCol} ${input1.metadata.startCol} ${GALAXY_DATA_INDEX_DIR}/add_scores.loc $out_file1
+  </command>
+
+  <inputs>
+    <param format="interval" name="input1" type="data" label="Dataset">
+      <validator type="unspecified_build"/>
+      <validator type="dataset_metadata_in_file" filename="add_scores.loc" metadata_name="dbkey" metadata_column="0" message="Data is currently not available for the specified build."/>
+    </param>
+  </inputs>
+
+  <outputs>
+    <data format="input" name="out_file1" />
+  </outputs>
+
+  <requirements>
+    <requirement type="package">add_scores</requirement>
+  </requirements>
+
+  <tests>
+    <test>
+      <param name="input1" value="add_scores_input1.interval" ftype="interval" dbkey="hg18" />
+      <output name="output" file="add_scores_output1.interval" />
+    </test>
+    <test>
+      <param name="input1" value="add_scores_input2.bed" ftype="interval" dbkey="hg18" />
+      <output name="output" file="add_scores_output2.interval" />
+    </test>
+  </tests>
+
+  <help>
+.. class:: warningmark
+
+This currently works only for build hg18.
+
+-----
+
+**Dataset formats**
+
+The input can be any interval_ format dataset.  The output is also in interval format.
+(`Dataset missing?`_)
+
+.. _interval: ./static/formatHelp.html#interval
+.. _Dataset missing?: ./static/formatHelp.html
+
+-----
+
+**What it does**
+
+This tool adds a column that measures interspecies conservation at each SNP 
+position, using conservation scores for primates pre-computed by the 
+phyloP program.  PhyloP performs an exact P-value computation under a 
+continuous Markov substitution model. 
+
+The chromosome and start position
+are used to look up the scores, so if a larger interval is in the input,
+only the score for the first nucleotide is returned.
+
+-----
+
+**Example**
+
+- input file, with SNPs::
+
+    chr22  16440426  14440427  C/T
+    chr22  15494851  14494852  A/G
+    chr22  14494911  14494912  A/T
+    chr22  14550435  14550436  A/G
+    chr22  14611956  14611957  G/T
+    chr22  14612076  14612077  A/G
+    chr22  14668537  14668538  C
+    chr22  14668703  14668704  A/T
+    chr22  14668775  14668776  G
+    chr22  14680074  14680075  A/T
+    etc.
+
+- output file, showing conservation scores for primates::
+
+    chr22  16440426  14440427  C/T  0.509
+    chr22  15494851  14494852  A/G  0.427
+    chr22  14494911  14494912  A/T  NA
+    chr22  14550435  14550436  A/G  NA
+    chr22  14611956  14611957  G/T  -2.142
+    chr22  14612076  14612077  A/G  0.369
+    chr22  14668537  14668538  C    0.419
+    chr22  14668703  14668704  A/T  -1.462
+    chr22  14668775  14668776  G    0.470
+    chr22  14680074  14680075  A/T  0.303
+    etc.
+
+  "NA" means that the phyloP score was not available.
+
+-----
+
+**Reference**
+
+Siepel A, Pollard KS, Haussler D. (2006)
+New methods for detecting lineage-specific selection.
+In Proceedings of the 10th International Conference on Research in Computational
+Molecular Biology (RECOMB 2006), pp. 190-205.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/evolution/codingSnps.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/evolution/codingSnps.pl Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,528 @@\n+#!/usr/bin/perl -w \n+use strict;\n+\n+#########################################################################\n+#\tcodingSnps.pl\n+#\tThis takes a bed file with the names being / separated nts\n+#\tand a gene bed file with cds start and stop.\n+#\tIt then checks for changes in coding regions, reporting\n+#\tthose that cause a frameshift or substitution in the amino acid.\n+#########################################################################\n+\n+my $seqFlag = "2bit"; #flag to set sequence type 2bit|nib\n+if (!@ARGV or scalar @ARGV < 3) {\n+   print "Usage: codingSnps.pl snps.bed genes.bed (/dir/*$seqFlag|Galaxy build= loc=) [chr=# start=# end=# snp=# keepColumns=1] > codingSnps.txt\\n";\n+   exit;\n+}\n+my $uniq = 0; #flag for whether want uniq positions\n+my $syn = 0;  #flag for if want synonomous changes rather than non-syn\n+my $keep = 0; #keep old columns and append new ones\n+my $snpFile = shift @ARGV;\n+my $geneFile = shift @ARGV;\n+my $nibDir = shift @ARGV;  #2bit or nib, depending on flag above\n+if ($nibDir eq \'Galaxy\') { getGalaxyInfo(); }\n+my $col0 = 0; #bed like columns in default positions\n+my $col1 = 1;\n+my $col2 = 2;\n+my $col3 = 3;\n+#column positions 1 based coming in (for Galaxy)\n+foreach (@ARGV) {\n+   if (/chr=(\\d+)/) { $col0 = $1 -1; }\n+   elsif (/start=(\\d+)/) { $col1 = $1 -1; }\n+   elsif (/end=(\\d+)/) { $col2 = $1 -1; }\n+   elsif (/snp=(\\d+)/) { $col3 = $1 -1; }\n+   elsif (/keepColumns=1/) { $keep = 1; }\n+}\n+if ($col0 < 0 || $col1 < 0 || $col2 < 0 || $col3 < 0) {\n+   print STDERR "ERROR column numbers are given with origin 1\\n";\n+   exit 1;\n+}\n+my @genes; #bed lines for genes, sorted by chrom and start\n+my %chrSt; #index in array where each chrom starts\n+my %codon; #hash of codon amino acid conversions\n+my $ends = 0; #ends vs sizes in bed 11 position, starts relative to chrom\n+my $ignoreN = 1; #skip N\n+\n+my %amb = (\n+"R" => "A/G",\n+"Y" => "C/T",\n+"S" => "C/G",\n+"W" => "A/T",\n+"K" => "G/T",\n+"M" => "A/C",\n+"B" => "C/G/T",\n+"D" => "A/G/T",\n+"H" => "A/C/T",\n+"V" => "A/C/G",\n+"N" => "A/C/G/T"\n+);\n+fill_codon();\n+open(FH, "cat $geneFile | sort -k1,1 -k2,2n |") \n+   or die "Couldn\'t open and sort $geneFile, $!\\n";\n+my $i = 0;\n+while(<FH>) {\n+   chomp;\n+   if (/refGene.cdsEnd|ccdsGene.exonEnds/) { $ends = 1; next; }\n+   push(@genes, "$_");\n+   my @f = split(/\\t/);\n+   if (!exists $chrSt{$f[0]}) { $chrSt{$f[0]} = $i; }\n+   $i++;\n+}\n+close FH or die "Couldn\'t close $geneFile, $!\\n";\n+\n+if ($ends) { print STDERR "WARNING using block ends rather than sizes\\n"; }\n+\n+#open snps sorted as well\n+my $s1 = $col0 + 1; #sort order is origin 1\n+my $s2 = $col1 + 1; \n+open(FH, "cat $snpFile | sort -k$s1,$s1 -k$s2,${s2}n |")\n+   or die "Couldn\'t open and sort $snpFile, $!\\n";\n+$i = 0;\n+my @g; #one genes fields, should be used repeatedly\n+my %done;\n+while(<FH>) {\n+   chomp;\n+   if (/^\\s*#/) { next; } #comment\n+   my @s = split(/\\t/); #SNP fields\n+   if (!@s or !$s[$col0]) { die "ERROR missing SNP data, $_\\n"; }\n+   my $size = $#s;\n+   if ($col0 > $size || $col1 > $size || $col2 > $size || $col3 > $size) {\n+      print STDERR "ERROR file has fewer columns than requested, requested columns (0 based) $col0 $col1 $col2 $col3, file has $size\\n";\n+      exit 1;\n+   }\n+   if ($s[$col1] =~ /\\D/) { \n+      print STDERR "ERROR the start point must be an integer not $s[$col1]\\n";\n+      exit 1;\n+   }\n+   if ($s[$col2] =~ /\\D/) {\n+      print STDERR "ERROR the start point must be an integer not $s[$col2]\\n";\n+      exit 1;\n+   }\n+   if ($s[$col3] eq \'N\' && $ignoreN) { next; }\n+   if (exists $amb{$s[$col3]}) { $s[$col3] = $amb{$s[$col3]}; }\n+   if (!@g && exists $chrSt{$s[$col0]}) { #need to fetch first gene row\n+      $i = $chrSt{$s[$col0]};\n+      @g = split(/\\t/, $genes[$i]);\n+      if (scalar @g < 12) {  \n+         print STDERR "ERROR the gene file must be the whole genes in BED format\\n";\n+         exit 1;\n+      }\n+   }elsif (!@g) { \n+      next; #no gene for this chrom\n+   }elsif ($s[$col0] ne $g[0] && exists $chrSt{$s[$col0]}) { #new c'..b'    $seq .= fetchSeqNib($chr, $s, $pos[0]);\n+         }\n+      }\n+   }\n+}\n+\n+sub fetchSeq2bit {\n+   my $chr = shift;\n+   my $st = shift;\n+   my $end = shift;\n+   my $strand = \'+\';\n+   $st--; #change to UCSC numbering\n+   open (BIT, "twoBitToFa -seq=$chr -start=$st -end=$end $nibDir stdout |") or\n+      die "Couldn\'t run twoBitToFa, $!\\n";\n+   my $seq = \'\';\n+   while (<BIT>) {\n+      chomp;\n+      if (/^>/) { next; } #header\n+      $seq .= uc($_);\n+   }\n+   close BIT or die "Couldn\'t finish twoBitToFa on $chr $st $end, $!\\n";\n+   return $seq;\n+}\n+\n+sub fetchSeqNib {\n+   my $chr = shift;\n+   my $st = shift;\n+   my $end = shift;\n+   my $strand = \'+\';\n+   $st--; #change to UCSC numbering\n+   open (NIB, "nibFrag -upper $nibDir/${chr}.nib $st $end $strand stdout |") or die "Couldn\'t run nibFrag, $!\\n";\n+   my $seq = \'\';\n+   while (<NIB>) {\n+      chomp;\n+      if (/^>/) { next; } #header\n+      $seq .= $_;\n+   }\n+   close NIB or die "Couldn\'t finish nibFrag on $chr $st $end, $!\\n";\n+   return $seq;\n+}\n+\n+sub compl {\n+   my $nts = shift;\n+   my $comp = \'\';\n+   if (!$nts) { die "ERROR called compl with nts undefined"; }\n+   foreach my $n (split(/ */, $nts)) {\n+      if ($n eq \'A\') { $comp .= \'T\'; }\n+      elsif ($n eq \'T\') { $comp .= \'A\'; }\n+      elsif ($n eq \'C\') { $comp .= \'G\'; }\n+      elsif ($n eq \'G\') { $comp .= \'C\'; }\n+      elsif ($n eq \'N\') { $comp .= \'N\'; }\n+      elsif ($n eq \'-\') { $comp .= \'-\'; } #deletion\n+      else { $comp = undef; }\n+   }\n+   return $comp;\n+}\n+\n+sub getaa {\n+   my $nts = shift;  #in multiples of 3\n+   my $aa = \'\';\n+   my @n = split(/ */, $nts);\n+   while (@n) {\n+      my @t = splice(@n, 0, 3);\n+      my $n = uc(join("", @t));\n+      if (!exists $codon{$n}) { $aa .= \'N\'; next; }\n+      $aa .= $codon{$n};\n+   }\n+   return $aa;\n+}\n+\n+sub fill_codon {\n+$codon{GCA} = \'Ala\';\n+$codon{GCC} = \'Ala\';\n+$codon{GCG} = \'Ala\';\n+$codon{GCT} = \'Ala\';\n+$codon{CGG} = \'Arg\';\n+$codon{CGT} = \'Arg\';\n+$codon{CGC} = \'Arg\';\n+$codon{AGA} = \'Arg\';\n+$codon{AGG} = \'Arg\';\n+$codon{CGA} = \'Arg\';\n+$codon{AAC} = \'Asn\';\n+$codon{AAT} = \'Asn\';\n+$codon{GAC} = \'Asp\';\n+$codon{GAT} = \'Asp\';\n+$codon{TGC} = \'Cys\';\n+$codon{TGT} = \'Cys\';\n+$codon{CAG} = \'Gln\';\n+$codon{CAA} = \'Gln\';\n+$codon{GAA} = \'Glu\';\n+$codon{GAG} = \'Glu\';\n+$codon{GGG} = \'Gly\';\n+$codon{GGA} = \'Gly\';\n+$codon{GGC} = \'Gly\';\n+$codon{GGT} = \'Gly\';\n+$codon{CAC} = \'His\';\n+$codon{CAT} = \'His\';\n+$codon{ATA} = \'Ile\';\n+$codon{ATT} = \'Ile\';\n+$codon{ATC} = \'Ile\';\n+$codon{CTA} = \'Leu\';\n+$codon{CTC} = \'Leu\';\n+$codon{CTG} = \'Leu\';\n+$codon{CTT} = \'Leu\';\n+$codon{TTG} = \'Leu\';\n+$codon{TTA} = \'Leu\';\n+$codon{AAA} = \'Lys\';\n+$codon{AAG} = \'Lys\';\n+$codon{ATG} = \'Met\';\n+$codon{TTC} = \'Phe\';\n+$codon{TTT} = \'Phe\';\n+$codon{CCT} = \'Pro\';\n+$codon{CCA} = \'Pro\';\n+$codon{CCC} = \'Pro\';\n+$codon{CCG} = \'Pro\';\n+$codon{TCA} = \'Ser\';\n+$codon{AGC} = \'Ser\';\n+$codon{AGT} = \'Ser\';\n+$codon{TCC} = \'Ser\';\n+$codon{TCT} = \'Ser\';\n+$codon{TCG} = \'Ser\';\n+$codon{TGA} = \'Stop\';\n+$codon{TAG} = \'Stop\';\n+$codon{TAA} = \'Stop\';\n+$codon{ACT} = \'Thr\';\n+$codon{ACA} = \'Thr\';\n+$codon{ACC} = \'Thr\';\n+$codon{ACG} = \'Thr\';\n+$codon{TGG} = \'Trp\';\n+$codon{TAT} = \'Tyr\';\n+$codon{TAC} = \'Tyr\';\n+$codon{GTC} = \'Val\';\n+$codon{GTA} = \'Val\';\n+$codon{GTG} = \'Val\';\n+$codon{GTT} = \'Val\';\n+}\n+\n+sub getGalaxyInfo {\n+   my $build;\n+   my $locFile;\n+   foreach (@ARGV) {\n+      if (/build=(.*)/) { $build = $1; }\n+      elsif (/loc=(.*)/) { $locFile = $1; }\n+   }\n+   if (!$build or !$locFile) {\n+      print STDERR "ERROR missing build or locfile for Galaxy input\\n";\n+      exit 1;\n+   }\n+   # read $locFile to get $nibDir (ignoring commets)\n+   open(LF, "< $locFile") || die "open($locFile): $!\\n";\n+   while(<LF>) {\n+      s/#.*$//;\n+      s/(?:^\\s+|\\s+$)//g;\n+      next if (/^$/);\n+   \n+      my @t = split(/\\t/);\n+      if ($t[0] eq $build) { $nibDir = $t[1]; }\n+   }\n+   close(LF);\n+   if ($nibDir eq \'Galaxy\') {\n+      print STDERR "Failed to find sequence directory in locfile $locFile\\n";\n+   }\n+   $nibDir .= "/$build.2bit";  #we want full path and filename\n+}\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/evolution/codingSnps.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/evolution/codingSnps.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,139 @@
+<tool id="hgv_codingSnps" name="aaChanges" version="1.0.0">
+  <description>amino-acid changes caused by a set of SNPs</description>
+
+  <command interpreter="perl">
+    codingSnps.pl $input1 $input2 Galaxy build=${input1.metadata.dbkey} loc=${GALAXY_DATA_INDEX_DIR}/codingSnps.loc chr=${input1.metadata.chromCol} start=${input1.metadata.startCol} end=${input1.metadata.endCol} snp=$col1 > $out_file1
+  </command>
+
+  <inputs>
+    <param format="interval" name="input1" type="data" label="SNP dataset">
+      <validator type="dataset_metadata_in_file" filename="codingSnps.loc" metadata_name="dbkey" metadata_column="0" message="Sequences are not currently available for the specified build." split="\t" />
+    </param>
+    <param name="col1" type="data_column" data_ref="input1" label="Column with SNPs" />
+    <param format="interval" name="input2" type="data" label="Gene dataset">
+      <validator type="dataset_metadata_in_file" filename="codingSnps.loc" metadata_name="dbkey" metadata_column="0" message="Sequences are not currently available for the specified build." split="\t" />
+    </param>
+  </inputs>
+
+  <outputs>
+    <data format="tabular" name="out_file1" />
+  </outputs>
+
+  <code file="codingSnps_filter.py"></code>
+
+  <requirements>
+    <requirement type="binary">cat</requirement>
+    <requirement type="binary">sort</requirement>
+    <requirement type="package">ucsc_tools</requirement>
+  </requirements>
+
+  <tests>
+    <test>
+      <param name="input1" ftype="interval" value="codingSnps_input1.interval" dbkey="hg18" />
+      <param name="col1" value="6" />
+      <param name="input2" ftype="interval" value="codingSnps_inputGenes1.bed" dbkey="hg18" />
+      <output name="output" file="codingSnps_output1.interval" />
+    </test>
+    <test>
+      <param name="input1" ftype="interval" value="codingSnps_input2.interval" dbkey="hg18" />
+      <param name="input2" ftype="interval" value="codingSnps_inputGenes2.bed" dbkey="hg18" />
+      <param name="col1" value="4" />
+      <output name="output" file="codingSnps_output2.interval" />
+    </test>
+  </tests>
+
+  <help>
+.. class:: infomark
+
+The build must be defined for the input files and must be the same for both files.
+Use the pencil icon to add the build to the files if necessary.
+
+-----
+
+**Dataset formats**
+
+The SNP dataset is in interval_ format, with a column of SNPs as described below.
+The gene dataset is in BED_ format with 12 columns.  The output dataset is also interval.
+(`Dataset missing?`_)
+
+.. _interval: ./static/formatHelp.html#interval
+.. _BED: ./static/formatHelp.html#bed
+.. _Dataset missing?: ./static/formatHelp.html
+
+-----
+
+**What it does**
+
+This tool identifies which SNPs create amino-acid changes in the specified 
+coding regions.  The first input file contains the SNPs and must be an interval file.
+It needs the chromosome, start, and end position as well as the SNP.  The 
+SNP can be given using ambiguous-nucleotide symbols or a list of two to four
+alleles 
+separated by '/'.  Any other columns in the first input file will not be
+used but will be kept for the output.  The second input file contains the genes
+to be used for defining the coding regions.  This file must be a BED file with
+the first 12 columns standard BED columns.  The output is the same as the
+first input file with
+several columns added: the name field from the line of the gene input file
+used, the amino acids, the codon number, and the reference nucleotide that 
+changed in the amino acid.
+The amino acids are listed with the reference amino acid first, then a colon,
+and then the amino acids for the alleles.  If a SNP is not in a coding region
+or is synonymous then it is not included in the output file.
+
+-----
+
+**Example**
+
+- first input file, with SNPs::
+
+    chr22  15660821  15660822  A/G
+    chr22  15825725  15825726  G/T
+    chr22  15827035  15827036  G
+    chr22  15827135  15827136  C/G
+    chr22  15830928  15830929  A/G
+    chr22  15830951  15830952  G
+    chr22  15830955  15830956  C/T
+    chr22  15848885  15848886  C/T
+    chr22  15849048  15849049  A/C
+    chr22  15919711  15919712  A/G
+    etc.
+
+  or, indicating polymorphisms using ambiguous-nucleotide symbols::
+
+    chr22  15660821  15660822  R
+    chr22  15825725  15825726  K
+    chr22  15827035  15827036  G
+    chr22  15827135  15827136  S
+    chr22  15830928  15830929  R
+    chr22  15830951  15830952  G
+    chr22  15830955  15830956  Y
+    chr22  15848885  15848886  Y
+    chr22  15849048  15849049  M
+    chr22  15919711  15919712  R
+    etc.
+
+- second input file, with UCSC annotations for human genes::
+
+    chr22  15688363  15690225  uc010gqr.1  0  +  15688363  15688363  0  2   587,794,  0,1068,
+    chr22  15822826  15869112  uc002zlw.1  0  -  15823622  15869004  0  10  940,105,97,91,265,86,251,208,304,282,  0,1788,2829,3241,4163,6361,8006,26023,29936,46004,
+    chr22  15826991  15869112  uc010gqs.1  0  -  15829218  15869004  0  5   1380,86,157,304,282,  0,2196,21858,25771,41839,
+    chr22  15897459  15919682  uc002zlx.1  0  +  15897459  15897459  0  4   775,128,103,1720,  0,8303,10754,20503,
+    chr22  15945848  15971389  uc002zly.1  0  +  15945981  15970710  0  13  271,25,147,113,127,48,164,84,85,12,102,42,2193,  0,12103,12838,13816,15396,17037,17180,18535,19767,20632,20894,22768,23348,
+    etc.
+
+- output file, showing non-synonymous substitutions in coding regions::
+
+    chr22  15825725  15825726  G/T  uc002zlw.1  Gln:Pro/Gln   469  T
+    chr22  15827035  15827036  G    uc002zlw.1  Glu:Asp       414  C
+    chr22  15827135  15827136  C/G  uc002zlw.1  Gly:Gly/Ala   381  C
+    chr22  15830928  15830929  A/G  uc002zlw.1  Ala:Ser/Pro   281  C
+    chr22  15830951  15830952  G    uc002zlw.1  Leu:Pro       273  A
+    chr22  15830955  15830956  C/T  uc002zlw.1  Ser:Gly/Ser   272  T
+    chr22  15848885  15848886  C/T  uc002zlw.1  Ser:Trp/Stop  217  G
+    chr22  15848885  15848886  C/T  uc010gqs.1  Ser:Trp/Stop  200  G
+    chr22  15849048  15849049  A/C  uc002zlw.1  Gly:Stop/Gly  163  C
+    etc.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/evolution/codingSnps_filter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/evolution/codingSnps_filter.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+
+# runs after the job (and after the default post-filter)
+import os
+from galaxy import eggs
+from galaxy import jobs
+from galaxy.tools.parameters import DataToolParameter
+# Older py compatibility
+try:
+    set()
+except:
+    from sets import Set as set
+
+def validate_input( trans, error_map, param_values, page_param_map ):
+    dbkeys = set()
+    data_param_names = set()
+    data_params = 0
+    for name, param in page_param_map.iteritems():
+        if isinstance( param, DataToolParameter ):
+            # for each dataset parameter
+            if param_values.get(name, None) != None:
+                dbkeys.add( param_values[name].dbkey )
+                data_params += 1
+                # check meta data
+                try:
+                    param = param_values[name]
+                    startCol = int( param.metadata.startCol )
+                    endCol = int( param.metadata.endCol )
+                    chromCol = int( param.metadata.chromCol )
+                    if param.metadata.strandCol is not None:
+                        strandCol = int ( param.metadata.strandCol )
+                    else:
+                        strandCol = 0
+                except:
+                    error_msg = "The attributes of this dataset are not properly set. " + \
+                    "Click the pencil icon in the history item to set the chrom, start, end and strand columns."
+                    error_map[name] = error_msg
+            data_param_names.add( name )
+    if len( dbkeys ) > 1:
+        for name in data_param_names:
+            error_map[name] = "All datasets must belong to same genomic build, " \
+                "this dataset is linked to build '%s'" % param_values[name].dbkey
+    if data_params != len(data_param_names):
+        for name in data_param_names:
+            error_map[name] = "A dataset of the appropriate type is required"
b
diff -r 000000000000 -r 9071e359b9a3 tools/evolution/mutate_snp_codon.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/evolution/mutate_snp_codon.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+"""
+Script to mutate SNP codons.
+Dan Blankenberg
+"""
+
+import sys, string
+
+def strandify( fields, column ):
+    strand = '+'
+    if column >= 0 and column < len( fields ):
+        strand = fields[ column ]
+        if strand not in [ '+', '-' ]:
+            strand = '+'
+    return strand
+
+def main():
+    # parse command line
+    input_file = sys.argv[1]
+    out = open( sys.argv[2], 'wb+' )
+    codon_chrom_col = int( sys.argv[3] ) - 1
+    codon_start_col = int( sys.argv[4] ) - 1
+    codon_end_col = int( sys.argv[5] ) - 1
+    codon_strand_col = int( sys.argv[6] ) - 1
+    codon_seq_col = int( sys.argv[7] ) - 1
+    
+    snp_chrom_col = int( sys.argv[8] ) - 1
+    snp_start_col = int( sys.argv[9] ) - 1
+    snp_end_col = int( sys.argv[10] ) - 1
+    snp_strand_col = int( sys.argv[11] ) - 1
+    snp_observed_col = int( sys.argv[12] ) - 1
+    
+    max_field_index = max( codon_chrom_col, codon_start_col, codon_end_col, codon_strand_col, codon_seq_col, snp_chrom_col, snp_start_col, snp_end_col, snp_strand_col, snp_observed_col )
+    
+    DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" )
+    skipped_lines = 0
+    errors = {}
+    for name, message in [ ('max_field_index','not enough fields'), ( 'codon_len', 'codon length must be 3' ), ( 'codon_seq', 'codon sequence must have length 3' ), ( 'snp_len', 'SNP length must be 3' ), ( 'snp_observed', 'SNP observed values must have length 3' ), ( 'empty_comment', 'empty or comment'), ( 'no_overlap', 'codon and SNP do not overlap' ) ]:
+        errors[ name ] = { 'count':0, 'message':message }
+    line_count = 0
+    for line_count, line in enumerate( open( input_file ) ):
+        line = line.rstrip( '\n\r' )
+        if line and not line.startswith( '#' ):
+            fields = line.split( '\t' )
+            if max_field_index >= len( fields ):
+                skipped_lines += 1
+                errors[ 'max_field_index' ]['count'] += 1
+                continue
+            
+            #read codon info
+            codon_chrom = fields[codon_chrom_col]
+            codon_start = int( fields[codon_start_col] )
+            codon_end = int( fields[codon_end_col] )
+            if codon_end - codon_start != 3:
+                #codons must be length 3
+                skipped_lines += 1
+                errors[ 'codon_len' ]['count'] += 1
+                continue
+            codon_strand = strandify( fields, codon_strand_col )
+            codon_seq = fields[codon_seq_col].upper()
+            if len( codon_seq ) != 3:
+                #codon sequence must have length 3
+                skipped_lines += 1
+                errors[ 'codon_seq' ]['count'] += 1
+                continue
+            
+            #read snp info
+            snp_chrom = fields[snp_chrom_col]
+            snp_start = int( fields[snp_start_col] )
+            snp_end = int( fields[snp_end_col] )
+            if snp_end - snp_start != 1:
+                #snps must be length 1
+                skipped_lines += 1
+                errors[ 'snp_len' ]['count'] += 1
+                continue
+            snp_strand = strandify( fields, snp_strand_col )
+            snp_observed = fields[snp_observed_col].split( '/' )
+            snp_observed = [ observed for observed in snp_observed if len( observed ) == 1 ]
+            if not snp_observed:
+                #sequence replacements must be length 1
+                skipped_lines += 1
+                errors[ 'snp_observed' ]['count'] += 1
+                continue
+            
+            #Determine index of replacement for observed values into codon
+            offset = snp_start - codon_start
+            #Extract DNA on neg strand codons will have positions reversed relative to interval positions; i.e. position 0 == position 2
+            if codon_strand == '-':
+                offset = 2 - offset
+            if offset < 0 or offset > 2: #assert offset >= 0 and offset <= 2, ValueError( 'Impossible offset determined: %s' % offset )
+                #codon and snp do not overlap
+                skipped_lines += 1
+                errors[ 'no_overlap' ]['count'] += 1
+                continue
+            
+            for observed in snp_observed:
+                if codon_strand != snp_strand:
+                    #if our SNP is on a different strand than our codon, take complement of provided observed SNP base
+                    observed = observed.translate( DNA_COMP )
+                snp_codon = [ char for char in codon_seq ]
+                snp_codon[offset] = observed.upper()
+                snp_codon = ''.join( snp_codon )
+                
+                if codon_seq != snp_codon: #only output when we actually have a different codon
+                    out.write( "%s\t%s\n" % ( line, snp_codon )  )
+        else:
+            skipped_lines += 1
+            errors[ 'empty_comment' ]['count'] += 1
+    if skipped_lines:
+        print "Skipped %i (%4.2f%%) of %i lines; reasons: %s" % ( skipped_lines, ( float( skipped_lines )/float( line_count ) ) * 100, line_count, ', '.join( [ "%s (%i)" % ( error['message'], error['count'] ) for error in errors.itervalues() if error['count'] ] ) )
+    
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/evolution/mutate_snp_codon.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/evolution/mutate_snp_codon.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,60 @@
+<tool id="mutate_snp_codon_1" name="Mutate Codons" version="1.0.0">
+  <description>with SNPs</description>
+  <command interpreter="python">mutate_snp_codon.py $input1 $output1 ${input1.metadata.chromCol} ${input1.metadata.startCol} ${input1.metadata.endCol} ${input1.metadata.strandCol} $codon_seq_col $snp_chrom_col $snp_start_col $snp_end_col $snp_strand_col $snp_observed_col</command>
+  <inputs>
+    <param name="input1" type="data" format="interval" label="Interval file with joined SNPs" optional="False" help="The interval metadata for this file should be set for the codon positions."/>
+    <param name="codon_seq_col" label="Codon Sequence column" type="data_column" data_ref="input1" />
+    <param name="snp_chrom_col" label="SNP chromosome column" type="data_column" data_ref="input1" />
+    <param name="snp_start_col" label="SNP start column" type="data_column" data_ref="input1" />
+    <param name="snp_end_col" label="SNP end column" type="data_column" data_ref="input1" />
+    <param name="snp_strand_col" label="SNP strand column" type="data_column" data_ref="input1" />
+    <param name="snp_observed_col" label="SNP observed column" type="data_column" data_ref="input1" />
+  </inputs>
+  <outputs>
+    <data name="output1" format="interval" metadata_source="input1"/>
+  </outputs>
+   <tests>
+     <test>
+       <param name="input1" value="mutate_snp_codon_in.interval"/>
+       <param name="codon_seq_col" value="8"/>
+       <param name="snp_chrom_col" value="17"/>
+       <param name="snp_start_col" value="18"/>
+       <param name="snp_end_col" value="19"/>
+       <param name="snp_strand_col" value="22"/>
+       <param name="snp_observed_col" value="25"/>
+       <output name="output1" file="mutate_snp_codon_out.interval" />
+     </test>
+   </tests>
+  <help>
+This tool takes an interval file as input.  This input should contain a set of codon locations and corresponding DNA sequence (such as from the *Extract Genomic DNA* tool) joined to SNP locations with observed values (such as *all fields from selected table* from the snp130 table of hg18 at the UCSC Table browser).  This interval file should have the metadata (chromosome, start, end, strand) set for the columns containing the locations of the codons. The user needs to specify the columns containing the sequence for the codon as well as the genomic positions and observed values (values should be split by '/') for the SNP data as tool input; SNPs positions and sequence substitutes must have a length of exactly 1. Only genomic intervals which yield a different sequence string are output. All sequence characters are converted to uppercase during processing.
+  
+  For example, using these settings:
+  
+  * **metadata** **chromosome**, **start**, **end** and **strand** set to **1**, **2**, **3** and **6**, respectively
+  * **Codon Sequence column** set to **c8**
+  * **SNP chromosome column** set to **c17**
+  * **SNP start column** set to **c18**
+  * **SNP end column** set to **c19**
+  * **SNP strand column** set to **c22**
+  * **SNP observed column** set to **c25**
+  
+  with the following input::
+  
+    chr1 58995 58998 NM_001005484 0 + GAA GAA Glu GAA 1177632 28.96 0 2787607 0.422452662804 585 chr1 58996 58997 rs1638318 0 + A A A/G genomic single by-submitter 0 0 unknown exact 3
+    chr1 59289 59292 NM_001005484 0 + TTT TTT Phe TTT 714298 17.57 0 1538990 0.464134269878 585 chr1 59290 59291 rs71245814 0 + T T G/T genomic single unknown 0 0 unknown exact 3
+    chr1 59313 59316 NM_001005484 0 + AAG AAG Lys AAG 1295568 31.86 0 2289189 0.565950648898 585 chr1 59315 59316 rs2854682 0 - G G C/T genomic single by-submitter 0 0 unknown exact 3
+    chr1 59373 59376 NM_001005484 0 + ACA ACA Thr ACA 614523 15.11 0 2162384 0.284187729839 585 chr1 59373 59374 rs2691305 0 - A A C/T genomic single unknown 0 0 unknown exact 3
+    chr1 59412 59415 NM_001005484 0 + GCG GCG Ala GCG 299495 7.37 0 2820741 0.106176001271 585 chr1 59414 59415 rs2531266 0 + G G C/G genomic single by-submitter 0 0 unknown exact 3
+    chr1 59412 59415 NM_001005484 0 + GCG GCG Ala GCG 299495 7.37 0 2820741 0.106176001271 585 chr1 59414 59415 rs55874132 0 + G G C/G genomic single unknown 0 0 coding-synon exact 1
+  
+  
+  will produce::
+  
+    chr1 58995 58998 NM_001005484 0 + GAA GAA Glu GAA 1177632 28.96 0 2787607 0.422452662804 585 chr1 58996 58997 rs1638318 0 + A A A/G genomic single by-submitter 0 0 unknown exact 3 GGA
+    chr1 59289 59292 NM_001005484 0 + TTT TTT Phe TTT 714298 17.57 0 1538990 0.464134269878 585 chr1 59290 59291 rs71245814 0 + T T G/T genomic single unknown 0 0 unknown exact 3 TGT
+    chr1 59313 59316 NM_001005484 0 + AAG AAG Lys AAG 1295568 31.86 0 2289189 0.565950648898 585 chr1 59315 59316 rs2854682 0 - G G C/T genomic single by-submitter 0 0 unknown exact 3 AAA
+    chr1 59373 59376 NM_001005484 0 + ACA ACA Thr ACA 614523 15.11 0 2162384 0.284187729839 585 chr1 59373 59374 rs2691305 0 - A A C/T genomic single unknown 0 0 unknown exact 3 GCA
+    chr1 59412 59415 NM_001005484 0 + GCG GCG Ala GCG 299495 7.37 0 2820741 0.106176001271 585 chr1 59414 59415 rs2531266 0 + G G C/G genomic single by-submitter 0 0 unknown exact 3 GCC
+    chr1 59412 59415 NM_001005484 0 + GCG GCG Ala GCG 299495 7.37 0 2820741 0.106176001271 585 chr1 59414 59415 rs55874132 0 + G G C/G genomic single unknown 0 0 coding-synon exact 1 GCC
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/extract/extract_genomic_dna.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/extract/extract_genomic_dna.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,283 @@\n+#!/usr/bin/env python\n+"""\n+usage: %prog $input $out_file1\n+    -1, --cols=N,N,N,N: Columns for start, end, strand in input file\n+    -d, --dbkey=N: Genome build of input file\n+    -o, --output_format=N: the data type of the output file\n+    -g, --GALAXY_DATA_INDEX_DIR=N: the directory containing alignseq.loc\n+    -I, --interpret_features: if true, complete features are interpreted when input is GFF \n+    -F, --fasta=<genomic_sequences>: genomic sequences to use for extraction\n+    -G, --gff: input and output file, when it is interval, coordinates are treated as GFF format (1-based, half-open) rather than \'traditional\' 0-based, closed format.\n+"""\n+from galaxy import eggs\n+import pkg_resources\n+pkg_resources.require( "bx-python" )\n+import sys, string, os, re, tempfile, subprocess\n+from bx.cookbook import doc_optparse\n+from bx.intervals.io import Header, Comment\n+import bx.seq.nib\n+import bx.seq.twobit\n+from galaxy.tools.util.galaxyops import *\n+from galaxy.datatypes.util import gff_util\n+\n+assert sys.version_info[:2] >= ( 2, 4 )\n+    \n+def stop_err( msg ):\n+    sys.stderr.write( msg )\n+    sys.exit()\n+\n+def reverse_complement( s ):\n+    complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" }\n+    reversed_s = []\n+    for i in s:\n+        reversed_s.append( complement_dna[i] )\n+    reversed_s.reverse()\n+    return "".join( reversed_s )\n+\n+def check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR ):\n+    seq_file = "%s/alignseq.loc" % GALAXY_DATA_INDEX_DIR\n+    seq_path = \'\'\n+    for line in open( seq_file ):\n+        line = line.rstrip( \'\\r\\n\' )\n+        if line and not line.startswith( "#" ) and line.startswith( \'seq\' ):\n+            fields = line.split( \'\\t\' )\n+            if len( fields ) < 3:\n+                continue\n+            if fields[1] == dbkey:\n+                seq_path = fields[2].strip()\n+                break\n+    return seq_path\n+        \n+def __main__():\n+    #\n+    # Parse options, args.\n+    #\n+    options, args = doc_optparse.parse( __doc__ )\n+    try:\n+        chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols )\n+        dbkey = options.dbkey\n+        output_format = options.output_format\n+        gff_format = options.gff\n+        interpret_features = options.interpret_features\n+        GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR\n+        fasta_file = options.fasta\n+        input_filename, output_filename = args\n+    except:\n+        doc_optparse.exception()\n+\n+    includes_strand_col = strand_col >= 0\n+    strand = None\n+    nibs = {}\n+    twobits = {}\n+        \n+    #\n+    # Set path to sequence data.\n+    #\n+    if fasta_file:\n+        # Need to create 2bit file from fasta file.\n+        try:\n+            seq_path = tempfile.NamedTemporaryFile( dir="." ).name\n+            cmd = "faToTwoBit %s %s" % ( fasta_file, seq_path )\n+        \n+            tmp_name = tempfile.NamedTemporaryFile( dir="." ).name\n+            tmp_stderr = open( tmp_name, \'wb\' )\n+            proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )\n+            returncode = proc.wait()\n+            tmp_stderr.close()\n+\n+            # Get stderr, allowing for case where it\'s very large.\n+            tmp_stderr = open( tmp_name, \'rb\' )\n+            stderr = \'\'\n+            buffsize = 1048576\n+            try:\n+                while True:\n+                    stderr += tmp_stderr.read( buffsize )\n+                    if not stderr or len( stderr ) % buffsize != 0:\n+                        break\n+            except OverflowError:\n+                pass\n+            tmp_stderr.close()\n+\n+            # Error checking.\n+            if returncode != 0:\n+                raise Exception, stderr\n+        except Exception, e:\n+            stop_err( \'Error running faToTwoBit. \' + str( e ) )\n+    else:\n+        seq_path = check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR )\n+        if not os.path.exists( seq_path ):\n+            # If this occ'..b'ine = line_count\n+                skipped_lines += len( invalid_lines )\n+                continue\n+        elif seq_path and os.path.isfile( seq_path ):\n+            if not(twobitfile):\n+                twobitfile = bx.seq.twobit.TwoBitFile( file( seq_path ) )\n+            try:\n+                if options.gff and interpret_features:\n+                    # Create sequence from intervals within a feature.\n+                    sequence = \'\'\n+                    for interval in feature.intervals:\n+                        sequence += twobitfile[interval.chrom][interval.start:interval.end]\n+                else:\n+                    sequence = twobitfile[chrom][start:end]\n+            except:\n+                warning = "Unable to fetch the sequence from \'%d\' to \'%d\' for chrom \'%s\'. " %( start, end-start, chrom )\n+                warnings.append( warning )\n+                if not invalid_lines:\n+                    invalid_lines = get_lines( feature )\n+                    first_invalid_line = line_count\n+                skipped_lines += len( invalid_lines )\n+                continue\n+        else:\n+            warning = "Chromosome by name \'%s\' was not found for build \'%s\'. " % ( chrom, dbkey )\n+            warnings.append( warning )\n+            if not invalid_lines:\n+                invalid_lines = get_lines( feature )\n+                first_invalid_line = line_count\n+            skipped_lines += len( invalid_lines )\n+            continue\n+        if sequence == \'\':\n+            warning = "Chrom: \'%s\', start: \'%s\', end: \'%s\' is either invalid or not present in build \'%s\'. " \\\n+                        % ( chrom, start, end, dbkey )\n+            warnings.append( warning )\n+            if not invalid_lines:\n+                invalid_lines = get_lines( feature )\n+                first_invalid_line = line_count\n+            skipped_lines += len( invalid_lines )\n+            continue\n+        if includes_strand_col and strand == "-":\n+            sequence = reverse_complement( sequence )\n+\n+        if output_format == "fasta" :\n+            l = len( sequence )        \n+            c = 0\n+            if gff_format:\n+                start, end = gff_util.convert_bed_coords_to_gff( [ start, end ] )\n+            fields = [dbkey, str( chrom ), str( start ), str( end ), strand]\n+            meta_data = "_".join( fields )\n+            fout.write( ">%s\\n" % meta_data )\n+            while c < l:\n+                b = min( c + 50, l )\n+                fout.write( "%s\\n" % str( sequence[c:b] ) )\n+                c = b\n+        else: # output_format == "interval"\n+            if gff_format and interpret_features:\n+                # TODO: need better GFF Reader to capture all information needed\n+                # to produce this line.\n+                meta_data = "\\t".join( \n+                                [feature.chrom, "galaxy_extract_genomic_dna", "interval", \\\n+                                 str( feature.start ), str( feature.end ), feature.score, feature.strand,\n+                                 ".", gff_util.gff_attributes_to_str( feature.attributes, "GTF" ) ] )\n+            else:\n+                meta_data = "\\t".join( fields )\n+            if gff_format:\n+                format_str = "%s seq \\"%s\\";\\n"\n+            else:\n+                format_str = "%s\\t%s\\n"\n+            fout.write( format_str % ( meta_data, str( sequence ) ) )\n+            \n+        # Update line count.\n+        if isinstance( feature, gff_util.GFFFeature ):\n+            line_count += len( feature.intervals )\n+        else:\n+            line_count += 1\n+\n+    fout.close()\n+\n+    if warnings:\n+        warn_msg = "%d warnings, 1st is: " % len( warnings )\n+        warn_msg += warnings[0]\n+        print warn_msg\n+    if skipped_lines:\n+        # Error message includes up to the first 10 skipped lines.\n+        print \'Skipped %d invalid lines, 1st is #%d, "%s"\' % ( skipped_lines, first_invalid_line, \'\\n\'.join( invalid_lines[:10] ) )\n+\n+if __name__ == "__main__": __main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/extract/extract_genomic_dna.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/extract/extract_genomic_dna.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,174 @@
+<tool id="Extract genomic DNA 1" name="Extract Genomic DNA" version="2.2.2">
+  <description>using coordinates from assembled/unassembled genomes</description>
+  <command interpreter="python">
+      extract_genomic_dna.py $input $out_file1 -o $out_format -d $dbkey 
+      
+      #if str( $interpret_features ) == "yes":
+        -I
+      #end if
+      
+      ## Columns to use in input file.
+      #if isinstance( $input.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
+        -1 1,4,5,7 --gff
+      #else:
+        -1 ${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol}
+      #end if
+            
+      #if $seq_source.index_source == "cached":
+        ## Genomic data from cache.
+        -g ${GALAXY_DATA_INDEX_DIR}
+      #else:
+        ## Genomic data from history.
+        -F $seq_source.ref_file
+      #end if
+  </command>
+  <inputs>
+      <param format="interval,gff" name="input" type="data" label="Fetch sequences for intervals in"/>
+      <param name="interpret_features" type="select" label="Interpret features when possible" help="Only meaningful for GFF, GTF datasets.">
+          <option value="yes">Yes</option>
+          <option value="no">No</option>
+      </param>
+      <conditional name="seq_source">
+          <param name="index_source" type="select" label="Source for Genomic Data">
+              <option value="cached">Locally cached</option>
+              <option value="history">History</option>
+          </param>
+          <when value="cached">
+          </when>
+          <when value="history">
+              <param name="ref_file" type="data" format="fasta" label="Using reference file" />
+          </when>
+      </conditional>
+   <param name="out_format" type="select" label="Output data type">
+       <option value="fasta">FASTA</option>
+       <option value="interval">Interval</option>
+   </param>
+  </inputs>
+  <outputs>
+      <data format="input" name="out_file1" metadata_source="input">
+          <change_format>
+              <when input="out_format" value="fasta" format="fasta" />
+          </change_format>
+      </data>
+  </outputs>
+  <requirements>
+      <requirement type="binary">faToTwoBit</requirement>
+  </requirements>
+  <tests>
+    <test>
+      <param name="input" value="1.bed" dbkey="hg17" ftype="bed" />
+      <param name="interpret_features" value="yes"/>
+      <param name="index_source" value="cached"/>
+      <param name="out_format" value="fasta"/>   
+      <output name="out_file1" file="extract_genomic_dna_out1.fasta" />
+    </test>
+    <test>
+      <param name="input" value="droPer1.bed" dbkey="droPer1" ftype="bed" />
+      <param name="interpret_features" value="yes"/>
+      <param name="index_source" value="cached"/>
+      <param name="out_format" value="fasta"/>
+      <output name="out_file1" file="extract_genomic_dna_out2.fasta" />
+    </test>
+    <test>
+      <param name="input" value="1.bed" dbkey="hg17" ftype="bed" />
+      <param name="interpret_features" value="yes"/>
+      <param name="index_source" value="cached"/>
+      <param name="out_format" value="interval"/>
+      <output name="out_file1" file="extract_genomic_dna_out3.interval" />
+    </test>
+    <!-- Test GFF file support. -->
+    <test>
+      <param name="input" value="gff_filter_by_attribute_out1.gff" dbkey="mm9" ftype="gff" />
+      <param name="interpret_features" value="no"/>
+      <param name="index_source" value="cached"/>
+      <param name="out_format" value="interval"/>
+      <output name="out_file1" file="extract_genomic_dna_out4.gff" />
+    </test>
+    <test>
+      <param name="input" value="gff_filter_by_attribute_out1.gff" dbkey="mm9" ftype="gff" />
+      <param name="interpret_features" value="no"/>
+      <param name="out_format" value="fasta"/>
+      <param name="index_source" value="cached"/>
+      <output name="out_file1" file="extract_genomic_dna_out5.fasta" />
+    </test>
+    <!-- Test custom sequences support and GFF feature interpretation. -->
+    <test>
+      <param name="input" value="cufflinks_out1.gtf" dbkey="mm9" ftype="gff" />
+      <param name="interpret_features" value="no"/>
+      <param name="index_source" value="history"/>
+      <param name="ref_file" value="tophat_in1.fasta"/>
+      <param name="out_format" value="fasta"/>
+      <output name="out_file1" file="extract_genomic_dna_out6.fasta" />
+    </test>
+    <test>
+      <param name="input" value="cufflinks_out1.gtf" dbkey="mm9" ftype="gff" />
+      <param name="interpret_features" value="yes"/>
+      <param name="index_source" value="history"/>
+      <param name="ref_file" value="tophat_in1.fasta"/>
+      <param name="out_format" value="fasta"/>
+      <output name="out_file1" file="extract_genomic_dna_out7.fasta" />
+    </test>
+  </tests>
+  <help>
+
+.. class:: warningmark
+
+This tool requires interval or gff (special tabular formatted data).  If your data is not TAB delimited, first use *Text Manipulation-&gt;Convert*.
+
+.. class:: warningmark
+
+Make sure that the genome build is specified for the dataset from which you are extracting sequences (click the pencil icon in the history item if it is not specified). 
+
+.. class:: warningmark
+
+All of the following will cause a line from the input dataset to be skipped and a warning generated.  The number of warnings and skipped lines is documented in the resulting history item.
+ - Any lines that do not contain at least 3 columns, a chromosome and numerical start and end coordinates.
+ - Sequences that fall outside of the range of a line's start and end coordinates. 
+ - Chromosome, start or end coordinates that are invalid for the specified build.
+ - Any lines whose data columns are not separated by a **TAB** character ( other white-space characters are invalid ).
+
+.. class:: infomark
+
+ **Extract genomic DNA using coordinates from ASSEMBLED genomes and UNassembled genomes** previously were achieved by two separate tools. 
+
+-----
+
+**What it does**
+
+This tool uses coordinate, strand, and build information to fetch genomic DNAs in FASTA or interval format.
+
+If strand is not defined, the default value is "+".
+
+-----
+
+**Example**
+
+If the input dataset is::
+
+    chr7  127475281  127475310  NM_000230  0  +
+    chr7  127485994  127486166  NM_000230  0  +
+    chr7  127486011  127486166  D49487     0  +
+
+Extracting sequences with **FASTA** output data type returns::
+
+    &gt;hg17_chr7_127475281_127475310_+
+    GTAGGAATCGCAGCGCCAGCGGTTGCAAG
+    &gt;hg17_chr7_127485994_127486166_+
+    GCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAACCCTGTGCG
+    GATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATC
+    CAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAG
+    GATCAATGACATTTCACACACG
+    &gt;hg17_chr7_127486011_127486166_+
+    TGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGG
+    CCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGA
+    CACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCAC
+    ACACG
+
+Extracting sequences with **Interval** output data type returns::
+
+    chr7    127475281       127475310       NM_000230       0       +       GTAGGAATCGCAGCGCCAGCGGTTGCAAG
+    chr7    127485994       127486166       NM_000230       0       +       GCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACG
+    chr7    127486011       127486166       D49487  0       +       TGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACG
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/extract/liftOver_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/extract/liftOver_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+#Guruprasad Ananda
+"""
+Converts coordinates from one build/assembly to another using liftOver binary and mapping files downloaded from UCSC.
+"""
+
+import os, string, subprocess, sys
+import tempfile
+import re
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+def safe_bed_file(infile):
+    """Make a BED file with track and browser lines ready for liftOver.
+
+    liftOver will fail with track or browser lines. We can make it happy
+    by converting these to comments. See:
+
+    https://lists.soe.ucsc.edu/pipermail/genome/2007-May/013561.html
+    """
+    fix_pat = re.compile("^(track|browser)")
+    (fd, fname) = tempfile.mkstemp()
+    in_handle = open(infile)
+    out_handle = open(fname, "w")
+    for line in in_handle:
+        if fix_pat.match(line):
+            line = "#" + line
+        out_handle.write(line)
+    in_handle.close()
+    out_handle.close()
+    return fname
+    
+if len( sys.argv ) < 9:
+    stop_err( "USAGE: prog input out_file1 out_file2 input_dbkey output_dbkey infile_type minMatch multiple <minChainT> <minChainQ> <minSizeQ>" )
+
+infile = sys.argv[1]
+outfile1 = sys.argv[2]
+outfile2 = sys.argv[3]
+in_dbkey = sys.argv[4]
+mapfilepath = sys.argv[5]
+infile_type = sys.argv[6]
+gff_option = ""
+if infile_type == "gff":
+    gff_option = "-gff "
+minMatch = sys.argv[7]
+multiple = int(sys.argv[8])
+multiple_option = ""
+if multiple:
+    minChainT = sys.argv[9]
+    minChainQ = sys.argv[10]
+    minSizeQ = sys.argv[11]
+    multiple_option = " -multiple -minChainT=%s -minChainQ=%s -minSizeQ=%s " %(minChainT,minChainQ,minSizeQ)
+
+try:
+    assert float(minMatch)
+except:
+    minMatch = 0.1
+#ensure dbkey is set
+if in_dbkey == "?": 
+    stop_err( "Input dataset genome build unspecified, click the pencil icon in the history item to specify it." )
+
+if not os.path.isfile( mapfilepath ):
+    stop_err( "%s mapping is not currently available."  % ( mapfilepath.split('/')[-1].split('.')[0] ) )
+
+safe_infile = safe_bed_file(infile)
+cmd_line = "liftOver " + gff_option + "-minMatch=" + str(minMatch) + multiple_option + " "  + safe_infile + " " + mapfilepath + " " + outfile1 + " " + outfile2 + "  > /dev/null"
+
+try:
+    # have to nest try-except in try-finally to handle 2.4
+    try:
+        proc = subprocess.Popen( args=cmd_line, shell=True, stderr=subprocess.PIPE )
+        returncode = proc.wait()
+        stderr = proc.stderr.read()
+        if returncode != 0:
+            raise Exception, stderr
+    except Exception, e:
+        raise Exception, 'Exception caught attempting conversion: ' + str( e )
+finally:
+    os.remove(safe_infile)
b
diff -r 000000000000 -r 9071e359b9a3 tools/extract/liftOver_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/extract/liftOver_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,145 @@
+<tool id="liftOver1" name="Convert genome coordinates" version="1.0.3">
+  <description> between assemblies and genomes</description>
+  <command interpreter="python">
+  liftOver_wrapper.py 
+  $input 
+  "$out_file1" 
+  "$out_file2" 
+  $dbkey 
+  $to_dbkey 
+  #if isinstance( $input.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__) or isinstance( $input.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gtf').__class__):
+        "gff"
+  #else:
+        "interval"
+  #end if
+  $minMatch ${multiple.choice} ${multiple.minChainT} ${multiple.minChainQ} ${multiple.minSizeQ}
+  </command>
+  <inputs>
+    <param format="interval,gff,gtf" name="input" type="data" label="Convert coordinates of">
+      <validator type="unspecified_build" />
+      <validator type="dataset_metadata_in_file" filename="liftOver.loc" metadata_name="dbkey" metadata_column="0" message="Liftover mappings are currently not available for the specified build." />
+    </param>
+    <param name="to_dbkey" type="select" label="To">
+      <options from_file="liftOver.loc">
+        <column name="name" index="1"/>
+        <column name="value" index="2"/>
+        <column name="dbkey" index="0"/>
+        <filter type="data_meta" ref="input" key="dbkey" column="0" />
+      </options>
+    </param> 
+    <param name="minMatch" size="10" type="float" value="0.95" label="Minimum ratio of bases that must remap" help="Recommended values: same species = 0.95, different species = 0.10" />
+    <conditional name="multiple">
+     <param name="choice" type="select" label="Allow multiple output regions?" help="Recommended values: same species = No, different species = Yes">
+      <option value="0" selected="true">No</option>
+      <option value="1">Yes</option>
+ </param>
+ <when value="0">
+     <param name="minSizeQ" type="hidden" value="0" />
+     <param name="minChainQ" type="hidden" value="0" />
+     <param name="minChainT" type="hidden" value="0" />
+     </when>
+     <when value="1">
+         <param name="minSizeQ" size="10" type="integer" value="0" label="Minimum matching region size in query" help="Recommended value: set to >= 300 bases for complete transcripts"/>
+     <param name="minChainQ" size="10" type="integer" value="500" label="Minimum chain size in query"/>
+     <param name="minChainT" size="10" type="integer" value="500" label="Minimum chain size in target"/>
+     </when>
+ </conditional>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" label="${tool.name} on ${on_string} [ MAPPED COORDINATES ]">
+      <actions>
+        <action type="metadata" name="dbkey">
+          <option type="from_file" name="liftOver.loc" column="1" offset="0">
+            <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/>
+            <filter type="param_value" ref="to_dbkey" column="2"/>
+          </option>
+        </action>
+      </actions>
+    </data>
+    <data format="input" name="out_file2" label="${tool.name} on ${on_string} [ UNMAPPED COORDINATES ]" />
+  </outputs>
+  <requirements>
+    <requirement type="package">ucsc_tools</requirement>
+  </requirements>
+  <tests>
+    <test>
+      <param name="input" value="5.bed" dbkey="hg18" ftype="bed" />
+      <param name="to_dbkey" value="panTro2" />
+      <param name="minMatch" value="0.95" />
+      <param name="choice" value="0" />
+      <output name="out_file1" file="5_liftover_mapped.bed"/>
+      <output name="out_file2" file="5_liftover_unmapped.bed"/>
+    </test>
+    <test>
+      <param name="input" value="5.bed" dbkey="hg18" ftype="bed" />
+      <param name="to_dbkey" value="panTro2" />
+      <param name="minMatch" value="0.10" />
+      <param name="choice" value="1" />
+      <param name="minSizeQ" value="0" />
+      <param name="minChainQ" value="500" />
+      <param name="minChainT" value="500" />
+      <output name="out_file1" file="5_mult_liftover_mapped.bed"/>
+      <output name="out_file2" file="5_mult_liftover_unmapped.bed"/>
+    </test>
+    <test>
+      <param name="input" value="cuffcompare_in1.gtf" dbkey="hg18" ftype="gtf" />
+      <param name="to_dbkey" value="panTro2" />
+      <param name="minMatch" value="0.95" />
+      <param name="choice" value="0" />
+      <output name="out_file1" file="cuffcompare_in1_liftover_mapped.bed"/>
+      <output name="out_file2" file="cuffcompare_in1_liftover_unmapped.bed"/>
+    </test>
+    <test>
+      <param name="input" value="cuffcompare_in1.gtf" dbkey="hg18" ftype="gtf" />
+      <param name="to_dbkey" value="panTro2" />
+      <param name="minMatch" value="0.10" />
+      <param name="choice" value="1" />
+      <param name="minSizeQ" value="0" />
+      <param name="minChainQ" value="500" />
+      <param name="minChainT" value="500" />
+      <output name="out_file1" file="cuffcompare_in1_mult_liftover_mapped.bed"/>
+      <output name="out_file2" file="cuffcompare_in1_mult_liftover_unmapped.bed"/>
+    </test>
+  </tests>
+  <help>
+.. class:: warningmark
+
+Make sure that the genome build of the input dataset is specified (click the pencil icon in the history item to set it if necessary).
+
+.. class:: warningmark
+
+This tool can work with interval, GFF, and GTF datasets. It requires the interval datasets to have chromosome in column 1,
+start co-ordinate in column 2 and end co-ordinate in column 3. BED comments
+and track and browser lines will be ignored, but if other non-interval lines
+are present the tool will return empty output datasets.
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+This tool is based on the LiftOver utility and Chain track from `the UC Santa Cruz Genome Browser`__.
+
+It converts coordinates and annotations between assemblies and genomes. It produces 2 files, one containing all the mapped coordinates and the other containing the unmapped coordinates, if any. 
+
+ .. __: http://genome.ucsc.edu/
+
+-----
+
+**Example**
+
+Converting the following hg16 intervals to hg18 intervals::
+
+    chrX  85170   112199  AK002185  0  +
+    chrX  110458  112199  AK097346  0  +
+    chrX  112203  121212  AK074528  0  -
+
+will produce the following hg18 intervals::
+
+    chrX  132991  160020  AK002185  0  +
+    chrX  158279  160020  AK097346  0  +
+    chrX  160024  169033  AK074528  0  -
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/extract/phastOdds/get_scores_galaxy.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/extract/phastOdds/get_scores_galaxy.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,147 @@
+#!/usr/bin/env python
+
+"""
+usage: %prog data_file.h5 region_mapping.bed in_file out_file chrom_col start_col end_col [options]
+   -p, --perCol: standardize to lod per column
+"""
+
+from __future__ import division
+
+import sys
+from galaxy import eggs
+from numpy import *
+from tables import *
+
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+
+from bx import intervals
+
+# ignore wanrnings about NumArray flavor
+from warnings import filterwarnings
+from tables.exceptions import FlavorWarning
+filterwarnings("ignore", category=FlavorWarning)
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write(msg)
+    sys.exit()
+
+def main():
+    # Parse command line
+    options, args = doc_optparse.parse( __doc__ )
+    try:
+        h5_fname = args[0]
+        mapping_fname = args[1]
+        in_fname = args[2]
+        out_fname = args[3]
+        chrom_col, start_col, end_col = map( lambda x: int( x ) - 1, args[4:7] )
+        per_col = bool( options.perCol )
+    except Exception, e:
+        doc_optparse.exception()
+        
+    if h5_fname == 'None.h5':
+        stop_err( 'Invalid genome build, this tool currently only works with data from build hg17.  Click the pencil icon in your history item to correct the build if appropriate.' )
+        
+    # Open the h5 file
+    h5 = openFile( h5_fname, mode = "r" )
+    # Load intervals and names for the subregions
+    intersecters = {}
+    for i, line in enumerate( file( mapping_fname ) ):
+        line = line.rstrip( '\r\n' )
+        if line and not line.startswith( '#' ):
+            chr, start, end, name = line.split()[0:4]
+            if not intersecters.has_key( chr ): 
+                intersecters[ chr ] = intervals.Intersecter()
+            intersecters[ chr ].add_interval( intervals.Interval( int( start ), int( end ), name ) )
+
+    # Find the subregion containing each input interval
+    skipped_lines = 0
+    first_invalid_line = 0
+    invalid_line = ''
+    out_file = open( out_fname, "w" )
+    warnings = []
+    warning = ''
+    for i, line in enumerate( file( in_fname ) ):
+        line = line.rstrip( '\r\n' )
+        if line.startswith( '#' ):
+            if i == 0:
+                out_file.write( "%s\tscore\n" % line )
+            else:
+                out_file.write( "%s\n" % line )
+        fields = line.split( "\t" )
+        try:
+            chr = fields[ chrom_col ]
+            start = int( fields[ start_col ] )
+            end = int( fields[ end_col ] )
+        except:
+            warning = "Invalid value for chrom, start or end column."
+            warnings.append( warning )
+            skipped_lines += 1
+            if not invalid_line:
+                first_invalid_line = i + 1
+                invalid_line = line
+            continue
+        # Find matching interval
+        try:
+            matches = intersecters[ chr ].find( start, end )
+        except:
+            warning = "'%s' is not a valid chrom value for the region. " %chr
+            warnings.append( warning )
+            skipped_lines += 1
+            if not invalid_line:
+                first_invalid_line = i + 1
+                invalid_line = line
+            continue
+        if not len( matches ) == 1:
+            warning = "Interval must match exactly one target region. "
+            warnings.append( warning )
+            skipped_lines += 1
+            if not invalid_line:
+                first_invalid_line = i + 1
+                invalid_line = line
+            continue
+        region = matches[0]
+        if not ( start >= region.start and end <= region.end ):
+            warning = "Interval must fall entirely within region. "
+            warnings.append( warning )
+            skipped_lines += 1
+            if not invalid_line:
+                first_invalid_line = i + 1
+                invalid_line = line
+            continue
+        region_name = region.value
+        rel_start = start - region.start
+        rel_end = end - region.start
+        if not rel_start < rel_end:
+            warning = "Region %s is empty, relative start:%d, relative end:%d. " % ( region_name, rel_start, rel_end )
+            warnings.append( warning )
+            skipped_lines += 1
+            if not invalid_line:
+                first_invalid_line = i + 1
+                invalid_line = line
+            continue
+        s = h5.getNode( h5.root, "scores_" + region_name )
+        c = h5.getNode( h5.root, "counts_" + region_name )
+        score = s[rel_end-1]
+        count = c[rel_end-1]
+        if rel_start > 0:
+            score -= s[rel_start-1]
+            count -= c[rel_start-1]
+        if per_col: 
+            score /= count
+        fields.append( str( score ) )
+        out_file.write( "%s\n" % "\t".join( fields ) )
+    # Close the file handle
+    h5.close()
+    out_file.close()
+
+    if warnings:
+        warn_msg = "PhastOdds scores are only available for ENCODE regions. %d warnings, 1st is: " % len( warnings )
+        warn_msg += warnings[0]
+        print warn_msg
+    if skipped_lines:
+        print 'Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
+
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/extract/phastOdds/phastOdds_tool.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/extract/phastOdds/phastOdds_tool.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,67 @@
+<tool id="phastOdds_for_intervals" name="Compute phastOdds score" version="1.0.0">
+  <description>for each interval</description>
+  <command interpreter="python">get_scores_galaxy.py $per_col ${score_file}.h5 ${score_file}.mapping.bed $input $output ${input.metadata.chromCol} ${input.metadata.startCol} ${input.metadata.endCol}</command>
+  <inputs>
+    <param format="interval" name="input" type="data" label="Interval file">
+      <validator type="unspecified_build" message="Unspecified build, this tool works with data from genome builds hg17. Click the pencil icon in your history item to set the genome build."/>
+      <validator type="dataset_metadata_in_file" filename="phastOdds.loc" metadata_name="dbkey" metadata_column="0" message="Sequences are currently unavailable for the specified build." />
+    </param>
+    <param name="score_file" type="select" label="Available datasets">
+      <options from_file="phastOdds.loc">
+        <column name="name" index="1"/>
+        <column name="value" index="2"/>
+        <column name="dbkey" index="0"/>
+        <filter type="data_meta" ref="input" key="dbkey" column="0" />
+      </options>
+    </param>
+ <param name="per_col" type="boolean" label="Standardize" help="Standardizes the score to be per alignment column" checked="yes" truevalue="-p" falsevalue=""/>
+  </inputs>
+  <outputs>
+    <data format="interval" name="output" metadata_source="input"/>
+  </outputs>
+  <requirements>
+    <requirement type="python-module">numpy</requirement>
+    <requirement type="python-module">tables</requirement>
+  </requirements>
+  <tests>
+    <test>
+      <param name="input" value="4.bed" dbkey="hg17" ftype="bed"/>
+      <param name="score_file" value="/galaxy/data/phastOdds_precomputed/encode_SEP-2005_tba.v2_phastOdds" />
+      <param name="per_col" value="true" />
+      <output name="output" file="phastOdds_tool_out.interval" />
+    </test>
+  </tests>
+  <help>
+    
+.. class:: warningmark
+
+This tool currently only works with interval data from genome build hg17.
+
+.. class:: warningmark
+
+This tool assumes that the input dataset is in interval format and contains at least a chrom column, a start column and an end column.  These 3 columns can be dispersed throughout any number of other data columns. 
+
+-----
+
+**Syntax**
+
+Append a column to each line of an interval file containing the phastOdds score for that interval.
+
+-----
+
+**Example**
+
+If your original data has the following format:
+
++-----+-----+---+
+|chrom|start|end|
++-----+-----+---+
+
+and you choose to compute phastOdds scores, your output will look like this:
+
++-----+-----+---+-----+
+|chrom|start|end|score|
++-----+-----+---+-----+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/fasta_tools/fasta_compute_length.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fasta_tools/fasta_compute_length.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+"""
+Input: fasta, int
+Output: tabular
+Return titles with lengths of corresponding seq
+"""
+
+import sys, os
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def __main__():
+    
+    infile = sys.argv[1]
+    out = open( sys.argv[2], 'w')
+    keep_first_char = int( sys.argv[3] )
+
+    fasta_title = ''
+    seq_len = 0
+
+    # number of char to keep in the title
+    if keep_first_char == 0:
+        keep_first_char = None
+    else:
+        keep_first_char += 1
+
+    first_entry = True
+
+    for line in open( infile ):
+        line = line.strip()
+        if not line or line.startswith( '#' ):
+            continue
+        if line[0] == '>':
+            if first_entry == False:
+                out.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first_char ], seq_len ) )
+            else:
+                first_entry = False
+            fasta_title = line
+            seq_len = 0
+        else:
+            seq_len += len(line)
+
+    # last fasta-entry
+    out.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first_char ], seq_len ) )
+    out.close()
+
+if __name__ == "__main__" : __main__()
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/fasta_tools/fasta_compute_length.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fasta_tools/fasta_compute_length.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,51 @@
+<tool id="fasta_compute_length" name="Compute sequence length">
+ <description></description>
+ <command interpreter="python">fasta_compute_length.py $input $output $keep_first</command>
+ <inputs>
+ <param name="input" type="data" format="fasta" label="Compute length for these sequences"/>
+ <param name="keep_first" type="integer" size="5" value="0" label="How many title characters to keep?" help="'0' = keep the whole thing"/>
+ </inputs>
+ <outputs>
+ <data name="output" format="tabular"/>
+ </outputs>
+ <tests>
+ <test>
+ <param name="input" value="454.fasta" />
+ <param name="keep_first" value="0"/>
+ <output name="output" file="fasta_tool_compute_length_1.out" />
+ </test>
+
+ <test>
+ <param name="input" value="extract_genomic_dna_out1.fasta" />
+ <param name="keep_first" value="0"/>
+ <output name="output" file="fasta_tool_compute_length_2.out" />
+ </test>
+
+ <test>
+ <param name="input" value="454.fasta" />
+ <param name="keep_first" value="14"/>
+ <output name="output" file="fasta_tool_compute_length_3.out" />
+ </test>
+ </tests>
+ <help>
+
+**What it does**
+
+This tool counts the length of each fasta sequence in the file. The output file has two columns per line (separated by tab): fasta titles and lengths of the sequences. The option *How many characters to keep?* allows to select a specified number of letters from the beginning of each FASTA entry. 
+
+-----
+
+**Example**
+
+Suppose you have the following FASTA formatted sequences from a Roche (454) FLX sequencing run::
+
+    &gt;EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_     TCCGCGCCGAGCATGCCCATCTTGGATTCCGGCGCGATGACCATCGCCCGCTCCACCACG     TTCGGCCGGCCCTTCTCGTCGAGGAATGACACCAGCGCTTCGCCCACG     &gt;EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_     AATAAAACTAAATCAGCAAAGACTGGCAAATACTCACAGGCTTATACAATACAAATGTAAfa
+
+Running this tool while setting **How many characters to keep?** to **14** will produce this::
+
+ EYKX4VC02EQLO5  108
+ EYKX4VC02D4GS2  60
+
+
+ </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/fasta_tools/fasta_concatenate_by_species.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fasta_tools/fasta_concatenate_by_species.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+#Dan Blankenberg
+"""
+Takes a Multiple Alignment FASTA file and concatenates 
+sequences for each species, resulting in one sequence 
+alignment per species.
+"""
+
+import sys, tempfile
+from galaxy import eggs
+from galaxy.tools.util.maf_utilities import iter_fasta_alignment
+from galaxy.util.odict import odict
+
+def __main__():
+    input_filename = sys.argv[1]
+    output_filename = sys.argv[2]
+    species = odict()
+    cur_size = 0
+    for components in iter_fasta_alignment( input_filename ):
+        species_not_written = species.keys()
+        for component in components:
+            if component.species not in species:
+                species[component.species] = tempfile.TemporaryFile()
+                species[component.species].write( "-" * cur_size )
+            species[component.species].write( component.text )
+            try:
+                species_not_written.remove( component.species )
+            except ValueError:
+                #this is a new species
+                pass
+        for spec in species_not_written:
+            species[spec].write( "-" * len( components[0].text ) )
+        cur_size += len( components[0].text )
+    out = open( output_filename, 'wb' )
+    for spec, f in species.iteritems():
+        f.seek( 0 )
+        out.write( ">%s\n%s\n" % ( spec, f.read() ) )
+    out.close()
+
+if __name__ == "__main__" : __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/fasta_tools/fasta_concatenate_by_species.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fasta_tools/fasta_concatenate_by_species.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,72 @@
+<tool id="fasta_concatenate0" name="Concatenate" version="0.0.0">
+  <description>FASTA alignment by species</description>
+  <command interpreter="python">fasta_concatenate_by_species.py $input1 $out_file1</command>
+  <inputs>
+    <param name="input1" type="data" format="fasta" label="FASTA alignment"/>
+  </inputs>
+  <outputs>
+    <data name="out_file1" format="fasta"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="cf_maf2fasta.dat" />
+      <output name="out_file1" file="fasta_concatenate_out.fasta" />
+    </test>
+  </tests>
+  <help>
+  
+**What it does**
+  
+This tools attempts to parse FASTA headers to determine the species for each sequence in a multiple FASTA alignment.
+It then linearly concatenates the sequences for each species in the file, creating one sequence per determined species.
+
+-------
+
+**Example**
+
+Starting FASTA::
+  
+  >hg18.chr1(+):10016339-10016341|hg18_0
+  GT
+  >panTro2.chr1(+):10195380-10195382|panTro2_0
+  GT
+  >rheMac2.chr1(+):13119747-13119749|rheMac2_0
+  GT
+  >mm8.chr4(-):148269679-148269681|mm8_0
+  GT
+  >canFam2.chr5(+):66213635-66213637|canFam2_0
+  GT
+  
+  >hg18.chr1(-):100323677-100323679|hg18_1
+  GT
+  >panTro2.chr1(-):101678671-101678673|panTro2_1
+  GT
+  >rheMac2.chr1(-):103154011-103154013|rheMac2_1
+  GT
+  >mm8.chr3(+):116620616-116620618|mm8_1
+  GT
+  >canFam2.chr6(+):52954092-52954094|canFam2_1
+  GT
+  
+
+
+becomes::
+  
+  >hg18
+  GTGT
+  >panTro2
+  GTGT
+  >rheMac2
+  GTGT
+  >mm8
+  GTGT
+  >canFam2
+  GTGT
+
+
+.. class:: warningmark 
+
+ This tool will only work properly on files with Galaxy style FASTA headers.
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/fasta_tools/fasta_filter_by_length.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fasta_tools/fasta_filter_by_length.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+"""
+Input: fasta, minimal length, maximal length
+Output: fasta
+Return sequences whose lengths are within the range.
+"""
+
+import sys, os
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def __main__():
+    input_filename = sys.argv[1]
+    try:
+        min_length = int( sys.argv[2] )
+    except:
+        stop_err( "Minimal length of the return sequence requires a numerical value." )
+    try:
+        max_length = int( sys.argv[3] )
+    except:
+        stop_err( "Maximum length of the return sequence requires a numerical value." )
+    output_filename = sys.argv[4]
+    output_handle = open( output_filename, 'w' )
+    tmp_size = 0 #-1
+    tmp_buf = ''
+    at_least_one = 0
+    for line in file(input_filename):
+        if not line or line.startswith('#'):
+            continue
+        if line[0] == '>':
+            if min_length <= tmp_size <= max_length or (min_length <= tmp_size and max_length == 0):
+                output_handle.write(tmp_buf)
+                at_least_one = 1
+            tmp_buf = line
+            tmp_size = 0                                                       
+        else:
+            if max_length == 0 or tmp_size < max_length:
+                tmp_size += len(line.rstrip('\r\n'))
+                tmp_buf += line
+    # final flush of buffer
+    if min_length <= tmp_size <= max_length or (min_length <= tmp_size and max_length == 0):
+        output_handle.write(tmp_buf.rstrip('\r\n'))
+        at_least_one = 1
+    output_handle.close()
+    if at_least_one == 0:
+        print "There is no sequence that falls within your range."
+
+if __name__ == "__main__" : __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/fasta_tools/fasta_filter_by_length.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fasta_tools/fasta_filter_by_length.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,56 @@
+<tool id="fasta_filter_by_length" name="Filter sequences by length" version="1.1">
+ <description></description>
+ <command interpreter="python">fasta_filter_by_length.py $input $min_length $max_length $output </command>
+ <inputs>
+ <param name="input" type="data" format="fasta" label="Fasta file"/>
+ <param name="min_length" type="integer" size="15" value="0" label="Minimal length" />
+ <param name="max_length" type="integer" size="15" value="0" label="Maximum length" help="Setting to '0' will return all sequences longer than the 'Minimal length'"/> 
+ </inputs>
+ <outputs>
+ <data name="output" format="fasta"/>
+ </outputs>
+ <tests>
+ <test>
+ <param name="input" value="454.fasta" />
+ <param name="min_length" value="10" />
+ <param name="max_length" value="0" />
+ <output name="output" file="fasta_tool_filter_length_1.out" />
+ </test>
+ </tests>
+ <help>
+
+.. class:: infomark
+
+**TIP**. To return sequences longer than a certain length, set *Minimal length* to desired value and leave *Maximum length* set to '0'.
+
+-----
+
+**What it does**
+
+Outputs sequences between *Minimal length* and *Maximum length*.

+-----
+
+**Example**
+
+Suppose you have the following FASTA formatted sequences::
+
+ &gt;seq1
+ TCATTTAATGAC
+ &gt;seq2
+ ATGGC
+ &gt;seq3
+ TCACATGATGCCG
+ &gt;seq4
+ ATGGAAGC
+
+Setting the **Minimal length** to **10**, and the **Maximum length** to **0** will return all sequences longer than 10 bp::
+
+  &gt;seq1
+ TCATTTAATGAC
+ &gt;seq3
+ TCACATGATGCCG
+
+
+ </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/fasta_tools/fasta_to_tabular.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fasta_tools/fasta_to_tabular.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# This code exists in 2 places: ~/datatypes/converters and ~/tools/fasta_tools
+"""
+Input: fasta (input file), tabular (output file), int (truncation of id), int (columns from description)
+Output: tabular
+format convert: fasta to tabular
+"""
+
+import sys, os
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def __main__():
+    if len(sys.argv) != 5:
+        stop_err("Wrong number of argument. Expect four (fasta, tabular, truncation, columns)")
+    infile = sys.argv[1]
+    outfile = sys.argv[2]
+    keep_first = int( sys.argv[3] )
+    descr_split = int( sys.argv[4] )
+    fasta_title = fasta_seq = ''
+    if keep_first == 0:
+        keep_first = None
+    elif descr_split == 1:
+        #Added one for the ">" character
+        #(which is removed if using descr_split > 1)
+        keep_first += 1
+    if descr_split < 1:
+        stop_err("Bad description split value (should be 1 or more)")
+    out = open( outfile, 'w' )
+    for i, line in enumerate( open( infile ) ):
+        line = line.rstrip( '\r\n' )
+        if not line or line.startswith( '#' ):
+            continue
+        if line.startswith( '>' ):
+            #Don't want any existing tabs to trigger extra columns:
+            line = line.replace('\t', ' ')
+            if i > 0:
+                out.write('\n')
+            if descr_split == 1:
+                out.write(line[1:keep_first])
+            else:
+                words = line[1:].split(None, descr_split-1)
+                #apply any truncation to first word (the id)
+                words[0] = words[0][0:keep_first]
+                #pad with empty columns if required
+                words += [""]*(descr_split-len(words))
+                out.write("\t".join(words))
+            out.write('\t')
+        else:
+            out.write(line)
+    if i > 0:
+        out.write('\n')
+    out.close()
+
+if __name__ == "__main__" : __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/fasta_tools/fasta_to_tabular.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fasta_tools/fasta_to_tabular.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,128 @@
+<tool id="fasta2tab" name="FASTA-to-Tabular" version="1.1.0">
+ <description>converter</description>
+ <command interpreter="python">fasta_to_tabular.py $input $output $keep_first $descr_columns</command>
+ <inputs>
+ <param name="input" type="data" format="fasta" label="Convert these sequences"/>
+ <param name="descr_columns" type="integer" size="2" value="1" label="How many columns to divide title string into?" help="Typically 2 to take the ID (first word) and decription (rest) as two columns, or 1 to give a single column">
+ <validator type="in_range" min="1" />
+ </param>
+ <param name="keep_first" type="integer" size="5" value="0" label="How many title characters to keep?" help="Applies only to the first column taken from the title string ('0' = keep the whole thing), useful when your sequence identifiers are all the same length.">
+ <validator type="in_range" min="0" />
+ </param>
+ </inputs>
+ <outputs>
+ <data name="output" format="tabular"/>
+ </outputs>
+ <tests>
+ <test>
+ <param name="input" value="454.fasta" />
+ <param name="descr_columns" value="1"/>
+ <param name="keep_first" value="0"/>
+ <output name="output" file="fasta_to_tabular_out1.tabular" />
+ </test>
+
+ <test>
+ <param name="input" value="4.fasta" />
+ <param name="descr_columns" value="1"/>
+ <param name="keep_first" value="0"/>
+ <output name="output" file="fasta_to_tabular_out2.tabular" />
+ </test>
+
+ <test>
+ <param name="input" value="454.fasta" />
+ <param name="descr_columns" value="1"/>
+ <param name="keep_first" value="14"/>
+ <output name="output" file="fasta_to_tabular_out3.tabular" />
+ </test>
+
+ <test>
+ <param name="input" value="454.fasta" />
+ <param name="descr_columns" value="2"/>
+ <param name="keep_first" value="0"/>
+ <output name="output" file="fasta_to_tabular_out4.tabular" />
+ </test>
+
+ <test>
+ <param name="input" value="454.fasta" />
+ <param name="descr_columns" value="5"/>
+ <param name="keep_first" value="0"/>
+ <output name="output" file="fasta_to_tabular_out5.tabular" />
+ </test>
+
+ <test>
+ <param name="input" value="454.fasta" />
+ <param name="descr_columns" value="5"/>
+ <param name="keep_first" value="10"/>
+ <output name="output" file="fasta_to_tabular_out6.tabular" />
+ </test>
+
+ </tests>
+ <help>
+
+**What it does**
+
+This tool converts FASTA formatted sequences to TAB-delimited format.
+
+Many tools consider the first word of the FASTA "&gt;" title line to be an identifier, and any remaining text to be a free form description.
+It is therefore useful to split this text into two columns in Galaxy (identifier and any description) by setting **How many columns to divide title string into?** to **2**.
+In some cases the description can be usefully broken up into more columns -- see the examples .
+
+The option *How many characters to keep?* allows to select a specified number of letters from the beginning of each FASTA entry.
+With the introduction of the **How many columns to divide title string into?** option this setting is of limited use, but does still allow you to truncate the identifier.
+
+-----
+
+**Example**
+
+Suppose you have the following FASTA formatted sequences from a Roche (454) FLX sequencing run::
+
+    &gt;EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_
+    TCCGCGCCGAGCATGCCCATCTTGGATTCCGGCGCGATGACCATCGCCCGCTCCACCACG
+    TTCGGCCGGCCCTTCTCGTCGAGGAATGACACCAGCGCTTCGCCCACG
+    &gt;EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_
+    AATAAAACTAAATCAGCAAAGACTGGCAAATACTCACAGGCTTATACAATACAAATGTAA
+
+Running this tool with the default settings will produce this (2 column output):
+
+========================================================================== =======================================
+EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG
+EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_  AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA
+========================================================================== =======================================
+
+Having the full title line (the FASTA "&gt;" line text) as a column is not always ideal.
+
+The **How many characters to keep?** option is useful if your identifiers are all the same length.
+In this example the identifier is 14 characters, so setting **How many characters to keep?** to **14** (and leaving **How many columns to divide title string into?** as the default, **1**) will produce this (2 column output):
+
+============== =======================================
+EYKX4VC02EQLO5 TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG
+EYKX4VC02D4GS2 AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA
+============== =======================================
+
+If however your FASTA file has identifiers of variable length, it is better to split the text into at least two columns.
+Running this tool with **How many columns to divide title string into?** to **2** will produce this (3 column output):
+
+============== =========================================================== =======================================
+EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG
+EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_  AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA
+============== =========================================================== =======================================
+
+Running this tool with **How many columns to divide title string into?** to **5** will produce this (5 column output):
+
+============== ========== ============ ======== ========================== =======================================
+EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG
+EYKX4VC02D4GS2 length=60  xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA
+============== ========== ============ ======== ========================== =======================================
+
+Running this tool with **How many columns to divide title string into?** to **5** and **How many characters to keep?** to **10** will produce this (5 column output).
+Notice that only the first column is truncated to 10 characters -- and be careful not to trim your sequence names too much (generally they should be unique):
+
+========== ========== ============ ======== ========================== =======================================
+EYKX4VC02E length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGC...ACG
+EYKX4VC02D length=60  xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ AATAAAACTAAATCAGCAAAGACTGGCAAATAC...TAA
+========== ========== ============ ======== ========================== =======================================
+
+Note the sequences have been truncated for display purposes in the above tables.
+
+ </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/fasta_tools/tabular_to_fasta.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fasta_tools/tabular_to_fasta.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+"""
+Input: fasta, minimal length, maximal length
+Output: fasta
+Return sequences whose lengths are within the range.
+"""
+import sys, os
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def __main__():
+    infile = sys.argv[1]
+    title_col = sys.argv[2]
+    seq_col = sys.argv[3]
+    outfile = sys.argv[4]        
+
+    if title_col == None or title_col == 'None' or seq_col == None or seq_col == 'None':
+        stop_err( "Columns not specified." )
+    try:
+        seq_col = int( seq_col ) - 1
+    except:
+        stop_err( "Invalid Sequence Column: %s." %str( seq_col ) )
+
+    title_col_list = title_col.split( ',' )
+    out = open( outfile, 'w' )
+    skipped_lines = 0
+    first_invalid_line = 0
+    invalid_line = ""
+    i = 0
+    
+    for i, line in enumerate( open( infile ) ):
+        error = False
+        line = line.rstrip( '\r\n' )
+        if line and not line.startswith( '#' ):
+            fields = line.split( '\t' )
+            fasta_title = []
+            for j in title_col_list:
+                try:
+                    j = int( j ) - 1
+                    fasta_title.append( fields[j] )
+                except:
+                    skipped_lines += 1
+                    if not invalid_line:
+                        first_invalid_line = i + 1
+                        invalid_line = line
+                    error = True
+                    break
+            if not error:
+                try:
+                    fasta_seq = fields[seq_col]
+                    if fasta_title[0].startswith( ">" ):
+                        fasta_title[0] = fasta_title[0][1:]
+                    print >> out, ">%s\n%s" % ( "_".join( fasta_title ), fasta_seq )
+                except:
+                    skipped_lines += 1
+                    if not invalid_line:
+                        first_invalid_line = i + 1
+                        invalid_line = line
+    out.close()    
+
+    if skipped_lines > 0:
+        print 'Data issue: skipped %d blank or invalid lines starting at #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
+
+if __name__ == "__main__" : __main__()
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/fasta_tools/tabular_to_fasta.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fasta_tools/tabular_to_fasta.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,43 @@
+<tool id="tab2fasta" name="Tabular-to-FASTA" version="1.1.0">
+ <description>converts tabular file to FASTA format</description>
+ <command interpreter="python">tabular_to_fasta.py $input $title_col $seq_col $output </command>
+ <inputs>
+ <param name="input" type="data" format="tabular" label="Tab-delimited file"/>
+ <param name="title_col" type="data_column" data_ref="input" multiple="True" numerical="False" label="Title column(s)" help="Multi-select list - hold the appropriate key while clicking to select multiple columns"/>
+ <param name="seq_col" type="data_column" data_ref="input" numerical="False" label="Sequence column" />
+ </inputs>
+ <outputs>
+ <data name="output" format="fasta"/>
+ </outputs>
+ <tests>
+ <test>
+ <param name="input" value="solexa.tabular" />
+ <param name="title_col" value="1,2,3,4" />
+ <param name="seq_col" value="5" />
+ <output name="output" file="tabular_to_fasta_out1.fasta" />
+ </test>
+ </tests>
+ <help>
+
+**What it does**
+
+Converts tab delimited data into FASTA formatted sequences.
+
+-----------
+
+**Example**
+
+Suppose this is a sequence file produced by Illumina (Solexa) sequencer::
+
+ 5 300 902 419 GACTCATGATTTCTTACCTATTAGTGGTTGAACATC
+ 5 300 880 431 GTGATATGTATGTTGACGGCCATAAGGCTGCTTCTT
+
+Selecting **c3** and **c4** as the **Title column(s)** and **c5** as the **Sequence column** will result in::
+
+ &gt;902_419
+ GACTCATGATTTCTTACCTATTAGTGGTTGAACATC
+ &gt;880_431
+ GTGATATGTATGTTGACGGCCATAAGGCTGCTTCTT
+
+ </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_combiner.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_combiner.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,49 @@
+#Dan Blankenberg
+import sys, os, shutil
+from galaxy_utils.sequence.fastq import fastqWriter, fastqSequencingRead, fastqCombiner, fastqFakeFastaScoreReader
+from galaxy_utils.sequence.fasta import fastaReader, fastaNamedReader
+
+def main():
+    #Read command line arguments
+    fasta_filename = sys.argv[1]
+    fasta_type = sys.argv[2] or 'fasta' #should always be fasta or csfasta? what if txt?
+    qual_filename = sys.argv[3]
+    qual_type = sys.argv[4] or 'qualsanger' #qual454 qualsolid
+    output_filename = sys.argv[5]
+    force_quality_encoding = sys.argv[6]
+    if force_quality_encoding == 'None':
+        force_quality_encoding = None
+    
+    format = 'sanger'
+    if fasta_type == 'csfasta' or qual_type == 'qualsolid':
+        format = 'cssanger'
+    elif qual_type == 'qualsolexa':
+        format = 'solexa'
+    elif qual_type == 'qualillumina':
+        format = 'illumina'
+    
+    out = fastqWriter( open( output_filename, 'wb' ), format = format, force_quality_encoding = force_quality_encoding )
+    if qual_filename == 'None':
+        qual_input = fastqFakeFastaScoreReader( format, quality_encoding = force_quality_encoding )
+    else:
+        qual_input = fastaNamedReader( open( qual_filename, 'rb' )  )
+    
+    fastq_combiner = fastqCombiner( format )
+    i = None
+    skip_count = 0
+    for i, sequence in enumerate( fastaReader( open( fasta_filename, 'rb' ) ) ):
+        quality = qual_input.get( sequence )
+        if quality:
+            fastq_read = fastq_combiner.combine( sequence, quality )
+            out.write( fastq_read )
+        else:
+            skip_count += 1
+    out.close()
+    if i is None:
+        print "Your file contains no valid FASTA sequences."
+    else:
+        print qual_input.has_data()
+        print 'Combined %s of %s sequences with quality scores (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 )
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_combiner.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_combiner.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,77 @@
+<tool id="fastq_combiner" name="Combine FASTA and QUAL" version="1.0.1">
+  <description>into FASTQ</description>
+  <command interpreter="python">fastq_combiner.py '$fasta_file' '${fasta_file.extension}' '$qual_file' '${qual_file.extension}' '$output_file' '$force_quality_encoding'</command>
+  <inputs>
+    <param name="fasta_file" type="data" format="fasta,csfasta" label="FASTA File" />
+    <param name="qual_file" type="data" format="qual" label="Quality Score File" optional="True" />
+    <param name="force_quality_encoding" type="select" label="Force Quality Score encoding">
+      <option value="None">Use Source Encoding</option>
+      <option value="ascii" selected="True">ASCII</option>
+      <option value="decimal">Decimal</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data name="output_file" format="fastqsanger">
+      <change_format>
+        <when input_dataset="fasta_file" attribute="extension" value="csfasta" format="fastqcssanger" />
+        <when input_dataset="qual_file" attribute="extension" value="qualsolid" format="fastqcssanger" />
+        <when input_dataset="qual_file" attribute="extension" value="qualsolexa" format="fastqsolexa" />
+        <when input_dataset="qual_file" attribute="extension" value="qualillumina" format="fastqillumina" />
+      </change_format>
+    </data>
+  </outputs>
+  <tests>
+    <test>
+      <param name="fasta_file" value="s2fq_phiX.csfasta" ftype="csfasta" />
+      <param name="qual_file" value="s2fq_phiX.qualsolid" ftype="qualsolid" />
+      <param name="force_quality_encoding" value="None" />
+      <output name="output_file" file="combine_phiX_out_1.fastqcssanger" />
+    </test>
+    <test>
+      <param name="fasta_file" value="s2fq_phiX.csfasta" ftype="csfasta" />
+      <param name="qual_file" value="s2fq_phiX.qualsolid" ftype="qualsolid" />
+      <param name="force_quality_encoding" value="ascii" />
+      <output name="output_file" file="combine_phiX_out_2.fastqcssanger" />
+    </test>
+    <test>
+      <param name="fasta_file" value="fastq_combiner_in_1.fasta" ftype="fasta" />
+      <param name="qual_file" value="fastq_combiner_in_1.qual454" ftype="qual454" />
+      <param name="force_quality_encoding" value="None" />
+      <output name="output_file" file="wrapping_as_sanger.fastqsanger" />
+    </test>
+    <test>
+      <param name="fasta_file" value="fastq_combiner_in_1.fasta" ftype="fasta" />
+      <param name="qual_file" value="fastq_combiner_in_1.qual454" ftype="qual454" />
+      <param name="force_quality_encoding" value="decimal" />
+      <output name="output_file" file="wrapping_as_sanger_decimal.fastqsanger" />
+    </test>
+    <test>
+      <param name="fasta_file" value="fastq_combiner_in_1.fasta" ftype="fasta" />
+      <param name="qual_file" />
+      <param name="force_quality_encoding" value="decimal" />
+      <output name="output_file" file="fastq_combiner_no_qual_decimal_out_1.fastqsanger" />
+    </test>
+    <test>
+      <param name="fasta_file" value="s2fq_phiX.csfasta" ftype="csfasta" />
+      <param name="qual_file" />
+      <param name="force_quality_encoding" value="ascii" />
+      <output name="output_file" file="fastq_combiner_no_qual_ascii_out_1.fastqcssanger" />
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+This tool joins a FASTA file to a Quality Score file, creating a single FASTQ block for each read.
+
+Specifying a set of quality scores is optional; when not provided, the output will be fastqsanger or fastqcssanger (when a csfasta is provided) with each quality score being the maximal allowed value (93).
+
+Use this tool, for example, to convert 454-type output to FASTQ.
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. &lt;http://www.ncbi.nlm.nih.gov/pubmed/20562416&gt;`_
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_filter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_filter.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,34 @@
+#Dan Blankenberg
+import sys, os, shutil
+from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
+
+def main():
+    #Read command line arguments
+    input_filename = sys.argv[1]
+    script_filename = sys.argv[2]
+    output_filename = sys.argv[3]
+    additional_files_path = sys.argv[4]
+    input_type = sys.argv[5] or 'sanger'
+    
+    #Save script file for debuging/verification info later
+    os.mkdir( additional_files_path )
+    shutil.copy( script_filename, os.path.join( additional_files_path, 'debug.txt' ) )
+    
+    out = fastqWriter( open( output_filename, 'wb' ), format = input_type )
+    
+    i = None
+    reads_kept = 0
+    for i, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
+        local = {'fastq_read':fastq_read, 'ret_val':False}
+        execfile( script_filename, {}, local )
+        if local['ret_val']:
+            out.write( fastq_read )
+            reads_kept += 1
+    out.close()
+    if i is None:
+        print "Your file contains no valid fastq reads."
+    else:
+        print 'Kept %s of %s reads (%.2f%%).' % ( reads_kept, i + 1, float( reads_kept ) / float( i + 1 ) * 100.0 )
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_filter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_filter.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,318 @@\n+<tool id="fastq_filter" name="Filter FASTQ" version="1.0.0">\n+  <description>reads by quality score and length</description>\n+  <command interpreter="python">fastq_filter.py $input_file $fastq_filter_file $output_file $output_file.files_path \'${input_file.extension[len( \'fastq\' ):]}\'</command>\n+  <inputs>\n+    <page>\n+      <param name="input_file" type="data" format="fastqsanger,fastqcssanger" label="FASTQ File" help="Requires groomed data: if your data does not appear here try using the FASTQ groomer."/>\n+       <param name="min_size" label="Minimum Size" value="0" type="integer">\n+        <validator type="in_range" message="Minimum size must be positive" min="0"/>\n+      </param>\n+      <param name="max_size" label="Maximum Size" value="0" type="integer" help="A maximum size less than 1 indicates no limit."/>\n+      <param name="min_quality" label="Minimum Quality" value="0" type="float"/>\n+      <param name="max_quality" label="Maximum Quality" value="0" type="float" help="A maximum quality less than 1 indicates no limit."/>\n+      <param name="max_num_deviants" label="Maximum number of bases allowed outside of quality range" value="0" type="integer">\n+        <validator type="in_range" message="Maximum number of deviate bases must be positive" min="0"/>\n+      </param>\n+     <param name="paired_end" label="This is paired end data" type="boolean" truevalue="paired_end" falsevalue="single_end" checked="False"/>\n+      <repeat name="fastq_filters" title="Quality Filter on a Range of Bases" help="The above settings do not apply to these filters.">\n+        <conditional name="offset_type">\n+          <param name="base_offset_type" type="select" label="Define Base Offsets as" help="Use Absolute for fixed length reads (Illumina, SOLiD)&lt;br&gt;Use Percentage for variable length reads (Roche/454)">\n+            <option value="offsets_absolute" selected="true">Absolute Values</option>\n+            <option value="offsets_percent">Percentage of Read Length</option>\n+          </param>\n+          <when value="offsets_absolute">\n+            <param name="left_column_offset" label="Offset from 5\' end" value="0" type="integer" help="Values start at 0, increasing from the left">\n+              <validator type="in_range" message="Base Offsets must be positive" min="0" max="inf"/>\n+              <validator type="expression" message="An integer is required.">int( float( value ) ) == float( value )</validator>\n+            </param>\n+            <param name="right_column_offset" label="Offset from 3\' end" value="0" type="integer" help="Values start at 0, increasing from the right">\n+              <validator type="in_range" message="Base Offsets must be positive" min="0" max="inf"/>\n+              <validator type="expression" message="An integer is required.">int( float( value ) ) == float( value )</validator>\n+            </param>\n+          </when>\n+          <when value="offsets_percent">\n+            <param name="left_column_offset" label="Offset from 5\' end" value="0" type="float">\n+              <validator type="in_range" message="Base Offsets must be between 0 and 100" min="0" max="100"/>\n+            </param>\n+            <param name="right_column_offset" label="Offset from 3\' end" value="0" type="float">\n+              <validator type="in_range" message="Base Offsets must be between 0 and 100" min="0" max="100"/>\n+            </param>\n+          </when>\n+        </conditional>\n+        <param name="score_operation" type="select" label="Aggregate read score for specified range">\n+          <option value="min" selected="True">min score</option>\n+          <option value="max">max score</option>\n+          <option value="sum">sum of scores</option>\n+          <option value="mean">mean of scores</option>\n+        </param>\n+        <param name="score_comparison" type="select" label="Keep read when aggregate score is">\n+          <option value="&gt;">&gt;</option>\n+          <option value="&gt;=" selected="true">&gt;=</o'..b'ame="right_column_offset" value="1"/>\n+      <param name="score_operation" value="max"/>\n+      <param name="score_comparison" value="&lt;="/>\n+      <param name="score" value="92"/>\n+      <output name="out_file1" file="sanger_full_range_original_sanger.fastqsanger"/>\n+    </test>\n+    <!-- percent based offsets -->\n+    <test>\n+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger"/>\n+      <param name="min_size" value="0"/>\n+      <param name="max_size" value="0"/>\n+      <param name="min_quality" value="0"/>\n+      <param name="max_quality" value="0"/>\n+      <param name="max_num_deviants" value="0"/>\n+      <param name="paired_end" value="single_end"/>\n+      <param name="base_offset_type" value="offsets_percent"/>\n+      <param name="left_column_offset" value="1.075"/>\n+      <param name="right_column_offset" value="1.075"/>\n+      <param name="score_operation" value="min"/>\n+      <param name="score_comparison" value="&gt;="/>\n+      <param name="score" value="1"/>\n+      <output name="out_file1" file="sanger_full_range_original_sanger.fastqsanger"/>\n+    </test>\n+    <test>\n+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger"/>\n+      <param name="min_size" value="0"/>\n+      <param name="max_size" value="0"/>\n+      <param name="min_quality" value="0"/>\n+      <param name="max_quality" value="0"/>\n+      <param name="max_num_deviants" value="0"/>\n+      <param name="paired_end" value="single_end"/>\n+      <param name="base_offset_type" value="offsets_percent"/>\n+      <param name="left_column_offset" value="1"/>\n+      <param name="right_column_offset" value="1"/>\n+      <param name="score_operation" value="min"/>\n+      <param name="score_comparison" value="&gt;="/>\n+      <param name="score" value="1"/>\n+      <output name="out_file1" file="empty_file.dat"/>\n+    </test>\n+  </tests>\n+<help>\n+This tool allows you to build complex filters to be applied to each read in a FASTQ file.\n+\n+**Basic Options:**\n+    * You can specify a minimum and maximum read lengths.\n+    * You can specify minimum and maximum per base quality scores, with optionally specifying the number of bases that are allowed to deviate from this range (default of 0 deviant bases).\n+    * If your data is paired-end, select the proper checkbox; this will cause each read to be internally split down the middle and filters applied to each half using the offsets specified.\n+\n+**Advance Options:**\n+    * You can specify any number of advanced filters. \n+    * 5\' and 3\' offsets are defined, starting at zero, increasing from the respective end of the reads. For example, a quality string of "ABCDEFG", with 5\' and 3\' offsets of 1 and 1, respectively, specified will yield "BCDEF".\n+    * You can specify either absolute offset values, or percentage offset values. *Absolute Values* based offsets are useful for fixed length reads (e.g. Illumina or SOLiD data). *Percentage of Read Length* based offsets are useful for variable length reads (e.g. 454 data). When using the percent-based method, offsets are rounded to the nearest integer.\n+    * The user specifies the aggregating action (min, max, sum, mean) to perform on the quality score values found between the specified offsets to be used with the user defined comparison operation and comparison value.\n+    * If a set of offsets is specified that causes the remaining quality score list to be of length zero, then the read will **pass** the quality filter unless the size range filter is used to remove these reads.\n+\n+-----\n+\n+.. class:: warningmark\n+\n+Adapter bases in color space reads are excluded from filtering.\n+\n+------\n+\n+**Citation**\n+\n+If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. &lt;http://www.ncbi.nlm.nih.gov/pubmed/20562416&gt;`_\n+\n+\n+</help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_groomer.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_groomer.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,42 @@
+#Dan Blankenberg
+import sys
+from galaxy_utils.sequence.fastq import fastqReader, fastqVerboseErrorReader, fastqAggregator, fastqWriter
+
+def main():
+    input_filename = sys.argv[1]
+    input_type = sys.argv[2]
+    output_filename = sys.argv[3]
+    output_type = sys.argv[4]
+    force_quality_encoding = sys.argv[5]
+    summarize_input = sys.argv[6] == 'summarize_input'
+    if force_quality_encoding == 'None':
+        force_quality_encoding = None
+    
+    aggregator = fastqAggregator()
+    out = fastqWriter( open( output_filename, 'wb' ), format = output_type, force_quality_encoding = force_quality_encoding )
+    read_count = None
+    if summarize_input:
+        reader = fastqVerboseErrorReader
+    else:
+        reader = fastqReader
+    for read_count, fastq_read in enumerate( reader( open( input_filename ), format = input_type, apply_galaxy_conventions = True ) ):
+        if summarize_input:
+            aggregator.consume_read( fastq_read )
+        out.write( fastq_read )
+    out.close()
+    
+    if read_count is not None:
+        print "Groomed %i %s reads into %s reads." % ( read_count + 1, input_type, output_type )
+        if input_type != output_type and 'solexa' in [ input_type, output_type ]:
+            print "Converted between Solexa and PHRED scores."
+        if summarize_input:
+            print "Based upon quality and sequence, the input data is valid for: %s" % ( ", ".join( aggregator.get_valid_formats() )  or "None" )
+            ascii_range = aggregator.get_ascii_range()
+            decimal_range =  aggregator.get_decimal_range()
+            print "Input ASCII range: %s(%i) - %s(%i)" % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) ) #print using repr, since \x00 (null) causes info truncation in galaxy when printed
+            print "Input decimal range: %i - %i" % ( decimal_range[0], decimal_range[1] )        
+    else:
+        print "No valid FASTQ reads were provided."
+
+
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_groomer.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_groomer.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,371 @@\n+<tool id="fastq_groomer" name="FASTQ Groomer" version="1.0.4">\n+  <description>convert between various FASTQ quality formats</description>\n+  <command interpreter="python">fastq_groomer.py \'$input_file\' \'$input_type\' \'$output_file\'\n+#if str( $options_type[\'options_type_selector\'] ) == \'basic\':\n+#if str( $input_type ) == \'cssanger\':\n+\'cssanger\'\n+#else:\n+\'sanger\'\n+#end if\n+\'ascii\' \'summarize_input\'\n+#else:\n+\'${options_type.output_type}\' \'${options_type.force_quality_encoding}\' \'${options_type.summarize_input}\'\n+#end if\n+</command>\n+  <inputs>\n+    <param name="input_file" type="data" format="fastq" label="File to groom" />\n+    <param name="input_type" type="select" label="Input FASTQ quality scores type">\n+      <option value="solexa">Solexa</option>\n+      <option value="illumina">Illumina 1.3+</option>\n+      <option value="sanger" selected="True">Sanger</option>\n+      <option value="cssanger">Color Space Sanger</option>\n+    </param>\n+    <conditional name="options_type">\n+    <param name="options_type_selector" type="select" label="Advanced Options">\n+      <option value="basic" selected="True">Hide Advanced Options</option>\n+      <option value="advanced">Show Advanced Options</option>\n+    </param>\n+    <when value="basic">\n+      <!-- no options -->\n+    </when>\n+    <when value="advanced">\n+      <param name="output_type" type="select" label="Output FASTQ quality scores type" help="Galaxy tools are designed to work with the Sanger Quality score format.">\n+        <option value="solexa">Solexa</option>\n+        <option value="illumina">Illumina 1.3+</option>\n+        <option value="sanger" selected="True">Sanger (recommended)</option>\n+        <option value="cssanger">Color Space Sanger</option>\n+      </param>\n+      <param name="force_quality_encoding" type="select" label="Force Quality Score encoding">\n+        <option value="None">Use Source Encoding</option>\n+        <option value="ascii" selected="True">ASCII</option>\n+        <option value="decimal">Decimal</option>\n+      </param>\n+      <param name="summarize_input" type="select" label="Summarize input data">\n+        <option value="summarize_input" selected="True">Summarize Input</option>\n+        <option value="dont_summarize_input">Do not Summarize Input (faster)</option>\n+      </param>\n+    </when>\n+  </conditional>\n+  </inputs>\n+  <outputs>\n+    <data name="output_file" format="fastqsanger">\n+      <change_format>\n+        <when input="input_type" value="cssanger" format="fastqcssanger" />\n+        <when input="options_type.output_type" value="solexa" format="fastqsolexa" />\n+        <when input="options_type.output_type" value="illumina" format="fastqillumina" />\n+        <when input="options_type.output_type" value="sanger" format="fastqsanger" />\n+        <when input="options_type.output_type" value="cssanger" format="fastqcssanger" />\n+      </change_format>\n+    </data>\n+  </outputs>\n+  <tests>\n+    <!-- These tests include test files adapted from supplemental material in Cock PJ, Fields CJ, Goto N, Heuer ML, Rice PM. The Sanger FASTQ file format for sequences with quality scores, and the Solexa/Illumina FASTQ variants. Nucleic Acids Res. 2009 Dec 16. -->\n+    <!-- Unfortunately, cannot test for expected failures -->\n+    <!-- Test basic options -->\n+    <test>\n+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastq" />\n+      <param name="input_type" value="sanger" />\n+      <param name="options_type_selector" value="basic" />\n+      <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" />\n+    </test>\n+    <test>\n+      <param name="input_file" value="sanger_full_range_as_cssanger.fastqcssanger" ftype="fastq" />\n+      <param name="input_type" value="cssanger" />\n+      <param name="options_type_selector" value="basic" />\n+      <output name="output_file" file="sanger_full_range_as_cssanger.fastqcssanger" />\n+    </test>\n+    <test>\n+      <param name="input_fi'..b'put_type" value="sanger" />\n+      <param name="force_quality_encoding" value="ascii" />\n+      <param name="summarize_input" value="summarize_input" />\n+      <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" />\n+    </test>\n+    <!-- Solexa, range -5 - 62 -->\n+    <test>\n+      <param name="input_file" value="solexa_full_range_as_decimal_solexa.fastqsolexa" ftype="fastq" />\n+      <param name="input_type" value="solexa" />\n+      <param name="options_type_selector" value="advanced" />\n+      <param name="output_type" value="solexa" />\n+      <param name="force_quality_encoding" value="ascii" />\n+      <param name="summarize_input" value="summarize_input" />\n+      <output name="output_file" file="solexa_full_range_original_solexa.fastqsolexa" />\n+    </test>\n+    <test>\n+      <param name="input_file" value="solexa_full_range_original_solexa.fastqsolexa" ftype="fastq" />\n+      <param name="input_type" value="solexa" />\n+      <param name="options_type_selector" value="advanced" />\n+      <param name="output_type" value="solexa" />\n+      <param name="force_quality_encoding" value="decimal" />\n+      <param name="summarize_input" value="summarize_input" />\n+      <output name="output_file" file="solexa_full_range_as_decimal_solexa.fastqsolexa" />\n+    </test>\n+  </tests>\n+  <help>\n+**What it does**\n+\n+This tool offers several conversions options relating to the FASTQ format.\n+\n+When using *Basic* options, the output will be *sanger* formatted or *cssanger* formatted (when the input is Color Space Sanger).\n+\n+When converting, if a quality score falls outside of the target score range, it will be coerced to the closest available value (i.e. the minimum or maximum). \n+\n+When converting between Solexa and the other formats, quality scores are mapped between Solexa and PHRED scales using the equations found in `Cock PJ, Fields CJ, Goto N, Heuer ML, Rice PM. The Sanger FASTQ file format for sequences with quality scores, and the Solexa/Illumina FASTQ variants. Nucleic Acids Res. 2009 Dec 16.`_\n+\n+When converting between color space (csSanger) and base/sequence space (Sanger, Illumina, Solexa) formats, adapter bases are lost or gained; if gained, the base \'G\' is used as the adapter. You cannot convert a color space read to base space if there is no adapter present in the color space sequence. Any masked or ambiguous nucleotides in base space will be converted to \'N\'s when determining color space encoding.\n+\n+-----\n+\n+**Quality Score Comparison**\n+\n+::\n+\n+    SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n+    ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII\n+    ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\n+    !"#$%&amp;\'()*+,-./0123456789:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\n+    |                         |    |        |                              |                     |\n+   33                        59   64       73                            104                   126\n+  \n+   S - Sanger       Phred+33,  93 values  (0, 93) (0 to 60 expected in raw reads)\n+   I - Illumina 1.3 Phred+64,  62 values  (0, 62) (0 to 40 expected in raw reads)\n+   X - Solexa       Solexa+64, 67 values (-5, 62) (-5 to 40 expected in raw reads)\n+\n+Diagram adapted from http://en.wikipedia.org/wiki/FASTQ_format\n+\n+------\n+\n+**Citation**\n+\n+If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. &lt;http://www.ncbi.nlm.nih.gov/pubmed/20562416&gt;`_\n+\n+\n+.. _Cock PJ, Fields CJ, Goto N, Heuer ML, Rice PM. The Sanger FASTQ file format for sequences with quality scores, and the Solexa/Illumina FASTQ variants. Nucleic Acids Res. 2009 Dec 16.: http://www.ncbi.nlm.nih.gov/pubmed/20015970\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_manipulation.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_manipulation.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,37 @@
+#Dan Blankenberg
+import sys, os, shutil
+import imp
+from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
+
+def main():
+    #Read command line arguments
+    input_filename = sys.argv[1]
+    script_filename = sys.argv[2]
+    output_filename = sys.argv[3]
+    additional_files_path = sys.argv[4]
+    input_type = sys.argv[5] or 'sanger'
+    
+    #Save script file for debuging/verification info later
+    os.mkdir( additional_files_path )
+    shutil.copy( script_filename, os.path.join( additional_files_path, 'debug.txt' ) )
+    
+    fastq_manipulator = imp.load_module( 'fastq_manipulator', open( script_filename ), script_filename, ( '', 'r', imp.PY_SOURCE ) )
+    
+    out = fastqWriter( open( output_filename, 'wb' ), format = input_type )
+    
+    i = None
+    reads_manipulated = 0
+    for i, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
+        new_read = fastq_manipulator.match_and_manipulate_read( fastq_read )
+        if new_read:
+            out.write( new_read )
+        if new_read != fastq_read:
+            reads_manipulated += 1
+    out.close()
+    if i is None:
+        print "Your file contains no valid FASTQ reads."
+    else:
+        print 'Manipulated %s of %s reads (%.2f%%).' % ( reads_manipulated, i + 1, float( reads_manipulated ) / float( i + 1 ) * 100.0 )
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_manipulation.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_manipulation.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,429 @@\n+<tool id="fastq_manipulation" name="Manipulate FASTQ" version="1.0.1">\n+  <options sanitize="False" /> <!-- This tool uses a file to rely all parameter information (actually a dynamically generated python module), we can safely not sanitize any parameters -->\n+  <description>reads on various attributes</description>\n+  <command interpreter="python">fastq_manipulation.py $input_file $fastq_manipulation_file $output_file $output_file.files_path \'${input_file.extension[len( \'fastq\' ):]}\'</command>\n+  <inputs>\n+    <!-- This tool is purposely over-engineered (e.g. Single option conditionals) to allow easy enhancement with workflow/rerun compatibility -->\n+    <page>\n+      <param name="input_file" type="data" format="fastqsanger,fastqcssanger" label="FASTQ File" help="Requires groomed data: if your data does not appear here try using the FASTQ groomer."/>\n+      <!-- Match Reads -->\n+      <repeat name="match_blocks" title="Match Reads">\n+        <conditional name="match_type">\n+          <param name="match_type_selector" type="select" label="Match Reads by">\n+            <option value="identifier">Name/Identifier</option>\n+            <option value="sequence">Sequence Content</option>\n+            <option value="quality">Quality Score Content</option>\n+          </param>\n+          <when value="identifier">\n+            <conditional name="match">\n+              <param name="match_selector" type="select" label="Identifier Match Type">\n+                <option value="regex">Regular Expression</option>\n+              </param>\n+              <when value="regex">\n+                <param type="text" name="match_by" label="Match by" value=".*" />\n+              </when>\n+            </conditional>\n+          </when>\n+          <when value="sequence">\n+            <conditional name="match">\n+              <param name="match_selector" type="select" label="Sequence Match Type">\n+                <option value="regex">Regular Expression</option>\n+              </param>\n+              <when value="regex">\n+                <param type="text" name="match_by" label="Match by" value=".*" />\n+              </when>\n+            </conditional>\n+          </when>\n+          <when value="quality">\n+            <conditional name="match">\n+              <param name="match_selector" type="select" label="Quality Match Type">\n+                <option value="regex">Regular Expression</option>\n+              </param>\n+              <when value="regex">\n+                <param type="text" name="match_by" label="Match by" value=".*" />\n+              </when>\n+            </conditional>\n+          </when>\n+        </conditional>\n+      </repeat>\n+      <!-- Manipulate Matched Reads -->\n+      <repeat name="manipulate_blocks" title="Manipulate Reads">\n+        <conditional name="manipulation_type">\n+          <param name="manipulation_type_selector" type="select" label="Manipulate Reads on">\n+            <option value="identifier">Name/Identifier</option>\n+            <option value="sequence">Sequence Content</option>\n+            <option value="quality">Quality Score Content</option>\n+            <option value="miscellaneous">Miscellaneous Actions</option>\n+          </param>\n+          <when value="identifier">\n+            <conditional name="manipulation">\n+              <param name="manipulation_selector" type="select" label="Identifier Manipulation Type">\n+                <option value="translate">String Translate</option>\n+              </param>\n+              <when value="translate">\n+                <param name="from" type="text" label="From" value="" />\n+                <param name="to" type="text" label="To" value="" />\n+              </when>\n+            </conditional>\n+          </when>\n+          <when value="sequence">\n+            <conditional name="manipulation">\n+              <param name="manipulation_selector" type="select" label="Sequence Manipulation Type">\n+                <option value="rev_comp">Reverse Complement</opti'..b'="fastqsanger" />\n+      <param name="match_type_selector" value="identifier" />\n+      <param name="match_selector" value="regex" />\n+      <param name="match_by" value="FAKE0001" />\n+      <param name="manipulation_type_selector" value="sequence" />\n+      <param name="manipulation_selector" value="rev_comp" />\n+      <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" />\n+    </test>\n+    <!-- match all and DNA to RNA -->\n+    <test>\n+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />\n+      <param name="match_type_selector" value="identifier" />\n+      <param name="match_selector" value="regex" />\n+      <param name="match_by" value=".*" />\n+      <param name="manipulation_type_selector" value="sequence" />\n+      <param name="manipulation_selector" value="dna_to_rna" />\n+      <output name="output_file" file="sanger_full_range_as_rna.fastqsanger" />\n+    </test>\n+    <!-- match all and RNA to DNA -->\n+    <test>\n+      <param name="input_file" value="sanger_full_range_as_rna.fastqsanger" ftype="fastqsanger" />\n+      <param name="match_type_selector" value="identifier" />\n+      <param name="match_selector" value="regex" />\n+      <param name="match_by" value=".*" />\n+      <param name="manipulation_type_selector" value="sequence" />\n+      <param name="manipulation_selector" value="rna_to_dna" />\n+      <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" />\n+    </test>\n+  </tests>\n+<help>\n+This tool allows you to build complex manipulations to be applied to each matching read in a FASTQ file. A read must match all matching directives in order for it to be manipulated; if a read does not match, it is output in a non-modified manner. All reads matching will have each of the specified manipulations performed upon them, in the order specified.\n+\n+Regular Expression Matches are made using re.search, see http://docs.python.org/library/re.html for more information.\n+  All matching is performed on a single line string, regardless if e.g. the sequence or quality score spans multiple lines in the original file.\n+\n+String translations are performed using string.translate, see http://docs.python.org/library/string.html#string.translate and http://docs.python.org/library/string.html#string.maketrans for more information.\n+\n+.. class:: warningmark\n+\n+Only color space reads can have adapter bases substituted.\n+\n+\n+-----\n+\n+**Example**\n+\n+Suppose you have a color space sanger formatted sequence (fastqcssanger) and you want to double-encode the color space into psuedo-nucleotide space (this is different from converting) to allow these reads to be used in tools which do not natively support it (using specially designed indexes). This tool can handle this manipulation, however, this is generally not recommended as results tend to be poorer than those produced from tools which are specially designed to handle color space data.\n+\n+Steps:\n+\n+1. Click **Add new Match Reads** and leave the matching options set to the default (Matching by sequence name/identifier using the regular expression "\\*."; thereby matching all reads). \n+2. Click **Add new Manipulate Reads**, change **Manipulate Reads on** to "Sequence Content", set **Sequence Manipulation Type** to "Change Adapter Base" and set **New Adapter** to "" (an empty text field). \n+3. Click **Add new Manipulate Reads**, change **Manipulate Reads on** to "Sequence Content", set **Sequence Manipulation Type** to "String Translate" and set **From** to "0123." and **To** to "ACGTN".\n+4. Click Execute. The new history item will contained double-encoded psuedo-nucleotide space reads.\n+\n+------\n+\n+**Citation**\n+\n+If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. &lt;http://www.ncbi.nlm.nih.gov/pubmed/20562416&gt;`_\n+\n+\n+</help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_masker_by_quality.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_masker_by_quality.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,83 @@
+#Dan Blankenberg
+import string
+from optparse import OptionParser
+from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
+
+
+def get_score_comparer( operator ):
+    if operator == 'gt':
+        return compare_gt
+    elif operator == 'ge':
+        return compare_ge
+    elif operator == 'eq':
+        return compare_eq
+    elif operator == 'lt':
+        return compare_lt
+    elif operator == 'le':
+        return compare_le
+    elif operator == 'ne':
+        return compare_ne
+    raise 'Invalid operator provided: %s' % operator
+
+def compare_gt( quality_score, threshold_value ):
+    return quality_score > threshold_value
+
+def compare_ge( quality_score, threshold_value ):
+    return quality_score >= threshold_value
+
+def compare_eq( quality_score, threshold_value ):
+    return quality_score == threshold_value
+
+def compare_ne( quality_score, threshold_value ):
+    return quality_score != threshold_value
+
+def compare_lt( quality_score, threshold_value ):
+    return quality_score < threshold_value
+
+def compare_le( quality_score, threshold_value ):
+    return quality_score <= threshold_value
+
+class BaseReplacer( object ):
+    def __init__( self, replace_character ):
+        self.replace_character = replace_character
+    def __call__( self, base_character ):
+        return self.replace_character
+
+def main():
+    usage = "usage: %prog [options] input_file output_file"
+    parser = OptionParser( usage=usage )
+    parser.add_option( '-f', '--format', dest='format', type='choice', default='sanger', choices=( 'sanger', 'solexa', 'illumina' ), help='FASTQ variant type' )
+    parser.add_option( '-m', '--mask_character', dest='mask_character', default='N', help='Mask Character to use' )
+    parser.add_option( '-c', '--score_comparison', type="choice", dest='score_comparison', default='le', choices=('gt','ge','eq','lt', 'le', 'ne' ), help='Mask base when score is' )
+    parser.add_option( '-s', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score' )
+    parser.add_option( "-l", "--lowercase", action="store_true", dest="lowercase", default=False, help="Use lowercase masking")
+    ( options, args ) = parser.parse_args()
+    
+    if len ( args ) != 2:
+        parser.error( "Need to specify an input file and an output file" )
+    
+    score_comparer = get_score_comparer( options.score_comparison )
+    
+    if options.lowercase:
+        base_masker = string.lower
+    else:
+        base_masker = BaseReplacer( options.mask_character )
+    
+    out = fastqWriter( open( args[1], 'wb' ), format = options.format )
+    
+    num_reads = None
+    num_reads_excluded = 0
+    for num_reads, fastq_read in enumerate( fastqReader( open( args[0] ), format = options.format ) ):
+        sequence_list = list( fastq_read.sequence )
+        for i, quality_score in enumerate( fastq_read.get_decimal_quality_scores() ):
+            if score_comparer( quality_score, options.quality_score ):
+                sequence_list[ i ] = base_masker( sequence_list[ i ] )
+        fastq_read.sequence = "".join( sequence_list )
+        out.write( fastq_read )
+    
+    if num_reads is not None:
+        print "Processed %i %s reads." % ( num_reads + 1, options.format )
+    else:
+        print "No valid FASTQ reads were provided."
+
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_masker_by_quality.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_masker_by_quality.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,60 @@
+<tool id="fastq_masker_by_quality" name="FASTQ Masker" version="1.0.0">
+  <description>by quality score</description>
+  <command interpreter="python">fastq_masker_by_quality.py '$input_file' '$output_file' -f '${input_file.extension[len( 'fastq' ):]}' -s '${quality_score}' -c '${score_comparison}'
+      #if $mask_type.value == 'lowercase'
+      --lowercase
+      #else
+      -m '${mask_type}'
+      #end if
+  </command>
+  <inputs>
+    <param name="input_file" type="data" format="fastqsanger" label="File to mask" />
+    <param name="mask_type" type="select" label="Mask input with">
+      <option value="N">N's</option>
+      <option value="lowercase">Lowercase</option>
+    </param>
+    <param name="score_comparison" type="select" label="When score is">
+      <option value="le" selected="True">Less than or equal</option>
+      <option value="lt">Less than</option>
+      <option value="eq">Equal to</option>
+      <option value="ne">Not Equal to</option>
+      <option value="ge">Greater than</option>
+      <option value="gt">Greater than or equal</option>
+    </param>
+    <param name="quality_score" type="integer" value="0" label="Quality score"/>
+  </inputs>
+  <outputs>
+    <data name="output_file" format="fastqsanger" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="mask_type" value="N" />
+      <param name="score_comparison" value="le" />
+      <param name="quality_score" value="20" />
+      <output name="output_file" file="sanger_full_range_masked_N.fastqsanger" />
+    </test>
+    <test>
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="mask_type" value="lowercase" />
+      <param name="score_comparison" value="le" />
+      <param name="quality_score" value="20" />
+      <output name="output_file" file="sanger_full_range_masked_lowercase.fastqsanger" />
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+This tool allows masking base characters in FASTQ format files dependent upon user specified quality score value and comparison method.
+
+This tool is not available for use on color space (csSanger) formats.
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. &lt;http://www.ncbi.nlm.nih.gov/pubmed/20562416&gt;`_
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_paired_end_deinterlacer.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_paired_end_deinterlacer.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,64 @@
+#Florent Angly
+import sys
+from galaxy_utils.sequence.fastq import fastqReader, fastqWriter, fastqNamedReader, fastqJoiner
+
+def main():
+    input_filename   = sys.argv[1]
+    input_type       = sys.argv[2] or 'sanger'
+    mate1_filename   = sys.argv[3]
+    mate2_filename   = sys.argv[4]
+    single1_filename = sys.argv[5]
+    single2_filename = sys.argv[6]
+
+    type        = input_type
+    input       = fastqNamedReader( open( input_filename, 'rb' ), format = type  )
+    mate1_out   = fastqWriter( open( mate1_filename, 'wb' ), format = type )
+    mate2_out   = fastqWriter( open( mate2_filename, 'wb' ), format = type )
+    single1_out = fastqWriter( open( single1_filename, 'wb' ), format = type )
+    single2_out = fastqWriter( open( single2_filename, 'wb' ), format = type )
+    joiner      = fastqJoiner( type )
+
+    i = None
+    skip_count = 0
+    found = {}
+    for i, mate1 in enumerate( fastqReader( open( input_filename, 'rb' ), format = type ) ):
+     
+        if mate1.identifier in found:
+            del found[mate1.identifier]
+            continue
+
+        mate2 = input.get( joiner.get_paired_identifier( mate1 ) )
+
+        if mate2:
+            # This is a mate pair
+            found[mate2.identifier] = None
+            if joiner.is_first_mate( mate1 ):
+                mate1_out.write( mate1 )
+                mate2_out.write( mate2 )
+            else:
+                mate1_out.write( mate2 )
+                mate2_out.write( mate1 )
+        else:
+            # This is a single
+            skip_count += 1
+            if joiner.is_first_mate( mate1 ):
+                single1_out.write( mate1 )
+            else:
+                single2_out.write( mate1 )
+
+    if i is None:
+        print "Your input file contained no valid FASTQ sequences."
+    else:
+        if skip_count:
+            print 'There were %i reads with no mate.' % skip_count
+        print 'De-interlaced %s pairs of sequences.' % ( (i - skip_count + 1)/2 )
+
+    input.close()
+    mate1_out.close()
+    mate2_out.close()
+    single1_out.close()
+    single2_out.close()
+

+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_paired_end_deinterlacer.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_paired_end_deinterlacer.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,70 @@
+<tool id="fastq_paired_end_deinterlacer" name="FASTQ de-interlacer" version="1.1">
+  <description>on paired end reads</description>
+  <command interpreter="python">fastq_paired_end_deinterlacer.py '$input_file' '${input_file.extension[len( 'fastq' ):]}' '$output1_pairs_file' '$output2_pairs_file' '$output1_singles_file' '$output2_singles_file'</command>
+  <inputs>
+    <param name="input_file" type="data" format="fastqsanger,fastqcssanger" label="FASTQ reads" />
+  </inputs>
+  <outputs>
+    <data name="output1_pairs_file" format="input" label="FASTQ de-interlacer left mates from data ${input_file.hid}" />
+    <data name="output2_pairs_file" format="input" label="FASTQ de-interlacer right mates from data ${input_file.hid}"/>
+    <data name="output1_singles_file" format="input" label="FASTQ de-interlacer left singles from data ${input_file.hid}"/>
+    <data name="output2_singles_file" format="input" label="FASTQ de-interlacer right singles from data ${input_file.hid}"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input_file" value="paired_end_merged.fastqsanger" ftype="fastqsanger" />
+      <output name="output1_pairs_file" file="paired_end_1.fastqsanger" />
+      <output name="output2_pairs_file" file="paired_end_2.fastqsanger" />
+      <output name="output1_singles_file" file="paired_end_1_singles.fastqsanger" />
+      <output name="output2_singles_file" file="paired_end_2_singles.fastqsanger" />
+    </test>
+    <test>
+      <param name="input_file" value="paired_end_merged_errors.fastqsanger" ftype="fastqsanger" />
+      <output name="output1_pairs_file" file="paired_end_1_cleaned.fastqsanger" />
+      <output name="output2_pairs_file" file="paired_end_2_cleaned.fastqsanger" />
+      <output name="output1_singles_file" file="paired_end_1_cleaned_singles.fastqsanger" />
+      <output name="output2_singles_file" file="paired_end_2_cleaned_singles.fastqsanger" />
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+De-interlaces a single fastq dataset representing paired-end run into two fastq datasets containing only the first or second mate read. Reads without mate are saved in separate output files.
+
+Sequence identifiers for paired-end reads must follow the /1 and /2 convention.
+
+-----
+
+**Input**
+
+A multiple-fastq file containing paired-end reads, for example::
+
+    @1539:931/1
+    ACTTCCCGCGCGTGAAGGCGCCGGCAAACGAGGCTCGGGAAGGGGCTCCCG
+    +1539:931/1
+    BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+    @1539:931/2
+    CGCCATTCCGAATCGTAGTTGTCGGCGTCTTCCAGTGCGGCAAGGCATCGT
+    +1539:931/2
+    WNUUZ\P^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+-----
+
+**Output**
+
+Multi-fastq file with left-hand mate only::
+
+    @1539:931/1
+    ACTTCCCGCGCGTGAAGGCGCCGGCAAACGAGGCTCGGGAAGGGGCTCCCG
+    +1539:931/1
+    BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+Multi-fastq file with right-hand mate only::
+
+    @1539:931/2
+    CGCCATTCCGAATCGTAGTTGTCGGCGTCTTCCAGTGCGGCAAGGCATCGT
+    +1539:931/2
+    WNUUZ\P^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_paired_end_interlacer.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_paired_end_interlacer.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,59 @@
+#Florent Angly
+import sys
+from galaxy_utils.sequence.fastq import fastqReader, fastqWriter, fastqNamedReader, fastqJoiner
+
+def main():
+    mate1_filename   = sys.argv[1]
+    mate1_type       = sys.argv[2] or 'sanger'
+    mate2_filename   = sys.argv[3]
+    mate2_type       = sys.argv[4] or 'sanger'
+    outfile_pairs    = sys.argv[5]
+    outfile_singles = sys.argv[6]
+
+    if mate1_type != mate2_type:
+        print "WARNING: You are trying to interlace files of two different types: %s and %s." % ( mate1_type, mate2_type )
+        return
+
+    type = mate1_type
+    joiner = fastqJoiner( type )
+    out_pairs = fastqWriter( open( outfile_pairs, 'wb' ), format = type )
+    out_singles = fastqWriter( open( outfile_singles, 'wb' ), format = type )
+
+    # Pairs + singles present in mate1
+    nof_singles = 0
+    nof_pairs   = 0
+    mate2_input = fastqNamedReader( open( mate2_filename, 'rb' ), format = type )
+    i = None
+    for i, mate1 in enumerate( fastqReader( open( mate1_filename, 'rb' ), format = type ) ):
+        mate2 = mate2_input.get( joiner.get_paired_identifier( mate1 ) )
+        if mate2:
+            out_pairs.write( mate1 )
+            out_pairs.write( mate2 )
+            nof_pairs += 1
+        else:
+            out_singles.write( mate1 )
+            nof_singles += 1
+
+    # Singles present in mate2
+    mate1_input = fastqNamedReader( open( mate1_filename, 'rb' ), format = type )
+    j = None
+    for j, mate2 in enumerate( fastqReader( open( mate2_filename, 'rb' ), format = type ) ):
+        mate1 = mate1_input.get( joiner.get_paired_identifier( mate2 ) )
+        if not mate1:
+            out_singles.write( mate2 )
+            nof_singles += 1
+
+    if (i is None) and (j is None):
+        print "Your input files contained no valid FASTQ sequences."
+    else:
+        print 'There were %s single reads.' % ( nof_singles )
+        print 'Interlaced %s pairs of sequences.' % ( nof_pairs )
+
+    mate1_input.close()
+    mate2_input.close()
+    out_pairs.close()
+    out_singles.close()
+

+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_paired_end_interlacer.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_paired_end_interlacer.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,72 @@
+<tool id="fastq_paired_end_interlacer" name="FASTQ interlacer" version="1.1">
+  <description>on paired end reads</description>
+  <command interpreter="python">fastq_paired_end_interlacer.py '$input1_file' '${input1_file.extension[len( 'fastq' ):]}' '$input2_file' '${input2_file.extension[len( 'fastq' ):]}' '$outfile_pairs' '$outfile_singles'</command>
+  <inputs>
+    <param name="input1_file" type="data" format="fastqsanger,fastqcssanger" label="Left-hand mates" />
+    <param name="input2_file" type="data" format="fastqsanger,fastqcssanger" label="Right-hand mates" />
+  </inputs>
+  <outputs>
+    <!-- $input1_file.name = filename  , e.g. paired_end_2_errors.fastqsanger -->
+    <!-- $input1_file.id   = ID        , e.g. 10 -->
+    <!-- $input1_file.hid  = history ID, e.g. 5  -->
+    <data name="outfile_pairs"   format="input" label="FASTQ interlacer pairs from data ${input1_file.hid} and data ${input2_file.hid}"/>
+    <data name="outfile_singles" format="input" label="FASTQ interlacer singles from data ${input1_file.hid} and data ${input2_file.hid}"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1_file" value="paired_end_1.fastqsanger" ftype="fastqsanger" />
+      <param name="input2_file" value="paired_end_2.fastqsanger" ftype="fastqsanger" />
+      <output name="outfile_pairs" file="paired_end_merged.fastqsanger" />
+      <output name="outfile_singles" file="paired_end_merged_singles.fastqsanger" />
+    </test>
+    <test>
+      <param name="input1_file" value="paired_end_1_errors.fastqsanger" ftype="fastqsanger" />
+      <param name="input2_file" value="paired_end_2_errors.fastqsanger" ftype="fastqsanger" />
+      <output name="outfile_pairs" file="paired_end_merged_cleaned.fastqsanger" />
+      <output name="outfile_singles" file="paired_end_merged_cleaned_singles.fastqsanger" />
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+This tool joins paired end FASTQ reads from two separate files, one with the left mates and one with the right mates, into a single files where left mates alternate with their right mates. The join is performed using sequence identifiers, allowing the two files to contain differing ordering. If a sequence identifier does not appear in both files, it is included in a separate file.
+
+Sequence identifiers with /1 and /2 appended override the left-hand and right-hand designation; i.e. if the reads end with /1 and /2, the read containing /1 will be used as the left-hand read and the read containing /2 will be used as the right-hand read. Sequences without this designation will follow the left-hand and right-hand settings set by the user.
+
+-----
+
+**Input**
+
+Left-hand mates, for example::
+
+    @1539:931/1
+    ACTTCCCGCGCGTGAAGGCGCCGGCAAACGAGGCTCGGGAAGGGGCTCCCG
+    +1539:931/1
+    BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+Right-hand mates, for example::
+
+    @1539:931/2
+    CGCCATTCCGAATCGTAGTTGTCGGCGTCTTCCAGTGCGGCAAGGCATCGT
+    +1539:931/2
+    WNUUZ\P^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+-----
+
+**Output**
+
+A multiple-fastq file containing interlaced left and right paired reads::
+
+    @1539:931/1
+    ACTTCCCGCGCGTGAAGGCGCCGGCAAACGAGGCTCGGGAAGGGGCTCCCG
+    +1539:931/1
+    BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+    @1539:931/2
+    CGCCATTCCGAATCGTAGTTGTCGGCGTCTTCCAGTGCGGCAAGGCATCGT
+    +1539:931/2
+    WNUUZ\P^`BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A multiple-fastq file containing reads that have no mate is also produced.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_paired_end_joiner.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_paired_end_joiner.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,38 @@
+#Dan Blankenberg
+import sys, os, shutil
+from galaxy_utils.sequence.fastq import fastqReader, fastqNamedReader, fastqWriter, fastqJoiner
+
+def main():
+    #Read command line arguments
+    input1_filename = sys.argv[1]
+    input1_type = sys.argv[2] or 'sanger'
+    input2_filename = sys.argv[3]
+    input2_type = sys.argv[4] or 'sanger'
+    output_filename = sys.argv[5]
+    
+    if input1_type != input2_type:
+        print "WARNING: You are trying to join files of two different types: %s and %s." % ( input1_type, input2_type )
+    
+    input2 = fastqNamedReader( open( input2_filename, 'rb' ), input2_type )
+    joiner = fastqJoiner( input1_type )
+    out = fastqWriter( open( output_filename, 'wb' ), format = input1_type )
+    
+    i = None
+    skip_count = 0
+    for i, fastq_read in enumerate( fastqReader( open( input1_filename, 'rb' ), format = input1_type ) ):
+        identifier = joiner.get_paired_identifier( fastq_read )
+        fastq_paired = input2.get( identifier )
+        if fastq_paired is None:
+            skip_count += 1
+        else:
+            out.write( joiner.join( fastq_read, fastq_paired ) )
+    out.close()
+    
+    if i is None:
+        print "Your file contains no valid FASTQ reads."
+    else:
+        print input2.has_data()
+        print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 )
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_paired_end_joiner.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_paired_end_joiner.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,62 @@
+<tool id="fastq_paired_end_joiner" name="FASTQ joiner" version="1.0.0">
+  <description>on paired end reads</description>
+  <command interpreter="python">fastq_paired_end_joiner.py '$input1_file' '${input1_file.extension[len( 'fastq' ):]}' '$input2_file' '${input2_file.extension[len( 'fastq' ):]}' '$output_file'</command>
+  <inputs>
+    <param name="input1_file" type="data" format="fastqsanger,fastqcssanger" label="Left-hand Reads" />
+    <param name="input2_file" type="data" format="fastqsanger,fastqcssanger" label="Right-hand Reads" />
+  </inputs>
+  <outputs>
+    <data name="output_file" format="input" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1_file" value="split_pair_reads_1.fastqsanger" ftype="fastqsanger" />
+      <param name="input2_file" value="split_pair_reads_2.fastqsanger" ftype="fastqsanger" />
+      <output name="output_file" file="3.fastqsanger" />
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+This tool joins paired end FASTQ reads from two separate files into a single read in one file. The join is performed using sequence identifiers, allowing the two files to contain differing ordering. If a sequence identifier does not appear in both files, it is excluded from the output.
+
+Sequence identifiers with /1 and /2 appended override the left-hand and right-hand designation; i.e. if the reads end with /1 and /2, the read containing /1 will be used as the left-hand read and the read containing /2 will be used as the right-hand read. Sequences without this designation will follow the left-hand and right-hand settings set by the user.
+
+-----
+
+**Input formats**
+
+Left-hand Read::
+
+    @HWI-EAS91_1_30788AAXX:7:21:1542:1758/1
+    GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC
+    +HWI-EAS91_1_30788AAXX:7:21:1542:1758/1
+    hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+
+Right-hand Read::
+
+    @HWI-EAS91_1_30788AAXX:7:21:1542:1758/2
+    GCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA
+    +HWI-EAS91_1_30788AAXX:7:21:1542:1758/2
+    hhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR
+
+-----
+
+**Output**
+
+A multiple-fastq file, for example::
+
+    @HWI-EAS91_1_30788AAXX:7:21:1542:1758
+    GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA
+    +HWI-EAS91_1_30788AAXX:7:21:1542:1758
+    hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. &lt;http://www.ncbi.nlm.nih.gov/pubmed/20562416&gt;`_
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_paired_end_splitter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_paired_end_splitter.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,33 @@
+#Dan Blankenberg
+import sys, os, shutil
+from galaxy_utils.sequence.fastq import fastqReader, fastqWriter, fastqSplitter
+
+def main():
+    #Read command line arguments
+    input_filename = sys.argv[1]
+    input_type = sys.argv[2] or 'sanger'
+    output1_filename = sys.argv[3]
+    output2_filename = sys.argv[4]
+    
+    splitter = fastqSplitter()
+    out1 = fastqWriter( open( output1_filename, 'wb' ), format = input_type )
+    out2 = fastqWriter( open( output2_filename, 'wb' ), format = input_type )
+    
+    i = None
+    skip_count = 0
+    for i, fastq_read in enumerate( fastqReader( open( input_filename, 'rb' ), format = input_type ) ):
+        read1, read2 = splitter.split( fastq_read )
+        if read1 and read2:
+            out1.write( read1 )
+            out2.write( read2 )
+        else:
+            skip_count += 1
+    out1.close()
+    out2.close()
+    if i is None:
+        print "Your file contains no valid FASTQ reads."
+    else:
+        print 'Split %s of %s reads (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 )
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_paired_end_splitter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_paired_end_splitter.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,63 @@
+<tool id="fastq_paired_end_splitter" name="FASTQ splitter" version="1.0.0">
+  <description>on joined paired end reads</description>
+  <command interpreter="python">fastq_paired_end_splitter.py '$input1_file' '${input1_file.extension[len( 'fastq' ):]}' '$output1_file' '$output2_file'</command>
+  <inputs>
+    <param name="input1_file" type="data" format="fastqsanger,fastqcssanger" label="FASTQ reads" />
+  </inputs>
+  <outputs>
+    <data name="output1_file" format="input" />
+    <data name="output2_file" format="input" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1_file" value="3.fastqsanger" ftype="fastqsanger" />
+      <output name="output1_file" file="split_pair_reads_1.fastqsanger" />
+      <output name="output2_file" file="split_pair_reads_2.fastqsanger" />
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+Splits a single fastq dataset representing paired-end run into two datasets (one for each end). This tool works only for datasets where both ends have **the same** length.  
+
+Sequence identifiers will have /1 or /2 appended for the split left-hand and right-hand reads, respectively.
+
+-----
+
+**Input format**
+
+A multiple-fastq file, for example::
+
+    @HWI-EAS91_1_30788AAXX:7:21:1542:1758
+    GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA
+    +HWI-EAS91_1_30788AAXX:7:21:1542:1758
+    hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR
+
+
+-----
+
+**Outputs**
+
+Left-hand Read::
+
+    @HWI-EAS91_1_30788AAXX:7:21:1542:1758/1
+    GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC
+    +HWI-EAS91_1_30788AAXX:7:21:1542:1758/1
+    hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+
+Right-hand Read::
+
+    @HWI-EAS91_1_30788AAXX:7:21:1542:1758/2
+    GCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA
+    +HWI-EAS91_1_30788AAXX:7:21:1542:1758/2
+    hhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. &lt;http://www.ncbi.nlm.nih.gov/pubmed/20562416&gt;`_
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_stats.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_stats.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,48 @@
+#Dan Blankenberg
+import sys
+from galaxy_utils.sequence.fastq import fastqReader, fastqAggregator
+
+VALID_NUCLEOTIDES = [ 'A', 'C', 'G', 'T', 'N' ]
+VALID_COLOR_SPACE = map( str, range( 7 ) ) + [ '.' ]
+SUMMARY_STAT_ORDER = ['read_count', 'min_score', 'max_score', 'sum_score', 'mean_score', 'q1', 'med_score', 'q3', 'iqr', 'left_whisker', 'right_whisker' ]
+
+def main():
+    input_filename = sys.argv[1]
+    output_filename = sys.argv[2]
+    input_type = sys.argv[3] or 'sanger'
+    
+    aggregator = fastqAggregator()
+    num_reads = None
+    fastq_read = None
+    for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
+        aggregator.consume_read( fastq_read )
+    out = open( output_filename, 'wb' )
+    valid_nucleotides = VALID_NUCLEOTIDES
+    if fastq_read:
+        if fastq_read.sequence_space == 'base':
+            out.write( '#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\tA_Count\tC_Count\tG_Count\tT_Count\tN_Count\tother_bases\tother_base_count\n' )
+        else:
+            out.write( '#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\t0_Count\t1_Count\t2_Count\t3_Count\t4_Count\t5_Count\t6_Count\t._Count\tother_bases\tother_base_count\n' )
+            valid_nucleotides = VALID_COLOR_SPACE
+    for i in range( aggregator.get_max_read_length() ):
+        column_stats = aggregator.get_summary_statistics_for_column( i )
+        out.write( '%i\t' % ( i + 1 ) )
+        out.write( '%s\t' * len( SUMMARY_STAT_ORDER ) % tuple( [ column_stats[ key ] for key in SUMMARY_STAT_ORDER ] ) )
+        out.write( '%s\t' % ','.join( map( str, column_stats['outliers'] ) ) )
+        base_counts = aggregator.get_base_counts_for_column( i )
+        for nuc in valid_nucleotides:
+            out.write( "%s\t" % base_counts.get( nuc, 0 ) )
+        extra_nucs = sorted( [ nuc for nuc in base_counts.keys() if nuc not in valid_nucleotides ] )
+        out.write( "%s\t%s\n" % ( ','.join( extra_nucs ), ','.join( str( base_counts[nuc] ) for nuc in extra_nucs ) ) )
+    out.close()
+    if num_reads is None:
+        print "No valid fastq reads could be processed."
+    else:
+        print "%i fastq reads were processed." % ( num_reads + 1 )
+        print "Based upon quality values and sequence characters, the input data is valid for: %s" % ( ", ".join( aggregator.get_valid_formats() ) or "None" )
+        ascii_range = aggregator.get_ascii_range()
+        decimal_range =  aggregator.get_decimal_range()
+        print "Input ASCII range: %s(%i) - %s(%i)" % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) ) #print using repr, since \x00 (null) causes info truncation in galaxy when printed
+        print "Input decimal range: %i - %i" % ( decimal_range[0], decimal_range[1] )
+
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_stats.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_stats.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,71 @@
+<tool id="fastq_stats" name="FASTQ Summary Statistics" version="1.0.0">
+  <description>by column</description>
+  <command interpreter="python">fastq_stats.py '$input_file' '$output_file' '${input_file.extension[len( 'fastq' ):]}'</command>
+  <inputs>
+    <param name="input_file" type="data" format="fastqsanger,fastqillumina,fastqsolexa,fastqcssanger" label="FASTQ File"/>
+  </inputs>
+  <outputs>
+    <data name="output_file" format="tabular" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input_file" value="fastq_stats1.fastq" ftype="fastqsanger" />
+      <output name="output_file" file="fastq_stats_1_out.tabular" />
+    </test>
+  </tests>
+  <help>
+This tool creates summary statistics on a FASTQ file. 
+
+.. class:: infomark
+
+**TIP:** This statistics report can be used as input for the **Boxplot** and **Nucleotides Distribution** tools.
+
+-----
+
+**The output file will contain the following fields:**
+
+* column      = column number (1 to 36 for a 36-cycles read Solexa file)
+* count       = number of bases found in this column.
+* min         = Lowest quality score value found in this column.
+* max         = Highest quality score value found in this column.
+* sum         = Sum of quality score values for this column.
+* mean        = Mean quality score value for this column.
+* Q1          = 1st quartile quality score.
+* med         = Median quality score.
+* Q3          = 3rd quartile quality score.
+* IQR         = Inter-Quartile range (Q3-Q1).
+* lW          = 'Left-Whisker' value (for boxplotting).
+* rW          = 'Right-Whisker' value (for boxplotting).
+* outliers    = Scores falling beyond the left and right whiskers (comma separated list).
+* A_Count     = Count of 'A' nucleotides found in this column.
+* C_Count     = Count of 'C' nucleotides found in this column.
+* G_Count     = Count of 'G' nucleotides found in this column.
+* T_Count     = Count of 'T' nucleotides found in this column.
+* N_Count     = Count of 'N' nucleotides found in this column.
+* Other_Nucs  = Comma separated list of other nucleotides found in this column.
+* Other_Count = Comma separated count of other nucleotides found in this column.
+
+For example::
+
+  #column   count   min max sum mean    Q1  med Q3  IQR lW  rW  outliers    A_Count C_Count G_Count T_Count N_Count other_bases other_base_count
+  1   14336356    2   33  450600675   31.4306281875   32.0    33.0    33.0    1.0 31  33  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30    4482314 2199633 4425957 3208745 19707       
+  2   14336356    2   34  441135033   30.7703737965   30.0    33.0    33.0    3.0 26  34  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25   4419184 2170537 4627987 3118567 81      
+  3   14336356    2   34  433659182   30.2489127642   29.0    32.0    33.0    4.0 23  34  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22    4310988 2941988 3437467 3645784 129     
+  4   14336356    2   34  433635331   30.2472490917   29.0    32.0    33.0    4.0 23  34  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22    4110637 3007028 3671749 3546839 103     
+  5   14336356    2   34  432498583   30.167957813    29.0    32.0    33.0    4.0 23  34  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22    4348275 2935903 3293025 3759029 124     
+
+-----
+
+.. class:: warningmark
+
+Adapter bases in color space reads are excluded from statistics.
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. &lt;http://www.ncbi.nlm.nih.gov/pubmed/20562416&gt;`_
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_to_fasta.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_to_fasta.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,22 @@
+#Dan Blankenberg
+import sys
+from galaxy_utils.sequence.fastq import fastqReader
+from galaxy_utils.sequence.fasta import fastaWriter
+
+def main():
+    input_filename = sys.argv[1]
+    output_filename = sys.argv[2]
+    input_type = sys.argv[3] or 'sanger' #input type should ordinarily be unnecessary
+    
+    num_reads = None
+    fastq_read = None
+    out = fastaWriter( open( output_filename, 'wb' ) )
+    for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
+        out.write( fastq_read )
+    out.close()
+    if num_reads is None:
+        print "No valid FASTQ reads could be processed."
+    else:
+        print "%i FASTQ reads were converted to FASTA." % ( num_reads + 1 )
+    
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_to_fasta.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_to_fasta.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,40 @@
+<tool id="fastq_to_fasta_python" name="FASTQ to FASTA" version="1.0.0">
+  <description>converter</description>
+  <command interpreter="python">fastq_to_fasta.py '$input_file' '$output_file' '${input_file.extension[len( 'fastq' ):]}'</command>
+  <inputs>
+    <param name="input_file" type="data" format="fastq" label="FASTQ file to convert" />
+  </inputs>
+  <outputs>
+    <data name="output_file" format="fasta" />
+  </outputs>
+  <tests>
+    <!-- basic test -->
+    <test>
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <output name="output_file" file="fastq_to_fasta_python_1.out" />
+    </test>
+    <!-- color space test -->
+    <test>
+      <param name="input_file" value="sanger_full_range_as_cssanger.fastqcssanger" ftype="fastqcssanger" />
+      <output name="output_file" file="fastq_to_fasta_python_2.out" />
+    </test>
+    <!-- test of ignoring invalid score values: this input has ascii characters falling outside of illumina range, but they should not matter -->
+    <test>
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqillumina" />
+      <output name="output_file" file="fastq_to_fasta_python_1.out" />
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+This tool converts FASTQ sequencing reads to FASTA sequences.
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. &lt;http://www.ncbi.nlm.nih.gov/pubmed/20562416&gt;`_
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_to_tabular.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_to_tabular.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,38 @@
+#Dan Blankenberg
+import sys
+from galaxy_utils.sequence.fastq import fastqReader
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def main():
+    if len(sys.argv) != 5:
+        stop_err("Wrong number of arguments. Expect: fasta tabular desrc_split [type]")
+    input_filename = sys.argv[1]
+    output_filename = sys.argv[2]
+    descr_split = int( sys.argv[3] ) - 1
+    if descr_split < 0:
+        stop_err("Bad description split value (should be 1 or more)")
+    input_type = sys.argv[4] or 'sanger' #input type should ordinarily be unnecessary
+    
+    num_reads = None
+    fastq_read = None
+    out = open( output_filename, 'wb' )
+    if descr_split == 0:
+        #Don't divide the description into multiple columns
+        for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
+            out.write( "%s\t%s\t%s\n" % ( fastq_read.identifier[1:].replace( '\t', ' ' ), fastq_read.sequence.replace( '\t', ' ' ), fastq_read.quality.replace( '\t', ' ' ) ) )
+    else:
+        for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
+            words = fastq_read.identifier[1:].replace( '\t', ' ' ).split(None, descr_split)
+            #pad with empty columns if required
+            words += [""]*(descr_split-len(words))
+            out.write( "%s\t%s\t%s\n" % ("\t".join(words), fastq_read.sequence.replace( '\t', ' ' ), fastq_read.quality.replace( '\t', ' ' ) ) )
+    out.close()
+    if num_reads is None:
+        print "No valid FASTQ reads could be processed."
+    else:
+        print "%i FASTQ reads were converted to Tabular." % ( num_reads + 1 )
+    
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_to_tabular.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_to_tabular.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,101 @@
+<tool id="fastq_to_tabular" name="FASTQ to Tabular" version="1.1.0">
+  <description>converter</description>
+  <command interpreter="python">fastq_to_tabular.py '$input_file' '$output_file' $descr_columns '${input_file.extension[len( 'fastq' ):]}'</command>
+  <inputs>
+    <param name="input_file" type="data" format="fastqsanger,fastqcssanger,fastqillumina,fastqsolexa" label="FASTQ file to convert" />
+    <param name="descr_columns" type="integer" size="2" value="1" label="How many columns to divide title string into?" help="Typically 2 to take the ID (first word) and decription (rest) as two columns, or 1 to give a single column">
+      <validator type="in_range" min="1" />
+    </param>
+  </inputs>
+  <outputs>
+    <data name="output_file" format="tabular" />
+  </outputs>
+  <tests>
+    <!-- basic test -->
+    <test>
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="descr_columns" value="1"/>
+      <output name="output_file" file="fastq_to_tabular_out_1.tabular" />
+    </test>
+    <!-- color space test -->
+    <test>
+      <param name="input_file" value="sanger_full_range_as_cssanger.fastqcssanger" ftype="fastqcssanger" />
+      <param name="descr_columns" value="1"/>
+      <output name="output_file" file="fastq_to_tabular_out_2.tabular" />
+    </test>
+    <!-- split title into columns -->
+    <test>
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="descr_columns" value="2"/>
+      <output name="output_file" file="fastq_to_tabular_out_3.tabular" />
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool converts FASTQ sequencing reads to a Tabular file.
+
+It is conventional to take the first word of the FASTQ "@" title line as the identifier, and any remaining text to be a free form description.
+It is therefore often useful to split this text into two columns in Galaxy (identifier and any description) by setting **How many columns to divide title string into?** to **2**.
+In some cases the description can be usefully broken up into more columns -- see the examples .
+
+Tab characters, if present in the source FASTQ title, will be converted to spaces.
+
+-----
+
+**Example**
+
+Consider the following two 454 reads in Sanger FASTQ format (using line wrapping for display, but do note not all tools will accept line wrapped FASTQ files)::
+
+ @FSRRS4401BE7HA [length=395] [gc=36.46] [flows=800] [phred_min=0] [phred_max=40] [trimmed_length=95]
+ tcagTTAAGATGGGATAATATCCTCAGATTGCGTGATGAACTTTGTTCTGGTGGAGGAGAAGGAAGTGCATTCGACGTAT
+ GCCCGTTTGTCGATATTTGtatttaaagtaatccgtcacaaatcagtgacataaatattatttagatttcgggagcaact
+ ttatttattccacaagcaggtttaaattttaaatttaaattattgcagaagactttaaattaacctcgttgtcggagtca
+ tttgttcggttattggtcgaaagtaaccncgggaagtgccgaaaactaacaaacaaaagaagatagtgaaattttaatta
+ aaanaaatagccaaacgtaactaactaaaacggacccgtcgaggaactgccaacggacgacacagggagtagnnn
+ +FSRRS4401BE7HA [length=395] [gc=36.46] [flows=800] [phred_min=0] [phred_max=40] [trimmed_length=95]
+ FFFDDDDDDDA666?688FFHGGIIIIIIIIIIIIIIIIIIHHHIIIIIIIIIGHGFFFFF====DFFFFFFFFFFFFFF
+ D???:3104/76=:5...4.3,,,366////4&lt;ABBAAA=CCFDDDDDDDD:666CDFFFF=&lt;ABA=;:333111&lt;===9
+ 9;B889FFFFFFDDBDBDDD=8844231..,,,-,,,,,,,,1133..---17111,,,,,22555131121.--.,333
+ 11,.,,3--,,.,,--,3511123..--!,,,,--,----9,,,,8=,,-,,,-,,,,---26:9:5-..1,,,,11//,
+ ,,,!,,1917--,,,,-3.,--,,17,,,,---+11113.030000,,,044400036;96662.//;7&gt;&lt;;!!!
+ @FSRRS4401BRRTC [length=145] [gc=38.62] [flows=800] [phred_min=0] [phred_max=38] [trimmed_length=74]
+ tcagCCAGCAATTCCGACTTAATTGTTCTTCTTCCATCATTCATCTCGACTAACAGTTCTACGATTAATGAGTTTGGCtt
+ taatttgttgttcattattgtcacaattacactactgagactgccaaggcacncagggataggnn
+ +FSRRS4401BRRTC [length=145] [gc=38.62] [flows=800] [phred_min=0] [phred_max=38] [trimmed_length=74]
+ FFFFFFFFFDDDDFFFFGFDDDDBAAAAA=&lt;4444@@B=555:BBBBB@@?8:8&lt;?&lt;89898&lt;84442;==3,,,514,,
+ ,11,,,.,,21777555513,..--1115758.//34488&gt;&lt;&lt;;;;;9944/!/4,,,57855!!
+
+By default this is converted into a 3 column tabular file, with the full FASTQ title used as column 1:
+
+=================================================================================================== ============== ==============
+FSRRS4401BE7HA [length=395] [gc=36.46] [flows=800] [phred_min=0] [phred_max=40] [trimmed_length=95] tcagTTAA...nnn FFFDDDDD...!!!
+FSRRS4401BRRTC [length=145] [gc=38.62] [flows=800] [phred_min=0] [phred_max=38] [trimmed_length=74] tcagCCAG...gnn FFFFFFFF...5!!
+=================================================================================================== ============== ==============
+
+If you specified the title should be turned into 2 columns, you'd get 4 columns in total:
+
+============== ==================================================================================== ============== ==============
+FSRRS4401BE7HA [length=395] [gc=36.46] [flows=800] [phred_min=0] [phred_max=40] [trimmed_length=95] tcagTTAA...nnn FFFDDDDD...!!!
+FSRRS4401BRRTC [length=145] [gc=38.62] [flows=800] [phred_min=0] [phred_max=38] [trimmed_length=74] tcagCCAG...gnn FFFFFFFF...5!!
+============== ==================================================================================== ============== ==============
+
+Similarly, for this example treating the title string as 7 columns makes sense:
+
+============== ============ ========== =========== ============= ============== =================== ============== ==============
+FSRRS4401BE7HA [length=395] [gc=36.46] [flows=800] [phred_min=0] [phred_max=40] [trimmed_length=95] tcagTTAA...nnn FFFDDDDD...!!!
+FSRRS4401BRRTC [length=145] [gc=38.62] [flows=800] [phred_min=0] [phred_max=38] [trimmed_length=74] tcagCCAG...gnn FFFFFFFF...5!!
+============== ============ ========== =========== ============= ============== =================== ============== ==============
+
+Note the sequences and quality strings have been truncated for display purposes in the above tables.
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. &lt;http://www.ncbi.nlm.nih.gov/pubmed/20562416&gt;`_
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_trimmer.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_trimmer.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,41 @@
+#Dan Blankenberg
+import sys
+from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
+
+def main():
+    input_filename = sys.argv[1]
+    output_filename = sys.argv[2]
+    left_offset = sys.argv[3]
+    right_offset = sys.argv[4]
+    percent_offsets = sys.argv[5] == 'offsets_percent'
+    input_type = sys.argv[6] or 'sanger'
+    keep_zero_length = sys.argv[7] == 'keep_zero_length'
+    
+    out = fastqWriter( open( output_filename, 'wb' ), format = input_type )
+    num_reads_excluded = 0
+    num_reads = None
+    for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
+        if percent_offsets:
+            left_column_offset = int( round( float( left_offset ) / 100.0 * float( len( fastq_read ) ) ) )
+            right_column_offset = int( round( float( right_offset ) / 100.0 * float( len( fastq_read ) ) ) )
+        else:
+            left_column_offset = int( left_offset )
+            right_column_offset = int( right_offset )
+        if right_column_offset > 0:
+            right_column_offset = -right_column_offset
+        else:
+            right_column_offset = None
+        fastq_read = fastq_read.slice( left_column_offset, right_column_offset )
+        if keep_zero_length or len( fastq_read ):
+            out.write( fastq_read )
+        else:
+            num_reads_excluded += 1
+    out.close()
+    if num_reads is None:
+        print "No valid fastq reads could be processed."
+    else:
+        print "%i fastq reads were processed." % ( num_reads + 1 )
+    if num_reads_excluded:
+        print "%i reads of zero length were excluded from the output." % num_reads_excluded
+
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_trimmer.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_trimmer.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,120 @@
+<tool id="fastq_trimmer" name="FASTQ Trimmer" version="1.0.0">
+  <description>by column</description>
+  <command interpreter="python">fastq_trimmer.py '$input_file' '$output_file' '${offset_type['left_column_offset']}' '${offset_type['right_column_offset']}' '${offset_type['base_offset_type']}' '${input_file.extension[len( 'fastq' ):]}' '$keep_zero_length'</command>
+  <inputs>
+    <param name="input_file" type="data" format="fastqsanger,fastqcssanger" label="FASTQ File"/>
+    <conditional name="offset_type">
+      <param name="base_offset_type" type="select" label="Define Base Offsets as" help="Use Absolute for fixed length reads (Illumina, SOLiD)&lt;br&gt;Use Percentage for variable length reads (Roche/454)">
+        <option value="offsets_absolute" selected="true">Absolute Values</option>
+        <option value="offsets_percent">Percentage of Read Length</option>
+      </param>
+      <when value="offsets_absolute">
+        <param name="left_column_offset" label="Offset from 5' end" value="0" type="integer" help="Values start at 0, increasing from the left">
+          <validator type="in_range" message="Base Offsets must be positive" min="0" max="inf"/>
+          <validator type="expression" message="An integer is required.">int( float( value ) ) == float( value )</validator>
+        </param>
+        <param name="right_column_offset" label="Offset from 3' end" value="0" type="integer" help="Values start at 0, increasing from the right">
+          <validator type="in_range" message="Base Offsets must be positive" min="0" max="inf"/>
+          <validator type="expression" message="An integer is required.">int( float( value ) ) == float( value )</validator>
+        </param>
+      </when>
+      <when value="offsets_percent">
+        <param name="left_column_offset" label="Offset from 5' end" value="0" type="float">
+          <validator type="in_range" message="Base Offsets must be between 0 and 100" min="0" max="100"/>
+        </param>
+        <param name="right_column_offset" label="Offset from 3' end" value="0" type="float">
+          <validator type="in_range" message="Base Offsets must be between 0 and 100" min="0" max="100"/>
+        </param>
+      </when>
+    </conditional>
+  <param name="keep_zero_length" label="Keep reads with zero length" type="boolean" truevalue="keep_zero_length" falsevalue="exclude_zero_length" selected="False"/>
+  </inputs>
+  <outputs>
+    <data name="output_file" format="input" />
+  </outputs>
+  <tests>
+    <test>
+      <!-- Do nothing trim -->
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="base_offset_type" value="offsets_absolute"/>
+      <param name="left_column_offset" value="0"/>
+      <param name="right_column_offset" value="0"/>
+      <param name="keep_zero_length" value="keep_zero_length" />
+      <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" />
+    </test>
+    <!-- Trim to empty File -->
+    <test>
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="base_offset_type" value="offsets_absolute"/>
+      <param name="left_column_offset" value="30"/>
+      <param name="right_column_offset" value="64"/>
+      <param name="keep_zero_length" value="exclude_zero_length" />
+      <output name="output_file" file="empty_file.dat" />
+    </test>
+    <test>
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="base_offset_type" value="offsets_percent"/>
+      <param name="left_column_offset" value="50"/>
+      <param name="right_column_offset" value="50"/>
+      <param name="keep_zero_length" value="exclude_zero_length" />
+      <output name="output_file" file="empty_file.dat" />
+    </test>
+    <!-- Trim to 4 inner-most bases -->
+    <test>
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="base_offset_type" value="offsets_absolute"/>
+      <param name="left_column_offset" value="45"/>
+      <param name="right_column_offset" value="45"/>
+      <param name="keep_zero_length" value="exclude_zero_length" />
+      <output name="output_file" file="fastq_trimmer_out1.fastqsanger" />
+    </test>
+    <test>
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="base_offset_type" value="offsets_percent"/>
+      <param name="left_column_offset" value="47.87"/>
+      <param name="right_column_offset" value="47.87"/>
+      <param name="keep_zero_length" value="exclude_zero_length" />
+      <output name="output_file" file="fastq_trimmer_out1.fastqsanger" />
+    </test>
+  </tests>
+  <help>
+This tool allows you to trim the ends of reads.
+
+You can specify either absolute or percent-based offsets. Offsets are calculated, starting at 0, from the respective end to be trimmed. When using the percent-based method, offsets are rounded to the nearest integer. 
+
+For example, if you have a read of length 36::
+  
+  @Some FASTQ Sanger Read
+  CAATATGTNCTCACTGATAAGTGGATATNAGCNCCA
+  +
+  =@@.@;B-%?8&gt;CBA@&gt;7@7BBCA4-48%&lt;;;%&lt;B@
+  
+And you set absolute offsets of 2 and 9::
+  
+  @Some FASTQ Sanger Read
+  ATATGTNCTCACTGATAAGTGGATA
+  +
+  @.@;B-%?8&gt;CBA@&gt;7@7BBCA4-4
+  
+Or you set percent offsets of 6% and 20% (corresponds to absolute offsets of 2,7 for a read length of 36)::
+  
+  @Some FASTQ Sanger Read
+  ATATGTNCTCACTGATAAGTGGATATN
+  +
+  @.@;B-%?8&gt;CBA@&gt;7@7BBCA4-48%
+  
+-----
+
+.. class:: warningmark
+
+Trimming a color space read will cause any adapter base to be lost.
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. &lt;http://www.ncbi.nlm.nih.gov/pubmed/20562416&gt;`_
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_trimmer_by_quality.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_trimmer_by_quality.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,126 @@
+#Dan Blankenberg
+from optparse import OptionParser
+from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
+
+def mean( score_list ):
+    return float( sum( score_list ) ) / float( len( score_list ) )
+
+ACTION_METHODS = { 'min':min, 'max':max, 'sum':sum, 'mean':mean }
+
+def compare( aggregated_value, operator, threshold_value ):
+    if operator == '>':
+        return aggregated_value > threshold_value
+    elif operator == '>=':
+        return aggregated_value >= threshold_value
+    elif operator == '==':
+        return aggregated_value == threshold_value
+    elif operator == '<':
+        return aggregated_value < threshold_value
+    elif operator == '<=':
+        return aggregated_value <= threshold_value
+    elif operator == '!=':
+        return aggregated_value != threshold_value
+
+def exclude( value_list, exclude_indexes ):
+    rval = []
+    for i, val in enumerate( value_list ):
+        if i not in exclude_indexes:
+            rval.append( val )
+    return rval
+
+def exclude_and_compare( aggregate_action, aggregate_list, operator, threshold_value, exclude_indexes = None ):
+    if not aggregate_list or compare( aggregate_action( aggregate_list ), operator, threshold_value ):
+        return True
+    if exclude_indexes:
+        for exclude_index in exclude_indexes:
+            excluded_list = exclude( aggregate_list, exclude_index )
+            if not excluded_list or compare( aggregate_action( excluded_list ), operator, threshold_value ):
+                return True
+    return False
+
+def main():
+    usage = "usage: %prog [options] input_file output_file"
+    parser = OptionParser( usage=usage )
+    parser.add_option( '-f', '--format', dest='format', type='choice', default='sanger', choices=( 'sanger', 'cssanger', 'solexa', 'illumina' ), help='FASTQ variant type' )
+    parser.add_option( '-s', '--window_size', type="int", dest='window_size', default='1', help='Window size' )
+    parser.add_option( '-t', '--window_step', type="int", dest='window_step', default='1', help='Window step' )
+    parser.add_option( '-e', '--trim_ends', type="choice", dest='trim_ends', default='53', choices=('5','3','53','35' ), help='Ends to Trim' )
+    parser.add_option( '-a', '--aggregation_action', type="choice", dest='aggregation_action', default='min', choices=('min','max','sum','mean' ), help='Aggregate action for window' )
+    parser.add_option( '-x', '--exclude_count', type="int", dest='exclude_count', default='0', help='Maximum number of bases to exclude from the window during aggregation' )
+    parser.add_option( '-c', '--score_comparison', type="choice", dest='score_comparison', default='>=', choices=('>','>=','==','<', '<=', '!=' ), help='Keep read when aggregate score is' )
+    parser.add_option( '-q', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score' )
+    parser.add_option( "-k", "--keep_zero_length", action="store_true", dest="keep_zero_length", default=False, help="Keep reads with zero length")
+    ( options, args ) = parser.parse_args()
+    
+    if len ( args ) != 2:
+        parser.error( "Need to specify an input file and an output file" )
+    
+    if options.window_size < 1:
+        parser.error( 'You must specify a strictly positive window size' )
+    
+    if options.window_step < 1:
+        parser.error( 'You must specify a strictly positive step size' )
+    
+    #determine an exhaustive list of window indexes that can be excluded from aggregation
+    exclude_window_indexes = []
+    last_exclude_indexes = []
+    for exclude_count in range( min( options.exclude_count, options.window_size ) ):
+        if last_exclude_indexes:
+            new_exclude_indexes = []
+            for exclude_list in last_exclude_indexes:
+                for window_index in range( options.window_size ):
+                    if window_index not in exclude_list:
+                        new_exclude = sorted( exclude_list + [ window_index ] )
+                        if new_exclude not in exclude_window_indexes + new_exclude_indexes:
+                            new_exclude_indexes.append( new_exclude )
+            exclude_window_indexes += new_exclude_indexes
+            last_exclude_indexes = new_exclude_indexes
+        else:
+            for window_index in range( options.window_size ):
+                last_exclude_indexes.append( [ window_index ] )
+            exclude_window_indexes = list( last_exclude_indexes )
+    
+    out = fastqWriter( open( args[1], 'wb' ), format = options.format )
+    action = ACTION_METHODS[ options.aggregation_action ]
+    
+    num_reads = None
+    num_reads_excluded = 0
+    for num_reads, fastq_read in enumerate( fastqReader( open( args[0] ), format = options.format ) ):
+        for trim_end in options.trim_ends:
+            quality_list = fastq_read.get_decimal_quality_scores()
+            if trim_end == '5':
+                lwindow_position = 0 #left position of window
+                while True:
+                    if lwindow_position >= len( quality_list ):
+                        fastq_read.sequence = ''
+                        fastq_read.quality = ''
+                        break
+                    if exclude_and_compare( action, quality_list[ lwindow_position:lwindow_position + options.window_size ], options.score_comparison, options.quality_score, exclude_window_indexes ):
+                        fastq_read = fastq_read.slice( lwindow_position, None )
+                        break
+                    lwindow_position += options.window_step
+            else:
+                rwindow_position = len( quality_list ) #right position of window
+                while True:
+                    lwindow_position = rwindow_position - options.window_size #left position of window
+                    if rwindow_position <= 0 or lwindow_position < 0:
+                        fastq_read.sequence = ''
+                        fastq_read.quality = ''
+                        break
+                    if exclude_and_compare( action, quality_list[ lwindow_position:rwindow_position ], options.score_comparison, options.quality_score, exclude_window_indexes ):
+                        fastq_read = fastq_read.slice( None, rwindow_position )
+                        break
+                    rwindow_position -= options.window_step
+        if options.keep_zero_length or len( fastq_read ):
+            out.write( fastq_read )
+        else:
+            num_reads_excluded += 1
+    out.close()
+    if num_reads is None:
+        print "No valid FASTQ reads could be processed."
+    else:
+        print "%i FASTQ reads were processed." % ( num_reads + 1 )
+    if num_reads_excluded:
+        print "%i reads of zero length were excluded from the output." % num_reads_excluded
+
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/fastq_trimmer_by_quality.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_trimmer_by_quality.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,145 @@
+<tool id="fastq_quality_trimmer" name="FASTQ Quality Trimmer" version="1.0.0">
+  <description>by sliding window</description>
+  <command interpreter="python">fastq_trimmer_by_quality.py '$input_file' '$output_file' -f '${input_file.extension[len( 'fastq' ):]}' -s '$window_size' 
+    -t '$step_size' -e '$trim_ends' -a '$aggregation_action' -x '$exclude_count' -c '$score_comparison' -q '$quality_score' 
+    #if $keep_zero_length.value:
+        -k
+    #end if
+  </command>
+  <inputs>
+    <param name="input_file" type="data" format="fastqsanger,fastqcssanger" label="FASTQ File"/>
+    <param name="keep_zero_length" label="Keep reads with zero length" type="boolean" truevalue="keep_zero_length" falsevalue="exclude_zero_length" selected="False"/>
+    <param name="trim_ends" type="select" label="Trim ends">
+      <option value="53" selected="True">5' and 3'</option>
+      <option value="5">5' only</option>
+      <option value="3">3' only</option>
+    </param>
+    <param name="window_size" type="integer" value="1" label="Window size"/>
+    <param name="step_size" type="integer" value="1" label="Step Size" />
+    <param name="exclude_count" label="Maximum number of bases to exclude from the window during aggregation" value="0" type="integer" />
+    <param name="aggregation_action" type="select" label="Aggregate action for window">
+      <option value="min" selected="True">min score</option>
+      <option value="max">max score</option>
+      <option value="sum">sum of scores</option>
+      <option value="mean">mean of scores</option>
+    </param>
+    <param name="score_comparison" type="select" label="Trim until aggregate score is">
+      <sanitizer>
+        <valid initial="none">
+            <add value="&lt;&gt;=!"/> <!-- only allow lt, gt, e, le, ge, ne for this parameter; will be single-quote escaped on commandline -->
+        </valid>
+      </sanitizer>
+      <option value="&gt;">&gt;</option>
+      <option value="&gt;=" selected="true">&gt;=</option>
+      <option value="==">==</option>
+      <option value="!=">!=</option>
+      <option value="&lt;">&lt;</option>
+      <option value="&lt;=">&lt;=</option>
+    </param>
+    <param name="quality_score" label="Quality Score" value="0" type="float" />
+  </inputs>
+  <outputs>
+    <data name="output_file" format="input" />
+  </outputs>
+  <tests>
+    <test>
+      <!-- Trim until window size 1 >= 20;both ends -->
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="keep_zero_length" value="exclude_zero_length" />
+      <param name="trim_ends" value="53"/>
+      <param name="window_size" value="1"/>
+      <param name="step_size" value="1"/>
+      <param name="exclude_count" value="0"/>
+      <param name="aggregation_action" value="min"/>
+      <param name="score_comparison" value="&gt;="/>
+      <param name="quality_score" value="20"/>
+      <output name="output_file" file="sanger_full_range_quality_trimmed_out_1.fastqsanger" />
+    </test>
+    <test>
+      <!-- Trim until window size 1 >= 20; 5' end only -->
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="keep_zero_length" value="exclude_zero_length" />
+      <param name="trim_ends" value="5"/>
+      <param name="window_size" value="1"/>
+      <param name="step_size" value="1"/>
+      <param name="exclude_count" value="0"/>
+      <param name="aggregation_action" value="min"/>
+      <param name="score_comparison" value="&gt;="/>
+      <param name="quality_score" value="20"/>
+      <output name="output_file" file="sanger_full_range_quality_trimmed_out_2.fastqsanger" />
+    </test>
+    <test>
+      <!-- Trim until window size 1 >= 20; 3' end only -->
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="keep_zero_length" value="exclude_zero_length" />
+      <param name="trim_ends" value="3"/>
+      <param name="window_size" value="1"/>
+      <param name="step_size" value="1"/>
+      <param name="exclude_count" value="0"/>
+      <param name="aggregation_action" value="min"/>
+      <param name="score_comparison" value="&gt;="/>
+      <param name="quality_score" value="20"/>
+      <output name="output_file" file="sanger_full_range_quality_trimmed_out_3.fastqsanger" />
+    </test>
+    <test>
+      <!-- Trim until window size 2 >= 1;both ends, 1 deviant score -->
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="keep_zero_length" value="exclude_zero_length" />
+      <param name="trim_ends" value="53"/>
+      <param name="window_size" value="2"/>
+      <param name="step_size" value="1"/>
+      <param name="exclude_count" value="1"/>
+      <param name="aggregation_action" value="min"/>
+      <param name="score_comparison" value="&gt;="/>
+      <param name="quality_score" value="1"/>
+      <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" />
+    </test>
+    <test>
+      <!-- Trim entire sequences; keep empty reads -->
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="keep_zero_length" value="true" />
+      <param name="trim_ends" value="53"/>
+      <param name="window_size" value="1"/>
+      <param name="step_size" value="1"/>
+      <param name="exclude_count" value="0"/>
+      <param name="aggregation_action" value="min"/>
+      <param name="score_comparison" value="&gt;="/>
+      <param name="quality_score" value="999"/>
+      <output name="output_file" file="sanger_full_range_empty_reads.fastqsanger" />
+    </test>
+    <test>
+      <!-- Trim entire sequences; discard empty reads -->
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="keep_zero_length"/>
+      <param name="trim_ends" value="53"/>
+      <param name="window_size" value="1"/>
+      <param name="step_size" value="1"/>
+      <param name="exclude_count" value="0"/>
+      <param name="aggregation_action" value="min"/>
+      <param name="score_comparison" value="&gt;="/>
+      <param name="quality_score" value="999"/>
+      <output name="output_file" file="empty_file.dat" />
+    </test>
+  </tests>
+  <help>
+This tool allows you to trim the ends of reads based upon the aggregate value of quality scores found within a sliding window; a sliding window of size 1 is equivalent to 'simple' trimming of the ends. 
+
+The user specifies the aggregating action (min, max, sum, mean) to perform on the quality score values found within the sliding window to be used with the user defined comparison operation and comparison value.
+
+The user can provide a maximum count of bases that can be excluded from the aggregation within the window. When set, this tool will first check the aggregation of the entire window, then after removing 1 value, then after removing 2 values, up to the number declared. Setting this value to be equal to or greater than the window size will cause no trimming to occur.
+
+-----
+
+.. class:: warningmark
+
+Trimming a color space read will cause any adapter base to be lost.
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. &lt;http://www.ncbi.nlm.nih.gov/pubmed/20562416&gt;`_
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/tabular_to_fastq.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/tabular_to_fastq.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,29 @@
+#Dan Blankenberg
+import sys
+
+def main():
+    input_filename = sys.argv[1]
+    output_filename = sys.argv[2]
+    identifier_col = int( sys.argv[3] ) - 1
+    sequence_col = int( sys.argv[4] ) - 1
+    quality_col = int( sys.argv[5] ) - 1
+    
+    max_col = max( identifier_col, sequence_col, quality_col )
+    num_reads = None
+    fastq_read = None
+    skipped_lines = 0
+    out = open( output_filename, 'wb' )
+    for num_reads, line in enumerate( open( input_filename ) ):
+        fields = line.rstrip( '\n\r' ).split( '\t' )
+        if len( fields ) > max_col:
+            out.write( "@%s\n%s\n+\n%s\n" % ( fields[identifier_col], fields[sequence_col], fields[quality_col] ) )
+        else:
+            skipped_lines += 1
+    
+    out.close()
+    if num_reads is None:
+        print "Input was empty."
+    else:
+        print "%i tabular lines were written as FASTQ reads. Be sure to use the FASTQ Groomer tool on this output before further analysis." % ( num_reads + 1 - skipped_lines )
+    
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastq/tabular_to_fastq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/tabular_to_fastq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,44 @@
+<tool id="tabular_to_fastq" name="Tabular to FASTQ" version="1.0.0">
+  <description>converter</description>
+  <command interpreter="python">tabular_to_fastq.py '$input_file' '$output_file' '$identifier' '$sequence' '$quality'</command>
+  <inputs>
+    <param name="input_file" type="data" format="tabular" label="Tabular file to convert" />
+    <param name="identifier" label="Identifier column" type="data_column" data_ref="input_file" />
+    <param name="sequence" label="Sequence column" type="data_column" data_ref="input_file" />
+    <param name="quality" label="Quality column" type="data_column" data_ref="input_file" />
+  </inputs>
+  <outputs>
+    <data name="output_file" format="fastq" />
+  </outputs>
+  <tests>
+    <!-- basic test -->
+    <test>
+      <param name="input_file" value="fastq_to_tabular_out_1.tabular" ftype="tabular" />
+      <param name="identifier" value="1" />
+      <param name="sequence" value="2" />
+      <param name="quality" value="3" />
+      <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" />
+    </test>
+    <!-- color space test -->
+    <test>
+      <param name="input_file" value="fastq_to_tabular_out_2.tabular" ftype="tabular" />
+      <param name="identifier" value="1" />
+      <param name="sequence" value="2" />
+      <param name="quality" value="3" />
+      <output name="output_file" file="sanger_full_range_as_cssanger.fastqcssanger" />
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+This tool attempts to convert a tabular file containing sequencing read data to a FASTQ formatted file. The FASTQ Groomer tool should always be used on the output of this tool. 
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. &lt;http://www.ncbi.nlm.nih.gov/pubmed/20562416&gt;`_
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastx_toolkit/fasta_clipping_histogram.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastx_toolkit/fasta_clipping_histogram.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,104 @@
+<tool id="cshl_fasta_clipping_histogram" name="Length Distribution">
+ <description>chart</description>
+ <requirements><requirement type="package">fastx_toolkit</requirement></requirements>
+ <command>fasta_clipping_histogram.pl $input $outfile</command>
+
+ <inputs>
+ <param format="fasta" name="input" type="data" label="Library to analyze" />
+ </inputs>
+
+ <outputs>
+ <data format="png" name="outfile" metadata_source="input" />
+ </outputs>
+<help>
+
+**What it does**
+
+This tool creates a histogram image of sequence lengths distribution in a given fasta dataset file.
+
+**TIP:** Use this tool after clipping your library (with **FASTX Clipper tool**), to visualize the clipping results.
+
+-----
+
+**Output Examples**
+
+In the following library, most sequences are 24-mers to 27-mers. 
+This could indicate an abundance of endo-siRNAs (depending of course of what you've tried to sequence in the first place).
+
+.. image:: ./static/fastx_icons/fasta_clipping_histogram_1.png
+
+
+In the following library, most sequences are 19,22 or 23-mers. 
+This could indicate an abundance of miRNAs (depending of course of what you've tried to sequence in the first place).
+
+.. image:: ./static/fastx_icons/fasta_clipping_histogram_2.png
+
+
+-----
+
+
+**Input Formats**
+
+This tool accepts short-reads FASTA files. The reads don't have to be short, but they do have to be on a single line, like so::
+
+   >sequence1
+   AGTAGTAGGTGATGTAGAGAGAGAGAGAGTAG
+   >sequence2
+   GTGTGTGTGGGAAGTTGACACAGTA
+   >sequence3
+   CCTTGAGATTAACGCTAATCAAGTAAAC
+
+
+If the sequences span over multiple lines::
+
+   >sequence1
+   CAGCATCTACATAATATGATCGCTATTAAACTTAAATCTCCTTGACGGAG
+   TCTTCGGTCATAACACAAACCCAGACCTACGTATATGACAAAGCTAATAG
+   aactggtctttacctTTAAGTTG
+
+Use the **FASTA Width Formatter** tool to re-format the FASTA into a single-lined sequences::
+
+   >sequence1
+   CAGCATCTACATAATATGATCGCTATTAAACTTAAATCTCCTTGACGGAGTCTTCGGTCATAACACAAACCCAGACCTACGTATATGACAAAGCTAATAGaactggtctttacctTTAAGTTG
+
+
+-----
+
+
+
+**Multiplicity counts (a.k.a reads-count)**
+
+If the sequence identifier (the text after the '>') contains a dash and a number, it is treated as a multiplicity count value (i.e. how many times that individual sequence repeated in the original FASTA file, before collapsing).
+
+Example 1 - The following FASTA file *does not* have multiplicity counts::
+
+    >seq1
+    GGATCC
+    >seq2
+    GGTCATGGGTTTAAA
+    >seq3
+    GGGATATATCCCCACACACACACAC
+
+Each sequence is counts as one, to produce the following chart:
+
+.. image:: ./static/fastx_icons/fasta_clipping_histogram_3.png
+
+
+Example 2 - The following FASTA file have multiplicity counts::
+
+    >seq1-2
+    GGATCC
+    >seq2-10
+    GGTCATGGGTTTAAA
+    >seq3-3
+    GGGATATATCCCCACACACACACAC
+
+The first sequence counts as 2, the second as 10, the third as 3, to produce the following chart:
+
+.. image:: ./static/fastx_icons/fasta_clipping_histogram_4.png
+
+Use the **FASTA Collapser** tool to create FASTA files with multiplicity counts.
+
+</help>
+</tool>
+<!-- FASTA-Clipping-Histogram is part of the FASTX-toolkit, by A.Gordon (gordon@cshl.edu) -->
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastx_toolkit/fasta_formatter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastx_toolkit/fasta_formatter.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,80 @@
+<tool id="cshl_fasta_formatter" name="FASTA Width">
+ <description>formatter</description>
+ <requirements><requirement type="package">fastx_toolkit</requirement></requirements>
+ <!--
+ Note:
+ fasta_formatter also has a tabular output mode (-t),
+ but Galaxy already contains such a tool, so no need
+ to offer the user a duplicated tool.
+
+ So this XML tool only changes the width (line-wrapping) of a
+ FASTA file.
+ -->
+ <command>zcat -f '$input' | fasta_formatter -w $width -o $output</command>
+ <inputs>
+ <param format="fasta" name="input" type="data" label="Library to re-format" />
+
+ <param name="width" type="integer" value="0" label="New width for nucleotides strings" help="Use 0 for single line out." />
+ </inputs>
+
+ <tests>
+ <test>
+ <!-- Re-format a FASTA file into a single line -->
+ <param name="input" value="fasta_formatter1.fasta" /> 
+ <param name="width" value="0" />
+ <output name="output" file="fasta_formatter1.out" />
+ </test>
+ <test>
+ <!-- Re-format a FASTA file into multiple lines wrapping at 60 charactes -->
+ <param name="input" value="fasta_formatter1.fasta" />
+ <param name="width" value="60" />
+ <output name="output" file="fasta_formatter2.out" />
+ </test>
+ </tests>
+
+ <outputs>
+ <data format="input" name="output" metadata_source="input" />
+ </outputs>
+
+<help>
+**What it does**
+
+This tool re-formats a FASTA file, changing the width of the nucleotides lines.
+  
+**TIP:** Outputting a single line (with **width = 0**) can be useful for scripting (with **grep**, **awk**, and **perl**). Every odd line is a sequence identifier, and every even line is a nucleotides line.
+
+--------
+
+**Example**
+
+Input FASTA file (each nucleotides line is 50 characters long)::
+
+    >Scaffold3648
+    AGGAATGATGACTACAATGATCAACTTAACCTATCTATTTAATTTAGTTC
+    CCTAATGTCAGGGACCTACCTGTTTTTGTTATGTTTGGGTTTTGTTGTTG
+    TTGTTTTTTTAATCTGAAGGTATTGTGCATTATATGACCTGTAATACACA
+    ATTAAAGTCAATTTTAATGAACATGTAGTAAAAACT
+    >Scaffold9299
+    CAGCATCTACATAATATGATCGCTATTAAACTTAAATCTCCTTGACGGAG
+    TCTTCGGTCATAACACAAACCCAGACCTACGTATATGACAAAGCTAATAG
+    aactggtctttacctTTAAGTTG
+
+
+Output FASTA file (with width=80)::
+
+    >Scaffold3648
+    AGGAATGATGACTACAATGATCAACTTAACCTATCTATTTAATTTAGTTCCCTAATGTCAGGGACCTACCTGTTTTTGTT
+    ATGTTTGGGTTTTGTTGTTGTTGTTTTTTTAATCTGAAGGTATTGTGCATTATATGACCTGTAATACACAATTAAAGTCA
+    ATTTTAATGAACATGTAGTAAAAACT
+    >Scaffold9299
+    CAGCATCTACATAATATGATCGCTATTAAACTTAAATCTCCTTGACGGAGTCTTCGGTCATAACACAAACCCAGACCTAC
+    GTATATGACAAAGCTAATAGaactggtctttacctTTAAGTTG
+
+Output FASTA file (with width=0 => single line)::
+
+    >Scaffold3648
+    AGGAATGATGACTACAATGATCAACTTAACCTATCTATTTAATTTAGTTCCCTAATGTCAGGGACCTACCTGTTTTTGTTATGTTTGGGTTTTGTTGTTGTTGTTTTTTTAATCTGAAGGTATTGTGCATTATATGACCTGTAATACACAATTAAAGTCAATTTTAATGAACATGTAGTAAAAACT
+    >Scaffold9299
+    CAGCATCTACATAATATGATCGCTATTAAACTTAAATCTCCTTGACGGAGTCTTCGGTCATAACACAAACCCAGACCTACGTATATGACAAAGCTAATAGaactggtctttacctTTAAGTTG
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastx_toolkit/fasta_nucleotide_changer.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastx_toolkit/fasta_nucleotide_changer.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,66 @@
+<tool id="cshl_fasta_nucleotides_changer" name="RNA/DNA" >
+ <description>converter</description>
+ <requirements><requirement type="package">fastx_toolkit</requirement></requirements>
+ <command>zcat -f '$input' | fasta_nucleotide_changer $mode -v -o $output</command>
+ <inputs>
+ <param format="fasta" name="input" type="data" label="Library to convert" />
+
+ <param name="mode" type="select" label="Convert">
+ <option value="-d">RNA to DNA (U to T)</option>
+ <option value="-r">DNA to RNA (T to U)</option>
+ </param>
+ </inputs>
+
+    <!-- 
+    Functional tests with param value starting with - fail.
+ <tests>
+ <test>
+ <param name="input" value="fasta_nuc_changer1.fasta" /> 
+ <param name="mode" value="-r" /> 
+ <output name="output" file="fasta_nuc_change1.out" />
+ </test>
+ <test>
+ <param name="input" value="fasta_nuc_changer2.fasta" /> 
+ <param name="mode" value="-d" /> 
+ <output name="output" file="fasta_nuc_change2.out" />
+ </test>
+ </tests>
+     -->
+  
+ <outputs>
+ <data format="input" name="output" metadata_source="input" />
+ </outputs>
+
+<help>
+**What it does**
+
+This tool converts RNA FASTA files to DNA (and vice-versa).
+
+In **RNA-to-DNA** mode, U's are changed into T's.
+
+In **DNA-to-RNA** mode, T's are changed into U's.
+
+--------
+
+**Example**
+
+Input RNA FASTA file ( from Sanger's mirBase )::
+
+    >cel-let-7 MIMAT0000001 Caenorhabditis elegans let-7
+    UGAGGUAGUAGGUUGUAUAGUU
+    >cel-lin-4 MIMAT0000002 Caenorhabditis elegans lin-4
+    UCCCUGAGACCUCAAGUGUGA
+    >cel-miR-1 MIMAT0000003 Caenorhabditis elegans miR-1
+    UGGAAUGUAAAGAAGUAUGUA
+
+Output DNA FASTA file (with RNA-to-DNA mode)::
+
+    >cel-let-7 MIMAT0000001 Caenorhabditis elegans let-7
+    TGAGGTAGTAGGTTGTATAGTT
+    >cel-lin-4 MIMAT0000002 Caenorhabditis elegans lin-4
+    TCCCTGAGACCTCAAGTGTGA
+    >cel-miR-1 MIMAT0000003 Caenorhabditis elegans miR-1
+    TGGAATGTAAAGAAGTATGTA
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastx_toolkit/fastq_quality_boxplot.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastx_toolkit/fastq_quality_boxplot.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,56 @@
+<tool id="cshl_fastq_quality_boxplot" name="Draw quality score boxplot">
+ <description></description>
+ <requirements><requirement type="package">fastx_toolkit</requirement></requirements>
+
+ <command>fastq_quality_boxplot_graph.sh -t '$input.name' -i $input -o $output</command>
+
+ <inputs>
+ <param format="txt" name="input" type="data" label="Statistics report file"  help="output of 'FASTQ Statistics' tool" />
+ </inputs>
+
+ <outputs>
+ <data format="png" name="output" metadata_source="input" />
+ </outputs>
+<help>
+
+**What it does**
+
+Creates a boxplot graph for the quality scores in the library.
+
+.. class:: infomark
+
+**TIP:** Use the **FASTQ Statistics** tool to generate the report file needed for this tool.
+
+-----
+
+**Output Examples**
+
+* Black horizontal lines are medians
+* Rectangular red boxes show the Inter-quartile Range (IQR) (top value is Q3, bottom value is Q1)
+* Whiskers show outlier at max. 1.5*IQR
+
+
+An excellent quality library (median quality is 40 for almost all 36 cycles):
+
+.. image:: ./static/fastx_icons/fastq_quality_boxplot_1.png
+
+
+A relatively good quality library (median quality degrades towards later cycles):
+
+.. image:: ./static/fastx_icons/fastq_quality_boxplot_2.png
+
+A low quality library (median drops quickly):
+
+.. image:: ./static/fastx_icons/fastq_quality_boxplot_3.png
+
+------
+
+This tool is based on `FASTX-toolkit`__ by Assaf Gordon.
+
+ .. __: http://hannonlab.cshl.edu/fastx_toolkit/
+
+
+
+</help>
+</tool>
+<!-- FASTQ-Quality-Boxplot is part of the FASTX-toolkit, by A.Gordon (gordon@cshl.edu) -->
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastx_toolkit/fastq_quality_converter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastx_toolkit/fastq_quality_converter.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,97 @@
+<tool id="cshl_fastq_quality_converter" name="Quality format converter">
+ <description>(ASCII-Numeric)</description>
+ <requirements><requirement type="package">fastx_toolkit</requirement></requirements>
+ <command>zcat -f $input | fastq_quality_converter $QUAL_FORMAT -o $output -Q $offset</command>
+ <inputs>
+ <param format="fastq" name="input" type="data" label="Library to convert" />
+
+ <param name="QUAL_FORMAT" type="select" label="Desired output format">
+ <option value="-a">ASCII (letters) quality scores</option>
+ <option value="-n">Numeric quality scores</option>
+ </param>
+
+        <param name="offset" type="select" label="FASTQ ASCII offset">
+            <option value="33">33</option>
+            <option selected="true" value="64">64</option>
+        </param>
+    </inputs>
+
+ <tests>
+ <test>
+ <!-- ASCII to NUMERIC -->
+ <param name="input" value="fastq_qual_conv1.fastq" />
+ <param name="QUAL_FORMAT" value="Numeric quality scores" />
+ <param name="offset" value="64" />
+ <output name="output" file="fastq_qual_conv1.out" />
+ </test>
+ <test>
+ <!-- ASCII to ASCII (basically, a no-op, but it should still produce a valid output -->
+ <param name="input" value="fastq_qual_conv1.fastq" />
+ <param name="QUAL_FORMAT" value="ASCII (letters) quality scores" />
+ <param name="offset" value="64" />
+ <output name="output" file="fastq_qual_conv1a.out" />
+ </test>
+ <test>
+ <!-- NUMERIC to ASCII -->
+ <param name="input" value="fastq_qual_conv2.fastq" />
+ <param name="QUAL_FORMAT" value="ASCII (letters) quality scores" />
+ <param name="offset" value="64" />
+ <output name="output" file="fastq_qual_conv2.out" />
+ </test>
+ <test>
+ <!-- NUMERIC to NUMERIC (basically, a no-op, but it should still produce a valid output -->
+ <param name="input" value="fastq_qual_conv2.fastq" />
+ <param name="QUAL_FORMAT" value="Numeric quality scores" />
+ <param name="offset" value="64" />
+ <output name="output" file="fastq_qual_conv2n.out" />
+ </test>
+ </tests>
+
+ <outputs>
+ <data format="fastq" name="output" metadata_source="input" />
+ </outputs>
+<help>
+
+**What it does**
+
+Converts a Solexa FASTQ file to/from numeric or ASCII quality format.
+
+.. class:: warningmark 
+
+Re-scaling is **not** performed. (e.g. conversion from Phred scale to Solexa scale).
+
+
+-----
+
+FASTQ with Numeric quality scores::
+
+    @CSHL__2_FC042AGWWWXX:8:1:120:202
+    ACGATAGATCGGAAGAGCTAGTATGCCGTTTTCTGC
+    +CSHL__2_FC042AGWWWXX:8:1:120:202
+    40 40 40 40 20 40 40 40 40 6 40 40 28 40 40 25 40 20 40 -1 30 40 14 27 40 8 1 3 7 -1 11 10 -1 21 10 8
+    @CSHL__2_FC042AGWWWXX:8:1:103:1185
+    ATCACGATAGATCGGCAGAGCTCGTTTACCGTCTTC
+    +CSHL__2_FC042AGWWWXX:8:1:103:1185
+    40 40 40 40 40 35 33 31 40 40 40 32 30 22 40 -0 9 22 17 14 8 36 15 34 22 12 23 3 10 -0 8 2 4 25 30 2
+
+
+FASTQ with ASCII quality scores::
+
+    @CSHL__2_FC042AGWWWXX:8:1:120:202
+    ACGATAGATCGGAAGAGCTAGTATGCCGTTTTCTGC
+    +CSHL__2_FC042AGWWWXX:8:1:120:202
+    hhhhThhhhFhh\hhYhTh?^hN[hHACG?KJ?UJH
+    @CSHL__2_FC042AGWWWXX:8:1:103:1185
+    ATCACGATAGATCGGCAGAGCTCGTTTACCGTCTTC
+    +CSHL__2_FC042AGWWWXX:8:1:103:1185
+    hhhhhca_hhh`^Vh@IVQNHdObVLWCJ@HBDY^B
+
+------
+
+This tool is based on `FASTX-toolkit`__ by Assaf Gordon.
+
+ .. __: http://hannonlab.cshl.edu/fastx_toolkit/

+</help>
+</tool>
+<!-- FASTQ-Quality-Converter is part of the FASTX-toolkit, by A.Gordon (gordon@cshl.edu) -->
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastx_toolkit/fastq_quality_filter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastx_toolkit/fastq_quality_filter.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,82 @@
+<tool id="cshl_fastq_quality_filter" name="Filter by quality">
+ <description></description>
+ <requirements><requirement type="package">fastx_toolkit</requirement></requirements>
+
+ <command>zcat -f '$input' | fastq_quality_filter -q $quality -p $percent -v -o $output
+#if $input.ext == "fastqsanger":
+-Q 33
+#end if
+ </command>
+
+ <inputs>
+ <param format="fastqsolexa,fastqsanger" name="input" type="data" label="Library to filter" />
+
+ <param name="quality" size="4" type="integer" value="20">
+ <label>Quality cut-off value</label>
+ </param>
+
+ <param name="percent" size="4" type="integer" value="90">
+ <label>Percent of bases in sequence that must have quality equal to / higher than cut-off value</label>
+ </param>
+ </inputs>
+
+ <tests>
+ <test>
+ <!-- Test1:  100% of bases with quality 33 or higher (pretty steep requirement...) -->
+ <param name="input" value="fastq_qual_filter1.fastq" ftype="fastqsolexa" />
+ <param name="quality" value="33"/>
+ <param name="percent" value="100"/>
+ <output name="output" file="fastq_qual_filter1a.out" />
+ </test>
+ <test>
+ <!-- Test2:  80% of bases with quality 20 or higher -->
+ <param name="input" value="fastq_qual_filter1.fastq" ftype="fastqsolexa"/>
+ <param name="quality" value="20"/>
+ <param name="percent" value="80"/>
+ <output name="output" file="fastq_qual_filter1b.out" />
+ </test>
+ </tests>
+
+ <outputs>
+ <data format="input" name="output" metadata_source="input" />
+ </outputs>
+
+ <help>
+**What it does**
+
+This tool filters reads based on quality scores.
+
+.. class:: infomark
+
+Using **percent = 100** requires all cycles of all reads to be at least the quality cut-off value.
+
+.. class:: infomark
+
+Using **percent = 50** requires the median quality of the cycles (in each read) to be at least the quality cut-off value.
+
+--------
+
+Quality score distribution (of all cycles) is calculated for each read. If it is lower than the quality cut-off value - the read is discarded.
+
+
+**Example**::
+
+    @CSHL_4_FC042AGOOII:1:2:214:584
+    GACAATAAAC
+    +CSHL_4_FC042AGOOII:1:2:214:584
+    30 30 30 30 30 30 30 30 20 10
+
+Using **percent = 50** and **cut-off = 30** - This read will not be discarded (the median quality is higher than 30).
+
+Using **percent = 90** and **cut-off = 30** - This read will be discarded (90% of the cycles do no have quality equal to / higher than 30).
+
+Using **percent = 100** and **cut-off = 20** - This read will be discarded (not all cycles have quality equal to / higher than 20).
+
+------
+
+This tool is based on `FASTX-toolkit`__ by Assaf Gordon.
+
+ .. __: http://hannonlab.cshl.edu/fastx_toolkit/     
+ </help>
+</tool>
+<!-- FASTQ-Quality-Filter is part of the FASTX-toolkit, by A.Gordon (gordon@cshl.edu) -->
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastx_toolkit/fastq_to_fasta.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastx_toolkit/fastq_to_fasta.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,80 @@
+<tool id="cshl_fastq_to_fasta" name="FASTQ to FASTA">
+ <description>converter</description>
+ <requirements><requirement type="package">fastx_toolkit</requirement></requirements>
+ <command>gunzip -cf $input | fastq_to_fasta $SKIPN $RENAMESEQ -o $output -v 
+#if $input.ext == "fastqsanger":
+-Q 33
+#end if
+ </command>
+
+ <inputs>
+ <param format="fastqsanger,fastqsolexa,fastqillumina" name="input" type="data" label="FASTQ Library to convert" />
+
+ <param name="SKIPN" type="select" label="Discard sequences with unknown (N) bases ">
+ <option value="">yes</option>
+ <option value="-n">no</option>
+ </param>
+
+ <param name="RENAMESEQ" type="select" label="Rename sequence names in output file (reduces file size)">
+ <option value="-r">yes</option>
+ <option value="">no</option>
+ </param>
+
+ </inputs>
+
+ <tests>
+ <test>
+ <!-- FASTQ-To-FASTA, keep N, don't rename -->
+ <param name="input" value="fastq_to_fasta1.fastq" ftype="fastqsolexa" />
+ <param name="SKIPN" value=""/>
+ <param name="RENAMESEQ" value=""/>
+ <output name="output" file="fastq_to_fasta1a.out" />
+ </test>
+ <test>
+ <!-- FASTQ-To-FASTA, discard N, rename -->
+ <param name="input" value="fastq_to_fasta1.fastq" ftype="fastqsolexa" />
+ <param name="SKIPN" value="no"/>
+ <param name="RENAMESEQ" value="yes"/>
+ <output name="output" file="fastq_to_fasta1b.out" />
+ </test>
+ </tests>
+
+ <outputs>
+ <data format="fasta" name="output" metadata_source="input" />
+ </outputs>
+
+<help>
+
+**What it does**
+
+This tool converts data from Solexa format to FASTA format (scroll down for format description).
+
+--------
+
+**Example**
+
+The following data in Solexa-FASTQ format::
+
+    @CSHL_4_FC042GAMMII_2_1_517_596
+    GGTCAATGATGAGTTGGCACTGTAGGCACCATCAAT
+    +CSHL_4_FC042GAMMII_2_1_517_596
+    40 40 40 40 40 40 40 40 40 40 38 40 40 40 40 40 14 40 40 40 40 40 36 40 13 14 24 24 9 24 9 40 10 10 15 40
+  
+Will be converted to FASTA (with 'rename sequence names' = NO)::
+
+    >CSHL_4_FC042GAMMII_2_1_517_596
+    GGTCAATGATGAGTTGGCACTGTAGGCACCATCAAT
+    
+Will be converted to FASTA (with 'rename sequence names' = YES)::
+
+    >1
+    GGTCAATGATGAGTTGGCACTGTAGGCACCATCAAT
+    
+------
+
+This tool is based on `FASTX-toolkit`__ by Assaf Gordon.
+
+ .. __: http://hannonlab.cshl.edu/fastx_toolkit/    
+</help>
+</tool>
+<!-- FASTQ-to-FASTA is part of the FASTX-toolkit, by A.Gordon (gordon@cshl.edu) -->
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastx_toolkit/fastx_artifacts_filter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastx_toolkit/fastx_artifacts_filter.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,90 @@
+<tool id="cshl_fastx_artifacts_filter" name="Remove sequencing artifacts">
+ <description></description>
+ <requirements><requirement type="package">fastx_toolkit</requirement></requirements>
+ <command>zcat -f '$input' | fastx_artifacts_filter -v -o "$output"
+#if $input.ext == "fastqsanger":
+-Q 33
+#end if
+ </command>
+
+ <inputs>
+ <param format="fasta,fastqsanger,fastqsolexa,fastqillumina" name="input" type="data" label="Library to filter" />
+
+ </inputs>
+
+ <tests>
+ <test>
+ <!-- Filter FASTA file -->
+ <param name="input" value="fastx_artifacts1.fasta" /> 
+ <output name="output" file="fastx_artifacts1.out" />
+ </test>
+ <test>
+ <!-- Filter FASTQ file -->
+ <param name="input" value="fastx_artifacts2.fastq" ftype="fastqsanger" />
+ <output name="output" file="fastx_artifacts2.out" />
+ </test>
+ </tests>
+
+ <outputs>
+ <data format="input" name="output" metadata_source="input" />
+ </outputs>
+<help>
+**What it does**
+
+This tool filters sequencing artifacts (reads with all but 3 identical bases).
+
+--------
+
+**The following is an example of sequences which will be filtered out**::
+
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAACACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAACACAAAAAAAAAAAAAAAAAAAAAAAAAAAAACACAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
+    AAAAACACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAACACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAACACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAA
+    AAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAA
+    AAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAA
+    AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAA
+    
+------
+
+This tool is based on `FASTX-toolkit`__ by Assaf Gordon.
+
+ .. __: http://hannonlab.cshl.edu/fastx_toolkit/
+</help>
+</tool>
+<!-- FASTX-Artifacts-filter is part of the FASTX-toolkit, by A.Gordon (gordon@cshl.edu) -->
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastx_toolkit/fastx_barcode_splitter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastx_toolkit/fastx_barcode_splitter.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,69 @@
+<tool id="cshl_fastx_barcode_splitter" name="Barcode Splitter">
+ <description></description>
+ <requirements><requirement type="package">fastx_toolkit</requirement></requirements>
+ <command interpreter="bash">fastx_barcode_splitter_galaxy_wrapper.sh $BARCODE $input "$input.name" "$output.files_path" --mismatches $mismatches --partial $partial $EOL > $output </command>
+
+ <inputs>
+ <param format="txt" name="BARCODE" type="data" label="Barcodes to use" />
+ <param format="fasta,fastqsanger,fastqsolexa,fastqillumina" name="input" type="data" label="Library to split" />
+
+ <param name="EOL" type="select" label="Barcodes found at">
+ <option value="--bol">Start of sequence (5' end)</option>
+ <option value="--eol">End of sequence (3' end)</option>
+ </param>
+
+ <param name="mismatches" type="integer" size="3" value="2" label="Number of allowed mismatches" />
+
+ <param name="partial" type="integer" size="3" value="0" label="Number of allowed barcodes nucleotide deletions" />
+
+ </inputs>
+
+ <tests>
+ <test>
+ <!-- Split a FASTQ file -->
+ <param name="BARCODE" value="fastx_barcode_splitter1.txt" />
+ <param name="input" value="fastx_barcode_splitter1.fastq" ftype="fastqsolexa" />
+ <param name="EOL" value="Start of sequence (5' end)" />
+ <param name="mismatches" value="2" />
+ <param name="partial" value="0" />
+ <output name="output" file="fastx_barcode_splitter1.out" />
+ </test>
+ </tests>
+
+ <outputs>
+ <data format="html" name="output" />
+ </outputs>
+<help>
+
+**What it does**
+
+This tool splits a Solexa library (FASTQ file) or a regular FASTA file into several files, using barcodes as the split criteria.
+
+--------
+
+**Barcode file Format**
+
+Barcode files are simple text files.
+Each line should contain an identifier (descriptive name for the barcode), and the barcode itself (A/C/G/T), separated by a TAB character.
+Example::
+
+    #This line is a comment (starts with a 'number' sign)
+    BC1 GATCT
+    BC2 ATCGT
+    BC3 GTGAT
+    BC4 TGTCT
+    
+For each barcode, a new FASTQ file will be created (with the barcode's identifier as part of the file name).
+Sequences matching the barcode will be stored in the appropriate file.
+
+One additional FASTQ file will be created (the 'unmatched' file), where sequences not matching any barcode will be stored.
+
+The output of this tool is an HTML file, displaying the split counts and the file locations.
+
+**Output Example**
+
+.. image:: ./static/fastx_icons/barcode_splitter_output_example.png
+
+</help>
+</tool>
+<!-- FASTX-barcode-splitter is part of the FASTX-toolkit, by A.Gordon (gordon@cshl.edu) -->
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastx_toolkit/fastx_barcode_splitter_galaxy_wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastx_toolkit/fastx_barcode_splitter_galaxy_wrapper.sh Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,80 @@
+#!/bin/bash
+
+#    FASTX-toolkit - FASTA/FASTQ preprocessing tools.
+#    Copyright (C) 2009  A. Gordon (gordon@cshl.edu)
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as
+#   published by the Free Software Foundation, either version 3 of the
+#   License, or (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#    You should have received a copy of the GNU Affero General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#
+#This is a shell script wrapper for 'fastx_barcode_splitter.pl'
+#
+# 1. Output files are saved at the dataset's files_path directory.
+#    
+# 2. 'fastx_barcode_splitter.pl' outputs a textual table.
+#    This script turns it into pretty HTML with working URL
+#    (so lazy users can just click on the URLs and get their files)
+
+BARCODE_FILE="$1"
+FASTQ_FILE="$2"
+LIBNAME="$3"
+OUTPUT_PATH="$4"
+shift 4
+# The rest of the parameters are passed to the split program
+
+if [ "$OUTPUT_PATH" == "" ]; then
+ echo "Usage: $0 [BARCODE FILE] [FASTQ FILE] [LIBRARY_NAME] [OUTPUT_PATH]" >&2
+ exit 1
+fi
+
+#Sanitize library name, make sure we can create a file with this name
+LIBNAME=${LIBNAME//\.gz/}
+LIBNAME=${LIBNAME//\.txt/}
+LIBNAME=${LIBNAME//[^[:alnum:]]/_}
+
+if [ ! -r "$FASTQ_FILE" ]; then
+ echo "Error: Input file ($FASTQ_FILE) not found!" >&2
+ exit 1
+fi
+if [ ! -r "$BARCODE_FILE" ]; then
+ echo "Error: barcode file ($BARCODE_FILE) not found!" >&2
+ exit 1
+fi
+mkdir -p "$OUTPUT_PATH"
+if [ ! -d "$OUTPUT_PATH" ]; then
+ echo "Error: failed to create output path '$OUTPUT_PATH'" >&2
+ exit 1
+fi
+
+PUBLICURL=""
+BASEPATH="$OUTPUT_PATH/"
+#PREFIX="$BASEPATH"`date "+%Y-%m-%d_%H%M__"`"${LIBNAME}__"
+PREFIX="$BASEPATH""${LIBNAME}__"
+SUFFIX=".txt"
+
+RESULTS=`zcat -f "$FASTQ_FILE" | fastx_barcode_splitter.pl --bcfile "$BARCODE_FILE" --prefix "$PREFIX" --suffix "$SUFFIX" "$@"`
+if [ $? != 0 ]; then
+ echo "error"
+fi
+
+#
+# Convert the textual tab-separated table into simple HTML table,
+# with the local path replaces with a valid URL
+echo "<html><body><table border=1>"
+echo "$RESULTS" | sed -r "s|$BASEPATH(.*)|<a href=\"\\1\">\\1</a>|" | sed '
+i<tr><td>
+s|\t|</td><td>|g
+a<\/td><\/tr>
+'
+echo "<p>"
+echo "</table></body></html>"
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastx_toolkit/fastx_clipper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastx_toolkit/fastx_clipper.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,109 @@
+<tool id="cshl_fastx_clipper" name="Clip" version="1.0.1" >
+  <description>adapter sequences</description>
+ <requirements><requirement type="package">fastx_toolkit</requirement></requirements>
+  <command>
+    zcat -f $input | fastx_clipper -l $minlength -a $clip_source.clip_sequence -d $keepdelta -o $output -v $KEEP_N $DISCARD_OPTIONS
+#if $input.ext == "fastqsanger":
+ -Q 33
+#end if
+  </command>
+  
+  <inputs>
+    <param format="fasta,fastqsanger,fastqsolexa,fastqillumina" name="input" type="data" label="Library to clip" />
+  
+    <param name="minlength" size="4" type="integer" value="15">
+      <label>Minimum sequence length (after clipping, sequences shorter than this length will be discarded)</label>
+    </param>
+
+ <conditional name="clip_source">
+ <param name="clip_source_list" type="select" label="Source">
+ <option value="prebuilt" selected="true">Standard (select from the list below)</option>
+ <option value="user">Enter custom sequence</option>
+ </param>
+
+ <when value="user">
+ <param name="clip_sequence" size="30" label="Enter custom clipping sequence" type="text" value="AATTGGCC" />
+ </when>
+
+ <when value="prebuilt">
+ <param name="clip_sequence" type="select" label="Choose Adapter">
+ <options from_file="fastx_clipper_sequences.txt">
+ <column name="name" index="1"/>
+ <column name="value" index="0"/>
+ </options>
+ </param> 
+ </when>
+ </conditional>
+
+ <param name="keepdelta" size="2" type="integer" value="0">
+ <label>enter non-zero value to keep the adapter sequence and x bases that follow it</label>
+ <help>use this for hairpin barcoding. keep at 0 unless you know what you're doing.</help>
+ </param>
+
+ <param name="KEEP_N" type="select" label="Discard sequences with unknown (N) bases">
+ <option value="">Yes</option>
+ <option value="-n">No</option>
+ </param>
+
+ <param name="DISCARD_OPTIONS" type="select" label="Output options">
+ <option value="-c">Output only clipped sequences (i.e. sequences which contained the adapter)</option>
+ <option value="-C">Output only non-clipped sequences (i.e. sequences which did not contained the adapter)</option>
+ <option value="">Output both clipped and non-clipped sequences</option>
+ </param>
+
+  </inputs>
+ <!--
+ #functional test with param value starting with - fails.
+ <tests>
+ <test>
+ <param name="input" value="fastx_clipper1.fastq" ftype="fastqsolexa"/>
+ <param name="maxmismatches" value="2" />
+ <param name="minlength" value="15" />
+ <param name="clip_source_list" value="user" />
+ <param name="clip_sequence" value="CAATTGGTTAATCCCCCTATATA" />
+ <param name="keepdelta" value="0" />
+ <param name="KEEP_N" value="-n" />
+ <param name="DISCARD_OPTIONS" value="-c" />
+ <output name="output" file="fastx_clipper1a.out" />
+ </test>
+ </tests>
+ -->
+  <outputs>
+    <data format="input" name="output" metadata_source="input" />
+  </outputs>
+  
+<help>
+**What it does**
+
+This tool clips adapters from the 3'-end of the sequences in a FASTA/FASTQ file.
+
+--------
+
+
+**Clipping Illustration:**
+
+.. image:: ./static/fastx_icons/fastx_clipper_illustration.png 







+
+**Clipping Example:**
+
+.. image:: ./static/fastx_icons/fastx_clipper_example.png 
+
+
+    
+**In the above example:**
+
+* Sequence no. 1 was discarded since it wasn't clipped (i.e. didn't contain the adapter sequence). (**Output** parameter).
+* Sequence no. 5 was discarded --- it's length (after clipping) was shorter than 15 nt (**Minimum Sequence Length** parameter).
+
+
+
+    
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastx_toolkit/fastx_collapser.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastx_toolkit/fastx_collapser.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,81 @@
+<tool id="cshl_fastx_collapser" name="Collapse">
+ <description>sequences</description>
+ <requirements><requirement type="package">fastx_toolkit</requirement></requirements>
+ <command>zcat -f '$input' | fastx_collapser -v -o '$output' 
+#if $input.ext == "fastqsanger":
+-Q 33
+#end if
+ </command>
+
+ <inputs>
+ <param format="fasta,fastqsanger,fastqsolexa" name="input" type="data" label="Library to collapse" />
+ </inputs>
+
+    <!-- The order of sequences in the test output differ between 32 bit and 64 bit machines. 
+ <tests>
+ <test>
+ <param name="input" value="fasta_collapser1.fasta" />
+ <output name="output" file="fasta_collapser1.out" />
+ </test>
+ </tests>
+    -->
+ <outputs>
+ <data format="fasta" name="output" metadata_source="input" />
+ </outputs>
+  <help>
+
+**What it does**
+
+This tool collapses identical sequences in a FASTA file into a single sequence.
+
+--------
+
+**Example**
+
+Example Input File (Sequence "ATAT" appears multiple times):: 
+
+    >CSHL_2_FC0042AGLLOO_1_1_605_414
+    TGCG
+    >CSHL_2_FC0042AGLLOO_1_1_537_759
+    ATAT
+    >CSHL_2_FC0042AGLLOO_1_1_774_520
+    TGGC
+    >CSHL_2_FC0042AGLLOO_1_1_742_502
+    ATAT
+    >CSHL_2_FC0042AGLLOO_1_1_781_514
+    TGAG
+    >CSHL_2_FC0042AGLLOO_1_1_757_487
+    TTCA
+    >CSHL_2_FC0042AGLLOO_1_1_903_769
+    ATAT
+    >CSHL_2_FC0042AGLLOO_1_1_724_499
+    ATAT
+
+Example Output file::
+
+    >1-1
+    TGCG
+    >2-4
+    ATAT
+    >3-1
+    TGGC
+    >4-1
+    TGAG
+    >5-1
+    TTCA
+    
+.. class:: infomark
+
+Original Sequence Names / Lane descriptions (e.g. "CSHL_2_FC0042AGLLOO_1_1_742_502") are discarded. 
+
+The output sequence name is composed of two numbers: the first is the sequence's number, the second is the multiplicity value.
+
+The following output::
+
+    >2-4
+    ATAT
+
+means that the sequence "ATAT" is the second sequence in the file, and it appeared 4 times in the input FASTA file.
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastx_toolkit/fastx_nucleotides_distribution.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastx_toolkit/fastx_nucleotides_distribution.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,51 @@
+<tool id="cshl_fastx_nucleotides_distribution" name="Draw nucleotides distribution chart">
+ <description></description>
+ <requirements><requirement type="package">fastx_toolkit</requirement></requirements>
+ <command>fastx_nucleotide_distribution_graph.sh -t '$input.name' -i $input -o $output</command>
+
+ <inputs>
+ <param format="txt" name="input" type="data" label="Statistics Text File" help="output of 'FASTX Statistics' tool" />
+ </inputs>
+
+ <outputs>
+ <data format="png" name="output" metadata_source="input" />
+ </outputs>
+<help>
+
+**What it does**
+
+Creates a stacked-histogram graph for the nucleotide distribution in the Solexa library.
+
+.. class:: infomark
+
+**TIP:** Use the **FASTQ Statistics** tool to generate the report file needed for this tool.
+
+-----
+
+**Output Examples**
+
+The following chart clearly shows the barcode used at the 5'-end of the library: **GATCT**
+
+.. image:: ./static/fastx_icons/fastq_nucleotides_distribution_1.png

+In the following chart, one can almost 'read' the most abundant sequence by looking at the dominant values: **TGATA TCGTA TTGAT GACTG AA...**
+
+.. image:: ./static/fastx_icons/fastq_nucleotides_distribution_2.png
+
+The following chart shows a growing number of unknown (N) nucleotides towards later cycles (which might indicate a sequencing problem):
+
+.. image:: ./static/fastx_icons/fastq_nucleotides_distribution_3.png
+
+But most of the time, the chart will look rather random:
+
+.. image:: ./static/fastx_icons/fastq_nucleotides_distribution_4.png
+
+------
+
+This tool is based on `FASTX-toolkit`__ by Assaf Gordon.
+
+ .. __: http://hannonlab.cshl.edu/fastx_toolkit/
+
+</help>
+</tool>
+<!-- FASTQ-Nucleotides-Distribution is part of the FASTX-toolkit, by A.Gordon (gordon@cshl.edu) -->
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastx_toolkit/fastx_quality_statistics.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastx_toolkit/fastx_quality_statistics.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,70 @@
+<tool id="cshl_fastx_quality_statistics" name="Compute quality statistics">
+ <description></description>
+ <requirements><requirement type="package">fastx_toolkit</requirement></requirements>
+ <command>zcat -f $input | fastx_quality_stats -o $output -Q 33</command>
+
+ <inputs>
+ <param format="fastqsanger" name="input" type="data" label="Library to analyse" />
+ </inputs>
+
+ <tests>
+ <test>
+ <param name="input" value="fastq_stats1.fastq" ftype="fastqsanger"/>
+ <output name="output" file="fastq_stats1.out" />
+ </test>
+ </tests>
+
+ <outputs>
+ <data format="txt" name="output" metadata_source="input" />
+ </outputs>
+
+<help>
+
+**What it does**
+
+Creates quality statistics report for the given Solexa/FASTQ library.
+
+.. class:: infomark
+
+**TIP:** This statistics report can be used as input for **Quality Score** and **Nucleotides Distribution** tools.
+
+-----
+
+**The output file will contain the following fields:**
+
+* column = column number (1 to 36 for a 36-cycles read Solexa file)
+* count   = number of bases found in this column.
+* min     = Lowest quality score value found in this column.
+* max     = Highest quality score value found in this column.
+* sum     = Sum of quality score values for this column.
+* mean    = Mean quality score value for this column.
+* Q1 = 1st quartile quality score.
+* med = Median quality score.
+* Q3 = 3rd quartile quality score.
+* IQR = Inter-Quartile range (Q3-Q1).
+* lW = 'Left-Whisker' value (for boxplotting).
+* rW = 'Right-Whisker' value (for boxplotting).
+* A_Count = Count of 'A' nucleotides found in this column.
+* C_Count = Count of 'C' nucleotides found in this column.
+* G_Count = Count of 'G' nucleotides found in this column.
+* T_Count = Count of 'T' nucleotides found in this column.
+* N_Count = Count of 'N' nucleotides found in this column.  
+
+
+For example::
+
+     1  6362991 -4 40 250734117 39.41 40 40 40  0 40 40 1396976 1329101  678730 2958184   0
+     2  6362991 -5 40 250531036 39.37 40 40 40  0 40 40 1786786 1055766 1738025 1782414   0
+     3  6362991 -5 40 248722469 39.09 40 40 40  0 40 40 2296384  984875 1443989 1637743   0
+     4  6362991 -4 40 248214827 39.01 40 40 40  0 40 40 2536861 1167423 1248968 1409739   0
+    36  6362991 -5 40 117158566 18.41  7 15 30 23 -5 40 4074444 1402980   63287  822035 245
+    
+------
+
+This tool is based on `FASTX-toolkit`__ by Assaf Gordon.
+
+ .. __: http://hannonlab.cshl.edu/fastx_toolkit/
+
+</help>
+ </tool>
+<!-- FASTQ-Statistics is part of the FASTX-toolkit, by A.Gordon (gordon@cshl.edu) -->
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastx_toolkit/fastx_renamer.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastx_toolkit/fastx_renamer.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,65 @@
+<tool id="cshl_fastx_renamer" name="Rename sequences" version="0.0.11" >
+ <description></description>
+ <requirements><requirement type="package">fastx_toolkit</requirement></requirements>
+ <command>zcat -f $input | fastx_renamer -n $TYPE -o $output -v 
+#if $input.ext == "fastqsanger":
+-Q 33
+#end if
+ </command>
+
+ <inputs>
+ <param format="fastqsolexa,fasta,fastqsanger" name="input" type="data" label="FASTQ/A Library to rename" />
+
+ <param name="TYPE" type="select" label="Rename sequence identifiers to">
+ <option value="SEQ">Nucleotides sequence</option>
+ <option value="COUNT">Numeric Counter</option>
+ </param>
+ </inputs>
+
+ <outputs>
+ <data format="input" name="output" metadata_source="input" />
+ </outputs>
+
+<help>
+
+**What it does**
+
+This tool renames the sequence identifiers in a FASTQ/A file.
+
+.. class:: infomark
+
+Use this tool at the beginning of your workflow, as a way to keep the original sequence (before trimming, clipping, barcode-removal, etc).
+
+--------
+
+**Example**
+
+The following Solexa-FASTQ file::
+
+    @CSHL_4_FC042GAMMII_2_1_517_596
+    GGTCAATGATGAGTTGGCACTGTAGGCACCATCAAT
+    +CSHL_4_FC042GAMMII_2_1_517_596
+    40 40 40 40 40 40 40 40 40 40 38 40 40 40 40 40 14 40 40 40 40 40 36 40 13 14 24 24 9 24 9 40 10 10 15 40
+  
+Renamed to **nucleotides sequence**::
+
+    @GGTCAATGATGAGTTGGCACTGTAGGCACCATCAAT
+    GGTCAATGATGAGTTGGCACTGTAGGCACCATCAAT
+    +GGTCAATGATGAGTTGGCACTGTAGGCACCATCAAT
+    40 40 40 40 40 40 40 40 40 40 38 40 40 40 40 40 14 40 40 40 40 40 36 40 13 14 24 24 9 24 9 40 10 10 15 40
+
+Renamed to **numeric counter**::
+
+    @1
+    GGTCAATGATGAGTTGGCACTGTAGGCACCATCAAT
+    +1
+    40 40 40 40 40 40 40 40 40 40 38 40 40 40 40 40 14 40 40 40 40 40 36 40 13 14 24 24 9 24 9 40 10 10 15 40
+
+------
+
+This tool is based on `FASTX-toolkit`__ by Assaf Gordon.
+
+ .. __: http://hannonlab.cshl.edu/fastx_toolkit/   
+</help>
+</tool>
+<!-- FASTQ-to-FASTA is part of the FASTX-toolkit, by A.Gordon (gordon@cshl.edu) -->
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastx_toolkit/fastx_reverse_complement.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastx_toolkit/fastx_reverse_complement.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,63 @@
+<tool id="cshl_fastx_reverse_complement" name="Reverse-Complement">
+ <description></description>
+ <requirements><requirement type="package">fastx_toolkit</requirement></requirements>
+ <command>zcat -f '$input' | fastx_reverse_complement -v -o $output
+#if $input.ext == "fastqsanger":
+-Q 33
+#end if
+ </command>
+ <inputs>
+ <param format="fasta,fastqsolexa,fastqsanger" name="input" type="data" label="Library to reverse-complement" />
+ </inputs>
+
+ <tests>
+ <test>
+ <!-- Reverse-complement a FASTA file -->
+ <param name="input" value="fastx_rev_comp1.fasta" /> 
+ <output name="output" file="fastx_reverse_complement1.out" />
+ </test>
+ <test>
+ <!-- Reverse-complement a FASTQ file -->
+ <param name="input" value="fastx_rev_comp2.fastq" ftype="fastqsolexa"/>
+ <output name="output" file="fastx_reverse_complement2.out" />
+ </test>
+ </tests>
+
+  
+ <outputs>
+ <data format="input" name="output" metadata_source="input" />
+ </outputs>
+
+<help>
+**What it does**
+
+This tool reverse-complements each sequence in a library.
+If the library is a FASTQ, the quality-scores are also reversed.
+  
+--------
+
+**Example**
+
+Input FASTQ file::
+
+    @CSHL_1_FC42AGWWWXX:8:1:3:740
+    TGTCTGTAGCCTCNTCCTTGTAATTCAAAGNNGGTA
+    +CSHL_1_FC42AGWWWXX:8:1:3:740
+    33 33 33 34 33 33 33 33 33 33 33 33 27 5 27 33 33 33 33 33 33 27 21 27 33 32 31 29 26 24 5 5 15 17 27 26
+
+
+Output FASTQ file::
+
+    @CSHL_1_FC42AGWWWXX:8:1:3:740
+    TACCNNCTTTGAATTACAAGGANGAGGCTACAGACA
+    +CSHL_1_FC42AGWWWXX:8:1:3:740
+    26 27 17 15 5 5 24 26 29 31 32 33 27 21 27 33 33 33 33 33 33 27 5 27 33 33 33 33 33 33 33 33 34 33 33 33
+
+------
+
+This tool is based on `FASTX-toolkit`__ by Assaf Gordon.
+
+ .. __: http://hannonlab.cshl.edu/fastx_toolkit/

+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/fastx_toolkit/fastx_trimmer.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastx_toolkit/fastx_trimmer.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,81 @@
+<tool id="cshl_fastx_trimmer" name="Trim sequences">
+ <description></description>
+ <requirements><requirement type="package">fastx_toolkit</requirement></requirements>
+ <command>zcat -f '$input' | fastx_trimmer -v -f $first -l $last -o $output
+#if $input.ext == "fastqsanger":
+-Q 33
+#end if
+ </command>
+
+ <inputs>
+ <param format="fasta,fastqsolexa,fastqsanger" name="input" type="data" label="Library to clip" />
+
+ <param name="first" size="4" type="integer" value="1">
+ <label>First base to keep</label>
+ </param>
+
+ <param name="last" size="4" type="integer" value="21">
+ <label>Last base to keep</label>
+ </param>
+ </inputs>
+
+ <tests>
+ <test>
+ <!-- Trim a FASTA file - remove first four bases (e.g. a barcode) -->
+ <param name="input" value="fastx_trimmer1.fasta" />
+ <param name="first" value="5"/>
+ <param name="last" value="36"/>
+ <output name="output" file="fastx_trimmer1.out" />
+ </test>
+ <test>
+ <!-- Trim a FASTQ file - remove last 9 bases (e.g. keep only miRNA length sequences) -->
+ <param name="input" value="fastx_trimmer2.fastq" ftype="fastqsolexa"/>
+ <param name="first" value="1"/>
+ <param name="last" value="27"/>
+ <output name="output" file="fastx_trimmer2.out" />
+ </test>
+ </tests>
+
+ <outputs>
+ <data format="input" name="output" metadata_source="input" />
+ </outputs>
+ <help>
+**What it does**
+
+This tool trims (cut bases from) sequences in a FASTA/Q file.
+  
+--------
+
+**Example**
+
+Input Fasta file (with 36 bases in each sequences)::
+
+    >1-1
+    TATGGTCAGAAACCATATGCAGAGCCTGTAGGCACC
+    >2-1
+    CAGCGAGGCTTTAATGCCATTTGGCTGTAGGCACCA
+    
+
+Trimming with First=1 and Last=21, we get a FASTA file with 21 bases in each sequences (starting from the first base)::
+
+    >1-1
+    TATGGTCAGAAACCATATGCA
+    >2-1
+    CAGCGAGGCTTTAATGCCATT
+
+Trimming with First=6 and Last=10, will generate a FASTA file with 5 bases (bases 6,7,8,9,10) in each sequences::
+
+    >1-1
+    TCAGA
+    >2-1
+    AGGCT
+    
+    ------
+
+This tool is based on `FASTX-toolkit`__ by Assaf Gordon.
+
+ .. __: http://hannonlab.cshl.edu/fastx_toolkit/
+    
+</help>
+</tool>
+<!-- FASTX-Trimmer is part of the FASTX-toolkit, by A.Gordon (gordon@cshl.edu) -->
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/CreateInterval.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/CreateInterval.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,19 @@
+#! /usr/bin/perl -w
+
+# Accepts chrom, start, end, name, and strand
+# If strand is void sets it to plus
+# CreateInterval.pl $chrom $start $end $name $strand $output
+
+my $strand = "+";
+
+die "Not enough arguments\n" unless @ARGV == 6;
+
+open OUT, ">$ARGV[5]" or die "Cannot open $ARGV[5]:$!\n";
+
+$strand = "-" if $ARGV[4] eq "minus";
+$ARGV[3] =~ s/\s+/_/g;
+$ARGV[3] =~ s/\t+/_/g;
+
+print OUT "$ARGV[0]\t$ARGV[1]\t$ARGV[2]\t$ARGV[3]\t0\t$strand\n";
+close OUT;
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/CreateInterval.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/CreateInterval.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,56 @@
+<tool id="createInterval" name="Create single interval">
+  <description>as a new dataset</description>
+  <command interpreter="perl">CreateInterval.pl $chrom $start $end "$name" $strand $out_file1</command>
+  <inputs>
+    <param name="chrom" size="20" type="text" value="chr7" label="Chromosome"/>
+    <param name="start" size="20" type="integer" value="100" label="Start position"/>
+    <param name="end"   size="20" type="integer" value="1000" label="End position"/>
+    <param name="name" size="20" type="text" value="myInterval" label="Name"/>
+    <param name="strand" type="select" label="Strand" help="If your interval is strandless set strand to plus" >
+      <option value="plus">plus</option>
+      <option value="minus">minus</option>
+    </param>    
+  </inputs>
+  <outputs>
+    <data format="bed" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="chrom" value="chr7"/>
+      <param name="start" value="100"/>
+      <param name="end" value="1000"/>
+      <param name="name" value="myinterval"/>
+      <param name="strand" value="plus"/>
+      <output name="out_file1" file="eq-createinterval.dat"/>
+    </test>
+  </tests>
+  <help>
+
+.. class:: warningmark
+
+**TIP**. Once your interval appears in history, you must tell Galaxy which genome it belongs to by clicking pencil icon or the "?" link in the history item.
+
+-----
+
+**What it does**
+
+This tool allows you to create a single genomic interval. The resulting history item will be in the BED format.
+
+-----
+
+**Example**
+
+Typing the following values in the form::
+
+    Chromosome: chrX
+    Start position: 151087187
+    End position: 151370486
+    Name: NM_000808
+    Strand: minus
+
+will create a single interval::
+
+    chrX  151087187  151370486  NM_000808  0  -
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/axt_to_concat_fasta.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/axt_to_concat_fasta.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+"""
+Adapted from bx/scripts/axt_to_concat_fasta.py
+"""
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+
+import sys
+import bx.align.axt
+
+def usage(s=None):
+ message = """
+axt_to_fasta species1 species2 < axt_file > fasta_file
+"""
+ if (s == None): sys.exit (message)
+ else:           sys.exit ("%s\n%s" % (s,message))
+
+
+def main():
+
+ # check the command line
+ species1 = sys.argv[1]
+ species2 = sys.argv[2]
+
+ # convert the alignment blocks
+
+ reader = bx.align.axt.Reader(sys.stdin,support_ids=True,\
+                              species1=species1,species2=species2)
+ sp1text = list()
+ sp2text = list()
+ for a in reader:
+ sp1text.append(a.components[0].text)
+ sp2text.append(a.components[1].text)
+ sp1seq = "".join(sp1text)
+ sp2seq = "".join(sp2text)
+ print_component_as_fasta(sp1seq,species1)
+ print_component_as_fasta(sp2seq,species2)
+
+
+
+# $$$ this should be moved to a bx.align.fasta module
+
+def print_component_as_fasta(text,src):
+ header = ">" + src
+ print header
+ print text
+
+
+if __name__ == "__main__": main()
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/axt_to_concat_fasta.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/axt_to_concat_fasta.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,63 @@
+<tool id="axt_to_concat_fasta" name="AXT to concatenated FASTA">
+  <description>Converts an AXT formatted file to a concatenated FASTA alignment</description>
+  <command interpreter="python">axt_to_concat_fasta.py $dbkey_1 $dbkey_2 &lt; $axt_input &gt; $out_file1</command>
+  <inputs>
+    <param format="axt" name="axt_input" type="data" label="AXT file"/>
+    <param name="dbkey_1" type="genomebuild" label="Genome"/>
+    <param name="dbkey_2" type="genomebuild" label="Genome"/>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="axt_input" value="1.axt" ftype="axt" />
+      <param name="dbkey_1" value='hg17' />
+      <param name="dbkey_2" value="panTro1" />
+      <output name="out_file1" file="axt_to_concat_fasta.dat" />
+    </test>
+  </tests>
+  <help>
+
+.. class:: warningmark
+
+**IMPORTANT**: AXT formatted alignments will be phased out from Galaxy in the coming weeks. They will be replaced with pairwise MAF alignments, which are already available. To try pairwise MAF alignments use "Extract Pairwise MAF blocks" tool in *Fetch Sequences and Alignments* section.
+
+--------
+
+**Syntax**
+
+This tool converts an AXT formatted file to the FASTA format, and concatenates the results in the same build.
+
+- **AXT format** The alignments are produced from Blastz, an alignment tool available from Webb Miller's lab at Penn State University. The lav format Blastz output, which does not include the sequence, was converted to AXT format with lavToAxt. Each alignment block in an AXT file contains three lines: a summary line and 2 sequence lines. Blocks are separated from one another by blank lines. 
+
+- **FASTA format** a text-based format for representing both nucleic and protein sequences, in which base pairs or proteins are represented using a single-letter code.
+
+  - This format contains an one line header. It starts with a " >" symbol. The first word on this line is the name of the sequence. The rest of the line is a description of the sequence.
+  - The remaining lines contain the sequence itself.
+  - Blank lines in a FASTA file are ignored, and so are spaces or other gap symbols (dashes, underscores, periods) in a sequence.
+  - Fasta files containing multiple sequences are just the same, with one sequence listed right after another. This format is accepted for many multiple sequence alignment programs.
+
+-----
+
+**Example**
+
+- AXT format::
+
+    0 chr19 3001012 3001075 chr11 70568380 70568443 - 3500
+    TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGA
+    TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGA
+
+    1 chr19 3008279 3008357 chr11 70573976 70574054 - 3900
+    CACAATCTTCACATTGAGATCCTGAGTTGCTGATCAGAATGGAAGGCTGAGCTAAGATGAGCGACGAGGCAATGTCACA
+    CACAGTCTTCACATTGAGGTACCAAGTTGTGGATCAGAATGGAAAGCTAGGCTATGATGAGGGACAGTGCGCTGTCACA
+
+- Convert the above file to concatenated FASTA format::
+
+    &gt;hg16
+    TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGACACAATCTTCACATTGAGATCCTGAGTTGCTGATCAGAATGGAAGGCTGAGCTAAGATGAGCGACGAGGCAATGTCACA
+    &gt;mm5
+    TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGACACAGTCTTCACATTGAGGTACCAAGTTGTGGATCAGAATGGAAAGCTAGGCTATGATGAGGGACAGTGCGCTGTCACA
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/axt_to_fasta.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/axt_to_fasta.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+"""
+Adapted from bx/scripts/axt_to_fasta.py
+"""
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+
+import sys
+import bx.align.axt
+
+def usage(s=None):
+ message = """
+axt_to_fasta species1 species2 < axt_file > fasta_file
+"""
+ if (s == None): sys.exit (message)
+ else:           sys.exit ("%s\n%s" % (s,message))
+
+
+def main():
+
+ # check the command line
+ species1 = sys.argv[1]
+ species2 = sys.argv[2]
+
+ # convert the alignment blocks
+
+ reader = bx.align.axt.Reader(sys.stdin,support_ids=True,\
+                              species1=species1,species2=species2)
+
+ for a in reader:
+ if ("id" in a.attributes): id = a.attributes["id"]
+ else:                      id = None
+ print_component_as_fasta(a.components[0],id)
+ print_component_as_fasta(a.components[1],id)
+ print
+
+
+# $$$ this should be moved to a bx.align.fasta module
+
+def print_component_as_fasta(c,id=None):
+ header = ">%s_%s_%s" % (c.src,c.start,c.start+c.size)
+ if (id != None): header += " " + id
+ print header
+ print c.text
+
+
+if __name__ == "__main__": main()
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/axt_to_fasta.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/axt_to_fasta.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,69 @@
+<tool id="axt_to_fasta" name="AXT to FASTA">
+  <description>Converts an AXT formatted file to FASTA format</description>
+  <command interpreter="python">axt_to_fasta.py $dbkey_1 $dbkey_2 &lt; $axt_input &gt; $out_file1</command>
+  <inputs>
+    <param format="axt" name="axt_input" type="data" label="AXT file"/>
+    <param name="dbkey_1" type="genomebuild" label="Genome"/>
+    <param name="dbkey_2" type="genomebuild" label="Genome"/>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="axt_input" value="1.axt" ftype="axt" />
+      <param name="dbkey_1" value="hg17" />
+      <param name="dbkey_2" value="panTro1" />
+      <output name="out_file1" file="axt_to_fasta.dat" />
+    </test>
+  </tests>
+  <help>
+
+.. class:: warningmark
+
+**IMPORTANT**: AXT formatted alignments will be phased out from Galaxy in the coming weeks. They will be replaced with pairwise MAF alignments, which are already available. To try pairwise MAF alignments use "Extract Pairwise MAF blocks" tool in *Fetch Sequences and Alignments* section.
+
+--------
+
+
+**Syntax**
+
+This tool converts an AXT formatted file to the FASTA format.
+
+- **AXT format** The alignments are produced from Blastz, an alignment tool available from Webb Miller's lab at Penn State University. The lav format Blastz output, which does not include the sequence, was converted to AXT format with lavToAxt. Each alignment block in an AXT file contains three lines: a summary line and 2 sequence lines. Blocks are separated from one another by blank lines.
+
+- **FASTA format** a text-based format for representing both nucleic and protein sequences, in which base pairs or proteins are represented using a single-letter code.
+
+  - This format contains an one line header. It starts with a " >" symbol. The first word on this line is the name of the sequence. The rest of the line is a description of the sequence.
+  - The remaining lines contain the sequence itself.
+  - Blank lines in a FASTA file are ignored, and so are spaces or other gap symbols (dashes, underscores, periods) in a sequence.
+  - Fasta files containing multiple sequences are just the same, with one sequence listed right after another. This format is accepted for many multiple sequence alignment programs.
+
+-----
+
+**Example**
+
+- AXT format::
+
+    0 chr19 3001012 3001075 chr11 70568380 70568443 - 3500
+    TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGA
+    TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGA
+
+    1 chr19 3008279 3008357 chr11 70573976 70574054 - 3900
+    CACAATCTTCACATTGAGATCCTGAGTTGCTGATCAGAATGGAAGGCTGAGCTAAGATGAGCGACGAGGCAATGTCACA
+    CACAGTCTTCACATTGAGGTACCAAGTTGTGGATCAGAATGGAAAGCTAGGCTATGATGAGGGACAGTGCGCTGTCACA
+
+- Convert the above file to FASTA format::
+
+    &gt;hg16.chr19(+):3001012-3001075|hg16_0
+    TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGA
+    &gt;mm5.chr11(-):70568380-70568443|mm5_0
+    TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGA
+
+    &gt;hg16.chr19(+):3008279-3008357|hg16_1
+    CACAATCTTCACATTGAGATCCTGAGTTGCTGATCAGAATGGAAGGCTGAGCTAAGATGAGCGACGAGGCAATGTCACA
+    &gt;mm5.chr11(-):70573976-70574054|mm5_1
+    CACAGTCTTCACATTGAGGTACCAAGTTGTGGATCAGAATGGAAAGCTAGGCTATGATGAGGGACAGTGCGCTGTCACA
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/axt_to_lav.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/axt_to_lav.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,176 @@
+#!/usr/bin/env python
+"""
+Application to convert AXT file to LAV file
+-------------------------------------------
+
+:Author: Bob Harris (rsharris@bx.psu.edu)
+:Version: $Revision: $
+
+The application reads an AXT file from standard input and writes a LAV file to
+standard out;  some statistics are written to standard error.
+"""
+
+import sys, copy
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+import bx.align.axt
+import bx.align.lav
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def usage(s=None):
+    message = """
+axt_to_lav primary_spec secondary_spec [--silent] < axt_file > lav_file
+  Each spec is of the form seq_file[:species_name]:lengths_file.
+
+  seq_file should be a format string for the file names for the individual
+  sequences, with %s to be replaced by the alignment's src field.  For example,
+  "hg18/%s.nib" would prescribe files named "hg18/chr1.nib", "hg18/chr2.nib",
+  etc.
+
+  species_name is optional.  If present, it is prepended to the alignment's src
+  field.
+
+  Lengths files provide the length of each chromosome (lav format needs this
+  information but axt file does not contain it).  The format is a series of
+  lines of the form
+    <chromosome name> <length>
+  The chromosome field in each axt block must match some <chromosome name> in
+  the lengths file.
+"""
+    if (s == None): sys.exit (message)
+    else:           sys.exit ("%s\n%s" % (s,message))
+
+
+def main():
+    global debug
+
+    # parse the command line
+
+    primary   = None
+    secondary = None
+    silent    = False
+
+    # pick off options
+
+    args = sys.argv[1:]
+    seq_file2 = open(args.pop(-1),'w')
+    seq_file1 = open(args.pop(-1),'w')
+    lav_out = args.pop(-1)
+    axt_in = args.pop(-1)
+    while (len(args) > 0):
+        arg = args.pop(0)
+        val = None
+        fields = arg.split("=",1)
+        if (len(fields) == 2):
+            arg = fields[0]
+            val = fields[1]
+            if (val == ""):
+                usage("missing a value in %s=" % arg)
+
+        if (arg == "--silent") and (val == None):
+            silent = True
+        elif (primary == None) and (val == None):
+            primary = arg
+        elif (secondary == None) and (val == None):
+            secondary = arg
+        else:
+            usage("unknown argument: %s" % arg)
+
+    if (primary == None):
+        usage("missing primary file name and length")
+
+    if (secondary == None):
+        usage("missing secondary file name and length")
+
+    try:
+        (primaryFile,primary,primaryLengths) = parse_spec(primary)
+    except:
+        usage("bad primary spec (must be seq_file[:species_name]:lengths_file")
+
+    try:
+        (secondaryFile,secondary,secondaryLengths) = parse_spec(secondary)
+    except:
+        usage("bad secondary spec (must be seq_file[:species_name]:lengths_file")
+
+    # read the lengths
+
+    speciesToLengths = {}
+    speciesToLengths[primary]   = read_lengths (primaryLengths)
+    speciesToLengths[secondary] = read_lengths (secondaryLengths)
+
+    # read the alignments
+
+    out = bx.align.lav.Writer(open(lav_out,'w'), \
+            attributes = { "name_format_1" : primaryFile,
+                           "name_format_2" : secondaryFile })
+
+    axtsRead = 0
+    axtsWritten = 0
+    for axtBlock in bx.align.axt.Reader(open(axt_in), \
+            species_to_lengths = speciesToLengths,
+            species1           = primary,
+            species2           = secondary,
+            support_ids        = True):
+        axtsRead += 1
+        out.write (axtBlock)
+        primary_c = axtBlock.get_component_by_src_start(primary)
+        secondary_c = axtBlock.get_component_by_src_start(secondary)
+        
+        print >>seq_file1, ">%s_%s_%s_%s" % (primary_c.src,secondary_c.strand,primary_c.start,primary_c.start+primary_c.size)
+        print >>seq_file1,primary_c.text
+        print >>seq_file1
+        
+        print >>seq_file2, ">%s_%s_%s_%s" % (secondary_c.src,secondary_c.strand,secondary_c.start,secondary_c.start+secondary_c.size)
+        print >>seq_file2,secondary_c.text
+        print >>seq_file2
+        axtsWritten += 1
+
+    out.close()
+    seq_file1.close()
+    seq_file2.close()
+
+    if (not silent):
+        sys.stdout.write ("%d blocks read, %d written\n" % (axtsRead,axtsWritten))
+
+def parse_spec(spec): # returns (seq_file,species_name,lengths_file)
+    fields = spec.split(":")
+    if   (len(fields) == 2): return (fields[0],"",fields[1])
+    elif (len(fields) == 3): return (fields[0],fields[1],fields[2])
+    else:                    raise ValueError
+
+def read_lengths (fileName):
+
+    chromToLength = {}
+
+    f = file (fileName, "r")
+
+    for lineNumber,line in enumerate(f):
+        line = line.strip()
+        if (line == ""): continue
+        if (line.startswith("#")): continue
+
+        fields = line.split ()
+        if (len(fields) != 2):
+            raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line)
+
+        chrom = fields[0]
+        try:
+            length = int(fields[1])
+        except:
+            raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line)
+
+        if (chrom in chromToLength):
+            raise "%s appears more than once (%s:%d): %s" \
+                % (chrom,fileName,lineNumber)
+
+        chromToLength[chrom] = length
+
+    f.close ()
+
+    return chromToLength
+
+
+if __name__ == "__main__": main()
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/axt_to_lav.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/axt_to_lav.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,94 @@
+<tool id="axt_to_lav_1" name="AXT to LAV">
+  <description>Converts an AXT formatted file to LAV format</description>
+  <command interpreter="python">axt_to_lav.py /galaxy/data/$dbkey_1/seq/%s.nib:$dbkey_1:${GALAXY_DATA_INDEX_DIR}/shared/ucsc/chrom/${dbkey_1}.len /galaxy/data/$dbkey_2/seq/%s.nib:$dbkey_2:${GALAXY_DATA_INDEX_DIR}/shared/ucsc/chrom/${dbkey_2}.len $align_input $lav_file $seq_file1 $seq_file2</command>
+  <inputs>
+    <param name="align_input" type="data" format="axt" label="Alignment File" optional="False"/>
+    <param name="dbkey_1" type="genomebuild" label="Genome"/>
+    <param name="dbkey_2" type="genomebuild" label="Genome"/>
+  </inputs>
+  <outputs>
+    <data name="lav_file" format="lav"/>
+    <data name="seq_file1" format="fasta" parent="lav_file"/>
+    <data name="seq_file2" format="fasta" parent="lav_file"/>
+  </outputs>
+  <help>
+
+.. class:: warningmark
+
+**IMPORTANT**: AXT formatted alignments will be phased out from Galaxy in the coming weeks. They will be replaced with pairwise MAF alignments, which are already available. To try pairwise MAF alignments use "Extract Pairwise MAF blocks" tool in *Fetch Sequences and Alignments* section.
+
+--------
+
+
+**Syntax**
+
+This tool converts an AXT formatted file to the LAV format.
+
+- **AXT format** The alignments are produced from Blastz, an alignment tool available from Webb Miller's lab at Penn State University. The lav format Blastz output, which does not include the sequence, was converted to AXT format with lavToAxt. Each alignment block in an AXT file contains three lines: a summary line and 2 sequence lines. Blocks are separated from one another by blank lines.
+
+- **LAV format** LAV is an alignment format developed by Webb Miller's group. It is the primary output format for BLASTZ.
+
+- **FASTA format** a text-based format for representing both nucleic and protein sequences, in which base pairs or proteins are represented using a single-letter code.
+
+  - This format contains an one line header. It starts with a ">" symbol. The first word on this line is the name of the sequence. The rest of the line is a description of the sequence.
+  - The remaining lines contain the sequence itself.
+  - Blank lines in a FASTA file are ignored, and so are spaces or other gap symbols (dashes, underscores, periods) in a sequence.
+  - Fasta files containing multiple sequences are just the same, with one sequence listed right after another. This format is accepted for many multiple sequence alignment programs.
+
+-----
+
+**Example**
+
+- AXT format::
+
+    0 chr19 3001012 3001075 chr11 70568380 70568443 - 3500
+    TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGA
+    TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGA
+
+    1 chr19 3008279 3008357 chr11 70573976 70574054 - 3900
+    CACAATCTTCACATTGAGATCCTGAGTTGCTGATCAGAATGGAAGGCTGAGCTAAGATGAGCGACGAGGCAATGTCACA
+    CACAGTCTTCACATTGAGGTACCAAGTTGTGGATCAGAATGGAAAGCTAGGCTATGATGAGGGACAGTGCGCTGTCACA
+
+- Convert the above file to LAV format::
+
+    #:lav
+    s {
+      &quot;/galaxy/data/hg16/seq/chr19.nib&quot; 1 63811651 0 1
+      &quot;/galaxy/data/mm5/seq/chr11.nib-&quot; 1 121648857 0 1
+    }
+    h {
+      &quot;> hg16.chr19&quot;
+      &quot;> mm5.chr11 (reverse complement)&quot;
+    }
+    a {
+      s 3500
+      b 3001012 70568380
+      e 3001075 70568443
+      l 3001012 70568380 3001075 70568443 81
+    }
+    a {
+      s 3900
+      b 3008279 70573976
+      e 3008357 70574054
+      l 3008279 70573976 3008357 70574054 78
+    }
+    #:eof
+
+- With two files in the FASTA format::
+
+    &gt;hg16.chr19_-_3001011_3001075
+    TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGA
+    
+    &gt;hg16.chr19_-_3008278_3008357
+    CACAATCTTCACATTGAGATCCTGAGTTGCTGATCAGAATGGAAGGCTGAGCTAAGATGAGCGACGAGGCAATGTCACA
+    
+ **and**::
+    
+    &gt;mm5.chr11_-_70568379_70568443
+    TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGA
+    
+    &gt;mm5.chr11_-_70573975_70574054
+    CACAGTCTTCACATTGAGGTACCAAGTTGTGGATCAGAATGGAAAGCTAGGCTATGATGAGGGACAGTGCGCTGTCACA
+  </help>
+  <code file="axt_to_lav_code.py"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/axt_to_lav_code.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/axt_to_lav_code.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,8 @@
+
+def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
+    for name,data in out_data.items():
+        if name == "seq_file2":
+            data.dbkey = param_dict['dbkey_2']
+            app.model.context.add( data )
+            app.model.context.flush()
+            break
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/bed2gff.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/bed2gff.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,89 @@
+<tool id="bed2gff1" name="BED-to-GFF" version="2.0.0">
+  <description>converter</description>
+  <command interpreter="python">bed_to_gff_converter.py $input $out_file1</command>
+  <inputs>
+    <param format="bed" name="input" type="data" label="Convert this query"/>
+  </inputs>
+  <outputs>
+    <data format="gff" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="9.bed"/>
+      <output name="out_file1" file="bed2gff_out.gff"/>
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool converts data from BED format to GFF format (scroll down for format description).
+
+--------
+
+**Example**
+
+The following data in BED format::
+
+ chr28 346187 388197 BC114771 0 + 346187 388197 0 9 144,81,115,63,155,96,134,105,112, 0,24095,26190,31006,32131,33534,36994,41793,41898,
+
+Will be converted to GFF (**note** that the start coordinate is incremented by 1)::
+
+ ##gff-version 2
+ ##bed_to_gff_converter.py
+
+ chr28 bed2gff mRNA 346188 388197 0 + . mRNA BC114771;
+ chr28 bed2gff exon 346188 346331 0 + . exon BC114771;
+ chr28 bed2gff exon 370283 370363 0 + . exon BC114771;
+ chr28 bed2gff exon 372378 372492 0 + . exon BC114771;
+ chr28 bed2gff exon 377194 377256 0 + . exon BC114771;
+ chr28 bed2gff exon 378319 378473 0 + . exon BC114771;
+ chr28 bed2gff exon 379722 379817 0 + . exon BC114771;
+ chr28 bed2gff exon 383182 383315 0 + . exon BC114771;
+ chr28 bed2gff exon 387981 388085 0 + . exon BC114771;
+ chr28 bed2gff exon 388086 388197 0 + . exon BC114771;
+
+
+------
+
+.. class:: informark
+
+**About formats**
+
+**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones:
+
+The first three BED fields (required) are::
+
+    1. chrom - The name of the chromosome (e.g. chr1, chrY_random).
+    2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.)
+    3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval).
+
+The additional BED fields (optional) are::
+
+    4. name - The name of the BED line.
+    5. score - A score between 0 and 1000.
+    6. strand - Defines the strand - either '+' or '-'.
+    7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser.
+    8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser.
+    9. reserved - This should always be set to zero.
+   10. blockCount - The number of blocks (exons) in the BED line.
+   11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
+   12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
+   13. expCount - The number of experiments.
+   14. expIds - A comma-separated list of experiment ids. The number of items in this list should correspond to expCount.
+   15. expScores - A comma-separated list of experiment scores. All of the expScores should be relative to expIds. The number of items in this list should correspond to expCount.
+
+**GFF format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF lines have nine tab-separated fields::
+
+    1. seqname - Must be a chromosome or scaffold.
+    2. source - The program that generated this feature.
+    3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon".
+    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+    5. end - The ending position of the feature (inclusive).
+    6. score - A score between 0 and 1000. If there is no score value, enter ".".
+    7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+    8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+    9. group - All lines with the same group are linked together into a single item.
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/bed_to_bigbed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/bed_to_bigbed.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,55 @@
+<tool id="bed_to_bigBed" name="BED-to-bigBed" version="1.0.0">
+  <description>converter</description>
+  <command>bedToBigBed $input1 $chromInfo $out_file1 
+    #if $settings.settingsType == "full":
+      -blockSize=${settings.blockSize} -itemsPerSlot=${settings.itemsPerSlot} ${settings.unc}
+    #end if
+    2&gt;&amp;1 || echo "Error running bedToBigBed." >&amp;2
+  </command>
+  <requirements>
+    <requirement type="package">ucsc_tools</requirement>
+  </requirements>
+  <inputs>
+    <param format="bed" name="input1" type="data" label="Convert">
+      <validator type="unspecified_build" />
+    </param>
+    <conditional name="settings">
+      <param name="settingsType" type="select" label="Converter settings to use" help="Default settings should usually be used.">
+        <option value="preset">Default</option>
+        <option value="full">Full parameter list</option>
+      </param>
+      <when value="preset" />
+      <when value="full">
+        <param name="blockSize" size="4" type="integer" value="256" label="Items to bundle in r-tree" help="Default is 256 (blockSize)" />
+        <param name="itemsPerSlot" size="4" type="integer" value="512" label="Data points bundled at lowest level" help="Default is 512 (itemsPerSlot)" />
+        <param name="unc" type="boolean" truevalue="-unc" falsevalue="" checked="False" label="Do not use compression" help="(unc)"/>
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="bigbed" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="7.bed" dbkey="hg17" />
+      <param name="settingsType" value="full" />
+      <param name="blockSize" value="256" />
+      <param name="itemsPerSlot" value="512" />
+      <param name="unc" value="False" />
+      <output name="out_file1" file="7.bigbed"/>
+    </test>
+    <test>
+      <param name="input1" value="7.bed" dbkey="hg17" />
+      <param name="settingsType" value="preset" />
+      <output name="out_file1" file="7.bigbed"/>
+    </test>
+  </tests>
+  <help>
+
+This tool converts a **sorted** BED file into a bigBed file.
+
+Currently, the bedFields option to specify the number of non-standard fields is not supported as an AutoSQL file must be provided, which is a format
+currently not supported by Galaxy.
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/bed_to_gff_converter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/bed_to_gff_converter.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+# This code exists in 2 places: ~/datatypes/converters and ~/tools/filters
+import sys
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def __main__():
+    input_name = sys.argv[1]
+    output_name = sys.argv[2]
+    skipped_lines = 0
+    first_skipped_line = 0
+    out = open( output_name, 'w' )
+    out.write( "##gff-version 2\n" )
+    out.write( "##bed_to_gff_converter.py\n\n" )
+    i = 0
+    for i, line in enumerate( file( input_name ) ):
+        complete_bed = False
+        line = line.rstrip( '\r\n' )
+        if line and not line.startswith( '#' ) and not line.startswith( 'track' ) and not line.startswith( 'browser' ):
+            try:
+                elems = line.split( '\t' )
+                if len( elems ) == 12:
+                    complete_bed = True
+                chrom = elems[0]
+                if complete_bed:
+                    feature = "mRNA"
+                else:
+                    try:
+                        feature = elems[3]
+                    except:
+                        feature = 'feature%d' % ( i + 1 )
+                start = int( elems[1] ) + 1
+                end = int( elems[2] )
+                try:
+                    score = elems[4]
+                except:
+                    score = '0'
+                try:
+                    strand = elems[5]
+                except:
+                    strand = '+'
+                try:
+                    group = elems[3]
+                except:
+                    group = 'group%d' % ( i + 1 )
+                if complete_bed:
+                    out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s %s;\n' % ( chrom, feature, start, end, score, strand, feature, group  ) )
+                else:
+                    out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s;\n' % ( chrom, feature, start, end, score, strand, group  ) )
+                if complete_bed:
+                    # We have all the info necessary to annotate exons for genes and mRNAs
+                    block_count = int( elems[9] )
+                    block_sizes = elems[10].split( ',' )
+                    block_starts = elems[11].split( ',' )
+                    for j in range( block_count ):
+                        exon_start = int( start ) + int( block_starts[j] )
+                        exon_end = exon_start + int( block_sizes[j] ) - 1
+                        out.write( '%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\texon %s;\n' % ( chrom, exon_start, exon_end, score, strand, group ) )
+            except:
+                skipped_lines += 1
+                if not first_skipped_line:
+                    first_skipped_line = i + 1
+        else:
+            skipped_lines += 1
+            if not first_skipped_line:
+                first_skipped_line = i + 1
+    out.close()
+    info_msg = "%i lines converted to GFF version 2.  " % ( i + 1 - skipped_lines )
+    if skipped_lines > 0:
+        info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
+    print info_msg
+
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/catWrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/catWrapper.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+#By, Guruprasad Ananda.
+
+from galaxy import eggs
+import sys, os
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+    
+def main():
+    outfile = sys.argv[1]
+    infile = sys.argv[2]
+    
+    try:
+        fout = open(sys.argv[1],'w')
+    except:
+        stop_err("Output file cannot be opened for writing.")
+        
+    try:
+        fin = open(sys.argv[2],'r')
+    except:
+        stop_err("Input file cannot be opened for reading.")
+    
+    if len(sys.argv) < 4:
+        os.system("cp %s %s" %(infile,outfile))
+        sys.exit()
+    
+    cmdline = "cat %s " %(infile)
+    for inp in sys.argv[3:]:
+        cmdline = cmdline + inp + " "
+    cmdline = cmdline + ">" + outfile
+    try:
+        os.system(cmdline)
+    except:
+        stop_err("Error encountered with cat.")
+        
+if __name__ == "__main__": main()
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/catWrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/catWrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,79 @@
+<tool id="cat1" name="Concatenate datasets">
+    <description>tail-to-head</description>
+    <command interpreter="python">
+        catWrapper.py 
+        $out_file1 
+        $input1
+        #for $q in $queries
+            ${q.input2}
+        #end for
+    </command>
+    <inputs>
+        <param name="input1" type="data" label="Concatenate Dataset"/>
+        <repeat name="queries" title="Dataset">
+            <param name="input2" type="data" label="Select" />
+        </repeat>
+    </inputs>
+    <outputs>
+        <data name="out_file1" format="input" metadata_source="input1"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input1" value="1.bed"/>
+            <param name="input2" value="2.bed"/>
+            <output name="out_file1" file="cat_wrapper_out1.bed"/>
+        </test>
+        <!--TODO: if possible, enhance the underlying test code to handle this test
+            the problem is multiple params with the same name "input2"
+        <test>
+            <param name="input1" value="1.bed"/>
+            <param name="input2" value="2.bed"/>
+            <param name="input2" value="3.bed"/>
+            <output name="out_file1" file="cat_wrapper_out2.bed"/>
+        </test>
+        -->
+    </tests>
+    <help>
+
+.. class:: warningmark
+
+**WARNING:** Be careful not to concatenate datasets of different kinds (e.g., sequences with intervals). This tool does not check if the datasets being concatenated are in the same format. 
+
+-----
+
+**What it does**
+
+Concatenates datasets
+
+-----
+
+**Example**
+
+Concatenating Dataset::
+
+    chrX  151087187  151087355  A  0  -
+    chrX  151572400  151572481  B  0  +
+
+with Dataset1::
+
+    chr1  151242630  151242955  X  0  +
+    chr1  151271715  151271999  Y  0  +
+    chr1  151278832  151279227  Z  0  -
+    
+and with Dataset2::
+
+    chr2  100000030  200000955  P  0  +
+    chr2  100000015  200000999  Q  0  +
+
+will result in the following::
+
+    chrX  151087187  151087355  A  0  -
+    chrX  151572400  151572481  B  0  +
+    chr1  151242630  151242955  X  0  +
+    chr1  151271715  151271999  Y  0  +
+    chr1  151278832  151279227  Z  0  -
+    chr2  100000030  200000955  P  0  +
+    chr2  100000015  200000999  Q  0  +
+
+    </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/changeCase.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/changeCase.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,58 @@
+#! /usr/bin/perl -w
+
+use strict;
+use warnings;
+
+my $columns = {};
+my $del = "";
+my @in = ();
+my @out = ();
+my $command = "";
+my $field = 0;
+
+# a wrapper for changing the case of columns from within galaxy
+# isaChangeCase.pl [filename] [columns] [delim] [casing] [output]
+
+die "Check arguments: $0 [filename] [columns] [delim] [casing] [output]\n" unless @ARGV == 5;
+
+# process column input
+$ARGV[1] =~ s/\s+//g;
+foreach ( split /,/, $ARGV[1] ) {
+  if (m/^c\d{1,}$/i) {
+    s/c//ig;
+    $columns->{$_} = --$_;
+  }
+}
+
+die "No columns specified, columns are not preceeded with 'c', or commas are not used to separate column numbers: $ARGV[1]\n" if keys %$columns == 0;
+
+my $column_delimiters_href = {
+ 'TAB' => q{\t},
+ 'COMMA' => ",",
+ 'DASH' => "-",
+ 'UNDERSCORE' => "_",
+ 'PIPE' => q{\|},
+ 'DOT' => q{\.},
+ 'SPACE' => q{\s+}
+};
+
+$del = $column_delimiters_href->{$ARGV[2]};
+
+open (OUT, ">$ARGV[4]") or die "Cannot create $ARGV[4]:$!\n";
+open (IN,  "<$ARGV[0]") or die "Cannot open $ARGV[0]:$!\n";
+while (<IN>) {
+  chop;
+  @in = split /$del/; 
+  for ( my $i = 0; $i <= $#in; ++$i) {
+ if (exists $columns->{$i}) {
+ push(@out, $ARGV[3] eq 'up' ? uc($in[$i]) : lc($in[$i]));
+ } else {
+ push(@out, $in[$i]);
+ }
+  }
+  print OUT join("\t",@out), "\n";
+  @out = ();
+}
+close IN;
+
+close OUT;
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/changeCase.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/changeCase.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,74 @@
+<tool id="ChangeCase" name="Change Case">
+  <description> of selected columns</description>
+  <command interpreter="perl">changeCase.pl $input "$cols" $delimiter $casing $out_file1</command>
+  <inputs>
+    <param name="input" format="txt" type="data" label="From"/>
+    <param name="cols" size="10" type="text" value="c1,c2" label="Change case of columns"/>
+    <param name="delimiter" type="select" label="Delimited by">
+      <option value="TAB">Tab</option>
+      <option value="SPACE">Whitespace</option>
+      <option value="DOT">Dot</option>
+      <option value="COMMA">Comma</option>
+      <option value="DASH">Dash</option>
+      <option value="UNDERSCORE">Underscore</option>
+      <option value="PIPE">Pipe</option>
+    </param>
+    <param name="casing" type="select" label="To">
+      <option value="up">Upper case</option>
+      <option value="lo">Lower case</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="1.txt" ftype="txt"/>
+      <param name="cols" value="c1"/>
+      <param name="delimiter" value="SPACE"/>
+      <param name="casing" value="up"/>
+      <output name="out_file1" file="changeCase_out1.tabular"/>
+    </test>
+    <test>
+      <param name="input" value="1.bed" ftype="bed"/>
+      <param name="cols" value="c1"/>
+      <param name="delimiter" value="TAB"/>
+      <param name="casing" value="up"/>
+      <output name="out_file1" file="changeCase_out2.tabular"/>
+    </test>
+  </tests>
+  <help>
+
+.. class:: warningmark
+
+**This tool breaks column assignments.** To re-establish column assignments run the tool and click on the pencil icon in the resulting history item.
+
+.. class:: warningmark
+
+The format of the resulting dataset from this tool is always tabular.
+
+-----
+
+**What it does**
+
+This tool selects specified columns from a dataset and converts the values of those columns to upper or lower case.
+
+- Columns are specified as **c1**, **c2**, and so on.
+- Columns can be specified in any order (e.g., **c2,c1,c6**)
+
+-----
+
+**Example**
+
+Changing columns 1 and 3 ( delimited by Comma ) to upper case in::
+
+  apple,is,good
+  windows,is,bad
+
+will result in::
+
+  APPLE is GOOD
+  WINDOWS is BAD
+
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/commWrapper.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/commWrapper.pl Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,19 @@
+#! /usr/bin/perl -w
+
+use strict;
+use warnings;
+use File::Temp "tempfile";
+#use POSIX qw(tmpnam);
+
+my ($input1, $input2, $mode, $out_file1) = @ARGV;
+
+my ($fh, $file1) = tempfile();
+my ($fh1,$file2) = tempfile(); 
+
+`sort $input1 > $file1`;
+`sort $input2 > $file2`;
+`comm $mode $file1 $file2 > $out_file1`;
+`rm $file1 ; rm $file2`;
+
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/commWrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/commWrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,38 @@
+<tool id="Comm1" name="Find Similarities and Differences">
+  <description>between two datasets</description>
+  <command interpreter="perl">commWrapper.pl $input1 $input2 $mode $out_file1</command>
+  <inputs>
+    <param format="tabular" name="input1" type="data" label="Compare Dataset1"/>
+    <param format="tabular" name="input2" type="data" label="with Dataset2"/>
+    <param name="mode" type="select" label="And find">
+      <option value="-23">Lines unique to Dataset1</option>
+      <option value="-12">Lines shared between Dataset1 and Dataset2</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input1" />
+  </outputs>
+  <help>
+This tool is based on UNIX shell command comm. It compares two datasets and returns similarities or differences. For example, if you have two datasets::
+  
+ a  1
+ b  2
+ c  3
+
+and::
+
+ a  1
+ f  6
+ h  8
+
+Using this tool with **Lines unique to Dataset1** option will return::
+
+ b  2
+ c  3
+
+If you use **Lines shared between Dataset1 and Dataset2** option output will look like this::
+
+ a  1
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/compare.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/compare.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,79 @@
+<tool id="comp1" name="Compare two Datasets" version="1.0.2">
+  <description>to find common or distinct rows</description>
+  <command interpreter="python">joinWrapper.py $input1 $input2 $field1 $field2 $mode $out_file1</command>
+  <inputs>
+    <param format="tabular" name="input1" type="data" label="Compare"/>
+    <param name="field1" label="Using column" type="data_column" data_ref="input1">
+        <validator type="no_options" message="Invalid column choice. Please try again after editing metadata of your input dataset by clicking on the pencil icon next to it."/>
+    </param>
+    <param format="tabular" name="input2" type="data" label="against" />
+    <param name="field2" label="and column" type="data_column" data_ref="input2">
+            <validator type="no_options" message="Invalid column choice. Please try again after editing metadata of your input dataset by clicking on the pencil icon next to it."/>
+    </param>
+    <param name="mode" type="select" label="To find" help="See examples below for explanation of these options">
+      <option value="N">Matching rows of 1st dataset</option>
+      <option value="V">Non Matching rows of 1st dataset</option>
+    </param>
+  </inputs>
+  <outputs>
+     <data format="input" name="out_file1" metadata_source="input1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="1.bed"/>
+      <param name="input2" value="2.bed"/>
+      <param name="field1" value="2"/>
+      <param name="field2" value="2"/>
+      <param name="mode" value="N"/>
+      <output name="out_file1" file="fs-compare.dat"/>
+    </test>
+    <!--test case with duplicated key values-->
+    <test>
+      <param name="input1" value="1.bed"/>
+      <param name="input2" value="3.bed"/>
+      <param name="field1" value="1"/>
+      <param name="field2" value="1"/>
+      <param name="mode" value="V"/>
+      <output name="out_file1" file="fs-compare-2.dat"/>
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
+
+-----
+
+**Syntax**
+
+This tool finds lines in one dataset that HAVE or DO NOT HAVE a common field with another dataset.
+
+-----
+
+**Example**
+
+If this is **First dataset**::
+
+  chr1 10 20 geneA 
+  chr1 50 80 geneB
+  chr5 10 40 geneL
+
+and this is **Second dataset**::
+
+  geneA tumor-suppressor
+  geneB Foxp2
+  geneC Gnas1
+  geneE INK4a
+
+Finding lines of the **First dataset** whose 4th column matches the 1st column of the **Second dataset** yields::
+
+  chr1 10 20 geneA 
+  chr1 50 80 geneB
+
+Conversely, using option **Non Matching rows of First dataset** on the same fields will yield::
+
+  chr5 10 40 geneL
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/condense_characters.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/condense_characters.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,105 @@
+#! /usr/bin/perl -w
+
+use strict;
+use warnings;
+
+# condenses all consecutive characters of one type
+# convert_characters.pl [input] [character] [output]
+
+die "Check arguments" unless @ARGV == 3;
+
+my $inputfile = $ARGV[0];
+my $character = $ARGV[1];
+my $outputfile = $ARGV[2];
+
+
+my $convert_from;
+my $convert_to;
+
+
+if ($character eq "s")
+{
+    $convert_from = '\s';
+}
+elsif ($character eq "T")
+{
+    $convert_from = '\t';
+}
+elsif ($character eq "Sp")
+{
+    $convert_from = " ";
+}
+elsif ($character eq "Dt")
+{
+    $convert_from = '\.';
+}
+elsif ($character eq "C")
+{
+    $convert_from = ",";
+}
+elsif ($character eq "D")
+{
+    $convert_from = "-";
+}
+elsif ($character eq "U")
+{
+    $convert_from = "_";
+}
+elsif ($character eq "P")
+{
+    $convert_from = '\|';
+}
+else
+{
+    die "Invalid value specified for convert from\n";
+}
+
+
+if ($character eq "T")
+{
+    $convert_to = "\t";
+}
+elsif ($character eq "Sp")
+{
+    $convert_to = " ";
+}
+elsif ($character eq "Dt")
+{
+    $convert_to = "\.";
+}
+elsif ($character eq "C")
+{
+    $convert_to = ",";
+}
+elsif ($character eq "D")
+{
+    $convert_to = "-";
+}
+elsif ($character eq "U")
+{
+    $convert_to = "_";
+}
+elsif ($character eq "P")
+{
+    $convert_to = "|";
+}
+else
+{
+    die "Invalid value specified for Convert to\n";
+}
+
+my $fhIn;
+open ($fhIn, "< $inputfile") or die "Cannot open source file";
+
+my $fhOut;
+open ($fhOut, "> $outputfile");
+
+while (<$fhIn>)
+{
+    my $thisLine = $_;
+    chomp $thisLine;
+    $thisLine =~ s/${convert_from}+/$convert_to/g;
+    print $fhOut $thisLine,"\n";    
+}
+close ($fhIn) or die "Cannot close source file";
+close ($fhOut) or die "Cannot close output file";
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/condense_characters.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/condense_characters.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,48 @@
+<tool id="Condense characters1" name="Condense">
+  <description>consecutive characters</description>
+  <command interpreter="perl">condense_characters.pl $input $character $out_file1</command>
+  <inputs>
+<!--    <display>condense all consecutive $character from $input</display> -->
+    <param name="character" type="select" label="Condense all consecutive">
+      <option value="T">Tabs</option>
+      <option value="Sp">Spaces</option>
+      <option value="Dt">Dots</option>
+      <option value="C">Commas</option>
+      <option value="D">Dashes</option>
+      <option value="U">Underscores</option>
+      <option value="P">Pipes</option>
+    </param>
+    <param format="txt" name="input" type="data" label="in this Query"/>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="character" value="T"/>
+      <param name="input" value="1.bed"/>
+      <output name="out_file1" file="eq-condense.dat"/>
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool condenses all consecutive characters of a specified type.
+
+-----
+
+**Example**
+
+- Input file::
+
+    geneX,,,10,,,,,20
+    geneY,,5,,,,,12,15,9,
+
+- Condense all consecutive commas. The above file will be converted into::
+
+    geneX,10,20
+    geneY,5,12,15,9
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/convert_characters.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/convert_characters.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,101 @@
+#! /usr/bin/perl -w
+
+use strict;
+use warnings;
+
+# converts all characters of one type into another 
+# convert_characters.pl [input] [convert_from] [convert_to] [output]
+
+die "Check argument\n" unless @ARGV == 4;
+
+my $inputfile = $ARGV[0];
+my $convert_from = $ARGV[1];
+my $convert_to = $ARGV[2];
+my $outputfile = $ARGV[3];
+
+if ($convert_from eq "s")
+{
+    $convert_from = '\s';
+}
+elsif ($convert_from eq "T")
+{
+    $convert_from = '\t';
+}
+elsif ($convert_from eq "Sp")
+{
+    $convert_from = '\s';
+}
+elsif ($convert_from eq "Dt")
+{
+    $convert_from = '\.';
+}
+elsif ($convert_from eq "C")
+{
+    $convert_from = ",";
+}
+elsif ($convert_from eq "D")
+{
+    $convert_from = "-";
+}
+elsif ($convert_from eq "U")
+{
+    $convert_from = "_";
+}
+elsif ($convert_from eq "P")
+{
+    $convert_from = '\|';
+}
+else
+{
+    die "Invalid value specified for convert from\n";
+}
+
+
+if ($convert_to eq "T")
+{
+    $convert_to = "\t";
+}
+elsif ($convert_to eq "Sp")
+{
+    $convert_to = '\s';
+}
+elsif ($convert_to eq "Dt")
+{
+    $convert_to = "\.";
+}
+elsif ($convert_to eq "C")
+{
+    $convert_to = ",";
+}
+elsif ($convert_to eq "D")
+{
+    $convert_to = "-";
+}
+elsif ($convert_to eq "U")
+{
+    $convert_to = "_";
+}
+elsif ($convert_to eq "P")
+{
+    $convert_to = "|";
+}
+else
+{
+    die "Invalid value specified for convert to\n";
+}
+
+my $fhIn;
+open ($fhIn, "< $inputfile") or die "Cannot open source file";
+
+my $fhOut;
+open ($fhOut, "> $outputfile");
+
+while (<$fhIn>)
+{
+    my $thisLine = $_;
+    chomp $thisLine;
+    $thisLine =~ s/$convert_from{1,}/$convert_to/g;
+    print $fhOut $thisLine,"\n";    
+}
+close ($fhIn) or die "Cannot close source file\n";
+close ($fhOut) or die "Cannot close output fil\n";
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/convert_characters.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/convert_characters.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+#By, Guruprasad Ananda.
+
+from galaxy import eggs
+import sys, re
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+    
+def main():
+    if len(sys.argv) != 4:
+        stop_err("usage: convert_characters infile from_char outfile")
+
+    try:
+        fin = open(sys.argv[1],'r')
+    except:
+        stop_err("Input file cannot be opened for reading.")
+    
+    from_char = sys.argv[2]
+    
+    try:
+        fout = open(sys.argv[3],'w')
+    except:
+        stop_err("Output file cannot be opened for writing.")
+    
+    char_dict = {'T':'\t','s':'\s','Dt':'\.','C':',','D':'-','U':'_','P':'\|','Co':':'}
+    from_ch = char_dict[from_char] + '+'    #making an RE to match 1 or more occurences.
+    skipped = 0
+    
+    for line in fin:
+        line = line.strip()
+        try:
+            fout.write("%s\n" %(re.sub(from_ch,'\t',line)))     
+        except:
+            skipped += 1
+            
+    if skipped:
+        print "Skipped %d lines as invalid." %skipped
+    
+if __name__ == "__main__": 
+    main()
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/convert_characters.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/convert_characters.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,58 @@
+<tool id="Convert characters1" name="Convert">
+  <description>delimiters to TAB</description>
+  <command interpreter="python">convert_characters.py $input $convert_from $out_file1</command>
+  <inputs>
+    <param name="convert_from" type="select" label="Convert all">
+      <option value="s">Whitespaces</option>
+      <option value="T">Tabs</option>
+      <!--<option value="Sp">Spaces</option>-->
+      <option value="Dt">Dots</option>
+      <option value="C">Commas</option>
+      <option value="D">Dashes</option>
+      <option value="U">Underscores</option>
+      <option value="P">Pipes</option>
+      <option value="Co">Colons</option>
+    </param>
+    <param format="txt" name="input" type="data" label="in Query"/>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="convert_from" value="s"/>
+      <param name="input" value="1.bed"/>
+      <output name="out_file1" file="eq-convert.dat"/>
+    </test>
+    <test>
+      <param name="convert_from" value="s"/>
+      <param name="input" value="a.txt"/>
+      <output name="out_file1" file="a.tab"/>
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+Converts all delimiters of a specified type into TABs.  Consecutive characters are condensed. For example, if columns are separated by 5 spaces they will converted into 1 tab.
+
+-----
+
+**Example**
+
+- Input file::
+
+    chrX||151283558|151283724|NM_000808_exon_8_0_chrX_151283559_r|0|-
+    chrX|151370273|151370486|NM_000808_exon_9_0_chrX_151370274_r|0|-
+    chrX|151559494|151559583|NM_018558_exon_1_0_chrX_151559495_f|0|+
+    chrX|151564643|151564711|NM_018558_exon_2_0_chrX_151564644_f||||0|+
+
+- Converting all pipe delimiters of the above file to TABs will get::
+
+    chrX  151283558  151283724  NM_000808_exon_8_0_chrX_151283559_r  0  -
+    chrX  151370273  151370486  NM_000808_exon_9_0_chrX_151370274_r  0  -
+    chrX  151559494  151559583  NM_018558_exon_1_0_chrX_151559495_f  0  +
+    chrX  151564643  151564711  NM_018558_exon_2_0_chrX_151564644_f  0  +
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/cutWrapper.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/cutWrapper.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,77 @@
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+
+my @columns = ();
+my $del = "";
+my @in = ();
+my @out = ();
+my $command = "";
+my $field = 0;
+
+# a wrapper for cut for use in galaxy
+# cutWrapper.pl [filename] [columns] [delim] [output]
+
+die "Check arguments\n" unless @ARGV == 4;
+
+$ARGV[1] =~ s/\s+//g;
+foreach ( split /,/, $ARGV[1] ) {
+  if (m/^c\d{1,}$/i) {
+    push (@columns, $_);
+    $columns[@columns-1] =~s/c//ig;
+  }
+}
+
+die "No columns specified, columns are not preceded with 'c', or commas are not used to separate column numbers: $ARGV[1]\n" if @columns == 0;
+
+my $column_delimiters_href = {
+  'T' => q{\t},
+  'C' => ",",
+  'D' => "-",
+  'U' => "_",
+  'P' => q{\|},
+  'Dt' => q{\.},
+  'Sp' => q{\s+}
+};
+
+$del = $column_delimiters_href->{$ARGV[2]};
+
+open (OUT, ">$ARGV[3]") or die "Cannot create $ARGV[2]:$!\n";
+open (IN,  "<$ARGV[0]") or die "Cannot open $ARGV[0]:$!\n";
+
+while (my $line=<IN>) {
+   if ($line =~ /^#/) {
+     #Ignore comment lines
+   } else {
+     chop($line);
+     @in = split(/$del/, $line);
+     foreach $field (@columns) {
+       if (defined($in[$field-1])) {
+         push(@out, $in[$field-1]);
+       } else {
+         push(@out, ".");
+       }
+     }    
+     print OUT join("\t",@out), "\n";
+     @out = ();
+   }
+}
+
+#while (<IN>) {
+#  chop;
+#  @in = split /$del/; 
+#  foreach $field (@columns) {
+#    if (defined($in[$field-1])) {
+#      push(@out, $in[$field-1]);
+#    } else {
+#      push(@out, ".");
+#    }
+#  }
+#  print OUT join("\t",@out), "\n";
+#  @out = ();
+#}
+close IN;
+
+close OUT;
+    
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/cutWrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/cutWrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,202 @@\n+<tool id="Cut1" name="Cut" version="1.0.1">\r\n+  <description>columns from a table</description>\r\n+  <command interpreter="perl">cutWrapper.pl $input "$columnList" $delimiter $out_file1</command>\r\n+  <inputs>\r\n+    <param name="columnList" size="10" type="text" value="c1,c2" label="Cut columns"/>\r\n+    <param name="delimiter" type="select" label="Delimited by">\r\n+      <option value="T">Tab</option>\r\n+      <option value="Sp">Whitespace</option>\r\n+      <option value="Dt">Dot</option>\r\n+      <option value="C">Comma</option>\r\n+      <option value="D">Dash</option>\r\n+      <option value="U">Underscore</option>\r\n+      <option value="P">Pipe</option>\r\n+    </param>\r\n+    <param format="txt" name="input" type="data" label="From"/>\r\n+  </inputs>\r\n+  <outputs>\r\n+    <data format="tabular" name="out_file1" >\r\n+      <actions>\r\n+        <conditional name="delimiter">\r\n+          <when value="T">\r\n+            <conditional name="input">\r\n+              <when datatype_isinstance="interval">\r\n+                <action type="format" default="tabular">\r\n+                  <option type="from_param" name="columnList" column="0" offset="0"> <!-- chromCol is 1-->\r\n+                    \r\n+                    <filter type="insert_column" column="0" value="interval"/>\r\n+                    \r\n+                    <filter type="insert_column" ref="columnList" /> <!-- startCol -->\r\n+                    \r\n+                    <filter type="insert_column" ref="columnList" /> <!-- endCol -->\r\n+                    \r\n+                    <filter type="multiple_splitter" column="1" separator=","/>\r\n+                    <filter type="column_strip" column="1"/> <!-- get rid of all external whitespace -->\r\n+                    <filter type="string_function" column="1" name="lower" />\r\n+                    <filter type="param_value" column="1" value="^c\\d{1,}$" compare="re_search" keep="True"/>\r\n+                    <filter type="column_strip" column="1" strip="c"/> <!-- get rid of c\'s  -->\r\n+                    <filter type="boolean" column="1" cast="int" />\r\n+                    \r\n+                    <filter type="multiple_splitter" column="2" separator=","/>\r\n+                    <filter type="column_strip" column="2"/> <!-- get rid of all external whitespace -->\r\n+                    <filter type="string_function" column="2" name="lower" />\r\n+                    <filter type="param_value" column="2" value="^c\\d{1,}$" compare="re_search" keep="True"/>\r\n+                    <filter type="column_strip" column="2" strip="c"/> <!-- get rid of c\'s  -->\r\n+                    <filter type="boolean" column="2" cast="int" />\r\n+                    \r\n+                    <filter type="multiple_splitter" column="3" separator=","/>\r\n+                    <filter type="column_strip" column="3"/> <!-- get rid of all external whitespace -->\r\n+                    <filter type="string_function" column="3" name="lower" />\r\n+                    <filter type="param_value" column="3" value="^c\\d{1,}$" compare="re_search" keep="True"/>\r\n+                    <filter type="column_strip" column="3" strip="c"/> <!-- get rid of c\'s  -->\r\n+                    <filter type="boolean" column="3" cast="int" />\r\n+                    \r\n+                    <filter type="metadata_value" ref="input" name="chromCol" column="1" />\r\n+                    <filter type="metadata_value" ref="input" name="startCol" column="2" />\r\n+                    <filter type="metadata_value" ref="input" name="endCol" column="3" />\r\n+                    \r\n+                  </option>\r\n+                </action>\r\n+                \r\n+                <conditional name="out_file1">\r\n+                  <when datatype_isinstance="interval">\r\n+                    <action type="metadata" name="chromCol">\r\n+                      <option type="from_param" name="columnList" column="0" offset="0"> <!-- chromCol is 0-->\r\n+                        <filter type="multiple_splitter" column="0" separator=","/>\r\n+'..b'" ref="input" name="endCol" column="1" />\r\n+                      </option>\r\n+                    </action>\r\n+                  \r\n+                    <action type="metadata" name="nameCol" default="0">\r\n+                      <option type="from_param" name="columnList" column="0" offset="0"> <!-- nameCol is 0-->\r\n+                        <filter type="multiple_splitter" column="0" separator=","/>\r\n+                        <filter type="column_strip" column="0"/> <!-- get rid of all external whitespace -->\r\n+                        <filter type="string_function" column="0" name="lower" />\r\n+                        <filter type="param_value" column="0" value="^c\\d{1,}$" compare="re_search" keep="True"/>\r\n+                        <filter type="column_strip" column="0" strip="c"/> <!-- get rid of c\'s  -->\r\n+                        <filter type="insert_column" value="1" iterate="True" column="0"/>\r\n+                        <filter type="boolean" column="1" cast="int" />\r\n+                        <filter type="metadata_value" ref="input" name="nameCol" column="1" />\r\n+                      </option>\r\n+                    </action>\r\n+                  \r\n+                    <action type="metadata" name="strandCol" default="0">\r\n+                      <option type="from_param" name="columnList" column="0" offset="0"> <!-- strandCol is 0-->\r\n+                        <filter type="multiple_splitter" column="0" separator=","/>\r\n+                        <filter type="column_strip" column="0"/> <!-- get rid of all external whitespace -->\r\n+                        <filter type="string_function" column="0" name="lower" />\r\n+                        <filter type="param_value" column="0" value="^c\\d{1,}$" compare="re_search" keep="True"/>\r\n+                        <filter type="column_strip" column="0" strip="c"/> <!-- get rid of c\'s  -->\r\n+                        <filter type="insert_column" value="1" iterate="True" column="0"/>\r\n+                        <filter type="boolean" column="1" cast="int" />\r\n+                        <filter type="metadata_value" ref="input" name="strandCol" column="1" />\r\n+                      </option>\r\n+                    </action>  \r\n+                  </when>\r\n+                </conditional>\r\n+                \r\n+              </when>\r\n+            </conditional>\r\n+          </when>\r\n+        </conditional>\r\n+      </actions>\r\n+    </data>\r\n+  </outputs>\r\n+  <tests>\r\n+    <test>\r\n+      <param name="columnList" value="c1,c4,c2,c3"/>\r\n+      <param name="delimiter" value="T"/>\r\n+      <param name="input" value="1.bed"/>\r\n+      <output name="out_file1" file="eq-cut.dat"/>\r\n+    </test>\r\n+  </tests>\r\n+  <help>\r\n+\r\n+.. class:: warningmark\r\n+\r\n+**WARNING: This tool breaks column assignments.** To re-establish column assignments run the tools and click on the pencil icon in the latest history item.\r\n+\r\n+.. class:: infomark\r\n+\r\n+The output of this tool is always in tabular format (e.g., if your original delimiters are commas, they will be replaced with tabs). For example:\r\n+\r\n+  Cutting columns 1 and 3 from::\r\n+\r\n+     apple,is,good\r\n+     windows,is,bad\r\n+\r\n+  will give::\r\n+\r\n+    apple   good\r\n+    windows bad\r\n+\r\n+-----\r\n+\r\n+**What it does**\r\n+\r\n+This tool selects (cuts out) specified columns from the dataset.\r\n+\r\n+- Columns are specified as **c1**, **c2**, and so on. Column count begins with **1**\r\n+- Columns can be specified in any order (e.g., **c2,c1,c6**)\r\n+- If you specify more columns than actually present - empty spaces will be filled with dots\r\n+\r\n+-----\r\n+\r\n+**Example**\r\n+\r\n+Input dataset (six columns: c1, c2, c3, c4, c5, and c6)::\r\n+\r\n+   chr1 10   1000  gene1 0 +\r\n+   chr2 100  1500  gene2 0 +\r\n+\r\n+**cut** on columns "**c1,c4,c6**" will return::\r\n+\r\n+   chr1 gene1 +\r\n+   chr2 gene2 +\r\n+\r\n+**cut** on columns "**c6,c5,c4,c1**" will return::\r\n+\r\n+   + 0 gene1 chr1 \r\n+   + 0 gene2 chr2\r\n+\r\n+\r\n+**cut** on columns "**c8,c7,c4**" will return::\r\n+\r\n+   . . gene1 \r\n+   . . gene2\r\n+   \r\n+\r\n+</help>\r\n+</tool>\r\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/fileGrep.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/fileGrep.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,42 @@
+<tool id="fileGrep1" name="Match">
+  <description>a column from one Query against another Query</description>
+  <command>cut -f $col $input1 | grep -f - $match $input2 > $out_file1</command>
+  <inputs>
+    <param name="col" size="2" type="text" value="1" label="Match content of column"/>
+    <param format="tabular" name="input1" type="data" label="From Query1"/>
+    <param format="tabular" name="input2" type="data" label="Against Query2"/>
+    <param name="match" type="select" label="and return rows that">
+      <option value="">Match</option>
+      <option value="-v">Do not match</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input2" />
+  </outputs>
+  <help>
+This tool is based on UNIX command grep with option -f. It matches content of one query against another. For example, assume you have two queries - one that contains EST accession numbers and some other information::
+
+  AA001229 12 12
+  A001501 7 7
+  AA001641 6 6
+  AA001842 6 6
+  AA002047 6 6
+  AA004638 3 3
+
+and another that is a typical BED file describing genomic location of some ESTs::
+
+  chr7 115443235 115443809 CA947954_exon_0_0_chr7_115443236_f 0 +
+  chr7 115443236 115443347 DB338189_exon_0_0_chr7_115443237_f 0 +
+  chr7 115443347 115443768 DB338189_exon_1_0_chr7_115443348_f 0 +
+  chr7 115443239 115443802 AA001842_exon_0_0_chr7_115443240_f 0 +
+  chr7 115443243 115443347 DB331869_exon_0_0_chr7_115443244_f 0 +
+  chr7 115443347 115443373 DB331869_exon_1_0_chr7_115443348_f 0 +
+
+Using this tool you will be able to tell how many ESTs in Query1 are also preset in Query2 and will output this::
+
+  chr7 115443239 115443802 AA001842_exon_0_0_chr7_115443240_f 0
+
+if **Match** option is chosen.
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/fixedValueColumn.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/fixedValueColumn.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,34 @@
+#! /usr/bin/perl -w
+
+use strict;
+use warnings;
+
+# fixedValueColumn.pl $input $out_file1 "expression" "iterate [yes|no]"
+
+my ($input, $out_file1, $expression, $iterate) = @ARGV;
+my $i = 0;
+my $numeric = 0;
+
+die "Check arguments\n" unless @ARGV == 4;
+
+open (DATA, "<$input") or die "Cannot open $input:$!\n";
+open (OUT,  ">$out_file1") or die "Cannot create $out_file1:$!\n";
+
+if ($expression =~ m/^\d+$/) {
+  $numeric = 1;
+  $i = $expression;
+}
+
+while (<DATA>) {
+  chop;
+  if ($iterate eq "no") {
+    print OUT "$_\t$expression\n";
+  } else {
+    print OUT "$_\t$i\n" if $numeric == 1;
+    print OUT "$_\t$expression-$i\n" if $numeric == 0;
+    ++$i;
+  }
+}
+
+close DATA;
+close OUT;
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/fixedValueColumn.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/fixedValueColumn.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,61 @@
+<tool id="addValue" name="Add column">
+  <description>to an existing dataset</description>
+  <command interpreter="perl">fixedValueColumn.pl $input $out_file1 "$exp" $iterate</command>
+  <inputs>
+     <param name="exp" size="20" type="text" value="1" label="Add this value"/>
+    <param format="tabular" name="input" type="data" label="to Dataset" help="Dataset missing? See TIP below" />
+    <param name="iterate" type="select" label="Iterate?">
+      <option value="no">NO</option>
+      <option value="yes">YES</option>
+    </param>    
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="exp" value="1"/>
+      <param name="input" value="1.bed"/>
+      <param name="iterate" value="no"/>
+      <output name="out_file1" file="eq-addvalue.dat"/>
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
+
+-----
+
+**What it does**
+
+You can enter any value and it will be added as a new column to your dataset
+
+-----
+
+**Example**
+
+If you original data looks like this::
+
+    chr1 10  100 geneA
+    chr2 200 300 geneB
+    chr2 400 500 geneC
+
+Typing **+** in the text box will generate::
+
+    chr1 10  100 geneA +
+    chr2 200 300 geneB +
+    chr2 400 500 geneC +
+     
+
+You can also add line numbers by selecting **Iterate: YES**. In this case if you enter **1** in the text box you will get::
+
+    chr1 10  100 geneA 1
+    chr2 200 300 geneB 2
+    chr2 400 500 geneC 3
+
+
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/gff/extract_GFF_Features.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/gff/extract_GFF_Features.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+#Guruprasad Ananda
+"""
+Extract features from GFF file.
+
+usage: %prog input1 out_file1 column features
+"""
+
+import sys, os
+
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def main():   
+    # Parsing Command Line here
+    options, args = doc_optparse.parse( __doc__ )
+    
+    try:
+        inp_file, out_file, column, features = args
+    except:
+        stop_err( "One or more arguments is missing or invalid.\nUsage: prog input output column features" )
+    try:
+        column = int( column )
+    except:
+        stop_err( "Column %s is an invalid column." % column )
+    
+    if features == None:
+        stop_err( "Column %d has no features to display, select another column." %( column + 1 ) )
+
+    fo=open( out_file, 'w' )
+    for i, line in enumerate( file( inp_file ) ):
+        line = line.rstrip( '\r\n' )
+        if line and line.startswith( '#' ):
+            # Keep valid comment lines in the output
+            fo.write( "%s\n" % line )
+        else:
+            try:
+                if line.split( '\t' )[column] in features.split( ',' ):
+                    fo.write( "%s\n" % line )
+            except:
+                pass
+    fo.close()
+            
+    print 'Column %d features: %s' %( column + 1, features )
+
+if __name__ == "__main__":
+    main()       
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/gff/extract_GFF_Features.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/gff/extract_GFF_Features.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,114 @@
+<tool id="Extract_features1" name="Extract features">
+  <description> from GFF data</description>
+  <command interpreter="python">extract_GFF_Features.py $input1 $out_file1 ${column_choice.col} ${column_choice.feature}</command>
+  <inputs>
+    <param format="gff" name="input1" type="data" label="Select GFF data"/>
+    <conditional name="column_choice">
+      <param name="col" type="select" label="From">
+        <option value="0" selected="true">Column 1 / Sequence name</option>
+        <option value="1">Column 2 / Source</option>
+        <option value="2">Column 3 / Feature</option>
+        <option value="6">Column 7 / Strand</option>
+        <option value="7">Column 8 / Frame</option>
+      </param>
+      <when value="0">
+        <param name="feature" type="select" multiple="true" label="Extract features" help="Multi-select list - hold the appropriate key while clicking to select multiple columns">
+          <options from_dataset="input1">
+            <column name="name" index="0"/>
+            <column name="value" index="0"/>
+            <filter type="unique_value" name="unique" column="0"/>
+          </options>
+        </param> 
+      </when>
+      <when value="1">
+        <param name="feature" type="select" multiple="true" label="Extract features" help="Multi-select list - hold the appropriate key while clicking to select multiple columns">
+          <options from_dataset="input1">
+            <column name="name" index="1"/>
+            <column name="value" index="1"/>
+            <filter type="unique_value" name="unique" column="1"/>
+          </options>
+        </param>        
+      </when>
+      <when value="2">
+        <param name="feature" type="select" multiple="true" label="Extract features" help="Multi-select list - hold the appropriate key while clicking to select multiple columns">
+          <options from_dataset="input1">
+            <column name="name" index="2"/>
+            <column name="value" index="2"/>
+            <filter type="unique_value" name="unique" column="2"/>
+          </options>
+        </param> 
+      </when>
+      <when value="6">
+        <param name="feature" type="select" multiple="true" label="Extract features" help="Multi-select list - hold the appropriate key while clicking to select multiple columns">
+          <options from_dataset="input1">
+            <column name="name" index="6"/>
+            <column name="value" index="6"/>
+            <filter type="unique_value" name="unique" column="6"/>
+          </options>
+        </param>         
+      </when>
+      <when value="7">
+        <param name="feature" type="select" multiple="true" label="Extract features" help="Multi-select list - hold the appropriate key while clicking to select multiple columns">
+          <options from_dataset="input1">
+            <column name="name" index="7"/>
+            <column name="value" index="7"/>
+            <filter type="unique_value" name="unique" column="7"/>
+          </options>
+        </param>            
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input1"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="5.gff"/>
+      <param name="col" value="0" />
+      <param name="feature" value="chr5,chr6,chr7,chr8" />
+      <output name="out_file1" file="Extract_features1_out.gff"/>
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool extracts selected features from GFF data.
+
+-----
+
+**Example**
+
+Selecting **promoter** from the following GFF data::
+
+    chr22  GeneA  enhancer  10000000  10001000  500  +  .  TGA
+    chr22  GeneA  promoter  10010000  10010100  900  +  .  TGA
+    chr22  GeneB  promoter  10020000  10025000  400  -  .  TGB
+    chr22  GeneB  CCDS2220  10030000  10065000  800  -  .  TGB
+    
+will produce the following output::
+
+    chr22  GeneA  promoter  10010000  10010100  900  +  .  TGA
+    chr22  GeneB  promoter  10020000  10025000  400  -  .  TGB
+
+----
+
+.. class:: infomark
+
+**About formats**
+
+**GFF format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF lines have nine tab-separated fields::
+
+    1. seqname - Must be a chromosome or scaffold.
+    2. source - The program that generated this feature.
+    3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon".
+    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+    5. end - The ending position of the feature (inclusive).
+    6. score - A score between 0 and 1000. If there is no score value, enter ".".
+    7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+    8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+    9. group - All lines with the same group are linked together into a single item.
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/gff/gff_filter_by_attribute.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/gff/gff_filter_by_attribute.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,163 @@
+#!/usr/bin/env python
+# This tool takes a gff file as input and creates filters on attributes based on certain properties.
+# The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
+# TODO: much of this code is copied from the Filter1 tool (filtering.py in tools/stats/). The commonalities should be 
+# abstracted and leveraged in each filtering tool.
+
+from __future__ import division
+import sys
+from galaxy import eggs
+from galaxy.util.json import to_json_string, from_json_string
+
+# Older py compatibility
+try:
+    set()
+except:
+    from sets import Set as set
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+#
+# Helper functions.
+#
+
+def get_operands( filter_condition ):
+    # Note that the order of all_operators is important
+    items_to_strip = ['+', '-', '**', '*', '//', '/', '%', '<<', '>>', '&', '|', '^', '~', '<=', '<', '>=', '>', '==', '!=', '<>', ' and ', ' or ', ' not ', ' is ', ' is not ', ' in ', ' not in ']
+    for item in items_to_strip:
+        if filter_condition.find( item ) >= 0:
+            filter_condition = filter_condition.replace( item, ' ' )
+    operands = set( filter_condition.split( ' ' ) )
+    return operands
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def check_for_executable( text, description='' ):
+    # Attempt to determine if the condition includes executable stuff and, if so, exit.
+    secured = dir()
+    operands = get_operands( text )
+    for operand in operands:
+        try:
+            check = int( operand )
+        except:
+            if operand in secured:
+                stop_err( "Illegal value '%s' in %s '%s'" % ( operand, description, text ) )
+                
+#
+# Process inputs.
+#
+
+in_fname = sys.argv[1]
+out_fname = sys.argv[2]
+cond_text = sys.argv[3]
+attribute_types = from_json_string( sys.argv[4] )
+
+# Convert types from str to type objects.
+for name, a_type in attribute_types.items():
+    check_for_executable(a_type)
+    attribute_types[ name ] = eval( a_type )
+    
+# Unescape if input has been escaped
+mapped_str = {
+    '__lt__': '<',
+    '__le__': '<=',
+    '__eq__': '==',
+    '__ne__': '!=',
+    '__gt__': '>',
+    '__ge__': '>=',
+    '__sq__': '\'',
+    '__dq__': '"',
+}
+for key, value in mapped_str.items():
+    cond_text = cond_text.replace( key, value )
+        
+# Attempt to determine if the condition includes executable stuff and, if so, exit.
+check_for_executable( cond_text, 'condition')
+
+# Prepare the column variable names and wrappers for column data types. Only 
+# prepare columns up to largest column in condition.
+attrs, type_casts = [], []
+for name, attr_type in attribute_types.items():
+    attrs.append( name )
+    type_cast = "get_value('%(name)s', attribute_types['%(name)s'], attribute_values)" % ( {'name': name} )
+    type_casts.append( type_cast )
+    
+attr_str = ', '.join( attrs )    # 'c1, c2, c3, c4'
+type_cast_str = ', '.join( type_casts )  # 'str(c1), int(c2), int(c3), str(c4)'
+wrap = "%s = %s" % ( attr_str, type_cast_str )
+    
+# Stats 
+skipped_lines = 0
+first_invalid_line = 0
+invalid_line = None
+lines_kept = 0
+total_lines = 0
+out = open( out_fname, 'wt' )
+
+# Helper function to safely get and type cast a value in a dict.
+def get_value(name, a_type, values_dict):
+    if name in values_dict:
+        return (a_type)(values_dict[ name ])
+    else:
+        return None
+    
+# Read and filter input file, skipping invalid lines
+code = '''
+for i, line in enumerate( file( in_fname ) ):
+    total_lines += 1
+    line = line.rstrip( '\\r\\n' )
+    if not line or line.startswith( '#' ):
+        skipped_lines += 1
+        if not invalid_line:
+            first_invalid_line = i + 1
+            invalid_line = line
+        continue
+    try:
+        # Place attribute values into variables with attribute
+        # name; type casting is done as well.
+        elems = line.split( '\t' )
+        attribute_values = {}
+        for name_value_pair in elems[8].split(";"):
+            pair = name_value_pair.strip().split(" ")
+            if pair == '':
+                continue
+            name = pair[0].strip()
+            if name == '':
+                continue
+            # Need to strip double quote from value and typecast.
+            attribute_values[name] = pair[1].strip(" \\"")
+        %s
+        if %s:
+            lines_kept += 1
+            print >> out, line
+    except Exception, e:
+        print e
+        skipped_lines += 1
+        if not invalid_line:
+            first_invalid_line = i + 1
+            invalid_line = line
+''' % ( wrap, cond_text )
+
+valid_filter = True
+try:
+    exec code
+except Exception, e:
+    out.close()
+    if str( e ).startswith( 'invalid syntax' ):
+        valid_filter = False
+        stop_err( 'Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % cond_text )
+    else:
+        stop_err( str( e ) )
+
+if valid_filter:
+    out.close()
+    valid_lines = total_lines - skipped_lines
+    print 'Filtering with %s, ' % ( cond_text )
+    if valid_lines > 0:
+        print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
+    else:
+        print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % cond_text
+    if skipped_lines > 0:
+        print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/gff/gff_filter_by_attribute.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/gff/gff_filter_by_attribute.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,53 @@
+<tool id="gff_filter_by_attribute" name="Filter GFF data by attribute" version="0.1">
+  <description>using simple expressions</description>
+  <command interpreter="python">
+    gff_filter_by_attribute.py $input $out_file1 "$cond" '${input.metadata.attribute_types}'
+  </command>
+  <inputs>
+    <param format="gff" name="input" type="data" label="Filter" help="Dataset missing? See TIP below."/>
+    <param name="cond" size="40" type="text" value="gene_id=='uc002loc.1'" label="With following condition" help="Double equal signs, ==, must be used as shown above. To filter for an arbitrary string, use the Select tool.">
+      <validator type="empty_field" message="Enter a valid filtering condition, see syntax and examples below."/>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input"/>
+  </outputs>
+  <tests>
+    <test>
+        <param name="input" value="gff_filter_attr_in1.gff"/>
+        <param name="cond" value="conf_lo>0"/>
+        <output name="out_file1" file="gff_filter_by_attribute_out1.gff"/>
+    </test>
+    <test>
+        <param name="input" value="gff_filter_attr_in1.gff"/>
+        <param name="cond" value="conf_lo==0 or conf_hi>125"/>
+        <output name="out_file1" file="gff_filter_by_attribute_out2.gff"/>
+    </test>
+  </tests>
+
+  <help>
+
+.. class:: warningmark
+
+Double equal signs, ==, must be used as *"equal to"* (e.g., **c1 == 'chr22'**)
+
+.. class:: infomark
+
+**TIP:** Attempting to apply a filtering condition may throw exceptions if the data type (e.g., string, integer) in every line of the attribute being filtered is not appropriate for the condition (e.g., attempting certain numerical calculations on strings).  If an exception is thrown when applying the condition to a line, that line is skipped as invalid for the filter condition.  The number of invalid skipped lines is documented in the resulting history item as a "Condition/data issue".
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
+
+-----
+
+**Syntax**
+
+The filter tool allows you to restrict the dataset using simple conditional statements.
+
+- Make sure that multi-character operators contain no white space ( e.g., **&lt;=** is valid while **&lt; =** is not valid )
+- When using 'equal-to' operator **double equal sign '==' must be used** ( e.g., **attribute_name=='chr1'** )
+- Non-numerical values must be included in single or double quotes ( e.g., **attribute_name=='XX22'** )
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/gff/gff_filter_by_feature_count.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/gff/gff_filter_by_feature_count.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+"""
+Filter a gff file using a criterion based on feature counts for a transcript.
+
+Usage:
+%prog input_name output_name feature_name condition
+"""
+import sys
+from galaxy import eggs
+from galaxy.datatypes.util.gff_util import GFFReaderWrapper
+from bx.intervals.io import GenomicInterval
+
+# Valid operators, ordered so that complex operators (e.g. '>=') are
+# recognized before simple operators (e.g. '>')
+ops = [
+    '>=',
+    '<=',
+    '<',
+    '>',
+    '==',
+    '!='
+]
+
+# Escape sequences for valid operators.
+mapped_ops = {
+    '__ge__': ops[0],
+    '__le__': ops[1],
+    '__lt__': ops[2],
+    '__gt__': ops[3],
+    '__eq__': ops[4],
+    '__ne__': ops[5],
+}
+
+
+def __main__():
+    # Get args.
+    input_name = sys.argv[1]
+    output_name = sys.argv[2]
+    feature_name = sys.argv[3]
+    condition = sys.argv[4]
+    
+    # Unescape operations in condition str.
+    for key, value in mapped_ops.items():
+        condition = condition.replace( key, value )
+    
+    # Error checking: condition should be of the form <operator><number>
+    for op in ops:
+        if op in condition:
+            empty, number_str = condition.split( op )
+            try:
+                number = float( number_str )
+            except:
+                number = None
+            if empty != "" or not number:
+                print >> sys.stderr, "Invalid condition: %s, cannot filter." % condition
+                return
+            break
+
+    # Do filtering.
+    kept_features = 0
+    skipped_lines = 0
+    first_skipped_line = 0
+    out = open( output_name, 'w' )
+    for i, feature in enumerate( GFFReaderWrapper( open( input_name ) ) ):
+        if not isinstance( feature, GenomicInterval ):
+            continue
+        count = 0
+        for interval in feature.intervals:
+            if interval.feature == feature_name:
+                count += 1
+        if eval( '%s %s' % ( count, condition ) ):
+            # Keep feature.
+            for interval in feature.intervals:
+                out.write( "\t".join(interval.fields) + '\n' )
+            kept_features += 1
+
+    # Needed because i is 0-based but want to display stats using 1-based.
+    i += 1
+
+    # Clean up.
+    out.close()
+    info_msg = "%i of %i features kept (%.2f%%) using condition %s.  " % \
+        ( kept_features, i, float(kept_features)/i * 100.0, feature_name + condition )
+    if skipped_lines > 0:
+        info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
+    print info_msg
+
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/gff/gff_filter_by_feature_count.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/gff/gff_filter_by_feature_count.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,53 @@
+<tool id="gff_filter_by_feature_count" name="Filter GFF data by feature count" version="0.1">
+  <description>using simple expressions</description>
+  <command interpreter="python">
+    gff_filter_by_feature_count.py $input_file1 $out_file1 "$feature_name" "$cond"
+  </command>
+  <inputs>
+    <param format="gff" name="input_file1" type="data" label="Filter"/>
+    <param name="feature_name" type="select" label="Using feature name">
+        <options from_dataset="input_file1">
+            <column name="name" index="2"/>
+            <column name="value" index="2"/>
+            <filter type="unique_value" name="unique" column="2"/>
+        </options>
+    </param>
+    <param name="cond" size="40" type="text" value=">0" label="With following condition">
+      <validator type="empty_field" message="Enter a valid filtering condition, see syntax and examples below."/>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input_file1"/>
+  </outputs>
+  <tests>
+      <!-- Test GTF filtering. -->
+      <test>
+          <param name="input_file1" value="gops_subtract_in1.gff"/>
+          <param name="feature_name" value="exon"/>
+          <param name="cond" value=">1"/>
+          <output name="out_file1" file="gff_filter_by_feature_count_out1.gff"/>
+      </test>
+      <!-- Test GFF3 filtering. -->
+      <test>
+          <param name="input_file1" value="5.gff3"/>
+          <param name="feature_name" value="HSP"/>
+          <param name="cond" value=">=5"/>
+          <output name="out_file1" file="gff_filter_by_feature_count_out2.gff"/>
+      </test>
+  </tests>
+
+  <help>
+
+
+.. class:: infomark
+
+Valid comparison operators are: &gt; &lt; &gt;=, &lt;=, !=, and ==
+
+-----
+
+**Syntax**
+
+The filter tool allows you to restrict the dataset based on transcripts' feature counts.
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/gff/gtf_filter_by_attribute_values_list.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/gff/gtf_filter_by_attribute_values_list.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,67 @@
+#
+# Filters a GFF file using a list of attribute values. Attribute values must 
+# be in the first column of the file; subsequent columns are ignored.
+# Usage:
+# python gff_filter_by_attribute_values.py <gff_file> <attribute_name> <ids_file> <output_file>
+#
+
+import sys
+
+def parse_gff_attributes( attr_str ):
+    """
+    Parses a GFF/GTF attribute string and returns a dictionary of name-value 
+    pairs. The general format for a GFF3 attributes string is 
+        name1=value1;name2=value2
+    The general format for a GTF attribute string is 
+        name1 "value1" ; name2 "value2"
+    The general format for a GFF attribute string is a single string that
+    denotes the interval's group; in this case, method returns a dictionary 
+    with a single key-value pair, and key name is 'group'
+    """    
+    attributes_list = attr_str.split(";")
+    attributes = {}
+    for name_value_pair in attributes_list:
+        # Try splitting by space and, if necessary, by '=' sign.
+        pair = name_value_pair.strip().split(" ")
+        if len( pair ) == 1:
+            pair = name_value_pair.strip().split("=")
+        if len( pair ) == 1:
+            # Could not split for some reason -- raise exception?
+            continue
+        if pair == '':
+            continue
+        name = pair[0].strip()
+        if name == '':
+            continue
+        # Need to strip double quote from values
+        value = pair[1].strip(" \"")
+        attributes[ name ] = value
+        
+    if len( attributes ) == 0:
+        # Could not split attributes string, so entire string must be 
+        # 'group' attribute. This is the case for strictly GFF files.
+        attributes['group'] = attr_str
+    return attributes
+
+def filter( gff_file, attribute_name, ids_file, output_file ):
+    # Put ids in dict for quick lookup.
+    ids_dict = {}
+    for line in open( ids_file ):
+        ids_dict[ line.split('\t')[0].strip() ] = True
+
+    # Filter GFF file using ids.
+    output = open( output_file, 'w' )
+    for line in open( gff_file ):
+        fields = line.split( '\t' )
+        attributes = parse_gff_attributes( fields[8] )
+        if ( attribute_name in attributes ) and ( attributes[ attribute_name ] in ids_dict ):
+            output.write( line )
+    output.close()
+        
+if __name__ == "__main__":
+    # Handle args.
+    if len( sys.argv ) != 5:
+        print >> sys.stderr, "usage: python %s <gff_file> <attribute_name> <ids_file> <output_file>"  % sys.argv[0]
+        sys.exit( -1 )
+    gff_file, attribute_name, ids_file, output_file = sys.argv[1:]
+    filter( gff_file, attribute_name, ids_file, output_file )
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/gff/gtf_filter_by_attribute_values_list.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/gff/gtf_filter_by_attribute_values_list.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,42 @@
+<tool id="gtf_filter_by_attribute_values_list" name="Filter GTF data by attribute values_list" version="0.1">
+    <description></description>
+    <command interpreter="python">
+      gtf_filter_by_attribute_values_list.py $input $attribute_name $ids $output
+    </command>
+    <inputs>
+        <param format="gtf" name="input" type="data" label="Filter"/>
+        <param name="attribute_name" type="select" label="Using attribute name">
+            <option value="gene_id">gene_id</option>
+            <option value="transcript_id">transcript_id</option>
+            <option value="p_id">p_id</option>
+            <option value="tss_id">tss_id</option>
+        </param>
+        <param format="tabular,txt" name="ids" type="data" label="And attribute values"/>
+    </inputs>
+    <outputs>
+        <data format="input" name="output" metadata_source="input"/>
+    </outputs>
+    <tests>
+        <!-- Test filtering with a simple list of values. -->
+        <test>
+            <param name="input" value="gops_subtract_in1.gff"/>
+            <param name="attribute_name" value="gene_id"/>
+            <param name="ids" value="gtf_filter_by_attribute_values_list_in1.txt"/>
+            <output name="output" file="gtf_filter_by_attribute_values_list_out1.gtf"/>
+        </test>
+        <!-- Test filtering with a more complex tabular file. -->
+        <test>
+            <param name="input" value="gtf_filter_by_attribute_values_list_in2.gtf"/>
+            <param name="attribute_name" value="transcript_id"/>
+            <param name="ids" value="gtf_filter_by_attribute_values_list_in3.tabular"/>
+            <output name="output" file="gtf_filter_by_attribute_values_list_out2.gtf"/>
+        </test>
+    </tests>
+    <help>
+
+This tool filters a GTF file using a list of attribute values. The attribute values are
+taken from the first column in the file; additional columns in the file are ignored. An example
+use of this tool is to filter a GTF file using a list of transcript_ids or gene_ids obtained from Cuffdiff.
+
+    </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/gff2bed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/gff2bed.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,87 @@
+<tool id="gff2bed1" name="GFF-to-BED" version="1.0.1">
+  <description>converter</description>
+  <command interpreter="python">gff_to_bed_converter.py $input $out_file1</command>
+  <inputs>
+    <param format="gff" name="input" type="data" label="Convert this query"/>
+  </inputs>
+  <outputs>
+    <data format="bed" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="5.gff" ftype="gff"/>
+      <output name="out_file1" file="gff2bed_out.bed"/>
+    </test>
+    <test>
+      <param name="input" value="gff2bed_in2.gff" ftype="gff"/>
+      <output name="out_file1" file="gff2bed_out2.bed"/>
+    </test>
+    <test>
+      <!-- Test conversion of gff3 file. -->
+      <param name="input" value="5.gff3" ftype="gff"/>
+      <output name="out_file1" file="gff2bed_out3.bed"/>
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool converts data from GFF format to BED format (scroll down for format description).
+
+--------
+
+**Example**
+
+The following data in GFF format::
+
+    chr22  GeneA  enhancer  10000000  10001000  500  +   .  TGA
+    chr22  GeneA  promoter  10010000  10010100  900  +   .  TGA
+
+Will be converted to BED (**note** that 1 is subtracted from the start coordinate)::
+
+    chr22   9999999  10001000   enhancer   0   +
+    chr22  10009999  10010100   promoter   0   +
+
+------
+
+.. class:: infomark
+
+**About formats**
+
+**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones:
+
+The first three BED fields (required) are::
+
+    1. chrom - The name of the chromosome (e.g. chr1, chrY_random).
+    2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.)
+    3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval).
+
+The additional BED fields (optional) are::
+
+    4. name - The name of the BED line.
+    5. score - A score between 0 and 1000.
+    6. strand - Defines the strand - either '+' or '-'.
+    7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser.
+    8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser.
+    9. reserved - This should always be set to zero.
+   10. blockCount - The number of blocks (exons) in the BED line.
+   11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
+   12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
+   13. expCount - The number of experiments.
+   14. expIds - A comma-separated list of experiment ids. The number of items in this list should correspond to expCount.
+   15. expScores - A comma-separated list of experiment scores. All of the expScores should be relative to expIds. The number of items in this list should correspond to expCount.
+
+**GFF format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF lines have nine tab-separated fields::
+
+    1. seqname - Must be a chromosome or scaffold.
+    2. source - The program that generated this feature.
+    3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon".
+    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+    5. end - The ending position of the feature (inclusive).
+    6. score - A score between 0 and 1000. If there is no score value, enter ".".
+    7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+    8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+    9. group - All lines with the same group are linked together into a single item.
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/gff_to_bed_converter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/gff_to_bed_converter.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,133 @@
+#!/usr/bin/env python
+import sys
+from galaxy import eggs
+from galaxy.datatypes.util.gff_util import parse_gff_attributes
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def get_bed_line( chrom, name, strand, blocks ):
+    """ Returns a BED line for given data. """
+
+    
+    if len( blocks ) == 1:
+        # Use simple BED format if there is only a single block:
+        #   chrom, chromStart, chromEnd, name, score, strand
+        #
+        start, end = blocks[0]
+        return "%s\t%i\t%i\t%s\t0\t%s\n" % ( chrom, start, end, name, strand )
+
+    #
+    # Build lists for transcript blocks' starts, sizes.
+    #
+    
+    # Get transcript start, end.
+    t_start = sys.maxint
+    t_end = -1
+    for block_start, block_end in blocks:
+        if block_start < t_start:
+            t_start = block_start
+        if block_end > t_end:
+            t_end = block_end
+            
+    # Get block starts, sizes.
+    block_starts = []
+    block_sizes = []
+    for block_start, block_end in blocks:
+        block_starts.append( str( block_start - t_start ) )
+        block_sizes.append( str( block_end - block_start ) )
+    
+    #
+    # Create BED entry.
+    # Bed format: chrom, chromStart, chromEnd, name, score, strand, \
+    #               thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts
+    #
+    # Render complete feature with thick blocks. There's no clear way to do this unless
+    # we analyze the block names, but making everything thick makes more sense than
+    # making everything thin.
+    #
+    return "%s\t%i\t%i\t%s\t0\t%s\t%i\t%i\t0\t%i\t%s\t%s\n" % \
+            ( chrom, t_start, t_end, name, strand, t_start, t_end, len( block_starts ), 
+                ",".join( block_sizes ), ",".join( block_starts ) )
+
+def __main__():
+    input_name = sys.argv[1]
+    output_name = sys.argv[2]
+    skipped_lines = 0
+    first_skipped_line = 0
+    out = open( output_name, 'w' )
+    i = 0
+    cur_transcript_chrom = None
+    cur_transcript_id = None
+    cur_transcript_strand = None
+    cur_transcripts_blocks = [] # (start, end) for each block.
+    for i, line in enumerate( file( input_name ) ):
+        line = line.rstrip( '\r\n' )
+        if line and not line.startswith( '#' ):
+            try:
+                # GFF format: chrom source, name, chromStart, chromEnd, score, strand, attributes
+                elems = line.split( '\t' )
+                start = str( long( elems[3] ) - 1 )
+                coords = [ long( start ), long( elems[4] ) ]
+                strand = elems[6]
+                if strand not in ['+', '-']:
+                    strand = '+'
+                attributes = parse_gff_attributes( elems[8] )
+                t_id = attributes.get( "transcript_id", None )
+                    
+                if not t_id:
+                    #
+                    # No transcript ID, so write last transcript and write current line as its own line.
+                    #
+                    
+                    # Write previous transcript.
+                    if cur_transcript_id:
+                        # Write BED entry.
+                        out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
+                    
+                    # Replace any spaces in the name with underscores so UCSC will not complain.
+                    name = elems[2].replace(" ", "_")
+                    out.write( get_bed_line( elems[0], name, strand, [ coords ] ) )
+                    continue
+                
+                # There is a transcript ID, so process line at transcript level.
+                if t_id == cur_transcript_id:
+                    # Line is element of transcript and will be a block in the BED entry.
+                    cur_transcripts_blocks.append( coords )
+                    continue
+                    
+                #
+                # Line is part of new transcript; write previous transcript and start
+                # new transcript.
+                #
+                
+                # Write previous transcript.
+                if cur_transcript_id:
+                    # Write BED entry.
+                    out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
+
+                # Start new transcript.
+                cur_transcript_chrome = elems[0]
+                cur_transcript_id = t_id
+                cur_transcript_strand = strand
+                cur_transcripts_blocks = []
+                cur_transcripts_blocks.append( coords )    
+            except:
+                skipped_lines += 1
+                if not first_skipped_line:
+                    first_skipped_line = i + 1
+        else:
+            skipped_lines += 1
+            if not first_skipped_line:
+                first_skipped_line = i + 1
+    
+    # Write last transcript.
+    if cur_transcript_id:
+        # Write BED entry.
+        out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
+    out.close()
+    info_msg = "%i lines converted to BED.  " % ( i + 1 - skipped_lines )
+    if skipped_lines > 0:
+        info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
+    print info_msg
+
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/grep.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/grep.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,123 @@
+# Filename: grep.py
+# Author: Ian N. Schenck
+# Version: 8/23/2005
+#
+# This script accepts regular expressions, as well as an "invert"
+# option, and applies the regular expression using grep.  This wrapper
+# provides security and pipeline.
+#
+# Grep is launched based on these inputs:
+# -i Input file
+# -o Output file
+# -pattern RegEx pattern
+# -v         true or false (output NON-matching lines)
+
+import sys
+import os
+import re
+import string
+import commands
+from tempfile import NamedTemporaryFile
+
+# This function is exceedingly useful, perhaps package for reuse?
+def getopts(argv):
+    opts = {}
+    while argv:
+ if argv[0][0] == '-':
+     opts[argv[0]] = argv[1]
+     argv = argv[2:]
+ else:
+     argv = argv[1:]
+    return opts
+
+def main():
+    args = sys.argv[1:]
+
+    try:
+ opts = getopts(args)
+    except IndexError:
+ print "Usage:"
+ print " -i Input file"
+ print " -o Output file"
+ print " -pattern RegEx pattern"
+ print " -v true or false (Invert match)"
+ return 0
+
+    outputfile = opts.get("-o")
+    if outputfile == None:
+ print "No output file specified."
+ return -1
+    
+    inputfile = opts.get("-i")
+    if inputfile == None:
+ print "No input file specified."
+ return -2
+
+    invert = opts.get("-v")
+    if invert == None:
+ print "Match style (Invert or normal) not specified."
+ return -3
+
+    pattern = opts.get("-pattern")
+    if pattern == None:
+ print "RegEx pattern not specified."
+ return -4
+
+    # All inputs have been specified at this point, now validate.
+
+    # replace if input has been escaped, remove sq
+    # characters that are allowed but need to be escaped
+    mapped_chars = { '>' :'__gt__', 
+                 '<' :'__lt__', 
+                 '\'' :'__sq__',
+                 '"' :'__dq__',
+                 '[' :'__ob__',
+                 ']' :'__cb__',
+  '{' :'__oc__',
+                 '}' :'__cc__'
+                 }
+    
+    #with new sanitizing we only need to replace for single quote, but this needs to remain for backwards compatibility
+    for key, value in mapped_chars.items():
+        pattern = pattern.replace(value, key)
+    
+    fileRegEx = re.compile("^[A-Za-z0-9./\-_]+$") #why?
+    invertRegEx = re.compile("(true)|(false)") #why?
+
+    if not fileRegEx.match(outputfile):
+ print "Illegal output filename."
+ return -5
+    if not fileRegEx.match(inputfile):
+ print "Illegal input filename."
+ return -6
+    if not invertRegEx.match(invert):
+ print "Illegal invert option."
+ return -7
+
+    # invert grep search?
+    if invert == "true":
+        invertflag = " -v"
+        print "Not matching pattern: %s" % pattern
+    else:
+        invertflag = ""
+        print "Matching pattern: %s" % pattern
+    
+    #Create temp file holding pattern
+    #By using a file to hold the pattern, we don't have worry about sanitizing grep commandline and can include single quotes in pattern
+    pattern_file_name = NamedTemporaryFile().name
+    open( pattern_file_name, 'w' ).write( pattern )
+    
+    #generate grep command
+    commandline = "grep -E %s -f %s %s > %s" % ( invertflag, pattern_file_name, inputfile, outputfile )
+    
+    #run grep
+    errorcode, stdout = commands.getstatusoutput(commandline)
+    
+    #remove temp pattern file
+    os.unlink( pattern_file_name )
+    
+    #return error code
+    return errorcode
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/grep.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/grep.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,82 @@
+<tool id="Grep1" name="Select" version="1.0.1">
+  <description>lines that match an expression</description>
+  <command interpreter="python">grep.py -i $input -o $out_file1 -pattern '$pattern' -v $invert</command>
+  <inputs>
+    <param format="txt" name="input" type="data" label="Select lines from"/>
+    <param name="invert" type="select" label="that">
+      <option value="false">Matching</option>
+      <option value="true">NOT Matching</option>
+    </param>
+    <param name="pattern" size="40" type="text" value="^chr([0-9A-Za-z])+" label="the pattern" help="here you can enter text or regular expression (for syntax check lower part of this frame)">
+      <sanitizer>
+        <valid initial="string.printable">
+         <remove value="&apos;"/>
+        </valid>
+        <mapping initial="none">
+          <add source="&apos;" target="__sq__"/>
+        </mapping>
+      </sanitizer>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="1.bed"/>
+      <param name="invert" value="false"/>
+      <param name="pattern" value="^chr[0-9]*"/>
+      <output name="out_file1" file="fs-grep.dat"/>
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
+
+-----
+
+**Syntax**
+
+The select tool searches the data for lines containing or not containing a match to the given pattern. Regular Expression is introduced in this tool. A Regular Expression is a pattern describing a certain amount of text. 
+
+- **( ) { } [ ] . * ? + \ ^ $** are all special characters. **\\** can be used to "escape" a special character, allowing that special character to be searched for.
+- **\\A** matches the beginning of a string(but not an internal line).
+- **\\d** matches a digit, same as [0-9].
+- **\\D** matches a non-digit.
+- **\\s** matches a whitespace character.
+- **\\S** matches anything BUT a whitespace.
+- **\\t** matches a tab.
+- **\\w** matches an alphanumeric character.
+- **\\W** matches anything but an alphanumeric character.
+- **(** .. **)** groups a particular pattern.
+- **\\Z** matches the end of a string(but not a internal line).
+- **{** n or n, or n,m **}** specifies an expected number of repetitions of the preceding pattern.
+
+  - **{n}** The preceding item is matched exactly n times.
+  - **{n,}** The preceding item is matched n or more times. 
+  - **{n,m}** The preceding item is matched at least n times but not more than m times. 
+
+- **[** ... **]** creates a character class. Within the brackets, single characters can be placed. A dash (-) may be used to indicate a range such as **a-z**.
+- **.** Matches any single character except a newline.
+- ***** The preceding item will be matched zero or more times.
+- **?** The preceding item is optional and matched at most once.
+- **+** The preceding item will be matched one or more times.
+- **^** has two meaning:
+  - matches the beginning of a line or string. 
+  - indicates negation in a character class. For example, [^...] matches every character except the ones inside brackets.
+- **$** matches the end of a line or string.
+- **\|** Separates alternate possibilities. 
+
+-----
+
+**Example**
+
+- **^chr([0-9A-Za-z])+** would match lines that begin with chromosomes, such as lines in a BED format file.
+- **(ACGT){1,5}** would match at least 1 "ACGT" and at most 5 "ACGT" consecutively.
+- **([^,][0-9]{1,3})(,[0-9]{3})\*** would match a large integer that is properly separated with commas such as 23,078,651.
+- **(abc)|(def)** would match either "abc" or "def".
+- **^\\W+#** would match any line that is a comment.
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/gtf2bedgraph.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/gtf2bedgraph.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,79 @@
+<tool id="gtf2bedgraph" name="GTF-to-BEDGraph">
+  <description>converter</description>
+  <command interpreter="python">gtf_to_bedgraph_converter.py $input $out_file1 $attribute_name</command>
+  <inputs>
+    <param format="gtf" name="input" type="data" label="Convert this query"/>
+    <param name="attribute_name" type="text" label="Attribute to Use for Value"/>
+  </inputs>
+  <outputs>
+    <data format="bedgraph" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="gtf2bedgraph_in.gtf" ftype="gtf"/>
+      <param name="attribute_name" value="FPKM"/>
+      <output name="out_file1" file="gtf2bedgraph_out.bedgraph" ftype="bedgraph"/>
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool converts data from GTF format to BEDGraph format (scroll down for format description).
+
+--------
+
+**Example**
+
+The following data in GFF format::
+
+    chr22  GeneA  enhancer  10000000  10001000  500  +   .  gene_id "GeneA"; transcript_id "TranscriptAlpha"; FPKM "2.75"; frac "1.000000";
+    chr22  GeneA  promoter  10010000  10010100  900  +   .  gene_id "GeneA"; transcript_id "TranscriptsAlpha"; FPKM "2.25"; frac "1.000000";
+
+using the attribute name 'FPKM' will be converted to BEDGraph (**note** that 1 is subtracted from the start coordinate)::
+
+
+    chr22   9999999  10001000   2.75
+    chr22  10009999  10010100   2.25   
+
+------
+
+.. class:: infomark
+
+**About formats**
+
+**GTF format** Gene Transfer Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GTF lines have nine tab-separated fields::
+
+    1. seqname - Must be a chromosome or scaffold.
+    2. source - The program that generated this feature.
+    3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon".
+    4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+    5. end - The ending position of the feature (inclusive).
+    6. score - A score between 0 and 1000. If there is no score value, enter ".".
+    7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+    8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+    9. group - The group field is a list of attributes. Each attribute consists of a type/value pair. Attributes must end in a semi-colon, and be separated from any following attribute by exactly one space. The attribute list must begin with the two mandatory attributes: (i) gene_id value - A globally unique identifier for the genomic source of the sequence and (ii) transcript_id value - A globally unique identifier for the predicted transcript.
+    
+**BEDGraph format**
+
+The bedGraph format is line-oriented. Bedgraph data are preceeded by a track definition line, which adds a number of options for controlling the default display of this track.
+
+For the track definition line, all options are placed in a single line separated by spaces:
+  track type=bedGraph name=track_label description=center_label
+        visibility=display_mode color=r,g,b altColor=r,g,b
+        priority=priority autoScale=on|off alwaysZero=on|off
+        gridDefault=on|off maxHeightPixels=max:default:min
+        graphType=bar|points viewLimits=lower:upper
+        yLineMark=real-value yLineOnOff=on|off
+        windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16
+
+The track type is REQUIRED, and must be bedGraph:
+  type=bedGraph
+
+Following the track definition line are the track data in four column BED format::
+
+  chromA  chromStartA  chromEndA  dataValueA
+  chromB  chromStartB  chromEndB  dataValueB
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/gtf_to_bedgraph_converter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/gtf_to_bedgraph_converter.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+import os, sys, tempfile
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def __main__():
+    # Read parms.
+    input_name = sys.argv[1]
+    output_name = sys.argv[2]
+    attribute_name = sys.argv[3]
+    
+    # Create temp files.
+    tmp_name1 = tempfile.NamedTemporaryFile().name
+    tmp_name2 = tempfile.NamedTemporaryFile().name
+    
+    # Do conversion.
+    skipped_lines = 0
+    first_skipped_line = 0
+    out = open( tmp_name1, 'w' )
+    
+    # Write track data to temporary file.
+    i = 0
+    for i, line in enumerate( file( input_name ) ):
+        line = line.rstrip( '\r\n' )
+        
+        if line and not line.startswith( '#' ):
+            try:
+                elems = line.split( '\t' )
+                start = str( int( elems[3] ) - 1 ) # GTF coordinates are 1-based, BedGraph are 0-based.
+                strand = elems[6]
+                if strand not in ['+', '-']:
+                    strand = '+'
+                attributes_list = elems[8].split(";")
+                attributes = {}
+                for name_value_pair in attributes_list:
+                    pair = name_value_pair.strip().split(" ")
+                    name = pair[0].strip()
+                    if name == '':
+                        continue
+                    # Need to strip double quote from values
+                    value = pair[1].strip(" \"")
+                    attributes[name] = value
+                value = attributes[ attribute_name ]
+                # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes.
+                # BedGraph format: chrom, chromStart, chromEnd, value
+                out.write( "%s\t%s\t%s\t%s\n" %( elems[0], start, elems[4], value ) )
+            except:
+                skipped_lines += 1
+                if not first_skipped_line:
+                    first_skipped_line = i + 1
+        else:
+            skipped_lines += 1
+            if not first_skipped_line:
+                first_skipped_line = i + 1
+    out.close()
+    
+    # Sort tmp file by chromosome name and chromosome start to create ordered track data.
+    cmd = "sort -k1,1 -k2,2n < %s > %s" % ( tmp_name1, tmp_name2 )
+    try:
+        os.system(cmd)
+        os.remove(tmp_name1)
+    except Exception, ex:
+        sys.stderr.write( "%s\n" % ex )
+        sys.exit(1)
+        
+    # Create bedgraph file by combining track definition with ordered track data.
+    cmd = "echo 'track type=bedGraph' | cat - %s > %s " % ( tmp_name2, output_name )
+    try:
+        os.system(cmd)
+        os.remove(tmp_name2)
+    except Exception, ex:
+        sys.stderr.write( "%s\n" % ex )
+        sys.exit(1)
+    
+    info_msg = "%i lines converted to BEDGraph.  " % ( i + 1 - skipped_lines )
+    if skipped_lines > 0:
+        info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
+    print info_msg
+
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/headWrapper.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/headWrapper.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,19 @@
+#! /usr/bin/perl -w
+
+use strict;
+use warnings;
+
+# a wrapper for head for use in galaxy
+# headWrapper.pl [filename] [# lines to show] [output]
+
+die "Check arguments" unless @ARGV == 3;
+die "Line number must be an integer\n" unless $ARGV[1]=~ m/^\d+$/;
+
+open (OUT, ">$ARGV[2]") or die "Cannot create $ARGV[2]:$!\n";
+open (HEAD, "head -n $ARGV[1] $ARGV[0]|") or die "Cannot run head:$!\n";
+while (<HEAD>) {
+    print OUT;
+}
+close OUT;
+close HEAD;
+    
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/headWrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/headWrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,42 @@
+<tool id="Show beginning1" name="Select first">
+  <description>lines from a dataset</description>
+  <command interpreter="perl">headWrapper.pl $input $lineNum $out_file1</command>
+  <inputs>
+    <param name="lineNum" size="5" type="integer" value="10" label="Select first" help="lines"/>
+    <param format="txt" name="input" type="data" label="from"/>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="lineNum" value="10"/>
+      <param name="input" value="1.bed"/>
+      <output name="out_file1" file="eq-showbeginning.dat"/>
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool outputs specified number of lines from the **beginning** of a dataset
+
+-----
+
+**Example**
+
+Selecting 2 lines from this::
+
+    chr7  56632  56652  D17003_CTCF_R6  310  +
+    chr7  56736  56756  D17003_CTCF_R7  354  +
+    chr7  56761  56781  D17003_CTCF_R4  220  +
+    chr7  56772  56792  D17003_CTCF_R7  372  +
+    chr7  56775  56795  D17003_CTCF_R4  207  +
+
+will produce::
+
+    chr7  56632  56652  D17003_CTCF_R6  310  +
+    chr7  56736  56756  D17003_CTCF_R7  354  +
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/join.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/join.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,370 @@\n+#!/usr/bin/env python\n+#Dan Blankenberg\n+"""\n+Script to Join Two Files on specified columns.\n+\n+Takes two tab delimited files, two column numbers (base 1) and outputs a new tab delimited file with lines joined by tabs.\n+User can also opt to have have non-joining rows of file1 echoed.\n+\n+"""\n+\n+import optparse, os, sys, tempfile, struct\n+import psyco_full\n+\n+try:\n+    simple_json_exception = None\n+    from galaxy import eggs\n+    from galaxy.util.bunch import Bunch\n+    from galaxy.util import stringify_dictionary_keys\n+    import pkg_resources\n+    pkg_resources.require("simplejson")\n+    import simplejson\n+except Exception, e:\n+    simplejson_exception = e\n+    simplejson = None\n+\n+\n+class OffsetList:\n+    def __init__( self, filesize = 0, fmt = None ):\n+        self.file = tempfile.NamedTemporaryFile( \'w+b\' )\n+        if fmt:\n+            self.fmt = fmt\n+        elif filesize and filesize <= sys.maxint * 2:\n+            self.fmt = \'I\'\n+        else:\n+            self.fmt = \'Q\'\n+        self.fmt_size = struct.calcsize( self.fmt )\n+    @property\n+    def size( self ):\n+        self.file.flush()\n+        return self.file_size / self.fmt_size\n+    @property\n+    def file_size( self ):\n+        self.file.flush()\n+        return os.stat( self.file.name ).st_size\n+    def add_offset( self, offset ):\n+        if not isinstance( offset, list ):\n+            offset = [offset]\n+        self.file.seek( self.file_size )\n+        for off in offset:\n+            self.file.write( struct.pack( self.fmt, off ) )\n+    def get_offsets( self, start = 0 ):\n+        self.file.seek( start * self.fmt_size )\n+        while True:\n+            packed = self.file.read( self.fmt_size )\n+            if not packed: break\n+            yield struct.unpack( self.fmt, packed )[0]\n+    def get_offset_by_index( self, index ):\n+        self.file.seek( index * self.fmt_size )\n+        return struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]\n+    def set_offset_at_index( self, index, offset ):\n+        if not isinstance( offset, list ):\n+            offset = [offset]\n+        if index >= self.size:\n+            self.add_offset( offset )\n+        else:\n+            temp_file = tempfile.NamedTemporaryFile( \'w+b\' )\n+            self.file.seek( 0 )\n+            temp_file.write( self.file.read( ( index ) * self.fmt_size ) )\n+            for off in offset:\n+                temp_file.write( struct.pack( self.fmt, off ) )\n+            temp_file.write( self.file.read() )\n+            self.file = temp_file\n+\n+class SortedOffsets( OffsetList ):\n+    def __init__( self, indexed_filename, column, split = None ):\n+        OffsetList.__init__( self, os.stat( indexed_filename ).st_size )\n+        self.indexed_filename = indexed_filename\n+        self.indexed_file = open( indexed_filename, \'rb\' )\n+        self.column = column\n+        self.split = split\n+        self.last_identifier = None\n+        self.last_identifier_merged = None\n+        self.last_offset_merged = 0\n+    def merge_with_dict( self, new_offset_dict ):\n+        if not new_offset_dict: return #no items to merge in\n+        keys = new_offset_dict.keys()\n+        keys.sort()\n+        identifier2 = keys.pop( 0 )\n+        \n+        result_offsets = OffsetList( fmt = self.fmt )\n+        offsets1 = enumerate( self.get_offsets() )\n+        try:\n+            index1, offset1 = offsets1.next()\n+            identifier1 = self.get_identifier_by_offset( offset1 )\n+        except StopIteration:\n+            offset1 = None\n+            identifier1 = None\n+            index1 = 0\n+        \n+        while True:\n+            if identifier1 is None and identifier2 is None:\n+                self.file = result_offsets.file #self is now merged results\n+                return\n+            elif identifier1 is None or ( identifier2 and identifier2 < identifier1 ):\n+                result_offsets.add_offset( new_offset_dict[identifier2] )\n+                if keys:\n+                    identifier2 = keys.pop( 0 '..b'rb\' ):\n+        identifier = get_identifier_by_line( line1, column1, split )\n+        if identifier:\n+            written = False\n+            for line2 in index.get_lines_by_identifier( identifier ):\n+                if not fill_options.fill_unjoined_only:\n+                    out.write( "%s%s%s\\n" % ( fill_empty_columns( line1.rstrip( \'\\r\\n\' ), split, fill_options.file1_columns ), split, fill_empty_columns( line2.rstrip( \'\\r\\n\' ), split, fill_options.file2_columns ) ) )\n+                else:\n+                    out.write( "%s%s%s\\n" % ( line1.rstrip( \'\\r\\n\' ), split, line2.rstrip( \'\\r\\n\' ) ) )\n+                written = True\n+            if not written and keep_unmatched:\n+                out.write( fill_empty_columns( line1.rstrip( \'\\r\\n\' ), split, fill_options.file1_columns ) )\n+                if fill_options:\n+                    if fill_options.file2_columns:\n+                        out.write( "%s%s" % ( split,  fill_empty_columns( "", split, fill_options.file2_columns ) ) )\n+                out.write( "\\n" )\n+        elif keep_partial:\n+            out.write( fill_empty_columns( line1.rstrip( \'\\r\\n\' ), split, fill_options.file1_columns ) )\n+            if fill_options:\n+                if fill_options.file2_columns:\n+                    out.write( "%s%s" % ( split,  fill_empty_columns( "", split, fill_options.file2_columns ) ) )\n+            out.write( "\\n" )\n+    out.close()\n+\n+def main():\n+    parser = optparse.OptionParser()\n+    parser.add_option(\n+        \'-b\',\'--buffer\',\n+        dest=\'buffer\',\n+        type=\'int\',default=1000000,\n+        help=\'Number of lines to buffer at a time. Default: 1,000,000 lines. A buffer of 0 will attempt to use memory only.\'\n+    )\n+    parser.add_option(\n+        \'-d\',\'--index_depth\',\n+        dest=\'index_depth\',\n+        type=\'int\',default=3,\n+        help=\'Depth to use on filebased offset indexing. Default: 3.\'\n+    )\n+    parser.add_option(\n+        \'-p\',\'--keep_partial\',\n+        action=\'store_true\',\n+        dest=\'keep_partial\',\n+        default=False,\n+        help=\'Keep rows in first input which are missing identifiers.\')\n+    parser.add_option(\n+        \'-u\',\'--keep_unmatched\',\n+        action=\'store_true\',\n+        dest=\'keep_unmatched\',\n+        default=False,\n+        help=\'Keep rows in first input which are not joined with the second input.\')\n+    parser.add_option(\n+        \'-f\',\'--fill_options_file\',\n+        dest=\'fill_options_file\',\n+        type=\'str\',default=None,\n+        help=\'Fill empty columns with a values from a JSONified file.\')\n+    \n+    \n+    options, args = parser.parse_args()\n+    \n+    fill_options = None\n+    if options.fill_options_file is not None:\n+        try:\n+            if simplejson is None:\n+                raise simplejson_exception\n+            fill_options = Bunch( **stringify_dictionary_keys( simplejson.load( open( options.fill_options_file ) ) ) ) #simplejson.load( open( options.fill_options_file ) )\n+        except Exception, e:\n+            print "Warning: Ignoring fill options due to simplejson error (%s)." % e\n+    if fill_options is None:\n+        fill_options = Bunch()\n+    if \'fill_unjoined_only\' not in fill_options:\n+        fill_options.fill_unjoined_only = True\n+    if \'file1_columns\' not in fill_options:\n+        fill_options.file1_columns = None\n+    if \'file2_columns\' not in fill_options:\n+        fill_options.file2_columns = None\n+    \n+    \n+    try:\n+        filename1 = args[0]\n+        filename2 = args[1]\n+        column1 = int( args[2] ) - 1\n+        column2 = int( args[3] ) - 1\n+        out_filename = args[4]\n+    except:\n+        print >> sys.stderr, "Error parsing command line."\n+        sys.exit()\n+    \n+    #Character for splitting fields and joining lines\n+    split = "\\t"\n+    \n+    return join_files( filename1, column1, filename2, column2, out_filename, split, options.buffer, options.keep_unmatched, options.keep_partial, options.index_depth, fill_options = fill_options )\n+\n+if __name__ == "__main__": main()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/joinWrapper.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/joinWrapper.pl Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,51 @@
+#! /usr/bin/perl -w
+
+use strict;
+use warnings;
+use File::Temp "tempfile";
+
+my ($input1, $input2, $field1, $field2, $mode, $OOption, $out_file1) = @ARGV;
+
+die "No arguments\n" unless @ARGV == 7;
+
+my ($fh1, $file1) = tempfile();
+my ($fh2, $file2) = tempfile(); 
+
+`sort -k $field1 $input1 > $file1`;
+`sort -k $field2 $input2 > $file2`;
+
+my $option = "";
+my @fields = ();
+my $line = "";
+
+if ($OOption eq "Y") {
+  if (defined($fh1)) {
+    $line = <$fh1>;
+  } else {
+    die "Failed to create file $file1\n";
+  }
+  @fields = split /\t/, $line;
+  die "The field you selected does not exist in the input file" if (@fields < $field1);
+  my @optionO = ();
+  my $i = 0;
+  foreach (@fields) {
+    ++$i;
+    push(@optionO, "1.$i");
+  }
+  $option = "-o " . join(",", @optionO);
+} else {
+  $option = "";
+}
+
+$ENV{'LC_ALL'} = 'POSIX';
+
+if ($mode eq "V") {
+  `join -v 1 $option -1 $field1 -2 $field2 $file1 $file2 | tr " " "\t" > $out_file1`;
+} else {
+  `join $option -1 $field1 -2 $field2 $file1 $file2 | tr " " "\t" > $out_file1`;
+}
+
+`rm $file1 ; rm $file2`;
+
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/joinWrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/joinWrapper.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+#Guruprasad Ananda
+"""
+This tool provides the UNIX "join" functionality.
+"""
+import sys, os, tempfile, subprocess
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+def main():
+    infile1 = sys.argv[1]
+    infile2 = sys.argv[2]
+    field1 = int(sys.argv[3])
+    field2 = int(sys.argv[4])
+    mode =sys.argv[5]
+    outfile = sys.argv[6]
+    
+    tmpfile1 = tempfile.NamedTemporaryFile()
+    tmpfile2 = tempfile.NamedTemporaryFile()
+    
+    try:
+        #Sort the two files based on specified fields
+        os.system("sort -t ' ' -k %d,%d -o %s %s" %(field1, field1, tmpfile1.name, infile1))
+        os.system("sort -t ' ' -k %d,%d -o %s %s" %(field2, field2, tmpfile2.name, infile2))
+    except Exception, exc:
+        stop_err( 'Initialization error -> %s' %str(exc) )
+        
+    option = ""
+    for line in file(tmpfile1.name):
+        line = line.strip()
+        if line:
+            elems = line.split('\t')
+            for j in range(1,len(elems)+1):
+                if j == 1:
+                    option = "1.1"
+                else:
+                    option = option + ",1." + str(j) 
+            break
+    
+    #check if join has --version option. BSD join doens't have this option, while GNU join does. 
+    #The return value in the latter case will be 0, and non-zero in the latter case.
+    ret = subprocess.call('join --version 2>/dev/null', shell=True) 
+    # check if we are a version later than 7 of join. If so, we want to skip
+    # checking the order since join will raise an error with duplicated items in
+    # the two files being joined.
+    if ret == 0: 
+        cl = subprocess.Popen(["join", "--version"], stdout=subprocess.PIPE)
+        (stdout, _) = cl.communicate()
+        version_line = stdout.split("\n")[0]
+        (version, _) = version_line.split()[-1].split(".")
+        if int(version) >= 7:
+            flags = "--nocheck-order"
+        else:
+            flags = ""
+    else:
+        flags = ""
+
+    if mode == "V":
+        cmdline = "join %s -t ' ' -v 1 -o %s -1 %d -2 %d %s %s > %s" %(flags, option, field1, field2, tmpfile1.name, tmpfile2.name, outfile)
+    else:
+        cmdline = "join %s -t ' ' -o %s -1 %d -2 %d %s %s > %s" %(flags, option, field1, field2, tmpfile1.name, tmpfile2.name, outfile)
+    
+    try:
+        os.system(cmdline) 
+    except Exception, exj:
+        stop_err('Error joining the two datasets -> %s' %str(exj))
+       
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/joiner.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/joiner.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,180 @@
+<tool id="join1" name="Join two Datasets" version="2.0.2">
+  <description>side by side on a specified field</description>
+  <command interpreter="python">join.py $input1 $input2 $field1 $field2 $out_file1 $unmatched $partial --index_depth=3 --buffer=50000000 --fill_options_file=$fill_options_file</command>
+  <inputs>
+    <param format="tabular" name="input1" type="data" label="Join"/>
+    <param name="field1" label="using column" type="data_column" data_ref="input1" />
+    <param format="tabular" name="input2" type="data" label="with" />
+    <param name="field2" label="and column" type="data_column" data_ref="input2" />
+    <param name="unmatched" type="select" label="Keep lines of first input that do not join with second input">
+      <option value="-u">Yes</option>
+      <option value="" selected="true">No</option>
+    </param>
+    <param name="partial" type="select" label="Keep lines of first input that are incomplete">
+      <option value="-p">Yes</option>
+      <option value="" selected="true">No</option>
+    </param>
+    <conditional name="fill_empty_columns">
+      <param name="fill_empty_columns_switch" type="select" label="Fill empty columns">
+        <option value="no_fill" selected="True">No</option>
+        <option value="fill_empty">Yes</option>
+      </param>
+     <when value="no_fill">
+        <!-- do nothing -->
+     </when>
+     <when value="fill_empty">
+       <param type="select" name="fill_columns_by" label="Only fill unjoined rows">
+         <option value="fill_unjoined_only" selected="True">Yes</option>
+         <option value="fill_all">No</option>
+       </param>
+       <conditional name="do_fill_empty_columns">
+         <param name="column_fill_type" type="select" label="Fill Columns by">
+           <option value="single_fill_value" selected="True">Single fill value</option>
+           <option value="fill_value_by_column">Values by column</option>
+         </param>
+         <when value="single_fill_value">
+           <param type="text" name="fill_value" label="Fill value" value="."/>
+         </when>
+         <when value="fill_value_by_column">
+           <repeat name="column_fill1" title="Fill Column for Input 1">
+             <param name="column_number1" label="Column" type="data_column" data_ref="input1" />
+             <param type="text" name="fill_value1" value="."/>
+           </repeat>
+           <repeat name="column_fill2" title="Fill Column for Input 2">
+             <param name="column_number2" label="Column" type="data_column" data_ref="input2" />
+             <param type="text" name="fill_value2" value="."/>
+           </repeat>
+         </when>
+       </conditional>
+     </when>
+   </conditional>
+  </inputs>
+  <configfiles>
+    <configfile name="fill_options_file">&lt;%
+import simplejson
+%&gt;
+#set $__fill_options = {}
+#if $fill_empty_columns['fill_empty_columns_switch'] == 'fill_empty':
+    #set $__fill_options['fill_unjoined_only'] = $fill_empty_columns['fill_columns_by'].value == 'fill_unjoined_only'
+    #if $fill_empty_columns['do_fill_empty_columns']['column_fill_type'] == 'single_fill_value':
+        #set $__start_fill = $fill_empty_columns['do_fill_empty_columns']['fill_value'].value
+    #else:
+        #set $__start_fill = ""
+    #end if
+    #set $__fill_options['file1_columns'] = [ __start_fill for i in range( int( $input1.metadata.columns ) ) ]
+    #set $__fill_options['file2_columns'] = [ __start_fill for i in range( int( $input2.metadata.columns ) ) ]
+    #if $fill_empty_columns['do_fill_empty_columns']['column_fill_type'] == 'fill_value_by_column':
+        #for column_fill1 in $fill_empty_columns['do_fill_empty_columns']['column_fill1']:
+            #set $__fill_options['file1_columns'][ int( column_fill1['column_number1'].value ) - 1 ] = column_fill1['fill_value1'].value
+        #end for
+        #for column_fill2 in $fill_empty_columns['do_fill_empty_columns']['column_fill2']:
+            #set $__fill_options['file2_columns'][ int( column_fill2['column_number2'].value ) - 1 ] = column_fill2['fill_value2'].value
+        #end for
+    #end if
+#end if
+${simplejson.dumps( __fill_options )}
+    </configfile>
+  </configfiles>
+  <outputs>
+     <data format="input" name="out_file1" metadata_source="input1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="1.bed"/>
+      <param name="input2" value="2.bed"/>
+      <param name="field1" value="2"/>
+      <param name="field2" value="2"/>
+      <param name="unmatched" value=""/>
+      <param name="partial" value=""/>
+      <param name="fill_empty_columns_switch" value="no_fill"/>
+      <output name="out_file1" file="joiner_out1.bed"/>
+    </test>
+    <test>
+      <param name="input1" value="1.bed"/>
+      <param name="input2" value="2.bed"/>
+      <param name="field1" value="2"/>
+      <param name="field2" value="2"/>
+      <param name="unmatched" value="Yes"/>
+      <param name="partial" value="Yes"/>
+      <param name="fill_empty_columns_switch" value="no_fill"/>
+      <output name="out_file1" file="joiner_out2.bed"/>
+    </test>
+    <test>
+      <param name="input1" value="1.bed"/>
+      <param name="input2" value="2.bed"/>
+      <param name="field1" value="2"/>
+      <param name="field2" value="2"/>
+      <param name="unmatched" value="Yes"/>
+      <param name="partial" value="Yes"/>
+      <param name="fill_empty_columns_switch" value="fill_empty"/>
+      <param name="fill_columns_by" value="fill_all"/>
+      <param name="column_fill_type" value="single_fill_value"/>
+      <param name="fill_value" value="~"/>
+      <output name="out_file1" file="joiner_out3.bed"/>
+    </test>
+    <test>
+      <param name="input1" value="1.bed"/>
+      <param name="input2" value="2.bed"/>
+      <param name="field1" value="2"/>
+      <param name="field2" value="2"/>
+      <param name="unmatched" value="Yes"/>
+      <param name="partial" value="Yes"/>
+      <param name="fill_empty_columns_switch" value="fill_empty"/>
+      <param name="fill_columns_by" value="fill_all"/>
+      <param name="column_fill_type" value="fill_value_by_column"/>
+      <param name="column_number1" value="6"/>
+      <param name="fill_value1" value="+"/>
+      <param name="column_number2" value="1"/>
+      <param name="fill_value2" value="NoChrom"/>
+      <output name="out_file1" file="joiner_out4.bed"/>
+    </test>
+  </tests>
+  <help>
+
+.. class:: warningmark
+
+**This tool will attempt to reuse the metadata from your first input.** To change metadata assignments click on the "edit attributes" link of the history item generated by this tool.
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
+
+-----
+
+**Syntax**
+
+This tool joins lines of two datasets on a common field. An empty string ("") is not a valid identifier.
+You may choose to include lines of your first input that do not join with your second input.
+
+- Columns are referenced with a **number**. For example, **3** refers to the 3rd column of a tab-delimited file.
+
+-----
+
+**Example**
+
+Dataset1::
+
+  chr1 10 20 geneA 
+  chr1 50 80 geneB
+  chr5 10 40 geneL
+
+Dataset2::
+
+  geneA tumor-supressor
+  geneB Foxp2
+  geneC Gnas1
+  geneE INK4a
+
+Joining the 4th column of Dataset1 with the 1st column of Dataset2 will yield::
+
+  chr1 10 20 geneA geneA tumor-suppressor
+  chr1 50 80 geneB geneB Foxp2
+
+Joining the 4th column of Dataset1 with the 1st column of Dataset2, while keeping all lines from Dataset1, will yield::
+
+  chr1 10 20 geneA geneA tumor-suppressor
+  chr1 50 80 geneB geneB Foxp2
+  chr5 10 40 geneL
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/joiner2.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/joiner2.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,13 @@
+<tool id="joiner2" name="Relational join 2">
+  <description>two datasets a specific column of which has the same value</description>
+  <command>sort -k $col1 $input1 > $input1.tmp; sort -k $col2 $input2 > $input2.tmp; join -1 $col1 -2 $col2 $input1.tmp $input2.tmp | tr " " "\t" > $out_file1; rm -rf $input1.tmp $input2.tmp </command>
+  <inputs>
+    <param name="input1" label="Combine dataset" format="tabular" type="data" />
+    <param name="col1" label="using column" type="data_column" data_ref="input1" />
+    <param name="input2" label="with dataset" format="tabular" type="data"/>
+    <param name="col2" label="and column" type="data_column" data_ref="input2" />
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input1" />
+  </outputs>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/lav_to_bed.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/lav_to_bed.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+#Reads a LAV file and writes two BED files.
+import sys
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+import bx.align.lav
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def main():
+    try:
+        lav_file = open(sys.argv[1],'r')
+        bed_file1 = open(sys.argv[2],'w')
+        bed_file2 = open(sys.argv[3],'w')
+    except Exception, e:
+        stop_err( str( e ) )
+        
+    lavsRead = 0
+    bedsWritten = 0
+    species = {}
+    # TODO: this is really bad since everything is read into memory.  Can we eliminate this tool?
+    for lavBlock in bx.align.lav.Reader( lav_file ):
+        lavsRead += 1
+        for c in lavBlock.components:
+            spec, chrom = bx.align.lav.src_split( c.src )
+            if bedsWritten < 1:
+                if len( species )==0:
+                    species[spec]=bed_file1
+                elif len( species )==1:
+                    species[spec]=bed_file2
+                else:
+                    continue #this is a pairwise alignment...
+            if spec in species:
+                species[spec].write( "%s\t%i\t%i\t%s_%s\t%i\t%s\n" % ( chrom, c.start, c.end, spec, str( bedsWritten ), 0, c.strand ) )
+        bedsWritten += 1
+        
+
+    for spec,file in species.items():
+        print "#FILE\t%s\t%s" % (file.name, spec)
+    
+    lav_file.close()
+    bed_file1.close()
+    bed_file2.close()
+    
+    print "%d lav blocks read, %d regions written\n" % (lavsRead,bedsWritten)
+
+
+
+if __name__ == "__main__": main()
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/lav_to_bed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/lav_to_bed.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,68 @@
+<tool id="lav_to_bed1" name="LAV to BED">
+  <description>Converts a LAV formatted file to BED format</description>
+  <command interpreter="python">lav_to_bed.py $lav_file $bed_file1 $bed_file2</command>
+  <inputs>
+    <param name="lav_file" type="data" format="lav" label="LAV File" optional="False"/>
+  </inputs>
+  <outputs>
+    <data name="bed_file1" format="bed"/>
+    <data name="bed_file2" format="bed"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="lav_file" value="2.lav" ftype="lav" />
+      <output name="bed_file2" file="lav_to_bed_out_1.bed" />
+      <output name="bed_file2" file="lav_to_bed_out_2.bed" />
+    </test>
+  </tests>
+  <help>
+
+**Syntax**
+
+This tool converts a LAV formatted file to the BED format.
+
+- **LAV format** LAV is an alignment format developed by Webb Miller's group at Penn State University. It is the primary output format for BLASTZ.
+
+- **BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser.
+
+-----
+
+**Example**
+
+- Convert LAV format::
+
+    #:lav
+    s {
+      &quot;/galaxy/data/hg16/seq/chr19.nib&quot; 1 63811651 0 1
+      &quot;/galaxy/data/mm5/seq/chr11.nib&quot; 1 121648857 0 1
+    }
+    h {
+      &quot;> hg16.chr19&quot;
+      &quot;> mm5.chr11 (reverse complement)&quot;
+    }
+    a {
+      s 3500
+      b 3001012 70568380
+      e 3001075 70568443
+      l 3001012 70568380 3001075 70568443 81
+    }
+    a {
+      s 3900
+      b 3008279 70573976
+      e 3008357 70574054
+      l 3008279 70573976 3008357 70574054 78
+    }
+    #:eof
+
+- To two BED formatted files::
+
+    chr19 3001011 3001075 hg16_0 0 +
+    chr19 3008278 3008357 hg16_1 0 +
+    
+ **and**::
+    
+    chr11 70568379 70568443 mm5_0 0 +
+    chr11 70573975 70574054 mm5_1 0 +
+  </help>
+  <code file="lav_to_bed_code.py"/>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/lav_to_bed_code.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/lav_to_bed_code.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,19 @@
+#Set build, name, and info for each output BED file
+def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
+    new_stdout = ""
+    filename_to_build = {}
+    for line in stdout.split("\n"):
+        if line.startswith("#FILE"):
+            fields = line.split("\t")
+            filename_to_build[fields[1]]=fields[2].strip()
+        else:
+            new_stdout = "%s%s" % ( new_stdout, line )
+    for name,data in out_data.items():
+        try:
+            data.info = "%s\n%s" % ( new_stdout, stderr )
+            data.dbkey = filename_to_build[data.file_name]
+            data.name = "%s (%s)" % ( data.name, data.dbkey )
+            app.model.context.add( data )
+            app.model.context.flush()
+        except:
+            continue
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/mergeCols.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/mergeCols.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,37 @@
+import sys, re
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def __main__():
+    try:
+        infile =  open ( sys.argv[1], 'r')
+        outfile = open ( sys.argv[2], 'w')
+    except:
+        stop_err( 'Cannot open or create a file\n' )
+        
+    if len( sys.argv ) < 4:
+        stop_err( 'No columns to merge' )
+    else:
+        cols = sys.argv[3:]        
+
+    skipped_lines = 0
+
+    for line in infile:
+        line = line.rstrip( '\r\n' )
+        if line and not line.startswith( '#' ):
+            fields = line.split( '\t' )
+            line += '\t'
+            for col in cols:
+                try:
+                    line += fields[ int( col ) -1 ]
+                except:
+                    skipped_lines += 1
+                    
+            print >>outfile, line
+            
+    if skipped_lines > 0:
+        print 'Skipped %d invalid lines' % skipped_lines
+            
+if __name__ == "__main__" : __main__()
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/mergeCols.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/mergeCols.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,63 @@
+<tool id="mergeCols1" name="Merge Columns" version="1.0.1">
+  <description>together</description>
+  <command interpreter="python">
+   mergeCols.py 
+      $input1
+      $out_file1
+      $col1
+      $col2
+      #for $col in $columns
+        ${col.datacol}
+      #end for
+  </command>
+  <inputs>
+    <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/>
+    <param name="col1" label="Merge column" type="data_column" data_ref="input1" />
+    <param name="col2" label="with column" type="data_column" data_ref="input1" help="Need to add more columns? Use controls below."/>
+    <repeat name="columns" title="Columns">
+      <param name="datacol" label="Add column" type="data_column" data_ref="input1" />
+    </repeat>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="1.bed"/>
+      <param name="col1" value="4" />
+      <param name="col2" value="1" />
+      <param name="datacol" value="6" />
+      <output name="out_file1" file="mergeCols.dat"/>
+    </test>
+  </tests>
+<help>
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
+
+-----
+
+**What it does**
+
+This tool merges columns together. Any number of valid columns can be merged in any order.
+
+-----
+
+**Example**
+
+Input dataset (five columns: c1, c2, c3, c4, and c5)::
+
+   1 10   1000  gene1 chr
+   2 100  1500  gene2 chr
+
+merging columns "**c5,c1**" will return::
+
+   1 10   1000  gene1 chr chr1
+   2 100  1500  gene2 chr chr2
+
+.. class:: warningmark
+   
+Note that all original columns are preserved and the result of merge is added as the rightmost column.
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/pasteWrapper.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/pasteWrapper.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,35 @@
+#! /usr/bin/perl -w
+
+use strict;
+use warnings;
+my $command = "";
+# a wrapper for paste for use in galaxy
+# pasteWrapper.pl [filename1] [filename2] [delimiter] [output]
+
+die "Check arguments" unless @ARGV == 4;
+
+if ($ARGV[2] eq 'T') {
+    $command = "paste $ARGV[0] $ARGV[1]";
+} elsif ($ARGV[2] eq 'C') {
+    $command = "paste -d \",\" $ARGV[0] $ARGV[1]";
+} elsif ($ARGV[2] eq 'D') {
+    $command = "paste -d \"-\" $ARGV[0] $ARGV[1]";
+} elsif ($ARGV[2] eq 'U') {
+    $command = "paste -d \"_\" $ARGV[0] $ARGV[1]";
+} elsif ($ARGV[2] eq 'P') {
+    $command = "paste -d \"|\" $ARGV[0] $ARGV[1]";
+} elsif ($ARGV[2] eq 'Dt') {
+    $command = "paste -d \".\" $ARGV[0] $ARGV[1]";
+} elsif ($ARGV[2] eq 'Sp') {
+    $command = "paste -d \" \" $ARGV[0] $ARGV[1]";
+}
+
+open (OUT, ">$ARGV[3]") or die "Cannot create $ARGV[2]:$!\n";
+open (PASTE, "$command |") or die "Cannot run paste:$!\n";
+
+while (<PASTE>) {
+    print OUT;
+}
+close OUT;
+close PASTE;
+    
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/pasteWrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/pasteWrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,68 @@
+<tool id="Paste1" name="Paste">
+  <description>two files side by side</description>
+  <command interpreter="perl">pasteWrapper.pl $input1 $input2 $delimiter $out_file1</command>
+  <inputs>
+<!--    <display>paste $input1 and $input2 using $delimiter as delimiter</display> -->
+    <param format="txt" name="input1" type="data" label="Paste"/>
+    <param format="txt" name="input2" type="data" label="and"/>
+    <param name="delimiter" type="select" label="Delimit by">
+      <option value="T">Tab</option>
+      <option value="Dt">Dot</option>
+      <option value="C">Comma</option>
+      <option value="D">Dash</option>
+      <option value="U">Underscore</option>
+      <option value="P">Pipe</option>
+      <option value="Sp">Space</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input1">
+      <change_format>
+        <when input_dataset="input1" attribute="ext" value="bed" format="interval"/>
+      </change_format>
+    </data>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="1.bed"/>
+      <param name="input2" value="2.bed"/>
+      <param name="delimiter" value="T"/>
+      <output name="out_file1" file="eq-paste.dat"/>
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+Paste preserves column assignments of the first dataset.
+
+-----
+
+**What it does**
+
+This tool merges two datasets side by side. If the first (left) dataset contains column assignments such as chromosome, start, end and strand, these will be preserved. However, if you would like to change column assignments, click the pencil icon in the history item.
+
+-----
+
+**Example**
+
+First dataset::
+  
+    a 1
+    a 2
+    a 3
+
+Second dataset::
+
+    20
+    30
+    40
+
+Pasting them together will produce::
+
+    a 1 20
+    a 2 30
+    a 3 40
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/randomlines.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/randomlines.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# Kanwei Li, 2010
+# Selects N random lines from a file and outputs to another file
+
+import random, sys
+
+def main():
+    infile = open(sys.argv[1], 'r')
+    total_lines = int(sys.argv[2])
+    
+    if total_lines < 1:
+        sys.stderr.write( "Must select at least one line." )
+        sys.exit()
+    
+    kept = []
+    n = 0
+    for line in infile:
+        line = line.rstrip("\n")
+        n += 1
+        if (n <= total_lines):
+            kept.append(line)
+        elif random.randint(1, n) <= total_lines:
+            kept.pop(random.randint(0, total_lines-1))
+            kept.append(line)
+    
+    if n < total_lines:
+        sys.stderr.write( "Error: asked to select more lines than there were in the file." )
+        sys.exit()
+        
+    open(sys.argv[3], 'w').write( "\n".join(kept) )
+    
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/randomlines.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/randomlines.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,42 @@
+<tool id="random_lines1" name="Select random lines">
+  <description>from a file</description>
+  <command interpreter="python">randomlines.py $input $num_lines $out_file1</command>
+  <inputs>
+    <param name="num_lines" size="5" type="integer" value="1" label="Randomly select" help="lines"/>
+    <param format="txt" name="input" type="data" label="from"/>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="num_lines" value="65"/>
+      <param name="input" value="1.bed"/>
+      <output name="out_file1" file="1.bed"/>
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool selects N random lines from a file, with no repeats, and preserving ordering.
+
+-----
+
+**Example**
+
+Input File::
+
+    chr7  56632  56652   D17003_CTCF_R6  310  +
+    chr7  56736  56756   D17003_CTCF_R7  354  +
+    chr7  56761  56781   D17003_CTCF_R4  220  +
+    chr7  56772  56792   D17003_CTCF_R7  372  +
+    chr7  56775  56795   D17003_CTCF_R4  207  +
+
+Selecting 2 random lines might return this::
+
+    chr7  56736  56756   D17003_CTCF_R7  354  +
+    chr7  56775  56795   D17003_CTCF_R4  207  +
+
+    </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/remove_beginning.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/remove_beginning.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,33 @@
+#! /usr/bin/perl -w
+
+use strict;
+use warnings;
+
+# Removes the specified number of lines from the beginning of the file.
+# remove_beginning.pl [input] [num_lines] [output]
+
+die "Check arguments" unless @ARGV == 3;
+
+my $inputfile = $ARGV[0];
+my $num_lines = $ARGV[1];
+my $outputfile = $ARGV[2];
+
+my $curCount=0;
+
+my $fhIn;
+open ($fhIn, "< $inputfile") or die "Cannot open source file";
+
+my $fhOut;
+open ($fhOut, "> $outputfile");
+
+while (<$fhIn>)
+{
+    $curCount++;
+    if ($curCount<=$num_lines)
+    {
+        next;
+    }
+    print $fhOut $_;
+}
+close ($fhIn) or die "Cannot close source file";
+close ($fhOut) or die "Cannot close output file";
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/remove_beginning.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/remove_beginning.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,42 @@
+<tool id="Remove beginning1" name="Remove beginning">
+  <description>of a file</description>
+  <command interpreter="perl">remove_beginning.pl $input $num_lines $out_file1</command>
+  <inputs>
+    <param name="num_lines" size="5" type="integer" value="1" label="Remove first" help="lines"/>
+    <param format="txt" name="input" type="data" label="from"/>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="num_lines" value="5"/>
+      <param name="input" value="1.bed"/>
+      <output name="out_file1" file="eq-removebeginning.dat"/>
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool removes a specified number of lines from the beginning of a dataset.
+
+-----
+
+**Example**
+
+Input File::
+
+    chr7  56632  56652   D17003_CTCF_R6  310  +
+    chr7  56736  56756   D17003_CTCF_R7  354  +
+    chr7  56761  56781   D17003_CTCF_R4  220  +
+    chr7  56772  56792   D17003_CTCF_R7  372  +
+    chr7  56775  56795   D17003_CTCF_R4  207  +
+
+After removing the first 3 lines the dataset will look like this::
+
+    chr7  56772  56792   D17003_CTCF_R7  372  +
+    chr7  56775  56795   D17003_CTCF_R4  207  +
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/sff_extract.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/sff_extract.py Fri Mar 09 19:37:19 2012 -0500
[
b"@@ -0,0 +1,1505 @@\n+#!/usr/bin/python\n+'''This software extracts the seq, qual and ancillary information from an sff\n+file, like the ones used by the 454 sequencer.\n+\n+Optionally, it can also split paired-end reads if given the linker sequence.\n+The splitting is done with maximum match, i.e., every occurence of the linker\n+sequence will be removed, even if occuring multiple times.'''\n+\n+#copyright Jose Blanca and Bastien Chevreux\n+#COMAV institute, Universidad Politecnica de Valencia (UPV)\n+#Valencia, Spain\n+\n+# additions to handle paired end reads by Bastien Chevreux\n+# bugfixes for linker specific lengths: Lionel Guy\n+\n+#This program is free software: you can redistribute it and/or modify\n+#it under the terms of the GNU General Public License as published by\n+#the Free Software Foundation, either version 3 of the License, or\n+#(at your option) any later version.\n+#This program is distributed in the hope that it will be useful,\n+#but WITHOUT ANY WARRANTY; without even the implied warranty of\n+#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n+#GNU General Public License for more details.\n+#You should have received a copy of the GNU General Public License\n+#along with this program.  If not, see <http://www.gnu.org/licenses/>.\n+\n+__author__ = 'Jose Blanca and Bastien Chevreux'\n+__copyright__ = 'Copyright 2008, Jose Blanca, COMAV, and Bastien Chevreux'\n+__license__ = 'GPLv3 or later'\n+__version__ = '0.2.8'\n+__email__ = 'jblanca@btc.upv.es'\n+__status__ = 'beta'\n+\n+import struct\n+import sys\n+import os\n+import subprocess\n+import tempfile\n+\n+\n+fake_sff_name = 'fake_sff_name'\n+\n+\n+# readname as key: lines with matches from SSAHA, one best match\n+ssahapematches = {}\n+# linker readname as key: length of linker sequence\n+linkerlengths = {}\n+\n+# set to true if something really fishy is going on with the sequences\n+stern_warning = True\n+\n+def read_bin_fragment(struct_def, fileh, offset=0, data=None,\n+                                                             byte_padding=None):\n+    '''It reads a chunk of a binary file.\n+\n+    You have to provide the struct, a file object, the offset (where to start\n+    reading).\n+    Also you can provide an optional dict that will be populated with the\n+    extracted data.\n+    If a byte_padding is given the number of bytes read will be a multiple of\n+    that number, adding the required pad at the end.\n+    It returns the number of bytes reads and the data dict.\n+    '''\n+    if data is None:\n+        data = {}\n+\n+    #we read each item\n+    bytes_read = 0\n+    for item in struct_def:\n+        #we go to the place and read\n+        fileh.seek(offset + bytes_read)\n+        n_bytes = struct.calcsize(item[1])\n+        buffer = fileh.read(n_bytes)\n+        read = struct.unpack('>' + item[1], buffer)\n+        if len(read) == 1:\n+            read = read[0]\n+        data[item[0]] = read\n+        bytes_read += n_bytes\n+\n+    #if there is byte_padding the bytes_to_read should be a multiple of the\n+    #byte_padding\n+    if byte_padding is not None:\n+        pad = byte_padding\n+        bytes_read = ((bytes_read + pad - 1) // pad) * pad\n+\n+    return (bytes_read, data)\n+\n+\n+def check_magic(magic):\n+    '''It checks that the magic number of the file matches the sff magic.'''\n+    if magic != 779314790:\n+        raise RuntimeError('This file does not seems to be an sff file.')\n+\n+def check_version(version):\n+    '''It checks that the version is supported, otherwise it raises an error.'''\n+    supported = ('\\x00', '\\x00', '\\x00', '\\x01')\n+    i = 0\n+    for item in version:\n+        if version[i] != supported[i]:\n+            raise RuntimeError('SFF version not supported. Please contact the author of the software.')\n+        i += 1\n+\n+def read_header(fileh):\n+    '''It reads the header from the sff file and returns a dict with the\n+    information'''\n+    #first we read the first part of the header\n+    head_struct = [\n+        ('magic_number', 'I'),\n+        ('version', 'cccc'),\n+        ('index_offs"..b'      help="base name for all output files")\n+    group.add_option("-s", "--seq_file", dest="seq_fname",\n+            help="output sequence file name", metavar="FILE")\n+    group.add_option("-q", "--qual_file", dest="qual_fname",\n+            help="output quality file name", metavar="FILE")\n+    group.add_option("-x", "--xml_file", dest="xml_fname",\n+            help="output ancillary xml file name", metavar="FILE")\n+    parser.add_option_group(group)\n+\n+    #default fnames\n+    #is there an sff file?\n+    basename = \'reads\'\n+    if sys.argv[-1][-4:].lower() == \'.sff\':\n+        basename = sys.argv[-1][:-4]\n+    def_seq_fname = basename + \'.fasta\'\n+    def_qual_fname = basename + \'.fasta.qual\'\n+    def_xml_fname = basename + \'.xml\'\n+    def_pelinker_fname = \'\'\n+    parser.set_defaults(seq_fname = def_seq_fname)\n+    parser.set_defaults(qual_fname = def_qual_fname)\n+    parser.set_defaults(xml_fname = def_xml_fname)\n+    parser.set_defaults(pelinker_fname = def_pelinker_fname)\n+\n+    #we parse the cmd line\n+    (options, args) = parser.parse_args()\n+\n+    #we put the result in a dict\n+    global config\n+    config = {}\n+    for property in dir(options):\n+        if property[0] == \'_\' or property in (\'ensure_value\', \'read_file\', \n+                                                                \'read_module\'):\n+            continue\n+        config[property] = getattr(options, property)\n+\n+    if config[\'basename\'] is None:\n+        config[\'basename\']=basename\n+\n+    #if we have not set a file name with -s, -q or -x we set the basename\n+    #based file name\n+    if config[\'want_fastq\']:\n+        config[\'qual_fname\'] = \'\'\n+        if config[\'seq_fname\'] == def_seq_fname:\n+            config[\'seq_fname\'] = config[\'basename\'] + \'.fastq\'\n+    else:\n+        if config[\'seq_fname\'] == def_seq_fname:\n+            config[\'seq_fname\'] = config[\'basename\'] + \'.fasta\'\n+        if config[\'qual_fname\'] == def_qual_fname:\n+            config[\'qual_fname\'] = config[\'basename\'] + \'.fasta.qual\'\n+\n+    if config[\'xml_fname\'] == def_xml_fname:\n+        config[\'xml_fname\'] = config[\'basename\'] + \'.xml\'\n+\n+    #we parse the extra info for the xml file\n+    config[\'xml_info\'] = parse_extra_info(config[\'xml_info\'])\n+    return config, args\n+\n+\n+\n+##########################################################################\n+\n+\n+def testsome():\n+    sys.exit()\n+    return\n+\n+\n+def debug():\n+    try:\n+        dummy = 1\n+        #debug()\n+        #testsome()\n+\n+        config, args = read_config()\n+        load_linker_sequences(config[\'pelinker_fname\'])\n+\n+        #pid = os.getpid()\n+        pid = 15603\n+\n+        #tmpfasta_fname = \'sffe.tmp.\'+ str(pid)+\'.fasta\'\n+        #tmpfasta_fh = open(tmpfasta_fname, \'w\')\n+        tmpfasta_fname = \'FLVI58L05.fa\'\n+        tmpfasta_fh = open(tmpfasta_fname, \'r\')\n+\n+        tmpssaha_fname = \'sffe.tmp.\'+str(pid)+\'.ssaha2\'\n+        tmpssaha_fh = open(tmpssaha_fname, \'w\')\n+\n+        launch_ssaha(config[\'pelinker_fname\'], tmpfasta_fh.name, tmpssaha_fh)\n+\n+        tmpssaha_fh = open("sffe.tmp.15603.ssaha2", \'r\')    \n+        read_ssaha_data(tmpssaha_fh)\n+\n+        sys.exit()\n+\n+        extract_reads_from_sff(config, args)\n+\n+    except (OSError, IOError, RuntimeError), errval:\n+        print errval\n+        sys.exit()\n+\n+    sys.exit()\n+\n+\n+def main():\n+\n+    argv = sys.argv\n+    if len(argv) == 1:\n+        sys.argv.append(\'-h\')\n+        read_config()\n+        sys.exit()\n+    try:\n+        #debug();\n+\n+        config, args = read_config()\n+\n+        if config[\'pelinker_fname\']:\n+            #tests_for_ssaha(config[\'pelinker_fname\'])\n+            load_linker_sequences(config[\'pelinker_fname\'])\n+        if len(args) == 0:\n+            raise RuntimeError("No SFF file given?")\n+        extract_reads_from_sff(config, args)\n+    except (OSError, IOError, RuntimeError), errval:\n+        print errval\n+        return 1\n+\n+    if stern_warning:\n+        return 1\n+\n+    return 0\n+\n+\n+\n+if __name__ == "__main__":\n+        sys.exit(main())\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/sff_extractor.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/sff_extractor.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,58 @@
+<tool id="Sff_extractor" name="SFF converter" version="1.0.0">
+    <description></description>
+    <command interpreter="python">
+        #if   str($fastq_output) == "fastq_false"  #sff_extract.py $clip --seq_file=$out_file3 --qual_file=$out_file4 --xml_file=$out_file2 $input
+        #elif str($fastq_output) == "fastq_true"   #sff_extract.py $clip --fastq --seq_file=$out_file1 --xml_file=$out_file2 $input
+        #end if#
+    </command>
+    <inputs>
+        <param format="sff" name="input" type="data" label="Extract from this dataset"/>
+        <param name="clip" type="select" label="Completely remove ends with low qual and/or adaptor sequence">
+            <option value="">No</option>
+            <option value="--clip">Yes</option>
+        </param>
+        <param name="fastq_output" type="boolean" truevalue="fastq_true" falsevalue="fastq_false" checked="False" label="Do you want FASTQ file instead of FASTA + FASTA quality file?" />
+    </inputs>
+    <outputs>
+        <data format="fastqsanger" name="out_file1" >
+            <filter>fastq_output is True</filter>
+        </data>
+        <data format="xml" name="out_file2">
+        </data>  
+        <data format="fasta" name="out_file3">
+            <filter>fastq_output is False</filter>
+        </data>
+        <data format="qual" name="out_file4">
+            <filter>fastq_output is False</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" value="2.sff"/>
+            <param name="clip" value=""/>
+            <param name="fastq_output" value="false"/>
+            <output name="out_file2" file="sff_converter_xml_1.dat"/>
+            <output name="out_file3" file="sff_converter_fasta.dat"/>
+            <output name="out_file4" file="sff_converter_qual.dat"/>
+        </test>
+        <test>
+            <param name="input" value="2.sff"/>
+            <param name="clip" value=""/>
+            <param name="fastq_output" value="true"/>
+            <output name="out_file1" file="sff_converter_fastq.dat"/>
+            <output name="out_file2" file="sff_converter_xml_2.dat"/>
+        </test>
+    </tests>
+    <help>
+
+**What it does**
+
+This tool extracts data from the 454 Sequencer SFF format and creates three files containing the: 
+Sequences (FASTA),
+Qualities (QUAL) and 
+Clippings (XML)
+
+    </help>
+</tool>
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/sorter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/sorter.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,49 @@
+"""
+Sorts tabular data on one or more columns.
+
+usage: %prog [options]
+   -i, --input=i: Tabular file to be sorted
+   -o, --out_file1=o: Sorted output file
+   -c, --column=c: First column to sort on
+   -s, --style=s: Sort style (numerical or alphabetical)
+   -r, --order=r: Order (ASC or DESC)
+
+usage: %prog input out_file1 column style order [column style ...]
+"""
+
+import os, re, string, sys
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+def main():
+    #Parse Command Line
+    options, args = doc_optparse.parse( __doc__ )
+    try:
+        inputfile = options.input
+        outputfile = '-o %s' % options.out_file1
+        columns = [options.column]
+        styles = [('','n')[options.style == 'num']]
+        orders = [('','r')[options.order == 'DESC']]
+        col_style_orders = sys.argv[6:]
+        if len(col_style_orders) > 1:
+            columns.extend([col_style_orders[i] for i in range(0,len(col_style_orders),3)])
+            styles.extend([('','n')[col_style_orders[i] == 'num'] for i in range(1,len(col_style_orders),3)])
+            orders.extend([('','r')[col_style_orders[i] == 'DESC'] for i in range(2,len(col_style_orders),3)])
+        cols = [ '-k%s,%s%s%s'%(columns[i], columns[i], styles[i], orders[i]) for i in range(len(columns)) ]
+    except Exception, ex:
+        stop_err('Error parsing input parameters\n' + str(ex))
+
+    # Launch sort.
+    cmd = "sort -f -t ' ' %s %s %s" % (' '.join(cols), outputfile, inputfile)
+    try:
+        os.system(cmd)
+    except Exception, ex:
+        stop_err('Error running sort command\n' + str(ex))
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/sorter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/sorter.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,130 @@
+<tool id="sort1" name="Sort" version="1.0.1">
+  <description>data in ascending or descending order</description>
+  <command interpreter="python">
+    sorter.py 
+      --input=$input 
+      --out_file1=$out_file1 
+      --column=$column
+      --style=$style
+      --order=$order 
+      #for $col in $column_set:
+        ${col.other_column}
+        ${col.other_style}
+        ${col.other_order}
+      #end for
+  </command>
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Sort Query" />
+    <param name="column" label="on column" type="data_column" data_ref="input" accept_default="true" />
+    <param name="style" type="select" label="with flavor">
+      <option value="num">Numerical sort</option>
+      <option value="alpha">Alphabetical sort</option>
+    </param>
+    <param name="order" type="select" label="everything in">
+      <option value="DESC">Descending order</option>
+      <option value="ASC">Ascending order</option>
+    </param>
+    <repeat name="column_set" title="Column selection">
+      <param name="other_column" label="on column" type="data_column" data_ref="input" accept_default="true" />
+      <param name="other_style" type="select" label="with flavor">
+        <option value="num">Numerical sort</option>
+        <option value="alpha">Alphabetical sort</option>
+      </param>
+      <param name="other_order" type="select" label="everything in">
+        <option value="DESC">Descending order</option>
+        <option value="ASC">Ascending order</option>
+      </param>
+    </repeat>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="sort_in1.bed"/>
+      <param name="column" value="1"/>
+      <param name="style" value="num"/>
+      <param name="order" value="ASC"/>
+      <param name="other_column" value="3"/>
+      <param name="other_style" value="num"/>
+      <param name="other_order" value="ASC"/>
+      <output name="out_file1" file="sort_out1.bed"/>
+    </test>
+    <test>
+      <param name="input" value="sort_in1.bed"/>
+      <param name="column" value="3"/>
+      <param name="style" value="alpha"/>
+      <param name="order" value="ASC"/>
+      <param name="other_column" value="1"/>
+      <param name="other_style" value="alpha"/>
+      <param name="other_order" value="ASC"/>
+      <output name="out_file1" file="sort_out2.bed"/>
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
+
+-----
+
+**Syntax**
+
+This tool sorts the dataset on any number of columns in either ascending or descending order.
+
+* Numerical sort orders numbers by their magnitude, ignores all characters besides numbers, and evaluates a string of numbers to the value they signify.  
+* Alphabetical sort is a phonebook type sort based on the conventional order of letters in an alphabet. Each nth letter is compared with the nth letter of other words in the list, starting at the first letter of each word and advancing to the second, third, fourth, and so on, until the order is established. Therefore, in an alphabetical sort, 2 comes after 100 (1 &lt; 2).
+
+-----
+
+**Examples**
+
+The list of numbers 4,17,3,5 collates to 3,4,5,17 by numerical sorting, while it collates to 17,3,4,5 by alphabetical sorting.
+
+Sorting the following::
+
+  Q     d    7   II    jhu  45
+  A     kk   4   I     h    111
+  Pd    p    1   ktY   WS   113
+  A     g    10  H     ZZ   856
+  A     edf  4   tw    b    234
+  BBB   rt   10  H     ZZ   100
+  A     rew  10  d     b    1111
+  C     sd   19  YH    aa   10
+  Hah   c    23  ver   bb   467
+  MN    gtr  1   a     X    32
+  N     j    9   a     T    205
+  BBB   rrf  10  b     Z    134
+  odfr  ws   6   Weg   dew  201
+  C     f    3   WW    SW   34
+  A     jhg  4   I     b    345
+  Pd    gf   7   Gthe  de   567
+  rS    hty  90  YY    LOp  89
+  A     g    10  H     h    43
+  A     g    4   I     h    500
+
+on columns 1 (alpha), 3 (num), and 6 (num) in ascending order will yield::
+
+  A     kk   4   I     h    111
+  A     edf  4   tw    b    234
+  A     jhg  4   I     b    345
+  A     g    4   I     h    500
+  A     g    10  H     h    43
+  A     g    10  H     ZZ   856
+  A     rew  10  d     b    1111
+  BBB   rt   10  H     ZZ   100
+  BBB   rrf  10  b     Z    134
+  C     f    3   WW    SW   34
+  C     sd   19  YH    aa   10
+  Hah   c    23  ver   bb   467
+  MN    gtr  1   a     X    32
+  N     j    9   a     T    205
+  odfr  ws   6   Weg   dew  201
+  Pd    p    1   ktY   WS   113
+  Pd    gf   7   Gthe  de   567
+  Q     d    7   II    jhu  45
+  rS    hty  90  YY    LOp  89
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/tailWrapper.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/tailWrapper.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,19 @@
+#! /usr/bin/perl -w
+
+use strict;
+use warnings;
+
+# a wrapper for tail for use in galaxy
+# lessWrapper.pl [filename] [# lines to show] [output]
+
+die "Check arguments" unless @ARGV == 3;
+die "Line number should be an integer\n" unless $ARGV[1]=~ m/^\d+$/;
+
+open (OUT, ">$ARGV[2]") or die "Cannot create $ARGV[2]:$!\n";
+open (TAIL, "tail -n $ARGV[1] $ARGV[0]|") or die "Cannot run tail:$!\n";
+while (<TAIL>) {
+    print OUT;
+}
+close OUT;
+close TAIL;
+    
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/tailWrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/tailWrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,42 @@
+<tool id="Show tail1" name="Select last">
+  <description>lines from a dataset</description>
+  <command interpreter="perl">tailWrapper.pl $input $lineNum $out_file1</command>
+  <inputs>
+    <param name="lineNum" size="5" type="integer" value="10" label="Select last" help="lines"/>
+    <param format="txt" name="input" type="data" label="from"/>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="lineNum" value="10"/>
+      <param name="input" value="1.bed"/>
+      <output name="out_file1" file="eq-showtail.dat"/>
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool outputs specified number of lines from the **end** of a dataset
+
+-----
+
+**Example**
+
+- Input File::
+
+    chr7    57134   57154   D17003_CTCF_R7  356     -
+    chr7    57247   57267   D17003_CTCF_R4  207     +
+    chr7    57314   57334   D17003_CTCF_R5  269     +
+    chr7    57341   57361   D17003_CTCF_R7  375     +
+    chr7    57457   57477   D17003_CTCF_R3  188     +
+
+- Show last two lines of above file. The result is::
+
+    chr7    57341   57361   D17003_CTCF_R7  375     +
+    chr7    57457   57477   D17003_CTCF_R3  188     +
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/trimmer.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/trimmer.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+
+import sys
+import optparse
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def main():
+    usage = """%prog [options]
+    
+options (listed below) default to 'None' if omitted
+    """
+    parser = optparse.OptionParser(usage=usage)
+    
+    parser.add_option(
+        '-a','--ascii',
+        dest='ascii',
+        action='store_true',
+        default = False,
+        help='Use ascii codes to defined ignored beginnings instead of raw characters')
+        
+    parser.add_option(
+        '-q','--fastq',
+        dest='fastq',
+        action='store_true',
+        default = False,
+        help='The input data in fastq format. It selected the script skips every even line since they contain sequence ids')
+
+    parser.add_option(
+        '-i','--ignore',
+        dest='ignore',
+        help='A comma separated list on ignored beginnings (e.g., ">,@"), or its ascii codes (e.g., "60,42") if option -a is enabled')
+
+    parser.add_option(
+        '-s','--start',
+        dest='start',
+        default = '0',
+        help='Trim from beginning to here (1-based)')
+
+    parser.add_option(
+        '-e','--end',
+        dest='end',
+        default = '0',
+        help='Trim from here to the ned (1-based)')
+
+    parser.add_option(
+        '-f','--file',
+        dest='input_txt',
+        default = False,
+        help='Name of file to be chopped. STDIN is default')
+            
+    parser.add_option(
+        '-c','--column',
+        dest='col',
+        default = '0',
+        help='Column to chop. If 0 = chop the whole line')
+       
+
+    options, args = parser.parse_args()
+    invalid_starts = []
+
+    if options.input_txt:
+ infile = open ( options.input_txt, 'r')
+    else:
+     infile = sys.stdin
+    
+    if options.ignore and options.ignore != "None":
+        invalid_starts = options.ignore.split(',')
+        
+    if options.ascii and options.ignore and options.ignore != "None":
+        for i, item in enumerate( invalid_starts ):
+            invalid_starts[i] = chr( int( item ) )
+
+    col = int( options.col )

+    for i, line in enumerate( infile ):
+        line = line.rstrip( '\r\n' )
+        if line:
+            
+            if options.fastq and i % 2 == 0:
+                print line
+                continue
+                
+
+            if line[0] not in invalid_starts:
+                if col == 0:
+                    if int( options.end ) > 0:
+                        line = line[ int( options.start )-1 : int( options.end ) ]
+                    else:
+                        line = line[ int( options.start )-1 : ]
+                else:
+                    fields = line.split( '\t' )
+                    if col-1 > len( fields ):
+                        stop_err('Column %d does not exist. Check input parameters\n' % col)
+                        
+                    if int( options.end ) > 0:
+                        fields[col - 1] = fields[col - 1][ int( options.start )-1 : int( options.end ) ]
+                    else:
+                        fields[col - 1] = fields[col - 1][ int( options.start )-1 : ]
+                    line = '\t'.join(fields)
+            print line   
+
+if __name__ == "__main__": main()
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/trimmer.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/trimmer.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,120 @@
+<tool id="trimmer" name="Trim" version="0.0.1">
+    <description>leading or trailing characters</description>
+    <command interpreter="python">
+    trimmer.py -a -f $input1 -c $col -s $start -e $end -i $ignore $fastq > $out_file1
+    </command>
+    <inputs>
+        <param format="tabular,txt" name="input1" type="data" label="this dataset"/>
+        <param name="col" type="integer" value="0" label="Trim this column only" help="0 = process entire line" />
+        <param name="start" type="integer" size="10" value="1" label="Trim from the beginning to this position" help="1 = do not trim the beginning"/>
+        <param name="end" type="integer" size="10" value="0" label="Remove everything from this position to the end" help="0 = do not trim the end"/>
+        <param name="fastq" type="select" label="Is input dataset in fastq format?" help="If set to YES, the tool will not trim evenly numbered lines (0, 2, 4, etc...)">
+            <option selected="true" value="">No</option>
+            <option value="-q">Yes</option>
+        </param>
+        <param name="ignore" type="select" display="checkboxes" multiple="True" label="Ignore lines beginning with these characters" help="lines beginning with these are not trimmed">
+            <option value="62">&gt;</option>
+            <option value="64">@</option>
+            <option value="43">+</option>
+            <option value="60">&lt;</option>
+            <option value="42">*</option>
+            <option value="45">-</option>
+            <option value="61">=</option>
+            <option value="124">|</option>
+            <option value="63">?</option>
+            <option value="36">$</option>
+            <option value="46">.</option>
+            <option value="58">:</option>
+            <option value="38">&amp;</option>
+            <option value="37">%</option>
+            <option value="94">^</option>
+            <option value="35">&#35;</option>
+         </param>   
+    </inputs>
+    <outputs>
+        <data name="out_file1" format="input" metadata_source="input1"/>
+    </outputs>
+    <tests>
+        <test>
+           <param name="input1" value="trimmer_tab_delimited.dat"/>
+           <param name="col" value="0"/>
+           <param name="start" value="1"/>
+           <param name="end" value="13"/>
+           <param name="ignore" value="62"/>
+           <param name="fastq" value="No"/>
+           <output name="out_file1" file="trimmer_a_f_c0_s1_e13_i62.dat"/>
+        </test>
+        <test>
+           <param name="input1" value="trimmer_tab_delimited.dat"/>
+           <param name="col" value="2"/>
+           <param name="start" value="1"/>
+           <param name="end" value="2"/>
+           <param name="ignore" value="62"/>
+           <param name="fastq" value="No"/>
+           <output name="out_file1" file="trimmer_a_f_c2_s1_e2_i62.dat"/>
+        </test>
+
+    </tests>
+
+    <help>
+
+
+**What it does**
+
+Trims specified number of characters from a dataset or its field (if dataset is tab-delimited).
+
+-----
+
+**Example 1**
+
+Trimming this dataset::
+
+  1234567890
+  abcdefghijk
+
+by setting **Trim from the beginning to this position** to *2* and **Remove everything from this position to the end** to *6* will produce::
+
+  23456
+  bcdef
+
+-----
+
+**Example 2**
+
+Trimming column 2 of this dataset::
+
+  abcde 12345 fghij 67890
+  fghij 67890 abcde 12345
+
+by setting **Trim content of this column only** to *2*, **Trim from the beginning to this position** to *2*, and **Remove everything from this position to the end** to *4* will produce::
+
+  abcde  234 fghij 67890
+  fghij  789 abcde 12345
+
+-----
+
+**Trimming FASTQ datasets**
+
+This tool can be used to trim sequences and quality strings in fastq datasets. This is done by selected *Yes* from the **Is input dataset in fastq format?** dropdown. If set to *Yes*, the tool will skip all even numbered lines (see warning below). For example, trimming last 5 bases of this dataset::
+
+  @081017-and-081020:1:1:1715:1759
+  GGACTCAGATAGTAATCCACGCTCCTTTAAAATATC
+  +
+  II#IIIIIII$5+.(9IIIIIII$%*$G$A31I&amp;&amp;B
+  
+cab done by setting **Remove everything from this position to the end** to 31::
+
+  @081017-and-081020:1:1:1715:1759
+  GGACTCAGATAGTAATCCACGCTCCTTTAAA
+  +
+  II#IIIIIII$5+.(9IIIIIII$%*$G$A3 
+  
+**Note** that headers are skipped.
+
+.. class:: warningmark
+
+**WARNING:** This tool will only work on properly formatted fastq datasets where (1) each read and quality string occupy one line and (2) '@' (read header) and "+" (quality header) lines are evenly numbered like in the above example.
+
+
+    </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/ucsc_gene_bed_to_exon_bed.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/ucsc_gene_bed_to_exon_bed.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,152 @@
+#!/usr/bin/env python
+
+"""
+Read a table dump in the UCSC gene table format and print a tab separated
+list of intervals corresponding to requested features of each gene.
+
+usage: ucsc_gene_table_to_intervals.py [options]
+
+options:
+  -h, --help                  show this help message and exit
+  -rREGION, --region=REGION
+                              Limit to region: one of coding, utr3, utr5, codon, intron, transcribed [default]
+  -e, --exons                 Only print intervals overlapping an exon
+  -i, --input=inputfile       input file
+  -o, --output=outputfile     output file
+"""
+
+import optparse, string, sys
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def main():
+
+    # Parse command line    
+    parser = optparse.OptionParser( usage="%prog [options] " )
+    parser.add_option( "-r", "--region", dest="region", default="transcribed",
+                       help="Limit to region: one of coding, utr3, utr5, transcribed [default]" )
+    parser.add_option( "-e", "--exons",  action="store_true", dest="exons",
+                       help="Only print intervals overlapping an exon" )
+    parser.add_option( "-s", "--strand",  action="store_true", dest="strand",
+                       help="Print strand after interval" )
+    parser.add_option( "-i", "--input",  dest="input",  default=None,
+                       help="Input file" )
+    parser.add_option( "-o", "--output", dest="output", default=None,
+                       help="Output file" )
+    options, args = parser.parse_args()
+    assert options.region in ( 'coding', 'utr3', 'utr5', 'transcribed', 'intron', 'codon' ), "Invalid region argument"
+    
+    try:
+        out_file = open (options.output,"w")
+    except:
+        print >> sys.stderr, "Bad output file."
+        sys.exit(0)
+    
+    try:
+        in_file = open (options.input)
+    except:
+        print >> sys.stderr, "Bad input file."
+        sys.exit(0)
+    
+    print "Region:", options.region+";"
+    """print "Only overlap with Exons:",
+    if options.exons:
+        print "Yes"
+    else:
+        print "No"
+    """
+    
+    # Read table and handle each gene
+    for line in in_file:
+        try:
+            if line[0:1] == "#":
+                continue
+            # Parse fields from gene tabls
+            fields = line.split( '\t' )
+            chrom     = fields[0]
+            tx_start  = int( fields[1] )
+            tx_end    = int( fields[2] )
+            name      = fields[3]
+            strand    = fields[5].replace(" ","_")
+            cds_start = int( fields[6] )
+            cds_end   = int( fields[7] )
+
+            # Determine the subset of the transcribed region we are interested in
+            if options.region == 'utr3':
+                if strand == '-': region_start, region_end = tx_start, cds_start
+                else: region_start, region_end = cds_end, tx_end 
+            elif options.region == 'utr5':
+                if strand == '-': region_start, region_end = cds_end, tx_end
+                else: region_start, region_end = tx_start, cds_start
+            elif options.region == 'coding' or options.region == 'codon':
+                region_start, region_end = cds_start, cds_end
+            else:
+                region_start, region_end = tx_start, tx_end
+
+            # If only interested in exons, print the portion of each exon overlapping
+            # the region of interest, otherwise print the span of the region
+        # options.exons is always TRUE
+            if options.exons:
+                exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) )
+                exon_starts = map((lambda x: x + tx_start ), exon_starts)
+                exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) )
+                exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends);
+
+        #for Intron regions:
+            if options.region == 'intron':
+                i=0
+                while i < len(exon_starts)-1:
+                    intron_starts = exon_ends[i]
+                    intron_ends = exon_starts[i+1]
+                    if strand: print_tab_sep(out_file, chrom, intron_starts, intron_ends, name, "0", strand )
+                    else: print_tab_sep(out_file, chrom, intron_starts, intron_ends )
+                    i+=1
+        #for non-intron regions:
+            else:
+                for start, end in zip( exon_starts, exon_ends ):
+                    start = max( start, region_start )
+                    end = min( end, region_end )
+                    if start < end:
+                        if options.region == 'codon':
+                            start += (3 - ((start-region_start)%3))%3
+                            c_start = start 
+                            while c_start+3 <= end:
+                                if strand:
+                                    print_tab_sep(out_file, chrom, c_start, c_start+3, name, "0", strand )
+                                else:
+                                    print_tab_sep(out_file, chrom, c_start, c_start+3)
+                                c_start += 3
+                        else:
+                            if strand:
+                                print_tab_sep(out_file, chrom, start, end, name, "0", strand )
+                            else: 
+                                print_tab_sep(out_file, chrom, start, end )
+                    """
+                    else:
+                        if options.region == 'codon':
+                            c_start = start
+                            c_end = end
+                            if c_start > c_end:
+                                t = c_start
+                                c_start = c_end
+                                c_end = t
+                            while c_start+3 <= c_end:
+                                if strand:
+                                    print_tab_sep(out_file, chrom, c_start, c_start+3, name, "0", strand )
+                                else:
+                                    print_tab_sep(out_file, chrom, c_start, c_start+3)
+                                c_start += 3
+                        else:
+                            if strand:
+                                print_tab_sep(out_file, chrom, region_start, region_end, name, "0", strand )
+                            else: 
+                                print_tab_sep(out_file, chrom, region_start, region_end )
+                    """
+        except:
+            continue
+
+def print_tab_sep(out_file, *args ):
+    """Print items in `l` to stdout separated by tabs"""
+    print >>out_file, string.join( [ str( f ) for f in args ], '\t' )
+
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/ucsc_gene_bed_to_exon_bed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/ucsc_gene_bed_to_exon_bed.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,78 @@
+<tool id="gene2exon1" name="Gene BED To Exon/Intron/Codon BED">
+<description>expander</description>
+  <command interpreter="python">ucsc_gene_bed_to_exon_bed.py --input=$input1 --output=$out_file1 --region=$region "--exons"</command>
+  <inputs>
+    <param name="region" type="select">
+      <label>Extract</label>
+      <option value="transcribed">Coding Exons + UTR Exons</option>
+      <option value="coding">Coding Exons only</option>
+      <option value="utr5">5'-UTR Exons</option>
+      <option value="utr3">3'-UTR Exons</option>
+      <option value="intron">Introns</option>
+      <option value="codon">Codons</option>
+    </param>
+    <param name="input1" type="data" format="bed" label="from" help="this history item must contain a 12 field BED (see below)"/>
+  </inputs>
+  <outputs>
+    <data name="out_file1" format="bed"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="3.bed" /> 
+      <param name="region" value="transcribed" />
+      <output name="out_file1" file="cf-gene2exon.dat"/>
+    </test>
+  </tests>
+<help>
+
+.. class:: warningmark
+
+This tool works only on a BED file that contains at least 12 fields (see **Example** and **About formats** below).  The output will be empty if applied to a BED file with 3 or 6 fields.
+
+------
+
+**What it does**
+
+BED format can be used to represent a single gene in just one line, which contains the information about exons, coding sequence location (CDS), and positions of untranslated regions (UTRs).  This tool *unpacks* this information by converting a single line describing a gene into a collection of lines representing individual exons, introns, UTRs, etc. 
+
+-------
+
+**Example**
+
+Extracting **Coding Exons + UTR Exons** from the following two BED lines::
+
+    chr7 127475281 127491632 NM_000230 0 + 127486022 127488767 0 3 29,172,3225,    0,10713,13126
+    chr7 127486011 127488900 D49487    0 + 127486022 127488767 0 2 155,490,        0,2399
+
+will return::
+
+    chr7 127475281 127475310 NM_000230 0 +
+    chr7 127485994 127486166 NM_000230 0 +
+    chr7 127488407 127491632 NM_000230 0 +
+    chr7 127486011 127486166 D49487    0 +
+    chr7 127488410 127488900 D49487    0 +
+
+------
+
+.. class:: infomark
+
+**About formats**
+
+**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and additional optional ones. In the specific case of this tool the following fields must be present::
+
+    1. chrom - The name of the chromosome (e.g. chr1, chrY_random).
+    2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.)
+    3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval).
+    4. name - The name of the BED line.
+    5. score - A score between 0 and 1000.
+    6. strand - Defines the strand - either '+' or '-'.
+    7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser.
+    8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser.
+    9. reserved - This should always be set to zero.
+   10. blockCount - The number of blocks (exons) in the BED line.
+   11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
+   12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
+
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/ucsc_gene_bed_to_intron_bed.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/ucsc_gene_bed_to_intron_bed.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+
+"""
+Read a table dump in the UCSC gene table format and print a tab separated
+list of intervals corresponding to requested features of each gene.
+
+usage: ucsc_gene_table_to_intervals.py [options]
+
+options:
+  -h, --help                  show this help message and exit
+  -rREGION, --region=REGION
+                              Limit to region: one of coding, utr3, utr5, transcribed [default]
+  -e, --exons                 Only print intervals overlapping an exon
+  -i, --input=inputfile       input file
+  -o, --output=outputfile     output file
+"""
+
+import optparse, string, sys
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def main():
+
+    # Parse command line    
+    parser = optparse.OptionParser( usage="%prog [options] " )
+    #parser.add_option( "-r", "--region", dest="region", default="transcribed",
+    #                   help="Limit to region: one of coding, utr3, utr5, transcribed [default]" )
+    #parser.add_option( "-e", "--exons",  action="store_true", dest="exons",
+    #                   help="Only print intervals overlapping an exon" )
+    parser.add_option( "-s", "--strand",  action="store_true", dest="strand",
+                       help="Print strand after interval" )
+    parser.add_option( "-i", "--input",  dest="input",  default=None,
+                       help="Input file" )
+    parser.add_option( "-o", "--output", dest="output", default=None,
+                       help="Output file" )
+    options, args = parser.parse_args()
+    #assert options.region in ( 'coding', 'utr3', 'utr5', 'transcribed' ), "Invalid region argument"
+    
+    try:
+        out_file = open (options.output,"w")
+    except:
+        print >> sys.stderr, "Bad output file."
+        sys.exit(0)
+    
+    try:
+        in_file = open (options.input)
+    except:
+        print >> sys.stderr, "Bad input file."
+        sys.exit(0)
+    
+    #print "Region:", options.region+";"
+    #print "Only overlap with Exons:",
+    #if options.exons:
+    #    print "Yes"
+    #else:
+    #    print "No"
+    
+    # Read table and handle each gene
+    
+    for line in in_file:
+        try:
+     #print ("len: %d", len(line))
+            if line[0:1] == "#":
+                continue
+    
+            # Parse fields from gene tabls
+            fields = line.split( '\t' )
+            chrom     = fields[0]
+            tx_start  = int( fields[1] )
+            tx_end    = int( fields[2] )
+            name      = fields[3]
+            strand    = fields[5].replace(" ","_")
+            cds_start = int( fields[6] )
+            cds_end   = int( fields[7] )
+     
+     exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) )
+            exon_starts = map((lambda x: x + tx_start ), exon_starts)
+            exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) )
+            exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends);
+     
+     i=0
+     while i < len(exon_starts)-1:
+             intron_starts = exon_ends[i] + 1
+ intron_ends = exon_starts[i+1] - 1
+ if strand: print_tab_sep(out_file, chrom, intron_starts, intron_ends, name, "0", strand )
+                else: print_tab_sep(out_file, chrom, intron_starts, intron_ends )
+ i+=1
+            # If only interested in exons, print the portion of each exon overlapping
+            # the region of interest, otherwise print the span of the region
+            
+        except:
+            continue
+
+def print_tab_sep(out_file, *args ):
+    """Print items in `l` to stdout separated by tabs"""
+    print >>out_file, string.join( [ str( f ) for f in args ], '\t' )
+
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/ucsc_gene_bed_to_intron_bed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/ucsc_gene_bed_to_intron_bed.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,60 @@
+<tool id="gene2intron1" name="Gene BED To Intron BED">
+<description>expander</description>
+  <command interpreter="python">ucsc_gene_bed_to_intron_bed.py --input=$input1 --output=$out_file1</command>
+  <inputs>
+    <param name="input1" type="data" format="interval" label="UCSC Gene Table"/>
+    
+  </inputs>
+  <outputs>
+    <data name="out_file1" format="bed"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="3.bed" /> 
+      <output name="out_file1" file="cf-gene2intron.dat"/>
+    </test>
+  </tests>
+<help>
+
+**Syntax**
+
+This tool converts a UCSC gene bed format file to a list of bed format lines corresponding to requested features of each gene.
+
+- **BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and twelve additional optional ones::
+
+    The first three BED fields (required) are:
+    1. chrom - The name of the chromosome (e.g. chr1, chrY_random).
+    2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.)
+    3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval).
+
+    The twelve additional BED fields (optional) are:
+    4. name - The name of the BED line.
+    5. score - A score between 0 and 1000.
+    6. strand - Defines the strand - either '+' or '-'.
+    7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser.
+    8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser.
+    9. reserved - This should always be set to zero.
+   10. blockCount - The number of blocks (exons) in the BED line.
+   11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
+   12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
+   13. expCount - The number of experiments.
+   14. expIds - A comma-separated list of experiment ids. The number of items in this list should correspond to expCount.
+   15. expScores - A comma-separated list of experiment scores. All of the expScores should be relative to expIds. The number of items in this list should correspond to expCount.
+
+-----
+
+**Example**
+
+- A UCSC gene bed format file::
+
+    chr7 127475281 127491632 NM_000230 0 + 127486022 127488767 0 3 29,172,3225,    0,10713,13126
+    chr7 127486011 127488900 D49487    0 + 127486022 127488767 0 2 155,490,        0,2399
+
+- Converts the above file to a list of bed lines, which has the introns::
+
+    chr7 127475311 127475993 NM_000230 0 +
+    chr7 127486167 127488406 NM_000230 0 +
+    chr7 127486167 127488409 D49487    0 +
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/ucsc_gene_table_to_intervals.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/ucsc_gene_table_to_intervals.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+
+"""
+Read a table dump in the UCSC gene table format and print a tab separated
+list of intervals corresponding to requested features of each gene.
+
+usage: ucsc_gene_table_to_intervals.py [options]
+
+options:
+  -h, --help                  show this help message and exit
+  -rREGION, --region=REGION
+                              Limit to region: one of coding, utr3, utr5, transcribed [default]
+  -e, --exons                 Only print intervals overlapping an exon
+  -i, --input=inputfile       input file
+  -o, --output=outputfile     output file
+"""
+
+import optparse, string, sys
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def main():
+
+    # Parse command line    
+    parser = optparse.OptionParser( usage="%prog [options] " )
+    parser.add_option( "-r", "--region", dest="region", default="transcribed",
+                       help="Limit to region: one of coding, utr3, utr5, transcribed [default]" )
+    parser.add_option( "-e", "--exons",  action="store_true", dest="exons",
+                       help="Only print intervals overlapping an exon" )
+    parser.add_option( "-s", "--strand",  action="store_true", dest="strand",
+                       help="Print strand after interval" )
+    parser.add_option( "-i", "--input",  dest="input",  default=None,
+                       help="Input file" )
+    parser.add_option( "-o", "--output", dest="output", default=None,
+                       help="Output file" )
+    options, args = parser.parse_args()
+    assert options.region in ( 'coding', 'utr3', 'utr5', 'transcribed' ), "Invalid region argument"
+    
+    try:
+        out_file = open (options.output,"w")
+    except:
+        print >> sys.stderr, "Bad output file."
+        sys.exit(0)
+    
+    try:
+        in_file = open (options.input)
+    except:
+        print >> sys.stderr, "Bad input file."
+        sys.exit(0)
+    
+    print "Region:", options.region+";"
+    print "Only overlap with Exons:",
+    if options.exons:
+        print "Yes"
+    else:
+        print "No"
+    
+    # Read table and handle each gene
+    for line in in_file:
+        try:
+            if line[0:1] == "#":
+                continue
+            # Parse fields from gene tabls
+            fields = line.split( '\t' )
+            name = fields[0]
+            chrom = fields[1]
+            strand = fields[2].replace(" ","_")
+            tx_start = int( fields[3] )
+            tx_end = int( fields[4] )
+            cds_start = int( fields[5] )
+            cds_end = int( fields[6] )
+
+            # Determine the subset of the transcribed region we are interested in
+            if options.region == 'utr3':
+                if strand == '-': region_start, region_end = tx_start, cds_start
+                else: region_start, region_end = cds_end, tx_end 
+            elif options.region == 'utr5':
+                if strand == '-': region_start, region_end = cds_end, tx_end
+                else: region_start, region_end = tx_start, cds_start
+            elif options.region == 'coding':
+                region_start, region_end = cds_start, cds_end
+            else:
+                region_start, region_end = tx_start, tx_end
+
+            # If only interested in exons, print the portion of each exon overlapping
+            # the region of interest, otherwise print the span of the region
+            if options.exons:
+                exon_starts = map( int, fields[8].rstrip( ',\n' ).split( ',' ) )
+                exon_ends = map( int, fields[9].rstrip( ',\n' ).split( ',' ) )
+                for start, end in zip( exon_starts, exon_ends ):
+                    start = max( start, region_start )
+                    end = min( end, region_end )
+                    if start < end:
+                        if strand: print_tab_sep(out_file, chrom, start, end, name, "0", strand )
+                        else: print_tab_sep(out_file, chrom, start, end )
+            else:
+                if strand: print_tab_sep(out_file, chrom, region_start, region_end, name, "0", strand )
+                else: print_tab_sep(out_file, chrom, region_start, region_end )
+        except:
+            continue
+
+def print_tab_sep(out_file, *args ):
+    """Print items in `l` to stdout separated by tabs"""
+    print >>out_file, string.join( [ str( f ) for f in args ], '\t' )
+
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/ucsc_gene_table_to_intervals.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/ucsc_gene_table_to_intervals.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,25 @@
+<tool id="ucsc_gene_table_to_intervals1" name="Gene Table To BED">
+<description>Parse a UCSC Gene Table dump</description>
+  <command interpreter="python">ucsc_gene_table_to_intervals.py --input=$input1 --output=$out_file1 --region=$region $exon</command>
+  <inputs>
+    <param name="input1" type="data" format="inverval" label="UCSC Gene Table"/>
+    <param name="region" type="select">
+      <label>Feature Type</label>
+      <option value="transcribed">Transcribed</option>
+      <option value="coding">Coding</option>
+      <option value="utr3">3' UTR</option>
+      <option value="utr5">5' UTR</option>
+    </param>
+    <param name="exon" type="select">
+      <label>Only print intervals overlapping an exon</label>
+      <option value="">False</option>
+      <option value="--exons">True</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data name="out_file1" format="bed"/>
+  </outputs>
+<help>
+Read a table dump in the UCSC gene table format and create a BED file corresponding to the requested feature of each gene.
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/uniq.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/uniq.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,117 @@
+# Filename: uniq.py
+# Author: Ian N. Schenck
+# Version: 19/12/2005
+#
+# This script accepts an input file, an output file, a column
+# delimiter, and a list of columns.  The script then grabs unique
+# lines based on the columns, and returns those records with a count
+# of occurences of each unique column, inserted before the columns.
+#
+# This executes the command pipeline:
+#       cut -f $fields | sort  | uniq -C
+#
+# -i            Input file
+# -o            Output file
+# -d            Delimiter
+# -c            Column list (Comma Seperated)
+
+import sys
+import re
+import string
+import commands
+
+# This function is exceedingly useful, perhaps package for reuse?
+def getopts(argv):
+    opts = {}
+    while argv:
+        if argv[0][0] == '-':
+            opts[argv[0]] = argv[1]
+            argv = argv[2:]
+        else:
+            argv = argv[1:]
+    return opts
+
+def main():
+    args = sys.argv[1:]
+
+    try:
+        opts = getopts(args)
+    except IndexError:
+        print "Usage:"
+        print " -i        Input file"
+        print " -o        Output file"
+        print " -c        Column list (comma seperated)"
+        print " -d        Delimiter:"
+        print "                     T   Tab"
+        print "                     C   Comma"
+        print "                     D   Dash"
+        print "                     U   Underscore"
+        print "                     P   Pipe"
+        print "                     Dt  Dot"
+        print "                     Sp  Space"
+        return 0
+
+    outputfile = opts.get("-o")
+    if outputfile == None:
+        print "No output file specified."
+        return -1
+    
+    inputfile = opts.get("-i")
+    if inputfile == None:
+        print "No input file specified."
+        return -2
+
+    delim = opts.get("-d")
+    if delim == None:
+        print "Field delimiter not specified."
+        return -3
+
+    columns = opts.get("-c")
+    if columns == None or columns == 'None':
+        print "Columns not specified."
+        return -4
+
+    # All inputs have been specified at this point, now validate.
+    fileRegEx = re.compile("^[A-Za-z0-9./\-_]+$")
+    columnRegEx = re.compile("([0-9]{1,},?)+")
+
+    if not columnRegEx.match(columns):
+        print "Illegal column specification."
+        return -4
+    if not fileRegEx.match(outputfile):
+        print "Illegal output filename."
+        return -5
+    if not fileRegEx.match(inputfile):
+        print "Illegal input filename."
+        return -6
+
+    column_list = re.split(",",columns)
+    columns_for_display = ""
+    for col in column_list:
+        columns_for_display += "c"+col+", "
+
+    commandline = "cut "
+    # Set delimiter
+    if delim=='C':
+        commandline += "-d \",\" "
+    if delim=='D':
+        commandline += "-d \"-\" "
+    if delim=='U':
+        commandline += "-d \"_\" "
+    if delim=='P':
+        commandline += "-d \"|\" "
+    if delim=='Dt':
+        commandline += "-d \".\" "
+    if delim=='Sp':
+        commandline += "-d \" \" "
+
+    # set columns
+    commandline += "-f " + columns
+    commandline += " " + inputfile + " | sed s/\ //g | sort | uniq -c | sed s/^\ *// | tr \" \" \"\t\" > " + outputfile
+    errorcode, stdout = commands.getstatusoutput(commandline)
+    
+    print "Count of unique values in " + columns_for_display
+    return errorcode
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/uniq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/uniq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,75 @@
+<tool id="Count1" name="Count">
+  <description>occurrences of each record</description>
+  <command interpreter="python">uniq.py -i $input -o $out_file1 -c "$column" -d $delim</command>
+  <inputs>
+    <param name="input" type="data" format="tabular" label="from dataset" help="Dataset missing? See TIP below"/>
+    <param name="column" type="data_column" data_ref="input" multiple="True" numerical="False" label="Count occurrences of values in column(s)" help="Multi-select list - hold the appropriate key while clicking to select multiple columns" />
+    <param name="delim" type="select" label="Delimited by">
+      <option value="T">Tab</option>
+      <option value="Sp">Whitespace</option>
+      <option value="Dt">Dot</option>
+      <option value="C">Comma</option>
+      <option value="D">Dash</option>
+      <option value="U">Underscore</option>
+      <option value="P">Pipe</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="1.bed"/>
+      <output name="out_file1" file="uniq_out.dat"/>
+      <param name="column" value="1"/>
+      <param name="delim" value="T"/>
+    </test>
+  </tests>
+  <help>
+  
+ .. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
+
+-----
+
+**Syntax**
+
+This tool counts occurrences of unique values in selected column(s).
+
+- If multiple columns are selected, counting is performed on each unique group of all values in the selected columns.
+- The first column of the resulting dataset will be the count of unique values in the selected column(s) and will be followed by each value.
+
+-----
+
+**Example**
+
+- Input file::
+     
+       chr1   10  100  gene1
+       chr1  105  200  gene2
+       chr1  205  300  gene3
+       chr2   10  100  gene4
+       chr2 1000 1900  gene5
+       chr3   15 1656  gene6
+       chr4   10 1765  gene7
+       chr4   10 1765  gene8
+
+- Counting unique values in column c1 will result in::
+
+       3 chr1
+       2 chr2
+       1 chr3
+       2 chr4   
+
+- Counting unique values in the grouping of columns c2 and c3 will result in::
+
+       2    10    100
+       2    10    1765
+       1    1000  1900
+       1    105   200
+       1    15    1656
+       1    205   300
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/wc_gnu.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/wc_gnu.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,66 @@
+<tool id="wc_gnu" name="Line/Word/Character count">
+    <description>of a dataset</description>
+    <command>
+        #set $word_to_arg = { 'characters':'m', 'words':'w', 'lines':'l' }
+        #set $arg_order = [ 'lines', 'words', 'characters' ]
+        #if not isinstance( $options.value, list ):
+            #set $args = [ $options.value ]
+        #else:
+            #set $args = $options.value
+        #end if
+        #if $include_header.value:
+            echo "#${ "\t".join( [ i for i in $arg_order if i in $args ] ) }" &gt; $out_file1
+            &amp;&amp;
+        #end if
+        wc
+        #for $option in $args:
+           -${ word_to_arg[ str(option) ] }
+        #end for
+        $input1 | awk '{ print ${ '"\\t"'.join( [ "$%i" % ( i+1 ) for i in range( len( $args ) ) ] ) } }'
+        &gt;&gt; $out_file1
+    </command>
+    <inputs>
+        <param format="txt" name="input1" type="data" label="Text file"/>
+        <param name="options" type="select" multiple="True" display="checkboxes" label="Desired values">
+            <!-- <option value="bytes" selected="True">Byte count</option> -->
+            <option value="lines" selected="True">Line count</option>
+            <option value="words" selected="True">Word count</option>
+            <option value="characters" selected="True">Character count</option>
+            <validator type="no_options" message="You must pick at least one attribute to count." />
+        </param>
+        <param name="include_header" type="boolean" label="Include Output header" checked="True"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="out_file1"/>
+    </outputs>
+    <tests>
+        <test>
+          <param name="input1" value="1.bed"/>
+          <param name="options" value="lines,words,characters"/>
+          <param name="include_header" value="True"/>
+          <output name="out_file1" file="wc_gnu_out_1.tabular"/>
+        </test>
+        <test>
+          <param name="input1" value="1.bed"/>
+          <param name="options" value="lines,words,characters"/>
+          <param name="include_header" value="False"/>
+          <output name="out_file1" file="wc_gnu_out_2.tabular"/>
+        </test>
+    </tests>
+    <help>
+
+**What it does**
+
+This tool outputs counts of specified attributes (lines, words, characters) of a dataset. 
+
+-----
+
+**Example Output**
+
+::
+
+  #lines  words  characters
+  7499   41376  624971
+
+    </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/wig_to_bigwig.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/wig_to_bigwig.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,75 @@
+<tool id="wig_to_bigWig" name="Wig-to-bigWig" version="1.1.0">
+  <description>converter</description>
+  <command>grep -v "^track" $input1 | wigToBigWig stdin $chromInfo $out_file1 
+    #if $settings.settingsType == "full":
+      -blockSize=${settings.blockSize} -itemsPerSlot=${settings.itemsPerSlot} ${settings.clip} ${settings.unc}
+    #else:
+      -clip
+    #end if
+    2&gt;&amp;1 || echo "Error running wigToBigWig." >&amp;2</command>
+  <requirements>
+    <requirement type="package">ucsc_tools</requirement>
+  </requirements>
+  <inputs>
+    <param format="wig" name="input1" type="data" label="Convert">
+      <validator type="unspecified_build" />
+    </param>
+    <conditional name="settings">
+      <param name="settingsType" type="select" label="Converter settings to use" help="Default settings should usually be used.">
+        <option value="preset">Default</option>
+        <option value="full">Full parameter list</option>
+      </param>
+      <when value="preset" />
+      <when value="full">
+        <param name="blockSize" size="4" type="integer" value="256" label="Items to bundle in r-tree" help="Default is 256 (blockSize)" />
+        <param name="itemsPerSlot" size="4" type="integer" value="1024" label="Data points bundled at lowest level" help="Default is 1024 (itemsPerSlot)" />
+        <param name="clip" type="boolean" truevalue="-clip" falsevalue="" checked="True" label="Clip chromosome positions" help="Issue warning messages rather than dying if wig file contains items off end of chromosome. (clip)"/>
+        <param name="unc" type="boolean" truevalue="-unc" falsevalue="" checked="False" label="Do not use compression" help="(unc)"/>
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="bigwig" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="2.wig" dbkey="hg17" />
+      <param name="settingsType" value="full" />
+      <param name="blockSize" value="256" />
+      <param name="itemsPerSlot" value="1024" />
+      <param name="clip" value="True" />
+      <param name="unc" value="False" />
+      <output name="out_file1" file="2.bigwig"/>
+    </test>
+    <test>
+      <param name="input1" value="2.wig" dbkey="hg17" />
+      <param name="settingsType" value="preset" />
+      <output name="out_file1" file="2.bigwig"/>
+    </test>
+  </tests>
+  <help>
+**Syntax**
+
+This tool converts wiggle data into bigWig type.
+
+- **Wiggle format**: The .wig format is line-oriented. Wiggle data is preceded by a UCSC track definition line.  Following the track definition line is the track data, which can be entered in three different formats described below.
+
+  - **BED format** with no declaration line and four columns of data::
+
+      chromA  chromStartA  chromEndA  dataValueA
+      chromB  chromStartB  chromEndB  dataValueB
+
+  - **variableStep** two column data; started by a declaration line and followed with chromosome positions and data values::
+
+      variableStep  chrom=chrN  [span=windowSize]
+      chromStartA  dataValueA
+      chromStartB  dataValueB
+
+  - **fixedStep** single column data; started by a declaration line and followed with data values::
+
+      fixedStep  chrom=chrN  start=position  step=stepInterval  [span=windowSize]
+      dataValue1
+      dataValue2
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/wiggle_to_simple.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/wiggle_to_simple.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+"""
+Read a wiggle track and print out a series of lines containing
+"chrom position score". Ignores track lines, handles bed, variableStep
+and fixedStep wiggle lines.
+"""
+import sys
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+import bx.wiggle
+from galaxy.tools.exception_handling import *
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def main():
+    if len( sys.argv ) > 1: 
+        in_file = open( sys.argv[1] )
+    else: 
+        in_file = open( sys.stdin )
+    
+    if len( sys.argv ) > 2:
+        out_file = open( sys.argv[2], "w" )
+    else:
+        out_file = sys.stdout
+    
+    try:
+        for fields in bx.wiggle.IntervalReader( UCSCOutWrapper( in_file ) ):
+            out_file.write( "%s\n" % "\t".join( map( str, fields ) ) )
+    except UCSCLimitException:
+        # Wiggle data was truncated, at the very least need to warn the user.
+        print 'Encountered message from UCSC: "Reached output limit of 100000 data values", so be aware your data was truncated.'
+    except ValueError, e:
+        in_file.close()
+        out_file.close()
+        stop_err( str( e ) )
+
+    in_file.close()
+    out_file.close()
+
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/filters/wiggle_to_simple.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/wiggle_to_simple.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,88 @@
+<tool id="wiggle2simple1" name="Wiggle-to-Interval">
+  <description>converter</description>
+  <command interpreter="python">wiggle_to_simple.py $input $out_file1 </command>
+  <inputs>
+    <param format="wig" name="input" type="data" label="Convert"/>
+  </inputs>
+  <outputs>
+    <data format="interval" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="2.wig" />
+      <output name="out_file1" file="2.interval"/>
+    </test>
+    <test>
+      <param name="input" value="3.wig" />
+      <output name="out_file1" file="3_wig.bed"/>
+    </test>
+  </tests>
+  <help>
+**Syntax**
+
+This tool converts wiggle data into interval type.
+
+- **Wiggle format**: The .wig format is line-oriented. Wiggle data is preceded by a UCSC track definition line.  Following the track definition line is the track data, which can be entered in three different formats described below.
+
+  - **BED format** with no declaration line and four columns of data::
+
+      chromA  chromStartA  chromEndA  dataValueA
+      chromB  chromStartB  chromEndB  dataValueB
+
+  - **variableStep** two column data; started by a declaration line and followed with chromosome positions and data values::
+
+      variableStep  chrom=chrN  [span=windowSize]
+      chromStartA  dataValueA
+      chromStartB  dataValueB
+
+  - **fixedStep** single column data; started by a declaration line and followed with data values::
+
+      fixedStep  chrom=chrN  start=position  step=stepInterval  [span=windowSize]
+      dataValue1
+      dataValue2
+
+-----
+
+**Example**
+
+- input wiggle format file::
+
+    #track type=wiggle_0 name="Bed Format" description="BED format"
+    chr19 59302000 59302300 -1.0
+    chr19 59302300 59302600 -0.75
+    chr19 59302600 59302900 -0.50
+    chr19 59302900 59303200 -0.25
+    chr19 59303200 59303500 0.0
+    #track type=wiggle_0 name="variableStep" description="variableStep format"
+    variableStep chrom=chr19 span=150
+    59304701 10.0
+    59304901 12.5
+    59305401 15.0
+    59305601 17.5
+    #track type=wiggle_0 name="fixedStep" description="fixed step" visibility=full
+    fixedStep chrom=chr19 start=59307401 step=300 span=200
+    1000
+    900
+    800
+    700
+    600
+
+- convert the above file to interval file::
+
+    chr19 59302000 59302300 + -1.0
+    chr19 59302300 59302600 + -0.75
+    chr19 59302600 59302900 + -0.5
+    chr19 59302900 59303200 + -0.25
+    chr19 59303200 59303500 + 0.0
+    chr19 59304701 59304851 + 10.0
+    chr19 59304901 59305051 + 12.5
+    chr19 59305401 59305551 + 15.0
+    chr19 59305601 59305751 + 17.5
+    chr19 59307701 59307901 + 1000.0
+    chr19 59308001 59308201 + 900.0
+    chr19 59308301 59308501 + 800.0
+    chr19 59308601 59308801 + 700.0
+    chr19 59308901 59309101 + 600.0
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/galaxy-loc.tar.gz
b
Binary file tools/galaxy-loc.tar.gz has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/gatk/analyze_covariates.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/gatk/analyze_covariates.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,92 @@
+<tool id="gatk_analyze_covariates" name="Analyze Covariates" version="0.0.1">
+  <description>- perform local realignment</description>
+<command interpreter="python">gatk_wrapper.py
+   --stdout "${output_log}"
+   --html_report_from_directory "${output_html}" "${output_html.files_path}"
+   -p 'java 
+    -jar "${GALAXY_DATA_INDEX_DIR}/shared/jars/gatk/AnalyzeCovariates.jar"
+    -recalFile "${input_recal}"
+    -outputDir "${output_html.files_path}"
+    ##-log "${output_log}"
+    ##-Rscript,--path_to_Rscript path_to_Rscript; on path is good enough
+    -resources "${GALAXY_DATA_INDEX_DIR}/gatk/R"         
+    #if $analysis_param_type.analysis_param_type_selector == "advanced":
+        --ignoreQ "${analysis_param_type.ignore_q}"
+        --numRG "${analysis_param_type.num_read_groups}"
+        --max_quality_score "${analysis_param_type.max_quality_score}"
+        --max_histogram_value "${analysis_param_type.max_histogram_value}"
+         ${analysis_param_type.do_indel_quality}
+    #end if
+   '
+  </command>
+  <inputs>
+    <param name="input_recal" type="data" format="csv" label="Covariates table recalibration file" />
+    <conditional name="analysis_param_type">
+      <param name="analysis_param_type_selector" type="select" label="Basic or Advanced options">
+        <option value="basic" selected="True">Basic</option>
+        <option value="advanced">Advanced</option>
+      </param>
+      <when value="basic">
+        <!-- Do nothing here -->
+      </when>
+      <when value="advanced">
+        <param name="ignore_q" type="integer" value="5" label="Ignore bases with reported quality less than this number."/>
+        <param name="num_read_groups" type="integer" value="-1" label="Only process N read groups."/>
+        <param name="max_quality_score" type="integer" value="50" label="Max quality score"/>
+        <param name="max_histogram_value" type="integer" value="0" label="Max quality score"/>
+        <param name="do_indel_quality" type="boolean" truevalue="--do_indel_quality" falsevalue="" label="Max quality score"/>
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="html" name="output_html" label="${tool.name} on ${on_string} (HTML)" />
+    <data format="txt" name="output_log" label="${tool.name} on ${on_string} (log)" />
+  </outputs>
+  <tests>
+      <test>
+          <param name="input_recal" value="gatk/gatk_count_covariates/gatk_count_covariates_out_1.csv" ftype="csv" /> 
+          <param name="analysis_param_type_selector" value="basic" />
+          <output name="output_html" file="gatk/gatk_analyze_covariates/gatk_analyze_covariates_out_1.html" />
+          <output name="output_log" file="gatk/gatk_analyze_covariates/gatk_analyze_covariates_out_1.log.contains" compare="contains" />
+      </test>
+  </tests>
+  <help>
+**What it does**
+
+     Create collapsed versions of the recal csv file and call R scripts to plot residual error versus the various covariates.
+
+
+------
+
+Please cite the website "http://addlink.here" as well as:
+
+Add citation here 2011.
+
+------
+
+**Input formats**
+
+GenomeAnalysisTK: AnalyzeCovariates accepts an recal CSV file.
+
+------
+
+**Outputs**
+
+The output is in and HTML file with links to PDF graphs and a data files, see http://addlink.here for more details.
+
+-------
+
+**Settings**::
+
+ recal_file                   The input recal csv file to analyze
+ output_dir                   The directory in which to output all the plots and intermediate data files
+ path_to_Rscript           The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript
+ path_to_resources     Path to resources folder holding the Sting R scripts.
+ ignoreQ                           Ignore bases with reported quality less than this number.
+ numRG                                 Only process N read groups. Default value: -1 (process all read groups)
+ max_quality_score          The integer value at which to cap the quality scores, default is 50
+ max_histogram_value   If supplied, this value will be the max value of the histogram plots
+ do_indel_quality                             If supplied, this value will be the max value of the histogram plots
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/gatk/count_covariates.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/gatk/count_covariates.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,431 @@\n+<tool id="gatk_count_covariates" name="Count Covariates" version="0.0.1">\n+  <description>on BAM files</description>\n+  <command interpreter="python">gatk_wrapper.py\n+   --stdout "${output_log}"\n+   -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input"\n+   -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index\n+   -p \'java \n+    -jar "${GALAXY_DATA_INDEX_DIR}/shared/jars/gatk/GenomeAnalysisTK.jar"\n+    -T "CountCovariates"\n+    --num_threads 4 ##hard coded, for now\n+    -et "NO_ET" ##ET no phone home\n+    ##-log "${output_log}" ##don\'t use this to log to file, instead directly capture stdout\n+    #if $reference_source.reference_source_selector != "history":\n+        -R "${reference_source.ref_file.fields.path}"\n+    #end if\n+    --recal_file "${output_recal}"\n+    ${standard_covs}\n+    #if $covariates.value:\n+        #for $cov in $covariates.value:\n+            -cov "${cov}"\n+        #end for\n+    #end if\n+   \'\n+    \n+    #set $snp_dataset_provided = False\n+    #if str( $input_dbsnp_rod ) != "None":\n+        -d "-D" "${input_dbsnp_rod}" "${input_dbsnp_rod.ext}" "dbsnp_rod"\n+        #set $snp_dataset_provided = True\n+    #end if\n+    #set $rod_binding_names = dict()\n+    #for $rod_binding in $rod_bind:\n+        #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == \'custom\':\n+            #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name\n+        #else\n+            #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector\n+        #end if\n+        #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == \'snps\':\n+            #set $snp_dataset_provided = True\n+        #end if\n+        #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1\n+        -d "-B:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}"\n+        #if str( $rod_binding.rod_bind_type.rodToIntervalTrackName ):\n+            -p \'--rodToIntervalTrackName "${rod_bind_name}"\'\n+        #end if\n+    #end for\n+    \n+    ##start standard gatk options\n+    #if $gatk_param_type.gatk_param_type_selector == "advanced":\n+        #for $sample_metadata in $gatk_param_type.sample_metadata:\n+            -p \'--sample_metadata "${sample_metadata.sample_metadata_file}"\'\n+        #end for\n+        #for $read_filter in $gatk_param_type.read_filter:\n+            -p \'--read_filter "${read_filter.read_filter_type.read_filter_type_selector}"\n+            ###raise Exception( str( dir( $read_filter ) ) )\n+            #for $name, $param in $read_filter.read_filter_type.iteritems():\n+                #if $name not in [ "__current_case__", "read_filter_type_selector" ]:\n+                    --${name} "${param}"\n+                #end if\n+            #end for\n+            \'\n+        #end for\n+        #if str( $gatk_param_type.input_intervals ) != "None":\n+            -d "-L" "${gatk_param_type.input_intervals}" "${gatk_param_type.input_intervals.ext}" "input_intervals"\n+        #end if\n+        #if str( $gatk_param_type.input_exclude_intervals ) != "None":\n+            -d "-XL" "${gatk_param_type.input_exclude_intervals}" "${gatk_param_type.input_exclude_intervals.ext}" "input_intervals"\n+        #end if\n+        \n+        -p \'--BTI_merge_rule "${gatk_param_type.BTI_merge_rule}"\'\n+        -p \'--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"\'\n+        #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE":\n+            -p \'--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"\'\n+        #end if\n+        -p \'\n+        --baq "${gatk_param_type.baq}"\n+        --baqGapOpenPenalty "${gatk_param_type.baq_gap_ope'..b'over known variant sites. Please provide a dbSNP ROD or a VCF file containing known sites of genetic variation."\n+However, if you do not provide this file, the \'--run_without_dbsnp_potentially_ruining_quality\' flag will be automatically used, and the command will be allowed to run.\n+  \n+**What it does**\n+\n+     This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal \n+     operating only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors \n+     and indicative of poor base quality. This walker generates tables based on various user-specified covariates (such \n+     as read group, reported quality score, cycle, and dinucleotide) Since there is a large amount of data one can then \n+     calculate an empirical probability of error given the particular covariates seen at this site, where p(error) = num \n+     mismatches / num observations The output file is a CSV list of (the several covariate values, num observations, num \n+     mismatches, empirical quality score) The first non-comment line of the output file gives the name of the covariates \n+     that were used for this calculation.  Note: ReadGroupCovariate and QualityScoreCovariate are required covariates \n+     and will be added for the user regardless of whether or not they were specified Note: This walker is designed to be \n+     used in conjunction with TableRecalibrationWalker.\n+\n+\n+------\n+\n+Please cite the website "http://addlink.here" as well as:\n+\n+Add citation here 2011.\n+\n+------\n+\n+**Input formats**\n+\n+GenomeAnalysisTK: CountCovariates accepts an aligned BAM input file.\n+\n+------\n+\n+**Outputs**\n+\n+The output is in CSV format, see http://addlink.here for more details.\n+\n+-------\n+\n+**Settings**::\n+\n+\n+ default_read_group                           If a read has no read group then default to the provided String.\n+ default_platform                                If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.\n+ force_read_group                               If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.\n+ force_platform                                    If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.\n+ window_size_nqs                                 The window size used by MinimumNQSCovariate for its calculation\n+ homopolymer_nback                           The number of previous bases to look at in HomopolymerCovariate\n+ exception_if_no_tile                               If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1\n+ solid_recal_mode                             How should we recalibrate solid bases in whichthe reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS (DO_NOTHING|SET_Q_ZERO|SET_Q_ZERO_BASE_N|REMOVE_REF_BIAS)\n+ solid_nocall_strategy   Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ (THROW_EXCEPTION|LEAVE_READ_UNRECALIBRATED|PURGE_READ)\n+ recal_file                                     Filename for the input covariates table recalibration .csv file\n+ out                                                           The output CSV file\n+ recal_file                                                     Filename for the outputted covariates table recalibration file\n+ standard_covs                                                                Use the standard set of covariates in addition to the ones listed using the -cov argument\n+ run_without_dbsnp_potentially_ruining_quality   If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/gatk/gatk_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/gatk/gatk_wrapper.py Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+#Dan Blankenberg
+
+"""
+A wrapper script for running the GenomeAnalysisTK.jar commands.
+"""
+
+import sys, optparse, os, tempfile, subprocess, shutil
+from string import Template
+
+GALAXY_EXT_TO_GATK_EXT = { 'gatk_interval':'intervals', 'bam_index':'bam.bai', 'gatk_dbsnp':'dbsnp', 'picard_interval_list':'interval_list' } #items not listed here, will use the galaxy extension as-is
+GALAXY_EXT_TO_GATK_FILE_TYPE = GALAXY_EXT_TO_GATK_EXT #for now, these are the same, but could be different if needed
+DEFAULT_GATK_PREFIX = "gatk_file"
+CHUNK_SIZE = 2**20 #1mb
+
+
+def cleanup_before_exit( tmp_dir ):
+    if tmp_dir and os.path.exists( tmp_dir ):
+        shutil.rmtree( tmp_dir )
+
+def gatk_filename_from_galaxy( galaxy_filename, galaxy_ext, target_dir = None, prefix = None ):
+    suffix = GALAXY_EXT_TO_GATK_EXT.get( galaxy_ext, galaxy_ext )
+    if prefix is None:
+        prefix = DEFAULT_GATK_PREFIX
+    if target_dir is None:
+        target_dir = os.getcwd()
+    gatk_filename = os.path.join( target_dir, "%s.%s" % ( prefix, suffix ) )
+    os.symlink( galaxy_filename, gatk_filename )
+    return gatk_filename
+
+def gatk_filetype_argument_substitution( argument, galaxy_ext ):
+    return argument % dict( file_type = GALAXY_EXT_TO_GATK_FILE_TYPE.get( galaxy_ext, galaxy_ext ) )
+
+def open_file_from_option( filename, mode = 'rb' ):
+    if filename:
+        return open( filename, mode = mode )
+    return None
+
+def html_report_from_directory( html_out, dir ):
+    html_out.write( '<html>\n<head>\n<title>Galaxy - GATK Output</title>\n</head>\n<body>\n<p/>\n<ul>\n' )
+    for fname in sorted( os.listdir( dir ) ):
+        html_out.write(  '<li><a href="%s">%s</a></li>\n' % ( fname, fname ) )
+    html_out.write( '</ul>\n</body>\n</html>\n' )
+
+def __main__():
+    #Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option( '-p', '--pass_through', dest='pass_through_options', action='append', type="string", help='These options are passed through directly to GATK, without any modification.' )
+    parser.add_option( '-d', '--dataset', dest='datasets', action='append', type="string", nargs=4, help='"-argument" "original_filename" "galaxy_filetype" "name_prefix"' )
+    parser.add_option( '', '--stdout', dest='stdout', action='store', type="string", default=None, help='If specified, the output of stdout will be written to this file.' )
+    parser.add_option( '', '--stderr', dest='stderr', action='store', type="string", default=None, help='If specified, the output of stderr will be written to this file.' )
+    parser.add_option( '', '--html_report_from_directory', dest='html_report_from_directory', action='append', type="string", nargs=2, help='"Target HTML File" "Directory"')
+    (options, args) = parser.parse_args()
+    
+    tmp_dir = tempfile.mkdtemp()
+    if options.pass_through_options:
+        cmd = ' '.join( options.pass_through_options )
+    else:
+        cmd = ''
+    if options.datasets:
+        for ( dataset_arg, filename, galaxy_ext, prefix ) in options.datasets:
+            gatk_filename = gatk_filename_from_galaxy( filename, galaxy_ext, target_dir = tmp_dir, prefix = prefix )
+            if dataset_arg:
+                cmd = '%s %s "%s"' % ( cmd, gatk_filetype_argument_substitution( dataset_arg, galaxy_ext ), gatk_filename )
+    #set up stdout and stderr output options
+    stdout = open_file_from_option( options.stdout, mode = 'wb' )
+    stderr = open_file_from_option( options.stderr, mode = 'wb' )
+    #if no stderr file is specified, we'll use our own
+    if stderr is None:
+        stderr = tempfile.NamedTemporaryFile( dir=tmp_dir )
+        stderr.close()
+        stderr = open( stderr.name, 'w+b' )
+    
+    proc = subprocess.Popen( args=cmd, stdout=stdout, stderr=stderr, shell=True, cwd=tmp_dir )
+    return_code = proc.wait()
+    
+    if return_code:
+        stderr_target = sys.stderr
+    else:
+        stderr_target = sys.stdout
+    stderr.flush()
+    stderr.seek(0)
+    while True:
+        chunk = stderr.read( CHUNK_SIZE )
+        if chunk:
+            stderr_target.write( chunk )
+        else:
+            break
+    stderr.close()
+    #generate html reports
+    if options.html_report_from_directory:
+        for ( html_filename, html_dir ) in options.html_report_from_directory:
+            html_report_from_directory( open( html_filename, 'wb' ), html_dir )
+    
+    cleanup_before_exit( tmp_dir )
+
+if __name__=="__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/gatk/indel_realigner.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/gatk/indel_realigner.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,332 @@\n+<tool id="gatk_indel_realigner" name="Indel Realigner" version="0.0.1">\n+  <description>- perform local realignment</description>\n+  <command interpreter="python">gatk_wrapper.py\n+   --stdout "${output_log}"\n+   -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input"\n+   -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index\n+   -p \'java \n+    -jar "${GALAXY_DATA_INDEX_DIR}/shared/jars/gatk/GenomeAnalysisTK.jar"\n+    -T "IndelRealigner"\n+    ##-quiet ##this appears to have no effect...confirmed by gatk programmers\n+    -o "${output_bam}"\n+    -et "NO_ET" ##ET no phone home\n+    ##-log "${output_log}" ##don\'t use this to log to file, instead directly capture stdout\n+    #if $reference_source.reference_source_selector != "history":\n+        -R "${reference_source.ref_file.fields.path}"\n+    #end if\n+   -LOD "${lod_threshold}"\n+    ${knowns_only}\n+   \'\n+   \n+    #set $rod_binding_names = dict()\n+    #if str( $input_dbsnp_rod ) != "None":\n+        -d "-D" "${input_dbsnp_rod}" "${input_dbsnp_rod.ext}" "dbsnp_rod"\n+    #end if\n+    #for $rod_binding in $rod_bind:\n+        #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == \'custom\':\n+            #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name\n+        #else\n+            #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector\n+        #end if\n+        #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1\n+        -d "-B:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}"\n+        #if str( $rod_binding.rod_bind_type.rodToIntervalTrackName ):\n+            -p \'--rodToIntervalTrackName "${rod_bind_name}"\'\n+        #end if\n+    #end for\n+   \n+    ##start standard gatk options\n+    #if $gatk_param_type.gatk_param_type_selector == "advanced":\n+        #for $sample_metadata in $gatk_param_type.sample_metadata:\n+            -p \'--sample_metadata "${sample_metadata.sample_metadata_file}"\'\n+        #end for\n+        #for $read_filter in $gatk_param_type.read_filter:\n+            -p \'--read_filter "${read_filter.read_filter_type.read_filter_type_selector}"\n+            ###raise Exception( str( dir( $read_filter ) ) )\n+            #for $name, $param in $read_filter.read_filter_type.iteritems():\n+                #if $name not in [ "__current_case__", "read_filter_type_selector" ]:\n+                    --${name} "${param}"\n+                #end if\n+            #end for\n+            \'\n+        #end for\n+        #if str( $gatk_param_type.input_intervals ) != "None":\n+            -d "-L" "${gatk_param_type.input_intervals}" "${gatk_param_type.input_intervals.ext}" "input_intervals"\n+        #end if\n+        #if str( $gatk_param_type.input_exclude_intervals ) != "None":\n+            -d "-XL" "${gatk_param_type.input_exclude_intervals}" "${gatk_param_type.input_exclude_intervals.ext}" "input_intervals"\n+        #end if\n+        -p \'--BTI_merge_rule "${gatk_param_type.BTI_merge_rule}"\'\n+        -p \'--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"\'\n+        #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE":\n+            -p \'--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"\'\n+        #end if\n+        -p \'\n+        --baq "${gatk_param_type.baq}"\n+        --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}"\n+        ${gatk_param_type.use_original_qualities}\n+        --defaultBaseQualities "${gatk_param_type.default_base_qualities}"\n+        --validation_strictness "${gatk_param_type.validation_strictness}"\n+        --interval_merging "${gatk_param_type.interval_merging}"\n+        \'\n+        #if '..b'/>\n+          <param name="rod_bind_type_selector" value="snps" />\n+          <param name="rodToIntervalTrackName" />\n+          <param name="input_rod" value="gatk/fake_phiX_variant_locations.bed" ftype="bed" />\n+          <param name="lod_threshold" value="5.0" />\n+          <param name="knowns_only" />\n+          <param name="gatk_param_type_selector" value="basic" />\n+          <param name="analysis_param_type_selector" value="basic" />\n+          <output name="output_bam" file="gatk/gatk_indel_realigner/gatk_indel_realigner_out_1.bam" ftype="bam" lines_diff="2" /> \n+          <output name="output_log" file="gatk/gatk_indel_realigner/gatk_indel_realigner_out_1.log.contains" compare="contains" />\n+      </test>\n+  </tests>\n+  <help>\n+**What it does**\n+\n+     Performs local realignment of reads based on misalignments due to the presence of indels. Unlike most mappers, this \n+     walker uses the full alignment context to determine whether an appropriate alternate reference (i.e. indel) exists \n+     and updates SAMRecords accordingly.\n+\n+------\n+\n+Please cite the website "http://addlink.here" as well as:\n+\n+Add citation here 2011.\n+\n+------\n+\n+**Input formats**\n+\n+GenomeAnalysisTK: IndelRealigner accepts an aligned BAM and a list of intervals to realign as input files.\n+\n+------\n+\n+**Outputs**\n+\n+The output is in the BAM format, see http://addlink.here for more details.\n+\n+-------\n+\n+**Settings**::\n+\n+ targetIntervals                intervals file output from RealignerTargetCreator\n+ LODThresholdForCleaning            LOD threshold above which the cleaner will clean\n+ entropyThreshold                      percentage of mismatches at a locus to be considered having high entropy\n+ out                                                      Output bam\n+ bam_compression                       Compression level to use for writing BAM files\n+ disable_bam_indexing                                              Turn off on-the-fly creation of indices for output BAM files.\n+ simplifyBAM                                          If provided, output BAM files will be simplified to include just key reads for downstream variation discovery analyses (removing duplicates, PF-, non-primary reads), as well stripping all extended tags from the kept reads except the read group identifier\n+ useOnlyKnownIndels                                    Don\'t run \'Smith-Waterman\' to generate alternate consenses; use only known indels provided as RODs for constructing the alternate references.\n+ maxReadsInMemory                  max reads allowed to be kept in memory at a time by the SAMFileWriter. Keep it low to minimize memory consumption (but the tool may skip realignment on regions with too much coverage.  If it is too low, it may generate errors during realignment); keep it high to maximize realignment (but make sure to give Java enough memory).\n+ maxIsizeForMovement               maximum insert size of read pairs that we attempt to realign\n+ maxPositionalMoveAllowed   maximum positional move in basepairs that a read can be adjusted during realignment\n+ maxConsensuses                   max alternate consensuses to try (necessary to improve performance in deep coverage)\n+ maxReadsForConsensuses           max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)\n+ maxReadsForRealignment         max reads allowed at an interval for realignment; if this value is exceeded, realignment is not attempted and the reads are passed to the output file(s) as-is\n+ noOriginalAlignmentTags                                   Don\'t output the original cigar or alignment start tags for each realigned read in the output bam.\n+ targetIntervalsAreNotSorted                      This tool assumes that the target interval list is sorted; if the list turns out to be unsorted, it will throw an exception.  Use this argument when your interval list is not sorted to instruct the Realigner to first sort it in memory.\n+\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/gatk/realigner_target_creator.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/gatk/realigner_target_creator.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,296 @@\n+<tool id="gatk_realigner_target_creator" name="Realigner Target Creator" version="0.0.1">\n+  <description>for use in local realignment</description>\n+  <command interpreter="python">gatk_wrapper.py\n+   --stdout "${output_log}"\n+   -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input"\n+   -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index\n+   -p \'java \n+    -jar "${GALAXY_DATA_INDEX_DIR}/shared/jars/gatk/GenomeAnalysisTK.jar"\n+    -T "RealignerTargetCreator"\n+    -o "${output_interval}"\n+    -et "NO_ET" ##ET no phone home\n+    ##-log "${output_log}" ##don\'t use this to log to file, instead directly capture stdout\n+    #if $reference_source.reference_source_selector != "history":\n+        -R "${reference_source.ref_file.fields.path}"\n+    #end if\n+   \'\n+    #set $rod_binding_names = dict()\n+    #if str( $input_dbsnp_rod ) != "None":\n+        -d "-D" "${input_dbsnp_rod}" "${input_dbsnp_rod.ext}" "dbsnp_rod"\n+    #end if\n+    #for $rod_binding in $rod_bind:\n+        #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == \'custom\':\n+            #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name\n+        #else\n+            #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector\n+        #end if\n+        #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1\n+        -d "-B:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}"\n+        #if str( $rod_binding.rod_bind_type.rodToIntervalTrackName ):\n+            -p \'--rodToIntervalTrackName "${rod_bind_name}"\'\n+        #end if\n+    #end for\n+   \n+    ##start standard gatk options\n+    #if $gatk_param_type.gatk_param_type_selector == "advanced":\n+        #for $sample_metadata in $gatk_param_type.sample_metadata:\n+            -p \'--sample_metadata "${sample_metadata.sample_metadata_file}"\'\n+        #end for\n+        #for $read_filter in $gatk_param_type.read_filter:\n+            -p \'--read_filter "${read_filter.read_filter_type.read_filter_type_selector}"\n+            ###raise Exception( str( dir( $read_filter ) ) )\n+            #for $name, $param in $read_filter.read_filter_type.iteritems():\n+                #if $name not in [ "__current_case__", "read_filter_type_selector" ]:\n+                    --${name} "${param}"\n+                #end if\n+            #end for\n+            \'\n+        #end for\n+        #if str( $gatk_param_type.input_intervals ) != "None":\n+            -d "-L" "${gatk_param_type.input_intervals}" "${gatk_param_type.input_intervals.ext}" "input_intervals"\n+        #end if\n+        #if str( $gatk_param_type.input_exclude_intervals ) != "None":\n+            -d "-XL" "${gatk_param_type.input_exclude_intervals}" "${gatk_param_type.input_exclude_intervals.ext}" "input_intervals"\n+        #end if\n+\n+        -p \'--BTI_merge_rule "${gatk_param_type.BTI_merge_rule}"\'\n+        \n+        -p \'--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"\'\n+        #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE":\n+            -p \'--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"\'\n+        #end if\n+        -p \'\n+        --baq "${gatk_param_type.baq}"\n+        --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}"\n+        ${gatk_param_type.use_original_qualities}\n+        --defaultBaseQualities "${gatk_param_type.default_base_qualities}"\n+        --validation_strictness "${gatk_param_type.validation_strictness}"\n+        --interval_merging "${gatk_param_type.interval_merging}"\n+        \'\n+        #if str( $gatk_param_type.read_group_black_list ) != "None":\n+            -d "-read_group_bl'..b' tag" />\n+        <param name="default_base_qualities" type="integer" label="Value to be used for all base quality scores, when some are missing" value="-1"/>\n+        <param name="validation_strictness" type="select" label="How strict should we be with validation">\n+          <option value="STRICT" selected="True">STRICT</option>\n+          <option value="LENIENT">LENIENT</option>\n+          <option value="SILENT">SILENT</option>\n+        </param>\n+        <param name="interval_merging" type="select" label="Interval merging rule">\n+          <option value="ALL" selected="True">ALL</option>\n+          <option value="OVERLAPPING_ONLY">OVERLAPPING_ONLY</option>\n+        </param>\n+        <param name="read_group_black_list" type="data" format="txt" optional="True" label="Read group black list" />\n+      </when>\n+    </conditional>\n+    \n+    <conditional name="analysis_param_type">\n+      <param name="analysis_param_type_selector" type="select" label="Basic or Advanced Analysis options">\n+        <option value="basic" selected="True">Basic</option>\n+        <option value="advanced">Advanced</option>\n+      </param>\n+      <when value="basic">\n+        <!-- Do nothing here -->\n+      </when>\n+      <when value="advanced">\n+        <param name="windowSize" type="integer" value="10" label="Window size for calculating entropy or SNP clusters (windowSize)" />\n+        <param name="mismatchFraction" type="float" value="0.15" label="Fraction of base qualities needing to mismatch for a position to have high entropy (mismatchFraction)" help="to disable set to &lt;= 0 or &gt; 1"/>\n+        <param name="minReadsAtLocus" type="integer" value="4" label="Minimum reads at a locus to enable using the entropy calculation (minReadsAtLocus)" />\n+        <param name="maxIntervalSize" type="integer" value="500" label="Maximum interval size" />\n+      </when>\n+    </conditional>\n+  </inputs>\n+  <outputs>\n+    <data format="gatk_interval" name="output_interval" label="${tool.name} on ${on_string} (GATK intervals)" />\n+    <data format="txt" name="output_log" label="${tool.name} on ${on_string} (log)" />\n+  </outputs>\n+  <tests>\n+      <test>\n+          <param name="reference_source_selector" value="history" />\n+          <param name="ref_file" value="phiX.fasta" ftype="fasta" />\n+          <param name="input_bam" value="gatk/fake_phiX_reads_1.bam" ftype="bam" />\n+          <param name="input_dbsnp_rod"  />\n+          <param name="rod_bind_type_selector" value="snps" />\n+          <param name="rodToIntervalTrackName" />\n+          <param name="input_rod" value="gatk/fake_phiX_variant_locations.bed" ftype="bed" />\n+          <param name="gatk_param_type_selector" value="basic" />\n+          <param name="analysis_param_type_selector" value="basic" />\n+          <output name="output_interval" file="gatk/gatk_realigner_target_creator/gatk_realigner_target_creator_out_1.gatk_interval" /> \n+          <output name="output_log" file="gatk/gatk_realigner_target_creator/gatk_realigner_target_creator_out_1.log.contains" compare="contains"/>\n+      </test>\n+  </tests>\n+  <help>\n+**What it does**\n+\n+Emits intervals for the Local Indel Realigner to target for cleaning.  Ignores 454 reads, MQ0 reads, and reads with consecutive indel operators in the CIGAR string.\n+\n+------\n+\n+Please cite the website "http://addlink.here" as well as:\n+\n+Add citation here 2011.\n+\n+------\n+\n+**Input formats**\n+\n+GenomeAnalysisTK: RealignerTargetCreator accepts an aligned BAM input file.\n+\n+------\n+\n+**Outputs**\n+\n+The output is in GATK Interval format, see http://addlink.here for more details.\n+\n+-------\n+\n+**Settings**::\n+\n+ windowSize          window size for calculating entropy or SNP clusters\n+ mismatchFraction    fraction of base qualities needing to mismatch for a position to have high entropy; to disable set to &lt;= 0 or &gt; 1\n+ minReadsAtLocus     minimum reads at a locus to enable using the entropy calculation\n+ maxIntervalSize     maximum interval size\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/gatk/table_recalibration.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/gatk/table_recalibration.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,400 @@\n+<tool id="gatk_table_recalibration" name="Table Recalibration" version="0.0.1">\n+  <description>on BAM files</description>\n+  <command interpreter="python">gatk_wrapper.py\n+   --stdout "${output_log}"\n+   -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input"\n+   -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index\n+   -p \'java \n+    -jar "${GALAXY_DATA_INDEX_DIR}/shared/jars/gatk/GenomeAnalysisTK.jar"\n+    -T "TableRecalibration"\n+    -o "${output_bam}"\n+    -et "NO_ET" ##ET no phone home\n+    ##-log "${output_log}" ##don\'t use this to log to file, instead directly capture stdout\n+    #if $reference_source.reference_source_selector != "history":\n+        -R "${reference_source.ref_file.fields.path}"\n+    #end if\n+    --recal_file "${input_recal}"\n+    --disable_bam_indexing\n+   \'\n+    ##start standard gatk options\n+    #if $gatk_param_type.gatk_param_type_selector == "advanced":\n+        #for $sample_metadata in $gatk_param_type.sample_metadata:\n+            -p \'--sample_metadata "${sample_metadata.sample_metadata_file}"\'\n+        #end for\n+        #for $read_filter in $gatk_param_type.read_filter:\n+            -p \'--read_filter "${read_filter.read_filter_type.read_filter_type_selector}"\n+            ###raise Exception( str( dir( $read_filter ) ) )\n+            #for $name, $param in $read_filter.read_filter_type.iteritems():\n+                #if $name not in [ "__current_case__", "read_filter_type_selector" ]:\n+                    --${name} "${param}"\n+                #end if\n+            #end for\n+            \'\n+        #end for\n+        #if str( $gatk_param_type.input_intervals ) != "None":\n+            -d "-L" "${gatk_param_type.input_intervals}" "${gatk_param_type.input_intervals.ext}" "input_intervals"\n+        #end if\n+        #if str( $gatk_param_type.input_exclude_intervals ) != "None":\n+            -d "-XL" "${gatk_param_type.input_exclude_intervals}" "${gatk_param_type.input_exclude_intervals.ext}" "input_intervals"\n+        #end if\n+        #set $rod_binding_names = dict()\n+        #for $rod_binding in $gatk_param_type.rod_bind:\n+            #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == \'custom\':\n+                #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name\n+            #else\n+                #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector\n+            #end if\n+            #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1\n+            -d "-B:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}"\n+            #if str( $rod_binding.rod_bind_type.rodToIntervalTrackName ):\n+                -p \'--rodToIntervalTrackName "${rod_bind_name}"\'\n+            #end if\n+        #end for\n+        -p \'--BTI_merge_rule "${gatk_param_type.BTI_merge_rule}"\'\n+        #if str( $gatk_param_type.input_dbsnp_rod ) != "None":\n+            -d "-D" "${gatk_param_type.input_dbsnp_rod}" "${gatk_param_type.input_dbsnp_rod.ext}" "dbsnp_rod"\n+        #end if\n+        -p \'--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"\'\n+        #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE":\n+            -p \'--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"\'\n+        #end if\n+        -p \'\n+        --baq "${gatk_param_type.baq}"\n+        --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}"\n+        ${gatk_param_type.use_original_qualities}\n+        --defaultBaseQualities "${gatk_param_type.default_base_qualities}"\n+        --validation_strictness "${gatk_param_type.validation_strictness}"\n+        --interval_merging "${gatk_par'..b'two-pass processing step, doing a by-read traversal.  For \n+     each base in each read this walker calculates various user-specified covariates (such as read group, reported \n+     quality score, cycle, and dinuc) Using these values as a key in a large hashmap the walker calculates an empirical \n+     base quality score and overwrites the quality score currently in the read. This walker then outputs a new bam file \n+     with these updated (recalibrated) reads.  Note: This walker expects as input the recalibration table file generated \n+     previously by CovariateCounterWalker. Note: This walker is designed to be used in conjunction with \n+     CovariateCounterWalker.\n+\n+------\n+\n+Please cite the website "http://addlink.here" as well as:\n+\n+Add citation here 2011.\n+\n+------\n+\n+**Input formats**\n+\n+GenomeAnalysisTK: TableRecalibration accepts an aligned BAM and a recalibration CSV input files.\n+\n+------\n+\n+**Outputs**\n+\n+The output is in BAM format, see http://addlink.here for more details.\n+\n+-------\n+\n+**Settings**::\n+\n+ default_read_group                           If a read has no read group then default to the provided String.\n+ default_platform                                If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.\n+ force_read_group                               If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.\n+ force_platform                                    If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.\n+ window_size_nqs                                 The window size used by MinimumNQSCovariate for its calculation\n+ homopolymer_nback                           The number of previous bases to look at in HomopolymerCovariate\n+ exception_if_no_tile                               If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1\n+ solid_recal_mode                             How should we recalibrate solid bases in whichthe reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS (DO_NOTHING|SET_Q_ZERO|SET_Q_ZERO_BASE_N|REMOVE_REF_BIAS)\n+ solid_nocall_strategy   Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ (THROW_EXCEPTION|LEAVE_READ_UNRECALIBRATED|PURGE_READ)\n+ recal_file                                     Filename for the input covariates table recalibration .csv file\n+ out                                                           The output BAM file\n+ bam_compression                            Compression level to use for writing BAM files\n+ disable_bam_indexing                                                   Turn off on-the-fly creation of indices for output BAM files.\n+ simplifyBAM                                               If provided, output BAM files will be simplified to include just key reads for downstream variation discovery analyses (removing duplicates, PF-, non-primary reads), as well stripping all extended tags from the kept reads except the read group identifier\n+ preserve_qscores_less_than            Bases with quality scores less than this threshold won\'t be recalibrated, default=5. In general it\'s unsafe to change qualities scores below &lt; 5, since base callers use these values to indicate random or bad bases\n+ smoothing                                              Number of imaginary counts to add to each bin bin order to smooth out bins with few data points, default=1\n+ max_quality_score                            The integer value at which to cap the quality scores, default=50\n+ doNotWriteOriginalQuals                                         If true, we will not write the original quality (OQ) tag for each read\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/gatk/unified_genotyper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/gatk/unified_genotyper.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,414 @@\n+<tool id="gatk_unified_genotyper" name="Unified Genotyper" version="0.0.1">\n+  <description>SNP and indel caller</description>\n+  <command interpreter="python">gatk_wrapper.py\n+   --stdout "${output_log}"\n+   #for $i, $input_bam in enumerate( $reference_source.input_bams ):\n+       -d "-I" "${input_bam.input_bam}" "${input_bam.input_bam.ext}" "gatk_input_${i}"\n+       -d "" "${input_bam.input_bam.metadata.bam_index}" "bam_index" "gatk_input_${i}" ##hardcode galaxy ext type as bam_index\n+   #end for\n+   -p \'java \n+    -jar "${GALAXY_DATA_INDEX_DIR}/shared/jars/gatk/GenomeAnalysisTK.jar"\n+    -T "UnifiedGenotyper"\n+    -o "${output_vcf}"\n+    -et "NO_ET" ##ET no phone home\n+    ##-log "${output_log}" ##don\'t use this to log to file, instead directly capture stdout\n+    #if $reference_source.reference_source_selector != "history":\n+        -R "${reference_source.ref_file.fields.path}"\n+    #end if\n+    --standard_min_confidence_threshold_for_calling "${standard_min_confidence_threshold_for_calling}"\n+    --standard_min_confidence_threshold_for_emitting "${standard_min_confidence_threshold_for_emitting}"\n+   \'\n+    #set $rod_binding_names = dict()\n+    #if str( $input_dbsnp_rod ) != "None":\n+        -d "-D" "${input_dbsnp_rod}" "${input_dbsnp_rod.ext}" "dbsnp_rod"\n+    #end if\n+    #for $rod_binding in $rod_bind:\n+        #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == \'custom\':\n+            #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name\n+        #else\n+            #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector\n+        #end if\n+        #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1\n+        -d "-B:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}"\n+        #if str( $rod_binding.rod_bind_type.rodToIntervalTrackName ):\n+            -p \'--rodToIntervalTrackName "${rod_bind_name}"\'\n+        #end if\n+    #end for\n+   \n+    ##start standard gatk options\n+    #if $gatk_param_type.gatk_param_type_selector == "advanced":\n+        #for $sample_metadata in $gatk_param_type.sample_metadata:\n+            -p \'--sample_metadata "${sample_metadata.sample_metadata_file}"\'\n+        #end for\n+        #for $read_filter in $gatk_param_type.read_filter:\n+            -p \'--read_filter "${read_filter.read_filter_type.read_filter_type_selector}"\n+            ###raise Exception( str( dir( $read_filter ) ) )\n+            #for $name, $param in $read_filter.read_filter_type.iteritems():\n+                #if $name not in [ "__current_case__", "read_filter_type_selector" ]:\n+                    --${name} "${param}"\n+                #end if\n+            #end for\n+            \'\n+        #end for\n+        #if str( $gatk_param_type.input_intervals ) != "None":\n+            -d "-L" "${gatk_param_type.input_intervals}" "${gatk_param_type.input_intervals.ext}" "input_intervals"\n+        #end if\n+        #if str( $gatk_param_type.input_exclude_intervals ) != "None":\n+            -d "-XL" "${gatk_param_type.input_exclude_intervals}" "${gatk_param_type.input_exclude_intervals.ext}" "input_intervals"\n+        #end if\n+\n+        -p \'--BTI_merge_rule "${gatk_param_type.BTI_merge_rule}"\'\n+        \n+        -p \'--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"\'\n+        #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE":\n+            -p \'--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"\'\n+        #end if\n+        -p \'\n+        --baq "${gatk_param_type.baq}"\n+        --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}"\n+        ${gatk_param_type.use_original_qualities}\n+        --defaultBaseQualities "${gatk_param_type.default_base_qualities'..b'"gatk/gatk_unified_genotyper/gatk_unified_genotyper_out_1.log.contains" compare="contains"/>\n+      </test>\n+  </tests>\n+  <help>\n+**What it does**\n+\n+     A variant caller which unifies the approaches of several disparate callers.  Works for single-sample and \n+     multi-sample data.  The user can choose from several different incorporated calculation models.\n+\n+------\n+\n+Please cite the website "http://addlink.here" as well as:\n+\n+Add citation here 2011.\n+\n+------\n+\n+**Input formats**\n+\n+GenomeAnalysisTK: UnifiedGenotyper accepts an aligned BAM input file.\n+\n+------\n+\n+**Outputs**\n+\n+The output is in VCF format, see http://addlink.here for more details.\n+\n+-------\n+\n+**Settings**::\n+\n+ genotype_likelihoods_model                           Genotype likelihoods calculation model to employ -- BOTH is the default option, while INDEL is also available for calling indels and SNP is available for calling SNPs only (SNP|INDEL|BOTH)\n+ p_nonref_model                                                  Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available. (EXACT|GRID_SEARCH)\n+ heterozygosity                                                  Heterozygosity value used to compute prior likelihoods for any locus\n+ pcr_error_rate                                             The PCR error rate to be used for computing fragment-based likelihoods\n+ genotyping_mode                                             Should we output confident genotypes (i.e. including ref calls) or just the variants? (DISCOVERY|GENOTYPE_GIVEN_ALLELES)\n+ output_mode                                                    Should we output confident genotypes (i.e. including ref calls) or just the variants? (EMIT_VARIANTS_ONLY|EMIT_ALL_CONFIDENT_SITES|EMIT_ALL_SITES)\n+ standard_min_confidence_threshold_for_calling                         The minimum phred-scaled confidence threshold at which variants not at \'trigger\' track sites should be called\n+ standard_min_confidence_threshold_for_emitting                        The minimum phred-scaled confidence threshold at which variants not at \'trigger\' track sites should be emitted (and filtered if less than the calling threshold)\n+ noSLOD                                                                            If provided, we will not calculate the SLOD\n+ min_base_quality_score                                   Minimum base quality required to consider a base for calling\n+ min_mapping_quality_score                             Minimum read mapping quality required to consider a read for calling\n+ max_deletion_fraction                               Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to &lt; 0 or &gt; 1; default:0.05]\n+ min_indel_count_for_genotyping           Minimum number of consensus indels required to trigger genotyping run\n+ indel_heterozygosity                       Heterozygosity for indel calling\n+ indelGapContinuationPenalty                    Indel gap continuation penalty\n+ indelGapOpenPenalty                                    Indel gap open penalty\n+ indelHaplotypeSize                                    Indel haplotype size\n+ doContextDependentGapPenalties                                                  Vary gap penalties by context\n+ indel_recal_file                                         Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE\n+ indelDebug                                                                 Output indel debug info\n+ out                                                                           File to which variants should be written\n+ annotation                                                             One or more specific annotations to apply to variant calls\n+ group                                                                       One or more classes/groups of annotations to apply to variant calls\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/genetrack/genetrack_indexer.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/genetrack/genetrack_indexer.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+
+"""
+Wraps genetrack.scripts.tabs2genetrack so the tool can be executed from Galaxy.
+
+usage: %prog input output shift
+"""
+
+import sys, shutil, os
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "GeneTrack" )
+
+from genetrack.scripts import tabs2genetrack
+from genetrack import logger
+
+if __name__ == "__main__":
+    import os
+    os.environ[ 'LC_ALL' ] = 'C' 
+    #os.system( 'export' )
+    
+    parser = tabs2genetrack.option_parser()
+
+    options, args = parser.parse_args()
+
+    # uppercase the format
+    options.format = options.format.upper()
+
+    if options.format not in ('BED', 'GFF'):
+        sys.stdout = sys.stderr
+        parser.print_help()
+        sys.exit(-1)
+
+    logger.disable(options.verbosity)
+
+    # missing file names
+    if not (options.inpname and options.outname and options.format):
+        parser.print_help()
+        sys.exit(-1)
+    else:
+        tabs2genetrack.transform(inpname=options.inpname, outname=options.outname,\
+            format=options.format, shift=options.shift, index=options.index, options=options)
b
diff -r 000000000000 -r 9071e359b9a3 tools/genetrack/genetrack_indexer.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/genetrack/genetrack_indexer.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,54 @@
+<tool id="bed2genetrack" name="GeneTrack indexer" version="1.0.1">
+  
+  <description>on a BED file</description>
+
+  <command interpreter="python">
+    genetrack_indexer.py -i $input -o $output -s $shift -v 0 -f BED -x
+  </command>
+    
+  <inputs>
+    
+    <param format="bed6" name="input" type="data" help="Input data">
+      <label>Select input bed file</label>
+    </param>
+    
+    <param name="shift" size="4" type="integer" value="0" help="distance in basepairs">
+        <label>Shift at 5' end</label>
+    </param>
+
+    <!-- this parameter is currently not used, may not be feasible to use it
+    <param name="coverage" type="select" label="Full coverage">
+      <option value="no">NO</option>
+      <option value="yes">YES</option>
+    </param>
+    -->
+  
+  </inputs>
+
+  <outputs>  
+    <data format="genetrack" name="output" />
+  </outputs>
+   
+<help>
+**Help**
+
+This tool will create a visualization of the bed file that is selected. 
+
+**Parameters**
+
+- **Shift at 5' end** should be used when the location of interest is at a fixed distance from
+  the 5' end for **all sequenced fragments**! 
+  
+  For example if the sequenced sample consists
+  mono-nucleosomal DNA (146bp) we should expect that 
+  each nucleosome midpoint is located at 73 bp from the 5' end of the fragment. 
+  Therefore we would enter 73 as the shift parameter. Once corrected the reads 
+  on each strand will coincide and indicate the actual midpoints 
+  of the nucleosomes.
+  
+  When shifting the averaging process in GeneTrack is able correct for longer or shorter
+  than expected fragment sizes as long as the errors are reasonably random.
+
+</help>
+
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/genetrack/genetrack_peak_prediction.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/genetrack/genetrack_peak_prediction.py Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+
+"""
+Wraps genetrack.scripts.peakpred so the tool can be executed from Galaxy.
+
+usage: %prog input output level sigma mode exclusion strand
+"""
+
+import sys
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "GeneTrack" )
+
+from genetrack.scripts import peakpred
+from genetrack import logger
+
+if __name__ == "__main__":
+
+    parser = peakpred.option_parser()
+
+    options, args = parser.parse_args()
+
+    logger.disable(options.verbosity)
+
+    from genetrack import conf
+
+    # trigger test mode
+    if options.test:
+        options.inpname = conf.testdata('test-hdflib-input.gtrack')
+        options.outname = conf.testdata('predictions.bed')
+
+    # missing input file name
+    if not options.inpname and not options.outname:
+        parser.print_help()
+    else:
+        print 'Sigma = %s' % options.sigma
+        print 'Minimum peak = %s' % options.level
+        print 'Peak-to-peak = %s' % options.exclude
+
+        peakpred.predict(options.inpname, options.outname, options)
b
diff -r 000000000000 -r 9071e359b9a3 tools/genetrack/genetrack_peak_prediction.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/genetrack/genetrack_peak_prediction.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,59 @@
+<tool id="predict2genetrack" name="Peak predictor">
+  
+  <description>on GeneTrack index</description>
+
+  <command interpreter="python">
+      genetrack_peak_prediction.py -i $input -o $output --level=$level --sigma=$sigma --mode=$mode --exclusion=$exclusion --strand=$strand -v 0 -x
+  </command>
+    
+  <inputs>
+    
+    <param format="genetrack" name="input" type="data" help="Input data" label="Select input data"/>

+    <param name="method" type="select" label="Smoothing method" help="The function used to average nearby read values">
+      <option value="gauss">Gaussian kernel</option>
+      <!-- <option value="yes">Moving averages</option> -->
+    </param>
+  
+    <param name="sigma" size="4" type="integer" value="10" label="Smoothing factor" help="The interval over which each read is averaged" />
+        
+
+    <param name="mode" type="select" label="Peak prediction" help="Peak prediction method"> 
+      <option value="nolap">Maximal non-overlapping</option>
+      <!-- <option value="above">Above a threshold</option> -->
+      <option value="all">All peaks</option>
+    </param>
+  
+    <param name="exclusion" type="integer" size="4" value="0" help="The minimal distance between peaks"  label="Peak-to-peak distance">
+    </param>
+
+    <param name="level" size="4" type="float" value="1" label="Threshold" help="Return only peaks above this value" />
+    
+    <param name="strand" type="select" label="Strands" help="Combine strand data or predict on each strand separately">
+      <option value="all">Merge strands</option>
+      <!-- <option value="yes1">Above a threshold</option> -->
+      <option value="two">Separate strands</option>
+    </param>
+
+  </inputs>
+
+  <outputs>  
+    <data format="bed" name="output" />
+  </outputs>
+   
+<help>
+**Help**
+
+This tool will generate genome wide peak prediction from an index file.
+
+**Parameters**
+
+- **Smoothing method** the function used to average nearby values
+
+- **Smoothing value** the factor used in the method
+
+- **Prediction method** the function used to average nearby values
+
+</help>
+
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/genome_diversity/cdblib.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/genome_diversity/cdblib.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,230 @@
+#!/usr/bin/env python2.5
+
+'''
+Manipulate DJB's Constant Databases. These are 2 level disk-based hash tables
+that efficiently handle many keys, while remaining space-efficient.
+
+    http://cr.yp.to/cdb.html
+
+When generated databases are only used with Python code, consider using hash()
+rather than djb_hash() for a tidy speedup.
+'''
+
+from _struct import Struct
+from itertools import chain
+
+
+def py_djb_hash(s):
+    '''Return the value of DJB's hash function for the given 8-bit string.'''
+    h = 5381
+    for c in s:
+        h = (((h << 5) + h) ^ ord(c)) & 0xffffffff
+    return h
+
+try:
+    from _cdblib import djb_hash
+except ImportError:
+    djb_hash = py_djb_hash
+
+read_2_le4 = Struct('<LL').unpack
+write_2_le4 = Struct('<LL').pack
+
+
+class Reader(object):
+    '''A dictionary-like object for reading a Constant Database accessed
+    through a string or string-like sequence, such as mmap.mmap().'''
+
+    def __init__(self, data, hashfn=djb_hash):
+        '''Create an instance reading from a sequence and using hashfn to hash
+        keys.'''
+        if len(data) < 2048:
+            raise IOError('CDB too small')
+
+        self.data = data
+        self.hashfn = hashfn
+
+        self.index = [read_2_le4(data[i:i+8]) for i in xrange(0, 2048, 8)]
+        self.table_start = min(p[0] for p in self.index)
+        # Assume load load factor is 0.5 like official CDB.
+        self.length = sum(p[1] >> 1 for p in self.index)
+
+    def iteritems(self):
+        '''Like dict.iteritems(). Items are returned in insertion order.'''
+        pos = 2048
+        while pos < self.table_start:
+            klen, dlen = read_2_le4(self.data[pos:pos+8])
+            pos += 8
+
+            key = self.data[pos:pos+klen]
+            pos += klen
+
+            data = self.data[pos:pos+dlen]
+            pos += dlen
+
+            yield key, data
+
+    def items(self):
+        '''Like dict.items().'''
+        return list(self.iteritems())
+
+    def iterkeys(self):
+        '''Like dict.iterkeys().'''
+        return (p[0] for p in self.iteritems())
+    __iter__ = iterkeys
+
+    def itervalues(self):
+        '''Like dict.itervalues().'''
+        return (p[1] for p in self.iteritems())
+
+    def keys(self):
+        '''Like dict.keys().'''
+        return [p[0] for p in self.iteritems()]
+
+    def values(self):
+        '''Like dict.values().'''
+        return [p[1] for p in self.iteritems()]
+
+    def __getitem__(self, key):
+        '''Like dict.__getitem__().'''
+        value = self.get(key)
+        if value is None:
+            raise KeyError(key)
+        return value
+
+    def has_key(self, key):
+        '''Return True if key exists in the database.'''
+        return self.get(key) is not None
+    __contains__ = has_key
+
+    def __len__(self):
+        '''Return the number of records in the database.'''
+        return self.length
+
+    def gets(self, key):
+        '''Yield values for key in insertion order.'''
+        # Truncate to 32 bits and remove sign.
+        h = self.hashfn(key) & 0xffffffff
+        start, nslots = self.index[h & 0xff]
+
+        if nslots:
+            end = start + (nslots << 3)
+            slot_off = start + (((h >> 8) % nslots) << 3)
+
+            for pos in chain(xrange(slot_off, end, 8),
+                             xrange(start, slot_off, 8)):
+                rec_h, rec_pos = read_2_le4(self.data[pos:pos+8])
+
+                if not rec_h:
+                    break
+                elif rec_h == h:
+                    klen, dlen = read_2_le4(self.data[rec_pos:rec_pos+8])
+                    rec_pos += 8
+
+                    if self.data[rec_pos:rec_pos+klen] == key:
+                        rec_pos += klen
+                        yield self.data[rec_pos:rec_pos+dlen]
+
+    def get(self, key, default=None):
+        '''Get the first value for key, returning default if missing.'''
+        # Avoid exception catch when handling default case; much faster.
+        return chain(self.gets(key), (default,)).next()
+
+    def getint(self, key, default=None, base=0):
+        '''Get the first value for key converted it to an int, returning
+        default if missing.'''
+        value = self.get(key, default)
+        if value is not default:
+            return int(value, base)
+        return value
+
+    def getints(self, key, base=0):
+        '''Yield values for key in insertion order after converting to int.'''
+        return (int(v, base) for v in self.gets(key))
+
+    def getstring(self, key, default=None, encoding='utf-8'):
+        '''Get the first value for key decoded as unicode, returning default if
+        not found.'''
+        value = self.get(key, default)
+        if value is not default:
+            return value.decode(encoding)
+        return value
+
+    def getstrings(self, key, encoding='utf-8'):
+        '''Yield values for key in insertion order after decoding as
+        unicode.'''
+        return (v.decode(encoding) for v in self.gets(key))
+
+
+class Writer(object):
+    '''Object for building new Constant Databases, and writing them to a
+    seekable file-like object.'''
+
+    def __init__(self, fp, hashfn=djb_hash):
+        '''Create an instance writing to a file-like object, using hashfn to
+        hash keys.'''
+        self.fp = fp
+        self.hashfn = hashfn
+
+        fp.write('\x00' * 2048)
+        self._unordered = [[] for i in xrange(256)]
+
+    def put(self, key, value=''):
+        '''Write a string key/value pair to the output file.'''
+        assert type(key) is str and type(value) is str
+
+        pos = self.fp.tell()
+        self.fp.write(write_2_le4(len(key), len(value)))
+        self.fp.write(key)
+        self.fp.write(value)
+
+        h = self.hashfn(key) & 0xffffffff
+        self._unordered[h & 0xff].append((h, pos))
+
+    def puts(self, key, values):
+        '''Write more than one value for the same key to the output file.
+        Equivalent to calling put() in a loop.'''
+        for value in values:
+            self.put(key, value)
+
+    def putint(self, key, value):
+        '''Write an integer as a base-10 string associated with the given key
+        to the output file.'''
+        self.put(key, str(value))
+
+    def putints(self, key, values):
+        '''Write zero or more integers for the same key to the output file.
+        Equivalent to calling putint() in a loop.'''
+        self.puts(key, (str(value) for value in values))
+
+    def putstring(self, key, value, encoding='utf-8'):
+        '''Write a unicode string associated with the given key to the output
+        file after encoding it as UTF-8 or the given encoding.'''
+        self.put(key, unicode.encode(value, encoding))
+
+    def putstrings(self, key, values, encoding='utf-8'):
+        '''Write zero or more unicode strings to the output file. Equivalent to
+        calling putstring() in a loop.'''
+        self.puts(key, (unicode.encode(value, encoding) for value in values))
+
+    def finalize(self):
+        '''Write the final hash tables to the output file, and write out its
+        index. The output file remains open upon return.'''
+        index = []
+        for tbl in self._unordered:
+            length = len(tbl) << 1
+            ordered = [(0, 0)] * length
+            for pair in tbl:
+                where = (pair[0] >> 8) % length
+                for i in chain(xrange(where, length), xrange(0, where)):
+                    if not ordered[i][0]:
+                        ordered[i] = pair
+                        break
+
+            index.append((self.fp.tell(), length))
+            for pair in ordered:
+                self.fp.write(write_2_le4(*pair))
+
+        self.fp.seek(0)
+        for pair in index:
+            self.fp.write(write_2_le4(*pair))
+        self.fp = None # prevent double finalize()
b
diff -r 000000000000 -r 9071e359b9a3 tools/genome_diversity/extract_flanking_dna.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/genome_diversity/extract_flanking_dna.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,89 @@
+#!/usr/bin/env python2.5
+
+import os
+import sys
+from optparse import OptionParser
+import genome_diversity as gd
+
+def main_function( parse_arguments=None ):
+    if parse_arguments is None:
+        parse_arguments = lambda arguments: ( None, arguments )
+    def main_decorator( to_decorate ):
+        def decorated_main( arguments=None ):
+            if arguments is None:
+                arguments = sys.argv
+            options, arguments = parse_arguments( arguments )
+            rc = 1
+            try:
+                rc = to_decorate( options, arguments )
+            except Exception, err:
+                sys.stderr.write( 'ERROR: %s\n' % str( err ) )
+                traceback.print_exc()
+            finally:
+                sys.exit( rc )
+        return decorated_main
+    return main_decorator
+
+def parse_arguments( arguments ):
+    parser = OptionParser()
+    parser.add_option('--input',
+                        type='string', dest='input',
+                        help='file of selected SNPs')
+    parser.add_option('--output',
+                        type='string', dest='output',
+                        help='output file')
+    parser.add_option('--snps_loc',
+                        type='string', dest='snps_loc',
+                        help='snps .loc file')
+    parser.add_option('--scaffold_col',
+                        type="int", dest='scaffold_col',
+                        help='scaffold column in the input file')
+    parser.add_option('--pos_col',
+                        type="int", dest='pos_col',
+                        help='position column in the input file')
+    parser.add_option('--output_format',
+                        type="string", dest='output_format',
+                        help='output format, fasta or primer3')
+    parser.add_option('--species',
+                        type="string", dest='species',
+                        help='species')
+    return parser.parse_args( arguments[1:] )
+
+
+@main_function( parse_arguments )
+def main( options, arguments ):
+    if not options.input:
+        raise RuntimeError( 'missing --input option' )
+    if not options.output:
+        raise RuntimeError( 'missing --output option' )
+    if not options.snps_loc:
+        raise RuntimeError( 'missing --snps_loc option' )
+    if not options.scaffold_col:
+        raise RuntimeError( 'missing --scaffold_col option' )
+    if not options.pos_col:
+        raise RuntimeError( 'missing --pos_col option' )
+    if not options.output_format:
+        raise RuntimeError( 'missing --output_format option' )
+    if not options.species:
+        raise RuntimeError( 'missing --species option' )
+    
+    snps = gd.SnpFile( filename=options.input, seq_col=int( options.scaffold_col ), pos_col=int( options.pos_col ) )
+
+    out_fh = gd._openfile( options.output, 'w' )
+
+    snpcalls_file = gd.get_filename_from_loc( options.species, options.snps_loc )
+    file_root, file_ext = os.path.splitext( snpcalls_file )
+    snpcalls_index_file = file_root + ".cdb"
+    snpcalls = gd.SnpcallsFile( data_file=snpcalls_file, index_file=snpcalls_index_file )
+
+    while snps.next():
+        seq, pos = snps.get_seq_pos()
+        flanking_dna = snpcalls.get_flanking_dna( sequence=seq, position=pos, format=options.output_format )
+        if flanking_dna:
+            out_fh.write( flanking_dna )
+
+    out_fh.close()
+
+if __name__ == "__main__":
+    main()
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/genome_diversity/extract_flanking_dna.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/genome_diversity/extract_flanking_dna.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,93 @@
+<tool id="gd_extract_flanking_dna" name="Extract" version="1.0.0">
+  <description>DNA flanking chosen SNPs</description>
+
+  <command interpreter="python2.5">
+    extract_flanking_dna.py "--input=$input" "--output=$output" "--snps_loc=${GALAXY_DATA_INDEX_DIR}/gd.snps.loc"
+    #if $override_metadata.choice == "0":
+      "--scaffold_col=${input.metadata.scaffold}" "--pos_col=${input.metadata.pos}" "--species=${input.metadata.species}"
+    #else
+      "--scaffold_col=$scaf_col" "--pos_col=$pos_col" "--species=$species"
+    #end if
+    "--output_format=$output_format"
+  </command>
+
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Selected SNPS dataset"/>
+    <param name="output_format" type="select" format="integer" label="output format">
+        <option value="fasta" selected="true">FastA format</option>
+        <option value="primer3">Primer3 input</option>
+    </param>
+    <conditional name="override_metadata">
+      <param name="choice" type="select" format="integer" label="choose columns">
+        <option value="0" selected="true">No, get columns from metadata</option>
+        <option value="1" >Yes, choose columns</option>
+      </param>
+      <when value="0">
+        <!-- no options -->
+      </when>
+      <when value="1">
+        <param name="scaf_col" type="data_column" data_ref="input" numerical="false" label="Column with scaffold"/>
+        <param name="pos_col" type="data_column" data_ref="input" numerical="true" label="Column with position"/>
+        <param name="species" type="select" label="Choose species">
+          <options from_file="gd.species.txt">
+            <column name="name" index="1"/>
+            <column name="value" index="0"/>
+          </options>
+        </param>
+      </when>
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data format="txt" name="output"/>
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="gd.sample.wsf" ftype="wsf"/>
+      <param name="output_format" value="primer3"/>
+      <param name="choice" value="0"/>
+      <output name="output" file="gd.extract_flanking_dna.txt"/>
+    </test>
+  </tests>
+
+  <help>
+**What it does**
+
+  It reports a DNA segment containing each SNP, with up to 200 nucleotides on
+  either side of the SNP position, which is indicated by "n". Fewer nucleotides
+  are reported if the SNP is near an end of the assembled genome fragment.
+
+-----
+
+**Example**
+
+- input file::
+
+    chr2_75111355_75112576    314  A  C  L  F  chr2   75111676  C  F  15  4  53   2   9  48   Y  96   0.369  0.355  0.396  0
+    chr8_93901796_93905612   2471  A  C  A  A  chr8   93904264  A  A  8   0  51   10  2  14   Y  961  0.016  0.534  0.114  2
+    chr10_7434473_7435447    524   T  C  S  S  chr10  7435005   T  S  11  5  90   14  0  69   Y  626  0.066  0.406  0.727  0
+    chr14_80021455_80022064  138   G  A  H  H  chr14  80021593  G  H  14  0  69   9   6  124  Y  377  0.118  0.997  0.195  1
+    chr15_64470252_64471048  89    G  A  Y  Y  chr15  64470341  G  Y  5   6  109  14  0  69   Y  312  0.247  0.998  0.393  0
+    chr18_48070585_48071386  514   C  T  E  K  chr18  48071100  T  K  7   7  46   14  0  69   Y  2    0.200  0.032  0.163  0
+    chr18_50154905_50155664  304   A  G  Y  C  chr18  50155208  A  Y  4   2  17   5   1  22   Y  8    0.022  0.996  0.128  0
+    chr18_57379354_57380496  315   C  T  V  V  chr18  57379669  G  V  11  0  60   9   6  62   Y  726  0.118  0.048  0.014  1
+    chr19_14240610_14242055  232   C  T  A  V  chr19  14240840  C  A  18  8  56   15  5  42   Y  73   0.003  0.153  0.835  0
+    chr19_39866997_39874915  3117  C  T  P  P  chr19  39870110  C  P  3   7  65   14  2  32   Y  6    0.321  0.911  0.462  4
+    etc.
+
+- output file::
+
+    > chr2_75111355_75112576 314 A C
+    TATCTTCATTTTTATTATAGACTCTCTGAACCAATTTGCCCTGAGGCAGACTTTTTAAAGTACTGTGTAATGTATGAAGTCCTTCTGCTCAAGCAAATCATTGGCATGAAAACAGTTGCAAACTTATTGTGAGAGAAGAGTCCAAGAGTTTTAACAGTCTGTAAGTATATAGCCTGTGAGTTTGATTTCCTTCTTGTTTTTnTTCCAGAAACATGATCAGGGGCAAGTTCTATTGGATATAGTCTTCAAGCATCTTGATTTGACTGAGCGTGACTATTTTGGTTTGCAGTTGACTGACGATTCCACTGATAACCCAGTAAGTTTAAGCTGTTGTCTTTCATTGTCATTGCAATTTTTCTGTCTTTATACTAGGTCCTTTCTGATTTACATTGTTCACTGATT
+    > chr8_93901796_93905612 2471 A C
+    GCTGCCGCTGGATTTACTTCTGCTTGGGTCGAGAGCGGGCTGGATGGGTGAAGAGTGGGCTCCCCGGCCCCTGACCAGGCAGGTGCAGACAAGTCGGAAGAAGGCCCGCCGCATCTCCTTGCTGGCCAGCGTGTAGATGACGGGGTTCATGGCAGAGTTGAGCACGGCCAGCACGATGAACCACTGGGCCTTGAACAGGATnGCGCACTCCTTCACCTTGCAGGCCACATCCACAAGGAAAAGGATGAAGAGTGGGGACCAGCAGGCGATGAACACGCTCACCACGATCACCACGGTCCGCAGCAGGGCCATGGACCGCTCTGAGTTGTGCGGGCTGGCCACCCTGCGGCTGCTGGACTTCACCAGGAAGTAGATGCGTGCGTACAGGATCACGATGGTCAC
+    > chr10_7434473_7435447 524 T C
+    ATTATTAACAGAAACATTTCTTTTTCATTACCCAGGGGTTACACTGGTCGTTGATGTTAATCAGTTTTTGGAGAAGGAGAAGCAAAGTGATATTTTGTCTGTTCTGAAGCCTGCCGTTGGTAATACAAATGACGTAATCCCTGAATGTGCTGACAGGTACCATGACGCCCTGGCAAAAGCAAAAGAGCAAAAATCTAGAAGnGGTAAGCATCTTCACTGTTTAGCACAAATTAAATAGCACTTTGAATATGATGATTTCTGTGGTATTGTGTTATCTTACTTTTGAGACAAATAATCGCTTTCAAATGAATATTTCTGAATGTTTGTCATCTCTGGCAAGGAAATTTTTTAGTGTTTCTTTTCCTTTTTTGTCTTTTGGAAATCTGTGATTAACTTGGTGGC
+    > chr14_80021455_80022064 138 G A
+    ACCCAGGGATCAAACCCAGGTCTCCCGCATTGCAGGCGGATTCTTTACTGTCTGAGCCTCCAGGGAAGCCCTCGGGGCTGAAGGGATGGTTATGAAGGTGAGAAACAGGGGCCACCTGTCCCCAAGGTACCTTGCGACnTGCCATCTGCGCTCCACCAGTAAATGGACGTCTTCGATCCTTCTGTTGTTGGCGTAGTGCAAACGTTTGGGAAGGTGCTGTTTCAAGTAAGGCTTAAAGTGCTGGTCTGGTTTTTTACACTGAAATATAAATGGACATTGGATTTTGCAATGGAGAGTCTTCTAGAAGAGTCCAAGACATTCTCTCCAGAAAGCTGAAGG
+    > chr15_64470252_64471048 89 G A
+    TGTGTGTGTGTGTGTGTGTGTGTGCCTGTGTCTGTACATGCACACCACGTGGCCTCACCCAGTGCCCTCAGCTCCATGGTGATGTCCACnTAGCCGTGCTCCGCGCTGTAGTACATGGCCTCCTGGAGGGCCTTGGTGCGCGTCCGGCTCAGGCGCATGGGCCCCTCGCTGCCGCTGCCCTGGCTGGATGCATCGCTCTCTTCCACGCCCTCAGCCAGGATCTCCTCCAGGGACAGCACATCTGCTTTGGCCTGCTGTGGCTGAGTCAGGAGCTTCCTCAGGACGTTCCT
+    etc.
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/genome_diversity/extract_primers.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/genome_diversity/extract_primers.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,84 @@
+#!/usr/bin/env python2.5
+
+import os
+import sys
+from optparse import OptionParser
+import genome_diversity as gd
+
+def main_function( parse_arguments=None ):
+    if parse_arguments is None:
+        parse_arguments = lambda arguments: ( None, arguments )
+    def main_decorator( to_decorate ):
+        def decorated_main( arguments=None ):
+            if arguments is None:
+                arguments = sys.argv
+            options, arguments = parse_arguments( arguments )
+            rc = 1
+            try:
+                rc = to_decorate( options, arguments )
+            except Exception, err:
+                sys.stderr.write( 'ERROR: %s\n' % str( err ) )
+                traceback.print_exc()
+            finally:
+                sys.exit( rc )
+        return decorated_main
+    return main_decorator
+
+def parse_arguments( arguments ):
+    parser = OptionParser()
+    parser.add_option('--input',
+                        type='string', dest='input',
+                        help='file of selected SNPs')
+    parser.add_option('--output',
+                        type='string', dest='output',
+                        help='output file')
+    parser.add_option('--primers_loc',
+                        type='string', dest='primers_loc',
+                        help='primers .loc file')
+    parser.add_option('--scaffold_col',
+                        type="int", dest='scaffold_col',
+                        help='scaffold column in the input file')
+    parser.add_option('--pos_col',
+                        type="int", dest='pos_col',
+                        help='position column in the input file')
+    parser.add_option('--species',
+                        type="string", dest='species',
+                        help='species')
+    return parser.parse_args( arguments[1:] )
+
+
+@main_function( parse_arguments )
+def main( options, arguments ):
+    if not options.input:
+        raise RuntimeError( 'missing --input option' )
+    if not options.output:
+        raise RuntimeError( 'missing --output option' )
+    if not options.primers_loc:
+        raise RuntimeError( 'missing --primers_loc option' )
+    if not options.scaffold_col:
+        raise RuntimeError( 'missing --scaffold_col option' )
+    if not options.pos_col:
+        raise RuntimeError( 'missing --pos_col option' )
+    if not options.species:
+        raise RuntimeError( 'missing --species option' )
+    
+    snps = gd.SnpFile( filename=options.input, seq_col=int( options.scaffold_col ), pos_col=int( options.pos_col ) )
+
+    out_fh = gd._openfile( options.output, 'w' )
+
+    primer_data_file = gd.get_filename_from_loc( options.species, options.primers_loc )
+    file_root, file_ext = os.path.splitext( primer_data_file )
+    primer_index_file = file_root + ".cdb"
+    primers = gd.PrimersFile( data_file=primer_data_file, index_file=primer_index_file )
+
+    while snps.next():
+        seq, pos = snps.get_seq_pos()
+        primer = primers.get_entry( seq, pos )
+        if primer:
+            out_fh.write( primer )
+
+    out_fh.close()
+
+if __name__ == "__main__":
+    main()
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/genome_diversity/extract_primers.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/genome_diversity/extract_primers.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,90 @@
+<tool id="gd_extract_primers" name="Extract primers" version="1.0.0">
+  <description>for selected SNPs</description>
+
+  <command interpreter="python2.5">
+    extract_primers.py "--input=$input" "--output=$output" "--primers_loc=${GALAXY_DATA_INDEX_DIR}/gd.primers.loc"
+    #if $override_metadata.choice == "0":
+      "--scaffold_col=${input.metadata.scaffold}" "--pos_col=${input.metadata.pos}" "--species=${input.metadata.species}"
+    #else
+      "--scaffold_col=$scaf_col" "--pos_col=$pos_col" "--species=$species"
+    #end if
+  </command>
+
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Selected SNPS dataset"/>
+    <conditional name="override_metadata">
+      <param name="choice" type="select" format="integer" label="choose columns">
+        <option value="0" selected="true">No, get columns from metadata</option>
+        <option value="1" >Yes, choose columns</option>
+      </param>
+      <when value="0">
+        <!-- no options -->
+      </when>
+      <when value="1">
+        <param name="scaf_col" type="data_column" data_ref="input" numerical="false" label="Column with scaffold"/>
+        <param name="pos_col" type="data_column" data_ref="input" numerical="true" label="Column with position"/>
+        <param name="species" type="select" label="Choose species">
+          <options from_file="gd.species.txt">
+            <column name="name" index="1"/>
+            <column name="value" index="0"/>
+          </options>
+        </param>
+      </when>
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data format="txt" name="output"/>
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="gd.sample.wsf" ftype="wsf"/>
+      <param name="choice" value="0"/>
+      <output name="output" file="gd.extract_primers.txt"/>
+    </test>
+  </tests>
+
+
+  <help>
+**What it does**
+
+  This tool extracts primers for SNPs in the dataset using the Primer3 program.
+  The first line of output for a given SNP reports the name of the assembled
+  contig, the SNP's position in the contig, the two variant nucleotides, and
+  Primer3's "pair penalty".  The next line, if not blank, names restriction
+  enzymes (from the user-adjustable list) that differentially cut at that
+  site, but do not cut at any other position between and including the
+  primer positions.  The next lines show the SNP's flanking regions, with
+  the SNP position indicated by "n", including the primer positions and an
+  additional 3 nucleotides.
+
+-----
+
+**Example**
+
+- input file::
+
+    chr5_30800874_30802049    734   G  A  chr5   30801606   A  24  0  99   4  11  97   Y  496  0.502  0.033  0.215  6
+    chr8_55117827_55119487    994   A  G  chr8   55118815   G  25  0  102  4  11  96   Y  22   0.502  0.025  2.365  1
+    chr9_100484836_100485311  355   C  T  chr9   100485200  T  27  0  108  6  17  100  Y  190  0.512  0.880  2.733  4
+    chr12_3635530_3637738     2101  T  C  chr12  3637630    T  25  0  102  4  13  93   Y  169  0.554  0.024  0.366  4
+
+- output file::
+
+    chr5_30800874_30802049 734 G A 0.352964
+     BglII,MboI,Sau3AI,Tru9I,XhoII
+      1 CTGAAGGTGAGCAGGATTCAGGAGACAGAAAACAAAGCCCAGGCCTGCCCAAGGTGGAAA
+           >>>>>>>>>>>>>>>>>>>>
+     
+     61 AGTCTAACAACTCGCCCTCTGCTTAnATCTGAGACTCACAGGGATAATAACACACTTGGT
+     
+     
+     21 CAAGGAATAAACTAGATATTATTCACTCCTCTAGAAGGCTGCCAGGAAAATTGCCTGACT
+                                                             &lt;&lt;&lt;&lt;&lt;&lt;&lt;
+     
+    181 TGAACCTTGGCTCTGA
+        &lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;
+    etc.
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/genome_diversity/genome_diversity.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/genome_diversity/genome_diversity.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,266 @@\n+#!/usr/bin/env python2.5\n+\n+import sys\n+import cdblib\n+\n+def _openfile( filename=None, mode=\'r\' ):\n+    try:\n+        fh = open( filename, mode )\n+    except IOError, err:\n+        raise RuntimeError( "can\'t open file: %s\\n" % str( err ) )\n+    return fh\n+\n+def get_filename_from_loc( species=None, filename=None ):\n+    fh = _openfile( filename )\n+    for line in fh:\n+        if line and not line.startswith( \'#\' ):\n+            line = line.rstrip( \'\\r\\n\' )\n+            if line:\n+                elems = line.split( \'\\t\' )\n+                if len( elems ) >= 2 and elems[0] == species:\n+                    return elems[1]\n+    return None\n+\n+\n+class SnpFile( object ):\n+    def __init__( self, filename=None, seq_col=1, pos_col=2, ref_seq_col=7, ref_pos_col=8 ):\n+        self.filename = filename\n+        self.fh = _openfile( filename )\n+        self.seq_col = seq_col\n+        self.pos_col = pos_col\n+        self.ref_seq_col = ref_seq_col\n+        self.ref_pos_col = ref_pos_col\n+        self.elems = None\n+        self.line = None\n+        self.comments = []\n+\n+    def next( self ):\n+        while self.fh:\n+            try:\n+                self.line = self.fh.next()\n+            except StopIteration:\n+                self.line = None\n+                self.elems = None\n+                return None\n+            if self.line:\n+                self.line = self.line.rstrip( \'\\r\\n\' )\n+                if self.line:\n+                    if self.line.startswith( \'#\' ):\n+                        self.comments.append( self.line )\n+                    else:\n+                        self.elems = self.line.split( \'\\t\' )\n+                        return 1\n+\n+    def get_seq_pos( self ):\n+        if self.elems:\n+            return self.elems[ self.seq_col - 1 ], self.elems[ self.pos_col - 1 ]\n+        else:\n+            return None, None\n+\n+    def get_ref_seq_pos( self ):\n+        if self.elems:\n+            return self.elems[ self.ref_seq_seq - 1 ], self.elems[ self.ref_pos_col - 1 ]\n+        else:\n+            return None, None\n+\n+\n+class IndexedFile( object ):\n+\n+    def __init__( self, data_file=None, index_file=None ):\n+        self.data_file = data_file\n+        self.index_file = index_file\n+        self.data_fh = _openfile( data_file )\n+        self.index_fh = _openfile( index_file )\n+        self._reader = cdblib.Reader( self.index_fh.read(), hash )\n+\n+    def get_indexed_line( self, key=None ):\n+        line = None\n+        if key in self._reader:\n+            offset = self._reader.getint( key )\n+            self.data_fh.seek( offset )\n+            try:\n+                line = self.data_fh.next()\n+            except StopIteration:\n+                raise RuntimeError( \'index file out of sync for %s\' % key )\n+        return line\n+\n+class PrimersFile( IndexedFile ):\n+    def get_primer_header( self, sequence=None, position=None ):\n+        key = "%s %s" % ( str( sequence ), str( position ) )\n+        header = self.get_indexed_line( key )\n+        if header:\n+            if header.startswith( \'>\' ):\n+                elems = header.split()\n+                if len( elems ) < 3:\n+                    raise RuntimeError( \'short primers header for %s\' % key )\n+                if sequence != elems[1] or str( position ) != elems[2]:\n+                    raise RuntimeError( \'primers index for %s finds %s %s\' % ( key, elems[1], elems[2] ) )\n+            else:\n+                raise RuntimeError( \'primers index out of sync for %s\' % key )\n+        return header\n+\n+    def get_entry( self, sequence=None, position=None ):\n+        entry = self.get_primer_header( sequence, position )\n+        if entry:\n+            while self.data_fh:\n+                try:\n+                    line = self.data_fh.next()\n+                except StopIteration:\n+                    break\n+                if line.startswith( \'>\' ):\n+                    break\n+                entry += line\n+        return entry\n+\n+    def get_enzymes( self, sequence=None, posit'..b'=None, format=\'fasta\' ):\n+        if format != \'fasta\' and format != \'primer3\':\n+            raise RuntimeError( \'invalid format for flanking dna: %s\' % str( format ) )\n+        seq = self.get_snp_seq( sequence, position )\n+        if seq:\n+            p = seq.find(\'[\')\n+            if p == -1:\n+                raise RuntimeError( \'snpcalls entry for %s %s missing left bracket: %s\' % ( str( sequence ), str( position ), seq ) )\n+            q = seq.find(\']\', p + 1)\n+            if q == -1:\n+                raise RuntimeError( \'snpcalls entry for %s %s missing right bracket: %s\' % ( str( sequence ), str( position ), seq ) )\n+            q += 1\n+\n+            if format == \'fasta\':\n+                flanking_seq = \'> \'\n+            else:\n+                flanking_seq = \'SEQUENCE_ID=\'\n+\n+            flanking_seq += "%s %s %s %s\\n" % ( str( sequence ), str( position ), seq[p+1], seq[p+3] )\n+\n+            if format == \'primer3\':\n+                flanking_seq += \'SEQUENCE_TEMPLATE=\'\n+\n+            flanking_seq += "%sn%s\\n" % ( seq[0:p], seq[q:] )\n+\n+            if format == \'primer3\':\n+                flanking_seq += "SEQUENCE_TARGET=%d,11\\n=\\n" % ( p - 5 )\n+\n+            return flanking_seq\n+        else:\n+            return None\n+\n+\n+\n+class LocationFile( object ):\n+    def __init__(self, filename):\n+        self.build_map(filename)\n+\n+    def build_map(self, filename):\n+        self.map = {}\n+        self.open_file(filename)\n+        for line in self.read_lines():\n+            elems = line.split(\'\\t\', 1)\n+            if len(elems) == 2:\n+                self.map[ elems[0].strip() ] = elems[1].strip()\n+        self.close_file()\n+\n+    def read_lines(self):\n+        for line in self.fh:\n+            if not line.startswith(\'#\'):\n+                line = line.rstrip(\'\\r\\n\')\n+                yield line\n+\n+    def open_file(self, filename):\n+        self.filename = filename\n+        try:\n+            self.fh = open(filename, \'r\')\n+        except IOError, err:\n+            print >> sys.stderr, "Error opening location file \'%s\': %s" % (filename, str(err))\n+            sys.exit(1)\n+\n+    def close_file(self):\n+        self.fh.close()\n+\n+    def loc_file( self, key ):\n+        if key in self.map:\n+            return self.map[key]\n+        else:\n+            print >> sys.stderr, "\'%s\' does not appear in location file \'%s\'" % (key, self.filename)\n+            sys.exit(1)\n+        \n+class ChrLens( object ):\n+    def __init__( self, location_file, species ):\n+        self.chrlen_loc = LocationFile( location_file )\n+        self.chrlen_filename = self.chrlen_loc.loc_file( species )\n+        self.build_map()\n+\n+    def build_map(self):\n+        self.map = {}\n+        self.open_file(self.chrlen_filename)\n+        for line in self.read_lines():\n+            elems = line.split(\'\\t\', 1)\n+            if len(elems) == 2:\n+                chrom = elems[0].strip()\n+                chrom_len_text = elems[1].strip()\n+                try:\n+                    chrom_len = int( chrom_len_text )\n+                except ValueError:\n+                    print >> sys.stderr, "Bad length \'%s\' for chromosome \'%s\' in \'%s\'" % (chrom_len_text, chrom, self.chrlen_filename)\n+                self.map[ chrom ] = chrom_len\n+        self.close_file()\n+\n+    def read_lines(self):\n+        for line in self.fh:\n+            if not line.startswith(\'#\'):\n+                line = line.rstrip(\'\\r\\n\')\n+                yield line\n+\n+    def open_file(self, filename):\n+        self.filename = filename\n+        try:\n+            self.fh = open(filename, \'r\')\n+        except IOError, err:\n+            print >> sys.stderr, "Error opening chromosome length file \'%s\': %s" % (filename, str(err))\n+            sys.exit(1)\n+\n+    def close_file(self):\n+        self.fh.close()\n+\n+    def length( self, key ):\n+        if key in self.map:\n+            return self.map[key]\n+        else:\n+            return None\n+\n+    def __iter__( self ):\n+        for chrom in self.map:\n+            yield chrom\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/genome_diversity/select_restriction_enzymes.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/genome_diversity/select_restriction_enzymes.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,103 @@
+#!/usr/bin/env python2.5
+
+import os
+import sys
+from optparse import OptionParser
+import genome_diversity as gd
+
+def main_function( parse_arguments=None ):
+    if parse_arguments is None:
+        parse_arguments = lambda arguments: ( None, arguments )
+    def main_decorator( to_decorate ):
+        def decorated_main( arguments=None ):
+            if arguments is None:
+                arguments = sys.argv
+            options, arguments = parse_arguments( arguments )
+            rc = 1
+            try:
+                rc = to_decorate( options, arguments )
+            except Exception, err:
+                sys.stderr.write( 'ERROR: %s\n' % str( err ) )
+                traceback.print_exc()
+            finally:
+                sys.exit( rc )
+        return decorated_main
+    return main_decorator
+
+def parse_arguments( arguments ):
+    parser = OptionParser()
+    parser.add_option('--input',
+                        type='string', dest='input',
+                        help='file of selected SNPs')
+    parser.add_option('--output',
+                        type='string', dest='output',
+                        help='output file')
+    parser.add_option('--primers_loc',
+                        type='string', dest='primers_loc',
+                        help='primers .loc file')
+    parser.add_option('--scaffold_col',
+                        type="int", dest='scaffold_col',
+                        help='scaffold column in the input file')
+    parser.add_option('--pos_col',
+                        type="int", dest='pos_col',
+                        help='position column in the input file')
+    parser.add_option('--enzyme_list',
+                        type="string", dest='enzyme_list_string',
+                        help='comma separated list of enzymes')
+    parser.add_option('--species',
+                        type="string", dest='species',
+                        help='species')
+    return parser.parse_args( arguments[1:] )
+
+
+@main_function( parse_arguments )
+def main( options, arguments ):
+    if not options.input:
+        raise RuntimeError( 'missing --input option' )
+    if not options.output:
+        raise RuntimeError( 'missing --output option' )
+    if not options.primers_loc:
+        raise RuntimeError( 'missing --primers_loc option' )
+    if not options.scaffold_col:
+        raise RuntimeError( 'missing --scaffold_col option' )
+    if not options.pos_col:
+        raise RuntimeError( 'missing --pos_col option' )
+    if not options.enzyme_list_string:
+        raise RuntimeError( 'missing --enzyme_list option' )
+    if not options.species:
+        raise RuntimeError( 'missing --species option' )
+    
+    snps = gd.SnpFile( filename=options.input, seq_col=int( options.scaffold_col ), pos_col=int( options.pos_col ) )
+
+    out_fh = gd._openfile( options.output, 'w' )
+
+    enzyme_dict = {}
+    for enzyme in options.enzyme_list_string.split( ',' ):
+        enzyme = enzyme.strip()
+        if enzyme:
+            enzyme_dict[enzyme] = 1
+
+    primer_data_file = gd.get_filename_from_loc( options.species, options.primers_loc )
+    file_root, file_ext = os.path.splitext( primer_data_file )
+    primer_index_file = file_root + ".cdb"
+    primers = gd.PrimersFile( data_file=primer_data_file, index_file=primer_index_file )
+
+    comments_printed = False
+
+    while snps.next():
+        seq, pos = snps.get_seq_pos()
+        enzyme_list = primers.get_enzymes( seq, pos )
+        for enzyme in enzyme_list:
+            if enzyme in enzyme_dict:
+                if not comments_printed:
+                    for comment in snps.comments:
+                        out_fh.write( "%s\n" % comment )
+                    comments_printed = True
+                out_fh.write( "%s\n" % snps.line )
+                break
+
+    out_fh.close()
+
+if __name__ == "__main__":
+    main()
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/genome_diversity/select_restriction_enzymes.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/genome_diversity/select_restriction_enzymes.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,90 @@
+<tool id="gd_select_restriction_enzymes" name="Specify" version="1.0.0">
+  <description>a set of restriction enzymes</description>
+
+  <command interpreter="python2.5">
+    select_restriction_enzymes.py "--input=$input" "--output=$output" "--primers_loc=${GALAXY_DATA_INDEX_DIR}/gd.primers.loc"
+    #if $override_metadata.choice == "0":
+      "--scaffold_col=${input.metadata.scaffold}" "--pos_col=${input.metadata.pos}" "--species=${input.metadata.species}"
+    #else
+      "--scaffold_col=$scaf_col" "--pos_col=$pos_col" "--species=$species"
+    #end if
+    "--enzyme_list=$enzymes"
+  </command>
+
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Selected SNPS dataset"/>
+    <conditional name="override_metadata">
+      <param name="choice" type="select" format="integer" label="choose columns">
+        <option value="0" selected="true">No, get columns from metadata</option>
+        <option value="1" >Yes, choose columns</option>
+      </param>
+      <when value="0">
+        <!-- no options -->
+      </when>
+      <when value="1">
+        <param name="scaf_col" type="data_column" data_ref="input" numerical="false" label="Column with scaffold"/>
+        <param name="pos_col" type="data_column" data_ref="input" numerical="true" label="Column with position"/>
+        <param name="species" type="select" label="Choose species">
+          <options from_file="gd.species.txt">
+            <column name="name" index="1"/>
+            <column name="value" index="0"/>
+          </options>
+        </param>
+      </when>
+    </conditional>
+
+    <param name="enzymes" type="select" display="checkboxes" multiple="true" label="Choose enzymes">
+        <options from_file="gd.restriction_enzymes.txt">
+            <column name="name" index="0"/>
+            <column name="value" index="1"/>
+        </options>
+    </param>
+  </inputs>
+
+  <outputs>
+    <data format="wsf" name="output" metadata_source="input"/>
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="gd.sample.wsf" ftype="wsf"/>
+      <param name="choice" value="0"/>
+      <param name="enzymes" value="BanI,BstOI,Hsp92II"/>
+      <output name="output" file="gd.select_restriction_enzymes.wsf"/>
+    </test>
+  </tests>
+
+  <help>
+**What it does**
+
+  It selects the SNPs that are differentially cut by at least one of the
+  specified restriction enzymes. The enzymes are required to cut the amplified
+  segment (for the specified PCR primers) only at the SNP.
+
+-----
+
+**Example**
+
+- input file::
+
+    chr2_75111355_75112576    314  A  C  L  F  chr2   75111676  C  F  15  4  53   2   9  48   Y  96   0.369  0.355  0.396  0
+    chr8_93901796_93905612   2471  A  C  A  A  chr8   93904264  A  A  8   0  51   10  2  14   Y  961  0.016  0.534  0.114  2
+    chr10_7434473_7435447    524   T  C  S  S  chr10  7435005   T  S  11  5  90   14  0  69   Y  626  0.066  0.406  0.727  0
+    chr14_80021455_80022064  138   G  A  H  H  chr14  80021593  G  H  14  0  69   9   6  124  Y  377  0.118  0.997  0.195  1
+    chr15_64470252_64471048  89    G  A  Y  Y  chr15  64470341  G  Y  5   6  109  14  0  69   Y  312  0.247  0.998  0.393  0
+    chr18_48070585_48071386  514   C  T  E  K  chr18  48071100  T  K  7   7  46   14  0  69   Y  2    0.200  0.032  0.163  0
+    chr18_50154905_50155664  304   A  G  Y  C  chr18  50155208  A  Y  4   2  17   5   1  22   Y  8    0.022  0.996  0.128  0
+    chr18_57379354_57380496  315   C  T  V  V  chr18  57379669  G  V  11  0  60   9   6  62   Y  726  0.118  0.048  0.014  1
+    chr19_14240610_14242055  232   C  T  A  V  chr19  14240840  C  A  18  8  56   15  5  42   Y  73   0.003  0.153  0.835  0
+    chr19_39866997_39874915  3117  C  T  P  P  chr19  39870110  C  P  3   7  65   14  2  32   Y  6    0.321  0.911  0.462  4
+    etc.
+
+- output file::
+
+    chr8_93901796_93905612   2471  A  C  A  A  chr8   93904264  A  A  8   0  51   10  2  14   Y  961  0.016  0.534  0.114  2
+    chr14_80021455_80022064  138   G  A  H  H  chr14  80021593  G  H  14  0  69   9   6  124  Y  377  0.118  0.997  0.195  1
+    chr18_57379354_57380496  315   C  T  V  V  chr18  57379669  G  V  11  0  60   9   6  62   Y  726  0.118  0.048  0.014  1
+    chr19_39866997_39874915  3117  C  T  P  P  chr19  39870110  C  P  3   7  65   14  2  32   Y  6    0.321  0.911  0.462  4
+    etc.
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/genome_diversity/select_snps.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/genome_diversity/select_snps.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,148 @@
+#!/usr/bin/env python
+
+import sys
+import math
+from optparse import OptionParser
+import genome_diversity as gd
+
+def main_function(parse_arguments=None):
+    if parse_arguments is None:
+        parse_arguments = lambda arguments: (None, arguments)
+    def main_decorator(to_decorate):
+        def decorated_main(arguments=None):
+            if arguments is None:
+                arguments = sys.argv
+            options, arguments = parse_arguments(arguments)
+            sys.exit(to_decorate(options, arguments))
+        return decorated_main
+    return main_decorator
+
+def parse_arguments(arguments):
+    parser = OptionParser()
+    parser.add_option('--input', dest='input')
+    parser.add_option('--output', dest='output')
+    parser.add_option('--chrlens_loc', dest='chrlens_loc')
+    parser.add_option('--num_snps', dest='num_snps')
+    parser.add_option('--ref_chrom_col', dest='ref_chrom_col')
+    parser.add_option('--ref_pos_col', dest='ref_pos_col')
+    parser.add_option('--species', dest='species')
+    return parser.parse_args(arguments[1:])
+
+@main_function(parse_arguments)
+def main(options, arguments):
+
+    ref_chrom_idx = to_int( options.ref_chrom_col ) -1
+    ref_pos_idx = to_int( options.ref_pos_col ) -1
+
+    if (ref_chrom_idx < 1) or (ref_pos_idx < 1) or (ref_chrom_idx == ref_pos_idx):
+        print >> sys.stderr, "Cannot locate reference genome sequence (ref) or reference genome position (rPos) column for this dataset."
+        sys.exit(1)
+
+    chrlens = gd.ChrLens( options.chrlens_loc, options.species )
+
+    total_len = 0
+    for chrom in chrlens:
+        total_len += chrlens.length(chrom)
+
+    total_requested = int( options.num_snps )
+    lines, data, comments = get_snp_lines_data_and_comments( options.input, ref_chrom_idx, ref_pos_idx )
+    selected = select_snps( data, total_len, total_requested )
+    out_data = fix_selection_and_order_like_input(data, selected, total_requested)
+    write_selected_snps( options.output, out_data, lines, comments )
+
+def to_int( value ):
+    try:
+        int_value = int( value )
+    except ValueError:
+        int_value = 0
+    return int_value
+
+def get_snp_lines_data_and_comments( filename, chrom_idx, pos_idx ):
+    fh = open( filename, 'r' )
+    if (chrom_idx >= pos_idx):
+        needed = chrom_idx + 1
+    else:
+        needed = pos_idx + 1
+    lines = []
+    data = []
+    comments = []
+    line_idx = 0
+    line_num = 0
+    for line in fh:
+        line_num += 1
+        line = line.rstrip('\r\n')
+        if line:
+            if line.startswith('#'):
+                comments.append(line)
+            else:
+                elems = line.split('\t')
+                if len(elems) >= needed:
+                    chrom = elems[chrom_idx]
+                    try:
+                        pos = int(elems[pos_idx])
+                    except ValueError:
+                        sys.stderr.write( "bad reference position in line %d column %d: %s\n" % ( line_num, pos_idx+1, elems[pos_idx] ) )
+                        sys.exit(1)
+                    lines.append(line)
+                    chrom_sort = chrom.lstrip('chr')
+                    data.append( [chrom_sort, chrom, pos, line_num, line_idx] )
+                    line_idx += 1
+    fh.close()
+    data = sorted( data, key=lambda x: (x[0], x[2]) )
+    return lines, data, comments
+
+def select_snps( data, total_len, requested ):
+    old_chrom = None
+    next_print = 0
+    selected = []
+    space = total_len / requested
+    for data_idx, datum in enumerate( data ):
+        chrom = datum[1]
+        pos = datum[2]
+        if chrom != old_chrom:
+            old_chrom = chrom
+            next_print = 0
+        if pos >= next_print:
+            selected.append(data_idx)
+            next_print += space
+    return selected
+
+def fix_selection_and_order_like_input(data, selected, requested):
+    total_selected = len( selected )
+    a = float( total_selected ) / requested
+    b = a / 2
+
+    idx_list = []
+    for i in range( requested ):
+        idx = int( math.ceil( i * a + b ) - 1 )
+        idx_list.append( idx )
+
+    out_data = []
+
+    for i, data_idx in enumerate(selected):
+        if total_selected > requested:
+            if i in idx_list:
+                out_data.append(data[data_idx])
+        else:
+            out_data.append(data[data_idx])
+
+    out_data = sorted( out_data, key=lambda x: x[3] )
+
+    return out_data
+
+def write_selected_snps( filename, data, lines, comments ):
+    fh = open( filename, 'w' )
+
+    for comment in comments:
+        fh.write("%s\n" % comment )
+
+    for datum in data:
+        line_idx = datum[4]
+        fh.write("%s\n" % lines[line_idx])
+
+    fh.close()
+
+if __name__ == "__main__":
+    main()
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/genome_diversity/select_snps.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/genome_diversity/select_snps.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,87 @@
+<tool id="gd_select_snps" name="Select" version="1.0.0">
+  <description>a specified number of SNPs</description>
+
+  <command interpreter="python">
+    select_snps.py "--input=$input" "--output=$output" "--chrlens_loc=${GALAXY_DATA_INDEX_DIR}/gd.chrlens.loc" "--num_snps=$num_snps"
+    #if $override_metadata.choice == "0":
+      "--ref_chrom_col=${input.metadata.ref}" "--ref_pos_col=${input.metadata.rPos}" "--species=${input.metadata.species}"
+    #else
+      "--ref_chrom_col=$ref_col" "--ref_pos_col=$rpos_col" "--species=$species"
+    #end if
+  </command>
+
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Selected SNPS dataset"/>
+    <param name="num_snps" type="integer" value="10" optional="false" min="1" label="Number of SNPs"/>
+    <conditional name="override_metadata">
+      <param name="choice" type="select" format="integer" label="choose columns">
+        <option value="0" selected="true">No, get columns from metadata</option>
+        <option value="1" >Yes, choose columns</option>
+      </param>
+      <when value="0">
+        <!-- no options -->
+      </when>
+      <when value="1">
+        <param name="ref_col" type="data_column" data_ref="input" numerical="false" label="Column with reference chromosome"/>
+        <param name="rpos_col" type="data_column" data_ref="input" numerical="true" label="Column with reference position"/>
+        <param name="species" type="select" label="Choose species">
+          <options from_file="gd.species.txt">
+            <column name="name" index="1"/>
+            <column name="value" index="0"/>
+          </options>
+        </param>
+      </when>
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data format="wsf" name="output" metadata_source="input"/>
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="gd.sample.wsf" ftype="wsf"/>
+      <param name="num_snps" value="5"/>
+      <param name="choice" value="0"/>
+      <output name="output" file="gd.select_snps.wsf"/>
+    </test>
+  </tests>
+
+
+  <help>
+**What it does**
+
+  It attempts to select a specified number of SNPs from the dataset, making them
+  approximately uniformly spaced relative to the reference genome. The number
+  actually selected may be slightly more than the specified number.
+
+-----
+
+**Example**
+
+- input file::
+
+    chr2_75111355_75112576    314  A  C  L  F  chr2   75111676  C  F  15  4  53   2   9  48   Y  96   0.369  0.355  0.396  0
+    chr8_93901796_93905612   2471  A  C  A  A  chr8   93904264  A  A  8   0  51   10  2  14   Y  961  0.016  0.534  0.114  2
+    chr10_7434473_7435447    524   T  C  S  S  chr10  7435005   T  S  11  5  90   14  0  69   Y  626  0.066  0.406  0.727  0
+    chr14_80021455_80022064  138   G  A  H  H  chr14  80021593  G  H  14  0  69   9   6  124  Y  377  0.118  0.997  0.195  1
+    chr15_64470252_64471048  89    G  A  Y  Y  chr15  64470341  G  Y  5   6  109  14  0  69   Y  312  0.247  0.998  0.393  0
+    chr18_48070585_48071386  514   C  T  E  K  chr18  48071100  T  K  7   7  46   14  0  69   Y  2    0.200  0.032  0.163  0
+    chr18_50154905_50155664  304   A  G  Y  C  chr18  50155208  A  Y  4   2  17   5   1  22   Y  8    0.022  0.996  0.128  0
+    chr18_57379354_57380496  315   C  T  V  V  chr18  57379669  G  V  11  0  60   9   6  62   Y  726  0.118  0.048  0.014  1
+    chr19_14240610_14242055  232   C  T  A  V  chr19  14240840  C  A  18  8  56   15  5  42   Y  73   0.003  0.153  0.835  0
+    chr19_39866997_39874915  3117  C  T  P  P  chr19  39870110  C  P  3   7  65   14  2  32   Y  6    0.321  0.911  0.462  4
+    etc.
+
+- output file::
+
+    chr2_75111355_75112576    314  A  C  L  F  chr2   75111676  C  F  15  4  53   2   9  48   Y  96   0.369  0.355  0.396  0
+    chr8_93901796_93905612   2471  A  C  A  A  chr8   93904264  A  A  8   0  51   10  2  14   Y  961  0.016  0.534  0.114  2
+    chr10_7434473_7435447    524   T  C  S  S  chr10  7435005   T  S  11  5  90   14  0  69   Y  626  0.066  0.406  0.727  0
+    chr14_80021455_80022064  138   G  A  H  H  chr14  80021593  G  H  14  0  69   9   6  124  Y  377  0.118  0.997  0.195  1
+    chr15_64470252_64471048  89    G  A  Y  Y  chr15  64470341  G  Y  5   6  109  14  0  69   Y  312  0.247  0.998  0.393  0
+    chr18_48070585_48071386  514   C  T  E  K  chr18  48071100  T  K  7   7  46   14  0  69   Y  2    0.200  0.032  0.163  0
+    chr19_14240610_14242055  232   C  T  A  V  chr19  14240840  C  A  18  8  56   15  5  42   Y  73   0.003  0.153  0.835  0
+    etc.
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/BEAM2_wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/BEAM2_wrapper.sh Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+#
+# Galaxy wrapper for Yu Zhang's BEAM2 adds two new options
+#  significance=foo    renames significance.txt to foo after BEAM2 is run
+#  posterior=bar       renames posterior.txt    to bar after BEAM2 is run
+# 
+
+set -e
+
+export PATH=$PATH:$(dirname $0)
+
+## options
+significance=
+posterior=
+new_args=
+map=
+ped=
+
+TFILE="/tmp/BEAM2.$$.tmp"
+
+## separate significance and posterior arguments from arguments to BEAM2
+until [ $# -eq 0 ]
+do
+  case $1 in
+    significance=*)
+      significance=${1#significance=}
+      ;;
+    posterior=*)
+      posterior=${1#posterior=}
+      ;;
+    map=*)
+      map=${1#map=}
+      ;;
+    ped=*)
+      ped=${1#ped=}
+      ;;
+    *)
+      if [ -z "$new_args" ]; then
+        new_args=$1
+      else
+        new_args="$new_args $1"
+      fi
+      ;;
+  esac
+
+  shift
+done
+
+## convert input for use with BEAM2
+lped_to_geno.pl $map $ped > $TFILE
+if [ $? -ne 0 ]; then
+  echo "failed: lped_to_geno.pl $map $ped > $TFILE"
+  exit 1
+fi
+
+## run BEAM2
+BEAM2 $TFILE $new_args 1>/dev/null
+if [ $? -ne 0 ]; then
+  echo "failed: BEAM2 $TFILE $new_args"
+  exit 1
+fi
+
+mergeSnps.pl significance.txt $TFILE
+if [ $? -ne 0 ]; then
+  echo "failed: mergeSnps.pl significance.txt $TFILE"
+  exit 1
+fi
+
+## move output files
+mv significance.txt $significance
+mv posterior.txt $posterior
+
+## cleanup
+rm -f $TFILE
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/beam.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/beam.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,137 @@
+<tool id="hgv_beam" name="BEAM" version="1.0.0">
+  <description>significant single- and multi-locus SNP associations in case-control studies</description>
+
+  <command interpreter="bash">
+    BEAM2_wrapper.sh map=${input.extra_files_path}/${input.metadata.base_name}.map ped=${input.extra_files_path}/${input.metadata.base_name}.ped $burnin $mcmc $pvalue significance=$significance posterior=$posterior
+  </command>
+
+  <inputs>
+    <param format="lped" name="input" type="data" label="Dataset"/>
+    <param name="burnin" label="Number of MCMC burn-in steps" type="integer" value="200" />
+    <param name="mcmc" label="Number of MCMC sampling steps" type="integer" value="200" />
+    <param name="pvalue" label="Significance cutoff (after Bonferroni adjustment)" type="float" value="0.05" />
+  </inputs>
+
+  <outputs>
+    <data format="tabular" name="significance" />
+    <data format="tabular" name="posterior" />
+  </outputs>
+
+  <requirements>
+    <requirement type="package">beam</requirement>
+    <requirement type="binary">mv</requirement>
+    <requirement type="binary">rm</requirement>
+  </requirements>
+
+  <!-- broken.  will be fixed soon.
+  <tests>
+    <test>
+      <param name='input' value='gpass_and_beam_input' ftype='lped' >
+        <metadata name='base_name' value='gpass_and_beam_input' />
+        <composite_data value='gpass_and_beam_input.ped' />
+        <composite_data value='gpass_and_beam_input.map' />
+        <edit_attributes type='name' value='gpass_and_beam_input' />
+      </param>
+      <param name="burnin" value="200"/>
+      <param name="mcmc" value="200"/>
+      <param name="pvalue" value="0.05"/>
+      <output name="significance" file="beam_output1.tab"/>
+      <output name="posterior" file="beam_output2.tab"/>
+    </test>
+  </tests>
+  -->
+
+  <help>
+.. class:: infomark
+
+This tool can take a long time to run, depending on the number of SNPs, the
+sample size, and the number of MCMC steps specified.  If you have hundreds
+of thousands of SNPs, it may take over a day.  The main tasks that slow down
+this tool are searching for interactions and dynamically partitioning the
+SNPs into blocks.  Optimization is certainly possible, but hasn't been done
+yet.  **If your only interest is to detect SNPs with primary effects (i.e.,
+single-SNP associations), please use the GPASS tool instead.**
+
+-----
+
+**Dataset formats**
+
+The input dataset must be in lped_ format.  The output datasets are both tabular_.
+(`Dataset missing?`_)
+
+.. _lped: ./static/formatHelp.html#lped
+.. _tabular: ./static/formatHelp.html#tabular
+.. _Dataset missing?: ./static/formatHelp.html
+
+-----
+
+**What it does**
+
+BEAM (Bayesian Epistasis Association Mapping) uses a Markov Chain Monte Carlo (MCMC) method to infer SNP block structures and detect both single-marker
+and interaction effects from case-control SNP data.
+This tool also partitions SNPs into blocks based on linkage disequilibrium (LD).  The method utilized is Bayesian, so the outputs are posterior probabilities of association, along with block partitions.  An advantage of this method is that it provides uncertainty measures for the associations and block partitions, and it scales well from small to large sample sizes. It is powerful in detecting gene-gene interactions, although slow for large datasets.
+
+-----
+
+**Example**
+
+- input map file::
+
+    1  rs0  0  738547
+    1  rs1  0  5597094
+    1  rs2  0  9424115
+    etc.
+
+- input ped file::
+
+    1 1 0 0 1  1  G G  A A  A A  A A  A A  A G  A A  G G  G G  A A  G G  G G  G G  A A  A A  A G  A A  G G  A G  A G  A A  G G  A A  G G  A A  G G  A G  A A  G G  A A  G G  A A  A G  A G  G G  A G  G G  G G  A A  A G  A A  G G  G G  G G  G G  A G  A A  A A  A A  A A
+    1 1 0 0 1  1  G G  A G  G G  A A  A A  A G  A A  G G  G G  G G  A A  G G  A G  A G  G G  G G  A G  G G  A G  A A  G G  A G  G G  A A  G G  G G  A G  A G  G G  A G  A A  A A  G G  G G  A G  A G  G G  A G  A A  A A  A G  G G  A G  G G  A G  G G  G G  A A  G G  A G
+    etc.
+
+- first output file, significance.txt::
+
+    ID   chr   position  results
+    rs0  chr1  738547    10 20 score= 45.101397 , df= 8 , p= 0.000431 , N=1225
+
+- second output file, posterior.txt::
+
+    id:  chr position  marginal + interaction = total posterior
+    0:   1 738547      0.0000 + 0.0000 = 0.0000
+    1:   1 5597094     0.0000 + 0.0000 = 0.0000
+    2:   1 9424115     0.0000 + 0.0000 = 0.0000
+    3:   1 13879818    0.0000 + 0.0000 = 0.0000
+    4:   1 13934751    0.0000 + 0.0000 = 0.0000
+    5:   1 16803491    0.0000 + 0.0000 = 0.0000
+    6:   1 17236854    0.0000 + 0.0000 = 0.0000
+    7:   1 18445387    0.0000 + 0.0000 = 0.0000
+    8:   1 21222571    0.0000 + 0.0000 = 0.0000
+    etc.
+
+    id:  chr position block_boundary  | allele counts in cases and controls
+    0:   1 738547      1.000          | 156 93 251 | 169 83 248 
+    1:   1 5597094     1.000          | 323 19 158 | 328 16 156 
+    2:   1 9424115     1.000          | 366 6 128 | 369 11 120 
+    3:   1 13879818    1.000          | 252 31 217 | 278 32 190 
+    4:   1 13934751    1.000          | 246 64 190 | 224 58 218 
+    5:   1 16803491    1.000          | 91 160 249 | 91 174 235 
+    6:   1 17236854    1.000          | 252 43 205 | 249 44 207 
+    7:   1 18445387    1.000          | 205 66 229 | 217 56 227 
+    8:   1 21222571    1.000          | 353 9 138 | 352 8 140 
+    etc.
+
+  The "id" field is an internally used index.
+
+-----
+
+**References**
+
+Zhang Y, Liu JS. (2007)
+Bayesian inference of epistatic interactions in case-control studies.
+Nat Genet. 39(9):1167-73. Epub 2007 Aug 26.
+
+Zhang Y, Zhang J, Liu JS. (2010)
+Block-based bayesian epistasis association mapping with application to WTCCC type 1 diabetes data.
+Submitted.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/ctd.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/ctd.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,80 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use LWP::UserAgent;
+require HTTP::Cookies;
+
+#######################################################
+# ctd.pl 
+# Submit a batch query to CTD and fetch results into galaxy history
+# usage: ctd.pl inFile idCol inputType resultType actionType outFile
+#######################################################
+
+if (!@ARGV or scalar @ARGV != 6) {
+   print "usage: ctd.pl inFile idCol inputType resultType actionType outFile\n";
+   exit;
+}
+
+my $in = shift @ARGV;
+my $col = shift @ARGV;
+if ($col < 1) {
+   print "The column number is with a 1 start\n";
+   exit 1;
+}
+my $type = shift @ARGV;
+my $resType = shift @ARGV;
+my $actType = shift @ARGV;
+my $out = shift @ARGV;
+
+my @data;
+open(FH, $in) or die "Couldn't open $in, $!\n";
+while (<FH>) {
+   chomp;
+   my @f = split(/\t/);
+   if (scalar @f < $col) { 
+      print "ERROR the requested column is not in the file $col\n";
+      exit 1;
+   }
+   push(@data, $f[$col-1]);
+}
+close FH or die "Couldn't close $in, $!\n";
+
+my $url = 'http://ctd.mdibl.org/tools/batchQuery.go';
+#my $url = 'http://globin.bx.psu.edu/cgi-bin/print-query';
+my $d = join("\n", @data);
+#list maintains order, where hash doesn't
+#order matters at ctd
+#to use input file (gives error can't find file)
+#my @form = ('inputType', $type, 'inputTerms', '', 'report', $resType, 
+   #'queryFile', [$in, ''], 'queryFileColumn', $col, 'format', 'tsv', 'action', 'Submit');
+my @form = ('inputType', $type, 'inputTerms', $d, 'report', $resType,
+  'queryFile', '', 'format', 'tsv', 'action', 'Submit');
+if ($resType eq 'cgixns') { #only add if this type
+   push(@form, 'actionTypes', $actType);
+}
+my $ua = LWP::UserAgent->new;
+$ua->cookie_jar(HTTP::Cookies->new( () ));
+$ua->agent('Mozilla/5.0');
+my $page = $ua->post($url, \@form, 'Content_Type'=>'form-data');
+if ($page->is_success) {
+   open(FH, ">", $out) or die "Couldn't open $out, $!\n";
+   print FH "#";
+   print FH $page->content, "\n";
+   close FH or die "Couldn't close $out, $!\n";
+}else {
+   print "ERROR failed to get page from CTD, ", $page->status_line, "\n";
+   print $page->content, "\n";
+   my $req = $page->request();
+   print "Requested \n";
+   foreach my $k(keys %$req) { 
+      if ($k eq '_headers') {
+         my $t = $req->{$k};
+         foreach my $k2 (keys %$t) { print "$k2 => $t->{$k2}\n"; }
+      }else { print "$k => $req->{$k}\n"; }
+   }
+   exit 1;
+}
+exit;
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/ctd.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/ctd.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,320 @@\n+<tool id="hgv_ctdBatch" name="CTD" version="1.0.0">\n+  <description>analysis of chemicals, diseases, or genes</description>\n+\n+  <command interpreter="perl">\n+    ctd.pl $input $numerical_column $inType.inputType\n+    #if $inType.inputType == "disease"\n+      $inType.report ANY\n+    #else if $inType.reportType.report == "cgixns"\n+      $inType.reportType.report $inType.reportType.actType\n+    #else\n+      $inType.reportType.report ANY\n+    #end if\n+    $out_file1\n+  </command>\n+\n+  <inputs>\n+    <param name="input" type="data" format="tabular" label="Dataset" />\n+    <param name="numerical_column" type="data_column" data_ref="input" label="Column with identifiers" />\n+    <conditional name="inType">\n+      <param name="inputType" label="Identifier type" type="select">\n+        <option value="chem">Chemicals (MeSH names, synonyms or accession IDs, or CAS RNs)</option>\n+        <option value="disease">Diseases (MeSH or OMIM names, synonyms or accession IDs)</option>\n+        <option value="gene" selected="true">Genes (NCBI official symbols or accession IDs)</option>\n+      </param>\n+      <when value="chem">\n+        <conditional name=\'reportType\'>\n+          <param name="report" label="Data to extract" type="select">\n+            <option value="cgixns">Curated chemical-gene interactions</option>\n+            <option value="genes">Curated gene associations</option>\n+            <option value="pathways">Pathway associations</option>\n+            <option value="diseases" selected="true">All disease relationships</option>\n+            <option value="diseases_direct">  Direct disease relationships only</option>\n+            <option value="diseases_inferred">  Inferred disease relationships only</option>\n+            <option value="go">All GO associations</option>\n+            <option value="go_p">  GO biological Processes only</option>\n+            <option value="go_f">  GO molecular Functions only</option>\n+            <option value="go_c">  GO cellular Components only</option>\n+          </param>\n+          <when value="genes">\n+            <!-- do nothing -->\n+          </when>\n+          <when value="pathways">\n+            <!-- do nothing -->\n+          </when>\n+          <when value="diseases">\n+            <!-- do nothing -->\n+          </when>\n+          <when value="diseases_direct">\n+            <!-- do nothing -->\n+          </when>\n+          <when value="diseases_inferred">\n+            <!-- do nothing -->\n+          </when>\n+          <when value="go">\n+            <!-- do nothing -->\n+          </when>\n+          <when value="go_p">\n+            <!-- do nothing -->\n+          </when>\n+          <when value="go_f">\n+            <!-- do nothing -->\n+          </when>\n+          <when value="go_c">\n+            <!-- do nothing -->\n+          </when>\n+          <when value="cgixns">\n+            <param name="actType" label="Interaction type" type="select">\n+              <option value="ANY">ANY</option>\n+              <option value="abundance">abundance</option>\n+              <option value="activity">activity</option>\n+              <option value="binding">binding</option>\n+              <option value="cotreatment">cotreatment</option>\n+              <option value="expression">expression</option>\n+              <option value="folding">folding</option>\n+              <option value="localization">localization</option>\n+              <option value="metabolic processing">metabolic processing</option>\n+              <option value="acetylation">- acetylation</option>\n+              <option value="acylation">- acylation</option>\n+              <option value="alkylation">- alkylation</option>\n+              <option value="amination">- amination</option>\n+              <option value="carbamoylation">- carbamoylation</option>\n+              <option value="carboxylation">- carboxylation</option>\n+              <option value="chemical synthesis">- chemical synthesis</option>\n+              <option value="degradat'..b'ion>\n+              <option value="ADP-ribosylation">    - ADP-ribosylation</option>\n+              <option value="sulfation">- sulfation</option>\n+              <option value="sumoylation">- sumoylation</option>\n+              <option value="ubiquitination">- ubiquitination</option>\n+              <option value="mutagenesis">mutagenesis</option>\n+              <option value="reaction">reaction</option>\n+              <option value="response to chemical">response to chemical</option>\n+              <option value="splicing">splicing</option>\n+              <option value="stability">stability</option>\n+              <option value="transport">transport</option>\n+              <option value="secretion">- secretion</option>\n+              <option value="export">    - export</option>\n+              <option value="uptake">- uptake</option>\n+              <option value="import">    - import</option>\n+            </param>\n+          </when>\n+        </conditional>\n+      </when>\n+    </conditional>\n+  </inputs>\n+\n+  <outputs>\n+    <data format="tabular" name="out_file1" />\n+  </outputs>\n+\n+  <!-- broken for now.  will be fixed soon.\n+  <tests>\n+    <test>\n+      <param name="input" ftype="tabular" value="ctdBatchInput.txt" />\n+      <param name="numerical_column" value="1" />\n+      <param name="inputType" value="gene" />\n+      <param name="report" value="diseases" />\n+      <output name="out_file1" file="ctdBatchOut.txt" />\n+    </test>\n+  </tests>\n+  -->\n+\n+  <help>\n+**Dataset formats**\n+\n+The input and output datasets are tabular_.  \n+(`Dataset missing?`_)\n+\n+.. _tabular: ./static/formatHelp.html#tab\n+.. _Dataset missing?: ./static/formatHelp.html\n+\n+-----\n+\n+**What it does**\n+\n+This tool extracts data related to the provided list of identifiers\n+from the Comparative Toxicogenomics Database (CTD).  The fields\n+extracted vary with the type of data requested; the first row\n+of the output identifies the columns.\n+\n+For the curated chemical-gene interactions, you can also choose the\n+interaction type from the search-and-select box.  The choices that\n+start with \'-\' are a subset of a choice above them; you can chose\n+either the general interaction type or a more specific one.\n+\n+Website: http://ctd.mdibl.org/\n+\n+-----\n+\n+**Examples**\n+\n+- input data file:\n+    HBB\n+\n+- select Column = c1, Identifier type = Genes, and Data to extract = All disease relationships\n+\n+- output file::\n+\n+    #Input  GeneSymbol  GeneName          GeneID  DiseaseName                  DiseaseID     GeneDiseaseRelation         OmimIDs  PubMedIDs\n+    hbb     HBB         hemoglobin, beta  3043    Abnormalities, Drug-Induced  MESH:D000014  inferred via Ethanol                 17676605|18926900\n+    hbb     HBB         hemoglobin, beta  3043    Abnormalities, Drug-Induced  MESH:D000014  inferred via Valproic Acid           8875741\n+    etc.\n+\n+Another example: \n+\n+- same input file:\n+    HBB\n+\n+- select Column = c1, Identifier type = Genes, Data to extract = Curated chemical-gene interactions, and Interaction type = ANY\n+\n+- output file::\n+\n+    #Input  GeneSymbol  GeneName          GeneID  ChemicalName             ChemicalID  CasRN    Organism         OrganismID  Interaction                                         InteractionTypes  PubMedIDs\n+    hbb     HBB         hemoglobin, beta  3043    1-nitronaphthalene       C016614     86-57-7  Macaca mulatta   9544        1-nitronaphthalene metabolite binds to HBB protein  binding           16453347\n+    hbb     HBB         hemoglobin, beta  3043    2,6-diisocyanatotoluene  C026942     91-08-7  Cavia porcellus  10141       2,6-diisocyanatotoluene binds to HBB protein        binding           8728499\n+    etc.\n+\n+-----\n+\n+**Reference**\n+\n+Davis AP, Murphy CG, Saraceni-Richards CA, Rosenstein MC, Wiegers TC, Mattingly CJ. (2009)\n+Comparative Toxicogenomics Database: a knowledgebase and discovery tool for\n+chemical-gene-disease networks.\n+Nucleic Acids Res. 37(Database issue):D786-92. Epub 2008 Sep 9.\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/disease_ontology_gene_fuzzy_selector.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/disease_ontology_gene_fuzzy_selector.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,64 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+##################################################################
+# Select genes that are associated with the diseases listed in the
+# disease ontology.
+# ontology: http://do-wiki.nubic.northwestern.edu/index.php/Main_Page
+# gene associations by FunDO: http://projects.bioinformatics.northwestern.edu/do_rif/
+# Sept 2010, switch to doLite
+# input: build outfile sourceFileLoc.loc term or partial term
+##################################################################
+
+if (!@ARGV or @ARGV < 3) { 
+   print "usage: disease_ontology_gene_selector.pl build outfile.txt sourceFile.loc [list of terms]\n";
+   exit;
+}
+
+my $build = shift @ARGV;
+my $out = shift @ARGV;
+my $in = shift @ARGV;
+my $term = shift @ARGV;
+$term =~ s/^'//; #remove quotes protecting from shell
+$term =~ s/'$//; 
+my $data;
+open(LOC, $in) or die  "Couldn't open $in, $!\n";
+while (<LOC>) {
+   chomp;
+   if (/^\s*#/) { next; }
+   my @f = split(/\t/);
+   if ($f[0] eq $build) { 
+      if ($f[1] eq 'disease associated genes') { 
+         $data = $f[2]; 
+      }
+   }
+}
+close LOC or die "Couldn't close $in, $!\n";
+if (!$data) { 
+   print "Error $build not found in $in\n";
+   exit; 
+}
+if (!defined $term) { 
+   print "No disease term entered\n";
+   exit;
+}
+
+#start with just fuzzy term matches
+open(OUT, ">", $out) or die "Couldn't open $out, $!\n";
+open(FH, $data) or die "Couldn't open data file $data, $!\n";
+$term =~ s/\s+/|/g; #use OR between words
+while (<FH>) {
+   chomp;
+   my @f = split(/\t/); #chrom start end strand geneName geneID disease
+   if ($f[6] =~ /($term)/i) { 
+      print OUT join("\t", @f), "\n";
+   }elsif ($term eq 'disease') { #print all with disease
+      print OUT join("\t", @f), "\n";
+   }
+}
+close FH or die "Couldn't close data file $data, $!\n";
+close OUT or die "Couldn't close $out, $!\n";
+
+exit;
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/freebayes.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/freebayes.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,124 @@\n+<?xml version="1.0"?>\n+<tool id="freebayes_wrapper" name="Call SNPS with Freebayes" version="0.5.0">\n+  <requirements>\n+    <requirement type="package">freebayes</requirement>\n+  </requirements>\n+  <description>Bayesian genetic variant detector</description>\n+  <command>\n+    ln -s $reference localref.fa;\n+    ln -s $bamfile localbam.bam;\n+    samtools faidx localref.fa;\n+    samtools sort localbam.bam localbam.bam;\n+    samtools index localbam.bam;\n+    freebayes --fasta-reference localref.fa localbam.bam --vcf $output\n+            #if $params.source_select == "full":\n+                $params.showRefRepeats\n+                -T $params.theta\n+                -p $params.ploidy\n+                $params.pooled\n+                $params.mnps\n+                $params.nosnps\n+                -n $params.bestAlleles\n+                $params.allAlleles\n+                $params.duplicateReads\n+                -M $params.refMapQuality\n+                $params.ignoreRefAllele\n+                $params.haploidReference\n+                -m $params.minMapQuality\n+                -q $params.minBaseQuality\n+                $params.noFilters\n+                -x $params.indelExclusionWindow\n+                <!-- -D $readDependenceFactor -->\n+                -V $params.diffusionPriorScalar\n+                -W $params.postIntegBandwidth\n+                -Y $params.postIntegBanddepth\n+                -F $params.minAltFraction\n+                -C $params.minAltCount\n+                -G $params.minAltTotal\n+                --min-coverage $params.minCoverage\n+            #end if\n+  </command>\n+  <inputs>\n+    <param format="fasta" name="reference" type="data" metadata_name="dbkey" label="Reference File"/>\n+    <param format="bam" name="bamfile" type="data" label="Bam Alignment File"/>\n+    <conditional name="params">\n+      <param name="source_select" type="select" label="Freebayes Settings to Use" help="For straight forward mapping needs use Commonly Used settings. If you want full control use Full Parameter List">\n+        <option value="pre_set">Commonly Used</option>\n+        <option value="full">Full Parameter List</option>\n+      </param>\n+      <when value="pre_set"/>\n+      <when value="full">\n+        <param name="indels" type="select" label="Include insertion and deletion alleles in the analysis">\n+          <option value="">No</option>\n+          <option value="-i -N --report-all-alternates --left-align-indels">Yes</option>\n+        </param>\n+        <param name="theta" size="5" type="float" value="0.001" label="Theta" help="The expected mutation rate or pairwise nucleotide diversity among the population under analysis.  This serves as the single parameter to the Ewens Sampling Formula prior model"/>\n+        <param name="showRefRepeats" type="select" label="Show Reference Repeats" help="Calculate and show information about reference repeats in the VCF output">\n+          <option value="">No</option>\n+          <option value="-_">Yes</option>\n+        </param>\n+        <param name="ploidy" size="5" type="integer" value="2" label="Ploidy" help="Sets the default ploidy for the analysis"/>\n+        <param name="pooled" type="select" label="Pooled" help="Assume that samples result from pooled sequencing. When using this flag, set --ploidy to the number of alleles in each sample">\n+          <option value="">No</option>\n+          <option value="-J">Yes</option>\n+        </param>\n+        <param name="mnps" type="select" label="Include multi-nuceotide polymorphisms in the analysis">\n+          <option value="">No</option>\n+          <option value="--mnps">Yes</option>\n+        </param>\n+        <param name="nosnps" type="select" label="Ignore SNP alleles">\n+          <option value="">No</option>\n+          <option value="--no-snps">Yes</option>\n+        </param>\n+        <param name="duplicateReads" type="select" label="Include duplicate-marked alignments in the analysis">\n+          <option value="">No</option>\n+          <opt'..b'e="10" label="Minimum Mapping Quality" help="Exclude alignments from analysis if they have a mapping quality less than Q"/>\n+        <param name="minBaseQuality" size="5" type="integer" value="5" label="Minimum Base Quality" help="Exclude alleles from analysis if their supporting base quality is less than Q"/>\n+        <param name="indelExclusionWindow" size="5" type="integer" value="0" label="Indel Exclusion Window" help="Ignore portions of alignments N bases from a putative insertion or deletion allele"/>\n+        <param name="ignoreRefAllele" type="select" label="Ignore Reference Allele" help="By default, the reference allele is considered as another sample.  This flag excludes it from the analysis">\n+          <option value="">No</option>\n+          <option value="--ignore-reference-allele">Yes</option>\n+        </param>\n+        <param name="haploidReference" type="select" label="Haploid Reference" help="If using the reference sequence as a sample, consider it to be haploid">\n+          <option value="">No</option>\n+          <option value="--haploid-reference">Yes</option>\n+        </param>\n+        <param name="noFilters" type="select" label="No Filters" help="Do not use any input base and mapping quality filters. Equivalent to -m 0 -q 0 -R 0 -S 0">\n+          <option value="">No</option>\n+          <option value="--no-filters">Yes</option>\n+        </param>\n+        <!-- <param name="readDependenceFactor" size="5" type="float" value="0.9" label="Read Dependence Factor" help="Incorporate non-independence of reads by scaling successive observations by this factor during data likelihood calculations"/> -->\n+        <param name="diffusionPriorScalar" size="5" type="float" value="1" label="Diffusion Prior Scalar" help="Downgrade the significance of P(genotype combo | allele frequency) by taking the Nth root of this component of the prior"/>\n+        <param name="postIntegBandwidth" size="5" type="integer" value="2" label="Posterior Integratoin Bandwidth" help="Integrate all genotype combinations in our posterior space which lie no more than N steps from the most likely combination in terms of data likelihoods, taking the N steps from the most to least likely genotype for each individual"/>\n+        <param name="postIntegBanddepth" size="5" type="integer" value="2" label="Posterior Integratoin Banddepth" help="Generate all genotype combinations for which up to this number of samples have up to their -W\'th worst genotype according to data likelihood"/>\n+        <param name="minAltFraction" size="5" type="integer" value="0" label="Minimum Alternative Fraction" help="Require at least this fraction of observations supporting an alternate allele within a single individual in the in order to evaluate the position"/>\n+        <param name="minAltCount" size="5" type="integer" value="1" label="Minimum Alternative Count" help="Require at least this count of observations supporting an alternate allele within a single individual in order to evaluate the position"/>\n+        <param name="minAltTotal" size="5" type="integer" value="1" label="Minimum Alternative Total" help="Require at least this count of observations supporting an alternate allele within the total population in order to use the allele in analysis"/>\n+        <param name="minCoverage" size="5" type="integer" value="0" label="Minimum Coverage" help="Require at least this coverage to process a site"/>\n+      </when>\n+    </conditional>\n+  </inputs>\n+  <outputs>\n+    <data format="vcf" name="output" metadata_source="reference" />\n+  </outputs>\n+  <tests>\n+    <test>\n+      <param name="reference" ftype="fasta" value="mosaik_test_ref.fasta"/>\n+      <param name="bamfile" ftype="bam" value="freebayes_in.bam"/>\n+      <param name="source_select" value="pre_set"/>\n+      <output name="output" file="freebayes_out.vcf" lines_diff="4"/>\n+    </test>\n+  </tests>\n+  <help>\n+This tool uses Freebayes to call SNPS given a reference sequence and a BAM alignment file.\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/funDo.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/funDo.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,101 @@
+<tool id="hgv_funDo" name="FunDO" version="1.0.0">
+  <description>human genes associated with disease terms</description>
+
+  <command interpreter="perl">
+    disease_ontology_gene_fuzzy_selector.pl $build $out_file1 ${GALAXY_DATA_INDEX_DIR}/funDo.loc '$term'
+  </command>
+
+  <inputs>
+    <param name="build" type="select" label="Database build">
+      <options from_file="funDo.loc">
+        <column name="name" index="0"/>
+        <column name="value" index="0"/>
+        <filter type="unique_value" column="0"/>
+      </options>
+    </param>
+    <param name="term" size="40" type="text" label="Disease term(s)" />
+  </inputs>
+
+  <outputs>
+    <data format="interval" name="out_file1">
+    </data>
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="term" value="lung"/>
+      <param name="build" value="hg18"/>
+      <output name="out_file1" file="funDo_output1.interval" />
+    </test>
+  </tests>
+
+  <help>
+**Dataset formats**
+
+There is no input dataset.  The output is in interval_ format.
+
+.. _interval: ./static/formatHelp.html#interval
+
+-----
+
+**What it does**
+
+This tool searches the disease-term field of the DOLite mappings
+used by the FunDO project and returns a set of genes that 
+are associated with terms matching the specified pattern.  (This is the
+reverse of what FunDO's own server does.)
+
+The search is case insensitive, and selects terms that contain any of
+the given words, either exactly or within a longer word (e.g. "nemia"
+selects not only "anemia", but also "hyperglycinemia", "tyrosinemias",
+and many other things).  Multiple words should be separated by spaces,
+not commas.  As a special case, entering the word "disease" returns all
+genes associated with any disease, even if that word does not actually
+appear in the term field.
+
+Website: http://django.nubic.northwestern.edu/fundo/
+
+-----
+
+**Example**
+
+Typing:: 
+
+    carcinoma
+
+results in::
+
+    1.     2.         3.         4. 5.       6.     7.
+    chr11  89507465   89565427   +  NAALAD2  10003  Adenocarcinoma
+    chr15  50189113   50192264   -  BCL2L10  10017  Carcinoma
+    chr7   150535855  150555250  -  ABCF2    10061  Clear cell carcinoma
+    chr7   150540508  150555250  -  ABCF2    10061  Clear cell carcinoma
+    chr10  134925911  134940397  -  ADAM8    101    Adenocarcinoma
+    chr10  134925911  134940397  -  ADAM8    101    Adenocarcinoma
+    etc.
+
+where the column contents are as follows::
+
+ 1. chromosome name
+ 2. start position of the gene
+ 3. end position of the gene
+ 4. strand
+ 4. gene name
+ 6. Entrez Gene ID
+ 7. disease term
+
+-----
+
+**References**
+
+Du P, Feng G, Flatow J, Song J, Holko M, Kibbe WA, Lin SM. (2009)
+From disease ontology to disease-ontology lite: statistical methods to adapt a general-purpose
+ontology for the test of gene-ontology associations.
+Bioinformatics. 25(12):i63-8.
+
+Osborne JD, Flatow J, Holko M, Lin SM, Kibbe WA, Zhu LJ, Danila MI, Feng G, Chisholm RL. (2009)
+Annotating the human genome with Disease Ontology.
+BMC Genomics. 10 Suppl 1:S6.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/gpass.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/gpass.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,79 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use File::Basename;
+use File::Temp qw/ tempfile /;
+
+$ENV{'PATH'} .= ':' . dirname($0);
+
+#this is a wrapper for gpass that converts a linkage pedigree file to input 
+#for this program
+
+my($map, $ped, $out, $fdr) = @ARGV;
+
+if (!$map or !$ped or !$out or !$fdr) { die "missing args\n"; }
+
+my($fh, $name) = tempfile();
+#by default this file is removed when these variable go out of scope
+print $fh "map=$map ped=$ped\n";
+close $fh;  #converter will overwrite, just keep name
+
+#run converter 
+system("lped_to_geno.pl $map $ped > $name") == 0
+ or die "system lped_to_geno.pl $map $ped > $name failed\n";
+
+#system("cp $name tmp.middle");
+
+#run GPASS
+system("gpass $name -o $out -fdr $fdr 1>/dev/null") == 0
+ or die "system gpass $name -o $out -fdr $fdr, failed\n";
+
+#merge SNP data with results
+merge();
+
+exit;
+
+########################################
+
+#merge the input and output files so have SNP data with result
+sub merge {
+   open(FH, $out) or die "Couldn't open $out, $!\n";
+   my %res;
+   my @ind;
+   while (<FH>) {
+      chomp;
+      my $line = $_;
+      if ($line =~ /^(\d+)/) { $res{$1} = $line; push(@ind, $1); }
+      else { $res{'index'} = $line; }
+   }
+   close FH;
+   if (!@ind) { return; } #no results, leave alone
+   @ind = sort { $a <=> $b } @ind;
+   $res{'index'} =~ s/Index/#ID\tchr\tposition/;
+   #read input file to get SNP data
+   open(FH, $name) or die "Couldn't open $name, $!\n";
+   my $i = 0; #index is 0 based not counting header line
+   my $c = shift @ind;
+   while (<FH>) {
+      chomp; 
+      if (/^ID/) { next; }
+      my @f = split(/\s+/);
+      if ($i == $c) { 
+         $res{$i} =~ s/^$i/$f[0]\t$f[1]\t$f[2]/;
+         if (!@ind) { last; }
+         $c = shift @ind;
+      }
+      $i++;      
+   }
+   close FH;
+   #now reprint results with SNP data included
+   open(FH, ">", $out) or die "Couldn't write to $out, $!\n";
+   print FH $res{'index'}, "\n";
+   delete $res{'index'};
+   foreach $i (keys %res) {
+      print FH $res{$i}, "\n";
+   }
+   close FH;
+}
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/gpass.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/gpass.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,112 @@
+<tool id="hgv_gpass" name="GPASS" version="1.0.0">
+  <description>significant single-SNP associations in case-control studies</description>
+
+  <command interpreter="perl">
+    gpass.pl ${input1.extra_files_path}/${input1.metadata.base_name}.map ${input1.extra_files_path}/${input1.metadata.base_name}.ped $output $fdr
+  </command>
+
+  <inputs>
+    <param name="input1" type="data" format="lped" label="Dataset"/>
+    <param name="fdr" type="float" value="0.05" label="FDR"/>
+  </inputs>
+
+  <outputs>
+    <data name="output" format="tabular" />
+  </outputs>
+
+  <requirements>
+    <requirement type="package">gpass</requirement>
+  </requirements>
+
+  <!-- we need to be able to set the seed for the random number generator
+  <tests>
+    <test>
+      <param name='input1' value='gpass_and_beam_input' ftype='lped' >
+        <metadata name='base_name' value='gpass_and_beam_input' />
+        <composite_data value='gpass_and_beam_input.ped' />
+        <composite_data value='gpass_and_beam_input.map' />
+        <edit_attributes type='name' value='gpass_and_beam_input' />
+      </param>
+      <param name="fdr" value="0.05" />
+      <output name="output" file="gpass_output.txt" />
+    </test>
+  </tests>
+  -->
+
+  <help>
+**Dataset formats**
+
+The input dataset must be in lped_ format, and the output is tabular_.
+(`Dataset missing?`_)
+
+.. _lped: ./static/formatHelp.html#lped
+.. _tabular: ./static/formatHelp.html#tab
+.. _Dataset missing?: ./static/formatHelp.html
+
+-----
+
+**What it does**
+
+GPASS (Genome-wide Poisson Approximation for Statistical Significance)
+detects significant single-SNP associations in case-control studies at a user-specified FDR.  Unlike previous methods, this tool can accurately approximate the genome-wide significance and FDR of SNP associations, while adjusting for millions of multiple comparisons, within seconds or minutes.
+
+The program has two main functionalities:
+
+1. Detect significant single-SNP associations at a user-specified false
+   discovery rate (FDR).
+
+   *Note*: a "typical" definition of FDR could be
+            FDR = E(# of false positive SNPs / # of significant SNPs)
+
+   This definition however is very inappropriate for association mapping, since SNPs are
+   highly correlated.  Our FDR is
+   defined differently to account for SNP correlations, and thus will obtain
+   a proper FDR in terms of "proportion of false positive loci".
+
+2. Approximate the significance of a list of candidate SNPs, adjusting for
+   multiple comparisons. If you have isolated a few SNPs of interest and want 
+   to know their significance in a GWAS, you can supply the GWAS data and let 
+   the program specifically test those SNPs.
+
+
+*Also note*: the number of SNPs in a study cannot be both too small and at the same
+time too clustered in a local region. A few hundreds of SNPs, or tens of SNPs
+spread in different regions, will be fine. The sample size cannot be too small
+either; around 100 or more individuals (case + control combined) will be fine.
+Otherwise use permutation.
+
+-----
+
+**Example**
+
+- input map file::
+
+    1  rs0  0  738547
+    1  rs1  0  5597094
+    1  rs2  0  9424115
+    etc.
+
+- input ped file::
+
+    1 1 0 0 1  1  G G  A A  A A  A A  A A  A G  A A  G G  G G  A A  G G  G G  G G  A A  A A  A G  A A  G G  A G  A G  A A  G G  A A  G G  A A  G G  A G  A A  G G  A A  G G  A A  A G  A G  G G  A G  G G  G G  A A  A G  A A  G G  G G  G G  G G  A G  A A  A A  A A  A A
+    1 1 0 0 1  1  G G  A G  G G  A A  A A  A G  A A  G G  G G  G G  A A  G G  A G  A G  G G  G G  A G  G G  A G  A A  G G  A G  G G  A A  G G  G G  A G  A G  G G  A G  A A  A A  G G  G G  A G  A G  G G  A G  A A  A A  A G  G G  A G  G G  A G  G G  G G  A A  G G  A G
+    etc.
+
+- output dataset, showing significant SNPs and their p-values and FDR::
+
+    #ID   chr   position   Statistics  adj-Pvalue  FDR
+    rs35  chr1  136606952  4.890849    0.991562    0.682138
+    rs36  chr1  137748344  4.931934    0.991562    0.795827
+    rs44  chr2  14423047   7.712832    0.665086    0.218776
+    etc.
+
+-----
+
+**Reference**
+
+Zhang Y, Liu JS. (2010)
+Fast and accurate significance approximation for genome-wide association studies.
+Submitted.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/hilbertvis.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/hilbertvis.sh Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,109 @@
+#!/usr/bin/env bash
+
+input_file="$1"
+output_file="$2"
+chromInfo_file="$3"
+chrom="$4"
+score_col="$5"
+hilbert_curve_level="$6"
+summarization_mode="$7"
+chrom_col="$8"
+start_col="$9"
+end_col="${10}"
+strand_col="${11}"
+
+## use first sequence if chrom filed is empty
+if [ -z "$chrom" ]; then
+    chrom=$( head -n 1 "$input_file" | cut -f$chrom_col )
+fi
+
+## get sequence length 
+if [ ! -r "$chromInfo_file" ]; then
+    echo "Unable to read chromInfo_file $chromInfo_file" 1>&2
+    exit 1
+fi
+
+chrom_len=$( awk '$1 == chrom {print $2}' chrom=$chrom $chromInfo_file )
+
+## error if we can't find the chrom_len
+if [ -z "$chrom_len" ]; then
+    echo "Can't find length for sequence \"$chrom\" in chromInfo_file $chromInfo_file" 1>&2
+    exit 1
+fi
+
+## make sure chrom_len is positive
+if [ $chrom_len -le 0 ]; then
+    echo "sequence \"$chrom\" length $chrom_len <= 0" 1>&2
+    exit 1
+fi
+
+## modify R script depending on the inclusion of a score column, strand information
+input_cols="\$${start_col}, \$${end_col}"
+col_types='beg=0, end=0, strand=""'
+
+# if strand_col == 0 (strandCol metadata is not set), assume everything's on the plus strand
+if [ $strand_col -ne 0 ]; then
+    input_cols="${input_cols}, \$${strand_col}"
+else
+    input_cols="${input_cols}, \\\"+\\\""
+fi
+
+# set plot value (either from data or use a constant value)
+if [ $score_col -eq -1 ]; then
+    value=1
+else
+    input_cols="${input_cols}, \$${score_col}"
+    col_types="${col_types}, score=0"
+    value='chunk$score[i]'
+fi
+
+R --vanilla &> /dev/null <<endOfCode
+library(HilbertVis);
+
+chrom_len <- ${chrom_len};
+chunk_size <- 1000;
+interval_count <- 0;
+invalid_strand <- 0;
+
+awk_cmd <- paste(
+  "awk 'BEGIN{FS=\"\t\";OFS=\"\t\"}",
+    "\$${chrom_col} == \"${chrom}\"",
+      "{print ${input_cols}}' ${input_file}"
+);
+
+col_types <- list(${col_types});
+vec <- vector(mode="numeric", length=chrom_len);
+conn <- pipe(description=awk_cmd, open="r");
+
+repeat {
+  chunk <- scan(file=conn, what=col_types, sep="\t", nlines=chunk_size, quiet=TRUE);
+
+  if ((rows <- length(chunk\$beg)) == 0)
+        break;
+
+  interval_count <- interval_count + rows;
+
+  for (i in 1:rows) {
+    if (chunk\$strand[i] == '+') {
+      start <- chunk\$beg[i] + 1;
+      stop <- chunk\$end[i];
+    } else if (chunk\$strand[i] == '-') {
+      start <- chrom_len - chunk\$end[i] - 1;
+      stop <- chrom_len - chunk\$beg[i];
+    } else {
+      invalid_strand <- invalid_strand + 1;
+      interval_count <- interval_count - 1;
+      next;
+    }
+    vec[start:stop] <- ${value};
+  }
+}
+
+close(conn);
+
+hMat <- hilbertImage(vec, level=$hilbert_curve_level, mode="$summarization_mode");
+pdf(file="$output_file", onefile=TRUE, width=8, height=10.5, paper="letter");
+showHilbertImage(hMat);
+dev.off();
+endOfCode
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/hilbertvis.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/hilbertvis.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,117 @@
+<tool id="hgv_hilbertvis" name="HVIS" version="1.0.0">
+  <description>visualization of genomic data with the Hilbert curve</description>
+
+  <command interpreter="bash">
+    hilbertvis.sh $input $output $chromInfo "$chrom" $plot_value.score_col $level $mode
+    #if isinstance( $input.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__)
+      1 4 5 7
+    #else
+      ${input.metadata.chromCol} ${input.metadata.startCol} ${input.metadata.endCol} ${input.metadata.strandCol}
+    #end if
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="interval,gff" label="Dataset">
+      <validator type="unspecified_build"/>
+      <validator type="metadata" check="chromCol" message="chromCol missing"/>
+      <validator type="metadata" check="startCol" message="startCol missing"/>
+      <validator type="metadata" check="endCol" message="endCol missing"/>
+    </param>
+    <param name="chrom" type="text" label="Sequence to plot" help="Name of sequence (from the chromosome column in the dataset) to plot.  If left blank, the first sequence in the dataset will be plotted."/>
+    <conditional name="plot_value">
+      <param name="choice" type="select" label="Value to plot">
+        <option value="score" selected="true">Score column from dataset</option>
+        <option value="exist">Same value for each base (existence)</option>
+      </param>
+      <when value="score">
+        <param name="score_col" type="data_column" data_ref="input" numerical="true" label="Score column"/>
+      </when>
+      <when value="exist">
+        <param name="score_col" type="hidden" value="-1"/>
+      </when>
+    </conditional>
+    <param name="level" type="integer" value="9" label="Level" help="Level of Hilbert curve.  The resulting image will have 2&lt;sup&gt;level&lt;/sup&gt; by 2&lt;sup&gt;level&lt;/sup&gt; pixels.">
+      <validator type="in_range" min="1" message="The level must be an integer &gt;= 1."/>
+    </param>
+    <param name="mode" type="select" label="Summarization mode" help="Method used to determine a value for a point in the plot which covers multiple values in the input.">
+      <option value="max">Maximal value in each bin</option>
+      <option value="min">Minimal value in each bin</option>
+      <option value="absmax" selected="true">Maximal absolute value in each bin</option>
+      <option value="mean">Mean value of each bin</option>
+    </param>
+  </inputs>
+
+  <outputs>
+    <data name="output" format="pdf"/>
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="hvis_mkar_chr22.tab"/>
+      <param name="chrom" value="chr22"/>
+      <param name="choice" value="score"/>
+      <param name="score_col" value="15"/>
+      <param name="level" value="9"/>
+      <param name="mode" value="absmax"/>
+      <output name="output" file="hvis_mkar_chr22.pdf" compare="sim_size" delta="7168"/>
+    </test>
+  </tests>
+
+  <help>
+**Dataset formats**
+
+The input format is interval_, and the output is an image in PDF format.
+(`Dataset missing?`_)
+
+.. _interval: ./static/formatHelp.html#interval
+.. _Dataset missing?: ./static/formatHelp.html
+
+-----
+
+**What it does**
+
+HilbertVis uses the Hilbert space-filling curve to visualize the structure of
+position-dependent data.  It maps the traditional one-dimensional line
+visualization onto a two-dimensional square.  For example, here is a diagram
+showing the path of a level-2 Hilbert curve.
+
+.. image:: ./static/images/hilbertvisDiagram.png
+
+The shade of each pixel represents the value for the corresponding bin of
+consecutive genomic positions, calculated according to the specified
+summarization mode.  The pixels are arranged so that bins that are close
+to each other on the data vector are represented by pixels that are close
+to each other in the plot.  In particular, adjacent bins are mapped to
+adjacent pixels.  Hence, dark spots in a figure represent a peak; the area
+of the spot in the two-dimensional plot is proportional to the width of the
+peak in the one-dimensional data, and the darkness of the spot corresponds to
+the height of the peak.
+
+The input file is in interval format, and typically contains a column with
+scores or other numbers, such as conservation scores, SNP density, the
+coverage of aligned reads from ChIP-Seq data, etc.
+
+Website: http://www.ebi.ac.uk/huber-srv/hilbert/
+
+-----
+
+**Examples**
+
+Here are some examples from the HilbertVis homepage, using ChIP-Seq data.
+
+.. image:: ./static/images/hilbertvis1.png
+
+-----
+
+.. image:: ./static/images/hilbertvis2.png
+
+-----
+
+**Reference**
+
+Anders S. (2009)
+Visualization of genomic data with the Hilbert curve.
+Bioinformatics. 25(10):1231-5. Epub 2009 Mar 17.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/ldtools.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/ldtools.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,111 @@
+<tool id="hgv_ldtools" name="LD" version="1.0.0">
+  <description>linkage disequilibrium and tag SNPs</description>
+
+  <command interpreter="bash">
+    ldtools_wrapper.sh rsquare=$rsquare freq=$freq input=$input output=$output
+  </command>
+
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Dataset"/>
+    <param name="rsquare" label="r&lt;sup&gt;2&lt;/sup&gt; threshold" type="float" value="0.64">
+      <validator type="in_range" message="rsquare must be in range [0.00, 1.00]" min="0.00" max="1.00" />
+    </param>
+    <param name="freq" label="Minimum allele frequency threshold" type="float" value="0.00">
+      <validator type="in_range" message="freq must be in range (0.00, 0.50]" min="0.00" max="0.50" />
+    </param>
+  </inputs>
+
+  <outputs>
+    <data format="tabular" name="output" />
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="ldInput1.txt" />
+      <param name="rsquare" value="0.64" />
+      <param name="freq" value="0.00" />
+      <output name="output" file="ldOutput1.txt" />
+    </test>
+  </tests>
+
+  <help>
+**Dataset formats**
+
+The input and output datasets are tabular_.
+(`Dataset missing?`_)
+
+.. _tabular: ./static/formatHelp.html#tab
+.. _Dataset missing?: ./static/formatHelp.html
+
+-----
+
+**What it does**
+
+This tool can be used to analyze the patterns of linkage disequilibrium
+(LD) between polymorphic sites in a locus.  SNPs are grouped based on the
+threshold level of LD as measured by r\ :sup:`2` (regardless of genomic
+position), and a representative "tag SNP" is reported for each group.
+The other SNPs in the group are in LD with the tag SNP, but not necessarily
+with each other.
+
+The underlying algorithm is the same as the one used in ldSelect (Carlson
+et al. 2004).  However, this tool is implemented to be much faster and more
+efficient than ldSelect.
+
+The input is a tabular file with genotype information for each individual
+at each SNP site, in exactly four columns: site ID, sample ID, and the
+two allele nucleotides.
+
+-----
+
+**Example**
+
+- input file::
+
+    rs2334386  NA20364  G  T
+    rs2334386  NA20363  G  G
+    rs2334386  NA20360  G  G
+    rs2334386  NA20359  G  G
+    rs2334386  NA20358  G  G
+    rs2334386  NA20356  G  G
+    rs2334386  NA20357  G  G
+    rs2334386  NA20350  G  G
+    rs2334386  NA20349  G  G
+    rs2334386  NA20348  G  G
+    rs2334386  NA20347  G  G
+    rs2334386  NA20346  G  G
+    rs2334386  NA20345  G  G
+    rs2334386  NA20344  G  G
+    rs2334386  NA20342  G  G
+    etc.
+
+- output file::
+
+    rs2238748  rs2793064,rs6518516,rs6518517,rs2283641,rs5993533,rs715590,rs2072123,rs2105421,rs2800954,rs1557847,rs807750,rs807753,rs5993488,rs8138035,rs2800980,rs2525079,rs5992353,rs712966,rs2525036,rs807743,rs1034727,rs807744,rs2074003
+    rs2871023  rs1210715,rs1210711,rs5748189,rs1210709,rs3788298,rs7284649,rs9306217,rs9604954,rs1210703,rs5748179,rs5746727,rs5748190,rs5993603,rs2238766,rs885981,rs2238763,rs5748165,rs9605996,rs9606001,rs5992398
+    rs7292006  rs13447232,rs5993665,rs2073733,rs1057457,rs756658,rs5992395,rs2073760,rs739369,rs9606017,rs739370,rs4493360,rs2073736
+    rs2518840  rs1061325,rs2283646,rs362148,rs1340958,rs361956,rs361991,rs2073754,rs2040771,rs2073740,rs2282684
+    rs2073775  rs10160,rs2800981,rs807751,rs5993492,rs2189490,rs5747997,rs2238743
+    rs5747263  rs12159924,rs2300688,rs4239846,rs3747025,rs3747024,rs3747023,rs2300691
+    rs433576   rs9605439,rs1109052,rs400509,rs401099,rs396012,rs410456,rs385105
+    rs2106145  rs5748131,rs2013516,rs1210684,rs1210685,rs2238767,rs2277837
+    rs2587082  rs2257083,rs2109659,rs2587081,rs5747306,rs2535704,rs2535694
+    rs807667   rs2800974,rs756651,rs762523,rs2800973,rs1018764
+    rs2518866  rs1206542,rs807467,rs807464,rs807462,rs712950
+    rs1110661  rs1110660,rs7286607,rs1110659,rs5992917,rs1110662
+    rs759076   rs5748760,rs5748755,rs5748752,rs4819925,rs933461
+    rs5746487  rs5992895,rs2034113,rs2075455,rs1867353
+    rs5748212  rs5746736,rs4141527,rs5748147,rs5748202
+    etc.
+
+-----
+
+**Reference**
+
+Carlson CS, Eberle MA, Rieder MJ, Yi Q, Kruglyak L, Nickerson DA. (2004)
+Selecting a maximally informative set of single-nucleotide polymorphisms for
+association analyses using linkage disequilibrium.
+Am J Hum Genet. 74(1):106-20. Epub 2003 Dec 15.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/ldtools_wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/ldtools_wrapper.sh Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+#
+# Galaxy wrapper for Aakrosh Ratan's ldtools
+# 
+
+set -e
+
+export PATH=$PATH:$(dirname $0)
+
+## pagetag options
+input=
+rsquare=0.64
+freq=0.00
+sample=###
+
+## senatag options
+excluded=###
+required=###
+output=
+
+until [ $# -eq 0 ]
+do
+  case $1 in
+    rsquare=*)
+      rsquare=${1#rsquare=}
+      ;;
+    freq=*)
+      freq=${1#freq=}
+      ;;
+    input=*)
+      input=${1#input=}
+      ;;
+    output=*)
+      output=${1#output=}
+      ;;
+    *)
+      if [ -z "$new_args" ]; then
+        new_args=$1
+      else
+        new_args="$new_args $1"
+      fi
+      ;;
+  esac
+
+  shift
+done
+
+## run pagetag
+pagetag.py --rsquare $rsquare --freq $freq $input snps.txt neighborhood.txt &> /dev/null
+if [ $? -ne 0 ]; then
+ echo "failed: pagetag.py --rsquare $rsquare --freq $freq $input snps.txt neighborhood.txt"
+ exit 1
+fi
+
+## run sentag
+senatag.py neighborhood.txt snps.txt > $output 2> /dev/null
+if [ $? -ne 0 ]; then
+ echo "failed: senatag.py neighborhood.txt snps.txt"
+ exit 1
+fi
+
+## cleanup
+rm -f snps.txt neighborhood.txt
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/linkToDavid.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/linkToDavid.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,59 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+###################################################
+# linkToDavid.pl
+# Generates a link to David for a list of gene IDs.
+###################################################

+if (!@ARGV or scalar @ARGV != 4) {
+   print "usage: linkToDavid.pl infile.tab 1basedCol idType outfile\n";
+   exit 1;
+}
+
+my $in = shift @ARGV;
+my $col = shift @ARGV;
+my $type = shift @ARGV;
+my $out = shift @ARGV;
+
+if ($col < 1) { 
+   print "ERROR the column number should be 1 based counting\n";
+   exit 1;
+}
+my @gene;
+open(FH, $in) or die "Couldn't open $in, $!\n";
+while (<FH>) {
+   chomp;
+   my @f = split(/\t/);
+   if (scalar @f < $col) {
+      print "ERROR there is no column $col in $in\n";
+      exit 1;
+   }
+   if ($f[$col-1]) { push(@gene, $f[$col-1]); }
+}
+close FH or die "Couldn't close $in, $!\n";
+
+if (scalar @gene > 400) {
+   print "ERROR David only allows 400 genes submitted via a link\n";
+   exit 1;
+}

+my $link = 'http://david.abcc.ncifcrf.gov/api.jsp?type=TYPE&ids=GENELIST&tool=summary';
+
+my $g = join(",", @gene);
+$link =~ s/GENELIST/$g/;
+$link =~ s/TYPE/$type/;
+#print output
+if (length $link > 2048) { 
+   print "ERROR too many genes to fit in URL, please select a smaller set\n";
+   exit;
+}
+open(FH, ">", $out) or die "Couldn't open $out, $!\n";
+print FH "<html><head><title>DAVID link</title></head><body>\n",
+      '<A TARGET=_BLANK HREF="', $link, '">click here to send of identifiers to DAVID</A>', "\n",
+      '</body></html>', "\n";
+close FH or die "Couldn't close $out, $!\n";
+
+exit;
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/linkToDavid.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/linkToDavid.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,110 @@
+<tool id="hgv_david" name="DAVID" version="1.0.0">
+  <description>functional annotation for a list of genes</description>
+
+  <command interpreter="perl">
+    linkToDavid.pl $input $numerical_column $type $out_file1
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="tabular" label="Dataset" />
+    <param name="numerical_column" type="data_column" data_ref="input" label="Column with identifiers" />
+    <param name="type" label="Identifier type" type="select">
+      <option value="AFFYMETRIX_3PRIME_IVT_ID">AFFYMETRIX_3PRIME_IVT_ID</option>
+      <option value="AFFYMETRIX_EXON_GENE_ID">AFFYMETRIX_EXON_GENE_ID</option>
+      <option value="AFFYMETRIX_SNP_ID">AFFYMETRIX_SNP_ID</option>
+      <option value="AGILENT_CHIP_ID">AGILENT_CHIP_ID</option>
+      <option value="AGILENT_ID">AGILENT_ID</option>
+      <option value="AGILENT_OLIGO_ID">AGILENT_OLIGO_ID</option>
+      <option value="ENSEMBL_GENE_ID">ENSEMBL_GENE_ID</option>
+      <option value="ENSEMBL_TRANSCRIPT_ID">ENSEMBL_TRANSCRIPT_ID</option>
+      <option value="ENTREZ_GENE_ID">ENTREZ_GENE_ID</option>
+      <option value="FLYBASE_GENE_ID">FLYBASE_GENE_ID</option>
+      <option value="FLYBASE_TRANSCRIPT_ID">FLYBASE_TRANSCRIPT_ID</option>
+      <option value="GENBANK_ACCESSION">GENBANK_ACCESSION</option>
+      <option value="GENPEPT_ACCESSION">GENPEPT_ACCESSION</option>
+      <option value="GENOMIC_GI_ACCESSION">GENOMIC_GI_ACCESSION</option>
+      <option value="PROTEIN_GI_ACCESSION">PROTEIN_GI_ACCESSION</option>
+      <option value="ILLUMINA_ID">ILLUMINA_ID</option>
+      <option value="IPI_ID">IPI_ID</option>
+      <option value="MGI_ID">MGI_ID</option>
+      <option value="GENE_SYMBOL" selected="true">GENE_SYMBOL</option>
+      <option value="PFAM_ID">PFAM_ID</option>
+      <option value="PIR_ACCESSION">PIR_ACCESSION</option>
+      <option value="PIR_ID">PIR_ID</option>
+      <option value="PIR_NREF_ID">PIR_NREF_ID</option>
+      <option value="REFSEQ_GENOMIC">REFSEQ_GENOMIC</option>
+      <option value="REFSEQ_MRNA">REFSEQ_MRNA</option>
+      <option value="REFSEQ_PROTEIN">REFSEQ_PROTEIN</option>
+      <option value="REFSEQ_RNA">REFSEQ_RNA</option>
+      <option value="RGD_ID">RGD_ID</option>
+      <option value="SGD_ID">SGD_ID</option>
+      <option value="TAIR_ID">TAIR_ID</option>
+      <option value="UCSC_GENE_ID">UCSC_GENE_ID</option>
+      <option value="UNIGENE">UNIGENE</option>
+      <option value="UNIPROT_ACCESSION">UNIPROT_ACCESSION</option>
+      <option value="UNIPROT_ID">UNIPROT_ID</option>
+      <option value="UNIREF100_ID">UNIREF100_ID</option>
+      <option value="WORMBASE_GENE_ID">WORMBASE_GENE_ID</option>
+      <option value="WORMPEP_ID">WORMPEP_ID</option>
+      <option value="ZFIN_ID">ZFIN_ID</option>
+    </param>
+  </inputs>
+
+  <outputs>
+    <data format="html" name="out_file1" />
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" ftype="tabular" value="linkToDavid.tabular" />
+      <param name="numerical_column" value="1" />
+      <param name="type" value="ENTREZ_GENE_ID" />
+      <output name="out_file1" file="linkToDavid_1.out" />
+    </test>
+  </tests>
+
+  <help>
+ .. class:: infomark
+
+The list is limited to 400 IDs.
+
+-----
+
+**Dataset formats**
+
+The input dataset is in tabular_ format.  The output dataset is html_ with
+a link to the DAVID website as described below.
+(`Dataset missing?`_)
+
+.. _tabular: ./static/formatHelp.html#tab
+.. _html: ./static/formatHelp.html#html
+.. _Dataset missing?: ./static/formatHelp.html
+
+-----
+
+**What it does**
+
+This tool creates a link to the Database for Annotation,
+Visualization, and Integrated Discovery (DAVID) website at NIH,
+sending a list of IDs from the selected column of a tabular
+Galaxy dataset.  To follow the created link, click on the
+eye icon once the Galaxy tool has finished running.
+
+DAVID provides a comprehensive set of functional annotation tools
+to help investigators discover biological meaning behind large
+lists of genes.
+
+-----
+
+**References**
+
+Huang DW, Sherman BT, Lempicki RA. (2009) Systematic and integrative analysis
+of large gene lists using DAVID bioinformatics resources.
+Nat Protoc. 4(1):44-57.
+
+Dennis G, Sherman BT, Hosack DA, Yang J, Gao W, Lane HC, Lempicki RA. (2003)
+DAVID: database for annotation, visualization, and integrated discovery.
+Genome Biol. 4(5):P3. Epub 2003 Apr 3.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/linkToGProfile.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/linkToGProfile.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,55 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+###################################################
+# linkToGProfile.pl
+# Generates a link to gprofile for a list of gene IDs.
+# g:Profiler a web-based toolset for functional profiling of gene lists from large-scale experiments (2007) NAR 35 W193-W200
+###################################################

+if (!@ARGV or scalar @ARGV != 4) {
+   print "usage: linkToGProfile.pl infile.tab 1basedCol idType outfile\n";
+   exit 1;
+}
+
+my $in = shift @ARGV;
+my $col = shift @ARGV;
+my $type = shift @ARGV;
+my $out = shift @ARGV;
+
+if ($col < 1) { 
+   print "ERROR the column number should be 1 based counting\n";
+   exit 1;
+}
+my @gene;
+open(FH, $in) or die "Couldn't open $in, $!\n";
+while (<FH>) {
+   chomp;
+   my @f = split(/\t/);
+   if (scalar @f < $col) {
+      print "ERROR there is no column $col in $in\n";
+      exit 1;
+   }
+   if ($f[$col-1]) { push(@gene, $f[$col-1]); }
+}
+close FH or die "Couldn't close $in, $!\n";

+my $link = 'http://biit.cs.ut.ee/gprofiler/index.cgi?organism=hsapiens&query=GENELIST&r_chr=1&r_start=start&r_end=end&analytical=1&domain_size_type=annotated&term=&significant=1&sort_by_structure=1&user_thr=1.00&output=png&prefix=TYPE';
+$link =~ s/TYPE/$type/;
+my $g = join("+", @gene);
+$link =~ s/GENELIST/$g/;
+#print output
+if (length $link > 2048) { 
+   print "ERROR too many genes to fit in URL, please select a smaller set\n";
+   exit;
+}
+open(FH, ">", $out) or die "Couldn't open $out, $!\n";
+print FH "<html><head><title>g:Profiler link</title></head><body>\n",
+      '<A TARGET=_BLANK HREF="', $link, '">click here to send list of identifiers to g:Profiler</A>', "\n",
+      '</body></html>', "\n";
+close FH or die "Couldn't close $out, $!\n";
+
+#also do link that prints text that could be pulled back into Galaxy?
+exit;
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/linkToGProfile.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/linkToGProfile.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,83 @@
+<tool id="hgv_linkToGProfile" name="g:Profiler" version="1.0.0">
+  <description>tools for functional profiling of gene lists</description>
+
+  <command interpreter="perl">
+    linkToGProfile.pl $input $numerical_column $type $out_file1
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="tabular" label="Dataset" />
+    <param name="numerical_column" type="data_column" data_ref="input" numerical="True" label="Column with identifiers" />
+    <param name="type" label="Identifier type" type="select">
+      <option value="ENTREZGENE_ACC" selected="true">Entrez Gene Acc</option>
+      <option value="MIM_MORBID">OMIM Morbid Map</option>
+      <option value="MIM_GENE">OMIM Gene ID</option>
+      <option value="AFFY_HUGENE_1_0_ST_V1">AFFY_HUGENE_1_0_ST_V1</option>
+      <option value="HGNC_AUTOMATIC_GENE_ACC">HGNC_AUTOMATIC_GENE_ACC</option>
+      <option value="HGNC_MB001_ACC">HGNC_MB001_ACC</option>
+      <option value="HGNC_ACC">HGNC_ACC</option>
+      <option value="WIKIGENE_ACC">WIKIGENE_ACC</option>
+      <option value="DBASS5_ACC">DBASS5_ACC</option>
+      <option value="ILLUMINA_HUMANWG_6_V1">ILLUMINA_HUMANWG_6_V1</option>
+      <option value="AFFY_HUEX_1_0_ST_V2">AFFY_HUEX_1_0_ST_V2</option>
+      <option value="DBASS3_ACC">DBASS3_ACC</option>
+    </param>
+  </inputs>
+
+  <outputs>
+    <data format="html" name="out_file1" />
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" ftype="tabular" value="linkToGProfile.tabular" />
+      <param name="numerical_column" value="2" />
+      <param name="type" value="ENTREZGENE_ACC" />
+      <output name="out_file1" file="linkToGProfile_1.out" />
+    </test>
+  </tests>
+
+  <help>
+**Dataset formats**
+
+The input dataset is tabular_ with a column of identifiers.
+The output dataset is html_ with a link to g:Profiler.
+(`Dataset missing?`_)
+
+.. _tabular: ./static/formatHelp.html#tab
+.. _html: ./static/formatHelp.html#html
+.. _Dataset missing?: ./static/formatHelp.html
+
+-----
+
+**What it does**
+
+This tool creates a link to the g:GOSt tool (Gene Group Functional
+Profiling), which is part of the g:Profiler site at the University
+of Tartu in Estonia.  g:GOSt retrieves the most significant Gene
+Ontology (GO) terms, KEGG and REACTOME pathways, and TRANSFAC motifs
+for a user-specified group of genes, proteins, or microarray probes.
+g:GOSt also allows analysis of ranked or ordered lists of genes,
+visual browsing of GO graph structure, interactive visualization of
+retrieved results, and many other features.  Multiple testing
+corrections are applied to extract only statistically important
+results.
+
+The g:GOSt form is pre-filled with gene, protein, or microarray probe
+IDs from the selected column of a tabular Galaxy dataset.  To follow
+the created link, click on the eye icon when the Galaxy tool has
+finished running.  Once at the g:Profiler site, scroll down to see
+the g:GOSt results.  You can also adjust the options in the g:GOSt
+form to your liking, or use the row of links between the form and
+the results to run other g:Profiler tools using the same list of IDs.
+
+-----
+
+**Reference**
+
+Reimand J, Kull M, Peterson H, Hansen J, Vilo J. (2007) g:Profiler -- a web-based
+toolset for functional profiling of gene lists from large-scale experiments.
+Nucleic Acids Res. 35(Web Server issue):W193-200. Epub 2007 May 3.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/lped_to_geno.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/lped_to_geno.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,90 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+#convert from a MAP and PED file to a genotype file 
+#assumes not many SNPs but lots of individuals
+# transposed formats are used when lots of SNPs (TPED, TFAM)
+
+if (!@ARGV or scalar @ARGV ne 2) {
+   print "usage: lped_to_geno.pl infile.map infile.ped > outfile\n";
+   exit;
+}
+
+my $map = shift @ARGV;
+my $ped = shift @ARGV;
+
+my @snp; #array to hold SNPs from map file
+open(FH, $map) or die "Couldn't open $map, $!\n";
+while (<FH>) {
+   chomp; 
+   my @f = split(/\s+/); #3 or 4 columns
+   #chrom ID [distance|morgans] position
+   if (!exists $f[3]) { $f[3] = $f[2]; } #only 3 columns
+   #have to leave in so know which to skip later
+   #if ($f[3] < 0) { next; } #way of excluding SNPs
+   #if ($f[0] eq '0') { next; } #unplaced SNP
+   $f[0] = "chr$f[0]";
+   push(@snp, "$f[0]:$f[3]:$f[1]");
+}
+close FH or die "Couldn't finish $map, $!\n";
+
+#rows are individuals, columns are SNPs (7 & up)
+#need to print row per SNP
+my @allele; #alleles to go with @snp
+my @pheno;  #marker for phenotype
+open(FH, $ped) or die "Couldn't open $ped, $!\n";
+while (<FH>) {
+   chomp;
+   my @f = split(/\s+/);
+   if (!defined $f[5]) { die "ERROR undefined phenotype $f[0] $f[1] $f[2] $f[3] $f[4]\n"; }
+   push(@pheno, $f[5]);
+   my $j = 0;
+   for(my $i = 6; $i< $#f; $i+=2) {
+      if (!$allele[$j]) { $allele[$j] = ''; }
+      #can be ACTG or 1234 (for haploview etc) or 0 for missing
+      if ($f[$i] eq '1') { $f[$i] = 'A'; }
+      elsif ($f[$i] eq '2') { $f[$i] = 'C'; }
+      elsif ($f[$i] eq '3') { $f[$i] = 'G'; }
+      elsif ($f[$i] eq '4') { $f[$i] = 'T'; }
+      if ($f[$i+1] eq '1') { $f[$i+1] = 'A'; }
+      elsif ($f[$i+1] eq '2') { $f[$i+1] = 'C'; }
+      elsif ($f[$i+1] eq '3') { $f[$i+1] = 'G'; }
+      elsif ($f[$i+1] eq '4') { $f[$i+1] = 'T'; }
+      $f[$i] = uc($f[$i]);
+      $f[$i+1] = uc($f[$i+1]);
+      $allele[$j] .= " $f[$i]$f[$i+1]"; 
+      $j++;
+   }
+}
+close FH or die "Couldn't close $ped, $!\n";
+
+print "ID Chr Pos";
+foreach (@pheno) { if ($_ > 0) { print " ", $_ - 1; }} #go from 1/2 to 0/1
+print "\n";
+for(my $i =0; $i <= $#snp; $i++) { #foreach snp
+   $allele[$i] =~ /(\w)/;
+   my $nt = $1;
+   my $j = 0;
+   my @t = split(/:/, $snp[$i]);
+   if ($t[0] eq 'chr0' or $t[1] < 0) { next; } #skip this SNP
+   if ($t[0] eq 'chrX') { $t[0] = 'chr23'; }
+   elsif ($t[0] eq 'chrY') { $t[0] = 'chr24'; }
+   elsif ($t[0] eq 'chrXY') { $t[0] = 'chr23'; }
+   elsif ($t[0] eq 'chrMT') { $t[0] = 'chr25'; }
+   print "$t[2] $t[0] $t[1]";
+   $allele[$i] =~ s/^\s+//;
+   foreach my $p (split(/ +/, $allele[$i])) {
+      if ($pheno[$j] > 0) { #pheno 0 or -9 skip
+          #change AA BB AB to 2 0 1
+          if ($p eq "$nt$nt") { print " 2"; }
+          elsif ($p =~ /$nt/) { print " 1"; }
+          else { print " 0"; }
+      }
+      $j++;
+   }
+   print "\n";
+}
+
+exit;
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/lps.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/lps.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,304 @@\n+<tool id="hgv_lps" name="LPS" version="1.0.0">\n+  <description>LASSO-Patternsearch algorithm</description>\n+\n+  <command interpreter="bash">\n+    lps_tool_wrapper.sh $lambda_fac $input_file $label_column $output_file $log_file\n+    Initialization 0\n+    #if $advanced.options == "true":\n+      Sample $advanced.sample\n+      Verbosity $advanced.verbosity\n+      Standardize $advanced.standardize\n+      initialLambda $advanced.initialLambda\n+      #if $advanced.continuation.continuation == "1":\n+        Continuation $advanced.continuation.continuation\n+        continuationSteps $advanced.continuation.continuationSteps\n+        accurateIntermediates $advanced.continuation.accurateIntermediates\n+      #end if\n+      printFreq $advanced.printFreq\n+      #if $advanced.newton.newton == "1":\n+        Newton $advanced.newton.newton\n+        NewtonThreshold $advanced.newton.newtonThreshold\n+      #end if\n+      HessianSampleFraction $advanced.hessianSampleFraction\n+      BB 0\n+      Monotone 0\n+      FullGradient $advanced.fullGradient\n+      GradientFraction $advanced.gradientFraction\n+      InitialAlpha $advanced.initialAlpha\n+      AlphaIncrease $advanced.alphaIncrease\n+      AlphaDecrease $advanced.alphaDecrease\n+      AlphaMax $advanced.alphaMax\n+      c1 $advanced.c1\n+      MaxIter $advanced.maxIter\n+      StopTol $advanced.stopTol\n+      IntermediateTol $advanced.intermediateTol\n+      FinalOnly $advanced.finalOnly\n+    #end if\n+  </command>\n+\n+  <inputs>\n+    <param name="input_file" type="data" format="tabular" label="Dataset"/>\n+    <param name="label_column" type="data_column" data_ref="input_file" numerical="true" label="Label column" help="Column containing outcome labels: +1 or -1."/>\n+    <param name="lambda_fac" label="Lambda_fac" type="float" value="0.03" help="Target value of the regularization parameter, expressed as a fraction of the calculated lambda_max.">\n+      <validator type="in_range" message="0.00 &lt; lambda_fac &lt;= 1.00" min="0.00" max="1.00"/>\n+    </param>\n+    <conditional name="advanced">\n+      <param name="options" type="select" label="Advanced Options">\n+        <option value="false" selected="true">Hide advanced options</option>\n+        <option value="true">Show advanced options</option>\n+      </param>\n+      <when value="false">\n+        <!-- no options -->\n+      </when>\n+      <when value="true">\n+        <!-- HARDCODED: \'Sample\' we don\'t support passing an array -->\n+        <param name="sample" type="float" value="1.0" label="Sample fraction" help="Sample this fraction of the data set.">\n+          <validator type="in_range" message="0.0 &lt;= sample &lt;= 1.0" min="0.0" max="1.0"/>\n+        </param>\n+        <!-- HARDCODED: \'Initialization\' = 0 :: Initialize at beta=0 -->\n+        <param name="verbosity" type="select" format="integer" label="Verbosity">\n+          <option value="0" selected="true">Little output</option>\n+          <option value="1">More output</option>\n+          <option value="2">Still more output</option>\n+        </param>\n+        <param name="standardize" type="select" format="integer" label="Standardize" help="Scales and shifts each column so that it has mean zero and variance 1.">\n+          <option value="0" selected="true">Don\'t standardize</option>\n+          <option value="1">Standardize</option>\n+        </param>\n+        <param name="initialLambda" type="float" value="0.8" label="Initial lambda" help="First value of lambda to be used in the continuation scheme, expressed as a fraction of lambda_max.">\n+          <validator type="in_range" message="0.0 &lt; initialLambda &lt; 1.0" min="0.0" max="1.0"/>\n+        </param>\n+        <conditional name="continuation">\n+          <param name="continuation" type="select" format="integer" label="Continuation" help="Use continuation strategy to start with a larger value of lambda, decreasing it successively to lambda_fac.">\n+            <option value="0" selected="true">Don\'t use continuation</option>\n+ '..b'e property of interest P.\n+In simple terms, LPS calculates a weight for each of the other attributes\n+in your dataset.  This weight indicates how "relevant" that attribute\n+is for predicting whether or not a given subject has property P.\n+The L1-regularization causes most of these weights to be equal to zero,\n+which means LPS will find a "small" subset of the remaining N-1 attributes\n+in your dataset that can be used to predict P.\n+\n+In other words, LPS can be used for feature selection.\n+\n+The input dataset is tabular, and must contain a label column which\n+indicates whether or not a given row has property P.  In the current\n+version of this tool, P must be encoded using +1 and -1.  The Lambda_fac\n+parameter ranges from 0 to 1, and controls how sparse the weight\n+vector will be.  At the low end, when Lambda_fac = 0, there will be\n+no regularization.  At the high end, when Lambda_fac = 1, there will be\n+"too much" regularization, and all of the weights will equal zero.\n+\n+The LPS tool creates two output datasets.  The first, called the results\n+file, is a tabular dataset containing one column of weights for each\n+value of the regularization parameter lambda that was tried.  The weight\n+columns are in order from left to right by decreasing values of lambda.\n+The first N-1 rows in each column are the weights for the N-1 attributes\n+in your input dataset.  The final row is a constant, the intercept.\n+\n+Let **x** be a row from your input dataset and let **b** be a column\n+from the results file.  To compute the probability that row **x** has\n+a label value of +1:\n+\n+  Probability(row **x** has label value = +1) = 1 / [1 + exp{**x** \\* **b**\\[1..N-1\\] + **b**\\[N\\]}]\n+\n+where **x** \\* **b**\\[1..N-1\\] represents matrix multiplication.\n+\n+The second output dataset, called the log file, is a text file which\n+contains additional data about the fitted L1-regularized logistic\n+regression model.  These data include the number of features, the\n+computed value of lambda_max, the actual values of lambda used, the\n+optimal values of the log-likelihood and regularized log-likelihood\n+functions, the number of non-zeros, and the number of iterations.\n+\n+Website: http://pages.cs.wisc.edu/~swright/LPS/\n+\n+-----\n+\n+**Example**\n+\n+- input file::\n+\n+    +1   1   0   0   0   0   1   0   1   1   ...\n+    +1   1   1   1   0   0   1   0   1   1   ...\n+    +1   1   0   1   0   1   0   1   0   1   ...\n+    etc.\n+\n+- output results file::\n+\n+    0\n+    0\n+    0\n+    0\n+    0.025541\n+    etc.\n+\n+- output log file::\n+\n+    Data set has 100 vectors with 50 features.\n+      calculateLambdaMax: n=50, m=100, m+=50, m-=50\n+      computed value of lambda_max: 5.0000e-01\n+     \n+    lambda=2.96e-02 solution:\n+      optimal log-likelihood function value: 6.46e-01\n+      optimal *regularized* log-likelihood function value: 6.79e-01\n+      number of nonzeros at the optimum:      5\n+      number of iterations required:     43\n+    etc.\n+\n+-----\n+\n+**References**\n+\n+Koh K, Kim S-J, Boyd S. (2007)\n+An interior-point method for large-scale l1-regularized logistic regression.\n+Journal of Machine Learning Research. 8:1519-1555.\n+\n+Shi W, Wahba G, Wright S, Lee K, Klein R, Klein B. (2008)\n+LASSO-Patternsearch algorithm with application to ophthalmology and genomic data.\n+Stat Interface. 1(1):137-153.\n+\n+<!--\n+Wright S, Novak R, Figueiredo M. (2009)\n+Sparse reconstruction via separable approximation.\n+IEEE Transactions on Signal Processing. 57:2479-2403.\n+\n+Shi J, Yin W, Osher S, Sajda P. (2010)\n+A fast hybrid algorithm for large scale l1-regularized logistic regression.\n+Journal of Machine Learning Research. 11:713-741.\n+\n+Byrd R, Chin G, Neveitt W, Nocedal J. (2010)\n+On the use of stochastic Hessian information in unconstrained optimization.\n+Technical Report. Northwestern University. June 16, 2010.\n+\n+Wright S. (2010)\n+Accelerated block-coordinate relaxation for regularized optimization.\n+Technical Report. University of Wisconsin. August 10, 2010.\n+-->\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/lps_tool_wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/lps_tool_wrapper.sh Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# script for execution of deployed applications
+#
+# Sets up the MCR environment for the current $ARCH and executes 
+# the specified command.
+#
+
+export PATH=$PATH:$(dirname $0)
+
+MCRROOT=${MCRROOT:-/galaxy/software/linux2.6-x86_64/bin/MCR-7.11/v711}
+MWE_ARCH=glnxa64
+
+if [ "$MWE_ARCH" = "sol64" ] ; then
+  LD_LIBRARY_PATH=.:/usr/lib/lwp:${MCRROOT}/runtime/glnxa64
+else
+  LD_LIBRARY_PATH=.:${MCRROOT}/runtime/glnxa64
+fi
+
+LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRROOT}/bin/glnxa64
+LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRROOT}/sys/os/glnxa64
+
+if [ "$MWE_ARCH" = "maci" -o "$MWE_ARCH" = "maci64" ]; then
+  DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/System/Library/Frameworks/JavaVM.framework/JavaVM:/System/Library/Frameworks/JavaVM.framework/Libraries
+else
+  MCRJRE=${MCRROOT}/sys/java/jre/glnxa64/jre/lib/amd64
+  LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRJRE}/native_threads
+  LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRJRE}/server
+  LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRJRE}/client
+  LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRJRE}
+fi
+
+XAPPLRESDIR=${MCRROOT}/X11/app-defaults
+
+export LD_LIBRARY_PATH XAPPLRESDIR
+
+lps_tool $*
+
+exit 0
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/mergeSnps.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/mergeSnps.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,57 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+#this merges the significance output with the SNPs so users get more than an index
+
+my($out, $snp) = @ARGV;
+
+if (!$out or !$snp) { die "missing args\n"; }
+
+#merge SNP data with results
+merge();
+
+exit;
+
+########################################
+
+#merge the input and output files so have SNP data with result
+sub merge {
+   open(FH, $out) or die "Couldn't open $out, $!\n";
+   my %res;
+   my @ind;
+   while (<FH>) {
+      chomp;
+      my $line = $_;
+      #0:      10 score= 14.224153 , df= 2 , p= 0.040760 , N=50
+      if ($line =~ /^(\d+):\s+(.*)/) { $res{$1} = $2; push(@ind, $1); }
+   }
+   close FH;
+   if (!@ind) { return; } #no results, leave alone
+   @ind = sort { $a <=> $b } @ind;
+   #read input file to get SNP data
+   open(FH, $snp) or die "Couldn't open $snp, $!\n";
+   my $i = 0; #0 based, not counting ID line
+   my $c = shift @ind;
+   while (<FH>) {
+      chomp; 
+      if (/^ID/) { next; }
+      my @f = split(/\s+/);
+      if ($i == $c) { 
+         $res{$i} = "$f[0]\t$f[1]\t$f[2]\t$res{$i}";
+         if (!@ind) { last; }
+         $c = shift @ind;
+      }
+      $i++;      
+   }
+   close FH;
+   #now reprint results with SNP data included
+   open(FH, ">", $out) or die "Couldn't write to $out, $!\n";
+   print FH "ID\tchr\tposition\tresults\n";
+   foreach $i (keys %res) {
+      print FH $res{$i}, "\n";
+   }
+   close FH;
+}
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/pagetag.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/pagetag.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,297 @@\n+#!/usr/bin/env python\n+\n+"""\n+This accepts as input a file of the following format:\n+\n+    Site   Sample   Allele1   Allele2\n+\n+for example:\n+\n+    000834   D001    G       G\n+    000834   D002    G       G\n+    000834   D003    G       G\n+    000834   D004    G       G\n+    000834   D005    N       N\n+    000834   E001    G       G\n+    000834   E002    G       G\n+    000834   E003    G       G\n+    000834   E004    G       G\n+    000834   E005    G       G\n+    000963   D001    T       T\n+    000963   D002    T       T\n+    000963   D003    T       T\n+    000963   D004    T       T\n+    000963   D005    N       N\n+    000963   E001    T       T\n+    000963   E002    N       N\n+    000963   E003    G       T\n+    000963   E004    G       G\n+    000963   E005    G       T\n+\n+and a rsquare threshold and outputs two files: \n+\n+a) a file of input snps (one on each line). A SNP is identified by the "Site"\n+column in the input file\n+\n+b) a file where each line has the following:\n+    SNP     list\n+where SNP is one of  the SNPs and the "list" is a comma separated list of SNPs \n+that exceed the rsquare threshold with the first SNP.\n+"""\n+\n+from sys import argv, stderr, exit\n+from getopt import getopt, GetoptError\n+\n+__author__ = "Aakrosh Ratan"\n+__email__  = "ratan@bx.psu.edu"\n+\n+# do we want the debug information to be printed?\n+debug_flag = False\n+\n+# denote different combos of alleles in code\n+HOMC  = str(1)\n+HOMR  = str(2)\n+HETE  = str(3)\n+OTHER = str(4)\n+\n+indexcalculator = {(HOMC,HOMC) : 0,\n+                   (HOMC,HOMR) : 1,\n+                   (HOMC,HETE) : 2,\n+                   (HOMR,HOMC) : 3,\n+                   (HOMR,HOMR) : 4,\n+                   (HOMR,HETE) : 5,\n+                   (HETE,HOMC) : 6,\n+                   (HETE,HOMR) : 7,\n+                   (HETE,HETE) : 8}\n+\n+def read_inputfile(filename, samples):\n+    input = {}\n+\n+    file = open(filename, "r")\n+\n+    for line in file:\n+        position,sample,allele1,allele2 = line.split()\n+\n+        # if the user specified a list of samples, then only use those samples\n+        if samples != None and sample not in samples: continue\n+            \n+        if position in input:\n+            v = input[position]\n+            v[sample] = (allele1,allele2)\n+        else:\n+            v = {sample : (allele1, allele2)}\n+            input[position] = v\n+\n+    file.close()\n+    return input\n+\n+def annotate_locus(input, minorallelefrequency, snpsfile):\n+    locus = {}\n+    for k,v in input.items():\n+        genotypes = [x for x in v.values()]\n+        alleles   = [y for x in genotypes for y in x]\n+        alleleset = list(set(alleles))\n+        alleleset = list(set(alleles) - set(["N","X"]))\n+\n+        if len(alleleset) == 2:\n+            genotypevec = ""\n+            num1 = len([x for x in alleles if x == alleleset[0]])\n+            num2 = len([x for x in alleles if x == alleleset[1]])\n+ \n+            if num1 > num2: \n+                major = alleleset[0]    \n+                minor = alleleset[1]\n+                minorfreq = (num2 * 1.0)/(num1 + num2)\n+            else:\n+                major = alleleset[1] \n+                minor = alleleset[0]\n+                minorfreq = (num1 * 1.0)/(num1 + num2)\n+\n+            if minorfreq < minorallelefrequency: continue\n+               \n+            for gen in genotypes:\n+                if gen == (major,major):\n+                    genotypevec += HOMC \n+                elif gen == (minor,minor):\n+                    genotypevec += HOMR\n+                elif gen == (major, minor) or gen == (minor, major):\n+                    genotypevec += HETE\n+                else:  \n+                    genotypevec += OTHER\n+\n+            locus[k] = genotypevec,minorfreq\n+        elif len(alleleset) > 2:\n+            print >> snpsfile, k\n+    return locus\n+\n+def calculateLD(loci, rsqthreshold):\n+    snps = list(loci)\n+    rsquare = {}\n+\n+    for index,loc1 in enumerate(snps):\n+        for loc2 in snps[index + 1:]:\n+            m'..b')\n+                if rsq >= rsqthreshold:\n+                    rsquare["%s %s" % (loc1,loc2)] = rsq\n+\n+    return rsquare\n+\n+def main(inputfile, snpsfile, neigborhoodfile, \\\n+         rsquare, minorallelefrequency, samples):\n+    # read the input file\n+    input = read_inputfile(inputfile, samples)     \n+    print >> stderr, "Read %d locations" % len(input)\n+\n+    # open the snpsfile to print\n+    file = open(snpsfile, "w")\n+\n+    # annotate the inputs, remove the abnormal loci (which do not have 2 alleles\n+    # and add the major and minor allele to each loci\n+    loci = annotate_locus(input, minorallelefrequency, file)\n+    print >> stderr, "Read %d interesting locations" % len(loci)\n+        \n+    # print all the interesting loci as candidate snps\n+    for k in loci.keys(): print >> file, k\n+    file.close() \n+    print >> stderr, "Finished creating the snpsfile"\n+\n+    # calculate the LD values and store it if it exceeds the threshold\n+    lds = calculateLD(loci, rsquare)\n+    print >> stderr, "Calculated all the LD values"\n+\n+    # create a list of SNPs   \n+    snps   = {}\n+    ldvals = {}\n+    for k,v in lds.items():\n+        s1,s2 = k.split()\n+        if s1 in snps: snps[s1].append(s2)\n+        else         : snps[s1] = [s2]    \n+        if s2 in snps: snps[s2].append(s1)\n+        else         : snps[s2] = [s1]    \n+\n+        if s1 in ldvals: ldvals[s1].append(str(v))\n+        else           : ldvals[s1] = [str(v)]\n+        if s2 in ldvals: ldvals[s2].append(str(v))\n+        else           : ldvals[s2] = [str(v)]\n+           \n+    # print the snps to the output file\n+    file = open(neigborhoodfile, "w")\n+\n+    for k,v in snps.items():\n+        ldv = ldvals[k]\n+        if debug_flag == True:\n+            print >> file, "%s\\t%s\\t%s" % (k, ",".join(v), ",".join(ldv))\n+        else:            \n+            print >> file, "%s\\t%s" % (k, ",".join(v))\n+\n+    file.close()\n+ \n+\n+def read_list(filename):\n+    file = open(filename, "r")\n+    list = {}    \n+\n+    for line in file:\n+        list[line.strip()] = 1\n+\n+    file.close()\n+    return list\n+\n+def usage():\n+    f = stderr\n+    print >> f, "usage:"\n+    print >> f, "pagetag [options] input.txt snps.txt neighborhood.txt"\n+    print >> f, "where input.txt is the prettybase file"\n+    print >> f, "where snps.txt is the first output file with the snps"\n+    print >> f, "where neighborhood.txt is the output neighborhood file"\n+    print >> f, "where the options are:"\n+    print >> f, "-h,--help : print usage and quit"\n+    print >> f, "-d,--debug: print debug information"\n+    print >> f, "-r,--rsquare: the rsquare threshold (default : 0.64)"\n+    print >> f, "-f,--freq : the minimum MAF required (default: 0.0)"\n+    print >> f, "-s,--sample : a list of samples to be clustered"   \n+\n+if __name__ == "__main__":\n+    try:\n+        opts, args = getopt(argv[1:], "hds:r:f:",\\\n+                    ["help", "debug", "rsquare=","freq=", "sample="])\n+    except GetoptError, err:\n+        print str(err)\n+        usage()\n+        exit(2) \n+\n+    rsquare = 0.64\n+    minorallelefrequency = 0.0\n+    samples = None\n+\n+    for o, a in opts:\n+        if o in ("-h", "--help"):\n+            usage()\n+            exit()\n+        elif o in ("-d", "--debug"):\n+            debug_flag = True\n+        elif o in ("-r", "--rsquare"):\n+            rsquare = float(a)\n+        elif o in ("-f", "--freq"):\n+            minorallelefrequency = float(a)\n+        elif o in ("-s", "--sample"):\n+            samples = read_list(a)\n+        else:\n+            assert False, "unhandled option"\n+\n+    if rsquare < 0.00 or rsquare > 1.00: \n+        print >> stderr, "input value of rsquare should be in [0.00, 1.00]"\n+        exit(3)\n+\n+    if minorallelefrequency < 0.0 or minorallelefrequency > 0.5:\n+        print >> stderr, "input value of MAF should be (0.00,0.50]"\n+        exit(4)\n+\n+    if len(args) != 3:\n+        usage()\n+        exit(5)\n+\n+    main(args[0], args[1], args[2], rsquare, minorallelefrequency, samples)\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/pass.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/pass.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,126 @@
+<tool id="hgv_pass" name="PASS" version="1.0.0">
+  <description>significant transcription factor binding sites from ChIP data</description>
+
+  <command interpreter="bash">
+    pass_wrapper.sh "$input" "$min_window" "$max_window" "$false_num" "$output"
+  </command>
+
+  <inputs>
+    <param format="gff" name="input" type="data" label="Dataset"/>
+    <param name="min_window" label="Smallest window size (by # of probes)" type="integer" value="2" />
+    <param name="max_window" label="Largest window size (by # of probes)" type="integer" value="6" />
+    <param name="false_num" label="Expected total number of false positive intervals to be called" type="float" value="5.0" help="N.B.: this is a &lt;em&gt;count&lt;/em&gt;, not a rate." />
+  </inputs>
+
+  <outputs>
+    <data format="tabular" name="output" />
+  </outputs>
+
+  <requirements>
+    <requirement type="package">pass</requirement>
+    <requirement type="binary">sed</requirement>
+  </requirements>
+
+  <!-- we need to be able to set the seed for the random number generator
+  <tests>
+    <test>
+      <param name="input" ftype="gff" value="pass_input.gff"/>
+      <param name="min_window" value="2"/>
+      <param name="max_window" value="6"/>
+      <param name="false_num" value="5"/>
+      <output name="output" file="pass_output.tab"/>
+    </test>
+  </tests>
+  -->
+
+  <help>
+**Dataset formats**
+
+The input is in GFF_ format, and the output is tabular_.
+(`Dataset missing?`_)
+
+.. _GFF: ./static/formatHelp.html#gff
+.. _tabular: ./static/formatHelp.html#tab
+.. _Dataset missing?: ./static/formatHelp.html
+
+-----
+
+**What it does**
+
+PASS (Poisson Approximation for Statistical Significance) detects
+significant transcription factor binding sites in the genome from
+ChIP data.  This is probably the only peak-calling method that
+accurately controls the false-positive rate and FDR in ChIP data,
+which is important given the huge discrepancy in results obtained
+from different peak-calling algorithms.  At the same time, this
+method achieves a similar or better power than previous methods.
+
+<!-- we don't have wrapper support for the "prior" file yet
+Another unique feature of this method is that it allows varying
+thresholds to be used for peak calling at different genomic
+locations.  For example, if a position lies in an open chromatin
+region, is depleted of nucleosome positioning, or a co-binding
+protein has been detected within the neighborhood, then the position
+is more likely to be bound by the target protein of interest, and
+hence a lower threshold will be used to call significant peaks.
+As a result, weak but real binding sites can be detected.
+-->
+
+-----
+
+**Hints**
+
+- ChIP-Seq data:
+
+  If the data is from ChIP-Seq, you need to convert the ChIP-Seq values
+  into z-scores before using this program.  It is also recommended that
+  you group read counts within a neighborhood together, e.g. in tiled
+  windows of 30bp.  In this way, the ChIP-Seq data will resemble
+  ChIP-chip data in format.
+
+- Choosing window size options:
+
+  The window size is related to the probe tiling density.  For example,
+  if the probes are tiled at every 100bp, then setting the smallest
+  window = 2 and largest window = 6 is appropriate, because the DNA
+  fragment size is around 300-500bp.
+
+-----
+
+**Example**
+
+- input file::
+
+    chr7  Nimblegen  ID  40307603  40307652  1.668944     .  .  .
+    chr7  Nimblegen  ID  40307703  40307752  0.8041307    .  .  .
+    chr7  Nimblegen  ID  40307808  40307865  -1.089931    .  .  .
+    chr7  Nimblegen  ID  40307920  40307969  1.055044     .  .  .
+    chr7  Nimblegen  ID  40308005  40308068  2.447853     .  .  .
+    chr7  Nimblegen  ID  40308125  40308174  0.1638694    .  .  .
+    chr7  Nimblegen  ID  40308223  40308275  -0.04796628  .  .  .
+    chr7  Nimblegen  ID  40308318  40308367  0.9335709    .  .  .
+    chr7  Nimblegen  ID  40308526  40308584  0.5143972    .  .  .
+    chr7  Nimblegen  ID  40308611  40308660  -1.089931    .  .  .
+    etc.
+
+  In GFF, a value of dot '.' is used to mean "not applicable".
+
+- output file::
+
+    ID  Chr   Start     End       WinSz  PeakValue  # of FPs  FDR
+    1   chr7  40310931  40311266  4      1.663446   0.248817  0.248817
+
+-----
+
+**References**
+
+Zhang Y. (2008)
+Poisson approximation for significance in genome-wide ChIP-chip tiling arrays.
+Bioinformatics. 24(24):2825-31. Epub 2008 Oct 25.
+
+Chen KB, Zhang Y. (2010)
+A varying threshold method for ChIP peak calling using multiple sources of information.
+Submitted.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/pass_wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/pass_wrapper.sh Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+export PATH=$PATH:$(dirname $0)
+
+input=$1
+min_window=$2
+max_window=$3
+false_num=$4
+output=$5
+
+pass "$input" "$min_window" "$max_window" "$false_num" "$output" >/dev/null
+sed -i -e 's/\t\t*/\t/g' "$output"
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/senatag.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/senatag.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,243 @@
+#!/usr/bin/env python
+
+"""
+This tool takes the following file pairs as input:
+a) input_snp  : A file with identifiers for SNPs (one on each line)
+b) ldfile     : A file where each line has  the following
+                snp     list
+                where "snp" is an identifier for one SNP and the "list" is a 
+                comma separated list of all the other snps that are in LD with
+                it (as per some threshold of rsquare)
+
+The output is a set of tag SNPs for the given datasets
+
+The algorithm is as follows:
+
+a) Construct a graph for each population, where each node is a SNP and two nodes
+are connected using an edge iff they are in LD.
+b) For each SNP, count the total number of connected nodes, which have not yet
+been visited.
+c) Find the SNP with the highest count and assign it to be a tag SNP.
+d) Mark that SNP and all the snps connected to it as "visited". This should be
+done for each population.
+e) Continue steps b-e until all SNPs, in all populations have been visited.
+"""
+
+from sys import argv, stderr, exit
+from getopt import getopt, GetoptError
+
+import os
+import heapq
+
+__author__ = "Aakrosh Ratan"
+__email__  = "ratan@bx.psu.edu"
+
+# do we want the debug information to be printed?
+debug_flag = False
+
+class node:
+    def __init__(self, name):
+        self.name    = name
+        self.edges   = []
+        self.visited = False
+
+    # return the number of nodes connected to this node, that have yet to be
+    # visited
+    def num_not_visited(self):
+        num = 0
+        for n in self.edges:
+            if n.visited == False: num += 1
+        return num 
+
+    def __cmp__(self, other):
+        return other.num_not_visited() - self.num_not_visited()
+
+    def __str__(self):  
+        return self.name
+
+class graph:
+    def __init__(self):
+        self.nodes = {}
+
+    def __str__(self):
+        string = ""
+        for n1 in self.nodes.values():
+            n2s = [x.name for x in n1.edges]
+            string += "%s %s\n" % (n1.name, ",".join(n2s))
+        return string[:-1]
+
+    def add_node(self, n):
+        self.nodes[n.name] = n
+
+    def add_edges(self, n1, n2):
+        assert n1.name in self.nodes
+        assert n2.name in self.nodes
+        n1.edges.append(n2)
+        n2.edges.append(n1)
+
+    def check_graph(self):
+        for n in self.nodes.values():
+            ms = [x for x in n.edges]
+            for m in ms:
+                if n not in m.edges:
+                    print >> stderr, "check : %s - %s" % (n,m)
+
+def construct_graph(ldfile, snpfile):
+    # construct the initial graph. add all the SNPs as nodes
+    g = graph()
+    file = open(snpfile, "r")
+
+    for line in file:
+        # ignore empty lines and add the remainder to the graph
+        if len(line.strip()) == 0: continue
+        n = node(line.strip())           
+        g.add_node(n)
+
+    file.close()
+    print >> stderr, "Added %d nodes to a graph" % len(g.nodes)
+  
+    # now add all the edges
+    file   = open(ldfile, "r")
+
+    for line in file:
+        tokens = line.split()
+        assert len(tokens) == 2
+
+        # if this node is in the graph, then we need to construct an edge from
+        # this node to all the nodes which are highly related to it
+        if tokens[0] in g.nodes:
+            n1  = g.nodes[tokens[0]]
+            n2s = [g.nodes[x] for x in tokens[1].split(",")]
+
+            for n2 in n2s:
+                g.add_edges(n1, n2)
+
+    file.close()
+    print >> stderr, "Added all edges to the graph"
+
+    return g
+     
+def check_output(g, tagsnps):
+    # find all the nodes in the graph 
+    allsnps = [x.name for x in g.nodes.values()]
+
+    # find the nodes that are covered by our tagsnps
+    mysnps = [x.name for x in tagsnps]
+
+    for n in tagsnps:
+        for m in n.edges:
+                mysnps.append(m.name)
+
+    mysnps = list(set(mysnps))
+
+    if set(allsnps) != set(mysnps):
+        diff = list(set(allsnps) - set(mysnps))
+        print >> stderr, "%s are not covered" % ",".join(diff)
+
+def main(ldfile, snpsfile, required, excluded):
+    # construct the graph
+    g = construct_graph(ldfile, snpsfile)
+    if debug_flag == True: g.check_graph()
+
+    tagsnps   = []
+    neighbors = {}
+
+    # take care of the SNPs that are required to be TagSNPs
+    for s in required:
+        t = g.nodes[s]
+
+        t.visited = True
+        ns = []

+        for n in t.edges:
+            if n.visited == False: ns.append(n.name)
+            n.visited = True 
+        
+        tagsnps.append(t)
+        neighbors[t.name] = list(set(ns))
+
+    # find the tag SNPs for this graph
+    data = [x for x in g.nodes.values()]
+    heapq.heapify(data)
+
+    while data:
+        s = heapq.heappop(data)
+
+        if s.visited == True or s.name in excluded: continue
+
+        s.visited = True
+        ns = []
+
+        for n in s.edges:
+            if n.visited == False: ns.append(n.name)
+            n.visited = True
+            
+        tagsnps.append(s)
+        neighbors[s.name] = list(set(ns))
+
+        heapq.heapify(data)
+
+    for s in tagsnps:
+        if len(neighbors[s.name]) > 0: 
+            print "%s\t%s" % (s, ",".join(neighbors[s.name]))
+            continue
+        print s
+        
+    if debug_flag == True: check_output(g, tagsnps) 
+       
+def read_list(filename):
+    assert os.path.exists(filename) == True
+    file = open(filename, "r")
+    list = {}
+
+    for line in file:
+        list[line.strip()] = 1
+
+    file.close()
+    return list
+           
+def usage():
+    f = stderr
+    print >> f, "usage:"
+    print >> f, "senatag [options] neighborhood.txt inputsnps.txt"
+    print >> f, "where inputsnps.txt is a file of snps from one population"
+    print >> f, "where neighborhood.txt is neighborhood details for the pop."
+    print >> f, "where the options are:"
+    print >> f, "-h,--help : print usage and quit"
+    print >> f, "-d,--debug: print debug information"
+    print >> f, "-e,--excluded : file with names of SNPs that cannot be TagSNPs"
+    print >> f, "-r,--required : file with names of SNPs that should be TagSNPs"
+
+if __name__ == "__main__":
+    try:
+        opts, args = getopt(argv[1:], "hdr:e:",\
+                     ["help", "debug", "required=", "excluded="])
+    except GetoptError, err:
+        print str(err)
+        usage()
+        exit(2) 
+
+    required = {}
+    excluded = {}
+
+    for o, a in opts:
+        if o in ("-h", "--help"):
+            usage()
+            exit()
+        elif o in ("-d", "--debug"):
+            debug_flag = True
+        elif o in ("-r", "--required"):
+            required = read_list(a)
+        elif o in ("-e", "--excluded"):
+            excluded = read_list(a)
+        else:
+            assert False, "unhandled option"
+
+    if len(args) != 2:
+        usage()
+        exit(3)
+
+    assert os.path.exists(args[0]) == True
+    assert os.path.exists(args[1]) == True
+    
+    main(args[0], args[1], required, excluded)
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/sift.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/sift.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,174 @@\n+<tool id="hgv_sift" name="SIFT" version="1.0.0">\n+  <description>predictions of functional sites</description>\n+\n+  <command interpreter="bash">\n+    sift_variants_wrapper.sh "$input" "$output" "${input.metadata.dbkey}" "${GALAXY_DATA_INDEX_DIR}/sift_db.loc" "$chrom_col" "$pos_col" "$base" "$allele_col" "$strand_source.strand_col" "$comment_source.comment_col" "$output_opts"\n+  </command>\n+\n+  <inputs>\n+    <param name="input" type="data" format="tabular" label="Dataset">\n+      <validator type="unspecified_build"/>\n+      <validator type="dataset_metadata_in_file" filename="sift_db.loc" metadata_name="dbkey" metadata_column="0" message="Data is currently not available for the specified build."/>\n+    </param>\n+    <param name="chrom_col"  type="data_column" data_ref="input" label="Column with chromosome"/>\n+    <param name="pos_col"    type="data_column" data_ref="input" numerical="true" label="Column with position"/>\n+    <param name="base" type="select" label="Position coordinates are">\n+      <option value="1" selected="true">one-based</option>\n+      <option value="0">zero-based</option>\n+    </param>\n+    <param name="allele_col" type="data_column" data_ref="input" label="Column with allele"/>\n+    <conditional name="strand_source">\n+      <param name="strand_choice" type="select" label="Strand info">\n+        <option value="data_column" selected="true">a column in the dataset</option>\n+        <option value="all_pos">all on sense/forward/+ strand</option>\n+        <option value="all_neg">all on antisense/reverse/- strand</option>\n+      </param>\n+      <when value="data_column">\n+        <param name="strand_col" type="data_column" data_ref="input" label="Column with strand"/>\n+      </when>\n+      <when value="all_pos">\n+        <param name="strand_col" type="hidden" value="+"/>\n+      </when>\n+      <when value="all_neg">\n+        <param name="strand_col" type="hidden" value="-"/>\n+      </when>\n+    </conditional>\n+    <conditional name="comment_source">\n+      <param name="comment_choice" type="select" label="Include comment column">\n+        <option value="no" selected="true">no</option>\n+        <option value="yes">yes</option>\n+      </param>\n+      <when value="no">\n+        <param name="comment_col" type="hidden" value="-"/>\n+      </when>\n+      <when value="yes">\n+        <param name="comment_col" type="data_column" data_ref="input" label="Column with comment"/>\n+      </when>\n+    </conditional>\n+    <param name="output_opts" type="select" multiple="true" display="checkboxes" label="Include the following additional fields in the output">\n+      <option value="A">Ensembl Gene ID</option>\n+      <option value="B">Gene Name</option>\n+      <option value="C">Gene Description</option>\n+      <option value="D">Ensembl Protein Family ID</option>\n+      <option value="E">Ensembl Protein Family Description</option>\n+      <option value="F">Ensembl Transcript Status (Known / Novel)</option>\n+      <option value="G">Protein Family Size</option>\n+      <option value="H">Ka/Ks (Human-mouse)</option>\n+      <option value="I">Ka/Ks (Human-macaque)</option>\n+      <option value="J">OMIM Disease</option>\n+      <option value="K">Allele Frequencies (All Hapmap Populations - weighted average)</option>\n+      <option value="L">Allele Frequencies (CEU Hapmap population)</option>\n+    </param>\n+  </inputs>\n+\n+  <outputs>\n+    <data format="tabular" name="output" />\n+  </outputs>\n+\n+  <requirements>\n+    <requirement type="binary">awk</requirement>\n+    <requirement type="binary">rm</requirement>\n+    <requirement type="binary">sed</requirement>\n+  </requirements>\n+\n+  <tests>\n+    <test>\n+      <param name="input" value="sift_variants.tab" ftype="tabular" dbkey="hg18"/>\n+      <param name="chrom_col" value="1"/>\n+      <param name="pos_col" value="3"/>\n+      <param name="base" value="1"/>\n+      <param name="allele_col" value="5"/>\n+      <param name="strand_choice" value="data_column"/>\n+      <param name="st'..b"sts>\n+\n+  <help>\n+.. class:: warningmark\n+\n+This currently works only for builds hg18 or hg19.\n+\n+-----\n+\n+**Dataset formats**\n+\n+The input and output datasets are tabular_. \n+(`Dataset missing?`_)\n+\n+.. _tabular: ./static/formatHelp.html#tab\n+.. _Dataset missing?: ./static/formatHelp.html\n+\n+-----\n+\n+**What it does**\n+\n+SIFT predicts whether an amino-acid substitution affects protein function,\n+based on sequence homology and the physical properties of amino acids.\n+SIFT can be applied to naturally occurring non-synonymous polymorphisms\n+and laboratory-induced missense mutations.  This tool uses SQLite databases\n+containing pre-computed SIFT scores and annotations for all possible nucleotide\n+substitutions at each position in the human exome.  Allele frequency data\n+are from the HapMap frequency database, and additional transcript and \n+gene-level data are from Ensembl BioMart.\n+\n+The input dataset must contain columns for the chromosome, position, and\n+alleles.  The alleles must be two nucleotides separated by '/',\n+usually the reference allele and the allele of interest.\n+The strand must either be in another column or all the same.\n+The output contains a standard set of columns plus the additional ones that\n+have been selected from the list above.\n+\n+Website: http://sift.jcvi.org/\n+\n+-----\n+\n+**Example**\n+\n+- input file::\n+\n+    chr3   81780820   +  T/C\n+    chr2   230341630  +  G/A\n+    chr2   43881517   +  A/T\n+    chr2   43857514   +  T/C\n+    chr6   88375602   +  G/A\n+    chr22  29307353   -  T/A\n+    chr10  115912482  -  G/T\n+    chr10  115900918  -  C/T\n+    chr16  69875502   +  G/T\n+    etc.\n+\n+- output file::\n+\n+    #Chrom  Position   Strand  Allele  Codons   Transcript ID    Protein ID       Substitution  Region    dbSNP ID      SNP Type       Prediction  Score  Median Info  Num seqs at position  User Comment\n+    chr3    81780820   +       T/C     AGA-gGA  ENST00000264326  ENSP00000264326  R190G         EXON CDS  rs2229519:C   Nonsynonymous  DAMAGING    0.04   3.06         149\n+    chr2    230341630  +       G/T     -        ENST00000389045  ENSP00000373697  NA            EXON CDS  rs1803846:A   Unknown        Not scored  NA     NA           NA\n+    chr2    43881517   +       A/T     ATA-tTA  ENST00000260605  ENSP00000260605  I230L         EXON CDS  rs11556157:T  Nonsynonymous  TOLERATED   0.47   3.19         7\n+    chr2    43857514   +       T/C     TTT-TcT  ENST00000260605  ENSP00000260605  F33S          EXON CDS  rs2288709:C   Nonsynonymous  TOLERATED   0.61   3.33         6\n+    chr6    88375602   +       G/A     GTT-aTT  ENST00000257789  ENSP00000257789  V217I         EXON CDS  rs2307389:A   Nonsynonymous  TOLERATED   0.75   3.17         13\n+    chr22   29307353   +       T/A     ACC-tCC  ENST00000335214  ENSP00000334612  T264S         EXON CDS  rs42942:A     Nonsynonymous  TOLERATED   0.4    3.14         23\n+    chr10   115912482  +       C/A     CGA-CtA  ENST00000369285  ENSP00000358291  R179L         EXON CDS  rs12782946:T  Nonsynonymous  TOLERATED   0.06   4.32         2\n+    chr10   115900918  +       G/A     CAA-tAA  ENST00000369287  ENSP00000358293  Q271*         EXON CDS  rs7095762:T   Nonsynonymous  N/A         N/A    N/A          N/A\n+    chr16   69875502   +       G/T     ACA-AaA  ENST00000338099  ENSP00000337512  T608K         EXON CDS  rs3096381:T   Nonsynonymous  TOLERATED   0.12   3.41         3\n+    etc.\n+\n+-----\n+\n+**References**\n+\n+Ng PC, Henikoff S. (2001) Predicting deleterious amino acid substitutions.\n+Genome Res. 11(5):863-74.\n+\n+Ng PC, Henikoff S. (2002) Accounting for human polymorphisms predicted to affect protein function.\n+Genome Res. 12(3):436-46.\n+\n+Ng PC, Henikoff S. (2003) SIFT: Predicting amino acid changes that affect protein function.\n+Nucleic Acids Res. 31(13):3812-4.\n+\n+Kumar P, Henikoff S, Ng PC. (2009) Predicting the effects of coding non-synonymous variants\n+on protein function using the SIFT algorithm.\n+Nat Protoc. 4(7):1073-81. Epub 2009 Jun 25.\n+\n+  </help>\n+</tool>\n"
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/sift_variants_wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/sift_variants_wrapper.sh Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,184 @@
+#!/usr/bin/env bash
+
+input_file=$1
+output_file=$2
+org=$3
+db_loc=$4
+chrom_col=$5
+pos_col=$6
+base=$7
+allele_col=$8
+strand_col=$9
+comment_col=${10}
+output_opts=${11}
+
+working_dir=$PWD
+sift_input="$working_dir/sift_input.txt"
+sift_output="$working_dir/sift_output.txt"
+
+################################################################################
+## make sure input file column selections are mutually exclusive              ##
+################################################################################
+ERROR=0
+declare -a col_use
+
+function check_col () {
+    local col=$1
+    local use=$2
+    local int=$3
+
+    if [ -n "${col//[0-9]}" ]; then
+        if [ $int -eq 1 ]; then
+            echo "ERROR: invalid value for $use column: $col" 1>&2
+            ERROR=1
+        fi
+        return
+    fi
+
+    local cur=${col_use[$col]}
+    if [ -n "$cur" ]; then
+        echo "ERROR: $use column is the same as $cur column" 1>&2
+        col_use[$col]="${cur},$use"
+        ERROR=1
+    else
+        col_use[$col]=$use
+    fi
+}
+
+check_col $chrom_col   'chromosome' 1
+check_col $pos_col     'position'   1
+check_col $allele_col  'allele'     1
+check_col $strand_col  'strand'     0
+check_col $comment_col 'comment'    0
+
+if [ $ERROR -ne 0 ]; then
+    exit 1
+fi
+
+################################################################################
+## get/check the db directory from the argument org,db_loc                    ##
+################################################################################
+db_dir=$( awk '$1 == org { print $2 }' org=$org $db_loc )
+
+if [ -z "$db_dir" ]; then
+    echo "Can't find dbkey \"$org\" in loc file \"$db_loc\"" 1>&2
+    exit 1
+fi
+
+if [ ! -d "$db_dir" ]; then
+    echo "Can't access SIFT database directory \"$db_dir\"" 1>&2
+    exit 1
+fi
+
+################################################################################
+## create input file for SIFT_exome_nssnvs.pl                                 ##
+################################################################################
+if [ ! -r "$input_file" ]; then
+    echo "Can't read input file \"$input_file\"" 1>&2
+    exit 1
+fi
+
+if [ $base -eq 0 ]; then
+    beg_col="$pos_col"
+    end_col="$pos_col + 1"
+    pos_adj='$2 = $2 - 1'
+else
+    beg_col="$pos_col - 1"
+    end_col="$pos_col"
+    pos_adj=''
+fi
+
+strand_cvt=''
+if [ \( "$strand_col" = "+" \) ]; then
+    strand='"1"'
+elif [ \( "$strand_col" = "-" \) ]; then
+    strand='"-1"'
+else
+    strand="\$$strand_col"
+    strand_cvt='if ('"${strand}"' == "+") {'"${strand}"' = "1"} else if ('"${strand}"' == "-") {'"${strand}"' = "-1"}'
+fi
+
+print_row='print $'"${chrom_col}"', $'"${beg_col}"', $'"${end_col}"', '"${strand}"', $'"${allele_col}"''
+if [ "$comment_col" != "-" ]; then
+    print_row=''"${print_row}"', $'"${comment_col}"''
+fi
+
+awk '
+BEGIN {FS="\t";OFS=","}
+$'"${chrom_col}"' ~ /^[cC][hH][rR]/ {$'"${chrom_col}"' = substr($'"${chrom_col}"',4)}
+{
+    '"${strand_cvt}"'
+    '"${print_row}"'
+}
+' "$input_file" > "$sift_input"
+
+################################################################################
+## run SIFT_exome_nssnvs.pl command line program                              ##
+################################################################################
+if [ "$output_opts" = "None" ]; then
+    output_opts=""
+else
+    output_opts=$( echo "$output_opts" | sed -e 's/,/ 1 -/g' )
+    output_opts="-$output_opts 1"
+fi
+
+SIFT_exome_nssnvs.pl -i "$sift_input" -d "$db_dir" -o "$working_dir" $output_opts &> "$sift_output"
+if [ $? -ne 0 ]; then
+    echo "failed: SIFT_exome_nssnvs.pl -i \"$sift_input\" -d \"$db_dir\" -o \"$working_dir\" $output_opts"
+    exit 1
+fi
+
+################################################################################
+## locate the SIFT_exome_nssnvs.pl output file                                ##
+################################################################################
+sift_pid=$( sed -n -e 's/^.*Your job id is \([0-9][0-9]*\) and is currently running.*$/\1/p' "$sift_output" )
+
+if [ -z "$sift_pid" ]; then
+    echo "Can't find SIFT pid in \"$sift_output\"" 1>&2
+    exit 1
+fi
+
+sift_outdir="$working_dir/$sift_pid"
+if [ ! -d "$sift_outdir" ]; then
+    echo "Can't access SIFT output directory \"$sift_outdir\"" 1>&2
+    exit 1
+fi
+
+sift_outfile="$sift_outdir/${sift_pid}_predictions.tsv"
+if [ ! -r "$sift_outfile" ]; then
+    echo "Can't access SIFT output file \"$sift_outfile\"" 1>&2
+    exit 1
+fi
+
+################################################################################
+## create galaxy output file                                                  ##
+################################################################################
+awk '
+BEGIN {FS="\t";OFS="\t"}
+NR == 1 {
+    $12 = "Num seqs at position"
+    $1 = "Chrom\tPosition\tStrand\tAllele"
+    print
+}
+NR != 1 {
+    $1 = "chr" $1
+    gsub(/,/, "\t", $1)
+    print
+}
+' "$sift_outfile" | awk '
+BEGIN {FS="\t";OFS="\t"}
+NR == 1 {
+    print "#" $0
+}
+NR != 1 {
+    if ($3 == "1") {$3 = "+"} else if ($3 == "-1") {$3 = "-"}
+    '"${pos_adj}"'
+    print
+}
+' > "$output_file"
+
+################################################################################
+## cleanup                                                                    ##
+################################################################################
+rm -rf "$sift_outdir" "$sift_input" "$sift_output"
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/snpFreq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/snpFreq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,104 @@
+<tool id="hgv_snpFreq" name="snpFreq" version="1.0.0">
+  <description>significant SNPs in case-control data</description>
+
+  <command interpreter="perl">
+    snpFreq2.pl $input $group1_1 $group1_2 $group1_3 $group2_1 $group2_2 $group2_3 0.05 $output
+  </command>
+
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Dataset" />
+    <param name="group1_1" label="Column with genotype 1 count for group 1" type="data_column" data_ref="input" />
+    <param name="group1_2" label="Column with genotype 2 count for group 1" type="data_column" data_ref="input" />
+    <param name="group1_3" label="Column with genotype 3 count for group 1" type="data_column" data_ref="input" />
+    <param name="group2_1" label="Column with genotype 1 count for group 2" type="data_column" data_ref="input" />
+    <param name="group2_2" label="Column with genotype 2 count for group 2" type="data_column" data_ref="input" />
+    <param name="group2_3" label="Column with genotype 3 count for group 2" type="data_column" data_ref="input" />
+  </inputs>
+
+  <outputs>
+    <data format="tabular" name="output" />
+  </outputs>
+
+  <requirements>
+    <requirement type="binary">R</requirement>
+  </requirements>
+
+  <tests>
+    <test>
+      <param name="input" ftype="tabular" value="snpFreqInput.txt" dbkey="hg18" />
+      <param name="group1_1" value="4" />
+      <param name="group1_2" value="5" />
+      <param name="group1_3" value="6" />
+      <param name="group2_1" value="7" />
+      <param name="group2_2" value="8" />
+      <param name="group2_3" value="9" />
+      <output name="output" file="snpFreqTestOut.txt" />
+    </test>
+  </tests>
+
+  <help>
+
+**Dataset formats**
+
+The input is tabular_, with six columns of allele counts.  The output is also tabular,
+and includes all of the input data plus the additional columns described below.
+(`Dataset missing?`_)
+
+.. _tabular: ./static/formatHelp.html#tab
+.. _Dataset missing?: ./static/formatHelp.html
+
+-----
+
+**What it does**
+
+This tool performs a basic analysis of bi-allelic SNPs in case-control
+data, using the R statistical environment and Fisher's exact test to
+identify SNPs with a significant difference in the allele frequencies
+between the two groups.  R's "qvalue" package is used to correct for
+multiple testing.
+
+The input file includes counts for each allele combination (AA aa Aa)
+for each group at each SNP position.  The assignment of codes (1 2 3)
+to these genotypes is arbitrary, as long as it is consistent for both
+groups.  Any other input columns are ignored in the computation, but
+are copied to the output.  The output appends eight additional columns,
+namely the minimum expected counts of the three genotypes for each
+group, the p-value, and the q-value.
+
+-----
+
+**Example**
+
+- input file::
+
+    chr1  210  211  38  4  15  56  0   1   x
+    chr1  228  229  55  0  2   56  0   1   x
+    chr1  230  231  46  0  11  55  0   2   x
+    chr1  234  235  43  0  14  55  0   2   x
+    chr1  236  237  55  0  2   13  10  34  x
+    chr1  437  438  55  0  2   46  0   11  x
+    chr1  439  440  56  0  1   55  0   2   x
+    chr1  449  450  56  0  1   13  20  24  x
+    chr1  518  519  56  0  1   38  4   15  x
+
+Here the group 1 genotype counts are in columns 4 - 6, while those
+for group 2 are in columns 7 - 9.
+
+Note that the "x" column has no meaning.  It was added to this example
+to show that extra columns can be included, and to make it easier
+to see where the new columns are appended in the output.
+
+- output file::
+
+    chr1  210  211  38  4  15  56  0   1   x  47    2   8     47    2   8     1.50219088598917e-05  6.32501425679652e-06
+    chr1  228  229  55  0  2   56  0   1   x  55.5  0   1.5   55.5  0   1.5   1                     0.210526315789474
+    chr1  230  231  46  0  11  55  0   2   x  50.5  0   6.5   50.5  0   6.5   0.0155644201009862    0.00409590002657532
+    chr1  234  235  43  0  14  55  0   2   x  49    0   8     49    0   8     0.00210854461554067   0.000739840215979182
+    chr1  236  237  55  0  2   13  10  34  x  34    5   18    34    5   18    6.14613878554783e-17  4.31307984950725e-17
+    chr1  437  438  55  0  2   46  0   11  x  50.5  0   6.5   50.5  0   6.5   0.0155644201009862    0.00409590002657532
+    chr1  439  440  56  0  1   55  0   2   x  55.5  0   1.5   55.5  0   1.5   1                     0.210526315789474
+    chr1  449  450  56  0  1   13  20  24  x  34.5  10  12.5  34.5  10  12.5  2.25757007974134e-18  2.37638955762246e-18
+    chr1  518  519  56  0  1   38  4   15  x  47    2   8     47    2   8     1.50219088598917e-05  6.32501425679652e-06
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/human_genome_variation/snpFreq2.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/human_genome_variation/snpFreq2.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,107 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+#expected input: path to file, cols of counts (2 sets of 3), threshold
+if (!@ARGV or scalar @ARGV != 9) {
+   print "usage snpFreq.pl /path/to/snps.txt <6 column numbers(1 based) with counts for alleles, first one group then another> #threshold outfile\n";
+   exit 1;
+}
+
+#get and verify inputs
+my $file = shift @ARGV;
+my $a1 = shift @ARGV;
+if ($a1 =~ /\D/ or $a1 < 1) {
+   print "Error the column number, must be an integer greater than or equal to 1. Got $a1\n";
+   exit 1;
+}
+my $a2 = shift @ARGV;
+if ($a2 =~ /\D/ or $a2 < 1) {
+   print "Error the column number, must be an integer greater than or equal to 1. Got $a2\n";
+   exit 1;
+}
+my $a3 = shift @ARGV;
+if ($a3 =~ /\D/ or $a3 < 1) {
+   print "Error the column number, must be an integer greater than or equal to 1. Got $a3\n";
+   exit 1;
+}
+my $b1 = shift @ARGV;
+if ($b1 =~ /\D/ or $b1 < 1) {
+   print "Error the column number, must be an integer greater than or equal to 1. Got $b1\n";
+   exit 1;
+}
+my $b2 = shift @ARGV;
+if ($b2 =~ /\D/ or $b2 < 1) {
+   print "Error the column number, must be an integer greater than or equal to 1. Got $b2\n";
+   exit 1;
+}
+my $b3 = shift @ARGV;
+if ($b3 =~ /\D/ or $b3 < 1) {
+   print "Error the column number, must be an integer greater than or equal to 1. Got $b3\n";
+   exit 1;
+}
+my $thresh = shift @ARGV;
+if ($thresh !~ /^\d*\.?\d+$/) {
+   print "Error the threshold must be a number. Got $thresh\n"; 
+   exit 1;
+}elsif ($thresh > .3) {
+   print "Error the threshold can not be greater than 0.3 got $thresh\n";
+   exit 1;
+}
+my $outfile = shift @ARGV;
+
+#run a fishers exact test (using R) on whole table
+my $cmd = qq|options(warn=-1)
+           tab <- read.table('$file', sep="\t")
+           size <- length(tab[,1])
+           width <- length(tab[1,])
+           x <- 1:size
+           y <- matrix(data=0, nr=size, nc=6)
+           for(i in 1:size) {
+              m <- matrix(c(tab[i,$a1], tab[i,$b1], tab[i,$a2], tab[i,$b2], tab[i,$a3], tab[i,$b3]), nrow=2)
+              t <- fisher.test(m)
+              x[i] <- t\$p.value
+              if (x[i] >= 1) {
+                  x[i] <- .999
+              }
+              n <- (tab[i,$a1] + tab[i,$a2] + tab[i,$a3] + tab[i,$b1] + tab[i,$b2] + tab[i,$b3])
+              n_a <- (tab[i,$a1] + tab[i,$a2] + tab[i,$a3])
+              y[i,1] <- ((tab[i,$a1] + tab[i,$b1])*(n_a))/n
+              y[i,1] <- round(y[i,1],3)
+              y[i,2] <- ((tab[i,$a2] + tab[i,$b2])*(n_a))/n
+              y[i,2] <- round(y[i,2],3)
+              y[i,3] <- ((tab[i,$a3] + tab[i,$b3])*(n_a))/n
+              y[i,3] <- round(y[i,3],3)
+              n_b <- (tab[i,$b1] + tab[i,$b2] + tab[i,$b3])
+              y[i,4] <- ((tab[i,$a1] + tab[i,$b1])*(n_b))/n
+              y[i,4] <- round(y[i,4],3)
+              y[i,5] <- ((tab[i,$a2] + tab[i,$b2])*(n_b))/n
+              y[i,5] <- round(y[i,5],3)
+              y[i,6] <- ((tab[i,$a3] + tab[i,$b3])*(n_b))/n
+              y[i,6] <- round(y[i,6],3)
+           }|;
+           #results <- data.frame(tab[1:size,1:width], x[1:size])
+           #write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t")
+           #q()|;
+
+my $cmd2 = qq|suppressPackageStartupMessages(library(qvalue))
+              qobj <- qvalue(x[1:size], lambda=seq(0,0.90,$thresh), pi0.method="bootstrap", fdr.level=0.1, robust=FALSE, smooth.log.pi0 = FALSE)
+              q <- qobj\$qvalues
+              results <- data.frame(tab[1:size,1:width], y[1:size,1:6], x[1:size], q[1:size])
+              write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t")
+              q()|;
+
+#for TESTING
+my $pr = qq|results <- data.frame(tab[1:size,1:width], y[1:size,1:6], x[1:size])
+            write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t")
+              q()|;
+
+open(FT, "| R --slave --vanilla") 
+   or die "Couldn't call fisher.text, $!\n";
+print FT $cmd, "\n"; #fisher test
+print FT $cmd2, "\n"; #qvalues and results
+#print FT $pr, "\n";
+close FT or die "Couldn't finish fisher.test, $!\n";
+
+exit;
b
diff -r 000000000000 -r 9071e359b9a3 tools/hyphy/hyphy_branch_lengths_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/hyphy/hyphy_branch_lengths_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,55 @@
+#Dan Blankenberg
+#takes commandline tree def and input multiple fasta alignment file and runs the branch length ananlysis
+import os, sys
+from galaxy import eggs
+from galaxy.tools.util import hyphy_util
+
+#Retrieve hyphy path, this will need to be the same across the cluster
+tool_data = sys.argv.pop()
+HYPHY_PATH = os.path.join( tool_data, "HYPHY" )
+HYPHY_EXECUTABLE = os.path.join( HYPHY_PATH, "HYPHY" )
+
+#Read command line arguments
+input_filename = os.path.abspath(sys.argv[1].strip())
+output_filename = os.path.abspath(sys.argv[2].strip())
+tree_contents = sys.argv[3].strip()
+nuc_model = sys.argv[4].strip()
+base_freq = sys.argv[5].strip()
+model_options = sys.argv[6].strip()
+
+#Set up Temporary files for hyphy run
+#set up tree file
+tree_filename = hyphy_util.get_filled_temp_filename(tree_contents)
+
+#Guess if this is a single or multiple FASTA input file
+found_blank = False
+is_multiple = False
+for line in open(input_filename):
+    line = line.strip()
+    if line == "": found_blank = True
+    elif line.startswith(">") and found_blank:
+        is_multiple = True
+        break
+    else: found_blank = False
+
+#set up BranchLengths file
+BranchLengths_filename = hyphy_util.get_filled_temp_filename(hyphy_util.BranchLengths)
+if is_multiple: 
+    os.unlink(BranchLengths_filename)
+    BranchLengths_filename = hyphy_util.get_filled_temp_filename(hyphy_util.BranchLengthsMF)
+    print "Multiple Alignment Analyses"
+else: print "Single Alignment Analyses"
+
+#setup Config file
+config_filename = hyphy_util.get_branch_lengths_config_filename(input_filename, nuc_model, model_options, base_freq, tree_filename, output_filename, BranchLengths_filename)
+
+#Run Hyphy
+hyphy_cmd = "%s BASEPATH=%s USEPATH=/dev/null %s" % (HYPHY_EXECUTABLE, HYPHY_PATH, config_filename)
+hyphy = os.popen(hyphy_cmd, 'r')
+#print hyphy.read()
+hyphy.close()
+
+#remove temporary files
+os.unlink(BranchLengths_filename)
+os.unlink(tree_filename)
+os.unlink(config_filename)
b
diff -r 000000000000 -r 9071e359b9a3 tools/hyphy/hyphy_branch_lengths_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/hyphy/hyphy_branch_lengths_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,95 @@
+<?xml version="1.2.1"?>
+<tool name="Branch Lengths" id="hyphy_branch_lengths_wrapper1">
+
+ <description>Estimation</description>
+
+ <command interpreter="python">hyphy_branch_lengths_wrapper.py $input1 $out_file1 "$tree" "$model" "$base_freq" "Global" ${GALAXY_DATA_INDEX_DIR}</command>
+
+    <inputs>
+        <page>
+            <param format="fasta" name="input1" type="data" label="Fasta file"/>
+            <param name="tree" type="text" label="Tree Definition" size="20" help="For example: ((hg17,panTro1),(mm5,rn3),canFam1)"/>
+            <param name="model" type="select" label="Substitution Model">
+               <option value="000000">F81</option>
+                <option value="010010">HKY85</option>
+                <option value="012345">REV</option>
+            </param>
+<!--            <param name="model_options" type="select" label="Model Options">
+               <option value="Local">All model parameters are estimated independently for each branch</option>
+                <option value="Global">Model parameters are shared by all branches, branch lengths are estimated independently</option>
+                <option value="Global w/variation">Model parameters are shared by all branches, branch lengths come from a user-chosen distribution, whose parameters are estimated</option>
+                <option value="Global w/variation+HM">Model parameters are shared by all branches, branch lengths come from a user-chosen distribution, whose parameters is estimated; rates at adjacent sites are correlated via a simple Hidden Markov model with an autocorrelation parameter lambda</option>
+            </param> -->
+            <param name="base_freq" type="select" label="Base Frequencies">
+               <option value="Observed">Nucleotide frequencies collected from the data file will be used as equilibrium frequencies</option>
+                <option value="Equal">Equal (.25) frequencies are used as equilibrium frequencies</option>
+            </param>
+        </page>
+    </inputs>
+ <outputs>
+ <data name="out_file1" format="tabular" />
+ </outputs>
+    <tests>
+      <test>
+        <param name="input1" value="branchlength_in.fasta"/>
+        <param name="tree" value="((hg17,panTro1),(mm5,rn3),canFam1)"/>
+        <param name="model" value="012345"/>
+        <param name="base_freq" value="Observed"/>
+        <output name="out_file1" file="branchlength_out.tabular"/>
+      </test>
+    </tests>
+ <help>
+This tool takes a single or multiple FASTA alignment file and estimates branch lengths using HYPHY_, a maximum likelihood analyses package.
+
+For the tree definition, you only need to specify the species build names. For example, you could use the tree *((hg17,panTro1),(mm5,rn3),canFam1)*, if your FASTA file looks like this::
+
+    &gt;hg17.chr7(+):26907301-26907310|hg17_0
+    GTGGGAGGT
+    &gt;panTro1.chr6(+):28037319-28037328|panTro1_0
+    GTGGGAGGT
+    &gt;mm5.chr6(+):52104022-52104031|mm5_0
+    GTGGGAGGT
+    &gt;rn3.chr4(+):80734395-80734404|rn3_0
+    GTGGGAGGT
+    &gt;canFam1.chr14(+):42826409-42826418|canFam1_0
+    GTGGGAGGT
+
+    &gt;hg17.chr7(+):26907310-26907326|hg17_1
+    AGTCAGAGTGTCTGAG
+    &gt;panTro1.chr6(+):28037328-28037344|panTro1_1
+    AGTCAGAGTGTCTGAG
+    &gt;mm5.chr6(+):52104031-52104047|mm5_1
+    AGTCAGAGTGTCTGAG
+    &gt;rn3.chr4(+):80734404-80734420|rn3_1
+    AGTCAGAGTATCTGAG
+    &gt;canFam1.chr14(+):42826418-42826434|canFam1_1
+    AGTCAGAGTGTCTGAG
+
+    &gt;hg17.chr7(+):26907326-26907338|hg17_2
+    GTAGAAGACCCC
+    &gt;panTro1.chr6(+):28037344-28037356|panTro1_2
+    GTAGAAGACCCC
+    &gt;mm5.chr6(+):52104047-52104059|mm5_2
+    GTAGACGATGCC
+    &gt;rn3.chr4(+):80734420-80734432|rn3_2
+    GTAGATGATGCG
+    &gt;canFam1.chr14(+):42826434-42826446|canFam1_2
+    GTAGAAGACCCC
+
+    &gt;hg17.chr7(+):26907338-26907654|hg17_3
+    GGGGAAGGAACGCAGGGCGAAGAGCTGGACTTCTCTGAGGAT---TCCTCGGCCTTCTCGT-----CGTTTCCTGG----CGGGGTGGCCGGAGAGATGGGCAAGAGACCCTCCTTCTCACGTTTCTTTTGCTTCATTCGGCGGTTCTGGAACCAGATCTTCACTTGGGTCTCGTTGAGCTGCAGGGATGCAGCGATCTCCACCCTGCGGGCGCGCGTCAGGTACTTGTTGAAGTGGAACTCCTTCTCCAGTTCCGTGAGCTGCTTGGTAGTGAAGTTGGTGCGCACCGCGTTGGGTTGACCCAGGTAGCCGTACTCTCCAACTTTCC
+    &gt;panTro1.chr6(+):28037356-28037672|panTro1_3
+    GGGGAAGGAACGCAGGGCGAAGAGCTGGACTTCTCTGAGGAT---TCCTCGGCCTTCTCGT-----CGTTTCCTGG----CGGGGTGGCCGGAGAGATGGGCAAGAGACCCTCCTTCTCACGTTTCTTTTGCTTCATTCGGCGGTTCTGGAACCAGATCTTCACTTGGGTCTCGTTGAGCTGCAGGGATGCAGCGATCTCCACCCTGCGGGCGCGCGTCAGGTACTTGTTGAAGTGGAACTCCTTCTCCAGTTCCGTGAGCTGCTTGGTAGTGAAGTTGGTGCGCACCGCGTTGGGTTGACCCAGGTAGCCGTACTCTCCAACTTTCC
+    &gt;mm5.chr6(+):52104059-52104375|mm5_3
+    GGAGAAGGGGCACTGGGCGAGGGGCTAGATTTCTCAGATGAT---TCTTCCGTTTTCTCAT-----CGCTGCCAGG----AGGAGTGGCAGGGGAGATGGGCAGGAGCCCCTCCTTCTCACGCTTCTTCTGCTTCATGCGGCGATTCTGGAACCAGATCTTCACCTGGGTCTCATTGAGCTGTAGGGACGCGGCAATCTCCACCCTGCGCGCTCGTGTAAGGTACTTGTTGAAGTGGAACTCCTTCTCCAGCTCTGTGAGCTGCTTGGTGGTGAAATTGGTGCGCACTGCGTTGGGTTGACCCACGTAGCCGTACTCTCCAACTTTCC
+    &gt;rn3.chr4(+):80734432-80734748|rn3_3
+    GGAGAAGGGGCGCTGGGCGAGGAGCTGGATTTCTCAGATGAT---TCTTCAGTTTTCTCAT-----CGCTTCCAGG----AGGGGTGGCGGGTGAAATGGGCAAGAGCCCCTCTTTCTCGCGCTTCTTCTGCTTCATGCGGCGATTCTGGAACCAGATCTTCACCTGGGTCTCATTGAGTTGCAGGGACGCGGCTATCTCCACCCTGCGGGCTCTTGTTAGGTACTTGTTGAAGTGGAACTCCTTCTCCAGCTCTGTGAGCTGCTTGGTGGTGAAGTTGGTGCGCACTGCGTTGGGTTGACCCACGTAGCCATACTCTCCAACTTTCC
+    &gt;canFam1.chr14(+):42826446-42826762|canFam1_3
+    GGAGACGGAATGCAGGGCGAGGAGCTGGATTTCTCTGAAGAT---TCCTCCGCCTTCTCCT-----CACTTCCTGG----CGGGGTGGCAGGGGAGATGGGCAAAAGGCCCTCTTTCTCTCGTTTCTTCTGCTTCATCCGGCGGTTCTGGAACCAGATCTTCACCTGGGTCTCGTTGAGCTGCAGGGATGCTGCGATCTCCACCCTGCGGGCGCGGGTCAGATACTTATTGAAGTGGAACTCCTTTTCCAGCTCGGTGAGCTGCTTGGTGGTGAAGTTGGTACGCACTGCATTCGGTTGACCCACGTAGCCGTACTCTCCAACTTTCC
+    
+
+
+.. _HYPHY: http://www.hyphy.org
+ </help>
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/hyphy/hyphy_dnds_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/hyphy/hyphy_dnds_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,55 @@
+#Guru
+#takes fasta alignments, a distance metric and builds neighbor joining trees
+import os, sys
+from galaxy import eggs
+from galaxy.tools.util import hyphy_util
+
+#Retrieve hyphy path, this will need to be the same across the cluster
+tool_data = sys.argv.pop()
+HYPHY_PATH = os.path.join( tool_data, "HYPHY" )
+HYPHY_EXECUTABLE = os.path.join( HYPHY_PATH, "HYPHY" )
+
+#Read command line arguments
+input_filename = os.path.abspath(sys.argv[1].strip())
+output_filename = os.path.abspath(sys.argv[2].strip())
+tree_contents = sys.argv[3].strip()
+nuc_model = sys.argv[4].strip()
+analysis = sys.argv[5].strip()
+
+if tree_contents == "":
+    print >> sys.stderr, "Please specify a valid tree definition."
+    sys.exit()
+        
+tree_filename = hyphy_util.get_filled_temp_filename(tree_contents)
+
+if analysis == "local":
+    fitter_filename = hyphy_util.get_filled_temp_filename(hyphy_util.SimpleLocalFitter)
+else:
+    fitter_filename = hyphy_util.get_filled_temp_filename(hyphy_util.SimpleGlobalFitter)
+
+tabwriter_filename = hyphy_util.get_filled_temp_filename(hyphy_util.TabWriter)
+FastaReader_filename = hyphy_util.get_filled_temp_filename(hyphy_util.FastaReader)
+#setup Config file
+config_filename = hyphy_util.get_dnds_config_filename(fitter_filename, tabwriter_filename, "Universal", tree_filename, input_filename, nuc_model, output_filename, FastaReader_filename)
+
+#Run Hyphy
+hyphy_cmd = "%s BASEPATH=%s USEPATH=/dev/null %s" % (HYPHY_EXECUTABLE, HYPHY_PATH, config_filename)
+hyphy = os.popen(hyphy_cmd, 'r')
+#print hyphy.read()
+hyphy.close()
+
+#remove temporary files
+os.unlink(fitter_filename)
+os.unlink(tabwriter_filename)
+os.unlink(tree_filename)
+os.unlink(FastaReader_filename)
+os.unlink(config_filename)
+
+if nuc_model == "000000":
+    model = "F81"
+elif nuc_model == "010010":
+    model = "HKY85"
+else:
+    model = "REV"
+    
+print "Analysis: %s; Model: %s; Tree: %s" %(analysis, model, tree_contents)
b
diff -r 000000000000 -r 9071e359b9a3 tools/hyphy/hyphy_dnds_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/hyphy/hyphy_dnds_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,101 @@
+<?xml version="1.1.1"?>
+<tool name="dN/dS Ratio" id="hyphy_dnds_wrapper1">
+
+ <description>Estimation</description>
+
+ <command interpreter="python">hyphy_dnds_wrapper.py $input1 $out_file1 "$tree" "$model" $analysis ${GALAXY_DATA_INDEX_DIR}</command>
+
+    <inputs>
+        <page>
+            <param format="fasta" name="input1" type="data" label="Fasta file"/>
+            <param name="analysis" type="select" label="Analysis to run on every gene">
+            <option value="global">Global</option>
+               <option value="local">Local</option>
+            </param>
+            <param name="tree" type="text" label="Tree Definition in Newick format" size="20" help="For example: ((hg17,panTro1),(mm5,rn3),canFam1)"/>
+            <param name="model" type="select" label="Substitution Model">
+               <option value="000000">F81</option>
+                <option value="010010">HKY85</option>
+                <option value="012345">REV</option>
+            </param>
+        </page>
+    </inputs>
+ <outputs>
+ <data name="out_file1" format="tabular" />
+ </outputs>
+    <tests>
+      <test>
+        <param name="input1" value="dnds_inp.fasta"/>
+        <param name="tree" value="((human, chimp), mouse)"/>
+        <param name="model" value="000000"/>
+        <param name="analysis" value="global"/>
+        <output name="out_file1" file="dnds_out.tabular"/>
+      </test>
+    </tests>
+ <help>
+
+.. class:: infomark
+
+This tool takes a FASTA alignment file and estimates dN/dS ratio using HYPHY_, a maximum likelihood analyses package.
+
+-----
+
+.. class:: warningmark
+
+The tool returns an error message if no tree definition or an invalid tree definition is supplied. 
+Any block/s not containing as many species as mentioned in the tree definition will be omitted from the output.
+
+-----
+
+For the tree definition, you only need to specify the species build names. For example, you could use the tree *(hg17,panTro1),(mm5,rn3),canFam1)*, if your FASTA file looks like the example below. You may also use **Neighbor Joining Tree Builder** tool to obtain the tree definition::
+
+    &gt;hg17.chr7(+):26907301-26907310|hg17_0
+    GTGGGAGGT
+    &gt;panTro1.chr6(+):28037319-28037328|panTro1_0
+    GTGGGAGGT
+    &gt;mm5.chr6(+):52104022-52104031|mm5_0
+    GTGGGAGGT
+    &gt;rn3.chr4(+):80734395-80734404|rn3_0
+    GTGGGAGGT
+    &gt;canFam1.chr14(+):42826409-42826418|canFam1_0
+    GTGGGAGGT
+
+    &gt;hg17.chr7(+):26907310-26907326|hg17_1
+    AGTCAGAGTGTCTGAG
+    &gt;panTro1.chr6(+):28037328-28037344|panTro1_1
+    AGTCAGAGTGTCTGAG
+    &gt;mm5.chr6(+):52104031-52104047|mm5_1
+    AGTCAGAGTGTCTGAG
+    &gt;rn3.chr4(+):80734404-80734420|rn3_1
+    AGTCAGAGTATCTGAG
+    &gt;canFam1.chr14(+):42826418-42826434|canFam1_1
+    AGTCAGAGTGTCTGAG
+
+    &gt;hg17.chr7(+):26907326-26907338|hg17_2
+    GTAGAAGACCCC
+    &gt;panTro1.chr6(+):28037344-28037356|panTro1_2
+    GTAGAAGACCCC
+    &gt;mm5.chr6(+):52104047-52104059|mm5_2
+    GTAGACGATGCC
+    &gt;rn3.chr4(+):80734420-80734432|rn3_2
+    GTAGATGATGCG
+    &gt;canFam1.chr14(+):42826434-42826446|canFam1_2
+    GTAGAAGACCCC
+
+    &gt;hg17.chr7(+):26907338-26907654|hg17_3
+    GGGGAAGGAACGCAGGGCGAAGAGCTGGACTTCTCTGAGGAT---TCCTCGGCCTTCTCGT-----CGTTTCCTGG----CGGGGTGGCCGGAGAGATGGGCAAGAGACCCTCCTTCTCACGTTTCTTTTGCTTCATTCGGCGGTTCTGGAACCAGATCTTCACTTGGGTCTCGTTGAGCTGCAGGGATGCAGCGATCTCCACCCTGCGGGCGCGCGTCAGGTACTTGTTGAAGTGGAACTCCTTCTCCAGTTCCGTGAGCTGCTTGGTAGTGAAGTTGGTGCGCACCGCGTTGGGTTGACCCAGGTAGCCGTACTCTCCAACTTTCC
+    &gt;panTro1.chr6(+):28037356-28037672|panTro1_3
+    GGGGAAGGAACGCAGGGCGAAGAGCTGGACTTCTCTGAGGAT---TCCTCGGCCTTCTCGT-----CGTTTCCTGG----CGGGGTGGCCGGAGAGATGGGCAAGAGACCCTCCTTCTCACGTTTCTTTTGCTTCATTCGGCGGTTCTGGAACCAGATCTTCACTTGGGTCTCGTTGAGCTGCAGGGATGCAGCGATCTCCACCCTGCGGGCGCGCGTCAGGTACTTGTTGAAGTGGAACTCCTTCTCCAGTTCCGTGAGCTGCTTGGTAGTGAAGTTGGTGCGCACCGCGTTGGGTTGACCCAGGTAGCCGTACTCTCCAACTTTCC
+    &gt;mm5.chr6(+):52104059-52104375|mm5_3
+    GGAGAAGGGGCACTGGGCGAGGGGCTAGATTTCTCAGATGAT---TCTTCCGTTTTCTCAT-----CGCTGCCAGG----AGGAGTGGCAGGGGAGATGGGCAGGAGCCCCTCCTTCTCACGCTTCTTCTGCTTCATGCGGCGATTCTGGAACCAGATCTTCACCTGGGTCTCATTGAGCTGTAGGGACGCGGCAATCTCCACCCTGCGCGCTCGTGTAAGGTACTTGTTGAAGTGGAACTCCTTCTCCAGCTCTGTGAGCTGCTTGGTGGTGAAATTGGTGCGCACTGCGTTGGGTTGACCCACGTAGCCGTACTCTCCAACTTTCC
+    &gt;rn3.chr4(+):80734432-80734748|rn3_3
+    GGAGAAGGGGCGCTGGGCGAGGAGCTGGATTTCTCAGATGAT---TCTTCAGTTTTCTCAT-----CGCTTCCAGG----AGGGGTGGCGGGTGAAATGGGCAAGAGCCCCTCTTTCTCGCGCTTCTTCTGCTTCATGCGGCGATTCTGGAACCAGATCTTCACCTGGGTCTCATTGAGTTGCAGGGACGCGGCTATCTCCACCCTGCGGGCTCTTGTTAGGTACTTGTTGAAGTGGAACTCCTTCTCCAGCTCTGTGAGCTGCTTGGTGGTGAAGTTGGTGCGCACTGCGTTGGGTTGACCCACGTAGCCATACTCTCCAACTTTCC
+    &gt;canFam1.chr14(+):42826446-42826762|canFam1_3
+    GGAGACGGAATGCAGGGCGAGGAGCTGGATTTCTCTGAAGAT---TCCTCCGCCTTCTCCT-----CACTTCCTGG----CGGGGTGGCAGGGGAGATGGGCAAAAGGCCCTCTTTCTCTCGTTTCTTCTGCTTCATCCGGCGGTTCTGGAACCAGATCTTCACCTGGGTCTCGTTGAGCTGCAGGGATGCTGCGATCTCCACCCTGCGGGCGCGGGTCAGATACTTATTGAAGTGGAACTCCTTTTCCAGCTCGGTGAGCTGCTTGGTGGTGAAGTTGGTACGCACTGCATTCGGTTGACCCACGTAGCCGTACTCTCCAACTTTCC
+    
+
+
+.. _HYPHY: http://www.hyphy.org
+ </help>
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/hyphy/hyphy_nj_tree_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/hyphy/hyphy_nj_tree_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,58 @@
+#Dan Blankenberg
+#takes fasta alignments, a distance metric and builds neighbor joining trees
+import os, sys
+from galaxy import eggs
+from galaxy.tools.util import hyphy_util
+
+#Retrieve hyphy path, this will need to be the same across the cluster
+tool_data = sys.argv.pop()
+HYPHY_PATH = os.path.join( tool_data, "HYPHY" )
+HYPHY_EXECUTABLE = os.path.join( HYPHY_PATH, "HYPHY" )
+
+#Read command line arguments
+input_filename = os.path.abspath(sys.argv[1].strip())
+output_filename1 = os.path.abspath(sys.argv[2].strip())
+output_filename2 = os.path.abspath(sys.argv[3].strip())
+distance_metric = sys.argv[4].strip()
+temp_ps_filename = hyphy_util.get_filled_temp_filename("")
+
+#Guess if this is a single or multiple FASTA input file
+found_blank = False
+is_multiple = False
+for line in open(input_filename):
+    line = line.strip()
+    if line == "": found_blank = True
+    elif line.startswith(">") and found_blank:
+        is_multiple = True
+        break
+    else: found_blank = False
+
+NJ_tree_shared_ibf = hyphy_util.get_filled_temp_filename(hyphy_util.NJ_tree_shared_ibf)
+
+#set up NJ_tree file
+NJ_tree_filename = hyphy_util.get_filled_temp_filename(hyphy_util.get_NJ_tree(NJ_tree_shared_ibf))
+#setup Config file
+config_filename = hyphy_util.get_nj_tree_config_filename(input_filename, distance_metric, output_filename1, temp_ps_filename, NJ_tree_filename)
+if is_multiple: 
+    os.unlink(NJ_tree_filename)
+    os.unlink(config_filename)
+    NJ_tree_filename = hyphy_util.get_filled_temp_filename(hyphy_util.get_NJ_treeMF(NJ_tree_shared_ibf))
+    config_filename = hyphy_util.get_nj_treeMF_config_filename(input_filename, output_filename1, temp_ps_filename, distance_metric, NJ_tree_filename)
+    print "Multiple Alignment Analyses"
+else: print "Single Alignment Analyses"
+
+
+#Run Hyphy
+hyphy_cmd = "%s BASEPATH=%s USEPATH=/dev/null %s" % (HYPHY_EXECUTABLE, HYPHY_PATH, config_filename)
+hyphy = os.popen(hyphy_cmd, 'r')
+#print hyphy.read()
+hyphy.close()
+
+#remove temporary files
+os.unlink(NJ_tree_filename)
+os.unlink(config_filename)
+
+
+#Convert PS to PDF
+if os.path.getsize(temp_ps_filename)>0: temp = os.popen("ps2pdf %s %s" % (temp_ps_filename, output_filename2), 'r').close()
+os.unlink(temp_ps_filename)
b
diff -r 000000000000 -r 9071e359b9a3 tools/hyphy/hyphy_nj_tree_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/hyphy/hyphy_nj_tree_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,46 @@
+<?xml version="1.1.1"?>
+<tool name="Neighbor Joining Tree" id="hyphy_nj_tree_wrapper1">
+    
+    <description>Builder</description>
+    
+    <command interpreter="python">hyphy_nj_tree_wrapper.py $input1 $out_file1 $out_file2 $distance_metric ${GALAXY_DATA_INDEX_DIR}</command>
+    
+    <inputs>
+        <page>
+            <param format="fasta" name="input1" type="data" label="Fasta file"/>
+            <param name="distance_metric" type="select" label="Distance Model">
+                  <option value="TN93">Tamura-Nei (93)</option>
+                <!-- <option value="TN93_RV">Tamura-Nei (93) distance and rate variation (unequal character frequencies, A->G, C->T and transversional bias corrections, gamma distributed rate variation from site to site)</option> -->
+                <!-- <option value="TN84">Tajima-Nei (84) distance (unequal character frequencies)</option> -->
+                <!-- <option value="K2P_RV">Kimura 2 parameter and rate variation (equal character frequencies, transition/trasversion bias correction, gamma distributed rate variation from site to site)</option> -->
+                <option value="K2P">Kimura 2 parameter</option>
+                <option value="JC69">Jukes-Cantor</option>
+                <!-- <option value="T3P">Tamura 3-parameter (correction for GC content bias and transition/trasversion bias)</option> -->
+                <!-- <option value="p_Distance">Number of observed substitutions per site</option> -->
+                <!-- <option value="Unaligned_LZ">Distance measure for unaligned sequences based on Lempel Ziv measure of information content</option> -->
+                <!-- <option value="Unaligned_LZ_FR">Distance measure for unaligned sequences based on Lempel Ziv measure of information content using the best choice forward and reverse string orientations</option> -->
+            </param>
+        </page>
+    </inputs>
+    <outputs>
+        <data name="out_file1" format="tabular" />
+        <data name="out_file2" format="pdf" />
+    </outputs>
+    <requirements>
+      <requirement type="binary">ps2pdf</requirement>
+    </requirements>
+    <tests>
+      <test>
+        <param name="input1" value="nj_tree_inp.fasta"/>
+        <param name="distance_metric" value="TN93"/>
+        <output name="out_file1" file="nj_tree_newick_out.tabular"/>
+        <output name="out_file2" file="nj_tree_pdf_out.pdf"/> 
+      </test>
+    </tests>
+    <help>
+This tool takes a single or multiple FASTA alignment file and builds Neighbor Joining Trees using HYPHY_, a maximum likelihood analyses package.
+
+.. _HYPHY: http://www.hyphy.org
+    </help>
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/ilmn_pacbio/abyss.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ilmn_pacbio/abyss.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,30 @@
+<tool id="abyss" name="ABySS" version="1.0.0">
+  <description>Short-read de Bruijn assembly</description>
+  <command interpreter="python">
+    quake_wrapper.py -k $k -r $input1 -p 8 > $output1
+  </command>
+  <inputs>
+    <param name="input1" format="fastq" type="data" label="Select FASTQ file to correct" />
+    <param name="k" type="integer" value="16" label="Size of k-mers to correct" />
+  </inputs>
+  <outputs>
+    <data format="fastq" name="output1" label="Error-corrected reads from ${on_string}" />
+  </outputs>
+  <help>
+
+**What it does**
+
+TBD.  Calls ABySS assembler
+
+**Parameter list**
+
+k
+
+**Output**
+
+Corrected reads
+
+  </help>
+</tool>
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/ilmn_pacbio/assembly_stats.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ilmn_pacbio/assembly_stats.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+#
+#Copyright (c) 2011, Pacific Biosciences of California, Inc.
+#
+#All rights reserved.
+#
+#Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+#    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+#    * Neither the name of Pacific Biosciences nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+#
+#THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS CONTRIBUTORS BE LIABLE FOR ANY
+#DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+import sys, os
+from optparse import OptionParser
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( 'bx-python' )
+from bx.seq.fasta import FastaReader
+
+def getStats( fastaFile, genomeLength, minContigLength ):
+    lengths = []
+    stats = { "Num" : 0,
+              "Sum" : 0, 
+              "Max" : 0, 
+              "Avg" : 0,
+              "N50" : 0,
+              "99%" : 0 }
+    fasta_reader = FastaReader( open( fastaFile, 'rb' ) )
+    while True:
+        seq = fasta_reader.next()
+        if not seq:
+            break
+        if seq.length < minContigLength:
+            continue
+        lengths.append( seq.length )
+    if lengths:
+        stats[ 'Num' ] = len( lengths )
+        stats[ 'Sum' ] = sum( lengths )
+        stats[ 'Max' ] = max( lengths )
+        stats[ 'Avg' ] = int( sum( lengths ) / float( len( lengths ) ) )
+        stats[ 'N50' ] = 0
+        stats[ '99%' ] = 0
+        if genomeLength == 0:
+            genomeLength = sum( lengths )
+        lengths.sort()
+        lengths.reverse()
+        lenSum = 0
+        stats[ "99%" ] = len( lengths )
+        for idx, length in enumerate( lengths ):
+            lenSum += length
+            if ( lenSum > genomeLength / 2 ):
+                stats[ "N50" ] = length
+                break
+        lenSum = 0
+        for idx, length in enumerate( lengths ):
+            lenSum += length
+            if lenSum > genomeLength * 0.99:
+                stats[ "99%" ] = idx + 1
+                break
+    return stats
+
+def __main__():
+    #Parse Command Line
+    usage = 'Usage: %prog input output --minContigLength'
+    parser = OptionParser( usage=usage )
+    parser.add_option( "--minContigLength", dest="minContigLength", help="Minimum length of contigs to analyze" )
+    parser.add_option( "--genomeLength", dest="genomeLength", help="Length of genome for which to calculate N50s" )
+    parser.set_defaults( minContigLength=0, genomeLength=0 )
+    options, args = parser.parse_args()
+    input_fasta_file = args[ 0 ]
+    output_tabular_file = args[ 1 ]
+    statKeys = "Num Sum Max Avg N50 99%".split( " " )
+    stats = getStats( input_fasta_file, int( options.genomeLength ), int( options.minContigLength ) )
+    fout = open( output_tabular_file, "w" )
+    fout.write( "%s\n" % "\t".join( map( lambda key: str( stats[ key ] ), statKeys ) ) )
+    fout.close()
+
+if __name__=="__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/ilmn_pacbio/assembly_stats.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ilmn_pacbio/assembly_stats.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,54 @@
+<tool id="assembly_stats" name="Assembly Statistics" version="1.0.0">
+    <description>Calculate common measures of assembly quality</description>
+    <command interpreter="python">
+        assembly_stats.py $input1 $output1 --minContigLength=${minLength}
+    </command>
+    <inputs>
+        <param name="input1" format="fasta" type="data" label="Select FASTA file containing contigs"/>
+        <param name="minLength" type="integer" value="0" label="Minimum length of contigs to consider"/>
+    </inputs>
+    <outputs>
+        <data name="output1" format="tabular" label="Assembly statistics for ${on_string}"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input1" value="3.fasta" ftype="fasta"/>
+            <param name="minLength" value="100"/>
+            <output name="output1" ftype="tabular" file="assembly_stats.tabular" />
+        </test>
+    </tests>
+    <help>
+
+**What it does**
+
+Reports standard measures of *de novo* assembly quality such as number of contigs, sum of contigs, mean contig length, and N50.
+
+**Parameter list**
+
+Minimum length
+    Only include contigs of this size or greater for calculating statistics.
+
+**Output**
+
+Num contigs
+    Total number of contigs in the assembly
+
+Sum of contig lengths
+    Total sum of contig lengths
+
+Maximum contig length
+    Maximum of the contig lengths
+
+Mean contig length
+    Average contig length
+
+N50
+    Contig length at which 50% of the assembly is contained in contigs of this size or greater.
+
+99%
+    Number of contigs accounting for 99% of the observed assembly.
+
+    </help>
+</tool>
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/ilmn_pacbio/cov_model.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ilmn_pacbio/cov_model.py Fri Mar 09 19:37:19 2012 -0500
[
b"@@ -0,0 +1,238 @@\n+#!/usr/bin/env python\n+from optparse import OptionParser, SUPPRESS_HELP\n+import os, random, quake\n+\n+############################################################\n+# cov_model.py\n+#\n+# Given a file of kmer counts, reports the cutoff to use\n+# to separate trusted/untrusted kmers.\n+############################################################\n+\n+############################################################\n+# main\n+############################################################\n+def main():\n+    usage = 'usage: %prog [options] <counts file>'\n+    parser = OptionParser(usage)\n+    parser.add_option('--int', dest='count_kmers', action='store_true', default=False, help='Kmers were counted as integers w/o the use of quality values [default: %default]')\n+    parser.add_option('--ratio', dest='ratio', type='int', default=200, help='Likelihood ratio to set trusted/untrusted cutoff [default: %default]')\n+    parser.add_option('--no_sample', dest='no_sample', action='store_true', default=False, help='Do not sample kmer coverages into kmers.txt because its already done [default: %default]')\n+    # help='Model kmer coverage as a function of GC content of kmers [default: %default]'\n+    parser.add_option('--gc', dest='model_gc', action='store_true', default=False, help=SUPPRESS_HELP)\n+    (options, args) = parser.parse_args()\n+\n+    if len(args) != 1:\n+        parser.error('Must provide kmers counts file')\n+    else:\n+        ctsf = args[0]\n+\n+    if options.count_kmers:\n+        model_cutoff(ctsf, options.ratio)\n+        print 'Cutoff: %s' % open('cutoff.txt').readline().rstrip()\n+        \n+    else:\n+        if options.model_gc:\n+            model_q_gc_cutoffs(ctsf, 25000, options.ratio)\n+        else:\n+            model_q_cutoff(ctsf, 50000, options.ratio, options.no_sample)\n+            print 'Cutoff: %s' % open('cutoff.txt').readline().rstrip()\n+\n+\n+############################################################\n+# model_cutoff\n+#\n+# Make a histogram of kmers to give to R to learn the cutoff\n+############################################################\n+def model_cutoff(ctsf, ratio):\n+    # make kmer histogram\n+    cov_max = 0\n+    for line in open(ctsf):\n+        cov = int(line.split()[1])\n+        if cov > cov_max:\n+            cov_max = cov\n+\n+    kmer_hist = [0]*cov_max\n+    for line in open(ctsf):\n+        cov = int(line.split()[1])\n+        kmer_hist[cov-1] += 1\n+\n+    cov_out = open('kmers.hist', 'w')\n+    for cov in range(0,cov_max):\n+        if kmer_hist[cov]:\n+            print >> cov_out, '%d\\t%d' % (cov+1,kmer_hist[cov])\n+    cov_out.close()\n+\n+    os.system('R --slave --args %d < %s/cov_model.r 2> r.log' % (ratio,quake.quake_dir))\n+\n+\n+############################################################\n+# model_q_cutoff\n+#\n+# Sample kmers to give to R to learn the cutoff\n+# 'div100' is necessary when the number of kmers is too \n+# large for random.sample, so we only consider every 100th\n+# kmer.\n+############################################################\n+def model_q_cutoff(ctsf, sample, ratio, no_sample=False):\n+    if not no_sample:\n+        # count number of kmer coverages\n+        num_covs = 0\n+        for line in open(ctsf):\n+            num_covs += 1\n+\n+        # choose random kmer coverages\n+        div100 = False\n+        if sample >= num_covs:\n+            rand_covs = range(num_covs)\n+        else:\n+            if num_covs > 1000000000:\n+                div100 = True\n+                rand_covs = random.sample(xrange(num_covs/100), sample)\n+            else:\n+                rand_covs = random.sample(xrange(num_covs), sample)\n+        rand_covs.sort()\n+\n+        # print to file\n+        out = open('kmers.txt', 'w')\n+        kmer_i = 0\n+        rand_i = 0\n+        for line in open(ctsf):\n+            if div100:\n+                if kmer_i % 100 == 0 and kmer_i/100 == rand_covs[rand_i]:\n+                    print >> out, line.split()[1]\n+                    rand_i += 1\n+                    if rand_i >= sample:\n+"..b"_i += 1\n+        out.close()\n+\n+    os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r.log' % (ratio,quake.quake_dir))\n+\n+\n+############################################################\n+# model_q_gc_cutoffs\n+#\n+# Sample kmers to give to R to learn the cutoff for each\n+# GC value\n+############################################################\n+def model_q_gc_cutoffs(ctsf, sample, ratio):\n+    # count number of kmer coverages at each at\n+    k = len(open(ctsf).readline().split()[0])\n+    num_covs_at = [0]*(k+1)\n+    for line in open(ctsf):\n+        kmer = line.split()[0]\n+        num_covs_at[count_at(kmer)] += 1\n+\n+    # for each AT bin\n+    at_cutoffs = []\n+    for at in range(1,k):\n+        # sample covs\n+        if sample >= num_covs_at[at]:\n+            rand_covs = range(num_covs_at[at])\n+        else:\n+            rand_covs = random.sample(xrange(num_covs_at[at]), sample)\n+        rand_covs.sort()\n+\n+        # print to file\n+        out = open('kmers.txt', 'w')\n+        kmer_i = 0\n+        rand_i = 0\n+        for line in open(ctsf):\n+            (kmer,cov) = line.split()\n+            if count_at(kmer) == at:\n+                if kmer_i == rand_covs[rand_i]:\n+                    print >> out, cov\n+                    rand_i += 1\n+                    if rand_i >= sample:\n+                        break\n+                kmer_i += 1\n+        out.close()\n+        \n+        os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r%d.log' % (ratio,quake.quake_dir,at))\n+\n+        at_cutoffs.append( open('cutoff.txt').readline().rstrip() )\n+        if at in [1,k-1]:   # setting extremes to next closests\n+            at_cutoffs.append( open('cutoff.txt').readline().rstrip() )\n+\n+        os.system('mv kmers.txt kmers.at%d.txt' % at)\n+        os.system('mv cutoff.txt cutoff.at%d.txt' % at)\n+\n+    out = open('cutoffs.gc.txt','w')\n+    print >> out, '\\n'.join(at_cutoffs)\n+    out.close()\n+\n+\n+############################################################\n+# model_q_gc_cutoffs_bigmem\n+#\n+# Sample kmers to give to R to learn the cutoff for each\n+# GC value\n+############################################################\n+def model_q_gc_cutoffs_bigmem(ctsf, sample, ratio):\n+    # input coverages\n+    k = 0\n+    for line in open(ctsf):\n+        (kmer,cov) = line.split()\n+        if k == 0:\n+            k = len(kmer)\n+            at_covs = ['']*(k+1)\n+        else:\n+            at = count_at(kmer)\n+            if at_covs[at]:\n+                at_covs[at].append(cov)\n+            else:\n+                at_covs[at] = [cov]\n+\n+    for at in range(1,k):\n+        print '%d %d' % (at,len(at_covs[at]))\n+\n+    # for each AT bin\n+    at_cutoffs = []\n+    for at in range(1,k):\n+        # sample covs\n+        if sample >= len(at_covs[at]):\n+            rand_covs = at_covs[at]\n+        else:\n+            rand_covs = random.sample(at_covs[at], sample)\n+\n+        # print to file\n+        out = open('kmers.txt', 'w')\n+        for rc in rand_covs:\n+            print >> out, rc\n+        out.close()\n+\n+        os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r%d.log' % (ratio,quake.quake_dir,at))\n+\n+        at_cutoffs.append( open('cutoff.txt').readline().rstrip() )\n+        if at in [1,k-1]:   # setting extremes to next closests\n+            at_cutoffs.append( open('cutoff.txt').readline().rstrip() )\n+\n+        os.system('mv kmers.txt kmers.at%d.txt' % at)\n+        os.system('mv cutoff.txt cutoff.at%d.txt' % at)\n+\n+    out = open('cutoffs.gc.txt','w')\n+    print >> out, '\\n'.join(at_cutoffs)\n+    out.close()\n+        \n+    \n+############################################################\n+# count_at\n+#\n+# Count A's and T's in the given sequence\n+############################################################\n+def count_at(seq):\n+    return len([nt for nt in seq if nt in ['A','T']])\n+\n+\n+############################################################\n+# __main__\n+############################################################\n+if __name__ == '__main__':\n+    main()\n"
b
diff -r 000000000000 -r 9071e359b9a3 tools/ilmn_pacbio/quake.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ilmn_pacbio/quake.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+from optparse import OptionParser, SUPPRESS_HELP
+import os, random, sys
+import cov_model
+
+############################################################
+# quake.py
+#
+# Launch pipeline to correct errors in Illumina sequencing
+# reads.
+############################################################
+
+#r_dir = '/nfshomes/dakelley/research/error_correction/bin'
+quake_dir = os.path.abspath(os.path.dirname(sys.argv[0]))
+
+############################################################
+# main
+############################################################
+def main():
+    usage = 'usage: %prog [options]'
+    parser = OptionParser(usage)
+    parser.add_option('-r', dest='readsf', help='Fastq file of reads')
+    parser.add_option('-f', dest='reads_listf', help='File containing fastq file names, one per line or two per line for paired end reads.')
+    parser.add_option('-k', dest='k', type='int', help='Size of k-mers to correct')
+    parser.add_option('-p', dest='proc', type='int', default=4, help='Number of processes [default: %default]')
+    parser.add_option('-q', dest='quality_scale', type='int', default=-1, help='Quality value ascii scale, generally 64 or 33. If not specified, it will guess.')
+    parser.add_option('--no_count', dest='no_count', action='store_true', default=False, help='Kmers are already counted and in expected file [reads file].qcts or [reads file].cts [default: %default]')
+    parser.add_option('--no_cut', dest='no_cut', action='store_true', default=False, help='Coverage model is optimized and cutoff was printed to expected file cutoff.txt [default: %default]')
+    parser.add_option('--int', dest='counted_kmers', action='store_true', default=False, help='Kmers were counted as integers w/o the use of quality values [default: %default]')
+    parser.add_option('--ratio', dest='ratio', type='int', default=200, help='Likelihood ratio to set trusted/untrusted cutoff.  Generally set between 10-1000 with lower numbers suggesting a lower threshold. [default: %default]')
+    # help='Model kmer coverage as a function of GC content of kmers [default: %default]'
+    parser.add_option('--gc', dest='model_gc', action='store_true', default=False, help=SUPPRESS_HELP)
+    parser.add_option('--headers', action='store_true', default=False, help='Output original read headers (i.e. pass --headers to correct)' )
+    (options, args) = parser.parse_args()
+
+    if not options.readsf and not options.reads_listf:
+        parser.error('Must provide fastq file of reads with -r or file with list of fastq files of reads with -f')
+    if not options.k:
+        parser.error('Must provide k-mer size with -k')
+    if options.quality_scale == -1:
+        options.quality_scale = guess_quality_scale(options.readsf, options.reads_listf)
+
+    if options.counted_kmers:
+        cts_suf = 'cts'
+    else:
+        cts_suf = 'qcts'
+    if options.readsf:
+        ctsf = '%s.%s' % (os.path.splitext( os.path.split(options.readsf)[1] )[0], cts_suf)
+        reads_str = '-r %s' % options.readsf
+    else:
+        ctsf = '%s.%s' % (os.path.split(options.reads_listf)[1], cts_suf)
+        reads_str = '-f %s' % options.reads_listf
+
+    if not options.no_count and not options.no_cut:
+        count_kmers(options.readsf, options.reads_listf, options.k, ctsf, options.quality_scale)
+
+    if not options.no_cut:
+        # model coverage
+        if options.counted_kmers:
+            cov_model.model_cutoff(ctsf, options.ratio)
+        else:
+            if options.model_gc:
+                cov_model.model_q_gc_cutoffs(ctsf, 10000, options.ratio)
+            else:
+                cov_model.model_q_cutoff(ctsf, 25000, options.ratio)
+
+
+    if options.model_gc:
+        # run correct C++ code
+        os.system('%s/correct %s -k %d -m %s -a cutoffs.gc.txt -p %d -q %d' % (quake_dir,reads_str, options.k, ctsf, options.proc, options.quality_scale))
+
+    else:
+        cutoff = open('cutoff.txt').readline().rstrip()
+
+        # run correct C++ code
+        headers = '--headers' if options.headers else ''
+        os.system('%s/correct %s %s -k %d -m %s -c %s -p %d -q %d' % (quake_dir,headers, reads_str, options.k, ctsf, cutoff, options.proc, options.quality_scale))
+
+
+################################################################################
+# guess_quality_scale
+# Guess at ascii scale of quality values by examining
+# a bunch of reads and looking for quality values < 64,
+# in which case we set it to 33.
+################################################################################
+def guess_quality_scale(readsf, reads_listf):
+    reads_to_check = 1000
+    if not readsf:
+        readsf = open(reads_listf).readline().split()[0]
+
+    fqf = open(readsf)
+    reads_checked = 0
+    header = fqf.readline()
+    while header and reads_checked < reads_to_check:
+        seq = fqf.readline()
+        mid = fqf.readline()
+        qual = fqf.readline().rstrip()
+        reads_checked += 1
+        for q in qual:
+            if ord(q) < 64:
+                print 'Guessing quality values are on ascii 33 scale'
+                return 33
+        header = fqf.readline()
+
+    print 'Guessing quality values are on ascii 64 scale'
+    return 64
+        
+
+
+############################################################
+# count_kmers
+#
+# Count kmers in the reads file using AMOS count-kmers or
+# count-qmers
+############################################################
+def count_kmers(readsf, reads_listf, k, ctsf, quality_scale):
+    # find files
+    fq_files = []
+    if readsf:
+        fq_files.append(readsf)
+    else:
+        for line in open(reads_listf):
+            for fqf in line.split():
+                fq_files.append(fqf)
+
+    if ctsf[-4:] == 'qcts':
+        os.system('cat %s | %s/count-qmers -k %d -q %d > %s' % (' '.join(fq_files), quake_dir, k, quality_scale, ctsf))
+    else:
+        os.system('cat %s | %s/count-kmers -k %d > %s' % (' '.join(fq_files), quake_dir, k, ctsf))
+    
+            
+############################################################
+# __main__
+############################################################
+if __name__ == '__main__':
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/ilmn_pacbio/quake.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ilmn_pacbio/quake.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,44 @@
+<tool id="quake" name="Quake" version="1.0.0">
+  <description>Quality-aware error correction</description>
+  <command interpreter="python">
+    quake_wrapper.py --default_cutoff=10 --headers -k $k -f $fofnfile -p 12 > $output1
+  </command>
+  <inputs>
+    <param name="input1" format="fastq" type="data" label="Select FASTQ file to correct" />
+    <param name="k" type="integer" value="16" label="Size of k-mers to correct" />
+  </inputs>
+  <configfiles>
+    <configfile name="fofnfile">
+${input1.file_name}
+    </configfile>
+  </configfiles>
+  <outputs>
+    <data format="fastq" name="output1" label="Error-corrected reads from ${on_string}" />
+  </outputs>
+  <help>
+
+**What it does**
+
+Applies the Quake_ algorithm for quality-aware correction of
+substitution error in short reads.
+
+Kelley DR, Schatz MC, Salzberg SL.
+"Quake: quality-aware detection and correction of sequencing errors."
+*Genome Biol.* 2010;11(11):R116.
+
+.. _Quake: http://www.cbcb.umd.edu/software/quake
+
+**Parameter list**
+
+k
+    k-mer size for detecting spurious k-mers versus true k-mers from
+    the genome.  Recommendations for choosing a value of k can be found
+    here_.
+
+.. _here: http://www.cbcb.umd.edu/software/quake/faq.html
+
+**Output**
+
+A FASTQ file of corrected and trimmed reads.
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/ilmn_pacbio/quake_pe.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ilmn_pacbio/quake_pe.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,53 @@
+<tool id="quake_pe" name="Quake PE" version="1.0.0">
+  <description>Quality-aware error correction for paired-end reads</description>
+  <command interpreter="python">
+    quake_wrapper.py --default_cutoff=$cutoff --headers -k $k -f $fofnfile -p 12 --output=$output1,$output2
+  </command>
+  <inputs>
+    <param name="input1" format="fastq" type="data" label="FASTQ file for forward reads" />
+    <param name="input2" format="fastq" type="data" label="FASTQ file for reverse reads" />
+    <param name="k" type="integer" value="16" label="Size of k-mers to correct" />
+    <param name="cutoff" type="integer" value="0" label="Default coverage cutoff if estimation fails"/>
+  </inputs>
+  <configfiles>
+    <configfile name="fofnfile">${input1.file_name} ${input2.file_name}
+    </configfile>
+  </configfiles>
+  <outputs>
+    <data format="fastq" name="output1" label="Error-corrected forward reads from ${on_string}" />
+    <data format="fastq" name="output2" label="Error-corrected reverse reads from ${on_string}" />
+  </outputs>
+  <help>
+
+**What it does**
+
+Applies the Quake_ algorithm for quality-aware correction of
+substitution error in short reads.  This form of the tool is customized
+for correcting paired-end reads.
+
+Kelley DR, Schatz MC, Salzberg SL.
+"Quake: quality-aware detection and correction of sequencing errors."
+*Genome Biol.* 2010;11(11):R116.
+
+.. _Quake: http://www.cbcb.umd.edu/software/quake
+
+**Parameter list**
+
+K-mer size
+    k-mer size for detecting spurious k-mers versus true k-mers from
+    the genome.  Recommendations for choosing a value of k can be found
+    here_.
+
+Default coverage cutoff
+    If the appropriate coverage cutoff can not be found then Quake can be
+    forced to proceed anyways with the supplied cutoff.  In this case, 
+    the optimal cutoff can be estimated by examining
+    the k-mer coverage histogram by eye.
+
+.. _here: http://www.cbcb.umd.edu/software/quake/faq.html
+
+**Output**
+
+A FASTQ file of corrected and trimmed reads.
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/ilmn_pacbio/quake_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ilmn_pacbio/quake_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,132 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2011, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+#Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+#    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+#    * Neither the name of Pacific Biosciences nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+#
+#THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS CONTRIBUTORS BE LIABLE FOR ANY
+#DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+import sys
+import os
+import subprocess
+
+QUAKE_EXE = os.path.join( os.path.dirname(os.path.abspath(sys.argv[0])), 'quake.py' )
+cmdLine = sys.argv
+cmdLine.pop(0)
+
+#
+# horribly not robust, but it was a pain to rewrite everything with
+# optparse
+#
+j = -1
+cut = 0
+for i,arg in enumerate(cmdLine):
+    if '--default_cutoff' in arg:
+        j = i
+        cut = int(arg.split('=')[1])
+if j>=0:
+    cmdLine = cmdLine[:j] + cmdLine[j+1:]
+
+j = -1
+output=''
+for i,arg in enumerate(cmdLine):
+    if '--output' in arg:
+        j = i
+        output = arg.split('=')[1]
+if j>=0:
+    cmdLine = cmdLine[:j] + cmdLine[j+1:]
+
+def backticks( cmd, merge_stderr=True ):
+    """
+    Simulates the perl backticks (``) command with error-handling support
+    Returns ( command output as sequence of strings, error code, error message )
+    """
+    if merge_stderr:
+        _stderr = subprocess.STDOUT
+    else:
+        _stderr = subprocess.PIPE
+
+    p = subprocess.Popen( cmd, shell=True, stdin=subprocess.PIPE,
+                          stdout=subprocess.PIPE, stderr=_stderr,
+                          close_fds=True )
+
+    out = [ l[:-1] for l in p.stdout.readlines() ]
+
+    p.stdout.close()
+    if not merge_stderr:
+        p.stderr.close()
+
+    # need to allow process to terminate
+    p.wait()
+
+    errCode = p.returncode and p.returncode or 0
+    if p.returncode>0:
+        errorMessage = os.linesep.join(out)
+        output = []
+    else:
+        errorMessage = ''
+        output = out
+
+    return output, errCode, errorMessage
+
+def to_stdout():
+    def toCorFastq(f):
+        stem, ext = os.path.splitext( os.path.basename(f) )
+        dir = os.path.dirname(f)
+        corFastq = os.path.join(dir,'%s.cor%s' % (stem,ext) )
+        if not os.path.exists(corFastq):
+            print >>sys.stderr, "Can't find path %s" % corFastq
+            sys.exit(1)
+        return corFastq
+    if '-r' in cmdLine:
+        fastqFile = cmdLine[ cmdLine.index('-r')+1 ]
+        corFastq = toCorFastq(fastqFile)
+        infile = open( corFastq, 'r' )
+        for line in infile:
+            sys.stdout.write( line )
+        infile.close()
+    else:
+        fofnFile = cmdLine[ cmdLine.index('-f')+1 ]
+        infile = open(fofnFile,'r')
+        for line in infile:
+            line = line.strip()
+            if len(line)>0:
+                fastqFiles = line.split()
+                break
+        infile.close()
+        outs = output.split(',')
+        for o,f in zip(outs,fastqFiles):
+            cf = toCorFastq(f)
+            os.system( 'cp %s %s' % ( cf, o ) )
+
+def run():
+    cmd = '%s %s' % ( QUAKE_EXE, " ".join(cmdLine) )
+    output, errCode, errMsg = backticks( cmd )
+
+    if errCode==0:
+        to_stdout()
+    else:
+        # if Quake exits with an error in cutoff determination we  
+        # can force correction if requested
+        if 'cutoff.txt' in errMsg and cut>0:
+            outfile = open( 'cutoff.txt', 'w' )
+            print >>outfile, str(cut)
+            outfile.close()
+            cmd = '%s --no_count --no_cut %s' % ( QUAKE_EXE, " ".join(cmdLine) )
+            output, errCode, errMsg = backticks( cmd )
+        if errCode==0:
+            to_stdout()
+        else:
+            print >>sys.stderr, errMsg
+            sys.exit(1)
+
+if __name__=='__main__': run()
b
diff -r 000000000000 -r 9071e359b9a3 tools/ilmn_pacbio/smrtpipe.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ilmn_pacbio/smrtpipe.py Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,5 @@
+#!/usr/bin/env python
+# EASY-INSTALL-SCRIPT: 'pbpy==0.1','smrtpipe.py'
+__requires__ = 'pbpy==0.1'
+import pkg_resources
+pkg_resources.run_script('pbpy==0.1', 'smrtpipe.py')
b
diff -r 000000000000 -r 9071e359b9a3 tools/ilmn_pacbio/smrtpipe_filter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ilmn_pacbio/smrtpipe_filter.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,67 @@
+<tool id="smrtpipe_filter" name="SMRTpipe Filter" version="1.0.0">
+  <description>Produce filtered reads from a set of PacBio primary analysis outputs.</description>
+  <command interpreter="python">
+    smrtpipe_galaxy.py --output=data/filtered_subreads.fasta --galaxy_output=${outfile} ${iniFile}
+  </command>
+  <inputs>
+    <conditional name="source">
+      <param name="input_source" type="select" label="Choose the source for the analysis inputs">
+        <option value="path">Path to fofn or multiple bas.h5 paths</option>
+        <option value="history">History</option>
+      </param>
+      <when value="path">
+        <repeat name="inputFiles" title="Input files">
+          <param name="path" type="text" label="File path" size="75"/>
+        </repeat>
+      </when>
+      <when value="history">
+        <param name="input1" type="data" format="tabular" label="File containing input paths" />
+      </when>
+    </conditional>
+    <param name="minimum_readlength" type="integer" value="50" label="Minimum raw readlength" />
+    <param name="minimum_readscore" type="float" value="0.75" label="Minimum read quality" />
+  </inputs>
+  <configfiles>
+    <configfile name="iniFile">
+[input]
+#if $source.input_source=="history":
+#for $l in open($source.input1.file_name,'r'):
+$l
+#end for
+#else
+#for $p in $source.inputFiles
+${p.path}
+#end for
+#end if
+
+[S_Filter]
+filters=MinRL=${minimum_readlength},MinReadScore=${minimum_readscore}
+    </configfile>
+  </configfiles>
+  <outputs>
+    <data name="outfile" format="fasta" label="Filtered subreads" />
+  </outputs>
+  <help>
+
+**What it does**
+
+Filters PacBio bas.h5 files and produces a FASTA file of filtered subreads.
+
+In PacBio SMRT sequencing, the template format is a SMRTbell: a circular
+molecule with adapters at two locations in the circle.  The subreads are the
+portions of the read between adapters.
+
+**Parameter list**
+
+Minimum readlength
+    Only keep reads from ZMWs that produced this many bases or more.
+
+Minimum read quality
+    Only keep reads with overall quality scores of this value or more.  The read quality score is a *de novo* prediction of the accuracy of the read.
+
+**Output**
+
+FASTA file of filtered reads.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/ilmn_pacbio/smrtpipe_galaxy.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ilmn_pacbio/smrtpipe_galaxy.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,265 @@\n+#!/usr/bin/python\n+import sys\n+import os\n+import subprocess\n+import optparse as op\n+import xml.etree.cElementTree as et\n+\n+TRACE=False\n+#\n+# Turn on tracing to dump out __input__.xml and __settings__.xml somewhere\n+#\n+#TRACE=True\n+#TRACE_PATH=\'/home/UNIXHOME/jsorenson\'\n+\n+class SmrtpipeGalaxy:\n+    """Wrapper for running smrtpipe under galaxy"""\n+    def __init__( self, argv ):\n+        self.__parseOptions( argv )\n+\n+    def __parseOptions( self, argv ):\n+        usage = \'Usage: %prog [--help] [options] smrtpipe.ini\'\n+        parser = op.OptionParser( usage=usage, description=SmrtpipeGalaxy.__doc__ )\n+        parser.add_option( "--output",\n+                           help="Designate a file generated by smrtpipe as the expected output for galaxy" )\n+        parser.add_option( "--nproc", type="int",\n+                           help="Number of processes to use (-D NPROC)" )\n+        parser.add_option( "--galaxy_output",\n+                           help="File name provided by galaxy where output should be placed" )\n+        parser.add_option( "--dry_run", action="store_true", \n+                           help="Create auxiliary XML files and exit" )\n+        parser.add_option( "--dat_extension", \n+                           help="Soft link .dat files to have this extension (some pipelines require certain extensions)" ) \n+\n+        parser.set_defaults( output=None, dry_run=False, galaxy_output=None,\n+            dat_extension=None, nproc=0 )\n+        self.options, self.args = parser.parse_args( argv )\n+\n+        if len(self.args)!=2:\n+            parser.error( \'Expected 1 argument\' )\n+\n+        self.configFile = self.args[1]\n+\n+    def __parseConfig( self ):\n+        infile = open( self.configFile, \'r\' )\n+        section = None\n+        sections = []\n+        for line in infile:\n+            l = line.strip()\n+            if len(l)==0 or line.startswith(\'#\'):\n+                continue\n+            if l.startswith(\'[\') and l.endswith(\']\'):\n+                section = section_factory( l[1:-1] )\n+                sections.append(section)\n+                continue\n+            if section is None:\n+                continue\n+            if \'=\' in l:\n+                section.addParameterLine(l)\n+            else:\n+                section.addLine(l)\n+        infile.close()\n+        return sections\n+\n+    def transferOutput( self ):\n+        if not self.options.output or not self.options.galaxy_output:\n+            return True, \'\'\n+        if not os.path.exists(self.options.output):\n+            return False, "Can\'t find file %s (job error?)" % self.options.output\n+        os.system( \'cp %s %s\' % (self.options.output, self.options.galaxy_output ))\n+        return True, \'\'\n+\n+    def run( self ):\n+        if not os.path.exists( self.configFile ):\n+            print >>sys.stderr, "Can\'t find config file %s" % self.configFile\n+            return 1\n+\n+        sections = self.__parseConfig()\n+\n+        if len(sections)==0:\n+            print >>sys.stderr, "No sections found in %s" % self.configFile\n+            return 1\n+        if sections[0].name != \'input\':\n+            print >>sys.stderr, "No [input] section found in %s" % self.configFile\n+            return 1\n+\n+        INPUT_FILE = \'__input__.xml\'\n+        SETTINGS_FILE = \'__settings__.xml\'\n+\n+        sections[0].softLinkDats( self.options.dat_extension )\n+        inputXml = sections[0].makeXmlElement()\n+        write_xml_to_file( INPUT_FILE, inputXml )\n+        if TRACE:\n+            write_xml_to_file( os.path.join(TRACE_PATH,INPUT_FILE), inputXml )\n+\n+        settings = et.Element( \'smrtpipeSettings\' )\n+        for s in sections[1:]:\n+            s.makeXmlElement( settings )\n+\n+        write_xml_to_file( SETTINGS_FILE, settings )\n+        if TRACE:\n+            write_xml_to_file( os.path.join(TRACE_PATH,SETTINGS_FILE), settings )\n+\n+        nproc = \'-D NPROC=%d\' % self.options.nproc if self.options.nproc>0 else \'\'\n+        cmd = \'smrtpipe.py %s --params=%s xml:%s > smrtpi'..b'elf, name ):\n+        self._name = name\n+        self._lines = []        \n+        self._vars = {}\n+\n+    @property\n+    def name(self):\n+        return self._name\n+\n+    def addLine( self, line ):\n+        self._lines.append(line)\n+\n+    def addParameterLine( self, line ):\n+        self.addLine(line)\n+        i = line.find( \'=\' )\n+        key = line[:i].strip()\n+        value = line[i+1:].strip()\n+        self._vars[key] = value\n+\n+    def makeXmlElement( self, settings ):\n+        if self._name==\'global\':\n+            root = et.SubElement( settings, "protocol", {\'name\':\'generic\'} )\n+        else:\n+            root = et.SubElement( settings, "module", {\'name\':self._name} )\n+        for k,v in self._vars.iteritems():\n+            param = et.SubElement( root, \'param\', {\'name\':k} )\n+            val = et.SubElement( param, \'value\' )\n+            val.text = v\n+        return None\n+\n+    def __str__( self ):\n+        "for debugging"\n+        buffer = [ \'S { name=\' ]\n+        buffer.append(self._name)\n+        buffer.append(\'; lines=%s\' % \',\'.join(self._lines) )\n+        for k,v in self._vars.iteritems():\n+            buffer.append(\'; %s=%s\' % (k,v) )\n+        buffer.append(\' }\')\n+        return \'\'.join(buffer)\n+\n+class InputSection( Section ):\n+    def __init__( self, name ):\n+        Section.__init__(self,name)\n+\n+    def softLinkDats( self, newExtension ):\n+        if not newExtension:\n+            return\n+        newLines = []\n+        for l in self._lines:\n+            if \':\' in l:\n+                protocol = l[:l.find(\':\')+1]\n+                file = l[l.find(\':\')+1:]\n+            else:\n+                protocol = \'\'\n+                file = l\n+            if os.path.exists(file) and file.endswith(\'.dat\'):\n+                newFile = \'%s.%s\' % ( file, newExtension )\n+                if not os.path.exists(newFile):\n+                    os.system( \'ln -s %s %s\' % ( file, newFile ) )\n+                newLines.append(protocol+newFile)\n+            else:\n+                newLines.append(l)\n+        self._lines = newLines\n+\n+    def makeXmlElement( self, parent=None ):\n+        root = et.Element( "pacbioAnalysisInputs" )\n+        data = et.SubElement( root, \'dataReferences\' )\n+        iRef = 0\n+        for l in self._lines:\n+            def add(x,iRef):\n+                if len(x)==0: return iRef\n+                node = et.SubElement( data, \'url\' )\n+                if \':\' in x:\n+                    node.attrib[ \'ref\' ] = x\n+                else:\n+                    node.attrib[ \'ref\' ] = \'run:0000000-%04d\' % iRef\n+                    node2 = et.SubElement( node, \'location\' )\n+                    node2.text = x\n+                return iRef+1\n+            if l.endswith(\'fofn\') and os.path.exists(l):\n+                infile = open(l,\'r\')\n+                for j,line in enumerate(infile): iRef=add(line.strip(),iRef)\n+                infile.close()\n+            else:\n+                iRef=add(l,iRef)\n+        return root\n+\n+def backticks( cmd, merge_stderr=True ):\n+    """\n+    Simulates the perl backticks (``) command with error-handling support\n+    Returns ( command output as sequence of strings, error code, error message )\n+    """\n+    if merge_stderr:\n+        _stderr = subprocess.STDOUT\n+    else:\n+        _stderr = subprocess.PIPE\n+\n+    p = subprocess.Popen( cmd, shell=True, stdin=subprocess.PIPE,\n+                          stdout=subprocess.PIPE, stderr=_stderr,\n+                          close_fds=True )\n+\n+    out = [ l[:-1] for l in p.stdout.readlines() ]\n+\n+    p.stdout.close()\n+    if not merge_stderr:\n+        p.stderr.close()\n+\n+    # need to allow process to terminate\n+    p.wait()\n+\n+    errCode = p.returncode and p.returncode or 0\n+    if p.returncode>0:\n+        errorMessage = os.linesep.join(out)\n+        output = []\n+    else:\n+        errorMessage = \'\'\n+        output = out\n+        \n+    return output, errCode, errorMessage\n+\n+if __name__==\'__main__\':\n+    app = SmrtpipeGalaxy( sys.argv )\n+    sys.exit( app.run() )\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/ilmn_pacbio/smrtpipe_hybrid.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ilmn_pacbio/smrtpipe_hybrid.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,59 @@
+<tool id="smrtpipe_hybrid" name="AHA" version="1.0.0">
+  <description>Assemble contigs from a set of contigs and PacBio reads.</description>
+  <command interpreter="python">
+    smrtpipe_galaxy.py --nproc=24 --dat_extension=fasta --output=data/scaffold.fasta --galaxy_output=${outfile} ${iniFile}
+  </command>
+  <!--
+  <command>cp ${iniFile} ${outfile}</command>
+  -->
+  <inputs>
+    <param name="contigs" format="fasta" type="data" label="Starting Contigs"/>
+    <param name="reads" format="fasta" type="data" label="PacBio Reads"/>
+    <param name="schedule" type="text" value="6,3,75;6,3,75;5,3,75;5,3,75;6,2,75;6,2,75;5,2,75;5,2,75" label="Parameter Schedule" size="60"/>
+  </inputs>
+  <configfiles>
+    <configfile name="iniFile">
+[input]
+assembled_contigs:${contigs}
+file:${reads}
+
+[HybridAssembly]
+instrumentModel=RS
+cleanup=False
+untangler=pacbio
+#set $schedule2 = $schedule.replace('X',';')
+paramSchedule=${schedule2}
+dontFillin=False
+longReadsAsStrobe=True
+exactQueryIds=True
+rm4Opts=-minMatch 7 -minFrac 0.1 -minPctIdentity 65 -bestn 10 -noSplitSubreads
+numberProcesses=16
+cluster=False
+minRepeatLength=100000
+    </configfile>
+  </configfiles>
+  <outputs>
+    <data name="outfile" format="fasta" label="Hybrid assembly contigs from ${on_string}"/>
+  </outputs>
+  <help>
+
+**What it does**
+
+The AHA assembly algorithm is an AMOS_-based pipeline
+for finishing bacterial-sized
+genomes using draft contigs and PacBio reads.
+
+.. _AMOS: http://sourceforge.net/apps/mediawiki/amos
+
+**Parameter list**
+
+Parameter schedule
+    The parameter schedule is a semi-colon delimited list of triples.  Each triple represents an iteration of hybrid assembly (alignment/scaffolding/gap-filling).  The three paremeters for each iteration are the Z-score, number of reads required to define a link, and the minimum length of subreads used in links.
+
+**Output**
+
+FASTA file containing scaffolded and gap-filled contigs resulting from the 
+hybrid assembly.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/ilmn_pacbio/soap_denovo.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ilmn_pacbio/soap_denovo.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,73 @@
+<tool id="soap_denovo" name="SOAPdenovo" version="1.0.0">
+  <description>Short-read de novo assembly</description>
+  <!--
+      # SOAPdenovo-127mer all -s ${soap_config} -o assembly -K ${k} -p 8 -d -D
+      # cat ${soap_config} > ${output1}
+      # cp ${soap_config} ${output1} &amp;&amp;
+  -->
+  <command>
+      SOAPdenovo-127mer all -s ${soap_config} -o assembly -K ${k} -p 24 -d -D -R
+  </command>
+  <inputs>
+    <conditional name="inputs">
+      <param name="read_type" type="select" label="Illumina read type">
+        <option value="single">Single fragment</option>
+        <option value="paired">Paired-end</option>
+      </param>
+      <when value="single">
+        <param name="input1" format="fastq" type="data" label="FASTQ file for reads"/>
+      </when>
+      <when value="paired">
+        <param name="input1" format="fastq" type="data" label="FASTQ file for forward reads"/>
+        <param name="input2" format="fastq" type="data" label="FASTQ file for reverse reads"/>
+        <param name="d" type="integer" value="500" label="Estimated insert size for paired-end reads" />
+      </when>
+    </conditional>
+    <param name="k" type="integer" value="23" label="Size of k for forming the de Bruijn overlap graph" />
+  </inputs>
+  <configfiles>
+    <configfile name="soap_config">max_rd_len=105
+[LIB]
+#if $inputs.read_type == "single"
+q=${inputs.input1.file_name}
+#else
+avg_ins=${inputs.d}
+asm_flags=3
+reverse_seq=0
+q1=${inputs.input1.file_name}
+q2=${inputs.input2.file_name}
+#end if
+    </configfile>
+  </configfiles>
+  <outputs>
+    <data name="assembled_contigs" format="fasta" from_work_dir="assembly.scafSeq" label="Assembled contigs from ${on_string}" />
+  </outputs>
+  <help>
+
+**What it does**
+
+Runs SOAPdenovo_ to generate a genome assembly
+using single-fragment or paired-end short reads.
+
+Li R, Zhu H, Ruan J, Qian W, Fang X, Shi Z, Li Y, Li S, Shan G, Kristiansen K, Li S, Yang H, Wang J, Wang J.
+"De novo assembly of human genomes with massively parallel short read sequencing."
+*Genome Res.* 2010 Feb;20(2):265-72.
+
+.. _SOAPdenovo: http://soap.genomics.org.cn/soapdenovo.html
+
+**Parameter list**
+
+k
+    k-mer size for constructing the de Bruijn graph.  The appropriate size of k is genome and data set dependent, but a good starting choice might be 75% of the read length.
+
+Insert size
+    For paired-end libraries, the expected insert size.
+
+**Output**
+
+assembly
+
+  </help>
+</tool>
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/indels/indel_analysis.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/indels/indel_analysis.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,227 @@\n+#!/usr/bin/env python\n+\n+"""\n+Given an input sam file, provides analysis of the indels..\n+\n+usage: %prog [options] [input3 sum3[ input4 sum4[ input5 sum5[...]]]]\n+   -i, --input=i: The sam file to analyze\n+   -t, --threshold=t: The deletion frequency threshold\n+   -I, --out_ins=I: The interval output file showing insertions\n+   -D, --out_del=D: The interval output file showing deletions\n+"""\n+\n+import re, sys\n+from galaxy import eggs\n+import pkg_resources; pkg_resources.require( "bx-python" )\n+from bx.cookbook import doc_optparse\n+\n+\n+def stop_err( msg ):\n+    sys.stderr.write( \'%s\\n\' % msg )\n+    sys.exit()\n+\n+def add_to_mis_matches( mis_matches, pos, bases ):\n+    """\n+    Adds the bases and counts to the mis_matches dict\n+    """\n+    for j, base in enumerate( bases ):\n+        try:\n+            mis_matches[ pos + j ][ base ] += 1\n+        except KeyError:\n+            try:\n+                mis_matches[ pos + j ][ base ] = 1\n+            except KeyError:\n+                mis_matches[ pos + j ] = { base: 1 }\n+\n+def __main__():\n+    #Parse Command Line\n+    options, args = doc_optparse.parse( __doc__ )\n+    # prep output files\n+    out_ins = open( options.out_ins, \'wb\' )\n+    out_del = open( options.out_del, \'wb\' )\n+    # patterns\n+    pat = re.compile( \'^((?P<lmatch>\\d+)M(?P<ins_del_width>\\d+)(?P<ins_del>[ID])(?P<rmatch>\\d+)M)$|((?P<match_width>\\d+)M)$\' )\n+    pat_multi = re.compile( \'(\\d+[MIDNSHP])(\\d+[MIDNSHP])(\\d+[MIDNSHP])+\' )\n+    # for tracking occurences at each pos of ref\n+    mis_matches = {}\n+    indels = {}\n+    multi_indel_lines = 0\n+    # go through all lines in input file\n+    for i,line in enumerate( open( options.input, \'rb\' ) ):\n+        if line.strip() and not line.startswith( \'#\' ) and not line.startswith( \'@\' ) :\n+            split_line = line.split( \'\\t\' )\n+            chrom = split_line[2].strip()\n+            pos = int( split_line[3].strip() )\n+            cigar = split_line[5].strip()\n+            bases = split_line[9].strip()\n+            # if not an indel or match, exit\n+            if chrom == \'*\':\n+                continue\n+            # find matches like 3M2D7M or 7M3I10M\n+            match = {}\n+            m = pat.match( cigar )\n+            # unprocessable CIGAR\n+            if not m:\n+                m = pat_multi.match( cigar )\n+                # skip this line if no match\n+                if not m:\n+                    continue\n+                # account for multiple indels or operations we don\'t process\n+                else:\n+                    multi_indel_lines += 1\n+            # get matching parts for the indel or full match if matching\n+            else:\n+                if not mis_matches.has_key( chrom ):\n+                    mis_matches[ chrom ] = {}\n+                    indels[ chrom ] = { \'D\': {}, \'I\': {} }\n+                parts = m.groupdict()\n+                if parts[ \'match_width\' ] or ( parts[ \'lmatch\' ] and parts[ \'ins_del_width\' ] and parts[ \'rmatch\' ] ):\n+                    match = parts\n+            # see if matches meet filter requirements\n+            if match:\n+                # match/mismatch\n+                if parts[ \'match_width\' ]:\n+                    add_to_mis_matches( mis_matches[ chrom ], pos, bases )\n+                # indel\n+                else:\n+                    # pieces of CIGAR string\n+                    left = int( match[ \'lmatch\' ] )\n+                    middle = int( match[ \'ins_del_width\' ] )\n+                    right = int( match[ \'rmatch\' ] )\n+                    left_bases = bases[ : left ]\n+                    if match[ \'ins_del\' ] == \'I\':\n+                        middle_bases = bases[ left : left + middle ]\n+                    else:\n+                        middle_bases = \'\'\n+                    right_bases = bases[ -right : ]\n+                    start = pos + left\n+                    # add data to ref_pos dict for match/mismatch bases on left and on right\n+                    add_to_mis_matches( mis_matches[ chr'..b' sum_counts_end += float( sum( mis_matches[ chrom ][ pos + 1 ].values() ) )\n+            except KeyError:\n+                pass\n+            # add deletions also touching this position\n+            try:\n+                sum_counts += float( sum( indels[ chrom ][ \'D\' ][ pos ].values() ) )\n+            except KeyError:\n+                pass\n+            try:\n+                sum_counts_end += float( sum( indels[ chrom ][ \'D\' ][ pos + 1 ].values() ) )\n+            except KeyError:\n+                pass\n+            freqs[ chrom ][ pos ][ \'total\' ] = sum_counts\n+            # calculate actual frequencies\n+            # deletions\n+            # frequencies for deletions\n+            try:\n+                for d in indels[ chrom ][ \'D\' ][ pos ].keys():\n+                    freqs[ chrom ][ pos ][ d ] = indels[ chrom ][ \'D\' ][ pos ][ d ] / sum_counts\n+            except KeyError:\n+                pass\n+            # frequencies for matches/mismatches\n+            try:\n+                for base in mis_matches[ chrom ][ pos ].keys():\n+                    try:\n+                        prop = float( mis_matches[ chrom ][ pos ][ base ] ) / sum_counts\n+                        freqs[ chrom ][ pos ][ base ] = prop\n+                    except ZeroDivisionError:\n+                        freqs[ chrom ][ pos ][ base ] = 0.0\n+            except KeyError:\n+                pass\n+            # insertions\n+            try:\n+                for bases in indels[ chrom ][ \'I\' ][ pos ].keys():\n+                    prop_start = indels[ chrom ][ \'I\' ][ pos ][ bases ] / ( indels[ chrom ][ \'I\' ][ pos ][ bases ] + sum_counts )\n+                    try:\n+                        prop_end = indels[ chrom ][ \'I\' ][ pos ][ bases ] / ( indels[ chrom ][ \'I\' ][ pos ][ bases ] + sum_counts_end )\n+                    except ZeroDivisionError:\n+                        prop_end = 0.0\n+                    try:\n+                        ins_freqs[ chrom ][ pos ][ bases ] = [ prop_start, prop_end ]\n+                    except KeyError:\n+                        ins_freqs[ chrom ][ pos ] = { bases: [ prop_start, prop_end ] }\n+            except KeyError, e:\n+                pass\n+    # output to files if meet threshold requirement\n+    threshold = float( options.threshold )\n+    #out_del.write( \'#Chrom\\tStart\\tEnd\\t#Del\\t#Reads\\t%TotReads\\n\' )\n+    #out_ins.write( \'#Chrom\\tStart\\tEnd\\tInsBases\\t#Reads\\t%TotReadsAtStart\\t%ReadsAtEnd\\n\' )\n+    for chrom in chroms:\n+        # deletions file\n+        poses = indels[ chrom ][ \'D\' ].keys()\n+        poses.sort()\n+        for pos in poses:\n+            start = pos\n+            dels = indels[ chrom ][ \'D\' ][ start ].keys()\n+            dels.sort()\n+            for d in dels:\n+                end = start + d\n+                prop = freqs[ chrom ][ start ][ d ]\n+                if prop > threshold :\n+                    out_del.write( \'%s\\t%s\\t%s\\t%s\\t%.2f\\n\' % ( chrom, start, end, indels[ chrom ][ \'D\' ][ pos ][ d ], 100.0 * prop ) )\n+        # insertions file\n+        poses = indels[ chrom ][ \'I\' ].keys()\n+        poses.sort()\n+        for pos in poses:\n+            start = pos\n+            end = pos + 1\n+            ins_bases = indels[ chrom ][ \'I\' ][ start ].keys()\n+            ins_bases.sort()\n+            for bases in ins_bases:\n+                prop_start = ins_freqs[ chrom ][ start ][ bases ][0]\n+                prop_end = ins_freqs[ chrom ][ start ][ bases ][1]\n+                if prop_start > threshold or prop_end > threshold:\n+                    out_ins.write( \'%s\\t%s\\t%s\\t%s\\t%s\\t%.2f\\t%.2f\\n\' % ( chrom, start, end, bases, indels[ chrom ][ \'I\' ][ start ][ bases ], 100.0 * prop_start, 100.0 * prop_end ) )\n+    # close out files\n+    out_del.close()\n+    out_ins.close()\n+    # if skipped lines because of more than one indel, output message\n+    if multi_indel_lines > 0:\n+        sys.stdout.write( \'%s alignments were skipped because they contained more than one indel.\' % multi_indel_lines )\n+\n+if __name__=="__main__": __main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/indels/indel_analysis.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/indels/indel_analysis.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,167 @@\n+<tool id="indel_analysis" name="Indel Analysis" version="1.0.0">\n+  <description></description>\n+  <command interpreter="python">\n+    indel_analysis.py\n+      --input=$input1\n+      --threshold=$threshold\n+      --out_ins=$out_ins\n+      --out_del=$out_del\n+  </command>\n+  <inputs>\n+    <param format="sam" name="input1" type="data" label="Select sam file to analyze" />\n+    <param name="threshold" type="float" value="0.015" size="5" label="Frequency threshold" help="Cutoff" />\n+  </inputs>\n+  <outputs>\n+    <data format="interval" name="out_del" />\n+    <data format="interval" name="out_ins" />\n+  </outputs>\n+  <tests>\n+    <test>\n+      <param name="input1" value="indel_analysis_in1.sam" ftype="sam"/>\n+      <param name="threshold" value="0.017"/>\n+      <output name="out_del" file="indel_analysis_out1.interval" ftype="interval"/>\n+      <output name="out_ins" file="indel_analysis_out2.interval" ftype="interval"/>\n+    </test>\n+    <test>\n+      <param name="input1" value="indel_analysis_in2.sam" ftype="sam"/>\n+      <param name="threshold" value="0.09"/>\n+      <output name="out_del" file="indel_analysis_out3.interval" ftype="interval"/>\n+      <output name="out_ins" file="indel_analysis_out4.interval" ftype="interval"/>\n+    </test>\n+  </tests>\n+  <help>\n+\n+**What it does**\n+\n+Given an input sam file, this tool provides analysis of the indels. It filters out matches that do not meet the frequency threshold. The way this frequency of occurence is calculated is different for deletions and insertions. The CIGAR string\'s "M" can indicate an exact match or a mismatch. For SAM containing the following bits of information (assuming the reference "ACTGCTCGAT")::\n+\n+ CHROM  POS   CIGAR  SEQ\n+   ref    3  2M1I3M  TACTTC\n+   ref    1  2M1D3M  ACGCT\n+   ref    4  4M2I3M  GTTCAAGAT\n+   ref    2  2M2D3M  CTCCG\n+   ref    1  3M1D4M  AACCTGG\n+   ref    6  3M1I2M  TTCAAT\n+   ref    5  3M1I3M  CTCTGTT\n+   ref    7      4M  CTAT\n+   ref    5      5M  CGCTA\n+   ref    3  2M1D2M  TGCC\n+\n+The following totals would be calculated (this is an intermediate step and not output)::\n+\n+ -------------------------------------------------------------------------------------------------------\n+  POS  BASE  NUMREADS  DELPROPCALC  DELPROP  INSPROPSTARTCALC  INSSTARTPROP  INSPROPENDCALC  INSENDPROP\n+ -------------------------------------------------------------------------------------------------------\n+    1     A         2          2/2     1.00               ---           ---             ---         ---\n+    2     A         1          1/3     0.33               ---           ---             ---         ---\n+          C         2          2/3     0.67               ---           ---             ---         ---\n+    3     C         1          1/5     0.20               ---           ---             ---         ---\n+          T         3          3/5     0.60               ---           ---             ---         ---\n+          -         1          1/5     0.20               ---           ---             ---         ---\n+    4     A         1          1/6     0.17               ---           ---             ---         ---\n+          G         3          3/6     0.50               ---           ---             ---         ---\n+          -         1          1/6     0.17               ---           ---             ---         ---\n+         --         1          1/6     0.17               ---           ---             ---         ---\n+    5     C         4          4/7     0.57               ---           ---             ---         ---\n+          T         2          2/7     0.29               ---           ---             ---         ---\n+          -         1          1/7     0.14               ---           ---             ---         ---\n+         +C         1          ---      ---               1/7          0.14             1/9        0.11\n+    6     C         2          2/9     0.22               ---           ---             ---    '..b'i:1  X1:i:0  XM:i:0  XO:i:0  XG:i:0  MD:Z:76\n+ r501     16   chrM    6   23      7M1I13M   *   0   0          TCTGTGCCTACCAGACATTCA   +=$2;?@BA@?-,.+4=4=4A          XT:A:U  NM:i:3  X0:i:1  X1:i:1  XM:i:2  XO:i:1  XG:i:1  MD:Z:28C36G9        XA:Z:chrM,+134263658,14M1I61M,4;\n+ r1288    16   chrM    8   37      11M1I7M   *   0   0            TCACTTACCTGTACACACA   /*F2;?@%A@?-,.+4=4=            XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:2T0T1A69\n+ r1902     0   chr1    4   37      7M2D18M   *   0   0        AGTCTCTTACCTGACGGTTATGA   &lt;2;?@BA@?-,.+4=4=4AA663        XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0\n+ r2204    16   chrM    9    0          19M   *   0   0            CTGGTACCTGACAGGTATC   2;?@BA@?-,.+4=4=4AA            XT:A:R  NM:i:1  X0:i:2  X1:i:0  XM:i:1  XO:i:0  XG:i:0  MD:Z:0T75           XA:Z:chrM,-564927,76M,1;\n+ r2314    16   chrM    6   37      10M2D8M   *   0   0               TCACTCTTACGTCTGA   &lt;2;?@BA@?-,.+4=4               XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:25A5^CA45\n+ r3001     0   chrM   13   37   3M1D5M2I7M   *   0   0              TACAGTCACCCTCATCA   &lt;2;?@BA/(@?-,$&amp;                XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0\n+ r3218     0   chr1   13   37       8M1D7M   *   0   0                TACAGTCACTCATCA   &lt;2;?@BA/(@?-,$&amp;                XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0\n+ r4767    16   chr2    3   37      15M2I7M   *   0   0       CAGACTCTCTTACCAAAGACAGAC   &lt;2;?@BA/(@?-,.+4=4=4AA66       XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:2T1A4T65\n+ r5333     0   chrM    5   37      17M1D8M   *   0   0       GTCTCTCATACCAGACAACGGCAT   FB3$@BA/(@?-,.+4=4=4AA66       XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:45C10^C0C5C13\n+ r6690    16   chrM    7   23          20M   *   0   0           CTCTCTTACCAGACAGACAT   2;?@BA/(@?-,.+4=4=4A           XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76             XA:Z:chrM,-568532,76M,1;\n+ r7211     0   chrM    7   37          24M   *   0   0       CGACAGAGACAAAATAACATTTAA   //&lt;2;?@BA@?-,.+4=442;;6:       XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:2  XO:i:1  XG:i:1  MD:Z:73G0G0\n+ r9922    16   chrM    4    0       7M3I9M   *   0   0            CCAGACATTTGAAATCAGG   F/D4=44^D++26632;;6            XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76\n+ r9987    16   chrM    4    0      9M1I18M   *   0   0   AGGTTCTCATTACCTGACACTCATCTTG   G/AD6"/+4=4426632;;6:&lt;2;?@BA   XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76\n+ r10145   16   chr1   16    0       5M2D7M   *   0   0                   CACATTGTTGTA   G//+4=44=4AA                   XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76\n+ r10324   16   chrM   15    0       6M1D5M   *   0   0                   CCGTTCTACTTG   A@??8.G//+4=                   XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76\n+ r12331   16   chrM   17    0       4M2I6M   *   0   0                  AGTCGAATACGTG   632;;6:&lt;2;?@B                  XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76\n+ r12914   16   chr2   24    0       4M3I3M   *   0   0                     ACTACCCCAA   G//+4=42,.                     XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76\n+\n+The following will be produced (deletions file followed by insertions file)::\n+\n+ chr1   11   13   1   100.00\n+ chr1   21   22   1    25.00\n+ chr1   21   23   1    25.00\n+ chrM   16   18   1     9.09\n+ chrM   19   20   1     8.33\n+ chrM   21   22   1     9.09\n+ chrM   22   23   1     9.09\n+\n+ chr2   18   19    AA   1   50.00   50.00\n+ chr2   28   29   CCC   1   50.00   50.00\n+ chrM   11   12   TTT   1    9.09    9.09\n+ chrM   13   14     C   1    9.09    9.09\n+ chrM   13   14     T   1    9.09    9.09\n+ chrM   19   20     T   1    7.69    8.33\n+ chrM   21   22    GA   1    8.33    8.33\n+\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/indels/indel_sam2interval.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/indels/indel_sam2interval.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,161 @@
+#!/usr/bin/env python
+
+"""
+Allows user to filter out non-indels from SAM.
+
+usage: %prog [options]
+   -i, --input=i: The input SAM file
+   -u, --include_base=u: Whether or not to include the base for insertions
+   -c, --collapse=c: Wheter to collapse multiple occurrences of a location with counts shown
+   -o, --int_out=o: The interval output file for the converted SAM file
+   -b, --bed_ins_out=b: The bed output file with insertions only for the converted SAM file
+   -d, --bed_del_out=d: The bed output file with deletions only for the converted SAM file
+"""
+
+import re, sys
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+
+
+def stop_err( msg ):
+    sys.stderr.write( '%s\n' % msg )
+    sys.exit()
+
+def numeric_sort( text1, text2 ):
+    """
+    For two items containing space-separated text, compares equivalent pieces
+    numerically if both numeric or as text otherwise
+    """
+    pieces1 = text1.split()
+    pieces2 = text2.split()
+    if len( pieces1 ) == 0:
+        return 1
+    if len( pieces2 ) == 0:
+        return -1
+    for i, pc1 in enumerate( pieces1 ):
+        if i == len( pieces2 ):
+            return 1
+        if not pieces2[i].isdigit():
+            if pc1.isdigit():
+                return -1
+            else:
+                if pc1 > pieces2[i]:
+                    return 1
+                elif pc1 < pieces2[i]:
+                    return -1
+        else:
+            if not pc1.isdigit():
+                return 1
+            else:
+                if int( pc1 ) > int( pieces2[i] ):
+                    return 1
+                elif int( pc1 ) < int( pieces2[i] ):
+                    return -1
+    if i < len( pieces2 ) - 1:
+        return -1
+    return 0
+
+def __main__():
+    #Parse Command Line
+    options, args = doc_optparse.parse( __doc__ )
+
+    # open up output files
+    output = open( options.int_out, 'wb' )
+    if options.bed_ins_out != 'None':
+        output_bed_ins = open( options.bed_ins_out, 'wb' )
+    else:
+        output_bed_ins = None
+    if options.bed_del_out != 'None':
+        output_bed_del = open( options.bed_del_out, 'wb' )
+    else:
+        output_bed_del = None
+
+    # the pattern to match, assuming just one indel per cigar string
+    pat_indel = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' )
+    pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
+
+    # go through all lines in input file
+    out_data = {}
+    multi_indel_lines = 0
+    for line in open( options.input, 'rb' ):
+        if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
+            split_line = line.split( '\t' )
+            if split_line < 12:
+                continue
+            # grab relevant pieces
+            cigar = split_line[5].strip()
+            pos = int( split_line[3] )
+            chr = split_line[2]
+            base_string = split_line[9]
+            # parse cigar string
+            m = pat_indel.match( cigar )
+            if not m:
+                m = pat_multi.match( cigar )
+                # skip this line if no match
+                if not m:
+                    continue
+                # account for multiple indels or operations we don't process
+                else:
+                    multi_indel_lines += 1
+                continue
+            else:
+                match = m.groupdict()
+            left = int( match[ 'lmatch' ] )
+            middle = int( match[ 'ins_del_width' ] )
+            middle_type = match[ 'ins_del' ]
+            bases = base_string[ left : left + middle ]
+            # calculate start and end positions, and output to insertion or deletion file
+            start = left + pos
+            if middle_type == 'D':
+                end = start + middle
+                data = [ chr, start, end, 'D' ]
+                if options.include_base == "true":
+                    data.append( '-' )
+            else:
+                end = start + 1
+                data = [ chr, start, end, 'I' ]
+                if options.include_base == "true":
+                    data.append( bases )
+            location = '\t'.join( [ '%s' % d for d in data ] )
+            try:
+                out_data[ location ] += 1
+            except KeyError:
+                out_data[ location ] = 1
+    # output to interval file
+    # get all locations and sort
+    locations = out_data.keys()
+    locations.sort( numeric_sort )
+    last_line = ''
+    # output each location, either with counts or each occurrence
+    for loc in locations:
+        sp_loc = loc.split( '\t' )
+        cur_line = '\t'.join( sp_loc[:3] )
+        if options.collapse == 'true':
+            output.write( '%s\t%s\n' % ( loc, out_data[ loc ] ) )
+            if output_bed_del and sp_loc[3] == 'D':
+                output_bed_del.write( '%s\n' % cur_line )
+            if output_bed_ins and sp_loc[3] == 'I' and last_line != cur_line:
+                output_bed_ins.write( '%s\n' % cur_line )
+                last_line = cur_line
+        else:
+            for i in range( out_data[ loc ] ):
+                output.write( '%s\n' % loc )
+                if output_bed_del or output_bed_ins:
+                    if output_bed_del and sp_loc[3] == 'D':
+                        output_bed_del.write( '%s\n' % cur_line )
+                    if output_bed_ins and sp_loc[3] == 'I':
+                        output_bed_ins.write( '%s\n' % cur_line )
+
+    # cleanup, close files
+    if output_bed_ins:
+        output_bed_ins.close()
+    if output_bed_del:
+        output_bed_del.close()
+    output.close()
+
+    # if skipped lines because of more than one indel, output message
+    if multi_indel_lines > 0:
+        sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )
+
+if __name__=="__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/indels/indel_sam2interval.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/indels/indel_sam2interval.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,139 @@\n+<tool id="indel_sam2interval" name="Extract indels" version="1.0.0">\n+  <description>from SAM</description>\n+  <command interpreter="python">\n+    indel_sam2interval.py\n+      --input=$input1\n+      --include_base=$include_base\n+      --collapse=$collapse\n+      --int_out=$output1\n+      #if $ins_out.include_ins_out == "true"\n+        --bed_ins_out=$output2\n+      #else\n+        --bed_ins_out="None"\n+      #end if\n+      #if $del_out.include_del_out == "true"\n+        --bed_del_out=$output3\n+      #else\n+        --bed_del_out="None"\n+      #end if\n+  </command>\n+  <inputs>\n+    <param format="sam" name="input1" type="data" label="Select dataset to convert" />\n+    <param name="include_base" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Include the relevant base(s) for each insertion (and a dash (-) for deletions)" />\n+    <param name="collapse" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Collapse repeated locations onto single line with counts" />\n+    <conditional name="ins_out">\n+      <param name="include_ins_out" type="select" label="Include insertions output bed file?">\n+        <option value="true">Yes</option>\n+        <option value="false">No</option>\n+      </param>\n+      <when value="true" />\n+      <when value="false" />\n+    </conditional>\n+    <conditional name="del_out">\n+      <param name="include_del_out" type="select" label="Include deletions output bed file?">\n+        <option value="true">Yes</option>\n+        <option value="false">No</option>\n+      </param>\n+      <when value="true" />\n+      <when value="false" />\n+    </conditional>\n+  </inputs>\n+  <outputs>\n+    <data format="interval" name="output1" />\n+    <data format="bed" name="output2">\n+      <filter>ins_out[ "include_ins_out" ] == "true"</filter>\n+    </data>\n+    <data format="bed" name="output3">\n+      <filter>del_out[ "include_del_out" ] == "true"</filter>\n+    </data>\n+  </outputs>\n+  <tests>\n+    <test>\n+      <param name="input1" value="indel_sam2interval_in1.sam" ftype="sam"/>\n+      <param name="include_base" value="true"/>\n+      <param name="collapse" value="true"/>\n+      <param name="include_ins_out" value="true" />\n+      <param name="include_del_out" value="true" />\n+      <output name="output1" file="indel_sam2interval_out1.interval" ftype="interval"/>\n+      <output name="output2" file="indel_sam2interval_out2.bed" ftype="bed"/>\n+      <output name="output3" file="indel_sam2interval_out3.bed" ftype="bed"/>\n+    </test>\n+  </tests>\n+  <help>\n+\n+**What it does**\n+\n+Given a SAM file containing indels, converts these to an interval file with a column indicating whether it is an insertion or a deletion, and then also can create a BED file for each type (one for insertions, one for deletions). The interval file can be combined with other like files to create a table useful for analysis with the Indel Analysis Table tool. The BED files can be useful for visualizing the reads.\n+\n+-----\n+\n+**Example**\n+\n+Suppose you have the following mapping results::\n+\n+ r327     16   chrM   11   37      8M1D10M   *   0   0             CTTACCAGATAGTCATCA   -+&lt;2;?@BA@?-,.+4=4             XT:A:U  NM:i:1  X0:i:1  X1:i:0  XM:i:0  XO:i:1  XG:i:1  MD:Z:41^C35\n+ r457      0   chr1   14   37          14M   *   0   0                 ACCTGACAGATATC   =/DF;?@1A@?-,.                 XT:A:U  NM:i:0  X0:i:1  X1:i:0  XM:i:0  XO:i:0  XG:i:0  MD:Z:76\n+ r501     16   chrM    6   23      7M1I13M   *   0   0          TCTGTGCCTACCAGACATTCA   +=$2;?@BA@?-,.+4=4=4A          XT:A:U  NM:i:3  X0:i:1  X1:i:1  XM:i:2  XO:i:1  XG:i:1  MD:Z:28C36G9        XA:Z:chrM,+134263658,14M1I61M,4;\n+ r1288    16   chrM    8   37      11M1I7M   *   0   0            TCACTTACCTGTACACACA   /*F2;?@%A@?-,.+4=4=            XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:2T0T1A69\n+ r1902     0   chr1    4   37      7M2D18M   *   0   0        AGTCTCTTACCTGACGGTTATGA   &lt;2;?@BA@?-,.+4=4=4AA663        XT:A:U  '..b':R  NM:i:1  X0:i:2  X1:i:0  XM:i:1  XO:i:0  XG:i:0  MD:Z:0T75           XA:Z:chrM,-564927,76M,1;\n+ r2314    16   chrM    6   37      10M2D8M   *   0   0               TCACTCTTACGTCTGA   &lt;2;?@BA@?-,.+4=4               XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:25A5^CA45\n+ r3001     0   chrM   13   37   3M1D5M2I7M   *   0   0              TACAGTCACCCTCATCA   &lt;2;?@BA/(@?-,$&amp;                XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0\n+ r3218     0   chr1   13   37       8M1D7M   *   0   0                TACAGTCACTCATCA   &lt;2;?@BA/(@?-,$&amp;                XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0\n+ r4767    16   chr2    3   37      15M2I7M   *   0   0       CAGACTCTCTTACCAAAGACAGAC   &lt;2;?@BA/(@?-,.+4=4=4AA66       XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:2T1A4T65\n+ r5333     0   chrM    5   37      17M1D8M   *   0   0       GTCTCTCATACCAGACAACGGCAT   FB3$@BA/(@?-,.+4=4=4AA66       XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:45C10^C0C5C13\n+ r6690    16   chrM    7   23          20M   *   0   0           CTCTCTTACCAGACAGACAT   2;?@BA/(@?-,.+4=4=4A           XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76             XA:Z:chrM,-568532,76M,1;\n+ r7211     0   chrM    7   37          24M   *   0   0       CGACAGAGACAAAATAACATTTAA   //&lt;2;?@BA@?-,.+4=442;;6:       XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:2  XO:i:1  XG:i:1  MD:Z:73G0G0\n+ r7899    69      *    0    0            *   *   0   0       CTGCGTGTTGGTGTCTACTGGGGT   #%#\'##$#$##&amp;%#%$$$%#%#\'#\n+ r9192   133      *    0    0            *   *   0   0       GTGCGTCGGGGAGGGTGCTGTCGG   ######%#$%#$$###($###&amp;&amp;%\n+ r9922    16   chrM    4    0       7M3I9M   *   0   0            CCAGACATTTGAAATCAGG   F/D4=44^D++26632;;6            XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76\n+ r9987    16   chrM    4    0      9M1I18M   *   0   0   AGGTTCTCATTACCTGACACTCATCTTG   G/AD6"/+4=4426632;;6:&lt;2;?@BA   XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76\n+ r10145   16   chr1   16    0       5M2D7M   *   0   0                   CACATTGTTGTA   G//+4=44=4AA                   XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76\n+ r10324   16   chrM   15    0       6M1D5M   *   0   0                   CCGTTCTACTTG   A@??8.G//+4=                   XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76\n+ r12331   16   chrM   17    0       4M2I6M   *   0   0                  AGTCGAATACGTG   632;;6:&lt;2;?@B                  XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76\n+ r12914   16   chr2   24    0       4M3I3M   *   0   0                     ACTACCCCAA   G//+4=42,.                     XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76\n+ r13452   16   chrM   13    0      3M1D11M   *   0   0                 TACGTCACTCATCA   IIIABCCCICCCCI                 XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76\n+\n+\n+The following three files will be produced (Interval, Insertions BED and Deletions BED)::\n+\n+ chr1   11   13   D     -   1\n+ chr1   21   22   D     -   1\n+ chr1   21   23   D     -   1\n+ chr2   18   19   I    AA   1\n+ chr2   28   29   I   CCC   1\n+ chrM   11   12   I   TTT   1\n+ chrM   13   14   I     C   1\n+ chrM   13   14   I     T   1\n+ chrM   16   17   D     -   1\n+ chrM   16   18   D     -   1\n+ chrM   19   20   D     -   1\n+ chrM   19   20   I     T   1\n+ chrM   21   22   D     -   1\n+ chrM   21   22   I    GA   1\n+ chrM   22   23   D     -   1\n+\n+ chr2   18   19\n+ chr2   28   29\n+ chrM   11   12\n+ chrM   13   14\n+ chrM   13   14\n+ chrM   19   20\n+ chrM   21   22\n+\n+ chr1   11   13\n+ chr1   21   22\n+ chr1   21   23\n+ chrM   16   17\n+ chrM   16   18\n+ chrM   19   20\n+ chrM   21   22\n+ chrM   22   23\n+\n+For more information on SAM, please consult the `SAM format description`__.\n+\n+.. __: http://www.ncbi.nlm.nih.gov/pubmed/19505943\n+\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/indels/indel_table.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/indels/indel_table.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+
+"""
+Combines several interval files containing indels with counts. All input files need to have the same number of columns.
+
+usage: %prog [options] [input3 sum3[ input4 sum4[ input5 sum5[...]]]]
+   -1, --input1=1: The first input file
+   -s, --sum1=s: Whether or not to include the totals from first file in overall total
+   -2, --input2=2: The second input file
+   -S, --sum2=S: Whether or not to include the totals from second file in overall total
+   -o, --output=o: The interval output file for the combined files
+"""
+
+import re, sys
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+
+
+def stop_err( msg ):
+    sys.stderr.write( '%s\n' % msg )
+    sys.exit()
+
+def numeric_sort( text1, text2 ):
+    """
+    For two items containing space-separated text, compares equivalent pieces
+    numerically if both numeric or as text otherwise
+    """
+    pieces1 = text1.split()
+    pieces2 = text2.split()
+    if len( pieces1 ) == 0:
+        return 1
+    if len( pieces2 ) == 0:
+        return -1
+    for i, pc1 in enumerate( pieces1 ):
+        if i == len( pieces2 ):
+            return 1
+        if not pieces2[i].isdigit():
+            if pc1.isdigit():
+                return -1
+            else:
+                if pc1 > pieces2[i]:
+                    return 1
+                elif pc1 < pieces2[i]:
+                    return -1
+        else:
+            if not pc1.isdigit():
+                return 1
+            else:
+                if int( pc1 ) > int( pieces2[i] ):
+                    return 1
+                elif int( pc1 ) < int( pieces2[i] ):
+                    return -1
+    if i < len( pieces2 ) - 1:
+        return -1
+    return 0
+
+def __main__():
+    # Parse Command Line
+    options, args = doc_optparse.parse( __doc__ )
+    inputs = [ options.input1, options.input2 ]
+    includes = [ options.sum1, options.sum2 ]
+    inputs.extend( [ a for i, a in enumerate( args ) if i % 2 == 0 ] )
+    includes.extend( [ a for i, a in enumerate( args ) if i % 2 == 1 ] )
+    num_cols = 0
+    counts = {}
+    # read in data from all files and get total counts
+    try:
+        for i, input in enumerate( inputs ):
+            for line in open( input, 'rb' ):
+                sp_line = line.strip().split( '\t' )
+                # set num_cols on first pass
+                if num_cols == 0:
+                    if len( sp_line ) < 4:
+                        raise Exception, 'There need to be at least 4 columns in the file: Chrom, Start, End, and Count'
+                    num_cols = len( sp_line )
+                # deal with differing number of columns
+                elif len( sp_line ) != num_cols:
+                    raise Exception, 'All of the files need to have the same number of columns (current %s != %s of first line)' % ( len( sp_line ), num_cols )
+                # get actual counts for each indel
+                indel = '\t'.join( sp_line[:-1] )
+                try:
+                    count = int( sp_line[-1] )
+                except ValueError, e:
+                    raise Exception, 'The last column of each file must be numeric, with the count of the number of instances of that indel: %s' % str( e )
+                # total across all included files
+                if includes[i] == "true":
+                    try:
+                        counts[ indel ]['tot'] += count
+                    except ( IndexError, KeyError ):
+                        counts[ indel ] = { 'tot': count }
+                # counts for ith file
+                counts[ indel ][i] = count
+    except Exception, e:
+        stop_err( 'Failed to read all input files:\n%s' % str( e ) )
+    # output combined results to table file
+    try:
+        output = open( options.output, 'wb' )
+        count_keys = counts.keys()
+        count_keys.sort( numeric_sort )
+        for indel in count_keys:
+            count_out = [ str( counts[ indel ][ 'tot' ] ) ]
+            for i in range( len( inputs ) ):
+                try:
+                    count_out.append( str( counts[ indel ][i] ) )
+                except KeyError:
+                    count_out.append( '0' )
+            output.write( '%s\t%s\n' % ( indel, '\t'.join( count_out ) ) )
+        output.close()
+    except Exception, e:
+        stop_err( 'Failed to output data: %s' % str( e ) )
+
+if __name__=="__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/indels/indel_table.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/indels/indel_table.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,122 @@
+<tool id="indel_table" name="Indel Analysis Table" version="1.0.0">
+  <description>for combining indel interval data</description>
+  <command interpreter="python">
+    indel_table.py
+      --input1=$input1
+      --sum1=$sum1
+      --input2=$input2
+      --sum2=$sum2
+      --output=$output1
+      #for $i in $inputs
+        ${i.input}
+        ${i.sum}
+      #end for
+  </command>
+  <inputs>
+    <param format="interval" name="input1" type="data" label="Select first file to add" />
+    <param name="sum1" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Include first file's totals in overall total" />
+    <param format="interval" name="input2" type="data" label="Select second file to add" />
+    <param name="sum2" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Include second file's totals in overall total" />
+    <repeat name="inputs" title="Input Files">
+      <param name="input" label="Add file" type="data" format="interval" />
+      <param name="sum" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Include file's totals in overall total" />
+    </repeat>
+  </inputs>
+  <outputs>
+    <data format="interval" name="output1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="indel_table_in1.interval" ftype="interval" />
+      <param name="sum1" value="true"/>
+      <param name="input2" value="indel_table_in2.interval" ftype="interval" />
+      <param name="sum2" value="true" />
+      <param name="input" value="indel_table_in3.interval" ftype="interval" />
+      <param name="sum" value="true" />
+      <output name="output1" file="indel_table_out1.interval" ftype="interval" />
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+Creates a table allowing for analysis and comparison of indel data. Combines any number of interval files that have been produced by the tool that converts indel SAM data to interval format. Includes overall total counts for all or some files. The tool has the option to not include a given file's counts in the total column. This could be useful for combined data if the counts for certain indels might be included more than once.
+
+The exact columns of the output will depend on the columns of the input. Here is the detailed specification of the output columns::
+
+                          Column  Description
+ -------------------------------  ----------------------------------------------------------------------------------
+  1 ... m                "Indel"  All the "indel" columns, which contain the info that will be checked for equality
+  m + 1        Total Occurrences  Total number of occurrences of this indel across all (included) files
+  m + 2   Occurrences for File 1  Number of occurrences of this indel for first file
+  m + 3   Occurrences for File 2  Number of occurrences of this indel for second file
+  [m + ...]                [...]  [Number of occurrences of this indel for ... file]
+
+The most likely columns would be from the output of the Convert SAM to Interval/BED tool, so: Chromosome, Start position, End position, I/D (Insertion/Deletion), -/&lt;base(s)&gt; (Deletion/Inserted base(s)), Total Occurrences (across files), Occurrences for File 1, Occurrences for File 2, etc. See below for an example.
+
+
+-----
+
+**Example**
+
+Suppose you have the following 4 files::
+
+ chrM    300    301   D   -    6
+ chrM    303    304   D   -   19
+ chrM    359    360   D   -    1
+ chrM    410    411   D   -    1
+ chrM    435    436   D   -    1
+
+ chrM    410    411   D   -    1
+ chrM    714    715   D   -    1
+ chrM    995    997   D   -    1
+ chrM   1168   1169   I   A    1
+ chrM   1296   1297   D   -    1
+
+ chrM    300    301   D   -    8
+ chrM    525    526   D   -    1
+ chrM    958    959   D   -    1
+ chrM    995    996   D   -    3
+ chrM   1168   1169   I   C    1
+ chrM   1296   1297   D   -    1
+
+ chrM    303    304   D   -   22
+ chrM    410    411   D   -    1
+ chrM    435    436   D   -    1
+ chrM    714    715   D   -    1
+ chrM    753    754   I   A    1
+ chrM   1168   1169   I   A    1
+
+and the fifth file::
+
+ chrM    303    304   D   -   22
+ chrM    410    411   D   -    2
+ chrM    435    436   D   -    1
+ chrM    714    715   D   -    2
+ chrM    753    754   I   A    1
+ chrM    995    997   D   -    1
+ chrM   1168   1169   I   A    2
+ chrM   1296   1297   D   -    1
+
+The following will be produced if you include the first four files in the sum, but not the fifth::
+
+ chrM    300    301   D   -   14    6   0   8    0    0
+ chrM    303    304   D   -   41   19   0   0   22   22
+ chrM    359    360   D   -    1    1   0   0    0    0
+ chrM    410    411   D   -    3    1   1   0    1    2
+ chrM    435    436   D   -    2    1   0   0    1    2
+ chrM    525    526   D   -    1    0   0   1    0    0
+ chrM    714    715   D   -    2    0   1   0    1    2
+ chrM    753    754   I   A    1    0   0   0    1    1
+ chrM    958    959   D   -    1    0   0   1    0    0
+ chrM    995    996   D   -    3    0   0   3    0    0
+ chrM    995    997   D   -    1    0   1   0    0    1
+ chrM   1168   1169   I   A    2    0   1   0    1    2
+ chrM   1168   1169   I   C    1    0   0   1    0    0
+ chrM   1296   1297   D   -    2    0   1   1    0    1
+
+The first numeric column includes the total or the next four columns, but not the fifth.
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/indels/sam_indel_filter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/indels/sam_indel_filter.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+
+"""
+Allows user to filter out non-indels from SAM.
+
+usage: %prog [options]
+   -i, --input=i: Input SAM file to be filtered
+   -q, --quality_threshold=q: Minimum quality value for adjacent bases
+   -a, --adjacent_bases=a: Number of adjacent bases on each size to check qualities
+   -o, --output=o: Filtered output SAM file
+"""
+
+import re, sys
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+
+
+def stop_err( msg ):
+    sys.stderr.write( '%s\n' % msg )
+    sys.exit()
+
+def __main__():
+    #Parse Command Line
+    options, args = doc_optparse.parse( __doc__ )
+    # prep output file
+    output = open( options.output, 'wb' )
+    # patterns
+    pat = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' )
+    pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
+    try:
+        qual_thresh = int( options.quality_threshold )
+        if qual_thresh < 0 or qual_thresh > 93:
+            raise ValueError
+    except ValueError:
+        stop_err( 'Your quality threshold should be an integer between 0 and 93, inclusive.' )
+    try:
+        adj_bases = int( options.adjacent_bases )
+        if adj_bases < 1:
+            raise ValueError
+    except ValueError:
+        stop_err( 'The number of adjacent bases should be an integer greater than 1.' )
+    # record lines skipped because of more than one indel
+    multi_indel_lines = 0
+    # go through all lines in input file
+    for i,line in enumerate(open( options.input, 'rb' )):
+        if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
+            split_line = line.split( '\t' )
+            cigar = split_line[5].strip()
+            # find matches like 3M2D7M or 7M3I10M
+            match = {}
+            m = pat.match( cigar )
+            # if unprocessable CIGAR
+            if not m:
+                m = pat_multi.match( cigar )
+                # skip this line if no match
+                if not m:
+                    continue
+                # account for multiple indels or operations we don't process
+                else:
+                    multi_indel_lines += 1
+            # otherwise get matching parts
+            else:
+                match = m.groupdict()
+            # process for indels
+            if match:
+                left = int( match[ 'lmatch' ] )
+                right = int( match[ 'rmatch' ] )
+                if match[ 'ins_del' ] == 'I':
+                    middle = int( match[ 'ins_del_width' ] )
+                else:
+                    middle = 0
+                # if there are enough adjacent bases to check, then do so
+                if left >= adj_bases and right >= adj_bases:
+                    quals = split_line[10]
+                    eligible_quals = quals[ left - adj_bases : left + middle + adj_bases ]
+                    qual_thresh_met = True
+                    for q in eligible_quals:
+                        if ord( q ) - 33 < qual_thresh:
+                            qual_thresh_met = False
+                            break
+                    # if filter reqs met, output line
+                    if qual_thresh_met:
+                        output.write( line )
+    # close out file
+    output.close()
+    # if skipped lines because of more than one indel, output message
+    if multi_indel_lines > 0:
+        sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )
+
+if __name__=="__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/indels/sam_indel_filter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/indels/sam_indel_filter.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,77 @@
+<tool id="sam_indel_filter" name="Filter Indels" version="1.0.0">
+  <description>for SAM</description>
+  <command interpreter="python">
+    sam_indel_filter.py
+      --input=$input1
+      --quality_threshold=$quality_threshold
+      --adjacent_bases=$adjacent_bases
+      --output=$out_file1
+  </command>
+  <inputs>
+    <param format="sam" name="input1" type="data" label="Select dataset to filter" />
+    <param name="quality_threshold" type="integer" value="40" label="Quality threshold for adjacent bases" help="Takes Phred value assuming Sanger scale; usually between 0 and 40, but up to 93" />
+    <param name="adjacent_bases" type="integer" value="1" label="The number of adjacent bases to match on either side of the indel" help="If one side is shorter than this width, the read will be excluded" />
+  </inputs>
+  <outputs>
+    <data format="sam" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="sam_indel_filter_in1.sam" ftype="sam"/>
+      <param name="quality_threshold" value="14"/>
+      <param name="adjacent_bases" value="2"/>
+      <output name="out_file1" file="sam_indel_filter_out1.sam" ftype="sam"/>
+    </test>
+    <test>
+      <param name="input1" value="sam_indel_filter_in1.sam" ftype="sam"/>
+      <param name="quality_threshold" value="29"/>
+      <param name="adjacent_bases" value="5"/>
+      <output name="out_file1" file="sam_indel_filter_out2.sam" ftype="sam"/>
+    </test>
+    <test>
+      <param name="input1" value="sam_indel_filter_in2.sam" ftype="sam"/>
+      <param name="quality_threshold" value="7"/>
+      <param name="adjacent_bases" value="1"/>
+      <output name="out_file1" file="sam_indel_filter_out3.sam" ftype="sam"/>
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+Allows extracting indels from SAM produced by BWA. Currently it can handle SAM with alignments that have only one insertion or one deletion, and will skip that alignment if it encounters one with more than one indel. It matches CIGAR strings (column 6 in the SAM file) like 5M3I5M or 4M2D10M, so there must be a match or mismatch of sufficient length on either side of the indel.
+
+-----
+
+**Example**
+
+Suppose you have the following::
+
+ r770    89  ref        116   37  17M1I5M          =   72131356   0   CACACTGTGACAGACAGCGCAGC   00/02!!0//1200210AA44/1  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
+ r770   181  ref        116    0      24M          =   72131356   0  TTGGTGCGCGCGGTTGAGGGTTGG  $$(#%%#$%#%####$%%##$###
+ r1945  177  ref   41710908    0      23M  190342418  181247988   0   AGAGAGAGAGAGAGAGAGAGAGA   SQQWZYURVYWX]]YXTSY]]ZM  XT:A:R  CM:i:0  SM:i:0   AM:i:0  X0:i:163148            XM:i:0  XO:i:0  XG:i:0  MD:Z:23
+ r3671  117  ref  190342418    0      24M          =  190342418   0  CTGGCGTTCTCGGCGTGGATGGGT  #####$$##$#%#%%###%$#$##
+ r3671  153  ref  190342418   37  16M1I6M          =  190342418   0   TCTAACTTAGCCTCATAATAGCT   /&lt;&lt;!"0///////00/!!0121/  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
+ r3824  117  ref   80324999    0      24M          =   80324999   0  TCCAGTCGCGTTGTTAGGTTCGGA  #$#$$$#####%##%%###**#+/
+ r3824  153  ref   80324999   37  8M1I14M          =   80324999   0   TTTAGCCCGAAATGCCTAGAGCA   4;6//11!"11100110////00  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
+ r4795   81  ref   26739130    0      23M   57401793   57401793   0   TGGCATTCCTGTAGGCAGAGAGG   AZWWZS]!"QNXZ]VQ]]]/2]]  XT:A:R  CM:i:2  SM:i:0   AM:i:0  X0:i:3    X1:i:0  XM:i:2  XO:i:0  XG:i:0  MD:Z:23
+ r4795  161  ref   57401793   37      23M   26739130   26739130   0   GATCACCCAGGTGATGTAACTCC   ]WV]]]]WW]]]]]]]]]]PU]]  XT:A:U  CM:i:0  SM:i:37  AM:i:0  X0:i:1    X1:i:0  XM:i:0  XO:i:0  XG:i:0  MD:Z:23
+ r4800   16  ref        241  255  15M1D8M          =          0   0   CGTGGCCGGCGGGCCGAAGGCAT   IIIIIIIIIICCCCIII?IIIII  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
+ r5377  170  ref   59090793   37      23M   26739130   26739130   0   TATCAATAAGGTGATGTAACTCG   ]WV]ABAWW]]]]]P]P//GU]]  XT:A:U  CM:i:0  SM:i:37  AM:i:0  X0:i:1    X1:i:0  XM:i:0  XO:i:0  XG:i:0  MD:Z:23
+ r5612  151  ref  190342418   37  19M1I3M          =  190342418   0   TCTAACTTAGCCTCATAATAGCT   /&lt;&lt;!"0/4//7//00/BC0121/  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
+
+
+To select only alignments with indels, you need to determine the minimum quality you want the adjacent bases to have, as well as the number of adjacent bases to check. If you set the quality threshold to 47 and the number of bases to check to 2, you will get the following output::
+
+ r770    89  ref        116   37  17M1I5M          =   72131356   0   CACACTGTGACAGACAGCGCAGC   00/02!!0//1200210AA44/1  XT:A:U  CM:i:2  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
+ r4800   16  ref        241  255  15M1D8M          =          0   0   CGTGGCCGGCGGGCCGAAGGCAT   IIIIIIIIIICCCCIII?IIIII  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
+ r5612  151  ref  190342418   37  19M1I3M          =  190342418   0   TCTAACTTAGCCTCATAATAGCT   /&lt;&lt;!"0/4//7//00/BC0121/  XT:A:U  CM:i:2  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
+
+
+For more information on SAM, please consult the `SAM format description`__.
+
+.. __: http://www.ncbi.nlm.nih.gov/pubmed/19505943
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/genebed_maf_to_fasta.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/genebed_maf_to_fasta.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,98 @@
+<tool id="GeneBed_Maf_Fasta2" name="Stitch Gene blocks" version="1.0.1">
+  <description>given a set of coding exon intervals</description>
+  <command interpreter="python">
+    #if $maf_source_type.maf_source == "user" #interval_maf_to_merged_fasta.py --dbkey=$dbkey --species=$maf_source_type.species --mafSource=$maf_source_type.maf_file --mafIndex=$maf_source_type.maf_file.metadata.maf_index --interval_file=$input1 --output_file=$out_file1 --mafSourceType=$maf_source_type.maf_source --geneBED --mafIndexFileDir=${GALAXY_DATA_INDEX_DIR}
+    #else                                     #interval_maf_to_merged_fasta.py --dbkey=$dbkey --species=$maf_source_type.species --mafSource=$maf_source_type.maf_identifier --interval_file=$input1 --output_file=$out_file1 --mafSourceType=$maf_source_type.maf_source  --geneBED --mafIndexFileDir=${GALAXY_DATA_INDEX_DIR}
+    #end if# --overwrite_with_gaps=$overwrite_with_gaps
+  </command>
+  <inputs>
+    <param name="input1" type="data" format="bed" label="Gene BED File">
+      <validator type="unspecified_build" />
+      <validator type="expression" message="Input must be in BED12 format.">value.metadata.columns &gt;= 12</validator> <!-- allow 12+ columns, not as strict as possible. TODO: only list bed files with 12+ columns -->
+    </param>
+    <conditional name="maf_source_type">
+      <param name="maf_source" type="select" label="MAF Source">
+        <option value="cached" selected="true">Locally Cached Alignments</option>
+        <option value="user">Alignments in Your History</option>
+      </param>
+      <when value="user">
+        <param name="maf_file" type="data" format="maf" label="MAF File">
+          <validator type="dataset_ok_validator" />
+          <options>
+            <filter type="data_meta" ref="input1" key="dbkey" />
+          </options>
+        </param>
+        <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment">
+          <options>
+            <filter type="data_meta" ref="maf_file" key="species" />
+          </options>
+        </param>
+      </when>
+      <when value="cached">
+        <param name="maf_identifier" type="select" label="MAF Type" >
+          <options from_file="maf_index.loc">
+            <column name="name" index="0"/>
+            <column name="value" index="1"/>
+            <column name="dbkey" index="2"/>
+            <column name="species" index="3"/>
+            <filter type="data_meta" ref="input1" key="dbkey" column="2" multiple="True" separator=","/>
+            <validator type="no_options" message="No alignments are available for the build associated with the selected interval file"/>
+          </options>
+        </param> 
+        <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment">
+          <options from_file="maf_index.loc">
+            <column name="uid" index="1"/>
+            <column name="value" index="3"/>
+            <column name="name" index="3"/>
+            <filter type="param_value" ref="maf_identifier" name="uid" column="1"/>
+            <filter type="multiple_splitter" column="3" separator=","/>
+          </options>
+        </param>
+      </when>
+    </conditional>
+    <param name="overwrite_with_gaps" type="select" label="Split into Gapless MAF blocks" help="When set to Yes, blocks are divided around gaps appearing in any species. This will prevent gaps occurring in the interior of the sequence for an aligning species from overwriting a nucleotide found for the same position in a lower-scoring block.">
+      <option value="True" selected="true">No</option>
+      <option value="False">Yes</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="8.bed"/>
+      <param name="maf_source" value="cached"/>in aligning species
+      <param name="maf_identifier" value="8_WAY_MULTIZ_hg17"/>
+      <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/>
+      <param name="overwrite_with_gaps" value="True"/>
+      <output name="out_file1" file="gene_bed_maf_to_fasta_out.fasta" />
+    </test>
+    <test>
+      <param name="input1" value="8.bed"/>
+      <param name="maf_source" value="user"/>
+      <param name="maf_file" value="4.maf"/>
+      <param name="species" value="hg17,panTro1"/>
+      <param name="overwrite_with_gaps" value="True"/>
+      <output name="out_file1" file="gene_bed_maf_to_fasta_user_out.fasta" />
+    </test>
+  </tests> 
+  <help>
+
+**What it does**
+
+The coding sequence of genes are usually composed of several coding exons. Each of these coding exons is an individual genomic region, which when concatenated with each other constitutes the coding sequence. A single genomic region can be covered by multiple alignment blocks. In many cases it is desirable to stitch these alignment blocks together. This tool accepts a list of gene-based intervals, in the Gene BED format. For every interval it performs the following:
+
+  * finds all MAF blocks that overlap the coding regions;
+  * sorts MAF blocks by alignment score;
+  * stitches blocks together and resolves overlaps based on alignment score;
+  * outputs alignments in FASTA format.
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. &lt;http://www.ncbi.nlm.nih.gov/pubmed/21775304&gt;`_
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/interval2maf.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/interval2maf.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,139 @@
+#!/usr/bin/env python
+
+"""
+Reads a list of intervals and a maf. Produces a new maf containing the
+blocks or parts of blocks in the original that overlapped the intervals.
+
+If a MAF file, not UID, is provided the MAF file is indexed before being processed.
+
+NOTE: If two intervals overlap the same block it will be written twice.
+
+usage: %prog maf_file [options]
+   -d, --dbkey=d: Database key, ie hg17
+   -c, --chromCol=c: Column of Chr
+   -s, --startCol=s: Column of Start
+   -e, --endCol=e: Column of End
+   -S, --strandCol=S: Column of Strand
+   -t, --mafType=t: Type of MAF source to use
+   -m, --mafFile=m: Path of source MAF file, if not using cached version
+   -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version
+   -i, --interval_file=i:       Input interval file
+   -o, --output_file=o:      Output MAF file
+   -p, --species=p: Species to include in output
+   -P, --split_blocks_by_species=P: Split blocks by species
+   -r, --remove_all_gap_columns=r: Remove all Gap columns
+   -l, --indexLocation=l: Override default maf_index.loc file
+   -z, --mafIndexFile=z: Directory of local maf index file ( maf_index.loc or maf_pairwise.loc )
+"""
+
+#Dan Blankenberg
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+import bx.align.maf
+import bx.intervals.io
+from galaxy.tools.util import maf_utilities
+import sys
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def __main__():
+    index = index_filename = None
+    mincols = 0
+    
+    #Parse Command Line
+    options, args = doc_optparse.parse( __doc__ )
+    
+    if options.dbkey: dbkey = options.dbkey
+    else: dbkey = None
+    if dbkey in [None, "?"]:
+        maf_utilities.tool_fail( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." )
+    
+    species = maf_utilities.parse_species_option( options.species )
+    
+    if options.chromCol: chromCol = int( options.chromCol ) - 1
+    else: 
+        maf_utilities.tool_fail( "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." )
+    
+    if options.startCol: startCol = int( options.startCol ) - 1
+    else: 
+        maf_utilities.tool_fail( "Start column not set, click the pencil icon in the history item to set the metadata attributes." )
+    
+    if options.endCol: endCol = int( options.endCol ) - 1
+    else: 
+        maf_utilities.tool_fail( "End column not set, click the pencil icon in the history item to set the metadata attributes." )
+    
+    if options.strandCol: strandCol = int( options.strandCol ) - 1
+    else: 
+        strandCol = -1
+    
+    if options.interval_file: interval_file = options.interval_file
+    else: 
+        maf_utilities.tool_fail( "Input interval file has not been specified." )
+    
+    if options.output_file: output_file = options.output_file
+    else: 
+        maf_utilities.tool_fail( "Output file has not been specified." )
+    
+    split_blocks_by_species = remove_all_gap_columns = False
+    if options.split_blocks_by_species and options.split_blocks_by_species == 'split_blocks_by_species':
+        split_blocks_by_species = True
+        if options.remove_all_gap_columns and options.remove_all_gap_columns == 'remove_all_gap_columns':
+            remove_all_gap_columns = True
+    else:
+        remove_all_gap_columns = True
+    #Finish parsing command line
+    
+    #Open indexed access to MAFs
+    if options.mafType:
+        if options.indexLocation:
+            index = maf_utilities.maf_index_by_uid( options.mafType, options.indexLocation )
+        else:
+            index = maf_utilities.maf_index_by_uid( options.mafType, options.mafIndexFile )
+        if index is None:
+            maf_utilities.tool_fail( "The MAF source specified (%s) appears to be invalid." % ( options.mafType ) )
+    elif options.mafFile:
+        index, index_filename = maf_utilities.open_or_build_maf_index( options.mafFile, options.mafIndex, species = [dbkey] )
+        if index is None:
+            maf_utilities.tool_fail( "Your MAF file appears to be malformed." )
+    else:
+        maf_utilities.tool_fail( "Desired source MAF type has not been specified." )
+    
+    #Create MAF writter
+    out = bx.align.maf.Writer( open(output_file, "w") )
+    
+    #Iterate over input regions 
+    num_blocks = 0
+    num_regions = None
+    for num_regions, region in enumerate( bx.intervals.io.NiceReaderWrapper( open( interval_file, 'r' ), chrom_col = chromCol, start_col = startCol, end_col = endCol, strand_col = strandCol, fix_strand = True, return_header = False, return_comments = False ) ):
+        src = maf_utilities.src_merge( dbkey, region.chrom )
+        for block in index.get_as_iterator( src, region.start, region.end ):
+            if split_blocks_by_species:
+                blocks = [ new_block for new_block in maf_utilities.iter_blocks_split_by_species( block ) if maf_utilities.component_overlaps_region( new_block.get_component_by_src_start( dbkey ), region ) ]
+            else:
+                blocks = [ block ]
+            for block in blocks:
+                block = maf_utilities.chop_block_by_region( block, src, region )
+                if block is not None:
+                    if species is not None:
+                        block = block.limit_to_species( species )
+                    block = maf_utilities.orient_block_by_region( block, src, region )
+                    if remove_all_gap_columns:
+                        block.remove_all_gap_columns()
+                    out.write( block )
+                    num_blocks += 1
+    
+    #Close output MAF
+    out.close()
+    
+    #remove index file if created during run
+    maf_utilities.remove_temp_index_file( index_filename )
+    
+    if num_blocks:
+        print "%i MAF blocks extracted for %i regions." % ( num_blocks, ( num_regions + 1 ) )
+    elif num_regions is not None:
+        print "No MAF blocks could be extracted for %i regions." % ( num_regions + 1 )
+    else:
+        print "No valid regions have been provided."
+    
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/interval2maf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/interval2maf.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,294 @@\n+<tool id="Interval2Maf1" name="Extract MAF blocks" version="1.0.1">\r\n+  <description>given a set of genomic intervals</description>\r\n+  <command interpreter="python">\r\n+    #if $maf_source_type.maf_source == "user" #interval2maf.py --dbkey=${input1.dbkey} --chromCol=${input1.metadata.chromCol} --startCol=${input1.metadata.startCol} --endCol=${input1.metadata.endCol} --strandCol=${input1.metadata.strandCol} --mafFile=$maf_source_type.mafFile --mafIndex=$maf_source_type.mafFile.metadata.maf_index --interval_file=$input1 --output_file=$out_file1 --mafIndexFile=${GALAXY_DATA_INDEX_DIR}/maf_index.loc --species=$maf_source_type.species\r\n+    #else                                     #interval2maf.py --dbkey=${input1.dbkey} --chromCol=${input1.metadata.chromCol} --startCol=${input1.metadata.startCol} --endCol=${input1.metadata.endCol} --strandCol=${input1.metadata.strandCol} --mafType=$maf_source_type.mafType --interval_file=$input1 --output_file=$out_file1 --mafIndexFile=${GALAXY_DATA_INDEX_DIR}/maf_index.loc --species=$maf_source_type.species\r\n+    #end if# --split_blocks_by_species=$split_blocks_by_species_selector.split_blocks_by_species\r\n+    #if $split_blocks_by_species_selector.split_blocks_by_species == "split_blocks_by_species"#\r\n+        --remove_all_gap_columns=$split_blocks_by_species_selector.remove_all_gap_columns\r\n+    #end if\r\n+  </command>\r\n+  <inputs>\r\n+    <param format="interval" name="input1" type="data" label="Choose intervals">\r\n+      <validator type="unspecified_build" />\r\n+    </param>\r\n+    <conditional name="maf_source_type">\r\n+      <param name="maf_source" type="select" label="MAF Source">\r\n+        <option value="cached" selected="true">Locally Cached Alignments</option>\r\n+        <option value="user">Alignments in Your History</option>\r\n+      </param>\r\n+      <when value="user">\r\n+        <param format="maf" name="mafFile" label="Choose alignments" type="data">\r\n+          <options>\r\n+            <filter type="data_meta" ref="input1" key="dbkey" />\r\n+          </options>\r\n+          <validator type="dataset_ok_validator" />\r\n+        </param>\r\n+        <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment">\r\n+          <options>\r\n+            <filter type="data_meta" ref="mafFile" key="species" />\r\n+          </options>\r\n+        </param>\r\n+      </when>\r\n+      <when value="cached">\r\n+        <param name="mafType" type="select" label="Choose alignments">\r\n+          <options from_data_table="indexed_maf_files">\r\n+            <!--\r\n+            <column name="name" index="0"/>\r\n+            <column name="value" index="1"/>\r\n+            <column name="dbkey" index="2"/>\r\n+            <column name="species" index="3"/>\r\n+            -->\r\n+            <filter type="data_meta" ref="input1" key="dbkey" column="dbkey" multiple="True" separator=","/>\r\n+            <validator type="no_options" message="No alignments are available for the build associated with the selected interval file"/>\r\n+          </options>\r\n+        </param>\r\n+        <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment">\r\n+          <options from_data_table="indexed_maf_files">\r\n+            <column name="uid" index="1"/>\r\n+            <column name="value" index="3"/>\r\n+            <column name="name" index="3"/>\r\n+            <filter type="param_value" ref="mafType" column="uid"/>\r\n+            <filter type="multiple_splitter" column="name" separator=","/>\r\n+          </options>\r\n+        </param>\r\n+      </when>\r\n+    </conditional>\r\n+    <conditional name="split_blocks_by_species_selector">\r\n+      <param name="split_blocks_by_species" type="select" label="Split blocks by species" help="Not usually applicable. See help below for more information.">\r\n+        <option value="split_blocks_by_species">Split by species</'..b'TCCCGCGGTCCCTTCCTGTACCTTGTC---AG \r\n+  s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG \r\n+  s species3.chr3  68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG \r\n+  \r\n+  a score=2047408.0\r\n+  s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG \r\n+  s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG \r\n+  s species3.chr3  68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG \r\n+  \r\n+  a score=2047408.0\r\n+  s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG \r\n+  s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG \r\n+  s species3.chr3  68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG \r\n+  \r\n+  a score=2047408.0\r\n+  s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG \r\n+  s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG \r\n+  s species3.chr3  68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG \r\n+  \r\n+  a score=2047408.0\r\n+  s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG \r\n+  s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG \r\n+  s species3.chr3  68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG \r\n+  \r\n+  a score=2047408.0\r\n+  s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG \r\n+  s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG \r\n+  s species3.chr3  68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG \r\n+  \r\n+  a score=2047408.0\r\n+  s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG \r\n+  s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG \r\n+  s species3.chr3  68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG \r\n+  \r\n+  a score=2047408.0\r\n+  s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG \r\n+  s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG \r\n+  s species3.chr3  68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG \r\n+  \r\n+  a score=2047408.0\r\n+  s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG \r\n+  s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG \r\n+  s species3.chr3  68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG \r\n+\r\n+------\r\n+\r\n+**Citation**\r\n+\r\n+If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. &lt;http://www.ncbi.nlm.nih.gov/pubmed/21775304&gt;`_\r\n+\r\n+\r\n+  </help>\r\n+</tool>\r\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/interval2maf_pairwise.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/interval2maf_pairwise.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,50 @@
+<tool id="Interval2Maf_pairwise1" name="Extract Pairwise MAF blocks" version="1.0.1">
+  <description>given a set of genomic intervals</description>
+  <command interpreter="python">interval2maf.py --dbkey=${input1.dbkey} --chromCol=${input1.metadata.chromCol} --startCol=${input1.metadata.startCol} --endCol=${input1.metadata.endCol} --strandCol=${input1.metadata.strandCol} --mafType=$mafType --interval_file=$input1 --output_file=$out_file1 --indexLocation=${GALAXY_DATA_INDEX_DIR}/maf_pairwise.loc</command>
+  <inputs>
+    <param name="input1" type="data" format="interval" label="Interval File">
+      <validator type="unspecified_build" />
+    </param>
+    <param name="mafType" type="select" label="Choose MAF source">
+      <options from_file="maf_pairwise.loc">
+        <column name="name" index="0"/>
+        <column name="value" index="1"/>
+        <column name="dbkey" index="2"/>
+        <column name="species" index="3"/>
+        <filter type="data_meta" ref="input1" key="dbkey" column="2" multiple="True" separator=","/>
+        <validator type="no_options" message="No alignments are available for the build associated with the selected interval file"/>
+      </options>
+    </param> 
+   </inputs>
+  <outputs>
+    <data format="maf" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="8.bed" dbkey="hg17" format="bed"/>
+      <param name="mafType" value="PAIRWISE_hg17_fr1"/>
+      <output name="out_file1" file="Interval2Maf_pairwise_out.maf"/>
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+This tool takes genomic coordinates, superimposes them on pairwise alignments (in MAF format) stored on the Galaxy site, and excises alignment blocks corresponding to each set of coordinates. Alignment blocks that extend past START and/or END positions of an interval are trimmed. Note that a single genomic interval may correspond to two or more alignment blocks. 
+
+-----
+
+**Example** 
+
+Here a single interval is superimposed on three MAF blocks. Blocks 1 and 3 are trimmed because they extend beyond boundaries of the interval:
+
+.. image:: ./static/images/maf_icons/interval2maf.png
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. &lt;http://www.ncbi.nlm.nih.gov/pubmed/21775304&gt;`_
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/interval_maf_to_merged_fasta.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/interval_maf_to_merged_fasta.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,196 @@\n+#!/usr/bin/env python\n+\n+"""\n+Reads an interval or gene BED and a MAF Source.\n+Produces a FASTA file containing the aligned intervals/gene sequences, based upon the provided coordinates\n+\n+Alignment blocks are layered ontop of each other based upon score.\n+\n+usage: %prog maf_file [options]\n+   -d, --dbkey=d: Database key, ie hg17\n+   -c, --chromCol=c: Column of Chr\n+   -s, --startCol=s: Column of Start\n+   -e, --endCol=e: Column of End\n+   -S, --strandCol=S: Column of Strand\n+   -G, --geneBED: Input is a Gene BED file, process and join exons as one region\n+   -t, --mafSourceType=t: Type of MAF source to use\n+   -m, --mafSource=m: Path of source MAF file, if not using cached version\n+   -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version\n+   -i, --interval_file=i:       Input interval file\n+   -o, --output_file=o:      Output MAF file\n+   -p, --species=p: Species to include in output\n+   -O, --overwrite_with_gaps=O: Overwrite bases found in a lower-scoring block with gaps interior to the sequence for a species.\n+   -z, --mafIndexFileDir=z: Directory of local maf_index.loc file\n+\n+usage: %prog dbkey_of_BED comma_separated_list_of_additional_dbkeys_to_extract comma_separated_list_of_indexed_maf_files input_gene_bed_file output_fasta_file cached|user GALAXY_DATA_INDEX_DIR\n+"""\n+\n+#Dan Blankenberg\n+from galaxy import eggs\n+from galaxy.tools.util import maf_utilities\n+import pkg_resources; pkg_resources.require( "bx-python" )\n+from bx.cookbook import doc_optparse\n+import bx.intervals.io\n+import sys\n+\n+assert sys.version_info[:2] >= ( 2, 4 )\n+\n+def stop_err( msg ):\n+    sys.stderr.write( msg )\n+    sys.exit()\n+\n+def __main__():\n+    \n+    #Parse Command Line\n+    options, args = doc_optparse.parse( __doc__ )\n+    mincols = 0\n+    strand_col = -1\n+    \n+    if options.dbkey:\n+        primary_species = options.dbkey\n+    else:\n+        primary_species = None\n+    if primary_species in [None, "?", "None"]:\n+        stop_err( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." )\n+    \n+    include_primary = True\n+    secondary_species = maf_utilities.parse_species_option( options.species )\n+    if secondary_species:\n+        species = list( secondary_species ) # make copy of species list\n+        if primary_species in secondary_species:\n+            secondary_species.remove( primary_species )\n+        else:\n+            include_primary = False\n+    else:\n+        species = None\n+    \n+    if options.interval_file:\n+        interval_file = options.interval_file\n+    else: \n+        stop_err( "Input interval file has not been specified." )\n+    \n+    if options.output_file:\n+        output_file = options.output_file\n+    else: \n+        stop_err( "Output file has not been specified." )\n+    \n+    if not options.geneBED:\n+        if options.chromCol:\n+            chr_col = int( options.chromCol ) - 1\n+        else: \n+            stop_err( "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." )\n+        \n+        if options.startCol:\n+            start_col = int( options.startCol ) - 1\n+        else: \n+            stop_err( "Start column not set, click the pencil icon in the history item to set the metadata attributes." )\n+        \n+        if options.endCol:\n+            end_col = int( options.endCol ) - 1\n+        else: \n+            stop_err( "End column not set, click the pencil icon in the history item to set the metadata attributes." )\n+        \n+        if options.strandCol:\n+            strand_col = int( options.strandCol ) - 1\n+    \n+    mafIndexFile = "%s/maf_index.loc" % options.mafIndexFileDir\n+    \n+    overwrite_with_gaps = True\n+    if options.overwrite_with_gaps and options.overwrite_with_gaps.lower() == \'false\':\n+        overwrite_with_gaps = False\n+    \n+    #Finish parsing command line\n+        \n+    #get '..b'" )\n+    \n+    #open output file\n+    output = open( output_file, "w" )\n+    \n+    if options.geneBED:\n+        region_enumerator = maf_utilities.line_enumerator( open( interval_file, "r" ).readlines() )\n+    else:\n+        region_enumerator = enumerate( bx.intervals.io.NiceReaderWrapper( open( interval_file, \'r\' ), chrom_col = chr_col, start_col = start_col, end_col = end_col, strand_col = strand_col, fix_strand = True, return_header = False, return_comments = False ) )\n+    \n+    #Step through intervals\n+    regions_extracted = 0\n+    line_count = 0\n+    for line_count, line in region_enumerator:\n+        try:\n+            if options.geneBED: #Process as Gene BED\n+                try:\n+                    starts, ends, fields = maf_utilities.get_starts_ends_fields_from_gene_bed( line )\n+                    #create spliced alignment object\n+                    alignment = maf_utilities.get_spliced_region_alignment( index, primary_species, fields[0], starts, ends, strand = \'+\', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps )\n+                    primary_name = secondary_name = fields[3]\n+                    alignment_strand = fields[5]\n+                except Exception, e:\n+                    print "Error loading exon positions from input line %i: %s" % ( line_count, e )\n+                    continue\n+            else: #Process as standard intervals\n+                try:\n+                    #create spliced alignment object\n+                    alignment = maf_utilities.get_region_alignment( index, primary_species, line.chrom, line.start, line.end, strand = \'+\', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps )\n+                    primary_name = "%s(%s):%s-%s" % ( line.chrom, line.strand, line.start, line.end )\n+                    secondary_name = ""\n+                    alignment_strand = line.strand\n+                except Exception, e:\n+                    print "Error loading region positions from input line %i: %s" % ( line_count, e )\n+                    continue\n+            \n+            #Write alignment to output file\n+            #Output primary species first, if requested\n+            if include_primary:\n+                output.write( ">%s.%s\\n" %( primary_species, primary_name ) )\n+                if alignment_strand == "-":\n+                    output.write( alignment.get_sequence_reverse_complement( primary_species ) )\n+                else:\n+                    output.write( alignment.get_sequence( primary_species ) )\n+                output.write( "\\n" )\n+            #Output all remainging species\n+            for spec in secondary_species or alignment.get_species_names( skip = primary_species ):\n+                if secondary_name:\n+                    output.write( ">%s.%s\\n" % ( spec, secondary_name ) )\n+                else:\n+                    output.write( ">%s\\n" % ( spec ) )\n+                if alignment_strand == "-":\n+                    output.write( alignment.get_sequence_reverse_complement( spec ) )\n+                else:\n+                    output.write( alignment.get_sequence( spec ) )\n+                output.write( "\\n" )\n+            \n+            output.write( "\\n" )\n+            \n+            regions_extracted += 1\n+        \n+        except Exception, e:\n+            print "Unexpected error from input line %i: %s" % ( line_count, e )\n+            continue\n+    \n+    #close output file\n+    output.close()\n+    \n+    #remove index file if created during run\n+    maf_utilities.remove_temp_index_file( index_filename )\n+    \n+    #Print message about success for user\n+    if regions_extracted > 0:\n+        print "%i regions were processed successfully." % ( regions_extracted )\n+    else:\n+        print "No regions were processed successfully."\n+        if line_count > 0 and options.geneBED:\n+            print "This tool requires your input file to conform to the 12 column BED standard."\n+\n+if __name__ == "__main__": __main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/interval_maf_to_merged_fasta.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/interval_maf_to_merged_fasta.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,114 @@
+<tool id="Interval_Maf_Merged_Fasta2" name="Stitch MAF blocks" version="1.0.1">
+  <description>given a set of genomic intervals</description>
+  <command interpreter="python">
+    #if $maf_source_type.maf_source == "user" #interval_maf_to_merged_fasta.py --dbkey=$dbkey --species=$maf_source_type.species --mafSource=$maf_source_type.maf_file --mafIndex=$maf_source_type.maf_file.metadata.maf_index --interval_file=$input1 --output_file=$out_file1 --chromCol=${input1.metadata.chromCol} --startCol=${input1.metadata.startCol} --endCol=${input1.metadata.endCol} --strandCol=${input1.metadata.strandCol} --mafSourceType=$maf_source_type.maf_source --mafIndexFileDir=${GALAXY_DATA_INDEX_DIR}
+    #else                                     #interval_maf_to_merged_fasta.py --dbkey=$dbkey --species=$maf_source_type.species --mafSource=$maf_source_type.maf_identifier --interval_file=$input1 --output_file=$out_file1 --chromCol=${input1.metadata.chromCol} --startCol=${input1.metadata.startCol} --endCol=${input1.metadata.endCol} --strandCol=${input1.metadata.strandCol} --mafSourceType=$maf_source_type.maf_source --mafIndexFileDir=${GALAXY_DATA_INDEX_DIR}
+    #end if# --overwrite_with_gaps=$overwrite_with_gaps
+  </command>
+  <inputs>
+    <page>
+        <param format="interval" name="input1" type="data" label="Choose intervals">
+          <validator type="unspecified_build" />
+        </param>
+        <conditional name="maf_source_type">
+            <param name="maf_source" type="select" label="MAF Source">
+              <option value="cached" selected="true">Locally Cached Alignments</option>
+              <option value="user">Alignments in Your History</option>
+            </param>
+            <when value="user">
+              <param name="maf_file" type="data" format="maf" label="MAF File">
+                <options>
+                  <filter type="data_meta" ref="input1" key="dbkey" />
+                </options>
+                <validator type="dataset_ok_validator" />
+              </param>
+              <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment">
+                <options>
+                  <filter type="data_meta" ref="maf_file" key="species" />
+                </options>
+              </param>
+            </when>
+            <when value="cached">
+              <param name="maf_identifier" type="select" label="MAF Type" >
+                <options from_file="maf_index.loc">
+                  <column name="name" index="0"/>
+                  <column name="value" index="1"/>
+                  <column name="dbkey" index="2"/>
+                  <column name="species" index="3"/>
+                  <filter type="data_meta" ref="input1" key="dbkey" column="2" multiple="True" separator=","/>
+                  <validator type="no_options" message="No alignments are available for the build associated with the selected interval file"/>
+                </options>
+              </param> 
+              <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment">
+                <options from_file="maf_index.loc">
+                  <column name="uid" index="1"/>
+                  <column name="value" index="3"/>
+                  <column name="name" index="3"/>
+                  <filter type="param_value" ref="maf_identifier" name="uid" column="1"/>
+                  <filter type="multiple_splitter" column="3" separator=","/>
+                </options>
+              </param>
+            </when>
+        </conditional>
+        <param name="overwrite_with_gaps" type="select" label="Split into Gapless MAF blocks" help="When set to Yes, blocks are divided around gaps appearing in any species. This will prevent gaps occurring in the interior of the sequence for an aligning species from overwriting a nucleotide found for the same position in a lower-scoring block.">
+          <option value="True" selected="true">No</option>
+          <option value="False">Yes</option>
+        </param>
+    </page>
+   </inputs>
+  <outputs>
+    <data format="fasta" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="13.bed" dbkey="hg18" ftype="bed"/>
+      <param name="maf_source" value="cached"/>
+      <param name="maf_identifier" value="17_WAY_MULTIZ_hg18"/>
+      <param name="species" value="hg18,mm8"/>
+      <param name="overwrite_with_gaps" value="True"/>
+      <output name="out_file1" file="interval_maf_to_merged_fasta_out3.fasta" />
+    </test>
+    <test>
+      <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/>
+      <param name="maf_source" value="cached"/>
+      <param name="maf_identifier" value="8_WAY_MULTIZ_hg17"/>
+      <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/>
+      <param name="overwrite_with_gaps" value="True"/>
+      <output name="out_file1" file="interval_maf_to_merged_fasta_out.dat" />
+    </test>
+    <test>
+      <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/>
+      <param name="maf_source" value="user"/>
+      <param name="maf_file" value="5.maf"/>
+      <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/>
+      <param name="overwrite_with_gaps" value="True"/>
+      <output name="out_file1" file="interval_maf_to_merged_fasta_user_out.dat" />
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+A single genomic region can be covered by multiple alignment blocks. In many cases it is desirable to stitch these alignment blocks together. This tool accepts a list of genomic intervals. For every interval it performs the following:
+
+  * finds all MAF blocks that overlap the interval;
+  * sorts MAF blocks by alignment score;
+  * stitches blocks together and resolves overlaps based on alignment score;
+  * outputs alignments in FASTA format.
+
+------
+
+**Example**
+
+Here three MAF blocks overlapping a single interval are stitched together. Space between blocks 2 and 3 is filled with gaps:
+
+.. image:: ./static/images/maf_icons/stitchMaf.png
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. &lt;http://www.ncbi.nlm.nih.gov/pubmed/21775304&gt;`_
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_by_block_number.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_by_block_number.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+#Dan Blankenberg
+"""
+Reads a list of block numbers and a maf. Produces a new maf containing the
+blocks specified by number.
+"""
+
+import sys
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from galaxy.tools.util import maf_utilities
+import bx.align.maf
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def __main__():
+    input_block_filename = sys.argv[1].strip()
+    input_maf_filename = sys.argv[2].strip()
+    output_filename1 = sys.argv[3].strip()
+    block_col = int( sys.argv[4].strip() ) - 1
+    if block_col < 0:
+        print >> sys.stderr, "Invalid column specified"
+        sys.exit(0)
+    species = maf_utilities.parse_species_option( sys.argv[5].strip() )
+    
+    maf_writer = bx.align.maf.Writer( open( output_filename1, 'w' ) )
+    #we want to maintain order of block file and write blocks as many times as they are listed
+    failed_lines = []
+    for ctr, line in enumerate( open( input_block_filename, 'r' ) ):
+        try:
+            block_wanted = int( line.split( "\t" )[block_col].strip() )
+        except:
+            failed_lines.append( str( ctr ) )
+            continue
+        try:
+            for count, block in enumerate( bx.align.maf.Reader( open( input_maf_filename, 'r' ) ) ):
+                if count == block_wanted:
+                    if species:
+                        block = block.limit_to_species( species )
+                    maf_writer.write( block )
+                    break
+        except:
+            print >>sys.stderr, "Your MAF file appears to be malformed."
+            sys.exit()
+    if len( failed_lines ) > 0: print "Failed to extract from %i lines (%s)." % ( len( failed_lines ), ",".join( failed_lines ) )
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_by_block_number.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_by_block_number.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,40 @@
+<tool id="maf_by_block_number1" name="Extract MAF by block number" version="1.0.1">
+  <description>given a set of block numbers and a MAF file</description>
+  <command interpreter="python">maf_by_block_number.py $input1 $input2 $out_file1 $block_col $species</command>
+  <inputs>
+    <param format="txt" name="input1" type="data" label="Block Numbers"/>
+    <param format="maf" name="input2" label="MAF File" type="data"/>
+    <param name="block_col" type="data_column" label="Column containing Block number" data_ref="input1" accept_default="True" />
+    <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment">
+      <options>
+        <filter type="data_meta" ref="input2" key="species" />
+      </options>
+    </param>
+   </inputs>
+  <outputs>
+    <data format="maf" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="maf_by_block_numbers.dat"/>
+      <param name="input2" value="3.maf"/>
+      <param name="block_col" value="1"/>
+      <param name="species" value="hg17,panTro1,mm5,rn3,canFam1"/>
+      <output name="out_file1" file="maf_by_block_number_out.dat" />
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool takes a list of block numbers, one per line, and extracts the corresponding MAF blocks from the provided file. Block numbers start at 0.
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. &lt;http://www.ncbi.nlm.nih.gov/pubmed/21775304&gt;`_
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_filter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_filter.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,65 @@
+#Dan Blankenberg
+#Filters a MAF file according to the provided code file, which is generated in maf_filter.xml <configfiles>
+#Also allows filtering by number of columns in a block, and limiting output species
+import sys, os, shutil
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+import bx.align.maf
+from galaxy.tools.util import maf_utilities
+
+def main():
+    #Read command line arguments
+    try:
+        script_file = sys.argv.pop( 1 )
+        maf_file = sys.argv.pop( 1 )
+        out_file = sys.argv.pop( 1 )
+        additional_files_path = sys.argv.pop( 1 )
+        species = maf_utilities.parse_species_option( sys.argv.pop( 1 ) )
+        min_size = int( sys.argv.pop( 1 ) )
+        max_size = int( sys.argv.pop( 1 ) )
+        if max_size < 1: max_size = sys.maxint
+        min_species_per_block = int( sys.argv.pop( 1 ) )
+        exclude_incomplete_blocks = int( sys.argv.pop( 1 ) )
+        if species:
+            num_species = len( species )
+        else:
+            num_species = len( sys.argv.pop( 1 ).split( ',') )
+    except:
+        print >>sys.stderr, "One or more arguments is missing.\nUsage: maf_filter.py maf_filter_file input_maf output_maf path_to_save_debug species_to_keep"
+        sys.exit()
+    
+    #Open input and output MAF files
+    try:
+        maf_reader = bx.align.maf.Reader( open( maf_file,'r' ) )
+        maf_writer = bx.align.maf.Writer( open( out_file,'w' ) )
+    except:
+        print >>sys.stderr, "Your MAF file appears to be malformed."
+        sys.exit()
+    
+    #Save script file for debuging/verification info later
+    os.mkdir( additional_files_path )
+    shutil.copy( script_file, os.path.join( additional_files_path, 'debug.txt' ) )
+    
+    #Loop through blocks, running filter on each
+    #'maf_block' and 'ret_val' are used/shared in the provided code file
+    #'ret_val' should be set to True if the block is to be kept
+    i = 0
+    blocks_kept = 0
+    for i, maf_block in enumerate( maf_reader ):
+        if min_size <= maf_block.text_size <= max_size:
+            local = {'maf_block':maf_block, 'ret_val':False}
+            execfile( script_file, {}, local )
+            if local['ret_val']:
+                #Species limiting must be done after filters as filters could be run on non-requested output species
+                if species:
+                    maf_block = maf_block.limit_to_species( species )
+                if len( maf_block.components ) >= min_species_per_block and ( not exclude_incomplete_blocks or len( maf_block.components ) >= num_species ):
+                    maf_writer.write( maf_block )
+                    blocks_kept += 1
+    maf_writer.close()
+    maf_reader.close()
+    if i == 0: print "Your file contains no valid maf_blocks."
+    else: print 'Kept %s of %s blocks (%.2f%%).' % ( blocks_kept, i + 1, float( blocks_kept ) / float( i + 1 ) * 100.0 )
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_filter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_filter.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,202 @@\n+<tool id="MAF_filter" name="Filter MAF" version="1.0.1">\r\n+  <description>by specified attributes</description>\r\n+  <command interpreter="python">maf_filter.py $maf_filter_file $input1 $out_file1 $out_file1.files_path $species $min_size $max_size $min_species_per_block $exclude_incomplete_blocks ${input1.metadata.species}</command>\r\n+  <inputs>\r\n+    <page>\r\n+      <param name="input1" type="data" format="maf" label="MAF File"/>\r\n+      <param name="min_size" label="Minimum Size" value="0" type="integer"/>\r\n+      <param name="max_size" label="Maximum Size" value="0" type="integer" help="A maximum size less than 1 indicates no limit"/>\r\n+      <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment">\r\n+        <options>\r\n+          <filter type="data_meta" ref="input1" key="species" />\r\n+        </options>\r\n+      </param>\r\n+      <param name="min_species_per_block" type="select" label="Exclude blocks which have only one species" >\r\n+        <option value="2">Yes</option>\r\n+        <option value="1" selected="True">No</option>\r\n+      </param>\r\n+      <param name="exclude_incomplete_blocks" type="select" label="Exclude blocks which have missing species" >\r\n+        <option value="1">Yes</option>\r\n+        <option value="0" selected="True">No</option>\r\n+      </param>\r\n+      <repeat name="maf_filters" title="Filter">\r\n+        <param name="species1" type="select" label="When Species" multiple="false">\r\n+          <options>\r\n+            <filter type="data_meta" ref="input1" key="species" />\r\n+          </options>\r\n+        </param>\r\n+        <conditional name="species1_attributes">\r\n+          <param name="species1_attribute_type" type="select" label="Species Attribute">\r\n+            <option value="attribute_strand">Strand</option>\r\n+            <option value="attribute_chr" selected="true">Chromosome</option>\r\n+          </param>\r\n+          <when value="attribute_strand">\r\n+            <param name="species1_is_isnot" type="select" label="Conditional">\r\n+              <option value="==">Is</option>\r\n+              <option value="!=">Is Not</option>\r\n+            </param>\r\n+            <param name="species1_attribute" type="select" label="Strand">\r\n+              <option value="+" selected="true">+</option>\r\n+              <option value="-">-</option>\r\n+            </param>\r\n+            <repeat name="filter_condition" title="Filter Condition">\r\n+              <param name="species2" type="select" label="Species" multiple="false">\r\n+                <options>\r\n+                  <filter type="data_meta" ref="input1" key="species" />\r\n+                </options>\r\n+              </param>\r\n+              <conditional name="species2_attributes">\r\n+                <param name="species2_attribute_type" type="select" label="Species Attribute">\r\n+                  <option value="attribute_strand" selected="true">Strand</option>\r\n+                  <option value="attribute_chr">Chromosome</option>\r\n+                </param>\r\n+                <when value="attribute_strand">\r\n+                  <param name="species2_is_isnot" type="select" label="Conditional">\r\n+                    <option value="==">Is</option>\r\n+                    <option value="!=">Is Not</option>\r\n+                  </param>\r\n+                  <param name="species2_attribute" type="select" label="Strand">\r\n+                    <option value="+" selected="true">+</option>\r\n+                    <option value="-">-</option>\r\n+                  </param>\r\n+                </when>\r\n+                <when value="attribute_chr">\r\n+                  <param name="species2_is_isnot" type="select" label="Conditional">\r\n+                    <option value="in">Is</option>\r\n+                    <option value="not in">Is Not</option>\r\n+                  </param>\r\n+                  <param name="species2_attribute" type="text" label="Chromosome" value="chr1"/>\r\n+   '..b'$filter_condition in $maf_filter[\'species1_attributes\'][\'filter_condition\']:\r\n+            secondary_component = maf_block.get_component_by_src_start( """$filter_condition[\'species2\'].value.encode( \'string_escape\' )""".decode( \'string_escape\' ) )\r\n+#if $filter_condition[\'species2_attributes\'][\'species2_attribute_type\'] == \'attribute_chr\':\r\n+            if secondary_component is not None:\r\n+                if not ( secondary_component.src.split( "." )[-1] $is_isnot_valid.get( $filter_condition[\'species2_attributes\'][\'species2_is_isnot\'].value.strip(), \'is in\' ) """$filter_condition[\'species2_attributes\'][\'species2_attribute\'].value.encode( \'string_escape\' )""".decode( \'string_escape\' ).split( "," ) ):\r\n+                    return False\r\n+#else:\r\n+            if secondary_component is not None:\r\n+                if not ( secondary_component.strand $is_isnot_valid.get( $filter_condition[\'species2_attributes\'][\'species2_is_isnot\'].value.strip(), \'==\' ) """$filter_condition[\'species2_attributes\'][\'species2_attribute\'].value.encode( \'string_escape\' )""".decode( \'string_escape\' ) ):\r\n+                    return False\r\n+#end if\r\n+#end for\r\n+#end for\r\n+    return True\r\n+ret_val = maf_block_pass_filter( maf_block )\r\n+</configfile>\r\n+  </configfiles>\r\n+  <outputs>\r\n+    <data format="maf" name="out_file1" />\r\n+  </outputs>\r\n+<!--\r\n+  <tests>\r\n+    <test>\r\n+      <param name="input1" value="4.maf"/>\r\n+      <param name="species" value="bosTau2,canFam2,hg17,panTro1,rheMac2,rn3"/>\r\n+      <param name="exclude_incomplete_blocks" value="0"/>\r\n+      <param name="min_species_per_block" value="1"/>\r\n+      <param name="min_size" value="0"/>\r\n+      <param name="max_size" value="0"/>\r\n+      <param name="species1" value="hg17"/>\r\n+      <param name="species2" value="hg17"/>\r\n+      <param name="species1_attribute_type" value="attribute_chr"/>\r\n+      <param name="species1_is_isnot" value="in"/>\r\n+      <param name="species1_attribute" value="chr1"/>\r\n+      <param name="filter_condition"/> Test will ERROR when this is set or when it is not set.\r\n+      \r\n+      <output name="out_file1" file="cf_maf_limit_to_species.dat"/>\r\n+    </test>\r\n+  </tests>\r\n+-->\r\n+<help>\r\n+This tool allows you to build complex filters to be applied to each alignment block of a MAF file. You can define restraints on species based upon chromosome and strand. You can specify comma separated lists of chromosomes where appropriate.\r\n+\r\n+.. class:: infomark\r\n+\r\n+For example, this tool is useful to restrict a set of alignments to only those blocks which contain alignments between chromosomes that are considered homologous.\r\n+\r\n+-----\r\n+\r\n+.. class:: warningmark\r\n+\r\n+If a species is not found in a particular block, all filters on that species are ignored.\r\n+\r\n+-----\r\n+\r\n+This tool allows the user to remove any undesired species from a MAF file. If no species are specified then all species will be kept. If species are specified, columns which contain only gaps are removed. The options for this are:\r\n+\r\n+ * **Exclude blocks which have missing species** - suppose you want to restrict an 8-way alignment to human, mouse, and rat.  The tool will first remove all other species. Next, if this option is set to **YES** the tool WILL NOT return MAF blocks, which do not include human, mouse, or rat. This means that all alignment blocks returned by the tool will have exactly three sequences in this example.\r\n+\r\n+ * **Exclude blocks which have only one species** - if this option is set to **YES** all single sequence alignment blocks WILL NOT be returned.\r\n+\r\n+-----\r\n+\r\n+You can also provide a size range and limit your output to the MAF blocks which fall within the specified range.\r\n+\r\n+------\r\n+\r\n+**Citation**\r\n+\r\n+If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. &lt;http://www.ncbi.nlm.nih.gov/pubmed/21775304&gt;`_\r\n+\r\n+\r\n+</help>\r\n+</tool>\r\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_limit_size.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_limit_size.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+#Dan Blankenberg
+"""
+Removes blocks that fall outside of specified size range.
+"""
+
+import sys
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+import bx.align.maf
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def __main__():
+
+    input_maf_filename = sys.argv[1].strip()
+    output_filename1 = sys.argv[2].strip()
+    min_size = int( sys.argv[3].strip() )
+    max_size = int( sys.argv[4].strip() )
+    if max_size < 1: max_size = sys.maxint
+    maf_writer = bx.align.maf.Writer( open( output_filename1, 'w' ) )
+    try:
+        maf_reader = bx.align.maf.Reader( open( input_maf_filename, 'r' ) )
+    except:
+        print >>sys.stderr, "Your MAF file appears to be malformed."
+        sys.exit()
+    
+    blocks_kept = 0
+    i = 0
+    for i, m in enumerate( maf_reader ):
+        if min_size <= m.text_size <= max_size:
+            maf_writer.write( m )
+            blocks_kept += 1
+    print 'Kept %s of %s blocks (%.2f%%).' % ( blocks_kept, i + 1, float( blocks_kept ) / float( i + 1 ) * 100.0 )
+
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_limit_size.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_limit_size.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,36 @@
+<tool id="maf_limit_size1" name="Filter MAF blocks" version="1.0.1">
+  <description>by Size</description>
+  <command interpreter="python">maf_limit_size.py $input1 $out_file1 $min_size $max_size</command>
+  <inputs>
+    <page>
+        <param format="maf" name="input1" label="MAF File" type="data"/>
+        <param name="min_size" label="Minimum Size" value="0" type="integer"/>
+        <param name="max_size" label="Maximum Size" value="0" type="integer" help="A maximum size less than 1 indicates no limit"/>
+    </page>
+   </inputs>
+  <outputs>
+    <data format="maf" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="3.maf" ftype="maf" />
+      <param name="min_size" value="0"/>
+      <param name="max_size" value="0"/>
+      <output name="out_file1" file="maf_limit_size1_out.maf" />
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool takes a MAF file and a size range and extracts the MAF blocks which fall within the specified range.
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. &lt;http://www.ncbi.nlm.nih.gov/pubmed/21775304&gt;`_
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_limit_to_species.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_limit_to_species.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+
+"""
+Read a maf file and write out a new maf with only blocks having the 
+required species, after dropping any other species and removing
+columns containing only gaps.
+
+usage: %prog species,species2,... input_maf output_maf allow_partial min_species_per_block
+"""
+#Dan Blankenberg
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+import bx.align.maf
+from galaxy.tools.util import maf_utilities
+import sys
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def main():
+
+    species = maf_utilities.parse_species_option( sys.argv[1] )
+    if species:
+        spec_len = len( species )
+    else:
+        spec_len = 0
+    try:
+        maf_reader = bx.align.maf.Reader( open( sys.argv[2],'r' ) )
+        maf_writer = bx.align.maf.Writer( open( sys.argv[3],'w' ) )
+    except:
+        print >>sys.stderr, "Your MAF file appears to be malformed."
+        sys.exit()
+    allow_partial = False
+    if int( sys.argv[4] ): allow_partial = True
+    min_species_per_block = int( sys.argv[5] )
+    
+    maf_blocks_kept = 0
+    for m in maf_reader:
+        if species:
+            m = m.limit_to_species( species )
+        m.remove_all_gap_columns()
+        spec_in_block_len = len( maf_utilities.get_species_in_block( m ) )
+        if ( not species or allow_partial or spec_in_block_len == spec_len ) and spec_in_block_len > min_species_per_block:
+            maf_writer.write( m )
+            maf_blocks_kept += 1
+    
+    maf_reader.close()
+    maf_writer.close()
+    
+    print "Restricted to species: %s." % ", ".join( species )
+    print "%i MAF blocks have been kept." % maf_blocks_kept
+
+if __name__ == "__main__": 
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_limit_to_species.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_limit_to_species.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,51 @@
+<tool id="MAF_Limit_To_Species1" name="Filter MAF blocks">
+  <description>by Species</description>
+  <command interpreter="python">maf_limit_to_species.py $species $input1 $out_file1 $allow_partial $min_species</command>
+  <inputs>
+    <param name="input1" type="data" format="maf" label="MAF file"/>
+    <param name="allow_partial" type="select" label="Exclude blocks which have missing species" >
+      <option value="1">No</option>
+      <option value="0">Yes</option>
+    </param>
+    <param name="min_species" type="select" label="Exclude blocks which have only one species" >
+      <option value="1">Yes</option>
+      <option value="0">No</option>
+    </param>
+    <param name="species" type="select" label="Species to keep" display="checkboxes" multiple="true">
+      <options>
+        <filter type="data_meta" ref="input1" key="species" />
+      </options>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="maf" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="4.maf"/>
+      <param name="species" value="bosTau2,canFam2,hg17,panTro1,rheMac2,rn3"/>
+      <param name="allow_partial" value="0"/>
+      <param name="min_species" value="0"/>
+      <output name="out_file1" file="cf_maf_limit_to_species.dat"/>
+    </test>
+  </tests>
+  <help>
+
+**What It Does**
+
+This tool allows the user to remove any undesired species from a MAF file. Columns which contain only gaps are removed. The options for this tool are:
+
+ * **Exclude blocks which have missing species** - suppose you want to restrict an 8-way alignment to human, mouse, and rat.  The tool will first remove all other species. Next, if this option is set to **YES** the tool WILL NOT return MAF blocks, which do not include human, mouse, or rat. This means that all alignment blocks returned by the tool will have exactly three sequences in this example.
+
+ * **Exclude blocks with have only one species** - if this option is set to **YES** all single sequence alignment blocks WILL NOT be returned.
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. &lt;http://www.ncbi.nlm.nih.gov/pubmed/21775304&gt;`_
+
+
+  </help>
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_reverse_complement.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_reverse_complement.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+
+"""
+Reads a MAF file. Produces a MAF file containing
+the reverse complement for each block in the source file.
+
+usage: %prog input_maf_file output_maf_file
+"""
+#Dan Blankenberg
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+import bx.align.maf
+from galaxy.tools.util import maf_utilities
+import sys
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def __main__():
+    #Parse Command Line
+    input_file = sys.argv.pop( 1 )
+    output_file = sys.argv.pop( 1 )
+    species = maf_utilities.parse_species_option( sys.argv.pop( 1 ) )
+    
+    try:
+        maf_writer = bx.align.maf.Writer( open( output_file, 'w' ) )
+    except:
+        print sys.stderr, "Unable to open output file"
+        sys.exit()
+    try:
+        count = 0
+        for count, maf in enumerate( bx.align.maf.Reader( open( input_file ) ) ):
+            maf = maf.reverse_complement()
+            if species:
+                maf = maf.limit_to_species( species )
+            maf_writer.write( maf )
+    except:
+        print >>sys.stderr, "Your MAF file appears to be malformed."
+        sys.exit()
+    print "%i regions were reverse complemented." % count
+    maf_writer.close()
+
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_reverse_complement.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_reverse_complement.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,53 @@
+<tool id="MAF_Reverse_Complement_1" name="Reverse Complement" version="1.0.1">
+  <description>a MAF file</description>
+  <command interpreter="python">maf_reverse_complement.py $input1 $out_file1 $species</command>
+  <inputs>
+    <page>
+        <param format="maf" name="input1" label="Alignment File" type="data"/>
+        <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment">
+          <options>
+            <filter type="data_meta" ref="input1" key="species" />
+          </options>
+        </param>
+    </page>
+   </inputs>
+  <outputs>
+    <data format="maf" name="out_file1" metadata_source="input1"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="3.maf" dbkey="hg17" format="maf"/>
+      <param name="species" value="hg17,panTro1,mm5,rn3,canFam1"/>
+      <output name="out_file1" file="maf_reverse_complement_out.dat"/>
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+This tool takes a MAF file and creates a new MAF file, where each block has been reversed complemented.
+
+**Example**
+  
+This MAF Block::
+
+  a score=8157.000000
+  s hg17.chr7    127471526 58 + 158628139 AATTTGTGGTTTATTCATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG
+  s panTro1.chr6 129885407 58 + 161576975 AATTTGTGGTTTATTCGTTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG
+  s mm5.chr6      28904928 54 + 149721531 AA----CGTTTCATTGATTGCTCATCATTTAAAAAAAGAAATTCCTCAGTGGAAGAGG
+
+becomes::
+
+  a score=8157.000000
+  s hg17.chr7     31156555 58 - 158628139 CCTCTTCCACTATAGACCTCCTTAAACAAAATAATGAAAAATGAATAAACCACAAATT
+  s panTro1.chr6  31691510 58 - 161576975 CCTCTTCCACTATAGACCTCCTTAAACAAAATAATGAAAAACGAATAAACCACAAATT
+  s mm5.chr6     120816549 54 - 149721531 CCTCTTCCACTGAGGAATTTCTTTTTTTAAATGATGAGCAATCAATGAAACG----TT
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. &lt;http://www.ncbi.nlm.nih.gov/pubmed/21775304&gt;`_
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_split_by_species.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_split_by_species.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+"""
+Read a maf and split blocks by unique species combinations 
+"""
+import sys
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.align import maf
+from galaxy.tools.util import maf_utilities
+from galaxy.util import string_as_bool
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def __main__():    
+    try:
+        maf_reader = maf.Reader( open( sys.argv[1] ) )
+    except Exception, e:
+        maf_utilities.tool_fail( "Error opening MAF: %s" % e )
+    try:
+        out = maf.Writer( open( sys.argv[2], "w") )
+    except Exception, e:
+        maf_utilities.tool_fail( "Error opening file for output: %s" % e )
+    try:
+        collapse_columns = string_as_bool( sys.argv[3] )
+    except Exception, e:
+        maf_utilities.tool_fail( "Error determining collapse columns value: %s" % e )
+    
+    start_count = 0
+    end_count = 0
+    for start_count, start_block in enumerate( maf_reader ):
+        for block in maf_utilities.iter_blocks_split_by_species( start_block ):
+            if collapse_columns:
+                block.remove_all_gap_columns()
+            out.write( block )
+            end_count += 1
+    out.close()
+    
+    if end_count:
+        print "%i alignment blocks created from %i original blocks." % ( end_count, start_count + 1 )
+    else:
+        print "No alignment blocks were created."
+
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_split_by_species.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_split_by_species.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,223 @@\n+<tool id="MAF_split_blocks_by_species1" name="Split MAF blocks" version="1.0.0">\n+  <description>by Species</description>\n+  <command interpreter="python">maf_split_by_species.py $input1 $out_file1 $collapse_columns</command>\n+  <inputs>\n+    <param format="maf" name="input1" type="data" label="MAF file to split"/>\n+    <param name="collapse_columns" type="select" label="Collapse empty alignment columns" help="Removes columns that are gaps in all sequences">\r\n+      <option value="True" selected="true">Yes</option>\r\n+      <option value="False">No</option>\r\n+    </param>\n+  </inputs>\n+  <outputs>\n+    <data format="maf" name="out_file1" />\n+  </outputs>\n+  <tests>\n+    <test>\n+      <param name="input1" value="maf_split_by_species_in.maf"/>\n+      <param name="collapse_columns" value="True"/>\n+      <output name="out_file1" file="maf_split_by_species_collapsed_out.maf"/>\n+    </test>\n+    <test>\n+      <param name="input1" value="maf_split_by_species_in.maf"/>\n+      <param name="collapse_columns" value="False"/>\n+      <output name="out_file1" file="maf_split_by_species_not_collapsed_out.maf"/>\n+    </test>\n+  </tests>\n+  <help>\n+\n+**What it does**\n+\n+This tool examines each MAF block for multiple occurrences of a species in a single block. When this occurs, a block is split into multiple blocks where every combination of one sequence per species per block is represented.\n+\n+The interface for this tool has two inputs: \n+\n+ * **MAF file to split**. Choose multiple alignments from history to be split by species.\n+ * **Collapse empty alignment columns**. Should alignment columns containing only gaps in the new blocks be removed.\n+\n+-----\n+\n+**Example 1**: **Collapse empty alignment columns is Yes**:\n+\n+For the following alignment::\n+\n+  ##maf version=1\n+  a score=2047408.0\n+  s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG \n+  s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG \n+  s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG \n+  s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG \n+  s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG \n+  s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG \n+  s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG \n+  s species3.chr3  68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG \n+\n+the tool will create **a single** history item containing 12 alignment blocks (notice that no columns contain only gaps)::\n+\n+  ##maf version=1\n+  a score=2047408.0\n+  s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG \n+  s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG \n+  s species3.chr3  68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG \n+  \n+  a score=2047408.0\n+  s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG \n+  s species2.chr1 129723125 85 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG \n+  s species3.chr3  68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG \n+  \n+  a score=2047408.0\n+  s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT-----'..b'CGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG \n+  s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG \n+  s species3.chr3  68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG \n+  \n+  a score=2047408.0\n+  s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG \n+  s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG \n+  s species3.chr3  68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG \n+  \n+  a score=2047408.0\n+  s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG \n+  s species2.chr1 129723125 83 - 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCT--GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTCGTCCTCAG \n+  s species3.chr3  68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG \n+  \n+  a score=2047408.0\n+  s species1.chr1 147984545 85 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG \n+  s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG \n+  s species3.chr3  68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG \n+  \n+  a score=2047408.0\n+  s species1.chr1 147984545 83 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTT--GTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG \n+  s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG \n+  s species3.chr3  68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG \n+  \n+  a score=2047408.0\n+  s species1.chr1 147984645 79 + 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTT------AG \n+  s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG \n+  s species3.chr3  68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG \n+  \n+  a score=2047408.0\n+  s species1.chr1 147984645 79 - 245522847 ATGGCGTCGGCCTCCTCCGGGCCGTCGTC---GGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTC---AG \n+  s species2.chr1 129723925 79 + 229575298 ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTC------AG \n+  s species3.chr3  68255714 76 - 258222147 ATGGCGTCCGCCTCCTCAGGGCCAGCGGC---GGCGGGGTTTTCACCCCTTGATTCCGGGGTCCCTGCCGGTACCGC------AG \n+\n+-------\n+\n+.. class:: infomark\n+\n+**About formats**\n+\n+**MAF format** multiple alignment format file. This format stores multiple alignments at the DNA level between entire genomes. \n+\n+ - The .maf format is line-oriented. Each multiple alignment ends with a blank line.\n+ - Each sequence in an alignment is on a single line.\n+ - Lines starting with # are considered to be comments.\n+ - Each multiple alignment is in a separate paragraph that begins with an "a" line and contains an "s" line for each sequence in the multiple alignment.\n+ - Some MAF files may contain two optional line types: \n+\n+   - An "i" line containing information about what is in the aligned species DNA before and after the immediately preceding "s" line; \n+   - An "e" line containing information about the size of the gap between the alignments that span the current block.\n+\n+------\n+\n+**Citation**\n+\n+If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. &lt;http://www.ncbi.nlm.nih.gov/pubmed/21775304&gt;`_\n+\n+\n+    </help>\n+</tool>\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_stats.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_stats.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+#Dan Blankenberg
+"""
+Reads a list of intervals and a maf. Outputs a new set of intervals with statistics appended.
+"""
+
+import sys
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+import bx.intervals.io
+from bx.bitset import BitSet
+from galaxy.tools.util import maf_utilities
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def __main__():
+    maf_source_type = sys.argv.pop( 1 )
+    input_maf_filename = sys.argv[1].strip()
+    input_interval_filename = sys.argv[2].strip()
+    output_filename = sys.argv[3].strip()
+    dbkey = sys.argv[4].strip()
+    try:
+        chr_col  = int( sys.argv[5].strip() ) - 1
+        start_col = int( sys.argv[6].strip() ) - 1
+        end_col = int( sys.argv[7].strip() ) - 1
+    except:
+        print >>sys.stderr, "You appear to be missing metadata. You can specify your metadata by clicking on the pencil icon associated with your interval file."
+        sys.exit()
+    summary = sys.argv[8].strip()
+    if summary.lower() == "true": summary = True
+    else: summary = False
+
+    mafIndexFile = "%s/maf_index.loc" % sys.argv[9]
+    try:
+        maf_index_filename = sys.argv[10].strip()
+    except:
+        maf_index_filename = None
+    index = index_filename = None
+    if maf_source_type == "user":
+        #index maf for use here
+        index, index_filename = maf_utilities.open_or_build_maf_index( input_maf_filename, maf_index_filename, species = [dbkey] )
+        if index is None:
+            print >>sys.stderr, "Your MAF file appears to be malformed."
+            sys.exit()
+    elif maf_source_type == "cached":
+        #access existing indexes
+        index = maf_utilities.maf_index_by_uid( input_maf_filename, mafIndexFile )
+        if index is None:
+            print >> sys.stderr, "The MAF source specified (%s) appears to be invalid." % ( input_maf_filename )
+            sys.exit()
+    else:
+        print >>sys.stdout, 'Invalid source type specified: %s' % maf_source_type 
+        sys.exit()
+        
+    out = open(output_filename, 'w')
+    
+    num_region = None
+    species_summary = {}
+    total_length = 0
+    #loop through interval file
+    for num_region, region in enumerate( bx.intervals.io.NiceReaderWrapper( open( input_interval_filename, 'r' ), chrom_col = chr_col, start_col = start_col, end_col = end_col, fix_strand = True, return_header = False, return_comments = False ) ):
+        src = "%s.%s" % ( dbkey, region.chrom )
+        region_length = region.end - region.start
+        total_length += region_length
+        coverage = { dbkey: BitSet( region_length ) }
+        
+        
+        for block in index.get_as_iterator( src, region.start, region.end ):
+            for spec in maf_utilities.get_species_in_block( block ):
+                if spec not in coverage: coverage[spec] = BitSet( region_length )
+            for block in maf_utilities.iter_blocks_split_by_species( block ):
+                if maf_utilities.component_overlaps_region( block.get_component_by_src( src ), region ):
+                    #need to chop and orient the block
+                    block = maf_utilities.orient_block_by_region( maf_utilities.chop_block_by_region( block, src, region ), src, region, force_strand = '+' )
+                    start_offset, alignment = maf_utilities.reduce_block_by_primary_genome( block, dbkey, region.chrom, region.start )
+                    for i in range( len( alignment[dbkey] ) ):
+                        for spec, text in alignment.items():
+                            if text[i] != '-':
+                                coverage[spec].set( start_offset + i )
+        if summary:
+            #record summary
+            for key in coverage.keys():
+                if key not in species_summary: species_summary[key] = 0
+                species_summary[key] = species_summary[key] + coverage[key].count_range()
+        else:
+            #print coverage for interval
+            coverage_sum = coverage[dbkey].count_range()
+            out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), dbkey, coverage_sum, region_length - coverage_sum ) )
+            keys = coverage.keys()
+            keys.remove( dbkey )
+            keys.sort()
+            for key in keys:
+                coverage_sum = coverage[key].count_range()
+                out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), key, coverage_sum, region_length - coverage_sum ) )
+    if summary:
+        out.write( "#species\tnucleotides\tcoverage\n" )
+        for spec in species_summary:
+            out.write( "%s\t%s\t%.4f\n" % ( spec, species_summary[spec], float( species_summary[spec] ) / total_length ) )
+    out.close()
+    if num_region is not None:
+        print "%i regions were processed with a total length of %i." % ( num_region + 1, total_length )
+    maf_utilities.remove_temp_index_file( index_filename )
+
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_stats.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_stats.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,120 @@
+<tool id="maf_stats1" name="MAF Coverage Stats" version="1.0.1">
+  <description>Alignment coverage information</description>
+  <command interpreter="python">
+    maf_stats.py
+    #if $maf_source_type.maf_source == "user":
+      $maf_source_type.maf_source $input2 $input1 $out_file1 $dbkey ${input1.metadata.chromCol} ${input1.metadata.startCol} ${input1.metadata.endCol} $summary
+    #else:
+      $maf_source_type.maf_source $maf_source_type.mafType $input1 $out_file1 $dbkey ${input1.metadata.chromCol} ${input1.metadata.startCol} ${input1.metadata.endCol} $summary
+    #end if
+    ${GALAXY_DATA_INDEX_DIR}
+    #if $maf_source_type.maf_source == "user":
+    $input2.metadata.maf_index
+    #end if
+  </command>
+  <inputs>
+    <param format="interval" name="input1" label="Interval File" type="data">
+      <validator type="unspecified_build" />
+    </param>
+    <conditional name="maf_source_type">
+      <param name="maf_source" type="select" label="MAF Source">
+        <option value="cached" selected="true">Locally Cached Alignments</option>
+        <option value="user">Alignments in Your History</option>
+      </param>
+      <when value="user">
+        <param format="maf" name="input2" label="MAF File" type="data">
+          <options>
+            <filter type="data_meta" ref="input1" key="dbkey" />
+          </options>
+          <validator type="dataset_ok_validator" />
+        </param>
+      </when>
+      <when value="cached">
+        <param name="mafType" type="select" label="MAF Type">
+          <options from_file="maf_index.loc">
+            <column name="name" index="0"/>
+            <column name="value" index="1"/>
+            <column name="dbkey" index="2"/>
+            <filter type="data_meta" ref="input1" key="dbkey" column="2" multiple="True" separator=","/>
+            <validator type="no_options" message="No alignments are available for the build associated with the selected interval file"/>
+          </options>
+        </param> 
+      </when>
+    </conditional>
+    <param name="summary" type="select" label="Type of Output">
+      <option value="false" selected="true">Coverage by Region</option>
+      <option value="true">Summarize Coverage</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="interval" name="out_file1" metadata_source="input1">
+      <change_format>
+        <when input="summary" value="true" format="tabular" />
+      </change_format>
+    </data>
+  </outputs>
+  <requirements>
+    <requirement type="python-module">numpy</requirement>
+  </requirements>
+  <tests>
+    <test>
+      <param name="input1" value="1.bed" dbkey="hg17" format="bed"/>
+      <param name="maf_source" value="cached"/>
+      <param name="mafType" value="8_WAY_MULTIZ_hg17"/>
+      <output name="out_file1" file="maf_stats_interval_out.dat"/>
+      <param name="summary" value="false"/>
+    </test>
+    <test>
+      <param name="input1" value="1.bed" dbkey="hg17" format="bed"/>
+      <param name="maf_source" value="cached"/>
+      <param name="mafType" value="8_WAY_MULTIZ_hg17"/>
+      <output name="out_file1" file="maf_stats_summary_out.dat"/>
+      <param name="summary" value="true"/>
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool takes a MAF file and an interval file and relates coverage information by interval for each species.
+If a column does not exist in the reference genome, it is not included in the output.
+
+Consider the interval: "chrX 1000 1100 myInterval"
+  Let's suppose we want to do stats on three way alignments for H, M, and R. The result look like this:
+
+    chrX 1000 1100 myInterval H XXX YYY 
+    
+    chrX 1000 1100 myInterval M XXX YYY 
+    
+    chrX 1000 1100 myInterval R XXX YYY 
+    
+
+  where XXX and YYY are:
+
+    XXX = number of nucleotides
+    
+    YYY = number of gaps
+
+----
+
+Alternatively, you can request only summary information for a set of intervals:
+  
+  ========  ===========  ========
+  #species  nucleotides  coverage
+  ========  ===========  ========
+  hg18         30639      0.2372
+  rheMac2      7524       0.0582
+  panTro2      30390      0.2353
+  ========  ===========  ========
+
+  where **coverage** is the number of nucleotides divided by the total length of the provided intervals.
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. &lt;http://www.ncbi.nlm.nih.gov/pubmed/21775304&gt;`_
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_thread_for_species.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_thread_for_species.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+
+"""
+Read a maf file and write out a new maf with only blocks having all of
+the passed in species, after dropping any other species and removing columns 
+containing only gaps. This will attempt to fuse together any blocks
+which are adjacent after the unwanted species have been dropped. 
+
+usage: %prog input_maf output_maf species1,species2
+"""
+#Dan Blankenberg
+import sys
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+import bx.align.maf
+
+from bx.align.tools.thread import *
+from bx.align.tools.fuse import *
+
+def main():
+    input_file = sys.argv.pop( 1 )
+    output_file = sys.argv.pop( 1 )
+    species = sys.argv.pop( 1 ).split( ',' )
+    
+    try:
+        maf_reader = bx.align.maf.Reader( open( input_file ) )
+    except:
+        print >> sys.stderr, "Unable to open source MAF file"
+        sys.exit()
+    try:
+        maf_writer = FusingAlignmentWriter( bx.align.maf.Writer( open( output_file, 'w' ) ) )
+    except:
+        print >> sys.stderr, "Unable to open output file"
+        sys.exit()
+    try:
+        for m in maf_reader:            
+            new_components = m.components
+            if species != ['None']:
+                new_components = get_components_for_species( m, species )
+            if new_components: 
+                remove_all_gap_columns( new_components )
+                m.components = new_components
+                m.score = 0.0 
+                maf_writer.write( m )
+    except Exception, e:
+        print >> sys.stderr, "Error steping through MAF File: %s" % e
+        sys.exit()
+    maf_reader.close()
+    maf_writer.close()
+    
+    print "Restricted to species: %s." % ", ".join( species )
+    
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_thread_for_species.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_thread_for_species.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,60 @@
+<tool id="MAF_Thread_For_Species1" name="Join MAF blocks">
+  <description>by Species</description>
+  <command interpreter="python">maf_thread_for_species.py $input1 $out_file1 $species</command>
+  <inputs>
+    <param format="maf" name="input1" type="data" label="MAF file"/>
+    <param name="species" type="select" label="Species to keep" display="checkboxes" multiple="true">
+      <options>
+        <filter type="data_meta" ref="input1" key="species" />
+      </options>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="maf" name="out_file1"  metadata_source="input1"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="3.maf" format="maf"/>
+      <param name="species" value="hg17,panTro1"/>
+      <output name="out_file1" file="maf_thread_for_species.dat"/>
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool allows the user to merge MAF blocks which are adjoining in each specified species from a MAF file. Columns which contain only gaps are removed. Species which are not desired are removed from the output.
+
+**Example**
+
+Specifying the desired species as hg17 and panTro1 with this MAF file::
+
+  ##maf version=1
+  a score=60426.000000
+  s hg17.chr7    127471195 331 + 158628139 gtttgccatcttttgctgctctagggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATCATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTAAAACTTCCC-------------------------------AAATACT-GCCACTGATGTCCTG-----ATGGAGGTA-------TGAA-------------------AACATCCACTAA
+  s panTro1.chr6 129885076 331 + 161576975 gtttgccatcttttgctgctcttgggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATCATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTGAAACTTCCC-------------------------------AAATACT-GCCACTGATGTCCTG-----ATGGAGGTA-------TGAA-------------------AACATCCACTAA
+  s mm5.chr6      28904571 357 + 149721531 CTCCACTCTCGTTTGCTGTT----------------CTGTCACCATGGAAACAAA-CGAGGGTGGTCCAGTTACTATCTTGACTGCAGCTGGCAGTCAGTT-GCCACT-----CAGGAATAAGGCTATGCCATT-GATCCACTGAACCGTGATCTGGAAACCTGGCTGTTGTTT-------CAAGCCTTGGGGCCAGTTTGCGGTGTTACTCATGA--CTCTAAGATCGTGTGCTTG----CTGCAGGAAGAGACAGCAAGGGGGTTACATTTAAAAAGCCCCCAGTTTAGCTATAGGCAGGCCAACAGGTGTAAAAATACTCACTAGTAATGGGCTGAACTCATGGAGGTAGCATTAGTGAGACACTGTAACTGTTTTTTTAAAAATCACTAA
+  s rn3.chr4      56178191 282 + 187371129 CTTCACTCTCATTTGCTGTT----------------CTGTCACTATGGAGACAAACACAGGCTAGCCCAGTTACTATCTTGATCACAGCAGCT-GTCAGCTAGCTGCCACTCACAGGAATAAGGCCATACCATT-GATCCACTGAACCTTGATCTAGGAATTTGGC----------------------TGGGGCCAGTTTGCGGTGTCACTCATGA--CTCTAAGATTGTGTGTTTG----CTCCAGGAAGAGACGGCAAGAGGATTACCTTTAAAAGGTTC---------------------------------GGAGTCTAGCTGTAGACAGCCCA-----ATG--GGTA-------TAAC-------------------AATACTCACTAA
+
+  a score=8157.000000
+  s hg17.chr7    127471526 58 + 158628139 AATTTGTGGTTTATTCATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG
+  s panTro1.chr6 129885407 58 + 161576975 AATTTGTGGTTTATTCGTTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG
+  s mm5.chr6      28904928 54 + 149721531 AA----CGTTTCATTGATTGCTCATCATTTAAAAAAAGAAATTCCTCAGTGGAAGAGG
+
+results in::
+
+  ##maf version=1
+  a score=0.0
+  s hg17.chr7    127471195 389 + 158628139 gtttgccatcttttgctgctctagggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATCATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTAAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGGTATGAAAACATCCACTAAAATTTGTGGTTTATTCATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG 
+  s panTro1.chr6 129885076 389 + 161576975 gtttgccatcttttgctgctcttgggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATCATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTGAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGGTATGAAAACATCCACTAAAATTTGTGGTTTATTCGTTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG 
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. &lt;http://www.ncbi.nlm.nih.gov/pubmed/21775304&gt;`_
+
+
+  </help>
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_to_bed.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_to_bed.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+
+"""
+Read a maf and output intervals for specified list of species.
+"""
+import sys, os, tempfile
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.align import maf
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def __main__():
+        
+    input_filename = sys.argv[1]
+    output_filename = sys.argv[2]
+    #where to store files that become additional output
+    database_tmp_dir = sys.argv[5]
+    
+    species = sys.argv[3].split(',')
+    partial = sys.argv[4]
+    out_files = {}
+    primary_spec = None
+    
+    if "None" in species:
+        species = {}
+        try:
+            for i, m in enumerate( maf.Reader( open( input_filename, 'r' ) ) ):
+                for c in m.components:
+                    spec,chrom = maf.src_split( c.src )
+                    if not spec or not chrom:
+                        spec = chrom = c.src
+                    species[spec] = ""
+            species = species.keys()
+        except:
+            print >>sys.stderr, "Invalid MAF file specified"
+            return
+        
+    if "?" in species:
+        print >>sys.stderr, "Invalid dbkey specified"
+        return
+        
+    
+    for i in range( 0, len( species ) ):
+        spec = species[i]
+        if i == 0:
+            out_files[spec] = open( output_filename, 'w' )
+            primary_spec = spec
+        else:
+            out_files[spec] = tempfile.NamedTemporaryFile( mode = 'w', dir = database_tmp_dir, suffix = '.maf_to_bed' )
+            filename = out_files[spec].name
+            out_files[spec].close()
+            out_files[spec] = open( filename, 'w' )
+    num_species = len( species )
+    
+    print "Restricted to species:", ",".join( species )
+    
+    file_in = open( input_filename, 'r' )
+    maf_reader = maf.Reader( file_in )
+    
+    block_num = -1
+    
+    for i, m in enumerate( maf_reader ):
+        block_num += 1
+        if "None" not in species:
+            m = m.limit_to_species( species )
+        l = m.components
+        if len(l) < num_species and partial == "partial_disallowed": continue
+        for c in l:
+            spec,chrom = maf.src_split( c.src )
+            if not spec or not chrom:
+                    spec = chrom = c.src
+            if spec not in out_files.keys():
+                out_files[spec] = tempfile.NamedTemporaryFile( mode='w', dir = database_tmp_dir, suffix = '.maf_to_bed' )
+                filename = out_files[spec].name
+                out_files[spec].close()
+                out_files[spec] = open( filename, 'w' )
+            
+            if c.strand == "-":
+                out_files[spec].write( chrom + "\t" + str( c.src_size - c.end ) + "\t" + str( c.src_size - c.start ) + "\t" + spec + "_" + str( block_num ) + "\t" + "0\t" + c.strand + "\n" )
+            else:
+                out_files[spec].write( chrom + "\t" + str( c.start ) + "\t" + str( c.end ) + "\t" + spec + "_" + str( block_num ) + "\t" + "0\t" + c.strand + "\n" )
+            
+    file_in.close()
+    for file_out in out_files.keys():
+        out_files[file_out].close()
+
+    for spec in out_files.keys():
+        if spec != primary_spec:
+            print "#FILE\t" + spec + "\t" + os.path.join( database_tmp_dir, os.path.split( out_files[spec].name )[1] )
+        else:
+            print "#FILE1\t" + spec + "\t" + out_files[spec].name
+
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_to_bed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_to_bed.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,136 @@
+<tool id="MAF_To_BED1" name="Maf to BED" force_history_refresh="True">
+  <description>Converts a MAF formatted file to the BED format</description>
+  <command interpreter="python">maf_to_bed.py $input1 $out_file1 $species $complete_blocks $__new_file_path__</command>
+  <inputs>
+    <param format="maf" name="input1" type="data" label="MAF file to convert"/>
+    <param name="species" type="select" label="Select species" display="checkboxes" multiple="true" help="a separate history item will be created for each checked species">
+      <options>
+        <filter type="data_meta" ref="input1" key="species" />
+      </options>
+    </param>
+    <param name="complete_blocks" type="select" label="Exclude blocks which have a requested species missing">
+      <option value="partial_allowed">include blocks with missing species</option>
+      <option value="partial_disallowed">exclude blocks with missing species</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="bed" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="4.maf"/>
+      <param name="species" value="hg17"/>
+      <param name="complete_blocks" value="partial_disallowed"/>
+      <output name="out_file1" file="cf_maf_to_bed.dat"/>
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool converts every MAF block to an interval line (in BED format; scroll down for description of MAF and BED formats) describing position of that alignment block within a corresponding genome. 
+
+The interface for this tool contains two pages (steps): 
+
+ * **Step 1 of 2**. Choose multiple alignments from history to be converted to BED format.
+ * **Step 2 of 2**. Choose species from the alignment to be included in the output and specify how to deal with alignment blocks that lack one or more species:
+
+   *  **Choose species** - the tool reads the alignment provided during Step 1 and generates a list of species contained within that alignment. Using checkboxes you can specify taxa to be included in the output (only reference genome, shown in **bold**, is selected by default). If you select more than one species, then more than one history item will be created.
+   *  **Choose to include/exclude blocks with missing species** - if an alignment block does not contain any one of the species you selected within **Choose species** menu and this option is set to **exclude blocks with missing species**, then coordinates of such a block **will not** be included in the output (see **Example 2** below).  
+
+
+-----
+
+**Example 1**: **Include only reference genome** (hg18 in this case) and **include blocks with missing species**:
+
+For the following alignment::
+
+  ##maf version=1
+  a score=68686.000000
+  s hg18.chr20     56827368 75 +  62435964 GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- 
+  s panTro2.chr20  56528685 75 +  62293572 GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- 
+  s rheMac2.chr10  89144112 69 -  94855758 GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA------- 
+  s mm8.chr2      173910832 61 + 181976762 AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------- 
+  s canFam2.chr24  46551822 67 +  50763139 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C 
+
+  a score=10289.000000
+  s hg18.chr20    56827443 37 + 62435964 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG 
+  s panTro2.chr20 56528760 37 + 62293572 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG 
+  s rheMac2.chr10 89144181 37 - 94855758 ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG 
+
+the tool will create **a single** history item containing the following (**note** that field 4 is added to the output and is numbered iteratively: hg18_0, hg18_1 etc.)::
+
+  chr20    56827368    56827443   hg18_0   0   +
+  chr20    56827443    56827480   hg18_1   0   +
+
+-----
+
+**Example 2**: **Include hg18 and mm8** and **exclude blocks with missing species**:
+
+For the following alignment::
+
+  ##maf version=1
+  a score=68686.000000
+  s hg18.chr20     56827368 75 +  62435964 GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- 
+  s panTro2.chr20  56528685 75 +  62293572 GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- 
+  s rheMac2.chr10  89144112 69 -  94855758 GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA------- 
+  s mm8.chr2      173910832 61 + 181976762 AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------- 
+  s canFam2.chr24  46551822 67 +  50763139 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C 
+
+  a score=10289.000000
+  s hg18.chr20    56827443 37 + 62435964 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG 
+  s panTro2.chr20 56528760 37 + 62293572 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG 
+  s rheMac2.chr10 89144181 37 - 94855758 ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG 
+
+the tool will create **two** history items (one for hg18 and one fopr mm8) containing the following (**note** that both history items contain only one line describing the first alignment block. The second MAF block is not included in the output because it does not contain mm8):
+
+History item **1** (for hg18)::
+
+   chr20    56827368    56827443   hg18_0   0   +
+
+History item **2** (for mm8)::
+
+   chr2    173910832   173910893    mm8_0   0   +
+
+-------
+
+.. class:: infomark
+
+**About formats**
+
+**MAF format** multiple alignment format file. This format stores multiple alignments at the DNA level between entire genomes. 
+
+ - The .maf format is line-oriented. Each multiple alignment ends with a blank line.
+ - Each sequence in an alignment is on a single line.
+ - Lines starting with # are considered to be comments.
+ - Each multiple alignment is in a separate paragraph that begins with an "a" line and contains an "s" line for each sequence in the multiple alignment.
+ - Some MAF files may contain two optional line types: 
+
+   - An "i" line containing information about what is in the aligned species DNA before and after the immediately preceding "s" line; 
+   - An "e" line containing information about the size of the gap between the alignments that span the current block.
+
+**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and a number of additional optional ones:
+
+The first three BED fields (required) are::
+
+    1. chrom - The name of the chromosome (e.g. chr1, chrY_random).
+    2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.)
+    3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval).
+
+Additional (optional) fields are::
+
+    4. name - The name of the BED line.
+    5. score - A score between 0 and 1000.
+    6. strand - Defines the strand - either '+' or '-'.
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. &lt;http://www.ncbi.nlm.nih.gov/pubmed/21775304&gt;`_
+
+
+    </help>
+    <code file="maf_to_bed_code.py"/>
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_to_bed_code.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_to_bed_code.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,59 @@
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.align import maf
+from galaxy import datatypes, config, jobs 
+from shutil import move
+
+def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
+    output_data = out_data.items()[0][1]
+    history = output_data.history
+    if history == None:
+        print "unknown history!"
+        return
+    new_stdout = ""
+    split_stdout = stdout.split("\n")
+    basic_name = output_data.name
+    output_data_list = []
+    for line in split_stdout:
+        if line.startswith("#FILE1"):
+            fields = line.split("\t")
+            dbkey = fields[1]
+            filepath = fields[2]
+            output_data.dbkey = dbkey
+            output_data.name = basic_name + " (" + dbkey + ")"
+            app.model.context.add( output_data )
+            app.model.context.flush()
+            output_data_list.append(output_data)
+        elif line.startswith("#FILE"):
+            fields = line.split("\t")
+            dbkey = fields[1]
+            filepath = fields[2]
+            newdata = app.model.HistoryDatasetAssociation( create_dataset = True, sa_session = app.model.context )
+            newdata.set_size()
+            newdata.extension = "bed"
+            newdata.name = basic_name + " (" + dbkey + ")"
+            app.model.context.add( newdata )
+            app.model.context.flush()
+            history.add_dataset( newdata )
+            app.security_agent.copy_dataset_permissions( output_data.dataset, newdata.dataset )
+            app.model.context.add( history )
+            app.model.context.flush()
+            try:
+                move(filepath,newdata.file_name)
+                newdata.info = newdata.name
+                newdata.state = newdata.states.OK
+            except:
+                newdata.info = "The requested file is missing from the system."
+                newdata.state = newdata.states.ERROR
+            newdata.dbkey = dbkey
+            newdata.init_meta()
+            newdata.set_meta()
+            newdata.set_peek()
+            app.model.context.flush()
+            output_data_list.append(newdata)
+        else:
+            new_stdout = new_stdout + line
+        for data in output_data_list:
+            if data.state == data.states.OK:
+                data.info = new_stdout
+                app.model.context.add( data )
+                app.model.context.flush()
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_to_fasta.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_to_fasta.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,199 @@\n+<tool id="MAF_To_Fasta1" name="MAF to FASTA" version="1.0.1">\r\n+  <description>Converts a MAF formatted file to FASTA format</description>\r\n+  <command interpreter="python">\r\n+    #if $fasta_target_type.fasta_type == "multiple" #maf_to_fasta_multiple_sets.py $input1 $out_file1 $fasta_target_type.species $fasta_target_type.complete_blocks\r\n+    #else                                           #maf_to_fasta_concat.py $fasta_target_type.species $input1 $out_file1\r\n+    #end if#\r\n+  </command>\r\n+  <inputs>\r\n+    <param format="maf" name="input1" type="data" label="MAF file to convert"/>\r\n+    <conditional name="fasta_target_type">\r\n+      <param name="fasta_type" type="select" label="Type of FASTA Output">\r\n+        <option value="multiple" selected="true">Multiple Blocks</option>\r\n+        <option value="concatenated">One Sequence per Species</option>\r\n+      </param>\r\n+      <when value="multiple">\r\n+        <param name="species" type="select" label="Select species" display="checkboxes" multiple="true" help="checked taxa will be included in the output">\r\n+          <options>\r\n+            <filter type="data_meta" ref="input1" key="species" />\r\n+          </options>\r\n+        </param>\r\n+\t    <param name="complete_blocks" type="select" label="Choose to">\r\n+\t      <option value="partial_allowed">include blocks with missing species</option>\r\n+\t      <option value="partial_disallowed">exclude blocks with missing species</option>\r\n+\t    </param>\r\n+      </when>\r\n+      <when value="concatenated">\r\n+        <param name="species" type="select" label="Species to extract" display="checkboxes" multiple="true">\r\n+          <options>\r\n+            <filter type="data_meta" ref="input1" key="species" />\r\n+          </options>\r\n+        </param>\r\n+      </when>\r\n+    </conditional>\r\n+  </inputs>\r\n+  <outputs>\r\n+    <data format="fasta" name="out_file1" />\r\n+  </outputs>\r\n+  <tests>\r\n+    <test>\r\n+      <param name="input1" value="3.maf" ftype="maf"/>\r\n+      <param name="fasta_type" value="concatenated"/>\r\n+      <param name="species" value="canFam1"/>\r\n+      <output name="out_file1" file="cf_maf2fasta_concat.dat" ftype="fasta"/>\r\n+    </test>\r\n+    <test>\r\n+      <param name="input1" value="4.maf" ftype="maf"/>\r\n+      <param name="fasta_type" value="multiple"/>\r\n+      <param name="species" value="hg17,panTro1,rheMac2,rn3,mm7,canFam2,bosTau2,dasNov1"/>\r\n+      <param name="complete_blocks" value="partial_allowed"/>\r\n+      <output name="out_file1" file="cf_maf2fasta_new.dat" ftype="fasta"/>\r\n+    </test>\r\n+  </tests>\r\n+  <help>\r\n+\r\n+**Types of MAF to FASTA conversion**\r\n+\r\n+ * **Multiple Blocks** converts a single MAF block to a single FASTA block. For example, if you have 6 MAF blocks, they will be converted to 6 FASTA blocks.\r\n+ * **One Sequence per Species** converts MAF blocks to a single aggregated FASTA block. For example, if you have 6 MAF blocks, they will be converted and concatenated into a single FASTA block.\r\n+\r\n+-------\r\n+\r\n+**What it does**\r\n+\r\n+This tool converts MAF blocks to FASTA format and concatenates them into a single FASTA block or outputs multiple FASTA blocks separated by empty lines.\r\n+\r\n+The interface for this tool contains two pages (steps): \r\n+\r\n+ * **Step 1 of 2**. Choose multiple alignments from history to be converted to FASTA format.\r\n+ * **Step 2 of 2**. Choose the type of output as well as the species from the alignment to be included in the output.\r\n+ \r\n+   Multiple Block output has additional options:\r\n+   \r\n+   *  **Choose species** - the tool reads the alignment provided during Step 1 and generates a list of species contained within that alignment. Using checkboxes you can specify taxa to be included in the output (all species are selected by default). \r\n+   *  **Choose to include/exclude blocks with missing species** - if an alignment block does not contain any one of the species you selected within **Choose species** menu and this option is set to **exclude blocks with missing speci'..b'CCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C \r\n+\r\n+  a score=10289.000000\r\n+  s hg18.chr20    56827443 37 + 62435964 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG \r\n+  s panTro2.chr20 56528760 37 + 62293572 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG \r\n+  s rheMac2.chr10 89144181 37 - 94855758 ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG \r\n+\r\n+will be converted to::\r\n+\r\n+  &gt;hg18.chr20(+):56827368-56827443|hg18_0\r\n+  GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\r\n+  &gt;panTro2.chr20(+):56528685-56528760|panTro2_0\r\n+  GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\r\n+  &gt;rheMac2.chr10(-):89144112-89144181|rheMac2_0\r\n+  GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA-------\r\n+  &gt;mm8.chr2(+):173910832-173910893|mm8_0\r\n+  AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC-------\r\n+  &gt;canFam2.chr24(+):46551822-46551889|canFam2_0\r\n+  CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C\r\n+\r\n+  &gt;hg18.chr20(+):56827443-56827480|hg18_1\r\n+  ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG\r\n+  &gt;panTro2.chr20(+):56528760-56528797|panTro2_1\r\n+  ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG\r\n+  &gt;rheMac2.chr10(-):89144181-89144218|rheMac2_1\r\n+  ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG\r\n+\r\n+-----\r\n+\r\n+**Example 2b**: Multiple Block Approach **Include hg18 and mm8** and **exclude blocks with missing species**:\r\n+\r\n+The following alignment::\r\n+\r\n+  ##maf version=1\r\n+  a score=68686.000000\r\n+  s hg18.chr20     56827368 75 +  62435964 GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- \r\n+  s panTro2.chr20  56528685 75 +  62293572 GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- \r\n+  s rheMac2.chr10  89144112 69 -  94855758 GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA------- \r\n+  s mm8.chr2      173910832 61 + 181976762 AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------- \r\n+  s canFam2.chr24  46551822 67 +  50763139 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C \r\n+\r\n+  a score=10289.000000\r\n+  s hg18.chr20    56827443 37 + 62435964 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG \r\n+  s panTro2.chr20 56528760 37 + 62293572 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG \r\n+  s rheMac2.chr10 89144181 37 - 94855758 ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG \r\n+\r\n+will be converted to (**note** that the second MAF block, which does not have mm8, is not included in the output)::\r\n+\r\n+  &gt;hg18.chr20(+):56827368-56827443|hg18_0\r\n+  GACAGGGTGCATCTGGGAGGGCCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC\r\n+  &gt;mm8.chr2(+):173910832-173910893|mm8_0\r\n+  AGAAGGATCCACCT---------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------\r\n+\r\n+------\r\n+\r\n+.. class:: infomark\r\n+\r\n+**About formats**\r\n+\r\n+ **MAF format** multiple alignment format file. This format stores multiple alignments at the DNA level between entire genomes. \r\n+\r\n+ - The .maf format is line-oriented. Each multiple alignment ends with a blank line.\r\n+ - Each sequence in an alignment is on a single line.\r\n+ - Lines starting with # are considered to be comments.\r\n+ - Each multiple alignment is in a separate paragraph that begins with an "a" line and contains an "s" line for each sequence in the multiple alignment.\r\n+ - Some MAF files may contain two optional line types: \r\n+\r\n+   - An "i" line containing information about what is in the aligned species DNA before and after the immediately preceding "s" line; \r\n+   - An "e" line containing information about the size of the gap between the alignments that span the current block.\r\n+\r\n+------\r\n+\r\n+**Citation**\r\n+\r\n+If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. &lt;http://www.ncbi.nlm.nih.gov/pubmed/21775304&gt;`_\r\n+\r\n+\r\n+  </help>\r\n+</tool>\r\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_to_fasta_concat.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_to_fasta_concat.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+"""
+Read a maf and output a single block fasta file, concatenating blocks
+
+usage %prog species1,species2 maf_file out_file
+"""
+#Dan Blankenberg
+import sys
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.align import maf
+from galaxy.tools.util import maf_utilities
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def __main__():
+    try:
+        species = maf_utilities.parse_species_option( sys.argv[1] )
+    except Exception, e:
+        maf_utilities.tool_fail( "Error determining species value: %s" % e )
+    try:
+        input_filename = sys.argv[2]
+    except Exception, e:
+        maf_utilities.tool_fail( "Error reading MAF filename: %s" % e )
+    try:
+        file_out = open( sys.argv[3], 'w' )
+    except Exception, e:
+        maf_utilities.tool_fail( "Error opening file for output: %s" % e )
+    
+    if species:
+        print "Restricted to species: %s" % ', '.join( species )
+    else:
+        print "Not restricted to species."
+    
+    if not species:
+        try:
+            species = maf_utilities.get_species_in_maf( input_filename )
+        except Exception, e:
+            maf_utilities.tool_fail( "Error determining species in input MAF: %s" % e )
+    
+    for spec in species:
+        file_out.write( ">" + spec + "\n" )
+        try:
+            for start_block in maf.Reader( open( input_filename, 'r' ) ):
+                for block in maf_utilities.iter_blocks_split_by_species( start_block ):
+                    block.remove_all_gap_columns() #remove extra gaps
+                    component = block.get_component_by_src_start( spec ) #blocks only have one occurrence of a particular species, so this is safe
+                    if component:
+                        file_out.write( component.text )
+                    else:
+                        file_out.write( "-" * block.text_size )
+        except Exception, e:
+            maf_utilities.tool_fail( "Your MAF file appears to be malformed: %s" % e )
+        file_out.write( "\n" )
+    file_out.close()
+
+
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_to_fasta_multiple_sets.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_to_fasta_multiple_sets.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+
+"""
+Read a maf and output a multiple block fasta file.
+"""
+#Dan Blankenberg
+import sys
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.align import maf
+from galaxy.tools.util import maf_utilities
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def __main__():
+    try:
+        maf_reader = maf.Reader( open( sys.argv[1] ) )
+    except Exception, e:
+        maf_utilities.tool_fail( "Error opening input MAF: %s" % e )
+    try:
+        file_out = open( sys.argv[2], 'w' )
+    except Exception, e:
+        maf_utilities.tool_fail( "Error opening file for output: %s" % e )
+    try:
+        species = maf_utilities.parse_species_option( sys.argv[3] )
+        if species:
+            num_species = len( species )
+        else:
+            num_species = 0
+    except Exception, e:
+        maf_utilities.tool_fail( "Error determining species value: %s" % e )
+    try:
+        partial = sys.argv[4]
+    except Exception, e:
+        maf_utilities.tool_fail( "Error determining keep partial value: %s" % e )
+    
+    if species:
+        print "Restricted to species: %s" % ', '.join( species )
+    else:
+        print "Not restricted to species."
+    
+    for block_num, block in enumerate( maf_reader ):
+        if species:
+            block = block.limit_to_species( species )
+            if len( maf_utilities.get_species_in_block( block ) ) < num_species and partial == "partial_disallowed": continue
+        spec_counts = {}
+        for component in block.components:
+            spec, chrom = maf_utilities.src_split( component.src )
+            if spec not in spec_counts:
+                spec_counts[ spec ] = 0
+            else:
+                spec_counts[ spec ] += 1
+            file_out.write( "%s\n" % maf_utilities.get_fasta_header( component, { 'block_index' : block_num, 'species' : spec, 'sequence_index' : spec_counts[ spec ] }, suffix = "%s_%i_%i" % ( spec, block_num, spec_counts[ spec ] ) ) )
+            file_out.write( "%s\n" % component.text )
+        file_out.write( "\n" )
+    file_out.close()
+
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_to_interval.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_to_interval.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+
+"""
+Read a maf and output intervals for specified list of species.
+"""
+import sys, os
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.align import maf
+from galaxy.tools.util import maf_utilities
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def __main__():    
+    input_filename = sys.argv[1]
+    output_filename = sys.argv[2]
+    output_id = sys.argv[3]
+    #where to store files that become additional output
+    database_tmp_dir =  sys.argv[4]
+    primary_spec = sys.argv[5]
+    species = sys.argv[6].split( ',' )
+    all_species = sys.argv[7].split( ',' )
+    partial = sys.argv[8]
+    keep_gaps = sys.argv[9]
+    out_files = {}
+    
+    if "None" in species:
+        species = []
+    
+    if primary_spec not in species:
+        species.append( primary_spec )
+    if primary_spec not in all_species:
+        all_species.append( primary_spec )
+    
+    all_species.sort()
+    for spec in species:
+        if spec == primary_spec:
+            out_files[ spec ] = open( output_filename, 'wb+' )
+        else:
+            out_files[ spec ] = open( os.path.join( database_tmp_dir, 'primary_%s_%s_visible_interval_%s' % ( output_id, spec, spec ) ), 'wb+' )
+        out_files[ spec ].write( '#chrom\tstart\tend\tstrand\tscore\tname\t%s\n' % ( '\t'.join( all_species ) ) )
+    num_species = len( all_species )
+    
+    file_in = open( input_filename, 'r' )
+    maf_reader = maf.Reader( file_in )
+    
+    for i, m in enumerate( maf_reader ):
+        for j, block in enumerate( maf_utilities.iter_blocks_split_by_species( m ) ):
+            if len( block.components ) < num_species and partial == "partial_disallowed": continue
+            sequences = {}
+            for c in block.components:
+                spec, chrom = maf_utilities.src_split( c.src )
+                if keep_gaps == 'remove_gaps':
+                    sequences[ spec ] = c.text.replace( '-', '' )
+                else:
+                    sequences[ spec ] = c.text
+            sequences = '\t'.join( [ sequences.get( spec, '' ) for spec in all_species ] )
+            for spec in species:
+                c = block.get_component_by_src_start( spec )
+                if c is not None:
+                    spec2, chrom = maf_utilities.src_split( c.src )
+                    assert spec2 == spec, Exception( 'Species name inconsistancy found in component: %s != %s' % ( spec, spec2 ) )
+                    out_files[ spec ].write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( chrom, c.forward_strand_start, c.forward_strand_end, c.strand, m.score, "%s_%s_%s" % (spec, i, j), sequences ) )
+    file_in.close()
+    for file_out in out_files.values():
+        file_out.close()
+
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/maf_to_interval.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/maf_to_interval.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,133 @@\n+<tool id="MAF_To_Interval1" name="MAF to Interval" force_history_refresh="True">\n+  <description>Converts a MAF formatted file to the Interval format</description>\n+  <command interpreter="python">maf_to_interval.py $input1 $out_file1 $out_file1.id $__new_file_path__ $input1.dbkey $species $input1.metadata.species $complete_blocks $remove_gaps</command>\n+  <inputs>\n+    <param format="maf" name="input1" type="data" label="MAF file to convert"/>\n+    <param name="species" type="select" label="Select additional species" display="checkboxes" multiple="true" help="The species matching the dbkey of the alignment is always included. A separate history item will be created for each species.">\n+      <options>\n+        <filter type="data_meta" ref="input1" key="species" />\n+        <filter type="remove_value" meta_ref="input1" key="dbkey" />\n+      </options>\n+    </param>\n+    <param name="complete_blocks" type="select" label="Exclude blocks which have a species missing">\n+      <option value="partial_allowed">include blocks with missing species</option>\n+      <option value="partial_disallowed">exclude blocks with missing species</option>\n+    </param>\n+    <param name="remove_gaps" type="select" label="Remove Gap characters from sequences">\n+      <option value="keep_gaps">keep gaps</option>\n+      <option value="remove_gaps">remove gaps</option>\n+    </param>\n+  </inputs>\n+  <outputs>\n+    <data format="interval" name="out_file1" />\n+  </outputs>\n+  <tests>\n+    <test>\n+      <param name="input1" value="4.maf" dbkey="hg17"/>\n+      <param name="complete_blocks" value="partial_disallowed"/>\n+      <param name="remove_gaps" value="keep_gaps"/>\n+      <param name="species" value="panTro1" />\n+      <output name="out_file1" file="maf_to_interval_out_hg17.interval"/>\n+      <output name="out_file1" file="maf_to_interval_out_panTro1.interval"/>\n+    </test>\n+  </tests>\n+  <help>\n+\n+**What it does**\n+\n+This tool converts every MAF block to a set of genomic intervals describing the position of that alignment block within a corresponding genome. Sequences from aligning species are also included in the output.\n+\n+The interface for this tool contains several options: \n+\n+ * **MAF file to convert**. Choose multiple alignments from history to be converted to BED format.\n+ * **Choose species**. Choose additional species from the alignment to be included in the output \n+ * **Exclude blocks which have a species missing**. if an alignment block does not contain any one of the species found in the alignment set and this option is set to **exclude blocks with missing species**, then coordinates of such a block **will not** be included in the output (see **Example 2** below).\n+ * **Remove Gap characters from sequences**. Gaps can be removed from sequences before they are output.\n+\n+\n+-----\n+\n+**Example 1**: **Include only reference genome** (hg18 in this case) and **include blocks with missing species**:\n+\n+For the following alignment::\n+\n+  ##maf version=1\n+  a score=68686.000000\n+  s hg18.chr20     56827368 75 +  62435964 GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- \n+  s panTro2.chr20  56528685 75 +  62293572 GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- \n+  s rheMac2.chr10  89144112 69 -  94855758 GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA------- \n+  s mm8.chr2      173910832 61 + 181976762 AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------- \n+  s canFam2.chr24  46551822 67 +  50763139 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C \n+\n+  a score=10289.000000\n+  s hg18.chr20    56827443 37 + 62435964 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG \n+  s panTro2.chr20 56528760 37 + 62293572 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG \n+  s rheMac2.chr10 89144181 37 - 94855758 ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG \n+\n+the tool will create **a single** history item containing the foll'..b'CACCTCCCAACTCAAATGCCC-------\tGACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\tGACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA-------\n+  chr20\t56827443\t56827480\t+\t10289.0\thg18_1_0\t\tATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG\t\tATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG\tATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG\n+\n+\n+-----\n+\n+**Example 2**: **Include hg18 and mm8** and **exclude blocks with missing species**:\n+\n+For the following alignment::\n+\n+  ##maf version=1\n+  a score=68686.000000\n+  s hg18.chr20     56827368 75 +  62435964 GACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- \n+  s panTro2.chr20  56528685 75 +  62293572 GACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC- \n+  s rheMac2.chr10  89144112 69 -  94855758 GACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA------- \n+  s mm8.chr2      173910832 61 + 181976762 AGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC------- \n+  s canFam2.chr24  46551822 67 +  50763139 CG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C \n+\n+  a score=10289.000000\n+  s hg18.chr20    56827443 37 + 62435964 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG \n+  s panTro2.chr20 56528760 37 + 62293572 ATGTGCAGAAAATGTGATACAGAAACCTGCAGAGCAG \n+  s rheMac2.chr10 89144181 37 - 94855758 ATGTGCGGAAAATGTGATACAGAAACCTGCAGAGCAG \n+\n+the tool will create **two** history items (one for hg18 and one for mm8) containing the following (**note** that both history items contain only one line describing the first alignment block. The second MAF block is not included in the output because it does not contain mm8):\n+\n+History item **1** (for hg18)::\n+\n+   #chrom\tstart\tend\tstrand\tscore\tname\tcanFam2\thg18\tmm8\tpanTro2\trheMac2\n+   chr20\t56827368\t56827443\t+\t68686.0\thg18_0_0\tCG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C\tGACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\tAGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC-------\tGACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\tGACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA-------\n+\n+\n+History item **2** (for mm8)::\n+\n+   #chrom\tstart\tend\tstrand\tscore\tname\tcanFam2\thg18\tmm8\tpanTro2\trheMac2\n+   chr2\t173910832\t173910893\t+\t68686.0\tmm8_0_0\tCG------GCGTCTGTAAGGGGCCACCGCCCGGCCTGTG-CTCAAAGCTACAAATGACTCAACTCCCAACCGA------C\tGACAGGGTGCATCTGGGAGGG---CCTGCCGGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\tAGAAGGATCCACCT------------TGCTGGGCCTCTGCTCCAGCAAGACCCACCTCCCAACTCAAATGCCC-------\tGACAGGGTGCATCTGAGAGGG---CCTGCCAGGCCTTTA-TTCAACACTAGATACGCCCCATCTCCAATTCTAATGGAC-\tGACAGGGTGCATCTGAGAGGG---CCTGCTGGGCCTTTG-TTCAAAACTAGATATGCCCCAACTCCAATTCTA-------\n+\n+\n+-------\n+\n+.. class:: infomark\n+\n+**About formats**\n+\n+**MAF format** multiple alignment format file. This format stores multiple alignments at the DNA level between entire genomes. \n+\n+ - The .maf format is line-oriented. Each multiple alignment ends with a blank line.\n+ - Each sequence in an alignment is on a single line.\n+ - Lines starting with # are considered to be comments.\n+ - Each multiple alignment is in a separate paragraph that begins with an "a" line and contains an "s" line for each sequence in the multiple alignment.\n+ - Some MAF files may contain two optional line types: \n+\n+   - An "i" line containing information about what is in the aligned species DNA before and after the immediately preceding "s" line; \n+   - An "e" line containing information about the size of the gap between the alignments that span the current block.\n+\n+------\n+\n+**Citation**\n+\n+If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. &lt;http://www.ncbi.nlm.nih.gov/pubmed/21775304&gt;`_\n+\n+\n+    </help>\n+</tool>\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/vcf_to_maf_customtrack.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/vcf_to_maf_customtrack.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,151 @@
+#Dan Blankenberg
+from optparse import OptionParser
+import sys
+import galaxy_utils.sequence.vcf
+
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+import bx.align.maf
+
+UNKNOWN_NUCLEOTIDE = '*'
+
+class PopulationVCFParser( object ):
+    def __init__( self, reader, name ):
+        self.reader = reader
+        self.name = name
+        self.counter = 0
+    def next( self ):
+        rval = []
+        vc = self.reader.next()
+        for i, allele in enumerate( vc.alt ):
+            rval.append( ( '%s_%i.%i' % ( self.name, i + 1, self.counter + 1 ), allele ) )
+        self.counter += 1
+        return ( vc, rval )
+    def __iter__( self ):
+        while True:
+            yield self.next()
+
+class SampleVCFParser( object ):
+    def __init__( self, reader ):
+        self.reader = reader
+        self.counter = 0
+    def next( self ):
+        rval = []
+        vc = self.reader.next()
+        alleles = [ vc.ref ] + vc.alt
+        
+        if 'GT' in vc.format:
+            gt_index = vc.format.index( 'GT' )
+            for sample_name, sample_value in zip( vc.sample_names, vc.sample_values ):
+                gt_indexes = []
+                for i in sample_value[ gt_index ].replace( '|', '/' ).replace( '\\', '/' ).split( '/' ): #Do we need to consider phase here?
+                    try:
+                        gt_indexes.append( int( i ) )
+                    except:
+                        gt_indexes.append( None )
+                for i, allele_i in enumerate( gt_indexes ):
+                    if allele_i is not None:
+                        rval.append( ( '%s_%i.%i' % ( sample_name, i + 1, self.counter + 1 ), alleles[ allele_i ] ) )
+        self.counter += 1
+        return ( vc, rval )
+    def __iter__( self ):
+        while True:
+            yield self.next()
+
+def main():
+    usage = "usage: %prog [options] output_file dbkey inputfile pop_name"
+    parser = OptionParser( usage=usage )
+    parser.add_option( "-p", "--population", action="store_true", dest="population", default=False, help="Create MAF on a per population basis")
+    parser.add_option( "-s", "--sample", action="store_true", dest="sample", default=False, help="Create MAF on a per sample basis")
+    parser.add_option( "-n", "--name", dest="name", default='Unknown Custom Track', help="Name for Custom Track")
+    parser.add_option( "-g", "--galaxy", action="store_true", dest="galaxy", default=False, help="Tool is being executed by Galaxy (adds extra error messaging).")
+    
+
+    ( options, args ) = parser.parse_args()
+    
+    if len ( args ) < 3:
+        if options.galaxy:
+            print >>sys.stderr, "It appears that you forgot to specify an input VCF file, click 'Add new VCF...' to add at least input.\n"
+        parser.error( "Need to specify an output file, a dbkey and at least one input file" )
+    
+    if not ( options.population ^ options.sample ):
+        parser.error( 'You must specify either a per population conversion or a per sample conversion, but not both' )
+    
+    out = open( args.pop(0), 'wb' )
+    out.write( 'track name="%s" visibility=pack\n' %  options.name.replace( "\"", "'" ) )
+    
+    maf_writer = bx.align.maf.Writer( out )
+    
+    dbkey = args.pop(0)
+    
+    vcf_files = []
+    if options.population:
+        i = 0
+        while args:
+            filename = args.pop( 0 )
+            pop_name = args.pop( 0 ).replace( ' ', '_' )
+            if not pop_name:
+                pop_name = 'population_%i' % ( i + 1 )
+            vcf_files.append( PopulationVCFParser( galaxy_utils.sequence.vcf.Reader( open( filename ) ), pop_name  ) )
+            i += 1
+    else:
+        while args:
+            filename = args.pop( 0 )
+            vcf_files.append( SampleVCFParser( galaxy_utils.sequence.vcf.Reader( open( filename ) ) ) )
+    
+    non_spec_skipped = 0
+    for vcf_file in vcf_files:
+        for vc, variants in vcf_file:
+            num_ins = 0
+            num_dels = 0
+            for variant_name, variant_text in variants:
+                if 'D' in variant_text:
+                    num_dels = max( num_dels, int( variant_text[1:] ) )
+                elif 'I' in variant_text:
+                    num_ins = max( num_ins, len( variant_text ) - 1 )
+            
+            alignment = bx.align.maf.Alignment()
+            ref_text = vc.ref + '-' * num_ins + UNKNOWN_NUCLEOTIDE * ( num_dels - len( vc.ref ) )
+            start_pos = vc.pos - 1
+            if num_dels and start_pos:
+                ref_text = UNKNOWN_NUCLEOTIDE + ref_text
+                start_pos -= 1
+            alignment.add_component( bx.align.maf.Component( src='%s.%s%s' % (
+                 dbkey, ("chr" if not vc.chrom.startswith("chr") else ""), vc.chrom ),
+                 start = start_pos, size = len( ref_text.replace( '-', '' ) ),
+                 strand = '+', src_size = start_pos + len( ref_text ),
+                 text = ref_text ) )
+            for variant_name, variant_text in variants:
+                #FIXME:
+                ## skip non-spec. compliant data, see: http://1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3 for format spec
+                ## this check is due to data having indels not represented in the published format spec, 
+                ## e.g. 1000 genomes pilot 1 indel data: ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2010_03/pilot1/indels/CEU.SRP000031.2010_03.indels.sites.vcf.gz
+                if variant_text and variant_text[0] in [ '-', '+' ]:
+                    non_spec_skipped += 1
+                    continue
+                
+                #do we need a left padding unknown nucleotide (do we have deletions)?
+                if num_dels and start_pos:
+                    var_text = UNKNOWN_NUCLEOTIDE
+                else:
+                    var_text = ''
+                if 'D' in variant_text:
+                    cur_num_del = int( variant_text[1:] )
+                    pre_del = min( len( vc.ref ), cur_num_del )
+                    post_del = cur_num_del - pre_del
+                    var_text = var_text + '-' * pre_del + '-' * num_ins + '-' * post_del
+                    var_text = var_text + UNKNOWN_NUCLEOTIDE * ( len( ref_text ) - len( var_text ) )
+                elif 'I' in variant_text:
+                    cur_num_ins = len( variant_text ) - 1
+                    var_text = var_text + vc.ref + variant_text[1:] + '-' * ( num_ins - cur_num_ins ) + UNKNOWN_NUCLEOTIDE * max( 0, ( num_dels - 1 ) )
+                else:
+                    var_text = var_text + variant_text + '-' * num_ins + UNKNOWN_NUCLEOTIDE * ( num_dels - len( vc.ref ) )    
+                alignment.add_component( bx.align.maf.Component( src=variant_name, start = 0, size = len( var_text.replace( '-', '' ) ), strand = '+', src_size = len( var_text.replace( '-', '' ) ), text = var_text ) )
+            maf_writer.write( alignment )
+
+    maf_writer.close()
+    
+    if non_spec_skipped:
+        print 'Skipped %i non-specification compliant indels.' % non_spec_skipped
+
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/maf/vcf_to_maf_customtrack.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/vcf_to_maf_customtrack.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,127 @@
+<tool id="vcf_to_maf_customtrack1" name="VCF to MAF Custom Track">
+  <description>for display at UCSC</description>
+  <command interpreter="python">vcf_to_maf_customtrack.py '$out_file1'
+    #if $vcf_source_type.vcf_file
+    '${vcf_source_type.vcf_file[0].vcf_input.dbkey}'
+    #else
+    '?'
+    #end if
+    ${vcf_source_type.vcf_source} -n '$track_name'
+    #for $vcf_repeat in $vcf_source_type.vcf_file
+    '${vcf_repeat.vcf_input}'
+    #if $vcf_source_type.vcf_source == '-p'
+      '${vcf_repeat.population_name}'
+    #end if
+    #end for
+    -g
+  </command>
+  <inputs>
+    <param name="track_name" type="text" label="Custom Track Name" value="Galaxy Custom Track" size="30" />
+    <conditional name="vcf_source_type">
+      <param name="vcf_source" type="select" label="VCF Source Source Type">
+        <option value="-p" selected="true">Per Population (file)</option>
+        <option value="-s">Per Sample</option>
+      </param>
+      <when value="-p">
+        <repeat name="vcf_file" title="VCF population file" min="1">
+          <param format="tabular" name="vcf_input" type="data" label="VCF file"/>
+          <param name="population_name" type="text" label="Name for this population" value=""/>
+        </repeat>
+      </when>
+      <when value="-s">
+        <repeat name="vcf_file" title="VCF sample file" min="1">
+          <param format="tabular" name="vcf_input" type="data" label="VCF file"/>
+          <!-- add column count validator >= 8? -->
+        </repeat>
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="mafcustomtrack" name="out_file1" />
+  </outputs>
+<!--  <tests>
+    <test>
+      <param name="track_name" value="Galaxy Custom Track"/>
+      <param name="vcf_source" value="Per Population"/>
+      <param name="vcf_input" value="vcf_to_maf_in.vcf" ftype="tabular"/>
+      <param name="population_name" value=""/>
+      <output name="out_file1" file="vcf_to_maf_population_out.mafcustomtrack"/>
+    </test>
+    <test>
+      <param name="track_name" value="Galaxy Custom Track"/>
+      <param name="vcf_source" value="Per Sample"/>
+      <param name="vcf_input" value="vcf_to_maf_in.vcf" ftype="tabular"/>
+      <output name="out_file1" file="vcf_to_maf_sample_out.mafcustomtrack"/>
+    </test>
+  </tests> -->
+  <help>
+**What it does**
+
+This tool converts a Variant Call Format (VCF) file into a Multiple Alignment Format (MAF) custom track file suitable for display at genome browsers. 
+
+This file should be used for display purposes only (e.g as a UCSC Custom Track). Performing an analysis using the output created by this tool as input is not recommended; the source VCF file should be used when performing an analysis.
+
+*Unknown nucleotides* are represented as '*' as required to allow the display to draw properly; these include e.g. reference bases which appear before a deletion and are not available without querying the original reference sequence.
+
+**Example**
+
+Starting with a VCF::
+
+  ##fileformat=VCFv3.3
+  ##fileDate=20090805
+  ##source=myImputationProgramV3.1
+  ##reference=1000GenomesPilot-NCBI36
+  ##phasing=partial
+  ##INFO=NS,1,Integer,"Number of Samples With Data"
+  ##INFO=DP,1,Integer,"Total Depth"
+  ##INFO=AF,-1,Float,"Allele Frequency"
+  ##INFO=AA,1,String,"Ancestral Allele"
+  ##INFO=DB,0,Flag,"dbSNP membership, build 129"
+  ##INFO=H2,0,Flag,"HapMap2 membership"
+  ##FILTER=q10,"Quality below 10"
+  ##FILTER=s50,"Less than 50% of samples have data"
+  ##FORMAT=GT,1,String,"Genotype"
+  ##FORMAT=GQ,1,Integer,"Genotype Quality"
+  ##FORMAT=DP,1,Integer,"Read Depth"
+  ##FORMAT=HQ,2,Integer,"Haplotype Quality"
+  #CHROM  POS ID  REF ALT QUAL    FILTER  INFO    FORMAT  NA00001 NA00002 NA00003
+  20  14370   rs6054257   G   A   29  0   NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51  1|0:48:8:51,51  1/1:43:5:-1,-1
+  20  17330   .   T   A   3   q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50  0|1:3:5:65,3    0/0:41:3:-1,-1
+  20  1110696 rs6040355   A   G,T 67  0   NS=2;DP=10;AF=0.333,0.667;AA=T;DB   GT:GQ:DP:HQ 1|2:21:6:23,27  2|1:2:0:18,2    2/2:35:4:-1,-1
+  20  1230237 .   T   .   47  0   NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60  0|0:48:4:51,51  0/0:61:2:-1,-1
+  20  1234567 microsat1   G   D4,IGA  50  0   NS=3;DP=9;AA=G  GT:GQ:DP    0/1:35:4    0/2:17:2    1/1:40:3
+  
+
+
+
+Under the following conditions: **VCF Source type:** *Per Population (file)*, **Name for this population:** *CHB+JPT*
+Results in the following MAF custom track::
+
+  track name="Galaxy Custom Track" visibility=pack
+  ##maf version=1
+  a score=0
+  s hg18.chr20  14369 1 + 14370 G 
+  s CHB+JPT_1.1     0 1 +     1 A 
+  
+  a score=0
+  s hg18.chr20  17329 1 + 17330 T 
+  s CHB+JPT_1.2     0 1 +     1 A 
+  
+  a score=0
+  s hg18.chr20  1110695 1 + 1110696 A 
+  s CHB+JPT_1.3       0 1 +       1 G 
+  s CHB+JPT_2.3       0 1 +       1 T 
+  
+  a score=0
+  s hg18.chr20  1230236 1 + 1230237 T 
+  s CHB+JPT_1.4       0 1 +       1 . 
+  
+  a score=0
+  s hg18.chr20  1234565 5 + 1234572 *G--*** 
+  s CHB+JPT_1.5       0 1 +       1 *------ 
+  s CHB+JPT_2.5       0 7 +       7 *GGA*** 
+  
+
+    </help>
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/meme/._meme.xml
b
Binary file tools/meme/._meme.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/meme/fimo.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/meme/fimo.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,230 @@\n+<tool id="meme_fimo" name="FIMO" version="0.0.1">\n+  <description>- Find Individual Motif Occurrences</description>\n+  <command interpreter="python">fimo_wrapper.py \'fimo --o "${$html_outfile.files_path}" --verbosity "1"\n+  \n+  #if str( $options_type.options_type_selector ) == \'advanced\':\n+  --max-seq-length "${options_type.max_seq_length}" \n+  --max-stored-scores "${options_type.max_stored_scores }" \n+  --motif-pseudo "${options_type.motif_pseudo}" \n+  ${options_type.norc} \n+  --output-pthresh "${options_type.output_pthresh}" \n+\n+  \n+  #for $motif in $options_type.motifs:\n+    --motif "${motif.motif}"\n+  #end for\n+  \n+  #if str( $options_type.bgfile_type.bgfile_type_selector ) == \'motif-file\':\n+    --bgfile "motif-file"\n+  #elif str( $options_type.bgfile_type.bgfile_type_selector ) == \'motif-file\':\n+    --bgfile "${options_type.bgfile_type.bgfile}"\n+  #end if\n+  \n+  #if str( $options_type.qvalue_type.qvalue_type_selector ) == \'no-qvalue\':\n+    --no-qvalue\n+  #else:\n+    --output-qthresh "${options_type.qvalue_type.output_qthresh}"\n+  #end if\n+  #end if\n+  \n+  "${input_motifs}" \n+  \n+  #if str( $fasta_type.fasta_type_selector ) == \'history\':\n+    "${fasta_type.input_database}"\n+  #else:\n+    "${ filter( lambda x: str( x[0] ) == str( $fasta_type.input_database ), $__app__.tool_data_tables[ \'all_fasta\' ].get_fields() )[0][3] }"\n+  #end if\n+\n+  \'\n+  \n+  \'${html_outfile.files_path}\'\n+  \n+  \'${html_outfile}\'\n+  \n+  \'${interval_outfile}\'\n+  \n+  \'${txt_outfile}\'\n+  \n+  \'${xml_outfile}\'\n+  \n+  \'${gff_outfile}\'\n+    \n+  </command>\n+  <inputs>\n+    <param format="memexml" name="input_motifs" type="data" label="\'MEME output\' formatted file"/>\n+    \n+    <conditional name="fasta_type">\n+      <param name="fasta_type_selector" type="select" label="Source for sequence to search">\n+        <option value="cached">Locally Cached sequences</option>\n+        <option value="history" selected="true">Sequences from your history</option>\n+      </param>\n+      <when value="cached">\n+        <param name="input_database" type="select" label="Genome to search">\n+          <options from_data_table="all_fasta">\n+          </options>\n+        </param>\n+      </when>\n+      <when value="history">\n+         <param format="fasta" name="input_database" type="data" label="Sequences"/>\n+      </when>\n+    </conditional>\n+    \n+      <conditional name="options_type">\n+        <param name="options_type_selector" type="select" label="Options Configuration">\n+          <option value="basic" selected="true">Basic</option>\n+          <option value="advanced">Advanced</option>\n+        </param>\n+        <when value="basic">\n+          <!-- do nothing here -->\n+        </when>\n+        <when value="advanced">\n+    \n+    <conditional name="bgfile_type">\n+      <param name="bgfile_type_selector" type="select" label="Background file type">\n+        <option value="motif-file">Use Frequencies from Motif File</option>\n+        <option value="default" selected="true">Use frequencies from non-redundant database (default)</option>\n+        <option value="bgfile">Use Frequencies from Background File</option>\n+      </param>\n+      <when value="motif-file">\n+      <!-- do nothing here -->\n+      </when>\n+      <when value="default">\n+      <!-- do nothing here -->\n+      </when>\n+      <when value="bgfile">\n+        <param name="bgfile" type="data" format="txt" optional="True" label="Background Model" />\n+      </when>\n+    </conditional>\n+    \n+    <repeat name="motifs" title="Limit to specified motif">\n+      <param name="motif" type="text" value="" label="Specify motif by id" />\n+    </repeat>\n+    \n+    <param name="max_seq_length" type="integer" value="250000000" label="Maximum input sequence length" />\n+    <param name="max_stored_scores" type="integer" value="100000" label="Maximum score count to store" />\n+    <param name="motif_pseudo" type="float" value="0.1" label="Pseudocount to add to counts in motif matrix" />\n+    <param name="norc" label="Do no'..b'umn="0" value="seq" keep="True"/>\n+                <filter type="param_value" ref="fasta_type.input_database" column="1"/>\n+              </option>\n+            </action>\n+          </when>\n+        </conditional>\n+      </actions>\n+    </data>\n+    <data format="tabular" name="txt_outfile" label="${tool.name} on ${on_string} (text)">\n+      <actions>\n+        <conditional name="fasta_type.fasta_type_selector">\n+          <when value="cached">\n+            <action type="metadata" name="dbkey">\n+              <option type="from_data_table" name="all_fasta" column="1" offset="0">\n+                <filter type="param_value" ref="fasta_type.input_database" column="0"/>\n+              </option>\n+            </action>\n+          </when>\n+        </conditional>\n+      </actions>\n+    </data>\n+    <data format="tabular" name="gff_outfile" label="${tool.name} on ${on_string} (almost-gff)">\n+      <actions>\n+        <conditional name="fasta_type.fasta_type_selector">\n+          <when value="cached">\n+            <action type="metadata" name="dbkey">\n+              <option type="from_data_table" name="all_fasta" column="1" offset="0">\n+                <filter type="param_value" ref="fasta_type.input_database" column="0"/>\n+              </option>\n+            </action>\n+          </when>\n+        </conditional>\n+      </actions>\n+    </data>\n+    <data format="cisml" name="xml_outfile" label="${tool.name} on ${on_string} (xml)">\n+      <actions>\n+        <conditional name="fasta_type.fasta_type_selector">\n+          <when value="cached">\n+            <action type="metadata" name="dbkey">\n+              <option type="from_data_table" name="all_fasta" column="1" offset="0">\n+                <filter type="param_value" ref="fasta_type.input_database" column="0"/>\n+              </option>\n+            </action>\n+          </when>\n+        </conditional>\n+      </actions>\n+    </data>\n+    <data format="interval" name="interval_outfile" label="${tool.name} on ${on_string} (interval)">\n+      <actions>\n+        <conditional name="fasta_type.fasta_type_selector">\n+          <when value="cached">\n+            <action type="metadata" name="dbkey">\n+              <option type="from_data_table" name="all_fasta" column="1" offset="0">\n+                <filter type="param_value" ref="fasta_type.input_database" column="0"/>\n+              </option>\n+            </action>\n+          </when>\n+        </conditional>\n+      </actions>\n+    </data>\n+  </outputs>\n+  <tests>\n+    <test>\n+      <param name="input_motifs" value="meme/meme/meme_output_xml_1.xml" ftype="memexml"/>\n+      <param name="fasta_type_selector" value="history"/>\n+      <param name="input_database" value="phiX.fasta" ftype="fasta"/>\n+      <param name="options_type_selector" value="basic"/>\n+      <param name="non_commercial_use" value="True"/>\n+      <output name="html_outfile" file="meme/fimo/fimo_output_html_1.html" lines_diff="12"/>\n+      <output name="txt_outfile" file="meme/fimo/fimo_output_txt_1.txt" lines_diff="0"/>\n+      <output name="gff_outfile" file="meme/fimo/fimo_output_almost-gff_1.txt" lines_diff="0"/>\n+      <output name="xml_outfile" file="meme/fimo/fimo_output_xml_1.xml" lines_diff="8"/>\n+      <output name="interval_outfile" file="meme/fimo/fimo_output_interval_1.txt" lines_diff="0"/>\n+    </test>\n+  </tests>\n+  <help>\n+\n+.. class:: warningmark\n+\n+**WARNING: This tool is only available for non-commercial use. Use for educational, research and non-profit purposes is permitted. Before using, be sure to review, agree, and comply with the license.**\n+\n+.. class:: infomark\n+\n+**To cite FIMO:**\n+`Grant CE, Bailey TL, Noble WS. FIMO: scanning for occurrences of a given motif. Bioinformatics. 2011 Apr 1;27(7):1017-8. &lt;http://www.ncbi.nlm.nih.gov/pubmed/21330290&gt;`_\n+\n+\n+For detailed information on FIMO, click here_. To view the license_.\n+\n+.. _here: http://meme.nbcr.net/meme/fimo-intro.html\n+.. _license: http://meme.nbcr.net/meme/COPYRIGHT.html\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/meme/fimo_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/meme/fimo_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+#Dan Blankenberg
+
+"""
+Read text output from FIMO and create an interval file.
+"""
+import sys, tempfile, subprocess, shutil, os
+from galaxy_utils.sequence.transform import DNA_reverse_complement
+
+buffsize = 1048576
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def main():
+    assert len( sys.argv ) == 8, "Wrong number of arguments"
+    sys.argv.pop(0)
+    fimo_cmd = sys.argv.pop(0)
+    html_path = sys.argv.pop(0)
+    html_out = sys.argv.pop(0)
+    interval_out = sys.argv.pop(0)
+    txt_out = sys.argv.pop(0)
+    xml_out = sys.argv.pop(0)
+    gff_out = sys.argv.pop(0)
+    
+    #run fimo
+    try:
+        tmp_stderr = tempfile.NamedTemporaryFile()
+        #tmp_stderr = open( tmp_filename, 'wb' )
+        proc = subprocess.Popen( args=fimo_cmd, shell=True, stderr=tmp_stderr )
+        returncode = proc.wait()
+        #tmp_stderr.close()
+        # get stderr, allowing for case where it's very large
+        #tmp_stderr = open( tmp, 'rb' )
+        tmp_stderr.seek(0)
+        stderr = ''
+        try:
+            while True:
+                stderr += tmp_stderr.read( buffsize )
+                if not stderr or len( stderr ) % buffsize != 0:
+                    break
+        except OverflowError:
+            pass
+        
+        if returncode != 0:
+            raise Exception, stderr
+    except Exception, e:
+        raise Exception, 'Error running FIMO:\n' + str( e )
+
+    shutil.move( os.path.join( html_path, 'fimo.txt' ), txt_out )
+    shutil.move( os.path.join( html_path, 'fimo.gff' ), gff_out )
+    shutil.move( os.path.join( html_path, 'fimo.xml' ), xml_out )
+    shutil.move( os.path.join( html_path, 'fimo.html' ), html_out )
+    
+    out_file = open( interval_out, 'wb' )
+    out_file.write( "#%s\n" % "\t".join( ( "chr", "start", "end", "pattern name", "score", "strand", "matched sequence", "p-value", "q-value" ) ) )
+    for line in open( txt_out ):
+        if line.startswith( '#' ): continue
+        fields = line.rstrip( "\n\r" ).split( "\t" )
+        start, end = int( fields[2] ), int( fields[3] )
+        sequence = fields[7]
+        if start > end:
+            start, end = end, start #flip start and end, and set strand
+            strand = "-"
+            sequence = DNA_reverse_complement( sequence ) #we want sequences relative to strand; FIMO always provides + stranded sequence
+        else:
+            strand = "+"
+        start -= 1 #make 0-based start position
+        out_file.write( "%s\n" % "\t".join( [ fields[1], str( start ), str( end ), fields[0], fields[4], strand, sequence, fields[5], fields[6] ] ) )
+    out_file.close()
+
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/meme/meme.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/meme/meme.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,349 @@\n+<tool id="meme_meme" name="MEME" version="1.0.0">\n+  <requirements><requirement type=\'package\'>meme</requirement></requirements>\n+  <description>- Multiple Em for Motif Elicitation</description>\n+  <command>meme "$input1" -o "${html_outfile.files_path}" \n+  -nostatus\n+  \n+  ##-p 8 ##number of processors\n+  \n+  #if str( $options_type.options_type_selector ) == \'advanced\':\n+  -sf "${ str( $options_type.sf ).replace( \' \', \'_\' ) }"\n+  -${options_type.alphabet_type.alphabet_type_selector} \n+  -mod "${options_type.mod_type.mod_type_selector}" \n+  -nmotifs "${options_type.nmotifs}" \n+  -wnsites "${options_type.wnsites}"\n+  -maxsize "${options_type.maxsize}"\n+  \n+  #if $options_type.evt &lt; float(\'inf\'):\n+    -evt "${options_type.evt}" \n+  #end if\n+  \n+  #if str( $options_type.mod_type.mod_type_selector ) != \'oops\':\n+    #if str( $options_type.mod_type.motif_occurrence_type.motif_occurrence_type_selector ) == \'nsites\':\n+      -nsites "${options_type.mod_type.motif_occurrence_type.nsites}"\n+    #elif str( $options_type.mod_type.motif_occurrence_type.motif_occurrence_type_selector ) == \'min_max_sites\':\n+      -minsites "${options_type.mod_type.motif_occurrence_type.minsites}" -maxsites "${options_type.mod_type.motif_occurrence_type.maxsites}"\n+    #end if\n+  #end if\n+  \n+  #if str( $options_type.motif_width_type.motif_width_type_selector ) == \'exact\':\n+    -w "${options_type.motif_width_type.width}"\n+  #else\n+    -minw "${options_type.motif_width_type.minw}" -maxw "${options_type.motif_width_type.maxw}"\n+  #end if\n+  \n+  #if str( $options_type.motif_trim_type.motif_trim_type_selector ) == \'nomatrim\':\n+    -nomatrim\n+  #else\n+    -wg "${options_type.motif_trim_type.wg}" -ws "${options_type.motif_trim_type.ws}" ${options_type.motif_trim_type.noendgaps}\n+  #end if\n+  \n+  #if str( $options_type.bfile ) != \'None\':\n+    -bfile "${options_type.bfile}"\n+  #end if\n+  \n+  #if str( $options_type.pspfile ) != \'None\':\n+    -psp "${options_type.pspfile}"\n+  #end if\n+  \n+  #if str( $options_type.alphabet_type.alphabet_type_selector ) == "dna":\n+    ${options_type.alphabet_type.revcomp} ${options_type.alphabet_type.pal}\n+  #end if\n+  \n+  -maxiter "${options_type.maxiter}" -distance "${options_type.distance}"\n+  \n+  -prior "${options_type.alphabet_type.prior_type.prior_type_selector}"\n+  #if str( $options_type.alphabet_type.prior_type.prior_type_selector ) != \'addone\':\n+    -b "${options_type.alphabet_type.prior_type.prior_b}" \n+    #if str( $options_type.alphabet_type.prior_type.plib ) != \'None\':\n+      -plib "${options_type.alphabet_type.prior_type.plib}"\n+    #end if\n+  #end if\n+  \n+  #if str( $options_type.alphabet_type.spmap_type.spmap_type_selector ) == \'cons\':\n+    -cons "${options_type.alphabet_type.spmap_type.cons}" \n+  #else\n+    -spmap "${options_type.alphabet_type.spmap_type.spmap_type_selector}"\n+    -spfuzz "${options_type.alphabet_type.spmap_type.spfuzz}" \n+  #end if\n+  \n+  #if str( $options_type.branching_type.branching_type_selector ) == \'x_branch\':\n+    -x_branch -bfactor "${options_type.branching_type.bfactor}" -heapsize "${options_type.branching_type.heapsize}"\n+  #end if\n+  \n+  ##-maxsize "1000000" ##remove hardcoded maxsize? should increase number of processors instead\n+  \n+  #end if\n+  \n+  2&gt;&amp;1 || echo "Error running MEME."\n+  \n+  \n+  &amp;&amp; mv ${html_outfile.files_path}/meme.html ${html_outfile}\n+  \n+  &amp;&amp; mv ${html_outfile.files_path}/meme.txt ${txt_outfile}\n+  \n+  &amp;&amp; mv ${html_outfile.files_path}/meme.xml ${xml_outfile}\n+  \n+  </command>\n+  <inputs>\n+    <param format="fasta" name="input1" type="data" label="Sequences"/>\n+      \n+      <conditional name="options_type">\n+        <param name="options_type_selector" type="select" label="Options Configuration">\n+          <option value="basic" selected="true">Basic</option>\n+          <option value="advanced">Advanced</option>\n+        </param>\n+        <when value="basic">\n+          <!-- do nothing here -->\n+        </when>\n+        '..b'    <param name="maxw" type="integer" value="50" label="Max width of motif to search" />\n+        </when>\n+      </conditional>\n+    \n+      <conditional name="motif_trim_type">\n+        <param name="motif_trim_type_selector" type="select" label="Motif trim type">\n+          <option value="nomatrim">No motif trim</option>\n+          <option value="trim" selected="true">Trim motif</option>\n+        </param>\n+        <when value="nomatrim">\n+          <!-- no values here -->\n+        </when>\n+        <when value="trim">\n+          <param name="wg" type="integer" value="11" label="Gap cost" />\n+          <param name="ws" type="integer" value="1" label="Space cost" />\n+          <param name="noendgaps" label="Do not penalize endgaps" type="boolean" truevalue="-noendgaps" falsevalue="" checked="False"/>\n+        </when>\n+      </conditional>\n+    \n+    <param name="bfile" type="data" format="txt" optional="True" label="Background Model" />\n+    <param name="pspfile" type="data" format="txt" optional="True" label="Position-Specific Prior" />\n+    \n+    <param name="maxiter" type="integer" value="50" label="Number of iterations of EM to run" />\n+    <param name="distance" type="float" value="0.001" label="Convergence criterion" />\n+    \n+      <conditional name="branching_type">\n+        <param name="branching_type_selector" type="select" label="x-branching type">\n+          <option value="x_branch">Perform x-branching</option>\n+          <option value="no_x_branch" selected="true">No x-branching</option>\n+        </param>\n+        <when value="no_x_branch">\n+          <!-- no values here -->\n+        </when>\n+        <when value="x_branch">\n+          <param name="bfactor" type="integer" value="3" label="Number of iterations of branching" />\n+          <param name="heapsize" type="integer" value="64" label="Maximum number of heaps to use" />\n+        </when>\n+      </conditional>\n+  \n+    </when>\n+  </conditional>\n+  \n+  <param name="non_commercial_use" label="I certify that I am not using this tool for commercial purposes." type="boolean" truevalue="NON_COMMERCIAL_USE" falsevalue="COMMERCIAL_USE" checked="False">\n+    <validator type="expression" message="This tool is only available for non-commercial use.">value == True</validator>\n+  </param>\n+  \n+  </inputs>\n+  <outputs>\n+    <data format="html" name="html_outfile" label="${tool.name} on ${on_string} (html)"/>\n+    <data format="txt" name="txt_outfile" label="${tool.name} on ${on_string} (text)"/>\n+    <data format="memexml" name="xml_outfile" label="${tool.name} on ${on_string} (xml)"/>\n+  </outputs>\n+  <tests>\n+    <test>\n+      <param name="input1" value="meme/meme/meme_input_1.fasta" ftype="fasta" dbkey="hg19"/>\n+      <param name="options_type_selector" value="basic"/>\n+      <param name="non_commercial_use" value="True"/>\n+      <output name="html_outfile" file="meme/meme/meme_output_html_1.html" lines_diff="12"/>\n+      <output name="txt_outfile" file="meme/meme/meme_output_txt_1.txt" lines_diff="12"/>\n+      <output name="xml_outfile" file="meme/meme/meme_output_xml_1.xml" lines_diff="8"/>\n+    </test>\n+  </tests>\n+  <help>\n+\n+.. class:: warningmark\n+\n+**WARNING: This tool is only available for non-commercial use. Use for educational, research and non-profit purposes is permitted. Before using, be sure to review, agree, and comply with the license.**\n+\n+If you want to specify sequence weights, you must include them at the top of your input FASTA file.\n+\n+.. class:: infomark\n+\n+**To cite MEME:**\n+Timothy L. Bailey and Charles Elkan, "Fitting a mixture model by expectation maximization to discover motifs in biopolymers", Proceedings of the Second International Conference on Intelligent Systems for Molecular Biology, pp. 28-36, AAAI Press, Menlo Park, California, 1994. \n+\n+\n+For detailed information on MEME, click here_. To view the license_.\n+\n+.. _here: http://meme.nbcr.net/meme/meme-intro.html\n+.. _license: http://meme.nbcr.net/meme/COPYRIGHT.html\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/blat_coverage_report.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/blat_coverage_report.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+
+import os, sys
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+def reverse_complement(s):
+    complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":"."}
+    reversed_s = []
+    for i in s:
+        reversed_s.append(complement_dna[i])
+    reversed_s.reverse()
+    return "".join(reversed_s)
+
+def __main__():
+    nuc_index = {'a':0,'t':1,'c':2,'g':3}
+    diff_hash = {}    # key = (chrom, index)
+    infile = sys.argv[1]
+    outfile = sys.argv[2]
+    invalid_lines = 0
+    invalid_chars = 0
+    data_id = ''
+    data_seq = ''
+
+    for i, line in enumerate( open( infile ) ):
+        line = line.rstrip( '\r\n' )
+        if not line or line.startswith( '#' ):
+            continue
+        fields = line.split()
+        if len(fields) != 23:    # standard number of pslx columns
+            invalid_lines += 1
+            continue
+        if not fields[0].isdigit():
+            invalid_lines += 1
+            continue
+        read_id = fields[9]
+        chrom = fields[13]
+        try:
+            block_count = int(fields[17])
+        except:
+            invalid_lines += 1
+            continue
+        block_size = fields[18].split(',')
+        read_start = fields[19].split(',')
+        chrom_start = fields[20].split(',')
+        read_seq = fields[21].split(',')
+        chrom_seq = fields[22].split(',')
+
+        for j in range(block_count):
+            try:
+                this_block_size = int(block_size[j])
+                this_read_start = int(read_start[j])
+                this_chrom_start = int(chrom_start[j])
+            except:
+                invalid_lines += 1
+                break
+            this_read_seq = read_seq[j]
+            this_chrom_seq = chrom_seq[j]
+            
+            if not this_read_seq.isalpha():
+                continue
+            if not this_chrom_seq.isalpha():
+                continue
+            
+            # brut force to check coverage                
+            for k in range(this_block_size):
+                cur_index = this_chrom_start+k
+                sub_a = this_read_seq[k:(k+1)].lower()
+                sub_b = this_chrom_seq[k:(k+1)].lower()
+                if not diff_hash.has_key((chrom, cur_index)):
+                    try:
+                        diff_hash[(chrom, cur_index)] = [0,0,0,0,sub_b.upper()]    # a, t, c, g, ref. nuc.
+                    except Exception, e:
+                        stop_err( str( e ) )
+                if sub_a in ['a','t','c','g']:
+                    diff_hash[(chrom, cur_index)][nuc_index[(sub_a)]] += 1
+                else:
+                    invalid_chars += 1
+                        
+    outputfh = open(outfile, 'w')
+    outputfh.write( "##title\tlocation\tref.\tcov.\tA\tT\tC\tG\n" )
+    keys = diff_hash.keys()
+    keys.sort()
+    for i in keys:
+        (chrom, location) = i
+        sum = diff_hash[ (i) ][ 0 ] + diff_hash[ ( i ) ][ 1 ] + diff_hash[ ( i ) ][ 2 ] + diff_hash[ ( i ) ][ 3 ]    # did not include N's
+        if sum == 0:
+            continue
+        ratio_A = diff_hash[ ( i ) ][ 0 ] * 100.0 / sum
+        ratio_T = diff_hash[ ( i ) ][ 1 ] * 100.0 / sum
+        ratio_C = diff_hash[ ( i ) ][ 2 ] * 100.0 / sum
+        ratio_G = diff_hash[ ( i ) ][ 3 ] * 100.0 / sum
+        (title_head, title_tail) = os.path.split(chrom)
+        result = "%s\t%s\t%s\t%d\tA(%0.0f)\tT(%0.0f)\tC(%0.0f)\tG(%0.0f)\n" % ( title_tail, location, diff_hash[(i)][4], sum, ratio_A, ratio_T, ratio_C, ratio_G ) 
+        outputfh.write(result)
+    outputfh.close()
+
+    if invalid_lines:
+        print 'Skipped %d invalid lines. ' % ( invalid_lines )
+    if invalid_chars:
+        print 'Skipped %d invalid characters in the alignment. ' % (invalid_chars)
+        
+if __name__ == '__main__': __main__()
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/blat_coverage_report.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/blat_coverage_report.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,75 @@
+<tool id="generate_coverage_report" name="Polymorphism of the Reads">
+ <description>the percentage of reads supporting each nucleotide at each location</description>
+ <command interpreter="python">blat_coverage_report.py $input1 $output1</command>
+ <inputs>
+ <param name="input1" type="data" format="tabular" label="Alignment result"/>
+ </inputs>
+ <outputs>
+ <data name="output1" format="tabular"/>
+ </outputs> 
+ <tests>
+ <test>
+ <param name="input1" value="blat_coverage_report_test1.txt" ftype="tabular" />
+ <output name="output1" file="blat_coverage_report_test1.out" />
+ </test>
+ </tests>
+ <help>
+
+.. class:: warningmark
+
+**IMPORTANT**. Only works for BLAT **standard** or **pslx** output formats (hint: to output pslx format, add **-out=pslx** in the command).
+
+-----
+
+**What it does**

+ The tool will generate a table of 6 columns as following:

+- 1st column: chromosome id.
+
+- 2nd column: chromosome location.
+
+- 3rd column: the nucleotide from reference genome at the chromosome location (2nd column).
+
+- 4th column: total coverage of the reads (number of reads that were mapped to the chromosome location).
+
+- 5th column: percentage of reads that support nucleotide **A** at this location.
+
+- 6th column: percentage of reads that support nucleotide **T** at this location.
+
+- 7th column: percentage of reads that support nucleotide **C** at this location.
+
+- 8th column: percentage of reads that support nucleotide **G** at this location.


+-----
+
+**Example**
+
+- The BLAT pslx results look like the following (tab separated with sequence at the end)::
+
+ 30 0 0 0 0 0 0 0 + seq0 30 0 30 chr 4639675 4549207 4549237 1 30, 0, 4549207, cggacagcgccgccaccaacaaagccacca, cggacagcgccgccaccaacaaagccacca,
+ 30 0 0 0 0 0 0 0 + seq1 30 0 30 chr 4639675 614777 614807 1 30, 0, 614777, aaaacaccggatgctccggcgctggcagat, aaaacaccggatgctccggcgctggcagat,
+ 28 1 0 0 0 0 0 0 + seq2 30 0 29 chr 4639675 3289283 3289312 1 29, 0, 3289283, tttgcttttagtacaccggattcagaacc, tttgctttcagtacaccggattcagaacc,
+ 30 0 0 0 0 0 0 0 + seq4 30 0 30 chr 4639675 2665584 2665614 1 30, 0, 2665584, cacgctacgtgcgcccccgcccagaaggcg, cacgctacgtgcgcccccgcccagaaggcg,
+
+ The 14th column is the chromosome id, and the 16th and 17th columns shows the reads were mapped to chromosome start and end locations.
+
+- The report showed overall coverage of reads on each chromosome location (partial result)::

+   +-------+----------+------+------+--------+------+--------+------+
+   | title | location | ref. | cov. |   A    |  T   |   C    |  G   |
+   +-------+----------+------+------+--------+------+--------+------+
+   |   chr |   614777 |  A   |  1   | A(100) | T(0) | C(0) | G(0) |
+   |   chr |   614778 |  A   | 1   | A(100) | T(0) |   C(0) | G(0) |
+   |   chr |   614779 |  A   |  1   | A(100) | T(0) |   C(0) | G(0) |
+   +-------+----------+------+------+--------+------+--------+------+
+
+-----
+
+**Reference**

+ **BLAT**: Kent, W James, BLAT--the BLAST-like alignment tool. (2002) Genome Research:12(4) 656-664.
+
+ </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/blat_mapping.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/blat_mapping.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+
+import os, sys
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def reverse_complement(s):
+    complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":"."}
+    reversed_s = []
+    for i in s:
+        reversed_s.append(complement_dna[i])
+    reversed_s.reverse()
+    return "".join(reversed_s)
+    
+def __main__():
+    nuc_index = {'a':0,'t':1,'c':2,'g':3,'n':4}
+    coverage = {}        # key = (chrom, index)
+    invalid_lines = 0
+    invalid_chrom = 0
+    infile = sys.argv[1]
+    outfile = sys.argv[2]
+
+    for i, line in enumerate( open( infile ) ):
+        line = line.rstrip('\r\n')
+        if not line or line.startswith('#'):
+            continue
+        fields = line.split()
+        if len(fields) < 21:                # standard number of pslx columns
+            invalid_lines += 1
+            continue 
+        if not fields[0].isdigit():
+            invalid_lines += 1
+            continue
+        chrom = fields[13]
+        if not chrom.startswith( 'chr' ):
+            invalid_lines += 1
+            invalid_chrom += 1
+            continue
+        try:
+            block_count = int(fields[17])
+        except:
+            invalid_lines += 1
+            continue
+        block_size = fields[18].split(',')
+        chrom_start = fields[20].split(',')
+
+        for j in range( block_count ):
+            try:
+                this_block_size = int(block_size[j])
+                this_chrom_start = int(chrom_start[j])
+            except:
+                invalid_lines += 1
+                break
+            # brut force coverage                
+            for k in range( this_block_size ):
+                cur_index = this_chrom_start + k
+                if coverage.has_key( ( chrom, cur_index ) ):
+                    coverage[(chrom, cur_index)] += 1
+                else:
+                    coverage[(chrom, cur_index)] = 1
+                
+    # generate a index file
+    outputfh = open(outfile, 'w')
+    keys = coverage.keys()
+    keys.sort()
+    previous_chrom = ''
+    for i in keys:
+        (chrom, location) = i
+        sum = coverage[(i)]
+        if chrom != previous_chrom:
+            outputfh.write( 'variableStep chrom=%s\n' % ( chrom ) )
+            previous_chrom = chrom
+        outputfh.write( "%s\t%s\n" % ( location, sum ) )
+    outputfh.close()
+    
+    if invalid_lines:
+        invalid_msg = "Skipped %d invalid lines" % invalid_lines
+        if invalid_chrom:
+            invalid_msg += ", including %d lines with chrom id errors which must begin with 'chr' to map correctly to the UCSC Genome Browser. "
+        
+if __name__ == '__main__': __main__()
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/blat_mapping.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/blat_mapping.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,42 @@
+<tool id="blat2wig" name="Coverage of the Reads">
+  <description>in wiggle format</description>
+  <command interpreter="python">blat_mapping.py $input1 $output1</command>
+  <inputs>
+    <param name="input1" type="data" format="tabular" label="Alignment result"/>
+  </inputs>
+  <outputs>
+    <data name="output1" format="wig"/>
+  </outputs> 
+  <tests>
+    <test>
+      <param name="input1" value="blat_mapping_test1.txt" ftype="tabular" />
+      <output name="output1" file="blat_mapping_test1.out" />
+    </test>
+  </tests>
+  <help>
+
+.. class:: warningmark
+
+ To generate acceptable files, please use alignment program **BLAT** with option **-out=pslx**. 
+
+.. class:: warningmark
+
+ Please edit the database information by click on the pencil icon next to your dataset. Select the corresponding genome build.
+
+-----
+
+**What it does**

+ This tool takes **BLAT pslx** output and returns a wig-like file showing the number of reads (coverage) mapped at each chromosome location. Use **Graph/Display Data --> Build custom track** tool to show the coverage mapping in UCSC Genome Browser.
+
+-----
+
+**Example**
+
+ Showing reads coverage on human chromosome 22 (partial result) in UCSC Genome Browser Custom Track:

+ .. image:: ./static/images/blat_mapping_example.png
+  :width: 600

+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/blat_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/blat_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+
+import os, sys, tempfile
+
+assert sys.version_info[:2] >= (2.4)
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+    
+def check_nib_file( dbkey, GALAXY_DATA_INDEX_DIR ):
+    nib_file = "%s/alignseq.loc" % GALAXY_DATA_INDEX_DIR
+    nib_path = ''
+    nibs = {}
+    for i, line in enumerate( file( nib_file ) ):
+        line = line.rstrip( '\r\n' )
+        if line and not line.startswith( "#" ):
+            fields = line.split( '\t' )
+            if len( fields ) < 3:
+                continue
+            if fields[0] == 'seq':
+                nibs[( fields[1] )] = fields[2]
+    if nibs.has_key( dbkey ):
+        nib_path = nibs[( dbkey )]
+    return nib_path
+
+def check_twobit_file( dbkey, GALAXY_DATA_INDEX_DIR ):
+    twobit_file = "%s/twobit.loc" % GALAXY_DATA_INDEX_DIR
+    twobit_path = ''
+    twobits = {}
+    for i, line in enumerate( file( twobit_file ) ):
+        line = line.rstrip( '\r\n' )
+        if line and not line.startswith( "#" ): 
+            fields = line.split( '\t' )
+            if len( fields ) < 2:
+                continue
+            twobits[( fields[0] )] = fields[1]
+    if twobits.has_key( dbkey ):
+        twobit_path = twobits[( dbkey )]
+    return twobit_path
+
+def __main__():
+    # I/O
+    source_format = sys.argv[1]        # 0: dbkey; 1: upload file
+    target_file = sys.argv[2]
+    query_file = sys.argv[3]
+    output_file = sys.argv[4]
+    min_iden = sys.argv[5]
+    tile_size = sys.argv[6]
+    one_off = sys.argv[7]
+    
+    try:
+        float(min_iden)    
+    except:
+        stop_err('Invalid value for minimal identity.')
+    
+    try:  
+        test = int(tile_size)
+        assert test >= 6 and test <= 18
+    except:
+        stop_err('Invalid value for tile size. DNA word size must be between 6 and 18.')
+        
+    try:
+        test = int(one_off)
+        assert test >= 0 and test <= int(tile_size)
+    except:
+        stop_err('Invalid value for mismatch numbers in the word')
+        
+    GALAXY_DATA_INDEX_DIR = sys.argv[8]
+
+    all_files = []
+    if source_format == '0':
+        # check target genome
+        dbkey = target_file
+        nib_path = check_nib_file( dbkey, GALAXY_DATA_INDEX_DIR )
+        twobit_path = check_twobit_file( dbkey, GALAXY_DATA_INDEX_DIR )
+        if not os.path.exists( nib_path ) and not os.path.exists( twobit_path ):
+            stop_err("No sequences are available for %s, request them by reporting this error." % dbkey)
+    
+        # check the query file, see whether all of them are legitimate sequence
+        if nib_path and os.path.isdir( nib_path ):
+            compress_files = os.listdir(nib_path)
+            target_path = nib_path
+        elif twobit_path:
+            compress_files = [twobit_path]
+            target_path = ""
+        else:
+            stop_err("Requested genome build has no available sequence.")
+            
+        for file in compress_files:
+            file = "%s/%s" % ( target_path, file )
+            file = os.path.normpath(file)
+            all_files.append(file)
+    else:
+        all_files = [target_file]
+        
+    for detail_file_path in all_files:
+        output_tempfile = tempfile.NamedTemporaryFile().name
+        command = "blat %s %s %s -oneOff=%s -tileSize=%s -minIdentity=%s -mask=lower -noHead -out=pslx 2>&1" % ( detail_file_path, query_file, output_tempfile, one_off, tile_size, min_iden )
+        os.system( command )
+        os.system( 'cat %s >> %s' % ( output_tempfile, output_file ) )
+        os.remove( output_tempfile )
+        
+if __name__ == '__main__': __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/blat_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/blat_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,99 @@
+<tool id="blat_wrapper" name="BLAT" version="1.0.0">
+  <description> compare sequencing reads against UCSC genome builds</description>
+  <command interpreter="python">
+    #if $source.source_select=="database" #blat_wrapper.py 0 $source.dbkey $input_query $output1 $iden $tile_size $one_off
+    #else                                 #blat_wrapper.py 1 $source.input_target $input_query $output1 $iden $tile_size $one_off
+    #end if# ${GALAXY_DATA_INDEX_DIR}
+  </command>
+ <inputs>
+ <conditional name="source">
+ <param name="source_select" type="select" label="Target source">
+ <option value="database">Genome Build</option>
+ <option value="input_ref">Your Upload File</option>
+ </param>
+ <when value="database">
+ <param name="dbkey" type="genomebuild" label="Genome" />
+ </when>
+ <when value="input_ref">
+ <param name="input_target" type="data" format="fasta" label="Reference sequence" />
+  </when>
+ </conditional>
+ <param name="input_query" type="data" format="fasta" label="Sequence file"/>
+ <param name="iden" type="float" size="15" value="90.0" label="Minimal identity (-minIdentity)" />
+ <param name="tile_size" type="integer" size="15" value="11" label="Minimal size of exact match (-tileSize)" help="Must be between 6 and 18."/>
+ <param name="one_off" type="integer" size="15" value="0" label="Number of mismatch in the word (-oneOff)" help="Must be between 0 and 2." />
+ </inputs>
+ <outputs>
+ <data name="output1" format="tabular"/>
+ </outputs>
+ <requirements>
+   <requirement type="binary">blat</requirement>
+ </requirements>
+ <tests>
+ <test>
+ <param name="source_select" value="database" />
+ <param name="dbkey" value="eschColi_K12" />
+ <param name="input_query" value="blat_wrapper_test1.fa" ftype="fasta"/>
+ <param name="iden" value="90.0" />
+ <param name="tile_size" value="11" />
+ <param name="one_off" value="0" />
+ <output name="output1" file="blat_wrapper_test1.out" />
+ </test>
+ </tests>
+ <help>
+
+.. class:: warningmark 
+
+Using a smaller word size (*Minimal Size of Exact Match*) will increase the computational time.
+
+.. class:: warningmark 
+
+Using a larger mismatch number (*Number of Mismatch in the Word*) will increase the computational time.
+
+-----
+
+**What it does**

+This tool currently uses the **BLAT** alignment program. Your short reads file is searched against a genome build or another uploaded file. 

+-----

+**Example**

+- Input a multiple fasta file::
+
+ &gt;seq1
+ TGGTAATGGTGGTTTTTTTTTTTTTTTTTTATTTTT
+
+- Use the default settings:
+
+  - alignment identity must be higher than or equal to 90%.
+  
+  - minimal size of exact match to trigger an alignment is 11.
+  
+  - allow 0 mismatches in the above exact match size.
+  
+- Search against ce2 (C. elegans March 2004), partial result::
+
+ 25 1 0 0 0 0 0 0 + seq1 36 10 36 chrI 15080483 9704438 9704464 1 26, 10, 9704438, ggttttttttttttttttttattttt, ggtttttttttttttttttttttttt,
+ 27 0 0 0 0 0 1 32 + seq1 36 9 36 chrI 15080483 1302536 1302595 2 21,6, 9,30, 1302536,1302589, tggtttttttttttttttttt,attttt, tggtttttttttttttttttt,attttt,
+
+-----
+
+**Parameters**
+
+- *Minimal Identity* (**-minIdentity**) : In percent, the minimum sequence identity between the query and target alignment. Default is 90.
+
+- *Minimal Size of Exact Match* (**-tileSize**) : The size of a match that will trigger an alignment. Default is 11. Usually between 8 and 12. Must be between 6 and 18.
+
+- *Number of Mismatch in the Word* (**-oneOff**) : The number of mismatches allowed in the word (tile size) and still triggers an alignment. Default is 0.
+
+-----
+
+**Reference**

+ **BLAT**: Kent, W James, BLAT--the BLAST-like alignment tool. (2002) Genome Research:12(4) 656-664.
+
+
+ </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/convert_SOLiD_color2nuc.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/convert_SOLiD_color2nuc.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+"""
+convert SOLiD calor-base data to nucleotide sequence
+example: T011213122200221123032111221021210131332222101
+         TTGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT
+"""
+
+import sys, os
+
+def stop_err(msg):
+    
+    sys.stderr.write(msg)
+    sys.stderr.write('\n')
+    sys.exit()
+    
+def color2base(color_seq):
+
+    first_nuc = ['A','C','G','T']
+    code_matrix = {}
+    code_matrix['0'] = ['A','C','G','T']
+    code_matrix['1'] = ['C','A','T','G']
+    code_matrix['2'] = ['G','T','A','C']
+    code_matrix['3'] = ['T','G','C','A']
+
+    overlap_nuc = ''
+    nuc_seq = ''
+    
+    seq_prefix = prefix = color_seq[0].upper()
+    color_seq = color_seq[1:]
+                
+    if not (seq_prefix in first_nuc):
+        stop_err('The leading nucleotide is invalid. Must be one of the four nucleotides: A, T, C, G.\nThe file contains a %s' %seq_prefix )
+
+    for code in color_seq:
+        
+        if not (code in ['0','1','2','3']):
+            stop_err('Expect digits (0, 1, 2, 3) in the color-cading data. File contains numbers other than the set.\nThe file contains a %s' %code)
+        
+        second_nuc = code_matrix[code]
+        overlap_nuc = second_nuc[first_nuc.index(prefix)]
+        nuc_seq += overlap_nuc
+        prefix = overlap_nuc
+
+    return seq_prefix, nuc_seq
+
+def __main__():
+
+    infilename = sys.argv[1]
+    keep_prefix = sys.argv[2].lower()
+    outfilename = sys.argv[3]
+
+    outfile = open(outfilename,'w')
+
+    prefix = ''
+    color_seq = ''
+    for i, line in enumerate(file(infilename)):
+        line = line.rstrip('\r\n')
+
+        if not line: continue
+        if line.startswith("#"): continue
+    
+        if line.startswith(">"):
+            
+            if color_seq:
+                prefix, nuc_seq = color2base(color_seq)
+                
+                if keep_prefix == 'yes':
+                    nuc_seq = prefix + nuc_seq
+                
+                outfile.write(title+'\n')
+                outfile.write(nuc_seq+'\n')
+                
+            title = line
+            color_seq = ''
+        else:
+            color_seq += line
+            
+    if color_seq:
+        prefix, nuc_seq = color2base(color_seq)
+                
+        if keep_prefix == 'yes':
+            nuc_seq = prefix + nuc_seq
+
+        outfile.write(title+'\n')
+        outfile.write(nuc_seq+'\n')
+            
+    outfile.close()
+    
+if __name__=='__main__': __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/convert_SOLiD_color2nuc.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/convert_SOLiD_color2nuc.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,72 @@
+<tool id="color2nuc" name="Convert Color Space" version="1.0.0">
+<description> to Nucleotides </description>
+<command interpreter="python">convert_SOLiD_color2nuc.py $input1 $input2 $output1 </command>
+
+<inputs>
+    <param name="input1" type="data" format="txt" label="SOLiD color coding file" />
+    <param name="input2" type="select" label="Keep prefix nucleotide">
+     <option value="yes">Yes</option>
+     <option value="no">No</option>
+    </param>
+</inputs>
+<outputs>
+   <data name="output1" format="fasta" />
+</outputs>
+<!-- 
+<tests>
+ <test>
+ <param name="input1" value="convert_SOLiD_color2nuc_test1.txt" ftype="txt" />
+ <param name="input2" value="no" />
+ <output name="output1" file="convert_SOLiD_color2nuc_test1.out" />
+ </test>
+</tests>
+-->
+<help>
+
+.. class:: warningmark
+
+The tool was designed for color space files generated from an ABI SOLiD sequencer. The file format must be fasta-like: the title starts with a ">" character, and each color space sequence starts with a leading nucleotide.

+-----
+
+**What it does**
+
+This tool converts a color space sequence to nucleotides. The leading character must be a nucleotide: A, C, G, or T. 

+-----
+
+**Example**
+
+- If the color space file looks like this::
+
+ &gt;seq1
+ A013
+ &gt;seq2
+ T011213122200221123032111221021210131332222101
+
+- If you would like to **keep** the leading nucleotide::
+
+ &gt;seq1
+ AACG
+ &gt;seq2
+ TTGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT
+
+- If you **do not want to keep** the leading nucleotide (the length of nucleotide sequence will be one less than the color-space sequence)::

+  &gt;seq1
+  ACG
+  &gt;seq2
+ TGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT 
+
+-----
+
+**ABI SOLiD Color Coding Alignment matrix**
+
+ Each di-nucleotide is represented by a single digit: 0 to 3. The matrix is symmetric, thus the leading nucleotide is necessary to determine the sequence (otherwise there are four possibilities).
+

+ .. image:: ./static/images/dualcolorcode.png
+
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/fastqsolexa_to_fasta_qual.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/fastqsolexa_to_fasta_qual.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+
+"""
+convert fastqsolexa file to separated sequence and quality files.
+
+assume each sequence and quality score are contained in one line
+the order should be:
+1st line: @title_of_seq
+2nd line: nucleotides
+3rd line: +title_of_qualityscore (might be skipped)
+4th line: quality scores 
+(in three forms: a. digits, b. ASCII codes, the first char as the coding base, c. ASCII codes without the first char.)
+
+Usage:
+%python fastqsolexa_to_fasta_qual.py <your_fastqsolexa_filename> <output_seq_filename> <output_score_filename>
+"""
+
+import sys, os
+from math import *
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( "%s" % msg )
+    sys.exit()
+
+def __main__():
+    infile_name = sys.argv[1]
+    outfile_seq = open( sys.argv[2], 'w' )
+    outfile_score = open( sys.argv[3], 'w' )
+    datatype = sys.argv[4]
+    seq_title_startswith = ''
+    qual_title_startswith = ''
+    default_coding_value = 64
+    fastq_block_lines = 0
+    
+    for i, line in enumerate( file( infile_name ) ):
+        line = line.rstrip()
+        if not line or line.startswith( '#' ):
+            continue
+        fastq_block_lines = ( fastq_block_lines + 1 ) % 4
+        line_startswith = line[0:1]
+        if fastq_block_lines == 1:
+            # first line is @title_of_seq
+            if not seq_title_startswith:
+                seq_title_startswith = line_startswith
+            if line_startswith != seq_title_startswith:
+                outfile_seq.close()
+                outfile_score.close()
+                stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )
+            read_title = line[1:]
+            outfile_seq.write( '>%s\n' % line[1:] )
+        elif fastq_block_lines == 2:
+            # second line is nucleotides
+            read_length = len( line )
+            outfile_seq.write( '%s\n' % line )
+        elif fastq_block_lines == 3:
+            # third line is +title_of_qualityscore ( might be skipped )
+            if not qual_title_startswith:
+                qual_title_startswith = line_startswith
+            if line_startswith != qual_title_startswith:
+                outfile_seq.close()
+                outfile_score.close()
+                stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )    
+            quality_title = line[1:]
+            if quality_title and read_title != quality_title:
+                outfile_seq.close()
+                outfile_score.close()
+                stop_err( 'Invalid fastqsolexa format at line %d: sequence title "%s" differes from score title "%s".' % ( i + 1, read_title, quality_title ) )
+            if not quality_title:
+                outfile_score.write( '>%s\n' % read_title )
+            else:
+                outfile_score.write( '>%s\n' % line[1:] )
+        else:
+            # fourth line is quality scores
+            qual = ''
+            fastq_integer = True
+            # peek: ascii or digits?
+            val = line.split()[0]
+            try: 
+                check = int( val )
+                fastq_integer = True
+            except:
+                fastq_integer = False
+                
+            if fastq_integer:
+                # digits
+                qual = line
+            else:
+                # ascii
+                quality_score_length = len( line )
+                if quality_score_length == read_length + 1:
+                    # first char is qual_score_startswith
+                    qual_score_startswith = ord( line[0:1] )
+                    line = line[1:]
+                elif quality_score_length == read_length:
+                    qual_score_startswith = default_coding_value
+                else:
+                    stop_err( 'Invalid fastqsolexa format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) )
+                for j, char in enumerate( line ):
+                    score = ord( char ) - qual_score_startswith    # 64
+                    qual = "%s%s " % ( qual, str( score ) )
+            outfile_score.write( '%s\n' % qual )
+              
+    outfile_seq.close()
+    outfile_score.close()
+
+if __name__ == "__main__": __main__() 
+    
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/fastqsolexa_to_fasta_qual.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/fastqsolexa_to_fasta_qual.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,91 @@
+<tool id="fastqsolexa_to_fasta_qual" name="FASTQSOLEXA-to-FASTA-QUAL" version="1.0.0">
+  <description>extracts sequences and quality scores from FASTQSOLEXA data</description>
+  <command interpreter="python">fastqsolexa_to_fasta_qual.py $input1 $output1 $output2 $input1.extension</command>
+  <inputs>
+    <param name="input1" type="data" format="fastqsolexa" label="Fastqsolexa file"/>
+  </inputs>
+  <outputs>
+    <data name="output1" format="fasta"/>
+    <data name="output2" format="qualsolexa"/>
+  </outputs>
+  <tests>
+    <!-- NOTE: this tool generates 2 output files, but our functional tests currently only handle the last one generated -->
+    <test>
+      <param name="input1" value="1.fastqsolexa" ftype="fastqsolexa" />
+      <output name="output1" file="fastqsolexa_to_fasta_qual_out4.fasta" />
+    </test>
+    <test>
+      <param name="input1" value="2.fastqsolexa" ftype="fastqsolexa" />
+      <output name="output1" file="fastqsolexa_to_fasta_qual_out2.fasta" />
+    </test>
+  </tests>
+  <help>
+
+.. class:: warningmark
+
+IMPORTANT: This tool currently only supports data where the quality scores are integers or ASCII quality scores with base 64.  
+
+-----
+
+**What it does**
+
+This tool extracts sequences and quality scores from FASTQ data ( Solexa variant ), producing a FASTA dataset and a QUAL dataset.
+
+-----
+
+**Example1**
+
+- Converting the following Solexa fastq data::
+
+    @seq1  
+    GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT  
+    +seq1  
+    hhhhhhhhhhhhhhhhhhhhhhhhhhPW@hhhhhh  
+    @seq2  
+    GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG  
+    +seq2  
+    hhhhhhhhhhhhhhYhhahhhhWhAhFhSIJGChO
+
+- will extract the following sequences::
+
+    >seq1
+    GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT
+    >seq2
+    GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG
+    
+- and quality scores::
+
+    >seq1
+    40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 16 23 0 40 40 40 40 40 40 
+    >seq2
+    40 40 40 40 40 40 40 40 40 40 40 40 40 40 25 40 40 33 40 40 40 40 23 40 1 40 6 40 19 9 10 7 3 40 15 
+
+**Example2**
+
+- Converting the following Solexa fastq data::
+
+    @HANNIBAL_1_FC302VTAAXX:2:1:228:167
+    GAATTGATCAGGACATAGGACAACTGTAGGCACCAT
+    +HANNIBAL_1_FC302VTAAXX:2:1:228:167
+    40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4
+    @HANNIBAL_1_FC302VTAAXX:2:1:156:340
+    GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG
+    +HANNIBAL_1_FC302VTAAXX:2:1:156:340
+    40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9
+
+- will extract the following sequences::
+
+    >HANNIBAL_1_FC302VTAAXX:2:1:228:167
+    GAATTGATCAGGACATAGGACAACTGTAGGCACCAT
+    >HANNIBAL_1_FC302VTAAXX:2:1:156:340
+    GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG
+
+- and quality scores::
+
+    >HANNIBAL_1_FC302VTAAXX:2:1:228:167
+    40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4
+    >HANNIBAL_1_FC302VTAAXX:2:1:156:340
+    40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9
+
+    </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/mapping_to_ucsc.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/mapping_to_ucsc.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,204 @@\n+#!/usr/bin/env python\n+\n+from galaxy import eggs\n+import sys, tempfile, os\n+\n+assert sys.version_info[:2] >= (2.4)\n+\n+def stop_err(msg):\n+    sys.stderr.write(msg)\n+    sys.exit()\n+    \n+def main():\n+\n+    out_fname = sys.argv[1]\n+    in_fname = sys.argv[2]\n+    chr_col = int(sys.argv[3])-1\n+    coord_col = int(sys.argv[4])-1\n+    track_type = sys.argv[5]\n+    if track_type == \'coverage\' or track_type == \'both\': \n+        coverage_col = int(sys.argv[6])-1\n+        cname = sys.argv[7]\n+        cdescription = sys.argv[8]\n+        ccolor = sys.argv[9].replace(\'-\',\',\')\n+        cvisibility = sys.argv[10]\n+    if track_type == \'snp\' or track_type == \'both\':\n+        if track_type == \'both\':\n+            j = 5\n+        else:\n+            j = 0 \n+        #sname = sys.argv[7+j]\n+        sdescription = sys.argv[6+j]\n+        svisibility = sys.argv[7+j]\n+        #ref_col = int(sys.argv[10+j])-1\n+        read_col = int(sys.argv[8+j])-1\n+    \n+\n+    # Sort the input file based on chromosome (alphabetically) and start co-ordinates (numerically)\n+    sorted_infile = tempfile.NamedTemporaryFile()\n+    try:\n+        os.system("sort -k %d,%d -k %dn -o %s %s" %(chr_col+1,chr_col+1,coord_col+1,sorted_infile.name,in_fname))\n+    except Exception, exc:\n+        stop_err( \'Initialization error -> %s\' %str(exc) )\n+\n+    #generate chr list\n+    sorted_infile.seek(0)\n+    chr_vals = []\n+    for line in file( sorted_infile.name ):\n+        line = line.strip()\n+        if not(line):\n+            continue\n+        try:\n+            fields = line.split(\'\\t\')\n+            chr = fields[chr_col]\n+            if chr not in chr_vals:\n+                chr_vals.append(chr)\n+        except:\n+            pass\n+    if not(chr_vals):   \n+        stop_err("Skipped all lines as invalid.")\n+        \n+    if track_type == \'coverage\' or track_type == \'both\':\n+        if track_type == \'coverage\':\n+            fout = open( out_fname, "w" )\n+        else:\n+            fout = tempfile.NamedTemporaryFile()\n+        fout.write(\'\'\'track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\\n\'\'\' \\\n+                      % ( cname, cdescription, ccolor, cvisibility ))\n+    if track_type == \'snp\' or track_type == \'both\':\n+        fout_a = tempfile.NamedTemporaryFile()\n+        fout_t = tempfile.NamedTemporaryFile()\n+        fout_g = tempfile.NamedTemporaryFile()\n+        fout_c = tempfile.NamedTemporaryFile()\n+        fout_ref = tempfile.NamedTemporaryFile()\n+        \n+        fout_a.write(\'\'\'track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\\n\'\'\' \\\n+                      % ( "Track A", sdescription, \'255,0,0\', svisibility ))\n+        fout_t.write(\'\'\'track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\\n\'\'\' \\\n+                      % ( "Track T", sdescription, \'0,255,0\', svisibility ))\n+        fout_g.write(\'\'\'track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\\n\'\'\' \\\n+                      % ( "Track G", sdescription, \'0,0,255\', svisibility ))\n+        fout_c.write(\'\'\'track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\\n\'\'\' \\\n+                      % ( "Track C", sdescription, \'255,0,255\', svisibility ))\n+        \n+        \n+    sorted_infile.seek(0)\n+    for line in file( sorted_infile.name ):\n+        line = line.strip()\n+        if not(line):\n+            continue\n+        try:\n+            fields = line.split(\'\\t\')\n+            chr = fields[chr_col]\n+            start = int(fields[coord_col])\n+            assert start > 0\n+        except:\n+            continue\n+        try:\n+            ind = chr_vals.index(chr)    #encountered chr for the 1st time\n+            del chr_vals[ind]\n+            prev_start = \'\'\n+            header = "variableStep chrom=%s\\n" %(chr)\n+            if track_type == \'coverage\' or track_type == \'both\':\n+                coverage = int(fields[coverage_col])\n+                line1 = "%s\\t%s\\n" %(start,coverage)\n+                fout.write("%s%s" %(header,'..b'= c = 0\n+                fout_a.write("%s" %(header))\n+                fout_t.write("%s" %(header))\n+                fout_g.write("%s" %(header))\n+                fout_c.write("%s" %(header))\n+                try:\n+                    #ref_nt = fields[ref_col].capitalize()\n+                    read_nt = fields[read_col].capitalize()\n+                    try:\n+                        nt_ind = [\'A\',\'T\',\'G\',\'C\'].index(read_nt)\n+                        if nt_ind == 0:\n+                            a+=1\n+                        elif nt_ind == 1:\n+                            t+=1\n+                        elif nt_ind == 2:\n+                            g+=1\n+                        else:\n+                            c+=1\n+                    except ValueError:\n+                        pass\n+                except:\n+                    pass\n+            prev_start = start\n+        except ValueError:\n+            if start != prev_start:\n+                if track_type == \'coverage\' or track_type == \'both\':\n+                    coverage = int(fields[coverage_col])\n+                    fout.write("%s\\t%s\\n" %(start,coverage)) \n+                if track_type == \'snp\' or track_type == \'both\':\n+                    if a:\n+                        fout_a.write("%s\\t%s\\n" %(prev_start,a))\n+                    if t:\n+                        fout_t.write("%s\\t%s\\n" %(prev_start,t))\n+                    if g:\n+                        fout_g.write("%s\\t%s\\n" %(prev_start,g))\n+                    if c:\n+                        fout_c.write("%s\\t%s\\n" %(prev_start,c))\n+                    a = t = g = c = 0\n+                    try:\n+                        #ref_nt = fields[ref_col].capitalize()\n+                        read_nt = fields[read_col].capitalize()\n+                        try:\n+                            nt_ind = [\'A\',\'T\',\'G\',\'C\'].index(read_nt)\n+                            if nt_ind == 0:\n+                                a+=1\n+                            elif nt_ind == 1:\n+                                t+=1\n+                            elif nt_ind == 2:\n+                                g+=1\n+                            else:\n+                                c+=1\n+                        except ValueError:\n+                            pass\n+                    except:\n+                        pass\n+                prev_start = start\n+            else:\n+                if track_type == \'snp\' or track_type == \'both\':\n+                    try:\n+                        #ref_nt = fields[ref_col].capitalize()\n+                        read_nt = fields[read_col].capitalize()\n+                        try:\n+                            nt_ind = [\'A\',\'T\',\'G\',\'C\'].index(read_nt)\n+                            if nt_ind == 0:\n+                                a+=1\n+                            elif nt_ind == 1:\n+                                t+=1\n+                            elif nt_ind == 2:\n+                                g+=1\n+                            else:\n+                                c+=1\n+                        except ValueError:\n+                            pass\n+                    except:\n+                        pass\n+    \n+    if track_type == \'snp\' or track_type == \'both\':\n+        if a:\n+            fout_a.write("%s\\t%s\\n" %(prev_start,a))\n+        if t:\n+            fout_t.write("%s\\t%s\\n" %(prev_start,t))\n+        if g:\n+            fout_g.write("%s\\t%s\\n" %(prev_start,g))\n+        if c:\n+            fout_c.write("%s\\t%s\\n" %(prev_start,c))\n+            \n+        fout_a.seek(0)\n+        fout_g.seek(0)\n+        fout_t.seek(0)\n+        fout_c.seek(0)    \n+    \n+    if track_type == \'snp\':\n+        os.system("cat %s %s %s %s >> %s" %(fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_fname))\n+    elif track_type == \'both\':\n+        fout.seek(0)\n+        os.system("cat %s %s %s %s %s | cat > %s" %(fout.name,fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_fname))\n+if __name__ == "__main__":\n+    main()\n\\ No newline at end of file\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/mapping_to_ucsc.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/mapping_to_ucsc.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,202 @@\n+<tool id="mapToUCSC" name="Format mapping data" version="1.0.0">\n+  <description> as UCSC custom track</description>\n+  <command interpreter="python">\n+  \tmapping_to_ucsc.py \n+  \t$out_file1\n+  \t$input\n+  \t$chr_col\n+  \t$coord_col\n+  \t$track.track_type\n+  \t#if $track.track_type == "coverage" or $track.track_type == "both"\n+  \t$track.coverage_col\n+    "${track.cname}"\n+    "${track.cdescription}"\n+    "${track.ccolor}"\n+    "${track.cvisibility}"\n+    #end if\n+    #if $track.track_type == "snp" or $track.track_type == "both"\n+    "${track.sdescription}"\n+    "${track.svisibility}"\n+     $track.col2\n+    #end if\n+  </command>\n+  <inputs>\n+    <param format="tabular" name="input" type="data" label="Select mapping data"/>\n+    <param name="chr_col" type="data_column" data_ref="input" label="Column for reference chromosome" />\n+    <param name="coord_col" type="data_column" data_ref="input" numerical="True" label="Numerical column for reference co-ordinate" />\n+    <conditional name="track">\n+      <param name="track_type" type="select" label="Display">\n+    \t<option value="snp" selected="true">SNPs</option>\n+        <option value="coverage">Read coverage</option>\n+    \t<option value="both">Both</option>\n+      </param>\n+      <when value = "coverage">\n+      <param name="coverage_col" type="data_column" data_ref="input" numerical="True" label="Numerical column for read coverage" />\n+      <param name="cname" type="text" size="15" value="User Track" label="Coverage track name">\n+        <validator type="length" max="15"/>\n+      </param>\n+      <param name="cdescription" type="text" value="User Supplied Coverage Track (from Galaxy)" label="Coverage track description">\n+        <validator type="length" max="60" size="15"/>\n+      </param>\n+      <param label="Coverage track Color" name="ccolor" type="select">\n+            <option selected="yes" value="0-0-0">Black</option>\n+            <option value="255-0-0">Red</option>\n+            <option value="0-255-0">Green</option>\n+            <option value="0-0-255">Blue</option>\n+            <option value="255-0-255">Magenta</option>\n+            <option value="0-255-255">Cyan</option>\n+            <option value="255-215-0">Gold</option>\n+            <option value="160-32-240">Purple</option>\n+            <option value="255-140-0">Orange</option>\n+            <option value="255-20-147">Pink</option>\n+            <option value="92-51-23">Dark Chocolate</option>\n+            <option value="85-107-47">Olive green</option>\n+      </param>\n+      <param label="Coverage track Visibility" name="cvisibility" type="select">\n+            <option selected="yes" value="1">Dense</option>\n+            <option value="2">Full</option>\n+            <option value="3">Pack</option>\n+            <option value="4">Squish</option>\n+            <option value="0">Hide</option>\n+      </param>\n+      </when>\n+      \n+      <when value = "snp">\n+      <!-- \n+      <param name="col1" type="data_column" data_ref="input" label="Column containing the reference nucleotide" />\n+       -->\n+      <param name="col2" type="data_column" data_ref="input" label="Column containing the read nucleotide" />\n+      <!-- \n+      <param name="sname" type="text" size="15" value="User Track-2" label="SNP track name">\n+        <validator type="length" max="15"/>\n+      </param>\n+       -->\n+      <param name="sdescription" type="text" value="User Supplied Track (from Galaxy)" label="SNP track description">\n+        <validator type="length" max="60" size="15"/>\n+      </param>\n+      <param label="SNP track Visibility" name="svisibility" type="select">\n+            <option selected="yes" value="1">Dense</option>\n+            <option value="2">Full</option>\n+            <option value="3">Pack</option>\n+            <option value="4">Squish</option>\n+            <option value="0">Hide</option>\n+      </param>\n+      </when>\n+      \n+      <when value = "both">\n+      <param name="coverage_col" type="data_column" data_ref='..b'255-215-0">Gold</option>\n+            <option value="160-32-240">Purple</option>\n+            <option value="255-140-0">Orange</option>\n+            <option value="255-20-147">Pink</option>\n+            <option value="92-51-23">Dark Chocolate</option>\n+            <option value="85-107-47">Olive green</option>\n+      </param>\n+      <param label="Coverage track Visibility" name="cvisibility" type="select">\n+            <option selected="yes" value="1">Dense</option>\n+            <option value="2">Full</option>\n+            <option value="3">Pack</option>\n+            <option value="4">Squish</option>\n+            <option value="0">Hide</option>\n+      </param>\n+      <!-- \n+      <param name="col1" type="data_column" data_ref="input" label="Column containing the reference nucleotide" />\n+       -->\n+      <param name="col2" type="data_column" data_ref="input" label="Column containing the read nucleotide" />\n+      <!-- \n+      <param name="sname" type="text" size="15" value="User Track-2" label="SNP track name">\n+        <validator type="length" max="15"/>\n+      </param>\n+       -->\n+      <param name="sdescription" type="text" size="15" value="User Supplied Track (from Galaxy)" label="SNP track description">\n+        <validator type="length" max="60"/>\n+      </param>\n+      <param label="SNP track Visibility" name="svisibility" type="select">\n+            <option selected="yes" value="1">Dense</option>\n+            <option value="2">Full</option>\n+            <option value="3">Pack</option>\n+            <option value="4">Squish</option>\n+            <option value="0">Hide</option>\n+      </param>\n+      </when>\n+    </conditional>\n+  </inputs>\n+  <outputs>\n+    <data format="customtrack" name="out_file1"/>\n+  </outputs>\n+\n+  \n+ <help> \n+\n+.. class:: infomark\n+\n+**What it does**\n+\n+This tool turns mapping data generated by short read mappers into a format that can be displayed in the UCSC genome browser as a custom track. \n+\n+-----\n+\n+.. class:: warningmark\n+\n+**Note**\n+\n+This tool requires the mapping data to contain at least the following information: \n+\n+chromosome, genome coordinate, read nucleotide (if option to display is SNPs), read coverage (if option to display is Read coverage). \n+\n+-----\n+\n+**Example**\n+\n+For the following Mapping data::\n+\n+   #chr g_start read_id          read_coord g_nt read_nt qual read_coverage\n+   chrM    1   1:29:1672:1127/1    11        G    G       40  134\n+   chrM    1   1:32:93:933/1       4         G    A       40  134\n+   chrM    1   1:34:116:2032/1     11        G    A       40  134\n+   chrM    1   1:39:207:964/1      1         G    G       40  134\n+   chrM    2   1:3:359:848/1       1         G    C       40  234\n+   chrM    2   1:40:1435:1013/1    1         G    G       40  234\n+   chrM    3   1:40:730:972/1      9         G    G       40  334\n+   chrM    4   1:42:1712:921/2     31        G    T       35  434\n+   chrM    4   1:44:1649:493/1     4         G    G       40  434\n+\n+running this tool to display both SNPs and Read coverage will return the following tracks, containing aggregated data per genome co-ordinate::\n+\n+   track type=wiggle_0 name="Coverage Track" description="User Supplied Track (from Galaxy)" color=0,0,0 visibility=1\n+   variableStep chrom=chrM\n+   1   134\n+   2   234\n+   3   334\n+   4   434\n+   track type=wiggle_0 name="Track A" description="User Supplied SNP Track (from Galaxy)" color=255,0,0 visibility=1\n+   variableStep chrom=chrM\n+   1   2\n+   track type=wiggle_0 name="Track T" description="User Supplied SNP Track (from Galaxy)" color=0,255,0 visibility=1\n+   variableStep chrom=chrM\n+   4   1\n+   track type=wiggle_0 name="Track G" description="User Supplied SNP Track (from Galaxy)" color=0,0,255 visibility=1\n+   variableStep chrom=chrM\n+   1   2\n+   2   1\n+   3   1\n+   4   1\n+   track type=wiggle_0 name="Track C" description="User Supplied SNP Track (from Galaxy)" color=255,0,255 visibility=1\n+   variableStep chrom=chrM\n+   2   1\n+   \n+  </help>  \n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/megablast_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/megablast_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+"""
+run megablast for metagenomics data
+
+usage: %prog [options]
+   -d, --db_build=d: The database to use
+   -i, --input=i: Input FASTQ candidate file
+   -w, --word_size=w: Size of best perfect match
+   -c, --identity_cutoff=c: Report hits at or above this identity
+   -e, --eval_cutoff=e: Expectation value cutoff
+   -f, --filter_query=f: Filter out low complexity regions
+   -x, --index_dir=x: Data index directory
+   -o, --output=o: Output file
+   
+usage: %prog db_build input_file word_size identity_cutoff eval_cutoff filter_query index_dir output_file
+"""
+
+import os, subprocess, sys, tempfile
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+def __main__():
+    #Parse Command Line
+    options, args = doc_optparse.parse( __doc__ )
+    query_filename = options.input.strip()
+    output_filename = options.output.strip()
+    mega_word_size = options.word_size        # -W
+    mega_iden_cutoff = options.identity_cutoff      # -p
+    mega_evalue_cutoff = options.eval_cutoff      # -e
+    mega_temp_output = tempfile.NamedTemporaryFile().name
+    GALAXY_DATA_INDEX_DIR = options.index_dir
+    DB_LOC = "%s/blastdb.loc" % GALAXY_DATA_INDEX_DIR
+
+    # megablast parameters
+    try:
+        int( mega_word_size )    
+    except:
+        stop_err( 'Invalid value for word size' )
+    try:
+        float( mega_iden_cutoff )
+    except:
+        stop_err( 'Invalid value for identity cut-off' )
+    try:
+        float( mega_evalue_cutoff )
+    except:
+        stop_err( 'Invalid value for Expectation value' )
+
+    if not os.path.exists( os.path.split( options.db_build )[0] ):
+        stop_err( 'Cannot locate the target database directory. Please check your location file.' )
+
+    # arguments for megablast
+    megablast_command = "megablast -d %s -i %s -o %s -m 8 -a 8 -W %s -p %s -e %s -F %s > /dev/null" \
+        % ( options.db_build, query_filename, mega_temp_output, mega_word_size, mega_iden_cutoff, mega_evalue_cutoff, options.filter_query ) 
+
+    print megablast_command
+
+    tmp = tempfile.NamedTemporaryFile().name
+    try:
+        tmp_stderr = open( tmp, 'wb' )
+        proc = subprocess.Popen( args=megablast_command, shell=True, stderr=tmp_stderr.fileno() )
+        returncode = proc.wait()
+        tmp_stderr.close()
+        # get stderr, allowing for case where it's very large
+        tmp_stderr = open( tmp, 'rb' )
+        stderr = ''
+        buffsize = 1048576
+        try:
+            while True:
+                stderr += tmp_stderr.read( buffsize )
+                if not stderr or len( stderr ) % buffsize != 0:
+                    break
+        except OverflowError:
+            pass
+        tmp_stderr.close()
+        if returncode != 0:
+            raise Exception, stderr
+        if os.path.exists( tmp ):
+            os.unlink( tmp )
+    except Exception, e:
+        if os.path.exists( mega_temp_output ):
+            os.unlink( mega_temp_output )
+        if os.path.exists( tmp ):
+            os.unlink( tmp )
+        stop_err( 'Error indexing reference sequence. ' + str( e ) )
+
+    output = open( output_filename, 'w' )
+    invalid_lines = 0
+    for i, line in enumerate( file( mega_temp_output ) ):
+        line = line.rstrip( '\r\n' )
+        fields = line.split()
+        try:
+            # get gi and length of that gi seq
+            gi, gi_len = fields[1].split( '_' )
+            # convert the last column (causing problem in filter tool) to float
+            fields[-1] = float( fields[-1] )
+            new_line = "%s\t%s\t%s\t%s\t%0.1f" % ( fields[0], gi, gi_len, '\t'.join( fields[2:-1] ), fields[-1] )
+        except:
+            new_line = line
+            invalid_lines += 1
+        output.write( "%s\n" % new_line )
+    output.close()
+
+    if os.path.exists( mega_temp_output ):
+        os.unlink( mega_temp_output ) #remove the tempfile that we just reformatted the contents of
+
+    if invalid_lines:
+        print "Unable to parse %d lines. Keep the default format." % invalid_lines
+
+    # megablast generates a file called error.log, if empty, delete it, if not, show the contents
+    if os.path.exists( './error.log' ):
+        for i, line in enumerate( file( './error.log' ) ):
+            line = line.rstrip( '\r\n' )
+            print line
+        os.remove( './error.log' )
+
+if __name__ == "__main__" : __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/megablast_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/megablast_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,87 @@
+<tool id="megablast_wrapper" name="Megablast" version="1.1.0">
+    <description> compare short reads against htgs, nt, and wgs databases</description>
+    <command interpreter="python">
+      megablast_wrapper.py
+        --db_build="${ filter( lambda x: str( x[0] ) == str( $source_select ), $__app__.tool_data_tables[ 'blastdb' ].get_fields() )[0][-1] }"
+        --input=$input_query
+        --word_size=$word_size
+        --identity_cutoff=$iden_cutoff
+        --eval_cutoff=$evalue_cutoff 
+        --filter_query=$filter_query
+        --index_dir=${GALAXY_DATA_INDEX_DIR}
+        --output=$output1
+    </command>
+    <inputs>
+        <param name="input_query" type="data" format="fasta" label="Compare these sequences"/> 
+        <param name="source_select" type="select" display="radio" label="against target database">
+            <options from_data_table="blastdb" />
+        </param>
+        <param name="word_size" type="select" label="using word size" help="Size of best perfect match (-W)">
+            <option value="28">28</option>
+            <option value="16">16</option>
+        </param>
+        <param name="iden_cutoff" type="float" size="15" value="90.0" label="report hits above this identity (-p)" help="no cutoff if 0" />
+        <param name="evalue_cutoff" type="float" size="15" value="0.001" label="set expectation value cutoff (-e)" />
+        <param name="filter_query" type="select" label="Filter out low complexity regions? (-F)">
+            <option value="T">Yes</option>
+            <option value="F">No</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="output1" format="tabular"/>
+    </outputs>
+    <requirements>
+        <requirement type="package">megablast</requirement>
+    </requirements>
+    <tests>
+        <test>
+            <param name="input_query" value="megablast_wrapper_test1.fa" ftype="fasta"/>
+            <!-- source_select needs to match the entry in the blastdb.loc file, which includes the last update date if appropriate --> 
+            <param name="source_select" value="/galaxy/data/blastdb/phiX/phiX" />
+            <param name="word_size" value="28" />
+            <param name="iden_cutoff" value="99.0" />
+            <param name="evalue_cutoff" value="10.0" />
+            <param name="filter_query" value="T" />
+            <output name="output1" file="megablast_wrapper_test1.out"/> 
+        </test>
+    </tests>
+    <help>
+    
+.. class:: warningmark
+
+**Note**. Database searches may take substantial amount of time. For large input datasets it is advisable to allow overnight processing.  
+
+-----
+
+**What it does**
+
+This tool runs **megablast** (for information about megablast, please see the reference below) a high performance nucleotide local aligner developed by Webb Miller and colleagues.
+
+-----
+
+**Output format**
+
+Output of this tool contains 13 columns delimited by Tabs:
+
+1. Id of your sequence 
+2. GI of the database hit 
+3. Length of the database hit
+4. % identity
+5. Alignment length
+6. # mismatches
+7. # gaps
+8. Start position in your sequence
+9. End position in your sequence
+10. Start position in database hit
+11. End position in database hit
+12. E-value
+13. Bit score
+
+-------
+
+**Reference**
+
+Zhang et al. A Greedy Algorithm for Aligning DNA Sequences. 2000. JCB: 203-214.
+
+    </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/megablast_xml_parser.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/megablast_xml_parser.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+    
+import sys, os, re
+
+if sys.version_info[:2] >= ( 2, 5 ):
+    import xml.etree.cElementTree as ElementTree
+else:
+    from galaxy import eggs
+    import pkg_resources; pkg_resources.require( "elementtree" )
+    from elementtree import ElementTree
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+def __main__():
+    source  = sys.argv[1]
+    hspTags = [
+           "Hsp_bit-score",
+           "Hsp_evalue",
+           "Hsp_query-from",
+           "Hsp_query-to",
+           "Hsp_hit-from",
+           "Hsp_hit-to",
+           "Hsp_query-frame",
+           "Hsp_hit-frame",
+           "Hsp_identity",
+           "Hsp_align-len",
+           "Hsp_qseq",
+           "Hsp_hseq",
+           "Hsp_midline"
+          ]
+    hspData = []
+
+    # get an iterable
+    try: 
+        context = ElementTree.iterparse( source, events=( "start", "end" ) )
+    except:
+        stop_err( "Invalid data format." )
+    # turn it into an iterator
+    context = iter( context )
+    # get the root element
+    try:
+        event, root = context.next()
+    except:
+        stop_err( "Invalid data format." )
+
+    outfile = open( sys.argv[2], 'w' )
+    try:
+        for event, elem in context:
+           # for every <Iteration> tag
+           if event == "end" and elem.tag == "Iteration":
+               query = elem.findtext( "Iteration_query-def" )
+               qLen = elem.findtext( "Iteration_query-len" )
+               # for every <Hit> within <Iteration>
+               for hit in elem.findall( "Iteration_hits/Hit" ):
+                   subject = hit.findtext( "Hit_id" )
+                   if re.search( '^gi', subject ):
+                       subject = subject.split('|')[1]
+                   sLen = hit.findtext( "Hit_len" )
+                   # for every <Hsp> within <Hit>
+                   for hsp in hit.findall( "Hit_hsps/Hsp" ):
+                        outfile.write( "%s\t%s\t%s\t%s" % ( query, qLen, subject, sLen ) )
+                        for tag in hspTags:
+                            outfile.write("\t%s" %(hsp.findtext( tag )))
+                            #hspData.append( hsp.findtext( tag ) )
+                        #hspData = []
+                        outfile.write('\n')
+               # prevents ElementTree from growing large datastructure
+               root.clear()
+               elem.clear()
+    except:
+        outfile.close()
+        stop_err( "The input data is malformed, or there is more than one dataset in the input file. Error: %s" % sys.exc_info()[1] )
+
+    outfile.close()
+
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/megablast_xml_parser.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/megablast_xml_parser.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,60 @@
+<tool id="megablast_xml_parser" name="Parse blast XML output">
+<description></description>
+<command interpreter="python">megablast_xml_parser.py $input1 $output1</command>
+<inputs>
+  <param name="input1" type="data" format="blastxml" label="Megablast XML output" />
+</inputs>
+<outputs>
+  <data name="output1" format="tabular"/>
+</outputs>
+<tests>
+  <test>
+    <param name="input1" value="megablast_xml_parser_test1.gz" ftype="blastxml" />
+    <output name="output1" file="megablast_xml_parser_test1_out.tabular" ftype="tabular" />
+  </test>
+</tests>
+<help>
+
+**What it does**
+
+This tool processes the XML output of any NCBI blast tool (if you run your own blast jobs, the XML output can be generated with **-m 7** option).
+
+-----
+
+**Output fields**
+
+This tools returns tab-delimited output with the following fields::
+
+    Description                               Example
+    ----------------------------------------- ----------------- 
+
+    1. Name of the query sequence             Seq1
+    2. Length of the query sequence           30
+    3. Name of target sequence                gnl|BL_ORD_ID|0
+    4. Length of target sequence              5528445
+    5. Alignment bit score                    59.96
+    6. E-value                                8.38112e-11
+    7. Start of alignment within query        1
+    8. End of alignment within query          30
+    9. Start of alignment within target       5436010
+   10. End of alignment within target         5436039
+   11. Query frame                            1
+   12. Target frame                           1
+   13. Number of identical bases within       29 
+       the alignment
+   14. Alignment length                       30 
+   15. Aligned portion (sequence) of query    CGGACAGCGCCGCCACCAACAAAGCCACCA
+   16. Aligned portion (sequence) of target   CGGACAGCGCCGCCACCAACAAAGCCATCA
+   17. Midline indicating positions of        ||||||||||||||||||||||||||| || 
+       matches within the alignment
+
+------
+       
+.. class:: infomark
+
+Note that this form of output does not contain alignment identify value. However, it can be computed by dividing the number of identical bases within the alignment (Field 13) by the alignment length (Field 14) using *Text Manipulation->Compute* tool 
+
+
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/rmap_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/rmap_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+
+import os, sys, tempfile
+
+assert sys.version_info[:2] >= (2.4)
+
+def stop_err( msg ):
+    
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+    
+
+def __main__():
+    
+    # I/O
+    target_path = sys.argv[1]
+    infile = sys.argv[2]
+    read_len = sys.argv[3]              # -w
+    align_len = sys.argv[4]             # -h
+    mismatch = sys.argv[5]              # -m
+    output_file = sys.argv[6]
+    
+    # first guess the read length
+    guess_read_len = 0
+    seq = ''
+    for i, line in enumerate(open(infile)):
+        line = line.rstrip('\r\n')
+        if line.startswith('>'):
+            if seq:
+                guess_read_len = len(seq)
+                break
+        else:
+            seq += line
+            
+    try: 
+        test = int(read_len)
+        if test == 0:
+            read_len = str(guess_read_len)
+        else:
+            assert test >= 20 and test <= 64
+    except:
+        stop_err('Invalid value for read length. Must be between 20 and 64.')
+    
+    try:
+        int(align_len)    
+    except:
+        stop_err('Invalid value for minimal length of a hit.')
+    
+    try:
+        int(mismatch)
+        #assert test >= 0 and test <= int(0.1*int(read_len))
+    except:
+        stop_err('Invalid value for mismatch numbers in an alignment.')
+    
+    all_files = []
+    if os.path.isdir(target_path):
+        
+        # check target genome
+        fa_files = os.listdir(target_path)
+            
+        for file in fa_files:
+            file = "%s/%s" % ( target_path, file )
+            file = os.path.normpath(file)
+            all_files.append(file)
+    else:
+        stop_err("No sequences for %s are available for search, please report this error." %(target_path))
+   
+    for detail_file_path in all_files:
+        output_tempfile = tempfile.NamedTemporaryFile().name
+        command = "rmap -h %s -w %s -m %s -c %s %s -o %s 2>&1" % ( align_len, read_len, mismatch, detail_file_path, infile, output_tempfile )
+        #print command
+        try:
+            os.system( command )
+        except Exception, e:
+            stop_err( str( e ) )
+
+        try:
+            os.system( 'cat %s >> %s' % ( output_tempfile, output_file ) )
+        except Exception, e:
+            stop_err( str( e ) )
+        
+        try:
+            os.remove( output_tempfile )
+        except:
+            pass
+        
+        
+if __name__ == '__main__': __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/rmap_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/rmap_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,84 @@
+<tool id="rmap_wrapper" name="RMAP" version="1.0.0">
+    <description>for Solexa Short Reads Alignment</description>
+    <command interpreter="python">
+    #if $trim.choice=="No": #rmap_wrapper.py $database $input_seq 0 $align_len $mismatch $output1
+    #else: #rmap_wrapper.py $database $input_seq $trim.read_len $align_len $mismatch $output1
+    #end if
+    </command>
+    <inputs>
+        <param name="database" type="select" display="radio" label="Target database">
+ <options from_file="faseq.loc">
+   <column name="name" index="0"/>
+   <column name="value" index="0"/>
+ </options>
+        </param>
+        <param name="input_seq" type="data" format="fasta" label="Sequence file"/>
+        <param name="align_len" type="integer" size="15" value="11" label="Minimal length of a hit (-h)" help="seed" />
+        <param name="mismatch" type="select" label="Number of mismatches allowed (-m)">
+            <option value="0">0</option>
+            <option value="1">1</option>
+            <option value="3">3</option>
+            <option value="5">5</option>
+        </param>
+        <conditional name="trim">
+            <param name="choice" type="select" label="To trim the reads">
+                <option value="No">No</option>
+                <option value="Yes">Yes</option>
+            </param>
+            <when value="No">
+            </when>
+            <when value="Yes">
+                <param name="read_len" type="integer" size="15" value="36" label="Read length (-w)"/> 
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="output1" format="bed"/>
+    </outputs>
+    <requirements>
+      <requirement type="binary">rmap</requirement>
+    </requirements>
+    <!--     
+    <tests>
+        <test>
+            <param name="database" value="/galaxy/data/faseq/test" />
+            <param name="input_seq" value="rmap_wrapper_test1.fasta" ftype="fasta"/>
+            <param name="read_len" value="36" />
+            <param name="align_len" value="36" />
+            <param name="mismatch" value="3" />
+            <output name="output1" file="rmap_wrapper_test1.bed"/> 
+        </test>
+    </tests>
+     -->
+    <help>
+    
+.. class:: warningmark
+
+ RMAP was developed for **Solexa** reads. 
+
+.. class:: infomark
+
+**TIP**. The tool will guess the length of the reads, however, if you select to trim the reads, the *Reads length* must be between 20 and 64. Reads with lengths longer than the specified value will be trimmed at the 3'end. 
+
+-----
+
+**What it does**
+
+This tool runs **rmap** (for more information, please see the reference below), mapping Solexa reads onto a genome build.   
+
+-----
+
+**Parameters**
+
+- *Minimal Length of a Hit* (**-h**) : this is the seed length or the minimal exact match length   
+- *Number of Mismatches Allowed* (**-m**) : the maximal number of mismatches allowed in an alignment 
+- *Read Length* (**-w**) : maximal length of the reads; reads longer than the threshold will be truncated at 3' end.
+
+-----
+
+**Reference**
+
+ **RMAP** is developed by Dr. Andrew D Smith and Dr. Zhenyu Xuan at the Cold Spring Harbor Laboratory. Please see http://rulai.cshl.edu/rmap/
+
+    </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/rmapq_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/rmapq_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+
+import os, sys, tempfile
+
+assert sys.version_info[:2] >= (2.4)
+
+def stop_err( msg ):
+    
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+    
+
+def __main__():
+    
+    # I/O
+    target_path = sys.argv[1]
+    infile = sys.argv[2]
+    scorefile = sys.argv[3]
+    high_score = sys.argv[4]            # -q
+    high_len = sys.argv[5]              # -M
+    read_len = sys.argv[6]              # -w
+    align_len = sys.argv[7]             # -h
+    mismatch = sys.argv[8]              # -m
+    output_file = sys.argv[9]
+    
+    try: 
+        float(high_score)
+    except:
+        stop_err('Invalid value for minimal quality score.')
+
+    try:
+        int(high_len)
+    except:
+        stop_err('Invalid value for minimal high quality bases.')
+            
+    # first guess the read length
+    guess_read_len = 0
+    seq = ''
+    for i, line in enumerate(open(infile)):
+        line = line.rstrip('\r\n')
+        if line.startswith('>'):
+            if seq:
+                guess_read_len = len(seq)
+                break
+        else:
+            seq += line
+            
+    try: 
+        test = int(read_len)
+        if test == 0:
+            read_len = str(guess_read_len)
+        else:
+            assert test >= 20 and test <= 64
+    except:
+        stop_err('Invalid value for read length. Must be between 20 and 64.')
+
+    
+    try:
+        int(align_len)    
+    except:
+        stop_err('Invalid value for minimal length of a hit.')
+    
+    try:
+        int(mismatch)
+    except:
+        stop_err('Invalid value for mismatch numbers in an alignment.')
+    
+    all_files = []
+    if os.path.isdir(target_path):
+        # check target genome
+        fa_files = os.listdir(target_path)
+            
+        for file in fa_files:
+            file = "%s/%s" % ( target_path, file )
+            file = os.path.normpath(file)
+            all_files.append(file)
+    else:
+        stop_err("No sequences for %s are available for search, please report this error." %(target_path))
+   
+    for detail_file_path in all_files:
+        output_tempfile = tempfile.NamedTemporaryFile().name
+        command = "rmapq -q %s -M %s -h %s -w %s -m %s -Q %s -c %s %s -o %s 2>&1" % ( high_score, high_len, align_len, read_len, mismatch, scorefile, detail_file_path, infile, output_tempfile )
+        #print command
+        try:
+            os.system( command )
+        except Exception, e:
+            stop_err( str( e ) )
+
+        try:
+            assert os.system( 'cat %s >> %s' % ( output_tempfile, output_file ) ) == 0
+        except Exception, e:
+            stop_err( str( e ) )
+        
+        try:
+            os.remove( output_tempfile )
+        except:
+            pass
+
+            
+if __name__ == '__main__': __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/rmapq_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/rmapq_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,92 @@
+<tool id="rmapq_wrapper" name="RMAPQ" version="1.0.0">
+    <description>for Solexa Short Reads Alignment with Quality Scores</description>
+    <command interpreter="python">
+    #if $trim.choice=="No": #rmapq_wrapper.py $database $input_seq $input_score $high_score $high_len 0 $align_len $mismatch $output1
+    #else: #rmapq_wrapper.py $database $input_seq $input_score $high_score $high_len $trim.read_len $align_len $mismatch $output1
+    #end if
+    </command>
+    <inputs>
+        <param name="database" type="select" display="radio" label="Target database">
+ <options from_file="faseq.loc">
+   <column name="name" index="0"/>
+   <column name="value" index="0"/>
+ </options>
+        </param>
+        <param name="input_seq" type="data" format="fasta" label="Sequence file"/>
+        <param name="input_score" type="data" format="qualsolexa" label="Quality score file"/>
+        <param name="high_score" type="float" size="15" value="40" label="Minimum score for high-quality base (-q)"/>
+        <param name="high_len" type="integer" size="15" value="36" label="Minimal high-quality bases (-M)"/>
+        <param name="align_len" type="integer" size="15" value="11" label="Minimal length of a hit (-h)" help="seed"/>
+        <param name="mismatch" type="select" label="Number of mismatches allowed (-m)">
+            <option value="0">0</option>
+            <option value="1">1</option>
+            <option value="3">3</option>
+            <option value="5">5</option>
+        </param>
+        <conditional name="trim">
+            <param name="choice" type="select" label="To trim the reads">
+                <option value="No">No</option>
+                <option value="Yes">Yes</option>
+            </param>
+            <when value="No">
+            </when>
+            <when value="Yes">
+                <param name="read_len" type="integer" size="15" value="36" label="Read length (-w)" /> 
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="output1" format="bed"/>
+    </outputs>
+    <requirements>
+      <requirement type="binary">rmapq</requirement>
+    </requirements>
+    <!-- 
+    <tests>
+        <test>
+            <param name="database" value="/galaxy/data/faseq/test" />
+            <param name="input_seq" value="rmapq_wrapper_test1.fasta" ftype="fasta"/>
+            <param name="input_score" value="rmapq_wrapper_test1.qual" ftype="qualsolexa" />
+            <param name="high_score" value="40" />
+            <param name="high_len" value="36" />
+            <param name="read_len" value="36" />
+            <param name="align_len" value="36" />
+            <param name="mismatch" value="3" />
+            <output name="output1" file="rmapq_wrapper_test1.bed"/> 
+        </test>
+    </tests>
+    -->
+    <help>
+    
+.. class:: warningmark
+
+ RMAPQ was developed for **Solexa** reads. 
+
+.. class:: infomark
+
+**TIP**. The tool will guess the length of the reads, however, if you select to trim the reads, the *Maximal Length of the Reads* must be between 20 and 64. Reads with lengths longer than the specified value will be trimmed at the 3'end. 
+
+-----
+
+**What it does**
+
+This tool runs **rmapq** (for more information, please see the reference below), searching against a genome build with sequence qualities.   
+
+-----
+
+**Parameters**
+
+- *Minimal High-quality Bases* (**-M**): the minimal length of the high quality score bases
+- *Minimum Score for High-quality Base* (**-q**) : the minimal quality score 
+- *Minimal Length of a Hit* (**-h**) : the minimal length of an exact match or seed  
+- *Number of Mismatches Allowed* (**-m**) : the maximal number of mismatches allowed in an alignment
+- *Read Length* (**-w**) : maximal length of the reads; reads longer than the threshold will be truncated at 3' end.
+
+-----
+
+**Reference**
+
+ **RMAP** is developed by Dr. Andrew D Smith and Dr. Zhenyu Xuan at the Cold Spring Harbor Laboratory. Please see http://rulai.cshl.edu/rmap/
+
+    </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/short_reads_figure_high_quality_length.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/short_reads_figure_high_quality_length.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,165 @@
+#!/usr/bin/env python
+
+import os, sys, math, tempfile, zipfile, re
+from rpy import *
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+def unzip( filename ):
+    zip_file = zipfile.ZipFile( filename, 'r' )
+    tmpfilename = tempfile.NamedTemporaryFile().name
+    for name in zip_file.namelist():
+        file( tmpfilename, 'a' ).write( zip_file.read( name ) )
+    zip_file.close()
+    return tmpfilename
+
+def __main__():
+    infile_score_name = sys.argv[1].strip()
+    outfile_R_name = sys.argv[2].strip()
+    
+    try:
+        score_threshold = int( sys.argv[3].strip() )
+    except:
+        stop_err( 'Threshold for quality score must be numerical.' )
+
+    infile_is_zipped = False
+    if zipfile.is_zipfile( infile_score_name ):
+        infile_is_zipped = True
+        infile_name = unzip( infile_score_name )
+    else:
+        infile_name = infile_score_name
+
+    # detect whether it's tabular or fasta format
+    seq_method = None
+    data_type = None
+    for i, line in enumerate( file( infile_name ) ):
+        line = line.rstrip( '\r\n' )
+        if not line or line.startswith( '#' ):
+            continue
+        if data_type == None:
+            if line.startswith( '>' ):
+                data_type = 'fasta'
+                continue
+            elif len( line.split( '\t' ) ) > 0:
+                fields = line.split()
+                for score in fields:
+                    try:
+                        int( score )
+                        data_type = 'tabular'
+                        seq_method = 'solexa'
+                        break
+                    except:
+                        break
+        elif data_type == 'fasta':
+            fields = line.split()
+            for score in fields:
+                try: 
+                    int( score )
+                    seq_method = '454'
+                    break
+                except:
+                    break
+        if i == 100:
+            break
+
+    if data_type is None:
+        stop_err( 'This tool can only use fasta data or tabular data.' ) 
+    if seq_method is None:
+        stop_err( 'Invalid data for fasta format.')

+    cont_high_quality = []
+    invalid_lines = 0
+    invalid_scores = 0                       
+    if seq_method == 'solexa':
+        for i, line in enumerate( open( infile_name ) ):
+            line = line.rstrip( '\r\n' )
+            if not line or line.startswith( '#' ):
+                continue
+            locs = line.split( '\t' )
+            for j, base in enumerate( locs ):
+                nuc_errors = base.split()
+                try:
+                    nuc_errors[0] = int( nuc_errors[0] )
+                    nuc_errors[1] = int( nuc_errors[1] )
+                    nuc_errors[2] = int( nuc_errors[2] )
+                    nuc_errors[3] = int( nuc_errors[3] )
+                    big = max( nuc_errors )
+                except:
+                    invalid_scores += 1
+                    big = 0
+                if j == 0:
+                    cont_high_quality.append(1)
+                else:
+                    if big >= score_threshold:
+                        cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
+                    else:
+                        cont_high_quality.append(1)
+    else: # seq_method == '454'
+        tmp_score = ''
+        for i, line in enumerate( open( infile_name ) ):
+            line = line.rstrip( '\r\n' )
+            if not line or line.startswith( '#' ):
+                continue
+            if line.startswith( '>' ):
+                if len( tmp_score ) > 0:
+                    locs = tmp_score.split()
+                    for j, base in enumerate( locs ):
+                        try:
+                            base = int( base )
+                        except:
+                            invalid_scores += 1
+                            base = 0
+                        if j == 0:
+                            cont_high_quality.append(1)
+                        else:
+                            if base >= score_threshold:
+                                cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
+                            else:
+                                cont_high_quality.append(1)
+                tmp_score = ''
+            else:
+                tmp_score = "%s %s" % ( tmp_score, line )
+        if len( tmp_score ) > 0:
+            locs = tmp_score.split()
+            for j, base in enumerate( locs ):
+                try:
+                    base = int( base )
+                except:
+                    invalid_scores += 1
+                    base = 0
+                if j == 0:
+                    cont_high_quality.append(1)
+                else:
+                    if base >= score_threshold:
+                        cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
+                    else:
+                        cont_high_quality.append(1)
+
+    # generate pdf figures
+    cont_high_quality = array ( cont_high_quality )
+    outfile_R_pdf = outfile_R_name 
+    r.pdf( outfile_R_pdf )
+    title = "Histogram of continuous high quality scores"
+    xlim_range = [ 1, max( cont_high_quality ) ]
+    nclass = max( cont_high_quality )
+    if nclass > 100:
+        nclass = 100
+    r.hist( cont_high_quality, probability=True, xlab="Continuous High Quality Score length (bp)", ylab="Frequency (%)", xlim=xlim_range, main=title, nclass=nclass)
+    r.dev_off()    
+
+    if infile_is_zipped and os.path.exists( infile_name ):
+        # Need to delete temporary file created when we unzipped the infile archive
+        os.remove( infile_name )
+
+    if invalid_lines > 0: 
+        print 'Skipped %d invalid lines. ' % invalid_lines
+    if invalid_scores > 0:
+        print 'Skipped %d invalid scores. ' % invalid_scores
+
+    r.quit( save="no" )
+
+if __name__=="__main__":__main__()
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/short_reads_figure_high_quality_length.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/short_reads_figure_high_quality_length.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,75 @@
+<tool id="hist_high_quality_score" name="Histogram">
+<description> of high quality score reads </description>
+
+<command interpreter="python">short_reads_figure_high_quality_length.py $input1 $output1 $input2</command>
+
+<inputs>
+<page>
+    <param name="input1" type="data" format="qualsolexa,qual454,txtseq.zip" label="Quality score file" help="No dataset? Read tip below"/>
+    <param name="input2" type="integer" size="5" value="20" label="Quality score threshold" />
+</page>
+</inputs>
+<outputs>
+   <data name="output1" format="pdf" />
+</outputs>
+<requirements>
+ <requirement type="python-module">rpy</requirement>
+</requirements>
+<tests>
+ <test>
+ <param name="input1" value="solexa.qual" ftype="qualsolexa" />
+ <param name="input2" value="5" />
+   <output name="output1" file="solexa_high_quality_hist.pdf" ftype="pdf"/>
+ </test>
+ <test>
+ <param name="input1" value="454.qual" ftype="qual454" />
+ <param name="input2" value="5" />
+ <output name="output1" file="454_high_quality_hist.pdf" ftype="pdf"/>
+ </test>
+</tests>
+
+<help>
+
+.. class:: warningmark
+
+To use this tool, your dataset needs to be in the *Quality Score* format. Click the pencil icon next to your dataset to set the datatype to *Quality Score* (see below for examples).
+
+-----
+
+**What it does**
+
+This tool takes Quality Files generated by Roche (454), Illumina (Solexa), or ABI SOLiD machines and builds a histogram of lengths of high quality reads.
+
+-----
+
+**Examples of Quality Data**
+
+Roche (454) or ABI SOLiD data::
+
+ &gt;seq1
+ 23 33 34 25 28 28 28 32 23 34 27 4 28 28 31 21 28
+
+Illumina (Solexa) data::
+
+  -40 -40 40 -40  -40 -40 -40 40  

+-----
+
+**Note**
+
+- Quality score data::
+
+ &gt;seq1
+ 23 33 34 25 28 28 28 32 23 34 27 4 28 28 31 21 28
+
+- If the threshold is set to 20:
+
+  - a low quality score 4 in the middle separated two segments of lengths 11 and 5.
+
+  - The histogram will be built based on the numbers (11, 5).
+  
+- For Illumina (Solexa) data, only the maximal of the 4 values will be used.
+
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/short_reads_figure_score.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/short_reads_figure_score.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,248 @@\n+#!/usr/bin/env python\n+"""\n+boxplot:\n+- box: first quartile and third quartile\n+- line inside the box: median\n+- outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile\n+           IQR = third quartile - first quartile\n+- The smallest/largest value that is not an outlier is connected to the box by with a horizontal line.\n+"""\n+\n+import os, sys, math, tempfile, re\n+from rpy import *\n+\n+assert sys.version_info[:2] >= ( 2, 4 )\n+\n+def stop_err( msg ):\n+    sys.stderr.write( "%s\\n" % msg )\n+    sys.exit()\n+\n+def merge_to_20_datapoints( score ):\n+    number_of_points = 20\n+    read_length = len( score )\n+    step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) )\n+    scores = []\n+    point = 1\n+    point_sum = 0\n+    step_average = 0\n+    score_points = 0\n+    \n+    for i in xrange( 1, read_length ):\n+        if i < ( point * step ):\n+            point_sum += int( score[i] )\n+            step_average += 1\n+        else:\n+            point_avg = point_sum * 1.0 / step_average\n+            scores.append( point_avg )\n+            point += 1\n+            point_sum = 0\n+            step_average = 0                       \n+    if step_average > 0:\n+        point_avg = point_sum * 1.0 / step_average\n+        scores.append( point_avg )\n+    if len( scores ) > number_of_points:\n+        last_avg = 0\n+        for j in xrange( number_of_points - 1, len( scores ) ):\n+            last_avg += scores[j]\n+        last_avg = last_avg / ( len(scores) - number_of_points + 1 )\n+    else:    \n+        last_avg = scores[-1]\n+    score_points = []\n+    for k in range( number_of_points - 1 ):\n+        score_points.append( scores[k] )\n+    score_points.append( last_avg )\n+    return score_points\n+\n+def __main__():\n+\n+    invalid_lines = 0\n+\n+    infile_score_name = sys.argv[1].strip()\n+    outfile_R_name = sys.argv[2].strip()\n+\n+    infile_name = infile_score_name\n+\n+    # Determine tabular or fasta format within the first 100 lines\n+    seq_method = None\n+    data_type = None\n+    for i, line in enumerate( file( infile_name ) ):\n+        line = line.rstrip( \'\\r\\n\' )\n+        if not line or line.startswith( \'#\' ):\n+            continue\n+        if data_type == None:\n+            if line.startswith( \'>\' ):\n+                data_type = \'fasta\'\n+                continue\n+            elif len( line.split( \'\\t\' ) ) > 0:\n+                fields = line.split()\n+                for score in fields:\n+                    try:\n+                        int( score )\n+                        data_type = \'tabular\'\n+                        seq_method = \'solexa\'\n+                        break\n+                    except:\n+                        break\n+        elif data_type == \'fasta\':\n+            fields = line.split()\n+            for score in fields:\n+                try: \n+                    int( score )\n+                    seq_method = \'454\'\n+                    break\n+                except:\n+                    break\n+        if i == 100:\n+            break\n+\n+    if data_type is None:\n+        stop_err( \'This tool can only use fasta data or tabular data.\' ) \n+    if seq_method is None:\n+        stop_err( \'Invalid data for fasta format.\')\n+\n+    # Determine fixed length or variable length within the first 100 lines\n+    read_length = 0\n+    variable_length = False\n+    if seq_method == \'solexa\':\n+        for i, line in enumerate( file( infile_name ) ):\n+            line = line.rstrip( \'\\r\\n\' )\n+            if not line or line.startswith( \'#\' ):\n+                continue\n+            scores = line.split(\'\\t\')\n+            if read_length == 0:\n+                read_length = len( scores )\n+            if read_length != len( scores ):\n+                variable_length = True\n+                break\n+            if i == 100:\n+                break\n+    elif seq_method == \'454\':\n+        score = \'\'\n+        for i, line in enumerate( file( infile_name ) ):\n+            line = line.rstrip( \'\\r\\n\' )\n+    '..b'numerate( open( infile_name ) ):\n+            line = line.rstrip( \'\\r\\n\' )\n+            if not line or line.startswith( \'#\' ):\n+                continue\n+            tmp_array = []\n+            scores = line.split( \'\\t\' )\n+            for bases in scores:\n+                nuc_errors = bases.split()\n+                try:\n+                    nuc_errors[0] = int( nuc_errors[0] )\n+                    nuc_errors[1] = int( nuc_errors[1] )\n+                    nuc_errors[2] = int( nuc_errors[2] )\n+                    nuc_errors[3] = int( nuc_errors[3] )\n+                    big = max( nuc_errors )\n+                except:\n+                    #print \'Invalid numbers in the file. Skipped.\'\n+                    invalid_scores += 1\n+                    big = 0\n+                tmp_array.append( big )                        \n+            score_points.append( tmp_array )\n+    elif seq_method == \'454\':\n+        # skip the last fasta sequence\n+        score = \'\'\n+        for i, line in enumerate( open( infile_name ) ):\n+            line = line.rstrip( \'\\r\\n\' )\n+            if not line or line.startswith( \'#\' ):\n+                continue\n+            if line.startswith( \'>\' ):\n+                if len( score ) > 0:\n+                    score = [\'0\'] + score.split()\n+                    read_length = len( score )\n+                    tmp_array = []\n+                    if not variable_length:\n+                        score.pop(0)\n+                        score_points.append( score )\n+                        tmp_array = score\n+                    elif read_length > read_length_threshold:\n+                        score_points_tmp = merge_to_20_datapoints( score )\n+                        score_points.append( score_points_tmp )\n+                        tmp_array = score_points_tmp\n+                score = \'\'\n+            else:\n+                score = "%s %s" % ( score, line )\n+        if len( score ) > 0:\n+            score = [\'0\'] + score.split()\n+            read_length = len( score )\n+            if not variable_length:\n+                score.pop(0)\n+                score_points.append( score )\n+            elif read_length > read_length_threshold:\n+                score_points_tmp = merge_to_20_datapoints( score )\n+                score_points.append( score_points_tmp )\n+                tmp_array = score_points_tmp\n+\n+    # reverse the matrix, for R\n+    for i in range( number_of_points - 1 ):\n+        tmp_array = []\n+        for j in range( len( score_points ) ):\n+            try:\n+                tmp_array.append( int( score_points[j][i] ) )\n+            except:\n+                invalid_lines += 1\n+        score_matrix.append( tmp_array )\n+\n+    # generate pdf figures\n+    #outfile_R_pdf = outfile_R_name \n+    #r.pdf( outfile_R_pdf )\n+    outfile_R_png = outfile_R_name\n+    r.bitmap( outfile_R_png )\n+    \n+    title = "boxplot of quality scores"\n+    empty_score_matrix_columns = 0\n+    for i, subset in enumerate( score_matrix ):\n+        if not subset:\n+            empty_score_matrix_columns += 1\n+            score_matrix[i] = [0]\n+            \n+    if not variable_length:\n+        r.boxplot( score_matrix, xlab="location in read length", main=title )\n+    else:\n+        r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title )\n+        x_old_range = []\n+        x_new_range = []\n+        step = read_length_threshold / number_of_points \n+        for i in xrange( 0, read_length_threshold, step ):\n+            x_old_range.append( ( i / step ) )\n+            x_new_range.append( i )\n+        r.axis( 1, x_old_range, x_new_range )\n+    r.dev_off()\n+\n+    if invalid_scores > 0:\n+        print \'Skipped %d invalid scores. \' % invalid_scores\n+    if invalid_lines > 0:\n+        print \'Skipped %d invalid lines. \' % invalid_lines\n+    if empty_score_matrix_columns > 0:\n+        print \'%d missing scores in score_matrix. \' % empty_score_matrix_columns\n+\n+    r.quit(save = "no")\n+\n+if __name__=="__main__":__main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/short_reads_figure_score.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/short_reads_figure_score.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,87 @@
+<tool id="quality_score_distribution" name="Build base quality distribution" version="1.0.2">
+<description></description>
+
+<command interpreter="python">short_reads_figure_score.py $input1 $output1 </command>
+
+<inputs>
+<page>
+    <param name="input1" type="data" format="qualsolexa, qual454" label="Quality score file" help="No dataset? Read tip below"/>
+</page>
+</inputs>
+
+<outputs>
+   <data name="output1" format="png" />
+</outputs> 
+<requirements>
+ <requirement type="python-module">rpy</requirement>
+</requirements>
+<tests>
+ <test>
+ <param name="input1" value="solexa.qual" ftype="qualsolexa" />
+   <output name="output1" file="solexaScore.png" ftype="png" />
+ </test>
+ <test>
+ <param name="input1" value="454.qual" ftype="qual454" />
+ <output name="output1" file="454Score.png" ftype="png" />
+ </test>
+</tests>
+<help>
+
+.. class:: warningmark
+
+To use this tool, your dataset needs to be in the *Quality Score* format. Click the pencil icon next to your dataset to set the datatype to *Quality Score* (see below for examples).
+
+-----
+
+**What it does**
+
+This tool takes Quality Files generated by Roche (454), Illumina (Solexa), or ABI SOLiD machines and builds a graph showing score distribution like the one below. Such graph allows you to perform initial evaluation of data quality in a single pass.
+
+-----
+
+**Examples of Quality Data**
+
+Roche (454) or ABI SOLiD data::
+
+ &gt;seq1
+ 23 33 34 25 28 28 28 32 23 34 27 4 28 28 31 21 28
+
+Illumina (Solexa) data::
+
+  -40 -40 40 -40  -40 -40 -40 40  

+-----
+
+**Output example**
+
+Quality scores are summarized as boxplot (Roche 454 FLX data):
+
+.. image:: ./static/images/short_reads_boxplot.png
+
+where the **X-axis** is coordinate along the read and the **Y-axis** is quality score adjusted to comply with the Phred score metric. Units on the X-axis depend on whether your data comes from Roche (454) or Illumina (Solexa) and ABI SOLiD machines:
+
+  - For Roche (454) X-axis (shown above) indicates **relative** position (in %) within reads as this technology produces reads of different lengths;
+  - For Illumina (Solexa) and ABI SOLiD X-axis shows **absolute** position in nucleotides within reads.
+  
+Every box on the plot shows the following values::
+
+       o     &lt;---- Outliers
+       o
+      -+-    &lt;---- Upper Extreme Value that is no more 
+       |           than box length away from the box   
+       |
+    +--+--+  &lt;---- Upper Quartile
+    |     |
+    +-----+  &lt;---- Median
+    |     |
+    +--+--+  &lt;---- Lower Quartile 
+       |
+       |
+      -+-    &lt;---- Lower Extreme Value that is no more
+                   than box length away from the box
+       o     &lt;---- Outlier


+     
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/short_reads_trim_seq.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/short_reads_trim_seq.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,234 @@\n+#!/usr/bin/env python\n+"""\n+trim reads based on the quality scores\n+input: read file and quality score file\n+output: trimmed read file\n+"""\n+\n+import os, sys, math, tempfile, re\n+\n+assert sys.version_info[:2] >= ( 2, 4 )\n+\n+def stop_err( msg ):\n+    sys.stderr.write( "%s\\n" % msg )\n+    sys.exit()\n+\n+def append_to_outfile( outfile_name, seq_title, segments ):\n+    segments = segments.split( \',\' )\n+    if len( segments ) > 1:\n+        outfile = open( outfile_name, \'a\' )\n+        for i in range( len( segments ) ):\n+            outfile.write( "%s_%d\\n%s\\n" % ( seq_title, i, segments[i] ) )\n+        outfile.close()\n+    elif segments[0]:\n+        outfile = open( outfile_name, \'a\' )\n+        outfile.write( "%s\\n%s\\n" % ( seq_title, segments[0] ) )\n+        outfile.close()\n+\n+def trim_seq( seq, score, arg, trim_score, threshold ):\n+    seq_method = \'454\'\n+    trim_pos = 0\n+    # trim after a certain position\n+    if arg.isdigit():\n+        keep_homopolymers = False\n+        trim_pos = int( arg )    \n+        if trim_pos > 0 and trim_pos < len( seq ):\n+            seq = seq[0:trim_pos]\n+    else:\n+        keep_homopolymers = arg==\'yes\'\n+        \n+    new_trim_seq = \'\'\n+    max_segment = 0\n+\n+    for i in range( len( seq ) ):\n+        if i >= len( score ):\n+            score.append(-1)   \n+        if int( score[i] ) >= trim_score:\n+            pass_nuc = seq[ i:( i + 1 ) ]\n+        else:\n+            if keep_homopolymers and ( (i == 0 ) or ( seq[ i:( i + 1 ) ].lower() == seq[ ( i - 1 ):i ].lower() ) ):\n+                pass_nuc = seq[ i:( i + 1 ) ]\n+            else:\n+                pass_nuc = \' \'    \n+        new_trim_seq = \'%s%s\' % ( new_trim_seq, pass_nuc )\n+        # find the max substrings\n+        segments = new_trim_seq.split()\n+        max_segment = \'\'\n+        len_max_segment = 0\n+        if threshold == 0:\n+            for seg in segments:\n+                if len_max_segment < len( seg ):\n+                    max_segment = \'%s,\' % seg\n+                    len_max_segment = len( seg )\n+                elif len_max_segment == len( seg ):\n+                    max_segment = \'%s%s,\' % ( max_segment, seg )\n+        else:\n+            for seg in segments:\n+                if len( seg ) >= threshold:\n+                    max_segment = \'%s%s,\' % ( max_segment, seg )\n+    return max_segment[ 0:-1 ]\n+\n+def __main__():\n+    \n+    try:\n+        threshold_trim = int( sys.argv[1].strip() )\n+    except:\n+        stop_err( "Minimal quality score must be numeric." )\n+    try:\n+        threshold_report = int( sys.argv[2].strip() )\n+    except:\n+        stop_err( "Minimal length of trimmed reads must be numeric." )\n+    outfile_seq_name = sys.argv[3].strip()\n+    infile_seq_name = sys.argv[4].strip()\n+    infile_score_name = sys.argv[5].strip()\n+    arg = sys.argv[6].strip()\n+\n+    seq_infile_name = infile_seq_name\n+    score_infile_name = infile_score_name\n+    \n+\n+    # Determine quailty score format: tabular or fasta format within the first 100 lines\n+    seq_method = None\n+    data_type = None\n+    for i, line in enumerate( file( score_infile_name ) ):\n+        line = line.rstrip( \'\\r\\n\' )\n+        if not line or line.startswith( \'#\' ):\n+            continue\n+        if data_type == None:\n+            if line.startswith( \'>\' ):\n+                data_type = \'fasta\'\n+                continue\n+            elif len( line.split( \'\\t\' ) ) > 0:\n+                fields = line.split()\n+                for score in fields:\n+                    try:\n+                        int( score )\n+                        data_type = \'tabular\'\n+                        seq_method = \'solexa\'\n+                        break\n+                    except:\n+                        break\n+        elif data_type == \'fasta\':\n+            fields = line.split()\n+            for score in fields:\n+                try: \n+                    int( score )\n+                    seq_method = \'454\'\n+                    break\n+                except:\n+            '..b'al in score_line.split():\n+                                    try:\n+                                        int( val ) \n+                                    except:\n+                                        score_file.close()\n+                                        stop_err( "Non-numerical value \'%s\' in score file." % val )\n+                                if not score:\n+                                    score = score_line\n+                                else:\n+                                    score = \'%s %s\' % ( score, score_line )                                        \n+                    elif data_type == \'tabular\':\n+                        score = score_file.readline().rstrip(\'\\r\\n\')\n+                        loc = score.split( \'\\t\' )\n+                        for base in loc:\n+                            nuc_error = base.split()\n+                            try:\n+                                nuc_error[0] = int( nuc_error[0] )\n+                                nuc_error[1] = int( nuc_error[1] )\n+                                nuc_error[2] = int( nuc_error[2] )\n+                                nuc_error[3] = int( nuc_error[3] )\n+                                big = max( nuc_error )\n+                            except:\n+                                score_file.close()\n+                                stop_err( "Invalid characters in line %d: \'%s\'" % ( i, line ) )\n+                            scores.append( big )\n+                    if scores:\n+                        new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report )\n+                        append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments )  \n+                                \n+                seq_title = line\n+                seq = None\n+            else:\n+                if not seq:\n+                    seq = line\n+                else:\n+                    seq = "%s%s" % ( seq, line )\n+        if seq:\n+            scores = []\n+            if data_type == \'fasta\':\n+                score = None\n+                while score_line:\n+                    score_line = score_file.readline().rstrip( \'\\r\\n\' )\n+                    if not score_line or score_line.startswith( \'#\' ) or score_line.startswith( \'>\' ):\n+                        continue\n+                    for val in score_line.split():\n+                        try:\n+                            int( val )\n+                        except:\n+                            score_file.close()\n+                            stop_err( "Non-numerical value \'%s\' in score file." % val )\n+                    if not score:\n+                        score = score_line\n+                    else:\n+                        score = "%s %s" % ( score, score_line ) \n+                if score: \n+                    scores = score.split()\n+            elif data_type == \'tabular\':\n+                score = score_file.readline().rstrip(\'\\r\\n\')\n+                loc = score.split( \'\\t\' )\n+                for base in loc:\n+                    nuc_error = base.split()\n+                    try:\n+                        nuc_error[0] = int( nuc_error[0] )\n+                        nuc_error[1] = int( nuc_error[1] )\n+                        nuc_error[2] = int( nuc_error[2] )\n+                        nuc_error[3] = int( nuc_error[3] )\n+                        big = max( nuc_error )\n+                    except:\n+                        score_file.close()\n+                        stop_err( "Invalid characters in line %d: \'%s\'" % ( i, line ) )\n+                    scores.append( big )\n+            if scores:\n+                new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report )\n+                append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments )  \n+        score_file.close()\n+    else:\n+        stop_err( "Cannot locate sequence file \'%s\'or score file \'%s\'." % ( seq_infile_name, score_infile_name ) )    \n+\n+if __name__ == "__main__": __main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/short_reads_trim_seq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/short_reads_trim_seq.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,93 @@
+<tool id="trim_reads" name="Select high quality segments" version="1.0.0">
+<description></description>
+
+<command interpreter="python">
+ short_reads_trim_seq.py $trim $length $output1 $input1 $input2 $sequencing_method_choice.input3
+</command>
+<inputs>
+<page>
+    <param name="input1" type="data" format="fasta" label="Reads" />
+    <param name="input2" type="data" format="qualsolexa,qual454" label="Quality scores" />
+ <param name="trim" type="integer" size="5" value="20" label="Minimal quality score" help="bases scoring below this value will trigger splitting"/>
+    <param name="length" type="integer" size="5" value="100" label="Minimal length of contiguous segment" help="report all high quality segments above this length. Setting this option to '0' will cause the program to return a single longest run of high quality bases per read" />
+    <conditional name="sequencing_method_choice">
+        <param name="sequencer" type="select" label="Select technology">
+            <option value="454">Roche (454) or ABI SOLiD</option>
+            <option value="Solexa">Illumina (Solexa)</option>
+        </param>
+        <when value="454">
+            <param name="input3" type="select" label="Low quality bases in homopolymers" help="if set to 'DO NOT trigger splitting' the program will not count low quality bases that are within or adjacent to homonucleotide runs.  This will significantly reduce fragmentation of 454 data">
+                <option value="yes">DO NOT trigger splitting </option>
+                <option value="no">trigger splitting</option>
+            </param>
+        </when>
+        <when value="Solexa">
+            <param name="input3" type="integer" size="5" value="0" label="Restrict length of each read to" help="('0' = do not trim) The quality of Solexa reads drops towards the end. This option allows selecting the specified number of nucleotides from the beginning and then running the tool." />
+        </when> 
+    </conditional>
+</page>
+</inputs>
+
+<outputs>
+    <data name="output1" format="fasta" />
+</outputs>
+
+<tests>
+ <test>
+ <param name="sequencer" value="454" />
+ <param name="input1" value="454.fasta" ftype="fasta" />
+ <param name="input2" value="454.qual" ftype="qual454" />
+ <param name="input3" value="no" />
+ <param name="trim" value="20" />
+ <param name="length" value="0" />
+ <output name="output1" file="short_reads_trim_seq_out1.fasta" />
+ </test>
+ <test>
+ <param name="sequencer" value="Solexa" />
+ <param name="input1" value="solexa.fasta" ftype="fasta" />
+ <param name="input2" value="solexa.qual" ftype="qualsolexa" />
+ <param name="input3" value="0" />
+ <param name="trim" value="20" />
+ <param name="length" value="0" />
+ <output name="output1" file="short_reads_trim_seq_out2.fasta" />
+ </test>
+</tests>
+
+<help>
+  
+.. class:: warningmark
+
+To use this tool, your dataset needs to be in the *Quality Score* format. Click the pencil icon next to your dataset to set the datatype to *Quality Score* (see below for examples).

+-----
+
+**What it does**
+
+This tool finds high quality segments within sequencing reads generated by by Roche (454), Illumina (Solexa), or ABI SOLiD machines.
+
+-----
+
+**Example**
+
+
+Suppose this is your sequencing read::
+  
+   5'---------*-------------*------**----3'
+   
+where **dashes** (-) are HIGH quality bases (above 20) and **asterisks** (*) are LOW quality bases (below 20). If the **Minimal length of contiguous segment** is set to **5** (of course, only for the purposes of this example), the tool will return::
+
+   5'---------
+               -------------
+                             -------
+
+you can see that the tool simply splits the read on low quality bases and then returns all segments longer than 5.  **Note**, that the output of this tool will likely contain higher number of shorter sequences compared to the original input.   If we set the **Minimal length of contiguous segment** to **0**, the tool will only return the single longest segment::
+
+               -------------
+               
+
+               
+
+
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/shrimp_color_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/shrimp_color_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+
+"""
+SHRiMP wrapper : Color space
+"""
+
+import os, sys, tempfile, os.path, re
+
+assert sys.version_info[:2] >= (2.4)
+
+def stop_err( msg ):
+    
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+
+def __main__():
+    
+    # SHRiMP path
+    shrimp = 'rmapper-cs'
+    
+    # I/O
+    input_target_file = sys.argv[1]                  # fasta
+    input_query_file = sys.argv[2]
+    shrimp_outfile    = sys.argv[3]                # shrimp output
+            
+    # SHRiMP parameters
+    spaced_seed = '1111001111'
+    seed_matches_per_window = '2'
+    seed_hit_taboo_length = '4'
+    seed_generation_taboo_length = '0'
+    seed_window_length = '115.0'
+    max_hits_per_read = '100'
+    max_read_length = '1000'
+    kmer = '-1'
+    sw_match_value = '100'
+    sw_mismatch_value = '-150'
+    sw_gap_open_ref = '-400'
+    sw_gap_open_query = '-400'
+    sw_gap_ext_ref = '-70'
+    sw_gap_ext_query = '-70'
+    sw_crossover_penalty = '-140'
+    sw_full_hit_threshold = '68.0'
+    sw_vector_hit_threshold = '60.0'
+    
+    # TODO: put the threshold on each of these parameters
+    if len(sys.argv) > 4:
+        
+        try:
+            if sys.argv[4].isdigit():
+                spaced_seed = sys.argv[4]
+            else:
+                stop_err('Error in assigning parameter: Spaced seed.')
+        except:
+            stop_err('Spaced seed must be a combination of 1s and 0s.')
+        
+        seed_matches_per_window = sys.argv[5]
+        seed_hit_taboo_length = sys.argv[6]
+        seed_generation_taboo_length = sys.argv[7]
+        seed_window_length = sys.argv[8]
+        max_hits_per_read = sys.argv[9]
+        max_read_length = sys.argv[10]
+        kmer = sys.argv[11]
+        sw_match_value = sys.argv[12]
+        sw_mismatch_value = sys.argv[13]
+        sw_gap_open_ref = sys.argv[14]
+        sw_gap_open_query = sys.argv[15]
+        sw_gap_ext_ref = sys.argv[16]
+        sw_gap_ext_query = sys.argv[17]
+        sw_crossover_penalty = sys.argv[18]
+        sw_full_hit_threshold = sys.argv[19]
+        sw_vector_hit_threshold = sys.argv[20]
+        
+    # temp file for shrimp log file
+    shrimp_log = tempfile.NamedTemporaryFile().name
+    
+    # SHRiMP command
+    command = ' '.join([shrimp,  '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-x', sw_crossover_penalty, '-h', sw_full_hit_threshold, '-v', sw_vector_hit_threshold, input_query_file, input_target_file, '>', shrimp_outfile, '2>', shrimp_log])
+    
+    try:
+        os.system(command)
+    except Exception, e:
+        if os.path.exists(query_fasta): os.remove(query_fasta)
+        if os.path.exists(query_qual): os.remove(query_qual)
+        stop_err(str(e))
+    
+    # check SHRiMP output: count number of lines
+    num_hits = 0
+    if shrimp_outfile:
+        for i, line in enumerate(file(shrimp_outfile)):
+            line = line.rstrip('\r\n')
+            if not line or line.startswith('#'): continue
+            try:
+                fields = line.split()
+                num_hits += 1
+            except Exception, e:
+                stop_err(str(e))
+                
+    if num_hits == 0:   # no hits generated
+        err_msg = ''
+        if shrimp_log:
+            for i, line in enumerate(file(shrimp_log)):
+                if line.startswith('error'):            # deal with memory error: 
+                    err_msg += line                     # error: realloc failed: Cannot allocate memory
+                if re.search('Reads Matched', line):    # deal with zero hits
+                    if int(line[8:].split()[2]) == 0:
+                        err_msg = 'Zero hits found.\n' 
+        stop_err('SHRiMP Failed due to:\n' + err_msg)
+        
+        
+    # remove temp. files
+    if os.path.exists(shrimp_log): os.remove(shrimp_log)
+
+    
+if __name__ == '__main__': __main__()
+    
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/shrimp_color_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/shrimp_color_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,181 @@\n+<tool id="shrimp_color_wrapper" name="SHRiMP for Color-space" version="1.0.0">\n+  <description>reads mapping against reference sequence </description>\n+  <command interpreter="python">\n+    #if $param.skip_or_full=="skip" #shrimp_color_wrapper.py $input_target $input_query $output1 \n+    #else                           #shrimp_color_wrapper.py $input_target $input_query $output1 $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_crossover_penalty $param.sw_full_hit_threshold $param.sw_vector_hit_threshold  \n+    #end if#\n+  </command>\n+    <inputs>\n+        <page>\n+        <param name="input_query" type="data" format="csfasta" label="Align sequencing reads" help="No dataset? Read tip below"/>\n+        <param name="input_target" type="data" format="fasta" label="against reference" />\n+        <conditional name="param">\n+            <param name="skip_or_full" type="select" label="SHRiMP settings to use" help="For most mapping needs use Commonly used settings. If you want full control use Full List">\n+                <option value="skip">Commonly used</option>\n+                <option value="full">Full Parameter List</option>\n+            </param>\n+            <when value="skip" />\n+            <when value="full">\n+                <param name="spaced_seed"                   type="text"     size="30"   value="1111001111"    label="Spaced Seed" />\n+                <param name="seed_matches_per_window"       type="integer"  size="5"    value="2"               label="Seed Matches per Window" />\n+                <param name="seed_hit_taboo_length"         type="integer"  size="5"    value="4"               label="Seed Hit Taboo Length" />\n+                <param name="seed_generation_taboo_length"  type="integer"  size="5"    value="0"               label="Seed Generation Taboo Length" />\n+                <param name="seed_window_length"            type="float"    size="10"   value="115.0"           label="Seed Window Length"          help="in percentage"/>\n+                <param name="max_hits_per_read"             type="integer"  size="10"   value="100"             label="Maximum Hits per Read" />\n+                <param name="max_read_length"               type="integer"  size="10"   value="1000"            label="Maximum Read Length" />\n+                <param name="kmer"                          type="integer"  size="10"   value="-1"              label="Kmer Std. Deviation Limit"   help="-1 as None"/>\n+                <param name="sw_match_value"                type="integer"  size="10"   value="100"             label="S-W Match Value" />\n+                <param name="sw_mismatch_value"             type="integer"  size="10"   value="-150"            label="S-W Mismatch Value" />\n+                <param name="sw_gap_open_ref"               type="integer"  size="10"   value="-400"            label="S-W Gap Open Penalty (Reference)" />\n+                <param name="sw_gap_open_query"             type="integer"  size="10"   value="-400"            label="S-W Gap Open Penalty (Query)" />\n+                <param name="sw_gap_ext_ref"                type="integer"  size="10"   value="-70"             label="S-W Gap Extend Penalty (Reference)" />\n+                <param name="sw_gap_ext_query"              type="integer"  size="10"   value="-70"             label="S-W Gap Extend Penalty (Query)" />\n+                <param name="sw_crossover_penalty"          type="integer"  size="10"   value="-140"            label="S-W Crossover Penalty" />               \n+                <param name="sw_full_hit_threshold"         type="float"    size="10"   value="68.0"            label="S-W Full Hit Threshold"      help="in percentage'..b' per Window                 (default: 2)\n+          The number of seed matches per window dictates how many seeds \n+          must match within some window length of the genome before that \n+          region is considered for Smith-Waterman alignment. A lower \n+          value will increase sensitivity while drastically increasing \n+          running time. Higher values will have the opposite effect.\n+    -t    Seed Hit Taboo Length                   (default: 4)\n+          The seed taboo length specifies how many target genome bases \n+          or colours must exist prior to a previous seed match in order \n+          to count another seed match as a hit.\n+    -9    Seed Generation Taboo Length            (default: 0)\n+          \n+    -w    Seed Window Length                      (default: 115.00%)\n+          This parameter specifies the genomic span in bases (or colours) \n+          in which *seed_matches_per_window* must exist before the read \n+          is given consideration by the Simth-Waterman alignment machinery.\n+    -o    Maximum Hits per Read                   (default: 100)\n+          This parameter specifies how many hits to remember for each read. \n+          If more hits are encountered, ones with lower scores are dropped \n+          to make room.\n+    -r    Maximum Read Length                     (default: 1000)\n+          This parameter specifies the maximum length of reads that will \n+          be encountered in the dataset. If larger reads than the default \n+          are used, an appropriate value must be passed to *rmapper*.\n+    -d    Kmer Std. Deviation Limit               (default: -1 [None])\n+          This option permits pruning read kmers, which occur with \n+          frequencies greater than *kmer_std_dev_limit* standard \n+          deviations above the average. This can shorten running \n+          time at the cost of some sensitivity. \n+          *Note*: A negative value disables this option.            \n+    -m    S-W Match Value                         (default: 100)\n+          The value applied to matches during the Smith-Waterman score calculation.\n+    -i    S-W Mismatch Value                      (default: -150)\n+          The value applied to mismatches during the Smith-Waterman \n+          score calculation.\n+    -g    S-W Gap Open Penalty (Reference)        (default: -400)\n+          The value applied to gap opens along the reference sequence \n+          during the Smith-Waterman score calculation.\n+          *Note*: Note that for backward compatibility, if -g is set \n+          and -q is not set, the gap open penalty for the query will \n+          be set to the same value as specified for the reference.\n+    -q    S-W Gap Open Penalty (Query)            (default: -400)\n+          The value applied to gap opens along the query sequence during \n+          the Smith-Waterman score calculation.        \n+    -e    S-W Gap Extend Penalty (Reference)      (default: -70)\n+          The value applied to gap extends during the Smith-Waterman score calculation.\n+          *Note*: Note that for backward compatibility, if -e is set \n+          and -f is not set, the gap exten penalty for the query will \n+          be set to the same value as specified for the reference. \n+    -f    S-W Gap Extend Penalty (Query)          (default: -70)\n+          The value applied to gap extends during the Smith-Waterman score calculation.\n+    -x\n+    -h    S-W Full Hit Threshold                  (default: 68.00%)\n+          In letter-space, this parameter determines the threshold \n+          score for both vectored and full Smith-Waterman alignments. \n+          Any values less than this quantity will be thrown away.\n+          *Note* This option differs slightly in meaning between letter-space and color-space.\n+    -v\n+    \n+\n+-----\n+\n+**Reference**\n+ \n+ **SHRiMP**: Stephen M. Rumble, Michael Brudno, Phil Lacroute, Vladimir Yanovsky, Marc Fiume, Adrian Dalca. shrimp at cs dot toronto dot edu. \n+\n+</help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/shrimp_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/shrimp_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,624 @@\n+#!/usr/bin/env python\n+\n+"""\n+TODO\n+1. decrease memory usage\n+2. multi-fasta fastq file, ex. 454\n+3. split reads into small chuncks?\n+\n+SHRiMP wrapper\n+\n+Inputs: \n+1. reference seq \n+2. reads\n+\n+Outputs: \n+1. table of 8 columns:\n+         chrom   ref_loc     read_id     read_loc    ref_nuc     read_nuc    quality     coverage\n+2. SHRiMP output\n+         \n+Parameters:\n+    -s    Spaced Seed                             (default: 111111011111)\n+    -n    Seed Matches per Window                 (default: 2)\n+    -t    Seed Hit Taboo Length                   (default: 4)\n+    -9    Seed Generation Taboo Length            (default: 0)\n+    -w    Seed Window Length                      (default: 115.00%)\n+    -o    Maximum Hits per Read                   (default: 100)\n+    -r    Maximum Read Length                     (default: 1000)\n+    -d    Kmer Std. Deviation Limit               (default: -1 [None])\n+\n+    -m    S-W Match Value                         (default: 100)\n+    -i    S-W Mismatch Value                      (default: -150)\n+    -g    S-W Gap Open Penalty (Reference)        (default: -400)\n+    -q    S-W Gap Open Penalty (Query)            (default: -400)\n+    -e    S-W Gap Extend Penalty (Reference)      (default: -70)\n+    -f    S-W Gap Extend Penalty (Query)          (default: -70)\n+    -h    S-W Hit Threshold                       (default: 68.00%)\n+\n+Command:\n+%rmapper -s spaced_seed -n seed_matches_per_window -t seed_hit_taboo_length -9 seed_generation_taboo_length -w seed_window_length -o max_hits_per_read -r max_read_length -d kmer -m sw_match_value -i sw_mismatch_value -g sw_gap_open_ref -q sw_gap_open_query -e sw_gap_ext_ref -f sw_gap_ext_query -h sw_hit_threshold <query> <target> > <output> 2> <log> \n+\n+SHRiMP output:\n+>7:2:1147:982/1 chr3    +   36586562    36586595    2   35  36  2900    3G16G13\n+>7:2:1147:982/1 chr3    +   95338194    95338225    4   35  36  2700    9T7C14\n+>7:2:587:93/1   chr3    +   14913541    14913577    1   35  36  2960    19--16\n+\n+"""\n+\n+import os, sys, tempfile, os.path, re\n+\n+assert sys.version_info[:2] >= (2.4)\n+\n+def stop_err( msg ):\n+    \n+    sys.stderr.write( "%s\\n" % msg )\n+    sys.exit()\n+\n+def reverse_complement(s):\n+    \n+    complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":".", "-":"-"}\n+    reversed_s = []\n+    for i in s:\n+        reversed_s.append(complement_dna[i])\n+    reversed_s.reverse()\n+    return "".join(reversed_s)\n+\n+def generate_sub_table(result_file, ref_file, score_files, table_outfile, hit_per_read, insertion_size):\n+    \n+    invalid_editstring_char = 0\n+    \n+    all_score_file = score_files.split(\',\')\n+    \n+    if len(all_score_file) != hit_per_read: stop_err(\'One or more query files is missing. Please check your dataset.\')\n+        \n+    temp_table_name = tempfile.NamedTemporaryFile().name\n+    temp_table = open(temp_table_name, \'w\')\n+    \n+    outfile = open(table_outfile,\'w\')\n+    \n+    # reference seq: not a single fasta seq\n+    refseq = {}\n+    chrom_cov = {}\n+    seq = \'\'\n+    \n+    for i, line in enumerate(file(ref_file)):\n+        line = line.rstrip()\n+        if not line or line.startswith(\'#\'): continue\n+        \n+        if line.startswith(\'>\'):\n+            if seq:\n+                if refseq.has_key(title):\n+                    pass\n+                else:\n+                    refseq[title] = seq\n+                    chrom_cov[title] = {}\n+                seq = \'\'\n+            title = line[1:]\n+        else:\n+            seq += line\n+    if seq:\n+        if not refseq.has_key(title):\n+            refseq[title] = seq\n+            chrom_cov[title] = {}\n+\n+    # find hits : one end and/or the other\n+    hits = {}\n+    for i, line in enumerate(file(result_file)):\n+        line = line.rstrip()\n+        if not line or line.startswith(\'#\'): continue\n+        \n+        #FORMAT: readname contigname strand contigstart contigend readstart readend readlength score edit'..b", '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta, input_target_file, '>', shrimp_outfile, '2>', shrimp_log])\n+    \n+        try:\n+            os.system(command)\n+        except Exception, e:\n+            if os.path.exists(query_fasta): os.remove(query_fasta)\n+            if os.path.exists(query_qual): os.remove(query_qual)\n+            stop_err(str(e))\n+            \n+    else: # paired\n+        command_end1 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end1, input_target_file, '>', shrimp_outfile, '2>', shrimp_log])\n+        command_end2 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end2, input_target_file, '>>', shrimp_outfile, '2>>', shrimp_log])\n+        \n+        try:\n+            os.system(command_end1)\n+            os.system(command_end2)\n+        except Exception, e:\n+            if os.path.exists(query_fasta_end1): os.remove(query_fasta_end1)\n+            if os.path.exists(query_fasta_end2): os.remove(query_fasta_end2)\n+            if os.path.exists(query_qual_end1): os.remove(query_qual_end1)\n+            if os.path.exists(query_qual_end2): os.remove(query_qual_end2)\n+            stop_err(str(e))\n+    \n+    # check SHRiMP output: count number of lines\n+    num_hits = 0\n+    if shrimp_outfile:\n+        for i, line in enumerate(file(shrimp_outfile)):\n+            line = line.rstrip('\\r\\n')\n+            if not line or line.startswith('#'): continue\n+            try:\n+                fields = line.split()\n+                num_hits += 1\n+            except Exception, e:\n+                stop_err(str(e))\n+                \n+    if num_hits == 0:   # no hits generated\n+        err_msg = ''\n+        if shrimp_log:\n+            for i, line in enumerate(file(shrimp_log)):\n+                if line.startswith('error'):            # deal with memory error: \n+                    err_msg += line                     # error: realloc failed: Cannot allocate memory\n+                if re.search('Reads Matched', line):    # deal with zero hits\n+                    if int(line[8:].split()[2]) == 0:\n+                        err_msg = 'Zero hits found.\\n' \n+        stop_err('SHRiMP Failed due to:\\n' + err_msg)\n+        \n+    # convert to table\n+    if type_of_reads == 'single':\n+        return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual, table_outfile, hit_per_read, insertion_size)\n+    else:\n+        return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual_end1+','+query_qual_end2, table_outfile, hit_per_read, insertion_size)\n+        \n+    # remove temp. files\n+    if type_of_reads == 'single':\n+        if os.path.exists(query_fasta): os.remove(query_fasta)\n+        if os.path.exists(query_qual): os.remove(query_qual)\n+    else:\n+        if os.path.exists(query_fasta_end1): os.remove(query_fasta_end1)\n+        if os.path.exists(query_fasta_end2): os.remove(query_fasta_end2)\n+        if os.path.exists(query_qual_end1): os.remove(query_qual_end1)\n+        if os.path.exists(query_qual_end2): os.remove(query_qual_end2)    \n+    \n+    if os.path.exists(shrimp_log): os.remove(shrimp_log)\n+\n+    \n+if __name__ == '__main__': __main__()\n+    \n"
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/shrimp_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/shrimp_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,279 @@\n+<tool id="shrimp_wrapper" name="SHRiMP for Letter-space" version="1.0.0">\n+  <description>reads mapping against reference sequence </description>\n+  <command interpreter="python">\n+    #if     ($type_of_reads.single_or_paired=="single" and $param.skip_or_full=="skip") #shrimp_wrapper.py $input_target $output1 $output2 $input_query\n+    #elif   ($type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="skip") #shrimp_wrapper.py $input_target $output1 $output2 $type_of_reads.input1,$type_of_reads.input2,$type_of_reads.insertion_size\n+    #elif   ($type_of_reads.single_or_paired=="single" and $param.skip_or_full=="full") #shrimp_wrapper.py $input_target $output1 $output2 $input_query                                                              $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold \n+    #elif   ($type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="full") #shrimp_wrapper.py $input_target $output1 $output2 $type_of_reads.input1,$type_of_reads.input2,$type_of_reads.insertion_size $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold\n+    #end if#\n+  </command>\n+    <inputs>\n+        <page>\n+        <conditional name="type_of_reads">\n+            <param name="single_or_paired" type="select" label="Single- or Paired-ends">\n+                <option value="single">Single-end</option>\n+                <option value="paired">Paired-end</option>\n+            </param>\n+            <when value="single">\n+                <param name="input_query" type="data" format="fastqsolexa" label="Align sequencing reads" help="No dataset? Read tip below"/>\n+            </when>\n+            <when value="paired">\n+                <param name="insertion_size" type="integer" size="5" value="600" label="Insertion length between two ends" help="bp" />\n+                <param name="input1" type="data" format="fastqsolexa" label="Align sequencing reads, one end" />\n+                <param name="input2" type="data" format="fastqsolexa" label="and the other end" />\n+            </when> \n+        </conditional>\n+        <param name="input_target" type="data" format="fasta" label="against reference" />\n+        <conditional name="param">\n+            <param name="skip_or_full" type="select" label="SHRiMP settings to use" help="For most mapping needs use Commonly used settings. If you want full control use Full List">\n+                <option value="skip">Commonly used</option>\n+                <option value="full">Full Parameter List</option>\n+            </param>\n+            <when value="skip" />\n+            <when value="full">\n+                <param name="spaced_seed"                   type="text"     size="30"   value="111111011111"    label="Spaced Seed" />\n+                <param name="seed_matches_per_window"       type="integer"  size="5"    value="2"               label="Seed Matches per Window" />\n+                <param name="seed_hit_taboo_length"         type="integer"  size="5"    value="4"               label="Seed Hit Taboo Length" />\n+                <param name="seed_generation_taboo_length"  type="integer"  size="5"    value="0"               label="Seed Generation Taboo Length" />\n+                <param name="seed_window_length"            type="float"    size="10"   value="115.0"           label="Seed Window Length"          help="in percentage"/>\n+                <param'..b'   -n    Seed Matches per Window                 (default: 2)\n+          The number of seed matches per window dictates how many seeds \n+          must match within some window length of the genome before that \n+          region is considered for Smith-Waterman alignment. A lower \n+          value will increase sensitivity while drastically increasing \n+          running time. Higher values will have the opposite effect.\n+    -t    Seed Hit Taboo Length                   (default: 4)\n+          The seed taboo length specifies how many target genome bases \n+          or colors must exist prior to a previous seed match in order \n+          to count another seed match as a hit.\n+    -9    Seed Generation Taboo Length            (default: 0)\n+          \n+    -w    Seed Window Length                      (default: 115.00%)\n+          This parameter specifies the genomic span in bases (or colours) \n+          in which *seed_matches_per_window* must exist before the read \n+          is given consideration by the Simth-Waterman alignment machinery.\n+    -o    Maximum Hits per Read                   (default: 100)\n+          This parameter specifies how many hits to remember for each read. \n+          If more hits are encountered, ones with lower scores are dropped \n+          to make room.\n+    -r    Maximum Read Length                     (default: 1000)\n+          This parameter specifies the maximum length of reads that will \n+          be encountered in the dataset. If larger reads than the default \n+          are used, an appropriate value must be passed to *rmapper*.\n+    -d    Kmer Std. Deviation Limit               (default: -1 [None])\n+          This option permits pruning read kmers, which occur with \n+          frequencies greater than *kmer_std_dev_limit* standard \n+          deviations above the average. This can shorten running \n+          time at the cost of some sensitivity. \n+          *Note*: A negative value disables this option.            \n+    -m    S-W Match Value                         (default: 100)\n+          The value applied to matches during the Smith-Waterman score calculation.\n+    -i    S-W Mismatch Value                      (default: -150)\n+          The value applied to mismatches during the Smith-Waterman \n+          score calculation.\n+    -g    S-W Gap Open Penalty (Reference)        (default: -400)\n+          The value applied to gap opens along the reference sequence \n+          during the Smith-Waterman score calculation.\n+          *Note*: Note that for backward compatibility, if -g is set \n+          and -q is not set, the gap open penalty for the query will \n+          be set to the same value as specified for the reference.\n+    -q    S-W Gap Open Penalty (Query)            (default: -400)\n+          The value applied to gap opens along the query sequence during \n+          the Smith-Waterman score calculation.        \n+    -e    S-W Gap Extend Penalty (Reference)      (default: -70)\n+          The value applied to gap extends during the Smith-Waterman score calculation.\n+          *Note*: Note that for backward compatibility, if -e is set \n+          and -f is not set, the gap exten penalty for the query will \n+          be set to the same value as specified for the reference. \n+    -f    S-W Gap Extend Penalty (Query)          (default: -70)\n+          The value applied to gap extends during the Smith-Waterman score calculation.\n+    -h    S-W Hit Threshold                       (default: 68.00%)\n+          In letter-space, this parameter determines the threshold \n+          score for both vectored and full Smith-Waterman alignments. \n+          Any values less than this quantity will be thrown away.\n+          *Note* This option differs slightly in meaning between letter-space and color-space.\n+\n+\n+-----\n+\n+**Reference**\n+ \n+ **SHRiMP**: Stephen M. Rumble, Michael Brudno, Phil Lacroute, Vladimir Yanovsky, Marc Fiume, Adrian Dalca. shrimp at cs dot toronto dot edu. \n+\n+</help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/split_paired_reads.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/split_paired_reads.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+
+"""
+Split fixed length paired end reads
+"""
+
+import os, sys
+
+if __name__ == '__main__':
+    
+    infile = sys.argv[1]
+    outfile_end1 = open(sys.argv[2], 'w')
+    outfile_end2 = open(sys.argv[3], 'w')
+    
+    i = 0
+    
+    for line in file( infile ):
+        line = line.rstrip()
+        
+        if not line:
+            continue 
+        
+        end1 = ''
+        end2 = ''
+        
+        line_index = i % 4
+        
+        if line_index == 0:
+            end1 = line + '/1'
+            end2 = line + '/2'
+        
+        elif line_index == 1:
+            seq_len = len(line)/2
+            end1 = line[0:seq_len]
+            end2 = line[seq_len:]
+        
+        elif line_index == 2:
+            end1 = line + '/1'
+            end2 = line + '/2'
+        
+        else:
+            qual_len = len(line)/2
+            end1 = line[0:qual_len]
+            end2 = line[qual_len:]
+            
+        outfile_end1.write('%s\n' %(end1))
+        outfile_end2.write('%s\n' %(end2))
+        
+        i += 1
+        
+    if  i % 4 != 0  :
+        sys.stderr.write("WARNING: Number of lines in the input file was not divisible by 4.\nCheck consistency of the input fastq file.\n")
+    outfile_end1.close()
+    outfile_end2.close()    
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/metag_tools/split_paired_reads.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/split_paired_reads.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,57 @@
+<tool id="split_paired_reads" name="Split paired end reads" version="1.0.0">
+  <description></description>
+  <command interpreter="python">
+    split_paired_reads.py $input $output1 $output2
+  </command>
+    <inputs>
+        <param name="input" type="data" format="fastqsanger" label="Your paired-end file" />
+    </inputs>
+    <outputs>
+        <data name="output1" format="fastqsanger"/>
+        <data name="output2" format="fastqsanger"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" value="3.fastqsanger" ftype="fastqsanger"/>
+            <output name="output1" file="split_pair_reads_1.fastqsanger" ftype="fastqsanger"/>
+            <output name="output2" file="split_pair_reads_2.fastqsanger" ftype="fastqsanger"/>
+        </test>
+    </tests>
+<help>
+        
+**What it does**

+Splits a single fastq dataset representing paired-end run into two datasets (one for each end). This tool works only for datasets where both ends have **the same** length.  
+
+-----
+
+**Input formats**
+
+A multiple-fastq file, for example::
+
+    @HWI-EAS91_1_30788AAXX:7:21:1542:1758
+    GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA
+    +HWI-EAS91_1_30788AAXX:7:21:1542:1758
+    hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR
+
+
+-----
+
+**Outputs**
+
+One end::
+
+    @HWI-EAS91_1_30788AAXX:7:21:1542:1758/1
+    GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC
+    +HWI-EAS91_1_30788AAXX:7:21:1542:1758/1
+    hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+
+The other end::
+
+    @HWI-EAS91_1_30788AAXX:7:21:1542:1758/2
+    GCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA
+    +HWI-EAS91_1_30788AAXX:7:21:1542:1758/2
+    hhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR
+    
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/multivariate_stats/cca.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/multivariate_stats/cca.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,159 @@
+#!/usr/bin/env python
+
+from galaxy import eggs
+import sys, string
+from rpy import *
+import numpy
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+infile = sys.argv[1]
+x_cols = sys.argv[2].split(',')
+y_cols = sys.argv[3].split(',')
+
+x_scale = x_center = "FALSE"
+if sys.argv[4] == 'both':
+    x_scale = x_center = "TRUE"
+elif sys.argv[4] == 'center':
+    x_center = "TRUE"
+elif sys.argv[4] == 'scale':
+    x_scale = "TRUE"
+    
+y_scale = y_center = "FALSE"
+if sys.argv[5] == 'both':
+    y_scale = y_center = "TRUE"
+elif sys.argv[5] == 'center':
+    y_center = "TRUE"
+elif sys.argv[5] == 'scale':
+    y_scale = "TRUE"
+
+std_scores = "FALSE"   
+if sys.argv[6] == "yes":
+    std_scores = "TRUE"
+    
+outfile = sys.argv[7]
+outfile2 = sys.argv[8]
+
+fout = open(outfile,'w')
+elems = []
+for i, line in enumerate( file ( infile )):
+    line = line.rstrip('\r\n')
+    if len( line )>0 and not line.startswith( '#' ):
+        elems = line.split( '\t' )
+        break 
+    if i == 30:
+        break # Hopefully we'll never get here...
+
+if len( elems )<1:
+    stop_err( "The data in your input dataset is either missing or not formatted properly." )
+
+x_vals = []
+
+for k,col in enumerate(x_cols):
+    x_cols[k] = int(col)-1
+    x_vals.append([])
+
+y_vals = []
+
+for k,col in enumerate(y_cols):
+    y_cols[k] = int(col)-1
+    y_vals.append([])
+
+skipped = 0
+for ind,line in enumerate( file( infile )):
+    if line and not line.startswith( '#' ):
+        try:
+            fields = line.strip().split("\t")
+            valid_line = True
+            for col in x_cols+y_cols:
+                try:
+                    assert float(fields[col])
+                except:
+                    skipped += 1
+                    valid_line = False
+                    break
+            if valid_line:
+                for k,col in enumerate(x_cols):
+                    try:
+                        xval = float(fields[col])
+                    except:
+                        xval = NaN#
+                    x_vals[k].append(xval)
+                for k,col in enumerate(y_cols):
+                    try:
+                        yval = float(fields[col])
+                    except:
+                        yval = NaN#
+                    y_vals[k].append(yval)
+        except:
+            skipped += 1
+
+x_vals1 = numpy.asarray(x_vals).transpose()
+y_vals1 = numpy.asarray(y_vals).transpose()
+
+x_dat= r.list(array(x_vals1))
+y_dat= r.list(array(y_vals1))
+
+try:
+    r.suppressWarnings(r.library("yacca"))
+except:
+    stop_err("Missing R library yacca.")
+    
+set_default_mode(NO_CONVERSION)
+try:
+    xcolnames = ["c%d" %(el+1) for el in x_cols]
+    ycolnames = ["c%d" %(el+1) for el in y_cols]
+    cc = r.cca(x=x_dat, y=y_dat, xlab=xcolnames, ylab=ycolnames, xcenter=r(x_center), ycenter=r(y_center), xscale=r(x_scale), yscale=r(y_scale), standardize_scores=r(std_scores))
+    ftest = r.F_test_cca(cc)
+except RException, rex:
+    stop_err("Encountered error while performing CCA on the input data: %s" %(rex))
+
+set_default_mode(BASIC_CONVERSION)
+summary = r.summary(cc)
+
+ncomps = len(summary['corr'])
+comps = summary['corr'].keys()
+corr = summary['corr'].values()
+xlab = summary['xlab']
+ylab = summary['ylab']
+
+for i in range(ncomps):
+    corr[comps.index('CV %s' %(i+1))] = summary['corr'].values()[i]
+
+ftest=ftest.as_py()
+print >>fout, "#Component\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
+print >>fout, "#Correlation\t%s" %("\t".join(["%.4g" % el for el in corr]))
+print >>fout, "#F-statistic\t%s" %("\t".join(["%.4g" % el for el in ftest['statistic']]))
+print >>fout, "#p-value\t%s" %("\t".join(["%.4g" % el for el in ftest['p.value']]))
+
+print >>fout, "#X-Coefficients\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
+for i,val in enumerate(summary['xcoef']):
+    print >>fout, "%s\t%s" %(xlab[i], "\t".join(["%.4g" % el for el in val]))
+
+print >>fout, "#Y-Coefficients\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
+for i,val in enumerate(summary['ycoef']):
+    print >>fout, "%s\t%s" %(ylab[i], "\t".join(["%.4g" % el for el in val]))
+       
+print >>fout, "#X-Loadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
+for i,val in enumerate(summary['xstructcorr']):
+    print >>fout, "%s\t%s" %(xlab[i], "\t".join(["%.4g" % el for el in val]))
+
+print >>fout, "#Y-Loadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
+for i,val in enumerate(summary['ystructcorr']):
+    print >>fout, "%s\t%s" %(ylab[i], "\t".join(["%.4g" % el for el in val]))
+
+print >>fout, "#X-CrossLoadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
+for i,val in enumerate(summary['xcrosscorr']):
+    print >>fout, "%s\t%s" %(xlab[i], "\t".join(["%.4g" % el for el in val]))
+
+print >>fout, "#Y-CrossLoadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
+for i,val in enumerate(summary['ycrosscorr']):
+    print >>fout, "%s\t%s" %(ylab[i], "\t".join(["%.4g" % el for el in val]))
+
+r.pdf( outfile2, 8, 8 )
+#r.plot(cc)
+for i in range(ncomps):
+    r.helio_plot(cc, cv = i+1, main = r.paste("Explained Variance for CV",i+1), type = "variance")
+r.dev_off()
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/multivariate_stats/cca.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/multivariate_stats/cca.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,95 @@
+<tool id="cca1" name="Canonical Correlation Analysis" version="1.0.0">
+  <description> </description>
+  <command interpreter="python">
+    cca.py 
+      $input1
+      $x_cols
+      $y_cols
+      $x_scale
+      $y_scale
+      $std_scores
+      $out_file1
+      $out_file2
+  </command>
+  <inputs>
+    <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/>
+    <param name="x_cols" label="Select columns containing X variables " type="data_column" data_ref="input1" numerical="True" multiple="true" >
+        <validator type="no_options" message="Please select at least one column."/>
+    </param>
+    <param name="y_cols" label="Select columns containing Y variables " type="data_column" data_ref="input1" numerical="True" multiple="true" >
+        <validator type="no_options" message="Please select at least one column."/>
+    </param>
+    <param name="x_scale" type="select" label="Type of Scaling for X variables" help="Can be used to center and/or scale variables">
+        <option value="none" selected="true">None</option>
+        <option value="center">Center only</option>
+        <option value="scale">Scale only</option>
+        <option value="both">Center and Scale</option>
+    </param>
+    <param name="y_scale" type="select" label="Type of Scaling for Y variables" help="Can be used to center and/or scale variables">
+        <option value="none" selected="true">None</option>
+        <option value="center">Center only</option>
+        <option value="scale">Scale only</option>
+        <option value="both">Center and Scale</option>
+    </param>
+    <param name="std_scores" type="select" label="Report standardized scores?" help="Selecting 'Yes' will rescale scores (and coefficients) to produce scores of unit variance">
+        <option value="no" selected="true">No</option>
+        <option value="yes">Yes</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input1" />
+    <data format="pdf" name="out_file2" />
+  </outputs>
+  <requirements>
+    <requirement type="python-module">rpy</requirement>
+  </requirements>
+  <tests>
+    <test>
+        <param name="input1" value="iris.tabular"/>
+        <param name="x_cols" value="3,4"/>
+        <param name="y_cols" value="1,2"/>
+        <param name="x_scale" value="both"/>
+        <param name="y_scale" value="scale"/>
+        <param name="std_scores" value="yes"/>
+        <output name="out_file1" file="cca_out1.tabular"/>
+        <output name="out_file2" file="cca_out2.pdf"/>
+    </test>
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Edit Datasets-&gt;Convert characters*
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+This tool uses functions from 'yacca' library from R statistical package to perform Canonical Correlation Analysis (CCA) on the input data. It outputs two files, one containing the summary statistics of the performed CCA, and the other containing helioplots, which display structural loadings of X and Y variables on different canonical components.   
+
+*Carter T. Butts (2009). yacca: Yet Another Canonical Correlation Analysis Package. R package version 1.1.*
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+- This tool currently treats all predictor and response variables as continuous numeric variables. Running the tool on categorical variables might result in incorrect results.
+
+- Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis.
+
+- The summary statistics in the output are described below:
+
+  - correlation: Canonical correlation between the canonical variates (i.e. transformed variables)
+  - F-statistic: F-value obtained from F Test for Canonical Correlations Using Rao's Approximation
+  - p-value: denotes significance of canonical correlations
+  - Coefficients: represent the coefficients of X and Y variables on each canonical variate
+  - Loadings: represent the correlations between the original variables in each set and their respective canonical variates 
+  - CrossLoadings: represent the correlations between the original variables in each set and the opposite canonical variates 
+  
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/multivariate_stats/kcca.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/multivariate_stats/kcca.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,146 @@
+#!/usr/bin/env python
+
+"""
+Run kernel CCA using kcca() from R 'kernlab' package
+
+usage: %prog [options]
+   -i, --input=i: Input file
+   -o, --output1=o: Summary output
+   -x, --x_cols=x: X-Variable columns
+   -y, --y_cols=y: Y-Variable columns
+   -k, --kernel=k: Kernel function
+   -f, --features=f: Number of canonical components to return
+   -s, --sigma=s: sigma
+   -d, --degree=d: degree
+   -l, --scale=l: scale
+   -t, --offset=t: offset
+   -r, --order=r: order
+
+usage: %prog input output1 x_cols y_cols kernel features sigma(or_None) degree(or_None) scale(or_None) offset(or_None) order(or_None)
+"""
+
+from galaxy import eggs
+import sys, string
+from rpy import *
+import numpy
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+#Parse Command Line
+options, args = doc_optparse.parse( __doc__ )
+#{'options= kernel': 'rbfdot', 'var_cols': '1,2,3,4', 'degree': 'None', 'output2': '/afs/bx.psu.edu/home/gua110/workspace/galaxy_bitbucket/database/files/000/dataset_260.dat', 'output1': '/afs/bx.psu.edu/home/gua110/workspace/galaxy_bitbucket/database/files/000/dataset_259.dat', 'scale': 'None', 'offset': 'None', 'input': '/afs/bx.psu.edu/home/gua110/workspace/galaxy_bitbucket/database/files/000/dataset_256.dat', 'sigma': '1.0', 'order': 'None'}
+
+infile = options.input
+x_cols = options.x_cols.split(',')
+y_cols = options.y_cols.split(',')
+kernel = options.kernel
+outfile = options.output1
+ncomps = int(options.features)
+fout = open(outfile,'w')
+
+if ncomps < 1:
+    print "You chose to return '0' canonical components. Please try rerunning the tool with number of components = 1 or more."
+    sys.exit()
+elems = []
+for i, line in enumerate( file ( infile )):
+    line = line.rstrip('\r\n')
+    if len( line )>0 and not line.startswith( '#' ):
+        elems = line.split( '\t' )
+        break 
+    if i == 30:
+        break # Hopefully we'll never get here...
+
+if len( elems )<1:
+    stop_err( "The data in your input dataset is either missing or not formatted properly." )
+
+x_vals = []
+for k,col in enumerate(x_cols):
+    x_cols[k] = int(col)-1
+    x_vals.append([])
+y_vals = []
+for k,col in enumerate(y_cols):
+    y_cols[k] = int(col)-1
+    y_vals.append([])
+NA = 'NA'
+skipped = 0
+for ind,line in enumerate( file( infile )):
+    if line and not line.startswith( '#' ):
+        try:
+            fields = line.strip().split("\t")
+            valid_line = True
+            for col in x_cols+y_cols:
+                try:
+                    assert float(fields[col])
+                except:
+                    skipped += 1
+                    valid_line = False
+                    break
+            if valid_line:
+                for k,col in enumerate(x_cols):
+                    try:
+                        xval = float(fields[col])
+                    except:
+                        xval = NaN#
+                    x_vals[k].append(xval)
+                for k,col in enumerate(y_cols):
+                    try:
+                        yval = float(fields[col])
+                    except:
+                        yval = NaN#
+                    y_vals[k].append(yval)
+        except:
+            skipped += 1
+
+x_vals1 = numpy.asarray(x_vals).transpose()
+y_vals1 = numpy.asarray(y_vals).transpose()
+
+x_dat= r.list(array(x_vals1))
+y_dat= r.list(array(y_vals1))
+
+try:
+    r.suppressWarnings(r.library('kernlab'))
+except:
+    stop_err('Missing R library kernlab')
+            
+set_default_mode(NO_CONVERSION)
+if kernel=="rbfdot" or kernel=="anovadot":
+    pars = r.list(sigma=float(options.sigma))
+elif kernel=="polydot":
+    pars = r.list(degree=float(options.degree),scale=float(options.scale),offset=float(options.offset))
+elif kernel=="tanhdot":
+    pars = r.list(scale=float(options.scale),offset=float(options.offset))
+elif kernel=="besseldot":
+    pars = r.list(degree=float(options.degree),sigma=float(options.sigma),order=float(options.order))
+elif kernel=="anovadot":
+    pars = r.list(degree=float(options.degree),sigma=float(options.sigma))
+else:
+    pars = rlist()
+    
+try:
+    kcc = r.kcca(x=x_dat, y=y_dat, kernel=kernel, kpar=pars, ncomps=ncomps)
+except RException, rex:
+    stop_err("Encountered error while performing kCCA on the input data: %s" %(rex))
+
+set_default_mode(BASIC_CONVERSION)    
+kcor = r.kcor(kcc)
+if ncomps == 1:
+    kcor = [kcor]
+xcoef = r.xcoef(kcc)
+ycoef = r.ycoef(kcc)
+
+print >>fout, "#Component\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
+
+print >>fout, "#Correlation\t%s" %("\t".join(["%.4g" % el for el in kcor]))
+    
+print >>fout, "#Estimated X-coefficients\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
+for obs,val in enumerate(xcoef):
+    print >>fout, "%s\t%s" %(obs+1, "\t".join(["%.4g" % el for el in val]))
+
+print >>fout, "#Estimated Y-coefficients\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
+for obs,val in enumerate(ycoef):
+    print >>fout, "%s\t%s" %(obs+1, "\t".join(["%.4g" % el for el in val]))
b
diff -r 000000000000 -r 9071e359b9a3 tools/multivariate_stats/kcca.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/multivariate_stats/kcca.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,150 @@
+<tool id="kcca1" name="Kernel Canonical Correlation Analysis" version="1.0.0">
+  <description> </description>
+  <command interpreter="python">
+    kcca.py 
+      --input=$input1
+      --output1=$out_file1
+      --x_cols=$x_cols
+      --y_cols=$y_cols
+      --kernel=$kernelChoice.kernel
+      --features=$features
+      #if $kernelChoice.kernel == "rbfdot" or $kernelChoice.kernel == "anovadot":
+      --sigma=$kernelChoice.sigma
+      --degree="None"
+      --scale="None"
+      --offset="None"
+      --order="None"
+      #elif $kernelChoice.kernel == "polydot":
+      --sigma="None"
+      --degree=$kernelChoice.degree
+      --scale=$kernelChoice.scale
+      --offset=$kernelChoice.offset
+      --order="None"
+      #elif $kernelChoice.kernel == "tanhdot":
+      --sigma="None"
+      --degree="None"
+      --scale=$kernelChoice.scale
+      --offset=$kernelChoice.offset
+      --order="None"
+      #elif $kernelChoice.kernel == "besseldot":
+      --sigma=$kernelChoice.sigma
+      --degree=$kernelChoice.degree
+      --scale="None"
+      --offset="None"
+      --order=$kernelChoice.order
+      #elif $kernelChoice.kernel == "anovadot":
+      --sigma=$kernelChoice.sigma
+      --degree=$kernelChoice.degree
+      --scale="None"
+      --offset="None"
+      --order="None"
+      #else:
+      --sigma="None"
+      --degree="None"
+      --scale="None"
+      --offset="None"
+      --order="None"
+      #end if
+  </command>
+  <inputs>
+    <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/>
+    <param name="x_cols" label="Select columns containing X variables " type="data_column" data_ref="input1" numerical="True" multiple="true" >
+        <validator type="no_options" message="Please select at least one column."/>
+    </param>
+    <param name="y_cols" label="Select columns containing Y variables " type="data_column" data_ref="input1" numerical="True" multiple="true" >
+        <validator type="no_options" message="Please select at least one column."/>
+    </param>
+    <param name="features" size="10" type="integer" value="2" label="Number of canonical components to return" help="Enter an integer value greater than 0"/>
+    <conditional name="kernelChoice">
+        <param name="kernel" type="select" label="Kernel function">
+            <option value="rbfdot" selected="true">Gaussian Radial Basis Function</option>
+            <option value="polydot">Polynomial</option>
+            <option value="vanilladot">Linear</option>
+            <option value="tanhdot">Hyperbolic</option>
+            <option value="laplacedot">Laplacian</option>
+            <option value="besseldot">Bessel</option>
+            <option value="anovadot">ANOVA Radial Basis Function</option>
+            <option value="splinedot">Spline</option>
+        </param>
+        <when value="vanilladot" />
+        <when value="splinedot" />
+        <when value="rbfdot">
+            <param name="sigma" size="10" type="float" value="1" label="sigma (inverse kernel width)" />
+        </when>
+        <when value="laplacedot">
+            <param name="sigma" size="10" type="float" value="1" label="sigma (inverse kernel width)" />
+        </when>
+        <when value="polydot">
+            <param name="degree" size="10" type="float" value="1" label="degree" />
+            <param name="scale" size="10" type="float" value="1" label="scale" />
+            <param name="offset" size="10" type="float" value="1" label="offset" />
+        </when>
+        <when value="tanhdot">
+            <param name="scale" size="10" type="float" value="1" label="scale" />
+            <param name="offset" size="10" type="float" value="1" label="offset" />
+        </when>
+        <when value="besseldot">
+            <param name="sigma" size="10" type="float" value="1" label="sigma" />
+            <param name="order" size="10" type="float" value="1" label="order" />
+            <param name="degree" size="10" type="float" value="1" label="degree" />
+        </when>
+        <when value="anovadot">
+            <param name="sigma" size="10" type="float" value="1" label="sigma" />
+            <param name="degree" size="10" type="float" value="1" label="degree" />
+        </when>
+    </conditional>    
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input1" />
+  </outputs>
+  <requirements>
+    <requirement type="python-module">rpy</requirement>
+  </requirements>
+  <tests>
+    <test>
+        <param name="input1" value="iris.tabular"/>
+        <param name="x_cols" value="1,2"/>
+        <param name="y_cols" value="3,4"/>
+        <param name="kernel" value="anovadot"/>
+        <param name="features" value="4"/>
+        <param name="sigma" value="0.1"/>
+        <param name="degree" value="2"/>
+        <output name="out_file1" file="kcca_out1.tabular" compare="re_match"/>
+    </test>
+    <test>
+        <param name="input1" value="iris.tabular"/>
+        <param name="x_cols" value="3,4"/>
+        <param name="y_cols" value="1,2"/>
+        <param name="kernel" value="rbfdot"/>
+        <param name="features" value="2"/>
+        <param name="sigma" value="0.5"/>
+        <output name="out_file1" file="kcca_out2.tabular" compare="re_match"/>
+    </test>
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Edit Datasets-&gt;Convert characters*
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+This tool uses functions from 'kernlab' library from R statistical package to perform Kernel Canonical Correlation Analysis (kCCA) on the input data. 
+
+*Alexandros Karatzoglou, Alex Smola, Kurt Hornik, Achim Zeileis (2004). kernlab - An S4 Package for Kernel Methods in R. Journal of Statistical Software 11(9), 1-20. URL http://www.jstatsoft.org/v11/i09/*
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+This tool currently treats all variables as continuous numeric variables. Running the tool on categorical variables might result in incorrect results. Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/multivariate_stats/kpca.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/multivariate_stats/kpca.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,134 @@
+#!/usr/bin/env python
+
+"""
+Run kernel PCA using kpca() from R 'kernlab' package
+
+usage: %prog [options]
+   -i, --input=i: Input file
+   -o, --output1=o: Summary output
+   -p, --output2=p: Figures output
+   -c, --var_cols=c: Variable columns
+   -k, --kernel=k: Kernel function
+   -f, --features=f: Number of principal components to return
+   -s, --sigma=s: sigma
+   -d, --degree=d: degree
+   -l, --scale=l: scale
+   -t, --offset=t: offset
+   -r, --order=r: order
+
+usage: %prog input output1 output2 var_cols kernel features sigma(or_None) degree(or_None) scale(or_None) offset(or_None) order(or_None)
+"""
+
+from galaxy import eggs
+import sys, string
+from rpy import *
+import numpy
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+#Parse Command Line
+options, args = doc_optparse.parse( __doc__ )
+#{'options= kernel': 'rbfdot', 'var_cols': '1,2,3,4', 'degree': 'None', 'output2': '/afs/bx.psu.edu/home/gua110/workspace/galaxy_bitbucket/database/files/000/dataset_260.dat', 'output1': '/afs/bx.psu.edu/home/gua110/workspace/galaxy_bitbucket/database/files/000/dataset_259.dat', 'scale': 'None', 'offset': 'None', 'input': '/afs/bx.psu.edu/home/gua110/workspace/galaxy_bitbucket/database/files/000/dataset_256.dat', 'sigma': '1.0', 'order': 'None'}
+
+infile = options.input
+x_cols = options.var_cols.split(',')
+kernel = options.kernel
+outfile = options.output1
+outfile2 = options.output2
+ncomps = int(options.features)
+fout = open(outfile,'w')
+
+elems = []
+for i, line in enumerate( file ( infile )):
+    line = line.rstrip('\r\n')
+    if len( line )>0 and not line.startswith( '#' ):
+        elems = line.split( '\t' )
+        break 
+    if i == 30:
+        break # Hopefully we'll never get here...
+
+if len( elems )<1:
+    stop_err( "The data in your input dataset is either missing or not formatted properly." )
+
+x_vals = []
+
+for k,col in enumerate(x_cols):
+    x_cols[k] = int(col)-1
+    x_vals.append([])
+
+NA = 'NA'
+skipped = 0
+for ind,line in enumerate( file( infile )):
+    if line and not line.startswith( '#' ):
+        try:
+            fields = line.strip().split("\t")
+            for k,col in enumerate(x_cols):
+                try:
+                    xval = float(fields[col])
+                except:
+                    #xval = r('NA')
+                    xval = NaN#
+                x_vals[k].append(xval)
+        except:
+            skipped += 1
+
+x_vals1 = numpy.asarray(x_vals).transpose()
+dat= r.list(array(x_vals1))
+
+try:
+    r.suppressWarnings(r.library('kernlab'))
+except:
+    stop_err('Missing R library kernlab')
+            
+set_default_mode(NO_CONVERSION)
+if kernel=="rbfdot" or kernel=="anovadot":
+    pars = r.list(sigma=float(options.sigma))
+elif kernel=="polydot":
+    pars = r.list(degree=float(options.degree),scale=float(options.scale),offset=float(options.offset))
+elif kernel=="tanhdot":
+    pars = r.list(scale=float(options.scale),offset=float(options.offset))
+elif kernel=="besseldot":
+    pars = r.list(degree=float(options.degree),sigma=float(options.sigma),order=float(options.order))
+elif kernel=="anovadot":
+    pars = r.list(degree=float(options.degree),sigma=float(options.sigma))
+else:
+    pars = r.list()
+    
+try:
+    kpc = r.kpca(x=r.na_exclude(dat), kernel=kernel, kpar=pars, features=ncomps)
+except RException, rex:
+    stop_err("Encountered error while performing kPCA on the input data: %s" %(rex))
+set_default_mode(BASIC_CONVERSION)
+    
+eig = r.eig(kpc)
+pcv = r.pcv(kpc)
+rotated = r.rotated(kpc)
+
+comps = eig.keys()
+eigv = eig.values()
+for i in range(ncomps):
+    eigv[comps.index('Comp.%s' %(i+1))] = eig.values()[i]
+
+print >>fout, "#Component\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
+
+print >>fout, "#Eigenvalue\t%s" %("\t".join(["%.4g" % el for el in eig.values()]))
+    
+print >>fout, "#Principal component vectors\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
+for obs,val in enumerate(pcv):
+    print >>fout, "%s\t%s" %(obs+1, "\t".join(["%.4g" % el for el in val]))
+
+print >>fout, "#Rotated values\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
+for obs,val in enumerate(rotated):
+    print >>fout, "%s\t%s" %(obs+1, "\t".join(["%.4g" % el for el in val]))
+
+r.pdf( outfile2, 8, 8 )
+if ncomps != 1:
+    r.pairs(rotated,labels=r.list(range(1,ncomps+1)),main="Scatterplot of rotated values")
+else:
+    r.plot(rotated, ylab='Comp.1', main="Scatterplot of rotated values")
+r.dev_off()
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/multivariate_stats/kpca.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/multivariate_stats/kpca.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,140 @@
+<tool id="kpca1" name="Kernel Principal Component Analysis" version="1.0.0">
+  <description> </description>
+  <command interpreter="python">
+    kpca.py 
+      --input=$input1
+      --output1=$out_file1
+      --output2=$out_file2
+      --var_cols=$var_cols
+      --kernel=$kernelChoice.kernel
+      --features=$features
+      #if $kernelChoice.kernel == "rbfdot" or $kernelChoice.kernel == "anovadot":
+      --sigma=$kernelChoice.sigma
+      --degree="None"
+      --scale="None"
+      --offset="None"
+      --order="None"
+      #elif $kernelChoice.kernel == "polydot":
+      --sigma="None"
+      --degree=$kernelChoice.degree
+      --scale=$kernelChoice.scale
+      --offset=$kernelChoice.offset
+      --order="None"
+      #elif $kernelChoice.kernel == "tanhdot":
+      --sigma="None"
+      --degree="None"
+      --scale=$kernelChoice.scale
+      --offset=$kernelChoice.offset
+      --order="None"
+      #elif $kernelChoice.kernel == "besseldot":
+      --sigma=$kernelChoice.sigma
+      --degree=$kernelChoice.degree
+      --scale="None"
+      --offset="None"
+      --order=$kernelChoice.order
+      #elif $kernelChoice.kernel == "anovadot":
+      --sigma=$kernelChoice.sigma
+      --degree=$kernelChoice.degree
+      --scale="None"
+      --offset="None"
+      --order="None"
+      #else:
+      --sigma="None"
+      --degree="None"
+      --scale="None"
+      --offset="None"
+      --order="None"
+      #end if
+  </command>
+  <inputs>
+    <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/>
+    <param name="var_cols" label="Select columns containing input variables " type="data_column" data_ref="input1" numerical="True" multiple="true" >
+        <validator type="no_options" message="Please select at least one column."/>
+    </param>
+    <param name="features" size="10" type="integer" value="2" label="Number of principal components to return" help="To return all, enter 0"/>
+    <conditional name="kernelChoice">
+        <param name="kernel" type="select" label="Kernel function">
+            <option value="rbfdot" selected="true">Gaussian Radial Basis Function</option>
+            <option value="polydot">Polynomial</option>
+            <option value="vanilladot">Linear</option>
+            <option value="tanhdot">Hyperbolic</option>
+            <option value="laplacedot">Laplacian</option>
+            <option value="besseldot">Bessel</option>
+            <option value="anovadot">ANOVA Radial Basis Function</option>
+            <option value="splinedot">Spline</option>
+        </param>
+        <when value="vanilladot" />
+        <when value="splinedot" />
+        <when value="rbfdot">
+            <param name="sigma" size="10" type="float" value="1" label="sigma (inverse kernel width)" />
+        </when>
+        <when value="laplacedot">
+            <param name="sigma" size="10" type="float" value="1" label="sigma (inverse kernel width)" />
+        </when>
+        <when value="polydot">
+            <param name="degree" size="10" type="integer" value="1" label="degree" />
+            <param name="scale" size="10" type="integer" value="1" label="scale" />
+            <param name="offset" size="10" type="integer" value="1" label="offset" />
+        </when>
+        <when value="tanhdot">
+            <param name="scale" size="10" type="integer" value="1" label="scale" />
+            <param name="offset" size="10" type="integer" value="1" label="offset" />
+        </when>
+        <when value="besseldot">
+            <param name="sigma" size="10" type="integer" value="1" label="sigma" />
+            <param name="order" size="10" type="integer" value="1" label="order" />
+            <param name="degree" size="10" type="integer" value="1" label="degree" />
+        </when>
+        <when value="anovadot">
+            <param name="sigma" size="10" type="integer" value="1" label="sigma" />
+            <param name="degree" size="10" type="integer" value="1" label="degree" />
+        </when>
+    </conditional>    
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input1" />
+    <data format="pdf" name="out_file2" />
+  </outputs>
+  <requirements>
+    <requirement type="python-module">rpy</requirement>
+  </requirements>
+  <tests>
+    <test>
+        <param name="input1" value="iris.tabular"/>
+        <param name="var_cols" value="1,2,3,4"/>
+        <param name="kernel" value="polydot"/>
+        <param name="features" value="2"/>
+        <param name="offset" value="0"/>
+        <param name="scale" value="1"/>
+        <param name="degree" value="2"/>
+        <output name="out_file1" file="kpca_out1.tabular"/>
+        <output name="out_file2" file="kpca_out2.pdf"/>
+    </test>
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Edit Datasets-&gt;Convert characters*
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+This tool uses functions from 'kernlab' library from R statistical package to perform Kernel Principal Component Analysis (kPCA) on the input data. It outputs two files, one containing the summary statistics of the performed kPCA, and the other containing a scatterplot matrix of rotated values reported by kPCA.   
+
+*Alexandros Karatzoglou, Alex Smola, Kurt Hornik, Achim Zeileis (2004). kernlab - An S4 Package for Kernel Methods in R. Journal of Statistical Software 11(9), 1-20. URL http://www.jstatsoft.org/v11/i09/*
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+This tool currently treats all variables as continuous numeric variables. Running the tool on categorical variables might result in incorrect results. Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/multivariate_stats/pca.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/multivariate_stats/pca.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+
+from galaxy import eggs
+import sys, string
+from rpy import *
+import numpy
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+infile = sys.argv[1]
+x_cols = sys.argv[2].split(',')
+method = sys.argv[3]
+outfile = sys.argv[4]
+outfile2 = sys.argv[5]
+
+if method == 'svd':
+    scale = center = "FALSE"
+    if sys.argv[6] == 'both':
+        scale = center = "TRUE"
+    elif sys.argv[6] == 'center':
+        center = "TRUE"
+    elif sys.argv[6] == 'scale':
+        scale = "TRUE"
+    
+fout = open(outfile,'w')
+elems = []
+for i, line in enumerate( file ( infile )):
+    line = line.rstrip('\r\n')
+    if len( line )>0 and not line.startswith( '#' ):
+        elems = line.split( '\t' )
+        break 
+    if i == 30:
+        break # Hopefully we'll never get here...
+
+if len( elems )<1:
+    stop_err( "The data in your input dataset is either missing or not formatted properly." )
+
+x_vals = []
+
+for k,col in enumerate(x_cols):
+    x_cols[k] = int(col)-1
+    x_vals.append([])
+
+NA = 'NA'
+skipped = 0
+for ind,line in enumerate( file( infile )):
+    if line and not line.startswith( '#' ):
+        try:
+            fields = line.strip().split("\t")
+            valid_line = True
+            for k,col in enumerate(x_cols):
+                try:
+                    xval = float(fields[col])
+                except:
+                    skipped += 1 
+                    valid_line = False
+                    break
+            if valid_line:
+                for k,col in enumerate(x_cols):
+                    xval = float(fields[col])
+                    x_vals[k].append(xval)
+        except:
+            skipped += 1
+
+x_vals1 = numpy.asarray(x_vals).transpose()
+dat= r.list(array(x_vals1))
+
+set_default_mode(NO_CONVERSION)
+try:
+    if method == "cor":
+        pc = r.princomp(r.na_exclude(dat), cor = r("TRUE"))
+    elif method == "cov":
+        pc = r.princomp(r.na_exclude(dat), cor = r("FALSE"))
+    elif method=="svd":
+        pc = r.prcomp(r.na_exclude(dat), center = r(center), scale = r(scale))
+except RException, rex:
+    stop_err("Encountered error while performing PCA on the input data: %s" %(rex))
+
+set_default_mode(BASIC_CONVERSION)
+summary = r.summary(pc, loadings="TRUE")
+ncomps = len(summary['sdev'])
+
+if type(summary['sdev']) == type({}):
+    comps_unsorted = summary['sdev'].keys()
+    comps=[]
+    sd = summary['sdev'].values()
+    for i in range(ncomps):
+        sd[i] = summary['sdev'].values()[comps_unsorted.index('Comp.%s' %(i+1))]
+        comps.append('Comp.%s' %(i+1))
+elif type(summary['sdev']) == type([]):
+    comps=[]
+    for i in range(ncomps):
+        comps.append('Comp.%s' %(i+1))
+        sd = summary['sdev']
+
+print >>fout, "#Component\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
+print >>fout, "#Std. deviation\t%s" %("\t".join(["%.4g" % el for el in sd]))
+total_var = 0
+vars = []
+for s in sd:
+    var = s*s
+    total_var += var
+    vars.append(var)
+for i,var in enumerate(vars):
+    vars[i] = vars[i]/total_var
+       
+print >>fout, "#Proportion of variance explained\t%s" %("\t".join(["%.4g" % el for el in vars]))
+
+print >>fout, "#Loadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
+xcolnames = ["c%d" %(el+1) for el in x_cols]
+if 'loadings' in summary: #in case of princomp
+    loadings = 'loadings'
+elif 'rotation' in summary: #in case of prcomp
+    loadings = 'rotation'
+for i,val in enumerate(summary[loadings]):
+    print >>fout, "%s\t%s" %(xcolnames[i], "\t".join(["%.4g" % el for el in val]))
+
+print >>fout, "#Scores\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
+if 'scores' in summary: #in case of princomp
+    scores = 'scores'
+elif 'x' in summary: #in case of prcomp
+    scores = 'x'
+for obs,sc in enumerate(summary[scores]):
+    print >>fout, "%s\t%s" %(obs+1, "\t".join(["%.4g" % el for el in sc]))
+
+r.pdf( outfile2, 8, 8 )
+r.biplot(pc)
+r.dev_off()
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/multivariate_stats/pca.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/multivariate_stats/pca.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,100 @@
+<tool id="pca1" name="Principal Component Analysis" version="1.0.2">
+  <description> </description>
+  <command interpreter="python">
+    pca.py 
+      $input1
+      $var_cols
+      $methodChoice.method
+      $out_file1
+      $out_file2
+      #if $methodChoice.method == "svd":
+      $methodChoice.scale
+      #end if
+  </command>
+  <inputs>
+    <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/>
+    <param name="var_cols" label="Select columns containing input variables " type="data_column" data_ref="input1" numerical="True" multiple="true" >
+        <validator type="no_options" message="Please select at least one column."/>
+    </param>
+    <conditional name="methodChoice">
+        <param name="method" type="select" label="Method" help="The correlation matrix can only be used if there are no constant variables">
+            <option value="cor" selected="true">Eigenvectors of Correlation (princomp)</option>
+            <option value="cov">Eigenvectors of Covariance (princomp)</option>
+            <option value="svd">Singular Value Decomposition (prcomp)</option>
+        </param>
+        <when value="cor" />
+        <when value="cov" />
+        <when value="svd">
+            <param name="scale" type="select" label="Centering and Scaling" help="Can be used to center and/or scale variables">
+                <option value="none" selected="true">None</option>
+                <option value="center">Center only</option>
+                <option value="scale">Scale only</option>
+                <option value="both">Center and Scale</option>
+            </param>        
+        </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input1" />
+    <data format="pdf" name="out_file2" />
+  </outputs>
+  <requirements>
+    <requirement type="python-module">rpy</requirement>
+  </requirements>
+  <tests>
+    <test>
+        <param name="input1" value="iris.tabular"/>
+        <param name="var_cols" value="1,2,3,4"/>
+        <param name="method" value="cor"/>
+        <output name="out_file1" file="pca_out1.tabular"/>
+        <output name="out_file2" file="pca_out2.pdf"/>
+    </test>
+    <test>
+        <param name="input1" value="iris.tabular"/>
+        <param name="var_cols" value="1,2,3,4"/>
+        <param name="method" value="cov"/>
+        <output name="out_file1" file="pca_out3.tabular"/>
+        <output name="out_file2" file="pca_out4.pdf"/>
+    </test>
+    <test>
+        <param name="input1" value="iris.tabular"/>
+        <param name="var_cols" value="1,2,3,4"/>
+        <param name="method" value="svd"/>
+        <param name="scale" value="both"/>
+        <output name="out_file1" file="pca_out5.tabular"/>
+        <output name="out_file2" file="pca_out6.pdf"/>
+    </test>
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Edit Datasets-&gt;Convert characters*
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+This tool performs Principal Component Analysis on the given numeric input data using functions from R statistical package - 'princomp' function (for Eigenvector based solution) and 'prcomp' function (for Singular value decomposition based solution). It outputs two files, one containing the summary statistics of PCA, and the other containing biplots of the observations and principal components.   
+
+*R Development Core Team (2009). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. ISBN 3-900051-07-0, URL http://www.R-project.org.*
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+- This tool currently treats all variables as continuous numeric variables. Running the tool on categorical variables might result in incorrect results. Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis.
+
+- The summary statistics in the output are described below:
+
+  - Std. deviation: Standard deviations of the principal components
+  - Loadings: a list of eigen-vectors/variable loadings
+  - Scores: Scores of the input data on the principal components
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mutation/visualize.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mutation/visualize.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,391 @@\n+#!/usr/bin/env python\n+\n+\'\'\'\n+Mutation Visualizer tool\n+\'\'\'\n+\n+from __future__ import division\n+\n+import sys, csv, os, math\n+import optparse\n+\n+from galaxy import eggs\n+import pkg_resources\n+pkg_resources.require( "SVGFig" )\n+import svgfig as svg\n+\n+\n+SVGPan = """\n+/**\n+ *  SVGPan library 1.2\n+ * ====================\n+ *\n+ * Given an unique existing element with id "viewport", including the\n+ * the library into any SVG adds the following capabilities:\n+ *\n+ *  - Mouse panning\n+ *  - Mouse zooming (using the wheel)\n+ *  - Object dargging\n+ *\n+ * Known issues:\n+ *\n+ *  - Zooming (while panning) on Safari has still some issues\n+ *\n+ * Releases:\n+ *\n+ * 1.2, Sat Mar 20 08:42:50 GMT 2010, Zeng Xiaohui\n+ *      Fixed a bug with browser mouse handler interaction\n+ *\n+ * 1.1, Wed Feb  3 17:39:33 GMT 2010, Zeng Xiaohui\n+ *      Updated the zoom code to support the mouse wheel on Safari/Chrome\n+ *\n+ * 1.0, Andrea Leofreddi\n+ *      First release\n+ *\n+ * This code is licensed under the following BSD license:\n+ *\n+ * Copyright 2009-2010 Andrea Leofreddi (a.leofreddi@itcharm.com). All rights reserved.\n+ *\n+ * Redistribution and use in source and binary forms, with or without modification, are\n+ * permitted provided that the following conditions are met:\n+ *\n+ *    1. Redistributions of source code must retain the above copyright notice, this list of\n+ *       conditions and the following disclaimer.\n+ *\n+ *    2. Redistributions in binary form must reproduce the above copyright notice, this list\n+ *       of conditions and the following disclaimer in the documentation and/or other materials\n+ *       provided with the distribution.\n+ *\n+ * THIS SOFTWARE IS PROVIDED BY Andrea Leofreddi ``AS IS\'\' AND ANY EXPRESS OR IMPLIED\n+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\n+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Andrea Leofreddi OR\n+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\n+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON\n+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\n+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF\n+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n+ *\n+ * The views and conclusions contained in the software and documentation are those of the\n+ * authors and should not be interpreted as representing official policies, either expressed\n+ * or implied, of Andrea Leofreddi.\n+ */\n+\n+var root = document.documentElement;\n+\n+var state = \'none\', stateTarget, stateOrigin, stateTf;\n+\n+setupHandlers(root);\n+\n+/**\n+ * Register handlers\n+ */\n+function setupHandlers(root){\n+        setAttributes(root, {\n+                "onmouseup" : "add(evt)",\n+                "onmousedown" : "handleMouseDown(evt)",\n+                "onmousemove" : "handleMouseMove(evt)",\n+                "onmouseup" : "handleMouseUp(evt)",\n+                //"onmouseout" : "handleMouseUp(evt)", // Decomment this to stop the pan functionality when dragging out of the SVG element\n+        });\n+\n+        if(navigator.userAgent.toLowerCase().indexOf(\'webkit\') >= 0)\n+                window.addEventListener(\'mousewheel\', handleMouseWheel, false); // Chrome/Safari\n+        else\n+                window.addEventListener(\'DOMMouseScroll\', handleMouseWheel, false); // Others\n+}\n+\n+/**\n+ * Instance an SVGPoint object with given event coordinates.\n+ */\n+function getEventPoint(evt) {\n+        var p = root.createSVGPoint();\n+\n+        p.x = evt.clientX;\n+        p.y = evt.clientY;\n+\n+        return p;\n+}\n+\n+/**\n+ * Sets the current transform matrix of an element.\n+ */\n+function setCTM(element, matrix) {\n+        var s = "matrix(" + matrix.a + "," + matrix.b + "," + matrix.c + "," + matrix.d + "," + matrix.e + "," + matrix.f + ")";\n+\n+     '..b'of the dataset." % ( opts.ref_col, ref, count ) )\n+        # display positions\n+        if opts.zoom == \'interactive\':\n+            textx = 0\n+        else:\n+            textx = 7\n+        bt = svg.SVG("tspan", str(position), style="font-family:Verdana;font-size:25%")\n+        s.append(svg.SVG("text", bt, x=textx, y=34+(count*(HEIGHT+GAP)), stroke="none", fill="black"))\n+        s.append(svg.SVG("rect", x=0, y=30+(count*(HEIGHT+GAP)), width=14, height=HEIGHT, \n+                         stroke=\'none\', fill=colors[ref.upper()], fill_opacity=0.2))\n+        \n+        for sample_index, sample in enumerate(samples):\n+            n_a = int(row[int(sample[\'a_col\'])-1])\n+            n_c = int(row[int(sample[\'a_col\'])+1-1])\n+            n_g = int(row[int(sample[\'a_col\'])+2-1])\n+            n_t = int(row[int(sample[\'a_col\'])+3-1])\n+            total = int(row[int(sample[\'totals_col\'])-1])\n+            # validate\n+            base_error = validate_bases(n_a, n_c, n_g, n_t, total)\n+            if base_error:\n+                stop_error("For sample %i (%s), the number of base %s reads is more than the coverage on row %i." % (sample_index+1, \n+                                                                                                                     sample[\'name\'], \n+                                                                                                                     base_error, \n+                                                                                                                     count))\n+ \n+            if total:\n+                x = 16+(sample_index*(WIDTH+GAP))\n+                y = 30+(count*(HEIGHT+GAP))\n+                width = WIDTH\n+                height = HEIGHT\n+                if count%2:\n+                    s.append(svg.SVG("rect", x=x, y=y, width=width, height=height, \n+                                     stroke=\'none\', fill=\'grey\', fill_opacity=0.25))\n+                else:\n+                    s.append(svg.SVG("rect", x=x, y=y, width=width, height=height, \n+                                     stroke=\'none\', fill=\'grey\', fill_opacity=0.25))\n+                \n+                for base, value in enumerate([n_a, n_c, n_g, n_t]):\n+                    width = int(math.ceil(value / total * WIDTH))\n+                    s.append(svg.SVG("rect", x=x, y=y, width=width, height=BAR_WIDTH, \n+                                     stroke=\'none\', fill=colors[bases[base]], fill_opacity=0.6))\n+                    y = y + BAR_WIDTH\n+\n+        count=count+1\n+        \n+    if opts.zoom == \'interactive\':\n+        canv = svg.canvas(s)\n+        canv.save(opts.output_file)\n+        import fileinput\n+        flag = False\n+        for line in fileinput.input(opts.output_file, inplace=1):\n+            if line.startswith(\'<svg\'):\n+                print \'<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\'\n+                flag = True\n+                continue\n+            else:\n+                if flag:\n+                    print \'<script type="text/javascript">%s</script>\' % SVGPan\n+                flag = False\n+            print line,\n+    else:\n+        zoom = int(opts.zoom)\n+        w = "%ipx" % (x*(10+zoom))\n+        h = "%ipx" % (y*(2+zoom))\n+        canv = svg.canvas(s, width=w, height=h, viewBox="0 0 %i %i" %(x+100, y+100)) \n+        canv.save(opts.output_file)\n+\n+if __name__ == \'__main__\':\n+    parser = optparse.OptionParser()\n+    parser.add_option(\'-i\', \'--input-file\', dest=\'input_file\', action=\'store\')\n+    parser.add_option(\'-o\', \'--output-file\', dest=\'output_file\', action=\'store\')\n+    parser.add_option(\'-z\', \'--zoom\', dest=\'zoom\', action=\'store\', default=\'1\')\n+    parser.add_option(\'-p\', \'--position_col\', dest=\'position_col\', action=\'store\', default=\'c0\')\n+    parser.add_option(\'-r\', \'--ref_col\', dest=\'ref_col\', action=\'store\', default=\'c1\')\n+    (opts, args) = parser.parse_args()\n+    main(opts, args)\n+    sys.exit(1)\n+\n+    \n\\ No newline at end of file\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mutation/visualize.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mutation/visualize.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,106 @@
+<tool id="mutation_visualize" name="Mutation Visualization" version="1.0.0">
+  <description></description>
+  <command interpreter="python">
+    visualize.py 
+        --input-file=$input1
+        --output-file=$out_file1
+        --zoom=$zoom_value
+        --position_col=$position_col
+        --ref_col=$ref_col
+        #for $f in $sample_chooser:
+            "${f.name}"
+            ${f.a_col}
+            ${f.totals_col}
+        #end for
+  </command>
+  <inputs>
+    <param format="tabular" name="input1" type="data" label="Compare sequences in"></param>
+    <param name="position_col" type="data_column" data_ref="input1" multiple="false" numerical="false" label="Position Column" help="" />
+    <param name="ref_col" type="data_column" data_ref="input1" multiple="false" numerical="false" label="Reference Base Column" help="" />
+
+    <repeat name="sample_chooser" title="Sample">
+      <param name="name" type="text" label="Label" help="Optional" />
+      <param name="a_col" type="data_column" data_ref="input1" multiple="false" numerical="false" label="Base A Column" help="" />
+      <param name="totals_col" type="data_column" data_ref="input1" multiple="false" numerical="false" label="Coverage Column" help="" />
+    </repeat>
+
+    <param name="zoom_value" type="select" label="Zoom">
+        <option value="interactive">Interactive</option>
+        <option value="1">1x</option>
+        <option value="2">2x</option>
+        <option value="3">3x</option>
+        <option value="4">4x</option>
+        <option value="5">5x</option>
+        <option value="6">6x</option>
+        <option value="7">7x</option>
+        <option value="8">8x</option>
+        <option value="9">9x</option>
+        <option value="10">10x</option>
+    </param>
+    
+  </inputs>
+  <outputs>
+    <data format="svg" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="mutation_data1.txt" ftype="tabular" />
+      <param name="position_col" value="2" />
+      <param name="ref_col" value="4" />
+      <param name="zoom_value" value="interactive" />
+      <param name="name" value="s1" />
+      <param name="a_col" value="5" />
+      <param name="totals_col" value="9" />
+      <output name="output" file="mutation_data1_interactive.svg" ftype="svg" />
+    </test>
+    <test>
+      <param name="input1" value="mutation_data1.txt" ftype="tabular" />
+      <param name="position_col" value="2" />
+      <param name="ref_col" value="4" />
+      <param name="zoom_value" value="3" />
+      <param name="name" value="s1" />
+      <param name="a_col" value="5" />
+      <param name="totals_col" value="9" />
+      <output name="output" file="mutation_data1_zoom3x.svg" ftype="svg" />
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+This tool allows you to visualize mutations described in a tabular input file. It generates an SVG image which can be viewed in any web browser.
+
+You will need to specify the position and reference columns in the input file. Then click on the 'Add new Sample' to add samples in the input file that you would like to visualize. For each sample you select, specify the column for base 'A', the totals column and enter a name.
+This tool assumes the columns specifying bases A, C, G, T are placed consecutively and in that order in an input file.
+
+Interactivity: If interactive zoom option is selected, then the resultant image can be zoomed in or out using the scroll mouse wheel and can be panned by dragging the image using left mouse button. 
+
+-----
+
+**General Example**
+
+Given the input file::
+  
+  chrM    72      73      G   26394   4       49  0   26447   26398   1   23389   3       45  0   23437   23392   1
+  chrM    149     150     T   11      50422   2   96  50531   50435   1   4       45417   1   65  45487   45422   1
+  
+To visualize the two samples in the input file, the following parameters are selected before running the tool::
+  
+  Position column:        2
+  Reference Base column:  4
+  Sample 1 Label:         gm blood 
+  Sample 1 Base A column: 5 
+  Sample 1 Totals column: 9 
+  Sample 2 Label:         gm cheek 
+  Sample 2 Base A column: 12 
+  Sample 2 Totals column: 16 
+
+Visualization output:
+
+.. image:: ./static/images/mutation_visualization_example.png 
+   :width: 150
+   
+Here the left-most column represents the position and the background color is the reference base color. Each column on its right describe each sample.
+In the output above, the blue bar is the longest, which means that base A is maximum in position 72 for both the samples.  
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/.DS_Store
b
Binary file tools/mytools/.DS_Store has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._.DS_Store
b
Binary file tools/mytools/._.DS_Store has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._StartGenometriCorr.xml
b
Binary file tools/mytools/._StartGenometriCorr.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._Start_GenometriCorr.R
b
Binary file tools/mytools/._Start_GenometriCorr.R has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._align2database.py
b
Binary file tools/mytools/._align2database.py has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._align2database.xml
b
Binary file tools/mytools/._align2database.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._align2multiple.xml
b
Binary file tools/mytools/._align2multiple.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._alignr.py
b
Binary file tools/mytools/._alignr.py has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._alignr.xml
b
Binary file tools/mytools/._alignr.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._alignvis.xml
b
Binary file tools/mytools/._alignvis.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._altschulEriksonDinuclShuffle.py
b
Binary file tools/mytools/._altschulEriksonDinuclShuffle.py has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._bed_to_bam.xml
b
Binary file tools/mytools/._bed_to_bam.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._bedclean.xml
b
Binary file tools/mytools/._bedclean.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._bedsort.xml
b
Binary file tools/mytools/._bedsort.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._bigWigAverageOverBed.xml
b
Binary file tools/mytools/._bigWigAverageOverBed.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._binaverage.xml
b
Binary file tools/mytools/._binaverage.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._bowtie2bed.pl
b
Binary file tools/mytools/._bowtie2bed.pl has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._bowtie2bed.xml
b
Binary file tools/mytools/._bowtie2bed.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._bwBinavg.xml
b
Binary file tools/mytools/._bwBinavg.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._cdf.r
b
Binary file tools/mytools/._cdf.r has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._cdf.xml
b
Binary file tools/mytools/._cdf.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._closestBed.xml
b
Binary file tools/mytools/._closestBed.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._collapseBed.py
b
Binary file tools/mytools/._collapseBed.py has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._collapseBed.xml
b
Binary file tools/mytools/._collapseBed.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._collapseTab.xml
b
Binary file tools/mytools/._collapseTab.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._convertEnsembl.xml
b
Binary file tools/mytools/._convertEnsembl.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._dreme.xml
b
Binary file tools/mytools/._dreme.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._endbias.xml
b
Binary file tools/mytools/._endbias.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._fastamarkov.xml
b
Binary file tools/mytools/._fastamarkov.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._fastashuffle1.xml
b
Binary file tools/mytools/._fastashuffle1.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._fastashuffle2.xml
b
Binary file tools/mytools/._fastashuffle2.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._fastqdump.xml
b
Binary file tools/mytools/._fastqdump.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._fimo2-old.xml
b
Binary file tools/mytools/._fimo2-old.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._fimo2.xml
b
Binary file tools/mytools/._fimo2.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._fimo2bed.py
b
Binary file tools/mytools/._fimo2bed.py has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._fimo2bed.xml
b
Binary file tools/mytools/._fimo2bed.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._genomeView.xml
b
Binary file tools/mytools/._genomeView.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._genomeview-old2.r
b
Binary file tools/mytools/._genomeview-old2.r has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._genomeview.r
b
Binary file tools/mytools/._genomeview.r has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._genomeview_notused
b
Binary file tools/mytools/._genomeview_notused has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._headtail.xml
b
Binary file tools/mytools/._headtail.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._intersectSig.xml
b
Binary file tools/mytools/._intersectSig.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._intersectbed.xml
b
Binary file tools/mytools/._intersectbed.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._intervalSize.xml
b
Binary file tools/mytools/._intervalSize.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._iupac2meme.xml
b
Binary file tools/mytools/._iupac2meme.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._makebigwig.sh
b
Binary file tools/mytools/._makebigwig.sh has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._makebigwig.sh-old
b
Binary file tools/mytools/._makebigwig.sh-old has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._makebigwig.xml
b
Binary file tools/mytools/._makebigwig.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._makewindow.xml
b
Binary file tools/mytools/._makewindow.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._meme.xml
b
Binary file tools/mytools/._meme.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._memelogo.xml
b
Binary file tools/mytools/._memelogo.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._metaintv.xml
b
Binary file tools/mytools/._metaintv.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._metaintv_ext.xml
b
Binary file tools/mytools/._metaintv_ext.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._phastCons.xml
b
Binary file tools/mytools/._phastCons.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._plotmatrix.xml
b
Binary file tools/mytools/._plotmatrix.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._r_wrapper.sh
b
Binary file tools/mytools/._r_wrapper.sh has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._r_wrapper_old.sh
b
Binary file tools/mytools/._r_wrapper_old.sh has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._random_interval.py
b
Binary file tools/mytools/._random_interval.py has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._random_interval.xml
b
Binary file tools/mytools/._random_interval.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._removeDuplicate.xml
b
Binary file tools/mytools/._removeDuplicate.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._resize.xml
b
Binary file tools/mytools/._resize.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._revcompl.py
b
Binary file tools/mytools/._revcompl.py has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._revcompl.xml
b
Binary file tools/mytools/._revcompl.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._sampline.py
b
Binary file tools/mytools/._sampline.py has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._seq2meme.py
b
Binary file tools/mytools/._seq2meme.py has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._seq2meme.xml
b
Binary file tools/mytools/._seq2meme.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._seqshuffle.py
b
Binary file tools/mytools/._seqshuffle.py has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._shuffleBed.py
b
Binary file tools/mytools/._shuffleBed.py has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._shuffleBed.xml
b
Binary file tools/mytools/._shuffleBed.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._shuffleSequenceUsingAltschulErikson.txt
b
Binary file tools/mytools/._shuffleSequenceUsingAltschulErikson.txt has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._spatial_proximity.xml
b
Binary file tools/mytools/._spatial_proximity.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._splicesite.xml
b
Binary file tools/mytools/._splicesite.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._splicesitescore
b
Binary file tools/mytools/._splicesitescore has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._stats.txt
b
Binary file tools/mytools/._stats.txt has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/._venn.xml
b
Binary file tools/mytools/._venn.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/AATAAA.motif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/AATAAA.motif Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,19 @@
+MEME version 4.4
+
+ALPHABET= ACGT
+
+strands: + -
+
+Background letter frequencies (from uniform background):
+A 0.25000 C 0.25000 G 0.25000 T 0.25000 
+
+MOTIF AATAAA 
+
+letter-probability matrix: alength= 4 w= 6 nsites= 20 E= 0
+  1.000000   0.000000   0.000000   0.000000
+  1.000000   0.000000   0.000000   0.000000
+  0.000000   0.000000   0.000000   1.000000
+  1.000000   0.000000   0.000000   0.000000
+  1.000000   0.000000   0.000000   0.000000
+  1.000000   0.000000   0.000000   0.000000
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/StartGenometriCorr.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/StartGenometriCorr.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,23 @@
+<tool id="genometric_correlation" name="Genometric Correlation">
+<description>between two files of genomic intervals</description>
+<command interpreter="Rscript --vanilla">
+Start_GenometriCorr.R $config $query $reference $output_options $output
+</command>
+<inputs>
+<param format="text" name="config" type="data" label="Configuration file"/>
+<param format="text" name="query" type="data" label="Query intervals file"/>
+<param format="text" name="reference" type="data" label="Reference intervals file"/>
+<param name="output_options" type="select" label="Type of output">
+<option value="plot">ECDF plots</option>
+<option value="vis">Graphic visualization</option>
+<option value="stats">Text output of statistics</option>
+<option value="plot_vis">All</option>
+</param>
+</inputs>
+<outputs>
+<data name="output" format="pdf"/>
+</outputs>
+<help>
+This tool determines the statistical relationship (if any) between two sets of genomic intervals. Output can be text only, plot (ECDF curves), or a more colorful graphic.
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/Start_GenometriCorr.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/Start_GenometriCorr.R Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,105 @@
+# Start_GenometriCorr.R
+
+###################################################
+#                                                 #
+#  command-line interface to GenometriCorr        #
+#  functions, for use with Galaxy.                #
+#                                                 #
+###################################################
+
+capture.output <- function (result, pdffile, output_options)
+{
+   if(output_options != "stats")
+   {
+      pdf(file=pdffile, width=10, height=19, paper="special")
+   
+      if (output_options != "vis")   #need to do a plot
+      {
+         mymat <- matrix(ncol=3, nrow=4)
+         mymat[1,1] <- 1
+         mymat[1,2] <- 2
+         mymat[1,3] <- 3
+         mymat[2,1] <- 4
+         mymat[2,2] <- 5
+         mymat[2,3] <- 6
+         mymat[3,1] <- 7
+         mymat[3,2] <- 8
+         mymat[3,3] <- 9
+         mymat[4,1] <- 10
+         mymat[4,2] <- 11
+         mymat[4,3] <- 12
+       
+         layout(mymat, heights=c(0.2,0.2,0.2,0.2))
+         plot(result, pdffile, make.new=FALSE)
+      }
+      if (output_options != "plot")  #need to do the bigger graphic
+      {
+         mymat <- matrix(ncol=2, nrow=8)
+         mymat[1,1] <- 2
+         mymat[1,2] <- 3
+         mymat[2,1] <- 4
+         mymat[2,2] <- 4
+         mymat[3,1] <- 1
+         mymat[3,2] <- 1
+         mymat[4,1] <- 5
+         mymat[4,2] <- 6
+         mymat[5,1] <- 7
+         mymat[5,2] <- 7
+         mymat[6,1] <- 8
+         mymat[6,2] <- 9 
+         mymat[7,1] <- 10
+         mymat[7,2] <- 10
+         mymat[8,1] <- 11
+         mymat[8,2] <- 12
+         layoutresults <- 3
+         
+         layout(mymat, heights=c(0.05,0.05,0.15,0.15,0.15,0.15,0.15,0.15))
+         visualize(result, pdffile, make.new=FALSE) 
+      }
+      dev.off()
+   } 
+   
+   if (output_options == "stats")
+   {
+      show(result)
+   }
+}
+
+
+
+# Reads the command line arguments
+args <- commandArgs(trailingOnly=T)
+
+suppressPackageStartupMessages(library('GenometriCorr',  warn.conflicts=F, verbose=F))
+suppressPackageStartupMessages(library('graphics',  warn.conflicts=F, verbose=F))
+suppressPackageStartupMessages(library('gdata',  warn.conflicts=F, verbose=F))
+suppressPackageStartupMessages(library('gplots',  warn.conflicts=F, verbose=F))
+suppressPackageStartupMessages(library('gtools',  warn.conflicts=F, verbose=F))
+suppressPackageStartupMessages(library('caTools',  warn.conflicts=F, verbose=F))
+suppressPackageStartupMessages(library('grid',  warn.conflicts=F, verbose=F))
+
+
+
+# Variables
+query_file <- ""
+reference_file <- ""
+config_file <- ""
+output_options <- ""
+
+# Parse the command line arguments
+
+config_file <- args[1]
+query_file <- as.character(args[2])
+reference_file <- as.character(args[3])
+output_options <- args[4]
+pdffile <- args[5]
+
+conf<-new("GenometriCorrConfig",config_file)
+
+print('OK')
+
+result<-suppressWarnings(suppressPackageStartupMessages(GenometriCorr:::run.config(conf,query=query_file,reference=reference_file)))
+print('OK2')
+
+hideoutput <- capture.output(result, pdffile=args[5], output_options)
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/align2database.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/align2database.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,104 @@
+'''
+align mulitple bed to one bed
+python align_multiple.py hmChIP_mm9_peak_bed/mm9-GSE19999_PolII_P25_all.cod.bed hmChIP_mm9_peak_bed/ test.txt test.pdf 100 5000
+'''
+
+import os,sys,random
+def main():
+    queryfile = sys.argv[1]
+    inpath = sys.argv[2]
+    outputdata = sys.argv[3]
+    outputerr = sys.argv[4]
+    barplotpdf = sys.argv[5]
+    min_feat = sys.argv[6] # min features overlap
+    windowsize = sys.argv[7]
+    anchor = sys.argv[8]
+    span = sys.argv[9] # loess smooth parameter
+        
+    inpath = inpath.rstrip('/')
+    #os.system('rm '+inpath+'/*tmp*')
+
+    infiles = os.listdir(inpath)
+
+    #print len(infiles),' files\n'
+    i = 0
+    for infile in infiles:
+        if 'tmp' in infile:
+            #os.system('rm '+inpath+'/'+infile)
+            continue
+        i = i +1
+        print i,infile
+        output = infile.split('/')[-1]+'-to-'+queryfile.split('/')[-1]#'.summary'
+        if anchor == 'database':
+            command = 'python /Users/xuebing/galaxy-dist/tools/mytools/alignr.py -b '+inpath+'/'+infile+' -a '+queryfile+' -o '+output+' --summary-only -q -w '+windowsize
+        else:
+            command = 'python /Users/xuebing/galaxy-dist/tools/mytools/alignr.py -a '+inpath+'/'+infile+' -b '+queryfile+' -o '+output+' --summary-only -q -w '+windowsize            
+        #print command+'\n'
+        os.system(command)
+    print 'start visualization...'
+    # visualize
+    rscriptfile = 'f-'+str(random.random())+'.r'
+    r = open(rscriptfile,'w')
+    r.write("files <- dir('.','summary',full.name=T)\n")
+    #r.write("print(files)\n")    
+    r.write("x <- read.table(files[1])\n")
+    r.write("err <- read.table(gsub('summary','standarderror',files[1]))\n")
+    r.write("for (filename in files[2:length(files)]){\n")
+    r.write("   x <- rbind(x,read.table(filename))\n")
+    r.write("   err <- rbind(err,read.table(gsub('summary','standarderror',filename)))\n")    
+    r.write("}\n")
+    r.write("x <- x[x[,2] > "+min_feat+",]\n")
+    r.write("err <- err[err[,2] > "+min_feat+",]\n")    
+    r.write("name <- as.character(x[,1])\n")
+    r.write("nfeat <- x[,2]\n")
+    r.write("x <- x[,3:ncol(x)]\n")
+    r.write("err <- err[,3:ncol(err)]\n")    
+    r.write("for (i in 1:nrow(x)) {\n")
+    r.write("    name[i] <- strsplit(name[i],'-to-')[[1]][1]\n")
+    r.write("}\n")
+    # remove rows that have no variation, which cause problem in heatmap. This is the case when align to itself
+    r.write("toremove <- seq(nrow(x))\n")
+    r.write("for (i in 1:nrow(x)){\n")
+    r.write("    toremove[i] <- all(x[i,] == x[i,1])\n")
+    r.write("}\n")
+    r.write("x <- x[!toremove,]\n")
+    r.write("err <- err[!toremove,]\n")
+    r.write("name <- name[!toremove]\n")
+    r.write("nfeat <- nfeat[!toremove]\n")
+    r.write("write.table(cbind(name,nfeat,x),file='"+outputdata+"',sep ='\\t',quote=F,row.names=F,col.names=F)\n")
+    r.write("write.table(cbind(name,nfeat,err),file='"+outputerr+"',sep ='\\t',quote=F,row.names=F,col.names=F)\n")
+        
+    r.write("pdf('"+barplotpdf+"')\n")
+    r.write("heatmap(t(scale(t(as.matrix(x,nc=ncol(x))))),Colv=NA,labRow=name,labCol=NA,margins=c(1,8),cexRow=0.02+1/log(nrow(x)),col=heat.colors(1000))\n")
+
+    if windowsize != '0' :
+        r.write("xticks <- seq(-"+windowsize+","+windowsize+",length.out=100)\n")
+        r.write("xlab <- 'relative position (bp)'\n")
+    else:
+        r.write("xticks <- seq(100)\n")
+        r.write("xlab <- 'relative position (bin)'\n")
+        
+    #r.write("plot(xticks,colSums(t(scale(t(as.matrix(x,nc=ncol(x)))))),xlab='relative position (bp)',type='l',lwd=2,main='total signal')\n")
+    r.write("for (i in 1:nrow(x)) {\n")
+    r.write("   avg <- x[i,]/nfeat[i]\n")
+    #r.write("   maxv <- max(avg)\n")
+    #r.write("   minv <- min(avg)\n")
+    #r.write("   medv <- median(avg)\n")
+    #r.write("   if (maxv < "+fold+"*medv | minv*"+fold+">medv){next}\n")
+    #smooth
+    if float(span) >= 0.1:
+        r.write("   avg = loess(as.numeric(avg)~xticks,span="+span+")$fitted\n")
+        r.write("   err[i,] = loess(as.numeric(err[i,])~xticks,span="+span+")$fitted\n")
+    r.write("   par(cex=1.5)\n")
+    r.write("   plot(xticks,avg,ylab='average coverage',main=paste(name[i],'\n n=',nfeat[i],sep=''),xlab=xlab,type='l',lwd=1,ylim=c(min(avg-err[i,]),max(avg+err[i,])))\n")   
+    r.write("   polygon(c(xticks,rev(xticks)),c(avg+err[i,],rev(avg-err[i,])),col='slateblue1',border=NA)\n")
+    r.write("   lines(xticks,avg,type='l',lwd=1)\n")   
+    r.write("}\n")
+    r.write("dev.off()\n")
+    r.close()
+    os.system("R --vanilla < "+rscriptfile)    
+    os.system('rm '+rscriptfile)
+    os.system('rm *.summary')
+    os.system('rm *.standarderror')
+
+main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/align2database.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/align2database.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,54 @@
+<tool id="align2database" name="align-to-database">
+  <description> features </description>
+  <command interpreter="python"> align2database.py $query $database $output_coverage $output_standarderror $output_plot $minfeat $windowsize $anchor $span> $outlog </command>
+  <inputs>
+    <param name="query" type="data" format="interval" label="Query intervals" help= "keep it small (less than 1,000,000 lines)"/>
+    <param name="database" type="select" label="Feature database">
+     <option value="/Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/feature_database" selected="true">All mm9 features (over 200)</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation">Annotated mm9 features</option>   
+     <option value="/Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/CLIP">protein bound RNA (CLIP) mm9 </option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/conservedmiRNAseedsite">conserved miRNA target sites mm9 </option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/all-feature">Human ChIP hmChIP database hg18</option>
+      <option value="/Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/gene-feature">Human gene features hg18</option>
+       <option value="/Users/xuebing/galaxy-dist/tool-data/aligndb/hg19/conservedmiRNAseedsite">conserved miRNA target sites hg19 </option>
+    </param>
+    <param name="anchor" label="Anchor to query features" help="default anchoring to database featuers" type="boolean" truevalue="query" falsevalue="database" checked="False"/>
+        <param name="windowsize" size="10" type="integer" value="5000" label="Window size (-w)"  help="will create new intervals of w bp flanking the original center. set to 0 will not change input interval size)"/>
+    <param name="minfeat" size="10" type="integer" value="100" label="Minimum number of query intervals hits" help="database features overlapping with too few query intervals are discarded"/>
+        <param name="span" size="10" type="float" value="0.1" label="loess span: smoothing parameter" help="value less then 0.1 disables smoothing"/>
+    <param name="outputlabel" size="80" type="text" label="Output label" value="test"/>
+   
+</inputs>
+  <outputs>
+      <data format="txt" name="outlog" label="${outputlabel} (log)"/> 
+    <data format="tabular" name="output_standarderror" label="${outputlabel} (standard error)"/> 
+    <data format="tabular" name="output_coverage" label="${outputlabel} (coverage)"/> 
+    <data format="pdf" name="output_plot" label="${outputlabel} (plot)"/> 
+  </outputs>
+  <help>
+
+**Example output**
+
+.. image:: ./static/operation_icons/align_multiple2.png
+
+
+**What it does**
+
+This tool aligns a query interval set (such as ChIP peaks) to a database of features (such as other ChIP peaks or TSS/splice sites), calculates and plots the relative distance of database features to the query intervals. Currently two databases are available:  
+
+-- **ChIP peaks** from 191 ChIP experiments (processed from hmChIP database, see individual peak/BED files in **Shared Data**)
+
+-- **Annotated gene features**, such as: TSS, TES, 5'ss, 3'ss, CDS start and end, miRNA seed matches, enhancers, CpG island, microsatellite, small RNA, poly A sites (3P-seq-tags), miRNA genes, and tRNA genes. 
+
+Two output files are generated. One is the coverage/profile for each feature in the database that has a minimum overlap with the query set. The first two columns are feature name and the total number of overlapping intervals from the query. Column 3 to column 102 are coverage at each bin. The other file is an PDF file plotting both the heatmap for all features and the average coverage for each individual database feature.
+
+
+**How it works**
+
+For each interval/peak in the query file, a window (default 10,000bp) is created around the center of the interval and is divided into 100 bins. For each database feature set (such as Pol II peaks), the tool counts how many intervals in the database feature file overlap with each bin. The count is then averaged over all query intervals that have at least one hit in at least one bin. Overall the plotted 'average coverage' represnts the fraction of query features (only those with hits, number shown in individual plot title) that has database feature interval covering that bin. The extreme is when the database feature is the same as the query, then every query interval is covered at the center, the average coverage of the center bin will be 1.    
+
+The heatmap is scaled for each row before clustering.
+
+  </help>
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/align2multiple.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/align2multiple.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,109 @@
+<tool id="align2multiple" name="align-to-multiple">
+  <description>features</description>
+  <command>cat $script_file | R --vanilla --slave > $logfile </command>
+  <inputs>   
+      <param name="query" type="data" format="interval" label="Query intervals" help= "keep it small (less than 1,000,000 lines)"/>
+      <param name="label" type="text" value="" size="30" label="Data Label"/>
+    <param name="windowsize" size="10" type="integer" value="5000" label="radius of the window"  help="will create new intervals of w bp flanking the original center. set to 0 will not change input interval size)"/>
+    <param name="nbins" size="10" type="integer" value="20" label="Number of bins dividing the window"/>
+    <param name="sort" label="Sort intervals" help="Sort by the center of the first input, then the second input, then third..." type="boolean" truevalue="sort" falsevalue="none" checked="True"/>
+    <repeat name="series" title="input file">
+      <param name="label" type="text" value="" size="30" label="Data Label"/>
+      <param name="input" type="data" format="interval" label="Dataset"/>
+    </repeat>       
+  </inputs>
+
+  <configfiles>
+    <configfile name="script_file">
+      ## Setup R error handling to go to stderr
+      cat('\n[',date(),'] Start running job\n')
+      options(warn=-1)
+      windowsize = as.integer("$windowsize")
+      labels = '$label'
+      ## align query to itself
+      cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/alignr.py -a $query -b $query -o $label-$label --profile-only -q -w $windowsize -n $nbins'
+      cat('\n[',date(),'] ',cmd,'\n')
+      system(cmd)
+      ## align other sets to query
+      #for $i,$s in enumerate( $series )
+        labels = c(labels,'$s.label.value')
+        cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/alignr.py -a $s.input.file_name -b $query -o $label-$s.label.value --profile-only -q -w $windowsize -n $nbins'
+        cat('\n[',date(),'] ',cmd,'\n')
+        system(cmd)
+      #end for
+      cat('\n[',date(),'] Read output\n')
+      ## read output of query2query
+      print(paste(labels[1],labels[1],sep='-'))
+      x = read.table(paste(labels[1],labels[1],sep='-'))
+      ids = as.character(x[,1])
+      nfeat = nrow(x)
+      x = as.matrix(x[,3:ncol(x)])
+      nbin = ncol(x)
+            
+      ## a table mapping id to position
+      ind = list()
+      for (i in 1:nfeat){
+          ind[[ids[i]]] = i
+      }
+      ## read other output files
+      for (i in 2:length(labels)){
+          print(paste(labels[1],labels[i],sep='-'))
+          x0 = read.table(paste(labels[1],labels[i],sep='-'))
+          ids0 = as.character(x0[,1])
+          x0 = as.matrix(x0[,3:ncol(x0)])
+          x1 = matrix(0,nfeat,nbin)
+          for (j in 1:nrow(x0)){
+              #cat(j,'\t',ids0[j],'\t',ind[[ids0[j]]],'\n')
+              x1[ind[[ids0[j]]],] = x0[j,]                    
+          }
+          x = cbind(x,x1)          
+      }  
+      ## reorder
+      if ("${sort}" == "sort"){
+          cat('\n[',date(),'] Sort intervals\n')
+          for (i in rev(2:length(labels))){
+              x = x[order(x[,i*nbin-nbin/2]>0),]
+          }
+      }
+      png("${out_file1}")
+      ##par(mfrow=c(2,length(labels)),mar=c(1,1,4,1))
+      layout(matrix(seq(2*length(labels)),nrow=2,byrow=T),heights=c(1,5))
+      cat('\n[',date(),'] Plot summary\n')
+      par(mar=c(0,0,4,0)+0.1)
+      for (i in 1:length(labels)){
+          plot(colSums(x[,((i-1)*nbin+1):(i*nbin)]),type='l',axes=F,main=labels[i])
+      }
+      cat('\n[',date(),'] Plot heatmap\n')
+      par(mar=c(0,0,0,0)+0.1)
+      for (i in 1:length(labels)){
+          image(-t(log2(1+x[,((i-1)*nbin+1):(i*nbin)])),axes=F)
+      }
+      dev.off()  
+      cat('\n[',date(),'] Finished\n')
+
+    </configfile>
+  </configfiles>
+
+  <outputs>
+    <data format="txt" name="logfile" label="${tool.name} on ${on_string}: (log)" />
+    <data format="png" name="out_file1" label="${tool.name} on ${on_string}: (plot)" />
+  </outputs>
+
+<help>
+.. class:: infomark
+
+This tool allows you to check the co-localization pattern of multiple interval sets. All interval sets are aligned to the center of the intervals in the query interval set.
+
+Each row represents a window of certain size around the center of one interval in the query set, such as ChIP peaks. Each heatmap shows the position of other features in the SAME window (the same rows in each heatmap represent the same interval/genomic position).
+
+
+The example below shows that of all Fox2 peaks, half of them are within 1kb of TSS. Of the half outside TSS, about one half has H3K4me1, two thirds of which are further depleted of H3K4me3.  
+
+-----
+
+**Example**
+
+.. image:: ./static/images/align2multiple.png
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/alignr.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/alignr.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,353 @@\n+\'\'\'\n+the scripts takes two files as input, and compute the coverage of \n+features in input 1 across features in input 2. Features in input 2 are \n+divided into bins and coverage is computed for each bin.  \n+\n+please check the help information by typing:\n+\n+    python align.py -h\n+\n+\n+requirement:\n+    please install the following tools first:\n+    bedtools:   for read/region overlapping, http://code.google.com/p/bedtools/\n+    \n+\'\'\'\n+\n+import os,sys,os.path\n+from optparse import OptionParser\n+\n+def lineCount(filename):\n+    with open(filename) as f:\n+        for i, l in enumerate(f):\n+            pass\n+    return i + 1\n+\n+def combineFilename(f1,f2):\n+    \'\'\'\n+    fuse two file names into one\n+    \'\'\'\n+    return f1.split(\'/\')[-1]+\'-\'+f2.split(\'/\')[-1]\n+\n+def checkFormat(filename1,filename2,input1format):\n+    \'\'\'\n+    check the format of input files\n+    \'\'\'\n+\n+    # file1\n+    # read the first line, see how many filds\n+    ncol1 = 6\n+    if input1format == "BED":\n+        f = open(filename1)\n+        line = f.readline().strip().split(\'\\t\')\n+        ncol1 = len(line)\n+        if ncol1 < 3:\n+            print "ERROR: "+filename1+" has only "+str(ncol1)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited."\n+            sys.exit(1)\n+        f.close()\n+     \n+    # file2\n+    f = open(filename2)\n+    line = f.readline().strip().split(\'\\t\')\n+    ncol2 = len(line)  \n+    if ncol2 < 3:\n+        print "ERROR: "+filename2+" has only "+str(ncol2)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited."\n+        sys.exit(1)        \n+\n+    return ncol1,ncol2\n+\n+\n+def makeBed(filename,ncol):\n+    \'\'\'\n+    add up to 6 column\n+    \'\'\'\n+    f = open(filename)\n+    outfile = filename+\'.tmp.bed\'\n+    outf = open(outfile,\'w\')\n+    if ncol == 3:\n+        for line in f:\n+            outf.write(line.strip()+\'\\t.\\t0\\t+\\n\')\n+    elif ncol == 4:\n+        for line in f:\n+            outf.write(line.strip()+\'\\t0\\t+\\n\')\n+    if ncol == 5:\n+        for line in f:\n+            outf.write(line.strip()+\'\\t+\\n\')\n+    f.close()\n+    outf.close()\n+    return outfile\n+    \n+def makeWindow(filename,window):\n+\n+    outfile = filename+\'-window=\'+str(window)+\'.tmp.bed\'\n+    if not os.path.exists(outfile):\n+        f=open(filename)\n+        out = open(outfile,\'w\')\n+        lines = f.readlines()\n+        if \'track\' in lines[0]:\n+            del lines[0]\n+        for line in lines:\n+            flds = line.strip().split(\'\\t\')\n+\n+            #new position\n+            center = (int(flds[1]) + int(flds[2]))/2\n+            start = center - window\n+            end = center + window\n+            if start >= 0:\n+                flds[1] = str(start)\n+                flds[2] = str(end)\n+                out.write(\'\\t\'.join(flds)+\'\\n\')\n+        f.close()\n+        out.close()\n+    return outfile\n+\n+def groupReadsMapped2aRegion(filename,ncol):\n+    \'\'\'\n+    read output from intersectBED\n+    find all reads mapped to each region\n+    \'\'\'\n+    try:\n+        f=open(filename)\n+        #If filename cannot be opened, print an error message and exit\n+    except IOError:\n+        print "could not open",filename,"Are you sure this file exists?"\n+        sys.exit(1)\n+    lines = f.readlines()\n+    \n+    allReadsStart = {}\n+    allReadsEnd = {}\n+    regionStrand = {}\n+    regionStart = {}\n+    regionEnd = {}\n+    \n+    for line in lines:\n+        flds = line.strip().split(\'\\t\')\n+        key = \'_\'.join(flds[ncol:(ncol+4)])\n+        if not allReadsStart.has_key(key):\n+            allReadsStart[key] = list()\n+            allReadsEnd[key] = list()\n+        #print flds[ncol+0],flds[ncol+1],flds[ncol+2]\n+        allReadsStart[key].append(int(flds[1]))\n+        allReadsEnd[key].append(int(flds[2]))\n+        regionStrand[key] = flds[ncol+5]  \n+        regionStart[key] = int(flds[ncol+1])    \n+        regionEnd[key] = int(flds[ncol+2])      \n+    return (allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd)\n+            \n+'..b'ions.inputB))\n+    \n+    if not options.plot:\n+        if options.aformat == "BAM":\n+            cmd = "intersectBed -abam "+str(options.inputA)+" -b "+str(options.inputB) + \' -bed -split \'\n+        else:\n+            cmd = "intersectBed -a "+str(options.inputA)+" -b "+str(options.inputB)\n+        if options.strandness:\n+            cmd = cmd + \' -s\'\n+        cmd = cmd +" -wo > "+ output+\'-intersect.tmp.bed\'\n+        if not options.quiet:\n+            print "search for overlappings: "+cmd\n+        status = os.system(cmd)\n+        if status != 0:\n+            sys.exit(1)\n+\n+    \n+    if not options.quiet:\n+        print \'group reads mapped to the same region...\'\n+    \n+    allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd = groupReadsMapped2aRegion(output+\'-intersect.tmp.bed\',ncol)\n+\n+    if len(allReadsStart) == 0:\n+        if not options.quiet:\n+            print \'no overlap found!!\'\n+        os.system(\'rm *tmp.*\')\n+        sys.exit(1)\n+    \n+    if not options.quiet:\n+        print \'count number of reads mapped to each bin...\'\n+    \n+    RegionProfile,nRead = createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,options.nbins) \n+   \n+    if options.output_data == None:\n+        options.output_data = output+\'.txt\'\n+\n+    if options.summary_only:  \n+        saveSummary(options.output_data,RegionProfile,options.nbins) \n+    \n+    else:                 \n+        saveProfile(options.output_data,RegionProfile,nRead)\n+    \n+    if not options.quiet:\n+        print \'results saved to: \'+ options.output_data \n+        \n+    if not (options.summary_only or options.profile_only ):          \n+        # visualize \n+\n+        if options.window < 1:\n+            xlab = \'relative position (bins)\'\n+        else:\n+            xlab = \'relative position (bp)\'\n+\t            \n+        if options.output_plot == None:\n+            options.output_plot = output+\'.pdf\'\n+\n+        title = options.plot_title+\'\\n n = \'+str(len(RegionProfile))\n+\n+        rscript = open("tmp.r","w")\n+        rscript.write("x <- read.table(\'"+options.output_data+"\')\\n")\n+        rscript.write("pdf(\'"+options.output_plot+"\')\\n")\n+        rscript.write("avg <- colSums(x[,3:ncol(x)])/nrow(x)\\n")\n+        rscript.write("err <- sd(x[,3:ncol(x)])/sqrt(nrow(x))\\n")\n+        \n+        if options.window == 0:\n+            rscript.write("xticks <- seq("+str(options.nbins)+")\\n")\n+        else:\n+            rscript.write("xticks <- seq("+str(-options.window)+","+str(options.window)+",length.out="+str(options.nbins)+")\\n")\n+\n+        if options.ylim != \'min,max\':\n+            rscript.write("ylim=c("+options.ylim+")\\n")\n+        else:\n+            rscript.write("ylim=c(min(avg-err),max(avg+err))\\n")\n+        rscript.write("par(cex=1.5)\\n")\n+        #smooth\n+        if options.span >= 0.1:\n+            rscript.write("avg = loess(avg~xticks,span="+str(options.span)+")$fitted\\n")\n+            rscript.write("err = loess(err~xticks,span="+str(options.span)+")$fitted\\n")\n+        rscript.write("plot(xticks,avg,ylab=\'average coverage\',main=\'"+title+"\',xlab=\'"+xlab+"\',type=\'l\',lwd=0,ylim=ylim)\\n")   \n+        rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col=\'slateblue1\',border=NA)\\n")\n+        rscript.write("lines(xticks,avg,type=\'l\',lwd=1)\\n")   \n+        #rscript.write("xticks <- barplot(avg,names.arg=seq("+str(options.nbins)+"),ylab=\'average coverage\',main=\'"+title+"\',xlab=\'"+xlab+"\',,ylim=c(min(avg-err),max(avg+err)))\\n")\n+        #rscript.write("arrows(xticks,avg+err, xticks, avg-err, angle=90, code=3, length=0.0,col=\'green\')\\n")\n+        #rscript.write("lines(xticks,avg,lwd=2)\\n")\n+        #rscript.write("lines(xticks,avg-err,col=\'green\')\\n")\n+        #rscript.write("lines(xticks,avg+err,col=\'green\')\\n")\n+        rscript.write("dev.off()\\n")\n+        rscript.close()\n+\n+        os.system("R --vanilla < tmp.r")    \n+    \n+    # remove intermediate output\n+    os.system(\'rm *tmp.*\')\n+\n+    \n+if __name__ == "__main__":\n+    main()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/alignr.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/alignr.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,142 @@
+<tool id="alignr" name="align">
+  <description>two interval sets</description>
+  <command interpreter="python"> alignr.py -a $inputa -w $windowsize -n $nbins -o $output_data -v $output_plot $stranded  -q -l $outputlabel --ylim=$ylim --span $span
+    #if $inputb_source_type.inputb_select == "user":
+          -b "$inputb"
+    #else:
+        -b "${inputb_source_type.selectedb.fields.value}"
+    #end if
+    #if $inputa_format.inputa_select == "BAM":
+    -f BAM
+    #end if
+  </command>
+  <inputs>
+    <conditional name="inputa_format">
+     <param name="inputa_select" type="select" label="Select your first input format" >
+ <option value="BED" selected="true">BED-like (chrNum start end ...) </option>
+ <option value="BAM"> BAM</option>
+     </param>
+     <when value="BED">
+     <param name="inputa" type="data" format="interval" label="Input file for the first interval set (-a)"/>
+     </when>
+     <when value="BAM">
+     <param name="inputa" type="data" format="bam" label="Input file for the first interval set (-a)"/>
+     </when>
+    </conditional>
+    <conditional name="inputb_source_type">
+        <param name="inputb_select" type="select" label="Input source for the second interval set">
+            <option value="mm9ucsc" selected="true">mm9 ucsc knownGene annotations</option>
+            <option value="mm9refseq">mm9 refseq gene annotations</option>
+            <option value="mm9ensembl">mm9 ensembl gene annotations</option>
+            <option value="hg18ucsc" >hg18 ucsc knownGene annotations</option>
+            <option value="hg18refseq">hg18 refseq gene annotations</option>
+            <option value="hg18ensembl">hg18 ensembl gene annotations</option>
+            <option value="user">Dataset in Your History</option>
+        </param>
+        <when value="user">
+            <param name="inputb" type="data" format="interval" label="Input file for the second interval set (-b)" />
+        </when>
+        <when value="mm9ucsc">
+            <param name="selectedb" type="select" label="Input for the second interval set (-b)" >
+                <options from_file="aligndb-mm9-knownGene.loc">
+                    <column name="name" index="0"/>
+                    <column name="value" index="1"/>
+                </options>
+            </param>
+        </when>
+        <when value="mm9refseq">
+            <param name="selectedb" type="select" label="Input for the second interval set (-b)" >
+                <options from_file="aligndb-mm9-refGene.loc">
+                    <column name="name" index="0"/>
+                    <column name="value" index="1"/>
+                </options>
+            </param>
+        </when>
+        <when value="mm9ensembl">
+            <param name="selectedb" type="select" label="Input for the second interval set (-b)" >
+                <options from_file="aligndb-mm9-ensGene.loc">
+                    <column name="name" index="0"/>
+                    <column name="value" index="1"/>
+                </options>
+            </param>
+        </when>
+        <when value="hg18ucsc">
+            <param name="selectedb" type="select" label="Input for the second interval set (-b)" >
+                <options from_file="aligndb-hg18-knownGene.loc">
+                    <column name="name" index="0"/>
+                    <column name="value" index="1"/>
+                </options>
+            </param>
+        </when>
+        <when value="hg18refseq">
+            <param name="selectedb" type="select" label="Input for the second interval set (-b)" >
+                <options from_file="aligndb-hg18-refGene.loc">
+                    <column name="name" index="0"/>
+                    <column name="value" index="1"/>
+                </options>
+            </param>
+        </when>
+        <when value="hg18ensembl">
+            <param name="selectedb" type="select" label="Input for the second interval set (-b)" >
+                <options from_file="aligndb-hg18-ensGene.loc">
+                    <column name="name" index="0"/>
+                    <column name="value" index="1"/>
+                </options>
+            </param>
+        </when>
+                                                
+    </conditional>    
+    <param name="windowsize" size="10" type="integer" value="0" label="Change input 2 interval size (-w)"  help="will create new intervals of w bp flanking the original center. set to 0 will not change input interval size)"/>
+    <param name="nbins" size="10" type="integer" value="100" label="Number of bins dividing intervals in input 2(-n)"/>
+    <param name="span" size="10" type="float" value="0.1" label="loess span: smoothing parameter" help="value less then 0.1 disables smoothing"/>
+    <param name="stranded" label="Check if require overlap on the same strand (-s)" type="boolean" truevalue="-s" falsevalue="" checked="False"/>
+    <param name="outputlabel" size="80" type="text" label="Output label" value="test"/>
+    <param name="ylim" size="10" type="text" label="set ylim of the plot" value="min,max" help="e.g. 0,1 (default is the min and max of the signal)"/>
+   
+</inputs>
+  <outputs>
+    <data format="tabular" name="output_data" label="${outputlabel} (data)"/> 
+    <data format="pdf" name="output_plot" label="${outputlabel} (plot)"/> 
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool aligns two sets of intervals, finds overlaps, calculates and plots the coverage of the first set across the second set. Applications include:  
+
+- check read distribution around TSS/poly A site/splice site/motif site/miRNA target site
+- check relative position/overlap of two lists of ChIP-seq peaks
+
+Two output files are generated. One is the coverage/profile for each interval in input 2. The first two columns are interval ID and the total number of overlapping intervals from input 1. Column 3 to column nbins+2 are coverage at each bin. The other file is an PDF file plotting the average coverage of each bin. To modify the visualization, please downlaod the coverage file and make your own plots.
+
+-----
+
+**Annotated features**
+
+Currently supports mouse genome build mm9 and human hg18. Each interval spans 1000bp upstream and 1000bp downstream of a feature such as TSS. Features with overlapping exons in the intronic/intergenic part of the 2000bp interval are removed.
+
+-----
+
+**Usage**
+
+  -h, --help        show this help message and exit
+  -a INPUTA         (required) input file A, BED-like (first 3 columns: chr, start, end) or BAM format. The
+                    script computes the depth of coverage of features in file
+                    A across the features in file B
+  -b INPUTB         (required) input file B, BED format or MACS peak file.
+                    Requires an unique name for each line in column 4
+  -m                inputB is a MACS peak file.
+  -f AFORMAT        Format of input file A. Can be BED (default) or BAM
+  -w WINDOW         Generate new inputB by making a window of 2 x WINDOW bp
+                    (in total) flanking the center of each input feature
+  -n NBINS          number of bins. Features in B are binned, and the coverage
+                    is computed for each bin. Default is 100
+  -s                enforce strandness: require overlapping on the same
+                    strand. Default is off
+  -p                load existed intersectBed outputfile
+  -q                suppress output on screen
+  -o OUTPUTPROFILE  (optional) output profile name.
+  -v PLOTFILE       (optional) plot file name
+  </help>
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/alignvis.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/alignvis.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,76 @@
+import sys,os
+
+infile = sys.argv[1]
+outfile = sys.argv[2]
+uselog = sys.argv[3]
+subset = sys.argv[4]
+reorder = sys.argv[5]
+color = sys.argv[6]
+scale = sys.argv[7] # rescale each row
+rscript = open('tmp.r','w')
+
+rscript.write("x <- read.table('"+infile+"')\n")
+rscript.write("nfeat <- nrow(x) \n")
+rscript.write("nbin <- ncol(x) - 2\n")
+rscript.write("totalcov <- x[,2]\n")
+rscript.write("x <- x[,3:ncol(x)]\n")
+
+if subset =='subset':
+    rscript.write("if (nfeat*nbin > 100000) {\n")
+    rscript.write("  nfeat2 <- as.integer(100000/nbin)\n")
+    rscript.write("  subind <- sample(seq(nfeat),nfeat2)\n")
+    rscript.write("  x <- x[subind,]\n")
+    rscript.write("  totalcov <- totalcov[subind]\n")
+    rscript.write("}\n")
+
+rscript.write("pdf('"+outfile+"')\n")
+
+if uselog == 'uselog':
+    rscript.write("x <- -(log(1+as.matrix(x,nc=ncol(x)-2)))\n")
+else:
+    rscript.write("x <- -as.matrix(x,nc=ncol(x)-2)\n")
+if scale == 'scale':
+    rscript.write("x <- scale(x)\n")
+if reorder == 'average':
+    rscript.write("hc <- hclust(dist(x),method= 'average')\n")
+    rscript.write("x <- x[hc$order,]\n")
+elif reorder == 'centroid':
+    rscript.write("hc <- hclust(dist(x),method= 'centroid')\n")
+    rscript.write("x <- x[hc$order,]\n")
+elif reorder == 'complete':
+    rscript.write("hc <- hclust(dist(x),method= 'complete')\n")
+    rscript.write("x <- x[hc$order,]\n")
+elif reorder == 'single':
+    rscript.write("hc <- hclust(dist(x),method= 'single')\n")
+    rscript.write("x <- x[hc$order,]\n")
+elif reorder == 'median':
+    rscript.write("hc <- hclust(dist(x),method= 'median')\n")
+    rscript.write("x <- x[hc$order,]\n")    
+elif reorder == 'sort_by_total':
+    rscript.write("srt <- sort(totalcov,index.return=T)\n")
+    rscript.write("x <- x[srt$ix,]\n")
+elif reorder == 'sort_by_center':
+    rscript.write("srt <- sort(x[,as.integer(nbin/2)],index.return=T)\n")
+    rscript.write("x <- x[srt$ix,]\n")
+if color == 'heat':
+    rscript.write("colormap = heat.colors(1000)\n")
+elif color == 'topo':
+    rscript.write("colormap = topo.colors(1000)\n")
+elif color == 'rainbow':
+    rscript.write("colormap = rainbow(1000)\n")
+elif color == 'terrain':
+    rscript.write("colormap = terrain.colors(1000)\n")
+else:
+    rscript.write("colormap = gray.colors(1000)\n")
+
+#rscript.write("qt <- quantile(as.vector(x),probs=c(0.1,0.9))\n")
+#rscript.write("breaks <- c(min(as.vector(x)),seq(qt[1],qt[2],length.out=99),max(as.vector(x)))\n")
+#rscript.write("image(t(x),col=colormap,breaks=breaks,axes=F)\n")
+rscript.write("image(t(x),col=colormap,axes=F)\n")
+rscript.write("dev.off()\n")
+
+rscript.close()
+
+os.system("R --slave < tmp.r")
+os.system("rm tmp.r")
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/alignvis.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/alignvis.r Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,8 @@
+args <- commandArgs(TRUE)
+x <- read.table(args[1])
+pdf(args[2])
+#visualize the profile with heatmap 
+srt <- sort(x[,2],index.return=T) # sort by total number of reads
+image(-t(log(as.matrix(x[srt$ix[1:nrow(x)],3:ncol(x)],nc=ncol(x)-2))),col=gray.colors(100))
+dev.off()
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/alignvis.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/alignvis.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,42 @@
+<tool id="alignvis" name="heatmap">
+  <description>of align output</description>
+  <command interpreter="python"> alignvis.py $input $output $uselog $subset $reorder $color $scale </command>
+  <inputs>
+    <param name="input" format="tabular" type="data" label="Original file"/>
+    <param name="uselog" label="log transform the data" type="boolean" truevalue="uselog" falsevalue="none" checked="True"/>
+    <param name="subset" label="sample a subset if the data is too large" type="boolean" truevalue="subset" falsevalue="none" checked="True"/>
+    <param name="scale" label="normalize by row/feature" type="boolean" truevalue="scale" falsevalue="none" checked="False"/>
+    <param name="reorder" type="select" label="reorder features (rows)">
+      <option value="none" selected="true">None</option>
+      <option value="sort_by_sum">Sort row by sum</option>
+      <option value="sort_by_center">Sort row by center </option>
+      <option value="average">Cluster rows (average)</option>    
+      <option value="median">Cluster rows (median) </option>    
+      <option value="centroid">Cluster rows (centroid)</option>    
+      <option value="complete">Cluster rows (complete)</option>    
+      <option value="single">Cluster rows (single)</option> 
+          </param>
+             
+    <param name="color" type="select" label="color scheme">
+    <option value="heat" selected="true">heat</option>
+    <option value="gray">gray</option>
+    <option value="rainbow">rainbow</option>    
+    <option value="topo">topo</option>    
+    <option value="terrain">terrain</option>    
+    </param>
+  </inputs>
+  <outputs>
+    <data format="pdf" name="output" />
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool generates a heatmap for output from 'align' tool. Each row is the color-coded coverage of a feature, and the features are sorted by the total coverage in the interval.  
+
+**Example**
+
+.. image:: ./static/operation_icons/heatmap.png
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/altschulEriksonDinuclShuffle.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/altschulEriksonDinuclShuffle.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,150 @@
+#! /usr/bin/env python
+
+# altschulEriksonDinuclShuffle.py
+# P. Clote, Oct 2003
+# NOTE: One cannot use function "count(s,word)" to count the number
+# of occurrences of dinucleotide word in string s, since the built-in
+# function counts only nonoverlapping words, presumably in a left to
+# right fashion.
+
+
+import sys,string,random
+
+
+
+def computeCountAndLists(s):
+  #WARNING: Use of function count(s,'UU') returns 1 on word UUU
+  #since it apparently counts only nonoverlapping words UU
+  #For this reason, we work with the indices.
+
+  #Initialize lists and mono- and dinucleotide dictionaries
+  List = {} #List is a dictionary of lists
+  List['A'] = []; List['C'] = [];
+  List['G'] = []; List['T'] = [];
+  nuclList   = ["A","C","G","T"]
+  s       = s.upper()
+  s       = s.replace("U","T")
+  nuclCnt    = {}  #empty dictionary
+  dinuclCnt  = {}  #empty dictionary
+  for x in nuclList:
+    nuclCnt[x]=0
+    dinuclCnt[x]={}
+    for y in nuclList:
+      dinuclCnt[x][y]=0
+
+  #Compute count and lists
+  nuclCnt[s[0]] = 1
+  nuclTotal     = 1
+  dinuclTotal   = 0
+  for i in range(len(s)-1):
+    x = s[i]; y = s[i+1]
+    List[x].append( y )
+    nuclCnt[y] += 1; nuclTotal  += 1
+    dinuclCnt[x][y] += 1; dinuclTotal += 1
+  assert (nuclTotal==len(s))
+  assert (dinuclTotal==len(s)-1)
+  return nuclCnt,dinuclCnt,List


+def chooseEdge(x,dinuclCnt):
+  numInList = 0
+  for y in ['A','C','G','T']:
+    numInList += dinuclCnt[x][y]
+  z = random.random()
+  denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']
+  numerator = dinuclCnt[x]['A']
+  if z < float(numerator)/float(denom):
+    dinuclCnt[x]['A'] -= 1
+    return 'A'
+  numerator += dinuclCnt[x]['C']
+  if z < float(numerator)/float(denom):
+    dinuclCnt[x]['C'] -= 1
+    return 'C'
+  numerator += dinuclCnt[x]['G']
+  if z < float(numerator)/float(denom):
+    dinuclCnt[x]['G'] -= 1
+    return 'G'
+  dinuclCnt[x]['T'] -= 1
+  return 'T'
+
+def connectedToLast(edgeList,nuclList,lastCh):
+  D = {}
+  for x in nuclList: D[x]=0
+  for edge in edgeList:
+    a = edge[0]; b = edge[1]
+    if b==lastCh: D[a]=1
+  for i in range(2):
+    for edge in edgeList:
+      a = edge[0]; b = edge[1]
+      if D[b]==1: D[a]=1
+  ok = 0
+  for x in nuclList:
+    if x!=lastCh and D[x]==0: return 0
+  return 1
+

+
+def eulerian(s):
+  nuclCnt,dinuclCnt,List = computeCountAndLists(s)
+  #compute nucleotides appearing in s
+  nuclList = []
+  for x in ["A","C","G","T"]:
+    if x in s: nuclList.append(x)
+  #compute numInList[x] = number of dinucleotides beginning with x
+  numInList = {}
+  for x in nuclList:
+    numInList[x]=0
+    for y in nuclList:
+      numInList[x] += dinuclCnt[x][y]
+  #create dinucleotide shuffle L 
+  firstCh = s[0]  #start with first letter of s
+  lastCh  = s[-1]
+  edgeList = []
+  for x in nuclList:
+    if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] )
+  ok = connectedToLast(edgeList,nuclList,lastCh)
+  return ok,edgeList,nuclList,lastCh
+
+
+def shuffleEdgeList(L):
+  n = len(L); barrier = n
+  for i in range(n-1):
+    z = int(random.random() * barrier)
+    tmp = L[z]
+    L[z]= L[barrier-1]
+    L[barrier-1] = tmp
+    barrier -= 1
+  return L
+
+def dinuclShuffle(s):
+  ok = 0
+  while not ok:
+    ok,edgeList,nuclList,lastCh = eulerian(s)
+  nuclCnt,dinuclCnt,List = computeCountAndLists(s)
+
+  #remove last edges from each vertex list, shuffle, then add back
+  #the removed edges at end of vertex lists.
+  for [x,y] in edgeList: List[x].remove(y)
+  for x in nuclList: shuffleEdgeList(List[x])
+  for [x,y] in edgeList: List[x].append(y)
+
+  #construct the eulerian path
+  L = [s[0]]; prevCh = s[0]
+  for i in range(len(s)-2):
+    ch = List[prevCh][0] 
+    L.append( ch )
+    del List[prevCh][0]
+    prevCh = ch
+  L.append(s[-1])
+  t = string.join(L,"")
+  return t


+
+if __name__ == '__main__':
+  if len(sys.argv)!=3:
+    print "Usage: python altschulEriksonDinuclShuffle.py GCATCGA 5"
+    sys.exit(1)
+  s = sys.argv[1].upper()
+  for i in range(int(sys.argv[2])):
+    print dinuclShuffle(s)
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/bedClean.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/bedClean.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,37 @@
+import sys
+
+def readChrSize(filename):
+    f = open(filename)
+    chrSize = {}
+    for line in f:
+        chrom,size = line.strip().split()
+        chrSize[chrom]=int(size)
+    f.close()
+    return chrSize
+
+def cleanFile(filename,chrSize,outfile):
+    f = open(filename)
+    out = open(outfile,'w')
+    i = 0
+    for line in f:
+        i = i + 1
+        flds = line.strip().split('\t')
+        if len(flds) < 3:
+            print 'line',i,'incomplete line:\n',line
+        elif chrSize.has_key(flds[0]):
+            if int(flds[1]) > int(flds[2]):
+                tmp = flds[1]
+                flds[1] = flds[2]
+                flds[2] = tmp
+            if int( flds[1]) < 0 or int(flds[2]) <0:
+                print 'line',i,'negative coordinates:\n',line
+            elif int(flds[2]) > chrSize[flds[0]]:
+                print 'line',i,'end larger than chr size:\n',line
+            else:
+                out.write('\t'.join(flds)+'\n')
+        else:
+            print 'line',i,'chromosome',flds[0],'not found!\n',line
+    f.close()
+    out.close()
+
+cleanFile(sys.argv[1],readChrSize(sys.argv[2]),sys.argv[3])
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/bed_to_bam.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/bed_to_bam.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,19 @@
+<tool id="bedToBam" name="bedToBam">
+  <description>convert BED or GFF or VCF to BAM</description>
+  <command>bedToBam -i $input -g $genome $bed12 $mapq $ubam > $outfile </command>
+  <inputs>
+    <param name="input" format="bed,gff,vcf" type="data" label="Input file (BED,GFF,VCF)" help="BED files must be at least BED4 to be amenable to BAM (needs name field)"/>
+    <param name="genome" type="select" label="Select genome">
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm9.genome" selected="true">mm9</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm8.genome">mm8</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg18.genome">hg18</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg19.genome">hg19</option>
+    </param>
+    <param name="mapq" size="10" type="integer" value="255" label="Set the mappinq quality for the BAM records"/>
+    <param name="bed12" label="The BED file is in BED12 format" help="The BAM CIGAR string will reflect BED blocks" type="boolean" truevalue="-bed12" falsevalue="" checked="False"/>
+    <param name="ubam" label="Write uncompressed BAM output" help="Default is to write compressed BAM" type="boolean" truevalue="-ubam" falsevalue="" checked="False"/>
+  </inputs>
+  <outputs>
+    <data format="bam" name="outfile" />
+  </outputs>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/bedclean.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/bedclean.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,33 @@
+<tool id="bedclean" name="clean interval">
+  <description>remove off-chromosome lines</description>
+  <command interpreter="python">bedclean.py $input $genome $output > $log  </command>
+  <inputs>
+     <param name="input" type="data" format="interval" label="Original interval file"/>
+    <param name="genome" type="select" label="Select genome">
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm9.genome" selected="true">mm9</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm8.genome">mm8</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg18.genome">hg18</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg19.genome">hg19</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="input" name="output" label="${tool.name} on ${on_string} (interval)"/>
+    <data format="txt" name="log" label="${tool.name} on ${on_string} (log)"/>
+  </outputs>
+  <help>
+
+**Description**
+
+remove lines that are
+
+1. comment or track name lines
+
+2. on chr*_random
+
+3. or have negative coordinates
+
+4. or the end is larger than chromosome size
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/bedsort.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/bedsort.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,23 @@
+<tool id="bedsort" name="sort">
+  <description>a interval file by chr and start</description>
+  <command> head -n $skip $input > $output
+  &amp;&amp; tail -n+`expr $skip + 1` $input | sort -k1,1 -k2,2g >> $output    
+  </command>
+  <inputs>
+     <param name="input" type="data" format="bed" label="Input interval file"/>
+     <param name="skip" type="integer" value="0" label="top lines to skip" help="output directly, not sorted"/>
+  </inputs>
+  <outputs>
+    <data format="bed" name="output" />
+  </outputs>
+  <help>
+
+**Description**
+
+Unix command used::
+
+    head -n $skip $input > $output
+    tail -n+`expr $skip + 1` $input | sort -k1,1 -k2,2g >> $output
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/bigWigAverageOverBed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/bigWigAverageOverBed.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,12 @@
+<tool id="bigWigAverageOverBed" name="bigWigAverageOverBed">
+  <description>average interval coverage</description>
+  <command>bigWigAverageOverBed $bw $bed $outtab -bedOut=$outbed 2> err </command>
+  <inputs>
+    <param name="bw" format="bigwig" type="data" label="BigWig file"/>
+    <param name="bed" format="bed" type="data" label="Bed file"/>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="outtab" label="${tool.name} on ${on_string} (tab)"/>
+    <data format="bed" name="outbed" label="${tool.name} on ${on_string} (bed)"/>
+  </outputs>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/binaverage.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/binaverage.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,74 @@
+<tool id="binaverage" name="bin and average">
+  <description>of numeric columns</description>
+  <command>cat $script_file | R --vanilla --slave > $out_log </command>
+  <inputs>
+      <param name="input" type="data" format="tabular" label="Data file"/>
+      <param name="data_avg" type="integer" value="1" label="Column number of the data to average"/>
+      <param name="label_avg" type="text" value="label-avg" size="30" label="data label"/>    
+       <param name="log_avg" label="log2 transform the data" type="boolean" truevalue="logavg" falsevalue="none" checked="False"/> 
+       <param name="data_bin" type="integer" value="2" label="Column number of the data used to make bins"/>
+      <param name="label_bin" type="text" value="label-bin" size="30" label="data label"/> 
+      <param name="log_bin" label="log2 transform the data" type="boolean" truevalue="logbin" falsevalue="none" checked="False"/> 
+      <param name="nbin" type="integer" value="3" label="number of bins"/>
+      <param name="bintype" type="select" label="Bin by rank or by value" >
+   <option value="rank" selected="true">by rank: bins have the same number of data points</option>
+   <option value="value">by value: bins may have different number of data points</option>
+      </param>  
+      <param name="legendloc" type="select" label="legend location on CDF plot" >
+   <option value="bottomright" selected="true">bottomright</option>
+   <option value="bottomleft">bottomleft</option>
+   <option value="bottom">bottom</option>
+   <option value="left">left</option>
+   <option value="topleft">topleft</option>
+   <option value="top">top</option>
+   <option value="topright">topright</option>      
+   <option value="right">right</option>
+   <option value="center">center</option>  
+      </param>
+    
+      <param name="title" type="text" value="bin-average" size="50" label="title of this analysis"/>       
+         
+  </inputs>
+
+  <configfiles>
+    <configfile name="script_file">
+      ## Setup R error handling to go to stderr
+      options(warn=-1)
+      source("/Users/xuebing/galaxy-dist/tools/mytools/cdf.r")
+      x = read.table("${input}",sep='\t')
+      x = x[,c($data_bin,$data_avg)]
+      label_avg = "${label_avg}"
+      label_bin = "${label_bin}"
+      if ("${log_bin}" == "logbin"){
+          x[,1] = log2(1+x[,1])
+          label_bin = paste('log2',label_bin)
+      }
+      if ("${log_avg}" == "logavg"){
+          x[,2] = log2(1+x[,2])
+          label_avg = paste('log2',label_avg)
+      }
+      res = binaverage(x,$nbin,"${bintype}")
+      attach(res)
+      for (i in 1:${nbin}){
+          print(paste(label_bin,labels[i],sep=':'))
+          print(summary(binned[[i]]))
+      }      
+      pdf("${out_file}")
+      mycdf(binned,"${title}",labels,"$legendloc",label_avg,label_bin)
+      dev.off() 
+    </configfile>
+  </configfiles>
+
+  <outputs>
+    <data format="txt" name="out_log" label="${title}: (log)" />
+    <data format="pdf" name="out_file" label="${title}: (plot)" />
+  </outputs>
+
+<help>
+
+.. class:: infomark
+
+This tool generates barplot and CDF plot comparing data/rows in a numeric column that are binned by a second numeric column. The input should have at least two numeric columns. One of the column is used to group rows into bins, and then values in the other column are compared using barplot, CDF plot, and KS test.  
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/binnedAverage.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/binnedAverage.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,77 @@
+'''
+get binned score of intervals,allow extension
+'''
+
+import os,sys,numpy,random,string
+
+from resize import *
+
+from bx.bbi.bigwig_file import BigWigFile
+
+def binning(x,n):
+    # make n bin of x
+    y = numpy.zeros(n,dtype=float)
+    if len(x) == 0:
+        return y
+    step = float(len(x))/n
+    for k in range(n):
+        i = int(step*k)
+        j = int(step*(k+1)) + 1
+        y[k] = x[i:j].mean()
+        #print i,j,k,y[k]
+    return y
+
+def getBinnedScore(bwfile,intvfile,outfile,outplot,nbin):
+    '''
+    get binned average and std
+    '''
+    fbw = open(bwfile)
+    bw = BigWigFile(file=fbw)
+    fin = open(intvfile)
+    out = open(outfile,'w')
+    zeros = '\t'.join(['0']*nbin)
+    for line in fin:
+        #chrom,start,end,name,score,strand
+        line = line.strip()
+        flds = line.split('\t')
+        #get the score at base resolution as an array
+        scores = bw.get_as_array(flds[0],int(flds[1]),int(flds[2]))
+        if scores == None:
+            print 'not found:\t',line
+            out.write(line+'\t'+zeros+'\n')
+            continue
+        # reverse if on minus strand
+        if flds[5] == '-':
+            scores = scores[::-1]
+        # no score = 0    
+        scores = numpy.nan_to_num(scores)
+        # bin the data
+        binned = binning(scores,nbin)
+        out.write(line+'\t'+'\t'.join(map(str,binned))+'\n')
+    fin.close()
+    out.close()
+    # plot
+    if nbin > 1:
+        tmp = "".join(random.sample(string.letters+string.digits, 8))
+        rscript = open(tmp,"w")
+        rscript.write("options(warn=-1)\n")
+        rscript.write("x <- read.table('"+outfile+"',sep='\t')\n")
+        rscript.write("x <- x[,(ncol(x)+1-"+str(nbin)+"):ncol(x)]\n")
+        rscript.write("pdf('"+outplot+"')\n")
+        rscript.write("avg <- apply(x,2,mean)\n")
+        rscript.write("err <- apply(x,2,sd)/sqrt(nrow(x))\n")
+        rscript.write("print(avg)\n")
+        rscript.write("print(err)\n")
+        rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
+        rscript.write("xticks <- seq(ncol(x))\n")
+        rscript.write("plot(xticks,avg,xlab='',ylab='average',type='l',lwd=0,ylim=ylim)\n")   
+        rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='lightgreen',border=NA)\n")
+        rscript.write("lines(xticks,avg,type='l',lwd=1)\n")   
+        rscript.write("dev.off()\n")
+        rscript.close()
+        os.system("R --vanilla < "+tmp)
+        os.system("rm "+tmp)
+
+print sys.argv
+prog,bwfile,intvfile,nbin,outfile,outplot = sys.argv
+getBinnedScore(bwfile,intvfile,outfile,outplot,int(nbin))
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/bowtie2bed.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/bowtie2bed.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,45 @@
+#!/usr/bin/perl
+
+# write bowtie output to bed file
+
+# perl bowtie2bed.pl s_5_trimmed.map outputfile 200
+# input
+#  inputfile
+# extension
+
+$inputfile = $ARGV[0];
+$extension = $ARGV[2];
+$outputfile = $ARGV[1];#$inputfile.".extended_$extension"."bp.bed";
+
+print "input file: $inputfile\n";
+print "output file: $outputfile\n";
+print "track name: $outputfile\n";
+
+open (IN,$inputfile) or die $!;
+open (OUT,">$outputfile") or die $!;
+
+print OUT "track name=$outputfile itemRgb=On\n";
+
+while(<IN>)
+{
+    @flds = split/\t/;
+    $flds[0] =~ s/ /-/g;#substitute space to dash
+
+    if ($flds[1] eq "+")
+    {
+     print OUT join("\t",$flds[2],$flds[3],$flds[3]+$extension+length($flds[4]),$flds[0],1,$flds[1],$flds[3],$flds[3]+length($flds[4]),"255,0,0","\n");
+    }
+    else
+    {
+     
+     print OUT join("\t",$flds[2],max(0,$flds[3]-$extension),$flds[3]+length($flds[4]),$flds[0],1,$flds[1],$flds[3],$flds[3]+length($flds[4]),"0,255,0","\n");
+    }
+}
+close(IN);
+close(OUT);
+
+sub max()
+{
+ ($a,$b) = @_;
+ return $a>$b?$a:$b;
+}
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/bowtie2bed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/bowtie2bed.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,44 @@
+<tool id="bowtie2bed" name="bowtie-to-bed">
+  <description> converter and read extender</description>
+  <command interpreter="perl">bowtie2bed.pl $input $out_file1 $extendLength </command>
+  <inputs>
+    <param name="input" format="txt" type="data" label="Bowtie map result file"/>
+    <param name="extendLength" size="10" type="integer" value="200" label="Extend 3 primer (bp)"/>
+  </inputs>
+  <outputs>
+    <data format="bed" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="test.map" ftype="TXT"/>
+      <param name="extendLength" value="200"/>
+      <output name="out_file1" file="testmap.bed"/>
+    </test>
+  </tests>
+  <help>
+
+
+**What it does**
+
+This tool converts bowtie output map format file to bed format, with the option to extend the 3 primer end.
+
+- Sequence and quality information is lost after conversion
+- The output contains a track name at the first row
+
+-----
+
+**Example**
+
+Converting the following bowtie mapped reads::
+
+  SRR073078.2 HWUSI-EAS465_8_1_1_524 length=36 - chr2 112499209 AGTGTGACTGCATCTCTTCCTTCGTGGGGCTNCAGT ...
+  SRR073078.3 HWUSI-EAS465_8_1_1_1054 length=36 + chr17 75877120 CCACNCCTCCTTTCAAAACACACTGCCAGGTGCGTC ...
+
+will result in::
+
+  track name=/home/xuebing/Research/galaxy/galaxy-dist/database/files/000/dataset_5.dat itemRgb=On
+  chr2 112499109 112499245 SRR073078.2-HWUSI-EAS465_8_1_1_524-length=36 1 - 112499209 112499245 0,255,0
+  chr17 75877120 75877256 SRR073078.3-HWUSI-EAS465_8_1_1_1054-length=36 1 + 75877120 75877156 255,0,0
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/bwBinavg.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/bwBinavg.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,44 @@
+<tool id="bwbinavg" name="bigwig summary">
+  <description>for intervals</description>
+  <command interpreter="python">getGenomicScore.py $input $output $score_type $bwfile $nbin $strand $outplot $span</command>
+  <inputs>
+      <param name="input" format="interval" type="data" label="Interval file"/>
+      <param name="bwfile" format="bigwig" type="data" label="BigWig file"/>
+      <param name="score_type" type="select" label="Select score summary type" >
+   <option value="mean" selected="true">mean</option>
+   <option value="max">maximum</option>
+   <option value="min">minimum</option>
+   <option value="std">standard deviation</option>
+   <option value="coverage">coverage:fraction covered</option>
+      </param>
+      <param name="nbin" type="integer" value="1" label="number of bins"/>          
+        <param name="strand" type="integer" value="0" label="Specify the strand column" help="leave 0 to ignore strand information. Only matters if using more than 1 bin"/>   
+        <param name="span" size="10" type="float" value="0.1" label="loess span: smoothing parameter" help="value less then 0.1 disables smoothing"/>
+  </inputs>
+  <outputs>
+     <data format="pdf" name="outplot" label="${tool.name} on ${on_string}[plot]"/>
+    <data format="interval" name="output" label="${tool.name} on ${on_string}[data]"/>
+  </outputs>
+  <help>
+
+.. class:: infomark
+
+Each interval is binned and the average base-resolution score/coverage/density in the bigwig file is added as new columns appended at the end of the original file .
+
+**Example**
+
+If your original data has the following format:
+
++-----+-----+---+------+
+|chrom|start|end|other2|
++-----+-----+---+------+
+
+and you choose to divide each interval into 3 bins and return the mean scores of each bin, your output will look like this:
+
++-----+-----+---+------+-----+-----+-----+
+|chrom|start|end|other2|mean1|mean2|mean3|
++-----+-----+---+------+-----+-----+-----+
+
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/cdf-old-not-used/._cdf.xml
b
Binary file tools/mytools/cdf-old-not-used/._cdf.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/cdf-old-not-used/._cdf2-old.xml
b
Binary file tools/mytools/cdf-old-not-used/._cdf2-old.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/cdf-old-not-used/cdf.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/cdf-old-not-used/cdf.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,55 @@
+'''
+python wrapper for cdf plot
+'''
+
+import os,sys
+
+filename = sys.argv[1]
+output = sys.argv[2]
+c1, c2 = sys.argv[3].strip().split(',')
+label1,label2 = sys.argv[5].strip().split(',')
+title = sys.argv[6].strip()
+log = sys.argv[4]
+
+rf = open('tmp.r','w')
+
+rf.write("options(warn=-1)\n")
+
+rf.write("x <- read.table('"+filename+"')\n")
+
+rf.write("mycdf <- function(x1,x2) {\n")
+rf.write("       sx1 <- sort(x1)\n")
+rf.write("     sy1 <- c(1:length(sx1))/length(sx1)\n")
+rf.write("     sx2 <- sort(x2)\n")
+rf.write("     sy2 <- c(1:length(sx2))/length(sx2)\n")
+rf.write("       mi <- min(c(x1,x2))\n")
+rf.write("       ma <- max(c(x1,x2))\n")
+rf.write("       sx1 <- c(mi,sx1,ma)\n")
+rf.write("       sx2 <- c(mi,sx2,ma)\n")
+rf.write("       sy1 <- c(0,sy1,1)\n")
+rf.write("       sy2 <- c(0,sy2,1)\n")
+rf.write("       pv <- ks.test(x1,x2)$p.value\n")
+rf.write("       list(sx1,sy1,sx2,sy2,pv)\n")
+rf.write("}\n")
+
+rf.write("list1 <- x[,"+c1+"]\n")
+rf.write("list2 <- x[,"+c2+"]\n")
+
+if log == 'log':
+    rf.write("list1 <- log2(1+list1) \n")
+    rf.write("list2 <- log2(1+list2) \n")
+    #rf.write("list1[list1 == -Inf] <- min(list1[list1 != -Inf])\n")
+rf.write("res <- mycdf(list1,list2)\n")
+
+rf.write("pdf('"+output+"')\n")
+rf.write("plot(res[[1]],res[[2]],type='l',lty=1,lwd=2,col='red',main=paste('"+title+"',format(res[[5]], digit=2,scientific = T),sep=' , p='),xlab='value',ylab='cumulative frequency')\n")
+rf.write("lines(res[[3]],res[[4]],type='l',lty=2,lwd=2,col='blue')\n")
+rf.write("label1 <- paste('"+label1+"','( n1=',length(list1),')')\n")
+rf.write("label2 <- paste('"+label2+"','( n2=',length(list2),')')\n")
+rf.write("legend('bottomright',c(label1,label2),col=c('red','blue'), lty=1:2, lwd=2, bty='n')\n")
+rf.write("boxplot(list1,list2,names=c(label1,label2))\n")
+rf.write("dev.off()\n")
+
+rf.close()
+os.system("R --vanilla < tmp.r")    
+os.system('rm tmp.r')
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/cdf-old-not-used/cdf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/cdf-old-not-used/cdf.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,21 @@
+<tool id="cdf" name="CDF">
+  <description>plot of two columns</description>
+  <command interpreter="python">cdf.py $input $output $columns $log $labels $title </command>
+  <inputs>
+    <param name="input" format="tabular" type="data" label="input file"/>
+    <param name="columns" size="4" type="text" label="column numbers (sep by comma)" help="i.e. plot CDF for data in column 2 and 3" value="2,3"/>
+    <param name="title" size="50" type="text" label="Figure title (no space allowed)" value="CDF-plot"/>
+    <param name="labels" size="50" type="text" label="Figure legend (sep by comma,no space allowed)" value="sample,control"/>
+    <param name="log" label="log transform the data" type="boolean" truevalue="log" falsevalue="none" checked="False"/>
+  </inputs>
+  <outputs>
+    <data format="pdf" name="output" />
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool plots the CDF of two data columns in the input file. A KS test p-value is also shown. 
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/cdf-old-not-used/cdf2-old.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/cdf-old-not-used/cdf2-old.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,24 @@
+<tool id="cdf2" name="CDF">
+  <description>plot of two columns from two files</description>
+  <command interpreter="python">cdf2.py $input1 $input2 $output $c1 $c2 $labels $log  $title </command>
+  <inputs>
+    <param name="input1" format="tabular" type="data" label="input file 1"/>
+    <param name="c1" size="2" type="text" label="column number" value="1"/>
+    <param name="input2" format="tabular" type="data" label="input file 2"/>
+    <param name="c2" size="2" type="text" label="column number" value="1"/>
+    <param name="title" size="50" type="text" label="Figure title (no space allowed) " value="CDF-plot"/>
+    <param name="labels" size="50" type="text" label="Figure legend (sep by comma)" value="sample,control"/>
+    <param name="log" label="log transform the data" type="boolean" truevalue="log" falsevalue="none" checked="False"/>
+
+  </inputs>
+  <outputs>
+    <data format="pdf" name="output" />
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool plots the CDF of two data columns in two input files. A KS test p-value is also shown. 
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/cdf-old-not-used/cdf2.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/cdf-old-not-used/cdf2.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,69 @@
+'''
+python wrapper for cdf plot
+'''
+
+import os,sys
+
+filename1 = sys.argv[1]
+filename2 = sys.argv[2]
+output = sys.argv[3]
+c1 = sys.argv[4].strip()
+c2 = sys.argv[5].strip()
+label1,label2 = sys.argv[6].strip().split(',')
+title = sys.argv[8].strip()
+log = sys.argv[7]
+
+
+
+rf = open('tmp.r','w')
+
+rf.write("options(warn=-1)\n")
+
+rf.write("x1 <- read.table('"+filename1+"')\n")
+rf.write("x2 <- read.table('"+filename2+"')\n")
+
+
+rf.write("mycdf <- function(x1,x2) {\n")
+rf.write("       sx1 <- sort(x1)\n")
+rf.write("     sy1 <- c(1:length(sx1))/length(sx1)\n")
+rf.write("     sx2 <- sort(x2)\n")
+rf.write("     sy2 <- c(1:length(sx2))/length(sx2)\n")
+rf.write("       mi <- min(c(x1,x2))\n")
+rf.write("       ma <- max(c(x1,x2))\n")
+rf.write("       sx1 <- c(mi,sx1,ma)\n")
+rf.write("       sx2 <- c(mi,sx2,ma)\n")
+rf.write("       sy1 <- c(0,sy1,1)\n")
+rf.write("       sy2 <- c(0,sy2,1)\n")
+rf.write("       pv <- ks.test(x1,x2)$p.value\n")
+rf.write("       list(sx1,sy1,sx2,sy2,pv)\n")
+rf.write("}\n")
+
+rf.write("list1 <- x1[,"+c1+"]\n")
+rf.write("list2 <- x2[,"+c2+"]\n")
+
+rf.write("list1[list1 > 1e10] <- 1e10\n")
+rf.write("list2[list2 > 1e10] <- 1e10\n")
+rf.write("list1[list1 < -1e10] <- -1e10\n")
+rf.write("list2[list2 < -1e10] <- -1e10\n")
+
+rf.write("max(list1)\n")
+rf.write("max(list2)\n")
+rf.write("min(list1)\n")
+rf.write("min(list2)\n")
+
+
+
+if log == 'log':
+    rf.write("list1 <- log2(list1) \n")
+    rf.write("list2 <- log2(list2) \n")
+rf.write("res <- mycdf(list1,list2)\n")
+
+rf.write("pdf('"+output+"')\n")
+rf.write("plot(res[[1]],res[[2]],type='l',lty=1,lwd=2,col='red',main=paste('"+title+"',format(res[[5]], digit=2,scientific = T),sep=' , p='),xlab='value',ylab='cumulative frequency')\n")
+rf.write("lines(res[[3]],res[[4]],type='l',lty=2,lwd=2,col='blue')\n")
+rf.write("legend('bottomright',c('"+label1+"','"+label2+"'),col=c('red','blue'), lty=1:2, lwd=2, bty='n')\n")
+rf.write("dev.off()\n")
+
+rf.close()
+os.system("R --vanilla < tmp.r")    
+os.system('rm tmp.r')
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/cdf.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/cdf.r Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,103 @@
+# bin and average
+binaverage = function(x,nbin,rankORvalue){
+# use x[,1] to bin x[,2]
+ binned = list()
+ if (rankORvalue == 'value'){
+ mi = min(x[,1])
+ ma = max(x[,1])
+ bins = seq(mi,ma,length.out=nbin+1)
+ bins[1] = bins[1] - abs(mi)/100
+ bins[nbin+1] = bins[nbin+1] + abs(ma)/100
+ for (i in 1:nbin){
+ binned[[i]] = x[x[,1] >= bins[i] & x[,1] < bins[i+1],2]
+ }
+ bins[1] = bins[1] + abs(mi)/100
+ bins[nbin+1] = bins[nbin+1] - abs(ma)/100
+ } else {
+ x = x[order(x[,1]),]
+ step = round(nrow(x)/nbin)
+ bins = x[1,1]
+ for (i in 1:(nbin-1)){
+ binned[[i]] = x[((i-1)*step+1):(i*step),2]
+ bins = c(bins,x[i*step+1,1])
+ }
+ binned[[nbin]] = x[((nbin-1)*step+1):nrow(x),2]
+ bins[nbin+1] = x[nrow(x),1]
+ }
+# bin lavel
+ labels = character(0)
+ for (i in 1:nbin){
+ labels = c(labels,paste(format(bins[i],digits=2,nsmall=2),format(bins[i+1],digits=2,nsmall=2),sep='~'))
+ }
+    list(binned=binned,bins=bins,labels=labels)
+}
+#res = binaverage(x,3,'rank')
+
+# CDF plot and KS.test
+mycdf = function(list,title,labels,legendposition,xlabel,legend_title){
+    L = length(list)
+    
+    # barplot for mean and std
+    avg = numeric(L)
+    err = numeric(L)
+    for (i in 1:L){
+        avg[i] = mean(list[[i]])
+        err[i] = sd(list[[i]])
+    }
+    #print(list[[1]])
+    #print(list[[2]])
+    #print(avg)
+    #print(err)
+ par(cex=1.5,mar=c(8,6,6,4))
+    xticks <- barplot(avg,names.arg=labels,las=2,ylab=xlabel,main='mean and standard deviation',xlab=legend_title,ylim=c(0,max(avg+err)))
+    arrows(xticks,avg+err, xticks, avg-err, angle=90, code=3, length=0.0)
+    
+ if (L>1){
+    # ks test
+ cat('\nKS test:\n')
+ cat('sample1\tsample2\tp-value\n')
+ cat('-------------------------------------------------\n')
+    for (i in 1:(L-1)){
+        for (j in (i+1):L){
+        cat(labels[i],'\t',labels[j],'\t')
+        ks = ks.test(list[[i]],list[[j]])
+        pv = max(2.2e-16,ks$p.value)
+     pv = format(pv,digits=3,nsmall=2)
+ cat(pv,'\n')
+        }
+    }
+ cat('-------------------------------------------------\n')
+ }
+ if (L == 2){
+ title = paste(title,'\np=',pv,sep='')
+ }
+    # cdf plot
+    listx = list()
+    listy = list()
+    mi = 1e10
+    ma = 1e-10
+    for (i in 1:L){
+        mi = min(mi,list[[i]])
+        ma = max(ma,list[[i]])
+    }
+    for (i in 1:L){
+        listx[[i]] = c(mi,listx[i],ma)
+        listy[[i]] = c(0,listy[i],1)
+    }
+    for (i in 1:L){
+        mi = min(mi,list[[i]])
+        ma = max(ma,list[[i]])
+        listx[[i]] = sort(list[[i]])
+        listy[[i]] = c(1:length(list[[i]]))/length(list[[i]])          
+    }
+#par(xlog=(xlog=='xlog'))
+    plot(listx[[1]],listy[[1]],type='l',lty=1,lwd=2,col=2,main=title,xlab=xlabel,ylab='cumulative frequency')
+    for (i in 2:L){
+        lines(listx[[i]],listy[[i]],type='l',lty=i,lwd=2,col=i+1)
+    }
+    # legend
+    for (i in 1:L){
+        labels[i] = paste(labels[i],', n=',length(list[[i]]),sep='')
+    }
+    legend(legendposition,legend=labels,col=2:(L+1), lty=1:L,lwd=2, bty='n',title=legend_title)
+}
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/cdf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/cdf.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,78 @@
+<tool id="cdf" name="CDF">
+  <description>plot of multiple numeric columns</description>
+  <command>cat $script_file | R --vanilla --slave > $out_log </command>
+  <inputs>
+    <param name="title" type="text" value="CDF plot" size="30" label="Plot title"/>
+    <param name="xlabel" type="text" value="value" size="30" label="xlabel"/>
+    <param name="log" label="log2 transform the data" type="boolean" truevalue="log" falsevalue="none" checked="False"/>
+   <param name="zero" label="remove zero" type="boolean" truevalue="zero" falsevalue="none" checked="False"/> 
+  <param name="legendloc" type="select" label="legend location on CDF plot" >
+   <option value="bottomright" selected="true">bottomright</option>
+   <option value="bottomleft">bottomleft</option>
+   <option value="bottom">bottom</option>
+   <option value="left">left</option>
+   <option value="topleft">topleft</option>
+   <option value="top">top</option>
+   <option value="topright">topright</option>      
+   <option value="right">right</option>
+   <option value="center">center</option>  
+      </param>       
+    <repeat name="series" title="sample">
+      <param name="label" type="text" value="" size="30" label="data label"/>
+      <param name="input" type="data" format="tabular" label="dataset"/>
+      <param name="column" type="integer" value="2" label="column number (-1 for last column)"/>
+    </repeat>  
+         
+  </inputs>
+
+  <configfiles>
+    <configfile name="script_file">
+      ## Setup R error handling to go to stderr
+      options(warn=-1)
+      source("/Users/xuebing/galaxy-dist/tools/mytools/cdf.r")
+      uselog = as.character("${log}")
+      zero = as.character("${zero}")
+      title = as.character("${title}")
+      xlabel = as.character("${xlabel}")
+        if (uselog=='log'){
+            xlabel = paste('log2',xlabel)
+        }                  
+
+      labels = character(0)
+      x = list()
+      #for $i, $s in enumerate( $series )
+        labels = c(labels,"${s.label.value}")
+        x0 = read.table( "${s.input.file_name}" ,sep='\t')
+        col = ${s.column.value}
+        if (col == -1) {col = ncol(x0)}
+        x0 = x0[,col]
+        if (zero == 'zero'){
+            x0 = x0[x0 != 0]
+        }
+        if (uselog=='log'){
+            x0=log2(1+x0)
+        }
+        print("${s.label.value}")
+        summary(x0)
+        x[[$i+1]] = x0
+      #end for
+      pdf("${out_file}")
+      mycdf(x,title,labels,"${legendloc}",xlabel,'')
+      dev.off() 
+    </configfile>
+  </configfiles>
+
+  <outputs>
+    <data format="txt" name="out_log" label="${tool.name} on ${on_string}: (log)" />
+    <data format="pdf" name="out_file" label="${tool.name} on ${on_string}: (plot)" />
+  </outputs>
+
+<help>
+
+.. class:: infomark
+
+This tool generate barplot and CDF plot comparing multiple numeric columns in different files.
+
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/closestBed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/closestBed.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,32 @@
+<tool id="closestbed" name="closestBed">
+  <description>find closest features</description>
+  <command>closestBed -a $inputa -b $inputb $strandness -d $no -t $tie> $output_data
+  </command>
+  <inputs>
+      <param name="inputa" type="data" format="interval,bam,bed,gff,vcf" label="Input A (-a)"/>
+      <param name="inputb" type="data" format="interval,bam,bed,gff,vcf" label="Input B (-b)"/>          
+      <param name="strandness" type="select" label="Strand requirement" >
+ <option value="" selected="true"> none </option>
+        <option value="-s" > -s: closest feature on the same strand</option>
+        <option value="-S" > -S: closest feature on the opposite strand </option>
+      </param>
+      
+    <param name="no" label="Only look for non-overlaping features" type="boolean" truevalue="-no" falsevalue="" checked="False"/>
+              <param name="tie" type="select" label="Strand requirement" >
+ <option value="all" selected="true"> report all ties </option>
+        <option value="first" > report the first that occurred</option>
+        <option value="last" > report the last that occurred </option>
+      </param>
+        </inputs>
+  <outputs>
+    <data format="input" name="output_data"/> 
+  </outputs>
+  <help>
+
+**What it does**
+
+This is a wrapper for closestBed.
+
+  </help>
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/collapseBed.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/collapseBed.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,58 @@
+'''
+collapse intervals
+'''
+
+def collapseInterval_strand(filename):
+    uniqintv = {}
+    data = {}
+    f = open(filename)
+    header = f.readline()
+    if 'chr' in header:
+        flds = header.strip().split('\t')
+        key = '\t'.join([flds[0],flds[1],flds[2],flds[5]])
+        uniqintv[key] = 1
+        data[key] = flds
+    for line in f:
+        flds = line.strip().split('\t')
+        key = '\t'.join([flds[0],flds[1],flds[2],flds[5]])
+        if uniqintv.has_key(key):
+            uniqintv[key] = uniqintv[key] + 1
+        else:
+            uniqintv[key] = 1
+            data[key] = flds
+    f.close()        
+    for key in uniqintv.keys():
+        print '\t'.join(data[key]+[str(uniqintv[key])])
+        #flds = key.split('\t')
+        #print '\t'.join([flds[0],flds[1],flds[2],'.',str(uniqintv[key]),flds[3]])
+
+def collapseInterval(filename):
+    uniqintv = {}
+    data = {}
+    f = open(filename)
+    header = f.readline()
+    if 'chr' in header:
+        flds = header.strip().split('\t')
+        key = '\t'.join([flds[0],flds[1],flds[2]])
+        uniqintv[key] = 1
+        data[key] = flds
+    for line in f:
+        flds = line.strip().split('\t')
+        key = '\t'.join([flds[0],flds[1],flds[2]])
+        if uniqintv.has_key(key):
+            uniqintv[key] = uniqintv[key] + 1
+        else:
+            uniqintv[key] = 1
+            data[key] = flds
+    f.close()        
+    for key in uniqintv.keys():
+        print '\t'.join(data[key]+[str(uniqintv[key])])
+        #flds = key.split('\t')
+        #print '\t'.join([flds[0],flds[1],flds[2],'.',str(uniqintv[key])])       
+
+import sys
+
+if sys.argv[2] == 'strand':
+    collapseInterval_strand(sys.argv[1])
+else:
+    collapseInterval(sys.argv[1])
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/collapseBed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/collapseBed.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,19 @@
+<tool id="collapseBed" name="collapse">
+  <description>intervals</description>
+  <command interpreter="python">collapseBed2.py $input $strand $score > $outfile </command>
+  <inputs>
+    <param name="input" format="interval" type="data" label="Original file"/>
+    <param name="strand" size="10" type="integer" value="6" label="strand column" help="set 0 to ignore strand information" />
+    <param name="score" size="10" type="integer" value="5" label="for duplicate lines, keep the one with max value in column" help="set 0 to ignore score information" />
+    </inputs>
+  <outputs>
+    <data format="input" name="outfile" />
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool collapses genomic intervals that have the same position (and strandness if specified) and output a set of unique intervals.  
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/collapseBed2.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/collapseBed2.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,36 @@
+'''
+collapse intervals
+'''
+
+def collapseInterval_strand(filename,c_strand,c_score):
+    # keeping max column c
+    uniqintv = {}
+    data = {}
+    f = open(filename)
+    header = f.readline()
+    if 'chr' in header:
+        flds = header.strip().split('\t')
+        key = '\t'.join([flds[0],flds[1],flds[2],flds[c_strand]])
+        uniqintv[key] = float(flds[c_score])
+        data[key] = flds
+    for line in f:
+        flds = line.strip().split('\t')
+        key = '\t'.join([flds[0],flds[1],flds[2],flds[c_strand]])
+        if not uniqintv.has_key(key):
+            uniqintv[key] = float(flds[c_score])
+            data[key] = flds
+        elif uniqintv[key] < float(flds[c_score]):
+            uniqintv[key] = float(flds[c_score])
+            data[key] = flds
+            
+    f.close()        
+    for key in uniqintv.keys():
+        print '\t'.join(data[key])
+        
+import sys
+
+if sys.argv[2] == '0':#ignore strand
+    sys.argv[2] = 1
+if sys.argv[3] == '0':# ignore score
+    sys.argv[3] = 2
+collapseInterval_strand(sys.argv[1],int(sys.argv[2])-1,int(sys.argv[3])-1)
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/collapseTab.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/collapseTab.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,37 @@
+'''
+collapse tabular files, with key columns, and max columns
+'''
+
+def collapseTab(filename,c_key,c_max):
+    # keeping rows with max value in column c_max
+    nCol = max(max(c_key),c_max)
+    c_max = c_max - 1
+    for i in range(len(c_key)):
+        c_key[i] = c_key[i] - 1
+    uniqintv = {}
+    data = {}
+    f = open(filename)
+    for line in f:
+        flds = line.strip().split('\t')
+        if len(flds) < nCol:
+            continue
+        key = ''
+        for i in c_key:
+            key = key + flds[i-1] # i is 1-based, python is 0-based
+        if not uniqintv.has_key(key):
+            uniqintv[key] = float(flds[c_max])
+            data[key] = flds
+        elif uniqintv[key] < float(flds[c_max]):
+            uniqintv[key] = float(flds[c_max])
+            data[key] = flds
+
+    f.close()        
+    for key in uniqintv.keys():
+        print '\t'.join(data[key])
+        
+import sys
+
+# convert string to number list
+c_key = map(int,sys.argv[2].split(','))
+c_max = int(sys.argv[3])
+collapseTab(sys.argv[1],c_key,c_max)
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/collapseTab.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/collapseTab.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,19 @@
+<tool id="collapseTab" name="collapse tabular">
+  <description>files</description>
+  <command interpreter="python">collapseTab.py $input $key $max > $outfile </command>
+  <inputs>
+    <param name="input" format="tabular" type="data" label="Original file"/>
+    <param name="key" size="10" type="text" value="1,2,3" label="key column(s)" help="columns to define unique rows" />
+    <param name="max" size="10" type="text" value="5" label="for lines with identical key, keep the one with max value in this column" help="need to be numeric" />
+    </inputs>
+  <outputs>
+    <data format="input" name="outfile" />
+  </outputs>
+  <help>
+
+**What it does**
+
+Similar to 'Group' but returns the entire line.  
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/convertEnsembl.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/convertEnsembl.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,26 @@
+'''
+convert ensembl bed to ucsc
+add chr to chromosome
+1 = +
+-1 = -
+'''
+
+import sys
+f = open(sys.argv[1])
+out = open(sys.argv[2],'w')
+skip = int(sys.argv[3])
+
+for i in range(skip):
+    f.readline()
+
+for line in f:
+    flds = line.strip().split('\t')
+    flds[0] = 'chr'+flds[0]
+    if flds[5] == '1':
+        flds[5] = '+'
+    else:
+        flds[5] = '-'
+    out.write('\t'.join(flds)+'\n')
+f.close()
+out.close()
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/convertEnsembl.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/convertEnsembl.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,18 @@
+<tool id="convertens" name="convert ensembl">
+  <description>to ucsc</description>
+  <command interpreter="python">convertEnsembl.py $input $output $skip  </command>
+  <inputs>
+    <param name="input" format="interval" type="data" label="Original file"/>
+    <param name="skip" size="10" type="integer" value="0" label="Number of beginning lines to skip"/>
+  </inputs>
+  <outputs>
+    <data format="input" name="output" />
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool convert ensembl based interval file to ucsc format: add 'chr' to chromosome number (column 1), and replace '1' and '-1' with '+' and '-' in column 6, respectively. 
+
+ </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/dreme.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/dreme.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,50 @@
+<tool id="dreme" name="DREME">
+  <description>short motif discovery</description>
+  <command interpreter="python">/Users/xuebing/bin/dreme.py -p $input -png     -e $ethresh
+    #if $background_select.bg_select == "fromfile":
+        -n "${bgfile}"
+    #end if
+
+  &amp;&amp; mv dreme_out/dreme.html ${html_outfile}
+  
+  &amp;&amp; mv dreme_out/dreme.txt ${txt_outfile}
+
+  &amp;&amp; mv dreme_out/dreme.xml ${xml_outfile}
+  
+  &amp;&amp; rm -rf dreme_out
+  
+  </command>
+  <inputs>
+      <param name="input" type="data" format="fasta" label="Sequence file (FASTA)"/>      
+     <conditional name="background_select">
+     <param name="bg_select" type="select" label="Background sequence" >
+ <option value="shuffle" selected="true">shuffle the orignal sequence</option>
+ <option value="fromfile">load from file</option>
+     </param>
+     <when value="fromfile">
+     <param name="bgfile" type="data" format="fasta" label="Background sequence file (FASTA)"/>
+     </when>
+    </conditional>
+          
+      <param name="ethresh" size="10" type="float" value="0.05" label="E-value threshold"/>
+  </inputs>
+  <outputs>
+
+    <data format="xml" name="xml_outfile" label="${tool.name} on ${on_string} (xml)"/>
+    <data format="txt" name="txt_outfile" label="${tool.name} on ${on_string} (motif)"/>
+    <data format="html" name="html_outfile" label="${tool.name} on ${on_string} (html)"/>    
+  </outputs>
+  <help>
+
+**What it does**
+
+http://meme.sdsc.edu/meme/doc/dreme.html
+
+DREME (Discriminative Regular Expression Motif Elicitation) finds relatively short motifs (up to 8 bases) fast, and can perform discriminative motif discovery if given a negative set, consisting of sequences unlikely to contain a motif of interest that is however likely to be found in the main ("positive") sequence set. If you do not provide a negative set the program shuffles the positive set to provide a background (in the role of the negative set).
+
+The input to DREME is one or two sets of DNA sequences. The program uses a Fisher Exact Test to determine significance of each motif found in the postive set as compared with its representation in the negative set, using a significance threshold that may be set on the command line.
+
+DREME achieves its high speed by restricting its search to regular expressions based on the IUPAC alphabet representing bases and ambiguous characters, and by using a heuristic estimate of generalised motifs' statistical significance. 
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/dreme_out/dreme.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/dreme_out/dreme.html Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,3045 @@\n+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">\n+<html>\n+<head>\n+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n+<title>DREME</title>\n+<style type="text/css">\n+    \n+    /* START INCLUDED FILE "meme.css" */\n+        /* The following is the content of meme.css */\n+        body { background-color:white; font-size: 12px; font-family: Verdana, Arial, Helvetica, sans-serif;}\n+\n+        div.help {\n+          display: inline-block;\n+          margin: 0px;\n+          padding: 0px;\n+          width: 12px;\n+          height: 13px;\n+          cursor: pointer;\n+          background-image: url("help.gif");\n+          background-image: url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAwAAAANAQMAAACn5x0BAAAAAXNSR0IArs4c6QAAAAZQTFRFAAAAnp6eqp814gAAAAF0Uk5TAEDm2GYAAAABYktHRACIBR1IAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAB3RJTUUH2gMJBQgGYqhNZQAAACZJREFUCNdj+P+BoUGAoV+AYeYEEGoWYGgTYGgRAAm2gRGQ8f8DAOnhC2lYnqs6AAAAAElFTkSuQmCC");\n+        }\n+\n+        div.help2 {\n+          color: #999;\n+          display: inline-block;\n+          width: 12px;\n+          height: 12px;\n+          border: 1px solid #999;\n+          font-size: 13px;\n+          line-height:12px;\n+          font-family: Helvetica, sans-serif;\n+          font-weight: bold;\n+          font-style: normal;\n+          cursor: pointer;\n+        }\n+        div.help2:hover {\n+          color: #000;\n+          border-color: #000;\n+        }\n+        \n+        p.spaced { line-height: 1.8em;}\n+        \n+        span.citation { font-family: "Book Antiqua", "Palatino Linotype", serif; color: #004a4d;}\n+\n+        p.pad { padding-left: 30px; padding-top: 5px; padding-bottom: 10px;}\n+\n+        td.jump { font-size: 13px; color: #ffffff; background-color: #00666a;\n+          font-family: Georgia, "Times New Roman", Times, serif;}\n+\n+        a.jump { margin: 15px 0 0; font-style: normal; font-variant: small-caps;\n+          font-weight: bolder; font-family: Georgia, "Times New Roman", Times, serif;}\n+\n+        h2.mainh {font-size: 1.5em; font-style: normal; margin: 15px 0 0;\n+          font-variant: small-caps; font-family: Georgia, "Times New Roman", Times, serif;}\n+\n+        h2.line {border-bottom: 1px solid #CCCCCC; font-size: 1.5em; font-style: normal;\n+          margin: 15px 0 0; padding-bottom: 3px; font-variant: small-caps;\n+          font-family: Georgia, "Times New Roman", Times, serif;}\n+\n+        h4 {border-bottom: 1px solid #CCCCCC; font-size: 1.2em; font-style: normal;\n+          margin: 10px 0 0; padding-bottom: 3px; font-family: Georgia, "Times New Roman", Times, serif;}\n+\n+        h5 {margin: 0px}\n+\n+        a.help { font-size: 9px; font-style: normal; text-transform: uppercase;\n+          font-family: Georgia, "Times New Roman", Times, serif;}\n+\n+        div.pad { padding-left: 30px; padding-top: 5px; padding-bottom: 10px;}\n+        \n+        div.pad1 { margin: 10px 5px;}\n+\n+        div.pad2 { margin: 25px 5px 5px;}\n+        h2.pad2 { padding: 25px 5px 5px;}\n+\n+        div.pad3 { padding: 5px 0px 10px 30px;}\n+\n+        div.box { border: 2px solid #CCCCCC; padding:10px;}\n+\n+        div.bar { border-left: 7px solid #00666a; padding:5px; margin-top:25px; }\n+\n+        div.subsection {margin:25px 0px;}\n+\n+        img {border:0px none;}\n+\n+        th.majorth {text-align:left;}\n+        th.minorth {font-weight:normal; text-align:left; width:8em; padding: 3px 0px;}\n+        th.actionth {font-weight:normal; text-align:left;}\n+\n+        .strand_name {text-align:left;}\n+        .strand_side {padding:0px 10px;}\n+        .strand_start {padding:0px 10px;}\n+        .strand_pvalue {text-align:center; padding:0px 10px;}\n+        .strand_lflank {text-align:right; padding-right:5px; font-weight:bold; font-size:large; font-family: \'Courier New\', Courier, monospace; color:gray;}\n+        .strand_seq {text-align:center; font-weight:bold; font-size:large; font-family: \'Courier New\', Courier, monospace;}\n+        .strand_'..b'le width="100%" border="0" cellspacing="1" cellpadding="4" bgcolor="#FFFFFF"><tr>\n+<td><h2 class="mainh">Description</h2></td>\n+<td align="right" valign="bottom">\n+<a href="#motifs">Next</a>\xc2\xa0<a href="#top">Top</a>\n+</td>\n+</tr></table>\n+<div class="box"><p>xxxx</p></div>\n+<a name="motifs"></a><table width="100%" border="0" cellspacing="1" cellpadding="4" bgcolor="#FFFFFF"><tr>\n+<td><h2 class="mainh">Discovered Motifs</h2></td>\n+<td align="right" valign="bottom">\n+<a href="#description">Previous</a>\xc2\xa0<a href="#program">Next</a>\xc2\xa0<a href="#top">Top</a>\n+</td>\n+</tr></table>\n+<div class="box">\n+<p><b>Click on the \xe2\x86\xa7</b> under the <b>More</b> column to show more \n+        information about the motif.<br><b>Click on the \xe2\x87\xa2</b> under the <b>Submit</b> column to send the \n+        motif to another MEME suite program. Eg. Tomtom<br><b>Click on the \xe2\x9f\xb1</b> under the <b>Download</b> column to get \n+        the position weight matrix of a motif or to download the logo image with\n+        your chosen options.\n+      </p>\n+<table id="dreme_motifs" class="dreme_motifs">\n+<thead><tr class="motif_head">\n+<td>\xc2\xa0</td>\n+<th>Motif <div class="help2" onclick="help(this,\'pop_motifs_name\')">?</div>\n+</th>\n+<th>Logo <div class="help2" onclick="help(this,\'pop_motifs_logo\')">?</div>\n+</th>\n+<th>RC Logo <div class="help2" onclick="help(this,\'pop_motifs_rc_logo\')">?</div>\n+</th>\n+<th>E-value <div class="help2" onclick="help(this,\'pop_motifs_evalue\')">?</div>\n+</th>\n+<th>Unerased E-value <div class="help2" onclick="help(this,\'pop_motifs_uevalue\')">?</div>\n+</th>\n+<th>More <div class="help2" onclick="help(this,\'pop_more\')">?</div>\n+</th>\n+<th>Submit <div class="help2" onclick="help(this,\'pop_submit\')">?</div>\n+</th>\n+<th>Download <div class="help2" onclick="help(this,\'pop_download\')">?</div>\n+</th>\n+</tr></thead>\n+<tbody></tbody>\n+<tfoot><tr class="rule">\n+<td>\xc2\xa0</td>\n+<td>\xc2\xa0</td>\n+<td>\xc2\xa0</td>\n+<td>\xc2\xa0</td>\n+<td>\xc2\xa0</td>\n+<td>\xc2\xa0</td>\n+<td>\xc2\xa0</td>\n+<td>\xc2\xa0</td>\n+<td>\xc2\xa0</td>\n+</tr></tfoot>\n+</table>\n+<div style="float:left"><div onclick="send_to_popup(0);" class="actionbutton" title="Submit all motifs to another program.">\n+<div style="float:left; margin-right:1em;">Submit All</div>\n+<div style="float:right">\xe2\x87\xa2</div>\n+<div style="clear:both;"></div>\n+</div></div>\n+<div style="clear:both;"></div>\n+</div>\n+<a name="program"></a><div class="bar">\n+<div style="text-align:right;">\n+<a href="#motifs">Previous</a> <a href="#top">Top</a>\n+</div>\n+<div class="subsection">\n+<a name="version"></a><h5>DREME version</h5>4.7.0 (Release date: Wed Sep 28 17:30:10 EST 2011)\n+      </div>\n+<div class="subsection">\n+<a name="reference"></a><h5>Reference</h5>\n+<span class="citation">\n+          Timothy L. Bailey, "DREME: Motif discovery in transcription factor ChIP-seq data", <i>Bioinformatics</i>, <b>27</b>(12):1653-1659, 2011.\n+        </span>\n+</div>\n+<div class="subsection">\n+<a name="command"></a><h5>Command line summary</h5>\n+<textarea rows="1" style="width:100%;" readonly>dreme -p test.fa -desc xxxx</textarea><br>Result calculation took 0.01 seconds<br>\n+</div>\n+<a href="javascript:show_hidden(\'model\')" id="model_activator">show model parameters...</a><div class="subsection" id="model_data" style="display:none;">\n+<h5>Model parameters</h5>\n+<textarea style="width:100%;" rows="10" readonly>\n+positives             = name: "test", count: "3", file: "test.fa", last_mod_date: "Sat Dec 10 12:52:18 EST 2011"\n+negatives             = name: "shuffled positive sequences", count: "3", from: "shuffled"\n+background            = type: "dna", A: "0.243", C: "0.270", G: "0.243", T: "0.243", from: "dataset"\n+stop                  = evalue: "0.05"\n+ngen                  = 100\n+add_pv_thresh         = 0.01\n+seed                  = 1\n+host                  = SHARPLAB.MIT.EDU\n+when                  = Sun Dec 11 09:26:43 EST 2011\n+</textarea>\n+</div>\n+<a href="javascript:hide_shown(\'model\')" style="display:none;" id="model_deactivator">hide model parameters...</a>\n+</div>\n+</body>\n+</html>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/dreme_out/dreme.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/dreme_out/dreme.txt Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,21 @@
+# DREME 4.7.0
+# command:   dreme -p test.fa -desc xxxx
+# host:      SHARPLAB.MIT.EDU
+# when:      Sun Dec 11 09:26:43 EST 2011
+# positives: 3
+#      from: test.fa (Sat Dec 10 12:52:18 EST 2011)
+# negatives: 3
+#      from: shuffled positives
+#
+# xxxx
+
+
+MEME version 4.7.0
+
+ALPHABET= ACGT
+
+strands: + -
+
+Background letter frequencies (from dataset):
+A 0.243 C 0.270 G 0.243 T 0.243
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/dreme_out/dreme.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/dreme_out/dreme.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,84 @@
+<?xml version='1.0' encoding='UTF-8' standalone='yes'?>
+<!DOCTYPE dreme[
+<!ELEMENT dreme (model, motifs, run_time)>
+<!ATTLIST dreme version CDATA #REQUIRED release CDATA #REQUIRED>
+<!ELEMENT model 
+  (command_line, positives, negatives, background, stop, ngen, add_pv_thresh, 
+  seed, host, when, description?)>
+<!ELEMENT command_line (#PCDATA)>
+<!ELEMENT positives EMPTY>
+<!ATTLIST positives 
+  name CDATA #REQUIRED count CDATA #REQUIRED file CDATA #REQUIRED 
+  last_mod_date CDATA #REQUIRED>
+<!--  
+  negatives must have a file and last_mod_date specified when the from
+  attribute is file.
+-->
+<!ELEMENT negatives EMPTY>
+<!ATTLIST negatives 
+  name CDATA #REQUIRED count CDATA #REQUIRED from (shuffled|file) #REQUIRED
+  file CDATA #IMPLIED last_mod_date CDATA #IMPLIED>
+<!-- 
+  background allows DNA and RNA (AA is not going to be supported with DREME) 
+  however currently only DNA is implemented. Note that when type is dna the
+  value for T must be supplied and when the type is rna the value for U must
+  be supplied. The sum of the frequencies must be 1 (with a small error).
+-->
+<!ELEMENT background EMPTY>
+<!ATTLIST background 
+  type (dna|rna) #REQUIRED
+  A CDATA #REQUIRED C CDATA #REQUIRED G CDATA #REQUIRED 
+  T CDATA #IMPLIED U CDATA #IMPLIED 
+  from (dataset|file) #REQUIRED 
+  file CDATA #IMPLIED last_mod_date CDATA #IMPLIED>
+<!ELEMENT stop EMPTY>
+<!ATTLIST stop 
+  evalue CDATA #IMPLIED count CDATA #IMPLIED time CDATA #IMPLIED>
+<!ELEMENT ngen (#PCDATA)>
+<!ELEMENT seed (#PCDATA)>
+<!ELEMENT add_pv_thresh (#PCDATA)>
+<!ELEMENT host (#PCDATA)>
+<!ELEMENT when (#PCDATA)>
+<!ELEMENT description (#PCDATA)>
+<!ELEMENT motifs (motif+)>
+<!ELEMENT motif (pos+, match+)>
+<!ATTLIST motif
+  id CDATA #REQUIRED seq CDATA #REQUIRED length CDATA #REQUIRED 
+  nsites CDATA #REQUIRED p CDATA #REQUIRED n CDATA #REQUIRED
+  pvalue CDATA #REQUIRED evalue CDATA #REQUIRED unerased_evalue CDATA #REQUIRED>
+<!--
+  pos allows DNA and RNA (AA is not going to be supported with DREME)
+  however current only DNA is implemented. When the type in the background
+  is 'dna' pos must have a T attribute and when it is 'rna' pos must have a
+  U attribute
+-->
+<!ELEMENT pos EMPTY>
+<!ATTLIST pos
+  i CDATA #REQUIRED A CDATA #REQUIRED C CDATA #REQUIRED G CDATA #REQUIRED 
+  T CDATA #IMPLIED U CDATA #IMPLIED>
+<!ELEMENT match EMPTY>
+<!ATTLIST match
+  seq CDATA #REQUIRED p CDATA #REQUIRED n CDATA #REQUIRED 
+  pvalue CDATA #REQUIRED evalue CDATA #REQUIRED>
+<!ELEMENT run_time EMPTY>
+<!ATTLIST run_time
+  cpu CDATA #REQUIRED real CDATA #REQUIRED stop (evalue|count|time) #REQUIRED>
+]>
+<dreme version="4.7.0" release="Wed Sep 28 17:30:10 EST 2011">
+  <model>
+    <command_line>dreme -p test.fa -desc xxxx</command_line>
+    <positives name="test" count="3" file="test.fa" last_mod_date="Sat Dec 10 12:52:18 EST 2011" />
+    <negatives name="shuffled positive sequences" count="3" from="shuffled"/>
+    <background type="dna" A="0.243" C="0.270" G="0.243" T="0.243" from="dataset"/>
+    <stop evalue="0.05"/>
+    <ngen>100</ngen>
+    <add_pv_thresh>0.01</add_pv_thresh>
+    <seed>1</seed>
+    <host>SHARPLAB.MIT.EDU</host>
+    <when>Sun Dec 11 09:26:43 EST 2011</when>
+  <description>xxxx</description>
+  </model>
+  <motifs>
+  </motifs>
+  <run_time cpu="0.01" real="0.01" stop="evalue"/>
+</dreme>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/endbias.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/endbias.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,52 @@
+'''
+usage:
+
+python endbias.py utr5-coverage utr3-coverage outputfile
+'''
+import sys,math
+
+def getCoverage(filename):
+    f = open(filename)
+    coverage = {}
+    for line in f:
+        flds = line.strip().split('\t')
+        score = float(flds[4])
+        name = (flds[0].split('utr'))[0].strip('_')
+        if coverage.has_key(name):
+            if score > coverage[name]:
+                coverage[name] = score
+        else:
+            coverage[name] = score
+    return coverage
+
+def endBias(filename,utr5,utr3):
+    out = open(filename,'w')
+    for txpt in utr5.keys():
+        if utr3.has_key(txpt):
+            out.write('\t'.join([txpt,str(utr5[txpt]),str(utr3[txpt]),str(math.log((1+utr5[txpt])/(1+utr3[txpt]),2))])+'\n')
+    out.close()
+   
+   
+utr5 = getCoverage(sys.argv[1])
+utr3 = getCoverage(sys.argv[2])
+endBias(sys.argv[3],utr5,utr3)
+            
+'''
+
+utr5 = getCoverage('hmga2-utr5.coverage')
+utr3 = getCoverage('hmga2-utr3.coverage')
+logratio, cov5,cov3= endBias(utr5,utr3)
+2**pylab.median(logratio.values())
+
+log2utr5 = pylab.log2(pylab.array(cov5)+1)
+log2utr3 = pylab.log2(pylab.array(cov3)+1)
+  
+pylab.plot(log2utr5,log2utr3,'bo')   
+
+pylab.show()               
+
+utr5 = getCoverage('control-utr5.coverage')
+utr3 = getCoverage('control-utr3.coverage')
+logratio, cov5,cov3= endBias(utr5,utr3)
+2**pylab.median(logratio.values())
+'''
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/endbias.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/endbias.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,11 @@
+<tool id="endbias" name="bias">
+  <description>of UTR coverage</description>
+  <command interpreter="python"> endbias.py $input1 $input2 $output </command>
+  <inputs>
+    <param name="input1" format="txt" type="data" label="5' UTR coverage" help="tabular output from bigWigAverageOverBed"/>
+    <param name="input2" format="txt" type="data" label="3' UTR coverage" help="tabular output from bigWigAverageOverBed"/>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="output" />
+  </outputs>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/fasta-dinucleotide-shuffle.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/fasta-dinucleotide-shuffle.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,223 @@
+#!/usr/bin/python
+
+import sys, string, random
+import sequence
+
+# 
+# turn on psyco to speed up by 3X
+#
+if __name__=='__main__': 
+  try:
+    import psyco
+    #psyco.log()
+    psyco.full()
+    psyco_found = True
+  except ImportError:
+#    psyco_found = False
+    pass
+#  print >> sys.stderr, "psyco_found", psyco_found
+
+
+# altschulEriksonDinuclShuffle.py
+# P. Clote, Oct 2003
+
+def computeCountAndLists(s):
+
+  #Initialize lists and mono- and dinucleotide dictionaries
+  List = {} #List is a dictionary of lists
+  List['A'] = []; List['C'] = [];
+  List['G'] = []; List['T'] = [];
+  # FIXME: is this ok?
+  List['N'] = []
+  nuclList   = ["A","C","G","T","N"]
+  s       = s.upper()
+  #s       = s.replace("U","T")
+  nuclCnt    = {}  #empty dictionary
+  dinuclCnt  = {}  #empty dictionary
+  for x in nuclList:
+    nuclCnt[x]=0
+    dinuclCnt[x]={}
+    for y in nuclList:
+      dinuclCnt[x][y]=0
+
+  #Compute count and lists
+  nuclCnt[s[0]] = 1
+  nuclTotal     = 1
+  dinuclTotal   = 0
+  for i in range(len(s)-1):
+    x = s[i]; y = s[i+1]
+    List[x].append( y )
+    nuclCnt[y] += 1; nuclTotal  += 1
+    dinuclCnt[x][y] += 1; dinuclTotal += 1
+  assert (nuclTotal==len(s))
+  assert (dinuclTotal==len(s)-1)
+  return nuclCnt,dinuclCnt,List


+def chooseEdge(x,dinuclCnt):
+  z = random.random()
+  denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']+dinuclCnt[x]['N']
+  numerator = dinuclCnt[x]['A']
+  if z < float(numerator)/float(denom):
+    dinuclCnt[x]['A'] -= 1
+    return 'A'
+  numerator += dinuclCnt[x]['C']
+  if z < float(numerator)/float(denom):
+    dinuclCnt[x]['C'] -= 1
+    return 'C'
+  numerator += dinuclCnt[x]['G']
+  if z < float(numerator)/float(denom):
+    dinuclCnt[x]['G'] -= 1
+    return 'G'
+  numerator += dinuclCnt[x]['T']
+  if z < float(numerator)/float(denom):
+    dinuclCnt[x]['T'] -= 1
+    return 'T'
+  dinuclCnt[x]['N'] -= 1
+  return 'N'
+
+def connectedToLast(edgeList,nuclList,lastCh):
+  D = {}
+  for x in nuclList: D[x]=0
+  for edge in edgeList:
+    a = edge[0]; b = edge[1]
+    if b==lastCh: D[a]=1
+  for i in range(3):
+    for edge in edgeList:
+      a = edge[0]; b = edge[1]
+      if D[b]==1: D[a]=1
+  ok = 0
+  for x in nuclList:
+    if x!=lastCh and D[x]==0: return 0
+  return 1
+
+def eulerian(s):
+  nuclCnt,dinuclCnt,List = computeCountAndLists(s)
+  #compute nucleotides appearing in s
+  nuclList = []
+  for x in ["A","C","G","T","N"]:
+    if x in s: nuclList.append(x)
+  #create dinucleotide shuffle L 
+  firstCh = s[0]  #start with first letter of s
+  lastCh  = s[-1]
+  edgeList = []
+  for x in nuclList:
+    if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] )
+  ok = connectedToLast(edgeList,nuclList,lastCh)
+  return ok,edgeList,nuclList,lastCh
+
+
+def shuffleEdgeList(L):
+  n = len(L); barrier = n
+  for i in range(n-1):
+    z = int(random.random() * barrier)
+    tmp = L[z]
+    L[z]= L[barrier-1]
+    L[barrier-1] = tmp
+    barrier -= 1
+  return L
+
+def dinuclShuffle(s):
+  ok = 0
+  while not ok:
+    ok,edgeList,nuclList,lastCh = eulerian(s)
+  nuclCnt,dinuclCnt,List = computeCountAndLists(s)
+
+  #remove last edges from each vertex list, shuffle, then add back
+  #the removed edges at end of vertex lists.
+  for [x,y] in edgeList: List[x].remove(y)
+  for x in nuclList: shuffleEdgeList(List[x])
+  for [x,y] in edgeList: List[x].append(y)
+
+  #construct the eulerian path
+  L = [s[0]]; prevCh = s[0]
+  for i in range(len(s)-2):
+    ch = List[prevCh][0] 
+    L.append( ch )
+    del List[prevCh][0]
+    prevCh = ch
+  L.append(s[-1])
+  t = string.join(L,"")
+  return t

+def main():
+
+ #
+ # defaults
+ #
+ file_name = None
+ seed = 1
+ copies = 1
+
+ #
+ # get command line arguments
+ #
+ usage = """USAGE: 
+ %s [options]
+
+        -f <filename>   file name (required)
+        -t <tag>        added to shuffled sequence names
+        -s <seed> random seed; default: %d
+ -c <n> make <n> shuffled copies of each sequence; default: %d
+        -h              print this usage message
+ """ % (sys.argv[0], seed, copies)
+
+        # no arguments: print usage
+ if len(sys.argv) == 1:
+ print >> sys.stderr, usage; sys.exit(1)
+
+        tag = "";
+
+        # parse command line
+        i = 1
+        while i < len(sys.argv):
+                arg = sys.argv[i]
+                if (arg == "-f"):
+                        i += 1
+                        try: file_name = sys.argv[i]
+                        except: print >> sys.stderr, usage; sys.exit(1)
+                elif (arg == "-t"):
+                        i += 1
+                        try: tag = sys.argv[i]
+                        except: print >> sys.stderr, usage; sys.exit(1)
+                elif (arg == "-s"):
+                        i += 1
+                        try: seed = string.atoi(sys.argv[i])
+                        except: print >> sys.stderr, usage; sys.exit(1)
+                elif (arg == "-c"):
+                        i += 1
+                        try: copies = string.atoi(sys.argv[i])
+                        except: print >> sys.stderr, usage; sys.exit(1)
+                elif (arg == "-h"):
+                        print >> sys.stderr, usage; sys.exit(1)
+                else:
+                        print >> sys.stderr, "Unknown command line argument: " + arg
+                        sys.exit(1)
+                i += 1
+
+        # check that required arguments given
+        if (file_name == None):
+         print >> sys.stderr, usage; sys.exit(1)
+
+ random.seed(seed)
+
+ # read sequences
+ seqs = sequence.readFASTA(file_name,'Extended DNA')
+
+ for s in seqs:
+ str = s.getString()
+ #FIXME altschul can't handle ambigs
+ name = s.getName()
+
+ #print >> sys.stderr, ">%s" % name
+
+ for i in range(copies):
+
+ shuffledSeq = dinuclShuffle(str)
+
+ if (copies == 1):
+ print >> sys.stdout, ">%s\n%s" % (name+tag, shuffledSeq)
+ else:
+ print >> sys.stdout, ">%s_%d\n%s" % (name+tag, i, shuffledSeq)
+
+if __name__ == '__main__': main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/fastamarkov.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/fastamarkov.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,21 @@
+<tool id="fastamarkov" name="background model">
+  <description>of DNA sequence</description>
+  <command>cat $input | fasta-get-markov -m $m $norc > $output 2> err.txt
+    
+    </command>
+  <inputs>
+      <param name="input" type="data" format="fasta" label="Sequence file (FASTA)"/>
+      <param name="m" size="10" type="integer" value="0" label="Order of Markov model to use"/>
+    <param name="norc" label="Combine forward and reverse complement frequencies" type="boolean" truevalue="" falsevalue="-norc" checked="True"/>
+  </inputs>
+  <outputs>
+    <data format="txt" name="output" />
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool generates a markov model from a fasta sequence file. 
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/fastashuffle1.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/fastashuffle1.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,17 @@
+<tool id="seqshuffle" name="shuffle sequences">
+  <description>preserving mono-nucleotide frequency</description>
+  <command>cat $input | fasta-shuffle-letters > $output  </command>
+  <inputs>
+     <param name="input" type="data" format="fasta" label="Original FASTA sequence file"/>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="output" />
+  </outputs>
+  <help>
+
+**Description**
+
+shuffle the position of nucleotides in each sequence, preserving the frequency of nucleotides in that sequence, but do not preserve di-nucleotide frequency. 
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/fastashuffle2.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/fastashuffle2.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,24 @@
+<tool id="seqshuffle2" name="shuffle sequence">
+  <description>preserving dinucleotide frequency</description>
+  <command interpreter="python">fasta-dinucleotide-shuffle.py -f $input -t $tag -c $n -s $seed > $output </command>
+  <inputs>
+    <param name="input" format="fasta" type="data" label="Original sequence file"/>
+    <param name="tag" type="text" size="40" value="-shuffled" label="tag added to shuffled sequence name"/>
+    <param name="n" type="integer" value="1" label="number of shuffled copies for each sequence"/>
+    <param name="seed" type="integer" value="1" label="random seed" help="the same seed gives the same random sequences"/>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="output" />
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool shuffles the sequences in the input file but preserves the dinucleotide frequency of each sequence. 
+
+The code implements the Altschul-Erikson dinucleotide shuffle algorithm, described in "Significance of nucleotide sequence alignments: A method for random sequence permutation that preserves dinucleotide and codon usage", S.F. Altschul and B.W. Erikson, Mol. Biol. Evol., 2(6):526--538, 1985. 
+
+Code adapted from http://bioinformatics.bc.edu/clotelab/RNAdinucleotideShuffle/dinucleotideShuffle.html
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/fastqdump.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/fastqdump.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,18 @@
+<tool id="fastqdump" name="fastq-dump">
+  <description>convert SRA to FASTQ</description>
+  <command>/Users/xuebing/tools/sratoolkit.2.1.9-mac32/fastq-dump -A $input -M $minReadLen -Z > $out_file1 </command>
+  <inputs>
+    <param name="input" format="sra" type="data" label="Original file (SRA)"/>
+    <param name="minReadLen" size="10" type="integer" value="10" label="minimum read length to output"/>
+  </inputs>
+  <outputs>
+    <data format="fastq" name="out_file1" />
+  </outputs>
+  <help>
+
+**What it does**
+
+This is a wrapper of the fastq-dump tool from sra-toolkit. See http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software 
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/fimo2-old.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/fimo2-old.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,63 @@
+<tool id="fimo" name="motif search">
+  <description>using FIMO</description>
+  <command> fimo 
+    #if $background_select.bg_select == "fromfile":
+    -bgfile $bgfile
+    #end if
+    
+  $norc --output-pthresh $pth --verbosity 1 $motif $database 
+  &amp;&amp; mv fimo_out/fimo.html ${html_outfile}
+  
+  &amp;&amp; mv fimo_out/fimo.txt ${txt_outfile}
+  
+  &amp;&amp; rm -rf fimo_out
+  
+  </command>
+  <inputs>
+    <conditional name="motif_selection">   
+        <param name="motif_sel" type="select" label="Motif Source">
+            <option value="user" selected="true">Use motif file in Your History</option>
+            <option value="known" >Known motif</option>
+        </param>
+        ##<when value="known">
+        ##    <param name="motif" type="select" label="Select motif">
+        ##        <option value="/Users/xuebing/galaxy-dist/tool-data/motif-database/5primerSpliceSite" selected="true">mouse 5 primer splice site</option>
+        ##        <option value="/Users/xuebing/galaxy-dist/tool-data/motif-database/5primerSpliceSite">mouse 3 primer splice site</option>
+        ##        <option value="/Users/xuebing/galaxy-dist/tool-data/motif-database/TATA-Box.meme">TATA box</option>
+        ##    </param>
+        ##</when>
+        <when value="user">
+            <param name="motif" type="data" format="txt" label="Motif file" help="created using the tool create-motif-file, or import from Shared Data"/>
+        </when>
+    </conditional>     
+         
+    <param name="database" type="data" format="fasta" label="Sequence file (FASTA)"/>
+      
+    <conditional name="background_select">
+     <param name="bg_select" type="select" label="Background model" >
+   <option value="uniform" selected="true">uniform</option>
+   <option value="fromfile">load from file</option>
+     </param>
+     <when value="fromfile">
+     <param name="bgfile" type="data" format="txt" label="File for background model"/>
+     </when>
+    </conditional>
+      
+      <param name="pth" size="10" type="float" value="0.01" label="p-value threshold"/>
+    <param name="norc" label="Do not score the reverse complement DNA strand. Both strands are scored by default" type="boolean" truevalue="-norc" falsevalue="" checked="False"/>
+  </inputs>
+  <outputs>
+    <data format="html" name="html_outfile" label="${tool.name} on ${on_string} (html)"/>
+    <data format="txt" name="txt_outfile" label="${tool.name} on ${on_string} (txt)"/>
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool uses FIMO to find matches of a motif in a fasta file. See more details:
+
+http://meme.sdsc.edu/meme/fimo-intro.html

+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/fimo2.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/fimo2.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,47 @@
+<tool id="fimo" name="motif search">
+  <description>using FIMO</description>
+  <command> fimo 
+    #if $background_select.bg_select == "fromfile":
+    -bgfile $bgfile
+    #end if
+    
+  $norc --max-stored-scores 5000000 --output-pthresh $pth --verbosity 1 $motif $database 
+  &amp;&amp; mv fimo_out/fimo.html ${html_outfile}
+  
+  &amp;&amp; mv fimo_out/fimo.txt ${txt_outfile}
+  
+  &amp;&amp; rm -rf fimo_out
+  
+  </command>
+  <inputs>
+    
+            <param name="motif" type="data" format="txt" label="Motif file" help="created using the tool create-motif-file, or import from Shared Data"/>         
+    <param name="database" type="data" format="fasta" label="Sequence file (FASTA)"/>
+      
+    <conditional name="background_select">
+     <param name="bg_select" type="select" label="Background model" >
+   <option value="uniform" selected="true">uniform</option>
+   <option value="fromfile">load from file</option>
+     </param>
+     <when value="fromfile">
+     <param name="bgfile" type="data" format="txt" label="File for background model"/>
+     </when>
+    </conditional>
+      
+      <param name="pth" size="10" type="float" value="0.0001" label="p-value threshold"/>
+    <param name="norc" label="Do not score the reverse complement DNA strand. Both strands are scored by default" type="boolean" truevalue="-norc" falsevalue="" checked="False"/>
+  </inputs>
+  <outputs>
+    <data format="html" name="html_outfile" label="${tool.name} on ${on_string} (html)"/>
+    <data format="txt" name="txt_outfile" label="${tool.name} on ${on_string} (txt)"/>
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool uses FIMO to find matches of a motif in a fasta file. See more details:
+
+http://meme.sdsc.edu/meme/fimo-intro.html

+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/fimo2bed.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/fimo2bed.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,46 @@
+'''
+#pattern name sequence name start stop score p-value q-value matched sequence
+constitutive-donor mm9_chr1_39533592_39535592_- 1815 1823 12.032 4.26e-06 0.397 CAGGTAAGT
+constitutive-donor mm9_chr1_59313750_59315750_+ 1889 1897 12.032 4.26e-06 0.397 CAGGTAAGT
+
+#pattern name sequence name start stop score p-value q-value matched sequence
+constitutive-donor mm9_chr1_172019075_172021075_- 1947 1955 12.032 4.26e-06 0.843 CAGGTAAGT
+constitutive-donor mm9_chr1_15300532_15302532_+ 156 164 12.032 4.26e-06 0.843 CAGGTAAGT
+'''
+
+import sys
+
+def fimo2bed(filename,rc):
+    '''
+    parse fimo output to make a bed file
+    rc: the sequence have been reverse complemented
+    '''
+    f = open(filename)
+    header = f.readline()
+    for line in f:
+        pattern,posi,begin,stop,score,pv,qv,seq = line.strip().split('\t')
+        flds = posi.split('_')
+        start = flds[-3]
+        end = flds[-2]
+        strand = flds[-1]
+        chrom = '_'.join(flds[1:-3]) #'chrX_random'
+        if not rc:
+            if strand == '+':
+                start1 = str(int(start) + int(begin)-1)
+                end1 = str(int(start) + int(stop))
+                print '\t'.join([chrom,start1,end1,seq,score,strand]) 
+            else:
+                start1 = str(int(end) - int(stop))
+                end1 = str(int(end) - int(begin)+1)
+                print '\t'.join([chrom,start1,end1,seq,score,strand])
+        else:
+            if strand == '-':
+                start1 = str(int(start) + int(begin)-1)
+                end1 = str(int(start) + int(stop))
+                print '\t'.join([chrom,start1,end1,seq,score,'+']) 
+            else:
+                start1 = str(int(end) - int(stop))
+                end1 = str(int(end) - int(begin)+1)
+                print '\t'.join([chrom,start1,end1,seq,score,'-'])      
+
+fimo2bed(sys.argv[1],sys.argv[2]=='rc')                                   
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/fimo2bed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/fimo2bed.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,19 @@
+<tool id="fimo2bed" name="fimo-to-bed">
+  <description>convert FIMO output to BED</description>
+  <command interpreter="python">fimo2bed.py $input $rc > $output</command>
+  <inputs>
+    <param name="input" format="txt" type="data" label="FIMO output file"/>
+    <param name="rc" label="Check if the sequences are reverse complement" type="boolean" truevalue="rc" falsevalue="none" checked="False"/>
+  </inputs>
+  <outputs>
+    <data format="bed" name="output" />
+  </outputs>
+  <help>
+
+  Only works if your original FIMO input fasta sequences have ids like:: 
+  
+    mm9_chr15_99358448_99360448_-
+  
+  
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/fimo_out/cisml.css
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/fimo_out/cisml.css Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,24 @@
+table {
+ text-align: center;
+ margin-left: auto;
+ margin-right: auto;
+ margin-top: 1%;
+ border: 2px solid;
+ border-collapse: collapse;
+}
+
+th {
+ background-color: wheat;
+}
+
+td,th {
+ border: 2px solid black;
+}
+
+h2 {
+ text-align: center;
+}
+
+h3 {
+ text-align: center;
+}
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/fimo_out/cisml.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/fimo_out/cisml.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,1683 @@\n+<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n+<?xml-stylesheet type="text/xsl" href="fimo-to-html.xsl"?>\n+<!-- Begin document body -->\n+<cis-element-search\n+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n+  xsi:schemaLocation="http://zlab.bu.edu/schema/cisml cisml.xsd"\n+  xmlns="http://zlab.bu.edu/schema/cisml"\n+  xmlns:mem="http://noble.gs.washington.edu/meme"\n+>\n+<program-name>fimo</program-name>\n+<parameters>\n+<pattern-file>AATAAA.motif</pattern-file>\n+<sequence-file>/Users/xuebing/Downloads/hotair-target.fa.fasta</sequence-file>\n+<site-pvalue-cutoff>0.0001</site-pvalue-cutoff>\n+<sequence-filtering on-off="off"/>\n+</parameters>\n+<pattern accession="AATAAA" name="AATAAA">\n+<scanned-sequence accession="hg18_chr2_176729347_176730199_+" name="hg18_chr2_176729347_176730199_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr16_62574196_62575125_+" name="hg18_chr16_62574196_62575125_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr9_13935898_13936724_+" name="hg18_chr9_13935898_13936724_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr1_33901946_33902751_+" name="hg18_chr1_33901946_33902751_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr12_52645621_52646676_+" name="hg18_chr12_52645621_52646676_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr12_52642148_52644699_+" name="hg18_chr12_52642148_52644699_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr13_89656947_89657801_+" name="hg18_chr13_89656947_89657801_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr11_39335199_39336003_+" name="hg18_chr11_39335199_39336003_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr2_176860123_176860951_+" name="hg18_chr2_176860123_176860951_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr14_34946897_34947750_+" name="hg18_chr14_34946897_34947750_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr10_36709221_36710101_+" name="hg18_chr10_36709221_36710101_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr6_85734197_85734926_+" name="hg18_chr6_85734197_85734926_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr6_122208322_122209078_+" name="hg18_chr6_122208322_122209078_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chrX_7721675_7722551_+" name="hg18_chrX_7721675_7722551_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr4_83118096_83118950_+" name="hg18_chr4_83118096_83118950_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr1_246386296_246387002_+" name="hg18_chr1_246386296_246387002_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chrX_138845947_138846576_+" name="hg18_chrX_138845947_138846576_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr9_116983372_116983999_+" name="hg18_chr9_116983372_116983999_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr2_238501322_238502028_+" name="hg18_chr2_238501322_238502028_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr6_114316272_114316978_+" name="hg18_chr6_114316272_114316978_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr7_158117474_158118353_+" name="hg18_chr7_158117474_158118353_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr5_22844572_22845251_+" name="hg18_chr5_22844572_22845251_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr4_91055674_91056478_+" name="hg18_chr4_91055674_91056478_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr7_125039472_125040524_+" name="hg18_chr7_125039472_125040524_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr17_3131597_3132351_+" name="hg18_chr17_3131597_3132351_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr3_89426198_89426978_+" name="hg18_chr3_89426198_89426978_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr6_129312671_129313449_+" name="hg18_chr6_129312671_129313449_+">\n+</sca'..b'ccession="hg18_chr5_138549424_138550102_+" name="hg18_chr5_138549424_138550102_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr6_28727725_28728401_+" name="hg18_chr6_28727725_28728401_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr11_85906046_85906627_+" name="hg18_chr11_85906046_85906627_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr6_24359446_24360053_+" name="hg18_chr6_24359446_24360053_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr10_39168646_39169473_+" name="hg18_chr10_39168646_39169473_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chrY_12372398_12373328_+" name="hg18_chrY_12372398_12373328_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr20_33746349_33746974_+" name="hg18_chr20_33746349_33746974_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr10_39159926_39160728_+" name="hg18_chr10_39159926_39160728_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr6_1679921_1680524_+" name="hg18_chr6_1679921_1680524_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr20_61386321_61386802_+" name="hg18_chr20_61386321_61386802_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr6_43711850_43712377_+" name="hg18_chr6_43711850_43712377_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr15_99601071_99601501_+" name="hg18_chr15_99601071_99601501_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr12_77625374_77625953_+" name="hg18_chr12_77625374_77625953_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr1_143739897_143740578_+" name="hg18_chr1_143739897_143740578_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr8_43216723_43217402_+" name="hg18_chr8_43216723_43217402_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr8_82917047_82917528_+" name="hg18_chr8_82917047_82917528_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr6_41116798_41117328_+" name="hg18_chr6_41116798_41117328_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr10_39177802_39178378_+" name="hg18_chr10_39177802_39178378_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr10_39134122_39134849_+" name="hg18_chr10_39134122_39134849_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chrY_12365371_12365778_+" name="hg18_chrY_12365371_12365778_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr1_141476996_141478249_+" name="hg18_chr1_141476996_141478249_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr4_49338396_49339128_+" name="hg18_chr4_49338396_49339128_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr4_48799174_48801122_+" name="hg18_chr4_48799174_48801122_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr6_3685623_3686201_+" name="hg18_chr6_3685623_3686201_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr10_42110923_42111747_+" name="hg18_chr10_42110923_42111747_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chrY_11247425_11248952_+" name="hg18_chrY_11247425_11248952_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr1_1346473_1347026_+" name="hg18_chr1_1346473_1347026_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr10_39188896_39189328_+" name="hg18_chr10_39188896_39189328_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chrY_12379246_12380027_+" name="hg18_chrY_12379246_12380027_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr4_49343647_49346153_+" name="hg18_chr4_49343647_49346153_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr4_49335596_49336473_+" name="hg18_chr4_49335596_49336473_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr20_62050197_62050899_+" name="hg18_chr20_62050197_62050899_+">\n+</scanned-sequence>\n+<scanned-sequence accession="hg18_chr4_49340971_49341478_+" name="hg18_chr4_49340971_49341478_+">\n+</scanned-sequence>\n+</pattern>\n+</cis-element-search>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/fimo_out/fimo-to-html.xsl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/fimo_out/fimo-to-html.xsl Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,761 @@\n+<?xml version="1.0" encoding="ISO-8859-1"?>\n+<xsl:stylesheet version="1.0" \n+  xmlns:xsl="http://www.w3.org/1999/XSL/Transform"\n+  xmlns:cis="http://zlab.bu.edu/schema/cisml"\n+  xmlns:fimo="http://noble.gs.washington.edu/schema/cisml"\n+  xmlns:mem="http://noble.gs.washington.edu/meme"\n+>\n+\n+  <xsl:output method="html" indent="yes" \n+    doctype-public="-//W3C//DTD HTML 4.01 Transitional//EN"\n+    doctype-system="http://www.w3.org/TR/html4/loose.dtd"\n+  />\n+\n+  <!-- Stylesheet processing starts here -->\n+  <xsl:template match="/fimo">\n+    <html>\n+      <xsl:call-template name="html_head"/>\n+      <body bgcolor=\'#D5F0FF\'>  \n+        <a name="top_buttons"></a>\n+        <hr />\n+        <table summary="buttons" align="left" cellspacing="0">\n+          <tr>\n+            <td bgcolor="#00FFFF">\n+              <a href=\'#database_and_motifs\'><b>Database and Motifs</b></a>\n+            </td>\n+            <td bgcolor="#DDFFDD">\n+              <a href="#sec_i"><b>High-scoring Motif Occurences</b></a>\n+            </td>\n+            <td bgcolor="#DDDDFF">\n+              <a href="#debugging_information"><b>Debugging Information</b></a>\n+            </td>\n+          </tr>\n+        </table>\n+        <br />\n+        <br />\n+        <!-- Create the various sub-sections of the document -->\n+        <xsl:call-template name="version"/>\n+        <xsl:call-template name="database_and_motifs"/>\n+        <xsl:call-template name="high_scoring_motif_occurrences"/>\n+        <!-- Diagrams and annotations break for long sequences \n+        (H. sap. chr21 for example -->\n+        <!--\n+        <xsl:call-template name="motif_diagrams"/>\n+        <xsl:call-template name="annotations"/>\n+        -->\n+        <xsl:call-template name="debugging_information"/>\n+        <!-- Buttons are pointless since there is only one section\n+        <xsl:call-template name="button_help"/>\n+        -->\n+      </body>\n+    </html>\n+  </xsl:template>\n+\n+  <xsl:template name="html_head">\n+    <!-- This template prints the HTML head block, including the document level CSS. -->\n+    <head>\n+      <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"/>\n+      <title>FIMO Results</title>\n+      <style type="text/css">\n+        td.left {text-align: left;}\n+        td.right {text-align: right; padding-right: 1cm;}\n+      </style>\n+    </head>\n+  </xsl:template>\n+\n+  <xsl:template name="version">\n+    <!-- \n+      This template prints the HTML describing the version of FIMO \n+      that generated this document. \n+    -->\n+    <hr />\n+    <center>\n+      <big><b>FIMO - Motif search tool</b></big>\n+    </center>\n+    <hr />\n+    <p>\n+      FIMO version <xsl:value-of select="@version"/>, \n+      (Release date: <xsl:value-of select="@release"/>)\n+    </p>\n+    <p>\n+      For further information on how to interpret these results\n+      or to get a copy of the FIMO software please access\n+      <a href="http://meme.nbcr.net">http://meme.nbcr.net</a>\n+    </p>\n+    <p>If you use FIMO in your research, please cite the following paper:<br />\n+      Charles E. Grant, Timothy L. Bailey, and William Stafford Noble,\n+      &quot;FIMO: Scanning for occurrences of a given motif&quot;,\n+      <i>Bioinformatics</i>, <b>27</b>(7):1017&#45;1018, 2011.\n+    </p>\n+  </xsl:template>\n+\n+  <xsl:template name="reference">\n+    <!-- This template prints the instructions for citing FIMO. -->\n+    <hr/>\n+    <center>\n+      <a name="reference"/>\n+      <big><b>REFERENCE</b></big>\n+    </center>\n+    <hr/>\n+    <p>\n+    If you use this program in your research, please cite:\n+    </p>\n+    <p>\n+    </p>\n+  </xsl:template>\n+\n+  <xsl:template name="database_and_motifs">\n+    <!-- \n+      This template prints the HTML descibing the input sequences\n+      and motifs that were used to generate this document.\n+     -->\n+    <hr/>\n+    <center>\n+      <big>\n+        <b><a name="database_and_motifs">DATABASE AND MOTIFS</a></b>\n+      </big>\n+    </center>\n+    <hr/>\n+    <div style="padd'..b'to motif occurences table(<a href="#sec_i">SECTION I</a>)\n+    <br/>\n+    <span style="background-color: #DDFFDD">D</span>\n+    Links to motif occurences diagram (<a href="#sec_ii">SECTION II</a>)\n+    <br />\n+    <span style="background-color: #FFDDDD">A</span>\n+    Links to sequence/motif annotated alignments (<a href="#sec_iii">SECTION III</a>) \n+    <br />\n+    <span style="background-color: #FFFFFF">?</span>\n+    This information \n+    <br />\n+\n+    <hr />\n+    <span style="background-color: #DDDDFF"><a href="#top_buttons"><b>Go to top</b></a></span>\n+  </xsl:template>\n+\n+  <xsl:template name="wrap_sequence">\n+    <xsl:param name="sequence"/>\n+    <xsl:param name="sequence_index"/>\n+    <xsl:param name="sequence_name"/>\n+    <xsl:variable name="current_sequence">\n+      <xsl:value-of select="substring($sequence, 1, 75)"/>\n+    </xsl:variable>\n+    <tr>\n+      <td></td>\n+      <td>\n+        <xsl:call-template name="annotate_motif_names">\n+          <xsl:with-param name="sequence_index" select="$sequence_index"/>\n+          <xsl:with-param name="sequence_name" select="$sequence_name"/>\n+          <xsl:with-param name="current_sequence" select="$current_sequence" />\n+        </xsl:call-template>\n+      </td>\n+    </tr>\n+    <tr>\n+    <td style="width: 6em;"><xsl:value-of select ="$sequence_index"/></td>\n+    <td><xsl:value-of select="$current_sequence"/></td>\n+    </tr>\n+    <xsl:variable name="new_sequence" select="substring($sequence, 76)"/>\n+    <xsl:if test="string-length($new_sequence)">\n+      <xsl:call-template name="wrap_sequence">\n+        <xsl:with-param name="sequence" select="$new_sequence"/>\n+        <xsl:with-param name="sequence_index" select="number($sequence_index) + 75"/>\n+        <xsl:with-param name="sequence_name" select="$sequence_name"/>\n+      </xsl:call-template>\n+    </xsl:if>\n+  </xsl:template>\n+\n+  <xsl:template name="annotate_motif_names">\n+    <xsl:param name="sequence_index"/>\n+    <xsl:param name="sequence_name"/>\n+    <xsl:param name="input_sequence"/>\n+    <xsl:param name="output_sequence"/>\n+    <xsl:choose>\n+      <xsl:when test="count(document(\'cisml.xml\')/cis:cis-element-search/cis:pattern/cis:scanned-sequence[@name=$sequence_name]) = 1">\n+      </xsl:when>\n+    </xsl:choose>\n+      <xsl:for-each select="document(\'cisml.xml\')/cis:cis-element-search/cis:pattern/cis:scanned-sequence[@name=$sequence_name]">\n+        <xsl:for-each select="./cis:matched-element[(@start &gt;= $sequence_index) and (@start &lt; ($sequence_index + 75))]">\n+          <xsl:sort select="@start" data-type="number" />\n+          <xsl:text>     </xsl:text><xsl:value-of select="../../@name"/>\n+        </xsl:for-each>\n+      </xsl:for-each>\n+  </xsl:template>\n+\n+  <xsl:template name="annotate_pvalues">\n+    <xsl:param name="sequence_index"/>\n+    <xsl:param name="sequence_name"/>\n+    <pre>\n+      <xsl:for-each select="document(\'cisml.xml\')/cis:cis-element-search/cis:pattern/cis:scanned-sequence[@name=$sequence_name]">\n+        <xsl:for-each select="./cis:matched-element[(@start &gt;= $sequence_index) and (@start &lt; ($sequence_index + 75))]">\n+          <xsl:sort select="@start" data-type="number" />\n+          <xsl:text>    </xsl:text><xsl:value-of select="@pvalue"/>\n+        </xsl:for-each>\n+      </xsl:for-each>\n+    </pre>\n+  </xsl:template>\n+\n+  <xsl:template name="annotate_best_match">\n+    <xsl:param name="sequence_index"/>\n+    <xsl:param name="sequence_name"/>\n+    <pre>\n+      <xsl:for-each select="document(\'cisml.xml\')/cis:cis-element-search/cis:pattern/cis:scanned-sequence[@name=$sequence_name]">\n+        <xsl:for-each select="./cis:matched-element[(@start &gt;= $sequence_index) and (@start &lt; ($sequence_index + 75))]">\n+          <xsl:sort select="@start" data-type="number" />\n+          <xsl:text>     </xsl:text><xsl:value-of select="document(\'fimo.xml\')/fimo/motif[@name=./@name]/@best-possible-match"/>\n+        </xsl:for-each>\n+      </xsl:for-each>\n+    </pre>\n+  </xsl:template>\n+</xsl:stylesheet>\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/fimo_out/fimo.gff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/fimo_out/fimo.gff Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,1 @@
+##gff-version 3
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/fimo_out/fimo.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/fimo_out/fimo.html Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,161 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html xmlns:cis="http://zlab.bu.edu/schema/cisml" xmlns:fimo="http://noble.gs.washington.edu/schema/cisml" xmlns:mem="http://noble.gs.washington.edu/meme">
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<title>FIMO Results</title>
+<style type="text/css">
+        td.left {text-align: left;}
+        td.right {text-align: right; padding-right: 1cm;}
+      </style>
+</head>
+<body bgcolor="#D5F0FF">
+<a name="top_buttons"></a><hr>
+<table summary="buttons" align="left" cellspacing="0"><tr>
+<td bgcolor="#00FFFF"><a href="#database_and_motifs"><b>Database and Motifs</b></a></td>
+<td bgcolor="#DDFFDD"><a href="#sec_i"><b>High-scoring Motif Occurences</b></a></td>
+<td bgcolor="#DDDDFF"><a href="#debugging_information"><b>Debugging Information</b></a></td>
+</tr></table>
+<br><br><hr>
+<center><big><b>FIMO - Motif search tool</b></big></center>
+<hr>
+<p>
+      FIMO version 4.7.0, 
+      (Release date: 0 EST 20)
+    </p>
+<p>
+      For further information on how to interpret these results
+      or to get a copy of the FIMO software please access
+      <a href="http://meme.nbcr.net">http://meme.nbcr.net</a></p>
+<p>If you use FIMO in your research, please cite the following paper:<br>
+      Charles E. Grant, Timothy L. Bailey, and William Stafford Noble,
+      "FIMO: Scanning for occurrences of a given motif",
+      <i>Bioinformatics</i>, <b>27</b>(7):1017-1018, 2011.
+    </p>
+<hr>
+<center><big><b><a name="database_and_motifs">DATABASE AND MOTIFS</a></b></big></center>
+<hr>
+<div style="padding-left: 0.75in; line-height: 1em; font-family: monospace;">
+<p>
+        DATABASE 
+        /Users/xuebing/Downloads/hotair-target.fa.fasta<br>
+        Database contains 
+        832
+        sequences,
+        573269
+        residues
+      </p>
+<p>
+        MOTIFS 
+        AATAAA.motif 
+        (nucleotide)
+        <table>
+<thead><tr>
+<th style="border-bottom: 1px dashed;">MOTIF</th>
+<th style="border-bottom: 1px dashed; padding-left: 1em;">WIDTH</th>
+<th style="border-bottom: 1px dashed; padding-left: 1em;" align="left">
+              BEST POSSIBLE MATCH
+            </th>
+</tr></thead>
+<tbody>
+<tr>
+<td align="right">AATAAA</td>
+<td align="right" style="padding-left: 1em;">6</td>
+<td align="left" style="padding-left: 1em;">AATAAA</td>
+</tr>
+<tr>
+<td align="right">AATAAA</td>
+<td align="right" style="padding-left: 1em;">6</td>
+<td align="left" style="padding-left: 1em;">TTTATT</td>
+</tr>
+</tbody>
+</table></p>
+<p>
+        Random model letter frequencies 
+        (from non-redundant database):
+        <br>A 0.275 C 0.225 G 0.225 T 0.275 </p>
+</div>
+<hr>
+<center><big><b><a name="sec_i">SECTION I: HIGH-SCORING MOTIF OCCURENCES</a></b></big></center>
+<hr>
+<ul>
+<li> Each of the following 
+    0
+    motif occurrences has 
+    
+        p-value less than 
+        0.0001</li>
+<li> The p-value of a motif occurrence is defined as the
+    probability of a random sequence of the same length as the motif
+    matching that position of the sequence with as good or better a score.
+    </li>
+<li> The score for the match of a position in a sequence to a motif
+    is computed by summing the appropriate entries from each column of
+    the position-dependent scoring matrix that represents the motif.
+    </li>
+<li>The table is sorted by increasing p-value.</li>
+<li>If the start position is larger than the end position,
+    the motif occurrence is on the reverse strand.
+    </li>
+</ul>
+<table border="1">
+<thead><tr>
+<th>Motif</th>
+<th>Sequence Name</th>
+<th>Strand</th>
+<th>Start</th>
+<th>End</th>
+<th>p-value</th>
+<th>Matched Sequence</th>
+</tr></thead>
+<tbody></tbody>
+</table>
+<hr>
+<center><big><b><a name="debugging_information">DEBUGGING INFORMATION</a></b></big></center>
+<hr>
+<p>
+    Command line:
+    </p>
+<pre>fimo --verbosity 1 AATAAA.motif /Users/xuebing/Downloads/hotair-target.fa.fasta</pre>
+<p>
+    Settings:
+    </p>
+<pre><table>
+<tr>
+<td style="padding-right: 2em">output directory = fimo_out</td>
+<td style="padding-left: 5em; padding-right: 2em">MEME file name = AATAAA.motif</td>
+<td style="padding-left: 5em; padding-right: 2em">sequence file name = /Users/xuebing/Downloads/hotair-target.fa.fasta</td>
+</tr>
+<tr>
+<td style="padding-right: 2em">background file name = (null)</td>
+<td style="padding-left: 5em; padding-right: 2em">motif name = motif</td>
+<td style="padding-left: 5em; padding-right: 2em">allow clobber = true</td>
+</tr>
+<tr>
+<td style="padding-right: 2em">compute q-values = true</td>
+<td style="padding-left: 5em; padding-right: 2em">output p-threshold set = false</td>
+<td style="padding-left: 5em; padding-right: 2em">output q-threshold set = false</td>
+</tr>
+<tr>
+<td style="padding-right: 2em">text only = false</td>
+<td style="padding-left: 5em; padding-right: 2em">scan both strands = true</td>
+<td style="padding-left: 5em; padding-right: 2em">max sequence length = 250000000</td>
+</tr>
+<tr>
+<td style="padding-right: 2em">output q-value threshold =   1</td>
+<td style="padding-left: 5em; padding-right: 2em">output p-value threshold = 0.0001</td>
+<td style="padding-left: 5em; padding-right: 2em">pseudocount = 0.1</td>
+</tr>
+<tr>
+<td style="padding-right: 2em">verbosity = 1</td>
+<td style="padding-left: 5em; padding-right: 2em"></td>
+<td align="right"></td>
+</tr>
+</table></pre>
+<p>
+      This information can be useful in the event you wish to report a
+      problem with the FIMO software.
+    </p>
+<hr>
+<span style="background-color: #DDDDFF"><a href="#top_buttons"><b>Go to top</b></a></span>
+</body>
+</html>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/fimo_out/fimo.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/fimo_out/fimo.txt Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,1 @@
+#pattern name sequence name start stop score p-value q-value matched sequence
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/fimo_out/fimo.wig
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/fimo_out/fimo.wig Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,1 @@
+track type=wiggle_0 name="motif AATAAA" description="fimo scan of motif AATAAA" windowingFunction=maximum alwaysZero=on
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/fimo_out/fimo.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/fimo_out/fimo.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?xml-stylesheet type="text/xsl" href="fimo-to-html.xsl"?>
+<!-- Begin document body -->
+<fimo version="4.7.0" release="0 EST 20">
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation=  xmlns:fimo="http://noble.gs.washington.edu/schema/fimo"
+>
+<command-line>fimo --verbosity 1 AATAAA.motif /Users/xuebing/Downloads/hotair-target.fa.fasta</command-line>
+<settings>
+<setting name="output directory">fimo_out</setting>
+<setting name="MEME file name">AATAAA.motif</setting>
+<setting name="sequence file name">/Users/xuebing/Downloads/hotair-target.fa.fasta</setting>
+<setting name="background file name">(null)</setting>
+<setting name="motif name">motif</setting>
+<setting name="allow clobber">true</setting>
+<setting name="compute q-values">true</setting>
+<setting name="output p-threshold set">false</setting>
+<setting name="output q-threshold set">false</setting>
+<setting name="text only">false</setting>
+<setting name="scan both strands">true</setting>
+<setting name="max sequence length">250000000</setting>
+<setting name="output q-value threshold">  1</setting>
+<setting name="output p-value threshold">0.0001</setting>
+<setting name="pseudocount">0.1</setting>
+<setting name="verbosity">1</setting>
+</settings>
+<sequence-data num-sequences="832" num-residues="573269" />
+<alphabet>nucleotide</alphabet>
+<motif name="AATAAA" width="6" best-possible-match="AATAAA"/>
+<motif name="AATAAA" width="6" best-possible-match="TTTATT"/>
+<background source="non-redundant database">
+<value letter="A">0.275</value>
+<value letter="C">0.225</value>
+<value letter="G">0.225</value>
+<value letter="T">0.275</value>
+</background>
+<cisml-file>cisml.xml</cisml-file>
+</fimo>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/genomeView.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/genomeView.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,108 @@
+<tool id="genomeview" name="whole genome">
+  <description>plot and correlation</description>
+  <command>cat $script_file | R --vanilla --slave 2> err.log </command>
+  <inputs>
+    <param name="genome" type="select" label="Select genome">
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm9.genome" selected="true">mm9</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm8.genome">mm8</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg18.genome">hg18</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg19.genome">hg19</option>
+    </param>    
+    <param name="resolution" type="integer" label="resolution" value="5000" help="resolution in bps. It must be between 200 and 10,000,000">
+      <validator type="in_range" max="1000000000" min="200" message="Resolution is out of range, Resolution has to be between 200 to 100000000" />
+    </param>
+    <param name="log" label="plot the log" type="boolean" truevalue="log" falsevalue="none" checked="False"/>
+    <param name="union" label="compute correlation in union regions" help="ignore regions covered by neither interval sets. Recommended for sparse data under high resolution when most regions are empty" type="boolean" truevalue="union" falsevalue="none" checked="False"/>    
+    <repeat name="series" title="input file">
+      <param name="label" type="text" value="" size="30" label="Data Label"/>
+      <param name="input" type="data" format="interval" label="Dataset"/>
+    </repeat>       
+  </inputs>
+
+  <configfiles>
+    <configfile name="script_file">
+      ## Setup R error handling to go to stderr
+      options(warn=-1)
+      source("/Users/xuebing/galaxy-dist/tools/mytools/genomeview.r")
+      genome = read.table( "${genome}")
+      uselog = as.character("${log}")
+      union = as.character("${union}")
+      resolution = as.integer("${resolution}")
+      cat('resolution=',resolution,'\n')
+      offset = caloffset(genome)
+      mcov = matrix(ncol=1,nrow=as.integer(offset[length(offset)] / resolution))
+      ## Open output PDF file
+      pdf( "${out_file1}" ,height=4,width=20)
+      labels = character(0)
+      ## Determine range of all series in the plot
+      #for $i, $s in enumerate( $series )
+        x = read.table( "${s.input.file_name}" )
+        res = coverage(x,genome,offset,resolution)
+        plotcov(res,genome,offset,"${s.label.value}",uselog)
+        labels = c(labels,"${s.label.value}")
+        attach(res)
+        mcov = cbind(mcov,cov)
+        detach(res)
+      #end for
+      dev.off() 
+      pdf("${out_file2}")
+      mcov = mcov[,-1]
+      nSample = length(labels)
+      if (nSample > 1) {
+          if (union == 'union') {
+              cm = matrix(0,nrow=nSample,ncol=nSample)
+              for (i in 1:(nSample-1)) {
+                  cm[i,i] = 1
+                  for (j in (i+1):nSample){
+                      cm[i,j] = union_correlation(mcov[,i],mcov[,j])
+                      cm[j,i] = cm[i,j]        
+                  }
+              }
+              cm[nSample,nSample] = 1
+          } else {
+          cm = cor(mcov)
+          }
+          rm(mcov)
+          ##heatmap(-cm,margins=c(8,8),sym=T,scale='none',labRow=labels,labCol=labels)
+          ##heatmap2(cm,'none',TRUE,c(8,8),labels,labels)
+          x = cm
+          h = heatmap(-x,scale='none',sym=T,margins=c(8,8),labRow=labels,labRol=labels)
+          attach(h)
+    x = x[rowInd,colInd]
+    tx = numeric(0)
+    ty = numeric(0)
+    txt = character(0)
+    for (i in 1:nrow(x)){
+        for (j in 1:ncol(x)){
+            tx = c(tx,i)
+            ty = c(ty,ncol(x)-j+1)
+            txt = c(txt,round(x[i,j]*100)/100)
+        }    
+    }
+ heatmap(-x,scale='none',sym=T,margins=c(8,8),labRow=labels[rowInd],labCol=labels[colInd],add.expr=text(tx,ty,txt,col='black'))
+          library(gplots)
+          heatmap.2(cm,margins=c(8,8),scale='none',key=TRUE,trace='none', symkey=T,symbreaks=T,col=bluered,labRow=labels,labCol=labels,symm=T)
+      }
+      dev.off() 
+    </configfile>
+  </configfiles>
+
+  <outputs>
+    <data format="pdf" name="out_file1" label="${tool.name} on ${on_string}: (plot)" />
+    <data format="pdf" name="out_file2" label="${tool.name} on ${on_string}: (correlation)" />
+  </outputs>
+
+<help>
+.. class:: infomark
+
+This tool allows you to plot multiple intervals across all chromosomes at different resolution, and it also plots the correlation matrix if multiple intervals are provided.
+
+-----
+
+**Example**
+
+.. image:: ./static/images/correlationmatrix.png
+.. image:: ./static/images/wholegenome.png
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/genomeview-old2.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/genomeview-old2.r Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,52 @@
+
+caloffset = function(genome){
+    total_len = sum(as.numeric(genome[,2]))
+    offset = 0
+    for (i in 1:nrow(genome)) {
+        offset = c(offset,offset[i]+genome[i,2])        
+    }
+    offset    
+}
+
+coverage = function(intervals,genome,offset,resolution) {
+
+    nChr = length(offset) - 1
+    total_len = offset[nChr+1]
+    nbin = as.integer(total_len / resolution)
+    #cat('nbin=',nbin,'genomelen=',total_len,'\n')
+    cov = numeric(nbin)#coverage
+    col = numeric(nbin)#color
+    for (i in 1:nChr) {
+        d = x[x[,1]==as.character(genome[i,1]),2:3]
+ if (nrow(d) > 0){
+ #cat('dim(d)=',dim(d),'\n')
+ d = ceiling((d+offset[i])*nbin/total_len)
+ for (j in 1:nrow(d)){
+ cov[d[j,1]:d[j,2]] = cov[d[j,1]:d[j,2]] + 1
+ }
+ }
+        col[ceiling(offset[i]*nbin/total_len):ceiling(offset[i]*nbin/total_len)] = i
+    }
+    list(nbin=nbin,cov=cov)
+}
+
+# plot coverage
+# res = genomeView(x,genome,100000)
+plotcov = function(res,genome,offset,title,uselog) {
+ if (uselog == 'log'){
+ res$cov = log10(res$cov+1)
+ }
+    ymax = max(res$cov)
+ par(mar=c(5,5,5,1))
+    plot(seq(length(res$cov)),res$cov,type='h',cex=0.1,cex.axis=2,cex.lab=2,cex.main=3,col=res$col,xaxt='n',main=title,xlab='chromosome',ylab='coverage',frame.plot=F,ylim=c(0,ymax))
+    xticks = numeric(nrow(genome))
+    for (i in 1:nrow(genome)){
+       xticks[i] = (offset[i]+offset[i+1])/2*res$nbin/offset[length(offset)]
+    }
+    mtext(genome[,1],side=1,at=xticks,adj=1,las=2,col=seq(nrow(genome)))
+}
+
+union_correlation = function(x,y){
+ z = x>0 | y>0
+ cor(x[z],y[z])
+}
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/genomeview.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/genomeview.r Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,70 @@
+
+caloffset = function(genome){
+    total_len = sum(as.numeric(genome[,2]))
+    offset = 0
+    for (i in 1:nrow(genome)) {
+        offset = c(offset,offset[i]+genome[i,2])        
+    }
+    offset    
+}
+
+coverage = function(intervals,genome,offset,resolution) {
+
+    nChr = length(offset) - 1
+    total_len = offset[nChr+1]
+    nbin = as.integer(total_len / resolution)
+    cov = numeric(nbin)#coverage
+    col = numeric(nbin)#color
+    for (i in 1:nChr) {
+        d = x[x[,1]==as.character(genome[i,1]),2:3]
+        d = ceiling(((d[,1]+d[,2])/2+offset[i])*nbin/total_len)
+        t = table(d)
+ pos = as.numeric(row.names(t)) 
+        cov[pos] = cov[pos] + as.numeric(t)
+        col[pos] = i
+    }
+    list(nbin=nbin,cov=cov,col=col)
+}
+
+# plot coverage
+# res = genomeView(x,genome,100000)
+plotcov = function(res,genome,offset,title,uselog) {
+ if (uselog == 'log'){
+ res$cov = log10(res$cov+1)
+ }
+    ymax = max(res$cov)
+    #print(ymax)
+ par(mar=c(5,5,5,1))
+    plot(seq(length(res$cov)),res$cov,type='h',cex=0.1,cex.axis=2,cex.lab=2,cex.main=3,col=res$col,xaxt='n',main=title,xlab='chromosome',ylab='coverage',frame.plot=F,ylim=c(0,ymax))
+    xticks = numeric(nrow(genome))
+    for (i in 1:nrow(genome)){
+       xticks[i] = (offset[i]+offset[i+1])/2*res$nbin/offset[length(offset)]
+    }
+    mtext(genome[,1],side=1,at=xticks,adj=1,las=2,col=seq(nrow(genome)))
+}
+
+union_correlation = function(x,y){
+ z = x>0 | y>0
+ cor(x[z],y[z])
+}
+
+
+heatmap2 = function(x,scale,sym,margins,labRow,labCol){
+    h = heatmap(x,scale=scale,sym=sym,margins=margins,labRow=labRow,labCol=labCol)
+    x = x[h$rowInd,h$colInd]
+    tx = numeric(0)
+    ty = numeric(0)
+    txt = character(0)
+    for (i in 1:nrow(x)){
+        for (j in 1:ncol(x)){
+            tx <- c(tx,i)
+            ty <- c(ty,ncol(x)-j+1)
+            txt <- c(txt,format(x[i,j],digits=2,nsmall=2))
+        }    
+    }
+    #heatmap(x,scale=scale,sym=sym,margins=margins,labRow=labRow[h$rowInd],labCol=labCol[h$colInd],add.expr=text(1:4,1:4,1:4))
+ cat(dim(tx))
+ text(tx,ty,txt)
+ heatmap(x,scale=scale,sym=sym,margins=margins,labRow=labRow[h$rowInd],labCol=labCol[h$colInd],add.expr=text(tx,ty,txt))
+
+}
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/genomeview_notused
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/genomeview_notused Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,45 @@
+
+caloffset = function(genome){
+    total_len = sum(as.numeric(genome[,2]))
+    offset = 0
+    for (i in 1:nrow(genome)) {
+        offset = c(offset,offset[i]+genome[i,2])        
+    }
+    offset    
+}
+
+coverage = function(intervals,genome,offset,resolution) {
+
+    nChr = length(offset) - 1
+    total_len = offset[nChr+1]
+    nbin = as.integer(total_len / resolution)
+
+ pos = numeric(0)
+    cov = numeric(0)#coverage
+    col = numeric(0)#color
+    for (i in 1:nChr) {
+        d = x[x[,1]==as.character(genome[i,1]),2:3]
+        d = ceiling(((d[,1]+d[,2])/2+offset[i])*nbin/total_len)
+        t = table(d)
+ pos = c(pos,as.numeric(row.names(t)))
+        cov = c(cov, as.numeric(t))
+        col = c(col,numeric(length(t))+i)
+    }
+    list(nbin=nbin,pos=pos,cov=cov,col=col)
+}
+
+# plot coverage
+# res = genomeView(x,genome,100000)
+plotcov = function(res,genome,offset,title,uselog) {
+ if (uselog == 'log'){
+ res$cov = log10(res$cov+1)
+ }
+    ymax = max(res$cov)
+ par(mar=c(5,5,5,1))
+    plot(res$pos,res$cov,type='h',cex=0.1,cex.axis=2,cex.lab=2,cex.main=3,col=res$col,xaxt='n',main=title,xlab='chromosome',ylab='coverage',frame.plot=F,xlim=c(0,res$nbin),ylim=c(0,ymax))
+    xticks = numeric(nrow(genome))
+    for (i in 1:nrow(genome)){
+       xticks[i] = (offset[i]+offset[i+1])/2*res$nbin/offset[length(offset)]
+    }
+    mtext(genome[,1],side=1,at=xticks,adj=1,las=2,col=seq(nrow(genome)))
+}
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/getGenomicScore.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/getGenomicScore.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,79 @@
+
+import random,string,os,sys
+
+    
+def getScore(intvfile,outfile,summary_type,bwfilepath,nbin,strand,outplot,span):
+    f = open(intvfile)
+    tmpsh = "".join(random.sample(string.letters+string.digits, 8))
+    tmpout = "".join(random.sample(string.letters+string.digits, 8))
+    tmp = open(tmpsh,'w')
+    if os.path.isdir(bwfilepath):
+        for line in f:
+            flds = line.strip().split('\t')
+            cmd = 'bigWigSummary -type='+summary_type+' '+bwfilepath+'/'+flds[0]+'.bw '+flds[0]+' '+flds[1]+' '+flds[2]+' '+nbin+' >> '+tmpout+' 2>>'+tmpout+'\n'
+            tmp.write(cmd)
+    else:
+        for line in f:
+            flds = line.strip().split('\t')
+            cmd = 'bigWigSummary -type='+summary_type+' '+bwfilepath+' '+flds[0]+' '+flds[1]+' '+flds[2]+' '+nbin+' >> '+tmpout+' 2>>'+tmpout+'\n'
+            tmp.write(cmd)
+    f.close()        
+    # remove blank lines
+    tmp.write("sed '/^$/d' "+tmpout+'>'+tmpout+".1\n")
+    tmp.write("sed '/^Can/d' "+tmpout+".1 >"+tmpout+".2\n")
+    # set n/a to 0
+    tmp.write("sed 's/n\/a/0/g' "+tmpout+".2 >"+tmpout+".3\n")
+    # replace text with 0
+    zeros = ''.join(['0\t']*int(nbin))
+    tmp.write("sed 's/^[a-zA-Z]/"+zeros+"/' "+tmpout+".3 >"+tmpout+".4\n")
+    # cut the first nbin columns
+    tmp.write("cut -f 1-"+nbin+" "+tmpout+".4 > "+tmpout+".5\n")     
+    tmp.write("paste "+intvfile+" "+tmpout+".5 >"+outfile+"\n")
+    tmp.close()
+    os.system('chmod +x '+tmpsh)
+    os.system('./'+tmpsh)
+    #os.system('rm '+tmpout+'*')
+    #os.system('rm '+tmpsh)
+
+    # strandness: need to reverse bins for - strand
+    if nbin > 1 and strand > 0:
+        strand = strand - 1 # python is 0 based
+        os.system('mv '+outfile+' '+tmpout)
+        f = open(tmpout)
+        out = open(outfile,'w')
+        for line in f:
+            flds=line.strip().split('\t')
+            if flds[strand] == '+':
+                out.write(line)
+            else:
+                scores = flds[-int(nbin):]
+                scores.reverse()
+                flds = flds[:-int(nbin)]+scores
+                out.write('\t'.join(flds)+'\n')
+        os.system('rm '+tmpout)
+        f.close()
+        out.close()
+    # plot
+    if int(nbin) > 1:
+        rscript = open(tmpsh,"w")
+        rscript.write("options(warn=-1)\n")
+        rscript.write("x <- read.table('"+outfile+"',sep='\t')\n")
+        rscript.write("x <- x[,(ncol(x)+1-"+nbin+"):ncol(x)]\n")
+        rscript.write("pdf('"+outplot+"')\n")
+        rscript.write("avg <- apply(x,2,mean)\n")
+        rscript.write("err <- apply(x,2,sd)/sqrt(nrow(x))\n")
+        rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
+        rscript.write("xticks <- seq(ncol(x))-(1+ncol(x))/2\n")
+        if span >= 0.1:
+            rscript.write("avg = loess(avg~xticks,span="+str(span)+")$fitted\n")
+            rscript.write("err = loess(err~xticks,span="+str(span)+")$fitted\n")
+        rscript.write("par(cex=1.5)\n")
+        rscript.write("plot(xticks,avg,ylab='average conservation score',xlab='relative position (bin)',type='l',lwd=0,ylim=ylim)\n")   
+        rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='slateblue1',border=NA)\n")
+        rscript.write("lines(xticks,avg,type='l',lwd=1)\n")   
+        rscript.write("dev.off()\n")
+        rscript.close()
+        os.system("R --vanilla < "+tmpsh)
+        os.system("rm "+tmpsh)
+
+getScore(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],int(sys.argv[6]),sys.argv[7],float(sys.argv[8]))
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/headtail.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/headtail.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,30 @@
+<tool id="headtail" name="head-or-tail">
+  <description>of a file</description>
+  <command>$headortail -n $nline $input > $out_file1 </command>
+  <inputs>
+    <param name="input" format="txt" type="data" label="Original file"/>
+    <param name="nline" size="10" type="integer" value="10" label="Number of lines to output"/>
+    <param name="headortail" type="select" label="Head or Tail">
+        <option value="head" selected="true">head</option>
+        <option value="tail">tail</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <output name="out_file1" file="testmap.head"/>
+      <param name="input" value="test.map" ftype="TXT"/>
+      <param name="n" value="10"/>
+      <param name="headortail"  value="head" />
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This is a wrapper of the unix head/tail command, which is used to show lines at the beginning or at the end of a file.  
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/intersectSig.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/intersectSig.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,90 @@
+'''
+find overlap and test signifiance
+'''
+
+import os,sys
+
+def lineCount(filename):
+    if os.stat(filename).st_size == 0:
+        return 0
+    with open(filename) as f:
+        for i, l in enumerate(f):
+            pass
+            print i
+    return i+1
+
+def intersect(fileA,fileB,outfile,fraction,reciprocal):
+    # return fileA intervals that overlap with interval in fileB
+    cmd = 'intersectBed -a '+fileA+' -b '+fileB + ' -u -wa -f '+fraction +' '+ reciprocal + '>'+outfile
+    #print cmd
+    os.system(cmd)
+    
+def shuffle(fileA,fileB,genomefile,fraction,reciprocal,N):
+    # shuffle fileA N times, return the distribution of overlaps
+    nOverlap = []
+    for i in range(N):
+        # shuffle fileA using shuffleBed
+        #cmd = 'shuffleBed -i '+fileA+' -g '+genomefile +'>fileA.shuffled'
+        # using random_interval.py
+        cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/random_interval.py '+fileA+' fileA.shuffled across '+genomefile
+        os.system(cmd)
+        intersect('fileA.shuffled',fileB,'tmp',fraction,reciprocal)
+        nOverlap.append(lineCount('tmp'))
+    os.system('rm tmp')
+    os.system('rm fileA.shuffled')
+    return nOverlap
+
+def main():
+    fileA = sys.argv[1]
+    fileB = sys.argv[2]
+    outfile = sys.argv[3]
+    outplot = sys.argv[4]
+    outshuffle = sys.argv[5]
+    N = int(sys.argv[6]) # times to shuffle
+    genomefile = sys.argv[7]
+    fraction = sys.argv[8]
+    if len(sys.argv) == 10:
+        reciprocal = sys.argv[9] # can only be '-r'
+    else:
+        reciprocal = ''
+
+    #print sys.argv
+
+    # number of lines in input
+    nA = lineCount(fileA)
+    nB = lineCount(fileB)    
+
+    # intersect on real data
+    intersect(fileA,fileB,outfile,fraction,reciprocal)
+    # number of overlaps
+    nOverlapReal = lineCount(outfile)
+
+    #print 'number of intervals in inputA that overlap with intervals in inputB:',nOverlapReal
+    
+    # shuffle fileA to estimate background
+    nOverlapNull = shuffle(fileA,fileB,genomefile,fraction,reciprocal,N)
+    out = open(outshuffle,'w')
+    out.write("\t".join(map(str,nOverlapNull)))
+    out.close()
+
+    # plot histogram
+    rscript = open('tmp.r','w')
+    rscript.write("options(warn=-1)\n")
+    rscript.write("x0 <- "+str(nOverlapReal)+"\n")
+    rscript.write("x <- c("+','.join(map(str,nOverlapNull))+")\n")
+    rscript.write("library(MASS)\n")
+    rscript.write("pv <- min((1+sum(x>=x0))/length(x),(1+sum(x<=x0))/length(x))\n")
+    rscript.write("title <- paste('actual:chance = ',x0,':',format(mean(x),digits=1,nsmall=1),' = ',format(x0/mean(x),digits=1,nsmall=2),', p-value < ',pv,sep='')\n")
+    rscript.write("pdf('"+outplot+"')\n")
+    rscript.write("library(grid)\n")
+    rscript.write("library(VennDiagram)\n")
+    rscript.write("venn <- venn.diagram(x=list(A=1:"+str(nA)+",B="+str(nA-nOverlapReal+1)+":"+str(nA+nB-nOverlapReal)+"),filename=NULL,fill=c('red','blue'),col='transparent',alpha=0.5,label.col='black',cex=3,lwd=0,fontfamily='serif',fontface='bold',cat.col = c('red', 'blue'),cat.cex=3,cat.fontfamily='serif',cat.fontface='bold')\n")
+    rscript.write("grid.draw(venn)\n")
+    rscript.write("h <- hist(x,breaks=50,xlab='number of overlaps',ylab='frequency',main=title)\n")
+    rscript.write("plot(h$mids,h$counts,type='h',xlim=c(min(h$mids,x0),max(x0,h$mids)),ylim=c(0,max(h$counts)),xlab='number of overlaps',ylab='frequency',main=title)\n")
+    rscript.write("points(x0,0,col='red')\n")
+    rscript.write("dev.off()\n")
+    rscript.close()
+    os.system("R --vanilla < tmp.r")    
+    os.system('rm tmp.r')
+main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/intersectSig.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/intersectSig.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,30 @@
+<tool id="intersectsig" name="test overlap">
+  <description>of two interval lists</description>
+  <command interpreter="python"> intersectSig.py $fileA $fileB $outfile $outplot $outshuffle $n $genome $fraction $reciprocal </command>
+  <inputs>
+    <param name="fileA" type="data" format="interval" label="Return intervals in file A" />
+    <param name="fileB" type="data" format="interval" label="that overlap with intervals in file B" />
+    <param name="genome" type="select" label="Select genome">
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm9.genome" selected="true">mm9</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm8.genome">mm8</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg18.genome">hg18</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg19.genome">hg19</option>
+    </param>
+    <param name="fraction" size="10" type="float" value="1e-9" label="Minimum overlap required as a fraction of interval in file A" help="Default is 1E-9 (i.e., 1bp)."/>
+ <param name="reciprocal" label="Require that the fraction overlap be reciprocal for A and B" type="boolean" truevalue="-r" falsevalue="" checked="False"/>
+    <param name="n" size="10" type="integer" value="100" label="Number of permutations to run" help="File A is shuffled this number of times and the number of random overlaps is used to estimate the null distribution and compute the p value"/>
+</inputs>
+  <outputs>
+    <data format="interval" name="outfile" label="${tool.name} on ${on_string}:overlap"/> 
+    <data format="txt" name="outshuffle" label="${tool.name} on ${on_string}:null"/> 
+    <data format="pdf" name="outplot" label="${tool.name} on ${on_string}:plot"/> 
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool uses intersectBed to find intervals in the first dataset that overlap with intervals in the second dataset. To estimate the significance of the overlap, the first dataset is shuffled then intersect with the second dataset to generate a null distribution of the number of overlaps. The tool returns venn diagram plot, histogram of the null distribution, overlapped intervals from the first input, and the null distribution of overlaps. 
+
+  </help>
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/intersectbed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/intersectbed.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,103 @@
+<tool id="intersectbed" name="intersectBed">
+  <description>intersect two interval sets</description>
+  <command> intersectBed -a $inputa -b $inputb $output_opt $strandness $r -f $f $split > $output_data
+  </command>
+  <inputs>
+      <param name="inputa" type="data" format="interval,bam,bed,gff,vcf" label="Input A (-a)"/>
+      <param name="inputb" type="data" format="interval,bam,bed,gff,vcf" label="Input B (-b)"/>          
+      <param name="output_opt" type="select" label="Output style" >
+ <option value="-wa" selected="true"> -wa: entry in A that overlaps B</option>
+        <option value="-wb" > -wb: entry in B that overlaps A</option>
+        <option value="-wo" > -wo: A,B, and num bases overlap </option>
+        <option value="-wao" > -wao: A,B, and num bases overlap </option>
+        <option value="-u" > -u: A only </option>
+        <option value="-c" > -c: A, num B features overlap </option>
+        <option value="-v" > -v: A without overlap </option>
+      </param>
+  
+    <param name="f" size="10" type="float" value="1E-9" label="Minimum overlap required as a fraction of A"/>
+
+      <param name="strandness" type="select" label="Strand requirement" >
+ <option value="" selected="true"> none </option>
+        <option value="-s" > -s: require overlap on the same strand</option>
+        <option value="-S" > -S: require overlap on the opposite strand </option>
+      </param>
+      
+    <param name="r" label="Require that the fraction overlap be reciprocal for A and B (-r)." type="boolean" truevalue="-r" falsevalue="" checked="False"/>
+        <param name="split" label="Treat'split' BAM or BED12 entries as distinct BED intervals (-split)." type="boolean" truevalue="-split" falsevalue="" checked="False"/></inputs>
+  <outputs>
+    <data format="bed" name="output_data"/> 
+  </outputs>
+  <help>
+
+**What it does**
+
+This is a wrapper for intersecBed.
+
+    Program: intersectBed (v2.13.3)
+    Author:  Aaron Quinlan (aaronquinlan@gmail.com)
+    Summary: Report overlaps between two feature files.
+
+Usage::
+
+    intersectBed [OPTIONS] -a (bed/gff/vcf) -b (bed/gff/vcf)
+
+Options:: 
+ -abam The A input file is in BAM format.  Output will be BAM as well.
+
+ -ubam Write uncompressed BAM output. Default is to write compressed BAM.
+
+ -bed When using BAM input (-abam), write output as BED. The default
+ is to write output in BAM when using -abam.
+
+ -wa Write the original entry in A for each overlap.
+
+ -wb Write the original entry in B for each overlap.
+ - Useful for knowing _what_ A overlaps. Restricted by -f and -r.
+
+ -wo Write the original A and B entries plus the number of base
+ pairs of overlap between the two features.
+ - Overlaps restricted by -f and -r.
+   Only A features with overlap are reported.
+
+ -wao Write the original A and B entries plus the number of base
+ pairs of overlap between the two features.
+ - Overlapping features restricted by -f and -r.
+   However, A features w/o overlap are also reported
+   with a NULL B feature and overlap = 0.
+
+ -u Write the original A entry _once_ if _any_ overlaps found in B.
+ - In other words, just report the fact >=1 hit was found.
+ - Overlaps restricted by -f and -r.
+
+ -c For each entry in A, report the number of overlaps with B.
+ - Reports 0 for A entries that have no overlap with B.
+ - Overlaps restricted by -f and -r.
+
+ -v Only report those entries in A that have _no overlaps_ with B.
+ - Similar to "grep -v" (an homage).
+
+ -f Minimum overlap required as a fraction of A.
+ - Default is 1E-9 (i.e., 1bp).
+ - FLOAT (e.g. 0.50)
+
+ -r Require that the fraction overlap be reciprocal for A and B.
+ - In other words, if -f is 0.90 and -r is used, this requires
+   that B overlap 90% of A and A _also_ overlaps 90% of B.
+
+ -s Require same strandedness.  That is, only report hits in B that
+ overlap A on the _same_ strand.
+ - By default, overlaps are reported without respect to strand.
+
+ -S Require different strandedness.  That is, only report hits in B that
+ overlap A on the _opposite_ strand.
+ - By default, overlaps are reported without respect to strand.
+
+ -split Treat "split" BAM or BED12 entries as distinct BED intervals.
+
+ -sorted Use the "chromsweep" algorithm for sorted (-k1,1 -k2,2n) input
+ NOTE: this will trust, but not enforce that data is sorted. Caveat emptor.
+
+  </help>
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/intervalOverlap.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/intervalOverlap.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,82 @@
+'''
+find overlap and test signifiance
+'''
+
+import os,sys
+
+def lineCount(filename):
+    i = 0
+    with open(filename) as f:
+        for i, l in enumerate(f):
+            pass
+    return i + 1
+
+def intersect(fileA,fileB,outfile,fraction,reciprocal):
+    # return fileA intervals that overlap with interval in fileB
+    cmd = 'intersectBed -a '+fileA+' -b '+fileB + ' --wo -f '+fraction +' '+ reciprocal + '>'+outfile
+    #print cmd
+    os.system(cmd)
+
+def parseIntersect(filename):
+    # get number of overlapped A and B
+    nA = 0
+    nB = 0
+    return nA,nb
+    
+def shuffle(fileA,fileB,genomefile,fraction,reciprocal,N):
+    # shuffle fileA N times, return the distribution of overlaps
+    nOverlap = []
+    for i in range(N):
+        # shuffle fileA using shuffleBed
+        #cmd = 'shuffleBed -i '+fileA+' -g '+genomefile +'>fileA.shuffled'
+        # using random_interval.py
+        cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/random_interval.py '+fileA+' fileA.shuffled across '+genomefile
+        os.system(cmd)
+        intersect('fileA.shuffled',fileB,'tmp',fraction,reciprocal)
+        nOverlap.append(lineCount('tmp'))
+    os.system('rm tmp')
+    os.system('rm fileA.shuffled')
+    return nOverlap
+
+def main():
+    fileA = sys.argv[1]
+    fileB = sys.argv[2]
+    outfile = sys.argv[3]
+    outplot = sys.argv[4]
+    N = int(sys.argv[5]) # times to shuffle
+    genomefile = sys.argv[6]
+    fraction = sys.argv[7]
+    if len(sys.argv) == 9:
+        reciprocal = sys.argv[8] # can only be '-r'
+    else:
+        reciprocal = ''
+
+    print sys.argv
+    
+    # intersect on real data
+    intersect(fileA,fileB,outfile,fraction,reciprocal)
+    # number of overlaps
+    nOverlapReal = lineCount(outfile)
+
+    print 'number of intervals in inputA that overlap with intervals in inputB:',nOverlapReal
+    
+    # shuffle fileA to estimate background
+    nOverlapNull = shuffle(fileA,fileB,genomefile,fraction,reciprocal,N)
+
+    # plot histogram
+    rscript = open('tmp.r','w')
+    rscript.write("x0 <- "+str(nOverlapReal)+"\n")
+    rscript.write("x <- c("+','.join(map(str,nOverlapNull))+")\n")
+    rscript.write("library(MASS)\n")
+    rscript.write("\n")
+    rscript.write("pv <- min((1+sum(x>=x0))/length(x),(1+sum(x<=x0))/length(x))\n")
+    rscript.write("title <- paste('actual:chance = ',x0,':',round(mean(x)),' = ',format(x0/mean(x),digits=1,nsmall=2),', p-value < ',pv,sep='')\n")
+    rscript.write("pdf('"+outplot+"')\n")
+    rscript.write("h <- hist(x,breaks=50,xlab='number of overlaps',ylab='frequency',main=title)\n")
+    rscript.write("plot(h$mids,h$counts,type='h',xlim=c(min(h$mids,x0),max(x0,h$mids)),ylim=c(0,max(h$counts)),xlab='number of overlaps',ylab='frequency',main=title)\n")
+    rscript.write("points(x0,0,col='red')\n")
+    rscript.write("dev.off()")
+    rscript.close()
+    os.system("R --vanilla < tmp.r")    
+    os.system('rm tmp.r')
+main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/intervalSize.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/intervalSize.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,18 @@
+'''
+plot histogram of interval size
+'''
+
+import os,sys
+
+inputfile = sys.argv[1]
+outputfile = sys.argv[2]
+
+rf = open('tmp.r','w')
+rf.write("x <- read.table('"+inputfile+"')\n")
+rf.write("len <- x[,3]-x[,2]\n")
+rf.write("pdf('"+outputfile+"')\n")
+rf.write("hist(len,breaks=100,xlab='interval size',main=paste('mean=',mean(len),sep=''))\n")
+rf.write("dev.off()")
+rf.close()
+os.system("R --vanilla < tmp.r")    
+os.system('rm tmp.r')
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/intervalSize.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/intervalSize.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,17 @@
+<tool id="intervalsize" name="interval size">
+  <description>distribution</description>
+  <command interpreter="python">intervalSize.py $input $output</command>
+  <inputs>
+    <param name="input" format="txt" type="data" label="Plot the size distribution of the following file"/>
+  </inputs>
+  <outputs>
+    <data format="pdf" name="output" />
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool generates a histogram of the interval size.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/iupac2meme.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/iupac2meme.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,59 @@
+<tool id="iupac2meme" name="create-motif-file">
+  <description>from one sequence</description>
+  <command>iupac2meme 
+    #if $background_select.bg_select == "fromfile":
+    -bg $bgfile
+    #end if
+  -numseqs $numseqs $logodds $motif > $output </command>
+  <inputs>
+      <param name="motif" size="20" type="text" value="AATAAA" label="motif sequence" help="IUPAC motif, such as ACGGWNYCGT"/>
+      <conditional name="background_select">
+     <param name="bg_select" type="select" label="Background model" >
+   <option value="uniform" selected="true">uniform</option>
+   <option value="fromfile">load from file</option>
+     </param>
+     <when value="fromfile">
+     <param name="bgfile" type="data" format="txt" label="File for background model"/>
+     </when>
+    </conditional>
+    
+      <param name="numseqs" size="10" type="integer" value="20" label="Number of sequences (-numseqs)" help="assume frequencies based on this many sequences; default: 20"/>
+    <param name="logodds" label="also output log-odds (PSSM)" help="output the log-odds (PSSM) and frequency (PSPM) motifs; default: PSPM motif only" type="boolean" truevalue="-logodds" falsevalue="" checked="False"/>
+  </inputs>
+  <outputs>
+    <data format="txt" name="output" label="$motif-meme"/>
+  </outputs>
+  <help>
+
+**Description**
+
+Convert an IUPAC motif into a MEME version 4 formatted file suitable for use with FIMO and other MEME Suite programs.
+
+See additional information: 
+
+http://meme.sdsc.edu/meme/doc/iupac2meme.html
+
+**IUPAC code**::
+
+    Nucleotide Code:  Base:
+    ----------------  -----
+    A.................Adenine
+    C.................Cytosine
+    G.................Guanine
+    T (or U)..........Thymine (or Uracil)
+    R.................A or G
+    Y.................C or T
+    S.................G or C
+    W.................A or T
+    K.................G or T
+    M.................A or C
+    B.................C or G or T
+    D.................A or G or T
+    H.................A or C or T
+    V.................A or C or G
+    N.................any base
+
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/makebigwig.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/makebigwig.sh Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,57 @@
+# use of output: move to public_html and visualize in ucsc genome browser with the following:
+# track name="xxx" color=0,0,255 type=bigWig bigDataUrl=http://rous.mit.edu/~wuxbl/xxx.bw
+
+if [ $# -lt 6 ]
+then
+ echo "./bigwig.sh infile outtag bam/bed sorted/none genome strand/none [-split]"
+ exit
+fi
+
+f=$1
+outf=$2
+extension=$3
+sorted=$4
+genome=$5
+strand=$6
+split=$7
+i=i
+if [ $extension = bam ]
+then
+ i=ibam
+ if [ $sorted != sorted ]
+ then
+   echo 'sorting bam file...=>' $f.sorted.bam
+   samtools sort $f $f.sorted
+   f=$f.sorted.bam
+ fi
+else
+ if [ $sorted != sorted ]
+ then
+   echo 'sorting bed file...=>' $f.sorted.bed
+   sort -k1,1 $f > $f.sorted.bed
+   f=$f.sorted.bed
+ fi
+fi
+
+ echo 'making bedgraph file...=>' $f.bedgraph
+ if [ $strand != strand ]
+ then
+  genomeCoverageBed -bg -$i $f -g $genome $split > $f.bedgraph
+  echo 'making bigwig file...=>' $outf.bw
+  bedGraphToBigWig $f.bedgraph $genome $outf
+ else
+  genomeCoverageBed -bg -$i $f -g $genome $split -strand + > $f+.bedgraph
+  genomeCoverageBed -bg -$i $f -g $genome $split -strand - > $f-.bedgraph
+  echo 'making bigwig file for + strand...=>' $outf+.bw
+  bedGraphToBigWig $f+.bedgraph $genome $outf+.bw
+  echo 'making bigwig file for - strand...=>' $outf-.bw
+  bedGraphToBigWig $f-.bedgraph $genome $outf-.bw
+ fi
+
+# remove intermediate files
+if [ $sorted != sorted ]
+  then
+   rm $f
+fi
+rm $f*.bedgraph
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/makebigwig.sh-old
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/makebigwig.sh-old Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,59 @@
+# make bigwig file for genome browser visulization
+# usage
+# makebigwig.sh <infilename> <outfile> bedorbam sorted genome strand -split
+# input file types: *.bed, *.bam
+
+# use of output: move to public_html and visualize in ucsc genome browser with the following:
+# track name="xxx" color=0,0,255 type=bigWig bigDataUrl=http://rous.mit.edu/~wuxbl/xxx.bw
+
+if [ $# -lt 6 ]
+then 
+ echo "./makebigwig.sh infile outfile bedorbam sorted genome [-split -strand]"
+ exit
+fi
+
+f=$1
+outf=$2
+extension=$3
+sorted=$4
+genome=$5
+strand=$6
+split=$7
+i=i
+echo 'genome:' $genome
+echo 'strand:' $strand
+
+if [ $extension = bam ]
+then
+ i=ibam
+ if [ $sorted != sorted ]
+ then 
+   echo 'sorting bam file...=>' $f.sorted.bam
+   samtools sort $f $f.sorted
+   f=$f.sorted.bam
+ fi
+else
+ if [ $sorted != sorted ]
+ then
+   echo 'sorting bed file...=>' $f.sorted.bed
+   sort -k1,1 -k2,2g $f > $f.sorted.bed
+   f=$f.sorted.bed
+ fi
+fi
+
+ echo 'making bedgraph file...=>' $f.bedgraph 
+ if [ $strand != strand ]
+ then
+  genomeCoverageBed -bg -$i $f -g $genome $split > $f.bedgraph
+  echo 'making bigwig file...=>' $f.bw
+  bedGraphToBigWig $f.bedgraph $genome $outf
+ else
+  genomeCoverageBed -bg -$i $f -g $genome $split -strand + > $f+.bedgraph
+  genomeCoverageBed -bg -$i $f -g $genome $split -strand - > $f-.bedgraph
+  echo 'making bigwig file for + strand...' $f+.bw
+  bedGraphToBigWig $f+.bedgraph $genome $outf+
+  echo 'making bigwig file for - strand...=>' $f-.bw
+  bedGraphToBigWig $f-.bedgraph $genome $outf-
+ fi
+ rm $f
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/makebigwig.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/makebigwig.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,37 @@
+<tool id="makebigwig" name="make bigwig">
+  <description>from BED or BAM</description>
+  <command interpreter="sh"> makebigwig.sh $input $outfile 
+    #if $inputa_format.bedorbam == "bed":
+    bed
+    #else:
+    bam
+    #end if
+    $sorted $genome none $split >$log 2> $log </command>
+  <inputs>
+      <conditional name="inputa_format">
+     <param name="bedorbam" type="select" label="Select input format" >
+ <option value="bed" selected="true">BED</option>
+ <option value="bam"> BAM</option>
+     </param>
+     <when value="bed">
+     <param name="input" type="data" format="bed" label="Input file"/>
+     </when>
+     <when value="bam">
+     <param name="input" type="data" format="bam" label="Input file"/>
+     </when>
+    </conditional>
+    <param name="genome" type="select" label="Select genome">
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm9.genome" selected="true">mm9</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm8.genome">mm8</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg18.genome">hg18</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg19.genome">hg19</option>
+    </param>
+    <param name="sorted" label="Check if the input is sorted" type="boolean" truevalue="sorted" falsevalue="none" checked="False"/>
+    <param name="split" label="Split junctions" help="Treat 'split' BAM or BED12 entries as distinct BED intervals." type="boolean" truevalue="-split" falsevalue="" checked="False"/>
+  </inputs>
+  <outputs>
+    <data format="txt" name="log" label="makebigwig LOG" />
+        <data format="bigwig" name="outfile" />
+
+  </outputs>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/makewindow.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/makewindow.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,19 @@
+def makeWindow(filename,outfile,window):
+    window = window/2
+    f=open(filename)
+    out = open(outfile,'w')
+    for line in f:
+        flds = line.strip().split()
+        #new position
+        center = (int(flds[1]) + int(flds[2]))/2
+        start = center - window
+        end = center + window
+        if start >= 0:
+            flds[1] = str(start)
+            flds[2] = str(end)
+            out.write('\t'.join(flds)+'\n')
+    f.close()
+    out.close()
+
+import sys
+makeWindow(sys.argv[1],sys.argv[2],int(sys.argv[3]))
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/makewindow.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/makewindow.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,18 @@
+<tool id="makewindow" name="make-window">
+  <description>around interval center </description>
+  <command interpreter="python"> makewindow.py $input $output $window </command>
+  <inputs>
+     <param name="input" type="data" format="interval" label="Input interval file"/>
+     <param name="window" type="integer" value="1000" label="window size (bps)" />
+  </inputs>
+  <outputs>
+    <data format="input" name="output" />
+  </outputs>
+  <help>
+
+**Description**
+
+For each interval in the input file, take the middle point, then extend each side windowsize/2 bps. 
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/meme.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/meme.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,349 @@\n+<tool id="meme_meme" name="MEME" version="1.0.0">\n+  <requirements><requirement type=\'package\'>meme</requirement></requirements>\n+  <description>motif discovery</description>\n+  <command>meme "$input1" -o "${html_outfile.files_path}" \n+  -nostatus\n+  \n+  ##-p 8 ##number of processors\n+  \n+  #if str( $options_type.options_type_selector ) == \'advanced\':\n+  -sf "${ str( $options_type.sf ).replace( \' \', \'_\' ) }"\n+  -${options_type.alphabet_type.alphabet_type_selector} \n+  -mod "${options_type.mod_type.mod_type_selector}" \n+  -nmotifs "${options_type.nmotifs}" \n+  -wnsites "${options_type.wnsites}"\n+  -maxsize "${options_type.maxsize}"\n+  \n+  #if $options_type.evt &lt; float(\'inf\'):\n+    -evt "${options_type.evt}" \n+  #end if\n+  \n+  #if str( $options_type.mod_type.mod_type_selector ) != \'oops\':\n+    #if str( $options_type.mod_type.motif_occurrence_type.motif_occurrence_type_selector ) == \'nsites\':\n+      -nsites "${options_type.mod_type.motif_occurrence_type.nsites}"\n+    #elif str( $options_type.mod_type.motif_occurrence_type.motif_occurrence_type_selector ) == \'min_max_sites\':\n+      -minsites "${options_type.mod_type.motif_occurrence_type.minsites}" -maxsites "${options_type.mod_type.motif_occurrence_type.maxsites}"\n+    #end if\n+  #end if\n+  \n+  #if str( $options_type.motif_width_type.motif_width_type_selector ) == \'exact\':\n+    -w "${options_type.motif_width_type.width}"\n+  #else\n+    -minw "${options_type.motif_width_type.minw}" -maxw "${options_type.motif_width_type.maxw}"\n+  #end if\n+  \n+  #if str( $options_type.motif_trim_type.motif_trim_type_selector ) == \'nomatrim\':\n+    -nomatrim\n+  #else\n+    -wg "${options_type.motif_trim_type.wg}" -ws "${options_type.motif_trim_type.ws}" ${options_type.motif_trim_type.noendgaps}\n+  #end if\n+  \n+  #if str( $options_type.bfile ) != \'None\':\n+    -bfile "${options_type.bfile}"\n+  #end if\n+  \n+  #if str( $options_type.pspfile ) != \'None\':\n+    -psp "${options_type.pspfile}"\n+  #end if\n+  \n+  #if str( $options_type.alphabet_type.alphabet_type_selector ) == "dna":\n+    ${options_type.alphabet_type.revcomp} ${options_type.alphabet_type.pal}\n+  #end if\n+  \n+  -maxiter "${options_type.maxiter}" -distance "${options_type.distance}"\n+  \n+  -prior "${options_type.alphabet_type.prior_type.prior_type_selector}"\n+  #if str( $options_type.alphabet_type.prior_type.prior_type_selector ) != \'addone\':\n+    -b "${options_type.alphabet_type.prior_type.prior_b}" \n+    #if str( $options_type.alphabet_type.prior_type.plib ) != \'None\':\n+      -plib "${options_type.alphabet_type.prior_type.plib}"\n+    #end if\n+  #end if\n+  \n+  #if str( $options_type.alphabet_type.spmap_type.spmap_type_selector ) == \'cons\':\n+    -cons "${options_type.alphabet_type.spmap_type.cons}" \n+  #else\n+    -spmap "${options_type.alphabet_type.spmap_type.spmap_type_selector}"\n+    -spfuzz "${options_type.alphabet_type.spmap_type.spfuzz}" \n+  #end if\n+  \n+  #if str( $options_type.branching_type.branching_type_selector ) == \'x_branch\':\n+    -x_branch -bfactor "${options_type.branching_type.bfactor}" -heapsize "${options_type.branching_type.heapsize}"\n+  #end if\n+  \n+  ##-maxsize "1000000" ##remove hardcoded maxsize? should increase number of processors instead\n+  \n+  #end if\n+  \n+  2&gt;&amp;1 || echo "Error running MEME."\n+  \n+  \n+  &amp;&amp; mv ${html_outfile.files_path}/meme.html ${html_outfile}\n+  \n+  &amp;&amp; mv ${html_outfile.files_path}/meme.txt ${txt_outfile}\n+  \n+  &amp;&amp; mv ${html_outfile.files_path}/meme.xml ${xml_outfile}\n+  \n+  </command>\n+  <inputs>\n+    <param format="fasta" name="input1" type="data" label="Sequences"/>\n+      \n+      <conditional name="options_type">\n+        <param name="options_type_selector" type="select" label="Options Configuration">\n+          <option value="basic" selected="true">Basic</option>\n+          <option value="advanced">Advanced</option>\n+        </param>\n+        <when value="basic">\n+          <!-- do nothing here -->\n+        </when>\n+        <when value="advance'..b'    <param name="maxw" type="integer" value="50" label="Max width of motif to search" />\n+        </when>\n+      </conditional>\n+    \n+      <conditional name="motif_trim_type">\n+        <param name="motif_trim_type_selector" type="select" label="Motif trim type">\n+          <option value="nomatrim">No motif trim</option>\n+          <option value="trim" selected="true">Trim motif</option>\n+        </param>\n+        <when value="nomatrim">\n+          <!-- no values here -->\n+        </when>\n+        <when value="trim">\n+          <param name="wg" type="integer" value="11" label="Gap cost" />\n+          <param name="ws" type="integer" value="1" label="Space cost" />\n+          <param name="noendgaps" label="Do not penalize endgaps" type="boolean" truevalue="-noendgaps" falsevalue="" checked="False"/>\n+        </when>\n+      </conditional>\n+    \n+    <param name="bfile" type="data" format="txt" optional="True" label="Background Model" />\n+    <param name="pspfile" type="data" format="txt" optional="True" label="Position-Specific Prior" />\n+    \n+    <param name="maxiter" type="integer" value="50" label="Number of iterations of EM to run" />\n+    <param name="distance" type="float" value="0.001" label="Convergence criterion" />\n+    \n+      <conditional name="branching_type">\n+        <param name="branching_type_selector" type="select" label="x-branching type">\n+          <option value="x_branch">Perform x-branching</option>\n+          <option value="no_x_branch" selected="true">No x-branching</option>\n+        </param>\n+        <when value="no_x_branch">\n+          <!-- no values here -->\n+        </when>\n+        <when value="x_branch">\n+          <param name="bfactor" type="integer" value="3" label="Number of iterations of branching" />\n+          <param name="heapsize" type="integer" value="64" label="Maximum number of heaps to use" />\n+        </when>\n+      </conditional>\n+  \n+    </when>\n+  </conditional>\n+  \n+  <param name="non_commercial_use" label="I certify that I am not using this tool for commercial purposes." type="boolean" truevalue="NON_COMMERCIAL_USE" falsevalue="COMMERCIAL_USE" checked="False">\n+    <validator type="expression" message="This tool is only available for non-commercial use.">value == True</validator>\n+  </param>\n+  \n+  </inputs>\n+  <outputs>\n+    <data format="html" name="html_outfile" label="${tool.name} on ${on_string} (html)"/>\n+    <data format="txt" name="txt_outfile" label="${tool.name} on ${on_string} (text)"/>\n+    <data format="memexml" name="xml_outfile" label="${tool.name} on ${on_string} (xml)"/>\n+  </outputs>\n+  <tests>\n+    <test>\n+      <param name="input1" value="meme/meme/meme_input_1.fasta" ftype="fasta" dbkey="hg19"/>\n+      <param name="options_type_selector" value="basic"/>\n+      <param name="non_commercial_use" value="True"/>\n+      <output name="html_outfile" file="meme/meme/meme_output_html_1.html" lines_diff="12"/>\n+      <output name="txt_outfile" file="meme/meme/meme_output_txt_1.txt" lines_diff="12"/>\n+      <output name="xml_outfile" file="meme/meme/meme_output_xml_1.xml" lines_diff="8"/>\n+    </test>\n+  </tests>\n+  <help>\n+\n+.. class:: warningmark\n+\n+**WARNING: This tool is only available for non-commercial use. Use for educational, research and non-profit purposes is permitted. Before using, be sure to review, agree, and comply with the license.**\n+\n+If you want to specify sequence weights, you must include them at the top of your input FASTA file.\n+\n+.. class:: infomark\n+\n+**To cite MEME:**\n+Timothy L. Bailey and Charles Elkan, "Fitting a mixture model by expectation maximization to discover motifs in biopolymers", Proceedings of the Second International Conference on Intelligent Systems for Molecular Biology, pp. 28-36, AAAI Press, Menlo Park, California, 1994. \n+\n+\n+For detailed information on MEME, click here_. To view the license_.\n+\n+.. _here: http://meme.nbcr.net/meme/meme-intro.html\n+.. _license: http://meme.nbcr.net/meme/COPYRIGHT.html\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/memelogo.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/memelogo.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,26 @@
+<tool id="memelogo" name="motif logo">
+  <description>of MEME motif</description>
+  <command>ceqlogo -i $input -o tmp.eps -t $title -x ''  
+    &amp;&amp; ps2pdf -dEPSCrop tmp.eps $output
+  </command>
+  <inputs>
+     <param name="input" type="data" format="txt" label="MEME motif file"/>
+     <param name="title" type='text' size="50" label="Title" value="motif1"/>    
+  </inputs>
+  <outputs>
+    <data format="pdf" name="output" />
+  </outputs>
+  <help>
+
+**Description**
+
+Generate sequence logo for MEME motif file. See details here:
+
+http://meme.sdsc.edu/meme/doc/ceqlogo.html
+
+**Example output**
+
+.. image:: ./static/images/memelogo.png
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/metaintv.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/metaintv.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,109 @@
+'''
+get binned score of intervals,allow extension
+'''
+
+import os,sys,numpy
+
+from resize import *
+
+from bx.bbi.bigwig_file import BigWigFile
+
+def binning(x,n):
+    # make n bin of x
+    y = numpy.zeros(n,dtype=float)
+    if len(x) == 0:
+        return y
+    step = float(len(x))/n
+    for k in range(n):
+        i = int(step*k)
+        j = int(step*(k+1)) + 1
+        y[k] = x[i:j].mean()
+        #print i,j,k,y[k]
+    return y
+
+def getBinnedScore(bwfile,intvfile,nbin):
+    '''
+    get binned average and std
+    '''
+    fbw = open(bwfile)
+    bw = BigWigFile(file=fbw)
+    fin = open(intvfile)
+    avg = numpy.zeros(nbin)
+    sqr = numpy.zeros(nbin)
+    N = 0
+    for line in fin:
+        #chrom,start,end,name,score,strand
+        flds = line.strip().split('\t')
+        #get the score at base resolution as an array
+        scores = bw.get_as_array(flds[0],int(flds[1]),int(flds[2]))
+        if scores == None:
+            print 'not found:\t',line
+            continue
+        N = N + 1
+        #print line,scores
+        # reverse if on minus strand
+        if flds[5] == '-':
+            scores = scores[::-1]
+        # no score = 0    
+        scores = numpy.nan_to_num(scores)
+        # bin the data
+        binned = binning(scores,nbin)
+        avg = avg + binned
+        sqr = sqr + binned**2
+    # compute avg and std
+    avg = avg / N
+    err = ((sqr/N-avg**2)**0.5)/(N**0.5)
+    return avg,err
+
+def getExtendedBinScore(bwfile,intvfile,nbins,exts):
+    '''
+    nbins: n1,n2,n3
+    exts: l1,l2,l3,l4
+    '''
+    # make left extension
+    resize(intvfile,intvfile+'.tmp','start-'+str(exts[0]),'start+'+str(exts[1]),'stranded')
+    # compute binned average
+    avg,err = getBinnedScore(bwfile,intvfile+'.tmp',nbins[0])
+    # make center region
+    resize(intvfile,intvfile+'.tmp','start+'+str(exts[1]),'end-'+str(exts[2]),'stranded')
+    # compute binned average
+    avg1,err1 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[1])    
+    avg = numpy.concatenate((avg,avg1))
+    err = numpy.concatenate((err,err1))
+    # make right region
+    resize(intvfile,intvfile+'.tmp','end-'+str(exts[2]),'end+'+str(exts[3]),'stranded')
+    # compute binned average
+    avg2,err2 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[2])    
+    avg = numpy.concatenate((avg,avg2))
+    err = numpy.concatenate((err,err2))
+    
+    return avg,err
+
+print sys.argv
+prog,bwfile,intvfile,nbin,outfile,outplot = sys.argv
+avg, err = getBinnedScore(bwfile,intvfile,int(nbin))
+out = open(outfile,'w')
+numpy.savetxt(out, avg, fmt='%.6f', delimiter=' ', newline=' ')
+out.write('\n')
+numpy.savetxt(out, err, fmt='%.6f', delimiter=' ', newline=' ')
+out.write('\n')
+out.close()
+
+# plot
+rscript = open("tmp.r","w")
+rscript.write("options(warn=-1)\n")
+rscript.write("x <- read.table('"+outfile+"')\n")
+rscript.write("pdf('"+outplot+"')\n")
+rscript.write("avg <- x[1,]\n")
+rscript.write("err <- x[2,]\n")
+rscript.write("print(x)\n")
+rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
+rscript.write("xticks <- seq(ncol(x))\n")
+rscript.write("plot(xticks,avg,xlab='',ylab='average coverage',type='l',lwd=0,ylim=ylim)\n")   
+rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='lightgreen',border=NA)\n")
+rscript.write("lines(xticks,avg,type='l',lwd=1)\n")   
+rscript.write("dev.off()\n")
+rscript.close()
+os.system("R --vanilla < tmp.r")
+os.system("rm tmp.r")
+        
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/metaintv.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/metaintv.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,36 @@
+<tool id="metaintv" name="binned-average">
+  <description>from bigwig</description>
+  <command interpreter="python">binnedAverage.py $bwfile $intvfile $nbin $outfile $outplot </command>
+  <inputs>
+      <param name="intvfile" format="bed" type="data" label="BED file (require strand in column 6)"/>
+      <param name="bwfile" format="bigwig" type="data" label="BigWig file"/> 
+      <param name="nbin" type="integer" value="20" label="number of bins"/>
+                
+  </inputs>
+  <outputs>
+    <data format="tabular" name="outfile" label="${tool.name} on ${on_string}[data]"/>
+        <data format="pdf" name="outplot" label="${tool.name} on ${on_string}[plot]"/>
+  </outputs>
+  <help>
+
+.. class:: infomark
+
+Each interval is binned and the average base-resolution score/coverage/density in the bigwig file is added as new columns appended at the end of the original file .
+
+**Example**
+
+If your original data has the following format:
+
++-----+-----+---+------+
+|chrom|start|end|other2|
++-----+-----+---+------+
+
+and you choose to divide each interval into 3 bins and return the mean scores of each bin, your output will look like this:
+
++-----+-----+---+------+-----+-----+-----+
+|chrom|start|end|other2|mean1|mean2|mean3|
++-----+-----+---+------+-----+-----+-----+
+
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/metaintv2.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/metaintv2.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,109 @@
+'''
+get binned score of intervals,allow extension
+'''
+
+import os,sys,numpy
+
+from resize import *
+
+from bx.bbi.bigwig_file import BigWigFile
+
+def binning(x,n):
+    # make n bin of x
+    y = numpy.zeros(n,dtype=float)
+    if len(x) == 0:
+        return y
+    step = float(len(x))/n
+    for k in range(n):
+        i = int(step*k)
+        j = int(step*(k+1)) + 1
+        y[k] = x[i:j].mean()
+        #print i,j,k,y[k]
+    return y
+
+def getBinnedScore(bwfile,intvfile,nbin):
+    '''
+    get binned average and std
+    '''
+    fbw = open(bwfile)
+    bw = BigWigFile(file=fbw)
+    fin = open(intvfile)
+    avg = numpy.zeros(nbin)
+    sqr = numpy.zeros(nbin)
+    N = 0
+    for line in fin:
+        #chrom,start,end,name,score,strand
+        flds = line.strip().split('\t')
+        #get the score at base resolution as an array
+        scores = bw.get_as_array(flds[0],int(flds[1]),int(flds[2]))
+        if scores == None:
+            print 'not found:\t',line
+            continue
+        N = N + 1
+        #print line,scores
+        # reverse if on minus strand
+        if flds[5] == '-':
+            scores = scores[::-1]
+        # no score = 0    
+        scores = numpy.nan_to_num(scores)
+        # bin the data
+        binned = binning(scores,nbin)
+        avg = avg + binned
+        sqr = sqr + binned**2
+    # compute avg and std
+    avg = avg / N
+    err = ((sqr/N-avg**2)**0.5)/(N**0.5)
+    return avg,err
+
+def getExtendedBinScore(bwfile,intvfile,nbins,exts):
+    '''
+    nbins: n1,n2,n3
+    exts: l1,l2,l3,l4
+    '''
+    # make left extension
+    resize(intvfile,intvfile+'.tmp','start-'+str(exts[0]),'start+'+str(exts[1]),'stranded')
+    # compute binned average
+    avg,err = getBinnedScore(bwfile,intvfile+'.tmp',nbins[0])
+    # make center region
+    resize(intvfile,intvfile+'.tmp','start+'+str(exts[1]),'end-'+str(exts[2]),'stranded')
+    # compute binned average
+    avg1,err1 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[1])    
+    avg = numpy.concatenate((avg,avg1))
+    err = numpy.concatenate((err,err1))
+    # make right region
+    resize(intvfile,intvfile+'.tmp','end-'+str(exts[2]),'end+'+str(exts[3]),'stranded')
+    # compute binned average
+    avg2,err2 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[2])    
+    avg = numpy.concatenate((avg,avg2))
+    err = numpy.concatenate((err,err2))
+    
+    return avg,err
+
+print sys.argv
+bwfile,intvfile,exts,nbins,outfile,outplot = sys.argv
+avg, err = getExtendedBinScore(bwfile,intvfile,numpy.fromstring(nbins,sep=','),numpy.fromstring(exts,sep=','))
+out = open(outfile,'w')
+numpy.savetxt(out, avg, fmt='%.6f', delimiter=' ', newline=' ')
+out.write('\n')
+numpy.savetxt(out, err, fmt='%.6f', delimiter=' ', newline=' ')
+out.write('\n')
+out.close()
+
+# plot
+rscript = open("tmp.r","w")
+rscript.write("options(warn=-1)\n")
+rscript.write("x <- read.table('"+outfile+"')\n")
+rscript.write("pdf('"+outplot+"')\n")
+rscript.write("avg <- x[1,]\n")
+rscript.write("err <- x[2,]\n")
+rscript.write("print(x)\n")
+rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
+rscript.write("xticks <- seq(ncol(x))\n")
+rscript.write("plot(xticks,avg,ylab='average coverage',type='l',lwd=0,ylim=ylim)\n")   
+rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='lightgreen',border=NA)\n")
+rscript.write("lines(xticks,avg,type='l',lwd=1)\n")   
+rscript.write("dev.off()\n")
+rscript.close()
+os.system("R --vanilla < tmp.r")
+os.system("rm tmp.r")
+        
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/metaintv3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/metaintv3.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,109 @@
+'''
+get binned score of intervals,allow extension
+'''
+
+import os,sys,numpy
+
+from resize import *
+
+from bx.bbi.bigwig_file import BigWigFile
+
+def binning(x,n):
+    # make n bin of x
+    y = numpy.zeros(n,dtype=float)
+    if len(x) == 0:
+        return y
+    step = float(len(x))/n
+    for k in range(n):
+        i = int(step*k)
+        j = int(step*(k+1)) + 1
+        y[k] = x[i:j].mean()
+        #print i,j,k,y[k]
+    return y
+
+def getBinnedScore(bwfile,intvfile,nbin):
+    '''
+    get binned average and std
+    '''
+    fbw = open(bwfile)
+    bw = BigWigFile(file=fbw)
+    fin = open(intvfile)
+    avg = numpy.zeros(nbin)
+    sqr = numpy.zeros(nbin)
+    N = 0
+    for line in fin:
+        #chrom,start,end,name,score,strand
+        flds = line.strip().split('\t')
+        #get the score at base resolution as an array
+        scores = bw.get_as_array(flds[0],int(flds[1]),int(flds[2]))
+        if scores == None:
+            print 'not found:\t',line
+            continue
+        N = N + 1
+        #print line,scores
+        # reverse if on minus strand
+        if flds[5] == '-':
+            scores = scores[::-1]
+        # no score = 0    
+        scores = numpy.nan_to_num(scores)
+        # bin the data
+        binned = binning(scores,nbin)
+        avg = avg + binned
+        sqr = sqr + binned**2
+    # compute avg and std
+    avg = avg / N
+    err = ((sqr/N-avg**2)**0.5)/(N**0.5)
+    return avg,err
+
+def getExtendedBinScore(bwfile,intvfile,nbins,exts):
+    '''
+    nbins: n1,n2,n3
+    exts: l1,l2,l3,l4
+    '''
+    # make left extension
+    resize(intvfile,intvfile+'.tmp','start-'+str(exts[0]),'start+'+str(exts[1]),'stranded')
+    # compute binned average
+    avg,err = getBinnedScore(bwfile,intvfile+'.tmp',nbins[0])
+    # make center region
+    resize(intvfile,intvfile+'.tmp','start+'+str(exts[1]),'end-'+str(exts[2]),'stranded')
+    # compute binned average
+    avg1,err1 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[1])    
+    avg = numpy.concatenate((avg,avg1))
+    err = numpy.concatenate((err,err1))
+    # make right region
+    resize(intvfile,intvfile+'.tmp','end-'+str(exts[2]),'end+'+str(exts[3]),'stranded')
+    # compute binned average
+    avg2,err2 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[2])    
+    avg = numpy.concatenate((avg,avg2))
+    err = numpy.concatenate((err,err2))
+    
+    return avg,err
+
+print sys.argv
+bwfile,intvfile,exts,nbins,outfile,outplot = sys.argv
+avg, err = getExtendedBinScore(bwfile,intvfile,numpy.fromstring(nbins,sep=','),numpy.fromstring(exts,sep=','))
+out = open(outfile,'w')
+numpy.savetxt(out, avg, fmt='%.6f', delimiter=' ', newline=' ')
+out.write('\n')
+numpy.savetxt(out, err, fmt='%.6f', delimiter=' ', newline=' ')
+out.write('\n')
+out.close()
+
+# plot
+rscript = open("tmp.r","w")
+rscript.write("options(warn=-1)\n")
+rscript.write("x <- read.table('"+outfile+"')\n")
+rscript.write("pdf('"+outplot+"')\n")
+rscript.write("avg <- x[1,]\n")
+rscript.write("err <- x[2,]\n")
+rscript.write("print(x)\n")
+rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
+rscript.write("xticks <- seq(ncol(x))\n")
+rscript.write("plot(xticks,avg,ylab='average coverage',type='l',lwd=0,ylim=ylim)\n")   
+rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='lightgreen',border=NA)\n")
+rscript.write("lines(xticks,avg,type='l',lwd=1)\n")   
+rscript.write("dev.off()\n")
+rscript.close()
+os.system("R --vanilla < tmp.r")
+os.system("rm tmp.r")
+        
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/metaintv_ext.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/metaintv_ext.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,128 @@
+'''
+get binned score of intervals,allow extension
+'''
+
+import os,sys,numpy
+import string, random
+
+from resize import *
+
+from bx.bbi.bigwig_file import BigWigFile
+
+def binning(x,n):
+    # make n bin of x
+    y = numpy.zeros(n,dtype=float)
+    if len(x) == 0:
+        return y
+    step = float(len(x))/n
+    for k in range(n):
+        i = int(step*k)
+        j = int(step*(k+1)) + 1
+        y[k] = x[i:j].mean()
+        #print i,j,k,y[k]
+    return y
+
+def getBinnedScore(bwfile,intvfile,nbin):
+    '''
+    get binned average and std
+    '''
+    fbw = open(bwfile)
+    bw = BigWigFile(file=fbw)
+    fin = open(intvfile)
+    avg = numpy.zeros(nbin)
+    sqr = numpy.zeros(nbin)
+    N = 0
+    for line in fin:
+        #print N
+        #chrom,start,end,name,score,strand
+        flds = line.strip().split('\t')
+        #get the score at base resolution as an array
+        scores = bw.get_as_array(flds[0],int(flds[1]),int(flds[2]))
+        if scores == None:
+            print 'not found:\t',N,line
+            continue
+        N = N + 1
+        #print line,scores
+        # reverse if on minus strand
+        if flds[5] == '-':
+            scores = scores[::-1]
+        # no score = 0    
+        scores = numpy.nan_to_num(scores)
+        # bin the data
+        binned = binning(scores,nbin)
+        avg = avg + binned
+        sqr = sqr + binned**2
+    # compute avg and std
+    avg = avg / N
+    err = ((sqr/N-avg**2)**0.5)/(N**0.5)
+    return avg,err,N
+
+def getExtendedBinScore(bwfile,intvfile,nbins,exts):
+    '''
+    nbins: n1,n2,n3
+    exts: l1,l2,l3,l4
+    '''
+    avg = []
+    err = []
+    tmpfile = "".join(random.sample(string.letters+string.digits, 8))
+    if exts[0]>0 or exts[1]>0:
+        print 'make left extension'
+        resize(intvfile,tmpfile,'start-'+str(exts[0]),'start+'+str(exts[1]),'stranded')
+        print 'compute binned average'
+        avg,err,N = getBinnedScore(bwfile,tmpfile,nbins[0])
+        print 'regions used:',N
+    print 'make center region'
+    resize(intvfile,tmpfile,'start+'+str(exts[1]),'end-'+str(exts[2]),'stranded')
+    print 'compute binned average'
+    avg1,err1,N = getBinnedScore(bwfile,tmpfile,nbins[1])
+    print 'regions used:',N
+    avg = numpy.concatenate((avg,avg1))
+    err = numpy.concatenate((err,err1))
+    if exts[2]>0 or exts[3]>0:
+        print 'make right region'
+        resize(intvfile,tmpfile,'end-'+str(exts[2]),'end+'+str(exts[3]),'stranded')
+        print 'compute binned average'
+        avg2,err2,N = getBinnedScore(bwfile,tmpfile,nbins[2])
+        print 'regions used:',N
+        avg = numpy.concatenate((avg,avg2))
+        err = numpy.concatenate((err,err2))
+    os.system('rm '+tmpfile)
+    return avg,err
+
+prog,bwfile,intvfile,exts,nbins,outfile,outplot = sys.argv
+nbins = numpy.fromstring(nbins,dtype=int,sep=',')
+exts = numpy.fromstring(exts,dtype=int,sep=',')
+avg, err = getExtendedBinScore(bwfile,intvfile,nbins,exts)
+print 'save data'
+out = open(outfile,'w')
+numpy.savetxt(out, avg, fmt='%.6f', delimiter=' ', newline=' ')
+out.write('\n')
+numpy.savetxt(out, err, fmt='%.6f', delimiter=' ', newline=' ')
+out.write('\n')
+out.close()
+
+print 'plot'
+start = exts[0]*nbins[0]/(exts[0]+exts[1])
+end = nbins[0]+nbins[1]+exts[2]*nbins[2]/(exts[2]+exts[3])
+#print start,end
+rscript = open("tmp.r","w")
+rscript.write("options(warn=-1)\n")
+rscript.write("x <- read.table('"+outfile+"')\n")
+rscript.write("pdf('"+outplot+"')\n")
+rscript.write("avg <- x[1,]\n")
+rscript.write("err <- x[2,]\n")
+#rscript.write("print(x)\n")
+rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
+rscript.write("xticks <- seq(ncol(x))\n")
+#rscript.write("print(xticks)\n")
+rscript.write("plot(xticks,avg,xlab='',ylab='average coverage',type='l',lwd=0,ylim=ylim,xaxt='n')\n")
+rscript.write("axis(1, at=c(min(xticks),"+str(start)+","+str(end)+",max(xticks)),labels=c(-"+str(exts[0])+",0,0,"+str(exts[3])+"), las=2)\n")
+rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='lightgreen',border=NA)\n")
+rscript.write("lines(xticks,avg,type='l',lwd=1)\n")
+rscript.write("lines(c(min(xticks),max(xticks)),c(0,0),lwd=2)\n")
+rscript.write("lines(c("+str(start)+","+str(end)+"),c(0,0),lwd=10)\n")
+rscript.write("dev.off()\n")
+rscript.close()
+os.system("R --vanilla --slave < tmp.r")
+os.system("rm tmp.r")
+        
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/metaintv_ext.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/metaintv_ext.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,23 @@
+<tool id="metaintv_ext" name="aggregrate binned-average">
+  <description>from bigwig (allow extension)</description>
+  <command interpreter="python">metaintv_ext.py $bwfile $intvfile $exts $nbins $outfile $outplot > $outlog </command>
+  <inputs>
+      <param name="intvfile" format="interval" type="data" label="Interval file"/>
+      <param name="bwfile" format="bigwig" type="data" label="BigWig file"/> 
+      <param name="exts" type="text" size="80" value="100,50,50,100" label="extensions"/>
+      <param name="nbins" type="text" size="80" value="10,10,10" label="number of bins"/>
+                
+  </inputs>
+  <outputs>
+      <data format="txt" name="outlog" label="${tool.name} on ${on_string}[log]"/>
+    <data format="tabular" name="outfile" label="${tool.name} on ${on_string}[data]"/>
+        <data format="pdf" name="outplot" label="${tool.name} on ${on_string}[plot]"/>
+  </outputs>
+  <help>
+
+.. class:: infomark
+
+To be added
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/phastCons.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/phastCons.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,48 @@
+<tool id="getScore" name="conservation">
+  <description>phastCons or phyloP,vertebrate30way</description>
+  <command interpreter="python">getGenomicScore.py $input $output $score_type $score_path $nbin $strand $outplot $span</command>
+  <inputs>
+      <param name="input" format="interval" type="data" label="Interval file"/>
+      <param name="score_path" type="select" label="Select score" >
+      <option value="/Users/xuebing/galaxy-dist/tool-data/genome/mm8/phastcons" >mm8-phastCons17way</option>
+   <option value="/Users/xuebing/galaxy-dist/tool-data/genome/mm9/phastcon" selected="true">mm9-phastCons30way-vertebrate</option>
+          <option value="/Users/xuebing/galaxy-dist/tool-data/genome/mm9/phyloP30way">mm9-phyloP30way-vertebrate</option>
+          <option value="/Users/xuebing/galaxy-dist/tool-data/genome/hg18/phastCons28wayPlacMam">hg18-phastCons28wayPlacMam</option>                      </param>
+      <param name="score_type" type="select" label="Select score summary type" >
+   <option value="mean" selected="true">mean</option>
+   <option value="max">maximum</option>
+   <option value="min">minimum</option>
+   <option value="std">standard deviation</option>
+   <option value="coverage">coverage:fraction covered</option>
+      </param>
+      <param name="nbin" type="integer" value="1" label="number of bins"/> 
+       <param name="span" size="10" type="float" value="0.1" label="loess span: smoothing parameter" help="value less then 0.1 disables smoothing"/>         
+      <param name="strand" type="integer" value="0" label="Specify the strand column" help="leave 0 to ignore strand information. Only matters if using more than 1 bin"/>          
+  </inputs>
+  <outputs>
+     <data format="pdf" name="outplot" label="${tool.name} on ${on_string}[plot]"/>
+    <data format="interval" name="output" label="${tool.name} on ${on_string}[data]"/>
+  </outputs>
+  <help>
+
+.. class:: infomark
+
+The score for each interval is added as a new column appended at the end of the original file .
+
+**Example**
+
+If your original data has the following format:
+
++-----+-----+---+------+
+|chrom|start|end|other2|
++-----+-----+---+------+
+
+and you choose to return the mean of phastCons scores, your output will look like this:
+
++-----+-----+---+------+----+
+|chrom|start|end|other2|mean|
++-----+-----+---+------+----+
+
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/plotmatrix.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/plotmatrix.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,77 @@
+import sys,os
+
+infile = sys.argv[1]
+outfile = sys.argv[2]
+uselog = sys.argv[3]
+subset = sys.argv[4]
+reorder = sys.argv[5]
+color = sys.argv[6]
+scale = sys.argv[7] # rescale each row
+cols = sys.argv[8]
+rscript = open('tmp.r','w')
+
+rscript.write("x <- read.table('"+infile+"')\n")
+rscript.write("x <- x[,c("+cols+")]\n")
+rscript.write("nr <- nrow(x) \n")
+rscript.write("nc <- ncol(x)\n")
+rscript.write("rowsum <- apply(x,1,sum)\n")
+
+if subset =='subset':
+    rscript.write("if (nr*nc > 100000) {\n")
+    rscript.write("  nr2 <- as.integer(100000/nc)\n")
+    rscript.write("  subind <- sample(seq(nr),nr2)\n")
+    rscript.write("  x <- x[subind,]\n")
+    rscript.write("  rowsum <- rowsum[subind]\n")
+    rscript.write("}\n")
+
+rscript.write("pdf('"+outfile+"')\n")
+
+if uselog == 'uselog':
+    rscript.write("x <- -(log(as.matrix(x,nc=nc)))\n")
+else:
+    rscript.write("x <- -as.matrix(x,nc=nc)\n")
+if scale == 'scale':
+    rscript.write("x <- scale(x)\n")
+if reorder == 'average':
+    rscript.write("hc <- hclust(dist(x),method= 'average')\n")
+    rscript.write("x <- x[hc$order,]\n")
+elif reorder == 'centroid':
+    rscript.write("hc <- hclust(dist(x),method= 'centroid')\n")
+    rscript.write("x <- x[hc$order,]\n")
+elif reorder == 'complete':
+    rscript.write("hc <- hclust(dist(x),method= 'complete')\n")
+    rscript.write("x <- x[hc$order,]\n")
+elif reorder == 'single':
+    rscript.write("hc <- hclust(dist(x),method= 'single')\n")
+    rscript.write("x <- x[hc$order,]\n")
+elif reorder == 'median':
+    rscript.write("hc <- hclust(dist(x),method= 'median')\n")
+    rscript.write("x <- x[hc$order,]\n")    
+elif reorder == 'sort_by_total':
+    rscript.write("srt <- sort(rowsum,index.return=T)\n")
+    rscript.write("x <- x[srt$ix,]\n")
+elif reorder == 'sort_by_center':
+    rscript.write("srt <- sort(x[,as.integer(nc/2)],index.return=T)\n")
+    rscript.write("x <- x[srt$ix,]\n")
+if color == 'heat':
+    rscript.write("colormap = heat.colors(1000)\n")
+elif color == 'topo':
+    rscript.write("colormap = topo.colors(1000)\n")
+elif color == 'rainbow':
+    rscript.write("colormap = rainbow(1000)\n")
+elif color == 'terrain':
+    rscript.write("colormap = terrain.colors(1000)\n")
+else:
+    rscript.write("colormap = gray.colors(1000)\n")
+
+#rscript.write("qt <- quantile(as.vector(x),probs=c(0.1,0.9))\n")
+#rscript.write("breaks <- c(min(as.vector(x)),seq(qt[1],qt[2],length.out=99),max(as.vector(x)))\n")
+#rscript.write("image(t(x),col=colormap,breaks=breaks,axes=F)\n")
+rscript.write("image(t(x),col=colormap,axes=F)\n")
+rscript.write("dev.off()\n")
+
+rscript.close()
+
+os.system("R --slave < tmp.r")
+os.system("rm tmp.r")
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/plotmatrix.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/plotmatrix.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,43 @@
+<tool id="plotmatrix" name="matrix-visualization">
+  <description>with sorting and clustering)</description>
+  <command interpreter="python"> plotmatrix.py $input $output $uselog $subset $reorder $color $scale $cols</command>
+  <inputs>
+    <param name="input" format="tabular" type="data" label="Data file"/>
+    <param name="cols" type="text" value="1,3,8:10" label="data columns" help="e.g.: column 1, 3, 8,9,10"/>
+    <param name="uselog" label="log transform the data" type="boolean" truevalue="uselog" falsevalue="none" checked="True"/>
+    <param name="subset" label="sample a subset if the data is too large" type="boolean" truevalue="subset" falsevalue="none" checked="True"/>
+    <param name="scale" label="normalize by row" type="boolean" truevalue="scale" falsevalue="none" checked="False"/>
+    <param name="reorder" type="select" label="reorder features (rows)">
+      <option value="none" selected="true">None</option>
+      <option value="sort_by_sum">Sort row by sum</option>
+      <option value="sort_by_center">Sort row by center </option>
+      <option value="average">Cluster rows (average)</option>    
+      <option value="median">Cluster rows (median) </option>    
+      <option value="centroid">Cluster rows (centroid)</option>    
+      <option value="complete">Cluster rows (complete)</option>    
+      <option value="single">Cluster rows (single)</option> 
+          </param>
+             
+    <param name="color" type="select" label="color scheme">
+    <option value="heat" selected="true">heat</option>
+    <option value="gray">gray</option>
+    <option value="rainbow">rainbow</option>    
+    <option value="topo">topo</option>    
+    <option value="terrain">terrain</option>    
+    </param>
+  </inputs>
+  <outputs>
+    <data format="pdf" name="output" />
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool generates a heatmap for output from 'align' tool. Each row is the color-coded coverage of a feature, and the features are sorted by the total coverage in the interval.  
+
+**Example**
+
+.. image:: ./static/operation_icons/heatmap.png
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/ptb-3t3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/ptb-3t3 Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,2172 @@\n+chr16_57569675_57571675_15783_6_10\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr13_74440105_74442105_17676_9_22\t59\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t5\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr3_89237481_89239481_1852_6_1\t3\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr12_13319061_13321061_13323_6_6\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr10_96081244_96083244_5650_5_3\t12\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t2\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr19_29673073_29675073_22313_6_8\t2\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr6_37279553_37281553_21388_8_5\t6\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr9_51894012_51896012_4547_5_3\t3\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr14_64498751_64500751_21897_5_5\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr17_36056928_36058928_19472_7_2\t4\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr1_157044404_157046404_2031_5_6\t2\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t2\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr5_22922190_22924190_21290_7_1\t2\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr7_135585266_135587266_3169_6_1\t11\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t2\t0\n+chr6_72559358_72561358_13536_13_4\t12\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr11_23389881_23391881_12682_6_2\t62\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t5\t0\t0\t0\t0\t0\t0\t0\t0\n+chr13_19683051_19685051_20455_5_1\t22\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr9_65333909_65335909_7912_4_1\t30\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t3\t0\t0\t0\t0\t0\t0\t0\t0'..b'_2\t9\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr6_31460375_31462375_20784_9_47\t13\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t5\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr15_58280394_58282394_21786_5_3\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr8_12860585_12862585_6001_5_14\t4\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr4_11127005_11129005_22125_4_1\t6\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t3\t0\t0\t0\t0\t0\n+chr15_75729297_75731297_22252_6_4\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr15_102829916_102831916_4916_5_1\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr11_101182352_101184352_9520_6_1\t66\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t3\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr7_91762972_91764972_7429_7_16\t7\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr11_6522391_6524391_21073_4_2\t20\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t5\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr15_99529614_99531614_4869_4_2\t30\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t3\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr1_75193136_75195136_13180_5_1\t7\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr10_127554275_127556275_5558_6_6\t3\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr5_65880271_65882271_17303_5_6\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr7_117115038_117117038_4046_5_1\t34\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr15_51794475_51796475_22168_6_1\t25\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr15_34369578_34371578_22133_4_2\t18\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t5\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/ptb-ptb
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/ptb-ptb Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,14088 @@\n+chr18_10682862_10684862_10842_5_2\t2\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr11_62700002_62702002_9956_6_10\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t1\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr5_151931257_151933257_6077_4_2\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr17_30564008_30566008_19605_5_5\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr3_127571384_127573384_17034_4_6\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr4_149716624_149718624_1237_6_14\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr4_32702163_32704163_19140_5_4\t2\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t1\t0\t0\t0\t0\t0\t1\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr4_116224863_116226863_370_6_1\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr11_97433820_97435820_10125_4_1\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr15_78161060_78163060_15051_3_1\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr12_108179707_108181707_6967_3_1\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr17_85187249_85189249_13422_6_15\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t1\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr16_43308248_43310248_16447_6_45\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr6_58855943_58857943_16629_5_4\t2\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr7_86420017_86422017_7475_5_2\t2\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr1_181304251_181306251_2254_7_98\t3\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t1\t1\t1\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr3_108820418_108822418_549_6_1\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t'..b'64_10656_7_41\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t1\t1\t1\t1\t1\t1\t1\t1\t1\t1\t1\t1\t1\t1\t1\t1\t1\t1\t1\t1\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr6_52965284_52967284_21006_6_19\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr1_93987294_93989294_13975_6_19\t3\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t1\t0\t0\t0\t1\t1\t1\t1\t1\t1\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr1_87705463_87707463_13215_6_8\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr11_6522391_6524391_21073_4_2\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr1_75193136_75195136_13180_5_1\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t1\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr15_5136609_5138609_18281_7_9\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr11_30744489_30746489_13440_5_5\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr14_57353526_57355526_6022_5_5\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr18_13180908_13182908_10668_3_1\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr6_33396220_33398220_20786_6_6\t3\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr11_119346925_119348925_9772_5_4\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr2_145789356_145791356_14081_4_2\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr9_30722694_30724694_3925_4_1\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr2_104651228_104653228_3389_8_7\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr13_12297558_12299558_1131_5_9\t2\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t1\t1\t0\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n+chr13_54564465_54566465_18619_4_1\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/r_wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/r_wrapper.sh Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,2 @@
+#!/bin/sh
+R --vanilla  $* < $infile
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/r_wrapper_old.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/r_wrapper_old.sh Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+### Run R providing the R script in $1 as standard input and passing 
+### the remaining arguments on the command line
+
+# Function that writes a message to stderr and exits
+function fail
+{
+    echo "$@" >&2
+    exit 1
+}
+
+# Ensure R executable is found
+which R > /dev/null || fail "'R' is required by this tool but was not found on path" 
+
+# Extract first argument
+infile=$1; shift
+
+# Ensure the file exists
+test -f $infile || fail "R input file '$infile' does not exist"
+
+# Invoke R passing file named by first argument to stdin
+R --vanilla  $* < $infile
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/random_interval.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/random_interval.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,96 @@
+'''
+simulate a random interval set that mimics the size and strand of a reference set 
+'''
+
+def inferSizeFromRefBed(filename):
+    '''
+    read reference interval set, get chrom size information
+    '''
+    chrSize = {}
+    f = open(filename)
+    for line in f:
+        flds = line.strip().split('\t')
+        if not chrSize.has_key(flds[0]):
+            chrSize[flds[0]] = int(flds[2])
+        elif chrSize[flds[0]] < int(flds[2]):
+            chrSize[flds[0]] = int(flds[2])
+    f.close()
+    return chrSize 
+
+def getChrSize(filename):
+    chrSize = {}
+    f = open(filename)
+    for line in f:
+        flds = line.strip().split()
+        if len(flds) >1:
+            chrSize[flds[0]] = int(flds[1])
+    f.close()
+    return chrSize
+    
+def makeWeightedChrom(chrSize):
+    '''
+    make a list of chr_id, the freq is proportional to its length
+    '''
+     
+    genome_len = 0
+    
+    for chrom in chrSize:
+        chrom_len = chrSize[chrom]
+        genome_len += chrom_len
+
+    weighted_chrom = []
+    for chrom in chrSize:
+        weight = int(round(1000*float(chrSize[chrom])/genome_len))
+        weighted_chrom += [chrom]*weight
+
+    return weighted_chrom            
+
+def randomIntervalWithinChrom(infile,outfile,chrSize):
+    '''
+    '''
+    fin = open(infile)
+    fout = open(outfile,'w')
+    n = 0
+    for line in fin:
+        n = n + 1
+        flds = line.strip().split('\t')
+        interval_size = int(flds[2]) - int(flds[1])
+        flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size))
+        flds[2] = str(int(flds[1])+interval_size)
+        fout.write('\t'.join(flds)+'\n')
+    fin.close()
+    fout.close()   
+
+def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom):
+    '''
+    '''
+    fin = open(infile)
+    fout = open(outfile,'w')
+    n = 0
+    for line in fin:
+        n = n + 1
+        flds = line.strip().split('\t')
+        interval_size = int(flds[2]) - int(flds[1])
+        # find a random chrom
+        flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)]
+        flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size))
+        flds[2] = str(int(flds[1])+interval_size)
+        fout.write('\t'.join(flds)+'\n')
+    fin.close()
+    fout.close()            
+
+import sys,random
+def main():
+    # python random_interval.py test100.bed testout.bed across human.hg18.genome 
+
+    reference_interval_file = sys.argv[1]
+    output_file = sys.argv[2]
+    across_or_within_chrom = sys.argv[3] # within or across 
+    chrom_size_file = sys.argv[4]
+    chrSize = getChrSize(chrom_size_file)
+    print chrSize.keys()
+    if across_or_within_chrom == 'within':            
+        randomIntervalWithinChrom(reference_interval_file,output_file,chrSize)
+    else:
+        randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize))   
+main() 
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/random_interval.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/random_interval.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,45 @@
+<tool id="randominterval" name="shuffle intervals">
+  <description>weight chromosome by length</description>
+  <command interpreter="python">random_interval.py $input $output $within $genome </command>
+  <inputs>
+    <param name="input" format="interval" type="data" label="reference interval file to mimic"/>
+    <param name="within" label="randomize within chromosome" help="If checked, for each original interval will move it to a random position in the SAME chromosome. The default is to move it to any chromosome (chance proportional to chromosome size)" type="boolean" truevalue="within" falsevalue="across" checked="False"/>
+    <param name="genome" type="select" label="Select genome">
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm9.genome" selected="true">mm9</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm8.genome">mm8</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg18.genome">hg18</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg19.genome">hg19</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="interval" name="output" />
+  </outputs>
+  <help>
+
+
+**What it does**
+
+This tool will generate a set of intervals randomly distributed in the genome, mimicking the size distribution of the reference set. The same number of intervals are generated.
+
+
+**How it works**
+
+For each interval in the reference set, the script picks a random position as the new start in the genome, and then pick the end such that the size of the random interval is the same as the original one. The default setting is to move the interval to any chromosome, with the probability proportional to the size/length of the chromosome. You can have it pick a random position in the same chromosome, such that in the randomized set each chromosome has the same number of intervals as the reference set. The size of the chromosome can be either learned from the reference set (chromosome size = max(interval end)) or read from a chromosome size file. When learning from the reference set, only regions spanned by reference intervals are used to generate random intervals. Regions (may be an entire chromosome) not covered by the reference set will not appear in the output.
+
+**Chromosome size file**
+
+Chromosome size files for hg18,hg19,mm8,and mm9 can be found in 'Shared Data'. To use those files, select the correct one and import into to the history, then the file will be listed in the drop-down menu of this tool. You can also make your own chromosme size file: each line specifies the size of a chromosome (tab-delimited):
+
+chr1 92394392
+
+chr2 232342342    
+
+
+You can use the following script from UCSC genome browser to download chromosome size files for other genomes:
+  
+http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/fetchChromSizes
+
+
+  </help>
+  
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/removeDuplicate.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/removeDuplicate.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,10 @@
+<tool id="removeDuplicate" name="remove duplicate">
+  <description>lines</description>
+  <command> cat $input | sort | uniq > $output </command>
+  <inputs>
+    <param name="input" format="txt" type="data" label="Original file"/>
+  </inputs>
+  <outputs>
+    <data format="input" name="output" />
+  </outputs>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/resize.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/resize.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,57 @@
+'''
+change start and end of interval files
+'''
+
+import sys
+
+def resize(infile,outfile,expr_start,expr_end,strand):
+    fin = open(infile)
+    fout = open(outfile,'w')
+    if expr_start[0:3] == 'end':
+        c1 = 2
+        n1 = int(expr_start[3:])
+    else:
+        c1 = 1
+        n1 = int(expr_start[5:])
+    if expr_end[0:3] == 'end':
+        c2 = 2
+        n2 = int(expr_end[3:])
+    else:
+        c2 = 1
+        n2 = int(expr_end[5:])
+    if strand == 'ignore':
+        for line in fin:
+            flds = line.strip().split('\t')
+            start = int(flds[c1]) + n1
+            if start >= 0:
+                end = int(flds[c2]) + n2
+                if end >= start:
+                    flds[1] = str(start)
+                    flds[2] = str(end)
+                    fout.write('\t'.join(flds)+'\n')
+    else:# upstream downstream
+       for line in fin:
+            flds = line.strip().split('\t')
+            if flds[5] == '+':
+                start = int(flds[c1]) + n1
+                if start >= 0:
+                    end = int(flds[c2]) + n2
+                    if end >= start: 
+                        flds[1] = str(start)
+                        flds[2] = str(end)
+                        fout.write('\t'.join(flds)+'\n')
+            else: # on the - strand
+                start = int(flds[3-c2]) - n2 # end=end+n2
+                if start >= 0:
+                    end = int(flds[3-c1]) - n1
+                    if end >= start:
+                        flds[1] = str(start)
+                        flds[2] = str(end)
+                        fout.write('\t'.join(flds)+'\n')
+    fin.close()
+    fout.close()
+
+if __name__ == "__main__":
+    resize(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5])
+    # python resize.py in.bed out.bed start-3 end+5 both
+    # python resize.py <input.bed> <output.bed> expr_start expr_end strand(both/+/-)
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/resize.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/resize.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,32 @@
+<tool id="resize" name="resize">
+  <description>intervals</description>
+  <command interpreter="python">resize.py $infile  $outfile $expr_start $expr_end $strand </command>
+  <inputs>
+    <param name="infile" format="interval" type="data" label="Original file"/>
+    <param name="expr_start" size="20" type="text" value="start-0" label="start=" help="e.g. start+10, start-10, end-100"/>
+    <param name="expr_end" size="20" type="text" value="end+0" label="end=" help="e.g. end-100, start+10"/>
+    <param name="strand" label="Enforce strandness" type="boolean" truevalue="strand" falsevalue="ignore" checked="False"/>  
+  </inputs>
+  <outputs>
+    <data format="input" name="outfile" />
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool changes start and end of each row in an interval file. When strandness is enforced, chromosome start and end are treated as the 5' and 3' end for intervals on the '+' strand, and the opposite for intervals on the '-' strand. In the expression such as 'start=start-1000', 'start' and 'end' are interpreted as the 5' and 3' end, respectively, and the operator '+' and '-' means moving downsteam and upsteam, respectively. For example, when enforcing strandness,
+
+**start=start-1000**: extend 1000 bp on the 5' end (moving start upstream)
+
+**start=start+1000**: trancate 1000 bp on the 5' end (moving start downsteam)
+
+**end=end+1000**: extend 1000 bp on the 3' end (moving end downsteam)
+
+**end=start+1000**: moving the end to 1000 bp downsteam of the start (return the first 1000 bp on the 5' end)
+
+**end=start+1**: taking the 5' end of the interval
+
+**start=end-1**: taking the 3' end of the interval
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/revcompl.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/revcompl.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,84 @@
+import sys
+
+compldna = {'A':'T',
+        'C':'G',
+        'G':'C',
+        'T':'A',
+        'U':'A',
+        'M':'K',
+        'K':'M',
+        'W':'W',
+        'S':'S',
+        'R':'Y',
+        'Y':'R',
+        'N':'N'}
+complrna = {'A':'U',
+        'C':'G',
+        'G':'C',
+        'T':'A',
+        'U':'A',
+        'M':'K',
+        'K':'M',
+        'W':'W',
+        'S':'S',
+        'R':'Y',
+        'Y':'R',
+        'N':'N'}
+def complement(seq,compl):  
+    complseq = [compl[base] for base in seq]  
+    return complseq
+    
+def reverse_complement(seq,compl):  
+    seq = list(seq)  
+    seq.reverse()  
+    return ''.join(complement(seq,compl)) 
+            
+def readFastaFile(infile,outfile,compl):
+
+    fin = open(infile)
+    out = open(outfile,'w')
+    
+    currSeq=''
+    currSeqname=None
+    for line in fin:
+        if '>' in line:
+            if  currSeqname !=None:
+                out.write(currSeqname+reverse_complement(currSeq,compl)+'\n')
+                currSeqname=None
+                currSeq=''
+            currSeqname=line
+        else:
+            currSeq=currSeq+line.strip().upper()
+
+    if currSeqname!=None:
+        out.write(currSeqname+reverse_complement(currSeq,compl)+'\n')
+    
+    fin.close()
+    out.close()
+
+def readrawseq(infile,outfile,compl):
+    '''
+    each line is a sequence
+    '''
+    fin = open(infile)
+    out = open(outfile,'w')
+    for line in fin:
+        out.write(reverse_complement(line.strip().upper(),compl)+'\n')
+    fin.close()
+    out.close()
+    
+def main():
+    seqfile = sys.argv[1]
+    outfile = sys.argv[2]
+    fasta = sys.argv[3]
+    rna = sys.argv[4]
+    if rna == 'rna':
+        compl = complrna
+    else:
+        compl = compldna
+    if fasta == 'fasta':
+        readFastaFile(seqfile,outfile,compl)
+    else:
+        readrawseq(seqfile,outfile,compl)
+
+main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/revcompl.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/revcompl.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,41 @@
+<tool id="revcompl" name="reverse complement">
+  <description>of DNA/RNA sequences</description>
+  <command interpreter="python">revcompl.py $input $output $fasta $rna </command>
+  <inputs>
+    <param name="input" format="txt" type="data" label="Original sequence file"/>
+    <param name="fasta" label="Check if input is fasta format" type="boolean" truevalue="fasta" falsevalue="txt" checked="False"/>
+    <param name="rna" label="Check if need to output as RNA sequences" type="boolean" truevalue="rna" falsevalue="dna" checked="False"/>
+  </inputs>
+  <outputs>
+    <data format="input" name="output" />
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool outputs reverse complementary of DNA/RNA sequences in the input file. The input can be fasta format or raw sequences (each line is a sequence).
+
+Degenerate nucleotides are supported. Here is the match table:
+
+A to T/U
+
+C to G
+
+G to C
+
+T/U to A
+
+M to K
+
+W to W
+
+S to S
+
+R to Y
+
+Y to R
+
+N to N
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/sampline.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/sampline.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,133 @@
+#!/usr/bin/env python
+
+"""
+Sampling random records from a file. Each record is defined by a fixed number of lines.
+
+Usage: sampline.py [options] 
+
+Options:
+  -h, --help            show this help message and exit
+  -r, --replacement     Sampling with replacement
+  -i INPUT, --input=INPUT
+                        Input file
+  -o OUTPUT, --output=OUTPUT
+                        Output file
+  -k NSAMPLE, --nSample=NSAMPLE
+                        (required) number of records to be sampled/output
+  -m RECSIZE, --recSize=RECSIZE
+                        (default=1) number of lines spanned by each record
+  -n NSKIP, --nSkip=NSKIP
+                        (default=0) number of comment lines to skip at the
+                        beginning
+
+example:
+    python sampline.py -i test10000.fastq -o out.txt --nSample=5 --recSize=4 --nSkip=0 --replacement
+"""
+
+import optparse, string, random,sys,math,itertools
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def main():
+
+    # Parse command line    
+    parser = optparse.OptionParser( usage="%prog [options] " )
+    parser.add_option( "-r", "--replacement",  action="store_true", dest="replacement",default=False,
+                       help="Sampling with replacement" )
+    parser.add_option( "-i", "--input",  dest="input",  default=None,
+                       help="Input file" )
+    parser.add_option( "-o", "--output", dest="output", default=None,
+                       help="Output file" )
+    parser.add_option("-k","--nSample", type='int',dest="nSample",default=None,
+                      help="(required) number of records to be sampled/output" )
+    parser.add_option("-m","--recSize", type='int',dest="recSize",default=1,
+                      help="(default=1) number of lines spanned by each record" )     
+    parser.add_option("-n","--nSkip", type='int',dest="nSkip",default=0,
+                      help="(default=0) number of comment lines to skip at the beginning")    
+    options, args = parser.parse_args()
+    #assert options.region in ( 'coding', 'utr3', 'utr5', 'transcribed' ), "Invalid region argument"
+    
+    sampline(options.input,options.output,options.nSample,options.recSize,options.nSkip,options.replacement)
+
+def sample_wr(population, k):
+    "Chooses k random elements (with replacement) from a population"
+    n = len(population)
+    _random, _int = random.random, int  # speed hack
+    return [_int(_random() * n) for i in itertools.repeat(None, k)]
+
+# num of lines
+def readinput(filename):
+    try:
+        f = open (filename)
+    except:
+        print >> sys.stderr, "can't open file "+str(filename)
+        sys.exit(0)
+
+    nline = 0
+    for line in f:
+        nline = nline + 1
+    f.close()    
+    return nline
+
+def sampline(infile,outfile,nSample,recSize,nSkip,replacement):
+    # sample nSample records from file 
+    # each record contains recSize lines
+    # skip the top nSkip lines  
+    
+    nLine = readinput(infile)
+    print 'num of lines in input: ',nLine
+    print 'avoid sampling the first ',nSkip,' lines'
+    print 'lines per record: ',recSize
+
+    if (nLine-nSkip) % recSize:
+        print >> sys.stderr, "the number of lines is not dividable by record size!"
+        sys.exit(0)
+
+    nTotalRecords = (nLine-nSkip) / recSize
+    print "total number of records: ",nTotalRecords
+            
+    if replacement or nTotalRecords < nSample:
+        sel = sample_wr(range(nTotalRecords),nSample)
+    else:
+        sel = random.sample(range(nTotalRecords),nSample)
+    
+    #print len(sel), sorted(sel)
+
+    # output
+    try:
+        fout = open (outfile,'w')
+    except:
+        print >> sys.stderr, "can't open file "+str(outfile)
+        sys.exit(0)
+    fin = open(infile)
+    n = 0 # index of line
+    rec = "" # to store all content of a record
+    nrepeat = 0 # number of times a record is sampled
+    curr_rec = -1           
+    for line in fin:
+        if n < nSkip:
+            n = n + 1
+            fout.write(line)
+            continue
+        
+        if not (n-nSkip) % recSize:# a new record
+            # print the previous sampled record
+            for i in range(nrepeat):
+                fout.write(rec)
+            curr_rec = (n-nSkip)/recSize
+            nrepeat = sel.count(curr_rec)
+            if nrepeat: # sampled            
+                rec = line
+                #print curr_rec,nrepeat           
+        elif (n-nSkip)/recSize == curr_rec:
+            rec = rec + line
+        n = n + 1
+    # if the last record is selected
+    if curr_rec == nTotalRecords-1:
+        for i in range(nrepeat):
+            fout.write(rec)
+    fin.close()
+    fout.close()          
+        
+
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/sampline.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/sampline.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,108 @@
+<tool id="sampline" name="sample">
+  <description>records from a file</description>
+  <command interpreter="python">sampline.py --input=$input --output=$out_file1 --nSample=$nSample --recSize=$recSize --nSkip=$nSkip $replacement</command>
+  <inputs>
+    <param name="input" format="txt" type="data" label="Original file"/>
+    <param name="nSample" size="10" type="integer" value="100" label="Number of records to sample"/>
+    <param name="recSize" size="10" type="integer" value="1" label="Number of lines per record"/>
+    <param name="nSkip" size="10" type="integer" value="0" label="Number of top lines to output directly (without sampling)"/>
+    <param name="replacement" label="Sampling with replacement" type="boolean" truevalue="--replacement" falsevalue="" checked="False"/>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <output name="out_file1" file="testmap.sampled"/>
+      <param name="input" value="test.map" ftype="TXT"/>
+      <param name="nSample" value="100"/>
+      <param name="recSize"  value="1" />
+      <param name="nSkip" value="0" />
+      <param name="replacement" value=""/>
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool selects random records from a file. Each record is defined by a fixed number of lines.  
+
+- When doing over-sampling,  --replacement  option is enforced by default.
+
+-----
+
+**Example 1: sampling from a BED file**
+
+parameters::
+
+    1 line per record, sampling 5 lines, without replacement, output line 1 (track name) directly
+
+Input::
+
+    track name=test.bed
+    chr1 148078400 148078582 CCDS993.1_cds_0_0_chr1_148078401_r 0 -
+    chr11 116124407 116124501 CCDS8374.1_cds_0_0_chr11_116124408_r 0 -
+    chr15 41826029 41826196 CCDS10101.1_cds_0_0_chr15_41826030_f 0 +
+    chr16 142908 143003 CCDS10397.1_cds_0_0_chr16_142909_f 0 +
+    chr2 220229609 220230869 CCDS2443.1_cds_0_0_chr2_220229610_r 0 -
+    chr20 33579500 33579527 CCDS13256.1_cds_0_0_chr20_33579501_r 0 -
+    chr20 33593260 33593348 CCDS13257.1_cds_0_0_chr20_33593261_f 0 +
+    chr5 131621326 131621419 CCDS4152.1_cds_0_0_chr5_131621327_f 0 +
+    chr7 113660517 113660685 CCDS5760.1_cds_0_0_chr7_113660518_f 0 +
+    chrX 152648964 152649196 CCDS14733.1_cds_0_0_chrX_152648965_r 0 -
+
+Output::
+
+    track name=test.bed
+    chr11 116124407 116124501 CCDS8374.1_cds_0_0_chr11_116124408_r 0 -
+    chr16 142908 143003 CCDS10397.1_cds_0_0_chr16_142909_f 0 +
+    chr20 33579500 33579527 CCDS13256.1_cds_0_0_chr20_33579501_r 0 -
+    chr20 33593260 33593348 CCDS13257.1_cds_0_0_chr20_33593261_f 0 +
+    chr5 131621326 131621419 CCDS4152.1_cds_0_0_chr5_131621327_f 0 +
+
+**Example 2: sampling reads from a fastq file**
+
+parameters::
+
+    4 line per record, sampling 3 records, without replacement
+
+Input::
+
+    @SRR066787.2496 WICMT-SOLEXA:8:1:28:2047 length=36
+    NNANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+    +SRR066787.2496 WICMT-SOLEXA:8:1:28:2047 length=36
+    !!%!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    @SRR066787.2497 WICMT-SOLEXA:8:1:28:463 length=36
+    GTGATTAAGAAGAGACTGGCATCACTAAGGTGACAT
+    +SRR066787.2497 WICMT-SOLEXA:8:1:28:463 length=36
+    @A=BBCBBAA@:@:@@@:,?AB:B?BB=*2:@=?AA
+    @SRR066787.2498 WICMT-SOLEXA:8:1:28:704 length=36
+    GAACCCAATTTTCAAAGAAGTGTGACTGCTTGTTTC
+    +SRR066787.2498 WICMT-SOLEXA:8:1:28:704 length=36
+    =?BAABBACCCCAA9>>A=>A?A;;@A>ABBABBB:
+    @SRR066787.2499 WICMT-SOLEXA:8:1:28:997 length=36
+    CGACTTCAGGCTCTCGCTAGCCTTCGCTTGACTGAC
+    +SRR066787.2499 WICMT-SOLEXA:8:1:28:997 length=36
+    BCCBCCB?A1ACAC>;@CCAAABB?8=BA>@?B?@:
+    @SRR066787.2500 WICMT-SOLEXA:8:1:28:582 length=36
+    TCTCTCTCTTTCTCTCTCTCTCTCTCTCTCTCTCTC
+    +SRR066787.2500 WICMT-SOLEXA:8:1:28:582 length=36
+    ?.?.=9C8CCC:BACBCBC?CCC@CBBBCBBACAC8
+
+Output::
+
+    @SRR066787.2497 WICMT-SOLEXA:8:1:28:463 length=36
+    GTGATTAAGAAGAGACTGGCATCACTAAGGTGACAT
+    +SRR066787.2497 WICMT-SOLEXA:8:1:28:463 length=36
+    @A=BBCBBAA@:@:@@@:,?AB:B?BB=*2:@=?AA
+    @SRR066787.2499 WICMT-SOLEXA:8:1:28:997 length=36
+    CGACTTCAGGCTCTCGCTAGCCTTCGCTTGACTGAC
+    +SRR066787.2499 WICMT-SOLEXA:8:1:28:997 length=36
+    BCCBCCB?A1ACAC>;@CCAAABB?8=BA>@?B?@:
+    @SRR066787.2500 WICMT-SOLEXA:8:1:28:582 length=36
+    TCTCTCTCTTTCTCTCTCTCTCTCTCTCTCTCTCTC
+    +SRR066787.2500 WICMT-SOLEXA:8:1:28:582 length=36
+    ?.?.=9C8CCC:BACBCBC?CCC@CBBBCBBACAC8
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/seq2meme.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/seq2meme.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,101 @@
+# -*- coding: iso-8859-1 -*-
+import random,sys,math
+
+#import pylab
+
+def readMotifFile(filename):

+    try:
+        f=open(filename)
+    except IOError:
+        print "could not open",filename,"Are you sure this file exists?"
+        sys.exit(1)
+    
+    seqs=[]
+    maxL = 0
+    for line in f:
+        if '>' in line or 'N' in line:
+            next
+        else:
+            seq = line.strip().upper()
+            if maxL < len(seq):
+                maxL = len(seq)
+            seqs.append(seq)
+    f.close()
+
+    print len(seqs),'sequences loaded'
+    print 'max seq length:',maxL
+    for i in range(len(seqs)):
+        if len(seqs[i]) < maxL:
+            del seqs[i]
+    print len(seqs),'sequences with length = ',maxL
+    return seqs
+
+
+def createWeightMatrix(seqs,psuedocont):
+
+    motifWidth = len(seqs[0])
+    weightMatrix = []
+    for i in range(motifWidth):
+        weightMatrix.append({'A':psuedocont,'C':psuedocont,'G':psuedocont,'T':psuedocont})
+
+    #Use a for loop to iterate through all the sequences. For each sequence, begin at the start site in starts, and look at motifWidth bases. Count how many times each base appears in each position of the motif
+    #YOUR CODE HERE
+    for seq in seqs:
+        for pos in range(motifWidth):
+            weightMatrix[pos][seq[pos]] = weightMatrix[pos][seq[pos]] + 1.0
+    
+    #Normalize your weight matrix (so that it contains probabilities rather than counts)
+    #Remember the added psuedocounts when you normalize!
+    for pos in range(motifWidth):
+        totalCount = sum(weightMatrix[pos].values())
+        for letter in weightMatrix[pos].keys():
+            weightMatrix[pos][letter] = weightMatrix[pos][letter]/totalCount
+    
+    #Return your weight matrix
+    return weightMatrix
+
+def printMemeFormat(weightMatrix,motifName,filename,nSite,background):
+    f = open(filename,'w')
+    f.write('MEME version 4.4\n\n')
+    
+    f.write('ALPHABET= ACGT\n\n')
+
+    f.write('strands: + -\n\n')
+
+    f.write('Background letter frequencies (from:\n')
+    f.write(background+'\n\n')
+
+    f.write('MOTIF '+motifName+'\n\n') 
+
+    f.write('letter-probability matrix: alength= 4 '+'w= '+str(len(weightMatrix))+' nsites= '+str(nSite)+' E= 0\n')
+    for position in range(len(weightMatrix)):
+        probsThisPosition=weightMatrix[position]
+        f.write('  '+"%.6f" %(probsThisPosition['A'])+'\t  '+"%.6f" %(probsThisPosition['C'])+'\t  '+"%.6f" %(probsThisPosition['G'])+'\t  '+"%.6f" %(probsThisPosition['T'])+'\t\n')
+    f.write('\n\n')
+    f.close()
+    
+#get a two decimal-place string representation of a float f
+def twoDecimal(f):
+    return "%.6f" %(f)
+
+def run():
+
+    #Get file name from command line
+    if len(sys.argv) < 3:
+        print "python seq2meme.py motif_fasta outputfile motifName psuedocont background"
+        sys.exit(1)
+    else:
+        motifFile=sys.argv[1] #
+        outFile=sys.argv[2]
+        motifName=sys.argv[3]
+        psuedocont = float(sys.argv[4])
+        background=' '.join(sys.argv[5].strip().split(','))
+                
+    motifs=readMotifFile(motifFile)
+
+    #Create weight matrix
+    motifWeightMatrix=createWeightMatrix(motifs,psuedocont)
+    printMemeFormat(motifWeightMatrix,motifName,outFile,len(motifs),background)
+run()
+    
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/seq2meme.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/seq2meme.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,21 @@
+<tool id="seq2meme" name="create-motif-file">
+  <description>from fasta file</description>
+  <command interpreter="python">seq2meme.py $input $output $motifName $psuedocont $background </command>
+  <inputs>
+     <param name="input" type="data" format="txt" label="Sequence file" help="all sequences should be the same length"/>    
+      <param name="motifName" size="20" type="text" value="motif1" label="Motif name (no space allowed)"/>
+      <param name="psuedocont" size="10" type="float" value="1.0" label="Psuedocount"/>
+    <param name="background" size="40" type="text" value="A,0.25,C,0.25,G,0.25,T,0.25" label="Background frequency"/>
+  </inputs>
+  <outputs>
+    <data format="txt" name="output" label="$motifName-meme"/>
+  </outputs>
+  <help>
+
+**Description**
+
+Generate a MEME motif format file from a set of sequences. Input could be raw sequences (one sequence per line) or fasta format (one identifier line and one sequence line).
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/seqshuffle.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/seqshuffle.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,49 @@
+import sys
+
+from altschulEriksonDinuclShuffle import *
+            
+def readFastaFile(infile,outfile):
+
+    fin = open(infile)
+    out = open(outfile,'w')
+    
+    currSeq=''
+    currSeqname=None
+    for line in fin:
+        if '>' in line:
+            if  currSeqname !=None:
+                out.write(currSeqname+dinuclShuffle(currSeq)+'\n')
+                currSeqname=None
+                currSeq=''
+            currSeqname=line
+        else:
+            currSeq=currSeq+line.strip().upper()
+
+    if currSeqname!=None:
+        out.write(currSeqname+dinuclShuffle(currSeq)+'\n')
+    
+    fin.close()
+    out.close()
+
+def readrawseq(infile,outfile):
+    '''
+    each line is a sequence
+    '''
+    fin = open(infile)
+    out = open(outfile,'w')
+    for line in fin:
+        out.write(dinuclShuffle(line.strip().upper())+'\n')
+    fin.close()
+    out.close()
+    
+def main():
+    seqfile = sys.argv[1]
+    outfile = sys.argv[2]
+    fasta = sys.argv[3]
+
+    if fasta == 'fasta':
+        readFastaFile(seqfile,outfile)
+    else:
+        readrawseq(seqfile,outfile)
+
+main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/sequence.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/sequence.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,720 @@\n+#!@WHICHPYTHON@\n+\n+import copy, string, sys\n+\n+#------------------ Alphabet -------------------\n+\n+class Alphabet(object):\n+    """Biological alphabet class.\n+    This defines the set of symbols from which various objects can be built, e.g. sequences and motifs.\n+    The symbol set is immutable and accessed as a tuple.\n+    symstr: symbols in alphabet as either a tuple or string\n+    complement: dictionary defining letters and their complement\n+    """\n+    def __init__(self, symstr, complement = None):\n+        """Construct an alphabet from a string or tuple of characters.\n+        Lower case characters will be converted to upper case.\n+        An optional mapping for complements may be provided.\n+        Example:\n+        >>> alpha = sequence.Alphabet(\'ACGTttga\', {\'A\':\'C\', \'G\':\'T\'})\n+        >>> alpha.getSymbols()\n+        will construct the DNA alphabet and output:\n+        (\'A\', \'C\', \'G\', \'T\')\n+        """\n+        symlst = []\n+        for s in [str(sym).upper()[0] for sym in symstr]:\n+            if not s in symlst:\n+                symlst.append(s)\n+        self.symbols = tuple(symlst)\n+        if complement != None:\n+            # expand the mapping and check for contradictions\n+            cmap = {}\n+            for s in self.symbols:\n+                c = complement.get(s, None)\n+                if c != None:\n+                    if s in cmap and cmap[s] != c:\n+                        raise RuntimeError("Alphabet complement map "\n+                                "contains contradictory mapping")\n+                    cmap[s] = c\n+                    cmap[c] = s\n+            # replace mapping with indicies\n+            cimap = {}\n+            for idx in range (len(self.symbols)):\n+                s = self.symbols[idx]\n+                if s in cmap:\n+                    cimap[cmap[s]] = idx\n+            # create tuple\n+            cidxlst = []\n+            for idx in range (len(self.symbols)):\n+                cidxlst.append(cimap.get(self.symbols[idx], None))\n+            self.complements = tuple(cidxlst)\n+        else:\n+            self.complements = None\n+\n+    def getSymbols(self):\n+        """Retrieve a tuple with all symbols, immutable membership and order"""\n+        return self.symbols\n+\n+    def getComplements(self):\n+        """Retrieve a tuple with all complement indicies, immutable"""\n+        return self.complements\n+\n+    def isValidSymbol(self, sym):\n+        """Check if the symbol is a member of alphabet"""\n+        return any([s==sym for s in self.symbols])\n+\n+    def getIndex(self, sym):\n+        """Retrieve the index of the symbol (immutable)"""\n+        for idx in range (len(self.symbols)):\n+            if self.symbols[idx] == sym:\n+                return idx\n+        raise RuntimeError("Symbol " + sym + " does not exist in alphabet")\n+\n+    def isComplementable(self):\n+        return self.complements != None\n+\n+    def getComplement(self, sym):\n+        """Retrieve the complement of the symbol (immutable)"""\n+        return self.symbols[self.complements[self.getIndex(sym)]];\n+\n+    def isValidString(self, symstr):\n+        """Check if the string contains only symbols that belong to the alphabet"""\n+        found = True\n+        for sym in symstr:\n+            if self.isValidSymbol(sym) == False:\n+                return False\n+        return True\n+\n+    def getLen(self):\n+        """Retrieve the number of symbols in (the length of) the alphabet"""\n+        return len(self.symbols)\n+\n+# pre-defined alphabets that can be specified by their name\n+predefAlphabets = [\n+    ("DNA"                , Alphabet(\'ACGT\', {\'A\':\'T\', \'G\':\'C\'})),\n+    ("RNA"                , Alphabet(\'ACGU\')),\n+    ("Extended DNA"       , Alphabet(\'ACGTYRN\')),\n+    ("Protein"            , Alphabet(\'ACDEFGHIKLMNPQRSTVWY\')),\n+    ("Extended Protein"   , Alphabet(\'ACDEFGHIKLMNPQRSTVWYX\')),\n+    ("TM Labels"          , Alphabet(\'MIO\'))\n+]\n+\n+def getAlphabet(name):\n+    """Retrieve a pre-defined alphabet by name.\n+    Cu'..b'%M:%S %Z"\n+        small_date = time.strftime(small_dfmt, timestamp)\n+        full_date = time.strftime(full_dfmt, timestamp)\n+        points_per_cm = 72.0 / 2.54\n+        height = 4.5\n+        width = self.getLen() * 0.8 + 2\n+        width = min(30, width)\n+        points_height = int(height * points_per_cm)\n+        points_width = int(width * points_per_cm)\n+        defaults = _eps_defaults.copy()\n+        defaults[\'CREATOR\'] = program\n+        defaults[\'CREATIONDATE\'] = full_date\n+        defaults[\'LOGOHEIGHT\'] = str(height)\n+        defaults[\'LOGOWIDTH\'] = str(width)\n+        defaults[\'FINEPRINT\'] = program + \' \' + small_date\n+        defaults[\'CHARSPERLINE\'] = str(self.getLen())\n+        defaults[\'BOUNDINGHEIGHT\'] = str(points_height)\n+        defaults[\'BOUNDINGWIDTH\'] = str(points_width)\n+        defaults[\'LOGOLINEHEIGHT\'] = str(height)\n+        with open(template_file, \'r\') as template_fh:\n+            m_var = re.compile("\\{\\$([A-Z]+)\\}")\n+            for line in template_fh:\n+                last = 0\n+                match = m_var.search(line)\n+                while (match):\n+                    if (last < match.start()):\n+                        prev = line[last:match.start()]\n+                        eps_fh.write(prev)\n+                    key = match.group(1)\n+                    if (key == "DATA"):\n+                        eps_fh.write("\\nStartLine\\n")\n+                        for pos in range(self.getLen()):\n+                            eps_fh.write("({0:d}) startstack\\n".format(pos+1))\n+                            stack = []\n+                            # calculate the stack information content\n+                            alpha_ic = 2\n+                            h = 0\n+                            for sym in self.getAlphabet().getSymbols():\n+                                freq = self.getFreq(pos, sym)\n+                                if (freq == 0):\n+                                    continue\n+                                h -= (freq * math.log(freq, 2))\n+                            stack_ic = alpha_ic - h\n+                            # calculate the heights of each symbol\n+                            for sym in self.getAlphabet().getSymbols():\n+                                freq = self.getFreq(pos, sym)\n+                                if (freq == 0):\n+                                    continue\n+                                stack.append((freq * stack_ic, sym))\n+                            stack.sort();\n+                            # output the symbols\n+                            for symh, sym in stack:\n+                                eps_fh.write(" {0:f} ({1:s}) numchar\\n".format(\n+                                        symh, sym))\n+                            eps_fh.write("endstack\\n\\n")\n+                        eps_fh.write("EndLine\\n")\n+                    elif (key in defaults):\n+                        eps_fh.write(defaults[key])\n+                    else:\n+                        raise RuntimeError(\'Unknown variable "\' + key + \n+                                \'" in EPS template\')\n+                    last = match.end();\n+                    match = m_var.search(line, last)\n+                if (last < len(line)):\n+                    eps_fh.write(line[last:])\n+\n+\n+#------------------ Main method -------------------\n+# Executed if you run this file from the operating system prompt, e.g.\n+# > python sequence.py\n+\n+if __name__==\'__main__\':\n+    alpha = getAlphabet(\'Extended DNA\')\n+    #seqs = readFASTA(\'pos.fasta\')\n+    seqs = []\n+    aln = readStrings(\'tmp0\')\n+    #regexp = RegExp(alpha, \'[AG]G.[DE]TT[AS].\')\n+    pwm = PWM(alpha)\n+    pwm.setFromAlignment(aln)\n+    for row in pwm.pretty():\n+        print row\n+    for s in seqs:\n+        print s.getName(), s.getLen(), s.getAlphabet().getSymbols()\n+        for m in regexp.match( s ):\n+            print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2])\n+        for m in pwm.match( s ):\n+            print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2])\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/shuffleBed.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/shuffleBed.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,107 @@
+'''
+simulate a random interval set that mimics the size and strand of a reference set 
+'''
+
+def inferSizeFromRefBed(filename,header):
+    '''
+    read reference interval set, get chrom size information
+    '''
+    chrSize = {}
+    f = open(filename)
+    if header:
+        header = f.readline()
+    for line in f:
+        flds = line.strip().split('\t')
+        if not chrSize.has_key(flds[0]):
+            chrSize[flds[0]] = int(flds[2])
+        elif chrSize[flds[0]] < int(flds[2]):
+            chrSize[flds[0]] = int(flds[2])
+    f.close()
+    return chrSize 
+
+def getChrSize(filename):
+    chrSize = {}
+    f = open(filename)
+    for line in f:
+        flds = line.strip().split('\t')
+        if len(flds) >1:
+            chrSize[flds[0]] = int(flds[1])
+    f.close()
+    return chrSize
+    
+def makeWeightedChrom(chrSize):
+    '''
+    make a list of chr_id, the freq is proportional to its length
+    '''
+     
+    genome_len = 0
+    
+    for chrom in chrSize:
+        chrom_len = chrSize[chrom]
+        genome_len += chrom_len
+
+    weighted_chrom = []
+    for chrom in chrSize:
+        weight = int(round(1000*float(chrSize[chrom])/genome_len))
+        weighted_chrom += [chrom]*weight
+
+    return weighted_chrom            
+
+def randomIntervalWithinChrom(infile,outfile,chrSize,header):
+    '''
+    '''
+    fin = open(infile)
+    if header:
+        header = fin.readline()
+    fout = open(outfile,'w')
+    n = 0
+    for line in fin:
+        n = n + 1
+        flds = line.strip().split('\t')
+        interval_size = int(flds[2]) - int(flds[1])
+        rstart = random.randint(0,chrSize[flds[0]]-interval_size)
+        fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n')
+    fin.close()
+    fout.close()   
+
+def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom,header):
+    '''
+    '''
+    fin = open(infile)
+    if header:
+        header = fin.readline()
+    fout = open(outfile,'w')
+    n = 0
+    for line in fin:
+        n = n + 1
+        flds = line.strip().split('\t')
+        interval_size = int(flds[2]) - int(flds[1])
+        # find a random chrom
+        flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)]
+        # random start in the chrom
+        rstart = random.randint(0,chrSize[flds[0]]-interval_size)
+        fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n')
+    fin.close()
+    fout.close()            
+
+import sys,random
+def main():
+    # python random_interval.py test100.bed testout.bed across header human.hg18.genome 
+
+    reference_interval_file = sys.argv[1]
+    output_file = sys.argv[2]
+    across_or_within_chrom = sys.argv[3] # within or across 
+    if sys.argv[4] == 'header':
+        header = True 
+    else:
+        header = False
+    if len(sys.argv) == 6:
+        chrom_size_file = sys.argv[5]
+        chrSize = getChrSize(chrom_size_file)
+    else:
+        chrSize = inferSizeFromRefBed(reference_interval_file,header) 
+    if across_or_within_chrom == 'within':            
+        randomIntervalWithinChrom(reference_interval_file,output_file,chrSize,header)
+    else:
+        randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize),header)   
+main() 
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/shuffleBed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/shuffleBed.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,43 @@
+<tool id="shufflebed" name="shuffleBed">
+  <description>chromosome not weighted by length</description>
+  <command>shuffleBed -i $input -g $genome $chrom > $outfile 
+    #if $limit.limit_select=="include":
+    -incl $limitfile
+    #else if $limit.limit_select=="exclude":
+    -excl $limitfile 
+    #end if
+  </command>
+  <inputs>
+    <param name="input" format="bed,gff,vcf" type="data" label="Original intervals (BED/GFF/VCF)" />
+    <param name="genome" type="select" label="Select genome">
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm9.genome" selected="true">mm9</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm8.genome">mm8</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg18.genome">hg18</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg19.genome">hg19</option>
+    </param>
+    <param name="chrom" label="keep intervals on the same chromosome?" type="boolean" truevalue="-chrom" falsevalue="" checked="False"/>
+    <conditional name="limit">
+     <param name="limit_select" type="select" label="restrictions for the shuffling" help="Instead of randomly placing features in a genome, one can specify regions features should or should not be randomly placed (e.g. genes.bed or repeats.bed).">
+ <option value="none" selected="true">None</option>
+ <option value="include">within specified regions</option>
+ <option value="exclude">outside specified regions</option>
+     </param>
+     <when value="include">
+     <param name="limitfile" type="data" format="interval" label="specify regions"/>
+     </when>
+     <when value="exclude">
+     <param name="limitfile" type="data" format="interval" label="specify regions"/>
+     </when>
+    </conditional>         
+  </inputs>
+  <outputs>
+    <data format="input" name="outfile" />
+  </outputs>
+  <help>
+  
+.. class:: infomark
+
+Every chromosome are choosed with equal probability, regardless their size. Please use the tool 'random intervals' instead for general randomization.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/shuffleSequenceUsingAltschulErikson.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/shuffleSequenceUsingAltschulErikson.txt Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,61 @@
+#! /usr/bin/env python
+
+#shuffleSequenceUsingAltschulErikson.py
+#P. Clote, Oct 2003
+
+#------------------------------------------------------------------
+#Input RNAs in FASTA file, and compute NUM many shufflings of RNA sequence
+#using Altman-Erikson randomly shuffled dinucleotide method.
+#------------------------------------------------------------------
+
+PRINT   = 0
+LINELEN = 70
+
+import sys,os,stats,string
+from altschulEriksonDinuclShuffle import dinuclShuffle
+import computeRNAfoldEnergyForRNAsInFile
+
+
+
+
+def file2string(fileName):
+  file = open(fileName,"r")
+  L = []
+  line = file.readline()
+  while line:
+    while line[0]==">":  # treat lines beginning with '>' as comment and skip 
+      line = file.readline()
+      continue
+    else: 
+      line = line[:-1]
+      L.append(line)
+      line = file.readline()
+  text = string.join(L,"")
+  return text
+
+
+def main(fileName,NUM):
+  seq = file2string(fileName)
+  for i in range(NUM):
+    shuffledSeq = dinuclShuffle(seq) 
+    sys.stdout.write(">%d\n" % (i+1))
+    sys.stdout.write("%s\n" % shuffledSeq)
+
+
+
+  
+if __name__ == '__main__':  
+  if len(sys.argv) < 3 :
+     print "Usage: %s RNAs.faa NUM" %  sys.argv[0]
+     text = """
+            1) RNA.faa is FASTA file of ONE RNA sequence
+            2) NUM is number of random sequences to generate by
+               shuffling the dinucleotides of RNAs input
+     Script to compute Altman-Erikson randomly shuffled dinucleotides.
+            """
+     print text
+     sys.exit(1)
+  fileName = sys.argv[1]
+  NUM      = int(sys.argv[2])
+  main(fileName,NUM)
+  
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/spatial_proximity.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/spatial_proximity.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,36 @@
+
+import os,sys
+
+file1 = sys.argv[1]
+file2 = sys.argv[2]
+genome = sys.argv[3]
+outplot = sys.argv[4]
+outlog = sys.argv[5]
+outbed = sys.argv[6]
+
+strandness = ''
+if len(sys.argv) > 7:   
+    strandness = sys.argv[7]
+
+# real distance
+cmd = 'closestBed -a '+file1+' -b '+file2 + ' '+strandness + ' -d -t first > '+outbed
+os.system(cmd)
+# shuffle
+cmd = 'shuffleBed -chrom -g '+genome+' -i '+file1+'> shuffled.bed'
+os.system(cmd)
+# shuffled distance
+cmd = 'closestBed -a shuffled.bed -b '+file2 + ' '+strandness + ' -d -t first > shuffled.dist'
+os.system(cmd)
+
+
+# test in R
+r = open('tmp.r','w')
+r.write("options(warn=-1)\n")
+r.write("source('/Users/xuebing/galaxy-dist/tools/mytools/cdf.r')\n")
+r.write("x = read.table('"+outbed+"',sep='\t')\n")
+r.write("y = read.table('shuffled.dist',sep='\t')\n")
+r.write("pdf('"+outplot+"')\n")
+r.write("mycdf(list(log10(1+x[,ncol(x)]),log10(1+y[,ncol(y)])),'spatial distance',c('real','shuffled'),'topleft','log10 distance','')\n")
+r.write("dev.off()\n")
+r.close()
+os.system("R --vanilla < tmp.r >"+outlog)
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/spatial_proximity.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/spatial_proximity.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,32 @@
+<tool id="spatialproximity" name="spatial proximity">
+  <description>of two interval sets</description>
+  <command interpreter="python">spatial_proximity.py $inputa $inputb $genome  $outplot $outlog $outbed $strandness
+  </command>
+  <inputs>
+    <param name="inputa" format="interval" type="data" label="Interval set 1" />
+    <param name="inputb" format="interval" type="data" label="Interval set 2" />
+    <param name="genome" type="select" label="Select genome">
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm9.genome" selected="true">mm9</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm8.genome">mm8</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg18.genome">hg18</option>
+     <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg19.genome">hg19</option>
+    </param>
+          <param name="strandness" type="select" label="Strand requirement" >
+ <option value="" selected="true"> none </option>
+        <option value="-s" > -s: closest feature on the same strand</option>
+        <option value="-S" > -S: closest feature on the opposite strand </option>
+      </param>
+  </inputs>
+  <outputs>
+    <data format="input" name="outbed" label="${tool.name} on ${on_string}: (bed)" />
+    <data format="pdf" name="outplot" label="${tool.name} on ${on_string}: (plot)" />
+      <data format="txt" name="outlog" label="${tool.name} on ${on_string}: (log)" />
+  </outputs>
+  <help>
+  
+.. class:: infomark
+
+for each feature in the first interval set, find the closest in the second set, then compared the distance distribution to shuffled set 1.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesite.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesite.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,64 @@
+<tool id="splicesite" name="splice site score">
+  <description>using max entropy model</description>
+  <command interpreter="perl">$script $input > $out_file1 </command>
+  <inputs>
+    <param name="input" format="txt" type="data" label="Sequence file" help="fasta or raw sequence (one sequence per line)"/>
+    <param name="script" type="select" label="Select model">
+        <option value="splicesitescore/score5.pl" selected="true">5' splice site</option>
+        <option value="splicesitescore/score3.pl">3' splice site</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_file1" />
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool computes splice site scores using a max entropy model. See more details here: 
+
+http://genes.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html
+
+-----
+
+**Example input for 5' splice site sequence**
+
+3 exonic and 6 intronic nucleotides flanking the junction::
+
+    CTGGTGAGT
+    AAGGTACAG
+
+or fasta format::
+
+    >seq1
+    CTGGTGAGT
+    >seq2
+    AAGGTACAG
+
+Output::
+
+    CTGGTGAGT 10.10
+    AAGGTACAG 8.04
+
+or fasta format::
+
+    >seq1   CTGGTGAGT 10.10
+    >seq2   AAGGTACAG 8.04
+
+
+-----
+
+**Example input for 3' splice site sequence**
+
+3 exonic and 20 intronic nucleotides flanking the junction::
+
+    CCTGCATCCTCTGTTCCCAGGTG
+    TTTCTTCCCTCCGGGAACAGTGG
+
+Output::
+
+    CCTGCATCCTCTGTTCCCAGGTG 10.47
+    TTTCTTCCCTCCGGGAACAGTGG 6.22
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/._me2x5
b
Binary file tools/mytools/splicesitescore/._me2x5 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/._score3.pl
b
Binary file tools/mytools/splicesitescore/._score3.pl has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/._score5.pl
b
Binary file tools/mytools/splicesitescore/._score5.pl has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/._splicemodels
b
Binary file tools/mytools/splicesitescore/._splicemodels has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/._test3
b
Binary file tools/mytools/splicesitescore/._test3 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/._test3.fa
b
Binary file tools/mytools/splicesitescore/._test3.fa has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/._test5
b
Binary file tools/mytools/splicesitescore/._test5 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/._test5.fa
b
Binary file tools/mytools/splicesitescore/._test5.fa has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/me2x5
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/me2x5 Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,16384 @@\n+   1.4359596e-01\n+   2.0415665e-01\n+   1.9297912e-01\n+   1.0617628e+00\n+   2.3168664e-01\n+   4.8389961e-01\n+   4.6634523e-01\n+   1.2792657e+00\n+   7.6489932e+00\n+   1.0067296e+01\n+   6.1408952e+00\n+   5.3603145e+01\n+   1.1989258e-01\n+   9.8322607e-02\n+   1.4184147e-01\n+   7.8902998e-01\n+   1.7593148e-02\n+   4.8968427e-02\n+   3.4133324e-02\n+   1.8816052e-01\n+   4.0230456e-02\n+   1.6449797e-01\n+   1.1690377e-01\n+   3.2130290e-01\n+   1.5940351e+00\n+   4.1073125e+00\n+   1.8475330e+00\n+   1.6157861e+01\n+   2.2056827e-02\n+   3.5412391e-02\n+   3.7672180e-02\n+   2.0996346e-01\n+   7.7444391e-03\n+   2.1649340e-02\n+   6.6714840e-03\n+   7.5684837e-02\n+   8.2847698e-03\n+   3.4022669e-02\n+   1.0689360e-02\n+   6.0460855e-02\n+   7.8640944e-01\n+   2.0351230e+00\n+   4.0470686e-01\n+   7.2839766e+00\n+   3.8490650e-03\n+   6.2065414e-03\n+   2.9189752e-03\n+   3.3480365e-02\n+   1.9657292e-02\n+   4.8412421e-02\n+   3.0548362e-02\n+   1.7296027e-01\n+   2.3872539e-02\n+   8.6370348e-02\n+   5.5565012e-02\n+   1.5685419e-01\n+   9.3038786e-01\n+   2.1212143e+00\n+   8.6374928e-01\n+   7.7586767e+00\n+   1.4753136e-02\n+   2.0958348e-02\n+   2.0183261e-02\n+   1.1553751e-01\n+   3.5721985e-05\n+   7.9119473e-05\n+   3.3416838e-05\n+   7.1843384e-04\n+   4.0575472e-05\n+   1.3202135e-04\n+   5.6850193e-05\n+   6.0938191e-04\n+   1.3171435e-02\n+   2.7006505e-02\n+   7.3607592e-03\n+   2.5106464e-01\n+   4.5191403e-05\n+   5.7735533e-05\n+   3.7215870e-05\n+   8.0895250e-04\n+   9.0944783e-07\n+   3.9434554e-06\n+   1.2282146e-06\n+   2.6456281e-05\n+   1.4640585e-06\n+   9.3258946e-06\n+   2.9613766e-06\n+   3.1804201e-05\n+   5.7038454e-04\n+   2.2895700e-03\n+   4.6017607e-04\n+   1.5726060e-02\n+   1.7276167e-06\n+   4.3210171e-06\n+   2.0539344e-06\n+   4.4731586e-05\n+   1.0796194e-06\n+   4.7016642e-06\n+   6.4738766e-07\n+   2.8698248e-05\n+   8.1307479e-07\n+   5.2016888e-06\n+   7.3023563e-07\n+   1.6139496e-05\n+   7.5886612e-04\n+   3.0593807e-03\n+   2.7184313e-04\n+   1.9118372e-02\n+   8.1302875e-07\n+   2.0423329e-06\n+   4.2918318e-07\n+   1.9235638e-05\n+   2.3743530e-06\n+   9.1097150e-06\n+   2.5684501e-06\n+   5.6824293e-05\n+   2.0299678e-06\n+   1.1441472e-05\n+   3.2889248e-06\n+   3.6278791e-05\n+   7.7789563e-04\n+   2.7629194e-03\n+   5.0269752e-04\n+   1.7644574e-02\n+   2.7000750e-06\n+   5.9755123e-06\n+   2.5712517e-06\n+   5.7514924e-05\n+   5.6562877e-03\n+   1.9339633e-02\n+   1.0995094e-02\n+   1.0753419e-01\n+   1.0055850e-02\n+   5.0509006e-02\n+   2.9276890e-02\n+   1.4276073e-01\n+   1.1898916e+00\n+   3.7662726e+00\n+   1.3817655e+00\n+   2.1439941e+01\n+   3.4821433e-03\n+   6.8675693e-03\n+   5.9587750e-03\n+   5.8922035e-02\n+   4.1614899e-04\n+   2.7855863e-03\n+   1.1678384e-03\n+   1.1443617e-02\n+   1.0485490e-03\n+   1.0310738e-02\n+   4.4071886e-03\n+   2.1531691e-02\n+   1.4890762e-01\n+   9.2272478e-01\n+   2.4963803e-01\n+   3.8809050e+00\n+   3.8469208e-04\n+   1.4853223e-03\n+   9.5036442e-04\n+   9.4155119e-03\n+   1.0555532e-04\n+   7.0962725e-04\n+   1.3152603e-04\n+   2.6523365e-03\n+   1.2442257e-04\n+   1.2288034e-03\n+   2.3220394e-04\n+   2.3346554e-03\n+   4.2330443e-02\n+   2.6344527e-01\n+   3.1509679e-02\n+   1.0080975e+00\n+   3.8682121e-05\n+   1.5000311e-04\n+   4.2431172e-05\n+   8.6511702e-04\n+   1.2544477e-04\n+   7.4298728e-04\n+   2.8197870e-04\n+   2.8379522e-03\n+   1.6786341e-04\n+   1.4605545e-03\n+   5.6514310e-04\n+   2.8358514e-03\n+   2.3448075e-02\n+   1.2856512e-01\n+   3.1486916e-02\n+   5.0276028e-01\n+   6.9418975e-05\n+   2.3716277e-04\n+   1.3736766e-04\n+   1.3978062e-03\n+   3.4545523e-04\n+   4.3969437e-04\n+   4.6359794e-04\n+   1.0162662e-02\n+   3.3147557e-05\n+   6.1978902e-05\n+   6.6625487e-05\n+   7.2818583e-04\n+   4.3092813e-02\n+   5.0775156e-02\n+   3.4547310e-02\n+   1.2014940e+00\n+   1.4434781e-04\n+   1.0597643e-04\n+   1.7053085e-04\n+   3.7795657e-03\n+   4.7112950e-05\n+   1.1739528e-04\n+   9.1276039e-05\n+   2.0047297e-03\n+   6.4069749e-06\n+   2.3452892e-05\n+   1.8591238e-05\n+   2.0358389e-04\n+   9.9964520e-03\n+   2.3059155e-02\n+   1.1569699e-02\n+   4.0314598e-01\n+   2.9560265e-05\n+'..b'05\n+   6.1586001e-06\n+   9.1669747e-05\n+   2.0516704e-05\n+   1.1082449e-04\n+   5.5300500e-05\n+   4.1040188e-04\n+   5.6410541e-03\n+   1.9201852e-02\n+   6.0646088e-03\n+   1.4321498e-01\n+   2.2558698e-06\n+   4.7846445e-06\n+   3.5738814e-06\n+   5.3784564e-05\n+   2.0556180e-05\n+   6.6591143e-05\n+   6.6735567e-05\n+   4.9576228e-04\n+   1.3990564e-04\n+   6.6579737e-04\n+   6.8028179e-04\n+   2.5196549e-03\n+   1.5793755e-02\n+   4.7363854e-02\n+   3.0630916e-02\n+   3.6100889e-01\n+   2.0462243e-05\n+   3.8235520e-05\n+   5.8480474e-05\n+   4.3923851e-04\n+   3.9155530e-08\n+   1.1407270e-07\n+   7.6519765e-08\n+   2.1585039e-06\n+   2.4925229e-07\n+   1.0667452e-06\n+   7.2955561e-07\n+   1.0260619e-05\n+   2.3436536e-04\n+   6.3207636e-04\n+   2.7361092e-04\n+   1.2244868e-02\n+   6.5699722e-08\n+   1.1040577e-07\n+   1.1302823e-07\n+   3.2235846e-06\n+   1.8627472e-10\n+   1.0624133e-09\n+   5.2553413e-10\n+   1.4852966e-08\n+   1.6805539e-09\n+   1.4080739e-08\n+   7.1013071e-09\n+   1.0006603e-07\n+   1.8964727e-06\n+   1.0013219e-05\n+   3.1963414e-06\n+   1.4332003e-04\n+   4.6932484e-10\n+   1.5440204e-09\n+   1.1656375e-09\n+   3.3308023e-08\n+   1.1596653e-10\n+   6.6428509e-10\n+   1.4527045e-10\n+   8.4494011e-09\n+   4.8945278e-10\n+   4.1187506e-09\n+   9.1831979e-10\n+   2.6630457e-08\n+   1.3232143e-06\n+   7.0167974e-06\n+   9.9022529e-07\n+   9.1374324e-05\n+   1.1582928e-10\n+   3.8271887e-10\n+   1.2773377e-10\n+   7.5115143e-09\n+   6.0355551e-10\n+   3.0459144e-09\n+   1.3639357e-09\n+   3.9592644e-08\n+   2.8918745e-09\n+   2.1439412e-08\n+   9.7880137e-09\n+   1.4166133e-07\n+   3.2099351e-06\n+   1.4996296e-05\n+   4.3334348e-06\n+   1.9956966e-04\n+   9.1032842e-10\n+   2.6499537e-09\n+   1.8109962e-09\n+   5.3150987e-08\n+   2.2572736e-05\n+   1.0151767e-04\n+   9.1664727e-05\n+   1.1762715e-03\n+   2.2490011e-04\n+   1.4858692e-03\n+   1.3678753e-03\n+   8.7516128e-03\n+   7.7083798e-02\n+   3.2092826e-01\n+   1.8699966e-01\n+   3.8070404e+00\n+   1.8431024e-05\n+   4.7813114e-05\n+   6.5888638e-05\n+   8.5484834e-04\n+   3.1032705e-07\n+   2.7323011e-06\n+   1.8193006e-06\n+   2.3390680e-05\n+   4.3820549e-06\n+   5.6678682e-05\n+   3.8477026e-05\n+   2.4664705e-04\n+   1.8025646e-03\n+   1.4692203e-02\n+   6.3129950e-03\n+   1.2877012e-01\n+   3.8048227e-07\n+   1.9323364e-06\n+   1.9636410e-06\n+   2.5525458e-05\n+   4.1279768e-08\n+   3.6502971e-07\n+   1.0745329e-07\n+   2.8431149e-06\n+   2.7269337e-07\n+   3.5424080e-06\n+   1.0631528e-06\n+   1.4025121e-05\n+   2.6872794e-04\n+   2.1998407e-03\n+   4.1788317e-04\n+   1.7541676e-02\n+   2.0064011e-08\n+   1.0234081e-07\n+   4.5977268e-08\n+   1.2299599e-06\n+   1.1609665e-07\n+   9.0445978e-07\n+   5.4517296e-07\n+   7.1991407e-06\n+   8.7064510e-07\n+   9.9642346e-06\n+   6.1234214e-06\n+   4.0315962e-05\n+   3.5227115e-04\n+   2.5405858e-03\n+   9.8821319e-04\n+   2.0703280e-02\n+   8.5211017e-08\n+   3.8291722e-07\n+   3.5225154e-07\n+   4.7029747e-06\n+   5.4109151e-09\n+   9.0588045e-09\n+   1.5169509e-08\n+   4.3630986e-07\n+   2.9097041e-09\n+   7.1561910e-09\n+   1.2217668e-08\n+   1.7520557e-07\n+   1.0956878e-05\n+   1.6981423e-05\n+   1.8350453e-05\n+   8.3735957e-04\n+   2.9987474e-09\n+   2.8958731e-09\n+   7.4008799e-09\n+   2.1521868e-07\n+   1.3789165e-10\n+   4.5194860e-10\n+   5.5809135e-10\n+   1.6082797e-08\n+   1.0509170e-10\n+   5.0600277e-10\n+   6.3705174e-10\n+   9.1530814e-09\n+   4.7494806e-07\n+   1.4410687e-06\n+   1.1483462e-06\n+   5.2501434e-05\n+   1.1475087e-10\n+   2.1694360e-10\n+   4.0885212e-10\n+   1.1912307e-08\n+   4.4325323e-11\n+   1.4591005e-10\n+   7.9655699e-11\n+   4.7239961e-09\n+   1.5803810e-11\n+   7.6423728e-11\n+   4.2536846e-11\n+   1.2577498e-09\n+   1.7110571e-07\n+   5.2141690e-07\n+   1.8369138e-07\n+   1.7283169e-05\n+   1.4622980e-11\n+   2.7765715e-11\n+   2.3133623e-11\n+   1.3871059e-09\n+   2.3878154e-10\n+   6.9248831e-10\n+   7.7410054e-10\n+   2.2911967e-08\n+   9.6648459e-11\n+   4.1175581e-10\n+   4.6927815e-10\n+   6.9251827e-09\n+   4.2963055e-07\n+   1.1534380e-06\n+   8.3205317e-07\n+   3.9071299e-05\n+   1.1895429e-10\n+   1.9899018e-10\n+   3.3948422e-10\n+   1.0159160e-08\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/score3.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/score3.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,166 @@
+use strict;
+
+
+my $inputfile = $ARGV[0];
+my $usemaxent = 1;
+
+
+my @metables = &makemaxentscores;
+
+open (FILE,"<$inputfile") || die "can't open!\n";
+
+while(<FILE>) {
+    chomp;
+    if (/^\s*$/) { #discard blank lines;
+ next;
+    } 
+    elsif (/^>/) { #discard comment lines;
+        print $_."\t";
+ next;
+    }
+    elsif (/[NQWERYUIOPLKJHFDSZXVBM]/) {
+ next;
+    }
+    else {
+        $_ =~ s/\cM//g; #gets rid of carriage return
+ my $str = $_;
+ print $str."\t";
+ $str = uc($str);
+ if ($usemaxent) { 
+     print sprintf("%.2f",&log2(&scoreconsensus($str)*&maxentscore(&getrest($str),\@metables)))."\n";
+ }
+    }
+}
+
+sub hashseq{
+    #returns hash of sequence in base 4
+    # &hashseq('CAGAAGT') returns 4619
+    my $seq = shift;
+    $seq = uc($seq);
+    $seq =~ tr/ACGT/0123/;
+    my @seqa = split(//,$seq);
+    my $sum = 0;
+    my $len = length($seq);
+    my @four = (1,4,16,64,256,1024,4096,16384);
+    my $i=0;
+    while ($i<$len) {
+        $sum+= $seqa[$i] * $four[$len - $i -1] ;
+ $i++;
+    }
+    return $sum;
+}
+
+sub makemaxentscores{
+    my $dir = "/Users/xuebing/galaxy-dist/tools/mytools/splicesitescore/splicemodels/";
+    my @list = ('me2x3acc1','me2x3acc2','me2x3acc3','me2x3acc4',
+ 'me2x3acc5','me2x3acc6','me2x3acc7','me2x3acc8','me2x3acc9');
+    my @metables;
+    my $num = 0 ;
+    foreach my $file (@list) {
+ my $n = 0;
+ open (SCOREF,"<".$dir.$file) || die "Can't open $file!\n";
+ while(<SCOREF>) {
+     chomp;
+     $_=~ s/\s//;
+     $metables[$num]{$n} = $_;
+     $n++;
+ }
+ close(SCOREF);
+ #print STDERR $file."\t".$num."\t".$n."\n";
+ $num++;
+    }
+    return @metables;
+}
+sub makewmmscores{
+    my $dir = "/bionet/geneyeo_essentials/MaxEntropy/webserver/splicemodels/";
+    my @list = ('me1s0acc1','me1s0acc2','me1s0acc3','me1s0acc4',
+ 'me1s0acc5','me1s0acc6','me1s0acc7','me1s0acc8','me1s0acc9');
+    my @metables;
+    my $num = 0 ;
+    foreach my $file (@list) {
+ my $n = 0;
+ open (SCOREF,"<".$dir.$file) || die "Can't open $file!\n";
+ while(<SCOREF>) {
+     chomp;
+     $_=~ s/\s//;
+     $metables[$num]{$n} = $_;
+     $n++;
+ }
+ close(SCOREF);
+ #print STDERR $file."\t".$num."\t".$n."\n";
+ $num++;
+    }
+    return @metables;
+}
+sub makemmscores{
+    my $dir = "/bionet/geneyeo_essentials/MaxEntropy/webserver/splicemodels/";
+    my @list = ('me2s0acc1','me2s0acc2','me2s0acc3','me2s0acc4',
+ 'me2s0acc5','me2s0acc6','me2s0acc7','me2s0acc8','me2s0acc9');
+    my @metables;
+    my $num = 0 ;
+    foreach my $file (@list) {
+ my $n = 0;
+ open (SCOREF,"<".$dir.$file) || die "Can't open $file!\n";
+ while(<SCOREF>) {
+     chomp;
+     $_=~ s/\s//;
+     $metables[$num]{$n} = $_;
+     $n++;
+ }
+ close(SCOREF);
+ #print STDERR $file."\t".$num."\t".$n."\n";
+ $num++;
+    }
+    return @metables;
+}
+sub maxentscore{
+    my $seq = shift;
+    my $table_ref = shift;
+    my @metables = @$table_ref;
+    my @sc;
+    $sc[0] = $metables[0]{&hashseq(substr($seq,0,7))};
+    $sc[1] = $metables[1]{&hashseq(substr($seq,7,7))};
+    $sc[2] = $metables[2]{&hashseq(substr($seq,14,7))};
+    $sc[3] = $metables[3]{&hashseq(substr($seq,4,7))};
+    $sc[4] = $metables[4]{&hashseq(substr($seq,11,7))};
+    $sc[5] = $metables[5]{&hashseq(substr($seq,4,3))};
+    $sc[6] = $metables[6]{&hashseq(substr($seq,7,4))};
+    $sc[7] = $metables[7]{&hashseq(substr($seq,11,3))};
+    $sc[8] = $metables[8]{&hashseq(substr($seq,14,4))};
+    my $finalscore = $sc[0] * $sc[1] * $sc[2] * $sc[3] * $sc[4] / ($sc[5] * $sc[6] * $sc[7] * $sc[8]);
+    return $finalscore;
+}    
+    
+
+
+sub getrest{
+  my $seq = shift;
+  my $seq_noconsensus = substr($seq,0,18).substr($seq,20,3);
+  return $seq_noconsensus;
+}
+
+sub scoreconsensus{
+  my $seq = shift;
+  my @seqa = split(//,uc($seq));
+  my %bgd; 
+  $bgd{'A'} = 0.27; 
+  $bgd{'C'} = 0.23; 
+  $bgd{'G'} = 0.23; 
+  $bgd{'T'} = 0.27;  
+  my %cons1;
+  $cons1{'A'} = 0.9903;
+  $cons1{'C'} = 0.0032;
+  $cons1{'G'} = 0.0034;
+  $cons1{'T'} = 0.0030;
+  my %cons2;
+  $cons2{'A'} = 0.0027; 
+  $cons2{'C'} = 0.0037; 
+  $cons2{'G'} = 0.9905; 
+  $cons2{'T'} = 0.0030;
+  my $addscore = $cons1{$seqa[18]} * $cons2{$seqa[19]}/ ($bgd{$seqa[18]} * $bgd{$seqa[19]}); 
+  return $addscore;
+}
+sub log2{
+      my ($val) = @_;
+    return log($val)/log(2);
+}
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/score5.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/score5.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,102 @@
+use strict;
+
+
+my $inputfile = $ARGV[0];
+my $usemaxent = 1;
+
+my $modelpath = "/Users/xuebing/galaxy-dist/tools/mytools/splicesitescore/";
+my %me2x5 = &makescorematrix($modelpath.'me2x5');
+my %seq = &makesequencematrix($modelpath.'splicemodels/splice5sequences');
+
+my %bgd;
+$bgd{'A'} = 0.27;
+$bgd{'C'} = 0.23;
+$bgd{'G'} = 0.23;
+$bgd{'T'} = 0.27; 
+
+
+
+open (FILE,"<$inputfile") || die "can't open!\n";
+
+while(<FILE>) {
+    chomp;
+    if (/^\s*$/) { #discard blank lines;
+ next;
+    } 
+    elsif (/^>/) { #discard comment lines;
+        print $_."\t";
+ next;
+    }
+    elsif (/[NQWERYUIOPLKJHFDSZXVBM]/) {
+        next;
+    }
+    else {
+        $_ =~ s/\cM//g; #gets rid of carriage return
+ my $str = $_;
+ print $str."\t";
+ $str = uc($str);
+ if ($usemaxent) { 
+  print sprintf("%.2f",&log2(&scoreconsensus($str)*$me2x5{$seq{&getrest($str)}}))."\n";
+ }
+    }
+}
+
+  
+sub makesequencematrix{
+    my $file = shift;
+    my %matrix;my $n=0;
+    open(SCOREF, $file) || die "Can't open $file!\n";
+    while(<SCOREF>) { 
+ chomp;
+ $_=~ s/\s//;
+ $matrix{$_} = $n;
+ $n++;
+    }
+    close(SCOREF);
+    return %matrix;
+}
+sub makescorematrix{
+    my $file = shift;
+    my %matrix;my $n=0;
+    open(SCOREF, $file) || die "Can't open $file!\n";
+    while(<SCOREF>) { 
+ chomp;
+ $_=~ s/\s//;
+ $matrix{$n} = $_;
+ $n++;
+    }
+    close(SCOREF);
+    return %matrix;
+}
+
+sub getrest{
+  my $seq = shift;
+  my @seqa = split(//,uc($seq));
+  return $seqa[0].$seqa[1].$seqa[2].$seqa[5].$seqa[6].$seqa[7].$seqa[8];
+}
+sub scoreconsensus{
+  my $seq = shift;
+  my @seqa = split(//,uc($seq));
+  my %bgd; 
+  $bgd{'A'} = 0.27; 
+  $bgd{'C'} = 0.23; 
+  $bgd{'G'} = 0.23; 
+  $bgd{'T'} = 0.27;  
+  my %cons1;
+  $cons1{'A'} = 0.004;
+  $cons1{'C'} = 0.0032;
+  $cons1{'G'} = 0.9896;
+  $cons1{'T'} = 0.0032;
+  my %cons2;
+  $cons2{'A'} = 0.0034; 
+  $cons2{'C'} = 0.0039; 
+  $cons2{'G'} = 0.0042; 
+  $cons2{'T'} = 0.9884;
+  my $addscore = $cons1{$seqa[3]}*$cons2{$seqa[4]}/($bgd{$seqa[3]}*$bgd{$seqa[4]}); 
+  return $addscore;
+}
+
+sub log2{
+      my ($val) = @_;
+    return log($val)/log(2);
+}
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._hashseq.m
b
Binary file tools/mytools/splicesitescore/splicemodels/._hashseq.m has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._hashseq.m~
b
Binary file tools/mytools/splicesitescore/splicemodels/._hashseq.m~ has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me1s0acc1
b
Binary file tools/mytools/splicesitescore/splicemodels/._me1s0acc1 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me1s0acc2
b
Binary file tools/mytools/splicesitescore/splicemodels/._me1s0acc2 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me1s0acc3
b
Binary file tools/mytools/splicesitescore/splicemodels/._me1s0acc3 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me1s0acc4
b
Binary file tools/mytools/splicesitescore/splicemodels/._me1s0acc4 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me1s0acc5
b
Binary file tools/mytools/splicesitescore/splicemodels/._me1s0acc5 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me1s0acc6
b
Binary file tools/mytools/splicesitescore/splicemodels/._me1s0acc6 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me1s0acc7
b
Binary file tools/mytools/splicesitescore/splicemodels/._me1s0acc7 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me1s0acc8
b
Binary file tools/mytools/splicesitescore/splicemodels/._me1s0acc8 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me1s0acc9
b
Binary file tools/mytools/splicesitescore/splicemodels/._me1s0acc9 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2s0
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2s0 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2s0acc1
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2s0acc1 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2s0acc2
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2s0acc2 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2s0acc3
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2s0acc3 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2s0acc4
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2s0acc4 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2s0acc5
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2s0acc5 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2s0acc6
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2s0acc6 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2s0acc7
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2s0acc7 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2s0acc8
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2s0acc8 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2s0acc9
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2s0acc9 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2x3acc1
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2x3acc1 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2x3acc2
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2x3acc2 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2x3acc3
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2x3acc3 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2x3acc4
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2x3acc4 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2x3acc5
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2x3acc5 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2x3acc6
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2x3acc6 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2x3acc7
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2x3acc7 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2x3acc8
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2x3acc8 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2x3acc9
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2x3acc9 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._me2x5
b
Binary file tools/mytools/splicesitescore/splicemodels/._me2x5 has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/._splice5sequences
b
Binary file tools/mytools/splicesitescore/splicemodels/._splice5sequences has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/hashseq.m
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/hashseq.m Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,8 @@
+function [sum ] = hashseq(seq) 
+    sum = 0;
+    len = length(seq);
+    four = [1;4;16;64;256;1024;4096];
+    for i = 1:len
+        sum = sum + str2num(seq(i)) * four(len - i +1) ;
+    end
+    
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/hashseq.m~
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/hashseq.m~ Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,8 @@
+function [sum ] = hashseq(seq) 
+    sum = 0;
+    len = length(seq);
+    four = [1;4;16;64;256;1024;4096];
+    for i = 1:len
+        sum = sum + seq(i) * four(len - i - 1) ;
+    end
+    
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me1s0acc1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me1s0acc1 Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,16384 @@\n+8.346179e-03\n+3.128895e-02\n+1.227663e-02\n+4.038398e-02\n+2.720541e-02\n+1.019902e-01\n+4.001719e-02\n+1.316366e-01\n+1.114202e-02\n+4.177026e-02\n+1.638910e-02\n+5.391197e-02\n+3.535695e-02\n+1.325495e-01\n+5.200753e-02\n+1.710788e-01\n+2.742960e-02\n+1.028307e-01\n+4.034696e-02\n+1.327214e-01\n+8.941020e-02\n+3.351895e-01\n+1.315159e-01\n+4.326218e-01\n+3.661809e-02\n+1.372774e-01\n+5.386256e-02\n+1.771810e-01\n+1.162001e-01\n+4.356222e-01\n+1.709220e-01\n+5.622482e-01\n+1.158210e-02\n+4.342008e-02\n+1.703643e-02\n+5.604136e-02\n+3.775329e-02\n+1.415331e-01\n+5.553236e-02\n+1.826738e-01\n+1.546192e-02\n+5.796512e-02\n+2.274337e-02\n+7.481433e-02\n+4.906530e-02\n+1.839407e-01\n+7.217151e-02\n+2.374083e-01\n+3.329529e-02\n+1.248205e-01\n+4.897496e-02\n+1.611032e-01\n+1.085301e-01\n+4.068681e-01\n+1.596400e-01\n+5.251358e-01\n+4.444868e-02\n+1.666334e-01\n+6.538080e-02\n+2.150702e-01\n+1.410490e-01\n+5.287778e-01\n+2.074728e-01\n+6.824821e-01\n+1.983851e-02\n+7.437248e-02\n+2.918101e-02\n+9.599096e-02\n+6.466607e-02\n+2.424263e-01\n+9.511912e-02\n+3.128945e-01\n+2.648409e-02\n+9.928609e-02\n+3.895619e-02\n+1.281464e-01\n+8.404195e-02\n+3.150645e-01\n+1.236196e-01\n+4.066469e-01\n+6.519898e-02\n+2.444241e-01\n+9.590299e-02\n+3.154730e-01\n+2.125242e-01\n+7.967309e-01\n+3.126077e-01\n+1.028323e+00\n+8.703961e-02\n+3.263024e-01\n+1.280290e-01\n+4.211514e-01\n+2.762027e-01\n+1.035455e+00\n+4.062742e-01\n+1.336440e+00\n+2.753015e-02\n+1.032076e-01\n+4.049486e-02\n+1.332079e-01\n+8.973794e-02\n+3.364181e-01\n+1.319980e-01\n+4.342077e-01\n+3.675231e-02\n+1.377806e-01\n+5.406000e-02\n+1.778304e-01\n+1.166261e-01\n+4.372190e-01\n+1.715486e-01\n+5.643092e-01\n+7.914145e-02\n+2.966930e-01\n+1.164114e-01\n+3.829353e-01\n+2.579714e-01\n+9.671077e-01\n+3.794573e-01\n+1.248225e+00\n+1.056526e-01\n+3.960805e-01\n+1.554073e-01\n+5.112126e-01\n+3.352673e-01\n+1.256882e+00\n+4.931539e-01\n+1.622230e+00\n+9.570773e-03\n+3.587983e-02\n+1.407792e-02\n+4.630932e-02\n+3.119712e-02\n+1.169548e-01\n+4.588871e-02\n+1.509510e-01\n+1.277683e-02\n+4.789900e-02\n+1.879379e-02\n+6.182221e-02\n+4.054471e-02\n+1.519979e-01\n+5.963834e-02\n+1.961804e-01\n+3.145422e-02\n+1.179186e-01\n+4.626688e-02\n+1.521949e-01\n+1.025289e-01\n+3.843702e-01\n+1.508126e-01\n+4.960983e-01\n+4.199088e-02\n+1.574194e-01\n+6.176555e-02\n+2.031778e-01\n+1.332496e-01\n+4.995389e-01\n+1.960006e-01\n+6.447441e-01\n+1.328148e-02\n+4.979090e-02\n+1.953610e-02\n+6.426404e-02\n+4.329265e-02\n+1.622996e-01\n+6.368036e-02\n+2.094766e-01\n+1.773057e-02\n+6.647006e-02\n+2.608039e-02\n+8.579147e-02\n+5.626441e-02\n+2.109294e-01\n+8.276088e-02\n+2.722420e-01\n+3.818054e-02\n+1.431349e-01\n+5.616082e-02\n+1.847411e-01\n+1.244542e-01\n+4.665658e-01\n+1.830632e-01\n+6.021865e-01\n+5.097042e-02\n+1.910828e-01\n+7.497380e-02\n+2.466264e-01\n+1.617444e-01\n+6.063628e-01\n+2.379143e-01\n+7.826194e-01\n+2.383042e-02\n+8.933775e-02\n+3.505283e-02\n+1.153063e-01\n+7.767821e-02\n+2.912075e-01\n+1.142590e-01\n+3.758552e-01\n+3.181323e-02\n+1.192645e-01\n+4.679497e-02\n+1.539321e-01\n+1.009529e-01\n+3.784619e-01\n+1.484944e-01\n+4.884726e-01\n+7.831835e-02\n+2.936073e-01\n+1.152006e-01\n+3.789526e-01\n+2.552884e-01\n+9.570494e-01\n+3.755108e-01\n+1.235243e+00\n+1.045538e-01\n+3.919611e-01\n+1.537910e-01\n+5.058957e-01\n+3.317803e-01\n+1.243810e+00\n+4.880249e-01\n+1.605359e+00\n+3.306978e-02\n+1.239751e-01\n+4.864325e-02\n+1.600120e-01\n+1.077950e-01\n+4.041124e-01\n+1.585587e-01\n+5.215791e-01\n+4.414763e-02\n+1.655048e-01\n+6.493798e-02\n+2.136135e-01\n+1.400936e-01\n+5.251964e-01\n+2.060676e-01\n+6.778597e-01\n+9.506633e-02\n+3.563937e-01\n+1.398357e-01\n+4.599897e-01\n+3.098805e-01\n+1.161710e+00\n+4.558118e-01\n+1.499394e+00\n+1.269120e-01\n+4.757800e-01\n+1.866785e-01\n+6.140790e-01\n+4.027299e-01\n+1.509792e+00\n+5.923866e-01\n+1.948656e+00\n+1.948125e-02\n+7.303317e-02\n+2.865551e-02\n+9.426234e-02\n+6.350156e-02\n+2.380607e-01\n+9.340620e-02\n+3.072598e-01\n+2.600716e-02\n+9.749813e-02\n+3.825466e-02\n+1.258387e-01\n+8.252851e-02\n+3.093907e-01\n+1.213935e-01\n+3.993240e-01\n+6.402487e-02\n+2.400225e-01\n+9.417595e-02\n+3.097919e-01\n+2.086970e-01\n+7.823832e-01\n+3.069782e-01\n+1.009805e+00\n+8.547218e-02\n+3.204263e-01\n+1.257234e-01\n+4.135673e-01\n+2.'..b'3623e-01\n+8.498833e-01\n+5.725394e-01\n+2.146390e+00\n+8.421642e-01\n+2.770300e+00\n+2.344844e-01\n+8.790576e-01\n+3.449096e-01\n+1.134581e+00\n+7.440893e-01\n+2.789513e+00\n+1.094502e+00\n+3.600364e+00\n+5.049325e-01\n+1.892939e+00\n+7.427192e-01\n+2.443176e+00\n+1.645890e+00\n+6.170270e+00\n+2.420986e+00\n+7.963835e+00\n+6.740769e-01\n+2.527044e+00\n+9.915184e-01\n+3.261601e+00\n+2.139048e+00\n+8.019066e+00\n+3.146385e+00\n+1.035004e+01\n+1.141454e-01\n+4.279193e-01\n+1.678997e-01\n+5.523062e-01\n+3.720712e-01\n+1.394856e+00\n+5.472899e-01\n+1.800311e+00\n+1.523824e-01\n+5.712655e-01\n+2.241434e-01\n+7.373201e-01\n+4.835548e-01\n+1.812796e+00\n+7.112742e-01\n+2.339737e+00\n+3.751374e-01\n+1.406351e+00\n+5.518000e-01\n+1.815147e+00\n+1.222807e+00\n+4.584175e+00\n+1.798661e+00\n+5.916697e+00\n+5.008025e-01\n+1.877456e+00\n+7.366443e-01\n+2.423192e+00\n+1.589196e+00\n+5.957730e+00\n+2.337593e+00\n+7.689514e+00\n+1.584011e-01\n+5.938291e-01\n+2.329966e-01\n+7.664424e-01\n+5.163280e-01\n+1.935660e+00\n+7.594812e-01\n+2.498314e+00\n+2.114629e-01\n+7.927525e-01\n+3.110467e-01\n+1.023189e+00\n+6.710352e-01\n+2.515641e+00\n+9.870443e-01\n+3.246883e+00\n+4.553587e-01\n+1.707092e+00\n+6.697997e-01\n+2.203307e+00\n+1.484298e+00\n+5.564478e+00\n+2.183295e+00\n+7.181952e+00\n+6.078966e-01\n+2.278941e+00\n+8.941720e-01\n+2.941380e+00\n+1.929038e+00\n+7.231761e+00\n+2.837475e+00\n+9.333878e+00\n+2.713188e-01\n+1.017146e+00\n+3.990903e-01\n+1.312808e+00\n+8.843972e-01\n+3.315513e+00\n+1.300884e+00\n+4.279261e+00\n+3.622063e-01\n+1.357873e+00\n+5.327792e-01\n+1.752578e+00\n+1.149389e+00\n+4.308938e+00\n+1.690668e+00\n+5.561454e+00\n+8.916854e-01\n+3.342835e+00\n+1.311605e+00\n+4.314526e+00\n+2.906559e+00\n+1.089639e+01\n+4.275339e+00\n+1.406373e+01\n+1.190386e+00\n+4.462632e+00\n+1.750972e+00\n+5.759823e+00\n+3.777451e+00\n+1.416127e+01\n+5.556357e+00\n+1.827764e+01\n+3.765125e-01\n+1.411506e+00\n+5.538227e-01\n+1.821800e+00\n+1.227289e+00\n+4.600979e+00\n+1.805254e+00\n+5.938385e+00\n+5.026383e-01\n+1.884338e+00\n+7.393446e-01\n+2.432075e+00\n+1.595022e+00\n+5.979569e+00\n+2.346162e+00\n+7.717701e+00\n+1.082368e+00\n+4.057684e+00\n+1.592085e+00\n+5.237165e+00\n+3.528112e+00\n+1.322652e+01\n+5.189598e+00\n+1.707119e+01\n+1.444944e+00\n+5.416944e+00\n+2.125408e+00\n+6.991532e+00\n+4.585239e+00\n+1.718958e+01\n+6.744556e+00\n+2.218622e+01\n+1.308934e-01\n+4.907058e-01\n+1.925348e-01\n+6.333434e-01\n+4.266634e-01\n+1.599517e+00\n+6.275911e-01\n+2.064462e+00\n+1.747407e-01\n+6.550845e-01\n+2.570309e-01\n+8.455035e-01\n+5.545044e-01\n+2.078779e+00\n+8.156360e-01\n+2.683035e+00\n+4.301795e-01\n+1.612698e+00\n+6.327630e-01\n+2.081475e+00\n+1.402223e+00\n+5.256789e+00\n+2.062569e+00\n+6.784825e+00\n+5.742828e-01\n+2.152926e+00\n+8.447285e-01\n+2.778736e+00\n+1.822371e+00\n+6.831879e+00\n+2.680576e+00\n+8.817759e+00\n+1.816425e-01\n+6.809587e-01\n+2.671830e-01\n+8.788988e-01\n+5.920863e-01\n+2.219670e+00\n+8.709162e-01\n+2.864880e+00\n+2.424899e-01\n+9.090692e-01\n+3.566851e-01\n+1.173316e+00\n+7.694929e-01\n+2.884748e+00\n+1.131868e+00\n+3.723283e+00\n+5.221712e-01\n+1.957565e+00\n+7.680761e-01\n+2.526587e+00\n+1.702082e+00\n+6.380927e+00\n+2.503640e+00\n+8.235725e+00\n+6.970903e-01\n+2.613319e+00\n+1.025369e+00\n+3.372954e+00\n+2.212076e+00\n+8.292842e+00\n+3.253804e+00\n+1.070339e+01\n+3.259136e-01\n+1.221816e+00\n+4.793954e-01\n+1.576972e+00\n+1.062356e+00\n+3.982661e+00\n+1.562649e+00\n+5.140335e+00\n+4.350896e-01\n+1.631105e+00\n+6.399853e-01\n+2.105232e+00\n+1.380669e+00\n+5.175985e+00\n+2.030865e+00\n+6.680532e+00\n+1.071111e+00\n+4.015482e+00\n+1.575526e+00\n+5.182696e+00\n+3.491418e+00\n+1.308896e+01\n+5.135624e+00\n+1.689364e+01\n+1.429916e+00\n+5.360605e+00\n+2.103303e+00\n+6.918818e+00\n+4.537551e+00\n+1.701080e+01\n+6.674410e+00\n+2.195548e+01\n+4.522745e-01\n+1.695530e+00\n+6.652632e-01\n+2.188384e+00\n+1.474245e+00\n+5.526790e+00\n+2.168508e+00\n+7.133309e+00\n+6.037794e-01\n+2.263506e+00\n+8.881158e-01\n+2.921458e+00\n+1.915973e+00\n+7.182781e+00\n+2.818257e+00\n+9.270661e+00\n+1.300162e+00\n+4.874173e+00\n+1.912445e+00\n+6.290990e+00\n+4.238041e+00\n+1.588797e+01\n+6.233852e+00\n+2.050626e+01\n+1.735696e+00\n+6.506943e+00\n+2.553084e+00\n+8.398372e+00\n+5.507883e+00\n+2.064848e+01\n+8.101698e+00\n+2.665055e+01\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me1s0acc2
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me1s0acc2 Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,16384 @@\n+3.759644e-04\n+1.904071e-03\n+3.388058e-04\n+1.844646e-03\n+1.833231e-03\n+9.284399e-03\n+1.652043e-03\n+8.994638e-03\n+4.780063e-04\n+2.420863e-03\n+4.307623e-04\n+2.345309e-03\n+1.921835e-03\n+9.733131e-03\n+1.731889e-03\n+9.429366e-03\n+1.755997e-03\n+8.893244e-03\n+1.582442e-03\n+8.615691e-03\n+8.562376e-03\n+4.336415e-02\n+7.716110e-03\n+4.201078e-02\n+2.232599e-03\n+1.130699e-02\n+2.011939e-03\n+1.095411e-02\n+8.976213e-03\n+4.546002e-02\n+8.089044e-03\n+4.404124e-02\n+4.992201e-04\n+2.528299e-03\n+4.498794e-04\n+2.449393e-03\n+2.434236e-03\n+1.232818e-02\n+2.193647e-03\n+1.194343e-02\n+6.347153e-04\n+3.214515e-03\n+5.719829e-04\n+3.114192e-03\n+2.551887e-03\n+1.292403e-02\n+2.299670e-03\n+1.252068e-02\n+2.129598e-03\n+1.078535e-02\n+1.919118e-03\n+1.044874e-02\n+1.038408e-02\n+5.259019e-02\n+9.357769e-03\n+5.094888e-02\n+2.707600e-03\n+1.371264e-02\n+2.439993e-03\n+1.328468e-02\n+1.088597e-02\n+5.513197e-02\n+9.810048e-03\n+5.341133e-02\n+1.981987e-03\n+1.003777e-02\n+1.786096e-03\n+9.724497e-03\n+9.664321e-03\n+4.894494e-02\n+8.709143e-03\n+4.741740e-02\n+2.519926e-03\n+1.276216e-02\n+2.270867e-03\n+1.236386e-02\n+1.013142e-02\n+5.131055e-02\n+9.130073e-03\n+4.970917e-02\n+9.257160e-03\n+4.688288e-02\n+8.342225e-03\n+4.541969e-02\n+4.513863e-02\n+2.286046e-01\n+4.067733e-02\n+2.214699e-01\n+1.176968e-02\n+5.960754e-02\n+1.060642e-02\n+5.774723e-02\n+4.732027e-02\n+2.396534e-01\n+4.264335e-02\n+2.321740e-01\n+2.631759e-03\n+1.332854e-02\n+2.371648e-03\n+1.291256e-02\n+1.283266e-02\n+6.499099e-02\n+1.156434e-02\n+6.296266e-02\n+3.346055e-03\n+1.694609e-02\n+3.015346e-03\n+1.641721e-02\n+1.345289e-02\n+6.813213e-02\n+1.212326e-02\n+6.600577e-02\n+1.122669e-02\n+5.685756e-02\n+1.011709e-02\n+5.508306e-02\n+5.474221e-02\n+2.772418e-01\n+4.933173e-02\n+2.685893e-01\n+1.427377e-02\n+7.228949e-02\n+1.286301e-02\n+7.003337e-02\n+5.738800e-02\n+2.906415e-01\n+5.171603e-02\n+2.815707e-01\n+5.515115e-04\n+2.793129e-03\n+4.970026e-04\n+2.705957e-03\n+2.689213e-03\n+1.361951e-02\n+2.423423e-03\n+1.319446e-02\n+7.011993e-04\n+3.551223e-03\n+6.318959e-04\n+3.440392e-03\n+2.819187e-03\n+1.427777e-02\n+2.540552e-03\n+1.383217e-02\n+2.575915e-03\n+1.304572e-02\n+2.321323e-03\n+1.263857e-02\n+1.256036e-02\n+6.361194e-02\n+1.131895e-02\n+6.162665e-02\n+3.275054e-03\n+1.658651e-02\n+2.951363e-03\n+1.606885e-02\n+1.316743e-02\n+6.668643e-02\n+1.186602e-02\n+6.460518e-02\n+7.323183e-04\n+3.708825e-03\n+6.599392e-04\n+3.593075e-03\n+3.570841e-03\n+1.808452e-02\n+3.217915e-03\n+1.752011e-02\n+9.310796e-04\n+4.715452e-03\n+8.390559e-04\n+4.568285e-03\n+3.743426e-03\n+1.895858e-02\n+3.373443e-03\n+1.836689e-02\n+3.123960e-03\n+1.582129e-02\n+2.815202e-03\n+1.532752e-02\n+1.523267e-02\n+7.714585e-02\n+1.372714e-02\n+7.473817e-02\n+3.971846e-03\n+2.011541e-02\n+3.579287e-03\n+1.948762e-02\n+1.596890e-02\n+8.087446e-02\n+1.439060e-02\n+7.835041e-02\n+2.497740e-03\n+1.264980e-02\n+2.250875e-03\n+1.225501e-02\n+1.217917e-02\n+6.168141e-02\n+1.097544e-02\n+5.975637e-02\n+3.175661e-03\n+1.608313e-02\n+2.861793e-03\n+1.558119e-02\n+1.276782e-02\n+6.466259e-02\n+1.150590e-02\n+6.264451e-02\n+1.166606e-02\n+5.908275e-02\n+1.051304e-02\n+5.723881e-02\n+5.688462e-02\n+2.880921e-01\n+5.126240e-02\n+2.791009e-01\n+1.483239e-02\n+7.511863e-02\n+1.336643e-02\n+7.277422e-02\n+5.963396e-02\n+3.020161e-01\n+5.374001e-02\n+2.925904e-01\n+3.316596e-03\n+1.679690e-02\n+2.988798e-03\n+1.627267e-02\n+1.617198e-02\n+8.190296e-02\n+1.457361e-02\n+7.934682e-02\n+4.216766e-03\n+2.135581e-02\n+3.800000e-03\n+2.068930e-02\n+1.695360e-02\n+8.586149e-02\n+1.527798e-02\n+8.318180e-02\n+1.414810e-02\n+7.165304e-02\n+1.274977e-02\n+6.941679e-02\n+6.898723e-02\n+3.493858e-01\n+6.216885e-02\n+3.384816e-01\n+1.798809e-02\n+9.110067e-02\n+1.621023e-02\n+8.825747e-02\n+7.232152e-02\n+3.662723e-01\n+6.517359e-02\n+3.548411e-01\n+1.742991e-03\n+8.827377e-03\n+1.570722e-03\n+8.551880e-03\n+8.498960e-03\n+4.304297e-02\n+7.658961e-03\n+4.169963e-02\n+2.216063e-03\n+1.122325e-02\n+1.997038e-03\n+1.087298e-02\n+8.909731e-03\n+4.512332e-02\n+8.029133e-03\n+4.371505e-02\n+8.140896e-03\n+4.122956e-02\n+7.336287e-03\n+3.994281e-02\n+3.969564e-02\n+2.010385e-01\n+3.577230e-02\n+1.947642e-01\n+1.035045e-02\n+5.241984e-02\n+9.327457e-03\n+5.078384e-02\n+4.'..b'0681e-01\n+7.789422e-01\n+7.741221e-01\n+3.920541e+00\n+6.976114e-01\n+3.798183e+00\n+2.018486e-01\n+1.022262e+00\n+1.818988e-01\n+9.903580e-01\n+8.115369e-01\n+4.110028e+00\n+7.313283e-01\n+3.981756e+00\n+6.772429e-01\n+3.429896e+00\n+6.103073e-01\n+3.322851e+00\n+3.302289e+00\n+1.672444e+01\n+2.975906e+00\n+1.620248e+01\n+8.610561e-01\n+4.360818e+00\n+7.759532e-01\n+4.224719e+00\n+3.461895e+00\n+1.753276e+01\n+3.119737e+00\n+1.698558e+01\n+8.229663e-02\n+4.167912e-01\n+7.416280e-02\n+4.037834e-01\n+4.012847e-01\n+2.032306e+00\n+3.616236e-01\n+1.968879e+00\n+1.046331e-01\n+5.299141e-01\n+9.429161e-02\n+5.133758e-01\n+4.206796e-01\n+2.130531e+00\n+3.791015e-01\n+2.064039e+00\n+3.843785e-01\n+1.946684e+00\n+3.463883e-01\n+1.885930e+00\n+1.874259e+00\n+9.492184e+00\n+1.689016e+00\n+9.195938e+00\n+4.887041e-01\n+2.475042e+00\n+4.404028e-01\n+2.397797e+00\n+1.964846e+00\n+9.950960e+00\n+1.770649e+00\n+9.640396e+00\n+1.092766e-01\n+5.534315e-01\n+9.847624e-02\n+5.361592e-01\n+5.328414e-01\n+2.698575e+00\n+4.801778e-01\n+2.614354e+00\n+1.389358e-01\n+7.036405e-01\n+1.252041e-01\n+6.816802e-01\n+5.585946e-01\n+2.829002e+00\n+5.033857e-01\n+2.740710e+00\n+4.661578e-01\n+2.360856e+00\n+4.200849e-01\n+2.287175e+00\n+2.273022e+00\n+1.151172e+01\n+2.048366e+00\n+1.115244e+01\n+5.926795e-01\n+3.001625e+00\n+5.341018e-01\n+2.907946e+00\n+2.382881e+00\n+1.206810e+01\n+2.147368e+00\n+1.169146e+01\n+4.338465e-01\n+2.197215e+00\n+3.909671e-01\n+2.128641e+00\n+2.115469e+00\n+1.071379e+01\n+1.906386e+00\n+1.037942e+01\n+5.515985e-01\n+2.793570e+00\n+4.970810e-01\n+2.706384e+00\n+2.217714e+00\n+1.123161e+01\n+1.998525e+00\n+1.088108e+01\n+2.026344e+00\n+1.026242e+01\n+1.826069e+00\n+9.942132e+00\n+9.880609e+00\n+5.004034e+01\n+8.904054e+00\n+4.847860e+01\n+2.576322e+00\n+1.304778e+01\n+2.321690e+00\n+1.264056e+01\n+1.035816e+01\n+5.245888e+01\n+9.334404e+00\n+5.082167e+01\n+5.760782e-01\n+2.917547e+00\n+5.191412e-01\n+2.826492e+00\n+2.809002e+00\n+1.422619e+01\n+2.531373e+00\n+1.378220e+01\n+7.324338e-01\n+3.709410e+00\n+6.600433e-01\n+3.593642e+00\n+2.944766e+00\n+1.491377e+01\n+2.653719e+00\n+1.444832e+01\n+2.457463e+00\n+1.244582e+01\n+2.214579e+00\n+1.205739e+01\n+1.198278e+01\n+6.068678e+01\n+1.079846e+01\n+5.879278e+01\n+3.124453e+00\n+1.582379e+01\n+2.815646e+00\n+1.532994e+01\n+1.256193e+01\n+6.361989e+01\n+1.132037e+01\n+6.163435e+01\n+1.207230e-01\n+6.114013e-01\n+1.087912e-01\n+5.923198e-01\n+5.886545e-01\n+2.981240e+00\n+5.304745e-01\n+2.888197e+00\n+1.534888e-01\n+7.773441e-01\n+1.383187e-01\n+7.530836e-01\n+6.171053e-01\n+3.125329e+00\n+5.561134e-01\n+3.027789e+00\n+5.638543e-01\n+2.855640e+00\n+5.081255e-01\n+2.766517e+00\n+2.749397e+00\n+1.392432e+01\n+2.477659e+00\n+1.348975e+01\n+7.168922e-01\n+3.630700e+00\n+6.460378e-01\n+3.517388e+00\n+2.882281e+00\n+1.459731e+01\n+2.597409e+00\n+1.414174e+01\n+1.603006e-01\n+8.118423e-01\n+1.444572e-01\n+7.865052e-01\n+7.816382e-01\n+3.958606e+00\n+7.043846e-01\n+3.835060e+00\n+2.038084e-01\n+1.032188e+00\n+1.836649e-01\n+9.999736e-01\n+8.194163e-01\n+4.149933e+00\n+7.384289e-01\n+4.020416e+00\n+6.838185e-01\n+3.463198e+00\n+6.162329e-01\n+3.355113e+00\n+3.334352e+00\n+1.688682e+01\n+3.004799e+00\n+1.635979e+01\n+8.694163e-01\n+4.403158e+00\n+7.834871e-01\n+4.265738e+00\n+3.495507e+00\n+1.770299e+01\n+3.150027e+00\n+1.715049e+01\n+5.467421e-01\n+2.768975e+00\n+4.927046e-01\n+2.682557e+00\n+2.665957e+00\n+1.350174e+01\n+2.402466e+00\n+1.308035e+01\n+6.951355e-01\n+3.520513e+00\n+6.264315e-01\n+3.410640e+00\n+2.794808e+00\n+1.415430e+01\n+2.518582e+00\n+1.371255e+01\n+2.553639e+00\n+1.293290e+01\n+2.301249e+00\n+1.252928e+01\n+1.245174e+01\n+6.306184e+01\n+1.122107e+01\n+6.109372e+01\n+3.246733e+00\n+1.644307e+01\n+2.925840e+00\n+1.592990e+01\n+1.305356e+01\n+6.610974e+01\n+1.176341e+01\n+6.404650e+01\n+7.259854e-01\n+3.676752e+00\n+6.542323e-01\n+3.562003e+00\n+3.539961e+00\n+1.792813e+01\n+3.190087e+00\n+1.736860e+01\n+9.230279e-01\n+4.674674e+00\n+8.318000e-01\n+4.528780e+00\n+3.711054e+00\n+1.879463e+01\n+3.344270e+00\n+1.820806e+01\n+3.096945e+00\n+1.568447e+01\n+2.790857e+00\n+1.519497e+01\n+1.510094e+01\n+7.647871e+01\n+1.360843e+01\n+7.409186e+01\n+3.937499e+00\n+1.994146e+01\n+3.548334e+00\n+1.931910e+01\n+1.583080e+01\n+8.017508e+01\n+1.426616e+01\n+7.767286e+01\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me1s0acc3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me1s0acc3 Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,16384 @@\n+9.787592e-03\n+1.298461e-02\n+1.135238e-02\n+1.151277e-02\n+9.065475e-03\n+1.202662e-02\n+1.051482e-02\n+1.066337e-02\n+1.002661e-02\n+1.330170e-02\n+1.162961e-02\n+1.179392e-02\n+1.683521e-02\n+2.233426e-02\n+1.952674e-02\n+1.980261e-02\n+7.312489e-03\n+9.701040e-03\n+8.481573e-03\n+8.601399e-03\n+6.772982e-03\n+8.985309e-03\n+7.855812e-03\n+7.966798e-03\n+7.491064e-03\n+9.937945e-03\n+8.688697e-03\n+8.811450e-03\n+1.257789e-02\n+1.668633e-02\n+1.458878e-02\n+1.479489e-02\n+2.005230e-02\n+2.660219e-02\n+2.325817e-02\n+2.358675e-02\n+1.857287e-02\n+2.463951e-02\n+2.154221e-02\n+2.184655e-02\n+2.054199e-02\n+2.725183e-02\n+2.382614e-02\n+2.416276e-02\n+3.449109e-02\n+4.575726e-02\n+4.000535e-02\n+4.057054e-02\n+6.230323e-03\n+8.265395e-03\n+7.226395e-03\n+7.328489e-03\n+5.770657e-03\n+7.655584e-03\n+6.693241e-03\n+6.787802e-03\n+6.382471e-03\n+8.467241e-03\n+7.402868e-03\n+7.507454e-03\n+1.071650e-02\n+1.421694e-02\n+1.242981e-02\n+1.260541e-02\n+1.008638e-01\n+1.338099e-01\n+1.169893e-01\n+1.186421e-01\n+9.342214e-02\n+1.239376e-01\n+1.083580e-01\n+1.098889e-01\n+1.033269e-01\n+1.370776e-01\n+1.198463e-01\n+1.215395e-01\n+1.734913e-01\n+2.301605e-01\n+2.012282e-01\n+2.040712e-01\n+7.535715e-02\n+9.997181e-02\n+8.740488e-02\n+8.863972e-02\n+6.979739e-02\n+9.259601e-02\n+8.095625e-02\n+8.209999e-02\n+7.719742e-02\n+1.024132e-01\n+8.953935e-02\n+9.080435e-02\n+1.296185e-01\n+1.719571e-01\n+1.503413e-01\n+1.524653e-01\n+2.066444e-01\n+2.741427e-01\n+2.396816e-01\n+2.430678e-01\n+1.913984e-01\n+2.539168e-01\n+2.219982e-01\n+2.251346e-01\n+2.116907e-01\n+2.808374e-01\n+2.455348e-01\n+2.490037e-01\n+3.554399e-01\n+4.715408e-01\n+4.122659e-01\n+4.180903e-01\n+6.420514e-02\n+8.517711e-02\n+7.446994e-02\n+7.552204e-02\n+5.946816e-02\n+7.889284e-02\n+6.897563e-02\n+6.995011e-02\n+6.577307e-02\n+8.725718e-02\n+7.628854e-02\n+7.736633e-02\n+1.104365e-01\n+1.465094e-01\n+1.280925e-01\n+1.299021e-01\n+2.100752e-04\n+2.786942e-04\n+2.436610e-04\n+2.471034e-04\n+1.945761e-04\n+2.581325e-04\n+2.256840e-04\n+2.288724e-04\n+2.152054e-04\n+2.855001e-04\n+2.496114e-04\n+2.531378e-04\n+3.613412e-04\n+4.793697e-04\n+4.191106e-04\n+4.250318e-04\n+1.569511e-04\n+2.082175e-04\n+1.820436e-04\n+1.846155e-04\n+1.453714e-04\n+1.928555e-04\n+1.686126e-04\n+1.709948e-04\n+1.607839e-04\n+2.133023e-04\n+1.864892e-04\n+1.891239e-04\n+2.699646e-04\n+3.581459e-04\n+3.131252e-04\n+3.175490e-04\n+4.303911e-04\n+5.709741e-04\n+4.991999e-04\n+5.062525e-04\n+3.986374e-04\n+5.288483e-04\n+4.623695e-04\n+4.689018e-04\n+4.409015e-04\n+5.849176e-04\n+5.113907e-04\n+5.186155e-04\n+7.402969e-04\n+9.821075e-04\n+8.586519e-04\n+8.707828e-04\n+1.337241e-04\n+1.774037e-04\n+1.551032e-04\n+1.572945e-04\n+1.238581e-04\n+1.643151e-04\n+1.436599e-04\n+1.456895e-04\n+1.369897e-04\n+1.817360e-04\n+1.588909e-04\n+1.611357e-04\n+2.300129e-04\n+3.051443e-04\n+2.667862e-04\n+2.705553e-04\n+8.267657e-02\n+1.096820e-01\n+9.589448e-02\n+9.724927e-02\n+7.657679e-02\n+1.015898e-01\n+8.881950e-02\n+9.007433e-02\n+8.469558e-02\n+1.123605e-01\n+9.823628e-02\n+9.962415e-02\n+1.422083e-01\n+1.886593e-01\n+1.649439e-01\n+1.672742e-01\n+6.176918e-02\n+8.194546e-02\n+7.164452e-02\n+7.265670e-02\n+5.721192e-02\n+7.589962e-02\n+6.635868e-02\n+6.729618e-02\n+6.327761e-02\n+8.394661e-02\n+7.339412e-02\n+7.443102e-02\n+1.062465e-01\n+1.409508e-01\n+1.232326e-01\n+1.249736e-01\n+1.693834e-01\n+2.247108e-01\n+1.964636e-01\n+1.992392e-01\n+1.568865e-01\n+2.081319e-01\n+1.819687e-01\n+1.845396e-01\n+1.735199e-01\n+2.301984e-01\n+2.012613e-01\n+2.041047e-01\n+2.913490e-01\n+3.865152e-01\n+3.379284e-01\n+3.427026e-01\n+5.262803e-02\n+6.981845e-02\n+6.104194e-02\n+6.190433e-02\n+4.874520e-02\n+6.466733e-02\n+5.653834e-02\n+5.733710e-02\n+5.391324e-02\n+7.152346e-02\n+6.253262e-02\n+6.341607e-02\n+9.052317e-02\n+1.200917e-01\n+1.049956e-01\n+1.064789e-01\n+1.633178e-02\n+2.166639e-02\n+1.894282e-02\n+1.921045e-02\n+1.512684e-02\n+2.006787e-02\n+1.754525e-02\n+1.779312e-02\n+1.673061e-02\n+2.219550e-02\n+1.940542e-02\n+1.967958e-02\n+2.809158e-02\n+3.726742e-02\n+3.258272e-02\n+3.304305e-02\n+1.220177e-02\n+1.618736e-02\n+1.415253e-02\n+1.435248e-02\n+1.130154e-02\n+1.499308e-02\n+1.310837e-02\n+1.329357e-02\n+1.249975e-02\n+1.658266e-02\n+1.449814e-02\n+1.470297e-02\n+2.'..b'2724e+00\n+8.349034e+00\n+6.574263e+00\n+8.721680e+00\n+7.625323e+00\n+7.733053e+00\n+7.271276e+00\n+9.646366e+00\n+8.433772e+00\n+8.552923e+00\n+1.220886e+01\n+1.619676e+01\n+1.416075e+01\n+1.436081e+01\n+2.205356e+00\n+2.925713e+00\n+2.557937e+00\n+2.594075e+00\n+2.042647e+00\n+2.709857e+00\n+2.369215e+00\n+2.402687e+00\n+2.259212e+00\n+2.997160e+00\n+2.620403e+00\n+2.657424e+00\n+3.793335e+00\n+5.032390e+00\n+4.399795e+00\n+4.461954e+00\n+5.308883e-01\n+7.042977e-01\n+6.157641e-01\n+6.244635e-01\n+4.917200e-01\n+6.523354e-01\n+5.703338e-01\n+5.783913e-01\n+5.438529e-01\n+7.214970e-01\n+6.308014e-01\n+6.397133e-01\n+9.131578e-01\n+1.211432e+00\n+1.059149e+00\n+1.074112e+00\n+3.966364e-01\n+5.261937e-01\n+4.600486e-01\n+4.665481e-01\n+3.673730e-01\n+4.873717e-01\n+4.261068e-01\n+4.321267e-01\n+4.063225e-01\n+5.390436e-01\n+4.712833e-01\n+4.779415e-01\n+6.822369e-01\n+9.050827e-01\n+7.913095e-01\n+8.024889e-01\n+1.087656e+00\n+1.442928e+00\n+1.261545e+00\n+1.279368e+00\n+1.007410e+00\n+1.336471e+00\n+1.168470e+00\n+1.184978e+00\n+1.114217e+00\n+1.478165e+00\n+1.292353e+00\n+1.310611e+00\n+1.870830e+00\n+2.481918e+00\n+2.169929e+00\n+2.200585e+00\n+3.379387e-01\n+4.483229e-01\n+3.919666e-01\n+3.975042e-01\n+3.130059e-01\n+4.152462e-01\n+3.630478e-01\n+3.681768e-01\n+3.461913e-01\n+4.592712e-01\n+4.015386e-01\n+4.072115e-01\n+5.812735e-01\n+7.711406e-01\n+6.742046e-01\n+6.837296e-01\n+5.470946e+00\n+7.257976e+00\n+6.345614e+00\n+6.435264e+00\n+5.067306e+00\n+6.722491e+00\n+5.877442e+00\n+5.960478e+00\n+5.604550e+00\n+7.435220e+00\n+6.500577e+00\n+6.592416e+00\n+9.410335e+00\n+1.248413e+01\n+1.091481e+01\n+1.106901e+01\n+4.087444e+00\n+5.422567e+00\n+4.740924e+00\n+4.807903e+00\n+3.785877e+00\n+5.022496e+00\n+4.391144e+00\n+4.453182e+00\n+4.187262e+00\n+5.554989e+00\n+4.856700e+00\n+4.925315e+00\n+7.030633e+00\n+9.327119e+00\n+8.154656e+00\n+8.269863e+00\n+1.120859e+01\n+1.486976e+01\n+1.300056e+01\n+1.318423e+01\n+1.038163e+01\n+1.377269e+01\n+1.204139e+01\n+1.221151e+01\n+1.148231e+01\n+1.523289e+01\n+1.331804e+01\n+1.350620e+01\n+1.927940e+01\n+2.557682e+01\n+2.236169e+01\n+2.267762e+01\n+3.482548e+00\n+4.620088e+00\n+4.039321e+00\n+4.096388e+00\n+3.225610e+00\n+4.279223e+00\n+3.741304e+00\n+3.794161e+00\n+3.567594e+00\n+4.732913e+00\n+4.137963e+00\n+4.196424e+00\n+5.990179e+00\n+7.946811e+00\n+6.947858e+00\n+7.046017e+00\n+1.139468e-02\n+1.511664e-02\n+1.321641e-02\n+1.340313e-02\n+1.055400e-02\n+1.400135e-02\n+1.224132e-02\n+1.241426e-02\n+1.167295e-02\n+1.548580e-02\n+1.353916e-02\n+1.373044e-02\n+1.959949e-02\n+2.600147e-02\n+2.273296e-02\n+2.305413e-02\n+8.513175e-03\n+1.129392e-02\n+9.874219e-03\n+1.001372e-02\n+7.885083e-03\n+1.046067e-02\n+9.145711e-03\n+9.274920e-03\n+8.721072e-03\n+1.156972e-02\n+1.011535e-02\n+1.025826e-02\n+1.464314e-02\n+1.942617e-02\n+1.698421e-02\n+1.722416e-02\n+2.334483e-02\n+3.097018e-02\n+2.707708e-02\n+2.745962e-02\n+2.162247e-02\n+2.868524e-02\n+2.507937e-02\n+2.543368e-02\n+2.391492e-02\n+3.172649e-02\n+2.773832e-02\n+2.813020e-02\n+4.015441e-02\n+5.327045e-02\n+4.657410e-02\n+4.723209e-02\n+7.253321e-03\n+9.622546e-03\n+8.412946e-03\n+8.531802e-03\n+6.718180e-03\n+8.912606e-03\n+7.792249e-03\n+7.902336e-03\n+7.430451e-03\n+9.857534e-03\n+8.618395e-03\n+8.740154e-03\n+1.247612e-02\n+1.655132e-02\n+1.447074e-02\n+1.467518e-02\n+4.484456e+00\n+5.949259e+00\n+5.201409e+00\n+5.274893e+00\n+4.153598e+00\n+5.510330e+00\n+4.817655e+00\n+4.885718e+00\n+4.593969e+00\n+6.094543e+00\n+5.328430e+00\n+5.403709e+00\n+7.713517e+00\n+1.023306e+01\n+8.946715e+00\n+9.073113e+00\n+3.350419e+00\n+4.444800e+00\n+3.886067e+00\n+3.940969e+00\n+3.103229e+00\n+4.116868e+00\n+3.599358e+00\n+3.650209e+00\n+3.432238e+00\n+4.553345e+00\n+3.980967e+00\n+4.037210e+00\n+5.762909e+00\n+7.645306e+00\n+6.684254e+00\n+6.778688e+00\n+9.187519e+00\n+1.218853e+01\n+1.065637e+01\n+1.080692e+01\n+8.509675e+00\n+1.128927e+01\n+9.870159e+00\n+1.000960e+01\n+9.411883e+00\n+1.248618e+01\n+1.091661e+01\n+1.107084e+01\n+1.580305e+01\n+2.096496e+01\n+1.832956e+01\n+1.858852e+01\n+2.854595e+00\n+3.787020e+00\n+3.310973e+00\n+3.357750e+00\n+2.643986e+00\n+3.507618e+00\n+3.066694e+00\n+3.110019e+00\n+2.924306e+00\n+3.879501e+00\n+3.391829e+00\n+3.439748e+00\n+4.910064e+00\n+6.513887e+00\n+5.695060e+00\n+5.775519e+00\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me1s0acc4
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me1s0acc4 Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,16384 @@\n+5.908392e-04\n+3.114751e-03\n+8.667167e-04\n+3.925273e-03\n+2.739163e-03\n+1.444016e-02\n+4.018145e-03\n+1.819778e-02\n+9.533011e-04\n+5.025556e-03\n+1.398421e-03\n+6.333308e-03\n+4.359323e-03\n+2.298122e-02\n+6.394799e-03\n+2.896140e-02\n+2.190992e-03\n+1.155034e-02\n+3.214020e-03\n+1.455597e-02\n+1.015756e-02\n+5.354801e-02\n+1.490037e-02\n+6.748228e-02\n+3.535098e-03\n+1.863612e-02\n+5.185723e-03\n+2.348562e-02\n+1.616555e-02\n+8.522060e-02\n+2.371365e-02\n+1.073967e-01\n+8.610097e-04\n+4.539020e-03\n+1.263036e-03\n+5.720165e-03\n+3.991687e-03\n+2.104314e-02\n+5.855505e-03\n+2.651899e-02\n+1.389213e-03\n+7.323570e-03\n+2.037871e-03\n+9.229312e-03\n+6.352692e-03\n+3.348975e-02\n+9.318921e-03\n+4.220446e-02\n+3.404796e-03\n+1.794920e-02\n+4.994580e-03\n+2.261995e-02\n+1.578482e-02\n+8.321349e-02\n+2.315514e-02\n+1.048673e-01\n+5.493535e-03\n+2.896050e-02\n+8.058603e-03\n+3.649660e-02\n+2.512123e-02\n+1.324326e-01\n+3.685096e-02\n+1.668943e-01\n+2.290756e-03\n+1.207627e-02\n+3.360366e-03\n+1.521876e-02\n+1.062007e-02\n+5.598624e-02\n+1.557884e-02\n+7.055500e-02\n+3.696064e-03\n+1.948469e-02\n+5.421849e-03\n+2.455501e-02\n+1.690163e-02\n+8.910101e-02\n+2.479342e-02\n+1.122869e-01\n+8.494742e-03\n+4.478208e-02\n+1.246115e-02\n+5.643528e-02\n+3.938208e-02\n+2.076121e-01\n+5.777056e-02\n+2.616370e-01\n+1.370601e-02\n+7.225452e-02\n+2.010568e-02\n+9.105661e-02\n+6.267581e-02\n+3.304107e-01\n+9.194070e-02\n+4.163902e-01\n+3.338240e-03\n+1.759833e-02\n+4.896946e-03\n+2.217778e-02\n+1.547626e-02\n+8.158683e-02\n+2.270251e-02\n+1.028174e-01\n+5.386148e-03\n+2.839438e-02\n+7.901073e-03\n+3.578317e-02\n+2.463016e-02\n+1.298438e-01\n+3.613059e-02\n+1.636318e-01\n+1.320081e-02\n+6.959126e-02\n+1.936460e-02\n+8.770031e-02\n+6.119967e-02\n+3.226288e-01\n+8.977532e-02\n+4.065834e-01\n+2.129911e-02\n+1.122834e-01\n+3.124419e-02\n+1.415018e-01\n+9.739808e-02\n+5.134574e-01\n+1.428757e-01\n+6.470695e-01\n+7.842952e-04\n+4.134601e-03\n+1.150502e-03\n+5.210508e-03\n+3.636035e-03\n+1.916824e-02\n+5.333790e-03\n+2.415620e-02\n+1.265436e-03\n+6.671053e-03\n+1.856300e-03\n+8.406996e-03\n+5.786678e-03\n+3.050587e-02\n+8.488622e-03\n+3.844411e-02\n+2.908379e-03\n+1.533222e-02\n+4.266373e-03\n+1.932198e-02\n+1.348340e-02\n+7.108101e-02\n+1.977914e-02\n+8.957772e-02\n+4.692581e-03\n+2.473807e-02\n+6.883663e-03\n+3.117542e-02\n+2.145857e-02\n+1.131240e-01\n+3.147811e-02\n+1.425612e-01\n+1.142926e-03\n+6.025212e-03\n+1.676587e-03\n+7.593095e-03\n+5.298669e-03\n+2.793321e-02\n+7.772749e-03\n+3.520200e-02\n+1.844077e-03\n+9.721496e-03\n+2.705122e-03\n+1.225123e-02\n+8.432728e-03\n+4.445516e-02\n+1.237018e-02\n+5.602329e-02\n+4.519615e-03\n+2.382624e-02\n+6.629935e-03\n+3.002631e-02\n+2.095318e-02\n+1.104597e-01\n+3.073674e-02\n+1.392036e-01\n+7.292261e-03\n+3.844291e-02\n+1.069720e-02\n+4.844653e-02\n+3.334657e-02\n+1.757945e-01\n+4.891691e-02\n+2.215398e-01\n+3.041817e-03\n+1.603567e-02\n+4.462116e-03\n+2.020848e-02\n+1.410203e-02\n+7.434223e-02\n+2.068661e-02\n+9.368758e-02\n+4.907878e-03\n+2.587306e-02\n+7.199488e-03\n+3.260576e-02\n+2.244310e-02\n+1.183142e-01\n+3.292234e-02\n+1.491019e-01\n+1.127988e-02\n+5.946460e-02\n+1.654673e-02\n+7.493850e-02\n+5.229413e-02\n+2.756811e-01\n+7.671156e-02\n+3.474189e-01\n+1.819974e-02\n+9.594433e-02\n+2.669765e-02\n+1.209110e-01\n+8.322508e-02\n+4.387411e-01\n+1.220849e-01\n+5.529104e-01\n+4.432735e-03\n+2.336823e-02\n+6.502489e-03\n+2.944912e-02\n+2.055040e-02\n+1.083364e-01\n+3.014589e-02\n+1.365277e-01\n+7.152083e-03\n+3.770393e-02\n+1.049157e-02\n+4.751525e-02\n+3.270556e-02\n+1.724152e-01\n+4.797659e-02\n+2.172811e-01\n+1.752891e-02\n+9.240787e-02\n+2.571359e-02\n+1.164543e-01\n+8.126497e-02\n+4.284079e-01\n+1.192096e-01\n+5.398883e-01\n+2.828236e-02\n+1.490973e-01\n+4.148809e-02\n+1.878954e-01\n+1.293316e-01\n+6.818027e-01\n+1.897197e-01\n+8.592216e-01\n+2.214994e-03\n+1.167688e-02\n+3.249230e-03\n+1.471544e-02\n+1.026883e-02\n+5.413463e-02\n+1.506361e-02\n+6.822156e-02\n+3.573826e-03\n+1.884028e-02\n+5.242534e-03\n+2.374291e-02\n+1.634265e-02\n+8.615420e-02\n+2.397343e-02\n+1.085733e-01\n+8.213799e-03\n+4.330102e-02\n+1.204903e-02\n+5.456882e-02\n+3.807962e-02\n+2.007459e-01\n+5.585993e-02\n+2.529840e-01\n+1.325271e-02\n+6.986488e-02\n+1.944073e-02\n+8.804513e-02\n+6.'..b'6416e-01\n+7.320585e-01\n+5.108505e-01\n+2.693071e+00\n+7.493792e-01\n+3.393863e+00\n+1.777895e-01\n+9.372601e-01\n+2.608038e-01\n+1.181154e+00\n+8.130085e-01\n+4.285970e+00\n+1.192622e+00\n+5.401266e+00\n+4.357410e-01\n+2.297114e+00\n+6.391992e-01\n+2.894869e+00\n+2.020119e+00\n+1.064954e+01\n+2.963362e+00\n+1.342077e+01\n+7.030548e-01\n+3.706323e+00\n+1.031329e+00\n+4.670783e+00\n+3.214979e+00\n+1.694854e+01\n+4.716133e+00\n+2.135889e+01\n+4.831399e-02\n+2.546988e-01\n+7.087299e-02\n+3.209766e-01\n+2.239863e-01\n+1.180798e+00\n+3.285710e-01\n+1.488065e+00\n+7.795315e-02\n+4.109488e-01\n+1.143514e-01\n+5.178860e-01\n+3.564697e-01\n+1.879216e+00\n+5.229143e-01\n+2.368226e+00\n+1.791614e-01\n+9.444922e-01\n+2.628162e-01\n+1.190268e+00\n+8.306017e-01\n+4.378717e+00\n+1.218430e+00\n+5.518148e+00\n+2.890714e-01\n+1.523909e+00\n+4.240460e-01\n+1.920462e+00\n+1.321886e+00\n+6.968642e+00\n+1.939108e+00\n+8.782024e+00\n+7.040632e-02\n+3.711639e-01\n+1.032808e-01\n+4.677482e-01\n+3.264075e-01\n+1.720736e+00\n+4.788152e-01\n+2.168506e+00\n+1.135984e-01\n+5.988616e-01\n+1.666404e-01\n+7.546975e-01\n+5.194711e-01\n+2.738517e+00\n+7.620250e-01\n+3.451135e+00\n+2.784164e-01\n+1.467739e+00\n+4.084158e-01\n+1.849674e+00\n+1.290753e+00\n+6.804517e+00\n+1.893438e+00\n+8.575190e+00\n+4.492163e-01\n+2.368152e+00\n+6.589665e-01\n+2.984393e+00\n+2.054208e+00\n+1.082925e+01\n+3.013369e+00\n+1.364725e+01\n+1.873192e-01\n+9.874984e-01\n+2.747832e-01\n+1.244466e+00\n+8.684221e-01\n+4.578096e+00\n+1.273910e+00\n+5.769410e+00\n+3.022339e-01\n+1.593299e+00\n+4.433543e-01\n+2.007907e+00\n+1.382077e+00\n+7.285950e+00\n+2.027402e+00\n+9.181902e+00\n+6.946304e-01\n+3.661912e+00\n+1.018971e+00\n+4.614815e+00\n+3.220344e+00\n+1.697682e+01\n+4.724003e+00\n+2.139453e+01\n+1.120765e+00\n+5.908383e+00\n+1.644078e+00\n+7.445864e+00\n+5.125114e+00\n+2.701828e+01\n+7.518157e+00\n+3.404898e+01\n+2.729739e-01\n+1.439048e+00\n+4.004321e-01\n+1.813517e+00\n+1.265522e+00\n+6.671502e+00\n+1.856425e+00\n+8.407562e+00\n+4.404350e-01\n+2.321859e+00\n+6.460850e-01\n+2.926054e+00\n+2.014053e+00\n+1.061756e+01\n+2.954464e+00\n+1.338047e+01\n+1.079454e+00\n+5.690603e+00\n+1.583478e+00\n+7.171413e+00\n+5.004408e+00\n+2.638194e+01\n+7.341090e+00\n+3.324706e+01\n+1.741666e+00\n+9.181614e+00\n+2.554894e+00\n+1.157086e+01\n+7.964417e+00\n+4.198634e+01\n+1.168320e+01\n+5.291204e+01\n+6.413324e-02\n+3.380938e-01\n+9.407863e-02\n+4.260727e-01\n+2.973252e-01\n+1.567421e+00\n+4.361536e-01\n+1.975296e+00\n+1.034770e-01\n+5.455041e-01\n+1.517930e-01\n+6.874553e-01\n+4.731871e-01\n+2.494520e+00\n+6.941299e-01\n+3.143645e+00\n+2.378234e-01\n+1.253743e+00\n+3.488690e-01\n+1.579993e+00\n+1.102562e+00\n+5.812422e+00\n+1.617376e+00\n+7.324932e+00\n+3.837208e-01\n+2.022877e+00\n+5.628896e-01\n+2.549270e+00\n+1.754706e+00\n+9.250356e+00\n+2.574022e+00\n+1.165749e+01\n+9.345916e-02\n+4.926925e-01\n+1.370976e-01\n+6.209011e-01\n+4.332817e-01\n+2.284149e+00\n+6.355917e-01\n+2.878531e+00\n+1.507935e-01\n+7.949443e-01\n+2.212027e-01\n+1.001805e+00\n+6.895594e-01\n+3.635178e+00\n+1.011532e+00\n+4.581125e+00\n+3.695771e-01\n+1.948315e+00\n+5.421417e-01\n+2.455305e+00\n+1.713379e+00\n+9.032492e+00\n+2.513398e+00\n+1.138293e+01\n+5.963013e-01\n+3.143546e+00\n+8.747292e-01\n+3.961560e+00\n+2.726809e+00\n+1.437503e+01\n+4.000024e+00\n+1.811571e+01\n+2.487348e-01\n+1.311266e+00\n+3.648753e-01\n+1.652483e+00\n+1.153148e+00\n+6.079098e+00\n+1.691582e+00\n+7.661003e+00\n+4.013261e-01\n+2.115687e+00\n+5.887152e-01\n+2.666232e+00\n+1.835213e+00\n+9.674766e+00\n+2.692119e+00\n+1.219234e+01\n+9.223761e-01\n+4.862528e+00\n+1.353056e+00\n+6.127856e+00\n+4.276186e+00\n+2.254294e+01\n+6.272843e+00\n+2.840907e+01\n+1.488226e+00\n+7.845541e+00\n+2.183115e+00\n+9.887109e+00\n+6.805465e+00\n+3.587665e+01\n+9.983106e+00\n+4.521248e+01\n+3.624727e-01\n+1.910862e+00\n+5.317202e-01\n+2.408107e+00\n+1.680443e+00\n+8.858861e+00\n+2.465084e+00\n+1.116412e+01\n+5.848387e-01\n+3.083118e+00\n+8.579144e-01\n+3.885408e+00\n+2.674392e+00\n+1.409870e+01\n+3.923132e+00\n+1.776747e+01\n+1.433371e+00\n+7.556358e+00\n+2.102647e+00\n+9.522675e+00\n+6.645183e+00\n+3.503168e+01\n+9.747984e+00\n+4.414764e+01\n+2.312700e+00\n+1.219195e+01\n+3.392556e+00\n+1.536455e+01\n+1.057568e+01\n+5.575224e+01\n+1.551372e+01\n+7.026010e+01\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me1s0acc5
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me1s0acc5 Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,16384 @@\n+6.240976e-04\n+6.433615e-03\n+1.339863e-05\n+5.272890e-03\n+1.041372e-03\n+1.073516e-02\n+2.235702e-05\n+8.798372e-03\n+6.289573e-04\n+6.483712e-03\n+1.350296e-05\n+5.313949e-03\n+8.141045e-04\n+8.392334e-03\n+1.747785e-05\n+6.878226e-03\n+2.504064e-03\n+2.581357e-02\n+5.375926e-05\n+2.115640e-02\n+4.178294e-03\n+4.307265e-02\n+8.970298e-05\n+3.530167e-02\n+2.523563e-03\n+2.601457e-02\n+5.417787e-05\n+2.132114e-02\n+3.266428e-03\n+3.367253e-02\n+7.012630e-05\n+2.759748e-02\n+5.157920e-04\n+5.317129e-03\n+1.107344e-05\n+4.357836e-03\n+8.606533e-04\n+8.872190e-03\n+1.847720e-05\n+7.271508e-03\n+5.198084e-04\n+5.358533e-03\n+1.115966e-05\n+4.391770e-03\n+6.728253e-04\n+6.935933e-03\n+1.444475e-05\n+5.684582e-03\n+3.879651e-03\n+3.999404e-02\n+8.329146e-05\n+3.277849e-02\n+6.473606e-03\n+6.673426e-02\n+1.389806e-04\n+5.469436e-02\n+3.909861e-03\n+4.030546e-02\n+8.394004e-05\n+3.303373e-02\n+5.060813e-03\n+5.217025e-02\n+1.086496e-04\n+4.275792e-02\n+3.887928e-03\n+4.007937e-02\n+8.346917e-05\n+3.284842e-02\n+6.487418e-03\n+6.687664e-02\n+1.392771e-04\n+5.481105e-02\n+3.918203e-03\n+4.039146e-02\n+8.411913e-05\n+3.310421e-02\n+5.071611e-03\n+5.228156e-02\n+1.088814e-04\n+4.284915e-02\n+1.559952e-02\n+1.608103e-01\n+3.349030e-04\n+1.317976e-01\n+2.602944e-02\n+2.683289e-01\n+5.588209e-04\n+2.199181e-01\n+1.572099e-02\n+1.620625e-01\n+3.375108e-04\n+1.328239e-01\n+2.034880e-02\n+2.097691e-01\n+4.368645e-04\n+1.719235e-01\n+3.213220e-03\n+3.312402e-02\n+6.898398e-05\n+2.714793e-02\n+5.361595e-03\n+5.527091e-02\n+1.151070e-04\n+4.529917e-02\n+3.238241e-03\n+3.338195e-02\n+6.952115e-05\n+2.735932e-02\n+4.191487e-03\n+4.320865e-02\n+8.998620e-05\n+3.541313e-02\n+2.416899e-02\n+2.491501e-01\n+5.188792e-04\n+2.041995e-01\n+4.032850e-02\n+4.157331e-01\n+8.658045e-04\n+3.407284e-01\n+2.435719e-02\n+2.510902e-01\n+5.229196e-04\n+2.057896e-01\n+3.152725e-02\n+3.250040e-01\n+6.768523e-04\n+2.663682e-01\n+5.861119e-04\n+6.042034e-03\n+1.258312e-05\n+4.951956e-03\n+9.779895e-04\n+1.008177e-02\n+2.099626e-05\n+8.262861e-03\n+5.906759e-04\n+6.089082e-03\n+1.268110e-05\n+4.990517e-03\n+7.645542e-04\n+7.881535e-03\n+1.641406e-05\n+6.459583e-03\n+2.351654e-03\n+2.424243e-02\n+5.048721e-05\n+1.986871e-02\n+3.923983e-03\n+4.045104e-02\n+8.424322e-05\n+3.315304e-02\n+2.369966e-03\n+2.443120e-02\n+5.088034e-05\n+2.002343e-02\n+3.067617e-03\n+3.162305e-02\n+6.585807e-05\n+2.591776e-02\n+4.843984e-04\n+4.993503e-03\n+1.039945e-05\n+4.092597e-03\n+8.082698e-04\n+8.332185e-03\n+1.735258e-05\n+6.828929e-03\n+4.881704e-04\n+5.032387e-03\n+1.048043e-05\n+4.124465e-03\n+6.318739e-04\n+6.513779e-03\n+1.356558e-05\n+5.338591e-03\n+3.643516e-03\n+3.755980e-02\n+7.822194e-05\n+3.078343e-02\n+6.079591e-03\n+6.267249e-02\n+1.305215e-04\n+5.136539e-02\n+3.671888e-03\n+3.785228e-02\n+7.883104e-05\n+3.102313e-02\n+4.752788e-03\n+4.899491e-02\n+1.020367e-04\n+4.015547e-02\n+4.174434e-03\n+4.303286e-02\n+8.962010e-05\n+3.526906e-02\n+6.965483e-03\n+7.180485e-02\n+1.495406e-04\n+5.885014e-02\n+4.206940e-03\n+4.336795e-02\n+9.031796e-05\n+3.554369e-02\n+5.445344e-03\n+5.613424e-02\n+1.169050e-04\n+4.600675e-02\n+1.674906e-02\n+1.726606e-01\n+3.595824e-04\n+1.415099e-01\n+2.794758e-02\n+2.881023e-01\n+6.000010e-04\n+2.361242e-01\n+1.687949e-02\n+1.740050e-01\n+3.623824e-04\n+1.426118e-01\n+2.184833e-02\n+2.252272e-01\n+4.690575e-04\n+1.845927e-01\n+3.450005e-03\n+3.556496e-02\n+7.406748e-05\n+2.914849e-02\n+5.756697e-03\n+5.934388e-02\n+1.235894e-04\n+4.863732e-02\n+3.476870e-03\n+3.584190e-02\n+7.464424e-05\n+2.937546e-02\n+4.500362e-03\n+4.639274e-02\n+9.661738e-05\n+3.802277e-02\n+2.595002e-02\n+2.675102e-01\n+5.571160e-04\n+2.192472e-01\n+4.330035e-02\n+4.463689e-01\n+9.296066e-04\n+3.658370e-01\n+2.615209e-02\n+2.695933e-01\n+5.614541e-04\n+2.209544e-01\n+3.385053e-02\n+3.489539e-01\n+7.267303e-04\n+2.859972e-01\n+3.160738e-03\n+3.258300e-02\n+6.785726e-05\n+2.670452e-02\n+5.274024e-03\n+5.436817e-02\n+1.132270e-04\n+4.455930e-02\n+3.185350e-03\n+3.283672e-02\n+6.838566e-05\n+2.691246e-02\n+4.123027e-03\n+4.250292e-02\n+8.851646e-05\n+3.483473e-02\n+1.268182e-02\n+1.307326e-01\n+2.722634e-04\n+1.071464e-01\n+2.116095e-02\n+2.181412e-01\n+4.543002e-04\n+1.787851e-01\n+1.278057e-02\n+1.317506e-01\n+2.743835e-04\n+1.079808e-01\n+1.'..b'2639e-03\n+7.605701e-01\n+1.502092e-01\n+1.548457e+00\n+3.224812e-03\n+1.269091e+00\n+9.072181e-02\n+9.352211e-01\n+1.947689e-03\n+7.664926e-01\n+1.174277e-01\n+1.210524e+00\n+2.521033e-03\n+9.921263e-01\n+6.771128e-01\n+6.980132e+00\n+1.453680e-02\n+5.720807e+00\n+1.129834e+00\n+1.164708e+01\n+2.425618e-02\n+9.545768e+00\n+6.823854e-01\n+7.034485e+00\n+1.465000e-02\n+5.765354e+00\n+8.832603e-01\n+9.105238e+00\n+1.896254e-02\n+7.462511e+00\n+8.866215e-02\n+9.139887e-01\n+1.903470e-03\n+7.490909e-01\n+1.479421e-01\n+1.525086e+00\n+3.176140e-03\n+1.249937e+00\n+8.935255e-02\n+9.211058e-01\n+1.918292e-03\n+7.549239e-01\n+1.156554e-01\n+1.192253e+00\n+2.482983e-03\n+9.771521e-01\n+3.557388e-01\n+3.667193e+00\n+7.637285e-03\n+3.005574e+00\n+5.935876e-01\n+6.119098e+00\n+1.274361e-02\n+5.015117e+00\n+3.585088e-01\n+3.695749e+00\n+7.696756e-03\n+3.028978e+00\n+4.640437e-01\n+4.783673e+00\n+9.962462e-03\n+3.920623e+00\n+7.327577e-02\n+7.553757e-01\n+1.573143e-03\n+6.190941e-01\n+1.222683e-01\n+1.260424e+00\n+2.624955e-03\n+1.033024e+00\n+7.384636e-02\n+7.612577e-01\n+1.585393e-03\n+6.239149e-01\n+9.558464e-02\n+9.853503e-01\n+2.052088e-03\n+8.075777e-01\n+5.511609e-01\n+5.681735e+00\n+1.183276e-02\n+4.656661e+00\n+9.196700e-01\n+9.480573e+00\n+1.974421e-02\n+7.770130e+00\n+5.554527e-01\n+5.725978e+00\n+1.192490e-02\n+4.692922e+00\n+7.189623e-01\n+7.411544e+00\n+1.543526e-02\n+6.074385e+00\n+5.523369e-01\n+5.693858e+00\n+1.185801e-02\n+4.666597e+00\n+9.216322e-01\n+9.500801e+00\n+1.978634e-02\n+7.786708e+00\n+5.566378e-01\n+5.738195e+00\n+1.195035e-02\n+4.702935e+00\n+7.204962e-01\n+7.427357e+00\n+1.546819e-02\n+6.087346e+00\n+2.216139e+00\n+2.284544e+01\n+4.757785e-02\n+1.872377e+01\n+3.697861e+00\n+3.812002e+01\n+7.938865e-02\n+3.124258e+01\n+2.233396e+00\n+2.302334e+01\n+4.794833e-02\n+1.886957e+01\n+2.890844e+00\n+2.980075e+01\n+6.206296e-02\n+2.442423e+01\n+4.564847e-01\n+4.705749e+00\n+9.800179e-03\n+3.856758e+00\n+7.616927e-01\n+7.852038e+00\n+1.635263e-02\n+6.435407e+00\n+4.600392e-01\n+4.742392e+00\n+9.876491e-03\n+3.886790e+00\n+5.954618e-01\n+6.138418e+00\n+1.278385e-02\n+5.030951e+00\n+3.433556e+00\n+3.539540e+01\n+7.371435e-02\n+2.900951e+01\n+5.729250e+00\n+5.906094e+01\n+1.230001e-01\n+4.840543e+01\n+3.460293e+00\n+3.567101e+01\n+7.428835e-02\n+2.923541e+01\n+4.478905e+00\n+4.617155e+01\n+9.615673e-02\n+3.784148e+01\n+8.326574e-02\n+8.583589e-01\n+1.787616e-03\n+7.034975e-01\n+1.389376e-01\n+1.432262e+00\n+2.982825e-03\n+1.173860e+00\n+8.391411e-02\n+8.650428e-01\n+1.801536e-03\n+7.089756e-01\n+1.086161e-01\n+1.119687e+00\n+2.331857e-03\n+9.176779e-01\n+3.340868e-01\n+3.443990e+00\n+7.172443e-03\n+2.822640e+00\n+5.574589e-01\n+5.746660e+00\n+1.196798e-02\n+4.709872e+00\n+3.366883e-01\n+3.470808e+00\n+7.228294e-03\n+2.844620e+00\n+4.357997e-01\n+4.492515e+00\n+9.356098e-03\n+3.681995e+00\n+6.881585e-02\n+7.093998e-01\n+1.477394e-03\n+5.814130e-01\n+1.148265e-01\n+1.183708e+00\n+2.465187e-03\n+9.701488e-01\n+6.935171e-02\n+7.149238e-01\n+1.488898e-03\n+5.859404e-01\n+8.976689e-02\n+9.253771e-01\n+1.927188e-03\n+7.584246e-01\n+5.176146e-01\n+5.335917e+00\n+1.111257e-02\n+4.373234e+00\n+8.636944e-01\n+8.903540e+00\n+1.854248e-02\n+7.297202e+00\n+5.216452e-01\n+5.377467e+00\n+1.119910e-02\n+4.407288e+00\n+6.752027e-01\n+6.960441e+00\n+1.449579e-02\n+5.704669e+00\n+5.930392e-01\n+6.113444e+00\n+1.273184e-02\n+5.010483e+00\n+9.895483e-01\n+1.020093e+01\n+2.124442e-02\n+8.360519e+00\n+5.976571e-01\n+6.161049e+00\n+1.283098e-02\n+5.049499e+00\n+7.735904e-01\n+7.974687e+00\n+1.660806e-02\n+6.535929e+00\n+2.379449e+00\n+2.452895e+01\n+5.108391e-02\n+2.010354e+01\n+3.970360e+00\n+4.092913e+01\n+8.523888e-02\n+3.354488e+01\n+2.397977e+00\n+2.471995e+01\n+5.148169e-02\n+2.026008e+01\n+3.103873e+00\n+3.199680e+01\n+6.663645e-02\n+2.622408e+01\n+4.901235e-01\n+5.052521e+00\n+1.052236e-02\n+4.140967e+00\n+8.178227e-01\n+8.430663e+00\n+1.755767e-02\n+6.909640e+00\n+4.939400e-01\n+5.091864e+00\n+1.060430e-02\n+4.173212e+00\n+6.393420e-01\n+6.590765e+00\n+1.372591e-02\n+5.401688e+00\n+3.686579e+00\n+3.800372e+01\n+7.914644e-02\n+3.114726e+01\n+6.151445e+00\n+6.341321e+01\n+1.320642e-01\n+5.197248e+01\n+3.715286e+00\n+3.829965e+01\n+7.976274e-02\n+3.138979e+01\n+4.808961e+00\n+4.957398e+01\n+1.032426e-01\n+4.063006e+01\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me1s0acc6
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me1s0acc6 Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,64 @@
+6.406100e-02
+2.401580e-01
+9.422911e-02
+3.099668e-01
+2.088148e-01
+7.828248e-01
+3.071515e-01
+1.010375e+00
+8.552043e-02
+3.206071e-01
+1.257944e-01
+4.138007e-01
+2.713819e-01
+1.017382e+00
+3.991832e-01
+1.313114e+00
+2.105356e-01
+7.892760e-01
+3.096827e-01
+1.018702e+00
+6.862670e-01
+2.572743e+00
+1.009449e+00
+3.320584e+00
+2.810617e-01
+1.053671e+00
+4.134215e-01
+1.359950e+00
+8.918929e-01
+3.343613e+00
+1.311910e+00
+4.315530e+00
+8.889828e-02
+3.332703e-01
+1.307629e-01
+4.301449e-01
+2.897750e-01
+1.086336e+00
+4.262380e-01
+1.402111e+00
+1.186778e-01
+4.449106e-01
+1.745665e-01
+5.742366e-01
+3.766002e-01
+1.411835e+00
+5.539516e-01
+1.822224e+00
+2.555576e-01
+9.580587e-01
+3.759068e-01
+1.236546e+00
+8.330217e-01
+3.122911e+00
+1.225315e+00
+4.030674e+00
+3.411653e-01
+1.278993e+00
+5.018296e-01
+1.650769e+00
+1.082620e+00
+4.058628e+00
+1.592455e+00
+5.238384e+00
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me1s0acc7
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me1s0acc7 Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,256 @@
+9.223072e-03
+4.862164e-02
+1.352955e-02
+6.127398e-02
+4.275866e-02
+2.254126e-01
+6.272374e-02
+2.840695e-01
+1.488114e-02
+7.844954e-02
+2.182952e-02
+9.886370e-02
+6.804956e-02
+3.587397e-01
+9.982359e-02
+4.520910e-01
+3.420165e-02
+1.803022e-01
+5.017124e-02
+2.272205e-01
+1.585607e-01
+8.358908e-01
+2.325966e-01
+1.053407e+00
+5.518331e-02
+2.909121e-01
+8.094977e-02
+3.666134e-01
+2.523462e-01
+1.330304e+00
+3.701729e-01
+1.676476e+00
+1.344047e-02
+7.085465e-02
+1.971615e-02
+8.929246e-02
+6.231072e-02
+3.284860e-01
+9.140514e-02
+4.139647e-01
+2.168578e-02
+1.143218e-01
+3.181141e-02
+1.440707e-01
+9.916629e-02
+5.227790e-01
+1.454695e-01
+6.588167e-01
+5.314928e-02
+2.801893e-01
+7.796600e-02
+3.531002e-01
+2.464030e-01
+1.298973e+00
+3.614546e-01
+1.636992e+00
+8.575475e-02
+4.520768e-01
+1.257958e-01
+5.697164e-01
+3.921455e-01
+2.067289e+00
+5.752479e-01
+2.605240e+00
+3.575897e-02
+1.885120e-01
+5.245572e-02
+2.375667e-01
+1.657805e-01
+8.739520e-01
+2.431876e-01
+1.101372e+00
+5.769601e-02
+3.041584e-01
+8.463571e-02
+3.833066e-01
+2.638365e-01
+1.390877e+00
+3.870282e-01
+1.752812e+00
+1.326040e-01
+6.990537e-01
+1.945200e-01
+8.809616e-01
+6.147591e-01
+3.240851e+00
+9.018054e-01
+4.084186e+00
+2.139524e-01
+1.127902e+00
+3.138521e-01
+1.421405e+00
+9.783770e-01
+5.157750e+00
+1.435205e+00
+6.499901e+00
+5.211032e-02
+2.747121e-01
+7.644192e-02
+3.461978e-01
+2.415863e-01
+1.273580e+00
+3.543889e-01
+1.604992e+00
+8.407842e-02
+4.432396e-01
+1.233367e-01
+5.585796e-01
+3.844798e-01
+2.026878e+00
+5.640030e-01
+2.554313e+00
+2.060663e-01
+1.086328e+00
+3.022837e-01
+1.369012e+00
+9.553343e-01
+5.036275e+00
+1.401404e+00
+6.346816e+00
+3.324816e-01
+1.752757e+00
+4.877255e-01
+2.208860e+00
+1.520396e+00
+8.015133e+00
+2.230306e+00
+1.010083e+01
+1.224294e-02
+6.454163e-02
+1.795948e-02
+8.133666e-02
+5.675895e-02
+2.992185e-01
+8.326111e-02
+3.770812e-01
+1.975362e-02
+1.041359e-01
+2.897707e-02
+1.312342e-01
+9.033075e-02
+4.762003e-01
+1.325084e-01
+6.001173e-01
+4.540015e-02
+2.393378e-01
+6.659860e-02
+3.016184e-01
+2.104775e-01
+1.109583e+00
+3.087547e-01
+1.398319e+00
+7.325176e-02
+3.861643e-01
+1.074548e-01
+4.866520e-01
+3.349709e-01
+1.765880e+00
+4.913771e-01
+2.225397e+00
+1.784122e-02
+9.405429e-02
+2.617173e-02
+1.185291e-01
+8.271286e-02
+4.360408e-01
+1.213335e-01
+5.495074e-01
+2.878627e-02
+1.517537e-01
+4.222729e-02
+1.912431e-01
+1.316359e-01
+6.939503e-01
+1.931000e-01
+8.745303e-01
+7.055173e-02
+3.719305e-01
+1.034941e-01
+4.687143e-01
+3.270817e-01
+1.724290e+00
+4.798042e-01
+2.172985e+00
+1.138331e-01
+6.000985e-01
+1.669845e-01
+7.562562e-01
+5.205440e-01
+2.744173e+00
+7.635989e-01
+3.458263e+00
+4.748313e-02
+2.503187e-01
+6.965417e-02
+3.154567e-01
+2.201343e-01
+1.160491e+00
+3.229205e-01
+1.462475e+00
+7.661258e-02
+4.038817e-01
+1.123849e-01
+5.089798e-01
+3.503395e-01
+1.846899e+00
+5.139216e-01
+2.327499e+00
+1.760803e-01
+9.282496e-01
+2.582965e-01
+1.169799e+00
+8.163177e-01
+4.303416e+00
+1.197477e+00
+5.423251e+00
+2.841002e-01
+1.497703e+00
+4.167536e-01
+1.887435e+00
+1.299154e+00
+6.848801e+00
+1.905761e+00
+8.630998e+00
+6.919553e-02
+3.647809e-01
+1.015046e-01
+4.597043e-01
+3.207942e-01
+1.691144e+00
+4.705810e-01
+2.131214e+00
+1.116449e-01
+5.885629e-01
+1.637746e-01
+7.417188e-01
+5.105377e-01
+2.691422e+00
+7.489204e-01
+3.391785e+00
+2.736284e-01
+1.442498e+00
+4.013922e-01
+1.817865e+00
+1.268556e+00
+6.687499e+00
+1.860876e+00
+8.427721e+00
+4.414911e-01
+2.327426e+00
+6.476342e-01
+2.933070e+00
+2.018882e+00
+1.064302e+01
+2.961548e+00
+1.341255e+01
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me1s0acc8
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me1s0acc8 Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,64 @@
+4.076347e-02
+2.064465e-01
+3.673459e-02
+2.000035e-01
+1.987658e-01
+1.006649e+00
+1.791207e-01
+9.752323e-01
+5.182724e-02
+2.624790e-01
+4.670487e-02
+2.542872e-01
+2.083726e-01
+1.055303e+00
+1.877780e-01
+1.022367e+00
+1.903918e-01
+9.642389e-01
+1.715743e-01
+9.341455e-01
+9.283650e-01
+4.701703e+00
+8.366096e-01
+4.554966e+00
+2.420667e-01
+1.225947e+00
+2.181419e-01
+1.187686e+00
+9.732346e-01
+4.928946e+00
+8.770445e-01
+4.775116e+00
+5.412731e-02
+2.741277e-01
+4.877761e-02
+2.655723e-01
+2.639290e-01
+1.336668e+00
+2.378434e-01
+1.294951e+00
+6.881821e-02
+3.485298e-01
+6.201653e-02
+3.376524e-01
+2.766851e-01
+1.401272e+00
+2.493388e-01
+1.357539e+00
+2.308990e-01
+1.169388e+00
+2.080780e-01
+1.132892e+00
+1.125881e+00
+5.702025e+00
+1.014604e+00
+5.524068e+00
+2.935682e-01
+1.486776e+00
+2.645532e-01
+1.440374e+00
+1.180297e+00
+5.977615e+00
+1.063642e+00
+5.791057e+00
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me1s0acc9
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me1s0acc9 Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,256 @@
+1.531018e-02
+1.578276e-01
+3.286913e-04
+1.293530e-01
+2.554665e-02
+2.633520e-01
+5.484560e-04
+2.158392e-01
+1.542940e-02
+1.590566e-01
+3.312508e-04
+1.303603e-01
+1.997138e-02
+2.058783e-01
+4.287616e-04
+1.687347e-01
+6.142898e-02
+6.332511e-01
+1.318807e-03
+5.190027e-01
+1.025007e-01
+1.056646e+00
+2.200568e-03
+8.660106e-01
+6.190732e-02
+6.381821e-01
+1.329076e-03
+5.230441e-01
+8.013108e-02
+8.260448e-01
+1.720318e-03
+6.770134e-01
+1.265326e-02
+1.304383e-01
+2.716504e-04
+1.069052e-01
+2.111330e-02
+2.176500e-01
+4.532773e-04
+1.783826e-01
+1.275179e-02
+1.314540e-01
+2.737657e-04
+1.077376e-01
+1.650556e-02
+1.701503e-01
+3.543546e-04
+1.394526e-01
+9.517449e-02
+9.811223e-01
+2.043282e-03
+8.041125e-01
+1.588087e-01
+1.637106e+00
+3.409432e-03
+1.341746e+00
+9.591560e-02
+9.887621e-01
+2.059193e-03
+8.103740e-01
+1.241504e-01
+1.279826e+00
+2.665361e-03
+1.048925e+00
+9.537755e-02
+9.832156e-01
+2.047642e-03
+8.058281e-01
+1.591475e-01
+1.640599e+00
+3.416706e-03
+1.344609e+00
+9.612024e-02
+9.908717e-01
+2.063587e-03
+8.121030e-01
+1.244153e-01
+1.282556e+00
+2.671048e-03
+1.051163e+00
+3.826830e-01
+3.944952e+00
+8.215745e-03
+3.233221e+00
+6.385468e-01
+6.582567e+00
+1.370884e-02
+5.394969e+00
+3.856628e-01
+3.975671e+00
+8.279719e-03
+3.258397e+00
+4.991911e-01
+5.145995e+00
+1.071703e-02
+4.217577e+00
+7.882579e-02
+8.125890e-01
+1.692295e-03
+6.659852e-01
+1.315291e-01
+1.355890e+00
+2.823773e-03
+1.111266e+00
+7.943959e-02
+8.189165e-01
+1.705473e-03
+6.711711e-01
+1.028244e-01
+1.059982e+00
+2.207516e-03
+8.687449e-01
+5.929067e-01
+6.112079e+00
+1.272900e-02
+5.009364e+00
+9.893272e-01
+1.019865e+01
+2.123967e-02
+8.358651e+00
+5.975235e-01
+6.159672e+00
+1.282812e-02
+5.048371e+00
+7.734176e-01
+7.972905e+00
+1.660435e-02
+6.534469e+00
+1.437833e-02
+1.482215e-01
+3.086856e-04
+1.214800e-01
+2.399176e-02
+2.473231e-01
+5.150743e-04
+2.027021e-01
+1.449029e-02
+1.493756e-01
+3.110892e-04
+1.224259e-01
+1.875582e-02
+1.933476e-01
+4.026651e-04
+1.584647e-01
+5.769012e-02
+5.947083e-01
+1.238538e-03
+4.874136e-01
+9.626203e-02
+9.923334e-01
+2.066631e-03
+8.133010e-01
+5.813934e-02
+5.993392e-01
+1.248182e-03
+4.912091e-01
+7.525392e-02
+7.757677e-01
+1.615611e-03
+6.358071e-01
+1.188312e-02
+1.224992e-01
+2.551165e-04
+1.003984e-01
+1.982824e-02
+2.044028e-01
+4.256887e-04
+1.675253e-01
+1.197566e-02
+1.234531e-01
+2.571030e-04
+1.011802e-01
+1.550095e-02
+1.597942e-01
+3.327868e-04
+1.309648e-01
+8.938171e-02
+9.214064e-01
+1.918918e-03
+7.551703e-01
+1.491428e-01
+1.537464e+00
+3.201917e-03
+1.260081e+00
+9.007771e-02
+9.285813e-01
+1.933861e-03
+7.610507e-01
+1.165940e-01
+1.201929e+00
+2.503134e-03
+9.850825e-01
+1.024060e-01
+1.055670e+00
+2.198535e-03
+8.652105e-01
+1.708752e-01
+1.761496e+00
+3.668487e-03
+1.443695e+00
+1.032034e-01
+1.063890e+00
+2.215654e-03
+8.719477e-01
+1.335836e-01
+1.377069e+00
+2.867880e-03
+1.128624e+00
+4.108833e-01
+4.235660e+00
+8.821172e-03
+3.471480e+00
+6.856020e-01
+7.067644e+00
+1.471905e-02
+5.792531e+00
+4.140828e-01
+4.268642e+00
+8.889861e-03
+3.498512e+00
+5.359770e-01
+5.525209e+00
+1.150678e-02
+4.528375e+00
+8.463455e-02
+8.724695e-01
+1.817002e-03
+7.150624e-01
+1.412217e-01
+1.455807e+00
+3.031860e-03
+1.193157e+00
+8.529359e-02
+8.792633e-01
+1.831151e-03
+7.206305e-01
+1.104016e-01
+1.138094e+00
+2.370190e-03
+9.327637e-01
+6.365986e-01
+6.562484e+00
+1.366701e-02
+5.378510e+00
+1.062232e+00
+1.095020e+01
+2.280485e-02
+8.974609e+00
+6.415557e-01
+6.613585e+00
+1.377343e-02
+5.420391e+00
+8.304115e-01
+8.560437e+00
+1.782794e-02
+7.016001e+00
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2s0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2s0 Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,16384 @@\n+   6.3344216e-01\n+   6.1190460e-01\n+   1.0447441e+00\n+   1.0428626e+00\n+   6.3118803e-01\n+   8.2989326e-01\n+   1.4623646e+00\n+   7.8954472e-01\n+   7.4249688e+00\n+   7.7827067e+00\n+   7.1437180e+00\n+   2.3087809e+01\n+   5.6217331e-01\n+   3.2154620e-01\n+   8.4665214e-01\n+   7.0806879e-01\n+   3.9519655e-01\n+   3.8175954e-01\n+   6.5180264e-01\n+   6.5062881e-01\n+   4.6486739e-01\n+   6.1121297e-01\n+   1.0770255e+00\n+   5.8149644e-01\n+   4.4296592e+00\n+   4.6430820e+00\n+   4.2618679e+00\n+   1.3773947e+01\n+   5.0043644e-01\n+   2.8623457e-01\n+   7.5367431e-01\n+   6.3030993e-01\n+   2.0212945e-01\n+   1.9525688e-01\n+   3.3337464e-01\n+   3.3277427e-01\n+   1.4992661e-01\n+   1.9712523e-01\n+   3.4735667e-01\n+   1.8754121e-01\n+   1.8372019e+00\n+   1.9257191e+00\n+   1.7676105e+00\n+   5.7127469e+00\n+   1.0395854e-01\n+   5.9461151e-02\n+   1.5656509e-01\n+   1.3093790e-01\n+   3.0882033e-01\n+   2.9832018e-01\n+   5.0934125e-01\n+   5.0842398e-01\n+   2.2220061e-01\n+   2.9215192e-01\n+   5.1480430e-01\n+   2.7794779e-01\n+   1.6695626e+00\n+   1.7500027e+00\n+   1.6063211e+00\n+   5.1914752e+00\n+   2.6154847e-01\n+   1.4959785e-01\n+   3.9390090e-01\n+   3.2942565e-01\n+   1.8131545e-03\n+   1.7515057e-03\n+   2.9904584e-03\n+   2.9850729e-03\n+   1.8067023e-03\n+   2.3754729e-03\n+   4.1858484e-03\n+   2.2599799e-03\n+   2.1253109e-02\n+   2.2277092e-02\n+   2.0448061e-02\n+   6.6086165e-02\n+   1.6091557e-03\n+   9.2038858e-04\n+   2.4234432e-03\n+   2.0267645e-03\n+   1.7017043e-04\n+   1.6438450e-04\n+   2.8066423e-04\n+   2.8015879e-04\n+   2.0017048e-04\n+   2.6318645e-04\n+   4.6376391e-04\n+   2.5039060e-04\n+   1.9073978e-03\n+   1.9992970e-03\n+   1.8351474e-03\n+   5.9310195e-03\n+   2.1548641e-04\n+   1.2325173e-04\n+   3.2452986e-04\n+   2.7140954e-04\n+   5.0852471e-04\n+   4.9123444e-04\n+   8.3871619e-04\n+   8.3720575e-04\n+   3.7719089e-04\n+   4.9593491e-04\n+   8.7389269e-04\n+   4.7182307e-04\n+   4.6221002e-03\n+   4.8447949e-03\n+   4.4470195e-03\n+   1.4372339e-02\n+   2.6154271e-04\n+   1.4959455e-04\n+   3.9389222e-04\n+   3.2941839e-04\n+   5.7504443e-04\n+   5.5549244e-04\n+   9.4842800e-04\n+   9.4671998e-04\n+   4.1375263e-04\n+   5.4400671e-04\n+   9.5860057e-04\n+   5.1755767e-04\n+   3.1088389e-03\n+   3.2586240e-03\n+   2.9910790e-03\n+   9.6668793e-03\n+   4.8702102e-04\n+   2.7856135e-04\n+   7.3347023e-04\n+   6.1341294e-04\n+   3.8297676e-01\n+   3.6995524e-01\n+   6.3164839e-01\n+   6.3051086e-01\n+   3.8161393e-01\n+   5.0175037e-01\n+   8.8414017e-01\n+   4.7735579e-01\n+   4.4891084e+00\n+   4.7053954e+00\n+   4.3190652e+00\n+   1.3958803e+01\n+   3.3988789e-01\n+   1.9440563e-01\n+   5.1188272e-01\n+   4.2809575e-01\n+   9.3036109e-02\n+   8.9872804e-02\n+   1.5344562e-01\n+   1.5316928e-01\n+   1.0943783e-01\n+   1.4389011e-01\n+   2.5355045e-01\n+   1.3689433e-01\n+   1.0428185e+00\n+   1.0930619e+00\n+   1.0033175e+00\n+   3.2426255e+00\n+   1.1781140e-01\n+   6.7384572e-02\n+   1.7742798e-01\n+   1.4838587e-01\n+   4.9063546e-02\n+   4.7395345e-02\n+   8.0921124e-02\n+   8.0775393e-02\n+   3.6392180e-02\n+   4.7848856e-02\n+   8.4315027e-02\n+   4.5522495e-02\n+   4.4595007e-01\n+   4.6743612e-01\n+   4.2905791e-01\n+   1.3866738e+00\n+   2.5234197e-02\n+   1.4433201e-02\n+   3.8003560e-02\n+   3.1782988e-02\n+   3.4489126e-02\n+   3.3316467e-02\n+   5.6883349e-02\n+   5.6780908e-02\n+   2.4815416e-02\n+   3.2627594e-02\n+   5.7493464e-02\n+   3.1041274e-02\n+   1.8645714e-01\n+   1.9544071e-01\n+   1.7939432e-01\n+   5.7978515e-01\n+   2.9209794e-02\n+   1.6707123e-02\n+   4.3990944e-02\n+   3.6790334e-02\n+   3.1818902e-03\n+   3.0737033e-03\n+   5.2479315e-03\n+   5.2384805e-03\n+   3.1705673e-03\n+   4.1686982e-03\n+   7.3457118e-03\n+   3.9660205e-03\n+   3.7296910e-02\n+   3.9093890e-02\n+   3.5884138e-02\n+   1.1597408e-01\n+   2.8238944e-03\n+   1.6151825e-03\n+   4.2528810e-03\n+   3.5567527e-03\n+   1.8194874e-03\n+   1.7576233e-03\n+   3.0009034e-03\n+   2.9954991e-03\n+   2.1402523e-03\n+   2.8140283e-03\n+   4.9586322e-03\n+   2.6772132e-03\n+   2.0394179e-02\n+   2.1376778e-02\n+   1.9621667e-02\n+   6.3415337e-02\n+   2.3040125e-03\n+'..b'02\n+   3.8390581e-02\n+   3.8321443e-02\n+   1.7265170e-02\n+   2.2700443e-02\n+   4.0000715e-02\n+   2.1596771e-02\n+   2.1156753e-01\n+   2.2176094e-01\n+   2.0355355e-01\n+   6.5786548e-01\n+   1.1971602e-02\n+   6.8473957e-03\n+   1.8029640e-02\n+   1.5078478e-02\n+   3.5562968e-02\n+   3.4353797e-02\n+   5.8654450e-02\n+   5.8548819e-02\n+   2.5588060e-02\n+   3.3643475e-02\n+   5.9283560e-02\n+   3.2007765e-02\n+   1.9226260e-01\n+   2.0152589e-01\n+   1.8497988e-01\n+   5.9783714e-01\n+   3.0119261e-02\n+   1.7227310e-02\n+   4.5360631e-02\n+   3.7935825e-02\n+   3.4800014e-04\n+   3.3616785e-04\n+   5.7396101e-04\n+   5.7292736e-04\n+   3.4676177e-04\n+   4.5592635e-04\n+   8.0339315e-04\n+   4.3375969e-04\n+   4.0791257e-03\n+   4.2756596e-03\n+   3.9246123e-03\n+   1.2683969e-02\n+   3.0884650e-04\n+   1.7665090e-04\n+   4.6513334e-04\n+   3.8899849e-04\n+   3.2660943e-05\n+   3.1550444e-05\n+   5.3868104e-05\n+   5.3771093e-05\n+   3.8418876e-05\n+   5.0513579e-05\n+   8.9010567e-05\n+   4.8057662e-05\n+   3.6608834e-04\n+   3.8372662e-04\n+   3.5222126e-04\n+   1.1383452e-03\n+   4.1358474e-05\n+   2.3655801e-05\n+   6.2287269e-05\n+   5.2091844e-05\n+   9.7601542e-05\n+   9.4283009e-05\n+   1.6097545e-04\n+   1.6068555e-04\n+   7.2394540e-05\n+   9.5185172e-05\n+   1.6772690e-04\n+   9.0557369e-05\n+   8.8712329e-04\n+   9.2986524e-04\n+   8.5351990e-04\n+   2.7584941e-03\n+   5.0198095e-05\n+   2.8711798e-05\n+   7.5600039e-05\n+   6.3225528e-05\n+   1.1036872e-04\n+   1.0661610e-04\n+   1.8203252e-04\n+   1.8170470e-04\n+   7.9411864e-05\n+   1.0441163e-04\n+   1.8398495e-04\n+   9.9335246e-05\n+   5.9668186e-04\n+   6.2543023e-04\n+   5.7408012e-04\n+   1.8553717e-03\n+   9.3474323e-05\n+   5.3464497e-05\n+   1.4077551e-04\n+   1.1773282e-04\n+   5.4819424e-02\n+   5.2955518e-02\n+   9.0414365e-02\n+   9.0251538e-02\n+   5.4624347e-02\n+   7.1820718e-02\n+   1.2655612e-01\n+   6.8328869e-02\n+   6.4257250e-01\n+   6.7353190e-01\n+   6.1823246e-01\n+   1.9980678e+00\n+   4.8651667e-02\n+   2.7827288e-02\n+   7.3271067e-02\n+   6.1277770e-02\n+   1.3317220e-02\n+   1.2864424e-02\n+   2.1964259e-02\n+   2.1924703e-02\n+   1.5664968e-02\n+   2.0596480e-02\n+   3.6293297e-02\n+   1.9595100e-02\n+   1.4926939e-01\n+   1.5646125e-01\n+   1.4361520e-01\n+   4.6415051e-01\n+   1.6863564e-02\n+   9.6454505e-03\n+   2.5397101e-02\n+   2.1240003e-02\n+   7.0229727e-03\n+   6.7841858e-03\n+   1.1583077e-02\n+   1.1562217e-02\n+   5.2091890e-03\n+   6.8491015e-03\n+   1.2068882e-02\n+   6.5161053e-03\n+   6.3833444e-02\n+   6.6908965e-02\n+   6.1415494e-02\n+   1.9848896e-01\n+   3.6120316e-03\n+   2.0659733e-03\n+   5.4398424e-03\n+   4.5494276e-03\n+   4.9367852e-03\n+   4.7689304e-03\n+   8.1423019e-03\n+   8.1276385e-03\n+   3.5520871e-03\n+   4.6703249e-03\n+   8.2296339e-03\n+   4.4432585e-03\n+   2.6689538e-02\n+   2.7975450e-02\n+   2.5678563e-02\n+   8.2990643e-02\n+   4.1810999e-03\n+   2.3914632e-03\n+   6.2968787e-03\n+   5.2661808e-03\n+   9.2536669e-06\n+   8.9390346e-06\n+   1.5262189e-05\n+   1.5234704e-05\n+   9.2207374e-06\n+   1.2123531e-05\n+   2.1363016e-05\n+   1.1534098e-05\n+   1.0846797e-04\n+   1.1369400e-04\n+   1.0435931e-04\n+   3.3727924e-04\n+   8.2125330e-06\n+   4.6973215e-06\n+   1.2368354e-05\n+   1.0343853e-05\n+   5.2914870e-06\n+   5.1115720e-06\n+   8.7273159e-06\n+   8.7115989e-06\n+   6.2243451e-06\n+   8.1838403e-06\n+   1.4420840e-05\n+   7.7859505e-06\n+   5.9310953e-05\n+   6.2168578e-05\n+   5.7064310e-05\n+   1.8442635e-04\n+   6.7005971e-06\n+   3.8325397e-06\n+   1.0091327e-05\n+   8.4395391e-06\n+   8.0230696e-06\n+   7.7502785e-06\n+   1.3232549e-05\n+   1.3208719e-05\n+   5.9509965e-06\n+   7.8244385e-06\n+   1.3787534e-05\n+   7.4440224e-06\n+   7.2923559e-05\n+   7.6437045e-05\n+   7.0161284e-05\n+   2.2675451e-04\n+   4.1263980e-06\n+   2.3601754e-06\n+   6.2144958e-06\n+   5.1972827e-06\n+   9.3657055e-06\n+   9.0472637e-06\n+   1.5446976e-05\n+   1.5419157e-05\n+   6.7387581e-06\n+   8.8601965e-06\n+   1.5612656e-05\n+   8.4294230e-06\n+   5.0633426e-05\n+   5.3072964e-05\n+   4.8715480e-05\n+   1.5744374e-04\n+   7.9320749e-06\n+   4.5369079e-06\n+   1.1945975e-05\n+   9.9906105e-06\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2s0acc1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2s0acc1 Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,16384 @@\n+5.967111e-02\n+1.348003e-01\n+1.955443e-02\n+2.034869e-01\n+7.785607e-02\n+3.279871e-01\n+1.693685e-01\n+4.008884e-01\n+1.593060e-02\n+5.528581e-02\n+1.930878e-02\n+8.443290e-02\n+1.247011e-01\n+4.388850e-01\n+2.508146e-01\n+4.802173e-01\n+9.911109e-02\n+2.238974e-01\n+3.247904e-02\n+3.379828e-01\n+1.400757e-01\n+5.901018e-01\n+3.047213e-01\n+7.212630e-01\n+8.987576e-02\n+3.119062e-01\n+1.089344e-01\n+4.763455e-01\n+2.330246e-01\n+8.201294e-01\n+4.686886e-01\n+8.973657e-01\n+1.911538e-02\n+4.318269e-02\n+6.264175e-03\n+6.518615e-02\n+4.221987e-02\n+1.778612e-01\n+9.184533e-02\n+2.173942e-01\n+2.031417e-02\n+7.049859e-02\n+2.462190e-02\n+1.076660e-01\n+6.041390e-02\n+2.126265e-01\n+1.215121e-01\n+2.326508e-01\n+1.373858e-01\n+3.103621e-01\n+4.502180e-02\n+4.685051e-01\n+1.669620e-01\n+7.033669e-01\n+3.632100e-01\n+8.597035e-01\n+1.271527e-01\n+4.412727e-01\n+1.541162e-01\n+6.739149e-01\n+2.393888e-01\n+8.425281e-01\n+4.814890e-01\n+9.218738e-01\n+7.050151e-02\n+1.592668e-01\n+2.310358e-02\n+2.404201e-01\n+9.198706e-02\n+3.875172e-01\n+2.001091e-01\n+4.736502e-01\n+1.882203e-02\n+6.532026e-02\n+2.281335e-02\n+9.975759e-02\n+1.473345e-01\n+5.185433e-01\n+2.963378e-01\n+5.673775e-01\n+1.817815e-01\n+4.106544e-01\n+5.957041e-02\n+6.199006e-01\n+2.569154e-01\n+1.082317e+00\n+5.588950e-01\n+1.322882e+00\n+1.648428e-01\n+5.720730e-01\n+1.997987e-01\n+8.736741e-01\n+4.273948e-01\n+1.504215e+00\n+8.596305e-01\n+1.645875e+00\n+7.969590e-02\n+1.800374e-01\n+2.611662e-02\n+2.717743e-01\n+1.760232e-01\n+7.415394e-01\n+3.829218e-01\n+9.063605e-01\n+8.469388e-02\n+2.939229e-01\n+1.026537e-01\n+4.488813e-01\n+2.518778e-01\n+8.864830e-01\n+5.066084e-01\n+9.699681e-01\n+2.239816e-01\n+5.059868e-01\n+7.339954e-02\n+7.638090e-01\n+2.722000e-01\n+1.146707e+00\n+5.921453e-01\n+1.401584e+00\n+2.072985e-01\n+7.194118e-01\n+2.512573e-01\n+1.098691e+00\n+3.902782e-01\n+1.373583e+00\n+7.849769e-01\n+1.502941e+00\n+2.065899e-02\n+4.666978e-02\n+6.770020e-03\n+7.045006e-02\n+2.695487e-02\n+1.135538e-01\n+5.863777e-02\n+1.387932e-01\n+5.515401e-03\n+1.914073e-02\n+6.684973e-03\n+2.923187e-02\n+4.317328e-02\n+1.519482e-01\n+8.683556e-02\n+1.662581e-01\n+4.875379e-02\n+1.101375e-01\n+1.597678e-02\n+1.662573e-01\n+6.890468e-02\n+2.902772e-01\n+1.498956e-01\n+3.547968e-01\n+4.421083e-02\n+1.534299e-01\n+5.358599e-02\n+2.343193e-01\n+1.146273e-01\n+4.034303e-01\n+2.305528e-01\n+4.414236e-01\n+1.899958e-02\n+4.292109e-02\n+6.226227e-03\n+6.479126e-02\n+4.196411e-02\n+1.767837e-01\n+9.128893e-02\n+2.160772e-01\n+2.019110e-02\n+7.007151e-02\n+2.447274e-02\n+1.070137e-01\n+6.004791e-02\n+2.113384e-01\n+1.207759e-01\n+2.312414e-01\n+9.037355e-02\n+2.041588e-01\n+2.961572e-02\n+3.081866e-01\n+1.098290e-01\n+4.626807e-01\n+2.389226e-01\n+5.655202e-01\n+8.364212e-02\n+2.902729e-01\n+1.013789e-01\n+4.433069e-01\n+1.574720e-01\n+5.542221e-01\n+3.167275e-01\n+6.064163e-01\n+9.476677e-02\n+2.140833e-01\n+3.105539e-02\n+3.231681e-01\n+1.236472e-01\n+5.208932e-01\n+2.689828e-01\n+6.366715e-01\n+2.530021e-02\n+8.780224e-02\n+3.066526e-02\n+1.340922e-01\n+1.980442e-01\n+6.970159e-01\n+3.983315e-01\n+7.626579e-01\n+1.600406e-01\n+3.615405e-01\n+5.244585e-02\n+5.457611e-01\n+2.261885e-01\n+9.528726e-01\n+4.920517e-01\n+1.164667e+00\n+1.451278e-01\n+5.036536e-01\n+1.759029e-01\n+7.691835e-01\n+3.762788e-01\n+1.324312e+00\n+7.568195e-01\n+1.449030e+00\n+8.543752e-02\n+1.930080e-01\n+2.799817e-02\n+2.913540e-01\n+1.887047e-01\n+7.949629e-01\n+4.105091e-01\n+9.716585e-01\n+9.079558e-02\n+3.150984e-01\n+1.100493e-01\n+4.812205e-01\n+2.700241e-01\n+9.503489e-01\n+5.431066e-01\n+1.039849e+00\n+2.405334e-01\n+5.433781e-01\n+7.882361e-02\n+8.202529e-01\n+2.923150e-01\n+1.231446e+00\n+6.359036e-01\n+1.505158e+00\n+2.226174e-01\n+7.725748e-01\n+2.698247e-01\n+1.179882e+00\n+4.191190e-01\n+1.475088e+00\n+8.429850e-01\n+1.614005e+00\n+9.283722e-02\n+2.097244e-01\n+3.042307e-02\n+3.165881e-01\n+1.211297e-01\n+5.102873e-01\n+2.635061e-01\n+6.237083e-01\n+2.478507e-02\n+8.601450e-02\n+3.004089e-02\n+1.313620e-01\n+1.940118e-01\n+6.828240e-01\n+3.902211e-01\n+7.471294e-01\n+1.541985e-01\n+3.483430e-01\n+5.053139e-02\n+5.258388e-01\n+2.179318e-01\n+9.180893e-01\n+4.740900e-01\n+1.122152e+00\n+1.398301e-01\n+4.852684e-01\n+1.694818e-01\n+7.411055e-01\n+3.'..b'9703e-01\n+1.456556e+00\n+9.433847e-01\n+3.974231e+00\n+2.052244e+00\n+4.857579e+00\n+4.539112e-01\n+1.575260e+00\n+5.501657e-01\n+2.405749e+00\n+1.349922e+00\n+4.751047e+00\n+2.715134e+00\n+5.198480e+00\n+1.202490e+00\n+2.716492e+00\n+3.940602e-01\n+4.100662e+00\n+1.461361e+00\n+6.156326e+00\n+3.179051e+00\n+7.524686e+00\n+1.112924e+00\n+3.862307e+00\n+1.348925e+00\n+5.898544e+00\n+2.095287e+00\n+7.374356e+00\n+4.214306e+00\n+8.068841e+00\n+3.274695e-01\n+7.397716e-01\n+1.073129e-01\n+1.116717e+00\n+4.272668e-01\n+1.799962e+00\n+9.294785e-01\n+2.200038e+00\n+8.742566e-02\n+3.034034e-01\n+1.059648e-01\n+4.633598e-01\n+6.843479e-01\n+2.408560e+00\n+1.376447e+00\n+2.635388e+00\n+5.439124e-01\n+1.228728e+00\n+1.782420e-01\n+1.854818e+00\n+7.687221e-01\n+3.238423e+00\n+1.672282e+00\n+3.958224e+00\n+4.932298e-01\n+1.711712e+00\n+5.978220e-01\n+2.614140e+00\n+1.278817e+00\n+4.500794e+00\n+2.572119e+00\n+4.924659e+00\n+1.049034e-01\n+2.369826e-01\n+3.437721e-02\n+3.577355e-01\n+2.316987e-01\n+9.760856e-01\n+5.040386e-01\n+1.193039e+00\n+1.114822e-01\n+3.868897e-01\n+1.351227e-01\n+5.908608e-01\n+3.315458e-01\n+1.166874e+00\n+6.668468e-01\n+1.276766e+00\n+7.539605e-01\n+1.703238e+00\n+2.470754e-01\n+2.571112e+00\n+9.162719e-01\n+3.860012e+00\n+1.993263e+00\n+4.717972e+00\n+6.978022e-01\n+2.421663e+00\n+8.457751e-01\n+3.698382e+00\n+1.313743e+00\n+4.623715e+00\n+2.642367e+00\n+5.059157e+00\n+4.253769e-01\n+9.609498e-01\n+1.393975e-01\n+1.450595e+00\n+5.550118e-01\n+2.338119e+00\n+1.207376e+00\n+2.857810e+00\n+1.135643e-01\n+3.941154e-01\n+1.376463e-01\n+6.018960e-01\n+8.889554e-01\n+3.128676e+00\n+1.787980e+00\n+3.423321e+00\n+1.096794e+00\n+2.477719e+00\n+3.594232e-01\n+3.740224e+00\n+1.550121e+00\n+6.530251e+00\n+3.372141e+00\n+7.981722e+00\n+9.945933e-01\n+3.451652e+00\n+1.205502e+00\n+5.271388e+00\n+2.578723e+00\n+9.075809e+00\n+5.186655e+00\n+9.930529e+00\n+4.808521e-01\n+1.086271e+00\n+1.575769e-01\n+1.639774e+00\n+1.062051e+00\n+4.474142e+00\n+2.310392e+00\n+5.468604e+00\n+5.110078e-01\n+1.773409e+00\n+6.193699e-01\n+2.708364e+00\n+1.519726e+00\n+5.348672e+00\n+3.056666e+00\n+5.852387e+00\n+1.351413e+00\n+3.052915e+00\n+4.428625e-01\n+4.608508e+00\n+1.642342e+00\n+6.918755e+00\n+3.572760e+00\n+8.456579e+00\n+1.250753e+00\n+4.340633e+00\n+1.515983e+00\n+6.629047e+00\n+2.354777e+00\n+8.287631e+00\n+4.736226e+00\n+9.068125e+00\n+2.954069e-01\n+6.673404e-01\n+9.680585e-02\n+1.007379e+00\n+3.854331e-01\n+1.623728e+00\n+8.384731e-01\n+1.984632e+00\n+7.886580e-02\n+2.736971e-01\n+9.558974e-02\n+4.179922e-01\n+6.173433e-01\n+2.172737e+00\n+1.241679e+00\n+2.377357e+00\n+6.971400e-01\n+1.574877e+00\n+2.284551e-01\n+2.377346e+00\n+9.852816e-01\n+4.150732e+00\n+2.143387e+00\n+5.073310e+00\n+6.321794e-01\n+2.193925e+00\n+7.662366e-01\n+3.350578e+00\n+1.639078e+00\n+5.768729e+00\n+3.296720e+00\n+6.312003e+00\n+2.716787e-01\n+6.137372e-01\n+8.903005e-02\n+9.264629e-01\n+6.000530e-01\n+2.527865e+00\n+1.305358e+00\n+3.089731e+00\n+2.887166e-01\n+1.001966e+00\n+3.499406e-01\n+1.530210e+00\n+8.586369e-01\n+3.021970e+00\n+1.726999e+00\n+3.306566e+00\n+1.292269e+00\n+2.919307e+00\n+4.234810e-01\n+4.406821e+00\n+1.570467e+00\n+6.615962e+00\n+3.416401e+00\n+8.086485e+00\n+1.196015e+00\n+4.150669e+00\n+1.449637e+00\n+6.338933e+00\n+2.251723e+00\n+7.924931e+00\n+4.528949e+00\n+8.671267e+00\n+6.358805e-01\n+1.436489e+00\n+2.083802e-01\n+2.168443e+00\n+8.296671e-01\n+3.495168e+00\n+1.804862e+00\n+4.272036e+00\n+1.697632e-01\n+5.891489e-01\n+2.057625e-01\n+8.997525e-01\n+1.328867e+00\n+4.676944e+00\n+2.672786e+00\n+5.117398e+00\n+1.073865e+00\n+2.425920e+00\n+3.519092e-01\n+3.662031e+00\n+1.517714e+00\n+6.393730e+00\n+3.301644e+00\n+7.814857e+00\n+9.738005e-01\n+3.379492e+00\n+1.180300e+00\n+5.161185e+00\n+2.524813e+00\n+8.886071e+00\n+5.078223e+00\n+9.722923e+00\n+5.732817e-01\n+1.295075e+00\n+1.878664e-01\n+1.954972e+00\n+1.266199e+00\n+5.334164e+00\n+2.754496e+00\n+6.519782e+00\n+6.092340e-01\n+2.114295e+00\n+7.384255e-01\n+3.228967e+00\n+1.811849e+00\n+6.376796e+00\n+3.644219e+00\n+6.977335e+00\n+1.613968e+00\n+3.646041e+00\n+5.289027e-01\n+5.503858e+00\n+1.961420e+00\n+8.262945e+00\n+4.266883e+00\n+1.009954e+01\n+1.493752e+00\n+5.183940e+00\n+1.810511e+00\n+7.916952e+00\n+2.812268e+00\n+9.897769e+00\n+5.656390e+00\n+1.082990e+01\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2s0acc2
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2s0acc2 Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,16384 @@\n+1.436168e-02\n+4.804086e-02\n+2.440290e-04\n+5.193349e-02\n+1.914594e-02\n+9.116025e-02\n+1.871265e-02\n+8.018359e-02\n+1.438593e-04\n+8.922464e-04\n+1.939583e-04\n+1.130341e-03\n+4.050085e-02\n+1.586085e-01\n+4.987124e-02\n+1.349980e-01\n+2.477027e-02\n+8.285836e-02\n+4.208884e-04\n+8.957215e-02\n+4.971359e-02\n+2.367031e-01\n+4.858852e-02\n+2.082015e-01\n+2.008084e-02\n+1.245457e-01\n+2.707399e-02\n+1.577805e-01\n+6.530990e-02\n+2.557652e-01\n+8.042017e-02\n+2.176918e-01\n+1.965167e-04\n+6.573626e-04\n+3.339148e-06\n+7.106270e-04\n+5.528569e-04\n+2.632337e-03\n+5.403451e-04\n+2.315376e-03\n+1.500403e-04\n+9.305819e-04\n+2.022917e-04\n+1.178906e-03\n+1.123403e-03\n+4.399447e-03\n+1.383317e-03\n+3.744543e-03\n+3.823249e-02\n+1.278904e-01\n+6.496341e-04\n+1.382531e-01\n+9.645365e-02\n+4.592481e-01\n+9.427080e-02\n+4.039498e-01\n+2.954633e-02\n+1.832527e-01\n+3.983583e-02\n+2.321534e-01\n+9.175195e-02\n+3.593170e-01\n+1.129799e-01\n+3.058288e-01\n+1.756310e-02\n+5.874985e-02\n+2.984265e-04\n+6.351019e-02\n+2.341384e-02\n+1.114812e-01\n+2.288397e-02\n+9.805765e-02\n+1.759276e-04\n+1.091141e-03\n+2.371943e-04\n+1.382310e-03\n+4.952906e-02\n+1.939646e-01\n+6.098824e-02\n+1.650909e-01\n+4.938861e-02\n+1.652085e-01\n+8.391952e-04\n+1.785949e-01\n+9.912225e-02\n+4.719542e-01\n+9.687901e-02\n+4.151260e-01\n+4.003851e-02\n+2.483274e-01\n+5.398191e-02\n+3.145932e-01\n+1.302192e-01\n+5.099616e-01\n+1.603471e-01\n+4.340484e-01\n+1.355278e-02\n+4.533502e-02\n+2.302844e-04\n+4.900840e-02\n+3.812778e-02\n+1.815391e-01\n+3.726491e-02\n+1.596799e-01\n+1.034753e-02\n+6.417759e-02\n+1.395105e-02\n+8.130329e-02\n+7.747552e-02\n+3.034079e-01\n+9.540047e-02\n+2.582424e-01\n+6.602945e-02\n+2.208733e-01\n+1.121951e-03\n+2.387701e-01\n+1.665804e-01\n+7.931449e-01\n+1.628105e-01\n+6.976419e-01\n+5.102801e-02\n+3.164867e-01\n+6.879850e-02\n+4.009407e-01\n+1.584603e-01\n+6.205587e-01\n+1.951221e-01\n+5.281819e-01\n+2.398826e-04\n+8.024248e-04\n+4.076008e-06\n+8.674431e-04\n+3.197940e-04\n+1.522646e-03\n+3.125567e-04\n+1.339304e-03\n+2.402877e-06\n+1.490316e-05\n+3.239679e-06\n+1.888005e-05\n+6.764842e-04\n+2.649233e-03\n+8.329975e-04\n+2.254866e-03\n+5.481230e-04\n+1.833511e-03\n+9.313529e-06\n+1.982076e-03\n+1.100075e-03\n+5.237827e-03\n+1.075180e-03\n+4.607137e-03\n+4.443541e-04\n+2.755979e-03\n+5.991003e-04\n+3.491409e-03\n+1.445194e-03\n+5.659639e-03\n+1.779559e-03\n+4.817141e-03\n+1.372693e-04\n+4.591759e-04\n+2.332436e-06\n+4.963817e-04\n+3.861773e-04\n+1.838720e-03\n+3.774377e-04\n+1.617318e-03\n+1.048050e-04\n+6.500229e-04\n+1.413033e-04\n+8.234806e-04\n+7.847110e-04\n+3.073068e-03\n+9.662639e-04\n+2.615609e-03\n+1.096438e-03\n+3.667664e-03\n+1.863032e-05\n+3.964845e-03\n+2.766114e-03\n+1.317040e-02\n+2.703514e-03\n+1.158454e-02\n+8.473347e-04\n+5.255351e-03\n+1.142419e-03\n+6.657734e-03\n+2.631278e-03\n+1.030455e-02\n+3.240058e-03\n+8.770611e-03\n+4.926903e-02\n+1.648085e-01\n+8.371634e-04\n+1.781625e-01\n+6.568187e-02\n+3.127334e-01\n+6.419542e-02\n+2.750770e-01\n+4.935223e-04\n+3.060931e-03\n+6.653913e-04\n+3.877736e-03\n+1.389418e-01\n+5.441208e-01\n+1.710877e-01\n+4.631226e-01\n+1.149004e-01\n+3.843500e-01\n+1.952350e-03\n+4.154929e-01\n+2.306034e-01\n+1.097980e+00\n+2.253846e-01\n+9.657718e-01\n+9.314778e-02\n+5.777225e-01\n+1.255865e-01\n+7.318870e-01\n+3.029491e-01\n+1.186402e+00\n+3.730402e-01\n+1.009794e+00\n+3.289463e-02\n+1.100349e-01\n+5.589349e-04\n+1.189508e-01\n+9.254186e-02\n+4.406228e-01\n+9.044755e-02\n+3.875672e-01\n+2.511501e-02\n+1.557687e-01\n+3.386130e-02\n+1.973353e-01\n+1.880447e-01\n+7.364167e-01\n+2.315513e-01\n+6.267933e-01\n+1.092115e-01\n+3.653205e-01\n+1.855687e-03\n+3.949215e-01\n+2.755209e-01\n+1.311848e+00\n+2.692856e-01\n+1.153887e+00\n+8.439942e-02\n+5.234633e-01\n+1.137915e-01\n+6.631487e-01\n+2.620905e-01\n+1.026393e+00\n+3.227284e-01\n+8.736035e-01\n+1.727233e-02\n+5.777720e-02\n+2.934859e-04\n+6.245874e-02\n+2.302621e-02\n+1.096355e-01\n+2.250511e-02\n+9.643424e-02\n+1.730150e-04\n+1.073076e-03\n+2.332674e-04\n+1.359425e-03\n+4.870908e-02\n+1.907534e-01\n+5.997854e-02\n+1.623577e-01\n+2.979041e-02\n+9.965109e-02\n+5.061891e-04\n+1.077256e-01\n+5.978895e-02\n+2.846752e-01\n+5.843586e-02\n+2.503973e-01\n+2.415059e-02\n+1.497871e-01\n+3.256102e-02\n+1.897576e-01\n+7.'..b'7309e-03\n+1.618964e+00\n+1.259529e+00\n+5.997038e+00\n+1.231024e+00\n+5.274931e+00\n+3.418245e-01\n+2.120069e+00\n+4.608648e-01\n+2.685806e+00\n+2.559358e+00\n+1.002290e+01\n+3.151498e+00\n+8.530886e+00\n+1.486409e+00\n+4.972146e+00\n+2.525658e-02\n+5.375026e+00\n+3.749941e+00\n+1.785473e+01\n+3.665076e+00\n+1.570483e+01\n+1.148707e+00\n+7.124528e+00\n+1.548744e+00\n+9.025698e+00\n+3.567148e+00\n+1.396959e+01\n+4.392453e+00\n+1.189006e+01\n+3.121630e-01\n+1.044208e+00\n+5.304173e-03\n+1.128817e+00\n+4.161530e-01\n+1.981444e+00\n+4.067350e-01\n+1.742857e+00\n+3.126902e-03\n+1.939372e-02\n+4.215845e-03\n+2.456890e-02\n+8.803196e-01\n+3.447488e+00\n+1.083993e+00\n+2.934293e+00\n+5.384024e-01\n+1.800995e+00\n+9.148360e-03\n+1.946925e+00\n+1.080566e+00\n+5.144938e+00\n+1.056112e+00\n+4.525433e+00\n+4.364738e-01\n+2.707104e+00\n+5.884756e-01\n+3.429491e+00\n+1.419565e+00\n+5.559269e+00\n+1.747999e+00\n+4.731712e+00\n+4.271454e-03\n+1.428832e-02\n+7.257916e-05\n+1.544607e-02\n+1.201680e-02\n+5.721602e-02\n+1.174485e-02\n+5.032661e-02\n+3.261249e-03\n+2.022697e-02\n+4.396979e-03\n+2.562451e-02\n+2.441810e-02\n+9.562563e-02\n+3.006754e-02\n+8.139074e-02\n+8.310149e-01\n+2.779805e+00\n+1.412034e-02\n+3.005045e+00\n+2.096500e+00\n+9.982139e+00\n+2.049054e+00\n+8.780184e+00\n+6.422139e-01\n+3.983149e+00\n+8.658648e-01\n+5.046046e+00\n+1.994305e+00\n+7.810053e+00\n+2.455712e+00\n+6.647443e+00\n+7.368426e-01\n+2.464792e+00\n+1.252019e-02\n+2.664507e+00\n+9.823047e-01\n+4.677082e+00\n+9.600742e-01\n+4.113912e+00\n+7.380869e-03\n+4.577773e-02\n+9.951255e-03\n+5.799346e-02\n+2.077943e+00\n+8.137595e+00\n+2.558701e+00\n+6.926227e+00\n+2.072050e+00\n+6.931158e+00\n+3.520761e-02\n+7.492772e+00\n+4.158576e+00\n+1.980037e+01\n+4.064463e+00\n+1.741620e+01\n+1.679776e+00\n+1.041833e+01\n+2.264758e+00\n+1.319845e+01\n+5.463218e+00\n+2.139493e+01\n+6.727203e+00\n+1.821007e+01\n+5.685934e-01\n+1.901986e+00\n+9.661355e-03\n+2.056099e+00\n+1.599614e+00\n+7.616296e+00\n+1.563413e+00\n+6.699213e+00\n+4.341204e-01\n+2.692508e+00\n+5.853027e-01\n+3.410999e+00\n+3.250409e+00\n+1.272918e+01\n+4.002432e+00\n+1.083431e+01\n+2.770201e+00\n+9.266522e+00\n+4.707035e-02\n+1.001736e+01\n+6.988715e+00\n+3.327561e+01\n+6.830553e+00\n+2.926888e+01\n+2.140830e+00\n+1.327789e+01\n+2.886373e+00\n+1.682107e+01\n+6.648045e+00\n+2.603493e+01\n+8.186155e+00\n+2.215935e+01\n+2.854332e-01\n+9.547949e-01\n+4.849989e-03\n+1.032159e+00\n+3.805188e-01\n+1.811777e+00\n+3.719072e-01\n+1.593620e+00\n+2.859153e-03\n+1.773308e-02\n+3.854852e-03\n+2.246512e-02\n+8.049399e-01\n+3.152288e+00\n+9.911730e-01\n+2.683036e+00\n+6.522045e-01\n+2.181671e+00\n+1.108205e-02\n+2.358446e+00\n+1.308965e+00\n+6.232423e+00\n+1.279342e+00\n+5.481973e+00\n+5.287312e-01\n+3.279304e+00\n+7.128617e-01\n+4.154382e+00\n+1.719618e+00\n+6.734331e+00\n+2.117474e+00\n+5.731854e+00\n+1.633350e-01\n+5.463675e-01\n+2.775336e-03\n+5.906382e-01\n+4.595074e-01\n+2.187869e+00\n+4.491083e-01\n+1.924426e+00\n+1.247061e-01\n+7.734539e-01\n+1.681349e-01\n+9.798489e-01\n+9.337175e-01\n+3.656604e+00\n+1.149745e+00\n+3.112280e+00\n+1.304637e+00\n+4.364106e+00\n+2.216798e-02\n+4.717718e+00\n+3.291364e+00\n+1.567129e+01\n+3.216877e+00\n+1.378430e+01\n+1.008233e+00\n+6.253275e+00\n+1.359349e+00\n+7.921952e+00\n+3.130924e+00\n+1.226126e+01\n+3.855303e+00\n+1.043604e+01\n+1.075180e+00\n+3.596556e+00\n+1.826911e-02\n+3.887975e+00\n+1.433352e+00\n+6.824668e+00\n+1.400914e+00\n+6.002906e+00\n+1.076996e-02\n+6.679760e-02\n+1.452059e-02\n+8.462244e-02\n+3.032077e+00\n+1.187415e+01\n+3.733586e+00\n+1.010656e+01\n+2.507429e+00\n+8.387532e+00\n+4.260542e-02\n+9.067152e+00\n+5.032376e+00\n+2.396083e+01\n+4.918488e+00\n+2.107569e+01\n+2.032731e+00\n+1.260743e+01\n+2.740628e+00\n+1.597171e+01\n+6.611148e+00\n+2.589043e+01\n+8.140721e+00\n+2.203637e+01\n+7.178477e-01\n+2.401253e+00\n+1.219744e-02\n+2.595820e+00\n+2.019508e+00\n+9.615553e+00\n+1.973804e+00\n+8.457739e+00\n+5.480758e-01\n+3.399284e+00\n+7.389431e-01\n+4.306378e+00\n+4.103633e+00\n+1.607056e+01\n+5.053060e+00\n+1.367828e+01\n+2.383284e+00\n+7.972258e+00\n+4.049599e-02\n+8.618229e+00\n+6.012594e+00\n+2.862798e+01\n+5.876523e+00\n+2.518087e+01\n+1.841818e+00\n+1.142335e+01\n+2.483231e+00\n+1.447166e+01\n+5.719506e+00\n+2.239861e+01\n+7.042787e+00\n+1.906433e+01\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2s0acc3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2s0acc3 Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,16384 @@\n+4.607522e-02\n+5.364672e-02\n+3.173936e-02\n+5.684116e-02\n+4.264662e-02\n+5.987432e-02\n+4.878961e-02\n+4.402107e-02\n+3.717748e-02\n+5.040938e-02\n+3.740832e-02\n+4.905013e-02\n+8.421484e-02\n+1.037392e-01\n+1.137857e-01\n+8.969228e-02\n+5.032865e-02\n+5.859912e-02\n+3.466938e-02\n+6.208845e-02\n+5.027251e-02\n+7.058079e-02\n+5.751395e-02\n+5.189272e-02\n+8.224498e-02\n+1.115169e-01\n+8.275565e-02\n+1.085100e-01\n+7.766140e-02\n+9.566639e-02\n+1.049311e-01\n+8.271259e-02\n+1.923979e-01\n+2.240145e-01\n+1.325352e-01\n+2.373536e-01\n+1.228232e-01\n+1.724394e-01\n+1.405151e-01\n+1.267816e-01\n+1.431106e-01\n+1.940454e-01\n+1.439992e-01\n+1.888131e-01\n+2.754393e-01\n+3.392970e-01\n+3.721559e-01\n+2.933542e-01\n+2.871481e-02\n+3.343349e-02\n+1.978048e-02\n+3.542431e-02\n+4.226273e-02\n+5.933535e-02\n+4.835042e-02\n+4.362480e-02\n+4.930075e-02\n+6.684747e-02\n+4.960686e-02\n+6.504498e-02\n+5.509964e-02\n+6.787393e-02\n+7.444710e-02\n+5.868339e-02\n+8.527461e-01\n+9.928772e-01\n+5.874225e-01\n+1.051999e+00\n+7.892906e-01\n+1.108136e+00\n+9.029832e-01\n+8.147285e-01\n+6.880694e-01\n+9.329615e-01\n+6.923417e-01\n+9.078049e-01\n+1.558623e+00\n+1.919973e+00\n+2.105911e+00\n+1.659997e+00\n+4.452921e-01\n+5.184666e-01\n+3.067438e-01\n+5.493390e-01\n+4.447953e-01\n+6.244766e-01\n+5.088654e-01\n+4.591305e-01\n+7.276777e-01\n+9.866668e-01\n+7.321960e-01\n+9.600621e-01\n+6.871236e-01\n+8.464262e-01\n+9.283973e-01\n+7.318150e-01\n+1.594502e+00\n+1.856525e+00\n+1.098388e+00\n+1.967073e+00\n+1.017900e+00\n+1.429094e+00\n+1.164522e+00\n+1.050705e+00\n+1.186032e+00\n+1.608155e+00\n+1.193396e+00\n+1.564792e+00\n+2.282708e+00\n+2.811931e+00\n+3.084249e+00\n+2.431178e+00\n+3.373241e-01\n+3.927563e-01\n+2.323690e-01\n+4.161432e-01\n+4.964768e-01\n+6.970355e-01\n+5.679912e-01\n+5.124776e-01\n+5.791551e-01\n+7.852833e-01\n+5.827511e-01\n+7.641087e-01\n+6.472770e-01\n+7.973415e-01\n+8.745591e-01\n+6.893767e-01\n+3.989466e-04\n+4.645052e-04\n+2.748183e-04\n+4.921645e-04\n+3.692598e-04\n+5.184274e-04\n+4.224494e-04\n+3.811606e-04\n+3.219047e-04\n+4.364744e-04\n+3.239034e-04\n+4.247052e-04\n+7.291821e-04\n+8.982355e-04\n+9.852240e-04\n+7.766090e-04\n+2.901466e-04\n+3.378261e-04\n+1.998703e-04\n+3.579422e-04\n+2.898229e-04\n+4.069009e-04\n+3.315701e-04\n+2.991635e-04\n+4.741454e-04\n+6.428993e-04\n+4.770894e-04\n+6.255641e-04\n+4.477208e-04\n+5.515203e-04\n+6.049317e-04\n+4.768412e-04\n+1.522184e-03\n+1.772323e-03\n+1.048571e-03\n+1.877858e-03\n+9.717336e-04\n+1.364279e-03\n+1.111706e-03\n+1.003051e-03\n+1.132240e-03\n+1.535218e-03\n+1.139271e-03\n+1.493822e-03\n+2.179178e-03\n+2.684398e-03\n+2.944365e-03\n+2.320914e-03\n+1.316873e-05\n+1.533274e-05\n+9.071411e-06\n+1.624574e-05\n+1.938187e-05\n+2.721144e-05\n+2.217371e-05\n+2.000652e-05\n+2.260953e-05\n+3.065653e-05\n+2.274992e-05\n+2.982990e-05\n+2.526893e-05\n+3.112727e-05\n+3.414176e-05\n+2.691245e-05\n+4.312333e-01\n+5.020975e-01\n+2.970593e-01\n+5.319953e-01\n+3.991439e-01\n+5.603837e-01\n+4.566382e-01\n+4.120078e-01\n+3.479564e-01\n+4.717982e-01\n+3.501169e-01\n+4.590765e-01\n+7.881947e-01\n+9.709295e-01\n+1.064958e+00\n+8.394598e-01\n+2.261909e-01\n+2.633606e-01\n+1.558138e-01\n+2.790426e-01\n+2.259385e-01\n+3.172096e-01\n+2.584836e-01\n+2.332202e-01\n+3.696317e-01\n+5.011879e-01\n+3.719267e-01\n+4.876738e-01\n+3.490318e-01\n+4.299512e-01\n+4.715893e-01\n+3.717332e-01\n+1.271895e+00\n+1.480904e+00\n+8.761571e-01\n+1.569086e+00\n+8.119536e-01\n+1.139954e+00\n+9.289105e-01\n+8.381218e-01\n+9.460686e-01\n+1.282786e+00\n+9.519429e-01\n+1.248196e+00\n+1.820860e+00\n+2.243008e+00\n+2.460230e+00\n+1.939291e+00\n+1.653944e-01\n+1.925735e-01\n+1.139335e-01\n+2.040404e-01\n+2.434290e-01\n+3.417655e-01\n+2.784934e-01\n+2.512744e-01\n+2.839672e-01\n+3.850345e-01\n+2.857304e-01\n+3.746524e-01\n+3.173683e-01\n+3.909469e-01\n+4.288076e-01\n+3.380103e-01\n+3.004457e-02\n+3.498177e-02\n+2.069649e-02\n+3.706478e-02\n+2.780886e-02\n+3.904264e-02\n+3.181456e-02\n+2.870510e-02\n+2.424256e-02\n+3.287077e-02\n+2.439308e-02\n+3.198444e-02\n+5.491451e-02\n+6.764588e-02\n+7.419697e-02\n+5.848622e-02\n+3.281813e-02\n+3.821111e-02\n+2.260709e-02\n+4.048642e-02\n+3.278152e-02\n+4.602407e-02\n+3.750350e-02\n+3.383803e-02\n+5.363002e-02\n+7.271757e-02\n+5.396301e-02\n+7.075680e-02\n+5.'..b'0554e+00\n+1.232220e+01\n+6.376357e+00\n+8.952176e+00\n+7.294833e+00\n+6.581859e+00\n+7.429577e+00\n+1.007385e+01\n+7.475708e+00\n+9.802218e+00\n+1.429941e+01\n+1.761458e+01\n+1.932044e+01\n+1.522946e+01\n+1.298859e+00\n+1.512300e+00\n+8.947320e-01\n+1.602351e+00\n+1.911674e+00\n+2.683921e+00\n+2.187038e+00\n+1.973284e+00\n+2.230025e+00\n+3.023717e+00\n+2.243871e+00\n+2.942185e+00\n+2.492327e+00\n+3.070147e+00\n+3.367472e+00\n+2.654431e+00\n+6.399670e-01\n+7.451323e-01\n+4.408475e-01\n+7.895017e-01\n+5.923451e-01\n+8.316312e-01\n+6.776688e-01\n+6.114356e-01\n+5.163808e-01\n+7.001669e-01\n+5.195871e-01\n+6.812874e-01\n+1.169712e+00\n+1.440897e+00\n+1.580439e+00\n+1.245791e+00\n+6.990455e-01\n+8.139191e-01\n+4.815443e-01\n+8.623845e-01\n+6.982657e-01\n+9.803399e-01\n+7.988466e-01\n+7.207699e-01\n+1.142351e+00\n+1.548927e+00\n+1.149444e+00\n+1.507162e+00\n+1.078687e+00\n+1.328769e+00\n+1.457452e+00\n+1.148846e+00\n+2.672333e+00\n+3.111475e+00\n+1.840863e+00\n+3.296750e+00\n+1.705967e+00\n+2.395116e+00\n+1.951701e+00\n+1.760948e+00\n+1.987751e+00\n+2.695216e+00\n+2.000093e+00\n+2.622541e+00\n+3.825745e+00\n+4.712705e+00\n+5.169101e+00\n+4.074576e+00\n+3.988376e-01\n+4.643783e-01\n+2.747432e-01\n+4.920301e-01\n+5.870130e-01\n+8.241451e-01\n+6.715686e-01\n+6.059317e-01\n+6.847683e-01\n+9.284855e-01\n+6.890201e-01\n+9.034496e-01\n+7.653128e-01\n+9.427427e-01\n+1.034041e+00\n+8.150897e-01\n+6.949335e+00\n+8.091314e+00\n+4.787117e+00\n+8.573117e+00\n+6.432214e+00\n+9.030596e+00\n+7.358735e+00\n+6.639516e+00\n+5.607325e+00\n+7.603039e+00\n+5.642142e+00\n+7.398029e+00\n+1.270178e+01\n+1.564655e+01\n+1.716182e+01\n+1.352792e+01\n+3.628846e+00\n+4.225171e+00\n+2.499766e+00\n+4.476762e+00\n+3.624797e+00\n+5.089085e+00\n+4.146927e+00\n+3.741620e+00\n+5.930108e+00\n+8.040703e+00\n+5.966928e+00\n+7.823892e+00\n+5.599618e+00\n+6.897832e+00\n+7.565844e+00\n+5.963824e+00\n+1.299417e+01\n+1.512949e+01\n+8.951159e+00\n+1.603038e+01\n+8.295231e+00\n+1.164621e+01\n+9.490108e+00\n+8.562576e+00\n+9.665402e+00\n+1.310543e+01\n+9.725416e+00\n+1.275206e+01\n+1.860261e+01\n+2.291544e+01\n+2.513465e+01\n+1.981255e+01\n+2.748975e+00\n+3.200712e+00\n+1.893659e+00\n+3.391301e+00\n+4.045968e+00\n+5.680393e+00\n+4.628765e+00\n+4.176364e+00\n+4.719743e+00\n+6.399556e+00\n+4.749049e+00\n+6.226997e+00\n+5.274893e+00\n+6.497823e+00\n+7.127097e+00\n+5.617979e+00\n+1.598573e-02\n+1.861266e-02\n+1.101193e-02\n+1.972096e-02\n+1.479619e-02\n+2.077331e-02\n+1.692749e-02\n+1.527305e-02\n+1.289867e-02\n+1.748947e-02\n+1.297876e-02\n+1.701788e-02\n+2.921822e-02\n+3.599217e-02\n+3.947778e-02\n+3.111861e-02\n+1.162613e-02\n+1.353664e-02\n+8.008774e-03\n+1.434269e-02\n+1.161316e-02\n+1.630446e-02\n+1.328597e-02\n+1.198744e-02\n+1.899894e-02\n+2.576088e-02\n+1.911690e-02\n+2.506626e-02\n+1.794011e-02\n+2.209934e-02\n+2.423952e-02\n+1.910696e-02\n+6.099370e-02\n+7.101674e-02\n+4.201610e-02\n+7.524549e-02\n+3.893723e-02\n+5.466646e-02\n+4.454590e-02\n+4.019212e-02\n+4.536871e-02\n+6.151597e-02\n+4.565041e-02\n+5.985724e-02\n+8.731934e-02\n+1.075634e-01\n+1.179803e-01\n+9.299870e-02\n+5.276693e-04\n+6.143807e-04\n+3.634901e-04\n+6.509645e-04\n+7.766286e-04\n+1.090359e-03\n+8.884972e-04\n+8.016584e-04\n+9.059606e-04\n+1.228403e-03\n+9.115859e-04\n+1.195280e-03\n+1.012522e-03\n+1.247265e-03\n+1.368055e-03\n+1.078378e-03\n+6.338549e+00\n+7.380157e+00\n+4.366371e+00\n+7.819614e+00\n+5.866878e+00\n+8.236885e+00\n+6.711966e+00\n+6.055960e+00\n+5.114490e+00\n+6.934798e+00\n+5.146247e+00\n+6.747806e+00\n+1.158540e+01\n+1.427136e+01\n+1.565345e+01\n+1.233893e+01\n+3.324701e+00\n+3.871047e+00\n+2.290253e+00\n+4.101551e+00\n+3.320992e+00\n+4.662553e+00\n+3.799361e+00\n+3.428023e+00\n+5.433088e+00\n+7.366788e+00\n+5.466822e+00\n+7.168148e+00\n+5.130297e+00\n+6.319704e+00\n+6.931728e+00\n+5.463978e+00\n+1.869514e+01\n+2.176730e+01\n+1.287833e+01\n+2.306345e+01\n+1.193462e+01\n+1.675578e+01\n+1.365373e+01\n+1.231926e+01\n+1.390593e+01\n+1.885522e+01\n+1.399228e+01\n+1.834681e+01\n+2.676419e+01\n+3.296920e+01\n+3.616206e+01\n+2.850497e+01\n+2.431074e+00\n+2.830571e+00\n+1.674669e+00\n+2.999119e+00\n+3.578078e+00\n+5.023493e+00\n+4.093478e+00\n+3.693395e+00\n+4.173936e+00\n+5.659489e+00\n+4.199852e+00\n+5.506885e+00\n+4.664886e+00\n+5.746392e+00\n+6.302895e+00\n+4.968296e+00\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2s0acc4
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2s0acc4 Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,16384 @@\n+1.919622e-02\n+4.794884e-02\n+5.390010e-04\n+9.063981e-02\n+2.308413e-02\n+1.166969e-01\n+4.195089e-02\n+1.206049e-01\n+4.015088e-04\n+1.758047e-03\n+5.577693e-04\n+4.279859e-03\n+5.405202e-02\n+2.606216e-01\n+8.308326e-02\n+2.562365e-01\n+2.292378e-02\n+5.725966e-02\n+6.436654e-04\n+1.082405e-01\n+4.971088e-02\n+2.513027e-01\n+9.033980e-02\n+2.597186e-01\n+1.960513e-02\n+8.584300e-02\n+2.723511e-02\n+2.089796e-01\n+8.361943e-02\n+4.031862e-01\n+1.285313e-01\n+3.964024e-01\n+8.892826e-04\n+2.221275e-03\n+2.496972e-05\n+4.198973e-03\n+1.662333e-03\n+8.403569e-03\n+3.020965e-03\n+8.684997e-03\n+5.985983e-04\n+2.621022e-03\n+8.315626e-04\n+6.380721e-03\n+4.699343e-03\n+2.265874e-02\n+7.223351e-03\n+2.227749e-02\n+3.562413e-02\n+8.898292e-02\n+1.000272e-03\n+1.682083e-01\n+9.046467e-02\n+4.573248e-01\n+1.644018e-01\n+4.726402e-01\n+3.769653e-02\n+1.650580e-01\n+5.236738e-02\n+4.018238e-01\n+1.359078e-01\n+6.553040e-01\n+2.089036e-01\n+6.442782e-01\n+2.261399e-02\n+5.648585e-02\n+6.349669e-04\n+1.067777e-01\n+2.719412e-02\n+1.374740e-01\n+4.941999e-02\n+1.420779e-01\n+4.729951e-04\n+2.071056e-03\n+6.570768e-04\n+5.041862e-03\n+6.367566e-02\n+3.070237e-01\n+9.787573e-02\n+3.018579e-01\n+4.006361e-02\n+1.000720e-01\n+1.124926e-03\n+1.891705e-01\n+8.687908e-02\n+4.391986e-01\n+1.578857e-01\n+4.539070e-01\n+3.426363e-02\n+1.500267e-01\n+4.759846e-02\n+3.652311e-01\n+1.461406e-01\n+7.046435e-01\n+2.246325e-01\n+6.927875e-01\n+1.719545e-02\n+4.295126e-02\n+4.828223e-04\n+8.119266e-02\n+3.214339e-02\n+1.624940e-01\n+5.841433e-02\n+1.679358e-01\n+1.157468e-02\n+5.068091e-02\n+1.607936e-02\n+1.233796e-01\n+9.086797e-02\n+4.381364e-01\n+1.396730e-01\n+4.307645e-01\n+5.688598e-02\n+1.420914e-01\n+1.597273e-03\n+2.686016e-01\n+1.444575e-01\n+7.302739e-01\n+2.625232e-01\n+7.547301e-01\n+6.019527e-02\n+2.635710e-01\n+8.362225e-02\n+6.416477e-01\n+2.170228e-01\n+1.046415e+00\n+3.335853e-01\n+1.028808e+00\n+1.712513e-03\n+4.277562e-03\n+4.808479e-05\n+8.086063e-03\n+2.059357e-03\n+1.041064e-02\n+3.742478e-03\n+1.075928e-02\n+3.581898e-05\n+1.568370e-04\n+4.975913e-05\n+3.818102e-04\n+4.822032e-03\n+2.325030e-02\n+7.411936e-03\n+2.285910e-02\n+3.026057e-03\n+7.558568e-03\n+8.496714e-05\n+1.428829e-02\n+6.562091e-03\n+3.317325e-02\n+1.192532e-02\n+3.428419e-02\n+2.587977e-03\n+1.133172e-02\n+3.595174e-03\n+2.758638e-02\n+1.103819e-02\n+5.322265e-02\n+1.696679e-02\n+5.232715e-02\n+1.421884e-03\n+3.551621e-03\n+3.992437e-05\n+6.713786e-03\n+2.657923e-03\n+1.343656e-02\n+4.830255e-03\n+1.388654e-02\n+9.571056e-04\n+4.190783e-03\n+1.329595e-03\n+1.020221e-02\n+7.513833e-03\n+3.622931e-02\n+1.154950e-02\n+3.561973e-02\n+7.065443e-03\n+1.764826e-02\n+1.983870e-04\n+3.336128e-02\n+1.794214e-02\n+9.070263e-02\n+3.260632e-02\n+9.374018e-02\n+7.476468e-03\n+3.273646e-02\n+1.038618e-02\n+7.969495e-02\n+2.695501e-02\n+1.299685e-01\n+4.143249e-02\n+1.277817e-01\n+4.480941e-02\n+1.119262e-01\n+1.258181e-03\n+2.115790e-01\n+5.388489e-02\n+2.724035e-01\n+9.792525e-02\n+2.815261e-01\n+9.372353e-04\n+4.103779e-03\n+1.301992e-03\n+9.990402e-03\n+1.261727e-01\n+6.083647e-01\n+1.939398e-01\n+5.981286e-01\n+6.700613e-02\n+1.673697e-01\n+1.881431e-03\n+3.163864e-01\n+1.453047e-01\n+7.345568e-01\n+2.640629e-01\n+7.591565e-01\n+5.730570e-02\n+2.509187e-01\n+7.960811e-02\n+6.108466e-01\n+2.444192e-01\n+1.178512e+00\n+3.756963e-01\n+1.158683e+00\n+4.466240e-02\n+1.115590e-01\n+1.254053e-03\n+2.108848e-01\n+8.348727e-02\n+4.220520e-01\n+1.517218e-01\n+4.361862e-01\n+3.006337e-02\n+1.316355e-01\n+4.176353e-02\n+3.204587e-01\n+2.360149e-01\n+1.137989e+00\n+3.627780e-01\n+1.118841e+00\n+8.293966e-02\n+2.071690e-01\n+2.328821e-03\n+3.916206e-01\n+2.106188e-01\n+1.064738e+00\n+3.827584e-01\n+1.100395e+00\n+8.776460e-02\n+3.842861e-01\n+1.219211e-01\n+9.355213e-01\n+3.164189e-01\n+1.525671e+00\n+4.863668e-01\n+1.500001e+00\n+2.349794e-02\n+5.869380e-02\n+6.597868e-04\n+1.109515e-01\n+2.825710e-02\n+1.428477e-01\n+5.135175e-02\n+1.476315e-01\n+4.914838e-04\n+2.152011e-03\n+6.827610e-04\n+5.238941e-03\n+6.616465e-02\n+3.190248e-01\n+1.017016e-01\n+3.136571e-01\n+2.806082e-02\n+7.009109e-02\n+7.879057e-04\n+1.324963e-01\n+6.085069e-02\n+3.076177e-01\n+1.105842e-01\n+3.179195e-01\n+2.399848e-02\n+1.050797e-01\n+3.333828e-02\n+2.558103e-01\n+1.'..b'6583e-02\n+2.214001e+00\n+8.765017e-01\n+4.430966e+00\n+1.592870e+00\n+4.579356e+00\n+3.156241e-01\n+1.381992e+00\n+4.384597e-01\n+3.364376e+00\n+2.477832e+00\n+1.194732e+01\n+3.808671e+00\n+1.174630e+01\n+8.707525e-01\n+2.174990e+00\n+2.444942e-02\n+4.111479e+00\n+2.211208e+00\n+1.117829e+01\n+4.018437e+00\n+1.155264e+01\n+9.214077e-01\n+4.034476e+00\n+1.280004e+00\n+9.821688e+00\n+3.321963e+00\n+1.601745e+01\n+5.106184e+00\n+1.574795e+01\n+2.453296e-01\n+6.127911e-01\n+6.888487e-03\n+1.158386e+00\n+2.950175e-01\n+1.491398e+00\n+5.361366e-01\n+1.541343e+00\n+5.131324e-03\n+2.246801e-02\n+7.128348e-03\n+5.469703e-02\n+6.907903e-01\n+3.330771e+00\n+1.061812e+00\n+3.274729e+00\n+2.929683e-01\n+7.317842e-01\n+8.226109e-03\n+1.383324e+00\n+6.353101e-01\n+3.211674e+00\n+1.154552e+00\n+3.319230e+00\n+2.505555e-01\n+1.097082e+00\n+3.480675e-01\n+2.670781e+00\n+1.068665e+00\n+5.152761e+00\n+1.642643e+00\n+5.066063e+00\n+1.136512e-02\n+2.838811e-02\n+3.191156e-04\n+5.366330e-02\n+2.124478e-02\n+1.073985e-01\n+3.860824e-02\n+1.109951e-01\n+7.650147e-03\n+3.349693e-02\n+1.062745e-02\n+8.154627e-02\n+6.005808e-02\n+2.895809e-01\n+9.231516e-02\n+2.847086e-01\n+4.552800e-01\n+1.137211e+00\n+1.278358e-02\n+2.149720e+00\n+1.156148e+00\n+5.844657e+00\n+2.101072e+00\n+6.040389e+00\n+4.817654e-01\n+2.109458e+00\n+6.692604e-01\n+5.135349e+00\n+1.736915e+00\n+8.374852e+00\n+2.669809e+00\n+8.233940e+00\n+4.286325e-01\n+1.070650e+00\n+1.203536e-02\n+2.023897e+00\n+5.154456e-01\n+2.605725e+00\n+9.367215e-01\n+2.692988e+00\n+8.965292e-03\n+3.925543e-02\n+1.245443e-02\n+9.556498e-02\n+1.206928e+00\n+5.819421e+00\n+1.855166e+00\n+5.721506e+00\n+7.593778e-01\n+1.896795e+00\n+2.132219e-02\n+3.585595e+00\n+1.646733e+00\n+8.324704e+00\n+2.992614e+00\n+8.603491e+00\n+6.494433e-01\n+2.843652e+00\n+9.021957e-01\n+6.922701e+00\n+2.769994e+00\n+1.335603e+01\n+4.257752e+00\n+1.313130e+01\n+3.259277e-01\n+8.141112e-01\n+9.151561e-03\n+1.538950e+00\n+6.092556e-01\n+3.079961e+00\n+1.107203e+00\n+3.183106e+00\n+2.193901e-01\n+9.606214e-01\n+3.047730e-01\n+2.338575e+00\n+1.722339e+00\n+8.304570e+00\n+2.647404e+00\n+8.164841e+00\n+1.078234e+00\n+2.693243e+00\n+3.027520e-02\n+5.091156e+00\n+2.738091e+00\n+1.384183e+01\n+4.975945e+00\n+1.430538e+01\n+1.140959e+00\n+4.995805e+00\n+1.585002e+00\n+1.216199e+01\n+4.113516e+00\n+1.983406e+01\n+6.322878e+00\n+1.950034e+01\n+1.972500e-01\n+4.926966e-01\n+5.538485e-03\n+9.313660e-01\n+2.372001e-01\n+1.199114e+00\n+4.310648e-01\n+1.239272e+00\n+4.125689e-03\n+1.806474e-02\n+5.731338e-03\n+4.397753e-02\n+5.554095e-01\n+2.678008e+00\n+8.537189e-01\n+2.632949e+00\n+3.485462e-01\n+8.706083e-01\n+9.786654e-03\n+1.645749e+00\n+7.558324e-01\n+3.820949e+00\n+1.373578e+00\n+3.948909e+00\n+2.980874e-01\n+1.305206e+00\n+4.140980e-01\n+3.177445e+00\n+1.271397e+00\n+6.130272e+00\n+1.954262e+00\n+6.027127e+00\n+1.637749e-01\n+4.090815e-01\n+4.598554e-03\n+7.733049e-01\n+3.061439e-01\n+1.547645e+00\n+5.563567e-01\n+1.599474e+00\n+1.102410e-01\n+4.827012e-01\n+1.531449e-01\n+1.175107e+00\n+8.654556e-01\n+4.172951e+00\n+1.330290e+00\n+4.102739e+00\n+8.138093e-01\n+2.032755e+00\n+2.285054e-02\n+3.842607e+00\n+2.066605e+00\n+1.044728e+01\n+3.755650e+00\n+1.079715e+01\n+8.611518e-01\n+3.770640e+00\n+1.196298e+00\n+9.179395e+00\n+3.104722e+00\n+1.496998e+01\n+4.772263e+00\n+1.471810e+01\n+5.952563e-01\n+1.486847e+00\n+1.671390e-02\n+2.810653e+00\n+7.158165e-01\n+3.618657e+00\n+1.300857e+00\n+3.739843e+00\n+1.245040e-02\n+5.451533e-02\n+1.729589e-02\n+1.327143e-01\n+1.676101e+00\n+8.081625e+00\n+2.576332e+00\n+7.945647e+00\n+8.901214e-01\n+2.223370e+00\n+2.499327e-02\n+4.202934e+00\n+1.930254e+00\n+9.757984e+00\n+3.507858e+00\n+1.008477e+01\n+7.612592e-01\n+3.333249e+00\n+1.057528e+00\n+8.114595e+00\n+3.246909e+00\n+1.565556e+01\n+4.990818e+00\n+1.539215e+01\n+5.933033e-01\n+1.481969e+00\n+1.665907e-02\n+2.801432e+00\n+1.109060e+00\n+5.606615e+00\n+2.015499e+00\n+5.794375e+00\n+3.993672e-01\n+1.748669e+00\n+5.547942e-01\n+4.257030e+00\n+3.135264e+00\n+1.511724e+01\n+4.819208e+00\n+1.486289e+01\n+1.101785e+00\n+2.752070e+00\n+3.093648e-02\n+5.202359e+00\n+2.797897e+00\n+1.414417e+01\n+5.084631e+00\n+1.461785e+01\n+1.165881e+00\n+5.104925e+00\n+1.619622e+00\n+1.242763e+01\n+4.203365e+00\n+2.026729e+01\n+6.460984e+00\n+1.992628e+01\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2s0acc5
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2s0acc5 Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,16384 @@\n+1.936385e-02\n+2.158515e-01\n+1.518095e-04\n+1.256695e-01\n+1.262577e-02\n+1.225817e-01\n+1.215067e-03\n+1.460645e-01\n+2.443135e-03\n+3.872331e-02\n+2.338315e-05\n+1.860386e-02\n+1.413505e-02\n+9.245002e-02\n+3.196994e-04\n+9.707893e-02\n+1.865968e-02\n+2.080019e-01\n+1.462888e-04\n+1.210994e-01\n+1.524832e-02\n+1.480437e-01\n+1.467454e-03\n+1.764042e-01\n+2.224942e-02\n+3.526498e-01\n+2.129483e-04\n+1.694238e-01\n+1.737690e-02\n+1.136533e-01\n+3.930218e-04\n+1.193438e-01\n+4.596239e-04\n+5.123489e-03\n+3.603376e-06\n+2.982913e-03\n+3.854353e-04\n+3.742134e-03\n+3.709316e-05\n+4.459007e-03\n+3.537124e-04\n+5.606286e-03\n+3.385367e-06\n+2.693431e-03\n+3.759423e-04\n+2.458844e-03\n+8.502872e-06\n+2.581956e-03\n+2.092287e-02\n+2.332300e-01\n+1.640319e-04\n+1.357873e-01\n+2.366503e-02\n+2.297603e-01\n+2.277453e-03\n+2.737750e-01\n+2.284734e-02\n+3.621268e-01\n+2.186710e-04\n+1.739768e-01\n+4.741387e-02\n+3.101096e-01\n+1.072383e-03\n+3.256366e-01\n+1.816389e-02\n+2.024754e-01\n+1.424020e-04\n+1.178819e-01\n+1.184336e-02\n+1.149855e-01\n+1.139771e-03\n+1.370130e-01\n+2.291737e-03\n+3.632366e-02\n+2.193412e-05\n+1.745100e-02\n+1.325911e-02\n+8.672099e-02\n+2.998879e-04\n+9.106305e-02\n+7.033763e-02\n+7.840630e-01\n+5.514356e-04\n+4.564842e-01\n+5.747854e-02\n+5.580506e-01\n+5.531566e-03\n+6.649553e-01\n+8.386918e-02\n+1.329313e+00\n+8.027084e-04\n+6.386427e-01\n+6.550219e-02\n+4.284159e-01\n+1.481495e-03\n+4.498664e-01\n+1.532815e-02\n+1.708650e-01\n+1.201702e-04\n+9.947821e-02\n+1.285401e-02\n+1.247977e-01\n+1.237033e-03\n+1.487050e-01\n+1.179608e-02\n+1.869660e-01\n+1.128998e-04\n+8.982416e-02\n+1.253743e-02\n+8.200082e-02\n+2.835652e-04\n+8.610655e-02\n+3.927916e-02\n+4.378501e-01\n+3.079423e-04\n+2.549179e-01\n+4.442712e-02\n+4.313363e-01\n+4.275535e-03\n+5.139666e-01\n+4.289205e-02\n+6.798321e-01\n+4.105180e-04\n+3.266121e-01\n+8.901158e-02\n+5.821786e-01\n+2.013219e-03\n+6.113279e-01\n+8.839414e-04\n+9.853414e-03\n+6.929957e-06\n+5.736692e-03\n+5.763544e-04\n+5.595740e-03\n+5.546666e-05\n+6.667705e-03\n+1.115268e-04\n+1.767682e-03\n+1.067418e-06\n+8.492485e-04\n+6.452515e-04\n+4.220256e-03\n+1.459397e-05\n+4.431561e-03\n+1.142381e-03\n+1.273428e-02\n+8.956085e-06\n+7.413943e-03\n+9.335318e-04\n+9.063523e-03\n+8.984036e-05\n+1.079980e-02\n+1.362153e-03\n+2.158991e-02\n+1.303711e-05\n+1.037245e-02\n+1.063847e-03\n+6.958073e-03\n+2.406156e-05\n+7.306459e-03\n+6.910177e-04\n+7.702868e-03\n+5.417467e-06\n+4.484637e-03\n+5.794795e-04\n+5.626081e-03\n+5.576740e-05\n+6.703858e-03\n+5.317860e-04\n+8.428725e-03\n+5.089702e-06\n+4.049417e-03\n+5.652073e-04\n+3.696728e-03\n+1.278357e-05\n+3.881821e-03\n+1.297692e-03\n+1.446554e-02\n+1.017369e-05\n+8.421891e-03\n+1.467768e-03\n+1.425034e-02\n+1.412537e-04\n+1.698025e-02\n+1.417053e-03\n+2.246006e-02\n+1.356255e-05\n+1.079050e-02\n+2.940735e-03\n+1.923382e-02\n+6.651204e-05\n+2.019684e-02\n+3.762045e-02\n+4.193603e-01\n+2.949383e-04\n+2.441530e-01\n+2.452958e-02\n+2.381541e-01\n+2.360655e-03\n+2.837768e-01\n+4.746569e-03\n+7.523236e-02\n+4.542921e-05\n+3.614393e-02\n+2.746184e-02\n+1.796136e-01\n+6.211179e-04\n+1.886068e-01\n+6.632537e-02\n+7.393379e-01\n+5.199801e-04\n+4.304451e-01\n+5.419980e-02\n+5.262178e-01\n+5.216030e-03\n+6.270244e-01\n+7.908504e-02\n+1.253485e+00\n+7.569197e-04\n+6.022127e-01\n+6.176576e-02\n+4.039778e-01\n+1.396987e-03\n+4.242047e-01\n+2.709260e-02\n+3.020049e-01\n+2.124016e-04\n+1.758283e-01\n+2.271954e-02\n+2.205807e-01\n+2.186462e-03\n+2.628369e-01\n+2.084964e-02\n+3.304634e-01\n+1.995510e-04\n+1.587647e-01\n+2.215998e-02\n+1.449369e-01\n+5.012032e-04\n+1.521938e-01\n+1.002729e-01\n+1.117756e+00\n+7.861236e-04\n+6.507615e-01\n+1.134148e-01\n+1.101127e+00\n+1.091471e-02\n+1.312068e+00\n+1.094960e-01\n+1.735495e+00\n+1.047982e-03\n+8.337846e-01\n+2.272313e-01\n+1.486202e+00\n+5.139401e-03\n+1.560615e+00\n+3.261147e-02\n+3.635244e-01\n+2.556686e-04\n+2.116452e-01\n+2.126359e-02\n+2.064450e-01\n+2.046345e-03\n+2.459933e-01\n+4.114586e-03\n+6.521553e-02\n+3.938053e-05\n+3.133153e-02\n+2.380542e-02\n+1.556989e-01\n+5.384190e-04\n+1.634947e-01\n+3.142554e-02\n+3.503047e-01\n+2.463711e-04\n+2.039486e-01\n+2.568033e-02\n+2.493266e-01\n+2.471400e-03\n+2.970896e-01\n+3.747118e-02\n+5.939123e-01\n+3.586351e-04\n+2.853336e-01\n+2.'..b'1871e-03\n+1.665448e+00\n+2.151998e-01\n+2.089343e+00\n+2.071020e-02\n+2.489595e+00\n+1.974880e-01\n+3.130154e+00\n+1.890150e-03\n+1.503822e+00\n+2.098996e-01\n+1.372845e+00\n+4.747403e-03\n+1.441582e+00\n+9.497867e-01\n+1.058740e+01\n+7.446174e-03\n+6.164022e+00\n+1.074266e+00\n+1.042989e+01\n+1.033842e-01\n+1.242793e+01\n+1.037148e+00\n+1.643863e+01\n+9.926498e-03\n+7.897619e+00\n+2.152338e+00\n+1.407732e+01\n+4.868048e-02\n+1.478217e+01\n+3.486351e-01\n+3.886282e+00\n+2.733243e-03\n+2.262607e+00\n+2.273198e-01\n+2.207014e+00\n+2.187659e-02\n+2.629808e+00\n+4.398725e-02\n+6.971910e-01\n+4.210002e-04\n+3.349519e-01\n+2.544935e-01\n+1.664510e+00\n+5.756004e-03\n+1.747851e+00\n+3.359568e-01\n+3.744956e+00\n+2.633847e-03\n+2.180326e+00\n+2.745373e-01\n+2.665442e+00\n+2.642067e-02\n+3.176056e+00\n+4.005882e-01\n+6.349259e+00\n+3.834013e-03\n+3.050378e+00\n+3.128611e-01\n+2.046262e+00\n+7.076133e-03\n+2.148717e+00\n+8.275265e-03\n+9.224549e-02\n+6.487673e-05\n+5.370565e-02\n+6.939541e-03\n+6.737498e-02\n+6.678411e-04\n+8.028188e-02\n+6.368390e-03\n+1.009380e-01\n+6.095160e-05\n+4.849369e-02\n+6.768625e-03\n+4.427007e-02\n+1.530893e-04\n+4.648664e-02\n+3.767042e-01\n+4.199173e+00\n+2.953300e-03\n+2.444773e+00\n+4.260753e-01\n+4.136702e+00\n+4.100424e-02\n+4.929163e+00\n+4.113533e-01\n+6.519885e+00\n+3.937046e-03\n+3.132352e+00\n+8.536597e-01\n+5.583346e+00\n+1.930764e-02\n+5.862900e+00\n+5.020069e-01\n+5.595939e+00\n+3.935653e-03\n+3.257975e+00\n+3.273225e-01\n+3.177926e+00\n+3.150056e-02\n+3.786715e+00\n+6.333816e-02\n+1.003900e+00\n+6.062069e-04\n+4.823041e-01\n+3.664504e-01\n+2.396762e+00\n+8.288190e-03\n+2.516766e+00\n+1.943965e+00\n+2.166964e+01\n+1.524037e-02\n+1.261614e+01\n+1.588570e+00\n+1.542319e+01\n+1.528793e-01\n+1.837779e+01\n+2.317945e+00\n+3.673906e+01\n+2.218496e-02\n+1.765057e+01\n+1.810325e+00\n+1.184040e+01\n+4.094502e-02\n+1.243324e+01\n+4.236338e-01\n+4.722303e+00\n+3.321220e-03\n+2.749342e+00\n+3.552544e-01\n+3.449113e+00\n+3.418864e-02\n+4.109853e+00\n+3.260156e-01\n+5.167295e+00\n+3.120282e-03\n+2.482527e+00\n+3.465048e-01\n+2.266308e+00\n+7.837069e-03\n+2.379781e+00\n+1.085583e+00\n+1.210114e+01\n+8.510793e-03\n+7.045325e+00\n+1.227860e+00\n+1.192111e+01\n+1.181656e-01\n+1.420482e+01\n+1.185434e+00\n+1.878895e+01\n+1.134574e-02\n+9.026784e+00\n+2.460069e+00\n+1.609004e+01\n+5.564059e-02\n+1.689565e+01\n+2.530349e-01\n+2.820614e+00\n+1.983753e-03\n+1.642171e+00\n+1.649858e-01\n+1.601823e+00\n+1.587775e-02\n+1.908681e+00\n+3.192539e-02\n+5.060123e-01\n+3.055566e-04\n+2.431038e-01\n+1.847081e-01\n+1.208080e+00\n+4.177634e-03\n+1.268568e+00\n+3.270153e-01\n+3.645284e+00\n+2.563747e-03\n+2.122297e+00\n+2.672305e-01\n+2.594502e+00\n+2.571748e-02\n+3.091526e+00\n+3.899265e-01\n+6.180273e+00\n+3.731970e-03\n+2.969192e+00\n+3.045343e-01\n+1.991801e+00\n+6.887802e-03\n+2.091529e+00\n+1.978090e-01\n+2.205004e+00\n+1.550791e-03\n+1.283761e+00\n+1.658804e-01\n+1.610508e+00\n+1.596384e-02\n+1.919030e+00\n+1.522278e-01\n+2.412786e+00\n+1.456966e-03\n+1.159176e+00\n+1.617949e-01\n+1.058216e+00\n+3.659394e-03\n+1.111201e+00\n+3.714741e-01\n+4.140871e+00\n+2.912296e-03\n+2.410830e+00\n+4.201597e-01\n+4.079268e+00\n+4.043493e-02\n+4.860726e+00\n+4.056421e-01\n+6.429363e+00\n+3.882384e-03\n+3.088862e+00\n+8.418075e-01\n+5.505826e+00\n+1.903957e-02\n+5.781499e+00\n+6.246041e-01\n+6.962546e+00\n+4.896795e-03\n+4.053619e+00\n+4.072593e-01\n+3.954020e+00\n+3.919344e-02\n+4.711484e+00\n+7.880623e-02\n+1.249066e+00\n+7.542512e-04\n+6.000896e-01\n+4.559428e-01\n+2.982085e+00\n+1.031228e-02\n+3.131396e+00\n+1.101185e+00\n+1.227506e+01\n+8.633115e-03\n+7.146585e+00\n+8.998673e-01\n+8.736678e+00\n+8.660058e-02\n+1.041035e+01\n+1.313031e+00\n+2.081134e+01\n+1.256697e-02\n+9.998404e+00\n+1.025483e+00\n+6.707155e+00\n+2.319386e-02\n+7.042977e+00\n+4.498125e-01\n+5.014120e+00\n+3.526457e-03\n+2.919239e+00\n+3.772076e-01\n+3.662253e+00\n+3.630135e-02\n+4.363823e+00\n+3.461619e-01\n+5.486611e+00\n+3.313101e-03\n+2.635936e+00\n+3.679172e-01\n+2.406356e+00\n+8.321365e-03\n+2.526840e+00\n+1.664810e+00\n+1.855786e+01\n+1.305184e-02\n+1.080445e+01\n+1.883001e+00\n+1.828177e+01\n+1.812144e-01\n+2.178398e+01\n+1.817938e+00\n+2.881403e+01\n+1.739941e-02\n+1.384314e+01\n+3.772670e+00\n+2.467508e+01\n+8.532834e-02\n+2.591055e+01\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2s0acc6
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2s0acc6 Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,64 @@
+1.589192e-01
+3.590071e-01
+5.207835e-02
+5.419365e-01
+2.073508e-01
+8.735151e-01
+4.510724e-01
+1.067669e+00
+4.242720e-02
+1.472399e-01
+5.142409e-02
+2.248658e-01
+3.321108e-01
+1.168863e+00
+6.679833e-01
+1.278941e+00
+3.490647e-01
+7.885560e-01
+1.143896e-01
+1.190359e+00
+4.933393e-01
+2.078310e+00
+1.073214e+00
+2.540251e+00
+3.165376e-01
+1.098516e+00
+3.836608e-01
+1.677661e+00
+8.207010e-01
+2.888455e+00
+1.650697e+00
+3.160476e+00
+1.281749e-01
+2.895540e-01
+4.200333e-02
+4.370941e-01
+2.830976e-01
+1.192617e+00
+6.158524e-01
+1.457696e+00
+1.362129e-01
+4.727150e-01
+1.650975e-01
+7.219335e-01
+4.050944e-01
+1.425728e+00
+8.147770e-01
+1.559997e+00
+5.143570e-01
+1.161960e+00
+1.685565e-01
+1.754028e+00
+6.250866e-01
+2.633327e+00
+1.359817e+00
+3.218630e+00
+4.760453e-01
+1.652074e+00
+5.769930e-01
+2.523059e+00
+8.962432e-01
+3.154325e+00
+1.802638e+00
+3.451386e+00
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2s0acc7
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2s0acc7 Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,256 @@
+6.828528e-02
+1.705648e-01
+1.917348e-03
+3.224259e-01
+8.211593e-02
+4.151202e-01
+1.492299e-01
+4.290217e-01
+1.428270e-03
+6.253822e-03
+1.984129e-03
+1.522456e-02
+1.922764e-01
+9.270963e-01
+2.955480e-01
+9.114981e-01
+8.154603e-02
+2.036879e-01
+2.289689e-03
+3.850398e-01
+1.768345e-01
+8.939504e-01
+3.213625e-01
+9.238868e-01
+6.974072e-02
+3.053666e-01
+9.688263e-02
+7.433969e-01
+2.974552e-01
+1.434235e+00
+4.572182e-01
+1.410104e+00
+3.163415e-03
+7.901665e-03
+8.882392e-05
+1.493685e-02
+5.913344e-03
+2.989369e-02
+1.074636e-02
+3.089477e-02
+2.129370e-03
+9.323654e-03
+2.958084e-03
+2.269788e-02
+1.671677e-02
+8.060297e-02
+2.569533e-02
+7.924684e-02
+1.267251e-01
+3.165375e-01
+3.558250e-03
+5.983641e-01
+3.218079e-01
+1.626834e+00
+5.848238e-01
+1.681313e+00
+1.340971e-01
+5.871576e-01
+1.862855e-01
+1.429400e+00
+4.834615e-01
+2.331099e+00
+7.431284e-01
+2.291878e+00
+1.333481e-01
+3.330806e-01
+3.744214e-03
+6.296362e-01
+1.603567e-01
+8.106505e-01
+2.914174e-01
+8.377974e-01
+2.789139e-03
+1.221252e-02
+3.874625e-03
+2.973066e-02
+3.754792e-01
+1.810442e+00
+5.771488e-01
+1.779982e+00
+2.362435e-01
+5.900954e-01
+6.633360e-03
+1.115482e+00
+5.122995e-01
+2.589825e+00
+9.310056e-01
+2.676553e+00
+2.020428e-01
+8.846644e-01
+2.806745e-01
+2.153663e+00
+8.617446e-01
+4.155061e+00
+1.324587e+00
+4.085153e+00
+1.013966e-01
+2.532711e-01
+2.847062e-03
+4.787689e-01
+1.895397e-01
+9.581788e-01
+3.444518e-01
+9.902661e-01
+6.825241e-02
+2.988499e-01
+9.481510e-02
+7.275324e-01
+5.358204e-01
+2.583557e+00
+8.236092e-01
+2.540089e+00
+3.354405e-01
+8.378723e-01
+9.418661e-03
+1.583865e+00
+8.518231e-01
+4.306217e+00
+1.548024e+00
+4.450422e+00
+3.549541e-01
+1.554202e+00
+4.930963e-01
+3.783612e+00
+1.279719e+00
+6.170401e+00
+1.967055e+00
+6.066585e+00
+4.308947e-02
+1.076301e-01
+1.209887e-03
+2.034576e-01
+5.181691e-02
+2.619497e-01
+9.416723e-02
+2.707218e-01
+9.012691e-04
+3.946295e-03
+1.252028e-03
+9.607023e-03
+1.213305e-01
+5.850176e-01
+1.864971e-01
+5.751748e-01
+7.614078e-02
+1.901865e-01
+2.137918e-03
+3.595175e-01
+1.651131e-01
+8.346952e-01
+3.000611e-01
+8.626472e-01
+6.511797e-02
+2.851255e-01
+9.046079e-02
+6.941211e-01
+2.777384e-01
+1.339167e+00
+4.269116e-01
+1.316636e+00
+3.577687e-02
+8.936443e-02
+1.004560e-03
+1.689293e-01
+6.687737e-02
+3.380848e-01
+1.215367e-01
+3.494065e-01
+2.408225e-02
+1.054465e-01
+3.345466e-02
+2.567033e-01
+1.890594e-01
+9.115849e-01
+2.906032e-01
+8.962477e-01
+1.777781e-01
+4.440590e-01
+4.991741e-03
+8.394233e-01
+4.514527e-01
+2.282226e+00
+8.204282e-01
+2.358653e+00
+1.881200e-01
+8.237022e-01
+2.613332e-01
+2.005254e+00
+6.782306e-01
+3.270214e+00
+1.042508e+00
+3.215194e+00
+2.101708e-01
+5.249704e-01
+5.901279e-03
+9.923735e-01
+2.527393e-01
+1.277671e+00
+4.593048e-01
+1.320458e+00
+4.395980e-03
+1.924823e-02
+6.106821e-03
+4.685868e-02
+5.917951e-01
+2.853449e+00
+9.096479e-01
+2.805440e+00
+3.142832e-01
+7.850251e-01
+8.824598e-03
+1.483966e+00
+6.815305e-01
+3.445338e+00
+1.238550e+00
+3.560714e+00
+2.687848e-01
+1.176901e+00
+3.733914e-01
+2.865096e+00
+1.146410e+00
+5.527627e+00
+1.762146e+00
+5.434625e+00
+2.094826e-01
+5.232513e-01
+5.881955e-03
+9.891240e-01
+3.915839e-01
+1.979572e+00
+7.116283e-01
+2.045864e+00
+1.410077e-01
+6.174160e-01
+1.958855e-01
+1.503063e+00
+1.106991e+00
+5.337561e+00
+1.701555e+00
+5.247757e+00
+3.890158e-01
+9.716943e-01
+1.092298e-02
+1.836835e+00
+9.878733e-01
+4.993991e+00
+1.795269e+00
+5.161228e+00
+4.116462e-01
+1.802433e+00
+5.718519e-01
+4.387917e+00
+1.484111e+00
+7.155916e+00
+2.281226e+00
+7.035519e+00
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2s0acc8
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2s0acc8 Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,64 @@
+1.267734e-01
+4.240658e-01
+2.154088e-03
+4.584250e-01
+1.690069e-01
+8.047111e-01
+1.651846e-01
+7.078121e-01
+1.269869e-03
+7.875981e-03
+1.712088e-03
+9.977617e-03
+3.575112e-01
+1.400080e+00
+4.402260e-01
+1.191661e+00
+3.389709e-01
+1.133881e+00
+5.759669e-03
+1.225751e+00
+6.803077e-01
+3.239223e+00
+6.649215e-01
+2.849173e+00
+2.747966e-01
+1.704343e+00
+3.704917e-01
+2.159132e+00
+8.937407e-01
+3.500054e+00
+1.100519e+00
+2.979030e+00
+7.791715e-02
+2.606382e-01
+1.323940e-03
+2.817559e-01
+2.191988e-01
+1.043695e+00
+2.142413e-01
+9.180192e-01
+5.948891e-02
+3.689621e-01
+8.020532e-02
+4.674164e-01
+4.454152e-01
+1.744329e+00
+5.484678e-01
+1.484665e+00
+4.303659e-01
+1.439603e+00
+7.312620e-03
+1.556244e+00
+1.085724e+00
+5.169576e+00
+1.061169e+00
+4.547083e+00
+3.325904e-01
+2.062792e+00
+4.484116e-01
+2.613229e+00
+1.032808e+00
+4.044666e+00
+1.271761e+00
+3.442570e+00
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2s0acc9
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2s0acc9 Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,256 @@
+9.224099e-02
+1.028219e+00
+7.231517e-04
+5.986320e-01
+6.014413e-02
+5.839267e-01
+5.788058e-03
+6.957871e-01
+1.163810e-02
+1.844614e-01
+1.113873e-04
+8.862093e-02
+6.733367e-02
+4.403950e-01
+1.522921e-03
+4.624472e-01
+8.888472e-02
+9.908067e-01
+6.968392e-04
+5.768503e-01
+7.263309e-02
+7.051793e-01
+6.989951e-03
+8.402676e-01
+1.059819e-01
+1.679790e+00
+1.014344e-03
+8.070230e-01
+8.277231e-02
+5.413712e-01
+1.872105e-03
+5.684797e-01
+2.189314e-03
+2.440450e-02
+1.716381e-05
+1.420837e-02
+1.835935e-03
+1.782471e-02
+1.766839e-04
+2.123931e-02
+1.684835e-03
+2.670427e-02
+1.612542e-05
+1.282956e-02
+1.790712e-03
+1.171213e-02
+4.050147e-05
+1.229860e-02
+9.966647e-02
+1.110992e+00
+7.813661e-04
+6.468224e-01
+1.127289e-01
+1.094461e+00
+1.084863e-02
+1.304123e+00
+1.088336e-01
+1.724989e+00
+1.041637e-03
+8.287377e-01
+2.258534e-01
+1.477191e+00
+5.108244e-03
+1.551160e+00
+1.599379e-01
+1.782844e+00
+1.253883e-03
+1.037976e+00
+1.042848e-01
+1.012479e+00
+1.003599e-02
+1.206435e+00
+2.017947e-02
+3.198402e-01
+1.931360e-04
+1.536611e-01
+1.167508e-01
+7.636070e-01
+2.640614e-03
+8.018438e-01
+6.193526e-01
+6.903984e+00
+4.855606e-03
+4.019518e+00
+5.061105e-01
+4.913720e+00
+4.870628e-02
+5.855021e+00
+7.384866e-01
+1.170485e+01
+7.067994e-03
+5.623371e+00
+5.767610e-01
+3.772298e+00
+1.304491e-02
+3.961191e+00
+1.349645e-01
+1.504463e+00
+1.058096e-03
+8.759023e-01
+1.131798e-01
+1.098839e+00
+1.089202e-02
+1.309339e+00
+1.038649e-01
+1.646237e+00
+9.940828e-04
+7.909029e-01
+1.103919e-01
+7.220170e-01
+2.496792e-03
+7.581712e-01
+3.458722e-01
+3.855471e+00
+2.711572e-03
+2.244666e+00
+3.912028e-01
+3.798105e+00
+3.764797e-02
+4.525693e+00
+3.776848e-01
+5.986222e+00
+3.614790e-03
+2.875966e+00
+7.837783e-01
+5.126292e+00
+1.772712e-02
+5.382985e+00
+5.182960e-02
+5.777496e-01
+4.063342e-04
+3.363674e-01
+3.379459e-02
+3.281046e-01
+3.252272e-03
+3.909582e-01
+6.539373e-03
+1.036476e-01
+6.258780e-05
+4.979552e-02
+3.783434e-02
+2.474550e-01
+8.557192e-04
+2.598461e-01
+6.698434e-02
+7.466810e-01
+5.251444e-04
+4.347197e-01
+5.473696e-02
+5.314296e-01
+5.267691e-03
+6.332334e-01
+7.986894e-02
+1.265905e+00
+7.644190e-04
+6.081799e-01
+6.237797e-02
+4.079823e-01
+1.410835e-03
+4.284115e-01
+4.051775e-02
+4.516553e-01
+3.176514e-04
+2.629549e-01
+3.397775e-02
+3.298828e-01
+3.269898e-03
+3.930770e-01
+3.118133e-02
+4.942173e-01
+2.984340e-04
+2.374372e-01
+3.314080e-02
+2.167570e-01
+7.495628e-04
+2.276108e-01
+7.609032e-02
+8.481862e-01
+5.965335e-04
+4.938163e-01
+8.606284e-02
+8.355659e-01
+8.282382e-03
+9.956319e-01
+8.308895e-02
+1.316942e+00
+7.952375e-04
+6.326994e-01
+1.724277e-01
+1.127761e+00
+3.899887e-03
+1.184232e+00
+2.120910e-01
+2.364199e+00
+1.662753e-03
+1.376443e+00
+1.382902e-01
+1.342631e+00
+1.330856e-02
+1.599833e+00
+2.675965e-02
+4.241346e-01
+2.561144e-04
+2.037673e-01
+1.548212e-01
+1.012606e+00
+3.501673e-03
+1.063311e+00
+3.739271e-01
+4.168201e+00
+2.931517e-03
+2.426738e+00
+3.055584e-01
+2.966602e+00
+2.940586e-02
+3.534902e+00
+4.458528e-01
+7.066670e+00
+4.267221e-03
+3.395046e+00
+3.482129e-01
+2.277482e+00
+7.875713e-03
+2.391524e+00
+1.527389e-01
+1.702595e+00
+1.197443e-03
+9.912554e-01
+1.280852e-01
+1.243552e+00
+1.232646e-02
+1.481774e+00
+1.175436e-01
+1.863040e+00
+1.125000e-03
+8.950618e-01
+1.249301e-01
+8.171039e-01
+2.825610e-03
+8.580195e-01
+5.653036e-01
+6.301495e+00
+4.431873e-03
+3.668747e+00
+6.393933e-01
+6.207734e+00
+6.153294e-02
+7.396924e+00
+6.172991e-01
+9.784056e+00
+5.908119e-03
+4.700562e+00
+1.281030e+00
+8.378561e+00
+2.897373e-02
+8.798108e+00
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2x3acc1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2x3acc1 Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,16384 @@\n+1.941141e-01\n+2.543692e-01\n+5.922448e-02\n+4.796110e-01\n+1.841954e-01\n+4.315899e-01\n+3.647568e-01\n+6.772295e-01\n+3.915609e-02\n+7.420956e-02\n+4.606080e-02\n+1.426161e-01\n+2.633752e-01\n+5.400275e-01\n+4.929181e-01\n+7.411514e-01\n+1.966276e-01\n+3.709739e-01\n+8.533832e-02\n+7.729975e-01\n+1.637973e-01\n+5.525739e-01\n+4.614095e-01\n+9.582197e-01\n+1.162401e-01\n+3.171815e-01\n+1.945107e-01\n+6.736381e-01\n+2.509182e-01\n+7.407377e-01\n+6.680168e-01\n+1.123481e+00\n+5.241519e-02\n+1.171938e-01\n+2.446065e-02\n+1.803708e-01\n+6.658405e-02\n+2.661966e-01\n+2.016793e-01\n+3.409605e-01\n+3.778632e-02\n+1.221898e-01\n+6.798818e-02\n+1.916817e-01\n+8.130640e-02\n+2.844498e-01\n+2.327508e-01\n+3.186645e-01\n+3.042231e-01\n+4.404262e-01\n+8.492274e-02\n+9.304908e-01\n+2.554696e-01\n+6.613100e-01\n+4.628618e-01\n+1.162744e+00\n+2.113734e-01\n+4.425724e-01\n+2.274941e-01\n+9.530315e-01\n+3.353221e-01\n+7.595855e-01\n+5.741822e-01\n+1.168105e+00\n+1.456564e-01\n+2.325510e-01\n+4.139100e-02\n+3.723021e-01\n+1.340744e-01\n+3.827535e-01\n+2.472876e-01\n+5.099604e-01\n+3.926036e-02\n+9.065586e-02\n+4.301486e-02\n+1.479303e-01\n+2.463146e-01\n+6.153360e-01\n+4.293606e-01\n+7.170609e-01\n+2.073656e-01\n+4.766686e-01\n+8.382398e-02\n+8.433424e-01\n+1.675688e-01\n+6.887440e-01\n+4.396479e-01\n+1.014110e+00\n+1.638063e-01\n+5.445822e-01\n+2.552996e-01\n+9.820528e-01\n+3.298124e-01\n+1.186261e+00\n+8.178135e-01\n+1.527686e+00\n+1.218294e-01\n+3.318797e-01\n+5.295355e-02\n+4.337056e-01\n+1.501273e-01\n+7.312616e-01\n+4.235285e-01\n+7.952929e-01\n+1.173578e-01\n+4.623741e-01\n+1.966721e-01\n+6.158739e-01\n+2.355389e-01\n+1.003979e+00\n+6.280021e-01\n+9.550041e-01\n+2.926681e-01\n+5.162231e-01\n+7.609205e-02\n+9.260385e-01\n+2.384058e-01\n+7.519062e-01\n+4.023101e-01\n+1.122523e+00\n+2.717165e-01\n+6.931560e-01\n+2.723753e-01\n+1.267379e+00\n+4.020575e-01\n+1.109644e+00\n+6.412211e-01\n+1.448911e+00\n+5.082826e-02\n+8.101235e-02\n+1.536941e-02\n+1.267312e-01\n+7.242005e-02\n+2.063905e-01\n+1.421317e-01\n+2.686965e-01\n+1.878318e-02\n+4.329802e-02\n+2.189823e-02\n+6.903741e-02\n+1.109167e-01\n+2.766154e-01\n+2.057333e-01\n+3.149744e-01\n+6.711182e-02\n+1.540055e-01\n+2.886730e-02\n+2.662433e-01\n+8.394462e-02\n+3.444411e-01\n+2.343583e-01\n+4.955614e-01\n+7.268287e-02\n+2.412250e-01\n+1.205389e-01\n+4.250586e-01\n+1.377401e-01\n+4.945738e-01\n+3.634322e-01\n+6.223578e-01\n+3.619033e-02\n+9.841886e-02\n+1.673829e-02\n+1.256747e-01\n+6.902990e-02\n+3.356664e-01\n+2.072221e-01\n+3.567117e-01\n+4.779600e-02\n+1.879881e-01\n+8.523108e-02\n+2.446717e-01\n+9.028882e-02\n+3.841964e-01\n+2.561581e-01\n+3.570994e-01\n+1.280701e-01\n+2.255106e-01\n+3.543131e-02\n+3.952881e-01\n+1.614829e-01\n+5.084295e-01\n+2.899655e-01\n+7.416815e-01\n+1.630149e-01\n+4.151449e-01\n+1.738821e-01\n+7.417038e-01\n+2.270343e-01\n+6.255246e-01\n+3.852892e-01\n+7.981005e-01\n+1.813272e-01\n+1.976544e-01\n+5.134324e-02\n+3.821296e-01\n+1.622615e-01\n+3.162597e-01\n+2.982057e-01\n+5.088478e-01\n+3.363630e-02\n+5.302792e-02\n+3.672116e-02\n+1.044943e-01\n+3.177800e-01\n+5.420054e-01\n+5.519536e-01\n+7.627358e-01\n+1.859775e-01\n+2.918739e-01\n+7.490935e-02\n+6.236048e-01\n+1.461010e-01\n+4.099895e-01\n+3.819522e-01\n+7.289992e-01\n+1.011055e-01\n+2.294894e-01\n+1.570140e-01\n+4.997592e-01\n+3.065447e-01\n+7.527695e-01\n+7.573996e-01\n+1.170692e+00\n+1.377999e-01\n+2.562902e-01\n+5.968093e-02\n+4.044578e-01\n+1.650792e-01\n+5.489850e-01\n+4.640443e-01\n+7.210106e-01\n+9.135432e-02\n+2.457341e-01\n+1.525470e-01\n+3.952670e-01\n+2.760973e-01\n+8.034871e-01\n+7.335076e-01\n+9.229677e-01\n+3.257741e-01\n+3.923137e-01\n+8.439649e-02\n+8.498688e-01\n+2.579849e-01\n+5.555153e-01\n+4.337928e-01\n+1.001508e+00\n+2.081505e-01\n+3.625329e-01\n+2.079090e-01\n+8.004789e-01\n+4.638017e-01\n+8.739422e-01\n+7.370482e-01\n+1.378058e+00\n+2.386407e-01\n+4.355195e-01\n+7.698513e-02\n+6.250560e-01\n+2.571786e-01\n+8.392335e-01\n+5.384897e-01\n+1.002384e+00\n+5.220037e-02\n+1.377811e-01\n+6.492677e-02\n+2.015510e-01\n+4.057845e-01\n+1.158756e+00\n+8.029961e-01\n+1.210514e+00\n+2.627769e-01\n+6.904646e-01\n+1.205882e-01\n+1.095123e+00\n+2.486095e-01\n+1.168038e+00\n+7.404837e-01\n+1.541765e+00\n+1.684556e-01\n+6.401661e-01\n+2.980512e-01\n+1.034899e+00\n+4.'..b'5345e-01\n+1.124483e+00\n+6.318122e-01\n+3.033302e+00\n+1.791689e+00\n+3.130880e+00\n+2.903904e-01\n+1.127660e+00\n+4.891754e-01\n+1.425519e+00\n+8.858136e-01\n+3.721505e+00\n+2.374064e+00\n+3.359664e+00\n+9.057047e-01\n+1.574574e+00\n+2.367021e-01\n+2.680718e+00\n+1.120236e+00\n+3.482338e+00\n+1.900226e+00\n+4.933997e+00\n+7.506723e-01\n+1.887469e+00\n+7.564040e-01\n+3.275306e+00\n+1.688232e+00\n+4.592426e+00\n+2.706471e+00\n+5.691106e+00\n+5.068353e-01\n+8.379174e-01\n+1.346856e-01\n+1.242180e+00\n+7.997923e-01\n+2.364266e+00\n+1.379469e+00\n+2.916887e+00\n+1.151941e-01\n+2.754340e-01\n+1.180246e-01\n+4.161835e-01\n+7.980560e-01\n+2.064435e+00\n+1.300899e+00\n+2.227670e+00\n+6.635790e-01\n+1.579495e+00\n+2.508432e-01\n+2.587686e+00\n+9.192703e-01\n+3.912494e+00\n+2.255450e+00\n+5.334418e+00\n+4.420030e-01\n+1.521612e+00\n+6.442031e-01\n+2.540863e+00\n+9.827178e-01\n+3.660058e+00\n+2.278737e+00\n+4.364633e+00\n+1.227305e-01\n+3.461996e-01\n+4.988537e-02\n+4.189352e-01\n+2.592709e-01\n+1.307714e+00\n+6.839982e-01\n+1.316962e+00\n+9.968983e-02\n+4.067036e-01\n+1.562283e-01\n+5.016287e-01\n+2.209371e-01\n+9.751609e-01\n+5.508650e-01\n+8.589404e-01\n+8.751807e-01\n+1.598474e+00\n+2.127844e-01\n+2.655236e+00\n+1.222175e+00\n+3.991407e+00\n+1.928660e+00\n+5.517769e+00\n+6.851363e-01\n+1.809830e+00\n+6.422541e-01\n+3.064216e+00\n+1.119480e+00\n+3.199321e+00\n+1.669607e+00\n+3.868314e+00\n+5.816728e-01\n+1.171640e+00\n+1.439677e-01\n+1.474790e+00\n+8.903964e-01\n+3.206886e+00\n+1.430375e+00\n+3.359383e+00\n+1.766544e-01\n+5.146273e-01\n+1.685772e-01\n+6.602567e-01\n+1.141532e+00\n+3.597799e+00\n+1.733126e+00\n+3.296395e+00\n+1.070346e+00\n+3.104064e+00\n+3.768476e-01\n+4.317941e+00\n+1.438364e+00\n+7.458652e+00\n+3.286933e+00\n+8.634684e+00\n+9.526623e-01\n+3.995753e+00\n+1.293209e+00\n+5.665375e+00\n+1.975618e+00\n+8.964852e+00\n+4.266782e+00\n+9.077278e+00\n+4.363015e-01\n+1.499484e+00\n+1.651732e-01\n+1.540689e+00\n+8.940925e-01\n+5.494428e+00\n+2.196927e+00\n+4.698242e+00\n+4.735519e-01\n+2.353834e+00\n+6.912075e-01\n+2.465090e+00\n+9.789166e-01\n+5.264224e+00\n+2.273287e+00\n+3.937076e+00\n+1.287717e+00\n+2.865558e+00\n+2.916046e-01\n+4.041656e+00\n+1.744416e+00\n+6.941028e+00\n+2.563920e+00\n+8.147307e+00\n+1.347044e+00\n+4.335345e+00\n+1.176099e+00\n+6.232438e+00\n+2.052966e+00\n+7.148322e+00\n+2.851750e+00\n+7.338728e+00\n+2.221334e-01\n+4.466698e-01\n+5.850270e-02\n+5.493861e-01\n+5.263273e-01\n+1.892402e+00\n+8.997010e-01\n+1.937067e+00\n+9.249084e-02\n+2.689828e-01\n+9.391799e-02\n+3.372093e-01\n+5.625410e-01\n+1.769948e+00\n+9.088077e-01\n+1.584594e+00\n+3.790930e-01\n+1.097513e+00\n+1.420245e-01\n+1.491801e+00\n+7.885480e-01\n+4.082037e+00\n+1.917457e+00\n+4.617619e+00\n+4.625939e-01\n+1.936943e+00\n+6.681981e-01\n+2.683503e+00\n+9.029337e-01\n+4.090283e+00\n+2.075052e+00\n+4.046886e+00\n+1.418360e-01\n+4.866301e-01\n+5.713668e-02\n+4.885703e-01\n+4.499037e-01\n+2.760050e+00\n+1.176328e+00\n+2.306137e+00\n+2.110603e-01\n+1.047302e+00\n+3.278106e-01\n+1.071727e+00\n+4.106544e-01\n+2.204564e+00\n+1.014754e+00\n+1.611080e+00\n+6.166688e-01\n+1.369929e+00\n+1.485941e-01\n+1.888007e+00\n+1.293062e+00\n+5.136299e+00\n+2.022317e+00\n+5.891090e+00\n+8.844081e-01\n+2.841528e+00\n+8.216566e-01\n+3.991548e+00\n+1.268658e+00\n+4.409854e+00\n+1.875210e+00\n+4.423806e+00\n+8.526260e-01\n+1.172543e+00\n+2.102756e-01\n+1.782344e+00\n+1.268818e+00\n+3.120000e+00\n+2.031001e+00\n+3.946911e+00\n+1.782070e-01\n+3.544443e-01\n+1.694506e-01\n+5.491548e-01\n+1.734087e+00\n+3.731424e+00\n+2.623354e+00\n+4.128609e+00\n+1.130301e+00\n+2.237977e+00\n+3.965335e-01\n+3.759487e+00\n+1.476643e+00\n+5.227833e+00\n+3.362337e+00\n+7.308607e+00\n+6.923562e-01\n+1.982641e+00\n+9.364901e-01\n+3.394697e+00\n+2.162102e+00\n+6.698401e+00\n+4.652834e+00\n+8.190502e+00\n+5.810715e-01\n+1.363451e+00\n+2.191929e-01\n+1.691763e+00\n+1.157608e+00\n+4.856871e+00\n+2.834254e+00\n+5.015293e+00\n+4.340412e-01\n+1.472970e+00\n+6.312709e-01\n+1.862850e+00\n+1.351113e+00\n+4.960611e+00\n+3.126397e+00\n+4.480241e+00\n+1.687749e+00\n+2.564198e+00\n+3.808253e-01\n+4.367457e+00\n+2.222665e+00\n+6.038130e+00\n+3.255159e+00\n+8.558932e+00\n+1.215038e+00\n+2.669848e+00\n+1.057051e+00\n+4.634978e+00\n+2.788510e+00\n+6.629022e+00\n+3.859632e+00\n+8.218508e+00\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2x3acc2
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2x3acc2 Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,16384 @@\n+2.873860e-02\n+5.142776e-02\n+3.126138e-04\n+7.332443e-02\n+3.468014e-02\n+8.795091e-02\n+2.191688e-02\n+1.020345e-01\n+2.523302e-04\n+7.652920e-04\n+2.105940e-04\n+1.303050e-03\n+8.608750e-02\n+1.887643e-01\n+6.626845e-02\n+2.153999e-01\n+3.428269e-02\n+5.628034e-02\n+3.768936e-04\n+8.165776e-02\n+5.864147e-02\n+1.364314e-01\n+3.745451e-02\n+1.610685e-01\n+2.342992e-02\n+6.518964e-02\n+1.976282e-02\n+1.129542e-01\n+9.151594e-02\n+1.840884e-01\n+7.119750e-02\n+2.137675e-01\n+4.072104e-04\n+7.360839e-04\n+5.271340e-06\n+9.606032e-04\n+9.594717e-04\n+2.457922e-03\n+7.215886e-04\n+2.610000e-03\n+2.512840e-04\n+7.698377e-04\n+2.495750e-04\n+1.199772e-03\n+1.972705e-03\n+4.369365e-03\n+1.807126e-03\n+4.563621e-03\n+5.613629e-02\n+1.104627e-01\n+7.885974e-04\n+1.439964e-01\n+1.404953e-01\n+3.917970e-01\n+1.146644e-01\n+4.155781e-01\n+3.758170e-02\n+1.253355e-01\n+4.050627e-02\n+1.951161e-01\n+1.526111e-01\n+3.679644e-01\n+1.517127e-01\n+3.838982e-01\n+3.529064e-02\n+8.694884e-02\n+3.635402e-04\n+1.100307e-01\n+4.032611e-02\n+1.408050e-01\n+2.413430e-02\n+1.449853e-01\n+3.328470e-04\n+1.389872e-03\n+2.630702e-04\n+2.100429e-03\n+9.658256e-02\n+2.915754e-01\n+7.040700e-02\n+2.953082e-01\n+5.909191e-02\n+1.335618e-01\n+6.152084e-04\n+1.719974e-01\n+9.571271e-02\n+3.065852e-01\n+5.789208e-02\n+3.212526e-01\n+4.338159e-02\n+1.661826e-01\n+3.465244e-02\n+2.555690e-01\n+1.441168e-01\n+3.991319e-01\n+1.061776e-01\n+4.113683e-01\n+2.651337e-02\n+6.598505e-02\n+3.250254e-04\n+7.642956e-02\n+5.915477e-02\n+2.086402e-01\n+4.213059e-02\n+1.966389e-01\n+1.757487e-02\n+7.413084e-02\n+1.653023e-02\n+1.025410e-01\n+1.173472e-01\n+3.578502e-01\n+1.018004e-01\n+3.317352e-01\n+8.451323e-02\n+2.289649e-01\n+1.124312e-03\n+2.649134e-01\n+2.002877e-01\n+7.689994e-01\n+1.548003e-01\n+7.239635e-01\n+6.077695e-02\n+2.790673e-01\n+6.203475e-02\n+3.855910e-01\n+2.099093e-01\n+6.968250e-01\n+1.976141e-01\n+6.452577e-01\n+3.020121e-04\n+1.475089e-03\n+5.399788e-06\n+1.613347e-03\n+4.000323e-04\n+2.768959e-03\n+4.155304e-04\n+2.464233e-03\n+3.386821e-06\n+2.803575e-05\n+4.645996e-06\n+3.661885e-05\n+1.069736e-03\n+6.402040e-03\n+1.353484e-03\n+5.604057e-03\n+4.303007e-04\n+1.928039e-03\n+7.775453e-06\n+2.145927e-03\n+8.078998e-04\n+5.130135e-03\n+8.481381e-04\n+4.646048e-03\n+3.756058e-04\n+2.852344e-03\n+5.207390e-04\n+3.791265e-03\n+1.358225e-03\n+7.456984e-03\n+1.736799e-03\n+6.642582e-03\n+1.771017e-04\n+8.737619e-04\n+3.768206e-06\n+8.747185e-04\n+4.580274e-04\n+3.202502e-03\n+5.661856e-04\n+2.608677e-03\n+1.395831e-04\n+1.167157e-03\n+2.278657e-04\n+1.395364e-03\n+1.014480e-03\n+6.132843e-03\n+1.527495e-03\n+4.913734e-03\n+8.263538e-04\n+4.438128e-03\n+1.908040e-05\n+4.438071e-03\n+2.270071e-03\n+1.727830e-02\n+3.045204e-03\n+1.405889e-02\n+7.065827e-04\n+6.431656e-03\n+1.251753e-03\n+7.680688e-03\n+2.656355e-03\n+1.748106e-02\n+4.340420e-03\n+1.399061e-02\n+9.134165e-02\n+2.024500e-01\n+1.093124e-03\n+2.146119e-01\n+8.593976e-02\n+2.699421e-01\n+5.975168e-02\n+2.328425e-01\n+7.748344e-04\n+2.910608e-03\n+7.114489e-04\n+3.684705e-03\n+2.566205e-01\n+6.969282e-01\n+2.173282e-01\n+5.912874e-01\n+1.562616e-01\n+3.177248e-01\n+1.889967e-03\n+3.427495e-01\n+2.083974e-01\n+6.005077e-01\n+1.464366e-01\n+5.271084e-01\n+1.031774e-01\n+3.555571e-01\n+9.574603e-02\n+4.580554e-01\n+3.912209e-01\n+9.746947e-01\n+3.348484e-01\n+8.415285e-01\n+6.457058e-02\n+1.445639e-01\n+9.195913e-04\n+1.402690e-01\n+1.186200e-01\n+3.763664e-01\n+9.814621e-02\n+2.971448e-01\n+3.849612e-02\n+1.460724e-01\n+4.206407e-02\n+1.692594e-01\n+2.933768e-01\n+8.048203e-01\n+2.956720e-01\n+6.249929e-01\n+1.630525e-01\n+3.973893e-01\n+2.519980e-03\n+3.851563e-01\n+3.181672e-01\n+1.098934e+00\n+2.856804e-01\n+8.666590e-01\n+1.054621e-01\n+4.356234e-01\n+1.250548e-01\n+5.042141e-01\n+4.157359e-01\n+1.241521e+00\n+4.546859e-01\n+9.630516e-01\n+2.631570e-02\n+6.687532e-02\n+3.849117e-04\n+7.119380e-02\n+2.708326e-02\n+9.753923e-02\n+2.301454e-02\n+8.449116e-02\n+2.316076e-04\n+9.975384e-04\n+2.599160e-04\n+1.268203e-03\n+6.879514e-02\n+2.142182e-01\n+7.120787e-02\n+1.825186e-01\n+4.779288e-02\n+1.114202e-01\n+7.064974e-04\n+1.207063e-01\n+6.972104e-02\n+2.303520e-01\n+5.987791e-02\n+2.030550e-01\n+3.274110e-02\n+1.293659e-01\n+3.713426e-02\n+1.673666e-01\n+1.'..b'7925e-02\n+1.535437e+00\n+1.111883e+00\n+3.590047e+00\n+1.259258e+00\n+3.527570e+00\n+4.470623e-01\n+1.726265e+00\n+6.686554e-01\n+2.489489e+00\n+2.548247e+00\n+7.113818e+00\n+3.515327e+00\n+6.875386e+00\n+1.137456e+00\n+2.821055e+00\n+2.406269e-02\n+3.402911e+00\n+2.407131e+00\n+8.460659e+00\n+2.958453e+00\n+8.304220e+00\n+9.885318e-01\n+4.155216e+00\n+1.604481e+00\n+5.985708e+00\n+2.914583e+00\n+8.857296e+00\n+4.363251e+00\n+8.550956e+00\n+4.168079e-01\n+1.017010e+00\n+6.988714e-03\n+1.441624e+00\n+5.671814e-01\n+1.961277e+00\n+5.525081e-01\n+2.262150e+00\n+3.786259e-03\n+1.565762e-02\n+4.870862e-03\n+2.650546e-02\n+8.969437e-01\n+2.681652e+00\n+1.064267e+00\n+3.042312e+00\n+7.794502e-01\n+1.744728e+00\n+1.320842e-02\n+2.516773e+00\n+1.503451e+00\n+4.769313e+00\n+1.480155e+00\n+5.597936e+00\n+5.511313e-01\n+2.090837e+00\n+7.165588e-01\n+3.601800e+00\n+1.494737e+00\n+4.099699e+00\n+1.792469e+00\n+4.733069e+00\n+5.779740e-03\n+1.424538e-02\n+1.153265e-04\n+1.848275e-02\n+1.535650e-02\n+5.363962e-02\n+1.780199e-02\n+5.662833e-02\n+3.689988e-03\n+1.541405e-02\n+5.649110e-03\n+2.388317e-02\n+2.011436e-02\n+6.074630e-02\n+2.840215e-02\n+6.307928e-02\n+7.116191e-01\n+1.909312e+00\n+1.540913e-02\n+2.474506e+00\n+2.008336e+00\n+7.636480e+00\n+2.526517e+00\n+8.053051e+00\n+4.928912e-01\n+2.241333e+00\n+8.188712e-01\n+3.468972e+00\n+1.389775e+00\n+4.569006e+00\n+2.129606e+00\n+4.739230e+00\n+8.227580e-01\n+2.763973e+00\n+1.306423e-02\n+3.477440e+00\n+1.060156e+00\n+5.047298e+00\n+9.779954e-01\n+5.167024e+00\n+8.028381e-03\n+4.571047e-02\n+9.780788e-03\n+6.867907e-02\n+1.617581e+00\n+6.658487e+00\n+1.817615e+00\n+6.704647e+00\n+2.159652e+00\n+6.655723e+00\n+3.465746e-02\n+8.521395e+00\n+3.944538e+00\n+1.722801e+01\n+3.677600e+00\n+1.794760e+01\n+1.640333e+00\n+8.567801e+00\n+2.019664e+00\n+1.309989e+01\n+3.783772e+00\n+1.428842e+01\n+4.296968e+00\n+1.464112e+01\n+6.049180e-01\n+2.052743e+00\n+1.143057e-02\n+2.363883e+00\n+1.521921e+00\n+7.319102e+00\n+1.670778e+00\n+6.858118e+00\n+4.148534e-01\n+2.385936e+00\n+6.014507e-01\n+3.281203e+00\n+1.923353e+00\n+7.997330e+00\n+2.571902e+00\n+7.370738e+00\n+1.722149e+00\n+6.361689e+00\n+3.531438e-02\n+7.317842e+00\n+4.602261e+00\n+2.409352e+01\n+5.482863e+00\n+2.255104e+01\n+1.281315e+00\n+8.022012e+00\n+2.015908e+00\n+1.101988e+01\n+3.072788e+00\n+1.390855e+01\n+4.459000e+00\n+1.280463e+01\n+1.984574e-01\n+1.321657e+00\n+5.469384e-03\n+1.437156e+00\n+2.964209e-01\n+2.797614e+00\n+4.746083e-01\n+2.475305e+00\n+2.302534e-03\n+2.598863e-02\n+4.868681e-03\n+3.374827e-02\n+5.049801e-01\n+4.120724e+00\n+9.848489e-01\n+3.586190e+00\n+4.432595e-01\n+2.708066e+00\n+1.234612e-02\n+2.996637e+00\n+9.384569e-01\n+8.125368e+00\n+1.518596e+00\n+7.315998e+00\n+4.003032e-01\n+4.144920e+00\n+8.554521e-01\n+5.477394e+00\n+1.005107e+00\n+7.524218e+00\n+1.981112e+00\n+6.663628e+00\n+1.138898e-01\n+7.661477e-01\n+3.735213e-03\n+7.625413e-01\n+3.321424e-01\n+3.166501e+00\n+6.328640e-01\n+2.564402e+00\n+9.286791e-02\n+1.058813e+00\n+2.336849e-01\n+1.258500e+00\n+4.686627e-01\n+3.863100e+00\n+1.087715e+00\n+3.077240e+00\n+4.746161e-01\n+3.475632e+00\n+1.689206e-02\n+3.455443e+00\n+1.470235e+00\n+1.525828e+01\n+3.040062e+00\n+1.234330e+01\n+4.198655e-01\n+5.211077e+00\n+1.146528e+00\n+6.187004e+00\n+1.096017e+00\n+9.834592e+00\n+2.760461e+00\n+7.825298e+00\n+1.266897e+00\n+3.828666e+00\n+2.337010e-02\n+4.035146e+00\n+1.344118e+00\n+5.756667e+00\n+1.440496e+00\n+4.936720e+00\n+1.111865e-02\n+5.694876e-02\n+1.573641e-02\n+7.167685e-02\n+2.556927e+00\n+9.468311e+00\n+3.337812e+00\n+7.986541e+00\n+3.397565e+00\n+9.419414e+00\n+6.334152e-02\n+1.010242e+01\n+5.109502e+00\n+2.007531e+01\n+5.534197e+00\n+1.751940e+01\n+2.320977e+00\n+1.090568e+01\n+3.319906e+00\n+1.396810e+01\n+6.110718e+00\n+2.075851e+01\n+8.061894e+00\n+1.781854e+01\n+8.764477e-01\n+2.675523e+00\n+1.924000e-02\n+2.580989e+00\n+1.815600e+00\n+7.854714e+00\n+2.315552e+00\n+6.165431e+00\n+5.406038e-01\n+2.796970e+00\n+9.105257e-01\n+3.222167e+00\n+2.860697e+00\n+1.070046e+01\n+4.444009e+00\n+8.261411e+00\n+1.976665e+00\n+6.568699e+00\n+4.708924e-02\n+6.329595e+00\n+4.349427e+00\n+2.048357e+01\n+6.019714e+00\n+1.606046e+01\n+1.322736e+00\n+7.449810e+00\n+2.417663e+00\n+8.572838e+00\n+3.620580e+00\n+1.474253e+01\n+6.103666e+00\n+1.136955e+01\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2x3acc3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2x3acc3 Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,16384 @@\n+3.634988e-02\n+4.407944e-02\n+1.738557e-02\n+4.696201e-02\n+3.108840e-02\n+4.575623e-02\n+2.458699e-02\n+3.437003e-02\n+2.519454e-02\n+3.448581e-02\n+1.753575e-02\n+3.405175e-02\n+7.638275e-02\n+9.986627e-02\n+7.381095e-02\n+8.606789e-02\n+4.754964e-02\n+5.774428e-02\n+2.822854e-02\n+6.431086e-02\n+4.377477e-02\n+6.452151e-02\n+4.297206e-02\n+5.066393e-02\n+6.478406e-02\n+8.880366e-02\n+5.596823e-02\n+9.166310e-02\n+8.123256e-02\n+1.063610e-01\n+9.743410e-02\n+9.582291e-02\n+2.438238e-01\n+2.546409e-01\n+1.175268e-01\n+2.829715e-01\n+1.476460e-01\n+1.871512e-01\n+1.176802e-01\n+1.466311e-01\n+1.601772e-01\n+1.888225e-01\n+1.123553e-01\n+1.944718e-01\n+3.912220e-01\n+4.405202e-01\n+3.809988e-01\n+3.959971e-01\n+2.321697e-02\n+2.758283e-02\n+1.184802e-02\n+3.517513e-02\n+3.221595e-02\n+4.645399e-02\n+2.718519e-02\n+4.176754e-02\n+3.488061e-02\n+4.677549e-02\n+2.590340e-02\n+5.528452e-02\n+5.230065e-02\n+6.699323e-02\n+5.392462e-02\n+6.910976e-02\n+8.452457e-01\n+1.058768e+00\n+4.852237e-01\n+8.867280e-01\n+7.699883e-01\n+1.170633e+00\n+7.309105e-01\n+6.912409e-01\n+5.732674e-01\n+8.105432e-01\n+4.789040e-01\n+6.291497e-01\n+1.393519e+00\n+1.882007e+00\n+1.616261e+00\n+1.275036e+00\n+5.294727e-01\n+6.641868e-01\n+3.772749e-01\n+5.814929e-01\n+5.191896e-01\n+7.904812e-01\n+6.117323e-01\n+4.879380e-01\n+7.058873e-01\n+9.995005e-01\n+7.319509e-01\n+8.110087e-01\n+7.096823e-01\n+9.598451e-01\n+1.021687e+00\n+6.797777e-01\n+2.480488e+00\n+2.675926e+00\n+1.435063e+00\n+2.337585e+00\n+1.599884e+00\n+2.094809e+00\n+1.530535e+00\n+1.290199e+00\n+1.594530e+00\n+1.941649e+00\n+1.342452e+00\n+1.571999e+00\n+3.122639e+00\n+3.632029e+00\n+3.650020e+00\n+2.566577e+00\n+3.446382e-01\n+4.229428e-01\n+2.110946e-01\n+4.239917e-01\n+5.093715e-01\n+7.587029e-01\n+5.159044e-01\n+5.362489e-01\n+5.066558e-01\n+7.018304e-01\n+4.516050e-01\n+6.520729e-01\n+6.091196e-01\n+8.059560e-01\n+7.537994e-01\n+6.535798e-01\n+3.954682e-06\n+4.814712e-06\n+8.123618e-06\n+2.243639e-05\n+2.770909e-06\n+4.094491e-06\n+9.411986e-06\n+1.345245e-05\n+5.930464e-08\n+8.149828e-08\n+1.772796e-07\n+3.519811e-07\n+2.564091e-06\n+3.365757e-06\n+1.064170e-05\n+1.268753e-05\n+3.291488e-06\n+4.013097e-06\n+8.392387e-06\n+1.954911e-05\n+2.482471e-06\n+3.673589e-06\n+1.046643e-05\n+1.261702e-05\n+9.702572e-08\n+1.335289e-07\n+3.600082e-07\n+6.028523e-07\n+1.735021e-06\n+2.280777e-06\n+8.937945e-06\n+8.987556e-06\n+2.224048e-05\n+2.331965e-05\n+4.604229e-05\n+1.133466e-04\n+1.103329e-05\n+1.404112e-05\n+3.776930e-05\n+4.811798e-05\n+3.161132e-07\n+3.741290e-07\n+9.523301e-07\n+1.685374e-06\n+1.101085e-05\n+1.244771e-05\n+4.605464e-05\n+4.894264e-05\n+1.524040e-07\n+1.817839e-07\n+3.340327e-07\n+1.013968e-06\n+1.732516e-07\n+2.508159e-07\n+6.279004e-07\n+9.863770e-07\n+4.953918e-09\n+6.669739e-09\n+1.580060e-08\n+3.447990e-08\n+1.059321e-07\n+1.362314e-07\n+4.690939e-07\n+6.146921e-07\n+4.415397e-01\n+5.803456e-01\n+2.421418e-01\n+5.764655e-01\n+3.928866e-01\n+6.267626e-01\n+3.562774e-01\n+4.389436e-01\n+2.875936e-01\n+4.266749e-01\n+2.295152e-01\n+3.928006e-01\n+6.431536e-01\n+9.114276e-01\n+7.126139e-01\n+7.323530e-01\n+2.895175e-01\n+3.810838e-01\n+1.970744e-01\n+3.957053e-01\n+2.773024e-01\n+4.430148e-01\n+3.121260e-01\n+3.243310e-01\n+3.706823e-01\n+5.507424e-01\n+3.671888e-01\n+5.300150e-01\n+3.428550e-01\n+4.865715e-01\n+4.715257e-01\n+4.087047e-01\n+1.929791e+00\n+2.184471e+00\n+1.066560e+00\n+2.263273e+00\n+1.215789e+00\n+1.670371e+00\n+1.111102e+00\n+1.220176e+00\n+1.191355e+00\n+1.522222e+00\n+9.581822e-01\n+1.461696e+00\n+2.146398e+00\n+2.619611e+00\n+2.396761e+00\n+2.195528e+00\n+1.752854e-01\n+2.257163e-01\n+1.025652e-01\n+2.683711e-01\n+2.530541e-01\n+3.955026e-01\n+2.448436e-01\n+3.315440e-01\n+2.474745e-01\n+3.597068e-01\n+2.107255e-01\n+3.963782e-01\n+2.737159e-01\n+3.800211e-01\n+3.235893e-01\n+3.655041e-01\n+1.920838e-02\n+2.406238e-02\n+1.089418e-02\n+2.266817e-02\n+1.471851e-02\n+2.237848e-02\n+1.380348e-02\n+1.486372e-02\n+1.402636e-02\n+1.983325e-02\n+1.157660e-02\n+1.731650e-02\n+3.725965e-02\n+5.032425e-02\n+4.269550e-02\n+3.835012e-02\n+1.694518e-02\n+2.125803e-02\n+1.192904e-02\n+2.093464e-02\n+1.397657e-02\n+2.128120e-02\n+1.626975e-02\n+1.477603e-02\n+2.432305e-02\n+3.444259e-02\n+2.491778e-02\n+3.143595e-02\n+2.'..b'3659e+00\n+1.035144e+01\n+5.310414e+00\n+6.685071e+00\n+4.992453e+00\n+5.465095e+00\n+5.341889e+00\n+6.253954e+00\n+4.419688e+00\n+6.720718e+00\n+9.571750e+00\n+1.070387e+01\n+1.099501e+01\n+1.003980e+01\n+8.473635e-01\n+9.997922e-01\n+5.100515e-01\n+1.330347e+00\n+1.197978e+00\n+1.715567e+00\n+1.192378e+00\n+1.609465e+00\n+1.202678e+00\n+1.601734e+00\n+1.053479e+00\n+1.975303e+00\n+1.322959e+00\n+1.682972e+00\n+1.608905e+00\n+1.811522e+00\n+8.452254e-01\n+1.052795e+00\n+3.777076e-01\n+1.255483e+00\n+5.256002e-01\n+7.945949e-01\n+3.883826e-01\n+6.680848e-01\n+7.159180e-01\n+1.006551e+00\n+4.655633e-01\n+1.112476e+00\n+1.957457e+00\n+2.628779e+00\n+1.767319e+00\n+2.535902e+00\n+8.070252e-01\n+1.006670e+00\n+4.476367e-01\n+1.254928e+00\n+5.401961e-01\n+8.178438e-01\n+4.954628e-01\n+7.188212e-01\n+1.343679e+00\n+1.891892e+00\n+1.084591e+00\n+2.185829e+00\n+1.519488e+00\n+2.043563e+00\n+1.702847e+00\n+2.060777e+00\n+4.130608e+00\n+4.431027e+00\n+1.860253e+00\n+5.511567e+00\n+1.818642e+00\n+2.367863e+00\n+1.354336e+00\n+2.076569e+00\n+3.316088e+00\n+4.015294e+00\n+2.173283e+00\n+4.628886e+00\n+7.304466e+00\n+8.448302e+00\n+6.646395e+00\n+8.500643e+00\n+4.309695e-01\n+5.259182e-01\n+2.054870e-01\n+7.507083e-01\n+4.348101e-01\n+6.440063e-01\n+3.428139e-01\n+6.481301e-01\n+7.912479e-01\n+1.089896e+00\n+5.490128e-01\n+1.441871e+00\n+1.069980e+00\n+1.407789e+00\n+1.030749e+00\n+1.625557e+00\n+7.911370e+00\n+1.017907e+01\n+4.243343e+00\n+9.542314e+00\n+5.240112e+00\n+8.183059e+00\n+4.647485e+00\n+5.408544e+00\n+6.557124e+00\n+9.522932e+00\n+5.118020e+00\n+8.273793e+00\n+1.437503e+01\n+1.994140e+01\n+1.557776e+01\n+1.512213e+01\n+3.617284e+00\n+4.660877e+00\n+2.408210e+00\n+4.567494e+00\n+2.579008e+00\n+4.033264e+00\n+2.839132e+00\n+2.786672e+00\n+5.893344e+00\n+8.571322e+00\n+5.709601e+00\n+7.784782e+00\n+5.343556e+00\n+7.423452e+00\n+7.187564e+00\n+5.884748e+00\n+1.691509e+01\n+1.874345e+01\n+9.143352e+00\n+1.832733e+01\n+7.932559e+00\n+1.066860e+01\n+7.090315e+00\n+7.354888e+00\n+1.328792e+01\n+1.662009e+01\n+1.045251e+01\n+1.506161e+01\n+2.346855e+01\n+2.803833e+01\n+2.563050e+01\n+2.217754e+01\n+2.575157e+00\n+3.246088e+00\n+1.473719e+00\n+3.642438e+00\n+2.767341e+00\n+4.233875e+00\n+2.618753e+00\n+3.349569e+00\n+4.626373e+00\n+6.582611e+00\n+3.852865e+00\n+6.845708e+00\n+5.016147e+00\n+6.817377e+00\n+5.799903e+00\n+6.188149e+00\n+2.431794e-02\n+3.041052e-02\n+4.667261e-02\n+1.586217e-01\n+1.238868e-02\n+1.880361e-02\n+3.931710e-02\n+6.915107e-02\n+4.456473e-04\n+6.290561e-04\n+1.244683e-03\n+3.041000e-03\n+1.737705e-02\n+2.342954e-02\n+6.738310e-02\n+9.885851e-02\n+1.477331e-02\n+1.850136e-02\n+3.519398e-02\n+1.008804e-01\n+8.101351e-03\n+1.231408e-02\n+3.191311e-02\n+4.733960e-02\n+5.321815e-04\n+7.522922e-04\n+1.844942e-03\n+3.801704e-03\n+8.582575e-03\n+1.158868e-02\n+4.130930e-02\n+5.111508e-02\n+9.963864e-02\n+1.073109e-01\n+1.927249e-01\n+5.838302e-01\n+3.593987e-02\n+4.697983e-02\n+1.149496e-01\n+1.802076e-01\n+1.730667e-03\n+2.103927e-03\n+4.871423e-03\n+1.060869e-02\n+5.436657e-02\n+6.313041e-02\n+2.124622e-01\n+2.778388e-01\n+7.481407e-04\n+9.166015e-04\n+1.532050e-03\n+5.722760e-03\n+6.183747e-04\n+9.195342e-04\n+2.093932e-03\n+4.047733e-03\n+2.971823e-05\n+4.109806e-05\n+8.856150e-05\n+2.378124e-04\n+5.731150e-04\n+7.570589e-04\n+2.371217e-03\n+3.823545e-03\n+6.098357e+00\n+8.233190e+00\n+3.124713e+00\n+9.154001e+00\n+3.945464e+00\n+6.465054e+00\n+3.342846e+00\n+5.067970e+00\n+4.854111e+00\n+7.397176e+00\n+3.619422e+00\n+7.622495e+00\n+9.790056e+00\n+1.425053e+01\n+1.013496e+01\n+1.281697e+01\n+2.918692e+00\n+3.946138e+00\n+1.856269e+00\n+4.586483e+00\n+2.032613e+00\n+3.335475e+00\n+2.137610e+00\n+2.733280e+00\n+4.566702e+00\n+6.969276e+00\n+4.226565e+00\n+7.507295e+00\n+3.809353e+00\n+5.552975e+00\n+4.894896e+00\n+5.220894e+00\n+1.941878e+01\n+2.257856e+01\n+1.002753e+01\n+2.618442e+01\n+8.895234e+00\n+1.255309e+01\n+7.595397e+00\n+1.026399e+01\n+1.465008e+01\n+1.922717e+01\n+1.100891e+01\n+2.066571e+01\n+2.380395e+01\n+2.984104e+01\n+2.483481e+01\n+2.799447e+01\n+1.932682e+00\n+2.556324e+00\n+1.056604e+00\n+3.402083e+00\n+2.028690e+00\n+3.256792e+00\n+1.833954e+00\n+3.055892e+00\n+3.334514e+00\n+4.978391e+00\n+2.652873e+00\n+6.140537e+00\n+3.326151e+00\n+4.743382e+00\n+3.673952e+00\n+5.106563e+00\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2x3acc4
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2x3acc4 Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,16384 @@\n+3.474812e-02\n+5.353605e-02\n+9.026246e-04\n+1.433667e-01\n+3.872268e-02\n+1.157615e-01\n+6.683136e-02\n+1.721925e-01\n+5.696054e-04\n+1.445738e-03\n+6.834790e-04\n+4.694979e-03\n+1.030454e-01\n+3.196511e-01\n+1.536547e-01\n+4.313694e-01\n+3.134206e-02\n+5.741500e-02\n+6.668311e-04\n+1.412581e-01\n+4.947311e-02\n+1.758534e-01\n+6.993534e-02\n+2.403184e-01\n+1.945158e-02\n+5.870204e-02\n+1.911695e-02\n+1.751392e-01\n+1.081593e-01\n+3.989277e-01\n+1.320972e-01\n+4.945990e-01\n+1.269607e-03\n+2.730722e-03\n+5.003543e-05\n+6.951513e-03\n+1.981745e-03\n+8.270650e-03\n+5.189138e-03\n+1.169472e-02\n+5.873516e-04\n+2.081166e-03\n+1.069256e-03\n+6.424670e-03\n+5.727945e-03\n+2.480501e-02\n+1.295831e-02\n+3.182090e-02\n+5.580040e-02\n+9.343313e-02\n+1.475266e-03\n+2.540892e-01\n+1.368882e-01\n+4.447471e-01\n+2.404573e-01\n+6.718105e-01\n+4.515216e-02\n+1.245497e-01\n+5.514253e-02\n+4.107429e-01\n+2.399820e-01\n+8.090491e-01\n+3.642110e-01\n+1.108744e+00\n+3.172535e-02\n+4.717346e-02\n+8.206829e-04\n+1.040888e-01\n+4.377606e-02\n+1.263024e-01\n+7.523929e-02\n+1.547983e-01\n+7.468155e-04\n+1.829384e-03\n+8.923958e-04\n+4.895008e-03\n+1.221222e-01\n+3.656102e-01\n+1.813447e-01\n+4.065328e-01\n+3.593023e-02\n+6.352345e-02\n+7.612747e-04\n+1.287734e-01\n+7.022598e-02\n+2.409104e-01\n+9.885948e-02\n+2.712664e-01\n+3.202223e-02\n+9.326658e-02\n+3.134067e-02\n+2.292770e-01\n+1.609485e-01\n+5.729194e-01\n+1.957536e-01\n+5.852707e-01\n+1.723665e-02\n+3.577975e-02\n+6.764789e-04\n+7.504873e-02\n+3.331406e-02\n+1.341824e-01\n+8.686964e-02\n+1.563328e-01\n+1.145106e-02\n+3.915892e-02\n+2.075977e-02\n+9.960449e-02\n+1.009423e-01\n+4.218810e-01\n+2.274133e-01\n+4.459305e-01\n+6.133033e-02\n+9.910947e-02\n+1.614735e-03\n+2.220775e-01\n+1.862945e-01\n+5.841489e-01\n+3.258855e-01\n+7.270451e-01\n+7.126572e-02\n+1.897234e-01\n+8.667261e-02\n+5.155285e-01\n+3.423792e-01\n+1.113986e+00\n+5.174576e-01\n+1.257883e+00\n+4.422790e-03\n+7.234775e-03\n+1.005880e-04\n+1.811123e-02\n+4.154070e-03\n+1.318518e-02\n+6.277155e-03\n+1.833401e-02\n+1.137860e-04\n+3.066322e-04\n+1.195402e-04\n+9.308572e-04\n+1.540975e-02\n+5.075243e-02\n+2.011811e-02\n+6.402521e-02\n+5.177976e-03\n+1.007098e-02\n+9.645450e-05\n+2.316224e-02\n+6.888823e-03\n+2.599802e-02\n+8.526026e-03\n+3.321219e-02\n+5.043558e-03\n+1.616028e-02\n+4.339851e-03\n+4.507132e-02\n+2.099414e-02\n+8.221333e-02\n+2.244926e-02\n+9.528452e-02\n+2.653987e-03\n+6.060669e-03\n+9.157589e-05\n+1.442260e-02\n+3.491568e-03\n+1.547127e-02\n+8.004645e-03\n+2.045019e-02\n+1.926980e-03\n+7.249355e-03\n+3.071389e-03\n+2.092014e-02\n+1.406793e-02\n+6.468215e-02\n+2.786467e-02\n+7.756726e-02\n+1.167583e-02\n+2.075701e-02\n+2.702680e-04\n+5.276807e-02\n+2.414124e-02\n+8.327612e-02\n+3.712835e-02\n+1.175913e-01\n+1.482786e-02\n+4.342663e-02\n+1.585480e-02\n+1.338766e-01\n+5.899713e-02\n+2.111741e-01\n+7.839338e-02\n+2.705317e-01\n+6.542127e-02\n+8.878658e-02\n+1.523053e-03\n+2.300122e-01\n+8.484527e-02\n+2.234291e-01\n+1.312390e-01\n+3.215078e-01\n+1.515037e-03\n+3.387287e-03\n+1.629275e-03\n+1.064138e-02\n+2.551041e-01\n+6.970731e-01\n+3.409219e-01\n+9.100244e-01\n+7.755788e-02\n+1.251518e-01\n+1.478885e-03\n+2.978699e-01\n+1.424762e-01\n+4.461050e-01\n+1.805053e-01\n+5.897595e-01\n+6.800093e-02\n+1.807701e-01\n+5.989612e-02\n+5.217450e-01\n+3.519357e-01\n+1.143423e+00\n+3.852239e-01\n+1.371411e+00\n+5.152691e-02\n+9.762378e-02\n+1.819963e-03\n+2.404136e-01\n+9.360243e-02\n+3.441061e-01\n+2.196619e-01\n+4.706999e-01\n+3.367630e-02\n+1.051105e-01\n+5.494499e-02\n+3.139005e-01\n+3.056783e-01\n+1.166054e+00\n+6.197757e-01\n+1.447082e+00\n+1.052375e-01\n+1.552199e-01\n+2.493582e-03\n+4.083517e-01\n+3.004510e-01\n+8.598730e-01\n+4.730051e-01\n+1.256520e+00\n+1.203021e-01\n+2.923147e-01\n+1.316745e-01\n+9.325664e-01\n+5.951316e-01\n+1.767350e+00\n+8.094826e-01\n+2.343045e+00\n+4.305426e-02\n+1.251927e-01\n+1.437133e-03\n+2.005845e-01\n+3.767118e-02\n+2.125474e-01\n+8.354670e-02\n+1.891571e-01\n+5.835182e-04\n+2.795231e-03\n+8.997267e-04\n+5.430983e-03\n+9.491463e-02\n+5.556848e-01\n+1.818677e-01\n+4.486609e-01\n+4.011831e-02\n+1.387038e-01\n+1.096822e-03\n+2.041704e-01\n+4.972141e-02\n+3.335590e-01\n+9.031834e-02\n+2.727254e-01\n+2.058570e-02\n+1.172497e-01\n+2.599766e-02\n+2.092948e-01\n+1.'..b'4968e-02\n+2.231023e+00\n+6.364014e-01\n+4.004245e+00\n+1.826828e+00\n+4.843643e+00\n+2.277349e-01\n+1.216567e+00\n+4.544983e-01\n+3.212782e+00\n+1.274856e+00\n+8.323376e+00\n+3.161767e+00\n+9.134267e+00\n+4.364215e-01\n+1.101710e+00\n+1.264905e-02\n+2.563031e+00\n+1.381632e+00\n+6.767642e+00\n+2.660623e+00\n+8.745250e+00\n+5.502413e-01\n+2.288311e+00\n+7.366826e-01\n+6.455706e+00\n+1.678743e+00\n+8.532534e+00\n+2.793042e+00\n+1.000312e+01\n+4.092647e-01\n+6.374471e-01\n+9.563010e-03\n+1.542972e+00\n+4.246463e-01\n+1.283367e+00\n+6.592601e-01\n+1.725493e+00\n+7.441746e-03\n+1.909480e-02\n+8.032310e-03\n+5.604945e-02\n+6.087175e-01\n+1.908922e+00\n+8.164838e-01\n+2.328484e+00\n+4.467860e-01\n+8.274128e-01\n+8.550713e-03\n+1.840018e+00\n+6.566443e-01\n+2.359590e+00\n+8.349724e-01\n+2.914640e+00\n+3.075777e-01\n+9.383783e-01\n+2.719148e-01\n+2.530581e+00\n+7.733034e-01\n+2.883402e+00\n+8.495603e-01\n+3.231292e+00\n+1.550072e-02\n+3.370426e-02\n+5.495087e-04\n+7.755300e-02\n+2.252782e-02\n+9.504645e-02\n+5.306169e-02\n+1.214781e-01\n+7.954418e-03\n+2.849324e-02\n+1.302586e-02\n+7.950565e-02\n+3.507481e-02\n+1.535539e-01\n+7.137721e-02\n+1.780516e-01\n+4.885278e-01\n+8.269472e-01\n+1.161814e-02\n+2.032708e+00\n+1.115853e+00\n+3.665042e+00\n+1.763167e+00\n+5.004085e+00\n+4.384887e-01\n+1.222777e+00\n+4.817049e-01\n+3.644910e+00\n+1.053768e+00\n+3.591417e+00\n+1.438579e+00\n+4.448712e+00\n+8.261351e-01\n+1.241844e+00\n+1.922359e-02\n+2.476768e+00\n+1.061378e+00\n+3.095779e+00\n+1.640940e+00\n+3.429545e+00\n+2.157177e-02\n+5.341981e-02\n+2.318699e-02\n+1.292001e-01\n+1.594973e+00\n+4.827273e+00\n+2.130483e+00\n+4.851669e+00\n+1.132410e+00\n+2.023963e+00\n+2.158239e-02\n+3.708572e+00\n+2.060774e+00\n+7.146823e+00\n+2.609549e+00\n+7.273870e+00\n+1.119497e+00\n+3.296267e+00\n+9.855852e-01\n+7.324345e+00\n+2.544162e+00\n+9.155376e+00\n+2.783441e+00\n+8.453790e+00\n+4.652721e-01\n+9.763742e-01\n+1.642566e-02\n+1.851119e+00\n+8.372798e-01\n+3.409287e+00\n+1.963928e+00\n+3.590293e+00\n+3.428686e-01\n+1.185325e+00\n+5.591378e-01\n+2.725196e+00\n+1.366600e+00\n+5.774083e+00\n+2.769482e+00\n+5.516609e+00\n+1.187132e+00\n+1.939383e+00\n+2.811508e-02\n+3.927942e+00\n+3.357474e+00\n+1.064293e+01\n+5.283141e+00\n+1.197322e+01\n+1.530144e+00\n+4.118104e+00\n+1.673971e+00\n+1.011442e+01\n+3.323879e+00\n+1.093308e+01\n+4.518841e+00\n+1.115874e+01\n+3.177423e-01\n+5.254466e-01\n+6.500391e-03\n+1.188950e+00\n+2.778698e-01\n+8.916175e-01\n+3.776983e-01\n+1.120629e+00\n+9.067659e-03\n+2.470297e-02\n+8.569095e-03\n+6.778388e-02\n+5.552498e-01\n+1.848734e+00\n+6.520702e-01\n+2.108049e+00\n+4.502334e-01\n+8.852666e-01\n+7.544226e-03\n+1.840327e+00\n+5.577142e-01\n+2.127805e+00\n+6.209088e-01\n+2.456977e+00\n+4.864549e-01\n+1.575721e+00\n+3.765259e-01\n+3.972306e+00\n+9.155667e-01\n+3.624590e+00\n+8.806603e-01\n+3.797091e+00\n+1.976454e-01\n+4.562822e-01\n+6.134564e-03\n+9.814505e-01\n+2.421016e-01\n+1.084496e+00\n+4.992679e-01\n+1.295720e+00\n+1.591817e-01\n+6.053969e-01\n+2.282261e-01\n+1.579128e+00\n+5.254511e-01\n+2.442370e+00\n+9.362043e-01\n+2.647388e+00\n+6.235126e-01\n+1.120592e+00\n+1.298275e-02\n+2.574932e+00\n+1.200345e+00\n+4.185930e+00\n+1.660607e+00\n+5.342674e+00\n+8.783426e-01\n+2.600558e+00\n+8.448133e-01\n+7.246480e+00\n+1.580166e+00\n+5.717911e+00\n+1.888712e+00\n+6.621050e+00\n+9.534598e-01\n+1.308144e+00\n+1.996701e-02\n+3.063171e+00\n+1.151331e+00\n+3.065045e+00\n+1.601953e+00\n+3.986583e+00\n+2.449258e-02\n+5.535899e-02\n+2.369301e-02\n+1.571976e-01\n+1.864726e+00\n+5.151109e+00\n+2.241646e+00\n+6.078375e+00\n+1.368070e+00\n+2.231744e+00\n+2.346559e-02\n+4.801158e+00\n+2.339988e+00\n+7.406847e+00\n+2.666710e+00\n+8.850816e+00\n+1.330531e+00\n+3.575705e+00\n+1.054201e+00\n+9.328364e+00\n+3.113579e+00\n+1.022653e+01\n+3.065663e+00\n+1.108667e+01\n+7.784432e-01\n+1.490985e+00\n+2.473261e-02\n+3.318859e+00\n+1.316646e+00\n+4.893268e+00\n+2.779400e+00\n+6.050104e+00\n+5.643455e-01\n+1.780705e+00\n+8.282535e-01\n+4.806729e+00\n+2.316176e+00\n+8.932038e+00\n+4.224312e+00\n+1.001929e+01\n+1.140074e+00\n+1.699945e+00\n+2.429968e-02\n+4.042348e+00\n+3.030576e+00\n+8.768203e+00\n+4.291724e+00\n+1.158132e+01\n+1.445651e+00\n+3.551124e+00\n+1.423332e+00\n+1.024017e+01\n+3.233625e+00\n+9.707871e+00\n+3.956386e+00\n+1.163306e+01\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2x3acc5
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2x3acc5 Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,16384 @@\n+8.602152e-03\n+1.205619e-01\n+3.959228e-06\n+8.884085e-02\n+6.476866e-03\n+8.232760e-02\n+3.651058e-05\n+1.184930e-01\n+1.346775e-03\n+2.564991e-02\n+5.543976e-07\n+1.560800e-02\n+8.022254e-03\n+7.830108e-02\n+9.822408e-06\n+9.071470e-02\n+1.071698e-02\n+1.072519e-01\n+2.728038e-05\n+1.140389e-01\n+9.805041e-03\n+8.899371e-02\n+3.056876e-04\n+1.848215e-01\n+1.472910e-02\n+2.003071e-01\n+3.353337e-05\n+1.758748e-01\n+1.500445e-02\n+1.045733e-01\n+1.016052e-04\n+1.748140e-01\n+1.849890e-04\n+2.732681e-03\n+2.634733e-06\n+2.510945e-03\n+1.647160e-04\n+2.206766e-03\n+2.873273e-05\n+3.960493e-03\n+1.660631e-04\n+3.333527e-03\n+2.115372e-06\n+2.529363e-03\n+1.685921e-04\n+1.734397e-03\n+6.387723e-06\n+2.505556e-03\n+1.629494e-02\n+1.640756e-01\n+7.573888e-05\n+1.636313e-01\n+2.216043e-02\n+2.023702e-01\n+1.261521e-03\n+3.941977e-01\n+2.409476e-02\n+3.296865e-01\n+1.001639e-04\n+2.715081e-01\n+4.464408e-02\n+3.130568e-01\n+5.520113e-04\n+4.908549e-01\n+8.946474e-03\n+1.581588e-01\n+2.805234e-05\n+1.020850e-01\n+7.099194e-03\n+1.138225e-01\n+2.726319e-04\n+1.434965e-01\n+1.548469e-03\n+3.719903e-02\n+4.342525e-06\n+1.982710e-02\n+6.009819e-03\n+7.398968e-02\n+5.012988e-05\n+7.508396e-02\n+3.904089e-02\n+4.928226e-01\n+6.770357e-04\n+4.589919e-01\n+3.764404e-02\n+4.309675e-01\n+7.995367e-03\n+7.839777e-01\n+5.931798e-02\n+1.017524e+00\n+9.200277e-04\n+7.825610e-01\n+3.937201e-02\n+3.461200e-01\n+1.816341e-03\n+5.068138e-01\n+5.822123e-03\n+1.084833e-01\n+5.649186e-04\n+8.731266e-02\n+5.463495e-03\n+9.232716e-02\n+6.492702e-03\n+1.451405e-01\n+5.777917e-03\n+1.462987e-01\n+5.014165e-04\n+9.723302e-02\n+3.822019e-03\n+4.959553e-02\n+9.865418e-04\n+6.275731e-02\n+3.057523e-02\n+3.883284e-01\n+9.681651e-04\n+3.392250e-01\n+4.382222e-02\n+5.047789e-01\n+1.699513e-02\n+8.612611e-01\n+4.998065e-02\n+8.626188e-01\n+1.415482e-03\n+6.222525e-01\n+6.033938e-02\n+5.337016e-01\n+5.082750e-03\n+7.329843e-01\n+3.593033e-04\n+5.415086e-03\n+2.480168e-06\n+3.942048e-03\n+2.563794e-04\n+3.504327e-03\n+2.167471e-05\n+4.982715e-03\n+8.488846e-05\n+1.738523e-03\n+5.240724e-07\n+1.045096e-03\n+3.520933e-04\n+3.695474e-03\n+6.465406e-06\n+4.229550e-03\n+5.532636e-04\n+5.953956e-03\n+2.112159e-05\n+6.254145e-03\n+4.797040e-04\n+4.681923e-03\n+2.242943e-04\n+9.605753e-03\n+1.147455e-03\n+1.678019e-02\n+3.917896e-05\n+1.455520e-02\n+8.139299e-04\n+6.099985e-03\n+8.266081e-05\n+1.007392e-02\n+2.408337e-04\n+3.825616e-03\n+5.144278e-05\n+3.472671e-03\n+2.032222e-04\n+2.927742e-03\n+5.316537e-04\n+5.190867e-03\n+3.262452e-04\n+7.042322e-03\n+6.232670e-05\n+5.278824e-03\n+2.306300e-04\n+2.551339e-03\n+1.310513e-04\n+3.641145e-03\n+8.200075e-04\n+8.878710e-03\n+5.716106e-05\n+8.747553e-03\n+1.056835e-03\n+1.037807e-02\n+9.022774e-04\n+1.997093e-02\n+1.829730e-03\n+2.692196e-02\n+1.140754e-04\n+2.190294e-02\n+2.360674e-03\n+1.780067e-02\n+4.377606e-04\n+2.757278e-02\n+2.136387e-02\n+2.395814e-01\n+9.643492e-05\n+1.289364e-01\n+1.699987e-02\n+1.729005e-01\n+9.398325e-04\n+1.817455e-01\n+3.462148e-03\n+5.276019e-02\n+1.397728e-05\n+2.344704e-02\n+2.259794e-02\n+1.764860e-01\n+2.713572e-04\n+1.493277e-01\n+4.788682e-02\n+3.834586e-01\n+1.195487e-03\n+2.977740e-01\n+4.630214e-02\n+3.362644e-01\n+1.415730e-02\n+5.100281e-01\n+6.812362e-02\n+7.412894e-01\n+1.521071e-03\n+4.753518e-01\n+7.604371e-02\n+4.240665e-01\n+5.050221e-03\n+5.177376e-01\n+1.458305e-02\n+1.723700e-01\n+2.036996e-03\n+1.156724e-01\n+1.372291e-02\n+1.471082e-01\n+2.347677e-02\n+1.928191e-01\n+1.355045e-02\n+2.176478e-01\n+1.692849e-03\n+1.206095e-01\n+1.507438e-02\n+1.240854e-01\n+5.601435e-03\n+1.309171e-01\n+9.807695e-02\n+7.901835e-01\n+4.470786e-03\n+5.755336e-01\n+1.409614e-01\n+1.030003e+00\n+7.869870e-02\n+1.465299e+00\n+1.501116e-01\n+1.643474e+00\n+6.120036e-03\n+9.884727e-01\n+3.047738e-01\n+1.710042e+00\n+3.695836e-02\n+1.958200e+00\n+2.344827e-02\n+4.258147e-01\n+1.907033e-06\n+1.923850e-01\n+1.299769e-02\n+2.140688e-01\n+1.294684e-05\n+1.889073e-01\n+2.726604e-03\n+6.728519e-02\n+1.983318e-07\n+2.510320e-02\n+1.441135e-02\n+1.822564e-01\n+3.117958e-06\n+1.294615e-01\n+3.076145e-02\n+3.988833e-01\n+1.383658e-05\n+2.600412e-01\n+2.071960e-02\n+2.436676e-01\n+1.141442e-04\n+3.102694e-01\n+3.140032e-02\n+5.533002e-01\n+1.263220e-05\n+2.978627e-01\n+2.'..b'1243e-03\n+1.740262e+00\n+1.410317e-01\n+2.064354e+00\n+7.296444e-02\n+2.525743e+00\n+9.856605e-02\n+2.161748e+00\n+3.723868e-03\n+1.118212e+00\n+1.438877e-01\n+1.617268e+00\n+1.616911e-02\n+1.592757e+00\n+7.177475e-01\n+7.896047e+00\n+9.894436e-03\n+5.368392e+00\n+8.981718e-01\n+8.961379e+00\n+1.516454e-01\n+1.190020e+01\n+6.769816e-01\n+1.012051e+01\n+8.346780e-03\n+5.681932e+00\n+1.803643e+00\n+1.381837e+01\n+6.614364e-02\n+1.477062e+01\n+5.408616e-01\n+5.927737e+00\n+4.206456e-05\n+2.994635e+00\n+2.727641e-01\n+2.711236e+00\n+2.598171e-04\n+2.675267e+00\n+4.511339e-02\n+6.718863e-01\n+3.138039e-06\n+2.802915e-01\n+2.875407e-01\n+2.194675e+00\n+5.949050e-05\n+1.743142e+00\n+4.264784e-01\n+3.337560e+00\n+1.834433e-04\n+2.432932e+00\n+2.613469e-01\n+1.854927e+00\n+1.376805e-03\n+2.641025e+00\n+3.122715e-01\n+3.320871e+00\n+1.201323e-04\n+1.998996e+00\n+3.403835e-01\n+1.855106e+00\n+3.894856e-04\n+2.126067e+00\n+7.353705e-03\n+8.494718e-02\n+1.769798e-05\n+5.351182e-02\n+4.385704e-03\n+4.594724e-02\n+1.292728e-04\n+5.653339e-02\n+3.516940e-03\n+5.520714e-02\n+7.570162e-06\n+2.871806e-02\n+3.820510e-03\n+3.073494e-02\n+2.446003e-05\n+3.043971e-02\n+4.657925e-01\n+3.667610e+00\n+3.658349e-04\n+2.507600e+00\n+4.242883e-01\n+3.029905e+00\n+4.081352e-03\n+4.046216e+00\n+3.669386e-01\n+3.926193e+00\n+2.577560e-04\n+2.216694e+00\n+7.274905e-01\n+3.989203e+00\n+1.519981e-03\n+4.288138e+00\n+8.919997e-01\n+1.233122e+01\n+4.726162e-04\n+5.456662e+00\n+4.740947e-01\n+5.944066e+00\n+3.076517e-03\n+5.137474e+00\n+8.225201e-02\n+1.545167e+00\n+3.897743e-05\n+5.646190e-01\n+3.415844e-01\n+3.288573e+00\n+4.814599e-04\n+2.287895e+00\n+2.463646e+00\n+2.431917e+01\n+7.219328e-03\n+1.552801e+01\n+1.591102e+00\n+1.424445e+01\n+5.710403e-02\n+1.776469e+01\n+1.994234e+00\n+2.675064e+01\n+5.226577e-03\n+1.410458e+01\n+1.416348e+00\n+9.736629e+00\n+1.104095e-02\n+9.774239e+00\n+3.670080e-01\n+5.347572e+00\n+6.017370e-03\n+2.950690e+00\n+2.306789e-01\n+3.048361e+00\n+4.632224e-02\n+3.285324e+00\n+1.940424e-01\n+3.842072e+00\n+2.845449e-03\n+1.750618e+00\n+1.373444e-01\n+1.393671e+00\n+5.990460e-03\n+1.209023e+00\n+1.385936e+00\n+1.376487e+01\n+7.415655e-03\n+8.243534e+00\n+1.330488e+00\n+1.198443e+01\n+8.719022e-02\n+1.401856e+01\n+1.206998e+00\n+1.629008e+01\n+5.776114e-03\n+8.056075e+00\n+1.559187e+00\n+1.078438e+01\n+2.219334e-02\n+1.015416e+01\n+3.466665e-01\n+4.085599e+00\n+4.043510e-04\n+2.039036e+00\n+1.656826e-01\n+1.770917e+00\n+2.366868e-03\n+1.726284e+00\n+4.363457e-02\n+6.988148e-01\n+4.551977e-05\n+2.879985e-01\n+1.936566e-01\n+1.589441e+00\n+6.008929e-04\n+1.247156e+00\n+3.378535e-01\n+2.843162e+00\n+2.179464e-03\n+2.047465e+00\n+1.962063e-01\n+1.497488e+00\n+1.550188e-02\n+2.106314e+00\n+3.733046e-01\n+4.268979e+00\n+2.153809e-03\n+2.538621e+00\n+2.833396e-01\n+1.660537e+00\n+4.862354e-03\n+1.880056e+00\n+1.469093e-01\n+1.824875e+00\n+5.302531e-03\n+1.135659e+00\n+8.303218e-02\n+9.354221e-01\n+3.670547e-02\n+1.137018e+00\n+1.060248e-01\n+1.789694e+00\n+3.422663e-03\n+9.197138e-01\n+8.019951e-02\n+6.937831e-01\n+7.700586e-03\n+6.788065e-01\n+3.596906e-01\n+3.045515e+00\n+4.236803e-03\n+2.057075e+00\n+3.105001e-01\n+2.384354e+00\n+4.479414e-02\n+3.145610e+00\n+4.275919e-01\n+4.919815e+00\n+4.504654e-03\n+2.744082e+00\n+5.902979e-01\n+3.480736e+00\n+1.849688e-02\n+3.696305e+00\n+1.111658e+00\n+9.748637e+00\n+8.479149e-04\n+3.596823e+00\n+5.924888e-01\n+4.712271e+00\n+5.534923e-03\n+3.395864e+00\n+9.597722e-02\n+1.143743e+00\n+6.547447e-05\n+3.484678e-01\n+6.703228e-01\n+4.093784e+00\n+1.360139e-03\n+2.374693e+00\n+1.577077e+00\n+9.875405e+00\n+6.652863e-03\n+5.257458e+00\n+1.021365e+00\n+5.800435e+00\n+5.277001e-02\n+6.031514e+00\n+1.195270e+00\n+1.017081e+01\n+4.509666e-03\n+4.471317e+00\n+1.427657e+00\n+6.225787e+00\n+1.602130e-02\n+5.211019e+00\n+4.797567e-01\n+4.434388e+00\n+1.132373e-02\n+2.040115e+00\n+3.023863e-01\n+2.534849e+00\n+8.741397e-02\n+2.277812e+00\n+2.374966e-01\n+2.983031e+00\n+5.013588e-03\n+1.133281e+00\n+2.827065e-01\n+1.819771e+00\n+1.775098e-02\n+1.316270e+00\n+2.320164e+00\n+1.461771e+01\n+1.787156e-02\n+7.299190e+00\n+2.233548e+00\n+1.276243e+01\n+2.107120e-01\n+1.244725e+01\n+1.891897e+00\n+1.619741e+01\n+1.303358e-02\n+6.678823e+00\n+4.110108e+00\n+1.803358e+01\n+8.421995e-02\n+1.415744e+01\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2x3acc6
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2x3acc6 Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,64 @@
+1.985605e-01
+3.376098e-01
+5.320678e-02
+4.945075e-01
+2.810407e-01
+8.894465e-01
+4.978347e-01
+1.051663e+00
+5.483311e-02
+1.467011e-01
+5.979134e-02
+2.049196e-01
+4.315946e-01
+1.165403e+00
+7.294928e-01
+1.225165e+00
+3.186824e-01
+8.057160e-01
+1.252956e-01
+1.247968e+00
+4.256388e-01
+2.003059e+00
+1.106270e+00
+2.504460e+00
+2.778753e-01
+1.105458e+00
+4.445793e-01
+1.632886e+00
+7.035153e-01
+2.824722e+00
+1.744707e+00
+3.140203e+00
+1.214590e-01
+3.554377e-01
+5.257908e-02
+4.166597e-01
+2.607728e-01
+1.420445e+00
+7.462553e-01
+1.344131e+00
+1.220754e-01
+5.621219e-01
+2.150468e-01
+6.284064e-01
+3.554103e-01
+1.651741e+00
+9.704751e-01
+1.389698e+00
+5.343671e-01
+9.957698e-01
+1.315837e-01
+1.612177e+00
+6.963196e-01
+2.415218e+00
+1.133477e+00
+3.156525e+00
+5.183706e-01
+1.519946e+00
+5.194265e-01
+2.346789e+00
+1.001111e+00
+2.962646e+00
+1.554947e+00
+3.442660e+00
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2x3acc7
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2x3acc7 Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,256 @@
+8.110802e-02
+1.675725e-01
+2.337159e-03
+3.635438e-01
+9.627507e-02
+3.979178e-01
+1.866689e-01
+4.659507e-01
+1.414586e-03
+4.989754e-03
+1.894506e-03
+1.278680e-02
+1.911438e-01
+7.818384e-01
+3.128548e-01
+8.593617e-01
+1.064861e-01
+2.775265e-01
+2.589074e-03
+5.302543e-01
+1.781239e-01
+9.286975e-01
+2.914119e-01
+9.577378e-01
+7.028113e-02
+3.127240e-01
+7.942046e-02
+7.057803e-01
+2.915112e-01
+1.504128e+00
+4.025917e-01
+1.456027e+00
+3.287458e-03
+1.023282e-02
+1.506073e-04
+1.999742e-02
+5.448594e-03
+3.392814e-02
+1.679593e-02
+3.578752e-02
+1.595558e-03
+8.479273e-03
+3.397353e-03
+1.957339e-02
+1.174133e-02
+7.235531e-02
+3.055353e-02
+7.163979e-02
+1.362156e-01
+3.001117e-01
+3.884883e-03
+6.649416e-01
+3.462183e-01
+1.525973e+00
+6.644083e-01
+1.824904e+00
+1.150292e-01
+4.326883e-01
+1.524760e-01
+1.132412e+00
+4.492387e-01
+1.959528e+00
+7.277574e-01
+2.199667e+00
+1.572310e-01
+3.138854e-01
+4.442683e-03
+5.620031e-01
+2.298633e-01
+9.180011e-01
+4.370293e-01
+8.871632e-01
+3.916645e-03
+1.334928e-02
+5.143557e-03
+2.823282e-02
+4.650155e-01
+1.837885e+00
+7.463331e-01
+1.667212e+00
+2.676232e-01
+6.739527e-01
+6.380548e-03
+1.062730e+00
+5.513591e-01
+2.777670e+00
+8.845093e-01
+2.364105e+00
+2.522784e-01
+1.084667e+00
+2.795481e-01
+2.020313e+00
+9.194308e-01
+4.583978e+00
+1.245120e+00
+3.662190e+00
+9.498650e-02
+2.856873e-01
+4.267076e-03
+4.607692e-01
+1.938955e-01
+1.166641e+00
+5.860975e-01
+1.015600e+00
+6.584533e-02
+3.381153e-01
+1.374788e-01
+6.441486e-01
+4.257475e-01
+2.535123e+00
+1.086372e+00
+2.071556e+00
+3.268281e-01
+6.957764e-01
+9.140151e-03
+1.272283e+00
+1.023115e+00
+4.357276e+00
+1.925271e+00
+4.300530e+00
+3.941952e-01
+1.432757e+00
+5.123745e-01
+3.094677e+00
+1.352704e+00
+5.701262e+00
+2.148795e+00
+5.281901e+00
+5.710767e-02
+1.118927e-01
+1.335981e-03
+2.570862e-01
+6.180545e-02
+2.422561e-01
+9.728962e-02
+3.004307e-01
+1.611899e-03
+5.392075e-03
+1.752613e-03
+1.463395e-02
+1.516664e-01
+5.883214e-01
+2.015364e-01
+6.848520e-01
+1.059198e-01
+2.617924e-01
+2.090788e-03
+5.297362e-01
+1.615431e-01
+7.987466e-01
+2.145632e-01
+8.723775e-01
+1.131359e-01
+4.774099e-01
+1.037949e-01
+1.141098e+00
+3.267668e-01
+1.598953e+00
+3.663777e-01
+1.639244e+00
+3.818083e-02
+1.127065e-01
+1.420080e-03
+2.332656e-01
+5.769683e-02
+3.407187e-01
+1.443953e-01
+3.806191e-01
+2.998995e-02
+1.511437e-01
+5.184242e-02
+3.695053e-01
+1.536743e-01
+8.980957e-01
+3.246585e-01
+9.417369e-01
+1.874011e-01
+3.915586e-01
+4.339153e-03
+9.187987e-01
+4.342879e-01
+1.815277e+00
+6.766192e-01
+2.299108e+00
+2.561128e-01
+9.136226e-01
+2.756172e-01
+2.532323e+00
+6.965002e-01
+2.881136e+00
+9.160353e-01
+3.425251e+00
+2.329398e-01
+4.096184e-01
+5.812680e-03
+8.751473e-01
+3.067612e-01
+1.079138e+00
+5.150708e-01
+1.244432e+00
+5.656245e-03
+1.698145e-02
+6.559991e-03
+4.285537e-02
+6.772463e-01
+2.357764e+00
+9.599255e-01
+2.552155e+00
+3.880689e-01
+8.608298e-01
+8.170875e-03
+1.619737e+00
+7.201862e-01
+3.195905e+00
+1.020324e+00
+3.245741e+00
+3.565937e-01
+1.350495e+00
+3.489600e-01
+3.001574e+00
+1.310623e+00
+5.755779e+00
+1.567457e+00
+5.487020e+00
+1.975202e-01
+5.232911e-01
+7.836200e-03
+1.007094e+00
+3.631972e-01
+1.924930e+00
+9.695502e-01
+1.999559e+00
+1.334698e-01
+6.037064e-01
+2.461044e-01
+1.372401e+00
+8.703127e-01
+4.564836e+00
+1.961226e+00
+4.450991e+00
+3.856156e-01
+7.231157e-01
+9.523887e-03
+1.577815e+00
+1.087389e+00
+4.079232e+00
+1.807083e+00
+4.804181e+00
+4.533724e-01
+1.451507e+00
+5.204235e-01
+3.741071e+00
+1.568960e+00
+5.824824e+00
+2.201048e+00
+6.439262e+00
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2x3acc8
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2x3acc8 Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,64 @@
+1.344438e-01
+4.092514e-01
+2.027230e-03
+4.742694e-01
+1.843364e-01
+8.053008e-01
+1.613754e-01
+7.516279e-01
+1.442629e-03
+7.758774e-03
+1.693947e-03
+1.052480e-02
+3.860067e-01
+1.386879e+00
+4.092704e-01
+1.281481e+00
+3.728892e-01
+1.094436e+00
+5.800326e-03
+1.222804e+00
+7.188373e-01
+3.027880e+00
+6.491829e-01
+2.724678e+00
+3.086292e-01
+1.600430e+00
+3.738459e-01
+2.093096e+00
+9.754733e-01
+3.379250e+00
+1.066943e+00
+3.010408e+00
+8.976466e-02
+2.908347e-01
+1.697998e-03
+2.794937e-01
+2.406487e-01
+1.118979e+00
+2.642890e-01
+8.660795e-01
+6.801449e-02
+3.893422e-01
+1.001881e-01
+4.379688e-01
+4.845988e-01
+1.853180e+00
+6.445655e-01
+1.419979e+00
+3.795467e-01
+1.332811e+00
+7.810138e-03
+1.355863e+00
+1.060978e+00
+5.346963e+00
+1.267546e+00
+4.380911e+00
+3.114958e-01
+1.932612e+00
+4.991472e-01
+2.301325e+00
+9.560935e-01
+3.962763e+00
+1.383397e+00
+3.214281e+00
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2x3acc9
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2x3acc9 Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,256 @@
+8.086417e-02
+1.061148e+00
+1.053867e-05
+6.326468e-01
+4.741765e-02
+5.760009e-01
+7.648932e-05
+6.611140e-01
+9.048826e-03
+1.674918e-01
+1.034585e-06
+8.048592e-02
+5.156279e-02
+4.727216e-01
+1.917330e-05
+4.456379e-01
+9.107219e-02
+8.931797e-01
+6.126286e-05
+7.456448e-01
+6.486060e-02
+5.888404e-01
+5.400372e-04
+9.463644e-01
+8.887271e-02
+1.229428e+00
+5.244745e-05
+8.272508e-01
+8.664589e-02
+5.936773e-01
+1.662993e-04
+7.836727e-01
+1.796546e-03
+2.729718e-02
+7.391905e-06
+1.943716e-02
+1.260740e-03
+1.773244e-02
+6.420588e-05
+2.430810e-02
+1.179467e-03
+2.527825e-02
+4.257439e-06
+1.450784e-02
+1.114005e-03
+1.182541e-02
+1.307785e-05
+1.331442e-02
+1.013492e-01
+9.834073e-01
+1.374615e-04
+7.831973e-01
+1.076459e-01
+9.668837e-01
+1.807132e-03
+1.482449e+00
+1.058331e-01
+1.448493e+00
+1.259293e-04
+9.298115e-01
+1.918856e-01
+1.300784e+00
+7.425650e-04
+1.638076e+00
+1.547885e-01
+2.479552e+00
+1.047483e-04
+1.241938e+00
+9.241049e-02
+1.370311e+00
+7.740353e-04
+1.321338e+00
+1.806222e-02
+4.081195e-01
+1.072319e-05
+1.647613e-01
+6.649126e-02
+7.441293e-01
+1.283819e-04
+5.893407e-01
+6.104403e-01
+7.308212e+00
+2.132227e-03
+5.125611e+00
+4.426262e-01
+4.905333e+00
+1.913633e-02
+6.623243e+00
+6.211868e-01
+1.048991e+01
+1.903518e-03
+5.929898e+00
+3.912469e-01
+3.272414e+00
+3.899170e-03
+3.629058e+00
+9.900491e-02
+1.836332e+00
+2.115211e-03
+1.098519e+00
+7.073633e-02
+1.214508e+00
+1.870556e-02
+1.398699e+00
+6.777986e-02
+1.773278e+00
+1.270405e-03
+8.550152e-01
+4.135722e-02
+5.359140e-01
+2.521037e-03
+5.069236e-01
+3.395003e-01
+4.021317e+00
+2.391003e-03
+2.690590e+00
+3.671269e-01
+4.025388e+00
+3.200275e-02
+5.185070e+00
+3.696900e-01
+6.176573e+00
+2.284137e-03
+3.330946e+00
+4.330201e-01
+3.583319e+00
+8.701194e-03
+3.791018e+00
+5.405016e-02
+6.133016e-01
+7.120181e-05
+3.934640e-01
+3.040503e-02
+3.193640e-01
+4.957591e-04
+3.944433e-01
+8.782952e-03
+1.405722e-01
+1.015030e-05
+7.268939e-02
+3.611381e-02
+2.862858e-01
+1.357370e-04
+2.904168e-01
+7.808540e-02
+6.621867e-01
+5.309405e-04
+5.948657e-01
+5.334940e-02
+4.187971e-01
+4.489901e-03
+7.242855e-01
+1.106522e-01
+1.323586e+00
+6.600555e-04
+9.583663e-01
+7.784452e-02
+4.611986e-01
+1.510202e-03
+6.551151e-01
+3.348664e-02
+4.399554e-01
+1.392689e-03
+3.371079e-01
+2.254360e-02
+2.741724e-01
+1.160479e-02
+4.044378e-01
+3.192468e-02
+5.916233e-01
+1.164806e-03
+3.653813e-01
+2.175788e-02
+1.997115e-01
+2.581845e-03
+2.419660e-01
+7.915479e-02
+6.641221e-01
+1.085183e-03
+5.691557e-01
+8.065272e-02
+6.264022e-01
+1.368598e-02
+1.033485e+00
+1.200289e-01
+1.420490e+00
+1.443630e-03
+9.812105e-01
+1.570346e-01
+9.204826e-01
+6.142597e-03
+1.247354e+00
+2.604655e-01
+2.581281e+00
+2.828658e-04
+1.096237e+00
+1.542169e-01
+1.414753e+00
+2.072972e-03
+1.156691e+00
+2.795462e-02
+3.907692e-01
+2.663350e-05
+1.337613e-01
+1.801387e-01
+1.247216e+00
+5.581723e-04
+8.375329e-01
+5.153107e-01
+3.816698e+00
+2.888560e-03
+2.269680e+00
+3.705631e-01
+2.540647e+00
+2.571021e-02
+2.908632e+00
+4.823020e-01
+5.038710e+00
+2.371788e-03
+2.415110e+00
+5.317506e-01
+2.751541e+00
+8.504547e-03
+2.587283e+00
+1.706669e-01
+1.958368e+00
+5.851513e-03
+9.933285e-01
+1.209301e-01
+1.284526e+00
+5.131973e-02
+1.254322e+00
+1.074643e-01
+1.739365e+00
+3.232418e-03
+7.110992e-01
+1.147823e-01
+9.201727e-01
+1.122858e-02
+7.380046e-01
+7.702898e-01
+5.644597e+00
+8.705947e-03
+3.202243e+00
+8.260937e-01
+5.603657e+00
+1.155640e-01
+6.120130e+00
+7.714763e-01
+7.974125e+00
+7.649428e-03
+3.646240e+00
+1.581806e+00
+8.098064e+00
+5.100889e-02
+7.264300e+00
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/me2x5
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/me2x5 Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,16384 @@\n+   1.4359596e-01\n+   2.0415665e-01\n+   1.9297912e-01\n+   1.0617628e+00\n+   2.3168664e-01\n+   4.8389961e-01\n+   4.6634523e-01\n+   1.2792657e+00\n+   7.6489932e+00\n+   1.0067296e+01\n+   6.1408952e+00\n+   5.3603145e+01\n+   1.1989258e-01\n+   9.8322607e-02\n+   1.4184147e-01\n+   7.8902998e-01\n+   1.7593148e-02\n+   4.8968427e-02\n+   3.4133324e-02\n+   1.8816052e-01\n+   4.0230456e-02\n+   1.6449797e-01\n+   1.1690377e-01\n+   3.2130290e-01\n+   1.5940351e+00\n+   4.1073125e+00\n+   1.8475330e+00\n+   1.6157861e+01\n+   2.2056827e-02\n+   3.5412391e-02\n+   3.7672180e-02\n+   2.0996346e-01\n+   7.7444391e-03\n+   2.1649340e-02\n+   6.6714840e-03\n+   7.5684837e-02\n+   8.2847698e-03\n+   3.4022669e-02\n+   1.0689360e-02\n+   6.0460855e-02\n+   7.8640944e-01\n+   2.0351230e+00\n+   4.0470686e-01\n+   7.2839766e+00\n+   3.8490650e-03\n+   6.2065414e-03\n+   2.9189752e-03\n+   3.3480365e-02\n+   1.9657292e-02\n+   4.8412421e-02\n+   3.0548362e-02\n+   1.7296027e-01\n+   2.3872539e-02\n+   8.6370348e-02\n+   5.5565012e-02\n+   1.5685419e-01\n+   9.3038786e-01\n+   2.1212143e+00\n+   8.6374928e-01\n+   7.7586767e+00\n+   1.4753136e-02\n+   2.0958348e-02\n+   2.0183261e-02\n+   1.1553751e-01\n+   3.5721985e-05\n+   7.9119473e-05\n+   3.3416838e-05\n+   7.1843384e-04\n+   4.0575472e-05\n+   1.3202135e-04\n+   5.6850193e-05\n+   6.0938191e-04\n+   1.3171435e-02\n+   2.7006505e-02\n+   7.3607592e-03\n+   2.5106464e-01\n+   4.5191403e-05\n+   5.7735533e-05\n+   3.7215870e-05\n+   8.0895250e-04\n+   9.0944783e-07\n+   3.9434554e-06\n+   1.2282146e-06\n+   2.6456281e-05\n+   1.4640585e-06\n+   9.3258946e-06\n+   2.9613766e-06\n+   3.1804201e-05\n+   5.7038454e-04\n+   2.2895700e-03\n+   4.6017607e-04\n+   1.5726060e-02\n+   1.7276167e-06\n+   4.3210171e-06\n+   2.0539344e-06\n+   4.4731586e-05\n+   1.0796194e-06\n+   4.7016642e-06\n+   6.4738766e-07\n+   2.8698248e-05\n+   8.1307479e-07\n+   5.2016888e-06\n+   7.3023563e-07\n+   1.6139496e-05\n+   7.5886612e-04\n+   3.0593807e-03\n+   2.7184313e-04\n+   1.9118372e-02\n+   8.1302875e-07\n+   2.0423329e-06\n+   4.2918318e-07\n+   1.9235638e-05\n+   2.3743530e-06\n+   9.1097150e-06\n+   2.5684501e-06\n+   5.6824293e-05\n+   2.0299678e-06\n+   1.1441472e-05\n+   3.2889248e-06\n+   3.6278791e-05\n+   7.7789563e-04\n+   2.7629194e-03\n+   5.0269752e-04\n+   1.7644574e-02\n+   2.7000750e-06\n+   5.9755123e-06\n+   2.5712517e-06\n+   5.7514924e-05\n+   5.6562877e-03\n+   1.9339633e-02\n+   1.0995094e-02\n+   1.0753419e-01\n+   1.0055850e-02\n+   5.0509006e-02\n+   2.9276890e-02\n+   1.4276073e-01\n+   1.1898916e+00\n+   3.7662726e+00\n+   1.3817655e+00\n+   2.1439941e+01\n+   3.4821433e-03\n+   6.8675693e-03\n+   5.9587750e-03\n+   5.8922035e-02\n+   4.1614899e-04\n+   2.7855863e-03\n+   1.1678384e-03\n+   1.1443617e-02\n+   1.0485490e-03\n+   1.0310738e-02\n+   4.4071886e-03\n+   2.1531691e-02\n+   1.4890762e-01\n+   9.2272478e-01\n+   2.4963803e-01\n+   3.8809050e+00\n+   3.8469208e-04\n+   1.4853223e-03\n+   9.5036442e-04\n+   9.4155119e-03\n+   1.0555532e-04\n+   7.0962725e-04\n+   1.3152603e-04\n+   2.6523365e-03\n+   1.2442257e-04\n+   1.2288034e-03\n+   2.3220394e-04\n+   2.3346554e-03\n+   4.2330443e-02\n+   2.6344527e-01\n+   3.1509679e-02\n+   1.0080975e+00\n+   3.8682121e-05\n+   1.5000311e-04\n+   4.2431172e-05\n+   8.6511702e-04\n+   1.2544477e-04\n+   7.4298728e-04\n+   2.8197870e-04\n+   2.8379522e-03\n+   1.6786341e-04\n+   1.4605545e-03\n+   5.6514310e-04\n+   2.8358514e-03\n+   2.3448075e-02\n+   1.2856512e-01\n+   3.1486916e-02\n+   5.0276028e-01\n+   6.9418975e-05\n+   2.3716277e-04\n+   1.3736766e-04\n+   1.3978062e-03\n+   3.4545523e-04\n+   4.3969437e-04\n+   4.6359794e-04\n+   1.0162662e-02\n+   3.3147557e-05\n+   6.1978902e-05\n+   6.6625487e-05\n+   7.2818583e-04\n+   4.3092813e-02\n+   5.0775156e-02\n+   3.4547310e-02\n+   1.2014940e+00\n+   1.4434781e-04\n+   1.0597643e-04\n+   1.7053085e-04\n+   3.7795657e-03\n+   4.7112950e-05\n+   1.1739528e-04\n+   9.1276039e-05\n+   2.0047297e-03\n+   6.4069749e-06\n+   2.3452892e-05\n+   1.8591238e-05\n+   2.0358389e-04\n+   9.9964520e-03\n+   2.3059155e-02\n+   1.1569699e-02\n+   4.0314598e-01\n+   2.9560265e-05\n+'..b'05\n+   6.1586001e-06\n+   9.1669747e-05\n+   2.0516704e-05\n+   1.1082449e-04\n+   5.5300500e-05\n+   4.1040188e-04\n+   5.6410541e-03\n+   1.9201852e-02\n+   6.0646088e-03\n+   1.4321498e-01\n+   2.2558698e-06\n+   4.7846445e-06\n+   3.5738814e-06\n+   5.3784564e-05\n+   2.0556180e-05\n+   6.6591143e-05\n+   6.6735567e-05\n+   4.9576228e-04\n+   1.3990564e-04\n+   6.6579737e-04\n+   6.8028179e-04\n+   2.5196549e-03\n+   1.5793755e-02\n+   4.7363854e-02\n+   3.0630916e-02\n+   3.6100889e-01\n+   2.0462243e-05\n+   3.8235520e-05\n+   5.8480474e-05\n+   4.3923851e-04\n+   3.9155530e-08\n+   1.1407270e-07\n+   7.6519765e-08\n+   2.1585039e-06\n+   2.4925229e-07\n+   1.0667452e-06\n+   7.2955561e-07\n+   1.0260619e-05\n+   2.3436536e-04\n+   6.3207636e-04\n+   2.7361092e-04\n+   1.2244868e-02\n+   6.5699722e-08\n+   1.1040577e-07\n+   1.1302823e-07\n+   3.2235846e-06\n+   1.8627472e-10\n+   1.0624133e-09\n+   5.2553413e-10\n+   1.4852966e-08\n+   1.6805539e-09\n+   1.4080739e-08\n+   7.1013071e-09\n+   1.0006603e-07\n+   1.8964727e-06\n+   1.0013219e-05\n+   3.1963414e-06\n+   1.4332003e-04\n+   4.6932484e-10\n+   1.5440204e-09\n+   1.1656375e-09\n+   3.3308023e-08\n+   1.1596653e-10\n+   6.6428509e-10\n+   1.4527045e-10\n+   8.4494011e-09\n+   4.8945278e-10\n+   4.1187506e-09\n+   9.1831979e-10\n+   2.6630457e-08\n+   1.3232143e-06\n+   7.0167974e-06\n+   9.9022529e-07\n+   9.1374324e-05\n+   1.1582928e-10\n+   3.8271887e-10\n+   1.2773377e-10\n+   7.5115143e-09\n+   6.0355551e-10\n+   3.0459144e-09\n+   1.3639357e-09\n+   3.9592644e-08\n+   2.8918745e-09\n+   2.1439412e-08\n+   9.7880137e-09\n+   1.4166133e-07\n+   3.2099351e-06\n+   1.4996296e-05\n+   4.3334348e-06\n+   1.9956966e-04\n+   9.1032842e-10\n+   2.6499537e-09\n+   1.8109962e-09\n+   5.3150987e-08\n+   2.2572736e-05\n+   1.0151767e-04\n+   9.1664727e-05\n+   1.1762715e-03\n+   2.2490011e-04\n+   1.4858692e-03\n+   1.3678753e-03\n+   8.7516128e-03\n+   7.7083798e-02\n+   3.2092826e-01\n+   1.8699966e-01\n+   3.8070404e+00\n+   1.8431024e-05\n+   4.7813114e-05\n+   6.5888638e-05\n+   8.5484834e-04\n+   3.1032705e-07\n+   2.7323011e-06\n+   1.8193006e-06\n+   2.3390680e-05\n+   4.3820549e-06\n+   5.6678682e-05\n+   3.8477026e-05\n+   2.4664705e-04\n+   1.8025646e-03\n+   1.4692203e-02\n+   6.3129950e-03\n+   1.2877012e-01\n+   3.8048227e-07\n+   1.9323364e-06\n+   1.9636410e-06\n+   2.5525458e-05\n+   4.1279768e-08\n+   3.6502971e-07\n+   1.0745329e-07\n+   2.8431149e-06\n+   2.7269337e-07\n+   3.5424080e-06\n+   1.0631528e-06\n+   1.4025121e-05\n+   2.6872794e-04\n+   2.1998407e-03\n+   4.1788317e-04\n+   1.7541676e-02\n+   2.0064011e-08\n+   1.0234081e-07\n+   4.5977268e-08\n+   1.2299599e-06\n+   1.1609665e-07\n+   9.0445978e-07\n+   5.4517296e-07\n+   7.1991407e-06\n+   8.7064510e-07\n+   9.9642346e-06\n+   6.1234214e-06\n+   4.0315962e-05\n+   3.5227115e-04\n+   2.5405858e-03\n+   9.8821319e-04\n+   2.0703280e-02\n+   8.5211017e-08\n+   3.8291722e-07\n+   3.5225154e-07\n+   4.7029747e-06\n+   5.4109151e-09\n+   9.0588045e-09\n+   1.5169509e-08\n+   4.3630986e-07\n+   2.9097041e-09\n+   7.1561910e-09\n+   1.2217668e-08\n+   1.7520557e-07\n+   1.0956878e-05\n+   1.6981423e-05\n+   1.8350453e-05\n+   8.3735957e-04\n+   2.9987474e-09\n+   2.8958731e-09\n+   7.4008799e-09\n+   2.1521868e-07\n+   1.3789165e-10\n+   4.5194860e-10\n+   5.5809135e-10\n+   1.6082797e-08\n+   1.0509170e-10\n+   5.0600277e-10\n+   6.3705174e-10\n+   9.1530814e-09\n+   4.7494806e-07\n+   1.4410687e-06\n+   1.1483462e-06\n+   5.2501434e-05\n+   1.1475087e-10\n+   2.1694360e-10\n+   4.0885212e-10\n+   1.1912307e-08\n+   4.4325323e-11\n+   1.4591005e-10\n+   7.9655699e-11\n+   4.7239961e-09\n+   1.5803810e-11\n+   7.6423728e-11\n+   4.2536846e-11\n+   1.2577498e-09\n+   1.7110571e-07\n+   5.2141690e-07\n+   1.8369138e-07\n+   1.7283169e-05\n+   1.4622980e-11\n+   2.7765715e-11\n+   2.3133623e-11\n+   1.3871059e-09\n+   2.3878154e-10\n+   6.9248831e-10\n+   7.7410054e-10\n+   2.2911967e-08\n+   9.6648459e-11\n+   4.1175581e-10\n+   4.6927815e-10\n+   6.9251827e-09\n+   4.2963055e-07\n+   1.1534380e-06\n+   8.3205317e-07\n+   3.9071299e-05\n+   1.1895429e-10\n+   1.9899018e-10\n+   3.3948422e-10\n+   1.0159160e-08\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/splicemodels/splice5sequences
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/splicemodels/splice5sequences Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,16384 @@\n+AAAAAAA\n+AAAAAAC\n+AAAAAAG\n+AAAAAAT\n+AAAAACA\n+AAAAACC\n+AAAAACG\n+AAAAACT\n+AAAAAGA\n+AAAAAGC\n+AAAAAGG\n+AAAAAGT\n+AAAAATA\n+AAAAATC\n+AAAAATG\n+AAAAATT\n+AAAACAA\n+AAAACAC\n+AAAACAG\n+AAAACAT\n+AAAACCA\n+AAAACCC\n+AAAACCG\n+AAAACCT\n+AAAACGA\n+AAAACGC\n+AAAACGG\n+AAAACGT\n+AAAACTA\n+AAAACTC\n+AAAACTG\n+AAAACTT\n+AAAAGAA\n+AAAAGAC\n+AAAAGAG\n+AAAAGAT\n+AAAAGCA\n+AAAAGCC\n+AAAAGCG\n+AAAAGCT\n+AAAAGGA\n+AAAAGGC\n+AAAAGGG\n+AAAAGGT\n+AAAAGTA\n+AAAAGTC\n+AAAAGTG\n+AAAAGTT\n+AAAATAA\n+AAAATAC\n+AAAATAG\n+AAAATAT\n+AAAATCA\n+AAAATCC\n+AAAATCG\n+AAAATCT\n+AAAATGA\n+AAAATGC\n+AAAATGG\n+AAAATGT\n+AAAATTA\n+AAAATTC\n+AAAATTG\n+AAAATTT\n+AAACAAA\n+AAACAAC\n+AAACAAG\n+AAACAAT\n+AAACACA\n+AAACACC\n+AAACACG\n+AAACACT\n+AAACAGA\n+AAACAGC\n+AAACAGG\n+AAACAGT\n+AAACATA\n+AAACATC\n+AAACATG\n+AAACATT\n+AAACCAA\n+AAACCAC\n+AAACCAG\n+AAACCAT\n+AAACCCA\n+AAACCCC\n+AAACCCG\n+AAACCCT\n+AAACCGA\n+AAACCGC\n+AAACCGG\n+AAACCGT\n+AAACCTA\n+AAACCTC\n+AAACCTG\n+AAACCTT\n+AAACGAA\n+AAACGAC\n+AAACGAG\n+AAACGAT\n+AAACGCA\n+AAACGCC\n+AAACGCG\n+AAACGCT\n+AAACGGA\n+AAACGGC\n+AAACGGG\n+AAACGGT\n+AAACGTA\n+AAACGTC\n+AAACGTG\n+AAACGTT\n+AAACTAA\n+AAACTAC\n+AAACTAG\n+AAACTAT\n+AAACTCA\n+AAACTCC\n+AAACTCG\n+AAACTCT\n+AAACTGA\n+AAACTGC\n+AAACTGG\n+AAACTGT\n+AAACTTA\n+AAACTTC\n+AAACTTG\n+AAACTTT\n+AAAGAAA\n+AAAGAAC\n+AAAGAAG\n+AAAGAAT\n+AAAGACA\n+AAAGACC\n+AAAGACG\n+AAAGACT\n+AAAGAGA\n+AAAGAGC\n+AAAGAGG\n+AAAGAGT\n+AAAGATA\n+AAAGATC\n+AAAGATG\n+AAAGATT\n+AAAGCAA\n+AAAGCAC\n+AAAGCAG\n+AAAGCAT\n+AAAGCCA\n+AAAGCCC\n+AAAGCCG\n+AAAGCCT\n+AAAGCGA\n+AAAGCGC\n+AAAGCGG\n+AAAGCGT\n+AAAGCTA\n+AAAGCTC\n+AAAGCTG\n+AAAGCTT\n+AAAGGAA\n+AAAGGAC\n+AAAGGAG\n+AAAGGAT\n+AAAGGCA\n+AAAGGCC\n+AAAGGCG\n+AAAGGCT\n+AAAGGGA\n+AAAGGGC\n+AAAGGGG\n+AAAGGGT\n+AAAGGTA\n+AAAGGTC\n+AAAGGTG\n+AAAGGTT\n+AAAGTAA\n+AAAGTAC\n+AAAGTAG\n+AAAGTAT\n+AAAGTCA\n+AAAGTCC\n+AAAGTCG\n+AAAGTCT\n+AAAGTGA\n+AAAGTGC\n+AAAGTGG\n+AAAGTGT\n+AAAGTTA\n+AAAGTTC\n+AAAGTTG\n+AAAGTTT\n+AAATAAA\n+AAATAAC\n+AAATAAG\n+AAATAAT\n+AAATACA\n+AAATACC\n+AAATACG\n+AAATACT\n+AAATAGA\n+AAATAGC\n+AAATAGG\n+AAATAGT\n+AAATATA\n+AAATATC\n+AAATATG\n+AAATATT\n+AAATCAA\n+AAATCAC\n+AAATCAG\n+AAATCAT\n+AAATCCA\n+AAATCCC\n+AAATCCG\n+AAATCCT\n+AAATCGA\n+AAATCGC\n+AAATCGG\n+AAATCGT\n+AAATCTA\n+AAATCTC\n+AAATCTG\n+AAATCTT\n+AAATGAA\n+AAATGAC\n+AAATGAG\n+AAATGAT\n+AAATGCA\n+AAATGCC\n+AAATGCG\n+AAATGCT\n+AAATGGA\n+AAATGGC\n+AAATGGG\n+AAATGGT\n+AAATGTA\n+AAATGTC\n+AAATGTG\n+AAATGTT\n+AAATTAA\n+AAATTAC\n+AAATTAG\n+AAATTAT\n+AAATTCA\n+AAATTCC\n+AAATTCG\n+AAATTCT\n+AAATTGA\n+AAATTGC\n+AAATTGG\n+AAATTGT\n+AAATTTA\n+AAATTTC\n+AAATTTG\n+AAATTTT\n+AACAAAA\n+AACAAAC\n+AACAAAG\n+AACAAAT\n+AACAACA\n+AACAACC\n+AACAACG\n+AACAACT\n+AACAAGA\n+AACAAGC\n+AACAAGG\n+AACAAGT\n+AACAATA\n+AACAATC\n+AACAATG\n+AACAATT\n+AACACAA\n+AACACAC\n+AACACAG\n+AACACAT\n+AACACCA\n+AACACCC\n+AACACCG\n+AACACCT\n+AACACGA\n+AACACGC\n+AACACGG\n+AACACGT\n+AACACTA\n+AACACTC\n+AACACTG\n+AACACTT\n+AACAGAA\n+AACAGAC\n+AACAGAG\n+AACAGAT\n+AACAGCA\n+AACAGCC\n+AACAGCG\n+AACAGCT\n+AACAGGA\n+AACAGGC\n+AACAGGG\n+AACAGGT\n+AACAGTA\n+AACAGTC\n+AACAGTG\n+AACAGTT\n+AACATAA\n+AACATAC\n+AACATAG\n+AACATAT\n+AACATCA\n+AACATCC\n+AACATCG\n+AACATCT\n+AACATGA\n+AACATGC\n+AACATGG\n+AACATGT\n+AACATTA\n+AACATTC\n+AACATTG\n+AACATTT\n+AACCAAA\n+AACCAAC\n+AACCAAG\n+AACCAAT\n+AACCACA\n+AACCACC\n+AACCACG\n+AACCACT\n+AACCAGA\n+AACCAGC\n+AACCAGG\n+AACCAGT\n+AACCATA\n+AACCATC\n+AACCATG\n+AACCATT\n+AACCCAA\n+AACCCAC\n+AACCCAG\n+AACCCAT\n+AACCCCA\n+AACCCCC\n+AACCCCG\n+AACCCCT\n+AACCCGA\n+AACCCGC\n+AACCCGG\n+AACCCGT\n+AACCCTA\n+AACCCTC\n+AACCCTG\n+AACCCTT\n+AACCGAA\n+AACCGAC\n+AACCGAG\n+AACCGAT\n+AACCGCA\n+AACCGCC\n+AACCGCG\n+AACCGCT\n+AACCGGA\n+AACCGGC\n+AACCGGG\n+AACCGGT\n+AACCGTA\n+AACCGTC\n+AACCGTG\n+AACCGTT\n+AACCTAA\n+AACCTAC\n+AACCTAG\n+AACCTAT\n+AACCTCA\n+AACCTCC\n+AACCTCG\n+AACCTCT\n+AACCTGA\n+AACCTGC\n+AACCTGG\n+AACCTGT\n+AACCTTA\n+AACCTTC\n+AACCTTG\n+AACCTTT\n+AACGAAA\n+AACGAAC\n+AACGAAG\n+AACGAAT\n+AACGACA\n+AACGACC\n+AACGACG\n+AACGACT\n+AACGAGA\n+AACGAGC\n+AACGAGG\n+AACGAGT\n+AACGATA\n+AACGATC\n+AACGATG\n+AACGATT\n+AACGCAA\n+AACGCAC\n+AACGCAG\n+AACGCAT\n+AACGCCA\n+AACGCCC\n+AACGCCG\n+AACGCCT\n+AACGCGA\n+AACGCGC\n+AACGCGG\n+AACGCGT\n+AACGCTA\n+AACGCTC\n+AACGCTG\n+AACGCTT\n+AACGGAA\n+AACGGAC\n+AACGGAG\n+AACGGAT\n+AACGGCA\n+AACGGCC\n+AACGGCG\n+AACGGCT\n+AACGGGA\n+AACGGGC\n+AACGGGG\n+AACGGGT\n+AACGGTA\n+AACGGTC\n+AACGGTG\n+AACGGTT\n+AACGTAA\n+AACGTAC\n+AACGTAG\n+AACGTAT\n+AACGTCA\n+AACGTCC\n+AACGTCG\n+AACGTCT\n+AACGTGA\n+AACGTGC\n+'..b'AT\n+TTGCACA\n+TTGCACC\n+TTGCACG\n+TTGCACT\n+TTGCAGA\n+TTGCAGC\n+TTGCAGG\n+TTGCAGT\n+TTGCATA\n+TTGCATC\n+TTGCATG\n+TTGCATT\n+TTGCCAA\n+TTGCCAC\n+TTGCCAG\n+TTGCCAT\n+TTGCCCA\n+TTGCCCC\n+TTGCCCG\n+TTGCCCT\n+TTGCCGA\n+TTGCCGC\n+TTGCCGG\n+TTGCCGT\n+TTGCCTA\n+TTGCCTC\n+TTGCCTG\n+TTGCCTT\n+TTGCGAA\n+TTGCGAC\n+TTGCGAG\n+TTGCGAT\n+TTGCGCA\n+TTGCGCC\n+TTGCGCG\n+TTGCGCT\n+TTGCGGA\n+TTGCGGC\n+TTGCGGG\n+TTGCGGT\n+TTGCGTA\n+TTGCGTC\n+TTGCGTG\n+TTGCGTT\n+TTGCTAA\n+TTGCTAC\n+TTGCTAG\n+TTGCTAT\n+TTGCTCA\n+TTGCTCC\n+TTGCTCG\n+TTGCTCT\n+TTGCTGA\n+TTGCTGC\n+TTGCTGG\n+TTGCTGT\n+TTGCTTA\n+TTGCTTC\n+TTGCTTG\n+TTGCTTT\n+TTGGAAA\n+TTGGAAC\n+TTGGAAG\n+TTGGAAT\n+TTGGACA\n+TTGGACC\n+TTGGACG\n+TTGGACT\n+TTGGAGA\n+TTGGAGC\n+TTGGAGG\n+TTGGAGT\n+TTGGATA\n+TTGGATC\n+TTGGATG\n+TTGGATT\n+TTGGCAA\n+TTGGCAC\n+TTGGCAG\n+TTGGCAT\n+TTGGCCA\n+TTGGCCC\n+TTGGCCG\n+TTGGCCT\n+TTGGCGA\n+TTGGCGC\n+TTGGCGG\n+TTGGCGT\n+TTGGCTA\n+TTGGCTC\n+TTGGCTG\n+TTGGCTT\n+TTGGGAA\n+TTGGGAC\n+TTGGGAG\n+TTGGGAT\n+TTGGGCA\n+TTGGGCC\n+TTGGGCG\n+TTGGGCT\n+TTGGGGA\n+TTGGGGC\n+TTGGGGG\n+TTGGGGT\n+TTGGGTA\n+TTGGGTC\n+TTGGGTG\n+TTGGGTT\n+TTGGTAA\n+TTGGTAC\n+TTGGTAG\n+TTGGTAT\n+TTGGTCA\n+TTGGTCC\n+TTGGTCG\n+TTGGTCT\n+TTGGTGA\n+TTGGTGC\n+TTGGTGG\n+TTGGTGT\n+TTGGTTA\n+TTGGTTC\n+TTGGTTG\n+TTGGTTT\n+TTGTAAA\n+TTGTAAC\n+TTGTAAG\n+TTGTAAT\n+TTGTACA\n+TTGTACC\n+TTGTACG\n+TTGTACT\n+TTGTAGA\n+TTGTAGC\n+TTGTAGG\n+TTGTAGT\n+TTGTATA\n+TTGTATC\n+TTGTATG\n+TTGTATT\n+TTGTCAA\n+TTGTCAC\n+TTGTCAG\n+TTGTCAT\n+TTGTCCA\n+TTGTCCC\n+TTGTCCG\n+TTGTCCT\n+TTGTCGA\n+TTGTCGC\n+TTGTCGG\n+TTGTCGT\n+TTGTCTA\n+TTGTCTC\n+TTGTCTG\n+TTGTCTT\n+TTGTGAA\n+TTGTGAC\n+TTGTGAG\n+TTGTGAT\n+TTGTGCA\n+TTGTGCC\n+TTGTGCG\n+TTGTGCT\n+TTGTGGA\n+TTGTGGC\n+TTGTGGG\n+TTGTGGT\n+TTGTGTA\n+TTGTGTC\n+TTGTGTG\n+TTGTGTT\n+TTGTTAA\n+TTGTTAC\n+TTGTTAG\n+TTGTTAT\n+TTGTTCA\n+TTGTTCC\n+TTGTTCG\n+TTGTTCT\n+TTGTTGA\n+TTGTTGC\n+TTGTTGG\n+TTGTTGT\n+TTGTTTA\n+TTGTTTC\n+TTGTTTG\n+TTGTTTT\n+TTTAAAA\n+TTTAAAC\n+TTTAAAG\n+TTTAAAT\n+TTTAACA\n+TTTAACC\n+TTTAACG\n+TTTAACT\n+TTTAAGA\n+TTTAAGC\n+TTTAAGG\n+TTTAAGT\n+TTTAATA\n+TTTAATC\n+TTTAATG\n+TTTAATT\n+TTTACAA\n+TTTACAC\n+TTTACAG\n+TTTACAT\n+TTTACCA\n+TTTACCC\n+TTTACCG\n+TTTACCT\n+TTTACGA\n+TTTACGC\n+TTTACGG\n+TTTACGT\n+TTTACTA\n+TTTACTC\n+TTTACTG\n+TTTACTT\n+TTTAGAA\n+TTTAGAC\n+TTTAGAG\n+TTTAGAT\n+TTTAGCA\n+TTTAGCC\n+TTTAGCG\n+TTTAGCT\n+TTTAGGA\n+TTTAGGC\n+TTTAGGG\n+TTTAGGT\n+TTTAGTA\n+TTTAGTC\n+TTTAGTG\n+TTTAGTT\n+TTTATAA\n+TTTATAC\n+TTTATAG\n+TTTATAT\n+TTTATCA\n+TTTATCC\n+TTTATCG\n+TTTATCT\n+TTTATGA\n+TTTATGC\n+TTTATGG\n+TTTATGT\n+TTTATTA\n+TTTATTC\n+TTTATTG\n+TTTATTT\n+TTTCAAA\n+TTTCAAC\n+TTTCAAG\n+TTTCAAT\n+TTTCACA\n+TTTCACC\n+TTTCACG\n+TTTCACT\n+TTTCAGA\n+TTTCAGC\n+TTTCAGG\n+TTTCAGT\n+TTTCATA\n+TTTCATC\n+TTTCATG\n+TTTCATT\n+TTTCCAA\n+TTTCCAC\n+TTTCCAG\n+TTTCCAT\n+TTTCCCA\n+TTTCCCC\n+TTTCCCG\n+TTTCCCT\n+TTTCCGA\n+TTTCCGC\n+TTTCCGG\n+TTTCCGT\n+TTTCCTA\n+TTTCCTC\n+TTTCCTG\n+TTTCCTT\n+TTTCGAA\n+TTTCGAC\n+TTTCGAG\n+TTTCGAT\n+TTTCGCA\n+TTTCGCC\n+TTTCGCG\n+TTTCGCT\n+TTTCGGA\n+TTTCGGC\n+TTTCGGG\n+TTTCGGT\n+TTTCGTA\n+TTTCGTC\n+TTTCGTG\n+TTTCGTT\n+TTTCTAA\n+TTTCTAC\n+TTTCTAG\n+TTTCTAT\n+TTTCTCA\n+TTTCTCC\n+TTTCTCG\n+TTTCTCT\n+TTTCTGA\n+TTTCTGC\n+TTTCTGG\n+TTTCTGT\n+TTTCTTA\n+TTTCTTC\n+TTTCTTG\n+TTTCTTT\n+TTTGAAA\n+TTTGAAC\n+TTTGAAG\n+TTTGAAT\n+TTTGACA\n+TTTGACC\n+TTTGACG\n+TTTGACT\n+TTTGAGA\n+TTTGAGC\n+TTTGAGG\n+TTTGAGT\n+TTTGATA\n+TTTGATC\n+TTTGATG\n+TTTGATT\n+TTTGCAA\n+TTTGCAC\n+TTTGCAG\n+TTTGCAT\n+TTTGCCA\n+TTTGCCC\n+TTTGCCG\n+TTTGCCT\n+TTTGCGA\n+TTTGCGC\n+TTTGCGG\n+TTTGCGT\n+TTTGCTA\n+TTTGCTC\n+TTTGCTG\n+TTTGCTT\n+TTTGGAA\n+TTTGGAC\n+TTTGGAG\n+TTTGGAT\n+TTTGGCA\n+TTTGGCC\n+TTTGGCG\n+TTTGGCT\n+TTTGGGA\n+TTTGGGC\n+TTTGGGG\n+TTTGGGT\n+TTTGGTA\n+TTTGGTC\n+TTTGGTG\n+TTTGGTT\n+TTTGTAA\n+TTTGTAC\n+TTTGTAG\n+TTTGTAT\n+TTTGTCA\n+TTTGTCC\n+TTTGTCG\n+TTTGTCT\n+TTTGTGA\n+TTTGTGC\n+TTTGTGG\n+TTTGTGT\n+TTTGTTA\n+TTTGTTC\n+TTTGTTG\n+TTTGTTT\n+TTTTAAA\n+TTTTAAC\n+TTTTAAG\n+TTTTAAT\n+TTTTACA\n+TTTTACC\n+TTTTACG\n+TTTTACT\n+TTTTAGA\n+TTTTAGC\n+TTTTAGG\n+TTTTAGT\n+TTTTATA\n+TTTTATC\n+TTTTATG\n+TTTTATT\n+TTTTCAA\n+TTTTCAC\n+TTTTCAG\n+TTTTCAT\n+TTTTCCA\n+TTTTCCC\n+TTTTCCG\n+TTTTCCT\n+TTTTCGA\n+TTTTCGC\n+TTTTCGG\n+TTTTCGT\n+TTTTCTA\n+TTTTCTC\n+TTTTCTG\n+TTTTCTT\n+TTTTGAA\n+TTTTGAC\n+TTTTGAG\n+TTTTGAT\n+TTTTGCA\n+TTTTGCC\n+TTTTGCG\n+TTTTGCT\n+TTTTGGA\n+TTTTGGC\n+TTTTGGG\n+TTTTGGT\n+TTTTGTA\n+TTTTGTC\n+TTTTGTG\n+TTTTGTT\n+TTTTTAA\n+TTTTTAC\n+TTTTTAG\n+TTTTTAT\n+TTTTTCA\n+TTTTTCC\n+TTTTTCG\n+TTTTTCT\n+TTTTTGA\n+TTTTTGC\n+TTTTTGG\n+TTTTTGT\n+TTTTTTA\n+TTTTTTC\n+TTTTTTG\n+TTTTTTT\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/test3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/test3 Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,1 @@
+ctctactactatctatctagatc
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/test3.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/test3.fa Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,4 @@
+>1
+ctctactactatctatctagatc
+>2
+gatgctagcta
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/test5
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/test5 Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,2 @@
+acggtaagt
+caggtaagt
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/splicesitescore/test5.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/splicesitescore/test5.fa Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,4 @@
+>1
+acggtaagt
+>2
+caggtaagt
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/stats.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/stats.txt Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,36 @@
+#! /usr/bin/python
+# stats.py
+# P. Clote
+
+import sys,os,tempfile,string,math
+
+
+def getStats(L):
+  #L is list
+  if L==[]:
+    return (0,0,0,0)
+  max = -sys.maxint
+  min = sys.maxint
+  
+  count = 0
+  sum   = 0.0
+  sumSquares = 0.0
+  for value in L:
+    x = float(value)
+    if x<min: min=x
+    if x>max: max=x
+    count = count+1
+    sum = sum+x
+    sumSquares = sumSquares+x**2
+  mean = sum/count
+  variance = sumSquares/count - mean**2
+  stdev    = math.sqrt(variance)
+  #print "Mean:%f\tStDev:%f\tMax:%f\tMin:%f" % (mean,stdev,max,min)
+  return (mean,stdev,max,min)
+
+
+if __name__ == '__main__':
+  L = sys.argv[1:] 
+  (mean,stdev,max,min) = getStats(L)
+  print "Mean:%f\tStDev:%f\tMax:%f\tMin:%f" % (mean,stdev,max,min)
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/mytools/venn.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/mytools/venn.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,82 @@
+<tool id="venn" name="Venn Diagram">
+  <description>from summary counts</description>
+  <command interpreter="python">$script_file $labels $counts $output $script</command>
+  <inputs>   
+      <param name="labels" type="text" value="A,B" size="50" label="Set Labels"/>
+    <param name="counts" type="text" value="30,10,20" size="80" label="Counts in each region" help="region order: two sets: A, B, AB; three sets: A,B,C,AB,BC,AC,ABC"/>
+  </inputs>
+  <configfiles>
+    <configfile name="script_file">
+import os
+labels = '${labels}'.replace(' ','_').split(',')
+counts = '${counts}'.replace(' ','').split(',')
+counts = map(int,counts)
+rscript = open('${script}','w')
+rscript.write("options(warn=-1)\n")
+rscript.write("pdf('"+"${output}"+"')\n")
+rscript.write("library(grid)\n")
+rscript.write("library(VennDiagram)\n")
+if len(labels)==2:
+    for i in range(2):
+        counts[i+1] = counts[i+1]+counts[i]
+    rscript.write("venn =venn.diagram(\n\tx=list(\n\t\t"+labels[0]+"=c(1:"+str(counts[0])+","+str(counts[1]+1)+":"+str(counts[2])+"),\n\t\t"+labels[1]+"="+str(counts[0]+1)+":"+str(counts[2])+"),\n\tfilename=NULL,\n\tfill=c('red','blue'),\n\tcol='transparent',\n\talpha=0.5,\n\tlabel.col='black',\n\tcex=2,\n\tlwd=0,\n\tfontfamily='serif',\n\tfontface='bold',\n\tcat.col = c('red', 'blue'),\n\tcat.cex=2,\n\tcat.fontfamily='serif',\n\tcat.fontface='bold')\n")
+else:
+    for i in range(6):
+        counts[i+1] = counts[i+1]+counts[i]
+    rscript.write("venn =venn.diagram(\n\tx=list(\n\t\t"+labels[0]+"=c(1:"+str(counts[0])+","+str(counts[2]+1)+":"+str(counts[3])+","+str(counts[4]+1)+":"+str(counts[6])+"),\n\t\t"+labels[1]+"=c("+str(counts[0]+1)+":"+str(counts[1])+","+str(counts[2]+1)+":"+str(counts[4])+","+str(counts[5]+1)+":"+str(counts[6])+"),\n\t\t"+labels[2]+"=c("+str(counts[1]+1)+":"+str(counts[2])+","+str(counts[3]+1)+":"+str(counts[6])+")),\n\tfilename=NULL,\n\tfill=c('red','blue','green'),\n\tcol='transparent',\n\talpha=0.5,\n\tlabel.col='black',\n\tcex=2,\n\tlwd=0,\n\tfontfamily='serif',\n\tfontface='bold',\n\tcat.col = c('red', 'blue','green'),\n\tcat.cex=2,\n\tcat.fontfamily='serif',\n\tcat.fontface='bold')\n")
+rscript.write("grid.draw(venn)\n")
+rscript.write("dev.off()\n")
+rscript.close()
+os.system("cat "+"${script}"+" | R --vanilla --slave")    
+    </configfile>
+  </configfiles>
+
+  <outputs>
+   <data format="txt" name="script" label="${tool.name} on ${on_string}: (script)" />
+    <data format="pdf" name="output" label="${tool.name} on ${on_string}: (plot)" />
+   
+  </outputs>
+
+<help>
+.. class:: infomark
+
+This is a wrapper for R package VennDiagram. It allows you to plot two-set or three-set venn diagrams based on counts. The R script used to generate the plot is also in the output. 
+
+Input: labels for sets and counts for each region in the diagram.
+
+A: A-only
+
+B: B-only
+
+C: C-only
+
+AB: in A and B but not C
+
+BC: in B and C but not A
+
+AC: in A and C but not B
+
+ABC: in A, B, and C 
+
+-----
+
+**Example**
+
+Labels: X,Y
+
+Counts: 30,10,20
+
+
+.. image:: ./static/images/venn2.png
+
+
+Labels: A,B,C
+
+Counts: 10,20,30,40,50,60,70
+
+
+.. image:: ./static/images/venn3.png
+
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/ncbi_blast_plus/blastxml_to_tabular.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,254 @@\n+#!/usr/bin/env python\n+"""Convert a BLAST XML file to 12 column tabular output\n+\n+Takes three command line options, input BLAST XML filename, output tabular\n+BLAST filename, output format (std for standard 12 columns, or ext for the\n+extended 24 columns offered in the BLAST+ wrappers).\n+\n+The 12 columns output are \'qseqid sseqid pident length mismatch gapopen qstart\n+qend sstart send evalue bitscore\' or \'std\' at the BLAST+ command line, which\n+mean:\n+   \n+====== ========= ============================================\n+Column NCBI name Description\n+------ --------- --------------------------------------------\n+     1 qseqid    Query Seq-id (ID of your sequence)\n+     2 sseqid    Subject Seq-id (ID of the database hit)\n+     3 pident    Percentage of identical matches\n+     4 length    Alignment length\n+     5 mismatch  Number of mismatches\n+     6 gapopen   Number of gap openings\n+     7 qstart    Start of alignment in query\n+     8 qend      End of alignment in query\n+     9 sstart    Start of alignment in subject (database hit)\n+    10 send      End of alignment in subject (database hit)\n+    11 evalue    Expectation value (E-value)\n+    12 bitscore  Bit score\n+====== ========= ============================================\n+\n+The additional columns offered in the Galaxy BLAST+ wrappers are:\n+\n+====== ============= ===========================================\n+Column NCBI name     Description\n+------ ------------- -------------------------------------------\n+    13 sallseqid     All subject Seq-id(s), separated by a \';\'\n+    14 score         Raw score\n+    15 nident        Number of identical matches\n+    16 positive      Number of positive-scoring matches\n+    17 gaps          Total number of gaps\n+    18 ppos          Percentage of positive-scoring matches\n+    19 qframe        Query frame\n+    20 sframe        Subject frame\n+    21 qseq          Aligned part of query sequence\n+    22 sseq          Aligned part of subject sequence\n+    23 qlen          Query sequence length\n+    24 slen          Subject sequence length\n+====== ============= ===========================================\n+\n+Most of these fields are given explicitly in the XML file, others some like\n+the percentage identity and the number of gap openings must be calculated.\n+\n+Be aware that the sequence in the extended tabular output or XML direct from\n+BLAST+ may or may not use XXXX masking on regions of low complexity. This\n+can throw the off the calculation of percentage identity and gap openings.\n+[In fact, both BLAST 2.2.24+ and 2.2.25+ have a subtle bug in this regard,\n+with these numbers changing depending on whether or not the low complexity\n+filter is used.]\n+\n+This script attempts to produce identical output to what BLAST+ would have done.\n+However, check this with "diff -b ..." since BLAST+ sometimes includes an extra\n+space character (probably a bug).\n+"""\n+import sys\n+import re\n+\n+if sys.version_info[:2] >= ( 2, 5 ):\n+    import xml.etree.cElementTree as ElementTree\n+else:\n+    from galaxy import eggs\n+    import pkg_resources; pkg_resources.require( "elementtree" )\n+    from elementtree import ElementTree\n+\n+def stop_err( msg ):\n+    sys.stderr.write("%s\\n" % msg)\n+    sys.exit(1)\n+\n+#Parse Command Line\n+try:\n+    in_file, out_file, out_fmt = sys.argv[1:]\n+except:\n+    stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, out format (std or ext)")\n+\n+if out_fmt == "std":\n+    extended = False\n+elif out_fmt == "x22":\n+    stop_err("Format argument x22 has been replaced with ext (extended 24 columns)")\n+elif out_fmt == "ext":\n+    extended = True\n+else:\n+    stop_err("Format argument should be std (12 column) or ext (extended 24 columns)")\n+\n+\n+# get an iterable\n+try: \n+    context = ElementTree.iterparse(in_file, events=("start", "end"))\n+except:\n+    stop_err("Invalid data format.")\n+# turn it into an iterator\n+context = iter(context)\n+# get the root element\n+try:\n+    event, root = context.next()\n+except:\n+    st'..b'")\n+                xx = sum(1 for q,h in zip(q_seq, h_seq) if q=="X" and h=="X")\n+                if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx):\n+                    stop_err("%s vs %s mismatches, expected %i <= %i <= %i" \\\n+                             % (qseqid, sseqid, expected_mismatch - q_seq.count("X"),\n+                                int(mismatch), expected_mismatch))\n+\n+                #TODO - Remove this alternative identity calculation and test\n+                #once satisifed there are no problems\n+                expected_identity = sum(1 for q,h in zip(q_seq, h_seq) if q == h)\n+                if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")):\n+                    stop_err("%s vs %s identities, expected %i <= %i <= %i" \\\n+                             % (qseqid, sseqid, expected_identity, int(nident),\n+                                expected_identity + q_seq.count("X")))\n+                \n+\n+                evalue = hsp.findtext("Hsp_evalue")\n+                if evalue == "0":\n+                    evalue = "0.0"\n+                else:\n+                    evalue = "%0.0e" % float(evalue)\n+                \n+                bitscore = float(hsp.findtext("Hsp_bit-score"))\n+                if bitscore < 100:\n+                    #Seems to show one decimal place for lower scores\n+                    bitscore = "%0.1f" % bitscore\n+                else:\n+                    #Note BLAST does not round to nearest int, it truncates\n+                    bitscore = "%i" % bitscore\n+\n+                values = [qseqid,\n+                          sseqid,\n+                          pident,\n+                          length, #hsp.findtext("Hsp_align-len")\n+                          str(mismatch),\n+                          gapopen,\n+                          hsp.findtext("Hsp_query-from"), #qstart,\n+                          hsp.findtext("Hsp_query-to"), #qend,\n+                          hsp.findtext("Hsp_hit-from"), #sstart,\n+                          hsp.findtext("Hsp_hit-to"), #send,\n+                          evalue, #hsp.findtext("Hsp_evalue") in scientific notation\n+                          bitscore, #hsp.findtext("Hsp_bit-score") rounded\n+                          ]\n+\n+                if extended:\n+                    sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">"))\n+                    #print hit_def, "-->", sallseqid\n+                    positive = hsp.findtext("Hsp_positive")\n+                    ppos = "%0.2f" % (100*float(positive)/float(length))\n+                    qframe = hsp.findtext("Hsp_query-frame")\n+                    sframe = hsp.findtext("Hsp_hit-frame")\n+                    if blast_program == "blastp":\n+                        #Probably a bug in BLASTP that they use 0 or 1 depending on format\n+                        if qframe == "0": qframe = "1"\n+                        if sframe == "0": sframe = "1"\n+                    slen = int(hit.findtext("Hit_len"))\n+                    values.extend([sallseqid,\n+                                   hsp.findtext("Hsp_score"), #score,\n+                                   nident,\n+                                   positive,\n+                                   hsp.findtext("Hsp_gaps"), #gaps,\n+                                   ppos,\n+                                   qframe,\n+                                   sframe,\n+                                   #NOTE - for blastp, XML shows original seq, tabular uses XXX masking\n+                                   q_seq,\n+                                   h_seq,\n+                                   str(qlen),\n+                                   str(slen),\n+                                   ])\n+                #print "\\t".join(values) \n+                outfile.write("\\t".join(values) + "\\n")\n+        # prevents ElementTree from growing large datastructure\n+        root.clear()\n+        elem.clear()\n+outfile.close()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/ncbi_blast_plus/blastxml_to_tabular.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,127 @@
+<tool id="blastxml_to_tabular" name="BLAST XML to tabular" version="0.0.8">
+    <description>Convert BLAST XML output to tabular</description>
+    <command interpreter="python">
+      blastxml_to_tabular.py $blastxml_file $tabular_file $out_format
+    </command>
+    <inputs>
+        <param name="blastxml_file" type="data" format="blastxml" label="BLAST results as XML"/> 
+        <param name="out_format" type="select" label="Output format">
+            <option value="std" selected="True">Tabular (standard 12 columns)</option>
+            <option value="ext">Tabular (extended 24 columns)</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="tabular_file" format="tabular" label="BLAST results as tabular" />
+    </outputs>
+    <requirements>
+    </requirements>
+    <tests>
+        <test>
+            <param name="blastxml_file" value="blastp_four_human_vs_rhodopsin.xml" ftype="blastxml" />
+            <param name="out_format" value="std" />
+            <!-- Note this has some white space differences from the actual blastp output blast_four_human_vs_rhodopsin.tabluar -->
+            <output name="tabular_file" file="blastp_four_human_vs_rhodopsin_converted.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="blastxml_file" value="blastp_four_human_vs_rhodopsin.xml" ftype="blastxml" />
+            <param name="out_format" value="ext" />
+            <!-- Note this has some white space differences from the actual blastp output blast_four_human_vs_rhodopsin_22c.tabluar -->
+            <output name="tabular_file" file="blastp_four_human_vs_rhodopsin_converted_ext.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="blastxml_file" value="blastp_sample.xml" ftype="blastxml" />
+            <param name="out_format" value="std" />
+            <!-- Note this has some white space differences from the actual blastp output -->
+            <output name="tabular_file" file="blastp_sample_converted.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="blastxml_file" value="blastx_rhodopsin_vs_four_human.xml" ftype="blastxml" />
+            <param name="out_format" value="std" />
+            <!-- Note this has some white space differences from the actual blastx output -->
+            <output name="tabular_file" file="blastx_rhodopsin_vs_four_human_converted.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="blastxml_file" value="blastx_rhodopsin_vs_four_human.xml" ftype="blastxml" />
+            <param name="out_format" value="ext" />
+            <!-- Note this has some white space and XXXX masking differences from the actual blastx output -->
+            <output name="tabular_file" file="blastx_rhodopsin_vs_four_human_converted_ext.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="blastxml_file" value="blastx_sample.xml" ftype="blastxml" />
+            <param name="out_format" value="std" />
+            <!-- Note this has some white space differences from the actual blastx output -->
+            <output name="tabular_file" file="blastx_sample_converted.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="blastxml_file" value="blastp_human_vs_pdb_seg_no.xml" ftype="blastxml" />
+            <param name="out_format" value="std" />
+            <!-- Note this has some white space differences from the actual blastp output -->
+            <output name="tabular_file" file="blastp_human_vs_pdb_seg_no_converted_std.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="blastxml_file" value="blastp_human_vs_pdb_seg_no.xml" ftype="blastxml" />
+            <param name="out_format" value="ext" />
+            <!-- Note this has some white space differences from the actual blastp output -->
+            <output name="tabular_file" file="blastp_human_vs_pdb_seg_no_converted_ext.tabular" ftype="tabular" />
+        </test>
+    </tests>
+    <help>
+    
+**What it does**
+
+NCBI BLAST+ (and the older NCBI 'legacy' BLAST) can output in a range of
+formats including tabular and a more detailed XML format. A complex workflow
+may need both the XML and the tabular output - but running BLAST twice is
+slow and wasteful.
+
+This tool takes the BLAST XML output and by default converts it into the
+standard 12 column tabular equivalent:
+
+====== ========= ============================================
+Column NCBI name Description
+------ --------- --------------------------------------------
+     1 qseqid    Query Seq-id (ID of your sequence)
+     2 sseqid    Subject Seq-id (ID of the database hit)
+     3 pident    Percentage of identical matches
+     4 length    Alignment length
+     5 mismatch  Number of mismatches
+     6 gapopen   Number of gap openings
+     7 qstart    Start of alignment in query
+     8 qend      End of alignment in query
+     9 sstart    Start of alignment in subject (database hit)
+    10 send      End of alignment in subject (database hit)
+    11 evalue    Expectation value (E-value)
+    12 bitscore  Bit score
+====== ========= ============================================
+
+The BLAST+ tools can optionally output additional columns of information,
+but this takes longer to calculate. Most (but not all) of these columns are
+included by selecting the extended tabular output. The extra columns are
+included *after* the standard 12 columns. This is so that you can write
+workflow filtering steps that accept either the 12 or 22 column tabular
+BLAST output.
+
+====== ============= ===========================================
+Column NCBI name     Description
+------ ------------- -------------------------------------------
+    13 sallseqid     All subject Seq-id(s), separated by a ';'
+    14 score         Raw score
+    15 nident        Number of identical matches
+    16 positive      Number of positive-scoring matches
+    17 gaps          Total number of gaps
+    18 ppos          Percentage of positive-scoring matches
+    19 qframe        Query frame
+    20 sframe        Subject frame
+    21 qseq          Aligned part of query sequence
+    22 sseq          Aligned part of subject sequence
+    23 qlen          Query sequence length
+    24 slen          Subject sequence length
+====== ============= ===========================================
+
+Beware that the XML file (and thus the conversion) and the tabular output
+direct from BLAST+ may differ in the presence of XXXX masking on regions
+low complexity (columns 21 and 22), and thus also calculated figures like
+the percentage idenity (column 3).
+
+    </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/ncbi_blast_plus/hide_stderr.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/hide_stderr.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+"""A simple script to redirect stderr to stdout when the return code is zero.
+
+See https://bitbucket.org/galaxy/galaxy-central/issue/325/
+
+Currently Galaxy ignores the return code from command line tools (even if it
+is non-zero which by convention indicates an error) and treats any output on
+stderr as an error (even though by convention stderr is used for errors or
+warnings).
+
+This script runs the given command line, capturing all stdout and stderr in
+memory, and gets the return code. For a zero return code, any stderr (which
+should be warnings only) is added to the stdout. That way Galaxy believes
+everything is fine. For a non-zero return code, we output stdout as is, and
+any stderr, plus the return code to ensure there is some output on stderr.
+That way Galaxy treats this as an error.
+
+Once issue 325 is fixed, this script will not be needed.
+"""
+import sys
+import subprocess
+
+#Avoid using shell=True when we call subprocess to ensure if the Python
+#script is killed, so too is the BLAST process.
+try:
+    words = []
+    for w in sys.argv[1:]:
+       if " " in w:
+           words.append('"%s"' % w)
+       else:
+           words.append(w)
+    cmd = " ".join(words)
+    child = subprocess.Popen(sys.argv[1:],
+                             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+except Exception, err:
+    sys.stderr.write("Error invoking command:\n%s\n\n%s\n" % (cmd, err))
+    sys.exit(1)
+#Use .communicate as can get deadlocks with .wait(),
+stdout, stderr = child.communicate()
+return_code = child.returncode
+
+if return_code:
+    sys.stdout.write(stdout)
+    sys.stderr.write(stderr)
+    sys.stderr.write("Return error code %i from command:\n" % return_code)
+    sys.stderr.write("%s\n" % cmd)
+else:
+    sys.stdout.write(stdout)
+    sys.stdout.write(stderr)
b
diff -r 000000000000 -r 9071e359b9a3 tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,209 @@\n+<tool id="ncbi_blastn_wrapper" name="NCBI BLAST+ blastn" version="0.0.11">\n+    <description>Search nucleotide database with nucleotide query sequence(s)</description>\n+    <version_command>blastn -version</version_command>\n+    <command interpreter="python">hide_stderr.py\n+## The command is a Cheetah template which allows some Python based syntax.\n+## Lines starting hash hash are comments. Galaxy will turn newlines into spaces\n+blastn\n+-query "$query"\n+#if $db_opts.db_opts_selector == "db":\n+  -db "${db_opts.database.fields.path}"\n+#else:\n+  -subject "$db_opts.subject"\n+#end if\n+-task $blast_type\n+-evalue $evalue_cutoff\n+-out $output1\n+##Set the extended list here so if/when we add things, saved workflows are not affected\n+#if str($out_format)=="ext":\n+    -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen"\n+#else:\n+    -outfmt $out_format\n+#end if\n+-num_threads 8\n+#if $adv_opts.adv_opts_selector=="advanced":\n+$adv_opts.filter_query\n+$adv_opts.strand\n+## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string\n+## Note -max_target_seqs overrides -num_descriptions and -num_alignments\n+#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):\n+-max_target_seqs $adv_opts.max_hits\n+#end if\n+#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0):\n+-word_size $adv_opts.word_size\n+#end if\n+$adv_opts.ungapped\n+$adv_opts.parse_deflines\n+## End of advanced options:\n+#end if\n+    </command>\n+    <inputs>\n+        <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/> \n+        <conditional name="db_opts">\n+            <param name="db_opts_selector" type="select" label="Subject database/sequences">\n+              <option value="db" selected="True">BLAST Database</option>\n+              <option value="file">FASTA file</option>\n+            </param>\n+            <when value="db">\n+                <param name="database" type="select" label="Nucleotide BLAST database">\n+                    <options from_file="blastdb.loc">\n+                      <column name="value" index="0"/>\n+                      <column name="name" index="1"/>\n+                      <column name="path" index="2"/>\n+                    </options>\n+                </param>\n+                <param name="subject" type="hidden" value="" /> \n+            </when>\n+            <when value="file">\n+                <param name="database" type="hidden" value="" /> \n+                <param name="subject" type="data" format="fasta" label="Nucleotide FASTA file to use as database"/> \n+            </when>\n+        </conditional>\n+        <param name="blast_type" type="select" display="radio" label="Type of BLAST">\n+            <option value="megablast">megablast</option>\n+            <option value="blastn">blastn</option>\n+            <option value="blastn-short">blastn-short</option>\n+            <option value="dc-megablast">dc-megablast</option>\n+            <!-- Using BLAST 2.2.24+ this gives an error:\n+            BLAST engine error: Program type \'vecscreen\' not supported\n+            <option value="vecscreen">vecscreen</option>\n+            -->\n+        </param>\n+        <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" />\n+        <param name="out_format" type="select" label="Output format">\n+            <option value="6" selected="True">Tabular (standard 12 columns)</option>\n+            <option value="ext">Tabular (extended 24 columns)</option>\n+            <option value="5">BLAST XML</option>\n+            <option value="0">Pairwise text</option>\n+            <option value="0 -html">Pairwise HTML</option>\n+            <option value="2">Query-anchored text</option>\n+            <option value="2 -html">Query-anchored HTML</option>\n+            <option value="4">Flat query-anchored text</option>\n+            <option value="4 -html">Flat query-anchored HTML</option>\n+            <!--\n'..b'>\n+                <when input="out_format" value="0 -html" format="html"/>\n+                <when input="out_format" value="2" format="txt"/>\n+                <when input="out_format" value="2 -html" format="html"/>\n+                <when input="out_format" value="4" format="txt"/>\n+                <when input="out_format" value="4 -html" format="html"/>\n+                <when input="out_format" value="5" format="blastxml"/>\n+            </change_format>\n+        </data>\n+    </outputs>\n+    <requirements>\n+        <requirement type="binary">blastn</requirement>\n+    </requirements>\n+    <help>\n+    \n+.. class:: warningmark\n+\n+**Note**. Database searches may take a substantial amount of time.\n+For large input datasets it is advisable to allow overnight processing.  \n+\n+-----\n+\n+**What it does**\n+\n+Search a *nucleotide database* using a *nucleotide query*,\n+using the NCBI BLAST+ blastn command line tool.\n+Algorithms include blastn, megablast, and discontiguous megablast.\n+\n+-----\n+\n+**Output format**\n+\n+Because Galaxy focuses on processing tabular data, the default output of this\n+tool is tabular. The standard BLAST+ tabular output contains 12 columns:\n+\n+====== ========= ============================================\n+Column NCBI name Description\n+------ --------- --------------------------------------------\n+     1 qseqid    Query Seq-id (ID of your sequence)\n+     2 sseqid    Subject Seq-id (ID of the database hit)\n+     3 pident    Percentage of identical matches\n+     4 length    Alignment length\n+     5 mismatch  Number of mismatches\n+     6 gapopen   Number of gap openings\n+     7 qstart    Start of alignment in query\n+     8 qend      End of alignment in query\n+     9 sstart    Start of alignment in subject (database hit)\n+    10 send      End of alignment in subject (database hit)\n+    11 evalue    Expectation value (E-value)\n+    12 bitscore  Bit score\n+====== ========= ============================================\n+\n+The BLAST+ tools can optionally output additional columns of information,\n+but this takes longer to calculate. Most (but not all) of these columns are\n+included by selecting the extended tabular output. The extra columns are\n+included *after* the standard 12 columns. This is so that you can write\n+workflow filtering steps that accept either the 12 or 24 column tabular\n+BLAST output.\n+\n+====== ============= ===========================================\n+Column NCBI name     Description\n+------ ------------- -------------------------------------------\n+    13 sallseqid     All subject Seq-id(s), separated by a \';\'\n+    14 score         Raw score\n+    15 nident        Number of identical matches\n+    16 positive      Number of positive-scoring matches\n+    17 gaps          Total number of gaps\n+    18 ppos          Percentage of positive-scoring matches\n+    19 qframe        Query frame\n+    20 sframe        Subject frame\n+    21 qseq          Aligned part of query sequence\n+    22 sseq          Aligned part of subject sequence\n+    23 qlen          Query sequence length\n+    24 slen          Subject sequence length\n+====== ============= ===========================================\n+\n+The third option is BLAST XML output, which is designed to be parsed by\n+another program, and is understood by some Galaxy tools.\n+\n+You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program).\n+The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website.\n+The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.\n+The two query anchored outputs show a multiple sequence alignment between the query and all the matches,\n+and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).\n+\n+-------\n+\n+**References**\n+\n+Zhang et al. A Greedy Algorithm for Aligning DNA Sequences. 2000. JCB: 203-214.\n+\n+    </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,276 @@\n+<tool id="ncbi_blastp_wrapper" name="NCBI BLAST+ blastp" version="0.0.11">\n+    <description>Search protein database with protein query sequence(s)</description>\n+    <version_command>blastp -version</version_command>\n+    <command interpreter="python">hide_stderr.py\n+## The command is a Cheetah template which allows some Python based syntax.\n+## Lines starting hash hash are comments. Galaxy will turn newlines into spaces\n+blastp\n+-query "$query"\n+#if $db_opts.db_opts_selector == "db":\n+  -db "${db_opts.database.fields.path}"\n+#else:\n+  -subject "$db_opts.subject"\n+#end if\n+-task $blast_type\n+-evalue $evalue_cutoff\n+-out $output1\n+##Set the extended list here so if/when we add things, saved workflows are not affected\n+#if str($out_format)=="ext":\n+    -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen"\n+#else:\n+    -outfmt $out_format\n+#end if\n+-num_threads 8\n+#if $adv_opts.adv_opts_selector=="advanced":\n+$adv_opts.filter_query\n+-matrix $adv_opts.matrix\n+## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string\n+## Note -max_target_seqs overrides -num_descriptions and -num_alignments\n+#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):\n+-max_target_seqs $adv_opts.max_hits\n+#end if\n+#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0):\n+-word_size $adv_opts.word_size\n+#end if\n+##Ungapped disabled for now - see comments below\n+##$adv_opts.ungapped\n+$adv_opts.parse_deflines\n+## End of advanced options:\n+#end if\n+    </command>\n+    <inputs>\n+        <param name="query" type="data" format="fasta" label="Protein query sequence(s)"/> \n+        <conditional name="db_opts">\n+            <param name="db_opts_selector" type="select" label="Subject database/sequences">\n+              <option value="db" selected="True">BLAST Database</option>\n+              <option value="file">FASTA file</option>\n+            </param>\n+            <when value="db">\n+                <param name="database" type="select" label="Protein BLAST database">\n+                    <options from_file="blastdb_p.loc">\n+                      <column name="value" index="0"/>\n+                      <column name="name" index="1"/>\n+                      <column name="path" index="2"/>\n+                    </options>\n+                </param>\n+                <param name="subject" type="hidden" value="" /> \n+            </when>\n+            <when value="file">\n+                <param name="database" type="hidden" value="" /> \n+                <param name="subject" type="data" format="fasta" label="Protein FASTA file to use as database"/> \n+            </when>\n+        </conditional>\n+        <param name="blast_type" type="select" display="radio" label="Type of BLAST">\n+            <option value="blastp">blastp</option>\n+            <option value="blastp-short">blastp-short</option>\n+        </param>\n+        <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" />\n+        <param name="out_format" type="select" label="Output format">\n+            <option value="6" selected="True">Tabular (standard 12 columns)</option>\n+            <option value="ext">Tabular (extended 24 columns)</option>\n+            <option value="5">BLAST XML</option>\n+            <option value="0">Pairwise text</option>\n+            <option value="0 -html">Pairwise HTML</option>\n+            <option value="2">Query-anchored text</option>\n+            <option value="2 -html">Query-anchored HTML</option>\n+            <option value="4">Flat query-anchored text</option>\n+            <option value="4 -html">Flat query-anchored HTML</option>\n+            <!--\n+            <option value="-outfmt 11">BLAST archive format (ASN.1)</option>\n+            -->\n+        </param>\n+        <conditional name="adv_opts">\n+            <param name="adv_opts_selector" type="select" label="Advanced Options">\n+              <option value="basic" selecte'..b'.fasta" ftype="fasta" />\n+            <param name="database" value="" />\n+            <param name="evalue_cutoff" value="1e-8" />\n+            <param name="blast_type" value="blastp" />\n+            <param name="out_format" value="6" />\n+            <param name="adv_opts_selector" value="basic" />\n+            <output name="output1" file="blastp_rhodopsin_vs_four_human.tabular" ftype="tabular" />\n+        </test>\n+    </tests>\n+    <help>\n+    \n+.. class:: warningmark\n+\n+**Note**. Database searches may take a substantial amount of time.\n+For large input datasets it is advisable to allow overnight processing.  \n+\n+-----\n+\n+**What it does**\n+\n+Search a *protein database* using a *protein query*,\n+using the NCBI BLAST+ blastp command line tool.\n+\n+-----\n+\n+**Output format**\n+\n+Because Galaxy focuses on processing tabular data, the default output of this\n+tool is tabular. The standard BLAST+ tabular output contains 12 columns:\n+\n+====== ========= ============================================\n+Column NCBI name Description\n+------ --------- --------------------------------------------\n+     1 qseqid    Query Seq-id (ID of your sequence)\n+     2 sseqid    Subject Seq-id (ID of the database hit)\n+     3 pident    Percentage of identical matches\n+     4 length    Alignment length\n+     5 mismatch  Number of mismatches\n+     6 gapopen   Number of gap openings\n+     7 qstart    Start of alignment in query\n+     8 qend      End of alignment in query\n+     9 sstart    Start of alignment in subject (database hit)\n+    10 send      End of alignment in subject (database hit)\n+    11 evalue    Expectation value (E-value)\n+    12 bitscore  Bit score\n+====== ========= ============================================\n+\n+The BLAST+ tools can optionally output additional columns of information,\n+but this takes longer to calculate. Most (but not all) of these columns are\n+included by selecting the extended tabular output. The extra columns are\n+included *after* the standard 12 columns. This is so that you can write\n+workflow filtering steps that accept either the 12 or 24 column tabular\n+BLAST output.\n+\n+====== ============= ===========================================\n+Column NCBI name     Description\n+------ ------------- -------------------------------------------\n+    13 sallseqid     All subject Seq-id(s), separated by a \';\'\n+    14 score         Raw score\n+    15 nident        Number of identical matches\n+    16 positive      Number of positive-scoring matches\n+    17 gaps          Total number of gaps\n+    18 ppos          Percentage of positive-scoring matches\n+    19 qframe        Query frame\n+    20 sframe        Subject frame\n+    21 qseq          Aligned part of query sequence\n+    22 sseq          Aligned part of subject sequence\n+    23 qlen          Query sequence length\n+    24 slen          Subject sequence length\n+====== ============= ===========================================\n+\n+The third option is BLAST XML output, which is designed to be parsed by\n+another program, and is understood by some Galaxy tools.\n+\n+You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program).\n+The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website.\n+The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.\n+The two query anchored outputs show a multiple sequence alignment between the query and all the matches,\n+and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).\n+\n+-------\n+\n+**References**\n+\n+Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402.\n+\n+Schaffer et al. Improving the accuracy of PSI-BLAST protein database searches with composition-based statistics and other refinements. 2001. Nucleic Acids Res. 29:2994-3005.\n+\n+    </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,240 @@\n+<tool id="ncbi_blastx_wrapper" name="NCBI BLAST+ blastx" version="0.0.11">\n+    <description>Search protein database with translated nucleotide query sequence(s)</description>\n+    <version_command>blastx -version</version_command>\n+    <command interpreter="python">hide_stderr.py\n+## The command is a Cheetah template which allows some Python based syntax.\n+## Lines starting hash hash are comments. Galaxy will turn newlines into spaces\n+blastx\n+-query "$query"\n+#if $db_opts.db_opts_selector == "db":\n+  -db "${db_opts.database.fields.path}"\n+#else:\n+  -subject "$db_opts.subject"\n+#end if\n+-evalue $evalue_cutoff\n+-out $output1\n+##Set the extended list here so if/when we add things, saved workflows are not affected\n+#if str($out_format)=="ext":\n+    -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen"\n+#else:\n+    -outfmt $out_format\n+#end if\n+-num_threads 8\n+#if $adv_opts.adv_opts_selector=="advanced":\n+$adv_opts.filter_query\n+$adv_opts.strand\n+-matrix $adv_opts.matrix\n+## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string\n+## Note -max_target_seqs overrides -num_descriptions and -num_alignments\n+#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):\n+-max_target_seqs $adv_opts.max_hits\n+#end if\n+#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0):\n+-word_size $adv_opts.word_size\n+#end if\n+$adv_opts.ungapped\n+$adv_opts.parse_deflines\n+## End of advanced options:\n+#end if\n+    </command>\n+    <inputs>\n+        <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/> \n+        <conditional name="db_opts">\n+            <param name="db_opts_selector" type="select" label="Subject database/sequences">\n+              <option value="db" selected="True">BLAST Database</option>\n+              <option value="file">FASTA file</option>\n+            </param>\n+            <when value="db">\n+                <param name="database" type="select" label="Protein BLAST database">\n+                    <options from_file="blastdb_p.loc">\n+                      <column name="value" index="0"/>\n+                      <column name="name" index="1"/>\n+                      <column name="path" index="2"/>\n+                    </options>\n+                </param>\n+                <param name="subject" type="hidden" value="" /> \n+            </when>\n+            <when value="file">\n+                <param name="database" type="hidden" value="" /> \n+                <param name="subject" type="data" format="fasta" label="Protein FASTA file to use as database"/> \n+            </when>\n+        </conditional>\n+        <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" />\n+        <param name="out_format" type="select" label="Output format">\n+            <option value="6" selected="True">Tabular (standard 12 columns)</option>\n+            <option value="ext">Tabular (extended 24 columns)</option>\n+            <option value="5">BLAST XML</option>\n+            <option value="0">Pairwise text</option>\n+            <option value="0 -html">Pairwise HTML</option>\n+            <option value="2">Query-anchored text</option>\n+            <option value="2 -html">Query-anchored HTML</option>\n+            <option value="4">Flat query-anchored text</option>\n+            <option value="4 -html">Flat query-anchored HTML</option>\n+            <!--\n+            <option value="-outfmt 11">BLAST archive format (ASN.1)</option>\n+            -->\n+        </param>\n+        <conditional name="adv_opts">\n+            <param name="adv_opts_selector" type="select" label="Advanced Options">\n+              <option value="basic" selected="True">Hide Advanced Options</option>\n+              <option value="advanced">Show Advanced Options</option>\n+            </param>\n+            <when value="basic" />\n+            <when value="advanced">\n+                <!-- Could use a select (yes, no, '..b'    <test>\n+            <param name="query" value="rhodopsin_nucs.fasta" ftype="fasta" />\n+            <param name="db_opts_selector" value="file" />\n+            <param name="subject" value="four_human_proteins.fasta" ftype="fasta" />\n+            <param name="database" value="" />\n+            <param name="evalue_cutoff" value="1e-10" />\n+            <param name="out_format" value="ext" />\n+            <param name="adv_opts_selector" value="basic" />\n+            <output name="output1" file="blastx_rhodopsin_vs_four_human_ext.tabular" ftype="tabular" />\n+        </test>\n+    </tests>\n+    <help>\n+    \n+.. class:: warningmark\n+\n+**Note**. Database searches may take a substantial amount of time.\n+For large input datasets it is advisable to allow overnight processing.  \n+\n+-----\n+\n+**What it does**\n+\n+Search a *protein database* using a *translated nucleotide query*,\n+using the NCBI BLAST+ blastx command line tool.\n+\n+-----\n+\n+**Output format**\n+\n+Because Galaxy focuses on processing tabular data, the default output of this\n+tool is tabular. The standard BLAST+ tabular output contains 12 columns:\n+\n+====== ========= ============================================\n+Column NCBI name Description\n+------ --------- --------------------------------------------\n+     1 qseqid    Query Seq-id (ID of your sequence)\n+     2 sseqid    Subject Seq-id (ID of the database hit)\n+     3 pident    Percentage of identical matches\n+     4 length    Alignment length\n+     5 mismatch  Number of mismatches\n+     6 gapopen   Number of gap openings\n+     7 qstart    Start of alignment in query\n+     8 qend      End of alignment in query\n+     9 sstart    Start of alignment in subject (database hit)\n+    10 send      End of alignment in subject (database hit)\n+    11 evalue    Expectation value (E-value)\n+    12 bitscore  Bit score\n+====== ========= ============================================\n+\n+The BLAST+ tools can optionally output additional columns of information,\n+but this takes longer to calculate. Most (but not all) of these columns are\n+included by selecting the extended tabular output. The extra columns are\n+included *after* the standard 12 columns. This is so that you can write\n+workflow filtering steps that accept either the 12 or 24 column tabular\n+BLAST output.\n+\n+====== ============= ===========================================\n+Column NCBI name     Description\n+------ ------------- -------------------------------------------\n+    13 sallseqid     All subject Seq-id(s), separated by a \';\'\n+    14 score         Raw score\n+    15 nident        Number of identical matches\n+    16 positive      Number of positive-scoring matches\n+    17 gaps          Total number of gaps\n+    18 ppos          Percentage of positive-scoring matches\n+    19 qframe        Query frame\n+    20 sframe        Subject frame\n+    21 qseq          Aligned part of query sequence\n+    22 sseq          Aligned part of subject sequence\n+    23 qlen          Query sequence length\n+    24 slen          Subject sequence length \n+====== ============= ===========================================\n+\n+The third option is BLAST XML output, which is designed to be parsed by\n+another program, and is understood by some Galaxy tools.\n+\n+You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program).\n+The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website.\n+The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.\n+The two query anchored outputs show a multiple sequence alignment between the query and all the matches,\n+and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).\n+\n+-------\n+\n+**References**\n+\n+Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402.\n+\n+    </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,286 @@\n+<tool id="ncbi_tblastn_wrapper" name="NCBI BLAST+ tblastn" version="0.0.11">\n+    <description>Search translated nucleotide database with protein query sequence(s)</description>\n+    <version_command>tblastn -version</version_command>\n+    <command interpreter="python">hide_stderr.py\n+## The command is a Cheetah template which allows some Python based syntax.\n+## Lines starting hash hash are comments. Galaxy will turn newlines into spaces\n+tblastn\n+-query "$query"\n+#if $db_opts.db_opts_selector == "db":\n+  -db "${db_opts.database.fields.path}"\n+#else:\n+  -subject "$db_opts.subject"\n+#end if\n+-evalue $evalue_cutoff\n+-out $output1\n+##Set the extended list here so if/when we add things, saved workflows are not affected\n+#if str($out_format)=="ext":\n+    -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen"\n+#else:\n+    -outfmt $out_format\n+#end if\n+-num_threads 8\n+#if $adv_opts.adv_opts_selector=="advanced":\n+$adv_opts.filter_query\n+-matrix $adv_opts.matrix\n+## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string\n+## Note -max_target_seqs overrides -num_descriptions and -num_alignments\n+#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):\n+-max_target_seqs $adv_opts.max_hits\n+#end if\n+#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0):\n+-word_size $adv_opts.word_size\n+#end if\n+##Ungapped disabled for now - see comments below\n+##$adv_opts.ungapped\n+$adv_opts.parse_deflines\n+## End of advanced options:\n+#end if\n+    </command>\n+    <inputs>\n+        <param name="query" type="data" format="fasta" label="Protein query sequence(s)"/> \n+        <conditional name="db_opts">\n+            <param name="db_opts_selector" type="select" label="Subject database/sequences">\n+              <option value="db" selected="True">BLAST Database</option>\n+              <option value="file">FASTA file</option>\n+            </param>\n+            <when value="db">\n+                <param name="database" type="select" label="Nucleotide BLAST database">\n+                    <options from_file="blastdb.loc">\n+                      <column name="value" index="0"/>\n+                      <column name="name" index="1"/>\n+                      <column name="path" index="2"/>\n+                    </options>\n+                </param>\n+                <param name="subject" type="hidden" value="" /> \n+            </when>\n+            <when value="file">\n+                <param name="database" type="hidden" value="" /> \n+                <param name="subject" type="data" format="fasta" label="Nucleotide FASTA file to use as database"/> \n+            </when>\n+        </conditional>\n+        <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" />\n+        <param name="out_format" type="select" label="Output format">\n+            <option value="6" selected="True">Tabular (standard 12 columns)</option>\n+            <option value="ext">Tabular (extended 24 columns)</option>\n+            <option value="5">BLAST XML</option>\n+            <option value="0">Pairwise text</option>\n+            <option value="0 -html">Pairwise HTML</option>\n+            <option value="2">Query-anchored text</option>\n+            <option value="2 -html">Query-anchored HTML</option>\n+            <option value="4">Flat query-anchored text</option>\n+            <option value="4 -html">Flat query-anchored HTML</option>\n+            <!--\n+            <option value="-outfmt 11">BLAST archive format (ASN.1)</option>\n+            -->\n+        </param>\n+        <conditional name="adv_opts">\n+            <param name="adv_opts_selector" type="select" label="Advanced Options">\n+              <option value="basic" selected="True">Hide Advanced Options</option>\n+              <option value="advanced">Show Advanced Options</option>\n+            </param>\n+            <when value="basic" />\n+            <when value="advanced">\n+           '..b'ase" value="" />\n+            <param name="evalue_cutoff" value="1e-10" />\n+            <param name="out_format" value="0 -html" />\n+            <param name="adv_opts_selector" value="advanced" />\n+            <param name="filter_query" value="false" />\n+            <param name="matrix" value="BLOSUM80" />\n+            <param name="max_hits" value="0" />\n+            <param name="word_size" value="0" />\n+            <param name="parse_deflines" value="false" />\n+            <output name="output1" file="tblastn_four_human_vs_rhodopsin.html" ftype="html" />\n+        </test>\n+    </tests>\n+    <help>\n+    \n+.. class:: warningmark\n+\n+**Note**. Database searches may take a substantial amount of time.\n+For large input datasets it is advisable to allow overnight processing.  \n+\n+-----\n+\n+**What it does**\n+\n+Search a *translated nucleotide database* using a *protein query*,\n+using the NCBI BLAST+ tblastn command line tool.\n+\n+-----\n+\n+**Output format**\n+\n+Because Galaxy focuses on processing tabular data, the default output of this\n+tool is tabular. The standard BLAST+ tabular output contains 12 columns:\n+\n+====== ========= ============================================\n+Column NCBI name Description\n+------ --------- --------------------------------------------\n+     1 qseqid    Query Seq-id (ID of your sequence)\n+     2 sseqid    Subject Seq-id (ID of the database hit)\n+     3 pident    Percentage of identical matches\n+     4 length    Alignment length\n+     5 mismatch  Number of mismatches\n+     6 gapopen   Number of gap openings\n+     7 qstart    Start of alignment in query\n+     8 qend      End of alignment in query\n+     9 sstart    Start of alignment in subject (database hit)\n+    10 send      End of alignment in subject (database hit)\n+    11 evalue    Expectation value (E-value)\n+    12 bitscore  Bit score\n+====== ========= ============================================\n+\n+The BLAST+ tools can optionally output additional columns of information,\n+but this takes longer to calculate. Most (but not all) of these columns are\n+included by selecting the extended tabular output. The extra columns are\n+included *after* the standard 12 columns. This is so that you can write\n+workflow filtering steps that accept either the 12 or 24 column tabular\n+BLAST output.\n+\n+====== ============= ===========================================\n+Column NCBI name     Description\n+------ ------------- -------------------------------------------\n+    13 sallseqid     All subject Seq-id(s), separated by a \';\'\n+    14 score         Raw score\n+    15 nident        Number of identical matches\n+    16 positive      Number of positive-scoring matches\n+    17 gaps          Total number of gaps\n+    18 ppos          Percentage of positive-scoring matches\n+    19 qframe        Query frame\n+    20 sframe        Subject frame\n+    21 qseq          Aligned part of query sequence\n+    22 sseq          Aligned part of subject sequence\n+    23 qlen          Query sequence length\n+    24 slen          Subject sequence length\n+====== ============= ===========================================\n+\n+The third option is BLAST XML output, which is designed to be parsed by\n+another program, and is understood by some Galaxy tools.\n+\n+You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program).\n+The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website.\n+The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.\n+The two query anchored outputs show a multiple sequence alignment between the query and all the matches,\n+and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).\n+\n+-------\n+\n+**References**\n+\n+Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402.\n+\n+    </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,206 @@\n+<tool id="ncbi_tblastx_wrapper" name="NCBI BLAST+ tblastx" version="0.0.11">\n+    <description>Search translated nucleotide database with translated nucleotide query sequence(s)</description>\n+    <version_command>tblastx -version</version_command>\n+    <command interpreter="python">hide_stderr.py\n+## The command is a Cheetah template which allows some Python based syntax.\n+## Lines starting hash hash are comments. Galaxy will turn newlines into spaces\n+tblastx\n+-query "$query"\n+#if $db_opts.db_opts_selector == "db":\n+  -db "${db_opts.database.fields.path}"\n+#else:\n+  -subject "$db_opts.subject"\n+#end if\n+-evalue $evalue_cutoff\n+-out $output1\n+##Set the extended list here so if/when we add things, saved workflows are not affected\n+#if str($out_format)=="ext":\n+    -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen"\n+#else:\n+    -outfmt $out_format\n+#end if\n+-num_threads 8\n+#if $adv_opts.adv_opts_selector=="advanced":\n+$adv_opts.filter_query\n+$adv_opts.strand\n+-matrix $adv_opts.matrix\n+## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string\n+## Note -max_target_seqs overrides -num_descriptions and -num_alignments\n+#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):\n+-max_target_seqs $adv_opts.max_hits\n+#end if\n+#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0):\n+-word_size $adv_opts.word_size\n+#end if\n+$adv_opts.parse_deflines\n+## End of advanced options:\n+#end if\n+    </command>\n+    <inputs>\n+        <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/> \n+        <conditional name="db_opts">\n+            <param name="db_opts_selector" type="select" label="Subject database/sequences">\n+              <option value="db" selected="True">BLAST Database</option>\n+              <option value="file">FASTA file</option>\n+            </param>\n+            <when value="db">\n+                <param name="database" type="select" label="Nucleotide BLAST database">\n+                    <options from_file="blastdb.loc">\n+                      <column name="value" index="0"/>\n+                      <column name="name" index="1"/>\n+                      <column name="path" index="2"/>\n+                    </options>\n+                </param>\n+                <param name="subject" type="hidden" value="" /> \n+            </when>\n+            <when value="file">\n+                <param name="database" type="hidden" value="" /> \n+                <param name="subject" type="data" format="fasta" label="Nucleotide FASTA file to use as database"/> \n+            </when>\n+        </conditional>\n+        <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" />\n+        <param name="out_format" type="select" label="Output format">\n+            <option value="6" selected="True">Tabular (standard 12 columns)</option>\n+            <option value="ext">Tabular (extended 24 columns)</option>\n+            <option value="5">BLAST XML</option>\n+            <option value="0">Pairwise text</option>\n+            <option value="0 -html">Pairwise HTML</option>\n+            <option value="2">Query-anchored text</option>\n+            <option value="2 -html">Query-anchored HTML</option>\n+            <option value="4">Flat query-anchored text</option>\n+            <option value="4 -html">Flat query-anchored HTML</option>\n+            <!--\n+            <option value="-outfmt 11">BLAST archive format (ASN.1)</option>\n+            -->\n+        </param>\n+        <conditional name="adv_opts">\n+            <param name="adv_opts_selector" type="select" label="Advanced Options">\n+              <option value="basic" selected="True">Hide Advanced Options</option>\n+              <option value="advanced">Show Advanced Options</option>\n+            </param>\n+            <when value="basic" />\n+            <when value="advanced">\n+                <!-- Could use a select (yes, no'..b'/>\n+                <when input="out_format" value="0 -html" format="html"/>\n+                <when input="out_format" value="2" format="txt"/>\n+                <when input="out_format" value="2 -html" format="html"/>\n+                <when input="out_format" value="4" format="txt"/>\n+                <when input="out_format" value="4 -html" format="html"/>\n+                <when input="out_format" value="5" format="blastxml"/>\n+            </change_format>\n+        </data>\n+    </outputs>\n+    <requirements>\n+        <requirement type="binary">tblastx</requirement>\n+    </requirements>\n+    <help>\n+    \n+.. class:: warningmark\n+\n+**Note**. Database searches may take a substantial amount of time.\n+For large input datasets it is advisable to allow overnight processing.  \n+\n+-----\n+\n+**What it does**\n+\n+Search a *translated nucleotide database* using a *protein query*,\n+using the NCBI BLAST+ tblastx command line tool.\n+\n+-----\n+\n+**Output format**\n+\n+Because Galaxy focuses on processing tabular data, the default output of this\n+tool is tabular. The standard BLAST+ tabular output contains 12 columns:\n+\n+====== ========= ============================================\n+Column NCBI name Description\n+------ --------- --------------------------------------------\n+     1 qseqid    Query Seq-id (ID of your sequence)\n+     2 sseqid    Subject Seq-id (ID of the database hit)\n+     3 pident    Percentage of identical matches\n+     4 length    Alignment length\n+     5 mismatch  Number of mismatches\n+     6 gapopen   Number of gap openings\n+     7 qstart    Start of alignment in query\n+     8 qend      End of alignment in query\n+     9 sstart    Start of alignment in subject (database hit)\n+    10 send      End of alignment in subject (database hit)\n+    11 evalue    Expectation value (E-value)\n+    12 bitscore  Bit score\n+====== ========= ============================================\n+\n+The BLAST+ tools can optionally output additional columns of information,\n+but this takes longer to calculate. Most (but not all) of these columns are\n+included by selecting the extended tabular output. The extra columns are\n+included *after* the standard 12 columns. This is so that you can write\n+workflow filtering steps that accept either the 12 or 24 column tabular\n+BLAST output.\n+\n+====== ============= ===========================================\n+Column NCBI name     Description\n+------ ------------- -------------------------------------------\n+    13 sallseqid     All subject Seq-id(s), separated by a \';\'\n+    14 score         Raw score\n+    15 nident        Number of identical matches\n+    16 positive      Number of positive-scoring matches\n+    17 gaps          Total number of gaps\n+    18 ppos          Percentage of positive-scoring matches\n+    19 qframe        Query frame\n+    20 sframe        Subject frame\n+    21 qseq          Aligned part of query sequence\n+    22 sseq          Aligned part of subject sequence\n+    23 qlen          Query sequence length\n+    24 slen          Subject sequence length\n+====== ============= ===========================================\n+\n+The third option is BLAST XML output, which is designed to be parsed by\n+another program, and is understood by some Galaxy tools.\n+\n+You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program).\n+The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website.\n+The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.\n+The two query anchored outputs show a multiple sequence alignment between the query and all the matches,\n+and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).\n+\n+-------\n+\n+**References**\n+\n+Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402.\n+\n+    </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/basecoverage.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/basecoverage.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,41 @@
+<tool id="gops_basecoverage_1" name="Base Coverage">
+  <description>of all intervals</description>
+  <command interpreter="python">gops_basecoverage.py $input1 $output -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol}</command>
+  <inputs>
+    <param format="interval" name="input1" type="data">
+      <label>Compute coverage for</label>
+    </param>
+   </inputs>
+  <outputs>
+    <data format="txt" name="output" />
+  </outputs>
+  <code file="operation_filter.py"/>
+  <tests>
+    <test>
+      <param name="input1" value="1.bed" />
+      <output name="output" file="gops_basecoverage_out.txt" />     
+    </test>
+    <test>
+      <param name="input1" value="gops_bigint.interval" />
+      <output name="output" file="gops_basecoverage_out2.txt" />     
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns.
+
+This operation counts the total bases covered by a set of intervals.  Bases that are covered by more than one interval are **not** counted more than once towards the total.
+
+-----
+
+**Screencasts!**
+
+See Galaxy Interval Operation Screencasts_ (right click to open this link in another window).
+
+.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations
+
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/cluster.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/cluster.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,89 @@
+<tool id="gops_cluster_1" name="Cluster">
+  <description>the intervals of a dataset</description>
+  <command interpreter="python">gops_cluster.py $input1 $output -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} -d $distance -m $minregions -o $returntype</command>
+  <inputs>
+    <param format="interval" name="input1" type="data">
+      <label>Cluster intervals of</label>
+    </param>
+    <param name="distance" size="5" type="integer" value="1" help="(bp)">
+      <label>max distance between intervals</label>
+    </param>
+    <param name="minregions" size="5" type="integer" value="2">
+      <label>min number of intervals per cluster</label>
+    </param>
+ <param name="returntype" type="select" label="Return type">
+ <option value="1">Merge clusters into single intervals</option>
+ <option value="2">Find cluster intervals; preserve comments and order</option>
+ <option value="3">Find cluster intervals; output grouped by clusters</option>
+ <option value="4">Find the smallest interval in each cluster</option>
+ <option value="5">Find the largest interval in each cluster</option>
+ </param>
+   </inputs>
+  <outputs>
+    <data format="input" name="output" metadata_source="input1" />
+  </outputs>
+  <code file="operation_filter.py">
+    <hook exec_after_process="exec_after_cluster" />
+  </code>
+  <tests>
+    <test>
+      <param name="input1" value="5.bed" />
+      <param name="distance" value="1" />
+      <param name="minregions" value="2" />
+      <param name="returntype" value="1" />
+      <output name="output" file="gops-cluster-1.bed" />     
+    </test>
+    <test>
+      <param name="input1" value="gops_cluster_bigint.bed" />
+      <param name="distance" value="1" />
+      <param name="minregions" value="2" />
+      <param name="returntype" value="1" />
+      <output name="output" file="gops-cluster-1.bed" />     
+    </test>
+    <test>
+      <param name="input1" value="5.bed" />
+      <param name="distance" value="1" />
+      <param name="minregions" value="2" />
+      <param name="returntype" value="2" />
+      <output name="output" file="gops-cluster-2.bed" />     
+    </test>    
+    <test>
+      <param name="input1" value="5.bed" />
+      <param name="distance" value="1" />
+      <param name="minregions" value="2" />
+      <param name="returntype" value="3" />
+      <output name="output" file="gops-cluster-3.bed" />     
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns.
+
+-----
+
+**Screencasts!**
+
+See Galaxy Interval Operation Screencasts_ (right click to open this link in another window).
+
+.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations
+
+-----
+
+**Syntax**
+
+- **Maximum distance** is greatest distance in base pairs allowed between intervals that will be considered &quot;clustered&quot;.  **Negative** values for distance are allowed, and are useful for clustering intervals that overlap.
+- **Minimum intervals per cluster** allow a threshold to be set on the minimum number of intervals to be considered a cluster.  Any area with less than this minimum will not be included in the output.
+- **Merge clusters into single intervals** outputs intervals that span the entire cluster.
+- **Find cluster intervals; preserve comments and order** filters out non-cluster intervals while maintaining the original ordering and comments in the file.
+- **Find cluster intervals; output grouped by clusters** filters out non-cluster intervals, but outputs the cluster intervals so that they are grouped together. Comments and original ordering in the file are lost.
+
+-----
+
+**Example**
+
+.. image:: ./static/operation_icons/gops_cluster.gif
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/column_join.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/column_join.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,290 @@\n+#!/usr/bin/env python\n+\n+"""\n+This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.\n+\n+usage: %prog -o output -1 input1 -2 input2 -c column1[,column2[,column3[,...]]] -g hinge1[,hinge2[,hinge3[,...]]] -f <fill_options_file> [other_input1 [other_input2 [other_input3 ...]]]\n+    -o, output=0: the output pileup\n+    -1, input1=1: the pileup file to start with\n+    -2, input2=2: the second pileup file to join\n+    -g, hinge=h: the columns to be used for matching\n+    -c, columns=c: the columns that should appear in the output\n+    -f, fill_options_file=f: the file specifying the fill value to use\n+    other_inputs: the other input files to join\n+"""\n+\n+import optparse, os, re, struct, sys, tempfile\n+\n+try:\n+    simple_json_exception = None\n+    from galaxy import eggs\n+    from galaxy.util.bunch import Bunch\n+    from galaxy.util import stringify_dictionary_keys\n+    import pkg_resources\n+    pkg_resources.require("simplejson")\n+    import simplejson\n+except Exception, e:\n+    simplejson_exception = e\n+    simplejson = None\n+\n+def stop_err( msg ):\n+    sys.stderr.write( msg )\n+    sys.exit()\n+\n+def split_nums( text ):\n+    """\n+    Splits a string into pieces of numbers and non-numbers, like \'abc23B3\' --> [ \'abc\', 23, \'B\', 3 ]\n+    """\n+    split_t = []\n+    c = \'\'\n+    n = \'\'\n+    for ch in text:\n+        try:\n+            v = int( ch )\n+            n += ch\n+            if c:\n+                split_t.append( \'\'.join( c ) )\n+                c = \'\'\n+        except ValueError:\n+            c += ch\n+            if n:\n+                split_t.append( int( \'\'.join( n ) ) )\n+                n = \'\'\n+    if c:\n+        split_t.append( \'\'.join( c ) )\n+    if n:\n+        split_t.append( int( \'\'.join( n ) ) )\n+    return split_t\n+\n+def hinge_compare( hinge1, hinge2 ):\n+    """\n+    Compares items like \'chr10\' and \'chrM\' or \'scaffold2\' and scaffold10\' so that\n+    first part handled as text but last part as number\n+    """\n+    split_hinge1 = hinge1.split( \'\\t\' )\n+    split_hinge2 = hinge2.split( \'\\t\' )\n+    # quick check if either hinge is empty\n+    if not \'\'.join( split_hinge2 ):\n+        if \'\'.join( split_hinge1 ):\n+            return 1\n+        elif not \'\'.join( split_hinge1 ):\n+            return 0\n+    else:\n+        if not \'\'.join( split_hinge1 ):\n+            return -1\n+    # go through all parts of the hinges and compare\n+    for i, sh1 in enumerate( split_hinge1 ):\n+        # if these hinge segments are the same, just move on to the next ones\n+        if sh1 == split_hinge2[ i ]:\n+            continue\n+        # check all parts of each hinge\n+        h1 = split_nums( sh1 )\n+        h2 = split_nums( split_hinge2[ i ] )\n+        for j, h in enumerate( h1 ):\n+            # if second hinge has no more parts, first is considered larger\n+            if j > 0 and len( h2 ) <= j:\n+                return 1\n+            # if these two parts are the same, move on to next\n+            if h == h2[ j ]:\n+                continue\n+            # do actual comparison, depending on whether letter or number\n+            if type( h ) == int:\n+                if type( h2[ j ] ) == int:\n+                    if h > h2[ j ]:\n+                        return 1\n+                    elif h < h2[ j ]:\n+                        return -1\n+                # numbers are less than letters\n+                elif type( h2[ j ] ) == str:\n+                    return -1\n+            elif type( h ) == str:\n+                if type( h2[ j ] ) == str:\n+                    if h > h2[ j ]:\n+                        return 1\n+                    elif h < h2[ j ]:\n+                        return -1\n+                # numbers are less than letters\n+                elif type( h2[ j ] ) == int:\n+                    return 1\n+    # if all else has failed, just do basic string comparison\n+    if hinge'..b'cal alignment (account for "missing" files)\n+        # write output for leading and trailing empty columns\n+        # columns missing from actual file handled further below\n+        current_data = []\n+        if current != old_current:\n+            # fill trailing empty columns with appropriate fill value\n+            if not first_line:\n+                if last_loc < len( inputs ) - 1:\n+                    if not fill_empty:\n+                        filler = [ \'\' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]\n+                    else:\n+                        filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]\n+                    fout.write( \'%s%s\' % ( delimiter, delimiter.join( filler ) ) )\n+                # insert line break before current line\n+                fout.write( \'\\n\' )\n+            # fill leading empty columns with appropriate fill value\n+            if loc > 0:\n+                if not fill_empty:\n+                    current_data = [ \'\' for col in range( loc * len( cols ) ) ]\n+                else:\n+                    current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( loc * len( cols ) ) ]\n+        else:\n+            if loc - last_loc > 1:\n+                if not fill_empty:\n+                    current_data = [ \'\' for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ]\n+                else:\n+                    current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ]\n+        # now output actual data\n+        split_line = current_lines[ loc ].split( delimiter )\n+        # fill empties within actual line if appropriate\n+        if fill_empty:\n+            new_split_line = split_line[:]\n+            split_line = []\n+            for i, item in enumerate( new_split_line ):\n+                col = i + 1\n+                if not item:\n+                    try:\n+                        split_line.append( fill_empty[ i + 1 ] )\n+                    except KeyError:\n+                        split_line.append( item )\n+                else:\n+                    split_line.append( item )\n+        # add actual data to be output below\n+        if \'\'.join( split_line ):\n+            for col in cols:\n+                if col > hinge:\n+                    # if this column doesn\'t exist, add the appropriate filler or empty column\n+                    try:\n+                        new_item = split_line[ col - 1 ]\n+                    except IndexError:\n+                        if fill_empty:\n+                            new_item = fill_empty[ col ]\n+                        else:\n+                            new_item = \'\'\n+                    current_data.append( new_item )\n+            # grab next line for selected file\n+            current_lines[ loc ] = tmp_input_files[ loc ].readline().rstrip( \'\\r\\n\' )\n+            # write relevant data to file\n+            if current == old_current and current_data:\n+                fout.write( \'%s%s\' % ( delimiter, delimiter.join( current_data ) ) )\n+            elif current_data:\n+                fout.write( \'%s%s%s\' % ( current, delimiter, delimiter.join( current_data ) ) )\n+            last_lines = \'\'.join( current_lines )\n+        else:\n+            last_lines = None\n+        last_loc = loc\n+        old_current = current\n+        first_line = False\n+    # fill trailing empty columns for final line\n+    if last_loc < len( inputs ) - 1:\n+        if not fill_empty:\n+            filler = [ \'\' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]\n+        else:\n+            filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]\n+        fout.write( \'%s%s\' % ( delimiter, delimiter.join( filler ) ) )\n+    fout.write( \'\\n\' )\n+    fout.close()\n+    for f in tmp_input_files:\n+        os.unlink( f.name )\n+\n+if __name__ == "__main__" : __main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/column_join.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/column_join.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,260 @@\n+<tool id="column_join" name="Column Join" version="1.1.0">\n+  <description></description>\n+  <command interpreter="python">\n+    column_join.py\n+        --output=$output\n+        --input1=$input1\n+        --input2=$input2\n+        --hinge=$hinge\n+        --columns=$columns\n+        #if $fill_empty_columns.fill_empty_columns_switch == "fill_empty":\n+            --fill_options_file=$fill_options_file\n+        #end if\n+        #for $f in $file_chooser:\n+            ${f.input}\n+        #end for\n+  </command>\n+  <inputs>\n+    <param name="input1" type="data" format="tabular" label="Choose the first file for the join" />\n+    <param name="hinge" type="data_column" data_ref="input1" multiple="false" numerical="false" label="Use this column and columns to left the \'hinge\' (matching data for each join)" help="All columns to left of selected column (plus selected column) will be used. Select 2 for pileup" />\n+    <param name="columns" type="data_column" data_ref="input1" multiple="true" numerical="false" label="Include these column" help="Multi-select list - hold the appropriate key while clicking to select multiple columns" />\n+    <conditional name="fill_empty_columns">\n+      <param name="fill_empty_columns_switch" type="select" label="Fill empty columns">\n+        <option value="no_fill" selected="True">No</option>\n+        <option value="fill_empty">Yes</option>\n+      </param>\n+      <when value="no_fill" />\n+      <when value="fill_empty">\n+        <conditional name="do_fill_empty_columns">\n+          <param name="column_fill_type" type="select" label="Fill Columns by">\n+            <option value="single_fill_value" selected="True">Single fill value</option>\n+            <option value="fill_value_by_column">Values by column</option>\n+          </param>\n+          <when value="single_fill_value">\n+            <param type="text" name="fill_value" label="Fill value" value="." />\n+          </when>\n+          <when value="fill_value_by_column">\n+            <repeat name="column_fill" title="Fill Column">\n+              <param name="column_number" label="Column" type="data_column" data_ref="input1" />\n+              <param type="text" name="fill_value" value="." />\n+            </repeat>\n+          </when>\n+        </conditional>\n+      </when>\n+    </conditional>\n+    <param name="input2" type="data" format="tabular" label="Choose the second file for the join" />\n+    <repeat name="file_chooser" title="Additional Input">\n+      <param name="input" label="Additional input file" type="data" format="tabular" />\n+    </repeat>\n+  </inputs>\n+  <configfiles>\n+    <configfile name="fill_options_file">&lt;%\n+import simplejson\n+%&gt;\n+#set $__fill_options = {}\n+#if $fill_empty_columns[\'fill_empty_columns_switch\'] == \'fill_empty\':\n+    #if $fill_empty_columns[\'do_fill_empty_columns\'][\'column_fill_type\'] == \'single_fill_value\':\n+        #set $__start_fill = $fill_empty_columns[\'do_fill_empty_columns\'][\'fill_value\'].value\n+    #else:\n+        #set $__start_fill = ""\n+    #end if\n+    #set $__fill_options[\'file1_columns\'] = [ __start_fill for i in range( int( $input1.metadata.columns ) ) ]\n+    #if $fill_empty_columns[\'do_fill_empty_columns\'][\'column_fill_type\'] == \'fill_value_by_column\':\n+        #for column_fill in $fill_empty_columns[\'do_fill_empty_columns\'][\'column_fill\']:\n+            #set $__fill_options[\'file1_columns\'][ int( column_fill[\'column_number\'].value ) - 1 ] = column_fill[\'fill_value\'].value\n+        #end for\n+    #end if\n+#end if\n+${simplejson.dumps( __fill_options )}\n+    </configfile>\n+  </configfiles>\n+  <outputs>\n+    <data name="output" format="tabular" />\n+  </outputs>\n+  <tests>\n+    <test>\n+      <param name="input1" value="column_join_in1.pileup" ftype="pileup" />\n+      <param name="hinge" value="2" />\n+      <param name="columns" value="1,2,3,4,5,7" />\n+      <param name="fill_empty_columns_switch" value="fill_empty" />\n+      <param name="column_fill_type" value="single_fill_value" />\n+      <para'..b"e hinge altogether and a file having the hinge but missing the column (in both cases the column would be empty or filled). There is an example of this below.\n+\n+-----\n+\n+**General Example**\n+\n+Given the following files::\n+\n+  FILE 1\n+  chr2    1    T    6    .C...,     I$$III\n+  chr2    2    G    6    ..N..,     III@II\n+  chr2    3    C    7    ..C...,    I$IIIII\n+  chr2    4    G    7    .G....,    I#IIIII\n+  chr2    5    G    7    ...N..,    IIII#BI\n+  chr2    6    A    7    ..T...,    I$IDIII\n+  chr1    1    C    1    ^:.        I\n+  chr1    2    G    2    .^:.       $I\n+  chr1    3    A    2    ..         I%\n+  chr1    4    C    2    ..         I$\n+  chr1    5    T    3    ..^:.      I#I\n+  chr1    6    G    3    ..^:,      I#I\n+\n+  FILE 2\n+  chr1    3    T    1    ^:.        I\n+  chr1    4    G    2    .^:.       $I\n+  chr1    5    T    2    ..         I%\n+  chr1    6    C    3    ..^:.      III\n+  chr1    7    G    3    ..^:.      I#I\n+  chr1    8    T    4    ...^:,     I#II\n+  chr2    77   C    6    .G...,     I$$III\n+  chr2    78   G    6    ..N..,     III@II\n+  chr2    79   T    7    ..N...,    I$IIIII\n+  chr2    80   C    7    .G....,    I#IIIII\n+  chr2    81   G    7    ...A..,    IIII#BI\n+  chr2    82   A    8    ...G...,   I$IDIIII\n+  chr2    83   T    8    .A.....N   IIIIIIII\n+  chr2    84   A    9    ......T.   I$IIIIIII\n+\n+  FILE 3\n+  chr1    1    A    1    .          I\n+  chr1    2    T    2    G.         I$\n+  chr1    3    C    2    .,         I@\n+  chr1    4    C    3    ..N        III\n+  chr1    42   C    5    ...N^:.    III@I\n+  chr1    43   C    5    .N..^:.    IIIII\n+  chr1    44   T    5    .A..,      IA@II\n+  chr1    45   A    6    .N...^:.   IIIII$\n+  chr1    46   G    6    .GN..^:.   I@IIII\n+  chr1    47   A    7    ....^:..,  IIIII$I\n+  chr2    73   T    5    .N..,      II$II\n+  chr2    74   A    5    ....,      IIIII\n+  chr2    75   T    5    ....,      IIIII\n+  chr2    76   T    5    ....,      IIIII\n+  chr2    77   C    5    ....,      IIIBI\n+  chr2    78   T    5    ....,      IDIII\n+\n+To join on columns 3 and 4 combining on columns 1 and 2, columns 1-4 should be selected for the 'Include these columns' option, and column 2 selected for the 'hinge'. With these settings, the following would be output::\n+\n+  chr1    1    C    1              A    1\n+  chr1    2    G    2              T    2\n+  chr1    3    A    2    T    1    C    2\n+  chr1    4    C    2    G    2    C    3\n+  chr1    5    T    3    T    2\n+  chr1    6    G    3    C    3\n+  chr1    7              G    3\n+  chr1    8              T    4\n+  chr1    42                       C    5\n+  chr1    43                       C    5\n+  chr1    44                       T    5\n+  chr1    45                       A    6\n+  chr1    46                       G    6\n+  chr1    47                       A    7\n+  chr2    1    T    6\n+  chr2    2    G    6\n+  chr2    3    C    7\n+  chr2    4    G    7\n+  chr2    5    G    7\n+  chr2    6    A    7\n+  chr2    73                       T    5\n+  chr2    74                       A    5\n+  chr2    75                       T    5\n+  chr2    76                       T    5\n+  chr2    77             C    6    C    5\n+  chr2    78             G    6    T    5\n+  chr2    79             T    7\n+  chr2    80             C    7\n+  chr2    81             G    7\n+  chr2    82             A    8\n+  chr2    83             T    8\n+  chr2    84             A    9\n+\n+**Example with missing columns**\n+\n+Given the following input files::\n+\n+  FILE 1\n+  1   A\n+  2   B   b\n+  4   C   c\n+  5   D\n+  6   E   e\n+\n+  FILE 2\n+  1   M   m\n+  2   N\n+  3   O   o\n+  4   P   p\n+  5   Q\n+  7   R   r\n+\n+if we join only column 3 using column 1 as the hinge and with a fill value of '0', this is what will be output::\n+\n+  1   0   m\n+  2   b   0\n+  3   0   o\n+  4   c   p\n+  5   0   0\n+  6   e   0\n+  7   0   r\n+\n+Row 5 appears in both files with the missing column, so it's got nothing but fill values in the output file.\n+\n+  </help>\n+</tool>\n"
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/complement.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/complement.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,61 @@
+<tool id="gops_complement_1" name="Complement">
+  <description>intervals of a dataset</description>
+  <command interpreter="python">gops_complement.py $input1 $output -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} -l ${chromInfo} $allchroms</command>
+  <inputs>
+    <param format="interval" name="input1" type="data">
+      <label>Complement regions of</label>
+    </param>
+    <param name="allchroms" type="boolean" truevalue="--all" falsevalue="" label="Genome-wide complement">
+    </param>
+   </inputs>
+  <outputs>
+    <data format="input" name="output" metadata_source="input1" />
+  </outputs>
+  <code file="operation_filter.py"/>
+  <tests>
+    <test>
+      <param name="input1" value="1.bed" dbkey="?" />
+      <param name="allchroms" value="true" />
+      <output name="output" file="gops_complement_out.bed" />
+    </test>
+    <test>
+      <param name="input1" value="2_mod.bed" ftype="interval" dbkey="?" />
+      <param name="allchroms" value="true" />
+      <output name="output" file="gops_complement_out_diffCols.dat" />
+    </test>
+    <test>
+      <param name="input1" value="gops_bigint.interval" dbkey="?" />
+      <param name="allchroms" value="true" />
+      <output name="output" file="gops_complement_out2.bed" />
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns.
+
+This operation complements the regions of a set of intervals.  Regions are returned that represent the empty space in the input interval.
+
+-----
+
+**Screencasts!**
+
+See Galaxy Interval Operation Screencasts_ (right click to open this link in another window).
+
+.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations
+
+-----
+
+**Syntax**
+
+- **Genome-wide complement** will complement all chromosomes of the genome.  Leaving this option unchecked will only complement chromosomes present in the dataset.
+
+-----
+
+**Example**
+
+.. image:: ./static/operation_icons/gops_complement.gif
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/concat.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/concat.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,59 @@
+<tool id="gops_concat_1" name="Concatenate">
+  <description>two datasets into one dataset</description>
+  <command interpreter="python">gops_concat.py $input1 $input2 $output -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol} $sameformat</command>
+  <inputs>
+    <param format="interval" name="input1" type="data" help="First dataset">
+      <label>Concatenate</label>
+    </param>
+    <param format="interval" name="input2" type="data" help="Second dataset">
+      <label>with</label>
+    </param>
+    <param name="sameformat" type="boolean" truevalue="--sameformat" falsevalue="" label="Both datasets are same filetype?" checked="true" help="If unchecked Second dataset will be forced into format of First dataset">
+    </param>
+   </inputs>
+  <outputs>
+    <data format="input" name="output" metadata_source="input1" />
+  </outputs>
+  <code file="operation_filter.py"/>
+  <tests>
+    <test>
+      <param name="input1" value="1.bed" />
+      <param name="input2" value="2.bed" />
+      <param name="sameformat" value="true" />
+      <output name="output" file="gops_concat_out1.bed" />     
+    </test>
+    <test>
+      <param name="input1" value="1.bed" />
+      <param name="input2" value="1.interval" />
+      <param name="sameformat" value="false" />
+      <output name="output" file="gops_concat_out2.bed" />     
+    </test>   
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your dataset does not appear in the pulldown menu -> it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns.
+
+-----
+
+**Screencasts!**
+
+See Galaxy Interval Operation Screencasts_ (right click to open this link in another window).
+
+.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations
+
+-----
+
+**Syntax**
+
+- **Both datasets are exactly the same filetype** will preserve all extra fields in both files.  Leaving this unchecked will force the second dataset to use the same column assignments for chrom, start, end and strand, but will fill extra fields with a period(.).  In both cases, the output fields are truncated or padded with fields of periods to maintain a truly tabular output.
+
+-----
+
+**Example**
+
+.. image:: ./static/operation_icons/gops_concatenate.gif
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/coverage.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/coverage.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,91 @@
+<tool id="gops_coverage_1" name="Coverage">
+  <description>of a set of intervals on second set of intervals</description>
+  <command interpreter="python">gops_coverage.py $input1 $input2 $output -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol}</command>
+  <inputs>
+    <param format="interval" name="input1" type="data" help="First dataset">
+      <label>What portion of</label>
+    </param>
+    <param format="interval" name="input2" type="data" help="Second dataset">
+      <label>is covered by</label>
+    </param>
+   </inputs>
+  <outputs>
+    <data format="interval" name="output" metadata_source="input1" />
+  </outputs>
+  <code file="operation_filter.py"/>
+  <tests>
+    <test>
+      <param name="input1" value="1.bed" />
+      <param name="input2" value="2.bed" />
+      <output name="output" file="gops_coverage_out.interval" />
+    </test>
+    <test>
+      <param name="input1" value="1.bed" />
+      <param name="input2" value="2_mod.bed" ftype="interval"/>
+      <output name="output" file="gops_coverage_out_diffCols.interval" />
+    </test>
+    <test>
+      <param name="input1" value="gops_bigint.interval" />
+      <param name="input2" value="gops_bigint2.interval" />
+      <output name="output" file="gops_coverage_out2.interval" />
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your dataset does not appear in the pulldown menu -> it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns.
+
+Find the coverage of intervals in the first dataset on intervals in the second dataset.  The coverage is added as two columns, the first being bases covered, and the second being the fraction of bases covered by that interval.
+
+-----
+
+**Screencasts!**
+
+See Galaxy Interval Operation Screencasts_ (right click to open this link in another window).
+
+.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations
+
+-----
+
+**Example**
+
+
+    if **First dataset** are genes ::
+
+      chr11 5203271 5204877 NM_000518 0 -
+      chr11 5210634 5212434 NM_000519 0 -
+      chr11 5226077 5227663 NM_000559 0 -
+      chr11 5226079 5232587 BC020719  0 -
+      chr11 5230996 5232587 NM_000184 0 -
+
+    and **Second dataset** are repeats::
+
+       chr11      5203895 5203991 L1MA6     500 +
+       chr11      5204163 5204239 A-rich    219 +
+       chr11      5211034 5211167 (CATATA)n 245 +
+       chr11      5211642 5211673 AT_rich    24 +
+       chr11      5226551 5226606 (CA)n     303 +
+       chr11      5228782 5228825 (TTTTTG)n 208 +
+       chr11      5229045 5229121 L1PA11    440 +
+       chr11      5229133 5229319 MER41A   1106 +
+       chr11      5229374 5229485 L2        244 -
+       chr11      5229751 5230083 MLT1A     913 -
+       chr11      5231469 5231526 (CA)n     330 +
+
+    the Result is the coverage density of repeats in the genes::
+
+       chr11 5203271 5204877 NM_000518 0 - 172   0.107098
+       chr11 5210634 5212434 NM_000519 0 - 164   0.091111
+       chr11 5226077 5227663 NM_000559 0 -  55   0.034678
+       chr11 5226079 5232587 BC020719  0 - 860   0.132145
+       chr11 5230996 5232587 NM_000184 0 -  57   0.035827
+
+    For example, the following line of output::
+
+      chr11 5203271 5204877 NM_000518 0 - 172   0.107098
+
+   implies that 172 nucleotides accounting for 10.7% of the this interval (chr11:5203271-5204877) overlap with repetitive elements.
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/flanking_features.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/flanking_features.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,214 @@\n+#!/usr/bin/env python\n+#By: Guruprasad Ananda\n+"""\n+Fetch closest up/downstream interval from features corresponding to every interval in primary\n+\n+usage: %prog primary_file features_file out_file direction\n+    -1, --cols1=N,N,N,N: Columns for start, end, strand in first file\n+    -2, --cols2=N,N,N,N: Columns for start, end, strand in second file\n+    -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval\n+    -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval\n+"""\n+from galaxy import eggs\n+import pkg_resources\n+pkg_resources.require( "bx-python" )\n+import sys, traceback, fileinput\n+from warnings import warn\n+from bx.cookbook import doc_optparse\n+from galaxy.tools.util.galaxyops import *\n+from bx.intervals.io import *\n+from bx.intervals.operations import quicksect\n+from galaxy.datatypes.util.gff_util import *\n+\n+assert sys.version_info[:2] >= ( 2, 4 )\n+\n+def get_closest_feature (node, direction, threshold_up, threshold_down, report_func_up, report_func_down):\n+    #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases\n+    #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand\n+    #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand\n+    if direction == 1: \n+        if node.maxend <= threshold_up:\n+            if node.end == node.maxend:\n+                report_func_up(node)\n+            elif node.right and node.left:\n+                if node.right.maxend == node.maxend:\n+                    get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)\n+                elif node.left.maxend == node.maxend:\n+                    get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)\n+            elif node.right and node.right.maxend == node.maxend:\n+                get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)\n+            elif node.left and node.left.maxend == node.maxend:\n+                get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)\n+        elif node.minend <= threshold_up:\n+            if node.end <= threshold_up:\n+                report_func_up(node)\n+            if node.left and node.right:\n+                if node.right.minend <= threshold_up:\n+                    get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)\n+                if node.left.minend <= threshold_up:\n+                    get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)\n+            elif node.left:\n+                if node.left.minend <= threshold_up:\n+                    get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)\n+            elif node.right:\n+                if node.right.minend <= threshold_up:\n+                    get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)\n+    elif direction == 0:\n+        if node.start > threshold_down:\n+            report_func_down(node)\n+            if node.left:\n+                get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)\n+        else:\n+            if node.right:\n+                get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)\n+\n+def proximal_region_finder(readers, region, comments=True):\n+    """\n+    Returns an iterator that yields elements of the form [ <original_interval>, <closest_feature> ]. \n+    Intervals are GenomicInterval objects. \n+    """\n+    primary = readers[0]\n+    features = '..b't_up[res_ind].end)) <= abs(end - int(result_down[-1].start)):\n+                            iter_val = [ interval, result_up[res_ind].other ]\n+                        else:\n+                            #The last element of result_down will be the closest element to the given interval\n+                            iter_val = [ interval, result_down[-1].other ]\n+                    elif result_up:\n+                        iter_val = [ interval, result_up[res_ind].other ]\n+                    elif result_down:\n+                        #The last element of result_down will be the closest element to the given interval\n+                        iter_val = [ interval, result_down[-1].other ]\n+                    yield iter_val\n+                        \n+def main():\n+    options, args = doc_optparse.parse( __doc__ )\n+    try:\n+        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )\n+        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )\n+        in1_gff_format = bool( options.gff1 )\n+        in2_gff_format = bool( options.gff2 )\n+        in_fname, in2_fname, out_fname, direction = args\n+    except:\n+        doc_optparse.exception()\n+        \n+    # Set readers to handle either GFF or default format.\n+    if in1_gff_format:\n+        in1_reader_wrapper = GFFIntervalToBEDReaderWrapper\n+    else:\n+        in1_reader_wrapper = NiceReaderWrapper\n+    if in2_gff_format:\n+        in2_reader_wrapper = GFFIntervalToBEDReaderWrapper\n+    else:\n+        in2_reader_wrapper = NiceReaderWrapper\n+\n+    g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ),\n+                            chrom_col=chr_col_1,\n+                            start_col=start_col_1,\n+                            end_col=end_col_1,\n+                            strand_col=strand_col_1,\n+                            fix_strand=True )\n+    g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ),\n+                            chrom_col=chr_col_2,\n+                            start_col=start_col_2,\n+                            end_col=end_col_2,\n+                            strand_col=strand_col_2,\n+                            fix_strand=True )\n+\n+    # Find flanking features.\n+    out_file = open( out_fname, "w" )\n+    try:\n+        for result in proximal_region_finder([g1,g2], direction):\n+            if type( result ) is list:\n+                line, closest_feature = result\n+                # Need to join outputs differently depending on file types.\n+                if in1_gff_format:\n+                    # Output is GFF with added attribute \'closest feature.\'\n+\n+                    # Invervals are in BED coordinates; need to convert to GFF.\n+                    line = convert_bed_coords_to_gff( line )\n+                    closest_feature = convert_bed_coords_to_gff( closest_feature )\n+                    \n+                    # Replace double quotes with single quotes in closest feature\'s attributes.\n+                    out_file.write( "%s closest_feature \\"%s\\" \\n" % \n+                                    ( "\\t".join( line.fields ), \\\n+                                      "\\t".join( closest_feature.fields ).replace( "\\"", "\\\\\\"" )\n+                                     ) )\n+                else:\n+                    # Output is BED + closest feature fields.\n+                    output_line_fields = []\n+                    output_line_fields.extend( line.fields )\n+                    output_line_fields.extend( closest_feature.fields )\n+                    out_file.write( "%s\\n" % ( "\\t".join( output_line_fields ) ) )\n+            else:\n+                out_file.write( "%s\\n" % result )\n+    except ParseError, exc:\n+        fail( "Invalid file format: %s" % str( exc ) )\n+\n+    print "Direction: %s" %(direction)\n+    if g1.skipped > 0:\n+        print skipped( g1, filedesc=" of 1st dataset" )\n+    if g2.skipped > 0:\n+        print skipped( g2, filedesc=" of 2nd dataset" )\n+\n+if __name__ == "__main__":\n+    main()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/flanking_features.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/flanking_features.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,127 @@
+<tool id="flanking_features_1" name="Fetch closest non-overlapping feature" version="4.0.1">
+  <description>  for every interval</description>
+  <command interpreter="python">
+      flanking_features.py $input1 $input2 $out_file1 $direction
+      
+      #if isinstance( $input1.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
+          -1 1,4,5,7 --gff1
+      #else:
+          -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol}
+      #end if
+          
+      #if isinstance( $input2.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
+          -2 1,4,5,7 --gff2
+      #else:
+          -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol}
+      #end if
+  </command>
+  <inputs>
+    <param format="interval,gff" name="input1" type="data" label="For every interval in"/>
+    <param format="interval,gff" name="input2" type="data" label="Fetch closest feature(s) from"/>
+    <param name="direction" type="select" label="Located">
+      <option value="Either">Either Upstream or Downstream</option>
+      <option value="Both">Both Upstream and Downstream</option>
+      <option value="Upstream">Upstream</option>
+      <option value="Downstream">Downstream</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input1"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="4_windows.bed"/>
+      <param name="input2" value="4_windows_2.bed"/>
+      <param name="direction" value="Either"/>
+      <output name="out_file1" file="closest_features_either.interval"/>
+    </test>
+    <test>
+      <param name="input1" value="4_windows.bed"/>
+      <param name="input2" value="4_windows_2.bed"/>
+      <param name="direction" value="Both"/>
+      <output name="out_file1" file="closest_features.interval"/>
+    </test>
+    <test>
+      <param name="input1" value="4_windows.bed"/>
+      <param name="input2" value="4_windows_2.bed"/>
+      <param name="direction" value="Upstream"/>
+      <output name="out_file1" file="closest_features_up.interval"/>
+    </test>
+    <test>
+      <param name="input1" value="4_windows.bed"/>
+      <param name="input2" value="4_windows_2.bed"/>
+      <param name="direction" value="Downstream"/>
+      <output name="out_file1" file="closest_features_down.interval"/>
+    </test>
+    <test>
+      <param name="input1" value="4_windows.bed"/>
+      <param name="input2" value="4_windows_3.bed"/>
+      <param name="direction" value="Both"/>
+      <output name="out_file1" file="closest_features_both.interval"/>
+    </test>
+    <!-- Tests for GFF functionality. -->
+
+    <test>
+      <param name="input1" value="4_windows.bed"/>
+      <param name="input2" value="4_windows_2.gff"/>
+      <param name="direction" value="Either"/>
+      <output name="out_file1" file="closest_features_both.gff"/>
+    </test>
+    <test>
+      <param name="input1" value="4_windows.gff"/>
+      <param name="input2" value="4_windows_2.gff"/>
+      <param name="direction" value="Either"/>
+      <output name="out_file1" file="closest_features_both2.gff"/>
+    </test>
+    
+  </tests>
+ <help> 
+
+.. class:: infomark
+
+**What it does**
+
+For every interval in the **interval** dataset, this tool fetches the **closest non-overlapping** upstream and / or downstream features from the **features** dataset.
+
+-----
+
+.. class:: warningmark
+
+**Note:** 
+
+Every line should contain at least 3 columns: chromosome number, start and stop coordinates. If any of these columns is missing or if start and stop coordinates are not numerical, the lines will be treated as invalid and skipped. The number of skipped lines is documented in the resulting history item as a "data issue".
+
+If the strand column is missing from your input interval dataset, the intervals will be considered to be on positive strand. You can add a strand column to your input dataset by using the *Text Manipulation->Add column* tool.
+
+For GFF files, features are added as a GTF-style attribute at the end of the line.
+
+-----
+
+**Example**
+
+If the **intervals** are::
+
+   chr1 10   100  Query1.1
+   chr1 500  1000 Query1.2
+   chr1 1100 1250 Query1.3
+
+and the **features** are::
+
+   chr1 120  180  Query2.1
+   chr1 140  200  Query2.2
+   chr1 580  1050 Query2.3
+   chr1 2000 2204 Query2.4
+   chr1 2500 3000 Query2.5
+
+Running this tool for **Both Upstream and Downstream** will return::
+
+   chr1 10   100  Query1.1 chr1 120  180  Query2.1
+   chr1 500  1000 Query1.2 chr1 140  200  Query2.2
+   chr1 500  1000 Query1.2 chr1 2000 2204 Query2.4
+   chr1 1100 1250 Query1.3 chr1 580  1050 Query2.3
+   chr1 1100 1250 Query1.3 chr1 2000 2204 Query2.4
+
+</help>  
+
+
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/get_flanks.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/get_flanks.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,191 @@\n+#!/usr/bin/env python\n+#Done by: Guru\n+\n+"""\n+Get Flanking regions.\n+\n+usage: %prog input out_file size direction region\n+   -l, --cols=N,N,N,N: Columns for chrom, start, end, strand in file\n+   -o, --off=N: Offset\n+"""\n+\n+import sys, re, os\n+from galaxy import eggs\n+import pkg_resources; pkg_resources.require( "bx-python" )\n+from bx.cookbook import doc_optparse\n+from galaxy.tools.util.galaxyops import *\n+\n+def stop_err( msg ):\n+    sys.stderr.write( msg )\n+    sys.exit()\n+\n+def main(): \n+    try:\n+        if int( sys.argv[3] ) < 0:\n+            raise Exception\n+    except:\n+        stop_err( "Length of flanking region(s) must be a non-negative integer." )\n+\n+    # Parsing Command Line here\n+    options, args = doc_optparse.parse( __doc__ )\n+    try:\n+        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols )\n+        inp_file, out_file, size, direction, region = args\n+        if strand_col_1 <= 0:\n+            strand = "+"        #if strand is not defined, default it to +\n+    except:\n+        stop_err( "Metadata issue, correct the metadata attributes by clicking on the pencil icon in the history item." )\n+    try:\n+        offset = int(options.off)\n+        size = int(size)\n+    except:    \n+        stop_err( "Invalid offset or length entered. Try again by entering valid integer values." )\n+\n+    fo = open(out_file,\'w\')\n+    \n+    skipped_lines = 0\n+    first_invalid_line = 0\n+    invalid_line = None\n+    elems = []\n+    j=0\n+    for i, line in enumerate( file( inp_file ) ):\n+        line = line.strip()\n+        if line and (not line.startswith( \'#\' )) and line != \'\':\n+            j+=1\n+            try:\n+                elems = line.split(\'\\t\')\n+                #if the start and/or end columns are not numbers, skip that line.\n+                assert int(elems[start_col_1])\n+                assert int(elems[end_col_1])\n+                if strand_col_1 != -1:\n+                    strand = elems[strand_col_1]\n+                #if the stand value is not + or -, skip that line.\n+                assert strand in [\'+\', \'-\']\n+                if direction == \'Upstream\':\n+                    if strand == \'+\':\n+                        if region == \'end\':\n+                            elems[end_col_1] = str(int(elems[end_col_1]) + offset)\n+                            elems[start_col_1] = str( int(elems[end_col_1]) - size )\n+                        else:\n+                            elems[end_col_1] = str(int(elems[start_col_1]) + offset)\n+                            elems[start_col_1] = str( int(elems[end_col_1]) - size )\n+                    elif strand == \'-\':\n+                        if region == \'end\':\n+                            elems[start_col_1] = str(int(elems[start_col_1]) - offset)\n+                            elems[end_col_1] = str(int(elems[start_col_1]) + size)\n+                        else:\n+                            elems[start_col_1] = str(int(elems[end_col_1]) - offset)\n+                            elems[end_col_1] = str(int(elems[start_col_1]) + size)\n+                    assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0\n+                    fo.write( "%s\\n" % \'\\t\'.join( elems ) )\n+                                \n+                elif direction == \'Downstream\':\n+                    if strand == \'-\':\n+                        if region == \'start\':\n+                           elems[end_col_1] = str(int(elems[end_col_1]) - offset)\n+                           elems[start_col_1] = str( int(elems[end_col_1]) - size )\n+                        else:\n+                           elems[end_col_1] = str(int(elems[start_col_1]) - offset)\n+                           elems[start_col_1] = str( int(elems[end_col_1]) - size )\n+                    elif strand == \'+\':\n+                        if region == \'start\':\n+                            elems[start_col_1] = str(int(elems[start_col_1]) + offset)\n+                            elems[end_col_1] = str(int(elems[start_col_1'..b'ms[end_col_1]) > 0\n+                            fo.write( "%s\\n" % \'\\t\'.join( elems ) )\n+                        else:\n+                            start1 = str(int(elems[end_col_1]) - offset)\n+                            end1 = str(int(start1) + size)\n+                            start2 = str(int(elems[start_col_1]) - offset)\n+                            end2 = str(int(start2) - size)\n+                            elems[start_col_1]=start1\n+                            elems[end_col_1]=end1\n+                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0\n+                            fo.write( "%s\\n" % \'\\t\'.join( elems ) )\n+                            elems[start_col_1]=end2\n+                            elems[end_col_1]=start2\n+                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0\n+                            fo.write( "%s\\n" % \'\\t\'.join( elems ) )\n+                    elif strand == \'+\':\n+                        if region == \'start\':\n+                            start = str(int(elems[start_col_1]) + offset)\n+                            end1 = str(int(start) - size)\n+                            end2 = str(int(start) + size)\n+                            elems[start_col_1]=end1\n+                            elems[end_col_1]=start\n+                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0\n+                            fo.write( "%s\\n" % \'\\t\'.join( elems ) )\n+                            elems[start_col_1]=start\n+                            elems[end_col_1]=end2\n+                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0\n+                            fo.write( "%s\\n" % \'\\t\'.join( elems ) )\n+                        elif region == \'end\':\n+                            start = str(int(elems[end_col_1]) + offset)\n+                            end1 = str(int(start) - size)\n+                            end2 = str(int(start) + size)\n+                            elems[start_col_1]=end1\n+                            elems[end_col_1]=start\n+                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0\n+                            fo.write( "%s\\n" % \'\\t\'.join( elems ) )\n+                            elems[start_col_1]=start\n+                            elems[end_col_1]=end2\n+                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0\n+                            fo.write( "%s\\n" % \'\\t\'.join( elems ) )\n+                        else:\n+                            start1 = str(int(elems[start_col_1]) + offset)\n+                            end1 = str(int(start1) - size)\n+                            start2 = str(int(elems[end_col_1]) + offset)\n+                            end2 = str(int(start2) + size)\n+                            elems[start_col_1]=end1\n+                            elems[end_col_1]=start1\n+                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0\n+                            fo.write( "%s\\n" % \'\\t\'.join( elems ) )\n+                            elems[start_col_1]=start2\n+                            elems[end_col_1]=end2\n+                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0\n+                            fo.write( "%s\\n" % \'\\t\'.join( elems ) )\n+            except:\n+                skipped_lines += 1\n+                if not invalid_line:\n+                    first_invalid_line = i + 1\n+                    invalid_line = line\n+    fo.close()\n+\n+    if skipped_lines == j:\n+        stop_err( "Data issue: click the pencil icon in the history item to correct the metadata attributes." )\n+    if skipped_lines > 0:\n+        print \'Skipped %d invalid lines starting with #%dL "%s"\' % ( skipped_lines, first_invalid_line, invalid_line )\n+    print \'Location: %s, Region: %s, Flank-length: %d, Offset: %d \' %( direction, region, size, offset )\n+    \n+if __name__ == "__main__":\n+    main()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/get_flanks.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/get_flanks.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,78 @@
+<tool id="get_flanks1" name="Get flanks">
+  <description>returns flanking region/s for every gene</description>
+  <command interpreter="python">get_flanks.py $input $out_file1 $size $direction $region -o $offset -l ${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol}</command>
+  <inputs>
+    <param format="interval" name="input" type="data" label="Select data"/>
+    <param name="region" type="select" label="Region">
+      <option value="whole" selected="true">Whole feature</option>
+      <option value="start">Around Start</option>
+      <option value="end">Around End</option>
+    </param>
+    <param name="direction" type="select" label="Location of the flanking region/s">
+      <option value="Upstream">Upstream</option>
+      <option value="Downstream">Downstream</option>
+      <option value="Both">Both</option>
+    </param>
+    <param name="offset" size="10" type="integer" value="0" label="Offset" help="Use positive values to offset co-ordinates in the direction of transcription and negative values to offset in the opposite direction."/>
+    <param name="size" size="10" type="integer" value="50" label="Length of the flanking region(s)" help="Use non-negative value for length"/>
+    
+    
+  </inputs>
+  <outputs>
+    <data format="interval" name="out_file1" metadata_source="input"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="flanks_inp.bed"/>
+      <param name="offset" value="-500"/>
+      <param name="size" value="1000"/>
+      <param name="direction" value="Both"/>
+      <param name="region" value="whole"/>
+      <output name="out_file1" file="flanks_out1.bed"/>
+    </test>
+    <test>
+      <param name="input" value="flanks_inp.bed"/>
+      <param name="offset" value="200"/>
+      <param name="size" value="1000"/>
+      <param name="direction" value="Downstream"/>
+      <param name="region" value="start" />
+      <output name="out_file1" file="flanks_out2.bed"/>
+    </test>
+  </tests>
+ <help> 
+
+This tool finds the upstream and/or downstream flanking region(s) of all the selected regions in the input file. 
+
+**Note:** Every line should contain at least 3 columns: Chromosome number, Start and Stop co-ordinates. If any of these columns is missing or if start and stop co-ordinates are not numerical, the tool may encounter exceptions and such lines are skipped as invalid. The number of invalid skipped lines is documented in the resulting history item as a "Data issue".
+
+-----
+
+
+**Example 1**
+
+- For the following query::
+
+   chr22  1000  7000  NM_174568 0 +
+
+- running get flanks with Region: Around start, Offset: -200, Flank-length: 300 and Location: Upstream will return **(Red: Query positive strand; Blue: Flanks output)**::
+
+   chr22  500  800  NM_174568 0 +
+
+.. image:: ./static/operation_icons/flanks_ex1.gif
+
+**Example 2**
+
+- For the following query::
+
+   chr22  1000  7000  NM_028946 0 -
+
+- running get flanks with Region: Whole, Offset: 200, Flank-length: 300 and Location: Downstream will return **(Orange: Query negative strand; Magenta: Flanks output)**::
+
+   chr22  500  800  NM_028946 0 -
+
+.. image:: ./static/operation_icons/flanks_ex2.gif
+
+</help>  
+
+
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/gops_basecoverage.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/gops_basecoverage.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+"""
+Count total base coverage.
+
+usage: %prog in_file out_file
+    -1, --cols1=N,N,N,N: Columns for start, end, strand in first file
+"""
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+import sys, traceback, fileinput
+from warnings import warn
+from bx.intervals import *
+from bx.intervals.io import *
+from bx.intervals.operations.base_coverage import *
+from bx.cookbook import doc_optparse
+from galaxy.tools.util.galaxyops import *
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def main():
+    upstream_pad = 0
+    downstream_pad = 0
+
+    options, args = doc_optparse.parse( __doc__ )
+    try:
+        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
+        in_fname, out_fname = args
+    except:
+        doc_optparse.exception()
+        
+    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
+                            chrom_col=chr_col_1,
+                            start_col=start_col_1,
+                            end_col=end_col_1,
+                            strand_col = strand_col_1,
+                            fix_strand=True )
+    
+    try:
+        bases = base_coverage(g1)
+    except ParseError, exc:
+        fail( "Invalid file format: %s" % str( exc ) )
+    out_file = open( out_fname, "w" )
+    out_file.write( "%s\n" % str( bases ) )
+    out_file.close()
+    if g1.skipped > 0:
+        print skipped( g1, filedesc="" )
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/gops_cluster.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/gops_cluster.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,132 @@
+#!/usr/bin/env python
+"""
+Cluster regions of intervals.
+
+usage: %prog in_file out_file
+    -1, --cols1=N,N,N,N: Columns for start, end, strand in file
+    -d, --distance=N: Maximum distance between clustered intervals
+    -v, --overlap=N: Minimum overlap require (negative distance)
+    -m, --minregions=N: Minimum regions per cluster
+    -o, --output=N: 1)merged 2)filtered 3)clustered 4) minimum 5) maximum
+"""
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+import sys, traceback, fileinput
+from warnings import warn
+from bx.intervals import *
+from bx.intervals.io import *
+from bx.intervals.operations.find_clusters import *
+from bx.cookbook import doc_optparse
+from galaxy.tools.util.galaxyops import *
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def main():
+    distance = 0
+    minregions = 2
+    output = 1
+    upstream_pad = 0
+    downstream_pad = 0
+
+    options, args = doc_optparse.parse( __doc__ )
+    try:
+        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
+        if options.distance: distance = int( options.distance )
+        if options.overlap: distance = -1 * int( options.overlap )
+        if options.output: output = int( options.output )
+        if options.minregions: minregions = int( options.minregions )
+        in_fname, out_fname = args
+    except:
+        doc_optparse.exception()
+
+    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
+                            chrom_col=chr_col_1,
+                            start_col=start_col_1,
+                            end_col=end_col_1,
+                            strand_col=strand_col_1,
+                            fix_strand=True )
+
+    # Get the cluster tree
+    try:
+        clusters, extra = find_clusters( g1, mincols=distance, minregions=minregions)
+    except ParseError, exc:
+        fail( "Invalid file format: %s" % str( exc ) )
+
+    f1 = open( in_fname, "r" )
+    out_file = open( out_fname, "w" )
+    
+    # If "merge"
+    if output == 1:
+        fields = ["."  for x in range(max(g1.chrom_col, g1.start_col, g1.end_col)+1)]
+        for chrom, tree in clusters.items():
+            for start, end, lines in tree.getregions():
+                fields[g1.chrom_col] = chrom
+                fields[g1.start_col] = str(start)
+                fields[g1.end_col] = str(end)
+                out_file.write( "%s\n" % "\t".join( fields ) )
+
+    # If "filtered" we preserve order of file and comments, etc.
+    if output == 2:
+        linenums = dict()
+        for chrom, tree in clusters.items():
+            for linenum in tree.getlines():
+                linenums[linenum] = 0
+        linenum = -1
+        f1.seek(0)
+        for line in f1.readlines():
+            linenum += 1
+            if linenum in linenums or linenum in extra:
+                out_file.write( "%s\n" % line.rstrip( "\n\r" ) )
+
+    # If "clustered" we output original intervals, but near each other (i.e. clustered)
+    if output == 3:
+        linenums = list()
+        f1.seek(0)
+        fileLines = f1.readlines()
+        for chrom, tree in clusters.items():
+            for linenum in tree.getlines():
+                out_file.write( "%s\n" % fileLines[linenum].rstrip( "\n\r" ) )
+
+    # If "minimum" we output the smallest interval in each cluster
+    if output == 4 or output == 5:
+        linenums = list()
+        f1.seek(0)
+        fileLines = f1.readlines()
+        for chrom, tree in clusters.items():
+            regions = tree.getregions()
+            for start, end, lines in tree.getregions():
+                outsize = -1
+                outinterval = None
+                for line in lines:
+                    # three nested for loops?
+                    # should only execute this code once per line
+                    fileline = fileLines[line].rstrip("\n\r")
+                    try:
+                        cluster_interval = GenomicInterval( g1, fileline.split("\t"), 
+                                                            g1.chrom_col, 
+                                                            g1.start_col,
+                                                            g1.end_col, 
+                                                            g1.strand_col, 
+                                                            g1.default_strand,
+                                                            g1.fix_strand )
+                    except Exception, exc:
+                        print >> sys.stderr, str( exc )
+                        f1.close()
+                        sys.exit()
+                    interval_size = cluster_interval.end - cluster_interval.start
+                    if outsize == -1 or \
+                       ( outsize > interval_size and output == 4 ) or \
+                       ( outsize < interval_size and output == 5 ) :
+                        outinterval = cluster_interval
+                        outsize = interval_size
+                out_file.write( "%s\n" % outinterval )
+
+    f1.close()
+    out_file.close()
+    
+    if g1.skipped > 0:
+        print skipped( g1, filedesc="" )
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/gops_complement.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/gops_complement.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+"""
+Complement regions.
+
+usage: %prog in_file out_file
+    -1, --cols1=N,N,N,N: Columns for chrom, start, end, strand in file
+    -l, --lengths=N: Filename of .len file for species (chromosome lengths)
+    -a, --all: Complement all chromosomes (Genome-wide complement)
+"""
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+import sys, traceback, fileinput
+from warnings import warn
+from bx.intervals import *
+from bx.intervals.io import *
+from bx.intervals.operations.complement import complement
+from bx.intervals.operations.subtract import subtract
+from bx.cookbook import doc_optparse
+from galaxy.tools.util.galaxyops import *
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def main():
+    allchroms = False
+    upstream_pad = 0
+    downstream_pad = 0
+
+    options, args = doc_optparse.parse( __doc__ )
+    try:
+        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
+        lengths = options.lengths
+        if options.all: allchroms = True
+        in_fname, out_fname = args
+    except:
+        doc_optparse.exception()
+
+    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
+                            chrom_col=chr_col_1,
+                            start_col=start_col_1,
+                            end_col=end_col_1,
+                            strand_col=strand_col_1,
+                            fix_strand=True )
+
+    lens = dict()
+    chroms = list()
+    # dbfile is used to determine the length of each chromosome.  The lengths
+    # are added to the lens dict and passed copmlement operation code in bx.
+    dbfile = fileinput.FileInput( lengths )
+    
+    if dbfile:
+        if not allchroms:
+            try:
+                for line in dbfile:
+                    fields = line.split("\t")
+                    lens[fields[0]] = int(fields[1])
+            except:
+                # assume LEN doesn't exist or is corrupt somehow
+                pass
+        elif allchroms:
+            try:
+                for line in dbfile:
+                    fields = line.split("\t")
+                    end = int(fields[1])
+                    chroms.append("\t".join([fields[0],"0",str(end)]))
+            except:
+                pass
+
+    # Safety...if the dbfile didn't exist and we're on allchroms, then
+    # default to generic complement
+    if allchroms and len(chroms) == 0:
+        allchroms = False
+
+    if allchroms:
+        chromReader = GenomicIntervalReader(chroms)
+        generator = subtract([chromReader, g1])
+    else:
+        generator = complement(g1, lens)
+
+    out_file = open( out_fname, "w" )
+
+    try:
+        for interval in generator:
+            if type( interval ) is GenomicInterval:
+                out_file.write( "%s\n" % "\t".join( interval ) )
+            else:
+                out_file.write( "%s\n" % interval )
+    except ParseError, exc:
+        out_file.close()
+        fail( "Invalid file format: %s" % str( exc ) )
+
+    out_file.close()
+
+    if g1.skipped > 0:
+        print skipped( g1, filedesc="" )
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/gops_concat.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/gops_concat.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+"""
+Concatenate two bed files.  The concatenated files are returned in the
+same format as the first.  If --sameformat is specified, then all
+columns will be treated as the same, and all fields will be saved,
+although the output will be trimmed to match the primary input.  In
+addition, if --sameformat is specified, missing fields will be padded
+with a period(.).
+
+usage: %prog in_file_1 in_file_2 out_file
+    -1, --cols1=N,N,N,N: Columns for chrom, start, end, strand in first file
+    -2, --cols2=N,N,N,N: Columns for chrom, start, end, strand in second file
+    -s, --sameformat: All files are precisely the same format.
+"""
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+import sys, traceback, fileinput
+from warnings import warn
+from bx.intervals import *
+from bx.intervals.io import *
+from bx.intervals.operations.concat import *
+from bx.cookbook import doc_optparse
+from galaxy.tools.util.galaxyops import *
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def main():
+    sameformat=False
+    upstream_pad = 0
+    downstream_pad = 0
+
+    options, args = doc_optparse.parse( __doc__ )
+    try:
+        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
+        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
+        if options.sameformat: sameformat = True
+        in_file_1, in_file_2, out_fname = args
+    except:
+        doc_optparse.exception()
+
+    g1 = NiceReaderWrapper( fileinput.FileInput( in_file_1 ),
+                            chrom_col=chr_col_1,
+                            start_col=start_col_1,
+                            end_col=end_col_1,
+                            fix_strand=True )
+
+    g2 = NiceReaderWrapper( fileinput.FileInput( in_file_2 ),
+                            chrom_col=chr_col_2,
+                            start_col=start_col_2,
+                            end_col=end_col_2,
+                            strand_col=strand_col_2,
+                            fix_strand=True )
+
+    if strand_col_1 >= 0:
+        g1.strand_col = strand_col_1
+
+    out_file = open( out_fname, "w" )
+
+    try:
+        for line in concat( [g1, g2], sameformat=sameformat ):
+            if type( line ) is GenomicInterval:
+                out_file.write( "%s\n" % "\t".join( line.fields ) )
+            else:
+                out_file.write( "%s\n" % line )
+    except ParseError, exc:
+        out_file.close()
+        fail( "Invalid file format: %s" % str( exc ) )
+
+    out_file.close()
+
+    if g1.skipped > 0:
+        print skipped( g1, filedesc=" of 1st dataset" )
+    if g2.skipped > 0:
+        print skipped( g2, filedesc=" of 2nd dataset" )
+        
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/gops_coverage.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/gops_coverage.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+"""
+Calculate coverage of one query on another, and append the coverage to
+the last two columns as bases covered and percent coverage.
+
+usage: %prog bed_file_1 bed_file_2 out_file
+    -1, --cols1=N,N,N,N: Columns for start, end, strand in first file
+    -2, --cols2=N,N,N,N: Columns for start, end, strand in second file
+"""
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+import sys, traceback, fileinput
+from warnings import warn
+from bx.intervals import *
+from bx.intervals.io import *
+from bx.intervals.operations.coverage import *
+from bx.cookbook import doc_optparse
+from galaxy.tools.util.galaxyops import *
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def main():
+    upstream_pad = 0
+    downstream_pad = 0
+
+    options, args = doc_optparse.parse( __doc__ )
+    try:
+        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
+        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )      
+        in_fname, in2_fname, out_fname = args
+    except:
+        doc_optparse.exception()
+
+    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
+                            chrom_col=chr_col_1,
+                            start_col=start_col_1,
+                            end_col=end_col_1,
+                            strand_col=strand_col_1,
+                            fix_strand=True )
+    g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
+                            chrom_col=chr_col_2,
+                            start_col=start_col_2,
+                            end_col=end_col_2,
+                            strand_col=strand_col_2,
+                            fix_strand=True )
+
+    out_file = open( out_fname, "w" )
+
+    try:
+        for line in coverage( [g1,g2] ):
+            if type( line ) is GenomicInterval:
+                out_file.write( "%s\n" % "\t".join( line.fields ) )
+            else:
+                out_file.write( "%s\n" % line )
+    except ParseError, exc:
+        out_file.close()
+        fail( "Invalid file format: %s" % str( exc ) )
+
+    out_file.close()
+
+    if g1.skipped > 0:
+        print skipped( g1, filedesc=" of 1st dataset" )
+    if g2.skipped > 0:
+        print skipped( g2, filedesc=" of 2nd dataset" )
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/gops_intersect.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/gops_intersect.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+"""
+Find regions of first interval file that overlap regions in a second interval file.
+Interval files can either be BED or GFF format.
+
+usage: %prog interval_file_1 interval_file_2 out_file
+    -1, --cols1=N,N,N,N: Columns for start, end, strand in first file
+    -2, --cols2=N,N,N,N: Columns for start, end, strand in second file
+    -m, --mincols=N: Require this much overlap (default 1bp)
+    -p, --pieces: just print pieces of second set (after padding)
+    -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval
+    -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval
+"""
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+import sys, traceback, fileinput
+from warnings import warn
+from bx.intervals import *
+from bx.intervals.io import *
+from bx.intervals.operations.intersect import *
+from bx.cookbook import doc_optparse
+from galaxy.tools.util.galaxyops import *
+from galaxy.datatypes.util.gff_util import GFFFeature, GFFReaderWrapper, convert_bed_coords_to_gff
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def main():
+    mincols = 1
+    upstream_pad = 0
+    downstream_pad = 0
+
+    options, args = doc_optparse.parse( __doc__ )
+    try:
+        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
+        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )      
+        if options.mincols: mincols = int( options.mincols )
+        pieces = bool( options.pieces )
+        in1_gff_format = bool( options.gff1 )
+        in2_gff_format = bool( options.gff2 )
+        in_fname, in2_fname, out_fname = args
+    except:
+        doc_optparse.exception()
+        
+    # Set readers to handle either GFF or default format.
+    if in1_gff_format:
+        in1_reader_wrapper = GFFReaderWrapper
+    else:
+        in1_reader_wrapper = NiceReaderWrapper
+    if in2_gff_format:
+        in2_reader_wrapper = GFFReaderWrapper
+    else:
+        in2_reader_wrapper = NiceReaderWrapper
+        
+    g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ),
+                            chrom_col=chr_col_1,
+                            start_col=start_col_1,
+                            end_col=end_col_1,
+                            strand_col=strand_col_1,
+                            fix_strand=True )
+    if in1_gff_format:
+        # Intersect requires coordinates in BED format.
+        g1.convert_to_bed_coord=True
+    g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ),
+                            chrom_col=chr_col_2,
+                            start_col=start_col_2,
+                            end_col=end_col_2,
+                            strand_col=strand_col_2,
+                            fix_strand=True )
+    if in2_gff_format:
+        # Intersect requires coordinates in BED format.
+        g2.convert_to_bed_coord=True
+        
+    out_file = open( out_fname, "w" )
+    try:
+        for feature in intersect( [g1,g2], pieces=pieces, mincols=mincols ):
+            if isinstance( feature, GFFFeature ):
+                # Convert back to GFF coordinates since reader converted automatically.
+                convert_bed_coords_to_gff( feature )
+                for interval in feature.intervals:
+                    out_file.write( "%s\n" % "\t".join( interval.fields ) )
+            elif isinstance( feature, GenomicInterval ):
+                out_file.write( "%s\n" % "\t".join( feature.fields ) )
+            else:
+                out_file.write( "%s\n" % feature )
+    except ParseError, e:
+        out_file.close()
+        fail( "Invalid file format: %s" % str( e ) )
+
+    out_file.close()
+
+    if g1.skipped > 0:
+        print skipped( g1, filedesc=" of 1st dataset" )
+    if g2.skipped > 0:
+        print skipped( g2, filedesc=" of 2nd dataset" )
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/gops_join.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/gops_join.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+"""
+Join two sets of intervals using their overlap as the key.
+
+usage: %prog bed_file_1 bed_file_2 out_file
+    -1, --cols1=N,N,N,N: Columns for start, end, strand in first file
+    -2, --cols2=N,N,N,N: Columns for start, end, strand in second file
+    -m, --mincols=N: Require this much overlap (default 1bp)
+    -f, --fill=N: none, right, left, both
+"""
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+import sys, traceback, fileinput
+from warnings import warn
+from bx.intervals import *
+from bx.intervals.io import *
+from bx.intervals.operations.join import *
+from bx.cookbook import doc_optparse
+from galaxy.tools.util.galaxyops import *
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def main():
+    mincols = 1
+    upstream_pad = 0
+    downstream_pad = 0
+    leftfill = False
+    rightfill = False
+    
+    options, args = doc_optparse.parse( __doc__ )
+    try:
+        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
+        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )      
+        if options.mincols: mincols = int( options.mincols )
+        if options.fill:
+            if options.fill == "both":
+                rightfill = leftfill = True
+            else:
+                rightfill = options.fill == "right"
+                leftfill = options.fill == "left"
+        in_fname, in2_fname, out_fname = args
+    except:
+        doc_optparse.exception()
+
+    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
+                            chrom_col=chr_col_1,
+                            start_col=start_col_1,
+                            end_col=end_col_1,
+                            strand_col=strand_col_1,
+                            fix_strand=True )
+    g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
+                            chrom_col=chr_col_2,
+                            start_col=start_col_2,
+                            end_col=end_col_2,
+                            strand_col=strand_col_2,
+                            fix_strand=True )
+
+    out_file = open( out_fname, "w" )
+
+    try:
+        for outfields in join(g1, g2, mincols=mincols, rightfill=rightfill, leftfill=leftfill):
+            if type( outfields ) is list:
+                out_file.write( "%s\n" % "\t".join( outfields ) )
+            else:
+                out_file.write( "%s\n" % outfields )
+    except ParseError, exc:
+        out_file.close()
+        fail( "Invalid file format: %s" % str( exc ) )
+    except MemoryError:
+        out_file.close()
+        fail( "Input datasets were too large to complete the join operation." )
+
+    out_file.close()
+
+    if g1.skipped > 0:
+        print skipped( g1, filedesc=" of 1st dataset" )
+    if g2.skipped > 0:
+        print skipped( g2, filedesc=" of 2nd dataset" )
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/gops_merge.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/gops_merge.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+"""
+Merge overlaping regions.
+
+usage: %prog in_file out_file
+    -1, --cols1=N,N,N,N: Columns for start, end, strand in first file
+    -m, --mincols=N: Require this much overlap (default 1bp)
+    -3, --threecol: Output 3 column bed
+"""
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+import sys, traceback, fileinput
+from warnings import warn
+from bx.intervals import *
+from bx.intervals.io import *
+from bx.intervals.operations.merge import *
+from bx.cookbook import doc_optparse
+from galaxy.tools.util.galaxyops import *
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def main():
+    mincols = 1
+    upstream_pad = 0
+    downstream_pad = 0
+
+    options, args = doc_optparse.parse( __doc__ )
+    try:
+        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
+        if options.mincols: mincols = int( options.mincols )
+        in_fname, out_fname = args
+    except:
+        doc_optparse.exception()
+
+    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
+                            chrom_col=chr_col_1,
+                            start_col=start_col_1,
+                            end_col=end_col_1,
+                            strand_col = strand_col_1,
+                            fix_strand=True )
+
+    out_file = open( out_fname, "w" )
+
+    try:
+        for line in merge(g1,mincols=mincols):
+            if options.threecol:
+                if type( line ) is GenomicInterval:
+                    out_file.write( "%s\t%s\t%s\n" % ( line.chrom, str( line.startCol ), str( line.endCol ) ) )
+                elif type( line ) is list:
+                    out_file.write( "%s\t%s\t%s\n" % ( line[chr_col_1], str( line[start_col_1] ), str( line[end_col_1] ) ) )
+                else:
+                    out_file.write( "%s\n" % line )
+            else:
+                if type( line ) is GenomicInterval:
+                    out_file.write( "%s\n" % "\t".join( line.fields ) )
+                elif type( line ) is list:
+                    out_file.write( "%s\n" % "\t".join( line ) )
+                else:
+                    out_file.write( "%s\n" % line )
+    except ParseError, exc:
+        out_file.close()
+        fail( "Invalid file format: %s" % str( exc ) )
+
+    out_file.close()
+
+    if g1.skipped > 0:
+        print skipped( g1, filedesc=" of 1st dataset" )
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/gops_subtract.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/gops_subtract.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+"""
+Find regions of first interval file that do not overlap regions in a second
+interval file. Interval files can either be BED or GFF format.
+
+usage: %prog interval_file_1 interval_file_2 out_file
+    -1, --cols1=N,N,N,N: Columns for start, end, strand in first file
+    -2, --cols2=N,N,N,N: Columns for start, end, strand in second file
+    -m, --mincols=N: Require this much overlap (default 1bp)
+    -p, --pieces: just print pieces of second set (after padding)
+    -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval
+    -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval
+"""
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+import sys, traceback, fileinput
+from warnings import warn
+from bx.intervals import *
+from bx.intervals.io import *
+from bx.intervals.operations.subtract import *
+from bx.cookbook import doc_optparse
+from galaxy.tools.util.galaxyops import *
+from galaxy.datatypes.util.gff_util import GFFFeature, GFFReaderWrapper, convert_bed_coords_to_gff
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def main():
+    mincols = 1
+    upstream_pad = 0
+    downstream_pad = 0
+
+    options, args = doc_optparse.parse( __doc__ )
+    try:
+        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
+        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )      
+        if options.mincols: mincols = int( options.mincols )
+        pieces = bool( options.pieces )
+        in1_gff_format = bool( options.gff1 )
+        in2_gff_format = bool( options.gff2 )
+        in_fname, in2_fname, out_fname = args
+    except:
+        doc_optparse.exception()
+
+    # Set readers to handle either GFF or default format.
+    if in1_gff_format:
+        in1_reader_wrapper = GFFReaderWrapper
+    else:
+        in1_reader_wrapper = NiceReaderWrapper
+    if in2_gff_format:
+        in2_reader_wrapper = GFFReaderWrapper
+    else:
+        in2_reader_wrapper = NiceReaderWrapper
+        
+    g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ),
+                            chrom_col=chr_col_1,
+                            start_col=start_col_1,
+                            end_col=end_col_1,
+                            strand_col=strand_col_1,
+                            fix_strand=True )
+    if in1_gff_format:
+        # Subtract requires coordinates in BED format.
+        g1.convert_to_bed_coord=True
+        
+    g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ),
+                            chrom_col=chr_col_2,
+                            start_col=start_col_2,
+                            end_col=end_col_2,
+                            strand_col=strand_col_2,
+                            fix_strand=True )
+    if in2_gff_format:
+        # Subtract requires coordinates in BED format.
+        g2.convert_to_bed_coord=True
+        
+    out_file = open( out_fname, "w" )
+    try:
+        for feature in subtract( [g1,g2], pieces=pieces, mincols=mincols ):
+            if isinstance( feature, GFFFeature ):
+                # Convert back to GFF coordinates since reader converted automatically.
+                convert_bed_coords_to_gff( feature )
+                for interval in feature.intervals:
+                    out_file.write( "%s\n" % "\t".join( interval.fields ) )
+            elif isinstance( feature, GenomicInterval ):
+                out_file.write( "%s\n" % "\t".join( feature.fields ) )
+            else:
+                out_file.write( "%s\n" % feature )
+    except ParseError, exc:
+        out_file.close()
+        fail( "Invalid file format: %s" % str( exc ) )
+
+    out_file.close()
+
+    if g1.skipped > 0:
+        print skipped( g1, filedesc=" of 2nd dataset" )
+    if g2.skipped > 0:
+        print skipped( g2, filedesc=" of 1st dataset" )
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/intersect.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/intersect.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,137 @@
+<tool id="gops_intersect_1" name="Intersect">
+  <description>the intervals of two datasets</description>
+  <command interpreter="python">gops_intersect.py 
+      $input1 $input2 $output
+
+      #if isinstance( $input1.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
+        -1 1,4,5,7 --gff1
+      #else:
+        -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol}
+      #end if
+
+      #if isinstance( $input2.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
+        -2 1,4,5,7 --gff2
+      #else:
+          -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol} 
+      #end if
+
+      -m $min $returntype
+  </command>
+  <inputs>
+      <param name="returntype" type="select" label="Return" help="(see figure below)">
+          <option value="">Overlapping Intervals</option>
+          <option value="-p">Overlapping pieces of Intervals</option>
+      </param>
+      <param format="interval,gff" name="input1" type="data" help="First dataset">
+          <label>of</label>
+      </param>
+      <param format="interval,gff" name="input2" type="data" help="Second dataset">
+          <label>that intersect</label>
+      </param>
+      <param name="min" size="4" type="integer" value="1" min="1" help="(bp)">
+          <label>for at least</label>
+      </param>
+  </inputs>
+  <outputs>
+      <data format="input" name="output" metadata_source="input1"/>
+  </outputs>
+  <code file="operation_filter.py"/>
+  <trackster_conf/>
+  <tests>
+    <test>
+      <param name="input1" value="1.bed" />
+      <param name="input2" value="2.bed" />
+      <param name="min" value="1" />
+      <param name="returntype" value="" />
+      <output name="output" file="gops_intersect_out.bed" />
+    </test>
+    <test>
+      <param name="input1" value="1.bed" />
+      <param name="input2" value="2_mod.bed" ftype="interval"/>
+      <param name="min" value="1" />
+      <param name="returntype" value="" />
+      <output name="output" file="gops_intersect_diffCols.bed" />
+    </test>
+    <test>
+      <param name="input1" value="1.bed" />
+      <param name="input2" value="2_mod.bed" ftype="interval"/>
+      <param name="min" value="1" />
+      <param name="returntype" value="Overlapping pieces of Intervals" />
+      <output name="output" file="gops_intersect_p_diffCols.bed" />
+    </test>
+    <test>
+      <param name="input1" value="1.bed" />
+      <param name="input2" value="2.bed" />
+      <param name="min" value="10" />
+      <param name="returntype" value="Overlapping pieces of Intervals" />
+      <output name="output" file="gops_intersect_p_out.bed" />     
+    </test>
+    <test>
+      <param name="input1" value="gops_bigint.interval" ftype="interval" />
+      <param name="input2" value="gops_bigint2.interval" ftype="interval" />
+      <param name="min" value="1" />
+      <param name="returntype" value="" />
+      <output name="output" file="gops_intersect_bigint_out.interval" />     
+    </test>
+    <test>
+      <param name="input1" value="gops_bigint2.interval" ftype="interval" />
+      <param name="input2" value="gops_bigint.interval" ftype="interval" />
+      <param name="min" value="1" />
+      <param name="returntype" value="" />
+      <output name="output" file="gops_intersect_bigint_out.interval" />     
+    </test>
+    <test>
+      <param name="input1" value="12.bed" ftype="bed" />
+      <param name="input2" value="1.bed" ftype="bed" />
+      <param name="min" value="1" />
+      <param name="returntype" value="" />
+      <output name="output" file="gops_intersect_no_strand_out.bed" />     
+    </test>
+    <!-- Intersect two GFF files. -->
+    <test>
+        <param name="input1" value="gops_subtract_in1.gff" />
+        <param name="input2" value="gops_subtract_in2.gff" />
+        <param name="min" value="1" />
+        <param name="returntype" value="" />
+        <output name="output" file="gops_intersect_out2.gff" />        
+    </test>
+    <!-- Intersect GFF file and bed file. -->
+    <test>
+        <param name="input1" value="gops_subtract_in1.gff" />
+        <param name="input2" value="gops_subtract_in2.bed" />
+        <param name="min" value="1" />
+        <param name="returntype" value="" />
+        <output name="output" file="gops_intersect_out2.gff" />        
+    </test>
+    
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns.
+
+-----
+
+**Screencasts!**
+
+See Galaxy Interval Operation Screencasts_ (right click to open this link in another window).
+
+.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations
+
+-----
+
+**Syntax**
+
+- **Where overlap is at least** sets the minimum length (in base pairs) of overlap between elements of the two datasets
+- **Overlapping Intervals** returns entire intervals from the first dataset  that overlap the second dataset.  The returned intervals are completely unchanged, and this option only filters out intervals that do not overlap with the second dataset.
+- **Overlapping pieces of Intervals** returns intervals that indicate the exact base pair overlap between the first dataset and the second dataset.  The intervals returned are from the first dataset, and all fields besides start and end are guaranteed to remain unchanged.
+
+-----
+
+**Example**
+
+.. image:: ./static/operation_icons/gops_intersect.gif
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/join.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/join.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,139 @@
+<tool id="gops_join_1" name="Join">
+  <description>the intervals of two datasets side-by-side</description>
+  <command interpreter="python">gops_join.py $input1 $input2 $output -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol} -m $min -f $fill</command>
+  <inputs>
+    <param format="interval" name="input1" type="data" help="First dataset">
+      <label>Join</label>
+    </param>
+    <param format="interval" name="input2" type="data" help="Second dataset">
+      <label>with</label>
+    </param>
+    <param name="min" size="4" type="integer" value="1" help="(bp)">
+      <label>with min overlap</label>
+    </param>
+  <param name="fill" type="select" label="Return">
+    <option value="none">Only records that are joined (INNER JOIN)</option>
+    <option value="right">All records of first dataset (fill null with ".")</option>
+    <option value="left">All records of second dataset (fill null with ".")</option>
+    <option value="both">All records of both datasets (fill nulls with ".")</option>
+  </param>
+   </inputs>
+  <outputs>
+    <data format="interval" name="output" metadata_source="input1" />
+  </outputs>
+  <code file="operation_filter.py"/>
+  <tests>
+    <test>
+      <param name="input1" value="1.bed" />
+      <param name="input2" value="2.bed" />
+      <param name="min" value="1" />
+      <param name="fill" value="none" />
+      <output name="output" file="gops-join-none.dat" />
+    </test>
+    <test>
+      <param name="input1" value="1.bed" />
+      <param name="input2" value="2.bed" />
+      <param name="min" value="1" />
+      <param name="fill" value="right" />
+      <output name="output" file="gops-join-right.dat" />
+    </test>
+    <test>
+      <param name="input1" value="1.bed" />
+      <param name="input2" value="2.bed" />
+      <param name="min" value="1" />
+      <param name="fill" value="left" />
+      <output name="output" file="gops-join-left.dat" />
+    </test>
+    <test>
+      <param name="input1" value="1.bed" />
+      <param name="input2" value="2.bed" />
+      <param name="min" value="1" />
+      <param name="fill" value="both" />
+      <output name="output" file="gops-join-both.dat" />
+    </test>
+    <test>
+      <param name="input1" value="1.bed" />
+      <param name="input2" value="2.bed" />
+      <param name="min" value="500" />
+      <param name="fill" value="none" />
+      <output name="output" file="gops-join-none-500.dat" />
+    </test>
+    <test>
+      <param name="input1" value="1.bed" />
+      <param name="input2" value="2.bed" />
+      <param name="min" value="100" />
+      <param name="fill" value="both" />
+      <output name="output" file="gops-join-both-100.dat" />
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns.
+
+-----
+
+**Screencasts!**
+
+See Galaxy Interval Operation Screencasts_ (right click to open this link in another window).
+
+.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations
+
+-----
+
+**Syntax**
+
+- **Where overlap** specifies the minimum overlap between intervals that allows them to be joined.
+- **Return only records that are joined** returns only the records of the first dataset that join to a record in the second dataset.  This is analogous to an INNER JOIN.
+- **Return all records of first dataset (fill null with &quot;.&quot;)** returns all intervals of the first dataset, and any intervals that do not join an interval from the second dataset are filled in with a period(.).  This is analogous to a LEFT JOIN.
+- **Return all records of second dataset (fill null with &quot;.&quot;)** returns all intervals of the second dataset, and any intervals that do not join an interval from the first dataset are filled in with a period(.).  **Note that this may produce an invalid interval file, since a period(.) is not a valid chrom, start, end or strand.**
+- **Return all records of both datasets (fill nulls with &quot;.&quot;)** returns all records from both datasets, and fills on either the right or left with periods.  **Note that this may produce an invalid interval file, since a period(.) is not a valid chrom, start, end or strand.**
+
+-----
+
+**Example**
+
+If **First dataset** is::
+
+   chr1 10   100  Query1.1
+   chr1 500  1000 Query1.2
+   chr1 1100 1250 Query1.3
+
+and **Second dataset** is::
+
+   chr1 20   80   Query2.1
+   chr1 2000 2204 Query2.2
+   chr1 2500 3000 Query2.3
+
+
+The four return options will generate:
+
+
+- **Return only records that are joined**::
+
+   chr1 10 100 Query1.1 chr1 20 80 Query2.1
+
+- **Return all records of first dataset**::
+
+   chr1 10   100  Query1.1 chr1 20 80 Query2.1
+   chr1 500  1000 Query1.2 .    .  .  .
+   chr1 1100 1250 Query1.3 .    .  .  .
+
+- **Return all records of second dataset**::
+
+   chr1 10 100 Query1.1 chr1 20   80   Query2.1
+   .    .  .   .        chr1 2000 2204 Query2.2
+   .    .  .   .        chr1 2500 3000 Query2.3
+
+- **Return all records of both datasets**::
+
+   chr1 10   100  Query1.1 chr1 20   80   Query2.1
+   chr1 500  1000 Query1.2 .    .    .    .
+   chr1 1100 1250 Query1.3 .    .    .    .
+   .    .    .    .        chr1 2000 2204 Query2.2
+   .    .    .    .        chr1 2500 3000 Query2.3
+   
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/merge.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/merge.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,58 @@
+<tool id="gops_merge_1" name="Merge">
+  <description>the overlapping intervals of a dataset</description>
+  <command interpreter="python">gops_merge.py $input1 $output -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} $returntype</command>
+  <inputs>
+    <param format="interval" name="input1" type="data">
+      <label>Merge overlaping regions of</label>
+    </param>
+    <param name="returntype" type="boolean" truevalue="-3" falsevalue="">
+      <label>Output 3 column bed</label>
+    </param>
+   </inputs>
+  <outputs>
+    <data format="input" name="output" metadata_source="input1" />
+  </outputs>
+  <code file="operation_filter.py">
+    <hook exec_after_process="exec_after_merge" />
+  </code>
+  <tests>
+    <test>
+      <param name="input1" value="1.bed" />
+      <output name="output" file="gops-merge.dat" />
+      <param name="returntype" value="true" />
+    </test>
+    <test>
+      <param name="input1" value="2_mod.bed" ftype="interval"/>
+      <output name="output" file="gops_merge_diffCols.dat" />
+      <param name="returntype" value="true" />
+    </test>
+    <test>
+      <param name="input1" value="gops_bigint.interval" />
+      <output name="output" file="gops_merge_out2.bed" />
+      <param name="returntype" value="true" />
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns.
+
+-----
+
+**Screencasts!**
+
+See Galaxy Interval Operation Screencasts_ (right click to open this link in another window).
+
+.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations
+
+-----
+
+This operation merges all overlapping intervals into single intervals.
+
+**Example**
+
+.. image:: ./static/operation_icons/gops_merge.gif
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/operation_filter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/operation_filter.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,96 @@
+# runs after the job (and after the default post-filter)
+import os
+from galaxy import eggs
+from galaxy import jobs
+from galaxy.tools.parameters import DataToolParameter
+# Older py compatibility
+try:
+    set()
+except:
+    from sets import Set as set
+
+#def exec_before_process(app, inp_data, out_data, param_dict, tool=None):
+#    """Sets the name of the data"""
+#    dbkeys = sets.Set( [data.dbkey for data in inp_data.values() ] ) 
+#    if len(dbkeys) != 1:
+#        raise Exception, '<p><font color="yellow">Both Queries must be from the same genome build</font></p>'
+
+def validate_input( trans, error_map, param_values, page_param_map ):
+    dbkeys = set()
+    data_param_names = set()
+    data_params = 0
+    for name, param in page_param_map.iteritems():
+        if isinstance( param, DataToolParameter ):
+            # for each dataset parameter
+            if param_values.get(name, None) != None:
+                dbkeys.add( param_values[name].dbkey )
+                data_params += 1
+                # check meta data
+                try:
+                    param = param_values[name]
+                    if isinstance( param.datatype, trans.app.datatypes_registry.get_datatype_by_extension( 'gff' ).__class__ ):
+                        # TODO: currently cannot validate GFF inputs b/c they are not derived from interval.
+                        pass
+                    else: # Validate interval datatype.
+                        startCol = int( param.metadata.startCol )
+                        endCol = int( param.metadata.endCol )
+                        chromCol = int( param.metadata.chromCol )
+                        if param.metadata.strandCol is not None:
+                            strandCol = int ( param.metadata.strandCol )
+                        else:
+                            strandCol = 0
+                except:
+                    error_msg = "The attributes of this dataset are not properly set. " + \
+                    "Click the pencil icon in the history item to set the chrom, start, end and strand columns."
+                    error_map[name] = error_msg
+            data_param_names.add( name )
+    if len( dbkeys ) > 1:
+        for name in data_param_names:
+            error_map[name] = "All datasets must belong to same genomic build, " \
+                "this dataset is linked to build '%s'" % param_values[name].dbkey
+    if data_params != len(data_param_names):
+        for name in data_param_names:
+            error_map[name] = "A dataset of the appropriate type is required"
+
+# Commented out by INS, 5/30/2007.  What is the PURPOSE of this?
+def exec_after_process(app, inp_data, out_data, param_dict, tool=None, stdout=None, stderr=None):
+    """Verify the output data after each run"""
+    items = out_data.items()
+
+    for name, data in items:
+        try:
+            if stderr and len( stderr ) > 0:
+                raise Exception( stderr )
+
+        except Exception, exc:
+            data.blurb = jobs.JOB_ERROR
+            data.state = jobs.JOB_ERROR
+
+## def exec_after_process(app, inp_data, out_data, param_dict, tool=None, stdout=None, stderr=None):
+##     pass
+
+
+def exec_after_merge(app, inp_data, out_data, param_dict, tool=None, stdout=None, stderr=None):
+    exec_after_process(
+        app, inp_data, out_data, param_dict, tool=tool, stdout=stdout, stderr=stderr)
+
+    # strip strand column if clusters were merged
+    items = out_data.items()
+    for name, data in items:
+        if param_dict['returntype'] == True:
+            data.metadata.chromCol = 1
+            data.metadata.startCol = 2
+            data.metadata.endCol = 3
+        # merge always clobbers strand
+        data.metadata.strandCol = None
+            
+
+def exec_after_cluster(app, inp_data, out_data, param_dict, tool=None, stdout=None, stderr=None):
+    exec_after_process(
+        app, inp_data, out_data, param_dict, tool=tool, stdout=stdout, stderr=stderr)
+
+    # strip strand column if clusters were merged
+    if param_dict["returntype"] == '1':
+        items = out_data.items()
+        for name, data in items:
+            data.metadata.strandCol = None
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/subtract.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/subtract.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,118 @@
+<tool id="gops_subtract_1" name="Subtract">
+  <description>the intervals of two datasets</description>
+  <command interpreter="python">gops_subtract.py 
+      $input1 $input2 $output
+
+      #if isinstance( $input1.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
+        -1 1,4,5,7 --gff1
+      #else:
+        -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol}
+      #end if
+
+      #if isinstance( $input2.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
+        -2 1,4,5,7 --gff2
+      #else:
+          -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol} 
+      #end if
+
+      -m $min $returntype
+  </command>
+  <inputs>
+    <param format="interval,gff" name="input2" type="data" help="Second dataset">
+      <label>Subtract</label>
+    </param>
+
+    <param format="interval,gff" name="input1" type="data" help="First dataset">
+      <label>from</label>
+    </param>
+
+    <param name="returntype" type="select" label="Return" help="of the first dataset (see figure below)">
+      <option value="">Intervals with no overlap</option>
+      <option value="-p">Non-overlapping pieces of intervals</option>
+    </param>
+    
+    <param name="min" size="4" type="integer" value="1" min="1" help="(bp)">
+      <label>where minimal overlap is</label>
+    </param>
+   </inputs>
+  <outputs>
+    <data format="input" name="output" metadata_source="input1"/>
+  </outputs>
+  <code file="operation_filter.py"/>
+  <trackster_conf/>
+  <tests>
+    <test>
+      <param name="input1" value="1.bed" />
+      <param name="input2" value="2.bed" />
+      <param name="min" value="1" />
+      <param name="returntype" value="" />
+      <output name="output" file="gops-subtract.dat" />
+    </test>
+    <test>
+      <param name="input1" value="1.bed" />
+      <param name="input2" value="2_mod.bed" ftype="interval"/>
+      <param name="min" value="1" />
+      <param name="returntype" value="" />
+      <output name="output" file="gops_subtract_diffCols.dat" />
+    </test>
+    <test>
+      <param name="input1" value="gops_subtract_bigint.bed" />
+      <param name="input2" value="2.bed" />
+      <param name="min" value="1" />
+      <param name="returntype" value="" />
+      <output name="output" file="gops-subtract.dat" />
+    </test>
+    <test>
+      <param name="input1" value="1.bed" />
+      <param name="input2" value="2.bed" />
+      <param name="min" value="10" />
+      <param name="returntype" value="Non-overlapping pieces of intervals" />
+      <output name="output" file="gops-subtract-p.dat" />     
+    </test>
+    <!-- Subtract two GFF files. -->
+    <test>
+        <param name="input1" value="gops_subtract_in1.gff" />
+        <param name="input2" value="gops_subtract_in2.gff" />
+        <param name="min" value="1" />
+        <param name="returntype" value="" />
+        <output name="output" file="gops_subtract_out1.gff" />
+    </test>
+    <!-- Subtract BED file from GFF file. -->
+    <test>
+        <param name="input1" value="gops_subtract_in1.gff" />
+        <param name="input2" value="gops_subtract_in2.bed" />
+        <param name="min" value="1" />
+        <param name="returntype" value="" />
+        <output name="output" file="gops_subtract_out1.gff" />
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns.
+
+-----
+
+**Screencasts!**
+
+See Galaxy Interval Operation Screencasts_ (right click to open this link in another window).
+
+.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations
+
+-----
+
+**Syntax**
+
+- **Where overlap is at least** sets the minimum length (in base pairs) of overlap between elements of the two datasets.
+- **Intervals with no overlap** returns entire intervals from the first dataset that do not overlap the second dataset.  The returned intervals are completely unchanged, and this option only filters out intervals that overlap with the second dataset.
+- **Non-overlapping pieces of intervals** returns intervals from the first dataset that have the intervals from the second dataset removed.  Any overlapping base pairs are removed from the range of the interval.  All fields besides start and end are guaranteed to remain unchanged.
+
+-----
+
+**Example**
+
+.. image:: ./static/operation_icons/gops_subtract.gif
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/subtract_query.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/subtract_query.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+# Greg Von Kuster
+
+"""
+Subtract an entire query from another query
+usage: %prog in_file_1 in_file_2 begin_col end_col output 
+"""
+import sys, re
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+
+# Older py compatibility
+try:
+    set()
+except:
+    from sets import Set as set
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def get_lines(fname, begin_col='', end_col=''):
+    lines = set([])
+    i = 0
+    for i, line in enumerate(file(fname)):
+        line = line.rstrip('\r\n')
+        if line and not line.startswith('#'):
+            if begin_col and end_col:
+                """Both begin_col and end_col must be integers at this point."""
+                try:
+                    line = line.split('\t')
+                    line = '\t'.join([line[j] for j in range(begin_col-1, end_col)])
+                    lines.add( line )
+                except: pass
+            else:
+                lines.add( line )
+    if i: return (i+1, lines)
+    else: return (i, lines)
+
+def main():
+    
+    # Parsing Command Line here
+    options, args = doc_optparse.parse( __doc__ )
+
+    try:
+        inp1_file, inp2_file, begin_col, end_col, out_file = args
+    except:
+        doc_optparse.exception()
+    
+    begin_col = begin_col.strip()
+    end_col = end_col.strip()
+    
+    if begin_col != 'None' or end_col != 'None':
+        """
+        The user selected columns for restriction.  We'll allow default
+        values for both begin_col and end_col as long as the user selected
+        at least one of them for restriction.
+        """
+        if begin_col == 'None':
+            begin_col = end_col
+        elif end_col == 'None':
+            end_col = begin_col
+        begin_col = int(begin_col)
+        end_col = int(end_col)
+        """Make sure that begin_col <= end_col (switch if not)"""
+        if begin_col > end_col:
+            tmp_col = end_col
+            end_col = begin_col
+            begin_col = tmp_col
+    else:
+        begin_col = end_col = ''
+
+    try:
+        fo = open(out_file,'w')
+    except:
+        print >> sys.stderr, "Unable to open output file"
+        sys.exit()
+
+    """
+    len1 is the number of lines in inp1_file
+    lines1 is the set of unique lines in inp1_file
+    diff1 is the number of duplicate lines removed from inp1_file
+    """
+    len1, lines1 = get_lines(inp1_file, begin_col, end_col)
+    diff1 = len1 - len(lines1)
+    len2, lines2 = get_lines(inp2_file, begin_col, end_col)
+    
+    lines1.difference_update(lines2)
+    """lines1 is now the set of unique lines in inp1_file - the set of unique lines in inp2_file"""
+
+    for line in lines1:
+        print >> fo, line
+
+    fo.close()
+    
+    info_msg = 'Subtracted %d lines. ' %((len1 - diff1) - len(lines1))
+    
+    if begin_col and end_col:
+        info_msg += 'Restricted to columns c' + str(begin_col) + ' thru c' + str(end_col) + '. '
+
+    if diff1 > 0:
+        info_msg += 'Eliminated %d duplicate/blank/comment/invalid lines from first query.' %diff1
+    
+    print info_msg
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/subtract_query.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/subtract_query.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,110 @@
+<tool id="subtract_query1" name="Subtract Whole Dataset">
+  <description>from another dataset</description>
+  <command interpreter="python">subtract_query.py $input1 $input2 $begin_col $end_col $output</command>
+  <inputs>
+    <param format="txt" name="input2" type="data" label="Subtract" help="Second dataset" />
+    <param format="txt" name="input1" type="data" label="from" help="First dataset" />
+    <param name="begin_col" type="data_column" data_ref="input1" force_select="False" label="Restrict subtraction between 'begin column'" />
+    <param name="end_col" type="data_column" data_ref="input1" force_select="False" label="and 'end column'" help="Specifying columns for restricting subtraction is available only for tabular formatted datasets" />
+  </inputs>
+  <outputs>
+    <data format="input" name="output" metadata_source="input1" />
+  </outputs>
+  <tests>
+   <!-- Subtract 2 non-tabular files with no column restrictions. -->
+   <!-- Cannot figure out why this test won't pass, it works in real time...
+    <test>
+      <param name="input1" value="1.txt" />
+      <param name="input2" value="2.txt" />
+      <param name="begin_col" value="None" />
+      <param name="end_col" value="None" />
+      <output name="output" file="subtract-query-1.dat" />
+    </test>
+    -->
+   <!-- Subtract 2 tabular files with no column restrictions. -->
+    <test>
+      <param name="input1" value="eq-showbeginning.dat" />
+      <param name="input2" value="eq-showtail.dat" />
+      <param name="begin_col" value="None" />
+      <param name="end_col" value="None" />
+      <output name="output" file="subtract-query-2.dat" />
+    </test>
+   <!-- Subtract 2 tabular files with column restrictions. -->
+    <test>
+      <param name="input1" value="eq-showbeginning.dat" />
+      <param name="input2" value="eq-removebeginning.dat" />
+      <param name="begin_col" value="c1" />
+      <param name="end_col" value="c3" />
+      <output name="output" file="subtract-query-3.dat" />
+    </test>
+   <!-- Subtract a non-tabular file from a tabular file with no column restrictions. -->
+    <test>
+      <param name="input1" value="eq-showbeginning.dat" />
+      <param name="input2" value="2.txt" />
+      <param name="begin_col" value="None" />
+      <param name="end_col" value="None" />
+      <output name="output" file="subtract-query-4.dat" />
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** This tool complements the tool in the **Operate on Genomic Intervals** tool set which subtracts the intervals of two datasets.
+
+
+-----
+
+**Syntax**
+
+This tool subtracts an entire dataset from another dataset.  
+
+- Any text format is valid.
+- If both dataset formats are tabular, you may restrict the subtraction to specific columns **contained in both datasets** and the resulting dataset will include only the columns specified. 
+- The begin column must be less than or equal to the end column.  If it is not, begin column is switched with end column.
+- If begin column is specified but end column is not, end column will default to begin_column (and vice versa).
+- All blank and comment lines are skipped and not included in the resulting dataset (comment lines are lines beginning with a # character).
+- Duplicate lines are eliminated from both dataset prior to subtraction.  If any duplicate lines were eliminated from the first dataset, the number is displayed in the resulting history item.
+
+-----
+
+**Example**
+
+If this is the **First dataset**::
+
+  chr1            4225    19670
+  chr10           6       8
+  chr1            24417   24420
+  chr6_hla_hap2   0       150
+  chr2            1       5
+  chr10           2       10
+  chr1            30      55
+  chrY            1       20
+  chr1            1225979 42287290
+  chr10           7       8 
+
+and this is the **Second dataset**::
+
+  chr1            4225    19670
+  chr10           6       8
+  chr1            24417   24420
+  chr6_hla_hap2   0       150
+  chr2            1       5
+  chr1            30      55
+  chrY            1       20
+  chr1            1225979 42287290
+
+Subtracting the **Second dataset** from the **First dataset** (including all columns) will yield::
+
+  chr10           7       8 
+  chr10           2       10
+
+Conversely, subtracting the **First dataset** from the **Second dataset** (including all columns) will result in an empty dataset.
+
+Subtracting the **Second dataset** from the **First dataset** (restricting to columns c1 and c2) will yield::
+
+  chr10           7
+  chr10           2
+
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/tables_arithmetic_operations.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/tables_arithmetic_operations.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,117 @@
+# A program to implement arithmetic operations on tabular files data. The program takes three inputs:
+# The first input is a TABULAR format file containing numbers only.
+# The second input is a TABULAR format file containing numbers only.
+# The two files must have the same number of columns and the same number of rows
+# The third input is an arithmetic operation: +, -, *, or / for addition, subtraction, multiplication, or division, respectively 
+# The output file is a TABULAR format file containing the result of implementing the arithmetic operation on both input files.
+# The output file has the same number of columns and the same number of rows as each of the two input files.
+# Note: in case of division, none of the values in the second input file could be 0.
+
+use strict;
+use warnings;
+
+#variables to handle information of the first input tabular file
+my $lineData1 = "";
+my @lineDataArray1 = ();
+my $lineArraySize = 0;
+my $lineCounter1 = 0;
+
+#variables to handle information of the second input tabular file
+my $lineData2= "";
+my @lineDataArray2 = ();
+my $lineCounter2 = 0;
+
+my $result = 0;
+
+# check to make sure having the correct number of arguments
+my $usage = "usage: tables_arithmetic_operations.pl [TABULAR.in] [TABULAR.in] [ArithmeticOperation] [TABULAR.out] \n";
+die $usage unless @ARGV == 4;
+
+#variables to store the names of input and output files
+my $inputTabularFile1 = $ARGV[0];
+my $inputTabularFile2 = $ARGV[1];
+my $arithmeticOperation = $ARGV[2];
+my $outputTabularFile = $ARGV[3];
+
+#open the input and output files
+open (INPUT1, "<", $inputTabularFile1) || die("Could not open file $inputTabularFile1 \n"); 
+open (INPUT2, "<", $inputTabularFile2) || die("Could not open file $inputTabularFile2 \n"); 
+open (OUTPUT, ">", $outputTabularFile) || die("Could not open file $outputTabularFile \n");
+
+#store the first input file in the array @motifsFrequencyData1
+my @tabularData1 = <INPUT1>;
+
+#store the second input file in the array @motifsFrequencyData2
+my @tabularData2 = <INPUT2>;
+
+#reset the $lineCounter1 to 0
+$lineCounter1 = 0;
+
+#iterated through the lines of the first input file 
+INDEL1:
+foreach $lineData1 (@tabularData1){
+ chomp ($lineData1);
+ $lineCounter1++;
+
+ #reset the $lineCounter2 to 0
+ $lineCounter2 = 0;
+
+ #iterated through the lines of the second input file 
+ foreach $lineData2 (@tabularData2){
+ chomp ($lineData2);
+ $lineCounter2++;
+
+ #check if the two motifs are the same in the two input files
+ if ($lineCounter1 == $lineCounter2){
+
+ @lineDataArray1 = split(/\t/, $lineData1);
+ @lineDataArray2 = split(/\t/, $lineData2);
+
+ $lineArraySize = @lineDataArray1;
+
+ for (my $index = 0; $index < $lineArraySize; $index++){
+
+ if ($arithmeticOperation eq "Addition"){
+ #compute the additin of both values
+ $result = $lineDataArray1[$index] + $lineDataArray2[$index];
+ }
+
+ if ($arithmeticOperation eq "Subtraction"){
+ #compute the subtraction of both values
+ $result = $lineDataArray1[$index] - $lineDataArray2[$index];
+ }
+
+ if ($arithmeticOperation eq "Multiplication"){
+ #compute the multiplication of both values
+ $result = $lineDataArray1[$index] * $lineDataArray2[$index];
+ }
+
+ if ($arithmeticOperation eq "Division"){
+
+ #check if the denominator is 0
+ if ($lineDataArray2[$index] != 0){
+ #compute the division of both values
+ $result = $lineDataArray1[$index] / $lineDataArray2[$index];
+ }
+ else{
+ die("A denominator could not be zero \n"); 
+ }
+ }
+
+ #store the result in the output file
+ if ($index < $lineArraySize - 1){
+ print OUTPUT $result . "\t";
+ }
+ else{
+ print OUTPUT $result . "\n";
+ }
+ }
+ next INDEL1;
+ }
+ }
+}  
+
+#close the input and output files
+close(OUTPUT);
+close(INPUT2);
+close(INPUT1);
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/new_operations/tables_arithmetic_operations.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/new_operations/tables_arithmetic_operations.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,105 @@
+<tool id="tables_arithmetic_operations" name="Arithmetic Operations " version="1.0.0">
+  <description>on tables</description>
+  
+  <command interpreter="perl">
+   tables_arithmetic_operations.pl $inputFile1 $inputFile2 $inputArithmeticOperation3 $outputFile1
+  </command>
+
+  <inputs>
+   <param format="tabular" name="inputFile1" type="data" label="Select the first input tabular file"/>
+   <param format="tabular" name="inputFile2" type="data" label="Select the second input tabular file"/>
+  
+    <param name="inputArithmeticOperation3" type="select" label="Choose the arithmetic operation:">
+     <option value="Addition">Addition</option>
+       <option value="Subtraction">Subtraction</option>
+       <option value="Multiplication">Multiplication</option>
+       <option value="Division">Division</option>
+    </param>
+  </inputs>
+  
+  <outputs>
+    <data format="tabular" name="outputFile1"/>
+  </outputs>
+  
+  <tests>
+   <test>
+   <param name="inputFile1" value="numericalTable1.tabular" />
+   <param name="inputFile2" value="numericalTable1.tabular" />
+     <param name="inputArithmeticOperation3" value="Addition" />
+     <output name="outputFile1" file="table_addition_result.tabular" />
+   </test>
+  
+   <test>
+   <param name="inputFile1" value="numericalTable1.tabular" />
+   <param name="inputFile2" value="numericalTable1.tabular" />
+     <param name="inputArithmeticOperation3" value="Subtraction" />
+     <output name="outputFile1" file="table_subtraction_result.tabular" />
+   </test>
+  
+   <test>
+   <param name="inputFile1" value="numericalTable1.tabular" />
+   <param name="inputFile2" value="numericalTable1.tabular" />
+     <param name="inputArithmeticOperation3" value="Multiplication" />
+     <output name="outputFile1" file="table_multiplication_result.tabular" />
+   </test>
+  
+   <test>
+   <param name="inputFile1" value="numericalTable1.tabular" />
+   <param name="inputFile2" value="numericalTable1.tabular" />
+     <param name="inputArithmeticOperation3" value="Division" />
+     <output name="outputFile1" file="table_division_result.tabular" />
+   </test>
+  
+  </tests>
+  
+  
+  <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This program implements arithmetic operations on tabular files data. The program takes three inputs:
+
+- The first input is a TABULAR format file containing numbers only.
+- The second input is a TABULAR format file containing numbers only.
+- The third input is an arithmetic operation: +, -, x, or / for addition, subtraction, multiplication, or division, respectively. 
+- The output file is a TABULAR format file containing the result of implementing the arithmetic operation on both input files.
+
+
+Notes: 
+
+- The two files must have the same number of columns and the same number of rows.
+- The output file has the same number of columns and the same number of rows as each of the two input files.
+- In case of division, none of the values in the second input file could be 0, otherwise the program will stop and report an error.
+
+**Example**
+
+Let us have the first input file as follows::
+
+ 5 4 0
+ 10 11 12
+ 1 3 1
+ 1 2 1
+ 2 0 4
+
+And the second input file as follows::
+
+ 5 4 4
+ 2 5 8
+ 1 2 1
+ 3 2 5
+ 2 4 4
+
+Running the program and choosing "Addition" as an arithmetic operation will give the following output::
+
+ 10 8 4
+ 12 16 20
+ 2 5 2
+ 4 4 6
+ 4 4 8
+
+
+  </help>  
+  
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/next_gen_conversion/bwa_solid2fastq_modified.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/next_gen_conversion/bwa_solid2fastq_modified.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,89 @@
+#!/usr/bin/perl -w
+
+# Author: lh3
+# Note: Ideally, this script should be written in C. It is a bit slow at present.
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+my %opts;
+my $version = '0.1.3';
+my $usage = qq{
+Usage: solid2fastq.pl <paired> <outfile1> <outfile2> <F3.csfasta> <F3.qual> <R3.csfasta> <R3.qual> 
+
+Note: <in.title> is the string showed in the `# Title:' line of a
+      ".csfasta" read file. Then <in.title>F3.csfasta is read sequence
+      file and <in.title>F3_QV.qual is the quality file. If
+      <in.title>R3.csfasta is present, this script assumes reads are
+      paired; otherwise reads will be regarded as single-end.
+
+      The read name will be <out.prefix>:panel_x_y/[12] with `1' for R3
+      tag and `2' for F3. Usually you may want to use short <out.prefix>
+      to save diskspace. Long <out.prefix> also causes troubles to maq.
+
+};
+
+getopts('', \%opts);
+die($usage) if (@ARGV != 7);
+my ($is_paired,$outfile1,$outfile2,$f3reads,$f3qual,$r3reads,$r3qual) = @ARGV;
+my (@fhr, @fhw);
+my $fn = '';
+my @fn_suff = ($f3reads,$f3qual,$r3reads,$r3qual);
+if ($is_paired eq "yes") { # paired end
+  for (0 .. 3) {
+ $fn = $fn_suff[$_];
+ $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz");
+ open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n");
+  }
+  open($fhw[0], "|gzip >$outfile2") || die;
+  open($fhw[1], "|gzip >$outfile1") || die;
+  my (@df, @dr);
+  @df = &read1(1); @dr = &read1(2);
+  while (@df && @dr) {
+ if ($df[0] eq $dr[0]) { # mate pair
+   print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1];
+   @df = &read1(1); @dr = &read1(2);
+ }
+  }
+  close($fhr[$_]) for (0 .. $#fhr);
+  close($fhw[$_]) for (0 .. $#fhw);
+} else { # single end
+  for (0 .. 1) {
+ my $fn = "$fn_suff[$_]";
+ $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz");
+ open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n");
+  }
+  open($fhw[2], "|gzip >$outfile1") || die;
+  my @df;
+  while (@df = &read1(1, $fhr[0], $fhr[1])) {
+ print {$fhw[2]} $df[1];
+  }
+  close($fhr[$_]) for (0 .. $#fhr);
+  close($fhw[2]);
+}
+
+sub read1 {
+  my $i = shift(@_);
+  my $j = ($i-1)<<1;
+  my ($key, $seq);
+  my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]);
+  while (<$fhs>) {
+ my $t = <$fhq>;
+ if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) {
+   $key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines
+   #print $key;
+   die(qq/** unmatched read name: '$_' != '$t'\n/) unless ($_ eq $t);
+   my $name = "$1_$2_$3/$i";
+   $_ = substr(<$fhs>, 2);
+   tr/0123./ACGTN/;
+   my $s = $_;
+   $_ = <$fhq>;
+   s/^(\d+)\s*//;
+   s/(\d+)\s*/chr($1+33)/eg;
+   $seq = qq/\@$name\n$s+\n$_\n/;
+   last;
+ } 
+  }
+  return defined($seq)? ($key, $seq) : ();
+}
b
diff -r 000000000000 -r 9071e359b9a3 tools/next_gen_conversion/fastq_conversions.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/next_gen_conversion/fastq_conversions.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+
+"""
+Performs various conversions around Sanger FASTQ data
+
+usage: %prog [options]
+   -c, --command=c: Command to run
+   -i, --input=i: Input file to be converted
+   -o, --outputFastqsanger=o: FASTQ Sanger converted output file for sol2std
+   -s, --outputFastqsolexa=s: FASTQ Solexa converted output file 
+   -f, --outputFasta=f: FASTA converted output file
+
+usage: %prog command input_file output_file
+"""
+
+import os, sys, tempfile
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()

+def __main__():
+    #Parse Command Line
+    options, args = doc_optparse.parse( __doc__ )
+
+    cmd = "fq_all2std.pl %s %s > %s"
+    if options.command == 'sol2std':
+        cmd = cmd % (options.command, options.input, options.outputFastqsanger)
+    elif options.command == 'std2sol':
+        cmd = cmd % (options.command, options.input, options.outputFastqsolexa)
+    elif options.command == 'fq2fa':
+        cmd = cmd % (options.command, options.input, options.outputFasta)
+    try:
+        os.system(cmd)
+    except Exception, eq:
+        stop_err("Error converting data format.\n" + str(eq))        
+
+if __name__=="__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/next_gen_conversion/fastq_conversions.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/next_gen_conversion/fastq_conversions.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,133 @@
+<tool id="fastq_conversions" name="FASTQ Conversions" version="1.0.0">
+  <description>converts between FASTQ data and other data formats</description>
+  <command interpreter="python">
+    fastq_conversions.py 
+    --command=$conversionType.type
+    --input=$input
+    #if $conversionType.type == "sol2std":
+     --outputFastqsanger=$outputFastqsanger
+    #else:
+     --outputFastqsanger="None"
+    #end if
+    #if $conversionType.type == "std2sol":
+     --outputFastqsolexa=$outputFastqsolexa
+    #else:
+     --outputFastqsolexa="None"
+    #end if
+    #if $conversionType.type == "fq2fa":
+     --outputFasta=$outputFasta
+    #else:
+     --outputFasta="None"
+    #end if
+  </command>
+  <inputs>
+    <conditional name="conversionType">
+      <param name="type" type="select" label="What type of conversion do you want to do?">
+        <option value="sol2std">Solexa/Illumina FASTQ to standard Sanger FASTQ</option>
+        <option value="std2sol">Standard Sanger FASTQ to Solexa/Illumina FASTQ</option>
+        <option value="fq2fa">Various FASTQ to FASTA</option>
+      </param>
+      <when value="sol2std">
+        <param name="input" type="data" format="fastqsolexa" label="File to convert" />
+      </when>
+      <when value="std2sol">
+        <param name="input" type="data" format="fastqsanger" label="File to convert" />
+      </when>
+      <when value="fq2fa">
+        <param name="input" type="data" format="fastqsolexa, fastqsanger" label="File to convert" />
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data name="outputFastqsanger" format="fastqsanger">
+      <filter>conversionType['type'] == 'sol2std'</filter>
+    </data>
+    <data name="outputFastqsolexa" format="fastqsolexa">
+      <filter>conversionType['type'] == 'std2sol'</filter>
+    </data>
+    <data name="outputFasta" format="fasta">
+      <filter>conversionType['type'] == 'fq2fa'</filter>
+    </data>
+  </outputs>
+  <tests>
+    <test>
+      <param name="type" value="sol2std" />
+      <param name="input" value="fastq_conv_in1.fastq" ftype="fastqsolexa" />
+      <output name="outputFastqsanger" file="fastq_conv_out1.fastqsanger" />
+    </test>
+    <test>
+      <param name="type" value="std2sol" />
+      <param name="input" value="1.fastqsanger" ftype="fastqsanger" />
+      <output name="outputFastqsolexa" file="fastq_conv_out2.fastqsolexa" />
+    </test>
+    <test>
+      <param name="type" value="fq2fa" />
+      <param name="input" value="1.fastqsanger" ftype="fastqsanger" />
+      <output name="outputFasta" file="fastq_conv_out4.fasta" />
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+This tool offers several conversions options relating to the FASTQ format. 
+
+-----
+
+**Examples**
+
+- Converting the Solexa/Illumina FASTQ data::
+
+ @081017-and-081020:1:1:1715:1759
+ GGACTCAGATAGTAATCCACGCTCCTTTAAAATATC
+ +
+ II#IIIIIII$5+.(9IIIIIII$%*$G$A31I&amp;&amp;B
+
+- will produce the following Sanger FASTQ data::
+
+ @081017-and-081020:1:1:1715:1759
+ GGACTCAGATAGTAATCCACGCTCCTTTAAAATATC
+ +
+ ++!+++++++!!!!!"+++++++!!!!)!%!!+!!%!
+
+- Converting standard Sanger FASTQ::
+    
+    @1831_573_1004/1
+ AATACTTTCGGCGCCCTAAACCAGCTCACTGGGG
+ +
+ >&lt;C&amp;&amp;9952+C>5&lt;.?&lt;79,=42&lt;292:&lt;(9/-7
+ @1831_573_1050/1
+ TTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT
+ +
+ ;@@17?@=>7??@A8?==@4A?A4)&amp;+.'&amp;+'1,
+
+- will produce the following Solexa/Illumina FASTQ data::
+
+ @1831_573_1004/1
+ AATACTTTCGGCGCCCTAAACCAGCTCACTGGGG
+ +
+ ][bEEXXTQJb]T[M^[VXK\SQ[QXQY[GXNLV
+ @1831_573_1050/1
+ TTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT
+ +
+ Z__PV^_\]V^^_`W^\\_S`^`SHEJMFEJFPK
+
+- Converting the Sanger FASTQ data::
+
+    @1831_573_1004/1
+ AATACTTTCGGCGCCCTAAACCAGCTCACTGGGG
+ +
+ >&lt;C&amp;&amp;9952+C>5&lt;.?&lt;79,=42&lt;292:&lt;(9/-7
+ @1831_573_1050/1
+ TTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT
+ +
+ ;@@17?@=>7??@A8?==@4A?A4)&amp;+.'&amp;+'1,
+
+- will produce the following FASTA data::
+
+ >1831_573_1004/1
+ AATACTTTCGGCGCCCTAAACCAGCTCACTGGGG
+ >1831_573_1050/1
+ TTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/next_gen_conversion/fastq_gen_conv.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/next_gen_conversion/fastq_gen_conv.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,172 @@
+"""
+Converts any type of FASTQ file to Sanger type  and makes small adjustments if necessary.
+
+usage: %prog [options]
+   -i, --input=i: Input FASTQ candidate file
+   -r, --origType=r: Original type
+   -a, --allOrNot=a: Whether or not to check all blocks
+   -b, --blocks=b: Number of blocks to check
+   -o, --output=o: Output file
+
+usage: %prog input_file oroutput_file
+"""
+
+import math, sys
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+    
+def all_bases_valid(seq):
+    """Confirm that the sequence contains only bases"""
+    valid_bases = ['a', 'A', 'c', 'C', 'g', 'G', 't', 'T', 'N']
+    for base in seq:
+        if base not in valid_bases:
+            return False
+    return True
+
+def __main__():
+    #Parse Command Line
+    options, args = doc_optparse.parse( __doc__ )
+    orig_type = options.origType
+    if orig_type == 'sanger' and options.allOrNot == 'not':
+        max_blocks = int(options.blocks)
+    else:
+        max_blocks = -1
+    fin = file(options.input, 'r')
+    fout = file(options.output, 'w')
+    range_min = 1000
+    range_max = -5
+    block_num = 0
+    bad_blocks = 0
+    base_len = -1
+    line_count = 0
+    lines = []
+    line = fin.readline()
+    while line:
+        if line.strip() and max_blocks >= 0 and block_num > 0 and orig_type == 'sanger' and block_num >= max_blocks:
+            fout.write(line)
+            if line_count % 4 == 0:
+                block_num += 1
+            line_count += 1
+        elif line.strip():
+            # the line that starts a block, with a name
+            if line_count % 4 == 0 and line.startswith('@'):
+                lines.append(line)
+            else:
+                # if we expect a sequence of bases
+                if line_count % 4 == 1 and all_bases_valid(line.strip()):
+                    lines.append(line)
+                    base_len = len(line.strip())
+                # if we expect the second name line
+                elif line_count % 4 == 2 and line.startswith('+'):
+                    lines.append(line)
+                # if we expect a sequence of qualities and it's the expected length
+                elif line_count % 4 == 3:
+                    split_line = line.strip().split()
+                    # decimal qualities
+                    if len(split_line) == base_len:
+                        # convert
+                        phred_list = []
+                        for ch in split_line:
+                            int_ch = int(ch)
+                            if int_ch < range_min:
+                                range_min = int_ch
+                            if int_ch > range_max:
+                                range_max = int_ch
+                            if int_ch >= 0 and int_ch <= 93:
+                                phred_list.append(chr(int_ch + 33))
+                        # make sure we haven't lost any quality values
+                        if len(phred_list) == base_len:
+                            # print first three lines
+                            for l in lines:
+                                fout.write(l)
+                            # print converted quality line
+                            fout.write(''.join(phred_list))
+                            # reset
+                            lines = []
+                            base_len = -1
+                        # abort if so
+                        else:
+                            bad_blocks += 1
+                            lines = []
+                            base_len = -1
+                    # ascii qualities
+                    elif len(split_line[0]) == base_len:
+                        qualities = []
+                        # print converted quality line
+                        if orig_type == 'illumina':
+                            for c in line.strip():
+                                if ord(c) - 64 < range_min:
+                                    range_min = ord(c) - 64
+                                if ord(c) - 64 > range_max:
+                                    range_max = ord(c) - 64
+                                if ord(c) < 64 or ord(c) > 126:
+                                    bad_blocks += 1
+                                    base_len = -1
+                                    lines = []
+                                    break
+                                else:
+                                    qualities.append( chr( ord(c) - 31 ) )
+                            quals = ''.join(qualities)
+                        elif orig_type == 'solexa':
+                            for c in line.strip():
+                                if ord(c) - 64 < range_min:
+                                    range_min = ord(c) - 64
+                                if ord(c) - 64 > range_max:
+                                    range_max = ord(c) - 64
+                                if ord(c) < 59 or ord(c) > 126:
+                                    bad_blocks += 1
+                                    base_len = -1
+                                    lines = []
+                                    break
+                                else:
+                                    p = 10.0**( ( ord(c) - 64 ) / -10.0 ) / ( 1 + 10.0**( ( ord(c) - 64 ) / -10.0 ) )
+                                    qualities.append( chr( int( -10.0*math.log10( p ) ) + 33 ) )
+                            quals = ''.join(qualities)
+                        else:  # 'sanger'
+                            for c in line.strip():
+                                if ord(c) - 33 < range_min:
+                                    range_min = ord(c) - 33
+                                if ord(c) - 33 > range_max:
+                                    range_max = ord(c) - 33
+                                if ord(c) < 33 or ord(c) > 126:
+                                    bad_blocks += 1
+                                    base_len = -1
+                                    lines = []
+                                    break
+                                else:
+                                    qualities.append(c)
+                            quals = ''.join(qualities)
+                        # make sure we don't have bad qualities
+                        if len(quals) == base_len:
+                            # print first three lines
+                            for l in lines:
+                                fout.write(l)
+                            # print out quality line
+                            fout.write(quals+'\n')
+                        # reset
+                        lines = []
+                        base_len = -1
+                    else:
+                        bad_blocks += 1
+                        base_len = -1
+                        lines = []
+                    # mark the successful end of a block
+                    block_num += 1
+            line_count += 1
+        line = fin.readline()
+    fout.close()
+    fin.close()
+    if range_min != 1000 and range_min != -5:
+        outmsg = 'The range of quality values found were: %s to %s' % (range_min, range_max)
+    else:
+        outmsg = ''
+    if bad_blocks > 0:
+        outmsg += '\nThere were %s bad blocks skipped' % (bad_blocks)
+    sys.stdout.write(outmsg)
+
+if __name__=="__main__": __main__() 
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/next_gen_conversion/fastq_gen_conv.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/next_gen_conversion/fastq_gen_conv.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,106 @@
+<tool id="fastq_gen_conv" name="FASTQ Groomer" version="1.0.0">
+  <description>converts any FASTQ to Sanger</description>
+  <command interpreter="python">
+    fastq_gen_conv.py 
+     --input=$input 
+     --origType=$origTypeChoice.origType
+     #if $origTypeChoice.origType == "sanger":
+      --allOrNot=$origTypeChoice.howManyBlocks.allOrNot 
+      #if $origTypeChoice.howManyBlocks.allOrNot == "not":
+       --blocks=$origTypeChoice.howManyBlocks.blocks
+      #else:
+       --blocks="None"
+      #end if
+     #else:
+      --allOrNot="None"
+      --blocks="None"
+     #end if
+     --output=$output
+  </command>
+  <inputs>
+    <param name="input" type="data" format="fastq" label="Groom this dataset" />
+    <conditional name="origTypeChoice">
+      <param name="origType" type="select" label="How do you think quality values are scaled?" help="See below for explanation">
+        <option value="solexa">Solexa/Illumina 1.0</option>
+        <option value="illumina">Illumina 1.3+</option>
+        <option value="sanger">Sanger (validation only)</option>
+      </param>
+      <when value="solexa" />
+      <when value="illumina" />
+      <when value="sanger">
+        <conditional name="howManyBlocks">
+          <param name="allOrNot" type="select" label="Since your fastq is already in Sanger format you can check it for consistency">
+            <option value="all">Check all (may take a while)</option> 
+            <option selected="true" value="not">Check selected number of blocks</option>
+          </param>
+          <when value="all" />
+          <when value="not">
+            <param name="blocks" type="integer" value="1000" label="How many blocks (four lines each) do you want to check?" />
+          </when>
+        </conditional>
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data name="output" format="fastqsanger"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="fastq_gen_conv_in1.fastq" ftype="fastq" />
+      <param name="origType" value="solexa" />
+      <output name="output" format="fastqsanger" file="fastq_gen_conv_out1.fastqsanger" />
+    </test>
+    <test>
+      <param name="input" value="fastq_gen_conv_in2.fastq" ftype="fastq" />
+      <param name="origType" value="sanger" />
+      <param name="allOrNot" value="not" />
+      <param name="blocks" value="3" />
+      <output name="output" format="fastqsanger" file="fastq_gen_conv_out2.fastqsanger" />
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+Galaxy pipeline for mapping of Illumina data requires data to be in fastq format with quality values conforming to so called "Sanger" format. Unfortunately there are many other types of fastq. Thus the main objective of this tool is to "groom" multiple types of fastq into Sanger-conforming fastq that can be used in downstream application such as mapping.
+
+.. class:: infomark
+
+**TIP**: If the input dataset is already in Sanger format the tool does not perform conversion. However validation (described below) is still performed.
+
+-----
+
+**Types of fastq datasets**
+
+A good description of fastq datasets can be found `here`__, while a description of Galaxy's fastq "logic" can be found `here`__. Because ranges of quality values within different types of fastq datasets overlap it very difficult to detect them automatically. This tool supports conversion of two commonly found types (Solexa/Illumina 1.0 and Illumina 1.3+) into fastq Sanger. 
+
+ .. __: http://en.wikipedia.org/wiki/FASTQ_format
+ .. __: http://wiki.g2.bx.psu.edu/Admin/NGS%20Local%20Setup
+
+.. class:: warningmark
+
+**NOTE** that there is also a type of fastq format where quality values are represented by a list of space-delimited integers (e.g., 40 40 20 15 -5 20 ...). This tool **does not** handle such fastq. If you have such a dataset, it needs to be converted into ASCII-type fastq (where quality values are encoded by characters) by "Numeric-to-ASCII" utility before it can accepted by this tool.
+
+-----
+
+**Validation**
+
+In addition to converting quality values to Sanger format the tool also checks the input dataset for consistency. Specifically, it performs these four checks:
+
+- skips empty lines
+- checks that blocks are properly formed by making sure that:
+
+  #. there are four lines per block
+  #. the first line starts with "@"
+  #. the third line starts with "+"
+  #. lengths of second line (sequences) and the fourth line (quality string) are identical
+  
+- checks that quality values are within range for the chosen fastq format (e.g., the format provided by the user in **How do you think quality values are scaled?** drop down.
+
+To see exactly what the tool does you can take a look at its source code `here`__.
+
+ .. __: http://bitbucket.org/galaxy/galaxy-central/src/tip/tools/next_gen_conversion/fastq_gen_conv.py
+
+
+    </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/next_gen_conversion/solid2fastq.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/next_gen_conversion/solid2fastq.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,214 @@
+#!/usr/bin/env python
+
+import sys
+import string
+import optparse
+import tempfile
+import sqlite3
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+    
+def solid2sanger( quality_string, min_qual = 0 ):
+    sanger = ""
+    quality_string = quality_string.rstrip( " " )
+    for qv in quality_string.split(" "):
+        try:
+            if int( qv ) < 0:
+                qv = '0'
+            if int( qv ) < min_qual:
+                return False
+                break
+            sanger += chr( int( qv ) + 33 )
+        except:
+            pass    
+    return sanger
+
+def Translator(frm='', to='', delete='', keep=None):
+    allchars = string.maketrans('','')
+    if len(to) == 1:
+        to = to * len(frm)
+    trans = string.maketrans(frm, to)
+    if keep is not None:
+        delete = allchars.translate(allchars, keep.translate(allchars, delete))
+    def callable(s):
+        return s.translate(trans, delete)
+    return callable
+    
+def merge_reads_qual( f_reads, f_qual, f_out, trim_name=False, out='fastq', double_encode = False, trim_first_base = False, pair_end_flag = '', min_qual = 0, table_name=None ):

+    # Reads from two files f_csfasta (reads) and f_qual (quality values) and produces output in three formats depending on out parameter,
+    # which can have three values: fastq, txt, and db
+    # fastq = fastq format
+    # txt = space delimited format with defline, reads, and qvs
+    # dp = dump data into sqlite3 db. 
+    # IMPORTNAT! If out = db two optins must be provided:
+    #   1. f_out must be a db connection object initialized with sqlite3.connect()
+    #   2. table_name must be provided
+    
+    if out == 'db':
+        cursor = f_out.cursor()
+        sql = "create table %s (name varchar(50) not null, read blob, qv blob)" % table_name
+        cursor.execute(sql)
+    
+    lines = []
+    line = " "
+    while line:
+        for f in [ f_reads, f_qual ]:
+            line = f.readline().rstrip( '\n\r' )
+            while line.startswith( '#' ):
+                line = f.readline().rstrip( '\n\r' )
+            lines.append( line )
+    
+            
+        if lines[0].startswith( '>' ) and lines[1].startswith( '>' ):
+            
+            if lines[0] != lines[1]:
+                stop_err('Files reads and quality score files are out of sync and likely corrupted. Please, check your input data')
+            
+            defline = lines[0][1:]
+            if trim_name and ( defline[ len( defline )-3: ] == "_F3" or defline[ len( defline )-3: ] == "_R3" ):
+                defline = defline[ : len( defline )-3 ]
+                
+        elif ( not lines[0].startswith( '>' ) and not lines[1].startswith( '>' ) and len( lines[0] ) > 0 and len( lines[1] ) > 0 ):
+
+            if trim_first_base:
+                lines[0] = lines[0][1:]
+            if double_encode:
+                de = Translator(frm="0123.", to="ACGTN")
+                lines[0] = de(lines[0])
+            qual = solid2sanger( lines[1], int( min_qual ) )
+            if qual:
+                if out == 'fastq':
+                    f_out.write( "@%s%s\n%s\n+\n%s\n" % ( defline, pair_end_flag, lines[0], qual ) ) 
+                if out == 'txt':
+                    f_out.write( '%s %s %s\n' % (defline, lines[0], qual ) )
+                if out == 'db':
+                    cursor.execute('insert into %s values("%s","%s","%s")' % (table_name, defline, lines[0], qual ) )     
+        lines = []
+
+def main():
+
+    usage = "%prog --fr F3.csfasta --fq R3.csfasta --fout fastq_output_file [option]"
+    parser = optparse.OptionParser(usage=usage)
+    
+        
+    parser.add_option(
+        '--fr','--f_reads',
+        metavar="F3_CSFASTA_FILE",
+        dest='fr',
+        help='Name of F3 file with color space reads')
+        
+    parser.add_option(
+        '--fq','--f_qual',
+        metavar="F3_QUAL_FILE",
+        dest='fq',
+        help='Name of F3 file with color quality values')
+        
+    parser.add_option(
+        '--fout','--f3_fastq_output',
+        metavar="F3_OUTPUT",
+        dest='fout',
+        help='Name for F3 output file')
+                
+    parser.add_option(
+        '--rr','--r_reads',
+        metavar="R3_CSFASTA_FILE",
+        dest='rr',
+        default = False,
+        help='Name of R3 file with color space reads')
+        
+    parser.add_option(
+        '--rq','--r_qual',
+        metavar="R3_QUAL_FILE",
+        dest='rq',
+        default = False,
+        help='Name of R3 file with color quality values')
+        
+    parser.add_option(
+        '--rout',
+        metavar="R3_OUTPUT",
+        dest='rout',
+        help='Name for F3 output file')
+    
+    parser.add_option(
+        '-q','--min_qual',
+        dest='min_qual',
+        default = '-1000',
+        help='Minimum quality threshold for printing reads. If a read contains a single call with QV lower than this value, it will not be reported. Default is -1000')
+        
+    parser.add_option(
+        '-t','--trim_name',
+        dest='trim_name',
+        action='store_true',
+        default = False,
+        help='Trim _R3 and _F3 off read names. Default is False')
+    
+    parser.add_option(
+        '-f','--trim_first_base',
+        dest='trim_first_base',
+        action='store_true',
+        default = False,
+        help='Remove the first base of reads in color-space. Default is False')
+        
+    parser.add_option(
+        '-d','--double_encode',
+        dest='de',
+        action='store_true',
+        default = False,
+        help='Double encode color calls as nucleotides: 0123. becomes ACGTN. Default is False')
+    
+    options, args = parser.parse_args()
+    
+    if not ( options.fout and options.fr and options.fq ):
+        parser.error("""
+        One or more of the three required paremetrs is missing:
+        (1) --fr F3.csfasta file
+        (2) --fq F3.qual file
+        (3) --fout name of output file
+        Use --help for more info
+        """)
+
+    fr =  open ( options.fr , 'r' ) 
+    fq =  open ( options.fq , 'r' ) 
+    f_out = open ( options.fout , 'w' )
+    
+    if options.rr and options.rq:
+        rr =  open ( options.rr , 'r' ) 
+        rq =  open ( options.rq , 'r' ) 
+        if not options.rout:
+            parser.error("Provide the name for f3 output using --rout option. Use --help for more info")
+        r_out = open ( options.rout, 'w' )
+    
+        db = tempfile.NamedTemporaryFile()
+        
+        try:
+            con = sqlite3.connect(db.name)
+            cur = con.cursor()
+        except:
+            stop_err('Cannot connect to %s\n') % db.name
+    
+         
+        merge_reads_qual( fr, fq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="f3" )
+        merge_reads_qual( rr, rq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="r3" )
+        cur.execute('create index f3_name on f3( name )')
+        cur.execute('create index r3_name on r3( name )')
+         
+        cur.execute('select * from f3,r3 where f3.name = r3.name')
+        for item in cur:
+            f_out.write( "@%s%s\n%s\n+\n%s\n" % (item[0], "/1", item[1], item[2]) )
+            r_out.write( "@%s%s\n%s\n+\n%s\n" % (item[3], "/2", item[4], item[5]) )
+        
+            
+    else:
+        merge_reads_qual( fr, fq, f_out, trim_name=options.trim_name, out='fastq', double_encode = options.de, trim_first_base = options.trim_first_base, min_qual=options.min_qual )
+        
+    
+      
+    f_out.close()
+
+if __name__ == "__main__":
+    main()
+    

\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/next_gen_conversion/solid2fastq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/next_gen_conversion/solid2fastq.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,160 @@
+<tool id="solid2fastq" name="Convert">
+  <description>SOLiD output to fastq</description>
+  <command interpreter="python">
+    #if   $is_run.paired == "no"    #solid2fastq.py --fr=$input1 --fq=$input2 --fout=$out_file1 -q $qual $trim_name $trim_first_base $double_encode
+    #elif $is_run.paired == "yes"   #solid2fastq.py --fr=$input1 --fq=$input2 --fout=$out_file1 --rr=$input3 --rq=$input4 --rout=$out_file2 -q $qual $trim_name $trim_first_base $double_encode
+    #end if#
+  </command>
+  <inputs>
+    <param name="input1" type="data" format="csfasta" label="Select reads"/>
+    <param name="input2" type="data" format="qualsolid" label="Select qualities"/>
+    <conditional name="is_run">
+        <param name="paired" type="select" label="Is this a mate-pair run?">
+            <option value="no" selected="true">No</option>
+            <option value="yes">Yes</option>
+        </param>
+        <when value="yes">
+            <param name="input3" type="data" format="csfasta" label="Select Reverse reads"/>
+            <param name="input4" type="data" format="qualsolid" label="Select Reverse qualities"/>
+        </when>
+        <when value="no">
+        </when>
+    </conditional>
+    <param name="qual" label="Remove reads containing color qualities below this value" type="integer" value="0"/>
+    <param name="trim_name" type="select" label="Trim trailing &quot;_F3&quot; and &quot;_R3&quot; ?">
+        <option value="-t" selected="true">Yes</option>
+        <option value="">No</option>
+    </param>
+    <param name="trim_first_base" type="select" label="Trim first base?">
+        <option value="-f">Yes (BWA)</option>
+        <option value="" selected="true">No (bowtie)</option>
+    </param>
+    <param name="double_encode" type="select" label="Double encode?">
+        <option value="-d">Yes (BWA)</option>
+        <option value="" selected="true">No (bowtie)</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="fastqcssanger" name="out_file1"/>
+    <data format="fastqcssanger" name="out_file2">
+        <filter>is_run['paired'] == 'yes'</filter>
+    </data>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="fr.csfasta" ftype="csfasta"/>
+      <param name="input2" value="fr.qualsolid" ftype="qualsolid" />
+      <param name="paired" value="no"/>
+      <param name="qual" value="0" />
+      <param name="trim_first_base" value="No" />
+      <param name="trim_name" value="No" />
+      <param name="double_encode" value="No"/>
+      <output name="out_file1" file="solid2fastq_out_1.fastq"/>
+    </test>
+    <test>
+      <param name="input1" value="fr.csfasta" ftype="csfasta"/>
+      <param name="input2" value="fr.qualsolid" ftype="qualsolid" />
+      <param name="paired" value="yes"/>
+      <param name="input3" value="rr.csfasta" ftype="csfasta"/>
+      <param name="input4" value="rr.qualsolid" ftype="qualsolid" />
+      <param name="qual" value="0" />
+      <param name="trim_first_base" value="No" />
+      <param name="trim_name" value="Yes" />
+      <param name="double_encode" value="No"/>
+      <output name="out_file1" file="solid2fastq_out_2.fastq"/>
+      <output name="out_file2" file="solid2fastq_out_3.fastq"/>
+    </test>
+ </tests>
+<help>
+
+**What it does**
+
+Converts output of SOLiD instrument (versions 3.5 and earlier) to fastq format suitable for bowtie, bwa, and PerM mappers.
+
+--------
+
+**Input datasets**
+
+Below are examples of forward (F3) reads and quality scores:
+
+Reads::
+
+    >1831_573_1004_F3
+    T00030133312212111300011021310132222
+    >1831_573_1567_F3
+    T03330322230322112131010221102122113
+
+Quality scores::
+
+    >1831_573_1004_F3
+    4 29 34 34 32 32 24 24 20 17 10 34 29 20 34 13 30 34 22 24 11 28 19 17 34 17 24 17 25 34 7 24 14 12 22
+    >1831_573_1567_F3
+    8 26 31 31 16 22 30 31 28 29 22 30 30 31 32 23 30 28 28 31 19 32 30 32 19 8 32 10 13 6 32 10 6 16 11
+
+
+**Mate pairs**
+
+If your data is from a mate-paired run, you will have additional read and quality datasets that will look similar to the ones above with one exception: the names of reads will be ending with &quot;_R3&quot;.
+In this case choose **Yes** from the *Is this a mate-pair run?* drop down and you will be able to select R reads. When processing mate pairs this tool generates two output files: one for F3 reads and the other for R3 reads.
+The reads are guaranteed to be paired -- mated reads will be in the same position in F3 and R3 fastq file. However, because pairing is verified it may take a while to process an entire SOLiD run (several hours).
+
+------
+
+**Explanation of parameters**
+
+**Remove reads containing color qualities below this value** - any read that contains as least one color call with quality lower than the specified value **will not** be reported.
+
+**Trim trailing &quot;_F3&quot; and &quot;_R3&quot;?** - does just that. Not necessary for bowtie. Required for BWA.
+
+**Trim first base?** - SOLiD reads contain an adapter base such as the first T in this read::
+
+    >1831_573_1004_F3
+    T00030133312212111300011021310132222
+  
+this option removes this base leaving only color calls. Not necessary for bowtie. Required for BWA.
+
+**Double encode?** - converts color calls (0123.) to pseudo-nucleotides (ACGTN). Not necessary for bowtie. Required for BWA.
+
+------
+
+**Examples of output**
+
+When all parameters are left &quot;as-is&quot; you will get this (using reads and qualities shown above)::
+
+ @1831_573_1004
+ T00030133312212111300011021310132222
+ +
+ %>CCAA9952+C>5C.?C79,=42C292:C(9/-7
+ @1831_573_1004
+ T03330322230322112131010221102122113
+ +
+ );@@17?@=>7??@A8?==@4A?A4)A+.'A+'1,
+
+Setting *Trim first base from reads* to **Yes** will produce this::
+
+ @1831_573_1004
+ 00030133312212111300011021310132222
+ +
+ %>CCAA9952+C>5C.?C79,=42C292:C(9/-7
+ @1831_573_1004
+ 03330322230322112131010221102122113
+ +
+ );@@17?@=>7??@A8?==@4A?A4)A+.'A+'1,
+
+Finally, setting *Double encode* to **Yes** will yield::
+
+ @1831_573_1004
+ TAAATACTTTCGGCGCCCTAAACCAGCTCACTGGGG
+ +
+ %>CCAA9952+C>5C.?C79,=42C292:C(9/-7
+ @1831_573_1004
+ TATTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT
+ +
+ );@@17?@=>7??@A8?==@4A?A4)A+.'A+'1,
+
+

+

+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/next_gen_conversion/solid_to_fastq.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/next_gen_conversion/solid_to_fastq.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+
+"""
+Converts SOLiD data to Sanger FASTQ format.
+
+usage: %prog [options]
+   -i, --input1=i: Forward reads file
+   -q, --input2=q: Forward qual file
+   -I, --input3=I: Reverse reads file
+   -Q, --input4=Q: Reverse qual file
+   -o, --output1=o: Forward output
+   -r, --output2=r: Reverse output
+
+usage: %prog forward_reads_file forwards_qual_file reverse_reads_file(or_None) reverse_qual_file(or_None) output_file ouptut_id output_dir
+"""
+
+import os, sys, tempfile
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+    
+def replaceNeg1(fin, fout):
+    line = fin.readline()
+    while line.strip():
+        fout.write(line.replace('-1', '1'))
+        line = fin.readline()
+    fout.seek(0)
+    return fout

+def __main__():
+    #Parse Command Line
+    options, args = doc_optparse.parse( __doc__ )
+    # common temp file setup
+    tmpf = tempfile.NamedTemporaryFile()    #forward reads
+    tmpqf = tempfile.NamedTemporaryFile()
+    tmpqf = replaceNeg1(file(options.input2,'r'), tmpqf)
+    # if paired-end data (have reverse input files)
+    if options.input3 != "None" and options.input4 != "None":
+        tmpr = tempfile.NamedTemporaryFile()    #reverse reads
+        # replace the -1 in the qualities file 
+        tmpqr = tempfile.NamedTemporaryFile()
+        tmpqr = replaceNeg1(file(options.input4,'r'), tmpqr)
+        cmd1 = "%s/bwa_solid2fastq_modified.pl 'yes' %s %s %s %s %s %s 2>&1" %(os.path.split(sys.argv[0])[0], tmpf.name, tmpr.name, options.input1, tmpqf.name, options.input3, tmpqr.name)
+        try:
+            os.system(cmd1)
+            os.system('gunzip -c %s >> %s' %(tmpf.name,options.output1))
+            os.system('gunzip -c %s >> %s' %(tmpr.name,options.output2))
+        except Exception, eq:
+            stop_err("Error converting data to fastq format.\n" + str(eq))
+        tmpr.close()
+        tmpqr.close()
+    # if single-end data
+    else:
+        cmd1 = "%s/bwa_solid2fastq_modified.pl 'no' %s %s %s %s %s %s 2>&1" % (os.path.split(sys.argv[0])[0], tmpf.name, None, options.input1, tmpqf.name, None, None)
+        try:
+            os.system(cmd1)
+            os.system('gunzip -c %s >> %s' % (tmpf.name, options.output1))
+        except Exception, eq:
+            stop_err("Error converting data to fastq format.\n" + str(eq))
+    tmpqf.close()
+    tmpf.close()
+    sys.stdout.write('converted SOLiD data')
+
+if __name__=="__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/next_gen_conversion/solid_to_fastq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/next_gen_conversion/solid_to_fastq.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,101 @@
+<tool id="solid_to_fastq" name="SOLiD-to-FASTQ" version="1.0.0">
+  <description>converts SOLiD data to FASTQ data</description>
+  <command interpreter="python">
+    solid_to_fastq.py 
+    --input1=$input1 
+    --input2=$input2
+    #if $paired.pairedSingle == "single":
+     --input3="None"
+     --input4="None"
+    #else:
+     --input3=$input3
+     --input4=$input4
+    #end if
+    --output1=$output1
+    #if $paired.pairedSingle == "single":
+     --output2="None"
+    #else:
+     --output2=$output2
+    #end if
+  </command>
+  <inputs>
+    <conditional name="paired">
+      <param name="pairedSingle" type="select" label="Is this library mate-paired?">
+        <option value="single">Single</option>
+        <option value="paired">Paired</option>
+      </param>
+      <when value="single">
+        <param name="input1" type="data" format="csfasta" label="F3 read file" />
+        <param name="input2" type="data" format="qualsolid" label="F3 qual file" />
+      </when>
+      <when value="paired">
+        <param name="input1" type="data" format="csfasta" label="F3 read file" />
+        <param name="input2" type="data" format="qualsolid" label="F3 qual file" />
+        <param name="input3" type="data" format="csfasta" label="R3 read file" />
+        <param name="input4" type="data" format="qualsolid" label="R3 qual file" />      
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <!-- Variable number of outputs. Either one (for single-end) or two (for paired-end) -->
+    <data name="output1" format="fastqsanger"/>
+    <data name="output2" format="fastqsanger">
+      <filter>paired['pairedSingle'] == 'paired'</filter>
+    </data>    
+  </outputs>
+  <tests>
+    <test>
+      <param name="pairedSingle" value="single" />
+      <param name="input1" value="s2fq_phiX.csfasta" ftype="csfasta" />
+      <param name="input2" value="s2fq_phiX.qualsolid" ftype="qualsolid" />
+      <output name="output1" file="s2fq_out1.fastqsanger" />
+    </test>
+    <test>
+      <param name="pairedSingle" value="paired" />
+      <param name="input1" value="s2fq_paired_F3.csfasta" ftype="csfasta" />
+      <param name="input2" value="s2fq_paired_F3_QV.qualsolid" ftype="qualsolid" />
+      <param name="input3" value="s2fq_paired_R3.csfasta" ftype="csfasta" />
+      <param name="input4" value="s2fq_paired_R3_QV.qualsolid" ftype="qualsolid" />
+      <output name="output1" file="s2fq_out2.fastqsanger" />
+      <!-- testing framework does not deal with multiple outputs yet
+      <output name="output2" file="s2fq_out3.fastqsanger" />
+      -->
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool takes reads and quality files and converts them to FASTQ data ( Sanger variant ). Any -1 qualities are converted to 1 before being converted to FASTQ. Note that it also converts sequences to base pairs.
+
+-----
+
+**Example**
+
+- Converting the following sequences::
+
+    >1831_573_1004_F3
+    T00030133312212111300011021310132222
+    >1831_573_1567_F3
+    T03330322230322112131010221102122113
+
+- and quality scores::
+
+    >1831_573_1004_F3
+    4 29 34 34 32 32 24 24 20 17 10 34 29 20 34 13 30 34 22 24 11 28 19 17 34 17 24 17 25 34 7 24 14 12 22
+    >1831_573_1567_F3
+    8 26 31 31 16 22 30 31 28 29 22 30 30 31 32 23 30 28 28 31 19 32 30 32 19 8 32 10 13 6 32 10 6 16 11
+
+- will produce the following Sanger FASTQ data::
+
+    @1831_573_1004/1
+    AATACTTTCGGCGCCCTAAACCAGCTCACTGGGG
+    +
+    >CCAA9952+C>5C.?C79,=42C292:C(9/-7
+    @1831_573_1567/1
+    TTTATGGGTATGGCCGCTCACAGGCCAGCGGCCT
+    +
+    ;@@17?@=>7??@A8?==@4A?A4)A+.'A+'1,
+
+    </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/ngs_rna/cuffcompare_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ngs_rna/cuffcompare_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,131 @@
+#!/usr/bin/env python
+
+import optparse, os, shutil, subprocess, sys, tempfile
+
+def stop_err( msg ):
+    sys.stderr.write( '%s\n' % msg )
+    sys.exit()
+
+# Copied from sam_to_bam.py:
+def check_seq_file( dbkey, cached_seqs_pointer_file ):
+    seq_path = ''
+    for line in open( cached_seqs_pointer_file ):
+        line = line.rstrip( '\r\n' )
+        if line and not line.startswith( '#' ) and line.startswith( 'index' ):
+            fields = line.split( '\t' )
+            if len( fields ) < 3:
+                continue
+            if fields[1] == dbkey:
+                seq_path = fields[2].strip()
+                break
+    return seq_path
+
+def __main__():
+    #Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option( '-r', dest='ref_annotation', help='An optional "reference" annotation GTF. Each sample is matched against this file, and sample isoforms are tagged as overlapping, matching, or novel where appropriate. See the refmap and tmap output file descriptions below.' )
+    parser.add_option( '-R', action="store_true", dest='ignore_nonoverlap', help='If -r was specified, this option causes cuffcompare to ignore reference transcripts that are not overlapped by any transcript in one of cuff1.gtf,...,cuffN.gtf. Useful for ignoring annotated transcripts that are not present in your RNA-Seq samples and thus adjusting the "sensitivity" calculation in the accuracy report written in the transcripts accuracy file' )
+    parser.add_option( '-s', dest='use_seq_data', action="store_true", help='Causes cuffcompare to look into for fasta files with the underlying genomic sequences (one file per contig) against which your reads were aligned for some optional classification functions. For example, Cufflinks transcripts consisting mostly of lower-case bases are classified as repeats. Note that <seq_dir> must contain one fasta file per reference chromosome, and each file must be named after the chromosome, and have a .fa or .fasta extension.')
+    
+    # Wrapper / Galaxy options.
+    parser.add_option( '', '--dbkey', dest='dbkey', help='The build of the reference dataset' )
+    parser.add_option( '', '--index_dir', dest='index_dir', help='GALAXY_DATA_INDEX_DIR' )
+    parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' )
+    
+    # Outputs.
+    parser.add_option( '', '--combined-transcripts', dest='combined_transcripts' )
+    
+    (options, args) = parser.parse_args()
+    
+    # output version # of tool
+    try:
+        tmp = tempfile.NamedTemporaryFile().name
+        tmp_stdout = open( tmp, 'wb' )
+        proc = subprocess.Popen( args='cuffcompare 2>&1', shell=True, stdout=tmp_stdout )
+        tmp_stdout.close()
+        returncode = proc.wait()
+        stdout = None
+        for line in open( tmp_stdout.name, 'rb' ):
+            if line.lower().find( 'cuffcompare v' ) >= 0:
+                stdout = line.strip()
+                break
+        if stdout:
+            sys.stdout.write( '%s\n' % stdout )
+        else:
+            raise Exception
+    except:
+        sys.stdout.write( 'Could not determine Cuffcompare version\n' )
+        
+    # Set/link to sequence file.
+    if options.use_seq_data:
+        cached_seqs_pointer_file = os.path.join( options.index_dir, 'sam_fa_indices.loc' )
+        if not os.path.exists( cached_seqs_pointer_file ):
+            stop_err( 'The required file (%s) does not exist.' % cached_seqs_pointer_file )
+        # If found for the dbkey, seq_path will look something like /galaxy/data/equCab2/sam_index/equCab2.fa,
+        # and the equCab2.fa file will contain fasta sequences.
+        seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file )
+        if options.ref_file != 'None':
+            # Create symbolic link to ref_file so that index will be created in working directory.
+            seq_path = "ref.fa"
+            os.symlink( options.ref_file, seq_path  )
+    
+    # Build command.
+    
+    # Base.
+    cmd = "cuffcompare -o cc_output "
+    
+    # Add options.
+    if options.ref_annotation:
+        cmd += " -r %s " % options.ref_annotation
+    if options.ignore_nonoverlap:
+        cmd += " -R "
+    if options.use_seq_data:
+        cmd += " -s %s " % seq_path
+        
+    # Add input files.
+        
+    # Need to symlink inputs so that output files are written to temp directory.
+    for i, arg in enumerate( args ):
+        input_file_name = "./input%i" % ( i+1 )
+        os.symlink( arg, input_file_name )
+        cmd += "%s " % input_file_name
+
+    # Debugging.
+    print cmd
+    
+    # Run command.
+    try:        
+        tmp_name = tempfile.NamedTemporaryFile( dir="." ).name
+        tmp_stderr = open( tmp_name, 'wb' )
+        proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )
+        returncode = proc.wait()
+        tmp_stderr.close()
+        
+        # Get stderr, allowing for case where it's very large.
+        tmp_stderr = open( tmp_name, 'rb' )
+        stderr = ''
+        buffsize = 1048576
+        try:
+            while True:
+                stderr += tmp_stderr.read( buffsize )
+                if not stderr or len( stderr ) % buffsize != 0:
+                    break
+        except OverflowError:
+            pass
+        tmp_stderr.close()
+        
+        # Error checking.
+        if returncode != 0:
+            raise Exception, stderr
+            
+        # Copy outputs.
+        shutil.copyfile( "cc_output.combined.gtf" , options.combined_transcripts )    
+            
+        # check that there are results in the output file
+        cc_output_fname = "cc_output.stats"
+        if len( open( cc_output_fname, 'rb' ).read().strip() ) == 0:
+            raise Exception, 'The main output file is empty, there may be an error with your input file or settings.'
+    except Exception, e:
+        stop_err( 'Error running cuffcompare. ' + str( e ) )
+        
+if __name__=="__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/ngs_rna/cuffcompare_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ngs_rna/cuffcompare_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,217 @@\n+<tool id="cuffcompare" name="Cuffcompare" version="0.0.5">\n+    <!-- Wrapper supports Cuffcompare versions v1.0.0-v1.0.3 -->\n+    <description>compare assembled transcripts to a reference annotation and track Cufflinks transcripts across multiple experiments</description>\n+    <requirements>\n+        <requirement type="package">cufflinks</requirement>\n+    </requirements>\n+    <command interpreter="python">\n+        cuffcompare_wrapper.py \n+            \n+            ## Use annotation reference?\n+            #if $annotation.use_ref_annotation == "Yes":\n+                -r $annotation.reference_annotation\n+                #if $annotation.ignore_nonoverlapping_reference:\n+                    -R\n+                #end if\n+            #end if\n+            \n+            ## Use sequence data?\n+            #if $seq_data.use_seq_data == "Yes":\n+\t        -s\n+                #if $seq_data.seq_source.index_source == "history":\n+                    --ref_file=$seq_data.seq_source.ref_file\n+                #else:\n+                    --ref_file="None"\n+                #end if\n+                --dbkey=${first_input.metadata.dbkey} \n+                --index_dir=${GALAXY_DATA_INDEX_DIR}\n+            #end if\n+            \n+            ## Outputs.\n+            --combined-transcripts=${transcripts_combined}\n+            \n+            ## Inputs.\n+            ${first_input}\n+            #for $input_file in $input_files:\n+              ${input_file.additional_input}\n+            #end for\n+            \n+    </command>\n+    <inputs>\n+        <param format="gtf" name="first_input" type="data" label="GTF file produced by Cufflinks" help=""/>\n+        <repeat name="input_files" title="Additional GTF Input Files">\n+            <param format="gtf" name="additional_input" type="data" label="GTF file produced by Cufflinks" help=""/>\n+        </repeat>\n+        <conditional name="annotation">\n+            <param name="use_ref_annotation" type="select" label="Use Reference Annotation">\n+                <option value="No">No</option>\n+                <option value="Yes">Yes</option>\n+            </param>\n+            <when value="Yes">\n+                <param format="gtf" name="reference_annotation" type="data" label="Reference Annotation" help="Make sure your annotation file is in GTF format and that Galaxy knows that your file is GTF--not GFF."/>    \n+                <param name="ignore_nonoverlapping_reference" type="boolean" label="Ignore reference transcripts that are not overlapped by any transcript in input files"/>\n+            </when>\n+            <when value="No">\n+            </when>\n+        </conditional>\n+        <conditional name="seq_data">\n+            <param name="use_seq_data" type="select" label="Use Sequence Data" help="Use sequence data for some optional classification functions, including the addition of the p_id attribute required by Cuffdiff.">\n+                <option value="Yes">Yes</option>\n+                <option value="No">No</option>\n+            </param>\n+            <when value="No"></when>\n+            <when value="Yes">\n+                <conditional name="seq_source">\n+                  <param name="index_source" type="select" label="Choose the source for the reference list">\n+                    <option value="cached">Locally cached</option>\n+                    <option value="history">History</option>\n+                  </param>\n+                  <when value="cached"></when>\n+                  <when value="history">\n+                      <param name="ref_file" type="data" format="fasta" label="Using reference file" />\n+                  </when>\n+                </conditional>\n+            </when>\n+        </conditional>\n+    </inputs>\n+\n+    <outputs>\n+        <data format="txt" name="transcripts_accuracy" label="${tool.name} on ${on_string}: transcript accuracy" \n+            from_work_dir="cc_output.stats" />\n+        <data format="tabular" name="input1_tmap" label="${tool.name} on ${on_string}: ${fir'..b'th of the first and last exons, since these lengths will naturally vary from sample to sample due to the random nature of sequencing.\n+If you ran cuffcompare with the -r option, the first and second columns contain the closest matching reference transcript to the one described by each row.\n+\n+Here\'s an example of a line from the tracking file::\n+\n+  TCONS_00000045 XLOC_000023 Tcea|uc007afj.1\tj\t\\\n+     q1:exp.115|exp.115.0|100|3.061355|0.350242|0.350207 \\\n+     q2:60hr.292|60hr.292.0|100|4.094084|0.000000|0.000000\n+\n+In this example, a transcript present in the two input files, called exp.115.0 in the first and 60hr.292.0 in the second, doesn\'t match any reference transcript exactly, but shares exons with uc007afj.1, an isoform of the gene Tcea, as indicated by the class code j. The first three columns are as follows::\n+\n+  Column number   Column name               Example          Description\n+  -----------------------------------------------------------------------\n+  1               Cufflinks transfrag id    TCONS_00000045   A unique internal id for the transfrag\n+  2               Cufflinks locus id        XLOC_000023      A unique internal id for the locus\n+  3               Reference gene id         Tcea             The gene_name attribute of the reference GTF record for this transcript, or \'-\' if no reference transcript overlaps this Cufflinks transcript\n+  4               Reference transcript id   uc007afj.1       The transcript_id attribute of the reference GTF record for this transcript, or \'-\' if no reference transcript overlaps this Cufflinks transcript\n+  5               Class code                c                The type of match between the Cufflinks transcripts in column 6 and the reference transcript. See class codes\n+  \n+Each of the columns after the fifth have the following format:\n+  qJ:gene_id|transcript_id|FMI|FPKM|conf_lo|conf_hi\n+\n+A transcript need be present in all samples to be reported in the tracking file. A sample not containing a transcript will have a "-" in its entry in the row for that transcript.\n+\n+Class Codes\n+\n+If you ran cuffcompare with the -r option, tracking rows will contain the following values. If you did not use -r, the rows will all contain "-" in their class code column::\n+\n+  Priority\t Code\t   Description\n+  ---------------------------------\n+  1\t         =\t       Match\n+  2\t         c\t       Contained\t\n+  3\t         j\t       New isoform\t\n+  4\t         e\t       A single exon transcript overlapping a reference exon and at least 10 bp of a reference intron, indicating a possible pre-mRNA fragment.\t\n+  5\t         i\t       A single exon transcript falling entirely with a reference intron\t\n+  6\t         r\t       Repeat. Currently determined by looking at the reference sequence and applied to transcripts where at least 50% of the bases are lower case\t\n+  7\t         p\t       Possible polymerase run-on fragment\t\n+  8\t         u\t       Unknown, intergenic transcript\t\n+  9\t         o\t       Unknown, generic overlap with reference\t\n+  10             .\t       (.tracking file only, indicates multiple classifications)\n+    \n+-------\n+\n+**Settings**\n+\n+All of the options have a default value. You can change any of them. Most of the options in Cuffcompare have been implemented here.\n+\n+------\n+\n+**Cuffcompare parameter list**\n+\n+This is a list of implemented Cuffcompare options::\n+\n+  -r    An optional "reference" annotation GTF. Each sample is matched against this file, and sample isoforms are tagged as overlapping, matching, or novel where appropriate. See the refmap and tmap output file descriptions below.\n+  -R    If -r was specified, this option causes cuffcompare to ignore reference transcripts that are not overlapped by any transcript in one of cuff1.gtf,...,cuffN.gtf. Useful for ignoring annotated transcripts that are not present in your RNA-Seq samples and thus adjusting the "sensitivity" calculation in the accuracy report written in the transcripts_accuracy file\n+    </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/ngs_rna/cuffdiff_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ngs_rna/cuffdiff_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,233 @@\n+#!/usr/bin/env python\n+\n+import optparse, os, shutil, subprocess, sys, tempfile\n+\n+def group_callback( option, op_str, value, parser ):\n+    groups = []\n+    flist = []\n+    for arg in parser.rargs:\n+        arg = arg.strip()\n+        if arg[0] is "-":\n+            break\n+        elif arg[0] is ",":\n+            groups.append(flist)\n+            flist = []\n+        else:\n+            flist.append(arg)\n+    groups.append(flist)\n+\n+    setattr(parser.values, option.dest, groups)\n+    \n+def label_callback( option, op_str, value, parser ):\n+    labels = []\n+    for arg in parser.rargs:\n+        arg = arg.strip()\n+        if arg[0] is "-":\n+            break\n+        else:\n+            labels.append(arg)\n+\n+    setattr(parser.values, option.dest, labels)\n+\n+def stop_err( msg ):\n+    sys.stderr.write( "%s\\n" % msg )\n+    sys.exit()\n+    \n+# Copied from sam_to_bam.py:\n+def check_seq_file( dbkey, cached_seqs_pointer_file ):\n+    seq_path = \'\'\n+    for line in open( cached_seqs_pointer_file ):\n+        line = line.rstrip( \'\\r\\n\' )\n+        if line and not line.startswith( \'#\' ) and line.startswith( \'index\' ):\n+            fields = line.split( \'\\t\' )\n+            if len( fields ) < 3:\n+                continue\n+            if fields[1] == dbkey:\n+                seq_path = fields[2].strip()\n+                break\n+    return seq_path\n+\n+def __main__():\n+    #Parse Command Line\n+    parser = optparse.OptionParser()\n+    \n+    # Cuffdiff options.\n+    parser.add_option( \'-s\', \'--inner-dist-std-dev\', dest=\'inner_dist_std_dev\', help=\'The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp.\' )\n+    parser.add_option( \'-p\', \'--num-threads\', dest=\'num_threads\', help=\'Use this many threads to align reads. The default is 1.\' )\n+    parser.add_option( \'-m\', \'--inner-mean-dist\', dest=\'inner_mean_dist\', help=\'This is the expected (mean) inner distance between mate pairs. \\\n+                                                                                For, example, for paired end runs with fragments selected at 300bp, \\\n+                                                                                where each end is 50bp, you should set -r to be 200. The default is 45bp.\')\n+    parser.add_option( \'-c\', \'--min-alignment-count\', dest=\'min_alignment_count\', help=\'The minimum number of alignments in a locus for needed to conduct significance testing on changes in that locus observed between samples. If no testing is performed, changes in the locus are deemed not signficant, and the locus\\\' observed changes don\\\'t contribute to correction for multiple testing. The default is 1,000 fragment alignments (up to 2,000 paired reads).\' )\n+    parser.add_option( \'--FDR\', dest=\'FDR\', help=\'The allowed false discovery rate. The default is 0.05.\' )\n+\n+    # Advanced Options:\t\n+    parser.add_option( \'--num-importance-samples\', dest=\'num_importance_samples\', help=\'Sets the number of importance samples generated for each locus during abundance estimation. Default: 1000\' )\n+    parser.add_option( \'--max-mle-iterations\', dest=\'max_mle_iterations\', help=\'Sets the number of iterations allowed during maximum likelihood estimation of abundances. Default: 5000\' )\n+    \n+    # Wrapper / Galaxy options.\n+    parser.add_option( \'-f\', \'--files\', dest=\'groups\', action="callback", callback=group_callback, help="Groups to be processed, groups are separated by spaces, replicates in a group comma separated. group1_rep1,group1_rep2 group2_rep1,group2_rep2, ..., groupN_rep1, groupN_rep2" )\n+    parser.add_option( \'-A\', \'--inputA\', dest=\'inputA\', help=\'A transcript GTF file produced by cufflinks, cuffcompare, or other source.\')\n+    parser.add_option( \'-1\', \'--input1\', dest=\'input1\', help=\'File of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of thes'..b'    cmd += ( " -c %i" % int ( options.min_alignment_count ) )\n+    if options.FDR:\n+        cmd += ( " --FDR %f" % float( options.FDR ) )\n+    if options.num_importance_samples:\n+        cmd += ( " --num-importance-samples %i" % int ( options.num_importance_samples ) )\n+    if options.max_mle_iterations:\n+        cmd += ( " --max-mle-iterations %i" % int ( options.max_mle_iterations ) )\n+    if options.do_normalization:\n+        cmd += ( " -N" )\n+    if options.do_bias_correction:\n+        cmd += ( " -b %s" % seq_path )\n+            \n+    # Add inputs.\n+    # For replicate analysis: group1_rep1,group1_rep2 groupN_rep1,groupN_rep2\n+    if options.groups:\n+        cmd += " --labels "\n+        for label in options.labels:\n+            cmd += label + ","\n+        cmd = cmd[:-1]\n+\n+        cmd += " " + options.inputA + " "\n+\n+        for group in options.groups:\n+            for filename in group:\n+                cmd += filename + ","\n+            cmd = cmd[:-1] + " "\n+    else: \n+        cmd += " " + options.inputA + " " + options.input1 + " " + options.input2\n+        \n+    # Debugging.\n+    print cmd\n+\n+    # Run command.\n+    try:\n+        tmp_name = tempfile.NamedTemporaryFile( dir=tmp_output_dir ).name\n+        tmp_stderr = open( tmp_name, \'wb\' )\n+        proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_output_dir, stderr=tmp_stderr.fileno() )\n+        returncode = proc.wait()\n+        tmp_stderr.close()\n+        \n+        # Get stderr, allowing for case where it\'s very large.\n+        tmp_stderr = open( tmp_name, \'rb\' )\n+        stderr = \'\'\n+        buffsize = 1048576\n+        try:\n+            while True:\n+                stderr += tmp_stderr.read( buffsize )\n+                if not stderr or len( stderr ) % buffsize != 0:\n+                    break\n+        except OverflowError:\n+            pass\n+        tmp_stderr.close()\n+        \n+        # Error checking.\n+        if returncode != 0:\n+            raise Exception, stderr\n+            \n+        # check that there are results in the output file\n+        if len( open( os.path.join( tmp_output_dir, "isoforms.fpkm_tracking" ), \'rb\' ).read().strip() ) == 0:\n+            raise Exception, \'The main output file is empty, there may be an error with your input file or settings.\'\n+    except Exception, e:\n+        stop_err( \'Error running cuffdiff. \' + str( e ) )\n+\n+        \n+    # Copy output files from tmp directory to specified files.\n+    try:\n+        try:\n+            shutil.copyfile( os.path.join( tmp_output_dir, "isoforms.fpkm_tracking" ), options.isoforms_fpkm_tracking_output )\n+            shutil.copyfile( os.path.join( tmp_output_dir, "genes.fpkm_tracking" ), options.genes_fpkm_tracking_output )\n+            shutil.copyfile( os.path.join( tmp_output_dir, "cds.fpkm_tracking" ), options.cds_fpkm_tracking_output )\n+            shutil.copyfile( os.path.join( tmp_output_dir, "tss_groups.fpkm_tracking" ), options.tss_groups_fpkm_tracking_output )\n+            shutil.copyfile( os.path.join( tmp_output_dir, "isoform_exp.diff" ), options.isoforms_exp_output )\n+            shutil.copyfile( os.path.join( tmp_output_dir, "gene_exp.diff" ), options.genes_exp_output )\n+            shutil.copyfile( os.path.join( tmp_output_dir, "tss_group_exp.diff" ), options.tss_groups_exp_output )\n+            shutil.copyfile( os.path.join( tmp_output_dir, "splicing.diff" ), options.splicing_diff_output )\n+            shutil.copyfile( os.path.join( tmp_output_dir, "cds.diff" ), options.cds_diff_output )\n+            shutil.copyfile( os.path.join( tmp_output_dir, "cds_exp.diff" ), options.cds_exp_fpkm_tracking_output )\n+            shutil.copyfile( os.path.join( tmp_output_dir, "promoters.diff" ), options.promoters_diff_output )    \n+        except Exception, e:\n+            stop_err( \'Error in cuffdiff:\\n\' + str( e ) ) \n+    finally:\n+        # Clean up temp dirs\n+        if os.path.exists( tmp_output_dir ):\n+            shutil.rmtree( tmp_output_dir )\n+\n+if __name__=="__main__": __main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/ngs_rna/cuffdiff_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ngs_rna/cuffdiff_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,238 @@\n+<tool id="cuffdiff" name="Cuffdiff" version="0.0.5">\n+    <!-- Wrapper supports Cuffdiff versions v1.0.0-v1.0.3 -->\n+    <description>find significant changes in transcript expression, splicing, and promoter use</description>\n+    <requirements>\n+        <requirement type="package">cufflinks</requirement>\n+    </requirements>\n+    <command interpreter="python">\n+        cuffdiff_wrapper.py\n+            --FDR=$fdr\n+            --num-threads="4"\n+            --min-alignment-count=$min_alignment_count\n+\n+            --isoforms_fpkm_tracking_output=$isoforms_fpkm_tracking\n+            --genes_fpkm_tracking_output=$genes_fpkm_tracking\n+            --cds_fpkm_tracking_output=$cds_fpkm_tracking\n+            --tss_groups_fpkm_tracking_output=$tss_groups_fpkm_tracking\n+            --isoforms_exp_output=$isoforms_exp\n+            --genes_exp_output=$genes_exp\n+            --tss_groups_exp_output=$tss_groups_exp\n+            --cds_exp_fpkm_tracking_output=$cds_exp_fpkm_tracking\n+            --splicing_diff_output=$splicing_diff\n+            --cds_diff_output=$cds_diff\n+            --promoters_diff_output=$promoters_diff\n+            \n+            ## Set paired-end data parameters?\n+            #if $singlePaired.sPaired == "Yes":\n+                -m $singlePaired.mean_inner_distance\n+                -s $singlePaired.inner_distance_std_dev\n+                #end if\n+\n+            ## Normalization?\n+            #if str($do_normalization) == "Yes":\n+            -N\n+            #end if\n+\n+            \n+            ## Bias correction?\n+            #if $bias_correction.do_bias_correction == "Yes":\n+\t        -b\n+                #if $bias_correction.seq_source.index_source == "history":\n+                    --ref_file=$bias_correction.seq_source.ref_file\n+                #else:\n+                    --ref_file="None"\n+                #end if\n+                --dbkey=${gtf_input.metadata.dbkey} \n+                --index_dir=${GALAXY_DATA_INDEX_DIR}\n+            #end if\n+                \n+            ## Inputs.\n+            --inputA=$gtf_input\n+            #if $group_analysis.do_groups == "No":\n+                --input1=$aligned_reads1\n+                --input2=$aligned_reads2\n+            #else:\n+                ## Replicates.\n+                --labels\n+                #for $group in $group_analysis.groups\n+                    ${group.group}\n+                #end for\n+                --files\n+                #for $group in $group_analysis.groups\n+                    #for $file in $group.files:\n+                        ${file.file}\n+                    #end for\n+                    ,\n+                #end for\n+            #end if\n+\n+    </command>\n+    <inputs>\n+        <param format="gtf" name="gtf_input" type="data" label="Transcripts" help="A transcript GTF file produced by cufflinks, cuffcompare, or other source."/>\n+        <conditional name="group_analysis"> \n+            <param name="do_groups" type="select" label="Perform replicate analysis" help="Perform cuffdiff with replicates in each group.">\n+                <option value="No">No</option>\n+                <option value="Yes">Yes</option>\n+            </param>\n+            <when value="Yes">\n+                <repeat name="groups" title="Group">\n+                    <param name="group" title="Group name" type="text" label="Group name (no spaces or commas)"/>\n+                    <repeat name="files" title="Replicate">\n+                        <param name="file" label="Add file" type="data" format="sam,bam"/>\n+                    </repeat>\n+                </repeat>\n+            </when>\n+            <when value="No">\n+                <param format="sam,bam" name="aligned_reads1" type="data" label="SAM or BAM file of aligned RNA-Seq reads" help=""/>\n+                <param format="sam,bam" name="aligned_reads2" type="data" label="SAM or BAM file of aligned RNA-Seq reads" help=""/>\n+            </when>\n+        </conditional>\n+\n+        <param name="fdr" type="float" value'..b"isco. In other words, running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy.\n+\n+.. __: http://cufflinks.cbcb.umd.edu/manual.html#cuffdiff\n+\n+------\n+\n+**Input format**\n+\n+Cuffdiff takes Cufflinks or Cuffcompare GTF files as input along with two SAM files containing the fragment alignments for two or more samples.\n+\n+------\n+\n+**Outputs**\n+\n+Cuffdiff produces many output files:\n+\n+1. Transcript FPKM expression tracking.\n+2. Gene FPKM expression tracking; tracks the summed FPKM of transcripts sharing each gene_id\n+3. Primary transcript FPKM tracking; tracks the summed FPKM of transcripts sharing each tss_id\n+4. Coding sequence FPKM tracking; tracks the summed FPKM of transcripts sharing each p_id, independent of tss_id\n+5. Transcript differential FPKM.\n+6. Gene differential FPKM. Tests difference sin the summed FPKM of transcripts sharing each gene_id\n+7. Primary transcript differential FPKM. Tests difference sin the summed FPKM of transcripts sharing each tss_id\n+8. Coding sequence differential FPKM. Tests difference sin the summed FPKM of transcripts sharing each p_id independent of tss_id\n+9. Differential splicing tests: this tab delimited file lists, for each primary transcript, the amount of overloading detected among its isoforms, i.e. how much differential splicing exists between isoforms processed from a single primary transcript. Only primary transcripts from which two or more isoforms are spliced are listed in this file.\n+10. Differential promoter tests: this tab delimited file lists, for each gene, the amount of overloading detected among its primary transcripts, i.e. how much differential promoter use exists between samples. Only genes producing two or more distinct primary transcripts (i.e. multi-promoter genes) are listed here.\n+11. Differential CDS tests: this tab delimited file lists, for each gene, the amount of overloading detected among its coding sequences, i.e. how much differential CDS output exists between samples. Only genes producing two or more distinct CDS (i.e. multi-protein genes) are listed here.\n+    \n+-------\n+\n+**Settings**\n+\n+All of the options have a default value. You can change any of them. Most of the options in Cuffdiff have been implemented here.\n+\n+------\n+\n+**Cuffdiff parameter list**\n+\n+This is a list of implemented Cuffdiff options::\n+\n+  -m INT                         This is the expected (mean) inner distance between mate pairs. For, example, for paired end runs with fragments selected at 300bp, where each end is 50bp, you should set -r to be 200. The default is 45bp.\n+  -s INT                         The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp.\n+  -c INT                         The minimum number of alignments in a locus for needed to conduct significance testing on changes in that locus observed between samples. If no testing is performed, changes in the locus are deemed not significant, and the locus' observed changes don't contribute to correction for multiple testing. The default is 1,000 fragment alignments (up to 2,000 paired reads).\n+  --FDR FLOAT                    The allowed false discovery rate. The default is 0.05.\n+  --num-importance-samples INT   Sets the number of importance samples generated for each locus during abundance estimation. Default: 1000\n+  --max-mle-iterations INT       Sets the number of iterations allowed during maximum likelihood estimation of abundances. Default: 5000\n+  -N                             With this option, Cufflinks excludes the contribution of the top 25 percent most highly expressed genes from the number of mapped fragments used in the FPKM denominator. This can improve robustness of differential expression calls for less abundant genes and transcripts.\n+  \n+    </help>\n+</tool>\n"
b
diff -r 000000000000 -r 9071e359b9a3 tools/ngs_rna/cufflinks_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ngs_rna/cufflinks_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,155 @@\n+#!/usr/bin/env python\n+\n+import optparse, os, shutil, subprocess, sys, tempfile\n+\n+def stop_err( msg ):\n+    sys.stderr.write( "%s\\n" % msg )\n+    sys.exit()\n+    \n+# Copied from sam_to_bam.py:\n+def check_seq_file( dbkey, cached_seqs_pointer_file ):\n+    seq_path = \'\'\n+    for line in open( cached_seqs_pointer_file ):\n+        line = line.rstrip( \'\\r\\n\' )\n+        if line and not line.startswith( \'#\' ) and line.startswith( \'index\' ):\n+            fields = line.split( \'\\t\' )\n+            if len( fields ) < 3:\n+                continue\n+            if fields[1] == dbkey:\n+                seq_path = fields[2].strip()\n+                break\n+    return seq_path\n+\t\n+def __main__():\n+    #Parse Command Line\n+    parser = optparse.OptionParser()\n+    parser.add_option( \'-1\', \'--input\', dest=\'input\', help=\' file of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of these tags. Please see Input formats for more details.\' )\n+    parser.add_option( \'-s\', \'--inner-dist-std-dev\', dest=\'inner_dist_std_dev\', help=\'The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp.\' )\n+    parser.add_option( \'-I\', \'--max-intron-length\', dest=\'max_intron_len\', help=\'The minimum intron length. Cufflinks will not report transcripts with introns longer than this, and will ignore SAM alignments with REF_SKIP CIGAR operations longer than this. The default is 300,000.\' )\n+    parser.add_option( \'-F\', \'--min-isoform-fraction\', dest=\'min_isoform_fraction\', help=\'After calculating isoform abundance for a gene, Cufflinks filters out transcripts that it believes are very low abundance, because isoforms expressed at extremely low levels often cannot reliably be assembled, and may even be artifacts of incompletely spliced precursors of processed transcripts. This parameter is also used to filter out introns that have far fewer spliced alignments supporting them. The default is 0.05, or 5% of the most abundant isoform (the major isoform) of the gene.\' )\n+    parser.add_option( \'-j\', \'--pre-mrna-fraction\', dest=\'pre_mrna_fraction\', help=\'Some RNA-Seq protocols produce a significant amount of reads that originate from incompletely spliced transcripts, and these reads can confound the assembly of fully spliced mRNAs. Cufflinks uses this parameter to filter out alignments that lie within the intronic intervals implied by the spliced alignments. The minimum depth of coverage in the intronic region covered by the alignment is divided by the number of spliced reads, and if the result is lower than this parameter value, the intronic alignments are ignored. The default is 5%.\' )\n+    parser.add_option( \'-p\', \'--num-threads\', dest=\'num_threads\', help=\'Use this many threads to align reads. The default is 1.\' )\n+    parser.add_option( \'-m\', \'--inner-mean-dist\', dest=\'inner_mean_dist\', help=\'This is the expected (mean) inner distance between mate pairs. \\\n+                                                                                For, example, for paired end runs with fragments selected at 300bp, \\\n+                                                                                where each end is 50bp, you should set -r to be 200. The default is 45bp.\')\n+    parser.add_option( \'-G\', \'--GTF\', dest=\'GTF\', help=\'Tells Cufflinks to use the supplied reference annotation to estimate isoform expression. It will not assemble novel transcripts, and the program will ignore alignments not structurally compatible with any reference transcript.\' )\n+    parser.add_option( \'-g\', \'--GTF-guide\', dest=\'GTFguide\', help=\'use reference transcript annotation to guide assembly\' )\n+    \n+    # Normalization options.\n+    parser.add_option( "-N", "--quartile-normalization", dest="do_normalization", action="store_true" )\n+\n+    # Wrapper / Galaxy options.\n+    parser.add_'..b' # output version # of tool\n+    try:\n+        tmp = tempfile.NamedTemporaryFile().name\n+        tmp_stdout = open( tmp, \'wb\' )\n+        proc = subprocess.Popen( args=\'cufflinks --no-update-check 2>&1\', shell=True, stdout=tmp_stdout )\n+        tmp_stdout.close()\n+        returncode = proc.wait()\n+        stdout = None\n+        for line in open( tmp_stdout.name, \'rb\' ):\n+            if line.lower().find( \'cufflinks v\' ) >= 0:\n+                stdout = line.strip()\n+                break\n+        if stdout:\n+            sys.stdout.write( \'%s\\n\' % stdout )\n+        else:\n+            raise Exception\n+    except:\n+        sys.stdout.write( \'Could not determine Cufflinks version\\n\' )\n+    \n+    # If doing bias correction, set/link to sequence file.\n+    if options.do_bias_correction:\n+        cached_seqs_pointer_file = os.path.join( options.index_dir, \'sam_fa_indices.loc\' )\n+        if not os.path.exists( cached_seqs_pointer_file ):\n+            stop_err( \'The required file (%s) does not exist.\' % cached_seqs_pointer_file )\n+        # If found for the dbkey, seq_path will look something like /galaxy/data/equCab2/sam_index/equCab2.fa,\n+        # and the equCab2.fa file will contain fasta sequences.\n+        seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file )\n+        if options.ref_file != \'None\':\n+            # Create symbolic link to ref_file so that index will be created in working directory.\n+            seq_path = "ref.fa"\n+            os.symlink( options.ref_file, seq_path  )\n+    \n+    # Build command.\n+    \n+    # Base; always use quiet mode to avoid problems with storing log output.\n+    cmd = "cufflinks -q --no-update-check"\n+    \n+    # Add options.\n+    if options.inner_dist_std_dev:\n+        cmd += ( " -s %i" % int ( options.inner_dist_std_dev ) )\n+    if options.max_intron_len:\n+        cmd += ( " -I %i" % int ( options.max_intron_len ) )\n+    if options.min_isoform_fraction:\n+        cmd += ( " -F %f" % float ( options.min_isoform_fraction ) )\n+    if options.pre_mrna_fraction:\n+        cmd += ( " -j %f" % float ( options.pre_mrna_fraction ) )    \n+    if options.num_threads:\n+        cmd += ( " -p %i" % int ( options.num_threads ) )\n+    if options.inner_mean_dist:\n+        cmd += ( " -m %i" % int ( options.inner_mean_dist ) )\n+    if options.GTF:\n+        cmd += ( " -G %s" % options.GTF )\n+    if options.GTFguide:\n+\tcmd += ( " -g %s" % options.GTFguide )\n+    if options.num_importance_samples:\n+        cmd += ( " --num-importance-samples %i" % int ( options.num_importance_samples ) )\n+    if options.max_mle_iterations:\n+        cmd += ( " --max-mle-iterations %i" % int ( options.max_mle_iterations ) )\n+    if options.do_normalization:\n+        cmd += ( " -N" )\n+    if options.do_bias_correction:\n+        cmd += ( " -b %s" % seq_path )\n+        \n+    # Debugging.\n+    print cmd\n+        \n+    # Add input files.\n+    cmd += " " + options.input\n+    \n+    # Run command.\n+    try:\n+        tmp_name = tempfile.NamedTemporaryFile( dir="." ).name\n+        tmp_stderr = open( tmp_name, \'wb\' )\n+        proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )\n+        returncode = proc.wait()\n+        tmp_stderr.close()\n+        \n+        # Get stderr, allowing for case where it\'s very large.\n+        tmp_stderr = open( tmp_name, \'rb\' )\n+        stderr = \'\'\n+        buffsize = 1048576\n+        try:\n+            while True:\n+                stderr += tmp_stderr.read( buffsize )\n+                if not stderr or len( stderr ) % buffsize != 0:\n+                    break\n+        except OverflowError:\n+            pass\n+        tmp_stderr.close()\n+\n+        # Copy outputs.\n+        shutil.copyfile( "transcripts.gtf" , options.assembled_isoforms_output_file )\n+        \n+        # Error checking.\n+        if returncode != 0:\n+            raise Exception, stderr            \n+    except Exception, e:\n+        stop_err( \'Error running cufflinks. \' + str( e ) )\n+\n+if __name__=="__main__": __main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/ngs_rna/cufflinks_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ngs_rna/cufflinks_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,222 @@\n+<tool id="cufflinks" name="Cufflinks" version="0.0.5">\n+    <!-- Wrapper supports Cufflinks versions v1.0.0-v1.0.3 -->\n+    <description>transcript assembly and FPKM (RPKM) estimates for RNA-Seq data</description>\n+    <requirements>\n+        <requirement type="package">cufflinks</requirement>\n+    </requirements>\n+    <command interpreter="python">\n+        cufflinks_wrapper.py \n+            --input=$input\n+            --assembled-isoforms-output=$assembled_isoforms\n+            --num-threads="4"\n+            -I $max_intron_len\n+            -F $min_isoform_fraction\n+            -j $pre_mrna_fraction\n+            \n+            ## Include reference annotation?\n+            #if $reference_annotation.use_ref == "use reference annotation":\n+                -G $reference_annotation.reference_annotation_file\n+            #end if\n+            #if $reference_annotation.use_ref == "use reference annotation guide":\n+\t\t-g $reference_annotation_guide.reference_annotation_guide_file\n+            #end if\n+            \n+            ## Set paired-end parameters?\n+            #if $singlePaired.sPaired == "Yes":\n+                -m $singlePaired.mean_inner_distance\n+                -s $singlePaired.inner_distance_std_dev\n+                #end if\n+\n+            ## Normalization?\n+            #if str($do_normalization) == "Yes":\n+            -N\n+            #end if\n+            \n+            ## Bias correction?\n+            #if $bias_correction.do_bias_correction == "Yes":\n+\t        -b\n+                #if $bias_correction.seq_source.index_source == "history":\n+                    --ref_file=$bias_correction.seq_source.ref_file\n+                #else:\n+                    --ref_file="None"\n+                #end if\n+                --dbkey=${input.metadata.dbkey} \n+                --index_dir=${GALAXY_DATA_INDEX_DIR}\n+            #end if\n+    </command>\n+    <inputs>\n+        <param format="sam,bam" name="input" type="data" label="SAM or BAM file of aligned RNA-Seq reads" help=""/>\n+        <param name="max_intron_len" type="integer" value="300000" min="1" max="600000" label="Max Intron Length" help=""/>\n+        <param name="min_isoform_fraction" type="float" value="0.05" min="0" max="1" label="Min Isoform Fraction" help=""/>\n+        <param name="pre_mrna_fraction" type="float" value="0.05" min="0" max="1" label="Pre MRNA Fraction" help=""/>\n+        <param name="do_normalization" type="select" label="Perform quartile normalization" help="Removes top 25% of genes from FPKM denominator to improve accuracy of differential expression calls for low abundance transcripts.">\n+            <option value="No">No</option>\n+            <option value="Yes">Yes</option>\n+        </param>\n+        <conditional name="reference_annotation">\n+            <param name="use_ref" type="select" label="Use Reference Annotation">\n+                <option value="No">No</option>\n+                <option value="Use reference annotation">Use reference annotation</option>\n+                <option value="Use reference annotation guide">Use reference annotation as guide</option>\n+            </param>\n+            <when value="No"></when>\n+            <when value="Use reference annotation">\n+                <param format="gff3,gtf" name="reference_annotation_file" type="data" label="Reference Aonnotation" help="Make sure your annotation file is in GTF format and that Galaxy knows that your file is GTF--not GFF."/>\n+            \t</when>\n+\t    <when value="Use reference annotation guide">\n+                <param format="gff3,gtf" name="reference_annotation_guide_file" type="data" label="Reference Aonnotation" help="Make sure your annotation file is in GTF format and that Galaxy knows that your file is GTF--not GFF."/>\n+                </when>\n+        </conditional>\n+        <conditional name="bias_correction">\n+            <param name="do_bias_correction" type="select" label="Perform Bias Correction" help="Bias detection and correction can significantly impr'..b'       .           Cufflinks does not predict where the start and stop codons (if any) are located within each transcript, so this field is not used.\n+  8               attributes    See below\n+  \n+Each GTF record is decorated with the following attributes::\n+\n+  Attribute       Example       Description\n+  -----------------------------------------\n+  gene_id         CUFF.1        Cufflinks gene id\n+  transcript_id   CUFF.1.1      Cufflinks transcript id\n+  FPKM            101.267       Isoform-level relative abundance in Reads Per Kilobase of exon model per Million mapped reads\n+  frac            0.7647        Reserved. Please ignore, as this attribute may be deprecated in the future\n+  conf_lo         0.07          Lower bound of the 95% confidence interval of the abundance of this isoform, as a fraction of the isoform abundance. That is, lower bound = FPKM * (1.0 - conf_lo)\n+  conf_hi         0.1102        Upper bound of the 95% confidence interval of the abundance of this isoform, as a fraction of the isoform abundance. That is, upper bound = FPKM * (1.0 + conf_lo)\n+  cov             100.765       Estimate for the absolute depth of read coverage across the whole transcript\n+  \n+\n+Transcripts only:\n+  This file is simply a tab delimited file containing one row per transcript and with columns containing the attributes above. There are a few additional attributes not in the table above, but these are reserved for debugging, and may change or disappear in the future.\n+    \n+Genes only:\n+This file contains gene-level coordinates and expression values.\n+    \n+-------\n+\n+**Cufflinks settings**\n+\n+All of the options have a default value. You can change any of them. Most of the options in Cufflinks have been implemented here.\n+\n+------\n+\n+**Cufflinks parameter list**\n+\n+This is a list of implemented Cufflinks options::\n+\n+  -m INT    This is the expected (mean) inner distance between mate pairs. For, example, for paired end runs with fragments selected at 300bp, where each end is 50bp, you should set -r to be 200. The default is 45bp.\n+  -s INT    The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp.\n+  -I INT    The minimum intron length. Cufflinks will not report transcripts with introns longer than this, and will ignore SAM alignments with REF_SKIP CIGAR operations longer than this. The default is 300,000.\n+  -F \t    After calculating isoform abundance for a gene, Cufflinks filters out transcripts that it believes are very low abundance, because isoforms expressed at extremely low levels often cannot reliably be assembled, and may even be artifacts of incompletely spliced precursors of processed transcripts. This parameter is also used to filter out introns that have far fewer spliced alignments supporting them. The default is 0.05, or 5% of the most abundant isoform (the major isoform) of the gene.\n+  -j        Some RNA-Seq protocols produce a significant amount of reads that originate from incompletely spliced transcripts, and these reads can confound the assembly of fully spliced mRNAs. Cufflinks uses this parameter to filter out alignments that lie within the intronic intervals implied by the spliced alignments. The minimum depth of coverage in the intronic region covered by the alignment is divided by the number of spliced reads, and if the result is lower than this parameter value, the intronic alignments are ignored. The default is 5%.\n+  -G\t    Tells Cufflinks to use the supplied reference annotation to estimate isoform expression. It will not assemble novel transcripts, and the program will ignore alignments not structurally compatible with any reference transcript.  \n+  -N        With this option, Cufflinks excludes the contribution of the top 25 percent most highly expressed genes from the number of mapped fragments used in the FPKM denominator. This can improve robustness of differential expression calls for less abundant genes and transcripts.\n+    </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/ngs_rna/filter_transcripts_via_tracking.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ngs_rna/filter_transcripts_via_tracking.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+import os, sys, tempfile
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def __main__():
+    """
+    Utility script for analyzing Cufflinks data: uses a tracking file (produced by cuffcompare) to filter a GTF file of transcripts (usually the transcripts
+    produced by cufflinks). Filtering is done by extracting transcript IDs from tracking file and then filtering the GTF so that the output GTF contains only
+    transcript found in the tracking file. Because a tracking file has multiple samples, a sample number is used to filter transcripts for
+    a particular sample.
+    """
+    # Read parms.
+    tracking_file_name = sys.argv[1]
+    transcripts_file_name = sys.argv[2]
+    output_file_name = sys.argv[3]
+    sample_number = int ( sys.argv[4] )
+
+    # Open files.
+    transcripts_file = open( transcripts_file_name, 'r' )
+    output_file = open( output_file_name, 'w' )
+    
+    # Read transcript IDs from tracking file.
+    transcript_ids = {}
+    for i, line in enumerate( file( tracking_file_name ) ) :
+        # Split line into elements. Line format is 
+        # [Transfrag ID] [Locus ID] [Ref Gene ID] [Ref Transcript ID] [Class code] [qJ:<gene_id>|<transcript_id>|<FMI>|<FPKM>|<conf_lo>|<conf_hi>]
+        line = line.rstrip( '\r\n' )
+        elems = line.split( '\t' )
+        
+        # Get transcript info.
+        if sample_number == 1:
+            transcript_info = elems[4]
+        elif sample_number == 2:
+            transcript_info = elems[5]
+        if not transcript_info.startswith('q'):
+            # No transcript for this sample.
+            continue
+        
+        # Get and store transcript id.
+        transcript_id = transcript_info.split('|')[1]
+        transcript_id = transcript_id.strip('"')
+        transcript_ids[transcript_id] = ""
+        
+    # Filter transcripts file using transcript_ids
+    for i, line in enumerate( file( transcripts_file_name ) ):
+        # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes.
+        elems = line.split( '\t' )
+        
+        # Get attributes.
+        attributes_list = elems[8].split(";")
+        attributes = {}
+        for name_value_pair in attributes_list:
+            pair = name_value_pair.strip().split(" ")
+            name = pair[0].strip()
+            if name == '':
+                continue
+            # Need to strip double quote from values
+            value = pair[1].strip(" \"")
+            attributes[name] = value
+            
+        # Get element's transcript id.
+        transcript_id = attributes['transcript_id']
+        if transcript_id in transcript_ids:
+            output_file.write(line)
+        
+    # Clean up.
+    output_file.close()
+    
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/ngs_rna/filter_transcripts_via_tracking.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ngs_rna/filter_transcripts_via_tracking.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,32 @@
+<tool id="filter_combined_via_tracking" name="Filter Combined Transcripts" version="0.1">
+    <description>using tracking file</description>
+    <command interpreter="python">
+        filter_transcripts_via_tracking.py 
+            $tracking_file
+            $transcripts_file
+            $filtered_transcripts
+            $sample_num
+    </command>
+    <inputs>
+        <param format="gtf" name="transcripts_file" type="data" label="Cufflinks assembled transcripts" help=""/>
+        <param format="tabular" name="tracking_file" type="data" label="Cuffcompare tracking file" help=""/>
+        <param name="sample_num" type="select" label="Sample Number">
+            <option value="1">1</option>
+            <option value="2">2</option>
+        </param>
+    </inputs>
+
+    <outputs>
+        <data format="gtf" name="filtered_transcripts"/>
+    </outputs>
+
+    <tests>
+    </tests>
+
+    <help>
+        Uses a tracking file (produced by cuffcompare) to filter a GTF file of transcripts (usually the transcripts produced by 
+        cufflinks). Filtering is done by extracting transcript IDs from tracking file and then filtering the 
+        GTF so that the output GTF contains only transcript found in the tracking file. Because a tracking file has multiple 
+        samples, a sample number is used to filter transcripts for a particular sample.
+    </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/ngs_rna/tophat_color_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ngs_rna/tophat_color_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,683 @@\n+<tool id="tophat_color" name="Tophat for SOLiD" version="1.0.0">\n+    <description>Find splice junctions using RNA-seq data</description>\n+    <requirements>\n+        <requirement type="package">tophat</requirement>\n+    </requirements>\n+    <command interpreter="python">\n+        tophat_wrapper.py\n+            ## Change this to accommodate the number of threads you have available.\n+            --num-threads="4"\n+\n+            ## base- or color-space\n+            --color-space\n+\n+            ## Provide outputs.\n+            --junctions-output=$junctions\n+            --hits-output=$accepted_hits\n+\n+            ## Handle reference file.\n+            #if $refGenomeSource.genomeSource == "history":\n+                --own-file=$refGenomeSource.ownFile\n+            #else:\n+                --indexes-path="${ filter( lambda x: str( x[0] ) == str( $refGenomeSource.index ), $__app__.tool_data_tables[ \'tophat_indexes_color\' ].get_fields() )[0][-1] }"\n+            #end if\n+\n+            ## Are reads single-end or paired?\n+            --single-paired=$singlePaired.sPaired\n+\n+            ## First input file always required.\n+            --input1=$input1\n+\n+            ## Set params based on whether reads are single-end or paired.\n+            #if $singlePaired.sPaired == "single":\n+                --settings=$singlePaired.sParams.sSettingsType\n+                #if $singlePaired.sParams.sSettingsType == "full":\n+                    -a $singlePaired.sParams.anchor_length\n+                    -m $singlePaired.sParams.splice_mismatches\n+                    -i $singlePaired.sParams.min_intron_length\n+                    -I $singlePaired.sParams.max_intron_length\n+                    -F $singlePaired.sParams.junction_filter\n+                    -g $singlePaired.sParams.max_multihits\n+                    --min-segment-intron $singlePaired.sParams.min_segment_intron\n+                    --max-segment-intron $singlePaired.sParams.max_segment_intron\n+                    --seg-mismatches=$singlePaired.sParams.seg_mismatches\n+                    --seg-length=$singlePaired.sParams.seg_length\n+                    --library-type=$singlePaired.sParams.library_type\n+                    \n+                    ## Indel search.\n+                    #if $singlePaired.sParams.indel_search.allow_indel_search == "Yes":\n+                        --allow-indels\n+                        --max-insertion-length $singlePaired.sParams.indel_search.max_insertion_length\n+                        --max-deletion-length $singlePaired.sParams.indel_search.max_deletion_length\n+                    #end if\n+\n+                    ## Supplying junctions parameters.\n+                    #if $singlePaired.sParams.own_junctions.use_junctions == "Yes":\n+                        #if $singlePaired.sParams.own_junctions.gene_model_ann.use_annotations == "Yes":\n+                            -G $singlePaired.sParams.own_junctions.gene_model_ann.gene_annotation_model\n+                        #end if\n+                        #if $singlePaired.sParams.own_junctions.raw_juncs.use_juncs == "Yes":\n+                            -j $singlePaired.sParams.own_junctions.raw_juncs.raw_juncs\n+                        #end if\n+                        ## TODO: No idea why a string cast is necessary, but it is:\n+                        #if str($singlePaired.sParams.own_junctions.no_novel_juncs) == "Yes":\n+                            --no-novel-juncs\n+                        #end if\n+                    #end if\n+\n+                    #if $singlePaired.sParams.closure_search.use_search == "Yes":\n+                        --closure-search\n+                        --min-closure-exon $singlePaired.sParams.closure_search.min_closure_exon\n+                        --min-closure-intron $singlePaired.sParams.closure_search.min_closure_intron\n+                        --max-closure-intron $singlePaired.sParams.closure_search.max_closure_intron\n+                    #else:\n+                        --no-closu'..b'hen such a pair is supported by a split segment alignment of a long read. The default is 500000.\n+  -F/--min-isoform-fraction 0.0-1.0 TopHat filters out junctions supported by too few alignments. Suppose a junction spanning two exons, is supported by S reads. Let the average depth of coverage of \n+                                    exon A be D, and assume that it is higher than B. If S / D is less than the minimum isoform fraction, the junction is not reported. A value of zero disables the \n+                                    filter. The default is 0.15.\n+  -g/--max-multihits INT            Instructs TopHat to allow up to this many alignments to the reference for a given read, and suppresses all alignments for reads with more than this many \n+                                    alignments. The default is 40.\n+  -G/--GTF [GTF 2.2 file]           Supply TopHat with a list of gene model annotations. TopHat will use the exon records in this file to build a set of known splice junctions for each gene, and will attempt to align reads to these junctions even if they would not normally be covered by the initial mapping.\n+  -j/--raw-juncs [juncs file]       Supply TopHat with a list of raw junctions. Junctions are specified one per line, in a tab-delimited format. Records look like: [chrom] [left] [right] [+/-], left and right are zero-based coordinates, and specify the last character of the left sequenced to be spliced to the first character of the right sequence, inclusive.\n+  -no-novel-juncs                   Only look for junctions indicated in the supplied GFF file. (ignored without -G)\n+  --no-closure-search               Disables the mate pair closure-based search for junctions. Currently, has no effect - closure search is off by default.\n+  --closure-search                  Enables the mate pair closure-based search for junctions. Closure-based search should only be used when the expected inner distance between mates is small (about or less than 50bp)\n+  --no-coverage-search              Disables the coverage based search for junctions.\n+  --coverage-search                 Enables the coverage based search for junctions. Use when coverage search is disabled by default (such as for reads 75bp or longer), for maximum sensitivity.\n+  --microexon-search                With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer.\n+  --butterfly-search                TopHat will use a slower but potentially more sensitive algorithm to find junctions in addition to its standard search. Consider using this if you expect that your experiment produced a lot of reads from pre-mRNA, that fall within the introns of your transcripts.\n+  --segment-mismatches              Read segments are mapped independently, allowing up to this many mismatches in each segment alignment. The default is 2.\n+  --segment-length                  Each read is cut up into segments, each at least this long. These segments are mapped independently. The default is 25.\n+  --min-closure-exon                During closure search for paired end reads, exonic hops in the potential splice graph must be at least this long. The default is 50.\n+  --min-closure-intron              The minimum intron length that may be found during closure search. The default is 50.\n+  --max-closure-intron              The maximum intron length that may be found during closure search. The default is 5000.\n+  --min-coverage-intron             The minimum intron length that may be found during coverage search. The default is 50.\n+  --max-coverage-intron             The maximum intron length that may be found during coverage search. The default is 20000.\n+  --min-segment-intron              The minimum intron length that may be found during split-segment search. The default is 50.\n+  --max-segment-intron              The maximum intron length that may be found during split-segment search. The default is 500000.\n+    </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/ngs_rna/tophat_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ngs_rna/tophat_wrapper.py Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,236 @@\n+#!/usr/bin/env python\n+\n+import optparse, os, shutil, subprocess, sys, tempfile, fileinput\n+\n+def stop_err( msg ):\n+    sys.stderr.write( "%s\\n" % msg )\n+    sys.exit()\n+\n+def __main__():\n+    #Parse Command Line\n+    parser = optparse.OptionParser()\n+    parser.add_option( \'-p\', \'--num-threads\', dest=\'num_threads\', help=\'Use this many threads to align reads. The default is 1.\' )\n+    parser.add_option( \'-C\', \'--color-space\', dest=\'color_space\', action=\'store_true\', help=\'This indicates color-space data\' )\n+    parser.add_option( \'-J\', \'--junctions-output\', dest=\'junctions_output_file\', help=\'Junctions output file; formate is BED.\' )\n+    parser.add_option( \'-H\', \'--hits-output\', dest=\'accepted_hits_output_file\', help=\'Accepted hits output file; formate is BAM.\' )\n+    parser.add_option( \'\', \'--own-file\', dest=\'own_file\', help=\'\' )\n+    parser.add_option( \'-D\', \'--indexes-path\', dest=\'index_path\', help=\'Indexes directory; location of .ebwt and .fa files.\' )\n+    parser.add_option( \'-r\', \'--mate-inner-dist\', dest=\'mate_inner_dist\', help=\'This is the expected (mean) inner distance between mate pairs. \\\n+                                                                                For, example, for paired end runs with fragments selected at 300bp, \\\n+                                                                                where each end is 50bp, you should set -r to be 200. There is no default, \\\n+                                                                                and this parameter is required for paired end runs.\')\n+    parser.add_option( \'\', \'--mate-std-dev\', dest=\'mate_std_dev\', help=\'Standard deviation of distribution on inner distances between male pairs.\' )\n+    parser.add_option( \'-a\', \'--min-anchor-length\', dest=\'min_anchor_length\', \n+                        help=\'The "anchor length". TopHat will report junctions spanned by reads with at least this many bases on each side of the junction.\' )\n+    parser.add_option( \'-m\', \'--splice-mismatches\', dest=\'splice_mismatches\', help=\'The maximum number of mismatches that can appear in the anchor region of a spliced alignment.\' )\n+    parser.add_option( \'-i\', \'--min-intron-length\', dest=\'min_intron_length\', \n+                        help=\'The minimum intron length. TopHat will ignore donor/acceptor pairs closer than this many bases apart.\' )\n+    parser.add_option( \'-I\', \'--max-intron-length\', dest=\'max_intron_length\', \n+                        help=\'The maximum intron length. When searching for junctions ab initio, TopHat will ignore donor/acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read.\' )\n+    parser.add_option( \'-F\', \'--junction_filter\', dest=\'junction_filter\', help=\'Filter out junctions supported by too few alignments (number of reads divided by average depth of coverage)\' )\n+    parser.add_option( \'-g\', \'--max_multihits\', dest=\'max_multihits\', help=\'Maximum number of alignments to be allowed\' )\n+    parser.add_option( \'\', \'--seg-mismatches\', dest=\'seg_mismatches\', help=\'Number of mismatches allowed in each segment alignment for reads mapped independently\' )\n+    parser.add_option( \'\', \'--seg-length\', dest=\'seg_length\', help=\'Minimum length of read segments\' )\n+    parser.add_option( \'\', \'--library-type\', dest=\'library_type\', help=\'TopHat will treat the reads as strand specific. Every read alignment will have an XS attribute tag. Consider supplying library type options below to select the correct RNA-seq protocol.\' )\n+    parser.add_option( \'\', \'--allow-indels\', action="store_true", help=\'Allow indel search. Indel search is disabled by default.\' )\n+    parser.add_option( \'\', \'--max-insertion-length\', dest=\'max_insertion_length\', help=\'The maximum insertion length. The default is 3.\' )\n+    parser.add_option( \'\', \'--max-deletion-length\', dest=\'max_deletion_length\', help=\'The maximum deletion length. The default is 3.\' )\n+\n+    # Options for supplying own'..b' += \' -G %s\' % options.gene_model_annotations\n+            if options.raw_juncs:\n+                opts += \' -j %s\' % options.raw_juncs\n+            if options.no_novel_juncs:\n+                opts += \' --no-novel-juncs\'\n+            if options.library_type:\n+                opts += \' --library-type %s\' % options.library_type\n+            if options.allow_indels:\n+                # Max options do not work for Tophat v1.2.0, despite documentation to the contrary.\n+                opts += \' --allow-indels\'\n+                #opts += \' --max-insertion-length %i --max-deletion-length %i\' % ( int( options.max_insertion_length ), int( options.max_deletion_length ) )\n+                # need to warn user of this fact\n+                sys.stdout.write( "Max insertion length and max deletion length options don\'t work in Tophat v1.2.0\\n" )\n+\n+            # Search type options.\n+            if options.coverage_search:\n+                opts += \' --coverage-search --min-coverage-intron %s --max-coverage-intron %s\' % ( options.min_coverage_intron, options.max_coverage_intron )\n+            else:\n+                opts += \' --no-coverage-search\'\n+            if options.closure_search:\n+                opts += \' --closure-search --min-closure-exon %s --min-closure-intron %s --max-closure-intron %s\'  % ( options.min_closure_exon, options.min_closure_intron, options.max_closure_intron ) \n+            else:\n+                opts += \' --no-closure-search\'\n+            if options.microexon_search:\n+                opts += \' --microexon-search\'\n+            if options.single_paired == \'paired\':\n+                opts += \' --mate-std-dev %s\' % options.mate_std_dev\n+            if options.seg_mismatches:\n+                opts += \' --segment-mismatches %d\' % int( options.seg_mismatches )\n+            if options.seg_length:\n+                opts += \' --segment-length %d\' % int( options.seg_length )\n+            if options.min_segment_intron:\n+                opts += \' --min-segment-intron %d\' % int( options.min_segment_intron )\n+            if options.max_segment_intron:\n+                opts += \' --max-segment-intron %d\' % int( options.max_segment_intron )\n+            cmd = cmd % ( opts, index_path, reads )\n+        except Exception, e:\n+            # Clean up temp dirs\n+            if os.path.exists( tmp_index_dir ):\n+                shutil.rmtree( tmp_index_dir )\n+            stop_err( \'Something is wrong with the alignment parameters and the alignment could not be run\\n\' + str( e ) )\n+    #print cmd\n+\n+    # Run\n+    try:\n+        tmp_out = tempfile.NamedTemporaryFile().name\n+        tmp_stdout = open( tmp_out, \'wb\' )\n+        tmp_err = tempfile.NamedTemporaryFile().name\n+        tmp_stderr = open( tmp_err, \'wb\' )\n+        proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr )\n+        returncode = proc.wait()\n+        tmp_stderr.close()\n+        # get stderr, allowing for case where it\'s very large\n+        tmp_stderr = open( tmp_err, \'rb\' )\n+        stderr = \'\'\n+        buffsize = 1048576\n+        try:\n+            while True:\n+                stderr += tmp_stderr.read( buffsize )\n+                if not stderr or len( stderr ) % buffsize != 0:\n+                    break\n+        except OverflowError:\n+            pass\n+        tmp_stdout.close()\n+        tmp_stderr.close()\n+        if returncode != 0:\n+            raise Exception, stderr\n+            \n+        # Copy output files from tmp directory to specified files.\n+        shutil.copyfile( os.path.join( "tophat_out", "junctions.bed" ), options.junctions_output_file )\n+        shutil.copyfile( os.path.join( "tophat_out", "accepted_hits.bam" ), options.accepted_hits_output_file )\n+\n+        # TODO: look for errors in program output.\n+    except Exception, e:\n+        stop_err( \'Error in tophat:\\n\' + str( e ) ) \n+\n+    # Clean up temp dirs\n+    if os.path.exists( tmp_index_dir ):\n+        shutil.rmtree( tmp_index_dir )\n+\n+if __name__=="__main__": __main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/ngs_rna/tophat_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ngs_rna/tophat_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,680 @@\n+<tool id="tophat" name="Tophat for Illumina" version="1.5.0">\n+    <description>Find splice junctions using RNA-seq data</description>\n+    <version_command>tophat --version</version_command>\n+    <requirements>\n+        <requirement type="package">tophat</requirement>\n+    </requirements>\n+    <command interpreter="python">\n+        tophat_wrapper.py\n+            ## Change this to accommodate the number of threads you have available.\n+            --num-threads="4"\n+\n+            ## Provide outputs.\n+            --junctions-output=$junctions\n+            --hits-output=$accepted_hits\n+\n+            ## Handle reference file.\n+            #if $refGenomeSource.genomeSource == "history":\n+                --own-file=$refGenomeSource.ownFile\n+            #else:\n+                --indexes-path="${ filter( lambda x: str( x[0] ) == str( $refGenomeSource.index ), $__app__.tool_data_tables[ \'tophat_indexes\' ].get_fields() )[0][-1] }"\n+            #end if\n+\n+            ## Are reads single-end or paired?\n+            --single-paired=$singlePaired.sPaired\n+\n+            ## First input file always required.\n+            --input1=$input1\n+\n+            ## Set params based on whether reads are single-end or paired.\n+            #if $singlePaired.sPaired == "single":\n+                --settings=$singlePaired.sParams.sSettingsType\n+                #if $singlePaired.sParams.sSettingsType == "full":\n+                    -a $singlePaired.sParams.anchor_length\n+                    -m $singlePaired.sParams.splice_mismatches\n+                    -i $singlePaired.sParams.min_intron_length\n+                    -I $singlePaired.sParams.max_intron_length\n+                    -F $singlePaired.sParams.junction_filter\n+                    -g $singlePaired.sParams.max_multihits\n+                    --min-segment-intron $singlePaired.sParams.min_segment_intron\n+                    --max-segment-intron $singlePaired.sParams.max_segment_intron\n+                    --seg-mismatches=$singlePaired.sParams.seg_mismatches\n+                    --seg-length=$singlePaired.sParams.seg_length\n+                    --library-type=$singlePaired.sParams.library_type\n+                    \n+                    ## Indel search.\n+                    #if $singlePaired.sParams.indel_search.allow_indel_search == "Yes":\n+                        --allow-indels\n+                        --max-insertion-length $singlePaired.sParams.indel_search.max_insertion_length\n+                        --max-deletion-length $singlePaired.sParams.indel_search.max_deletion_length\n+                    #end if\n+\n+                    ## Supplying junctions parameters.\n+                    #if $singlePaired.sParams.own_junctions.use_junctions == "Yes":\n+                        #if $singlePaired.sParams.own_junctions.gene_model_ann.use_annotations == "Yes":\n+                            -G $singlePaired.sParams.own_junctions.gene_model_ann.gene_annotation_model\n+                        #end if\n+                        #if $singlePaired.sParams.own_junctions.raw_juncs.use_juncs == "Yes":\n+                            -j $singlePaired.sParams.own_junctions.raw_juncs.raw_juncs\n+                        #end if\n+                        ## TODO: No idea why a string cast is necessary, but it is:\n+                        #if str($singlePaired.sParams.own_junctions.no_novel_juncs) == "Yes":\n+                            --no-novel-juncs\n+                        #end if\n+                    #end if\n+\n+                    #if $singlePaired.sParams.closure_search.use_search == "Yes":\n+                        --closure-search\n+                        --min-closure-exon $singlePaired.sParams.closure_search.min_closure_exon\n+                        --min-closure-intron $singlePaired.sParams.closure_search.min_closure_intron\n+                        --max-closure-intron $singlePaired.sParams.closure_search.max_closure_intron\n+                    #else:\n+                        --no-closure-search\n+       '..b'hen such a pair is supported by a split segment alignment of a long read. The default is 500000.\n+  -F/--min-isoform-fraction 0.0-1.0 TopHat filters out junctions supported by too few alignments. Suppose a junction spanning two exons, is supported by S reads. Let the average depth of coverage of \n+                                    exon A be D, and assume that it is higher than B. If S / D is less than the minimum isoform fraction, the junction is not reported. A value of zero disables the \n+                                    filter. The default is 0.15.\n+  -g/--max-multihits INT            Instructs TopHat to allow up to this many alignments to the reference for a given read, and suppresses all alignments for reads with more than this many \n+                                    alignments. The default is 40.\n+  -G/--GTF [GTF 2.2 file]           Supply TopHat with a list of gene model annotations. TopHat will use the exon records in this file to build a set of known splice junctions for each gene, and will attempt to align reads to these junctions even if they would not normally be covered by the initial mapping.\n+  -j/--raw-juncs [juncs file]       Supply TopHat with a list of raw junctions. Junctions are specified one per line, in a tab-delimited format. Records look like: [chrom] [left] [right] [+/-], left and right are zero-based coordinates, and specify the last character of the left sequenced to be spliced to the first character of the right sequence, inclusive.\n+  -no-novel-juncs                   Only look for junctions indicated in the supplied GFF file. (ignored without -G)\n+  --no-closure-search               Disables the mate pair closure-based search for junctions. Currently, has no effect - closure search is off by default.\n+  --closure-search                  Enables the mate pair closure-based search for junctions. Closure-based search should only be used when the expected inner distance between mates is small (about or less than 50bp)\n+  --no-coverage-search              Disables the coverage based search for junctions.\n+  --coverage-search                 Enables the coverage based search for junctions. Use when coverage search is disabled by default (such as for reads 75bp or longer), for maximum sensitivity.\n+  --microexon-search                With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer.\n+  --butterfly-search                TopHat will use a slower but potentially more sensitive algorithm to find junctions in addition to its standard search. Consider using this if you expect that your experiment produced a lot of reads from pre-mRNA, that fall within the introns of your transcripts.\n+  --segment-mismatches              Read segments are mapped independently, allowing up to this many mismatches in each segment alignment. The default is 2.\n+  --segment-length                  Each read is cut up into segments, each at least this long. These segments are mapped independently. The default is 25.\n+  --min-closure-exon                During closure search for paired end reads, exonic hops in the potential splice graph must be at least this long. The default is 50.\n+  --min-closure-intron              The minimum intron length that may be found during closure search. The default is 50.\n+  --max-closure-intron              The maximum intron length that may be found during closure search. The default is 5000.\n+  --min-coverage-intron             The minimum intron length that may be found during coverage search. The default is 50.\n+  --max-coverage-intron             The maximum intron length that may be found during coverage search. The default is 20000.\n+  --min-segment-intron              The minimum intron length that may be found during split-segment search. The default is 50.\n+  --max-segment-intron              The maximum intron length that may be found during split-segment search. The default is 500000.\n+    </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/ngs_rna/trinity_all.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ngs_rna/trinity_all.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,89 @@
+<tool id="trinity_all" name="Trinity" version="0.0.1">
+    <!-- Run all steps of Trinity-Inchworm, Chrysalis, and Butterfly-in a single step. Wrapper status is alpha. -->
+    <description>De novo assembly of RNA-Seq data</description>
+    <requirements>
+        <requirement type="package">trinity</requirement>
+    </requirements>
+    <command>
+        Trinity.pl 
+        
+        ## Additional parameters.
+        #if $additional_params.use_additional == "yes":
+            --min_contig_length $additional_params.min_contig_length
+        #end if
+        
+        ## Inputs.
+        #if $inputs.paired_or_single == "paired":
+            --left $inputs.left_input --right $inputs.right_input
+            #if  $inputs.left_input.ext == 'fa':
+                --seqType fa
+            #else:
+                --seqType fq
+            #end if
+            #if $inputs.library_type != 'None':
+                --SS_lib_type $inputs.library_type
+            #end if
+        #else:
+            --single $inputs.input
+            #if  $inputs.input.ext == 'fa':
+                --seqType fa
+            #else:
+                --seqType fq
+            #end if
+            #if $inputs.library_type != 'None':
+                --SS_lib_type $inputs.library_type
+            #end if
+        #end if
+        
+        ## CPU and butterfly options.
+        --CPU 4 --run_butterfly --bfly_opts "-V 10 --stderr" > $trinity_log 2>&amp;1 
+    </command>
+    <inputs>
+        <conditional name="inputs">
+            <param name="paired_or_single" type="select" label="Paired or Single-end data?">
+                <option value="paired">Paired</option>
+                <option value="single">Single</option>
+            </param>
+            <when value="paired">
+                <param format="fasta,fastq" name="left_input" type="data" label="Left/Forward strand reads" help=""/>
+                <param format="fasta,fastq" name="right_input" type="data" label="Right/Reverse strand reads" help=""/>
+                <param name="library_type" type="select" label="Strand-specific Library Type">
+                    <option value="None">None</option>
+                    <option value="FR">FR</option>
+                    <option value="RF">RF</option>
+                </param>
+                <param name="paired_fragment_length" type="integer" value="300" min="1" label="Paired Fragment Length" help="Maximum length expected between fragment pairs"/>
+            </when>
+            <when value="single">
+                <param format="fasta,fastq" name="input" type="data" label="Single-end reads" help=""/>
+                <param name="library_type" type="select" label="Strand-specific Library Type">
+                    <option value="None">None</option>
+                    <option value="F">F</option>
+                    <option value="R">R</option>
+                </param>
+            </when>
+        </conditional>
+        <conditional name="additional_params">
+            <param name="use_additional" type="select" label="Use Additional Params?">
+                <option value="no">No</option>
+                <option value="yes">Yes</option>
+            </param>
+            <when value="no">
+            </when>
+            <when value="yes">            
+                <param name="min_contig_length" type="integer" value="200" min="1" label="Minimum Contig Length" help=""/>
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data format="txt" name="trinity_log" label="${tool.name} on ${on_string}: log" />
+        <data format="fasta" name="assembled_transcripts" label="${tool.name} on ${on_string}: Assembled Transcripts" from_work_dir="trinity_out_dir/Trinity.fasta"/>
+    </outputs>
+    <tests>
+    </tests>
+    <help>
+        Trinity is a de novo transcript assembler that uses RNA-seq data as input. This tool runs all Trinity_ commands--Inchworm, Chrysalis, and Butterfly--in a single pass.
+        
+        .. _Trinity: http://trinityrnaseq.sourceforge.net
+    </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/ngs_simulation/ngs_simulation.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ngs_simulation/ngs_simulation.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,280 @@\n+#!/usr/bin/env python\n+\n+"""\n+Runs Ben\'s simulation.\n+\n+usage: %prog [options]\n+   -i, --input=i: Input genome (FASTA format)\n+   -g, --genome=g: If built-in, the genome being used\n+   -l, --read_len=l: Read length\n+   -c, --avg_coverage=c: Average coverage\n+   -e, --error_rate=e: Error rate (0-1)\n+   -n, --num_sims=n: Number of simulations to run\n+   -p, --polymorphism=p: Frequency/ies for minor allele (comma-separate list of 0-1)\n+   -d, --detection_thresh=d: Detection thresholds (comma-separate list of 0-1)\n+   -p, --output_png=p: Plot output\n+   -s, --summary_out=s: Whether or not to output a file with summary of all simulations\n+   -m, --output_summary=m: File name for output summary of all simulations\n+   -f, --new_file_path=f: Directory for summary output files\n+\n+"""\n+# removed output of all simulation results on request (not working)\n+#   -r, --sim_results=r: Output all tabular simulation results (number of polymorphisms times number of detection thresholds)\n+#   -o, --output=o: Base name for summary output for each run\n+\n+from rpy import *\n+import os\n+import random, sys, tempfile\n+from galaxy import eggs\n+import pkg_resources; pkg_resources.require( "bx-python" )\n+from bx.cookbook import doc_optparse\n+\n+def stop_err( msg ):\n+    sys.stderr.write( \'%s\\n\' % msg )\n+    sys.exit()\n+\n+def __main__():\n+    #Parse Command Line\n+    options, args = doc_optparse.parse( __doc__ )\n+    # validate parameters\n+    error = \'\'\n+    try:\n+        read_len = int( options.read_len )\n+        if read_len <= 0:\n+            raise Exception, \' greater than 0\'\n+    except TypeError, e:\n+        error = \': %s\' % str( e )\n+    if error:\n+        stop_err( \'Make sure your number of reads is an integer value%s\' % error )\n+    error = \'\'\n+    try:\n+        avg_coverage = int( options.avg_coverage )\n+        if avg_coverage <= 0:\n+            raise Exception, \' greater than 0\'\n+    except Exception, e:\n+        error = \': %s\' % str( e )\n+    if error:\n+        stop_err( \'Make sure your average coverage is an integer value%s\' % error )\n+    error = \'\'\n+    try:\n+        error_rate = float( options.error_rate )\n+        if error_rate >= 1.0:\n+            error_rate = 10 ** ( -error_rate / 10.0 )\n+        elif error_rate < 0:\n+            raise Exception, \' between 0 and 1\'\n+    except Exception, e:\n+        error = \': %s\' % str( e )\n+    if error:\n+        stop_err( \'Make sure the error rate is a decimal value%s or the quality score is at least 1\' % error )\n+    try:\n+        num_sims = int( options.num_sims )\n+    except TypeError, e:\n+        stop_err( \'Make sure the number of simulations is an integer value: %s\' % str( e ) )\n+    if len( options.polymorphism ) > 0:\n+        polymorphisms = [ float( p ) for p in options.polymorphism.split( \',\' ) ]\n+    else:\n+        stop_err( \'Select at least one polymorphism value to use\' )\n+    if len( options.detection_thresh ) > 0:\n+        detection_threshes = [ float( dt ) for dt in options.detection_thresh.split( \',\' ) ]\n+    else:\n+        stop_err( \'Select at least one detection threshold to use\' )\n+\n+    # mutation dictionaries\n+    hp_dict = { \'A\':\'G\', \'G\':\'A\', \'C\':\'T\', \'T\':\'C\', \'N\':\'N\' } # heteroplasmy dictionary\n+    mt_dict = { \'A\':\'C\', \'C\':\'A\', \'G\':\'T\', \'T\':\'G\', \'N\':\'N\'} # misread dictionary\n+\n+    # read fasta file to seq string\n+    all_lines = open( options.input, \'rb\' ).readlines()\n+    seq = \'\'\n+    for line in all_lines:\n+        line = line.rstrip() \n+        if line.startswith(\'>\'):\n+            pass\n+        else:\n+            seq += line.upper()\n+    seq_len = len( seq )\n+\n+    # output file name template\n+# removed output of all simulation results on request (not working)\n+#    if options.sim_results == "true":\n+#        out_name_template = os.path.join( options.new_file_path, \'primary_output%s_\' + options.output + \'_visible_tabular\' )\n+#    else:\n+#        out_name_template = tempfile.NamedTemporaryFile().name + \'_%s\'\n+    out_name_template = tempfile.NamedTempora'..b'count += 1\n+            # close output up\n+            output.close()\n+\n+    # Parameters (heteroplasmy, error threshold, colours)\n+    r( \'\'\'\n+    het=c(%s)\n+    err=c(%s)\n+    grade = (0:32)/32\n+    hues = rev(gray(grade))\n+    \'\'\' % ( \',\'.join( [ str( p ) for p in polymorphisms ] ), \',\'.join( [ str( d ) for d in detection_threshes ] ) ) )\n+\n+    # Suppress warnings\n+    r( \'options(warn=-1)\' )\n+\n+    # Create allsum (for FP) and allneg (for FN) objects\n+    r( \'allsum <- data.frame()\' )\n+    for polymorphism in polymorphisms:\n+        for detection_thresh in detection_threshes:\n+            output = outputs[ polymorphism ][ detection_thresh ]\n+            cmd = \'\'\'\n+                  ngsum = read.delim(\'%s\', header=T)\n+                  ngsum$fprate <- ngsum$FP/%s\n+                  ngsum$hetcol <- %s\n+                  ngsum$errcol <- %s\n+                  allsum <- rbind(allsum, ngsum)\n+                  \'\'\' % ( output, seq_len, polymorphism, detection_thresh )\n+            r( cmd )\n+\n+    if os.path.getsize( output ) == 0:\n+        for p in outputs.keys():\n+            for d in outputs[ p ].keys():\n+                sys.stderr.write(outputs[ p ][ d ] + \' \'+str( os.path.getsize( outputs[ p ][ d ] ) )+\'\\n\')\n+\n+    if options.summary_out == "true":\n+        r( \'write.table(summary(ngsum), file="%s", quote=FALSE, sep="\\t", row.names=FALSE)\' % options.output_summary )\n+\n+    # Summary objects (these could be printed)\n+    r( \'\'\'\n+    tr_pos <- tapply(allsum$fprate,list(allsum$hetcol,allsum$errcol), mean)\n+    tr_neg <- tapply(allsum$FN,list(allsum$hetcol,allsum$errcol), mean)\n+    cat(\'\\nFalse Positive Rate Summary\\n\\t\', file=\'%s\', append=T, sep=\'\\t\')\n+    write.table(format(tr_pos, digits=4), file=\'%s\', append=T, quote=F, sep=\'\\t\')\n+    cat(\'\\nFalse Negative Rate Summary\\n\\t\', file=\'%s\', append=T, sep=\'\\t\')\n+    write.table(format(tr_neg, digits=4), file=\'%s\', append=T, quote=F, sep=\'\\t\')\n+    \'\'\' % tuple( [ options.output_summary ] * 4 ) )\n+\n+    # Setup graphs\n+    #pdf(paste(prefix,\'_jointgraph.pdf\',sep=\'\'), 15, 10)\n+    r( \'\'\'\n+    png(\'%s\', width=800, height=500, units=\'px\', res=250)\n+    layout(matrix(data=c(1,2,1,3,1,4), nrow=2, ncol=3), widths=c(4,6,2), heights=c(1,10,10))\n+    \'\'\' % options.output_png )\n+\n+    # Main title\n+    genome = \'\'\n+    if options.genome:\n+        genome = \'%s: \' % options.genome\n+    r( \'\'\'\n+    par(mar=c(0,0,0,0))\n+    plot(1, type=\'n\', axes=F, xlab=\'\', ylab=\'\')\n+    text(1,1,paste(\'%sVariation in False Positives and Negatives (\', %s, \' simulations, coverage \', %s,\')\', sep=\'\'), font=2, family=\'sans\', cex=0.7)\n+    \'\'\' % ( genome, options.num_sims, options.avg_coverage ) )\n+\n+    # False positive boxplot\n+    r( \'\'\'\n+    par(mar=c(5,4,2,2), las=1, cex=0.35)\n+    boxplot(allsum$fprate ~ allsum$errcol, horizontal=T, ylim=rev(range(allsum$fprate)), cex.axis=0.85)\n+    title(main=\'False Positives\', xlab=\'false positive rate\', ylab=\'\')\n+    \'\'\' )\n+\n+    # False negative heatmap (note zlim command!)\n+    num_polys = len( polymorphisms )\n+    num_dets = len( detection_threshes )\n+    r( \'\'\'\n+    par(mar=c(5,4,2,1), las=1, cex=0.35)\n+    image(1:%s, 1:%s, tr_neg, zlim=c(0,1), col=hues, xlab=\'\', ylab=\'\', axes=F, border=1)\n+    axis(1, at=1:%s, labels=rownames(tr_neg), lwd=1, cex.axis=0.85, axs=\'i\')\n+    axis(2, at=1:%s, labels=colnames(tr_neg), lwd=1, cex.axis=0.85)\n+    title(main=\'False Negatives\', xlab=\'minor allele frequency\', ylab=\'detection threshold\')\n+    \'\'\' % ( num_polys, num_dets, num_polys, num_dets ) )\n+\n+    # Scale alongside\n+    r( \'\'\'\n+    par(mar=c(2,2,2,3), las=1)\n+    image(1, grade, matrix(grade, ncol=length(grade), nrow=1), col=hues, xlab=\'\', ylab=\'\', xaxt=\'n\', las=1, cex.axis=0.85)\n+    title(main=\'Key\', cex=0.35)\n+    mtext(\'false negative rate\', side=1, cex=0.35)\n+    \'\'\' )\n+\n+    # Close graphics\n+    r( \'\'\'\n+    layout(1)\n+    dev.off()\n+    \'\'\' )\n+\n+    # Tidy up\n+#    r( \'rm(folder,prefix,sim,cov,het,err,grade,hues,i,j,ngsum)\' )\n+\n+if __name__ == "__main__" : __main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/ngs_simulation/ngs_simulation.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ngs_simulation/ngs_simulation.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,217 @@\n+<tool id="ngs_simulation" name="Simulate" version="1.0.0">\n+<!--<tool id="ngs_simulation" name="Simulate" force_history_refresh="True" version="1.0.0">-->\n+  <description>Illumina runs</description>\n+  <command interpreter="python">\n+    ngs_simulation.py\n+      #if $in_type.input_type == "built-in"\n+        --input="${ filter( lambda x: str( x[0] ) == str( $in_type.genome ), $__app__.tool_data_tables[ \'ngs_sim_fasta\' ].get_fields() )[0][-1] }"\n+        --genome=$in_type.genome\n+      #else\n+        --input=$in_type.input1\n+      #end if\n+      --read_len=$read_len\n+      --avg_coverage=$avg_coverage\n+      --error_rate=$error_rate\n+      --num_sims=$num_sims\n+      --polymorphism=$polymorphism\n+      --detection_thresh=$detection_thresh\n+      --output_png=$output_png\n+      --summary_out=$summary_out\n+      --output_summary=$output_summary\n+      --new_file_path=$__new_file_path__\n+  </command>\n+<!-- If want to include all simulation results file\n+        sim_results=$sim_results\n+        output=$output.id\n+-->\n+  <inputs>\n+    <conditional name="in_type">\n+      <param name="input_type" type="select" label="Use a built-in FASTA file or one from the history?">\n+        <option value="built-in">Built-in</option>\n+        <option value="history">History file</option>\n+      </param>\n+      <when value="built-in">\n+        <param name="genome" type="select" label="Select a built-in genome" help="if your genome of interest is not listed - contact Galaxy team">\n+          <options from_data_table="ngs_sim_fasta" />\n+        </param>\n+      </when>\n+      <when value="history">\n+        <param name="input1" type="data" format="fasta" label="Input genome (FASTA format)" />\n+      </when>\n+    </conditional>\n+    <param name="read_len" type="integer" value="76" label="Read length" />\n+    <param name="avg_coverage" type="integer" value="200" label="Average coverage" />\n+    <param name="error_rate" type="float" value="0.001" label="Error rate or quality score" help="Quality score if integer 1 or greater; error rate if between 0 and 1" />\n+    <param name="num_sims" type="integer" value="100" label="The number of simulations to run" />\n+    <param name="polymorphism" type="select" multiple="true" label="Frequency/ies for minor allele">\n+      <option value="0.001">0.001</option>\n+      <option value="0.002">0.002</option>\n+      <option value="0.003">0.003</option>\n+      <option value="0.004">0.004</option>\n+      <option value="0.005">0.005</option>\n+      <option value="0.006">0.006</option>\n+      <option value="0.007">0.007</option>\n+      <option value="0.008">0.008</option>\n+      <option value="0.009">0.009</option>\n+      <option value="0.01">0.01</option>\n+      <option value="0.02">0.02</option>\n+      <option value="0.03">0.03</option>\n+      <option value="0.04">0.04</option>\n+      <option value="0.05">0.05</option>\n+      <option value="0.06">0.06</option>\n+      <option value="0.07">0.07</option>\n+      <option value="0.08">0.08</option>\n+      <option value="0.09">0.09</option>\n+      <option value="0.1">0.1</option>\n+      <option value="0.2">0.2</option>\n+      <option value="0.3">0.3</option>\n+      <option value="0.4">0.4</option>\n+      <option value="0.5">0.5</option>\n+      <option value="0.6">0.6</option>\n+      <option value="0.7">0.7</option>\n+      <option value="0.8">0.8</option>\n+      <option value="0.9">0.9</option>\n+      <option value="1.0">1.0</option>\n+    </param>\n+    <param name="detection_thresh" type="select" multiple="true" label="Detection thresholds">\n+      <option value="0.001">0.001</option>\n+      <option value="0.002">0.002</option>\n+      <option value="0.003">0.003</option>\n+      <option value="0.004">0.004</option>\n+      <option value="0.005">0.005</option>\n+      <option value="0.006">0.006</option>\n+      <option value="0.007">0.007</option>\n+      <option value="0.008">0.008</option>\n+      <option value="0.009">0.009</option>\n+      <option value="0.01">0.0'..b'e listed output files.\n+    -->\n+    <!--\n+    <test>\n+      <param name="input_type" value="history" />\n+      <param name="input1" value="ngs_simulation_in1.fasta" ftype="fasta" />\n+      <param name="read_len" value="76" />\n+      <param name="avg_coverage" value="200" />\n+      <param name="error_rate" value="0.001" />\n+      <param name="num_sims" value="25" />\n+      <param name="polymorphism" value="0.02,0.04,0.1" />\n+      <param name="detection_thresh" value="0.01,0.02" />\n+      <param name="summary_out" value="true" />\n+      <output name="output_png" file="ngs_simulation_out1.png" />\n+      <output name="output_summary" file="ngs_simulation_out2.tabular" />\n+    </test>\n+    <test>\n+      <param name="input_type" value="built-in" />\n+      <param name="genome" value="pUC18" />\n+      <param name="read_len" value="50" />\n+      <param name="avg_coverage" value="150" />\n+      <param name="error_rate" value="0.005" />\n+      <param name="num_sims" value="25" />\n+      <param name="polymorphism" value="0.001,0.005" />\n+      <param name="detection_thresh" value="0.001,0.002" />\n+      <param name="summary_out" value="false" />\n+      <output name="output_png" file="ngs_simulation_out3.png" />\n+    </test>\n+    -->\n+  </tests>\n+  <help>\n+\n+**What it does**\n+\n+This tool simulates an Illumina run and provides plots of false positives and false negatives. It allows for a range of simulation parameters to be set. Note that this simulation sets only one (randomly chosen) position in the genome as polymorphic, according to the value specified. Superimposed on this are "sequencing errors", which are uniformly (and randomly) distributed. Polymorphisms are assigned using the detection threshold, so if the detection threshold is set to the same as the minor allele frequency, the expected false negative rate is 50%.\n+\n+**Parameter list**\n+\n+These are the parameters that should be set for the simulation::\n+\n+  Read length (which is the same for all reads)\n+  Average Coverage\n+  Frequency for Minor Allele\n+  Sequencing Error Rate\n+  Detection Threshold\n+  Number of Simulations\n+\n+You also should choose to use either a built-in genome or supply your own FASTA file.\n+\n+**Output**\n+\n+There are one or two. The first is a png that contains two different plots and is always generated. The second is optional and is a text file with some summary information about the simulations that were run. Below are some example outputs for a 10-simulation run on phiX with the default settings::\n+\n+  Read length                    76\n+  Average coverage               200\n+  Error rate/quality score       0.001\n+  Number of simulations          100\n+  Frequencies for minor allele   0.002\n+                                 0.004\n+  Detection thresholds           0.003\n+                                 0.005\n+                                 0.007\n+  Include summary file           Yes\n+\n+Plot output (png):\n+\n+.. image:: ./static/images/ngs_simulation.png\n+\n+Summary output (txt)::\n+\n+        FP              FN       GENOMESIZE.5386      fprate          hetcol          errcol\n+  Min.   : 71.0   Min.   :0.0    Mode:logical     Min.   :0.01318         Min.   :0.004   Min.   :0.007\n+  1st Qu.:86.0    1st Qu.:1.0    NA\'s:10          1st Qu.:0.01597         1st Qu.:0.004   1st Qu.:0.007\n+  Median :92.5    Median :1.0    NA       Median :0.01717         Median :0.004   Median :0.007\n+  Mean   :93.6    Mean   :0.9    NA       Mean   :0.01738         Mean   :0.004   Mean   :0.007\n+  3rd Qu.:100.8   3rd Qu.:1.0    NA       3rd Qu.:0.01871         3rd Qu.:0.004   3rd Qu.:0.007\n+  Max.   :123.0   Max.   :1.0    NA       Max.   :0.02284         Max.   :0.004   Max.   :0.007\n+  \n+  False Positive Rate Summary\n+          0.003   0.005   0.007\n+  0.001   0.17711 0.10854 0.01673\n+  0.009   0.18049 0.10791 0.01738\n+\n+  False Negative Rate Summary\n+          0.003   0.005     0.007\n+  0.001   1.0     0.8       1.0\n+  0.009   0.4     0.7       0.9\n+\n+\n+  </help>\n+</tool>\n+\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/peak_calling/ccat_2_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/peak_calling/ccat_2_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,123 @@
+<tool id="peakcalling_ccat2" name="CCAT" version="0.0.1">
+  <description>Control-based ChIP-seq Analysis Tool</description>
+  <command interpreter="python">ccat_wrapper.py '$input_tag_file' '$input_control_file' '$chromInfo' 
+  #if str( $options_type[ 'options_type_selector' ] ) == 'advanced':
+  '$input_advanced_config_file' 
+  #else:
+  '${ options_type.input_config_file.fields.path }'
+  #end if
+  'CCAT in Galaxy' 
+  '$output_peak_file' '$output_region_file' '$output_top_file' '$output_log_file'</command>
+  <requirements>
+    <requirement type="binary">CCAT</requirement>
+  </requirements>
+  <inputs>
+    <param name="input_tag_file" type="data" format="bed" label="ChIP-Seq Tag File" >
+      <validator type="unspecified_build" />
+    </param>
+    <param name="input_control_file" type="data" format="bed" label="ChIP-Seq Control File" >
+      <validator type="unspecified_build" />
+    </param>
+    <conditional name="options_type">
+      <param name="options_type_selector" type="select" label="Advanced Options">
+        <option value="basic" selected="True">Hide Advanced Options</option>
+        <option value="advanced">Show Advanced Options</option>
+      </param>
+      <when value="basic">
+        <param name="input_config_file" type="select" label="Select a pre-defined configuration file">
+          <options from_data_table="ccat_configurations">
+            <validator type="no_options" message="No configurations are available"/>
+          </options>
+        </param>
+      </when>
+      <when value="advanced">
+        <param name="fragment_size" type="integer" label="Length of DNA fragment" value="200"/>
+        <param name="sliding_window_size" type="integer" label="Sliding window size" value="500" help="transcription factor binding default: 300; histone modifications default: 500"/>
+        <param name="moving_step" type="integer" label="Step of sliding window" value="50" help="transcription factor binding default: 10; histone modifications default: 50"/>
+        <param name="is_strand_sensitive_mode" type="select" label="isStrandSensitiveMode" >
+          <option value="1">Transition from sense strand to anti-sense strand</option>
+          <option value="0" selected="True">Local maximum of read-enrichment profile</option>
+        </param>
+        <param name="min_count" type="integer" label="Minimum number of read counts at the peak" value="4"/>
+        <param name="output_num" type="integer" label="Number of peaks reported in top peak file" value="100000"/>
+        <param name="random_seed" type="integer" label="Random Seed" value="123456"/>
+        <param name="min_score" type="float" label="Minimum score of normalized difference" value="3.0"/>
+        <param name="bootstrap_pass" type="integer" label="Number of passes in the bootstrapping process" value="50"/>
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data name="output_peak_file" format="interval" label="${tool.name} on ${on_string} (peaks)">
+      <actions>
+        <action type="metadata" name="chromCol" default="1"/>
+        <action type="metadata" name="startCol" default="3"/>
+        <action type="metadata" name="endCol" default="4"/>
+      </actions>
+    </data>
+    <data name="output_region_file" format="interval" label="${tool.name} on ${on_string} (regions)">
+      <actions>
+        <action type="metadata" name="chromCol" default="1"/>
+        <action type="metadata" name="startCol" default="3"/>
+        <action type="metadata" name="endCol" default="4"/>
+      </actions>
+    </data>
+    <data name="output_top_file" format="interval" label="${tool.name} on ${on_string} (top peaks)">
+      <actions>
+        <action type="metadata" name="chromCol" default="1"/>
+        <action type="metadata" name="startCol" default="3"/>
+        <action type="metadata" name="endCol" default="4"/>
+      </actions>
+    </data>
+    <data name="output_log_file" format="txt" label="${tool.name} on ${on_string} (log)"/>
+  </outputs>
+  <configfiles>
+    <configfile name="input_advanced_config_file">#if str( $options_type['options_type_selector' ] ) == 'advanced':
+fragmentSize ${options_type[ 'fragment_size' ]}
+slidingWinSize ${options_type[ 'sliding_window_size' ]}
+movingStep ${options_type[ 'moving_step' ]}
+isStrandSensitiveMode ${options_type[ 'is_strand_sensitive_mode' ]}
+minCount ${options_type[ 'min_count' ]}
+outputNum ${options_type[ 'output_num' ]}
+randomSeed ${options_type[ 'random_seed' ]}
+minScore ${options_type[ 'min_score' ]}
+bootstrapPass ${options_type[ 'bootstrap_pass' ]}
+#end if</configfile>
+  </configfiles>
+  <tests>
+    <test>
+      <param name="input_tag_file" value="chipseq_enriched.bed.gz" ftype="bed" dbkey="hg18" />
+      <param name="input_control_file" value="chipseq_input.bed.gz" ftype="bed" dbkey="hg18" />
+      <param name="options_type_selector" value="basic" />
+      <param name="input_config_file" value="ccat_2.0_histone_config" />
+      <output name="output_peak_file" file="peakcalling_ccat2/ccat2_test_peak_out_1.interval" />
+      <output name="output_region_file" file="peakcalling_ccat2/ccat2_test_region_out_1.interval" />
+      <output name="output_top_file" file="peakcalling_ccat2/ccat2_test_top_out_1.interval" />
+      <output name="output_log_file" file="peakcalling_ccat2/ccat2_test_log_out_1.interval" />
+    </test>
+    <test>
+      <param name="input_tag_file" value="chipseq_enriched.bed.gz" ftype="bed" dbkey="hg18" />
+      <param name="input_control_file" value="chipseq_input.bed.gz" ftype="bed" dbkey="hg18" />
+      <param name="options_type_selector" value="advanced" />
+      <param name="fragment_size" value="200" />
+      <param name="sliding_window_size" value="500" />
+      <param name="moving_step" value="50" />
+      <param name="is_strand_sensitive_mode" value="0" />
+      <param name="min_count" value="4" />
+      <param name="output_num" value="100000" />
+      <param name="random_seed" value="123456" />
+      <param name="min_score" value="3.0" />
+      <param name="bootstrap_pass" value="50" />
+      <output name="output_peak_file" file="peakcalling_ccat2/ccat2_test_peak_out_1.interval" />
+      <output name="output_region_file" file="peakcalling_ccat2/ccat2_test_region_out_1.interval" />
+      <output name="output_top_file" file="peakcalling_ccat2/ccat2_test_top_out_1.interval" />
+      <output name="output_log_file" file="peakcalling_ccat2/ccat2_test_log_out_1.interval" />
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+This tool allows ChIP-seq peak/region calling using CCAT.
+
+View the original CCAT documentation: http://cmb.gis.a-star.edu.sg/ChIPSeq/paperCCAT.htm.
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/peak_calling/ccat_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/peak_calling/ccat_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,40 @@
+import sys, subprocess, tempfile, shutil, os.path
+
+CCAT_BINARY = "CCAT"
+
+def get_top_count( filename ):
+    for line in open( filename ):
+        if line.startswith( 'outputNum' ):
+            return int( line.split()[-1].strip() ) 
+
+def stop_err( tmp_dir, exception ):
+    print >> sys.stderr, "Error running CCAT."
+    shutil.rmtree( tmp_dir ) #some error has occurred, provide info and remove possibly non-empty temp directory
+    raise exception
+
+def main():
+    input_tag_file = sys.argv[1]
+    input_control_file = sys.argv[2]
+    chrom_info_file = sys.argv[3]
+    input_config_file = sys.argv[4]
+    project_name = sys.argv[5]
+    output_peak_file = sys.argv[6]
+    output_region_file = sys.argv[7]
+    output_top_file = sys.argv[8]
+    output_log_file = sys.argv[9]
+    
+    tmp_dir = tempfile.mkdtemp()
+    try:
+        proc = subprocess.Popen( args="%s %s > %s" % ( CCAT_BINARY, " ".join( map( lambda x: "'%s'" % x, [ input_tag_file, input_control_file, chrom_info_file, input_config_file, project_name ] ) ), output_log_file ), shell=True, cwd=tmp_dir )
+        proc.wait()
+        if proc.returncode:
+            raise Exception( "Error code: %i" % proc.returncode )
+        output_num = get_top_count( input_config_file )
+        shutil.move( os.path.join( tmp_dir, "%s.significant.peak" % project_name ), output_peak_file )
+        shutil.move( os.path.join( tmp_dir, "%s.significant.region" % project_name ), output_region_file )
+        shutil.move( os.path.join( tmp_dir, "%s.top%i.peak" % ( project_name, output_num ) ), output_top_file )
+    except Exception, e:
+        return stop_err( tmp_dir, e )
+    os.rmdir( tmp_dir ) #clean up empty temp working directory
+
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/peak_calling/ccat_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/peak_calling/ccat_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,139 @@\n+<tool id="peakcalling_ccat" name="CCAT" version="0.0.1">\n+  <description>Control-based ChIP-seq Analysis Tool</description>\n+  <command interpreter="python">ccat_wrapper.py \'$input_tag_file\' \'$input_control_file\' \'$chromInfo\' \n+  #if str( $options_type[ \'options_type_selector\' ] ) == \'advanced\':\n+  \'$input_advanced_config_file\' \n+  #else:\n+  \'${ options_type.input_config_file.fields.path }\'\n+  #end if\n+  \'CCAT in Galaxy\' \n+  \'$output_peak_file\' \'$output_region_file\' \'$output_top_file\' \'$output_log_file\'</command>\n+  <requirements>\n+    <requirement type="binary" version="3.0">CCAT</requirement>\n+  </requirements>\n+  <inputs>\n+    <param name="input_tag_file" type="data" format="bed" label="ChIP-Seq Tag File" >\n+      <validator type="unspecified_build" />\n+    </param>\n+    <param name="input_control_file" type="data" format="bed" label="ChIP-Seq Control File" >\n+      <validator type="unspecified_build" />\n+    </param>\n+    <conditional name="options_type">\n+      <param name="options_type_selector" type="select" label="Advanced Options">\n+        <option value="basic" selected="True">Hide Advanced Options</option>\n+        <option value="advanced">Show Advanced Options</option>\n+      </param>\n+      <when value="basic">\n+        <param name="input_config_file" type="select" label="Select a pre-defined configuration file">\n+          <options from_data_table="ccat_configurations">\n+            <validator type="no_options" message="No configurations are available"/>\n+          </options>\n+        </param>\n+      </when>\n+      <when value="advanced">\n+        <param name="fragment_size" type="integer" label="Length of DNA fragment" value="200"/>\n+        <param name="sliding_window_size" type="integer" label="Sliding window size" value="500" help="transcription factor binding default: 300; histone modifications default: 500"/>\n+        <param name="moving_step" type="integer" label="Step of sliding window" value="50" help="transcription factor binding default: 10; histone modifications default: 50"/>\n+        <param name="is_strand_sensitive_mode" type="select" label="isStrandSensitiveMode" >\n+          <option value="1">Transition from sense strand to anti-sense strand</option>\n+          <option value="0" selected="True">Local maximum of read-enrichment profile</option>\n+        </param>\n+        <param name="min_count" type="integer" label="Minimum number of read counts at the peak" value="4"/>\n+        <param name="output_num" type="integer" label="Number of peaks reported in top peak file" value="100000"/>\n+        <param name="random_seed" type="integer" label="Random Seed" value="123456"/>\n+        <param name="min_score" type="float" label="Minimum score of normalized difference" value="3.0"/>\n+        <param name="bootstrap_pass" type="integer" label="Number of passes in the bootstrapping process" value="50"/>\n+      </when>\n+    </conditional>\n+  </inputs>\n+  <outputs>\n+    <data name="output_peak_file" format="interval" label="${tool.name} on ${on_string} (peaks)">\n+      <actions>\n+        <action type="metadata" name="chromCol" default="1"/>\n+        <action type="metadata" name="startCol" default="3"/>\n+        <action type="metadata" name="endCol" default="4"/>\n+      </actions>\n+    </data>\n+    <data name="output_region_file" format="interval" label="${tool.name} on ${on_string} (regions)">\n+      <actions>\n+        <action type="metadata" name="chromCol" default="1"/>\n+        <action type="metadata" name="startCol" default="3"/>\n+        <action type="metadata" name="endCol" default="4"/>\n+      </actions>\n+    </data>\n+    <data name="output_top_file" format="interval" label="${tool.name} on ${on_string} (top peaks)">\n+      <actions>\n+        <action type="metadata" name="chromCol" default="1"/>\n+        <action type="metadata" name="startCol" default="3"/>\n+        <action type="metadata" name="endCol" default="4"/>\n+      </actions>\n+    </data>\n+    <data name="output_log_file" for'..b'ions_type[\'options_type_selector\' ] ) == \'advanced\':\n+fragmentSize\t${options_type[ \'fragment_size\' ]}\n+slidingWinSize\t${options_type[ \'sliding_window_size\' ]}\n+movingStep\t${options_type[ \'moving_step\' ]}\n+isStrandSensitiveMode\t${options_type[ \'is_strand_sensitive_mode\' ]}\n+minCount\t${options_type[ \'min_count\' ]}\n+outputNum\t${options_type[ \'output_num\' ]}\n+randomSeed\t${options_type[ \'random_seed\' ]}\n+minScore\t${options_type[ \'min_score\' ]}\t\n+bootstrapPass\t${options_type[ \'bootstrap_pass\' ]}\n+#end if</configfile>\n+  </configfiles>\n+  <tests>\n+    <test>\n+      <param name="input_tag_file" value="chipseq_enriched.bed.gz" ftype="bed" dbkey="hg18" />\n+      <param name="input_control_file" value="chipseq_input.bed.gz" ftype="bed" dbkey="hg18" />\n+      <param name="options_type_selector" value="advanced" />\n+      <param name="fragment_size" value="200" />\n+      <param name="sliding_window_size" value="500" />\n+      <param name="moving_step" value="50" />\n+      <param name="is_strand_sensitive_mode" value="0" />\n+      <param name="min_count" value="4" />\n+      <param name="output_num" value="100000" />\n+      <param name="random_seed" value="123456" />\n+      <param name="min_score" value="5.0" />\n+      <param name="bootstrap_pass" value="50" />\n+      <output name="output_peak_file" file="peakcalling_ccat/3.0/ccat_test_peak_out_1.interval.re_match" compare="re_match" />\n+      <output name="output_region_file" file="peakcalling_ccat/3.0/ccat_test_region_out_1.interval.re_match" compare="re_match" />\n+      <output name="output_top_file" file="peakcalling_ccat/3.0/ccat_test_top_out_1.interval.sorted.re_match" compare="re_match" sort="True" />\n+      <output name="output_log_file" file="peakcalling_ccat/3.0/ccat_test_log_out_1.txt" />\n+    </test>\n+    <test>\n+      <param name="input_tag_file" value="chipseq_enriched.bed.gz" ftype="bed" dbkey="hg18" />\n+      <param name="input_control_file" value="chipseq_input.bed.gz" ftype="bed" dbkey="hg18" />\n+      <param name="options_type_selector" value="basic" />\n+      <param name="input_config_file" value="ccat_3.0_histone_config" />\n+      <output name="output_peak_file" file="peakcalling_ccat/3.0/ccat_test_peak_out_1.interval.re_match" compare="re_match" />\n+      <output name="output_region_file" file="peakcalling_ccat/3.0/ccat_test_region_out_1.interval.re_match" compare="re_match" />\n+      <output name="output_top_file" file="peakcalling_ccat/3.0/ccat_test_top_out_1.interval.sorted.re_match" compare="re_match" sort="true" />\n+      <output name="output_log_file" file="peakcalling_ccat/3.0/ccat_test_log_out_1.txt" />\n+    </test>\n+    <!-- Test below gives different results on different architectures, \n+    e.g.: x86_64 GNU/Linux gave an extra line (additional peak called) when compared to the version running on 10.6.0 Darwin i386 \n+    slidingWinSize was fixed to be 1000, default as per readme.txt\n+    -->\n+    <!--\n+    <test>\n+      <param name="input_tag_file" value="chipseq_enriched.bed.gz" ftype="bed" dbkey="hg18" />\n+      <param name="input_control_file" value="chipseq_input.bed.gz" ftype="bed" dbkey="hg18" />\n+      <param name="options_type_selector" value="basic" />\n+      <param name="input_config_file" value="ccat_3.0_histone_config_readme" />\n+      <output name="output_peak_file" file="peakcalling_ccat/3.0/ccat_test_peak_out_2.interval.re_match" compare="re_match" />\n+      <output name="output_region_file" file="peakcalling_ccat/3.0/ccat_test_region_out_2.interval.re_match" compare="re_match" />\n+      <output name="output_top_file" file="peakcalling_ccat/3.0/ccat_test_top_out_2.interval.sorted.re_match" compare="re_match" sort="true" />\n+      <output name="output_log_file" file="peakcalling_ccat/3.0/ccat_test_log_out_2.txt" />\n+    </test>\n+  -->\n+  </tests>\n+  <help>\n+**What it does**\n+\n+This tool allows ChIP-seq peak/region calling using CCAT.\n+\n+View the original CCAT documentation: http://cmb.gis.a-star.edu.sg/ChIPSeq/paperCCAT.htm.\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/peak_calling/macs_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/peak_calling/macs_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,135 @@
+import sys, subprocess, tempfile, shutil, glob, os, os.path, gzip
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "simplejson" )
+import simplejson
+
+CHUNK_SIZE = 1024
+
+def gunzip_cat_glob_path( glob_path, target_filename, delete = False ):
+    out = open( target_filename, 'wb' )
+    for filename in glob.glob( glob_path ):
+        fh = gzip.open( filename, 'rb' )
+        while True:
+            data = fh.read( CHUNK_SIZE )
+            if data:
+                out.write( data )
+            else:
+                break
+        fh.close()
+        if delete:
+            os.unlink( filename )
+    out.close()
+
+def xls_to_interval( xls_file, interval_file, header = None ):
+    out = open( interval_file, 'wb' )
+    if header:
+        out.write( '#%s\n' % header )
+    wrote_header = False
+    #From macs readme: Coordinates in XLS is 1-based which is different with BED format.
+    for line in open( xls_file ):
+        #keep all existing comment lines
+        if line.startswith( '#' ):
+            out.write( line )
+        elif not wrote_header:
+            out.write( '#%s' % line )
+            wrote_header = True
+        else:
+            fields = line.split( '\t' )
+            if len( fields ) > 1:
+                fields[1] = str( int( fields[1] ) - 1 )
+            out.write( '\t'.join( fields ) )
+    out.close()
+
+def main():
+    options = simplejson.load( open( sys.argv[1] ) )
+    output_bed = sys.argv[2]
+    output_extra_html = sys.argv[3]
+    output_extra_path = sys.argv[4]
+    
+    experiment_name = '_'.join( options['experiment_name'].split() ) #save experiment name here, it will be used by macs for filenames (gzip of wig files will fail with spaces - macs doesn't properly escape them)..need to replace all whitespace, split makes this easier
+    cmdline = "macs -t %s" % ",".join( options['input_chipseq'] )
+    if options['input_control']:
+        cmdline = "%s -c %s" % ( cmdline, ",".join( options['input_control'] ) )
+    cmdline = "%s --format='%s' --name='%s' --gsize='%s' --tsize='%s' --bw='%s' --pvalue='%s' --mfold='%s' %s --lambdaset='%s' %s" % ( cmdline, options['format'], experiment_name, options['gsize'], options['tsize'], options['bw'], options['pvalue'], options['mfold'], options['nolambda'], options['lambdaset'], options['futurefdr'] )
+    if 'wig' in options:
+        wigextend = int( options['wig']['wigextend']  )
+        if wigextend >= 0:
+            wigextend = "--wigextend='%s'" % wigextend
+        else:
+            wigextend = ''
+        cmdline = "%s --wig %s --space='%s'" % ( cmdline, wigextend, options['wig']['space'] )
+    if 'nomodel' in options:
+        cmdline = "%s --nomodel --shiftsize='%s'" % ( cmdline, options['nomodel'] )
+    if 'diag' in options:
+        cmdline = "%s --diag --fe-min='%s' --fe-max='%s' --fe-step='%s'" % ( cmdline, options['diag']['fe-min'], options['diag']['fe-max'], options['diag']['fe-step'] )
+    
+    tmp_dir = tempfile.mkdtemp() #macs makes very messy output, need to contain it into a temp dir, then provide to user
+    stderr_name = tempfile.NamedTemporaryFile().name # redirect stderr here, macs provides lots of info via stderr, make it into a report
+    proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir, stderr=open( stderr_name, 'wb' ) )
+    proc.wait()
+    #We don't want to set tool run to error state if only warnings or info, e.g. mfold could be decreased to improve model, but let user view macs log
+    #Do not terminate if error code, allow dataset (e.g. log) creation and cleanup
+    if proc.returncode:
+        stderr_f = open( stderr_name )
+        while True:
+            chunk = stderr_f.read( CHUNK_SIZE )
+            if not chunk:
+                stderr_f.close()
+                break
+            sys.stderr.write( chunk )
+    
+    #run R to create pdf from model script
+    if os.path.exists( os.path.join( tmp_dir, "%s_model.r" % experiment_name ) ):
+        cmdline = 'R --vanilla --slave < "%s_model.r" > "%s_model.r.log"' % ( experiment_name, experiment_name )
+        proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir )
+        proc.wait()
+    
+    
+    #move bed out to proper output file
+    created_bed_name =  os.path.join( tmp_dir, "%s_peaks.bed" % experiment_name )
+    if os.path.exists( created_bed_name ):
+        shutil.move( created_bed_name, output_bed )
+    
+    #parse xls files to interval files as needed
+    if options['xls_to_interval']:
+        create_peak_xls_file = os.path.join( tmp_dir, '%s_peaks.xls' % experiment_name )
+        if os.path.exists( create_peak_xls_file ):
+            xls_to_interval( create_peak_xls_file, options['xls_to_interval']['peaks_file'], header = 'peaks file' )
+        create_peak_xls_file = os.path.join( tmp_dir, '%s_negative_peaks.xls' % experiment_name )
+        if os.path.exists( create_peak_xls_file ):
+            xls_to_interval( create_peak_xls_file, options['xls_to_interval']['negative_peaks_file'], header = 'negative peaks file' )
+    
+    #merge and move wig files as needed, delete gz'd files and remove emptied dirs
+    if 'wig' in options:
+        wig_base_dir = os.path.join( tmp_dir, "%s_MACS_wiggle" % experiment_name )
+        if os.path.exists( wig_base_dir ):
+            #treatment
+            treatment_dir = os.path.join( wig_base_dir, "treat" )
+            if os.path.exists( treatment_dir ):
+                gunzip_cat_glob_path( os.path.join( treatment_dir, "*.wig.gz" ), options['wig']['output_treatment_file'], delete = True )
+                os.rmdir( treatment_dir )
+                #control
+                if options['input_control']:
+                    control_dir = os.path.join( wig_base_dir, "control" )
+                    if os.path.exists( control_dir ):
+                        gunzip_cat_glob_path( os.path.join( control_dir, "*.wig.gz" ), options['wig']['output_control_file'], delete = True )
+                        os.rmdir( control_dir )
+            os.rmdir( wig_base_dir )
+    
+    #move all remaining files to extra files path of html file output to allow user download
+    out_html = open( output_extra_html, 'wb' )
+    out_html.write( '<html><head><title>Additional output created by MACS (%s)</title></head><body><h3>Additional Files:</h3><p><ul>\n' % experiment_name )
+    os.mkdir( output_extra_path )
+    for filename in sorted( os.listdir( tmp_dir ) ):
+        shutil.move( os.path.join( tmp_dir, filename ), os.path.join( output_extra_path, filename ) )
+        out_html.write( '<li><a href="%s">%s</a></li>\n' % ( filename, filename ) )
+    out_html.write( '</ul></p>\n' )
+    out_html.write( '<h3>Messages from MACS:</h3>\n<p><pre>%s</pre></p>\n' % open( stderr_name, 'rb' ).read() )
+    out_html.write( '</body></html>\n' )
+    out_html.close()
+    
+    os.unlink( stderr_name )
+    os.rmdir( tmp_dir )
+
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/peak_calling/macs_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/peak_calling/macs_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,230 @@\n+<tool id="peakcalling_macs" name="MACS" version="1.0.1">\n+  <description>Model-based Analysis of ChIP-Seq</description>\n+  <command interpreter="python">macs_wrapper.py $options_file $output_bed_file $output_extra_files $output_extra_files.files_path</command>\n+  <requirements>\n+    <requirement type="python-module">macs</requirement>\n+    <requirement type="package">macs</requirement>\n+  </requirements>\n+  <inputs>\n+    <param name="experiment_name" type="text" value="MACS in Galaxy" size="50" label="Experiment Name"/>\n+    <conditional name="input_type">\n+      <param name="input_type_selector" type="select" label="Paired End Sequencing">\n+        <option value="paired_end">Paired End (requires elandmulti format)</option>\n+        <option value="single_end" selected="true">Single End</option>\n+      </param>\n+      <when value="paired_end">\n+        <param name="input_chipseq_file1" type="data" format="elandmulti" label="ChIP-Seq Tag File 1" />\n+        <param name="input_chipseq_file2" type="data" format="elandmulti" label="ChIP-Seq Tag File 2" />\n+        <param name="input_control_file1" type="data" format="elandmulti" optional="True" label="ChIP-Seq Control File 1" />\n+        <param name="input_control_file2" type="data" format="elandmulti" optional="True" label="ChIP-Seq Control File 2" />\n+        <param name="petdist" type="integer" label="Best distance between Pair-End Tags" value="200"/>\n+      </when>\n+      <when value="single_end">\n+        <param name="input_chipseq_file1" type="data" format="bed,sam,bam,eland,elandmulti" label="ChIP-Seq Tag File" />\n+        <param name="input_control_file1" type="data" format="bed,sam,bam,eland,elandmulti" optional="True" label="ChIP-Seq Control File" />\n+      </when>\n+    </conditional>\n+    <param name="gsize" type="float" label="Effective genome size" value="2.7e+9" help="default: 2.7e+9"/>\n+    <param name="tsize" type="integer" label="Tag size" value="25"/>\n+    <param name="bw" type="integer" label="Band width" value="300"/>\n+    <param name="pvalue" type="float" label="Pvalue cutoff for peak detection" value="1e-5" help="default: 1e-5"/>\n+    <param name="mfold" type="integer" label="Select the regions with MFOLD high-confidence enrichment ratio against background to build model" value="32"/>\n+    <param name="xls_to_interval" label="Parse xls files into into distinct interval files" type="boolean" truevalue="create" falsevalue="do_not_create" checked="False"/>\n+    <conditional name="wig_type">\n+      <param name="wig_type_selector" type="select" label="Save shifted raw tag count at every bp into a wiggle file">\n+        <option value="wig">Save</option>\n+        <option value="no_wig" selected="true">Do not create wig file (faster)</option>\n+      </param>\n+      <when value="wig">\n+        <param name="wigextend" type="integer" label="Extend tag from its middle point to a wigextend size fragment." value="-1" help="Use value less than 0 for default (modeled d)"/>\n+        <param name="space" type="integer" label="Resolution for saving wiggle files" value="10"/>\n+      </when>\n+      <when value="no_wig">\n+        <!-- do nothing here -->\n+      </when>\n+    </conditional>\n+    <param name="nolambda" label="Use fixed background lambda as local lambda for every peak region" type="boolean" truevalue="--nolambda" falsevalue="" checked="False" help="up to 9X more time consuming"/>\n+    <param name="lambdaset" type="text" label="3 levels of regions around the peak region to calculate the maximum lambda as local lambda" value="1000,5000,10000" size="50"/>\n+    <conditional name="nomodel_type">\n+      <param name="nomodel_type_selector" type="select" label="Build Model">\n+        <option value="nomodel">Do not build the shifting model</option>\n+        <option value="create_model" selected="true">Build the shifting model</option>\n+      </param>\n+      <when value="nomodel">\n+        <param name="shiftsize" type="integer" label="Arbitrary shift size in b'..b'ive_peaks.xls" value="peakcalling_macs/test2/Galaxy_Test_Run_negative_peaks.xls" compare="re_match"/>\n+        <extra_files type="file" name="Galaxy_Test_Run_peaks.xls" value="peakcalling_macs/test2/Galaxy_Test_Run_peaks.xls" compare="re_match"/>\n+      </output>\n+    </test>\n+    <test>\n+      <param name="input_type_selector" value="single_end" />\n+      <param name="input_chipseq_file1" value="chipseq_enriched.bed.gz" ftype="bed" />\n+      <param name="input_control_file1" value="chipseq_input.bed.gz" ftype="bed" />\n+      <param name="experiment_name" value="Galaxy Test Run" />\n+      <param name="tsize" value="36" />\n+      <param name="mfold" value="13" />\n+      <param name="gsize" value="2.7e+9" />\n+      <param name="bw" value="300" />\n+      <param name="pvalue" value="1e-5" />\n+      <param name="xls_to_interval" value="true" />\n+      <param name="wig_type_selector" value="no_wig" />\n+      <param name="nolambda"/>\n+      <param name="lambdaset" value="1000,5000,10000"/>\n+      <param name="nomodel_type_selector" value="create_model" />\n+      <param name="diag_type_selector" value="no_diag" />\n+      <param name="futurefdr"/>\n+      <output name="output_bed_file" file="peakcalling_macs/macs_test_1_out.bed" />\n+      <output name="output_xls_to_interval_peaks_file" file="peakcalling_macs/macs_test_2_peaks_out.interval" lines_diff="4" />\n+      <output name="output_xls_to_interval_negative_peaks_file" file="peakcalling_macs/macs_test_2_neg_peaks_out.interval" />\n+      <output name="output_html_file" file="peakcalling_macs/macs_test_1_out.html" compare="re_match" >\n+        <extra_files type="directory" value="peakcalling_macs/test2/" compare="re_match"/>\n+      </output>\n+    </test>\n+    <!-- <test>\n+      <param name="input_type_selector" value="single_end" />\n+      <param name="input_chipseq_file1" value="chipseq_enriched.bed.gz" ftype="bed" />\n+      <param name="input_control_file1" value="chipseq_input.bed.gz" ftype="bed" />\n+      <param name="experiment_name" value="Galaxy Test Run" />\n+      <param name="tsize" value="36" />\n+      <param name="mfold" value="13" />\n+      <param name="gsize" value="2.7e+9" />\n+      <param name="bw" value="300" />\n+      <param name="pvalue" value="1e-5" />\n+      <param name="xls_to_interval" value="true" />\n+      <param name="wig_type_selector" value="wig" />\n+      <param name="wigextend" value="-1" />\n+      <param name="space" value="10" />\n+      <param name="nolambda"/>\n+      <param name="lambdaset" value="1000,5000,10000"/>\n+      <param name="nomodel_type_selector" value="create_model" />\n+      <param name="diag_type_selector" value="no_diag" />\n+      <param name="futurefdr"/>\n+      <output name="output_bed_file" file="peakcalling_macs/macs_test_1_out.bed" />\n+      <output name="output_xls_to_interval_peaks_file" file="peakcalling_macs/macs_test_2_peaks_out.interval" lines_diff="4" />\n+      <output name="output_xls_to_interval_negative_peaks_file" file="macs_test_2_neg_peaks_out.interval" />\n+      <output name="output_treatment_wig_file" file="peakcalling_macs/macs_test_3_treatment_out.wig" />\n+      <output name="output_control_wig_file" file="peakcalling_macs/macs_test_3_control_out.wig" />\n+      <output name="output_html_file" file="peakcalling_macs/macs_test_3_out.html" compare="re_match" >\n+        <extra_files type="directory" value="peakcalling_macs/test2/" compare="re_match"/>\n+      </output>\n+    </test> -->\n+  </tests>\n+  <help>\n+**What it does**\n+\n+This tool allows ChIP-seq peak calling using MACS.\n+\n+Depending upon selected options, 2 to 6 history items will be created; the first output will be a standard BED file and the last will be an HTML report containing links to download additional files generated by MACS. Up to two each of wig and interval files can be optionally created; the interval files are parsed from the xls output.\n+\n+View the original MACS documentation: http://liulab.dfci.harvard.edu/MACS/00README.html.\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/peak_calling/sicer_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/peak_calling/sicer_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,156 @@\n+#!/usr/bin/env python\n+#Dan Blankenberg\n+\n+"""\n+A wrapper script for running SICER (spatial clustering approach for the identification of ChIP-enriched regions) region caller.\n+"""\n+\n+import sys, optparse, os, tempfile, subprocess, shutil\n+\n+CHUNK_SIZE = 2**20 #1mb\n+\n+VALID_BUILDS = [ \'mm8\', \'mm9\', \'hg18\', \'hg19\', \'dm2\', \'dm3\', \'sacCer1\', \'pombe\', \'rn4\', \'tair8\' ] #HACK! FIXME: allow using all specified builds, would currently require hacking SICER\'s "GenomeData.py" on the fly.\n+\n+def cleanup_before_exit( tmp_dir ):\n+    if tmp_dir and os.path.exists( tmp_dir ):\n+        shutil.rmtree( tmp_dir )\n+\n+\n+def open_file_from_option( filename, mode = \'rb\' ):\n+    if filename:\n+        return open( filename, mode = mode )\n+    return None\n+\n+def add_one_to_file_column( filename, column, split_char = "\\t", startswith_skip = None ):\n+    tmp_out = tempfile.TemporaryFile( mode=\'w+b\' )\n+    tmp_in = open( filename )\n+    for line in tmp_in:\n+        if startswith_skip and line.startswith( startswith_skip ):\n+            tmp_out.write( line )\n+        else:\n+            fields = line.rstrip( \'\\n\\r\' ).split( split_char )\n+            if len( fields ) <= column:\n+                tmp_out.write( line )\n+            else:\n+                fields[ column ] = str( int( fields[ column ] ) + 1 )\n+                tmp_out.write( "%s\\n" % ( split_char.join( fields )  ) )\n+    tmp_in.close()\n+    tmp_out.seek( 0 )\n+    tmp_in = open( filename, \'wb\' )\n+    while True:\n+        chunk = tmp_out.read( CHUNK_SIZE )\n+        if chunk:\n+            tmp_in.write( chunk )\n+        else:\n+            break\n+    tmp_in.close()\n+    tmp_out.close()\n+\n+def __main__():\n+    #Parse Command Line\n+    parser = optparse.OptionParser()\n+    #stdout/err\n+    parser.add_option( \'\', \'--stdout\', dest=\'stdout\', action=\'store\', type="string", default=None, help=\'If specified, the output of stdout will be written to this file.\' )\n+    parser.add_option( \'\', \'--stderr\', dest=\'stderr\', action=\'store\', type="string", default=None, help=\'If specified, the output of stderr will be written to this file.\' )\n+    parser.add_option( \'\', \'--fix_off_by_one_errors\', dest=\'fix_off_by_one_errors\', action=\'store_true\', default=False, help=\'If specified, fix off-by-one errors in output files\' )\n+    #inputs\n+    parser.add_option( \'-b\', \'--bed_file\', dest=\'bed_file\', action=\'store\', type="string", default=None, help=\'Input ChIP BED file.\' )\n+    parser.add_option( \'-c\', \'--control_file\', dest=\'control_file\', action=\'store\', type="string", default=None, help=\'Input control BED file.\' )\n+    parser.add_option( \'-d\', \'--dbkey\', dest=\'dbkey\', action=\'store\', type="string", default=None, help=\'Input dbkey.\' )\n+    parser.add_option( \'-r\', \'--redundancy_threshold\', dest=\'redundancy_threshold\', action=\'store\', type="int", default=1, help=\'Redundancy Threshold: The number of copies of identical reads allowed in a library.\' )\n+    parser.add_option( \'-w\', \'--window_size\', dest=\'window_size\', action=\'store\', type="int", default=200, help=\'Window size: resolution of SICER algorithm. For histone modifications, one can use 200 bp\' )\n+    parser.add_option( \'-f\', \'--fragment_size\', dest=\'fragment_size\', action=\'store\', type="int", default=150, help=\'Fragment size: is for determination of the amount of shift from the beginning of a read to the center of the DNA fragment represented by the read. FRAGMENT_SIZE=150 means the shift is 75.\' )\n+    parser.add_option( \'-e\', \'--effective_genome_fraction\', dest=\'effective_genome_fraction\', action=\'store\', type="float", default=0.74, help=\'Effective genome fraction: Effective Genome as fraction of the genome size. It depends on read length.\' )\n+    parser.add_option( \'-g\', \'--gap_size\', dest=\'gap_size\', action=\'store\', type="int", default=600, help=\'Gap size: needs to be multiples of window size. Namely if the window size is 200, the gap size should be 0, 200, 400, 600, ... .\' )\n+    parser.add_option( \'-o\', \'--error_cut_off\', dest=\'error_cut_off\','..b'( args=cmd, stdout=stdout, stderr=stderr, shell=True, cwd=tmp_dir )\n+    return_code = proc.wait()\n+    \n+    if return_code:\n+        stderr_target = sys.stderr\n+    else:\n+        stderr_target = stdout #sys.stdout\n+        stderr_target.write( "\\nAdditionally, these warnings were reported:\\n" )\n+    stderr.flush()\n+    stderr.seek(0)\n+    while True:\n+        chunk = stderr.read( CHUNK_SIZE )\n+        if chunk:\n+            stderr_target.write( chunk )\n+        else:\n+            break\n+    stderr.close()\n+    \n+    try:\n+        #move files to where they belong\n+        shutil.move(  os.path.join( tmp_dir,\'%s-%i-removed.bed\' % ( bed_base_filename, options.redundancy_threshold ) ), options.redundancy_removed_test_bed_output_file )\n+        shutil.move(  os.path.join( tmp_dir,\'%s-W%i.graph\' % ( bed_base_filename, options.window_size ) ), options.summary_graph_output_file )\n+        if options.fix_off_by_one_errors: add_one_to_file_column( options.summary_graph_output_file, 2 )\n+        shutil.move(  os.path.join( tmp_dir,\'%s-W%i-normalized.wig\' % ( bed_base_filename, options.window_size ) ), options.test_normalized_wig_output_file )\n+        if options.control_file is not None:\n+            shutil.move(  os.path.join( tmp_dir,\'%s-%i-removed.bed\' % ( control_base_filename, options.redundancy_threshold ) ), options.redundancy_removed_control_bed_output_file )\n+            shutil.move(  os.path.join( tmp_dir,\'%s-W%i-G%i.scoreisland\' % ( bed_base_filename, options.window_size, options.gap_size ) ), options.score_island_output_file )\n+            if options.fix_off_by_one_errors: add_one_to_file_column( options.score_island_output_file, 2 )\n+            shutil.move(  os.path.join( tmp_dir,\'%s-W%i-G%i-islands-summary\' % ( bed_base_filename, options.window_size, options.gap_size ) ), options.islands_summary_output_file )\n+            if options.fix_off_by_one_errors: add_one_to_file_column( options.islands_summary_output_file, 2 )\n+            shutil.move(  os.path.join( tmp_dir,\'%s-W%i-G%i-islands-summary-FDR%s\' % ( bed_base_filename, options.window_size, options.gap_size, options.error_cut_off ) ), options.significant_islands_summary_output_file )\n+            if options.fix_off_by_one_errors: add_one_to_file_column( options.significant_islands_summary_output_file, 2 )\n+            shutil.move(  os.path.join( tmp_dir,\'%s-W%i-G%i-FDR%s-island.bed\' % ( bed_base_filename, options.window_size, options.gap_size, options.error_cut_off ) ), options.significant_islands_output_file )\n+            if options.fix_off_by_one_errors: add_one_to_file_column( options.significant_islands_output_file, 2 )\n+            shutil.move(  os.path.join( tmp_dir,\'%s-W%i-G%i-FDR%s-islandfiltered.bed\' % ( bed_base_filename, options.window_size, options.gap_size, options.error_cut_off ) ), options.island_filtered_output_file )\n+            shutil.move(  os.path.join( tmp_dir,\'%s-W%i-G%i-FDR%s-islandfiltered-normalized.wig\' % ( bed_base_filename, options.window_size, options.gap_size, options.error_cut_off ) ), options.island_filtered_normalized_wig_output_file )\n+        else:\n+            shutil.move(  os.path.join( tmp_dir,\'%s-W%i-G%i-E%s.scoreisland\' % ( bed_base_filename, options.window_size, options.gap_size, options.error_cut_off ) ), options.score_island_output_file )\n+            if options.fix_off_by_one_errors: add_one_to_file_column( options.score_island_output_file, 2 )\n+            shutil.move(  os.path.join( tmp_dir,\'%s-W%i-G%i-E%s-islandfiltered.bed\' % ( bed_base_filename, options.window_size, options.gap_size, options.error_cut_off ) ), options.island_filtered_output_file )\n+            shutil.move(  os.path.join( tmp_dir,\'%s-W%i-G%i-E%s-islandfiltered-normalized.wig\' % ( bed_base_filename, options.window_size, options.gap_size, options.error_cut_off ) ), options.island_filtered_normalized_wig_output_file )\n+    except Exception, e:\n+        raise e\n+    finally:\n+        cleanup_before_exit( tmp_dir )\n+\n+if __name__=="__main__": __main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/peak_calling/sicer_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/peak_calling/sicer_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,176 @@\n+<tool id="peakcalling_sicer" name="SICER" version="0.0.1">\n+  <description>Statistical approach for the Identification of ChIP-Enriched Regions</description>\n+  <command interpreter="python">sicer_wrapper.py \n+  --bed_file \'${input_bed_file}\' \n+  #if str( $input_control_file ) != \'None\':\n+      --control_file \'${input_control_file}\'\n+      --significant_islands_output_file "${significant_islands_output_file}"\n+      --islands_summary_output_file "${islands_summary_output_file}"\n+      --significant_islands_summary_output_file "${significant_islands_summary_output_file}"\n+  #end if\n+  ${fix_off_by_one_errors}\n+  --dbkey \'${input_bed_file.dbkey}\'\n+  --redundancy_threshold \'${redundancy_threshold}\'\n+  --window_size \'${window_size}\'\n+  --fragment_size \'${fragment_size}\'\n+  --effective_genome_fraction \'${effective_genome_fraction}\'\n+  --gap_size \'${gap_size}\'\n+  --error_cut_off \'${error_cut_off}\'\n+  ##output files\n+  --stdout "${output_log_file}"\n+  --redundancy_removed_test_bed_output_file "${redundancy_removed_test_bed_output_file}"\n+  --redundancy_removed_control_bed_output_file "${redundancy_removed_control_bed_output_file}"\n+  --score_island_output_file "${score_island_output_file}"\n+  --summary_graph_output_file "${summary_graph_output_file}"\n+  --test_normalized_wig_output_file "${test_normalized_wig_output_file}"\n+  --island_filtered_output_file "${island_filtered_output_file}"\n+  --island_filtered_normalized_wig_output_file "${island_filtered_normalized_wig_output_file}"\n+  </command>\n+  <requirements>\n+    <requirement type="package" version="1.1">SICER</requirement>\n+  </requirements>\n+  <inputs>\n+    <param name="input_bed_file" type="data" format="bed" label="ChIP-Seq Tag File" >\n+      <validator type="expression" message="SICER is not available for the genome.">value.dbkey in [ \'mm8\', \'mm9\', \'hg18\', \'hg19\', \'dm2\', \'dm3\', \'sacCer1\', \'pombe\', \'rn4\', \'tair8\' ]</validator>\n+    </param>\n+    <param name="input_control_file" type="data" format="bed" label="ChIP-Seq Control File" optional="True"> <!-- fix me, add filter to match dbkeys -->\n+      <options>\n+        <filter type="data_meta" ref="input_bed_file" key="dbkey" />\n+      </options>\n+    </param>\n+    <param name="fix_off_by_one_errors" type="boolean" truevalue="--fix_off_by_one_errors" falsevalue="" checked="True" label="Fix off-by-one errors in output files" help="SICER creates non-standard output files, this option will fix these coordinates"/> \n+    <param name="redundancy_threshold" type="integer" label="Redundancy Threshold" value="1" help="The number of copies of identical reads allowed in a library" />\n+    <param name="window_size" type="integer" label="Window size" value="200" help="Resolution of SICER algorithm. For histone modifications, one can use 200 bp" />\n+    <param name="fragment_size" type="integer" label="Fragment size" value="150" help="for determination of the amount of shift from the beginning of a read to the center of the DNA fragment represented by the read. FRAGMENT_SIZE=150 means the shift is 75." />\n+    <param name="effective_genome_fraction" type="float" label="Effective genome fraction" value="0.74" help="Effective Genome as fraction of the genome size. It depends on read length." />\n+    <param name="gap_size" type="integer" label="Gap size" value="600" help="Needs to be multiples of window size. Namely if the window size is 200, the gap size should be 0, 200, 400, 600, ..." />\n+    <param name="error_cut_off" type="float" label="Statistic threshold value" value="0.01" help="FDR (with control) or E-value (without control)" />\n+  </inputs>\n+  <outputs>\n+    <data name="redundancy_removed_test_bed_output_file" format="bed" label="${tool.name} on ${on_string} (test-${redundancy_threshold}-removed.bed)"/>\n+    <data name="redundancy_removed_control_bed_output_file" format="bed" label="${tool.name} on ${on_string} (control-${redundancy_threshold}-removed.bed)">\n+      <filter>input_control_file is not None</filter>'..b'am name="effective_genome_fraction" value="0.74" />\n+      <param name="gap_size" value="600" />\n+      <param name="error_cut_off" value="0.01" />\n+      <output name="redundancy_removed_test_bed_output_file" file="peakcalling_sicer/test_2/test-1-removed.bed" />\n+      <output name="redundancy_removed_control_bed_output_file" file="peakcalling_sicer/test_2/control-1-removed.bed" />\n+      <output name="summary_graph_output_file" file="peakcalling_sicer/test_3/test-W200.graph" />\n+      <output name="test_normalized_wig_output_file" file="peakcalling_sicer/test_2/test-W200-normalized.wig" />\n+      <output name="significant_islands_output_file" file="peakcalling_sicer/test_3/test-W200-G600-FDR0.01-island.bed" />\n+      <output name="island_filtered_output_file" file="peakcalling_sicer/test_2/test-W200-G600-FDR0.01-islandfiltered.bed" />\n+      <output name="island_filtered_normalized_wig_output_file" file="peakcalling_sicer/test_2/test-W200-G600-FDR0.01-islandfiltered-normalized.wig" />\n+      <output name="score_island_output_file" file="peakcalling_sicer/test_3/test-W200-G600.scoreisland" />\n+      <output name="islands_summary_output_file" file="peakcalling_sicer/test_3/test-W200-G600-islands-summary" />\n+      <output name="significant_islands_summary_output_file" file="peakcalling_sicer/test_3/test-W200-G600-islands-summary-FDR0.01" />\n+      <output name="output_log_file" file="peakcalling_sicer/test_2/output_log_file.contains" compare="contains"/>\n+    </test>\n+    <test>\n+      <param name="input_bed_file" value="chipseq_enriched.bed.gz" ftype="bed" dbkey="mm8" />\n+      <param name="input_control_file" />\n+      <param name="fix_off_by_one_errors" value="True" />\n+      <param name="redundancy_threshold" value="1" />\n+      <param name="window_size" value="200" />\n+      <param name="fragment_size" value="150" />\n+      <param name="effective_genome_fraction" value="0.74" />\n+      <param name="gap_size" value="600" />\n+      <param name="error_cut_off" value="0.01" />\n+      <output name="redundancy_removed_test_bed_output_file" file="peakcalling_sicer/test_1/test-1-removed.bed" />\n+      <output name="summary_graph_output_file" file="peakcalling_sicer/test_4/test-W200.graph" />\n+      <output name="test_normalized_wig_output_file" file="peakcalling_sicer/test_1/test-W200-normalized.wig" />\n+      <output name="island_filtered_output_file" file="peakcalling_sicer/test_1/test-W200-G600-E0.01-islandfiltered.bed" />\n+      <output name="island_filtered_normalized_wig_output_file" file="peakcalling_sicer/test_1/test-W200-G600-E0.01-islandfiltered-normalized.wig" />\n+      <output name="score_island_output_file" file="peakcalling_sicer/test_4/test-W200-G600-E0.01.scoreisland" />\n+      <output name="output_log_file" file="peakcalling_sicer/test_1/output_log_file.contains" compare="contains"/>\n+    </test>\n+  </tests>\n+  <help>\n+**What it does**\n+\n+SICER first and foremost is a filtering tool. Its main functions are::\n+  \n+  1. Delineation of the significantly ChIP-enriched regions, which can be used to associate with other genomic landmarks. \n+  2. Identification of reads on the ChIP-enriched regions, which can be used for profiling and other quantitative analysis.\n+\n+View the original SICER documentation: http://home.gwu.edu/~wpeng/Software.htm.\n+\n+------\n+\n+.. class:: warningmark\n+\n+  By default, SICER creates files that do not conform to standards (e.g. BED files are closed, not half-open). This could have implications for downstream analysis.\n+  To force the output of SICER to be formatted properly to standard file formats, check the **"Fix off-by-one errors in output files"** option.\n+\n+------\n+\n+**Citation**\n+\n+For the underlying tool, please cite `Zang C, Schones DE, Zeng C, Cui K, Zhao K, Peng W. A clustering approach for identification of enriched domains from histone modification ChIP-Seq data. Bioinformatics. 2009 Aug 1;25(15):1952-8. &lt;http://www.ncbi.nlm.nih.gov/pubmed/19505939&gt;`_\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/picard/picard_AddOrReplaceReadGroups.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/picard/picard_AddOrReplaceReadGroups.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,204 @@\n+<tool name="Add or Replace Groups" id="picard_ARRG" version="0.2.0">\n+  <requirements><requirement type="package">picard</requirement></requirements>\n+  <command interpreter="python">\n+    picard_wrapper.py\n+      --input="$inputFile"\n+      --rg-lb="$rglb"\n+      --rg-pl="$rgpl"\n+      --rg-pu="$rgpu"\n+      --rg-sm="$rgsm"\n+      --rg-id="$rgid"\n+      --rg-opts=${readGroupOpts.rgOpts}\n+      #if $readGroupOpts.rgOpts == "full"\n+        --rg-cn="$readGroupOpts.rgcn"\n+        --rg-ds="$readGroupOpts.rgds"\n+      #end if\n+      --output-format=$outputFormat\n+      --output=$outFile\n+      -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/AddOrReplaceReadGroups.jar"\n+  </command>\n+  <inputs>\n+    <param format="bam,sam" name="inputFile" type="data" label="SAM/BAM dataset to add or replace read groups in"\n+      help="If empty, upload or import a SAM/BAM dataset." />\n+    <param name="rgid" value="1" type="text" label="Read group ID (ID tag)" help="The most important read group tag. Galaxy will use a value of \'1\' if nothing provided." />\n+    <param name="rgsm" value="" type="text" label="Read group sample name (SM tag)" />\n+    <param name="rglb" value="" type="text" label="Read group library (LB tag)" />\n+    <param name="rgpl" value="" type="text" label="Read group platform (PL tag)" help="illumina, solid, 454, pacbio, helicos" />\n+    <param name="rgpu" value="" type="text" label="Read group platform unit" help="like run barcode, etc." />\n+    <conditional name="readGroupOpts">\n+      <param name="rgOpts" type="select" label="Specify additional (optional) arguments" help="Allows you to set RGCN and RGDS.">\n+        <option value="preSet">Use pre-set defaults</option>\n+        <option value="full">Set optional arguments</option>\n+      </param>\n+      <when value="preSet" />\n+      <when value="full">\n+        <param name="rgcn" value="" type="text" label="Read group sequencing center name" help="Leave set to &lt;null&gt; for default (none)" />\n+        <param name="rgds" value="" type="text" label="Read group description" help="Leave set to &lt;null&gt; for default (none)" />\n+      </when>\n+    </conditional>\n+    <param name="outputFormat" type="boolean" checked="True" truevalue="bam" falsevalue="sam" label="Output bam instead of sam" help="Uncheck for sam output" />\n+  </inputs>\n+  <outputs>\n+    <data name="outFile" format="bam" label="${tool.name} on ${on_string}: ${outputFormat} with read groups replaced">\n+      <change_format>\n+        <when input="outputFormat" value="sam" format="sam" />\n+      </change_format>\n+    </data>\n+  </outputs>\n+  <tests>\n+    <test>\n+      <!-- Command for replacing read groups in bam:\n+      java -jar AddOrReplaceReadGroups.jar VALIDATION_STRINGENCY=LENIENT I=test-data/picard_ARRG_input1.bam O=picard_ARRG_output1.sam RGID=one RGLB=lib RGPL=illumina RGPU=peaewe RGSM=sam1\n+      -->\n+      <param name="inputFile" value="picard_ARRG_input1.bam" />\n+      <param name="rglb" value="lib" />\n+      <param name="rgpl" value="illumina" />\n+      <param name="rgpu" value="peaewe" />\n+      <param name="rgsm" value="sam1" />\n+      <param name="rgid" value="one" />\n+      <param name="rgOpts" value="preSet" />\n+      <param name="outputFormat" value="False" />\n+      <output name="outFile" file="picard_ARRG_output1.sam" ftype="sam" />\n+    </test>\n+    <test>\n+      <!-- Command for replacing read groups in sam:\n+      java -jar AddOrReplaceReadGroups.jar VALIDATION_STRINGENCY=LENIENT I=test-data/picard_ARRG_input1.sam O=picard_ARRG_output2.sam RGLB=LIB RGPL=IL RGPU=PLAT RGSM=smp RGID=M5 RGCN=FamousCenter RGDS="description with spaces"\n+      picard_ARRG_input1.bam can be created from picard_ARRG_input1.sam\n+      -->\n+      <param name="inputFile" value="picard_ARRG_input1.sam" />\n+      <param name="rglb" value="LIB" />\n+      <param name="rgpl" value="IL" />\n+      <param name="rgpu" value="PLAT" />\n+      <param name="rgsm" value="smp" />\n+      <param name="rgid" value="M5" />\n+   '..b', SOLID, LS454, HELICOS and PACBIO.","Important.  Not currently used in the GATK, but was in the past, and may return.  The only way to known the sequencing technology used to generate the sequencing data","It\'s a good idea to use this field."\n+    "LB","DNA preparation library identify","Essential for MarkDuplicates","MarkDuplicates uses the LB field to determine which read groups might contain molecular duplicates, in case the same DNA library was sequenced on multiple lanes."\n+\n+**Example of Read Group usage**\n+\n+Support we have a trio of samples: MOM, DAD, and KID.  Each has two DNA libraries prepared, one with 400 bp inserts and another with 200 bp inserts.  Each of these libraries is run on two lanes of an illumina hiseq, requiring 3 x 2 x 2 = 12 lanes of data.  When the data come off the sequencer, we would create 12 BAM files, with the following @RG fields in the header::\n+\n+ Dad\'s data:\n+ @RG     ID:FLOWCELL1.LANE1      PL:illumina     LB:LIB-DAD-1 SM:DAD      PI:200\n+ @RG     ID:FLOWCELL1.LANE2      PL:illumina     LB:LIB-DAD-1 SM:DAD      PI:200\n+ @RG     ID:FLOWCELL1.LANE3      PL:illumina     LB:LIB-DAD-2 SM:DAD      PI:400\n+ @RG     ID:FLOWCELL1.LANE4      PL:illumina     LB:LIB-DAD-2 SM:DAD      PI:400\n+  \n+ Mom\'s data:\n+ @RG     ID:FLOWCELL1.LANE5      PL:illumina     LB:LIB-MOM-1 SM:MOM      PI:200\n+ @RG     ID:FLOWCELL1.LANE6      PL:illumina     LB:LIB-MOM-1 SM:MOM      PI:200\n+ @RG     ID:FLOWCELL1.LANE7      PL:illumina     LB:LIB-MOM-2 SM:MOM      PI:400\n+ @RG     ID:FLOWCELL1.LANE8      PL:illumina     LB:LIB-MOM-2 SM:MOM      PI:400\n+ \n+ Kid\'s data:\n+ @RG     ID:FLOWCELL2.LANE1      PL:illumina     LB:LIB-KID-1 SM:KID      PI:200\n+ @RG     ID:FLOWCELL2.LANE2      PL:illumina     LB:LIB-KID-1 SM:KID      PI:200\n+ @RG     ID:FLOWCELL2.LANE3      PL:illumina     LB:LIB-KID-2 SM:KID      PI:400\n+ @RG     ID:FLOWCELL2.LANE4      PL:illumina     LB:LIB-KID-2 SM:KID      PI:400\n+\n+Note the hierarchical relationship between read groups (unique for each lane) to libraries (sequenced on two lanes) and samples (across four lanes, two lanes for each library).\n+\n+**Picard documentation**\n+\n+This is a Galaxy wrapper for AddOrReplaceReadGroups, a part of the external package Picard-tools_.\n+\n+ .. _Picard-tools: http://www.google.com/search?q=picard+samtools\n+\n+------\n+\n+.. class:: infomark\n+\n+**Inputs, outputs, and parameters**\n+\n+Either a sam file or a bam file must be supplied. If a bam file is used, it must\n+be coordinate-sorted. Galaxy currently coordinate-sorts all bam files.\n+\n+The output file is either bam (the default) or sam, according to user selection,\n+and contains the same information as the input file except for the appropraite\n+additional (or modified) read group tags. Bam is recommended since it is smaller.\n+\n+From the Picard documentation.\n+\n+AddOrReplaceReadGroups REQUIRED parameters::\n+\n+  Option (Type)    Description\n+  \n+  RGLB=String      Read Group Library\n+  RGPL=String      Read Group platform (e.g. illumina, solid)\n+  RGPU=String      Read Group platform unit (eg. run barcode)\n+  RGSM=String      Read Group sample name\n+  RGID=String      Read Group ID; Default value: null (empty)\n+\n+AddOrReplaceReadGroups OPTIONAL parameters::\n+\n+  Option (Type)    Description\n+  \n+  RGCN=String      Read Group sequencing center name; Default value: null (empty)\n+  RGDS=String      Read Group description Default value: null (empty)\n+\n+One parameter that Picard\'s AddOrReplaceReadGroups offers that is automatically\n+set by Galaxy is the SORT_ORDER, which is set to coordinate.\n+\n+.. class:: warningmark\n+\n+**Warning on SAM/BAM quality**\n+\n+Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**\n+flag when it runs Picard, which allows reads to be discarded if they\'re empty or don\'t map. This appears\n+to be the only way to deal with SAM/BAM that cannot be parsed.\n+\n+\n+\n+  </help>\n+</tool>\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/picard/picard_BamIndexStats.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/picard/picard_BamIndexStats.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,117 @@
+<tool name="BAM Index Statistics" id="picard_BamIndexStats" version="0.2.0">
+  <requirements><requirement type="package">picard</requirement></requirements>
+  <command interpreter="python">
+    picard_wrapper.py
+      --input "$input_file"
+      --bai-file "$input_file.metadata.bam_index"
+      -t "$htmlfile"
+      -d "$htmlfile.files_path"
+      -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/BamIndexStats.jar"
+  </command>
+  <inputs>
+    <param format="bam" name="input_file" type="data"  label="BAM dataset to generate statistics for"
+      help="If empty, upload or import a BAM dataset" />
+  </inputs>
+  <outputs>
+    <data format="html" name="htmlfile" label="${tool.name}_on_${on_string}.html" />
+  </outputs>
+  <tests>
+    <test>
+      <!-- Command
+      java -jar BamIndexStats.jar I=test-data/picard_input_tiny_coord.bam > picard_BIS_output1.txt
+      picard_input_tiny_coord.bam can be created from picard_input_tiny_coord.sam
+      -->
+      <param name="input_file" value="picard_input_tiny_coord.bam" ftype="bam" />
+      <output name="htmlfile" file="picard_BIS_output1.txt" ftype="html" compare="contains" lines_diff="12"/>
+    </test>
+    <test>
+      <!-- Command
+      java -jar BamIndexStats.jar I=test-data/picard_BIS_input1.bam > picard_BIS_output2.txt
+      picard_BIS_input1.bam can be created from picard_BIS_input1.sam
+      -->
+      <param name="input_file" value="picard_BIS_input1.bam" ftype="bam" />
+      <output name="htmlfile" file="picard_BIS_output2.txt" ftype="html" compare="contains" lines_diff="12" />
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**Purpose**
+
+Generate Bam Index Stats for a provided BAM file.
+
+**Picard documentation**
+
+This is a Galaxy wrapper for BamIndexStats, a part of the external package Picard-tools_.
+
+ .. _Picard-tools: http://www.google.com/search?q=picard+samtools
+
+------
+
+.. class:: infomark
+
+**Inputs and outputs**
+
+The only input is the BAM file you wish to obtain statistics for, which is required.
+Note that it must be coordinate-sorted. Galaxy currently coordinate-sorts all BAM files.
+
+This tool outputs an HTML file that contains links to the actual metrics results, as well
+as a log file with info on the exact command run.
+
+.. class:: warningmark
+
+**Warning on SAM/BAM quality**
+
+Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
+flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
+to be the only way to deal with SAM/BAM that cannot be parsed.
+
+------
+
+**Example**
+
+Given a BAM file created from the following::
+
+  @HD    VN:1.0     SO:coordinate
+  @SQ    SN:chr1    LN:101
+  @SQ    SN:chr7    LN:404
+  @SQ    SN:chr8    LN:202
+  @SQ    SN:chr10   LN:303
+  @SQ    SN:chr14   LN:505
+  @RG    ID:0       SM:Hi,Mom!
+  @RG    ID:1       SM:samplesample    DS:ClearDescription
+  @PG    ID:1       PN:Hey!   VN:2.0
+  @CO    Just a generic comment to make the header longer
+  read1     83    chr7      1    255    101M             =       302     201    CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN    )'.*.+2,))II'I*/)-I*-)I.-)I)I),/-II..)./.,.).*II,I.II-)III0*IIIIIIII/32/,01460II/6/*0*/2/283//36868/I    RG:Z:0
+  read2     89    chr7      1    255    101M             *         0       0    CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN    )'.*.+2,))II'I*/)-I*-)I.-)I)I),/-II..)./.,.).*II,I.II-)III0*IIIIIIII/32/,01460II/6/*0*/2/283//36868/I    RG:Z:0
+  read3     83    chr7      1    255    101M             =       302     201    CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN    )'.*.+2,))II'I*/)-I*-)I.-)I)I),/-II..)./.,.).*II,I.II-)III0*IIIIIIII/32/,01460II/6/*0*/2/283//36868/I    RG:Z:0
+  read4    147    chr7     16    255    101M             =        21     -96    CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN    )'.*.+2,))II'I*/)-I*-)I.-)I)I),/-II..)./.,.).*II,I.II-)III0*IIIIIIII/32/,01460II/6/*0*/2/283//36868/I    RG:Z:0
+  read5     99    chr7     21    255    101M             =        16      96    CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN    )'.*.+2,))II'I*/)-I*-)I.-)I)I),/-II..)./.,.).*II,I.II-)III0*IIIIIIII/32/,01460II/6/*0*/2/283//36868/I    RG:Z:0
+  read6    163    chr7    302    255    101M             =         1    -201    NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA    I/15445666651/566666553+2/14/I/555512+3/)-'/-I-'*+))*''13+3)'//++''/'))/3+I*5++)I'2+I+/*I-II*)I-./1'1    RG:Z:0
+  read7    163    chr7    302    255    10M1D10M5I76M    =         1    -201    NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA    I/15445666651/566666553+2/14/I/555512+3/)-'/-I-'*+))*''13+3)'//++''/'))/3+I*5++)I'2+I+/*I-II*)I-./1'1    RG:Z:0
+  read8    165       *      0      0    *                chr7      1       0    NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA    I/15445666651/566666553+2/14/I/555512+3/)-'/-I-'*+))*''13+3)'//++''/'))/3+I*5++)I'2+I+/*I-II*)I-./1'1    RG:Z:0
+
+The following metrics file will be produced::
+
+  chr1 length=    101    Aligned= 0    Unaligned= 0
+  chr7 length=    404    Aligned= 7    Unaligned= 0
+  chr8 length=    202    Aligned= 0    Unaligned= 0
+  chr10 length=   303    Aligned= 0    Unaligned= 0
+  chr14 length=   505    Aligned= 0    Unaligned= 0
+  NoCoordinateCount= 1
+
+  </help>
+</tool>
+
+
+
+
+
+
+
+
+
+
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/picard/picard_MarkDuplicates.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/picard/picard_MarkDuplicates.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,150 @@
+<tool name="Mark Duplicates" id="picard_MarkDuplicates" version="0.01">
+  <command interpreter="python">
+    picard_wrapper.py
+      --input="$input_file"
+      --remove-dups="$remDups"
+      --read-regex="$readRegex"
+      --opt-dup-dist="$optDupeDist"
+      --output-format=$outputFormat
+      --output-txt=$outMetrics
+      #if str( $outputFormat ) == "sam"
+        #if str( $remDups ) == "true"
+          --output-sam=$outFileSamRemoved
+        #else
+          --output-sam=$outFileSamMarked
+        #end if
+      #else if str( $outputFormat ) == "bam"
+        #if str( $remDups ) == "true"
+          --output-sam=$outFileBamRemoved
+        #else
+          --output-sam=$outFileBamMarked
+        #end if
+      #end if
+      -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/"
+      --picard-cmd="MarkDuplicates"
+  </command>
+  <inputs>
+    <param format="bam,sam" name="input_file" type="data" label="SAM/BAM dataset to mark duplicates in"
+      help="If the select list is empty, you need to upload or import some aligned short read data from a shared library"/>
+    <param name="remDups" type="boolean" label="Remove duplicates from output file" truevalue="true" falsevalue="false" checked="False" 
+      help="If true do not write duplicates to the output file instead of writing them with appropriate flags set" />
+    <param name="readRegex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" type="text" size="80"
+      label="Regular expression that can be used to parse read names in the incoming SAM file" 
+      help="Names are parsed to extract: tile/region, x coordinate and y coordinate, to estimate optical duplication rate" >
+      <sanitizer>
+        <valid initial="string.printable">
+         <remove value="&apos;"/>
+        </valid>
+        <mapping initial="none">
+          <add source="&apos;" target="__sq__"/>
+        </mapping>
+      </sanitizer>
+    </param>
+    <param name="optDupeDist" value="100" type="text"
+      label="The maximum offset between two duplicate clusters in order to consider them optical duplicates" size="5" 
+      help="Common range 5-10 pixels. Later Illumina software versions multiply pixel values by 10, in which case 50-100" />
+    <param name="outputFormat" type="boolean" checked="True" truevalue="bam" falsevalue="sam" label="Output bam instead of sam" help="Uncheck for sam output" />
+  </inputs>
+  <outputs>
+    <data format="txt" name="outMetrics" label="${tool.name} on ${on_string}: metrics" />
+    <data format="sam" name="outFileSamMarked" label="${tool.name} on ${on_string}: duplicates marked sam">
+      <filter>outputFormat is False</filter>
+      <filter>remDups is False</filter>
+    </data>
+    <data format="sam" name="outFileSamRemoved" label="${tool.name} on ${on_string}: duplicates removed sam">
+      <filter>outputFormat is False</filter>
+      <filter>remDups is True</filter>
+    </data>
+    <data format="bam" name="outFileBamMarked" label="${tool.name} on ${on_string}: duplicates marked bam">
+      <filter>outputFormat is True</filter>
+      <filter>remDups is False</filter>
+    </data>
+    <data format="bam" name="outFileBamRemoved" label="${tool.name} on ${on_string}: duplicates removed bam">
+      <filter>outputFormat is True</filter>
+      <filter>remDups is True</filter>
+    </data>
+  </outputs>
+  <tests>
+    <!-- Functional tests with Picard bam outputs currently aren't working
+    <test>
+    -->
+      <!-- Command to run:
+      java -jar MarkDuplicates.jar VALIDATION_STRINGENCY=LENIENT I=test-data/picard_input_tiny_coord.bam METRICS_FILE=picard_MD_output1.txt OUTPUT=picard_MD_output2.bam REMOVE_DUPLICATES=false ASSUME_SORTED=true READ_NAME_REGEX="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" OPTICAL_DUPLICATE_PIXEL_DISTANCE=100
+      -->
+    <!--
+      <param name="input_file" value="picard_input_tiny_coord.bam" />
+      <param name="remDups" value="false" />
+      <param name="readRegex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" />
+      <param name="optDupeDist" value="100" />
+      <param name="outputFormat" value="bam" />
+      <output name="outMetrics" file="picard_MD_output1.txt" ftype="txt" lines_diff="4" />
+      <output name="outFileBamMarked" file="picard_MD_output2.bam" ftype="bam" />
+    </test>
+    -->
+    <test>
+      <!-- Command to run:
+      java -jar MarkDuplicates.jar VALIDATION_STRINGENCY=LENIENT I=test-data/picard_input_tiny_coord.sam METRICS_FILE=picard_MD_output3.txt O=picard_MD_output4.sam REMOVE_DUPLICATES=true ASSUME_SORTED=true READ_NAME_REGEX="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" OPTICAL_DUPLICATE_PIXEL_DISTANCE=100
+      -->
+      <param name="input_file" value="picard_input_tiny_coord.sam" />
+      <param name="remDups" value="true" />
+      <param name="readRegex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" />
+      <param name="optDupeDist" value="100" />
+      <param name="outputFormat" value="sam" />
+      <output name="outMetrics" file="picard_MD_output3.txt" ftype="txt" lines_diff="4" />
+      <output name="outFileSamRemoved" file="picard_MD_output4.sam" ftype="sam" />
+    </test>
+  </tests>
+  
+  <help>
+
+.. class:: infomark
+
+**Purpose**
+
+MarkDuplicates examines aligned records in the supplied sam or bam file to identify duplicate molecules.
+
+**Picard documentation**
+
+This is a Galaxy interface for MarkDuplicates, a part of Picard-tools_, which is closely related to SAMTools_.
+
+ .. _Picard-tools: http://picard.sourceforge.net/index.shtml
+ .. _SAMTools: http://samtools.sourceforge.net/
+
+------
+
+**Input**
+
+Either a sam file or a bam file is required. If a bam file is used, it must be coordinate-sorted.
+
+**Outputs**
+
+This tool provides two outputs. The first contains the marked (or kept) records and is either bam (the default) or sam, according to user selection. Bam is recommended since it is smaller. The second output is the metrics file, which is a text file containing information about the duplicates. 
+
+**MarkDuplicates parameters**
+
+The two main parameters to be concerned with are the flag for removing duplicates and the regular expression needed to identify reads. If it is set to remove duplicates, they will not be written to the output file; otherwise they will appear in the output but will be flagged appropriately. The read name regular expression is used to parse read names from the input sam file. Read names are parsed to extract three variables: tile/region, x coordinate, and y coordinate. These values are used to estimate the rate of optical duplication in order to give a more accurate estimated library size. The regular expression should contain three capture groups for the three variables, in order (capture groups are enclosed in parentheses). Default value: [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*. 
+
+One other parameter that can be set is the maximum offset between two duplicate clusters in order for them to be considered optical duplicates. Later versions of the Illumina pipeline that multiply pixel values by 10 should generally use 50-100 pixels; otherwise 5-10 is normal. The default is set to 100. 
+
+One parameter that Picard's MarkDuplicates offers that is automatically set by Galaxy is the ASSUME_SORTED, which is set to true because Galaxy bam should always be coordinate-sorted.
+
+**Note on the use of regular expressions for read name parsing**
+
+The regular expression (regex) is used to parse the read names, so it's important to get it exactly right (so you probably don't want to edit this unless you know exactly what you're doing). The three parts of the read names identified are tile/region, x coordinate, and y coordinate, which are used in conjunction with the optical duplication rate to more accurately estimate library size.
+
+
+
+  </help>
+</tool>
+
+
+
+
+
+
+
+
+
+
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/picard/picard_ReorderSam.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/picard/picard_ReorderSam.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,165 @@\n+<tool name="Reorder SAM/BAM" id="picard_ReorderSam" version="0.3.0">\n+  <requirements><requirement type="package">picard</requirement></requirements>\n+  <command interpreter="python">\n+    picard_wrapper.py\n+      --input=$inputFile\n+      #if $source.indexSource == "built-in"\n+        --ref="${ filter( lambda x: str( x[0] ) == str( $source.ref ), $__app__.tool_data_tables[ \'picard_indexes\' ].get_fields() )[0][-1] }"\n+      #else\n+        --ref-file=$refFile\n+        --species-name=$source.speciesName\n+        --build-name=$source.buildName\n+        --trunc-names=$source.truncateSeqNames\n+      #end if\n+      --allow-inc-dict-concord=$allowIncDictConcord\n+      --allow-contig-len-discord=$allowContigLenDiscord\n+      --output-format=$outputFormat\n+      --output=$outFile\n+      -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/ReorderSam.jar"\n+  </command>\n+  <inputs>\n+    <param format="bam,sam" name="inputFile" type="data" label="SAM/BAM dataset to be reordered"\n+           help="If empty, upload or import a SAM/BAM dataset." />\n+    <conditional name="source">\n+      <param name="indexSource" type="select" label="Select Reference Genome" help="This tool will re-order SAM/BAM in the same order as reference selected below.">\n+        <option value="built-in">Locally cached</option>\n+        <option value="history">History</option>\n+      </param>\n+      <when value="built-in">\n+        <param name="ref" type="select" label="Select a reference genome">\n+          <options from_data_table="picard_indexes" />\n+        </param>\n+      </when>\n+      <when value="history">\n+        <param name="refFile" type="data" format="fasta" metadata_name="dbkey" label="Using reference file" />\n+        <param name="speciesName" type="text" value="" label="Species name" />\n+        <param name="buildName" type="text" value="" label="Build name" />\n+        <param name="truncateSeqNames" type="boolean" checked="False" truevalue="true" falsevalue="false" label="Truncate sequence names after first whitespace" />\n+      </when>\n+    </conditional>\n+    <param name="allowIncDictConcord" type="boolean" checked="False" truevalue="true" falsevalue="false" label="Allow incomplete dict concordance?" help="Allows a partial overlap of the BAM contigs with the new reference sequence contigs." />\n+    <param name="allowContigLenDiscord" type="boolean" checked="False" truevalue="true" falsevalue="false" label="Allow contig length discordance?" help="This is dangerous--don\'t check it unless you know exactly what you\'re doing!" />\n+    <param name="outputFormat" type="boolean" checked="True" truevalue="bam" falsevalue="sam" label="Output BAM instead of SAM" help="Uncheck for SAM output" />\n+  </inputs>\n+  <outputs>\n+    <data name="outFile" format="bam" label="${tool.name} on ${on_string}: reordered ${outputFormat}">\n+      <change_format>\n+        <when input="outputFormat" value="sam" format="sam" />\n+      </change_format>\n+    </data>\n+  </outputs>\n+  <tests>\n+    <test>\n+      <!-- Commands:\n+      cp test-data/phiX.fasta .\n+      samtools faidx phiX.fasta\n+      java -jar CreateSequenceDictionary.jar R=phiX.fasta O=phiX.dict URI=phiX.fasta TRUNCATE_NAMES_AT_WHITESPACE=false SPECIES=phiX174\n+      java -jar ReorderSam.jar VALIDATION_STRINGENCY=LENIENT I=test-data/picard_RS_input1.bam O=picard_RS_output1.bam REFERENCE=phiX.fasta ALLOW_INCOMPLETE_DICT_CONCORDANCE=false ALLOW_CONTIG_LENGTH_DISCORDANCE=false\n+    -->\n+      <param name="inputFile" value="picard_RS_input1.bam" />\n+      <param name="indexSource" value="history" />\n+      <param name="refFile" value="phiX.fasta" />\n+      <param name="speciesName" value="phiX174" />\n+      <param name="buildName" value="" />\n+      <param name="truncateSeqNames" value="false" />\n+      <param name="allowIncDictConcord" value="false" />\n+      <param name="allowContigLenDiscord" value="false" />\n+      <param name="outputFormat" value="True" />\n+      <output name="outFile" file="picard_RS_output1.ba'..b'_RS_input2.sam" />\n+      <param name="indexSource" value="built-in" />\n+      <param name="ref" value="phiX" />\n+      <param name="allowIncDictConcord" value="false" />\n+      <param name="allowContigLenDiscord" value="false" />\n+      <param name="outputFormat" value="False" />\n+      <output name="outFile" file="picard_RS_output2.sam" ftype="sam" lines_diff="4" sort="True" />\n+    </test>\n+    <test>\n+      <!-- Commands:\n+      cp test-data/picard_RS_input4.fasta .\n+      samtools faidx picard_RS_input4.fasta\n+      java -jar CreateSequenceDictionary.jar R=picard_RS_input4.fasta O=picard_RS_input4.dict URI=picard_RS_input4.fasta TRUNCATE_NAMES_AT_WHITESPACE=true SPECIES=phiX174 GENOME_ASSEMBLY=phiX_buildBlah1.1\n+      java -jar ReorderSam.jar VALIDATION_STRINGENCY=LENIENT I=test-data/picard_RS_input3.bam O=picard_RS_output3.sam REFERENCE=picard_RS_input4.fasta ALLOW_INCOMPLETE_DICT_CONCORDANCE=true ALLOW_CONTIG_LENGTH_DISCORDANCE=false\n+      picard_RS_input3.bam can be made from picard_RS_input3.sam\n+      -->\n+      <param name="inputFile" value="picard_RS_input3.bam" />\n+      <param name="indexSource" value="history" />\n+      <param name="refFile" value="picard_RS_input4.fasta" />\n+      <param name="speciesName" value="phiX174" />\n+      <param name="buildName" value="phiX_buildBlah1.1" />\n+      <param name="truncateSeqNames" value="true" />\n+      <param name="allowIncDictConcord" value="true" />\n+      <param name="allowContigLenDiscord" value="false" />\n+      <param name="outputFormat" value="False" />\n+      <output name="outFile" file="picard_RS_output3.sam" ftype="sam" lines_diff="12" sort="True" />\n+    </test>\n+  </tests>\n+  <help>\n+\n+.. class:: infomark\n+\n+**Purpose**\n+\n+Reorder SAM/BAM to match contig ordering in a particular reference file. Note that this is\n+not the same as sorting as done by the SortSam tool, which sorts by either coordinate\n+values or query name. The ordering in ReorderSam is based on exact name matching of\n+contigs/chromosomes. Reads that are mapped to a contig that is not in the new reference file are\n+not included in the output.\n+\n+**Picard documentation**\n+\n+This is a Galaxy wrapper for ReorderSam, a part of the external package Picard-tools_.\n+\n+ .. _Picard-tools: http://www.google.com/search?q=picard+samtools\n+\n+------\n+\n+.. class:: infomark \n+\n+**Inputs, outputs, and parameters**\n+\n+For the file that needs to be reordered, either a sam file or a bam file must be supplied.\n+If a bam file is used, it must be coordinate-sorted. A reference file is also required,\n+so either a fasta file should be supplied or a built-in reference can be selected.\n+\n+The output contains the same reads as the input file but the reads have been rearranged so\n+they appear in the same order as the provided reference file. The tool will output either\n+bam (the default) or sam, according to user selection. Bam is recommended since it is smaller.\n+\n+The only extra parameters that can be set are flags for allowing incomplete dict concordance\n+and allowing contig length discordance. If incomplete dict concordance is allowed, only a\n+partial overlap of the bam contigs with the new reference sequence contigs is required. By\n+default it is off, requiring a corresponding contig in the new reference for each read contig.\n+If contig length discordance is allowed, contig names that are the same between a read and the\n+new reference contig are allowed even if they have different lengths. This is usually not a\n+good idea, unless you know exactly what you\'re doing. It\'s off by default.\n+\n+.. class:: warningmark\n+\n+**Warning on SAM/BAM quality**\n+\n+Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**\n+flag when it runs Picard, which allows reads to be discarded if they\'re empty or don\'t map. This appears\n+to be the only way to deal with SAM/BAM that cannot be parsed.\n+\n+\n+  </help>\n+</tool>\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/picard/picard_ReplaceSamHeader.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/picard/picard_ReplaceSamHeader.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,115 @@
+<tool name="Replace SAM/BAM Header" id="picard_ReplaceSamHeader" version="0.2.0">
+  <requirements><requirement type="package">picard</requirement></requirements>
+  <command interpreter="python">
+    picard_wrapper.py
+      --input "$inputFile"
+      -o $outFile
+      --header-file $headerFile
+      --output-format $outputFormat
+      -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/ReplaceSamHeader.jar"
+      --tmpdir "${__new_file_path__}" 
+  </command>
+  <inputs>
+    <param format="bam,sam" name="inputFile" type="data" label="SAM/BAM dataset to replace header in (TARGET)"
+      help="If empty, upload or import a SAM/BAM dataset." />
+    <param format="bam,sam" name="headerFile" type="data" label="SAM/BAM to reader header from (SOURCE)"
+      help="If empty, upload or import a SAM/BAM dataset." />
+    <param name="outputFormat" type="boolean" checked="True" truevalue="bam" falsevalue="sam" label="Output BAM instead of SAM" help="Uncheck for SAM output" />
+  </inputs>
+  <outputs>
+    <data name="outFile" format="bam" label="${tool.name} on ${on_string}: ${outputFormat} with replaced header">
+      <change_format>
+        <when input="outputFormat" value="sam" format="sam" />
+      </change_format>
+    </data>
+  </outputs>
+  <tests>
+    <test>
+      <!-- Command:
+      java -jar ReplaceSamHeader.jar VALIDATION_STRINGENCY=LENIENT I=test-data/picard_input_tiny_coord.bam HEADER=test-data/picard_RSH_input1.bam O=picard_RSH_output1.sam
+      picard_RSH_input1.bam can be made from picard_RSH_input1.sam
+      -->
+      <param name="inputFile" value="picard_input_tiny_coord.bam" ftype="bam" />
+      <param name="headerFile" value="picard_RSH_input1.bam" ftype="bam" />
+      <param name="outputFormat" value="False" />
+      <output name="outFile" file="picard_RSH_output1.sam" ftype="sam" />
+    </test>
+    <test>
+      <!-- Command:
+      java -jar ReplaceSamHeader.jar VALIDATION_STRINGENCY=LENIENT I=test-data/picard_input_tiny_coord.sam HEADER=test-data/picard_RSH_input1.bam O=picard_RSH_output2.sam
+      picard_RSH_input1.bam can be made from picard_RSH_input1.sam
+      -->
+      <param name="inputFile" value="picard_input_tiny_coord.sam" ftype="sam" />
+      <param name="headerFile" value="picard_RSH_input1.bam" ftype="bam" />
+      <param name="outputFormat" value="False" />
+      <output name="outFile" file="picard_RSH_output2.sam" ftype="sam" />
+    </test>
+    <test>
+      <!-- Command:
+      java -jar ReplaceSamHeader.jar VALIDATION_STRINGENCY=LENIENT I=test-data/picard_input_tiny_coord.sam HEADER=test-data/picard_RSH_input1.sam O=picard_RSH_output2.bam
+      -->
+      <param name="inputFile" value="picard_input_tiny_coord.sam" ftype="sam" />
+      <param name="headerFile" value="picard_RSH_input1.sam" ftype="sam" />
+      <param name="outputFormat" value="True" />
+      <output name="outFile" file="picard_RSH_output2.bam" ftype="bam" />
+    </test>
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**Purpose**
+
+Replace Sam Header with the header from another sam file. The tool does not do any
+significant validation, so it's up to the user to make sure that the elements in
+the header are relevant and that the new header has all the required things.
+
+Replace the SAMFileHeader in a SAM file with the given header. Validation is
+minimal. It is up to the user to ensure that all the elements referred to in the
+SAMRecords are present in the new header. Sort order of the two input files must
+be the same.
+
+**Picard documentation**
+
+This is a Galaxy wrapper for ReplaceSamHeader, a part of the external package Picard-tools_.
+
+ .. _Picard-tools: http://www.google.com/search?q=picard+samtools
+
+------
+
+.. class:: infomark
+
+**Inputs and outputs**
+
+Either a sam file or a bam file is required as the file whose header will be replaced.
+The header file is also required and can also be either sam or bam (it does not have
+to be the same type as the other file). In both cases, if a bam file is used, it must
+be coordinate-sorted. Galaxy currently coordinate-sorts all bam files.
+
+The tool will output either bam (the default) or sam. Bam is recommended since it is smaller.
+
+.. class:: warningmark
+
+**Warning on SAM/BAM quality**
+
+Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
+flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
+to be the only way to deal with SAM/BAM that cannot be parsed.
+
+
+
+  </help>
+</tool>
+
+
+
+
+
+
+
+
+
+
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/picard/picard_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/picard/picard_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,734 @@\n+#!/usr/bin/env python\n+"""\n+Originally written by Kelly Vincent\n+pretty output and additional picard wrappers by Ross Lazarus for rgenetics\n+Runs all available wrapped Picard tools.\n+usage: picard_wrapper.py [options]\n+code Ross wrote licensed under the LGPL\n+see http://www.gnu.org/copyleft/lesser.html\n+"""\n+\n+import optparse, os, sys, subprocess, tempfile, shutil, time, logging\n+\n+galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>\n+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\n+<head>\n+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n+<meta name="generator" content="Galaxy %s tool output - see http://getgalaxy.org/" />\n+<title></title>\n+<link rel="stylesheet" href="/static/style/base.css" type="text/css" />\n+</head>\n+<body>\n+<div class="document">\n+"""\n+galhtmlattr = """Galaxy tool %s run at %s</b><br/>"""\n+galhtmlpostfix = """</div></body></html>\\n"""\n+\n+\n+def stop_err( msg ):\n+    sys.stderr.write( \'%s\\n\' % msg )\n+    sys.exit()\n+    \n+\n+def timenow():\n+    """return current time as a string\n+    """\n+    return time.strftime(\'%d/%m/%Y %H:%M:%S\', time.localtime(time.time()))\n+\n+\n+class PicardBase():\n+    """\n+    simple base class with some utilities for Picard\n+    adapted and merged with Kelly Vincent\'s code april 2011 Ross\n+    lots of changes...\n+    """\n+    \n+    def __init__(self, opts=None,arg0=None):\n+        """ common stuff needed at init for a picard tool\n+        """\n+        assert opts <> None, \'PicardBase needs opts at init\'\n+        self.opts = opts\n+        if self.opts.outdir == None:\n+             self.opts.outdir = os.getcwd() # fixmate has no html file eg so use temp dir\n+        assert self.opts.outdir <> None,\'## PicardBase needs a temp directory if no output directory passed in\'\n+        self.picname = self.baseName(opts.jar)\n+        if self.picname.startswith(\'picard\'):\n+            self.picname = opts.picard_cmd # special case for some tools like replaceheader?\n+        self.progname = self.baseName(arg0)\n+        self.version = \'0.002\'\n+        self.delme = [] # list of files to destroy\n+        self.title = opts.title\n+        self.inputfile = opts.input\n+        try:\n+            os.makedirs(opts.outdir)\n+        except:\n+            pass\n+        try:\n+            os.makedirs(opts.tmpdir)\n+        except:\n+            pass\n+        self.log_filename = os.path.join(self.opts.outdir,\'%s.log\' % self.picname)\n+        self.metricsOut =  os.path.join(opts.outdir,\'%s.metrics.txt\' % self.picname)\n+        self.setLogging(logfname=self.log_filename)\n+ \n+    def baseName(self,name=None):\n+        return os.path.splitext(os.path.basename(name))[0]\n+\n+    def setLogging(self,logfname="picard_wrapper.log"):\n+        """setup a logger\n+        """\n+        logging.basicConfig(level=logging.INFO,\n+                    filename=logfname,\n+                    filemode=\'a\')\n+\n+\n+    def readLarge(self,fname=None):\n+        """ read a potentially huge file.\n+        """\n+        try:\n+            # get stderr, allowing for case where it\'s very large\n+            tmp = open( fname, \'rb\' )\n+            s = \'\'\n+            buffsize = 1048576\n+            try:\n+                while True:\n+                    more = tmp.read( buffsize )\n+                    if len(more) > 0:\n+                        s += more\n+                    else:\n+                        break\n+            except OverflowError:\n+                pass\n+            tmp.close()\n+        except Exception, e:\n+            stop_err( \'Error : %s\' % str( e ) )   \n+        return s\n+    \n+    def runCL(self,cl=None,output_dir=None):\n+        """ construct and run a command line\n+        we have galaxy\'s temp path as opt.temp_dir so don\'t really need isolation\n+        sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts\n+'..b'% tempout)\n+        # reference\n+        cl.append(\'REFERENCE=%s\' % ref_file_name)\n+        # incomplete dict concordance\n+        if opts.allow_inc_dict_concord == \'true\':\n+            cl.append(\'ALLOW_INCOMPLETE_DICT_CONCORDANCE=true\')\n+        # contig length discordance\n+        if opts.allow_contig_len_discord == \'true\':\n+            cl.append(\'ALLOW_CONTIG_LENGTH_DISCORDANCE=true\')\n+        pic.runPic(opts.jar, cl)\n+        haveTempout = True\n+\n+    elif pic.picname == \'ReplaceSamHeader\':\n+        cl.append(\'INPUT=%s\' % opts.input)\n+        cl.append(\'OUTPUT=%s\' % tempout)\n+        cl.append(\'HEADER=%s\' % opts.header_file)\n+        pic.runPic(opts.jar, cl)\n+        haveTempout = True\n+\n+    elif pic.picname == \'CalculateHsMetrics\':\n+        maxloglines = 100\n+        baitfname = os.path.join(opts.outdir,\'rgPicardHsMetrics.bait\')\n+        targetfname = os.path.join(opts.outdir,\'rgPicardHsMetrics.target\')\n+        baitf = pic.makePicInterval(opts.baitbed,baitfname)\n+        if opts.targetbed == opts.baitbed: # same file sometimes\n+            targetf = baitf\n+        else:\n+            targetf = pic.makePicInterval(opts.targetbed,targetfname)   \n+        cl.append(\'BAIT_INTERVALS=%s\' % baitf)\n+        cl.append(\'TARGET_INTERVALS=%s\' % targetf)\n+        cl.append(\'INPUT=%s\' % os.path.abspath(opts.input))\n+        cl.append(\'OUTPUT=%s\' % pic.metricsOut)\n+        cl.append(\'TMP_DIR=%s\' % opts.tmpdir)\n+        pic.runPic(opts.jar,cl)\n+           \n+    elif pic.picname == \'ValidateSamFile\':\n+        import pysam\n+        doTranspose = False\n+        sortedfile = os.path.join(opts.outdir,\'rgValidate.sorted\')\n+        stf = open(pic.log_filename,\'w\')\n+        tlog = None\n+        if opts.datatype == \'sam\': # need to work with a bam \n+            tlog,tempbam = pic.samToBam(opts.input,opts.outdir)\n+            try:\n+                tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)\n+            except:\n+                print \'## exception on sorting sam file %s\' % opts.input\n+        else: # is already bam\n+            try:\n+                tlog = pic.sortSam(opts.input,sortedfile,opts.outdir)\n+            except: # bug - [bam_sort_core] not being ignored - TODO fixme\n+                print \'## exception on sorting bam file %s\' % opts.input\n+        if tlog:\n+            print \'##tlog=\',tlog\n+            stf.write(tlog)\n+            stf.write(\'\\n\')\n+        sortedfile = \'%s.bam\' % sortedfile # samtools does that      \n+        cl.append(\'O=%s\' % pic.metricsOut)\n+        cl.append(\'TMP_DIR=%s\' % opts.tmpdir)\n+        cl.append(\'I=%s\' % sortedfile)\n+        opts.maxerrors = \'99999999\'\n+        cl.append(\'MAX_OUTPUT=%s\' % opts.maxerrors)\n+        if opts.ignoreflags[0] <> \'None\': # picard error values to ignore\n+            igs = [\'IGNORE=%s\' % x for x in opts.ignoreflags if x <> \'None\']\n+            cl.append(\' \'.join(igs))\n+        if opts.bisulphite.lower() <> \'false\':\n+            cl.append(\'IS_BISULFITE_SEQUENCED=true\')\n+        if opts.ref <> None or opts.ref_file <> None:\n+            cl.append(\'R=%s\' %  ref_file_name)\n+        pic.runPic(opts.jar,cl)\n+        if opts.datatype == \'sam\':\n+            pic.delme.append(tempbam)\n+        newsam = opts.output\n+        outformat = \'bam\'\n+        pe = open(pic.metricsOut,\'r\').readlines()\n+        pic.cleanSam(insam=sortedfile, newsam=newsam, picardErrors=pe,outformat=outformat)\n+        pic.delme.append(sortedfile) # not wanted\n+        stf.close()\n+        pic.cleanup()\n+    else:\n+        print >> sys.stderr,\'picard.py got an unknown tool name - %s\' % pic.picname\n+        sys.exit(1)\n+    if haveTempout:\n+        # Some Picard tools produced a potentially intermediate bam file. \n+        # Either just move to final location or create sam\n+        shutil.move(tempout, os.path.abspath(opts.output))\n+\n+    if opts.htmlout <> None or doFix: # return a pretty html page\n+        pic.fixPicardOutputs(transpose=doTranspose,maxloglines=maxloglines)\n+\n+if __name__=="__main__": __main__()\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/picard/rgPicardASMetrics.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/picard/rgPicardASMetrics.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,162 @@\n+<tool name="SAM/BAM Alignment Summary Metrics" id="PicardASMetrics" version="0.03">\n+  <command interpreter="python">\n+    picard_wrapper.py -i "$input_file" -d "$html_file.files_path" -t "$html_file"\n+    --assumesorted "$sorted" -b "$bisulphite" --adaptors "$adaptors" --maxinsert "$maxinsert" -n "$out_prefix"\n+    -j ${GALAXY_DATA_INDEX_DIR}/shared/jars/CollectAlignmentSummaryMetrics.jar\n+#if $genomeSource.refGenomeSource == "history":\n+    --ref-file "$genomeSource.ownFile"\n+#else\n+    --ref "${ filter( lambda x: str( x[0] ) == str( $genomeSource.index ), $__app__.tool_data_tables[ \'all_fasta\' ].get_fields() )[0][-1] }"\n+#end if\n+  </command>\n+  <requirements><requirement type="package">picard</requirement></requirements>\n+  <inputs>\n+    <param format="sam,bam" name="input_file" type="data" label="SAM/BAM dataset to generate statistics for"\n+      help="If empty, upload or import a SAM/BAM dataset."/>\n+    <param name="out_prefix" value="Picard Alignment Summary Metrics" type="text"\n+      label="Title for the output file" help="Use this remind you what the job was for." size="80" />\n+\n+      <conditional name="genomeSource">\n+    \n+      <param name="refGenomeSource" type="select" label="Select Reference Genome">\n+        <option value="default" selected="true">Use the assigned data genome/build</option>\n+        <option value="indexed">Select a different built-in genome</option>\n+        <option value="history">Use a genome (fasta format) from my history</option>\n+      </param>\n+      <when value="default">\n+        <param name="index" type="select" label="Check the assigned reference genome" help="Galaxy thinks that the reads in you dataset were aligned against this reference. If this is not correct, use the \'Select a build-in reference genome\' option of the \'Select Reference Genome\' dropdown to select approprtiate Reference.">\n+          <options from_data_table="all_fasta">\n+          <filter type="data_meta" ref="input_file" key="dbkey" column="dbkey" multiple="True" separator="," />\n+          <validator type="no_options" message="No reference build available for selected input" /> \n+          </options>\n+        </param>\n+      </when>\n+      <when value="indexed">\n+        <param name="index" type="select" label="Select a built-in reference genome" help="This list contains genomes cached at this Galaxy instance. If your genome of interest is not present here request it by using \'Help\' link at the top of Galaxy interface or use the \'Use a genome (fasta format) from my history\' option of the \'Select Reference Genome\' dropdown.">\n+          <options from_data_table="all_fasta">\n+          </options>\n+        </param>\n+      </when>\n+      <when value="history">\n+        <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference genome from history" help="This option works best for relatively small genomes. If you are working with large human-sized genomes, send request to Galaxy team for adding your reference to this Galaxy instance by using \'Help\' link at the top of Galaxy interface."/>\n+      </when>\n+    </conditional>\n+    <param name="sorted" type="boolean" label="Assume the input file is already sorted" checked="true" truevalue="true" falsevalue="false"/>\n+    <param name="bisulphite" type="boolean" label="Input file contains Bisulphite sequenced reads" checked="false" falsevalue="false" truevalue="true" />\n+    <param name="adaptors" value="" type="text" area="true" label="Adapter sequences" help="One per line if multiple" size="5x120" />\n+    <param name="maxinsert" value="100000" type="integer" label="Larger paired end reads and inter-chromosomal pairs considered chimeric " size="20" />\n+  </inputs>\n+  <outputs>\n+    <data format="html" name="html_file"  label="${out_prefix}.html" />\n+  </outputs>\n+  <tests>\n+    <test>\n+      <param name="out_prefix" value="AsMetrics" />\n+      <param name="bisulphite" value="false" />\n+      <param name="sorted" va'..b' with inter-chromosomal pairs. Default value: 100000."\n+    "ADAPTER_SEQUENCE=String","This option may be specified 0 or more times. "\n+    "IS_BISULFITE_SEQUENCED=Boolean","Whether the SAM or BAM file consists of bisulfite sequenced reads. Default value: false. "\n+    "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created."\n+\n+The output produced by the tool has the following columns::\n+\n+  1. CATEGORY: One of either UNPAIRED (for a fragment run), FIRST_OF_PAIR when metrics are for only the first read in a paired run, SECOND_OF_PAIR when the metrics are for only the second read in a paired run or PAIR when the metrics are aggregeted for both first and second reads in a pair.\n+  2. TOTAL_READS: The total number of reads including all PF and non-PF reads. When CATEGORY equals PAIR this value will be 2x the number of clusters.\n+  3. PF_READS: The number of PF reads where PF is defined as passing Illumina\'s filter.\n+  4. PCT_PF_READS: The percentage of reads that are PF (PF_READS / TOTAL_READS)\n+  5. PF_NOISE_READS: The number of PF reads that are marked as noise reads. A noise read is one which is composed entirey of A bases and/or N bases. These reads are marked as they are usually artifactual and are of no use in downstream analysis.\n+  6. PF_READS_ALIGNED: The number of PF reads that were aligned to the reference sequence. This includes reads that aligned with low quality (i.e. their alignments are ambiguous).\n+  7. PCT_PF_READS_ALIGNED: The percentage of PF reads that aligned to the reference sequence. PF_READS_ALIGNED / PF_READS\n+  8. PF_HQ_ALIGNED_READS: The number of PF reads that were aligned to the reference sequence with a mapping quality of Q20 or higher signifying that the aligner estimates a 1/100 (or smaller) chance that the alignment is wrong.\n+  9. PF_HQ_ALIGNED_BASES: The number of bases aligned to the reference sequence in reads that were mapped at high quality. Will usually approximate PF_HQ_ALIGNED_READS * READ_LENGTH but may differ when either mixed read lengths are present or many reads are aligned with gaps.\n+ 10. PF_HQ_ALIGNED_Q20_BASES: The subest of PF_HQ_ALIGNED_BASES where the base call quality was Q20 or higher.\n+ 11. PF_HQ_MEDIAN_MISMATCHES: The median number of mismatches versus the reference sequence in reads that were aligned to the reference at high quality (i.e. PF_HQ_ALIGNED READS).\n+ 12. PF_HQ_ERROR_RATE: The percentage of bases that mismatch the reference in PF HQ aligned reads.\n+ 13. MEAN_READ_LENGTH: The mean read length of the set of reads examined. When looking at the data for a single lane with equal length reads this number is just the read length. When looking at data for merged lanes with differing read lengths this is the mean read length of all reads.\n+ 14. READS_ALIGNED_IN_PAIRS: The number of aligned reads who\'s mate pair was also aligned to the reference.\n+ 15. PCT_READS_ALIGNED_IN_PAIRS: The percentage of reads who\'s mate pair was also aligned to the reference. READS_ALIGNED_IN_PAIRS / PF_READS_ALIGNED\n+ 16. BAD_CYCLES: The number of instrument cycles in which 80% or more of base calls were no-calls.\n+ 17. STRAND_BALANCE: The number of PF reads aligned to the positive strand of the genome divided by the number of PF reads aligned to the genome.\n+ 18. PCT_CHIMERAS: The percentage of reads that map outside of a maximum insert size (usually 100kb) or that have the two ends mapping to different chromosomes.\n+ 19. PCT_ADAPTER: The percentage of PF reads that are unaligned and match to a known adapter sequence right from the start of the read.\n+\n+.. class:: warningmark\n+\n+**Warning on SAM/BAM quality**\n+\n+Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**\n+flag when it runs Picard, which allows reads to be discarded if they\'re empty or don\'t map. This appears\n+to be the only way to deal with SAM/BAM that cannot be parsed.\n+\n+\n+  </help>\n+</tool>\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/picard/rgPicardFixMate.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/picard/rgPicardFixMate.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,107 @@
+<tool name="Paired Read Mate Fixer" id="rgPicFixMate" version="0.2.0">
+  <description>for paired data</description>
+  <command interpreter="python">
+   picard_wrapper.py -i "$input_file" -o "$out_file" --tmpdir "${__new_file_path__}" -n "$out_prefix" 
+   --output-format "$outputFormat" -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/FixMateInformation.jar" --sortorder "$sortOrder"
+  </command>
+  <requirements><requirement type="package">picard</requirement></requirements>
+  <inputs>
+    <param format="bam,sam" name="input_file" type="data" label="SAM/BAM dataset to fix"
+      help="If empty, upload or import a SAM/BAM dataset."/>
+      <param name="sortOrder" type="select" help="If in doubt, leave as default and read Picard/Samtools documentation"
+         label="Sort order">
+        <option value="coordinate" selected ="true">Coordinate sort</option>
+        <option value="queryname">Query name sort</option>
+        <option value="unsorted">Unsorted - docs not clear if this means unchanged or not</option>
+      </param>
+    <param name="out_prefix" value="Fix Mate" type="text"
+      label="Title for the output file" help="Use this remind you what the job was for." size="80" />
+    <param name="outputFormat" type="boolean" checked="True" truevalue="bam" falsevalue="sam" label="Output BAM instead of SAM" help="Uncheck for SAM output" />
+  </inputs>
+  <outputs>
+    <data format="bam" name="out_file" label="${tool.name} on ${on_string}: ${outputFormat} with fixed mates">
+    <change_format>
+     <when input="outputFormat" value="sam" format="sam" />
+    </change_format> 
+    </data>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input_file" value="picard_input_sorted_pair.sam" />
+      <param name="sortOrder" value="coordinate" />
+      <param name="outputFormat" value="True" />
+      <param name="out_prefix" value="Test FixMate" />
+      <output name="out_file" file="picard_output_fixmate_sorted_pair.bam" ftype="bam" />
+    </test>
+    <test>
+      <param name="input_file" value="picard_input_sorted_pair.sam" />
+      <param name="sortOrder" value="coordinate" />
+      <param name="outputFormat" value="False" />
+      <param name="out_prefix" value="Test FixMate" />
+      <output name="out_file" file="picard_output_fixmate_sorted_pair.sam" ftype="sam" />
+    </test>
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**Purpose**
+
+Ensure that all mate-pair information is in sync between each read and it's mate pair.
+
+**Picard documentation**
+
+This is a Galaxy wrapper for FixMateInformation, a part of the external package Picard-tools_.
+
+ .. _Picard-tools: http://www.google.com/search?q=picard+samtools
+
+.. class:: warningmark
+
+**Useful for paired data only**
+
+Likely won't do anything helpful for single end sequence data
+Currently, Galaxy doesn't distinguish paired from single ended SAM/BAM so make sure
+the data you choose are valid (paired end) SAM or BAM data - unless you trust this
+tool not to harm your data.
+
+-----
+
+.. class:: infomark
+
+**Syntax**
+
+- **Input** - a paired read sam/bam format aligned short read data in your current history
+- **Sort order** - can be used to adjust the ordering of reads 
+- **Title** - the title to use for all output files from this job - use it for high level metadata
+- **Output Format** - either SAM or compressed as BAM
+
+-----
+
+.. class:: infomark
+
+**Inputs, outputs, and parameters**
+
+.. csv-table::
+
+   :header-rows: 1
+
+  Option,Description
+  "INPUT=File","The input file to fix. This option may be specified 0 or more times."
+  "OUTPUT=File","The output file to write to"
+  "SORT_ORDER=SortOrder","Optional sort order if the OUTPUT file should be sorted differently than the INPUT file. Default value: null. Possible values: {unsorted, queryname, coordinate}"
+  "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created. Default value: false"
+
+.. class:: warningmark
+
+**Warning on SAM/BAM quality**
+
+Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
+flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
+to be the only way to deal with SAM/BAM that cannot be parsed.
+
+
+  </help>
+</tool>
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/picard/rgPicardGCBiasMetrics.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/picard/rgPicardGCBiasMetrics.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,148 @@\n+<tool name="SAM/BAM GC Bias Metrics" id="PicardGCBiasMetrics" version="0.01">\n+  <command interpreter="python">\n+    picard_wrapper.py -i "$input_file" -d "$html_file.files_path" -t "$html_file"\n+    --windowsize "$windowsize" --mingenomefrac "$mingenomefrac" -n "$out_prefix" --tmpdir "${__new_file_path__}"\n+    -j ${GALAXY_DATA_INDEX_DIR}/shared/jars/CollectGcBiasMetrics.jar\n+#if $genomeSource.refGenomeSource == "history":\n+ --ref-file "$genomeSource.ownFile"\n+#else:\n+ --ref "${ filter( lambda x: str( x[0] ) == str( $genomeSource.index ), $__app__.tool_data_tables[ \'all_fasta\' ].get_fields() )[0][-1] }"\n+#end if\n+  </command>\n+  <requirements><requirement type="package">picard</requirement></requirements>\n+  <inputs>\n+    <param format="sam,bam" name="input_file" type="data" label="SAM/BAM dataset to generateGC bias metrics"\n+      help="If empty, upload or import a SAM/BAM dataset."/>\n+    <param name="out_prefix" value="Short Read GC Bias Metrics" type="text"\n+      label="Title for the output file" help="Use this remind you what the job was for." size="80" />\n+    <conditional name="genomeSource"> \n+      <param name="refGenomeSource" type="select" label="Select Reference Genome">\n+        <option value="default" selected="true">Use the assigned data genome/build</option>\n+        <option value="indexed">Select a different built-in genome</option>\n+        <option value="history">Use a genome (fasta format) from my history</option>\n+      </param>\n+      <when value="default">\n+        <param name="index" type="select" label="Check the assigned reference genome" help="Galaxy thinks that the reads in you dataset were aligned against this reference. If this is not correct, use the \'Select a build-in reference genome\' option of the \'Select Reference Genome\' dropdown to select approprtiate Reference.">\n+          <options from_data_table="all_fasta">\n+          <filter type="data_meta" ref="input_file" key="dbkey" column="dbkey" multiple="True" separator=","/>\n+          <validator type="no_options" message="No reference build available for the selected input data" />\n+          </options>\n+        </param>\n+      </when>\n+      <when value="indexed">\n+        <param name="index" type="select" label="Select a built-in reference genome" help="This list contains genomes cached at this Galaxy instance. If your genome of interest is not present here request it by using \'Help\' link at the top of Galaxy interface or use the \'Use a genome (fasta format) from my history\' option of the \'Select Reference Genome\' dropdown.">\n+          <options from_data_table="all_fasta"/>\n+        </param>\n+      </when>\n+      <when value="history">\n+        <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference genome from history" help="This option works best for relatively small genomes. If you are working with large human-sized genomes, send request to Galaxy team for adding your reference to this Galaxy instance by using \'Help\' link at the top of Galaxy interface."/>\n+      </when>\n+    </conditional>\n+    <param name="windowsize" type="integer" label="GC minimum window size" value="100"\n+    help="The size of windows on the genome that are used to bin reads. Default value: 100."/>\n+    <param name="mingenomefrac" value="0.00001" type="float" label="Minimum Genome Fraction"\n+    help="For summary metrics, exclude GC windows that include less than this fraction of the genome. Default value: 1.0E-5." />\n+    <!--\n+    \n+    Users can be enabled to set Java heap size by uncommenting this option and adding \'-x "$maxheap"\' to the <command> tag.\n+    If commented out the heapsize defaults to the value specified within picard_wrapper.py\n+    \n+    <param name="maxheap" type="select" help="If in doubt, choose 8G and read Picard documentation please"\n+     label="Java heap size">\n+    <option value="1G">1GB: very small data</option>\n+    <option value="2G" selected="true">2GB</option>\n+    <opt'..b'yntax**\n+\n+- **Input** - SAM/BAM format aligned short read data in your current history\n+- **Title** - the title to use for all output files from this job - use it for high level metadata\n+- **Reference Genome** - Galaxy (and Picard) needs to know which genomic reference was used to generate alignemnts within the input SAM/BAM dataset. Here you have three choices:\n+\n+  - *Assigned data genome/build* - a genome specified for this dataset. If you your SAM/BAM dataset has an assigned reference genome it will be displayed below this dropdown. If it does not -> use one of the following two options.\n+  - *Select a different built-in genome* - this option will list all reference genomes presently cached at this instance of Galaxy.\n+  - *Select a reference genome from history* - alternatively you can upload your own version of reference genome into your history and use it with this option. This is however not advisable with large human-sized genomes. If your genome is large contact Galaxy team using "Help" link at the top of the interface and provide exact details on where we can download sequences you would like to use as the refenece. We will then install them as a part of locally cached genomic references.\n+  \n+- **Window Size** see Picard documentation http://picard.sourceforge.net/command-line-overview.shtml#CollectGCBiasMetrics\n+- **Minimum Genome Fraction** See Picard documentation at http://picard.sourceforge.net/command-line-overview.shtml#CollectGCBiasMetrics\n+\n+-----\n+\n+.. class:: infomark\n+\n+**Inputs, outputs, and parameters**\n+\n+The Picard documentation (reformatted for Galaxy) says:\n+\n+.. csv-table::\n+   :header-rows: 1\n+\n+    Option,Description\n+    "REFERENCE_SEQUENCE=File","The reference sequence fasta file. Required."\n+    "INPUT=File","The BAM or SAM file containing aligned reads. Required."\n+    "OUTPUT=File","The text file to write the metrics table to. Required."\n+    "CHART_OUTPUT=File","The PDF file to render the chart to. Required."\n+    "SUMMARY_OUTPUT=File","The text file to write summary metrics to. Default value: null."\n+    "WINDOW_SIZE=Integer","The size of windows on the genome that are used to bin reads. Default value: 100."\n+    "MINIMUM_GENOME_FRACTION=Double","For summary metrics, exclude GC windows that include less than this fraction of the genome. Default value: 1.0E-5."\n+    "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created. Default value: false."\n+\n+The output produced by the tool has the following columns::\n+\n+ 1. GC: The G+C content of the reference sequence represented by this bin. Values are from 0% to 100%\n+ 2. WINDOWS: The number of windows on the reference genome that have this G+C content.\n+ 3. READ_STARTS: The number of reads who\'s start position is at the start of a window of this GC.\n+ 4. MEAN_BASE_QUALITY: The mean quality (determined via the error rate) of all bases of all reads that are assigned to windows of this GC.\n+ 5. NORMALIZED_COVERAGE: The ration of "coverage" in this GC bin vs. the mean coverage of all GC bins. A number of 1 represents mean coverage, a number less than one represents lower than mean coverage (e.g. 0.5 means half as much coverage as average) while a number greater than one represents higher than mean coverage (e.g. 3.1 means this GC bin has 3.1 times more reads per window than average).\n+ 6. ERROR_BAR_WIDTH: The radius of error bars in this bin based on the number of observations made. For example if the normalized coverage is 0.75 and the error bar width is 0.1 then the error bars would be drawn from 0.65 to 0.85.\n+\n+.. class:: warningmark\n+\n+**Warning on SAM/BAM quality**\n+\n+Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**\n+flag when it runs Picard, which allows reads to be discarded if they\'re empty or don\'t map. This appears\n+to be the only way to deal with SAM/BAM that cannot be parsed.\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/picard/rgPicardHsMetrics.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/picard/rgPicardHsMetrics.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,131 @@\n+<tool name="SAM/BAM Hybrid Selection Metrics" id="PicardHsMetrics" version="0.01">\n+  <description>for targeted resequencing data</description>\n+  <command interpreter="python">\n+\n+    picard_wrapper.py -i "$input_file" -d "$html_file.files_path" -t "$html_file" --datatype "$input_file.ext"\n+    --baitbed "$bait_bed" --targetbed "$target_bed" -n "$out_prefix" --tmpdir "${__new_file_path__}"\n+    -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/CalculateHsMetrics.jar"\n+\n+  </command>\n+  <requirements><requirement type="package">picard</requirement></requirements>\n+  <inputs>\n+    <param format="sam,bam" name="input_file" type="data" label="SAM/BAM dataset to generate statistics for" />\n+    <param name="out_prefix" value="Picard HS Metrics" type="text" label="Title for the output file" help="Use to remind you what the job was for." size="80" />\n+    <param name="bait_bed" type="data" format="interval" label="Bait intervals: Sequences for bait in the design" help="In UCSC BED format" size="80" />\n+    <param name="target_bed" type="data" format="interval" label="Target intervals: Sequences for targets in the design" help="In UCSC BED format" size="80" />\n+    <!--\n+    \n+    Users can be enabled to set Java heap size by uncommenting this option and adding \'-x "$maxheap"\' to the <command> tag.\n+    If commented out the heapsize defaults to the value specified within picard_wrapper.py\n+    \n+    <param name="maxheap" type="select" \n+       help="If in doubt, try the default. If it fails with a complaint about java heap size, try increasing it please - larger jobs will require your own hardware."\n+     label="Java heap size">\n+    <option value="4G" selected = "true">4GB default </option>\n+    <option value="8G" >8GB use if 4GB fails</option>\n+    <option value="16G">16GB - try this if 8GB fails</option>\n+    </param>\n+    \n+    -->\n+  </inputs>\n+  <outputs>\n+    <data format="html" name="html_file" label="${out_prefix}.html" />\n+  </outputs>\n+  <tests>\n+    <test>\n+      <param name="out_prefix" value="HSMetrics" />\n+      <param name="input_file" value="picard_input_summary_alignment_stats.sam" ftype="sam" />\n+      <param name="bait_bed" value="picard_input_bait.bed" />\n+      <param name="target_bed" value="picard_input_bait.bed"  />\n+      <param name="maxheap" value="8G"  />\n+      <output name="html_file" file="picard_output_hs_transposed_summary_alignment_stats.html" ftype="html" lines_diff="212"/>\n+    </test>\n+  </tests>\n+  <help>\n+\n+.. class:: infomark\n+\n+**Summary**\n+\n+Calculates a set of Hybrid Selection specific metrics from an aligned SAM or BAM file.\n+\n+**Picard documentation**\n+\n+This is a Galaxy wrapper for CollectAlignmentSummaryMetrics, a part of the external package Picard-tools_.\n+\n+ .. _Picard-tools: http://www.google.com/search?q=picard+samtools\n+\n+-----\n+\n+.. class:: infomark\n+\n+**Inputs, outputs, and parameters**\n+\n+Picard documentation says (reformatted for Galaxy):\n+\n+Calculates a set of Hybrid Selection specific metrics from an aligned SAM or BAM file.\n+\n+.. csv-table::\n+   :header-rows: 1\n+\n+   "Option", "Description"\n+   "BAIT_INTERVALS=File","An interval list file that contains the locations of the baits used. Required."\n+   "TARGET_INTERVALS=File","An interval list file that contains the locations of the targets. Required."\n+   "INPUT=File","An aligned SAM or BAM file. Required."\n+   "OUTPUT=File","The output file to write the metrics to. Required. Cannot be used in conjuction with option(s) METRICS_FILE (M)"\n+   "METRICS_FILE=File","Legacy synonym for OUTPUT, should not be used. Required. Cannot be used in conjuction with option(s) OUTPUT (O)"\n+   "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created. Default value: false"\n+\n+HsMetrics\n+\n+ The set of metrics captured that are specific to a hybrid selection analysis.\n+\n+Output Column Definitions::\n+\n+  1. BAIT_SET: The name of the bait set used in the hybrid selection.\n+  2. GENOME_SIZE: The number of bases'..b'rget terrirtoy / bait territory. 1 == perfectly efficient, 0.5 = half of baited bases are not target.\n+  6. TOTAL_READS: The total number of reads in the SAM or BAM file examine.\n+  7. PF_READS: The number of reads that pass the vendor\'s filter.\n+  8. PF_UNIQUE_READS: The number of PF reads that are not marked as duplicates.\n+  9. PCT_PF_READS: PF reads / total reads. The percent of reads passing filter.\n+ 10. PCT_PF_UQ_READS: PF Unique Reads / Total Reads.\n+ 11. PF_UQ_READS_ALIGNED: The number of PF unique reads that are aligned with mapping score > 0 to the reference genome.\n+ 12. PCT_PF_UQ_READS_ALIGNED: PF Reads Aligned / PF Reads.\n+ 13. PF_UQ_BASES_ALIGNED: The number of bases in the PF aligned reads that are mapped to a reference base. Accounts for clipping and gaps.\n+ 14. ON_BAIT_BASES: The number of PF aligned bases that mapped to a baited region of the genome.\n+ 15. NEAR_BAIT_BASES: The number of PF aligned bases that mapped to within a fixed interval of a baited region, but not on a baited region.\n+ 16. OFF_BAIT_BASES: The number of PF aligned bases that mapped to neither on or near a bait.\n+ 17. ON_TARGET_BASES: The number of PF aligned bases that mapped to a targetted region of the genome.\n+ 18. PCT_SELECTED_BASES: On+Near Bait Bases / PF Bases Aligned.\n+ 19. PCT_OFF_BAIT: The percentage of aligned PF bases that mapped neither on or near a bait.\n+ 20. ON_BAIT_VS_SELECTED: The percentage of on+near bait bases that are on as opposed to near.\n+ 21. MEAN_BAIT_COVERAGE: The mean coverage of all baits in the experiment.\n+ 22. MEAN_TARGET_COVERAGE: The mean coverage of targets that recieved at least coverage depth = 2 at one base.\n+ 23. PCT_USABLE_BASES_ON_BAIT: The number of aligned, de-duped, on-bait bases out of the PF bases available.\n+ 24. PCT_USABLE_BASES_ON_TARGET: The number of aligned, de-duped, on-target bases out of the PF bases available.\n+ 25. FOLD_ENRICHMENT: The fold by which the baited region has been amplified above genomic background.\n+ 26. ZERO_CVG_TARGETS_PCT: The number of targets that did not reach coverage=2 over any base.\n+ 27. FOLD_80_BASE_PENALTY: The fold over-coverage necessary to raise 80% of bases in "non-zero-cvg" targets to the mean coverage level in those targets.\n+ 28. PCT_TARGET_BASES_2X: The percentage of ALL target bases acheiving 2X or greater coverage.\n+ 29. PCT_TARGET_BASES_10X: The percentage of ALL target bases acheiving 10X or greater coverage.\n+ 30. PCT_TARGET_BASES_20X: The percentage of ALL target bases acheiving 20X or greater coverage.\n+ 31. PCT_TARGET_BASES_30X: The percentage of ALL target bases acheiving 30X or greater coverage.\n+ 32. HS_LIBRARY_SIZE: The estimated number of unique molecules in the selected part of the library.\n+ 33. HS_PENALTY_10X: The "hybrid selection penalty" incurred to get 80% of target bases to 10X. This metric should be interpreted as: if I have a design with 10 megabases of target, and want to get 10X coverage I need to sequence until PF_ALIGNED_BASES = 10^6 * 10 * HS_PENALTY_10X.\n+ 34. HS_PENALTY_20X: The "hybrid selection penalty" incurred to get 80% of target bases to 20X. This metric should be interpreted as: if I have a design with 10 megabases of target, and want to get 20X coverage I need to sequence until PF_ALIGNED_BASES = 10^6 * 20 * HS_PENALTY_20X.\n+ 35. HS_PENALTY_30X: The "hybrid selection penalty" incurred to get 80% of target bases to 10X. This metric should be interpreted as: if I have a design with 10 megabases of target, and want to get 30X coverage I need to sequence until PF_ALIGNED_BASES = 10^6 * 30 * HS_PENALTY_30X.\n+\n+.. class:: warningmark\n+\n+**Warning on SAM/BAM quality**\n+\n+Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**\n+flag when it runs Picard, which allows reads to be discarded if they\'re empty or don\'t map. This appears\n+to be the only way to deal with SAM/BAM that cannot be parsed.\n+\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/picard/rgPicardInsertSize.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/picard/rgPicardInsertSize.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,89 @@
+<tool name="Insertion size metrics" id="PicardInsertSize" version="0.3.0">
+  <description>for PAIRED data</description>
+  <requirements><requirement type="package">picard</requirement></requirements>
+  <command interpreter="python">
+   picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --taillimit "$tailLimit"
+   --histwidth "$histWidth" --minpct "$minPct"
+   -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/CollectInsertSizeMetrics.jar" -d "$html_file.files_path" -t "$html_file"
+  </command>
+  <inputs>
+    <param format="bam,sam" name="input_file" type="data" label="SAM/BAM dataset to generate statistics for"
+      help="If empty, upload or import a SAM/BAM dataset."/>
+    <param name="out_prefix" value="Insertion size metrics" type="text"
+      label="Title for the output file" help="Use this remind you what the job was for" size="120" />
+    <param name="tailLimit" value="10000" type="integer"
+      label="Tail limit" size="5" 
+      help="When calculating mean and stdev stop when the bins in the tail of the distribution contain fewer than mode/TAIL_LIMIT items" />
+     <param name="histWidth" value="0" type="integer"
+      label="Histogram width" size="5" 
+      help="Explicitly sets the histogram width, overriding the TAIL_LIMIT option - leave 0 to ignore" />
+     <param name="minPct" value="0.01" type="float"
+      label="Minimum percentage" size="5" 
+      help="Discard any data categories (out of FR, TANDEM, RF) that have fewer than this percentage of overall reads" />
+  </inputs>
+  <outputs>
+    <data format="html" name="html_file" label="InsertSize_${out_prefix}.html"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input_file" value="picard_input_tiny.sam" />
+      <param name="out_prefix" value="Insertion size metrics" />
+      <param name="tailLimit" value="10000" />
+      <param name="histWidth" value="0" />
+      <param name="minPct" value="0.01" />
+      <output name="html_file" file="picard_output_insertsize_tinysam.html" ftype="html" compare="contains" lines_diff="40" />
+    </test>
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**Purpose**
+
+Reads a SAM or BAM file and describes the distribution 
+of insert size (excluding duplicates) with metrics and a histogram plot.
+
+**Picard documentation**
+
+This is a Galaxy wrapper for CollectInsertSizeMetrics, a part of the external package Picard-tools_.
+
+ .. _Picard-tools: http://www.google.com/search?q=picard+samtools
+
+.. class:: warningmark
+
+**Useful for paired data only**
+
+This tool works for paired data only and can be expected to fail for single end data.
+
+-----
+
+.. class:: infomark
+
+**Inputs, outputs, and parameters**
+
+Picard documentation says (reformatted for Galaxy):
+
+.. csv-table::
+   :header-rows: 1
+
+    Option,Description
+    "INPUT=File","SAM or BAM file Required."
+    "OUTPUT=File","File to write insert size metrics to Required."
+    "HISTOGRAM_FILE=File","File to write insert size histogram chart to Required."
+    "TAIL_LIMIT=Integer","When calculating mean and stdev stop when the bins in the tail of the distribution contain fewer than mode/TAIL_LIMIT items. This also limits how much data goes into each data category of the histogram."
+    "HISTOGRAM_WIDTH=Integer","Explicitly sets the histogram width, overriding the TAIL_LIMIT option. Also, when calculating mean and stdev, only bins LE HISTOGRAM_WIDTH will be included. "
+    "MINIMUM_PCT=Float","When generating the histogram, discard any data categories (out of FR, TANDEM, RF) that have fewer than this percentage of overall reads. (Range: 0 to 1) Default value: 0.01."
+    "STOP_AFTER=Integer","Stop after processing N reads, mainly for debugging. Default value: 0."
+    "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created. Default value: false."
+
+.. class:: warningmark
+
+**Warning on SAM/BAM quality**
+
+Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
+flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
+to be the only way to deal with SAM/BAM that cannot be parsed.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/picard/rgPicardLibComplexity.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/picard/rgPicardLibComplexity.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,122 @@
+<tool name="Estimate Library Complexity" id="rgEstLibComp" version="0.01">
+  <command interpreter="python">
+   picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --minid "$minIDbases"
+   --maxdiff "$maxDiff" --minmeanq "$minMeanQ" --readregex "$readRegex" --optdupdist "$optDupeDist"
+   -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/EstimateLibraryComplexity.jar" -d "$html_file.files_path" -t "$html_file"
+  </command>
+  <inputs>
+    <param format="bam,sam" name="input_file" type="data" label="SAM/BAM dataset"
+      help="If empty, upload or import a SAM/BAM dataset."/>
+    <param name="out_prefix" value="Library Complexity" type="text"
+      label="Title for the output file" help="Use this remind you what the job was for." size="80" />
+    <param name="minIDbases" value="5" type="integer"  label="Minimum identical bases at starts of reads for grouping" size="5" 
+      help="Total_reads / 4^max_id_bases reads will be compared at a time. Lower numbers = more accurate results and exponentially more time/memory." />
+     <param name="maxDiff" value="0.03" type="float"
+      label="Maximum difference rate for identical reads" size="5" 
+      help="The maximum rate of differences between two reads to call them identical" />
+     <param name="minMeanQ" value="20" type="integer"
+      label="Minimum percentage" size="5" 
+      help="The minimum mean quality of bases in a read pair. Lower average quality reads filtered out from all calculations" />
+     <param name="readRegex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" type="text" size="120"
+      label="Regular expression that can be used to parse read names in the incoming SAM file" 
+      help="Names are parsed to extract: tile/region, x coordinate and y coordinate, to estimate optical duplication rate" >
+      <sanitizer>
+        <valid initial="string.printable">
+         <remove value="&apos;"/>
+        </valid>
+        <mapping initial="none">
+          <add source="&apos;" target="__sq__"/>
+        </mapping>
+      </sanitizer>
+     </param>
+     <param name="optDupeDist" value="100" type="text"
+      label="The maximum offset between two duplicte clusters in order to consider them optical duplicates." size="5" 
+      help="e.g. 5-10 pixels. Later Illumina software versions multiply pixel values by 10, in which case 50-100" />
+
+  </inputs>
+  <outputs>
+    <data format="html" name="html_file" label="${out_prefix}_lib_complexity.html"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input_file" value="picard_input_tiny.sam" />
+      <param name="out_prefix" value="Library Complexity" />
+      <param name="minIDbases" value="5" />
+      <param name="maxDiff" value="0.03" />
+      <param name="minMeanQ" value="20" />
+      <param name="readRegex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" />
+      <param name="optDupeDist" value="100" />      
+      <output name="html_file" file="picard_output_estlibcomplexity_tinysam.html" ftype="html" lines_diff="30" />
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**Purpose**
+
+Attempts to estimate library complexity from sequence alone. 
+Does so by sorting all reads by the first N bases (5 by default) of each read and then 
+comparing reads with the first N bases identical to each other for duplicates. Reads are considered to be 
+duplicates if they match each other with no gaps and an overall mismatch rate less than or equal to MAX_DIFF_RATE (0.03 by default).
+
+Reads of poor quality are filtered out so as to provide a more accurate estimate. 
+The filtering removes reads with any no-calls in the first N bases or with a mean base quality lower than 
+MIN_MEAN_QUALITY across either the first or second read.
+
+The algorithm attempts to detect optical duplicates separately from PCR duplicates and excludes these in the 
+calculation of library size. Also, since there is no alignment to screen out technical reads one 
+further filter is applied on the data. After examining all reads a histogram is built of 
+[#reads in duplicate set -> #of duplicate sets]; all bins that contain exactly one duplicate set are
+then removed from the histogram as outliers before library size is estimated.
+
+**Picard documentation**
+
+This is a Galaxy wrapper for EstimateLibraryComplexity, a part of the external package Picard-tools_.
+
+ .. _Picard-tools: http://www.google.com/search?q=picard+samtools
+
+-----
+
+.. class:: infomark
+
+**Inputs, outputs, and parameters**
+
+Picard documentation says (reformatted for Galaxy):
+
+.. csv-table::
+   :header-rows: 1
+
+    Option Description
+    "INPUT=File","One or more files to combine and estimate library complexity from. Reads can be mapped or unmapped. This option may be specified 0 or more times."
+    "OUTPUT=File","Output file to writes per-library metrics to. Required."
+    "MIN_IDENTICAL_BASES=Integer","The minimum number of bases at the starts of reads that must be identical for reads to be grouped together for duplicate detection. In effect total_reads / 4^max_id_bases reads will be compared at a time, so lower numbers will produce more accurate results but consume exponentially more memory and CPU. Default value: 5."
+    "MAX_DIFF_RATE=Double","The maximum rate of differences between two reads to call them identical. Default value: 0.03. "
+    "MIN_MEAN_QUALITY=Integer","The minimum mean quality of the bases in a read pair for the read to be analyzed. Reads with lower average quality are filtered out and not considered in any calculations. Default value: 20."
+    "READ_NAME_REGEX=String","Regular expression that can be used to parse read names in the incoming SAM file. Read names are parsed to extract three variables: tile/region, x coordinate and y coordinate. These values are used to estimate the rate of optical duplication in order to give a more accurate estimated library size. The regular expression should contain three capture groups for the three variables, in order. Default value: [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*. This option can be set to 'null' to clear the default value."
+    "OPTICAL_DUPLICATE_PIXEL_DISTANCE=Integer","The maximum offset between two duplicte clusters in order to consider them optical duplicates. This should usually be set to some fairly small number (e.g. 5-10 pixels) unless using later versions of the Illumina pipeline that multiply pixel values by 10, in which case 50-100 is more normal. Default value: 100"
+    "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created. Default value: false. This option can be set to 'null' to clear the default value. "
+
+.. class:: warningmark
+
+**Warning on SAM/BAM quality**
+
+Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
+flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
+to be the only way to deal with SAM/BAM that cannot be parsed.
+
+.. class:: infomark
+
+**Note on the Regular Expression**
+
+(from the Picard docs)
+This tool requires a valid regular expression to parse out the read names in the incoming SAM or BAM file. 
+These values are used to estimate the rate of optical duplication in order to give a more accurate estimated library size. 
+The regular expression should contain three capture groups for the three variables, in order. 
+Default value: [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*.
+
+
+  </help>
+</tool>
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/picard/rgPicardMarkDups.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/picard/rgPicardMarkDups.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,130 @@
+<tool name="Mark Duplicate reads" id="rgPicardMarkDups" version="0.01">
+  <command interpreter="python">
+   picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" -o "$out_file"
+   --remdups "$remDups" --assumesorted "$assumeSorted" --readregex "$readRegex" --optdupdist "$optDupeDist"
+   -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/MarkDuplicates.jar" -d "$html_file.files_path" -t "$html_file" -e "$input_file.ext"
+  </command>
+  <requirements><requirement type="package">picard</requirement></requirements>
+  <inputs>
+    <param format="bam,sam" name="input_file" type="data" label="SAM/BAM dataset to mark duplicates in"
+      help="If empty, upload or import a SAM/BAM dataset."/>
+    <param name="out_prefix" value="Dupes Marked" type="text"
+      label="Title for the output file" help="Use this remind you what the job was for" size="80" />
+    <param name="remDups" value="false" type="boolean"  label="Remove duplicates from output file"
+      truevalue="true" falsevalue="false" checked="yes" 
+      help="If true do not write duplicates to the output file instead of writing them with appropriate flags set." />
+    <param name="assumeSorted" value="true" type="boolean"  label="Assume reads are already ordered"
+      truevalue="true" falsevalue="false" checked="yes" 
+      help="If true assume input data are already sorted (most Galaxy SAM/BAM should be)." />
+     <param name="readRegex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" type="text" size="80"
+      label="Regular expression that can be used to parse read names in the incoming SAM file" 
+      help="Names are parsed to extract: tile/region, x coordinate and y coordinate, to estimate optical duplication rate" >
+      <sanitizer>
+        <valid initial="string.printable">
+         <remove value="&apos;"/>
+        </valid>
+        <mapping initial="none">
+          <add source="&apos;" target="__sq__"/>
+        </mapping>
+      </sanitizer>
+     </param>
+     <param name="optDupeDist" value="100" type="integer"
+      label="The maximum offset between two duplicate clusters in order to consider them optical duplicates." size="5" 
+      help="e.g. 5-10 pixels. Later Illumina software versions multiply pixel values by 10, in which case 50-100." >
+      <validator type="in_range" message="Minimum optical dupe distance must be positive" min="0" />    
+     </param>
+
+  </inputs>
+  <outputs>
+    <data format="bam" name="out_file" label="MarkDups_${out_prefix}.bam"/>
+    <data format="html" name="html_file" label="MarkDups_${out_prefix}.html"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input_file" value="picard_input_tiny_coord.bam" ftype="bam" />
+      <param name="out_prefix" value="Dupes Marked" />
+      <param name="remDups" value="false" />
+      <param name="assumeSorted" value="true" />
+      <param name="readRegex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" />
+      <param name="optDupeDist" value="100" />      
+      <output name="out_file" file="picard_output_markdups_sortedpairsam.bam" ftype="bam" compare="diff" />
+      <output name="html_file" file="picard_output_markdups_sortedpairsam.html" ftype="html" lines_diff="75" />
+    </test>
+    <test>
+      <param name="input_file" value="picard_input_tiny_coord.sam" ftype="sam" />
+      <param name="out_prefix" value="Dupes Marked" />
+      <param name="remDups" value="true" />
+      <param name="assumeSorted" value="true" />
+      <param name="readRegex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" />
+      <param name="optDupeDist" value="100" />
+      <output name="out_file" file="picard_output_markdups_remdupes.bam" ftype="bam" compare="diff" />
+      <output name="html_file" file="picard_output_markdups_sortedpairsam.html" ftype="html" lines_diff="75" />
+    </test>
+  </tests>
+  
+  <help>
+
+.. class:: infomark
+
+**Purpose**
+
+Marks all duplicate reads in a provided SAM or BAM file and either removes them or flags them.
+
+**Picard documentation**
+
+This is a Galaxy wrapper for MarkDuplicates, a part of the external package Picard-tools_.
+
+ .. _Picard-tools: http://www.google.com/search?q=picard+samtools
+
+-----
+
+.. class:: infomark
+
+**Inputs, outputs, and parameters**
+
+Picard documentation says (reformatted for Galaxy):
+
+.. csv-table:: Mark Duplicates docs
+   :header-rows: 1
+
+    Option,Description
+    "INPUT=File","The input SAM or BAM file to analyze. Must be coordinate sorted. Required."
+    "OUTPUT=File","The output file to right marked records to Required."
+    "METRICS_FILE=File","File to write duplication metrics to Required."
+    "REMOVE_DUPLICATES=Boolean","If true do not write duplicates to the output file instead of writing them with appropriate flags set. Default value: false."
+    "ASSUME_SORTED=Boolean","If true, assume that the input file is coordinate sorted, even if the header says otherwise. Default value: false."
+    "MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=Integer","This option is obsolete. ReadEnds will always be spilled to disk. Default value: 50000."
+    "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=Integer","Maximum number of file handles to keep open when spilling read ends to disk."
+    "READ_NAME_REGEX=String","Regular expression that can be used to parse read names in the incoming SAM file. Read names are parsed to extract three variables: tile/region, x coordinate and y coordinate. "
+    "OPTICAL_DUPLICATE_PIXEL_DISTANCE=Integer","The maximum offset between two duplicte clusters in order to consider them optical duplicates. This should usually be set to some fairly small number (e.g. 5-10 pixels) unless using later versions of the Illumina pipeline that multiply pixel values by 10, in which case 50-100 is more normal. Default value: 100"
+
+.. class:: warningmark
+
+**Warning on SAM/BAM quality**
+
+Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
+flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
+to be the only way to deal with SAM/BAM that cannot be parsed.
+.. class:: infomark
+
+**Note on the Regular Expression**
+
+(from the Picard docs)
+This tool requires a valid regular expression to parse out the read names in the incoming SAM or BAM file. These values are used to estimate the rate of optical duplication in order to give a more accurate estimated library size. The regular expression should contain three capture groups for the three variables, in order. Default value: [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).
+
+Examines aligned records in the supplied SAM or BAM file to locate duplicate molecules. All records are then written to the output file with the duplicate records flagged unless the remove duplicates option is selected. In some cases you may want to do this, but please only do this if you really understand what you are doing.
+
+  </help>
+</tool>
+
+
+
+
+
+
+
+
+
+
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/plotting/bar_chart.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/plotting/bar_chart.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,144 @@
+#!/usr/bin/env python
+
+
+"""
+histogram_gnuplot.py <datafile> <xtic column> <column_list> <title> <ylabel> <yrange_min> <yrange_max> <grath_file>
+a generic histogram builder based on gnuplot backend
+
+   data_file    - tab delimited file with data
+   xtic_column  - column containing labels for x ticks [integer, 0 means no ticks]
+   column_list  - comma separated list of columns to plot
+   title        - title for the entire histrogram
+   ylabel       - y axis label
+   yrange_max   - minimal value at the y axis (integer)
+   yrange_max   - maximal value at the y_axis (integer) 
+                  to set yrange to autoscaling assign 0 to yrange_min and yrange_max
+   graph_file   - file to write histogram image to
+   img_size     - as X,Y pair in pixels (e.g., 800,600 or 600,800 etc.)
+   
+   
+   This tool required gnuplot and gnuplot.py
+
+anton nekrutenko | anton@bx.psu.edu
+
+"""
+
+import Gnuplot, Gnuplot.funcutils
+import sys, string, tempfile, os
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+def main(tmpFileName):
+    skipped_lines_count = 0
+    skipped_lines_index = []
+    gf = open(tmpFileName, 'w')
+    
+    
+    try:
+        in_file   = open( sys.argv[1], 'r' )
+        xtic      = int( sys.argv[2] )
+        col_list  = string.split( sys.argv[3],"," )
+        title     = 'set title "' + sys.argv[4] + '"'
+        ylabel    = 'set ylabel "' + sys.argv[5] + '"'
+        ymin      = sys.argv[6]
+        ymax      = sys.argv[7]
+        img_file  = sys.argv[8]
+        img_size  = sys.argv[9]
+    except:
+        stop_err("Check arguments\n")
+        
+    try:
+        int( col_list[0] )
+    except:
+        stop_err('You forgot to set columns for plotting\n')    
+    
+       
+    for i, line in enumerate( in_file ):
+        valid = True
+        line = line.rstrip('\r\n')
+        if line and not line.startswith( '#' ):
+            row = []
+            try:
+                fields = line.split( '\t' )
+                for col in col_list:
+                    row.append( str( float( fields[int( col )-1] ) ) )
+                    
+            except:
+                valid = False
+                skipped_lines_count += 1
+                skipped_lines_index.append(i)
+                    
+        else:
+            valid = False
+            skipped_lines_count += 1
+            skipped_lines_index.append(i)
+            
+        if valid and xtic > 0:
+            row.append( fields[xtic-1] )
+        elif valid and xtic == 0:
+            row.append( str( i ) )    
+            
+        if valid:
+            gf.write( '\t'.join( row ) )
+            gf.write( '\n' )  
+             
+    if skipped_lines_count < i:
+        
+        #prepare 'using' clause of plot statement
+        
+        g_plot_command = ' ';
+        
+        #set the first column
+        if xtic > 0:
+            g_plot_command = "'%s' using 1:xticlabels(%s) ti 'Column %s', " % ( tmpFileName, str( len( row ) ), col_list[0] )
+        else:
+            g_plot_command = "'%s' using 1 ti 'Column %s', " % ( tmpFileName, col_list[0] )
+        
+        #set subsequent columns
+        
+        for i in range(1,len(col_list)):
+            g_plot_command += "'%s' using %s t 'Column %s', " % ( tmpFileName, str( i+1 ), col_list[i] )
+        
+        g_plot_command = g_plot_command.rstrip( ', ' )
+        
+        yrange = 'set yrange [' + ymin + ":" + ymax + ']'
+                    
+        try:
+            g = Gnuplot.Gnuplot()
+            g('reset')
+            g('set boxwidth 0.9 absolute')
+            g('set style fill  solid 1.00 border -1')
+            g('set style histogram clustered gap 5 title  offset character 0, 0, 0')
+            g('set xtics border in scale 1,0.5 nomirror rotate by 90 offset character 0, 0, 0')
+            g('set key invert reverse Left outside')
+            if xtic == 0:  g('unset xtics')
+            g(title) 
+            g(ylabel)
+            g_term = 'set terminal png tiny size ' + img_size
+            g(g_term)
+            g_out = 'set output "' + img_file + '"'
+            if ymin != ymax:
+                g(yrange)
+            g(g_out)
+            g('set style data histograms')
+            g.plot(g_plot_command)
+        except:
+            stop_err("Gnuplot error: Data cannot be plotted")
+    else:
+        sys.stderr.write('Column(s) %s of your dataset do not contain valid numeric data' %sys.argv[3] )
+        
+    if skipped_lines_count > 0:
+        sys.stdout.write('\nWARNING. You dataset contain(s) %d invalid lines starting with line #%d.  These lines were skipped while building the graph.\n' % ( skipped_lines_count, skipped_lines_index[0]+1 ) )
+    
+
+if __name__ == "__main__":
+    # The tempfile initialization is here because while inside the main() it seems to create a condition
+    # when the file is removed before gnuplot has a chance of accessing it
+    gp_data_file = tempfile.NamedTemporaryFile('w')
+    Gnuplot.gp.GnuplotOpts.default_term = 'png'
+    main(gp_data_file.name)
+    
b
diff -r 000000000000 -r 9071e359b9a3 tools/plotting/bar_chart.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/plotting/bar_chart.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,60 @@
+<tool id="barchart_gnuplot" name="Bar chart">
+  <description>for multiple columns</description>
+  <command interpreter="python">
+    #if $xtic.userSpecified == "Yes" #bar_chart.py $input $xtic.xticColumn $colList "$title" "$ylabel" $ymin $ymax $out_file1 "$pdf_size"
+    #else                            #bar_chart.py $input 0 $colList "$title" "$ylabel" $ymin $ymax $out_file1 "$pdf_size"
+    #end if
+  </command>
+  <inputs>
+    <param name="input" type="data" format="tabular" label="Dataset" help="Dataset missing? See TIP below"/>
+    <conditional name="xtic">
+        <param name="userSpecified" type="select" label="Use X Tick labels?" help="see example below">
+            <option value="Yes">Yes</option>
+            <option value="No">No</option>
+        </param>
+        <when value="Yes">
+            <param name="xticColumn" type="data_column" data_ref="input" numerical="False" label="Use this column for X Tick labels" />
+        </when>
+        <when value="No">
+        </when>
+    </conditional>           
+    <param name="colList" label="Numerical columns" type="data_column" numerical="True" multiple="True" data_ref="input" help="Multi-select list - hold the appropriate key while clicking to select multiple columns" />
+    <param name="title" type="text" size="30" value="Bar Chart" label="Plot title"/>
+    <param name="ylabel" type="text" size="30" value="V1" label="Label for Y axis"/>
+    <param name="ymin" type="integer" size="4" value="0" label="Minimal value on Y axis" help="set to 0 for autoscaling"/>
+    <param name="ymax" type="integer" size="4" value="0" label="Maximal value on Y axis" help="set to 0 for autoscaling"/>
+    <param name="pdf_size" type="select" label="Choose chart size (pixels)">
+        <option value="800,600">Normal: 800 by 600</option>
+        <option value="640,480">Small: 640 by 480</option>
+        <option value="1480,800">Large: 1480 by 800</option>
+        <option value="600,800">Normal Flipped: 600 by 800</option>
+        <option value="480,640">Small Flipped: 480 by 640</option>
+        <option value="800,1480">Large Flipped: 800 by 1480</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="png" name="out_file1" />
+  </outputs>
+  <requirements>
+    <requirement type="python-module">Gnuplot</requirement>
+    <requirement type="python-module">Numeric</requirement>
+  </requirements>
+  <help>
+
+**What it does**
+
+This tool builds a bar chart on one or more columns. Suppose you have dataset like this one::
+
+  Gene1 10 15
+  Gene2 20 14
+  Gene3 67 45
+  Gene4 55 12
+
+Graphing columns 2 and 3 while using column 1 for X Tick Labels will produce the following plot:
+
+.. image:: ./static/images/bar_chart.png 
+   :height: 324 
+   :width: 540 
+    
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/plotting/boxplot.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/plotting/boxplot.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,102 @@
+<tool id="qual_stats_boxplot" name="Boxplot" version="1.0.0">
+  <description>of quality statistics</description>
+  <command>gnuplot &lt; '$gnuplot_commands' 2&gt;&amp;1 || echo "Error running gnuplot." >&amp;2</command>
+  <requirements>
+    <requirement type="binary" version="gnuplot 4.2 patchlevel 2">gnuplot</requirement>
+  </requirements>
+  <inputs>
+    <param name="input_file" type="data" format="tabular" label="Quality Statistics File"/>
+    <param name="title" type="text" value="Box plot in Galaxy" label="Title for plot" size="50"/>
+    <param name="graph_size" type="text" value="2048,768" label="Dimensions of Graph"/>
+    <param name="xlabel" type="text" value="X Axis Label" label="X axis label" size="50"/>
+    <param name="ylabel" type="text" value="Score Value" label="Y axis label" size="50"/>
+    <param name="xcol" type="data_column" data_ref="input_file" label="Column for X axis position" default_value="1" help="A unique number; c1 if plotting output of FASTQ summary"/>
+    <param name="q1col" type="data_column" data_ref="input_file" label="Column for Q1" default_value="7" help="c7 if plotting output of FASTQ summary"/>
+    <param name="medcol" type="data_column" data_ref="input_file" label="Column for Median" default_value="8" help="c8 if plotting output of FASTQ summary"/>
+    <param name="q3col" type="data_column" data_ref="input_file" label="Column for Q3" default_value="9" help="c9 if plotting output of FASTQ summary"/>
+    <param name="lwcol" type="data_column" data_ref="input_file" label="Column for left whisker" default_value="11" help="c11 if plotting output of FASTQ summary"/>
+    <param name="rwcol" type="data_column" data_ref="input_file" label="Column for right whisker" default_value="12" help="c12 if plotting output of FASTQ summary"/>
+    <conditional name="use_outliers">
+      <param name="use_outliers_type" type="select" label="Plot Outliers">
+        <option value="use_outliers" selected="true">Plot Outliers</option>
+        <option value="dont_use_outliers">Don't Plot Outliers</option>
+      </param>
+      <when value="use_outliers">
+        <param name="outliercol" type="data_column" data_ref="input_file" label="Column for Outliers" default_value="13" help="c13 if plotting output of FASTQ summary"/>
+      </when>
+      <when value="dont_use_outliers">
+      </when>
+    </conditional>
+  </inputs>
+  <configfiles>
+    <configfile name="gnuplot_commands">
+set output '$output_file'
+set term png size ${graph_size}
+set boxwidth 0.8 
+set key right tmargin
+set xlabel "${xlabel}"
+set ylabel "${ylabel}"
+set title  "${title}"
+set xtics 1 
+set ytics 1
+set grid ytics
+set offsets 1, 1, 1, 1
+plot '${input_file}' using ${xcol}:${q1col}:${lwcol}:${rwcol}:${q3col} with candlesticks lt 1  lw 1 title 'Quartiles' whiskerbars, \
+      ''         using ${xcol}:${medcol}:${medcol}:${medcol}:${medcol} with candlesticks lt -1 lw 2 title 'Medians'\
+#if str( $use_outliers['use_outliers_type'] ) == 'use_outliers':
+,      "&lt; python -c \"for xval, yvals in [ ( fields[${xcol} - 1], fields[${use_outliers['outliercol']} - 1].split( ',' ) ) for fields in [ line.rstrip( '\\n\\r' ).split( '\\t' ) for line in open( '${input_file}' ) if not line.startswith( '#' ) ] if len( fields ) &gt; max( ${xcol} - 1, ${use_outliers['outliercol']} - 1 ) ]: print '\\n'.join( [ '%s\\t%s' % ( xval, yval ) for yval in yvals if yval ] )\"" using 1:2 with points pt 29 title 'Outliers'
+#end if
+    </configfile>
+  </configfiles>
+  <outputs>
+    <data name="output_file" format="png" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input_file" value="fastq_stats_1_out.tabular" ftype="tabular" />
+      <param name="title" value="Boxplot of Summary Statistics for Sanger Reads" />
+      <param name="graph_size" value="2048,768" />
+      <param name="xlabel" value="Read Column" />
+      <param name="ylabel" value="Quality Score Value" />
+      <param name="xcol" value="1" />
+      <param name="q1col" value="7" />
+      <param name="medcol" value="8" />
+      <param name="q3col" value="9" />
+      <param name="lwcol" value="11" />
+      <param name="rwcol" value="12" />
+      <param name="use_outliers_type" value="use_outliers" />
+      <param name="outliercol" value="13" />
+      <output name="output_file" file="boxplot_summary_statistics_out.png" />
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+Creates a boxplot graph. Its main purpose is to display a distribution of quality scores produced by *NGS: QC and maniupulation -> FASTQ Summary Statistics* tool.
+
+.. class:: warningmark
+
+**TIP:** If you want to display a distribution of quality scores produced by *NGS: QC and maniupulation -> FASTQ Summary Statistics* and the column assignments within the tool's interface are not automatically set (they will all read "c1" in that case) set columns manually to the following values::
+
+  Column for X axis           c1
+  Column for Q1               c7
+  Column for Median           c8
+  Column for Q3               c9
+  Column for left whisker     c11
+  Column for right whisker    c12
+  Column for Outliers         c13
+
+-----
+
+**Output Example**
+
+* Black horizontal lines are medians
+* Rectangular red boxes show the Inter-quartile Range (IQR) (top value is Q3, bottom value is Q1)
+* Whiskers show outliers at max. 1.5*IQR
+
+.. image:: ./static/images/solid_qual.png
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/plotting/histogram.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/plotting/histogram.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+#Greg Von Kuster
+
+import sys
+from rpy import *
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+def main():
+
+    # Handle input params
+    in_fname = sys.argv[1]
+    out_fname = sys.argv[2] 
+    try:
+        column = int( sys.argv[3] ) - 1
+    except:
+        stop_err( "Column not specified, your query does not contain a column of numerical data." )
+    title = sys.argv[4]
+    xlab = sys.argv[5]
+    breaks = int( sys.argv[6] )
+    if breaks == 0:
+        breaks = "Sturges"
+    if sys.argv[7] == "true":
+        density = True
+    else: density = False
+    if len( sys.argv ) >= 9 and sys.argv[8] == "true":
+        frequency = True
+    else: frequency = False
+
+    matrix = []
+    skipped_lines = 0
+    first_invalid_line = 0
+    invalid_value = ''
+    i = 0
+    for i, line in enumerate( file( in_fname ) ):
+        valid = True
+        line = line.rstrip('\r\n')
+        # Skip comments
+        if line and not line.startswith( '#' ): 
+            # Extract values and convert to floats
+            row = []
+            try:
+                fields = line.split( "\t" )
+                val = fields[column]
+                if val.lower() == "na":
+                    row.append( float( "nan" ) )
+            except:
+                valid = False
+                skipped_lines += 1
+                if not first_invalid_line:
+                    first_invalid_line = i+1
+            else:
+                try:
+                    row.append( float( val ) )
+                except ValueError:
+                    valid = False
+                    skipped_lines += 1
+                    if not first_invalid_line:
+                        first_invalid_line = i+1
+                        invalid_value = fields[column]
+        else:
+            valid = False
+            skipped_lines += 1
+            if not first_invalid_line:
+                first_invalid_line = i+1
+
+        if valid:
+            matrix += row
+
+    if skipped_lines < i:
+        try:
+            a = r.array( matrix )
+            r.pdf( out_fname, 8, 8 )
+            histogram = r.hist( a, probability=not frequency, main=title, xlab=xlab, breaks=breaks )
+            if density:
+                density = r.density( a )
+                if frequency:
+                    scale_factor = len( matrix ) * ( histogram['mids'][1] - histogram['mids'][0] ) #uniform bandwidth taken from first 2 midpoints
+                    density[ 'y' ] = map( lambda x: x * scale_factor, density[ 'y' ] )
+                r.lines( density )
+            r.dev_off()
+        except Exception, exc:
+            stop_err( "%s" %str( exc ) )
+    else:
+        if i == 0:
+            stop_err("Input dataset is empty.")
+        else:
+            stop_err( "All values in column %s are non-numeric." %sys.argv[3] )
+
+    print "Histogram of column %s. " %sys.argv[3]
+    if skipped_lines > 0:
+        print "Skipped %d invalid lines starting with line #%d, '%s'." % ( skipped_lines, first_invalid_line, invalid_value )
+
+    r.quit( save="no" )
+    
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/plotting/histogram2.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/plotting/histogram2.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,76 @@
+<tool id="histogram_rpy" name="Histogram" version="1.0.3">
+  <description>of a numeric column</description>
+  <command interpreter="python">histogram.py $input $out_file1 $numerical_column "$title" "$xlab" $breaks $density $frequency</command>
+  <inputs>
+    <param name="input" type="data" format="tabular" label="Dataset" help="Dataset missing? See TIP below"/>
+    <param name="numerical_column" type="data_column" data_ref="input" numerical="True" label="Numerical column for x axis" />
+    <param name="breaks" type="integer" size="4" value="0" label="Number of breaks (bars)"/>
+    <param name="title" type="text" size="30" value="Histogram" label="Plot title"/>
+    <param name="xlab" type="text" size="30" value="V1" label="Label for x axis"/>
+    <param name="density" type="boolean" checked="yes" label="Include smoothed density"/>
+    <param name="frequency" type="boolean" checked="no" label="Plot as frequency (counts)"/>
+  </inputs>
+  <outputs>
+    <data format="pdf" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="histogram_in1.tabular" ftype="tabular"/>
+      <param name="numerical_column" value="2"/>
+      <param name="breaks" value="0"/>
+      <param name="title" value="Histogram"/>
+      <param name="xlab" value="V1"/>
+      <param name="density" value="true"/>
+      <param name="frequency" value="false"/>
+      <output name="out_file1" file="histogram_out1.pdf"/>
+    </test>
+  </tests>
+  <requirements>
+    <requirement type="python-module">rpy</requirement>
+  </requirements>
+  <help>
+
+.. class:: infomark
+
+**TIP:** To remove comment lines that do not begin with a *#* character, use *Text Manipulation-&gt;Remove beginning*
+
+ .. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
+
+-----
+
+**Syntax**
+
+This tool computes a histogram of the numerical values in a column of a dataset.
+
+- All invalid, blank and comment lines in the dataset are skipped.  The number of skipped lines is displayed in the resulting history item.
+- **Column for x axis** - only numerical columns are possible.
+- **Number of breaks(bars)** - breakpoints between histogram cells. Value of '0' will determine breaks automatically.
+- **Plot title** - the histogram title.
+- **Label for x axis** - the label of the x axis for the histogram.
+- **Include smoothed density** - if checked, the resulting graph will join the given corresponding points with line segments.
+
+-----
+
+**Example**
+
+- Input file::
+
+    1 68 4.1
+    2 71 4.6
+    3 62 3.8
+    4 75 4.4
+    5 58 3.2
+    6 60 3.1
+    7 67 3.8
+    8 68 4.1
+    9 71 4.3
+    10 69 3.7 
+
+- Create a histogram on column 2 of the above dataset. 
+
+.. image:: ./static/images/histogram2.png
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/plotting/plot_filter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/plotting/plot_filter.py Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,19 @@
+
+def validate(incoming):
+    """Validator for the plotting program"""
+    
+    bins = incoming.get("bins","")
+    col  = incoming.get("col","")
+
+    if not bins or not col:
+        raise Exception, "You need to specify a number for bins and columns"
+
+    try:
+        bins = int(bins)
+        col  = int(col)
+    except:
+        raise Exception, "Parameters are not valid numbers, columns:%s, bins:%s" % (col, bins)
+
+    if not 1<bins<100:
+        raise Exception, "The number of bins %s must be a number between 1 and 100" % bins
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/plotting/plotter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/plotting/plotter.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+
+# python histogram input_file output_file column bins 
+import sys, os
+import matplotlib; matplotlib.use('Agg')
+
+from pylab import *
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+if __name__ == '__main__':
+    # parse the arguments
+    
+    if len(sys.argv) != 6:
+        stop_err('Usage: python histogram.py input_file column bins output_file style')
+        sys.exit()
+
+    mode = sys.argv[5]
+    HIST = mode == 'hist'
+    try:
+        col =  int(float(sys.argv[2]))
+        if HIST:
+            bin = int(float(sys.argv[3]))
+        else:
+            # hack, this parameter is the plotting style for scatter plots
+            if sys.argv[3] == 'P':
+                style = 'o'
+            elif sys.argv[3] == 'LP':
+                style = 'o-'
+            else:
+                style = '-'
+
+    except:
+        msg = 'Parameter were not numbers %s, %s' % (sys.argv[3], sys.argv[4])
+        stop_err(msg)
+
+    # validate arguments
+    inp_file = sys.argv[1]
+    out_file = sys.argv[4]
+
+    if HIST:
+        print "Histogram on column %s (%s bins)" % (col, bin)
+    else:
+        print "Scatterplot on column %s" % (col)
+
+    xcol= col -1
+    # read the file
+    values = []
+    try:
+        count = 0
+        for line in file(inp_file):
+            count += 1
+            line = line.strip()
+            if line and line[0] != '#':
+                values.append(float(line.split()[xcol]))
+    except Exception, e:
+        stop_err('%s' % e)
+        stop_err("Non numerical data at line %d, column %d" % (count, col) )
+
+    # plot the data
+
+    if HIST:
+        n, bins, patches = hist(values, bins=bin, normed=0)
+    else:
+        plot(values, style)
+    
+    xlabel('values')
+    ylabel('counts')
+
+    if HIST:
+        title('Histogram of values over column %s (%s bins)' % (col, len(bins)) )
+    else:
+        title('Scatterplot over column %s' % col )        
+    grid(True)
+    
+    # the plotter detects types by file extension
+    png_out = out_file + '.png' # force it to png
+    savefig(png_out)
+
+    # shuffle it back and clean up
+    data = file(png_out, 'rb').read() 
+    fp = open(out_file, 'wb')
+    fp.write(data)
+    fp.close()
+    os.remove(png_out)
b
diff -r 000000000000 -r 9071e359b9a3 tools/plotting/r_wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/plotting/r_wrapper.sh Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+### Run R providing the R script in $1 as standard input and passing 
+### the remaining arguments on the command line
+
+# Function that writes a message to stderr and exits
+function fail
+{
+    echo "$@" >&2
+    exit 1
+}
+
+# Ensure R executable is found
+which R > /dev/null || fail "'R' is required by this tool but was not found on path" 
+
+# Extract first argument
+infile=$1; shift
+
+# Ensure the file exists
+test -f $infile || fail "R input file '$infile' does not exist"
+
+# Invoke R passing file named by first argument to stdin
+R --vanilla --slave $* < $infile
b
diff -r 000000000000 -r 9071e359b9a3 tools/plotting/scatterplot.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/plotting/scatterplot.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+#Greg Von Kuster
+
+import sys
+from rpy import *
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+def main():
+
+    in_fname = sys.argv[1]
+    out_fname = sys.argv[2]
+    try:
+        columns = int( sys.argv[3] ) - 1, int( sys.argv[4] ) - 1
+    except:
+        stop_err( "Columns not specified, your query does not contain a column of numerical data." )
+    title = sys.argv[5]
+    xlab = sys.argv[6]
+    ylab = sys.argv[7]
+
+    matrix = []
+    skipped_lines = 0
+    first_invalid_line = 0
+    invalid_value = ''
+    invalid_column = 0
+    i = 0
+    for i, line in enumerate( file( in_fname ) ):
+        valid = True
+        line = line.rstrip( '\r\n' )
+        if line and not line.startswith( '#' ): 
+            row = []
+            fields = line.split( "\t" )
+            for column in columns:
+                try:
+                    val = fields[column]
+                    if val.lower() == "na": 
+                        row.append( float( "nan" ) )
+                    else:
+                        row.append( float( fields[column] ) )
+                except:
+                    valid = False
+                    skipped_lines += 1
+                    if not first_invalid_line:
+                        first_invalid_line = i + 1
+                        try:
+                            invalid_value = fields[column]
+                        except:
+                            invalid_value = ''
+                        invalid_column = column + 1
+                    break
+        else:
+            valid = False
+            skipped_lines += 1
+            if not first_invalid_line:
+                first_invalid_line = i+1
+
+        if valid:
+            matrix.append( row )
+
+    if skipped_lines < i:
+        try:
+            r.pdf( out_fname, 8, 8 )
+            r.plot( array( matrix ), type="p", main=title, xlab=xlab, ylab=ylab, col="blue", pch=19 )
+            r.dev_off()
+        except Exception, exc:
+            stop_err( "%s" %str( exc ) )
+    else:
+        stop_err( "All values in both columns %s and %s are non-numeric or empty." % ( sys.argv[3], sys.argv[4] ) )
+
+    print "Scatter plot on columns %s, %s. " % ( sys.argv[3], sys.argv[4] )
+    if skipped_lines > 0:
+        print "Skipped %d lines starting with line #%d, value '%s' in column %d is not numeric." % ( skipped_lines, first_invalid_line, invalid_value, invalid_column )
+
+    r.quit( save="no" )
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/plotting/scatterplot.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/plotting/scatterplot.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,71 @@
+<tool id="scatterplot_rpy" name="Scatterplot">
+  <description>of two numeric columns</description>
+  <command interpreter="python">scatterplot.py $input $out_file1 $col1 $col2 "$title" "$xlab" "$ylab"</command>
+  <inputs>
+    <param name="input" type="data" format="tabular" label="Dataset" help="Dataset missing? See TIP below"/>
+    <param name="col1" type="data_column" data_ref="input" numerical="True" label="Numerical column for x axis" />
+    <param name="col2" type="data_column" data_ref="input" numerical="True" label="Numerical column for y axis" />
+    <param name="title" size="30" type="text" value="Scatterplot" label="Plot title"/>
+    <param name="xlab" size="30" type="text" value="V1" label="Label for x axis"/>
+    <param name="ylab" size="30" type="text" value="V2" label="Label for y axis"/>
+  </inputs>
+  <outputs>
+    <data format="pdf" name="out_file1" />
+  </outputs>
+  <requirements>
+    <requirement type="python-module">rpy</requirement>
+  </requirements>
+  <!-- TODO: uncomment the following test when we have tools.update_state() working for 
+       multiple dependents with the same dependency.
+  <tests>
+    <test>
+      <param name="input" value="scatterplot_in1.tabular" ftype="tabular"/>
+      <param name="col1" value="2"/>
+      <param name="col2" value="3"/>
+      <param name="title" value="Scatterplot"/>
+      <param name="xlab" value="V1"/>
+      <param name="ylab" value="V2"/>
+      <output name="out_file1" file="scatterplot_out1.pdf" />
+    </test>
+  </tests>
+  -->
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
+
+-----
+
+**Syntax**
+
+This tool creates a simple scatter plot between two variables containing numeric values of a selected dataset. 
+
+- All invalid, blank and comment lines in the dataset are skipped.  The number of skipped lines is displayed in the resulting history item.
+
+- **Plot title** The scatterplot title
+- **Label for x axis** and **Label for y axis** The labels for x and y axis of the scatterplot.
+
+-----
+
+**Example**
+
+- Input file::
+
+    1   68  4.1
+    2   71  4.6
+    3   62  3.8
+    4   75  4.4
+    5   58  3.2
+    6   60  3.1
+    7   67  3.8
+    8   68  4.1
+    9   71  4.3
+    10  69  3.7 
+
+- Create a simple scatterplot between the variables in column 2 and column 3 of the above dataset.
+
+.. image:: ./static/images/scatterplot.png
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/plotting/xy_plot.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/plotting/xy_plot.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,148 @@
+<tool id="XY_Plot_1" name="Plotting tool" version="1.0.1">
+  <description>for multiple series and graph types</description>
+  <command interpreter="bash">r_wrapper.sh $script_file</command>
+
+  <inputs>
+    <param name="main" type="text" value="" size="30" label="Plot Title"/>
+    <param name="xlab" type="text" value="" size="30" label="Label for x axis"/>
+    <param name="ylab" type="text" value="" size="30" label="Label for y axis"/>
+    <repeat name="series" title="Series">
+      <param name="input" type="data" format="tabular" label="Dataset"/>
+      <param name="xcol" type="data_column" data_ref="input" label="Column for x axis"/>
+      <param name="ycol" type="data_column" data_ref="input" label="Column for y axis"/>
+      <conditional name="series_type">
+        <param name="type" type="select" label="Series Type">
+          <option value="line" selected="true">Line</option>
+          <option value="points">Points</option>
+        </param>
+        <when value="line">
+          <param name="lty" type="select" label="Line Type">
+            <option value="1">Solid</option>
+            <option value="2">Dashed</option>
+            <option value="3">Dotted</option>
+          </param>
+          <param name="col" type="select" label="Line Color">
+            <option value="1">Black</option>
+            <option value="2">Red</option>
+            <option value="3">Green</option>
+            <option value="4">Blue</option>
+            <option value="5">Cyan</option>
+            <option value="6">Magenta</option>
+            <option value="7">Yellow</option>
+            <option value="8">Gray</option>
+          </param>
+          <param name="lwd" type="float" label="Line Width" value="1.0"/>
+        </when>
+        <when value="points">
+          <param name="pch" type="select" label="Point Type">
+            <option value="1">Circle (hollow)</option>
+            <option value="2">Triangle (hollow)</option>
+            <option value="3">Cross</option>
+            <option value="4">Diamond (hollow)</option>
+            <option value="15">Square (filled)</option>
+            <option value="16">Circle (filled)</option>
+            <option value="17">Triangle (filled)</option>  
+          </param>
+          <param name="col" type="select" label="Point Color">
+            <option value="1">Black</option>
+            <option value="2">Red</option>
+            <option value="3">Green</option>
+            <option value="4">Blue</option>
+            <option value="5">Cyan</option>
+            <option value="6">Magenta</option>
+            <option value="7">Yellow</option>
+            <option value="8">Gray</option>
+          </param>
+          <param name="cex" type="float" label="Point Scale" value="1.0"/>
+        </when>
+      </conditional>
+    </repeat>       
+  </inputs>
+
+  <configfiles>
+    <configfile name="script_file">
+      ## Setup R error handling to go to stderr
+      options( show.error.messages=F, 
+               error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+      ## Determine range of all series in the plot
+      xrange = c( NULL, NULL )
+      yrange = c( NULL, NULL )
+      #for $i, $s in enumerate( $series )
+        s${i} = read.table( "${s.input.file_name}" )
+        x${i} = s${i}[,${s.xcol}]
+        y${i} = s${i}[,${s.ycol}]
+        xrange = range( x${i}, xrange )
+        yrange = range( y${i}, yrange )
+      #end for
+      ## Open output PDF file
+      pdf( "${out_file1}" )
+      ## Dummy plot for axis / labels
+      plot( NULL, type="n", xlim=xrange, ylim=yrange, main="${main}", xlab="${xlab}", ylab="${ylab}" )
+      ## Plot each series
+      #for $i, $s in enumerate( $series )
+        #if $s.series_type['type'] == "line"
+          lines( x${i}, y${i}, lty=${s.series_type.lty}, lwd=${s.series_type.lwd}, col=${s.series_type.col} )
+        #elif $s.series_type.type == "points"
+          points( x${i}, y${i}, pch=${s.series_type.pch}, cex=${s.series_type.cex}, col=${s.series_type.col} )
+        #end if
+      #end for    
+      ## Close the PDF file
+      devname = dev.off() 
+    </configfile>
+  </configfiles>
+
+  <outputs>
+    <data format="pdf" name="out_file1" />
+  </outputs>
+
+    <tests>
+        <test>
+            <param name="main" value="Example XY Plot"/>
+            <param name="xlab" value="Column 1"/>
+            <param name="ylab" value="Column 2"/>
+            <param name="input" value="2.tabular" ftype="tabular"/>
+            <param name="xcol" value="1"/>
+            <param name="ycol" value="2"/>
+            <param name="type" value="line"/>
+            <param name="lty" value="2"/>
+            <param name="col" value="2"/>
+            <param name="lwd" value="1.0"/>
+            <output name="out_file1" file="XY_Plot_1_out.pdf"/>
+        </test>
+    </tests>
+<help>
+.. class:: infomark
+
+This tool allows you to plot values contained in columns of a dataset against each other and also allows you to have different series corresponding to the same or different datasets in one plot.
+
+-----
+
+.. class:: warningmark
+
+This tool throws an error if the columns selected for plotting are absent or are not numeric and also if the lengths of these columns differ.
+
+-----
+
+**Example**
+
+Input file::
+
+    1   68  4.1
+    2   71  4.6
+    3   62  3.8
+    4   75  4.4
+    5   58  3.2
+    6   60  3.1
+    7   67  3.8
+    8   68  4.1
+    9   71  4.3
+    10  69  3.7 
+
+Create a two series XY plot on the above data:
+
+- Series 1: Red Dashed-Line plot between columns 1 and 2
+- Series 2: Blue Circular-Point plot between columns 3 and 2 
+
+.. image:: ./static/images/xy_example.jpg
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/best_regression_subsets.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/best_regression_subsets.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+
+from galaxy import eggs
+
+import sys, string
+from rpy import *
+import numpy
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+infile = sys.argv[1]
+y_col = int(sys.argv[2])-1
+x_cols = sys.argv[3].split(',')
+outfile = sys.argv[4]
+outfile2 = sys.argv[5]
+print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1)
+fout = open(outfile,'w')
+
+for i, line in enumerate( file ( infile )):
+    line = line.rstrip('\r\n')
+    if len( line )>0 and not line.startswith( '#' ):
+        elems = line.split( '\t' )
+        break 
+    if i == 30:
+        break # Hopefully we'll never get here...
+
+if len( elems )<1:
+    stop_err( "The data in your input dataset is either missing or not formatted properly." )
+
+y_vals = []
+x_vals = []
+
+for k,col in enumerate(x_cols):
+    x_cols[k] = int(col)-1
+    x_vals.append([])
+    
+NA = 'NA'
+for ind,line in enumerate( file( infile )):
+    if line and not line.startswith( '#' ):
+        try:
+            fields = line.split("\t")
+            try:
+                yval = float(fields[y_col])
+            except Exception, ey:
+                yval = r('NA')
+            y_vals.append(yval)
+            for k,col in enumerate(x_cols):
+                try:
+                    xval = float(fields[col])
+                except Exception, ex:
+                    xval = r('NA')
+                x_vals[k].append(xval)
+        except:
+            pass
+
+response_term = ""
+
+x_vals1 = numpy.asarray(x_vals).transpose()
+
+dat= r.list(x=array(x_vals1), y=y_vals)
+
+r.library("leaps")

+set_default_mode(NO_CONVERSION)
+try:
+    leaps = r.regsubsets(r("y ~ x"), data= r.na_exclude(dat))
+except RException, rex:
+    stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.")
+set_default_mode(BASIC_CONVERSION)
+
+summary = r.summary(leaps)
+tot = len(x_vals)
+pattern = "["
+for i in range(tot):
+    pattern = pattern + 'c' + str(int(x_cols[int(i)]) + 1) + ' '
+pattern = pattern.strip() + ']'  
+print >>fout, "#Vars\t%s\tR-sq\tAdj. R-sq\tC-p\tbic" %(pattern)
+for ind,item in enumerate(summary['outmat']):
+    print >>fout, "%s\t%s\t%s\t%s\t%s\t%s" %(str(item).count('*'), item, summary['rsq'][ind], summary['adjr2'][ind], summary['cp'][ind], summary['bic'][ind])
+
+
+r.pdf( outfile2, 8, 8 )
+r.plot(leaps, scale="Cp", main="Best subsets using Cp Criterion")
+r.plot(leaps, scale="r2", main="Best subsets using R-sq Criterion")
+r.plot(leaps, scale="adjr2", main="Best subsets using Adjusted R-sq Criterion")
+r.plot(leaps, scale="bic", main="Best subsets using bic Criterion")
+
+r.dev_off()
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/best_regression_subsets.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/best_regression_subsets.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,66 @@
+<tool id="BestSubsetsRegression1" name="Perform Best-subsets Regression">
+  <description> </description>
+  <command interpreter="python">
+    best_regression_subsets.py 
+      $input1
+      $response_col
+      $predictor_cols
+      $out_file1
+      $out_file2
+      1>/dev/null
+      2>/dev/null
+  </command>
+  <inputs>
+    <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/>
+    <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" />
+    <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true" >
+        <validator type="no_options" message="Please select at least one column."/>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input1" />
+    <data format="pdf" name="out_file2" />
+  </outputs>
+  <requirements>
+    <requirement type="python-module">rpy</requirement>
+  </requirements>
+  <tests>
+    <!-- Testing this tool will not be possible because this tool produces a pdf output file.
+    -->
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Edit Datasets-&gt;Convert characters*
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+This tool uses the 'regsubsets' function from R statistical package for regression subset selection. It outputs two files, one containing a table with the best subsets and the corresponding summary statistics, and the other containing the graphical representation of the results.  
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+- This tool currently treats all predictor and response variables as continuous variables. 
+
+- Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis.
+
+- The 6 columns in the output are described below:
+
+  - Column 1 (Vars): denotes the number of variables in the model
+  - Column 2 ([c2 c3 c4...]): represents a list of the user-selected predictor variables (full model). An asterix denotes the presence of the corresponding predictor variable in the selected model.
+  - Column 3 (R-sq): the fraction of variance explained by the model
+  - Column 4 (Adj. R-sq): the above R-squared statistic adjusted, penalizing for higher number of predictors (p)
+  - Column 5 (Cp): Mallow's Cp statistics  
+  - Column 6 (bic): Bayesian Information Criterion. 
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/categorize_elements_satisfying_criteria.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/categorize_elements_satisfying_criteria.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,172 @@
+#!/usr/bin/perl -w
+
+# The program takes as input a set of categories, such that each category contains many elements.
+# It also takes a table relating elements with criteria, such that each element is assigned a number
+# representing the number of times the element satisfies a certain criterion. 
+# The first input is a TABULAR format file, such that the left column represents the name of categories and, 
+# all other columns represent the names of elements.
+# The second input is a TABULAR format file relating elements with criteria, such that the first line
+# represents the names of criteria and the left column represents the names of elements.
+# The output is a TABULAR format file relating catergories with criteria, such that each categoy is 
+# assigned a number representing the total number of times its elements satisfies a certain criterion.
+# Each category is assigned as many numbers as criteria.
+
+use strict;
+use warnings;
+
+#variables to handle information of the categories input file
+my @categoryElementsArray = ();
+my @categoriesArray = ();
+my $categoryMemberNames;
+my $categoryName;
+my %categoryMembersHash = ();
+my $memberNumber = 0;
+my $totalMembersNumber = 0;
+my $totalCategoriesNumber = 0;
+my @categoryCountersTwoDimArray = ();
+my $lineCounter1 = 0;
+
+#variables to handle information of the criteria and elements data input file
+my $elementLine;
+my @elementDataArray = ();
+my $elementName;
+my @criteriaArray = ();
+my $criteriaNumber = 0;
+my $totalCriteriaNumber = 0;
+my $lineCounter2 = 0;
+
+#variable representing the row and column indices used to store results into a two-dimensional array
+my $row = 0;
+my $column = 0;
+
+# check to make sure having correct files
+my $usage = "usage: categorize_motifs_significance.pl [TABULAR.in] [TABULAR.in] [TABULAR.out] \n";
+die $usage unless @ARGV == 3;
+
+#get the categories input file
+my $categories_inputFile = $ARGV[0];
+
+#get the criteria and data input file
+my $elements_data_inputFile = $ARGV[1];
+
+#get the output file
+my $categorized_data_outputFile = $ARGV[2];
+
+#open the input and output files
+open (INPUT1, "<", $categories_inputFile) || die("Could not open file $categories_inputFile \n");
+open (INPUT2, "<", $elements_data_inputFile ) || die("Could not open file $elements_data_inputFile  \n");
+open (OUTPUT, ">", $categorized_data_outputFile) || die("Could not open file $categorized_data_outputFile \n"); 
+
+#store the first input file into an array
+my @categoriesData = <INPUT1>;
+
+#reset the value of $lineCounter1 to 0 
+$lineCounter1 = 0;
+
+#iterate through the first input file to get the names of categories and their corresponding elements
+foreach $categoryMemberNames (@categoriesData){
+ chomp ($categoryMemberNames);
+
+ @categoryElementsArray = split(/\t/, $categoryMemberNames);
+
+ #store the name of the current category into an array
+ $categoriesArray [$lineCounter1] = $categoryElementsArray[0];
+
+ #store the name of the current category into a two-dimensional array
+ $categoryCountersTwoDimArray [$lineCounter1] [0] = $categoryElementsArray[0];
+
+ #get the total number of elements in the current category
+ $totalMembersNumber = @categoryElementsArray;
+
+ #store the names of categories and their corresponding elements into a hash
+ for ($memberNumber = 1; $memberNumber < $totalMembersNumber; $memberNumber++) {
+
+ $categoryMembersHash{$categoryElementsArray[$memberNumber]} = $categoriesArray[$lineCounter1];
+ }
+
+ $lineCounter1++;
+}
+
+#store the second input file into an array
+my @elementsData = <INPUT2>;
+
+#reset the value of $lineCounter2 to 0 
+$lineCounter2 = 0;
+
+#iterate through the second input file in order to count the number of elements
+#in each category that satisfy each criterion
+foreach $elementLine (@elementsData){
+ chomp ($elementLine);
+
+ $lineCounter2++;
+
+ @elementDataArray = split(/\t/, $elementLine);
+
+ #if at the first line, get the total number of criteria and the total  
+ #number of catergories and initialize the two-dimensional array
+ if ($lineCounter2 == 1){
+ @criteriaArray = @elementDataArray;
+ $totalCriteriaNumber = @elementDataArray;
+
+ $totalCategoriesNumber = @categoriesArray;
+
+ #initialize the two-dimensional array
+ for ($row = 0; $row < $totalCategoriesNumber; $row++) {
+
+ for ($column = 1; $column <= $totalCriteriaNumber; $column++) {
+
+ $categoryCountersTwoDimArray [$row][$column] = 0;
+ }
+ }
+ }
+ else{
+ #get the element data
+ $elementName = $elementDataArray[0];
+
+ #do the counting and store the result in the two-dimensional array
+ for ($criteriaNumber = 0; $criteriaNumber < $totalCriteriaNumber; $criteriaNumber++) {
+
+ if ($elementDataArray[$criteriaNumber + 1] > 0){
+
+ $categoryName = $categoryMembersHash{$elementName};
+
+ my ($categoryIndex) = grep $categoriesArray[$_] eq $categoryName, 0 .. $#categoriesArray;
+
+ $categoryCountersTwoDimArray [$categoryIndex] [$criteriaNumber + 1] = $categoryCountersTwoDimArray [$categoryIndex] [$criteriaNumber + 1] + $elementDataArray[$criteriaNumber + 1];
+ }
+ }
+ }
+}
+
+print OUTPUT "\t";
+
+#store the criteria names into the output file
+for ($column = 1; $column <= $totalCriteriaNumber; $column++) {
+
+ if ($column < $totalCriteriaNumber){
+ print OUTPUT $criteriaArray[$column - 1] . "\t";
+ }
+ else{
+ print OUTPUT $criteriaArray[$column - 1] . "\n";
+ }
+}
+
+#store the category names and their corresponding number of elements satisfying criteria into the output file
+for ($row = 0; $row < $totalCategoriesNumber; $row++) {
+
+ for ($column = 0; $column <= $totalCriteriaNumber; $column++) {
+
+ if ($column < $totalCriteriaNumber){
+ print OUTPUT $categoryCountersTwoDimArray [$row][$column] . "\t";
+ }
+ else{
+ print OUTPUT $categoryCountersTwoDimArray [$row][$column] . "\n";
+ }
+ }
+}
+
+#close the input and output file
+close(OUTPUT);
+close(INPUT2);
+close(INPUT1);
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/categorize_elements_satisfying_criteria.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/categorize_elements_satisfying_criteria.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,78 @@
+<tool id="categorize_elements_satisfying_criteria" name="Categorize Elements" version="1.0.0">
+  <description>satisfying criteria</description>
+  
+  <command interpreter="perl">
+   categorize_elements_satisfying_criteria.pl $inputFile1 $inputFile2 $outputFile1
+  </command>
+
+  <inputs>
+   <param format="tabular" name="inputFile1" type="data" label="Select file containing categories and their elements"/>
+   <param format="tabular" name="inputFile2" type="data" label="Select file containing criteria and elements data"/>
+  </inputs>
+  
+  <outputs>
+    <data format="tabular" name="outputFile1"/>
+  </outputs>
+
+  <tests>
+   <test>
+   <param name="inputFile1" value="categories.tabular" ftype="tabular" />
+   <param name="inputFile2" value="criteria_elements_data.tabular" ftype="tabular" />
+     <output name="outputFile1" file="categorized_elements.tabular" />
+   </test>
+  </tests>
+  
+  
+  <help> 
+
+.. class:: infomark
+
+**What it does**
+
+The program takes as input a set of categories, such that each category contains many elements. It also takes a table relating elements with criteria, such that each element is assigned a number representing the number of times the element satisfies a certain criterion. 
+
+- The first input is a TABULAR format file, such that the left column represents the names of categories and, all other columns represent the names of elements in each category.
+- The second input is a TABULAR format file relating elements with criteria, such that the first line represents the names of criteria and the left column represents the names of elements.
+- The output is a TABULAR format file relating catergories with criteria, such that each categoy is assigned a number representing the total number of times its elements satisfies a certain criterion.. Each category is assigned as many numbers as criteria.
+
+
+**Example**
+
+Let the first input file be a group of motif categories as follows::
+
+ Deletion_Hotspots deletionHoptspot1 deletionHoptspot2 deletionHoptspot3
+ Dna_Pol_Pause_Frameshift dnaPolPauseFrameshift1 dnaPolPauseFrameshift2 dnaPolPauseFrameshift3 dnaPolPauseFrameshift4
+ Indel_Hotspots indelHotspot1
+ Insertion_Hotspots insertionHotspot1 insertionHotspot2
+ Topoisomerase_Cleavage_Sites topoisomeraseCleavageSite1 topoisomeraseCleavageSite2 topoisomeraseCleavageSite3
+
+
+And let the second input file represent the number of times each motif occurs in a certain window size of indel flanking regions, as follows::
+
+ 10bp 20bp 40bp
+ deletionHoptspot1 1 1 2
+ deletionHoptspot2 1 1 1
+ deletionHoptspot3 0 0 0
+ dnaPolPauseFrameshift1 1 1 1
+ dnaPolPauseFrameshift2 0 2 1
+ dnaPolPauseFrameshift3 0 0 0
+ dnaPolPauseFrameshift4 0 1 2
+ indelHotspot1 0 0 0
+ insertionHotspot1 0 0 1
+ insertionHotspot2 1 1 1
+ topoisomeraseCleavageSite1 1 1 1
+ topoisomeraseCleavageSite2 1 2 1
+ topoisomeraseCleavageSite3 0 0 2
+
+Running the program will give the total number of times the motifs of each category occur in every window size of indel flanking regions::
+
+ 10bp 20bp 40bp
+ Deletion_Hotspots 2 2 3
+ Dna_Pol_Pause_Frameshift 1 4 4
+ Indel_Hotspots 0 0 0
+ Insertion_Hotspots 1 1 2
+ Topoisomerase_Cleavage_Sites 2 3 4
+
+    </help> 
+    
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/compute_motif_frequencies_for_all_motifs.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/compute_motif_frequencies_for_all_motifs.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,153 @@
+#!/usr/bin/perl -w
+
+# a program to compute the frequencies of each motif at a window size, determined by the user, in both 
+# upstream and downstream sequences flanking indels in all chromosomes.
+# the first input is a TABULAR format file containing the motif names and sequences, such that the file 
+# consists of two columns: the left column represents the motif names and the right column represents 
+# the motif sequence, one line per motif.
+# the second input is a TABULAR format file containing the windows into which both upstream and downstream 
+# sequences flanking indels have been divided.
+# the fourth input is an integer number representing the number of windows to be considered in both 
+# upstream and downstream flanking sequences.
+# the output is a TABULAR format file consisting of three columns: the left column represents the motif 
+# name, the middle column represents the motif frequency in the window of the upstream sequence flanking 
+# an indel, and the the right column represents the motif frequency in the window of the downstream 
+# sequence flanking an indel, one line per indel.
+# The total number of lines in the output file = number of motifs x number of indels.
+
+use strict;
+use warnings;
+
+#variable to handle the window information
+my $window = "";
+my $windowNumber = 0;
+my $totalWindowsNumber = 0;
+my $upstreamAndDownstreamFlankingSequencesWindows = "";
+
+#variable to handle the motif information
+my $motif = "";
+my $motifName = "";
+my $motifSequence = "";
+my $motifNumber = 0;
+my $totalMotifsNumber = 0;
+my $upstreamMotifFrequencyCounter = 0;
+my $downstreamMotifFrequencyCounter = 0;
+
+#arrays to sotre window and motif data
+my @windowsArray = ();
+my @motifNamesArray = ();
+my @motifSequencesArray = ();
+
+#variable to handle the indel information
+my $indelIndex = 0;
+
+#variable to store line counter value
+my $lineCounter = 0;
+
+# check to make sure having correct files
+my $usage = "usage: compute_motif_frequencies_for_all_motifs.pl [TABULAR.in] [TABULAR.in] [windowSize] [TABULAR.out] \n";
+die $usage unless @ARGV == 4;
+
+#get the input arguments
+my $motifsInputFile = $ARGV[0];
+my $indelFlankingSequencesWindowsInputFile = $ARGV[1];
+my $numberOfConsideredWindows = $ARGV[2];
+my $motifFrequenciesOutputFile = $ARGV[3];
+
+#open the input files
+open (INPUT1, "<", $motifsInputFile) || die("Could not open file $motifsInputFile \n"); 
+open (INPUT2, "<", $indelFlankingSequencesWindowsInputFile) || die("Could not open file indelFlankingSequencesWindowsInputFile \n");   
+open (OUTPUT, ">", $motifFrequenciesOutputFile) || die("Could not open file $motifFrequenciesOutputFile \n");   
+
+#store the motifs input file in the array @motifsData
+my @motifsData = <INPUT1>;
+
+#iterated through the motifs (lines) of the motifs input file
+foreach $motif (@motifsData){
+ chomp ($motif);
+ #print ($motif . "\n");
+
+ #split the motif data into its name and its sequence
+ my @motifNameAndSequenceArray = split(/\t/, $motif);
+
+ #store the name of the motif into the array @motifNamesArray
+ push @motifNamesArray, $motifNameAndSequenceArray[0];
+
+ #store the sequence of the motif into the array @motifSequencesArray
+ push @motifSequencesArray, $motifNameAndSequenceArray[1];
+}
+
+#compute the size of the motif names array 
+$totalMotifsNumber = @motifNamesArray;
+
+
+#store the first output file containing the windows of both upstream and downstream flanking sequences in the array @windowsData
+my @windowsData = <INPUT2>;
+
+#check if the number of considered window entered by the user is 0 or negative, if so make it equal to 1
+if ($numberOfConsideredWindows <= 0){
+ $numberOfConsideredWindows = 1;
+}
+
+#iterated through the motif sequences to check their occurrences in the considered windows
+#and store the count of their occurrences in the corresponding ouput file
+for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){
+
+ #get the motif name
+ $motifName = $motifNamesArray[$motifNumber];
+
+ #get the motif sequence
+    $motifSequence = $motifSequencesArray[$motifNumber];
+         
+ #iterated through the lines of the second input file. Each line represents   
+ #the windows of the upstream and downstream flanking sequences of an indel
+ foreach $upstreamAndDownstreamFlankingSequencesWindows (@windowsData){
+
+ chomp ($upstreamAndDownstreamFlankingSequencesWindows);
+ $lineCounter++;
+
+ #split both upstream and downstream flanking sequences into their windows
+ my @windowsArray = split(/\t/, $upstreamAndDownstreamFlankingSequencesWindows);
+
+ if ($lineCounter == 1){
+ $totalWindowsNumber = @windowsArray;
+ $indelIndex = ($totalWindowsNumber - 1)/2;
+ }
+
+ #reset the motif frequency counters
+ $upstreamMotifFrequencyCounter = 0;
+ $downstreamMotifFrequencyCounter = 0;
+
+ #iterate through the considered windows of the upstream flanking sequence and increment the motif frequency counter
+ for ($windowNumber = $indelIndex - 1; $windowNumber > $indelIndex - $numberOfConsideredWindows - 1; $windowNumber--){
+
+ #get the window
+ $window = $windowsArray[$windowNumber];
+
+ #if the motif is found in the window, then increment its corresponding counter
+ if ($window =~ m/$motifSequence/i){
+          $upstreamMotifFrequencyCounter++;
+         }  
+ }
+
+ #iterate through the considered windows of the upstream flanking sequence and increment the motif frequency counter
+ for ($windowNumber = $indelIndex + 1; $windowNumber < $indelIndex + $numberOfConsideredWindows + 1; $windowNumber++){
+
+ #get the window
+     $window = $windowsArray[$windowNumber];
+  
+     #if the motif is found in the window, then increment its corresponding counter
+ if ($window =~ m/$motifSequence/i){
+          $downstreamMotifFrequencyCounter++;
+         }  
+ }
+
+ #store the result into the output file of the motif
+ print OUTPUT $motifName . "\t" . $upstreamMotifFrequencyCounter . "\t" . $downstreamMotifFrequencyCounter . "\n";
+ }
+}
+
+#close the input and output files
+close(OUTPUT);
+close(INPUT2);
+close(INPUT1);
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/compute_motif_frequencies_for_all_motifs.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/compute_motif_frequencies_for_all_motifs.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,72 @@
+<tool id="compute_motif_frequencies_for_all_motifs" name="Compute Motif Frequencies For All Motifs" version="1.0.0">
+  <description>motif by motif</description>
+  
+  <command interpreter="perl">
+   compute_motif_frequencies_for_all_motifs.pl $inputFile1 $inputFile2 $inputWindowSize3 $outputFile1
+  </command>
+
+  <inputs>
+   <param format="tabular" name="inputFile1" type="data" label="Select the motifs file"/>
+   <param format="tabular" name="inputFile2" type="data" label="Select the indel flanking sequences windows file"/>
+    <param type="integer" name="inputWindowSize3" size="6" value="0" label="What is the number of 10bp windows in which the motif frequencies will be computed?" help="'0' = one window only"/>
+  </inputs>
+  
+  <outputs>
+    <data format="tabular" name="outputFile1"/>
+  </outputs>
+
+  <tests>
+   <test>
+   <param name="inputFile1" value="motifs2.tabular" />
+   <param name="inputFile2" value="flankingSequencesWindows10_2.tabular" />
+     <param name="inputWindowSize3" value="0" />
+     <output name="outputFile1" file="motifFrequencies_every_indels0.tabular" />
+   </test>
+  
+   <test>
+   <param name="inputFile1" value="motifs2.tabular" />
+   <param name="inputFile2" value="flankingSequencesWindows10_2.tabular" />
+     <param name="inputWindowSize3" value="4" />
+     <output name="outputFile1" file="motifFrequencies_every_indels4.tabular" /> 
+   </test>
+  </tests>
+
+  <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This program computes the frequencies of each motif at a window size, determined by the user, in both upstream and downstream sequences flanking indels in all chromosomes.
+
+- The first input is a TABULAR format file containing the motif names and sequences, one line per motif, such that the file consists of two columns: 
+
+ - The left column represents the motif names
+ - The right column represents the motif sequence, as follows::

+  dnaPolPauseFrameshift1 GAG
+ dnaPolPauseFrameshift2 ACG
+ xSites1 CCG
+
+- The second input is a TABULAR format file representing the windows of both upstream  and downstream flanking sequences. It consists of multiple left columns representing the windows of the upstream flanking sequences, followed by one column representing the indels, then followed by multiple right columns representing the windows of the downstream flanking sequences, as follows::
+
+ cgaggtcagg agatcgagac catcctggct aacatggtga aatcccgtct ctactaaaaa indel aaatttatat ttataaacaa ttttaataca cctatgttta ttatacattt
+ GCCAGTTTAT GGTCTAACAA GGAGAGAAAC AGGGGGCTGA AGGGGTTTCT TAACCTCCAG indel TTCCGGGCTC TGTCCCTAAC CCCCAGCTAG GTAAGTGGCA AAGCACTTCT
+ CAGTGGGACC AAGCACTGAA CCACTTTGGG GAGAATCTCA CACTGGGGCC CTCTGACACC indel tatatatttt tttttttttt tttttttttt tttttttttg agatggtgtc
+ AGAGCAGCAG CACCCACTTT TGCAGTGTGT GACGTTGGTG GAGCCATCGA AGTCTGTGCT indel GAGCCCTCCC CAGTGCTCCG AGGAGCTGCT GTTCCCCCTG GAGCTCAGAA
+
+- The third input is an integer number representing the number of windows to be considered starting from the indel and leftward for the upstream flanking sequence and, starting from the indel and rightward for the downstream flanking sequence.
+
+- The output is a TABULAR format file consisting of three columns: 
+
+ - The left column represents the motif name
+ - The middle column represents the motif frequency in the specified windows of the upstream sequence flanking an indel
+ - The right column represents the motif frequency in the specified windows of the downstream sequence flanking an indel

+ There is line per indel in the output file, such that the total number of lines in the output file = number of motifs x number of indels.
+
+Note: The number of windows entered by the user must be a positive integer >= 1. if negative integer or 0 is entered by the user, the program will consider it as 1.
+
+  </help>  
+  
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/compute_motifs_frequency.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/compute_motifs_frequency.pl Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,252 @@\n+#!/usr/bin/perl -w\n+\n+# a program to compute the frequency of each motif at each window in both upstream and downstream sequences flanking indels\n+# in a chromosome/genome.\n+# the first input is a TABULAR format file containing the motif names and sequences, such that the file consists of two\n+# columns: the left column represents the motif names and the right column represents the motif sequence, one line per motif.\n+# the second input is a TABULAR format file containing the upstream and downstream sequences flanking indels, one line per indel.\n+# the fourth input is an integer number representing the window size according to which the upstream and downstream sequences\n+# flanking each indel will be divided.\n+# the first output is a TABULAR format file containing the windows into which both upstream and downstream sequences flanking \n+# indels are divided.\n+# the second output is a TABULAR format file containing the motifs and their corresponding frequencies at each window in both \n+# upstream and downstream sequences flanking indels, one line per motif.\n+ \n+use strict;\n+use warnings;\n+\n+#variable to handle the falnking sequences information\n+my $sequence = "";\n+my $upstreamFlankingSequence = "";\n+my $downstreamFlankingSequence = "";\n+my $discardedSequenceLength = 0;\n+my $lengthOfDownstreamFlankingSequenceAfterTrimming = 0;\n+\n+#variable to handle the window information\n+my $window = "";\n+my $windowStartIndex = 0;\n+my $windowNumber = 0;\n+my $totalWindowsNumber = 0;\n+my $totalNumberOfWindowsInUpstreamSequence = 0;\n+my $totalNumberOfWindowsInDownstreamSequence = 0;\n+my $totalWindowsNumberInBothFlankingSequences = 0;\n+my $totalWindowsNumberInMotifCountersTwoDimArray = 0;\n+my $upstreamAndDownstreamFlankingSequencesWindows = "";\n+\n+#variable to handle the motif information\n+my $motif = "";\n+my $motifSequence = "";\n+my $motifNumber = 0;\n+my $totalMotifsNumber = 0;\n+\n+#arrays to sotre window and motif data\n+my @windowsArray = ();\n+my @motifNamesArray = ();\n+my @motifSequencesArray = ();\n+my @motifCountersTwoDimArray = ();\n+\n+#variables to store line counter values\n+my $lineCounter1 = 0;\n+my $lineCounter2 = 0;\n+\n+# check to make sure having correct files\n+my $usage = "usage: compute_motifs_frequency.pl [TABULAR.in] [TABULAR.in] [windowSize] [TABULAR.out] [TABULAR.out]\\n";\n+die $usage unless @ARGV == 5;\n+\n+#get the input and output arguments\n+my $motifsInputFile = $ARGV[0];\n+my $indelFlankingSequencesInputFile = $ARGV[1];\n+my $windowSize = $ARGV[2];\n+my $indelFlankingSequencesWindowsOutputFile = $ARGV[3];\n+my $motifFrequenciesOutputFile = $ARGV[4];\n+\n+#open the input and output files\n+open (INPUT1, "<", $motifsInputFile) || die("Could not open file $motifsInputFile \\n"); \n+open (INPUT2, "<", $indelFlankingSequencesInputFile) || die("Could not open file $indelFlankingSequencesInputFile \\n"); \n+open (OUTPUT1, ">", $indelFlankingSequencesWindowsOutputFile) || die("Could not open file $indelFlankingSequencesWindowsOutputFile \\n");   \n+open (OUTPUT2, ">", $motifFrequenciesOutputFile) || die("Could not open file $motifFrequenciesOutputFile \\n"); \n+\n+#store the motifs input file in the array @motifsData\n+my @motifsData = <INPUT1>;\n+\n+#iterated through the motifs (lines) of the motifs input file\n+foreach $motif (@motifsData){\n+\tchomp ($motif);\n+\t#print ($motif . "\\n");\n+\t\n+\t#split the motif data into its name and its sequence\n+\tmy @motifNameAndSequenceArray = split(/\\t/, $motif);\n+\t\n+\t#store the name of the motif into the array @motifNamesArray\n+\tpush @motifNamesArray, $motifNameAndSequenceArray[0];\n+\t\n+\t#store the sequence of the motif into the array @motifSequencesArray\n+\tpush @motifSequencesArray, $motifNameAndSequenceArray[1];\n+}\n+\n+#compute the size of the motif names array \n+$totalMotifsNumber = @motifNamesArray;\n+\n+#store the input file in the array @sequencesData\n+my @sequencesData = <INPUT2>;\n+\n+#iterated through the sequences of the second input file in order to create windwos file\n+foreach $sequence (@sequencesDa'..b'ws\n+\tfor ($windowNumber = 0; $windowNumber < $totalNumberOfWindowsInDownstreamSequence; $windowNumber++){\n+\t\t$windowStartIndex = $windowNumber * $windowSize;\n+\t\tprint OUTPUT1 (substr($downstreamFlankingSequence, $windowStartIndex, $windowSize) . "\\t");\n+\t}\n+\t\n+\tprint OUTPUT1 ("\\n");\n+}\n+\n+#compute the total number of windows on both upstream and downstream sequences flanking the indel\n+$totalWindowsNumberInBothFlankingSequences = $totalNumberOfWindowsInUpstreamSequence + $totalNumberOfWindowsInDownstreamSequence;\n+\n+#add an additional cell to store the name of the motif and another one for the indel itself\n+$totalWindowsNumberInMotifCountersTwoDimArray = $totalWindowsNumberInBothFlankingSequences + 1 + 1;\n+\n+#initialize the two dimensional array $motifCountersTwoDimArray. the first column will be initialized with motif names\n+for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){\n+\t\n+\tfor ($windowNumber = 0; $windowNumber < $totalWindowsNumberInMotifCountersTwoDimArray; $windowNumber++){\n+\t\t\n+\t\tif ($windowNumber == 0){\n+\t\t\t$motifCountersTwoDimArray [$motifNumber] [0] = $motifNamesArray[$motifNumber];\n+\t\t}\n+\t\telsif ($windowNumber == $totalNumberOfWindowsInUpstreamSequence + 1){\n+\t\t\t$motifCountersTwoDimArray [$motifNumber] [$windowNumber] = "indel";\n+\t\t}\n+\t\telse{\n+\t\t\t$motifCountersTwoDimArray [$motifNumber] [$windowNumber] = 0;\n+\t\t}\n+\t}\n+}\n+\n+close(OUTPUT1);\n+\n+#open the file the contains the windows of the upstream and downstream flanking sequences, which is actually the first output file\n+open (INPUT3, "<", $indelFlankingSequencesWindowsOutputFile) || die("Could not open file $indelFlankingSequencesWindowsOutputFile \\n");   \n+\n+#store the first output file containing the windows of both upstream and downstream flanking sequences in the array @windowsData\n+my @windowsData = <INPUT3>;\n+\n+#iterated through the lines of the first output file. Each line represents   \n+#the windows of the upstream and downstream flanking sequences of an indel\n+foreach $upstreamAndDownstreamFlankingSequencesWindows (@windowsData){\n+\t\n+\tchomp ($upstreamAndDownstreamFlankingSequencesWindows);\n+\t$lineCounter2++;\n+\t\n+\t#split both upstream and downstream flanking sequences into their windows\n+\tmy @windowsArray = split(/\\t/, $upstreamAndDownstreamFlankingSequencesWindows);\n+\t\n+\t$totalWindowsNumber = @windowsArray;\n+\t\n+\t#iterate through the windows to search for matched motifs and increment their corresponding counters accordingly\n+\tWINDOWS:\n+\tfor ($windowNumber = 0; $windowNumber < $totalWindowsNumber; $windowNumber++){\n+\t\t\n+\t\t#get the window\n+\t\t$window = $windowsArray[$windowNumber];\n+\t\t\n+        #if the window is the one that contains the indel, then skip the indel window\n+        if ($window eq "indel") {  \n+        \tnext WINDOWS;\t\n+        }\n+        else{  #iterated through the motif sequences to check their occurrences in the winodw \n+               #and increment their corresponding counters accordingly\n+\t        \n+\t        for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){\n+\t        \t#get the motif sequence\n+\t        \t$motifSequence = $motifSequencesArray[$motifNumber];\n+\t        \t\n+\t        \t#if the motif is found in the window, then increment its corresponding counter\n+\t        \tif ($window =~ m/$motifSequence/i){\n+\t        \t\t$motifCountersTwoDimArray [$motifNumber] [$windowNumber + 1]++;\n+\t        \t}  \n+\t        }\n+        }\n+\t}\n+}\n+\n+#store the motif counters values in the second output file\n+for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){\n+\t\n+\tfor ($windowNumber = 0; $windowNumber <= $totalWindowsNumber; $windowNumber++){\n+\t\t\n+\t\tprint OUTPUT2 $motifCountersTwoDimArray [$motifNumber] [$windowNumber] . "\\t";\n+\t\t#print ($motifCountersTwoDimArray [$motifNumber] [$windowNumber] . " ");\n+\t}\n+\tprint OUTPUT2 "\\n";\n+\t#print ("\\n");\n+}\n+\t\t\n+#close the input and output files\n+close(OUTPUT2);\n+close(OUTPUT1);\n+close(INPUT3);\n+close(INPUT2);\n+close(INPUT1);\n\\ No newline at end of file\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/compute_motifs_frequency.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/compute_motifs_frequency.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,109 @@
+<tool id="compute_motifs_frequency" name="Compute Motif Frequencies" version="1.0.0">
+  <description>in indel flanking regions</description>
+  
+  
+  <command interpreter="perl">
+    compute_motifs_frequency.pl $inputFile1 $inputFile2 $inputNumber3 $outputFile1 $outputFile2
+  </command>
+  
+  
+  <inputs>

+    <param format="tabular" name="inputFile1" type="data" label="Select motifs file"/>
+
+    <param format="tabular" name="inputFile2" type="data" label="Select indel flanking regions file from your history"/>
+      
+    <param type="integer" name="inputNumber3" size="5" value="0" label="What is the size of each window?" help="'0' = all the upstream flanking sequence will be one window only, and the same for the downstream flanking sequence."/>
+        
+  </inputs>
+  
+  
+  <outputs>
+    <data format="tabular" name="outputFile1"/>
+    <data format="tabular" name="outputFile2"/>
+  </outputs>
+  
+  <tests>
+   <test>
+   <param name="inputFile1" value="motifs1.tabular" />
+   <param name="inputFile2" value="indelsFlankingSequences1.tabular" />
+     <param name="inputNumber3" value="0" />
+     <output name="outputFile1" file="flankingSequencesWindows0.tabular" />
+     <output name="outputFile2" file="motifFrequencies0.tabular" />    
+   </test>
+  
+   <test>
+   <param name="inputFile1" value="motifs1.tabular" />
+   <param name="inputFile2" value="indelsFlankingSequences1.tabular" />
+     <param name="inputNumber3" value="10" />
+     <output name="outputFile1" file="flankingSequencesWindows10.tabular" /> 
+     <output name="outputFile2" file="motifFrequencies10.tabular" />    
+   </test>
+  </tests>
+
+   
+   <help>
+
+.. class:: infomark
+
+**What it does**
+
+This program computes the frequency of motifs in the flanking regions of indels found in a chromosome or a genome.
+Each indel has an upstream flanking sequence and a downstream flanking one. Each of the upstream and downstream flanking 
+sequences will be divided into a certain number of windows according to the window size input by the user. 
+The frequency of a motif in a certain window in one of the two flanking sequences is the total sum of occurrences of 
+that motif in that window of that flanking sequence over all indels. The indel flanking regions file will be taken
+from your history or it will be uploaded, whereas the motifs file should be uploaded.
+
+- The first input file is the motifs file and it is a tabular file consisting of two columns:
+
+ - the first column represents the motif name
+ - the second column represents the motif sequence, as follows::
+  
+ dnaPolPauseFrameshift1 GAG
+ dnaPolPauseFrameshift2 ACG
+ xSites1 CCG
+
+- The second input file is the indels flanking regions file and it is a tabular file consisting of five columns:
+
+ - the first column represents the indel start coordinate
+ - the second column represents the indel end coordinate
+ - the third column represents the indel length
+ - the fourth column represents the upstream flanking sequence
+ - the fifth column represents the upstream flanking sequence, as follows::
+  
+   16694766   16694768   3   GTGGGTCCTGCCCAGCCTCTGCCTCAGAGGGAAGAGTAGAGAACTGGG   AGAGCAGGTCCTTAGGGAGCCCGAGGAAGTCCCTGACGCCAGCTGTTCTCGCGGACGAA
+ 25169542   25169545   4   caagcccacaagccttcagaccatagcaCGGGCTCCAGAGGTGTGAGG   CAGGTCAGGTGCTTTAGAAGTCAAAAACTCTCAGTAAGGCAAATCACCCCCTATCTCCT
+ 41929580   41929585   6   ggctgtcgtatggaatctggggctcaggactctgtcccatttctctaa   accattctgcTTCAACCCAGACACTGACTGTTTTCCAAATTTACTTGTTTGTTTGTTTT
+
+
+-----
+
+.. class:: warningmark
+
+**Notes**
+
+- The lengths of the upstream flanking sequences must be equal for all indels.
+- The lengths of the downstream flanking sequences must be equal for all indels.
+- If the length of the upstream flanking sequence L is not an integer multiple of the window size S, in other words if L/S = m + r where m is the result of division and r is the remainder, then the upstream flanking sequence will be divided into m windows only starting from the indel, and the rest of the sequence will not be considered. The same rule applies to the downstream flanking sequence. 
+
+-----
+
+The **output** of this program is two files:
+
+- The first output file is a tabular file and represents the windows of both upstream  and downstream flanking sequences. It consists of multiple left columns representing the windows of the upstream flanking sequence, followed by one column representing the indels, then followed by multiple right columns representing the windows of the downstream flanking sequence, as follows::
+
+ cgaggtcagg agatcgagac catcctggct aacatggtga aatcccgtct ctactaaaaa indel aaatttatat ttataaacaa ttttaataca cctatgttta ttatacattt
+ GCCAGTTTAT GGTCTAACAA GGAGAGAAAC AGGGGGCTGA AGGGGTTTCT TAACCTCCAG indel TTCCGGGCTC TGTCCCTAAC CCCCAGCTAG GTAAGTGGCA AAGCACTTCT
+ CAGTGGGACC AAGCACTGAA CCACTTTGGG GAGAATCTCA CACTGGGGCC CTCTGACACC indel tatatatttt tttttttttt tttttttttt tttttttttg agatggtgtc
+ AGAGCAGCAG CACCCACTTT TGCAGTGTGT GACGTTGGTG GAGCCATCGA AGTCTGTGCT indel GAGCCCTCCC CAGTGCTCCG AGGAGCTGCT GTTCCCCCTG GAGCTCAGAA
+
+- The second output file is a tabular file and represents the motif frequencies in every window of every flanking sequence. The first column on the left represents the names of motifs. The other columns represent the frequencies of motifs in the windows that correspond to the ones in the first output file, as follows::
+
+ dnaPolPauseFrameshift1 2 3 1 0 1 2 indel 0 2 2 1 3
+ dnaPolPauseFrameshift2 2 3 1 0 1 2 indel 0 2 2 1 3
+ xSites1 3 2 0 1 1 2 indel 1 1 3 2 3
+
+  </help>
+   
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/compute_q_values.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/compute_q_values.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,95 @@
+# A program to compute the q-values based on the p-values of multiple simultaneous tests. 
+# The q-valules are computed using a specific R package created by John Storey called "qvalue".
+# The input is a TABULAR format file consisting of one column only that represents the p-values 
+# of multiple simultaneous tests, one line for every p-value. 
+# The first output is a TABULAR format file consisting of one column only that represents the q-values 
+# corresponding to p-values, one line for every q-value. 
+# the second output is a TABULAR format file consisting of three pages: the first page represents 
+# the p-values histogram, the second page represents the q-values histogram, and the third page represents 
+# the four Q-plots as introduced in the "qvalue" package manual.
+
+use strict;
+use warnings;
+use IO::Handle;
+use File::Temp qw/ tempfile tempdir /;
+my $tdir = tempdir( CLEANUP => 0 );
+
+# check to make sure having correct input and output files
+my $usage = "usage: compute_q_values.pl [TABULAR.in] [lambda] [pi0_method] [fdr_level] [robust] [TABULAR.out] [PDF.out] \n";
+die $usage unless @ARGV == 7;
+
+#get the input arguments
+my $p_valuesInputFile = $ARGV[0];
+my $lambdaValue =  $ARGV[1];
+my $pi0_method =  $ARGV[2];
+my $fdr_level =  $ARGV[3];
+my $robustValue =  $ARGV[4];
+my $q_valuesOutputFile = $ARGV[5];
+my $p_q_values_histograms_QPlotsFile = $ARGV[6];
+
+if($lambdaValue =~ /sequence/){
+ $lambdaValue = "seq(0, 0.95, 0.05)";
+}
+
+#open the input files
+open (INPUT, "<", $p_valuesInputFile) || die("Could not open file $p_valuesInputFile \n");
+open (OUTPUT1, ">", $q_valuesOutputFile) || die("Could not open file $q_valuesOutputFile \n");
+open (OUTPUT2, ">", $p_q_values_histograms_QPlotsFile) || die("Could not open file $p_q_values_histograms_QPlotsFile \n");
+#open (ERROR,  ">", "error.txt")  or die ("Could not open file error.txt \n");
+
+#save all error messages into the error file $errorFile using the error file handle ERROR
+#STDERR -> fdopen( \*ERROR,  "w" ) or die ("Could not direct errors to the error file error.txt \n");
+
+#warn "Hello Error File \n";
+
+#variable to store the name of the R script file
+my $r_script;
+
+# R script to implement the calcualtion of q-values based on multiple simultaneous tests p-values 
+# construct an R script file and save it in a temp directory
+chdir $tdir;
+$r_script = "q_values_computation.r";
+
+open(Rcmd,">", $r_script) or die "Cannot open $r_script \n\n"; 
+print Rcmd "
+ #options(show.error.messages = FALSE);
+
+ #load necessary packages
+ suppressPackageStartupMessages(library(tcltk));
+ library(qvalue);
+
+ #read the p-values of the multiple simultaneous tests from the input file $p_valuesInputFile
+ p <- scan(\"$p_valuesInputFile\", quiet = TRUE);
+
+ #compute the q-values that correspond to the p-values of the multiple simultaneous tests
+ qobj <- qvalue(p, pi0.meth = \"$pi0_method\", lambda = $lambdaValue, fdr.level = $fdr_level, robust = $robustValue);
+ #qobj <- qvalue(p, pi0.meth = \"smoother\", lambda = seq(0, 0.95, 0.05), fdr.level = 0.05);
+ #qobj <- qvalue(p, pi0.meth = \"bootstrap\", fdr.level = 0.05);
+
+ #draw the p-values histogram, the q-values histogram, and the four Q-plots 
+ # and save them on multiple pages of the output file $p_q_values_histograms_QPlotsFile
+ pdf(file = \"$p_q_values_histograms_QPlotsFile\", width = 6.25, height = 6, family = \"Times\", pointsize = 12, onefile = TRUE)
+ hist(qobj\$pvalues);
+ #dev.off();
+
+ hist(qobj\$qvalues);
+ #dev.off(); 
+
+ qplot(qobj);  
+ dev.off();
+
+ #save the q-values in the output file $q_valuesOutputFile
+ qobj\$pi0 <- signif(qobj\$pi0,digits=6)
+ qwrite(qobj, filename=\"$q_valuesOutputFile\"); 
+
+ #options(show.error.messages = TRUE);
+ #eof\n";
+close Rcmd;
+
+system("R --no-restore --no-save --no-readline < $r_script > $r_script.out");
+
+#close the input and output and error files
+#close(ERROR);
+close(OUTPUT2);
+close(OUTPUT1);
+close(INPUT);
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/compute_q_values.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/compute_q_values.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,155 @@
+<tool id="compute_q_values" name="Compute q-values" version="1.0.1">
+  <description>based on multiple simultaneous tests p-values</description>
+  
+  <command interpreter="perl">
+   compute_q_values.pl $inputFile1 $inputLambda2 $inputPI0_method3 $inputFDR_level4 $inputRobust5 $outputFile1 $outputFile2
+  </command>
+
+  <inputs>
+   <param format="tabular" name="inputFile1" type="data" label="Select the p-values file"/>
+  
+   <param type="text" name="inputLambda2" size="100" value="sequence_from_0_to_0.95_increment_0.05" label="What is the lambda value?" help="Either choose the default sequence or one deciaml value between 0 and 1"/>
+  
+   <param name="inputPI0_method3" type="select" label="Choose the PI method:">
+     <option value="smoother">smoother</option>
+       <option value="bootstrap">bootstrap</option>
+    </param>
+    
+    <param type="float" name="inputFDR_level4" size="5" value="" label="What is the FDR level?" help="The FDR level must be between 0 and 1"/>
+    
+    <param name="inputRobust5" type="select" label="Do you want to make the estimate more robust:" help="Choose TRUE for small p-values">
+   <option value="FALSE">FALSE</option>
+     <option value="TRUE">TRUE</option>
+    </param>
+  </inputs>
+  
+  <outputs>
+    <data format="tabular" name="outputFile1"/>
+    <data format="pdf" name="outputFile2"/>
+  </outputs>
+
+  <tests>
+   <test>
+   <param name="inputFile1" value="p_values.tabular" ftype="tabular" />
+     <param name="inputLambda2" value="sequence_from_0_to_0.95_increment_0.05" />
+     <param name="inputPI0_method3" value="smoother" />
+     <param name="inputFDR_level4" value="0.05" />
+     <param name="inputRobust5" value="FALSE" />
+     <output name="outputFile1" file="q_values.tabular" />
+     <output name="outputFile1" file="p_q_hists_Q_plots.pdf" />
+   </test>
+  </tests>
+  
+  <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This program computes the q-values based on the p-values of multiple simultaneous tests. The q-values are computed using a specific R package, created by John Storey and Alan Dabney, called "qvalue". The program takes five inputs:

+- The first input is a TABULAR format file consisting of one column only that represents the p-values of multiple simultaneous tests, one line for every p-value. 
+- The second input is the lambda parameter. The user can choose either the default: seq(0, 0.95, 0.05) or a decimal number between 0.0 and 1.0.
+- The third input is PI method which is either "smoother" or "bootstrap".
+- The fourth input is the FDR (false discovery rate) level which is a decimal number between 0.0 and 1.0.
+- The fifth input is either TRUE or FALSE for the estimate robustness. 
+
+The program gives two outputs:
+
+- The first output is a TABULAR format file consisting of three columns: 
+
+ - the left column represents the p-values of multiple simultaneous tests, one line for every p-value
+ - the middle column represents the q-values corresponding to the p-values
+ - the third column represent the significance values, either 1 for significant or 0 for non-significant 
+
+- The second output is a PDF format file consisting of three pages: 
+
+ - the first page represents the p-values histogram
+ - the second page represents the q-values histogram
+ - the third page represents the four Q-plots as introduced in the "qvalue" package manual.
+  
+
+**Example**
+
+Let us have the first input file of p-values as follows::
+
+ 0.140627492
+ 0.432249886
+ 0.122120877
+ 0.142010182
+ 0.012909858
+ 0.000142807
+ 0.039841941
+ 0.035173303
+ 0.011340057
+ 1.01E-05
+ 0.212738282
+ 0.091256284
+ 0.547375415
+ 0.189589833
+ 6.18E-12
+ 0.001235875
+ 1.10E-05
+ 9.75E-07
+ 2.13E-18
+ 2.54E-16
+ 1.20E-19
+ 9.76E-14
+ 0.359181534
+ 0.03661672
+ 0.400459987
+ 0.387436466
+ 0.342075061
+ 0.904129283
+ 0.031152635
+
+Running the program will give the following output::
+
+ pi0: 0.140311054
+
+ FDR level: 0.05
+
+ p-value q-value significant
+ 0.1406275 0.02889212 1
+ 0.4322499 0.06514199 0
+ 0.1221209 0.02760624 1
+ 0.1420102 0.02889212 1
+ 0.01290986 0.00437754 1
+ 0.000142807 6.46E-05 1
+ 0.03984194 0.01013235 1
+ 0.0351733 0.009932946 1
+ 0.01134006 0.004194811 1
+ 1.01E-05 5.59E-06 1
+ 0.2127383 0.03934711 1
+ 0.09125628 0.02184257 1
+ 0.5473754 0.07954578 0
+ 0.1895898 0.03673547 1
+ 6.18E-12 5.03E-12 1
+ 0.001235875 0.00050288 1
+ 1.10E-05 5.59E-06 1
+ 9.75E-07 6.61E-07 1
+ 2.13E-18 4.33E-18 1
+ 2.54E-16 3.45E-16 1
+ 1.20E-19 4.88E-19 1
+ 9.76E-14 9.93E-14 1
+ 0.3591815 0.06089654 0
+ 0.03661672 0.009932946 1
+ 0.40046 0.0626723 0
+ 0.3874365 0.0626723 0
+ 0.3420751 0.06051785 0
+ 0.9041293 0.1268593 0
+ 0.03115264 0.009750824 1
+
+
+.. image:: ./static/operation_icons/p_hist.png
+
+
+.. image:: ./static/operation_icons/q_hist.png
+
+
+.. image:: ./static/operation_icons/Q_plots.png
+
+
+  </help>  
+  
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/delete_overlapping_indels.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/delete_overlapping_indels.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,94 @@
+#!/usr/bin/perl -w
+
+# This program detects overlapping indels in a chromosome and keeps all non-overlapping indels. As for overlapping indels, 
+# the first encountered one is kept and all others are removed. It requires three inputs: 
+# The first input is a TABULAR format file containing coordinates of indels in blocks extracted from multi-alignment.
+# The second input is an integer number representing the number of the column where indel start coordinates are stored in the input file.
+# The third input is an integer number representing the number of the column where indel end coordinates are stored in the input file.
+# The output is a TABULAR format file containing all non-overlapping indels in the input file, and the first encountered indel of overlapping ones.
+# Note: The number of the first column is 1.

+use strict;
+use warnings;
+
+#varaibles to handle information related to indels
+my $indel1 = "";
+my $indel2 = "";
+my @indelArray1 = ();
+my @indelArray2 = ();
+my $lineCounter1 = 0;
+my $lineCounter2 = 0;
+my $totalNumberofNonOverlappingIndels = 0;
+
+# check to make sure having correct files
+my $usage = "usage: delete_overlapping_indels.pl [TABULAR.in] [indelStartColumn] [indelEndColumn] [TABULAR.out]\n";
+die $usage unless @ARGV == 4;
+
+my $inputFile = $ARGV[0];
+my $indelStartColumn = $ARGV[1] - 1;
+my $indelEndColumn = $ARGV[2] - 1;
+my $outputFile = $ARGV[3];
+
+#verifie column numbers
+if ($indelStartColumn < 0 ){
+ die ("The indel start column number is invalid \n"); 
+}
+if ($indelEndColumn < 0 ){
+ die ("The indel end column number is invalid \n"); 
+}
+
+#open the input and output files
+open (INPUT, "<", $inputFile) || die ("Could not open file $inputFile \n"); 
+open (OUTPUT, ">", $outputFile) || die ("Could not open file $outputFile \n"); 
+
+#store the input file in the array @rawData
+my @indelsRawData = <INPUT>;
+
+#iterated through the indels of the input file
+INDEL1:
+foreach $indel1 (@indelsRawData){
+ chomp ($indel1);
+ $lineCounter1++;
+
+ #get the first indel
+ @indelArray1 = split(/\t/, $indel1);
+  
+ #our purpose is to detect overlapping indels and to store one copy of them only in the output file
+ #all other non-overlapping indels will stored in the output file also
+  
+ $lineCounter2 = 0;
+  
+ #iterated through the indels of the input file
+ INDEL2:
+ foreach $indel2 (@indelsRawData){
+ chomp ($indel2);
+ $lineCounter2++;
+
+ if ($lineCounter2 > $lineCounter1){
+ #get the second indel
+ @indelArray2 = split(/\t/, $indel2);
+  
+  #check if the two indels are overlapping
+  if (($indelArray2[$indelEndColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelEndColumn] <= $indelArray1[$indelEndColumn]) || ($indelArray2[$indelStartColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelStartColumn] <= $indelArray1[$indelEndColumn])){
+  #print ("There is an overlap between" . "\n" . $indel1 . "\n" . $indel2 . "\n");
+  #print("The two overlapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n\n");

+  #break out of the loop and go back to the outerloop
+  next INDEL1;
+  }
+  else{
+  #print("The two non-overlaapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n");
+  }
+ }
+ }
+  
+ print OUTPUT $indel1 . "\n";
+ $totalNumberofNonOverlappingIndels++;
+}
+
+#print("The total number of indels is: " . $lineCounter1 . "\n");
+#print("The total number of non-overlapping indels is: " . $totalNumberofNonOverlappingIndels . "\n");
+
+#close the input and output files
+close(OUTPUT);
+close(INPUT);
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/delete_overlapping_indels.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/delete_overlapping_indels.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,66 @@
+<tool id="delete_overlapping_indels" name="Delete Overlapping Indels" version="1.0.0">
+  <description>from a chromosome indels file</description>
+  
+  <command interpreter="perl">
+   delete_overlapping_indels.pl $inputFile1 $inputIndelStartColumnNumber2 $inputIndelEndColumnNumber3 $outputFile1
+  </command>
+
+  <inputs>
+   <param format="tabular" name="inputFile1" type="data" label="Select indels file"/>
+   <param type="data_column" name="inputIndelStartColumnNumber2" data_ref="inputFile1" accept_default="true" label="Choose the indel start coordinate column number" />
+    <param type="data_column" name="inputIndelEndColumnNumber3" data_ref="inputFile1" accept_default="true" label="Choose the the indel end coordinate column number" />
+  </inputs>
+  
+  <outputs>
+    <data format="tabular" name="outputFile1"/>
+  </outputs>
+  
+  <tests>
+   <test>
+   <param name="inputFile1" value="indels1.tabular" />
+     <param name="inputIndelStartColumnNumber2" value="5" />
+     <param name="inputIndelEndColumnNumber3" value="6" />
+     <output name="outputFile1" file="non_overlapping_indels1.tabular" />     
+   </test>
+  </tests>
+  
+  <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This program detects overlapping indels in a chromosome and keeps all non-overlapping indels. As for overlapping indels, the first encountered one is kept and all others are removed. 
+It requires three inputs: 
+
+- The first input is a TABULAR format file containing coordinates of indels in blocks extracted from multi-alignment.
+- The second input is an integer number representing the number of the column where indel start coordinates are stored in the input file.
+- The third input is an integer number representing the number of the column where indel end coordinates are stored in the input file.
+- The output is a TABULAR format file containing all non-overlapping indels in the input file, and the first encountered indel of overlapping ones.
+
+Note: The number of the first column is 1.
+
+
+**Example**
+
+Let us have the following insertions in the human genome. The start and end coordinates of insertions are on columns 5 and 6 respectively::
+
+ 3 hg18.chr22_insert 3 hg18.chr22 14508610 14508612 3924 - panTro2.chr2b 132518950 132518951 3910 + rheMac2.chr17 14311798 14311799 3896 +
+ 7 hg18.chr22_insert 13 hg18.chr22 14513678 14513690 348 - panTro2.chr2b 132517876 132517877 321 + rheMac2.chr17 14274462 14274463 337 +
+ 7 hg18.chr22_insert 6 hg18.chr22 14513688 14513699 348 - panTro2.chr2b 132517879 132517880 321 + rheMac2.chr17 14274465 14274466 337 +
+ 25 hg18.chr22_insert 9 hg18.chr22 14529501 14529509 385 - panTro2.chr22 14528775 14528776 376 - rheMac2.chr9 42869449 42869450 375 -
+ 36 hg18.chr22_insert 4 hg18.chr22 14566316 14566319 540 - panTro2.chr2b 132492077 132492078 533 + rheMac2.chr10 59230438 59230439 533 -
+ 40 hg18.chr22_insert 7 hg18.chr22 14508610 14508616 2337 - panTro2.chr2b 132487750 132487751 2313 + rheMac2.chr10 59128305 59128306 2332 +
+ 41 hg18.chr22_insert 4 hg18.chr22 14571556 14571559 2483 - panTro2.chr2b 132485878 132485879 2481 + rheMac2.chr10 59126094 59126095 2508 +
+
+By removing the overlapping indels which, we get::
+
+ 3 hg18.chr22_insert 3 hg18.chr22 14508610 14508612 3924 - panTro2.chr2b 132518950 132518951 3910 + rheMac2.chr17 14311798 14311799 3896 +
+ 7 hg18.chr22_insert 13 hg18.chr22 14513678 14513690 348 - panTro2.chr2b 132517876 132517877 321 + rheMac2.chr17 14274462 14274463 337 +
+ 25 hg18.chr22_insert 9 hg18.chr22 14529501 14529509 385 - panTro2.chr22 14528775 14528776 376 - rheMac2.chr9 42869449 42869450 375 -
+ 36 hg18.chr22_insert 4 hg18.chr22 14566316 14566319 540 - panTro2.chr2b 132492077 132492078 533 + rheMac2.chr10 59230438 59230439 533 -
+ 41 hg18.chr22_insert 4 hg18.chr22 14571556 14571559 2483 - panTro2.chr2b 132485878 132485879 2481 + rheMac2.chr10 59126094 59126095 2508 +
+
+  </help>  
+  
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/draw_stacked_barplots.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/draw_stacked_barplots.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,78 @@
+#!/usr/bin/perl -w
+
+# This program draws, in a pdf file, a stacked bars plot for different categories of data and for 
+# different criteria. For each criterion a stacked bar is drawn, such that the height of each stacked 
+# sub-bar represents the number of elements in each category satisfying that criterion.
+# The input consists of a TABULAR format file, where the left column represents the names of categories 
+# and the other columns are headed by the names of criteria, such that each data value in the file 
+# represents the number of elements in a certain category satisfying a certain criterion.
+# The output is a PDF file containing a stacked bars plot representing the number of elements in each 
+# category satisfying each criterion. The drawing is done using R code.  
+
+  
+use strict;
+use warnings;
+
+my $criterion;
+my @criteriaArray = ();
+my $criteriaNumber = 0;
+my $lineCounter = 0;
+
+#variable to store the names of R script file
+my $r_script;
+
+# check to make sure having correct files
+my $usage = "usage: draw_stacked_bar_plot.pl [TABULAR.in] [PDF.out] \n";
+die $usage unless @ARGV == 2;
+
+my $categoriesInputFile = $ARGV[0];
+
+my $categories_criteria_bars_plot_outputFile = $ARGV[1];
+
+#open the input file
+open (INPUT, "<", $categoriesInputFile) || die("Could not open file $categoriesInputFile \n"); 
+open (OUTPUT, ">", $categories_criteria_bars_plot_outputFile) || die("Could not open file $categories_criteria_bars_plot_outputFile \n");
+
+# R script to implement the drawing of a stacked bar plot representing thes significant motifs in each category of motifs 
+#construct an R script file 
+$r_script = "motif_significance_bar_plot.r";
+open(Rcmd,">", $r_script) or die "Cannot open $r_script \n\n";
+print Rcmd "
+ #store the table content of the first file into a matrix
+ categoriesTable <- read.table(\"$categoriesInputFile\", header = TRUE);
+ categoriesMatrix <- as.matrix(categoriesTable); 
+
+
+ #compute the sum of elements in the column with the maximum sum in each matrix
+ columnSumsVector <- colSums(categoriesMatrix);
+ maxColumn <- max (columnSumsVector);
+
+ if (maxColumn %% 10 != 0){
+ maxColumn <- maxColumn + 10;
+ }
+
+ plotHeight = maxColumn/8;
+ criteriaVector <- names(categoriesTable);
+
+ pdf(file = \"$categories_criteria_bars_plot_outputFile\", width = length(criteriaVector), height = plotHeight, family = \"Times\", pointsize = 12, onefile = TRUE);
+
+
+
+ #draw the first barplot
+ barplot(categoriesMatrix, ylab = \"No. of elements in each category\", xlab = \"Criteria\", ylim = range(0, maxColumn), col = \"black\", density = c(10, 20, 30, 40, 50, 60, 70, 80), angle = c(45, 90, 135), names.arg = criteriaVector);
+
+ #draw the legend
+ legendX = 0.2;
+ legendY = maxColumn;
+
+ legend (legendX, legendY, legend = rownames(categoriesMatrix), density = c(10, 20, 30, 40, 50, 60, 70, 80), angle = c(45, 90, 135));
+   
+    dev.off();
+
+ #eof\n";
+close Rcmd;
+system("R --no-restore --no-save --no-readline < $r_script > $r_script.out");
+
+#close the input files
+close(OUTPUT);
+close(INPUT);
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/draw_stacked_barplots.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/draw_stacked_barplots.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,59 @@
+<tool id="draw_stacked_barplots" name="Draw Stacked Bar Plots" version="1.0.0">
+  <description>for different categories and different criteria</description>
+  
+  <command interpreter="perl">
+   draw_stacked_barplots.pl $inputFile1 $outputFile1
+  </command>
+
+  <inputs>
+   <param format="tabular" name="inputFile1" type="data" label="Select the input file"/>
+  </inputs>
+  
+  <outputs>
+    <data format="pdf" name="outputFile1"/>
+  </outputs>
+
+  <tests>
+   <test>
+   <param name="inputFile1" value="categories_criteria.tabular" />
+     <output name="outputFile1" file="stacked_barplot.pdf" />     
+   </test>
+  </tests>
+
+  <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This program draws, in a pdf file, a stacked bars plot for different categories of data and for different criteria. For each criterion a stacked bar is
+drawn, such that the height of each stacked sub-bar represents the number of elements in each category satisfying that criterion.
+
+- The input consists of a TABULAR format file, where the left column represents the names of categories and the other columns are headed by the names of criteria, such that each data value in the file represents the number of elements in a certain category satisfying a certain criterion:

+- The output is a PDF file containing a stacked bars plot representing the number of elements in each category satisfying each criterion. The drawing is done using R code.  
+
+**Example**
+
+Let us suppose that the input file represent the number of significant motifs in each motif category for each window size::
+
+  10bp 20bp 40bp 80bp 160bp 320bp 640bp 1280bp
+ Deletion_Hotspots 2 3 4 4 5 6 7 7
+ Dna_Pol_Pause/Frameshift_Hotspots 8 10 14 17 18 15 19 20
+ Indel_Hotspots 1 1 1 2 1 0 0 0
+ Insertion_Hotspots 0 0 1 2 2 2 2 5
+ Topoisomerase_Cleavage_Sites 2 3 5 4 3 3 4 4
+ Translin_Targets 0 0 2 2 3 3 3 2
+ VDJ_Recombination_Signals 0 0 1 1 1 2 2 2
+ X-like_Sites 4 4 4 5 6 7 7 10
+
+
+Runnig the program will give the following output::
+
+ The stacked bars plot representing the data in the input file.
+
+.. image:: ./static/operation_icons/stacked_bars_plot.png
+
+  </help>  
+  
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/featureCounter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/featureCounter.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,148 @@
+#!/usr/bin/env python
+#Guruprasad Ananda
+"""
+Calculate count and coverage of one query on another, and append the Coverage and counts to
+the last four columns as bases covered, percent coverage, number of completely present features, number of partially present/overlapping features.
+
+usage: %prog bed_file_1 bed_file_2 out_file
+    -1, --cols1=N,N,N,N: Columns for chr, start, end, strand in first file
+    -2, --cols2=N,N,N,N: Columns for chr, start, end, strand in second file
+"""
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+import sys, traceback, fileinput
+from warnings import warn
+from bx.intervals.io import *
+from bx.cookbook import doc_optparse
+from bx.intervals.operations import quicksect
+from galaxy.tools.util.galaxyops import *
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+def counter(node, start, end):
+    global full, partial
+    if node.start <= start and node.maxend > start:
+        if node.end >= end or (node.start == start and end > node.end > start):
+            full += 1
+        elif end > node.end > start:
+            partial += 1
+        if node.left and node.left.maxend > start:
+            counter(node.left, start, end)
+        if node.right: 
+            counter(node.right, start, end)
+    elif start < node.start < end:
+        if node.end <= end:
+            full += 1
+        else:
+            partial += 1
+        if node.left and node.left.maxend > start:
+            counter(node.left, start, end)
+        if node.right: 
+            counter(node.right, start, end)
+    else:
+        if node.left: 
+            counter(node.left, start, end)
+
+def count_coverage( readers, comments=True ):
+    primary = readers[0]
+    secondary = readers[1]
+    secondary_copy = readers[2]
+    
+    rightTree = quicksect.IntervalTree()
+    for item in secondary:
+        if type( item ) is GenomicInterval:
+            rightTree.insert( item, secondary.linenum, item.fields )
+    
+    bitsets = secondary_copy.binned_bitsets() 
+        
+    global full, partial
+    
+    for interval in primary:
+        if type( interval ) is Header:
+            yield interval
+        if type( interval ) is Comment and comments:
+            yield interval
+        elif type( interval ) == GenomicInterval:
+            chrom = interval.chrom
+            start = int(interval.start)
+            end = int(interval.end)
+            full = 0
+            partial = 0
+            if chrom not in bitsets:
+                bases_covered = 0
+                percent = 0.0
+                full = 0
+                partial = 0
+            else:
+                bases_covered = bitsets[ chrom ].count_range( start, end-start )
+                if (end - start) == 0:
+                    percent = 0
+                else: 
+                    percent = float(bases_covered) / float(end - start)
+                if bases_covered:
+                    root = rightTree.chroms[chrom]    #root node for the chrom tree
+                    counter(root, start, end)
+            interval.fields.append(str(bases_covered))
+            interval.fields.append(str(percent))
+            interval.fields.append(str(full))
+            interval.fields.append(str(partial))
+            yield interval
+    
+def main():
+    options, args = doc_optparse.parse( __doc__ )
+    
+    try:
+        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
+        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )      
+        in1_fname, in2_fname, out_fname = args
+    except:
+        stop_err( "Data issue: click the pencil icon in the history item to correct the metadata attributes." )
+    
+    g1 = NiceReaderWrapper( fileinput.FileInput( in1_fname ),
+                            chrom_col=chr_col_1,
+                            start_col=start_col_1,
+                            end_col=end_col_1,
+                            strand_col=strand_col_1,
+                            fix_strand=True )
+    g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
+                            chrom_col=chr_col_2,
+                            start_col=start_col_2,
+                            end_col=end_col_2,
+                            strand_col=strand_col_2,
+                            fix_strand=True )
+    g2_copy = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
+                                 chrom_col=chr_col_2,
+                                 start_col=start_col_2,
+                                 end_col=end_col_2,
+                                 strand_col=strand_col_2,
+                                 fix_strand=True )
+    
+
+    out_file = open( out_fname, "w" )
+
+    try:
+        for line in count_coverage([g1,g2,g2_copy]):
+            if type( line ) is GenomicInterval:
+                out_file.write( "%s\n" % "\t".join( line.fields ) )
+            else:
+                out_file.write( "%s\n" % line )
+    except ParseError, exc:
+        out_file.close()
+        fail( str( exc ) )
+
+    out_file.close()
+
+    if g1.skipped > 0:
+        print skipped( g1, filedesc=" of 1st dataset" )
+    if g2.skipped > 0:
+        print skipped( g2, filedesc=" of 2nd dataset" )
+    elif g2_copy.skipped > 0:
+        print skipped( g2_copy, filedesc=" of 2nd dataset" )
+        
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/featureCounter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/featureCounter.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,75 @@
+<tool id="featureCoverage1" name="Feature coverage" version="2.0.0">
+  <description></description>
+  <command interpreter="python">featureCounter.py $input1 $input2 $output -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol}</command>
+  <inputs>
+    <param format="interval" name="input1" type="data" help="First dataset">
+      <label>What portion of</label>
+    </param>
+    <param format="interval" name="input2" type="data" help="Second dataset">
+      <label>is covered by</label>
+    </param>
+   </inputs>
+  <outputs>
+    <data format="interval" name="output" metadata_source="input1" />
+  </outputs>
+  
+  <tests>
+    <test>
+      <param name="input1" value="1.bed" />
+      <param name="input2" value="2.bed" />
+      <output name="output" file="6_feature_coverage.bed" />
+    </test>
+    <test>
+      <param name="input1" value="chrY1.bed" />
+      <param name="input2" value="chrY2.bed" />
+      <output name="output" file="chrY_Coverage.bed" />
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**What it does**
+
+This tool finds the coverage of intervals in the first dataset on intervals in the second dataset. The coverage and count are appended as 4 new columns in the resulting dataset.
+
+-----
+
+**Example**
+
+- If **First dataset** consists of the following windows::
+
+    chrX 1     10001 seg 0 -
+    chrX 10001 20001 seg 0 -
+    chrX 20001 30001 seg 0 -
+    chrX 30001 40001 seg 0 -
+      
+- and **Second dataset** consists of the following exons::
+
+    chrX 5000  6000  seg2 0 -
+    chrX 5500  7000  seg2 0 -
+    chrX 9000  22000 seg2 0 -
+    chrX 24000 34000 seg2 0 -
+    chrX 36000 38000 seg2 0 -
+      
+- the **Result** is the coverage of exons of the second dataset in each of the windows contained in first dataset::
+
+    chrX 1     10001 seg 0 - 3001  0.3001 2 1
+    chrX 10001 20001 seg 0 - 10000 1.0    1 0
+    chrX 20001 30001 seg 0 - 8000  0.8    0 2
+    chrX 30001 40001 seg 0 - 5999  0.5999 1 1
+   
+- To clarify, the following line of output ( added columns are indexed by a, b and c )::
+
+                         a    b      c d
+    chrX 1 10001 seg 0 - 3001 0.3001 2 1
+                                  
+  implies that 2 exons (c) fall fully in this window (chrX:1-10001), 1 exon (d) partially overlaps this window, and these 3 exons cover 30.01% (c) of the window size, spanning 3001 nucleotides (a).
+
+  * a: number of nucleotides in this window covered by the features in (c) and (d) - features overlapping with each other will be merged to calculate (a)
+  * b: fraction of window size covered by features in (c) and (d) - features overlapping with each other will be merged to calculate (b)
+  * c: number of features in the 2nd dataset that fall **completely** within this window
+  * d: number of features in the 2nd dataset that **partially** overlap this window
+    
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/getIndelRates_3way.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/getIndelRates_3way.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,249 @@\n+#!/usr/bin/env python\n+#Guruprasad Ananda\n+\n+from galaxy import eggs\n+import pkg_resources\n+pkg_resources.require( "bx-python" )\n+\n+import sys, os, tempfile\n+import traceback\n+import fileinput\n+from warnings import warn\n+\n+from galaxy.tools.util.galaxyops import *\n+from bx.intervals.io import *\n+\n+from bx.intervals.operations import quicksect\n+\n+def stop_err(msg):\n+    sys.stderr.write(msg)\n+    sys.exit()\n+    \n+def counter(node, start, end, sort_col):\n+    global full, blk_len, blk_list\n+    if node.start < start:\n+        if node.right:\n+            counter(node.right, start, end, sort_col)\n+    elif start <= node.start <= end and start <= node.end <= end:\n+        full += 1\n+        if node.other[0] not in blk_list:\n+            blk_list.append(node.other[0])\n+            blk_len += int(node.other[sort_col+2])\n+        if node.left and node.left.maxend > start:\n+            counter(node.left, start, end, sort_col)\n+        if node.right: \n+            counter(node.right, start, end, sort_col)\n+    elif node.start > end:\n+        if node.left: \n+            counter(node.left, start, end, sort_col)\n+            \n+\n+infile = sys.argv[1]  \n+fout = open(sys.argv[2],\'w\')\n+int_file = sys.argv[3]\n+if int_file != "None": #User has specified an interval file\n+    try:\n+        fint = open(int_file, \'r\')\n+        dbkey_i = sys.argv[4]\n+        chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] )\n+    except:\n+        stop_err("Unable to open input Interval file")\n+        \n+def main():\n+\n+    for i, line in enumerate( file ( infile )):\n+        line = line.rstrip(\'\\r\\n\')\n+        if len( line )>0 and not line.startswith( \'#\' ):\n+            elems = line.split( \'\\t\' )\n+            break\n+        if i == 30:\n+            break # Hopefully we\'ll never get here...\n+        \n+    if len( elems ) != 18:\n+        stop_err( "This tool only works on tabular data output by \'Fetch Indels from 3-way alignments\' tool. The data in your input dataset is either missing or not formatted properly." )\n+    \n+    for i, line in enumerate( file ( infile )):\n+        line = line.rstrip(\'\\r\\n\')\n+        elems = line.split(\'\\t\')\n+        try:\n+            assert int(elems[0])\n+            assert len(elems) == 18\n+            if int_file != "None":\n+                if dbkey_i not in elems[3] and  dbkey_i not in elems[8] and dbkey_i not in elems[13]:\n+                    stop_err("The species build corresponding to your interval file is not present in the Indel file.") \n+                if dbkey_i in elems[3]:\n+                    sort_col = 4\n+                elif dbkey_i in elems[8]:\n+                    sort_col = 9\n+                elif dbkey_i in elems[13]:\n+                    sort_col = 14\n+            else:\n+                species = []\n+                species.append( elems[3].split(\'.\')[0] )\n+                species.append( elems[8].split(\'.\')[0] )\n+                species.append( elems[13].split(\'.\')[0] )\n+                sort_col = 0    #Based on block numbers\n+            break\n+        except:\n+            continue\n+        \n+        \n+    fin = open(infile, \'r\')\n+    skipped = 0\n+    \n+    if int_file == "None":\n+        sorted_infile = tempfile.NamedTemporaryFile()\n+        cmdline = "sort -n -k"+str(1)+" -o "+sorted_infile.name+" "+infile\n+        try:\n+            os.system(cmdline)\n+        except:\n+            stop_err("Encountered error while sorting the input file.")\n+        print >>fout, "#Block\\t%s_InsRate\\t%s_InsRate\\t%s_InsRate\\t%s_DelRate\\t%s_DelRate\\t%s_DelRate" %(species[0],species[1],species[2],species[0],species[1],species[2])\n+        prev_bnum = -1\n+        sorted_infile.seek(0)\n+        for line in sorted_infile.readlines():\n+            line = line.rstrip(\'\\r\\n\')\n+            elems = line.split(\'\\t\')\n+            try:\n+                assert int(elems[0])\n+                assert len(elems) == 18\n+                new_bnum = int(elems[0])\n+                if new_bnum != prev_bnum'..b'ei:\n+                #print >>sys.stderr, ei\n+                continue\n+        irate = []\n+        drate = []\n+        for i,elem in enumerate(inserts):\n+            try:\n+                irate.append(str("%.2e" %(inserts[i]/blen[i])))\n+            except:\n+                irate.append(\'0\')\n+            try:\n+                drate.append(str("%.2e" %(deletes[i]/blen[i])))\n+            except:\n+                drate.append(\'0\')\n+        print >>fout, "%s\\t%s\\t%s" %(prev_bnum, \'\\t\'.join(irate) , \'\\t\'.join(drate))\n+        sys.exit()\n+    \n+    \n+    inf = open(infile, \'r\')\n+    start_met = False\n+    end_met = False\n+    sp_file = tempfile.NamedTemporaryFile()\n+    for n, line in enumerate(inf):\n+        line = line.rstrip(\'\\r\\n\')\n+        elems = line.split(\'\\t\')\n+        try:\n+            assert int(elems[0])\n+            assert len(elems) == 18\n+            if dbkey_i not in elems[1]: \n+                if not(start_met):   \n+                    continue\n+                else:\n+                    sp_end = n\n+                    break\n+            else:\n+                print >>sp_file, line\n+                if not(start_met):\n+                    start_met = True\n+                    sp_start = n\n+        except:\n+            continue\n+    \n+    try:\n+        assert sp_end\n+    except:\n+        sp_end = n+1\n+    \n+    sp_file.seek(0)\n+    win = NiceReaderWrapper( fileinput.FileInput( int_file ),\n+                                chrom_col=chr_col_i,\n+                                start_col=start_col_i,\n+                                end_col=end_col_i,\n+                                strand_col=strand_col_i,\n+                                fix_strand=True)\n+    \n+    indel = NiceReaderWrapper( fileinput.FileInput( sp_file.name ),\n+                                chrom_col=1,\n+                                start_col=sort_col,\n+                                end_col=sort_col+1,\n+                                strand_col=-1,\n+                                fix_strand=True)\n+    \n+    indelTree = quicksect.IntervalTree()\n+    for item in indel:\n+        if type( item ) is GenomicInterval:\n+            indelTree.insert( item, indel.linenum, item.fields )\n+    result=[]\n+    \n+    global full, blk_len, blk_list\n+    for interval in win:\n+        if type( interval ) is Header:\n+            pass\n+        if type( interval ) is Comment:\n+            pass\n+        elif type( interval ) == GenomicInterval:\n+            chrom = interval.chrom\n+            start = int(interval.start)\n+            end = int(interval.end)\n+            if start > end: \n+                warn( "Interval start after end!" )\n+            ins_chr = "%s.%s_insert" %(dbkey_i,chrom)\n+            del_chr = "%s.%s_delete" %(dbkey_i,chrom)\n+            irate = 0\n+            drate = 0\n+            if ins_chr not in indelTree.chroms and del_chr not in indelTree.chroms:\n+                pass    \n+            else:\n+                if ins_chr in indelTree.chroms:\n+                    full = 0.0\n+                    blk_len = 0\n+                    blk_list = []\n+                    root = indelTree.chroms[ins_chr]    #root node for the chrom insertion tree\n+                    counter(root, start, end, sort_col)\n+                    if blk_len:\n+                        irate = full/blk_len\n+                \n+                if del_chr in indelTree.chroms:\n+                    full = 0.0\n+                    blk_len = 0\n+                    blk_list = []\n+                    root = indelTree.chroms[del_chr]    #root node for the chrom insertion tree\n+                    counter(root, start, end, sort_col)\n+                    if blk_len:\n+                        drate = full/blk_len\n+                \n+            interval.fields.append(str("%.2e" %irate))\n+            interval.fields.append(str("%.2e" %drate))\n+            print >>fout, "\\t".join(interval.fields)\n+            fout.flush()\n+\n+if __name__ == "__main__":\n+    main()    \n\\ No newline at end of file\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/getIndelRates_3way.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/getIndelRates_3way.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,61 @@
+<tool id="indelRates_3way" name="Estimate Indel Rates" version="1.0.0">
+  <description> for 3-way alignments</description>
+  <command interpreter="python">
+    getIndelRates_3way.py $input1 $out_file1 
+    #if $region.type == "align"
+        "None"
+    #else
+        $region.input2 $input2.dbkey $input2.metadata.chromCol,$input2.metadata.startCol,$input2.metadata.endCol,$input2.metadata.strandCol
+    #end if
+  </command>
+  <inputs>
+    <page>
+        <param format="tabular" name="input1" type="data" label="Select dataset containing Indels"/>
+        
+        <conditional name="region">
+          <param name="type" type="select" label="Estimate rates corresponding to" multiple="false">
+            <option value="win" selected="True">Intervals in your history</option>
+            <option value="align">Alignment block</option>
+         </param>
+         <when value="win">
+            <param format="interval" name="input2" type="data" label="Choose intervals">
+                <validator type="unspecified_build" />
+            </param>
+          </when>
+          <when value="align" />
+      </conditional>
+     
+    </page>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_file1" metadata_source="input1"/>
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input1" value="indels_3way.tabular"/>
+      <param name="type" value="align"/>
+      <output name="out_file1" file="indelrates_3way.tabular"/>
+    </test>
+  </tests>
+
+ <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This tool estimates the insertion and deletion rates for alignments in a window of specified size. Rates are computed over the total adjusted lengths (adjusted by disregarding masked bases) of all the alignments blocks from the indel file that fall within that window.
+  
+-----
+
+.. class:: warningmark
+
+**Note**
+
+This tool only works on the output of the 'Estimate Indel Rates for 3-way alignments' tool.   
+
+</help>  
+
+
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/getIndels.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/getIndels.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+
+"""
+Estimate INDELs for pair-wise alignments.
+
+usage: %prog maf_input out_file1 out_file2
+"""
+
+from __future__ import division
+from galaxy import eggs
+import pkg_resources 
+pkg_resources.require( "bx-python" )
+try:
+    pkg_resources.require("numpy")
+except:
+    pass
+import psyco_full
+import sys
+from bx.cookbook import doc_optparse
+from galaxy.tools.exception_handling import *
+import bx.align.maf
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def main():   
+    # Parsing Command Line here
+    options, args = doc_optparse.parse( __doc__ )
+    
+    try:
+        inp_file, out_file1 = args    
+    except:
+        print >> sys.stderr, "Tool initialization error."
+        sys.exit()
+    
+    try:
+        fin = open(inp_file,'r')
+    except:
+        print >> sys.stderr, "Unable to open input file"
+        sys.exit()
+    try:
+        fout1 = open(out_file1,'w')
+        #fout2 = open(out_file2,'w')
+    except:
+        print >> sys.stderr, "Unable to open output file"
+        sys.exit()
+
+    try:
+        maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
+    except:
+        print >>sys.stderr, "Your MAF file appears to be malformed."
+        sys.exit()
+    maf_count = 0
+    
+    print >>fout1, "#Block\tSource\tSeq1_Start\tSeq1_End\tSeq2_Start\tSeq2_End\tIndel_length"
+    for block_ind, block in enumerate(maf_reader):
+        if len(block.components) < 2:
+            continue
+        seq1 = block.components[0].text
+        src1 = block.components[0].src
+        start1 = block.components[0].start
+        if len(block.components) == 2:
+            seq2 = block.components[1].text
+            src2 = block.components[1].src
+            start2 = block.components[1].start
+            #for pos in range(len(seq1)):
+            nt_pos1 = start1-1    #position of the nucleotide (without counting gaps)
+            nt_pos2 = start2-1
+            pos = 0        #character column position
+            gaplen1 = 0
+            gaplen2 = 0
+            prev_pos_gap1 = 0
+            prev_pos_gap2 = 0
+            while pos < len(seq1):
+                if prev_pos_gap1 == 0:
+                    gaplen1 = 0
+                if prev_pos_gap2 == 0:
+                    gaplen2 = 0
+                    
+                if seq1[pos] == '-':
+                    if seq2[pos] != '-':
+                        nt_pos2 += 1
+                        gaplen1 += 1
+                        prev_pos_gap1 = 1
+                        #write 2
+                        if prev_pos_gap2 == 1:
+                            prev_pos_gap2 = 0
+                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1,nt_pos1+1,nt_pos2-1,nt_pos2-1+gaplen2,gaplen2)
+                        if pos == len(seq1)-1:
+                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1,nt_pos1+1,nt_pos2+1-gaplen1,nt_pos2+1,gaplen1)
+                    else:
+                        prev_pos_gap1 = 0
+                        prev_pos_gap2 = 0
+                        """
+                        if prev_pos_gap1 == 1:
+                            prev_pos_gap1 = 0
+                            print >>fout1,"%d\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,gaplen1)
+                        elif prev_pos_gap2 == 1:
+                            prev_pos_gap2 = 0
+                            print >>fout1,"%d\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos2-1,nt_pos2,gaplen2)
+                        """
+                else:
+                    nt_pos1 += 1
+                    if seq2[pos] != '-':
+                        nt_pos2 += 1
+                        #write both
+                        if prev_pos_gap1 == 1:
+                            prev_pos_gap1 = 0
+                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,nt_pos2-gaplen1,nt_pos2,gaplen1)
+                        elif prev_pos_gap2 == 1:
+                            prev_pos_gap2 = 0
+                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1-gaplen2,nt_pos1,nt_pos2-1,nt_pos2,gaplen2)
+                    else:
+                        gaplen2 += 1
+                        prev_pos_gap2 = 1
+                        #write 1
+                        if prev_pos_gap1 == 1:
+                            prev_pos_gap1 = 0
+                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,nt_pos2,nt_pos2+gaplen1,gaplen1)
+                        if pos == len(seq1)-1:
+                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1+1-gaplen2,nt_pos1+1,nt_pos2,nt_pos2+1,gaplen2)
+                pos += 1
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/getIndels_2way.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/getIndels_2way.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,59 @@
+<tool id="getIndels_2way" name="Fetch Indels">
+  <description> from pairwise alignments</description>
+  <command interpreter="python">
+   getIndels.py $input1 $out_file1
+  </command>
+  <inputs>
+    <page>
+     <param format="maf" name="input1" type="data" label="Select data"/>
+    </page>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_file1" metadata_source="input1"/>
+  </outputs>
+  <requirements>
+    <requirement type="python-module">numpy</requirement>
+  </requirements>
+  <tests>
+    <test>
+      <param name="input1" value="6.maf"/>
+      <output name="out_file1" file="6_indels.tabular"/>
+    </test>
+  </tests>
+ <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This tool estimates the number of indels for every alignment block of the MAF file. 
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+Any block/s not containing exactly 2 species will be omitted. 
+
+-----
+
+**Example**
+
+- For the following alignment block::
+
+   a score=7233.0
+   s hg18.chr1     100 35 + 247249719 AT--GACTGAGGACTTAGTTTAAGATGTTCCTACT
+   s rheMac2.chr11 200 31 + 134511895 ATAAG-CGGACGACTTAGTTTAAGATGTTCC---- 
+
+- running this tool will return::
+
+   #Block   Source       Seq1_Start Seq1_End  Seq2_Start Seq2_End Indel_length
+   1    hg18.chr1            101          102      202      204       2
+   1    rheMac2.chr11 103          104      204      205       1
+   1    rheMac2.chr11 129          133      229      230       4
+   
+</help>  
+
+
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/getIndels_3way.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/getIndels_3way.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,53 @@
+<tool id="indels_3way" name="Fetch Indels"  version="1.0.3">
+  <description> from 3-way alignments</description>
+  <command interpreter="perl">
+   parseMAF_smallIndels.pl $input1 $out_file1 $outgroup
+  </command>
+  <inputs>
+    <page>
+     <param format="maf" name="input1" type="data" label="Select data"/>
+     <param name="outgroup" type="select" label="Select outgroup species">
+          <options>
+            <filter type="data_meta" ref="input1" key="species" />
+          </options>  
+       </param>
+    </page>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_file1" metadata_source="input1"/>
+    <!--<data format="tabular" name="out_file2" metadata_source="input1"/>-->
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="3way.maf"/>
+      <param name="outgroup" value="canFam2"/>
+      <output name="out_file1" file="indels_3way.tabular"/>
+    </test>
+  </tests>
+ <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This tool consists of the first module from the computational pipeline to identify indels as described in Kvikstad et al., 2007. Note that the generated output does not include subsequent filtering steps.
+
+Deletions in a particular species are identified as one or more consecutive gap columns within an alignment block, given that the orthologous positions in the other two species contain nucleotides of
+equal length. 
+Similarly, insertions in a particular species are identified as one or more consecutive nucleotide columns within an alignment block, given that the orthologous positions in the other two
+species contain gaps.
+
+*Kvikstad E. M. et al. (2007). A Macaques-Eye View of Human Insertions and Deletions: Differences in Mechanisms. PLoS Computational Biology 3(9):e176*
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+Any block/s not containing exactly 3 sequences will be omitted. 
+
+  </help>  
+
+
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/linear_regression.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/linear_regression.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,147 @@
+#!/usr/bin/env python
+
+from galaxy import eggs
+import sys, string
+from rpy import *
+import numpy
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+infile = sys.argv[1]
+y_col = int(sys.argv[2])-1
+x_cols = sys.argv[3].split(',')
+outfile = sys.argv[4]
+outfile2 = sys.argv[5]
+
+print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1)
+fout = open(outfile,'w')
+elems = []
+for i, line in enumerate( file ( infile )):
+    line = line.rstrip('\r\n')
+    if len( line )>0 and not line.startswith( '#' ):
+        elems = line.split( '\t' )
+        break 
+    if i == 30:
+        break # Hopefully we'll never get here...
+
+if len( elems )<1:
+    stop_err( "The data in your input dataset is either missing or not formatted properly." )
+
+y_vals = []
+x_vals = []
+
+for k,col in enumerate(x_cols):
+    x_cols[k] = int(col)-1
+    x_vals.append([])
+
+NA = 'NA'
+for ind,line in enumerate( file( infile )):
+    if line and not line.startswith( '#' ):
+        try:
+            fields = line.split("\t")
+            try:
+                yval = float(fields[y_col])
+            except:
+                yval = r('NA')
+            y_vals.append(yval)
+            for k,col in enumerate(x_cols):
+                try:
+                    xval = float(fields[col])
+                except:
+                    xval = r('NA')
+                x_vals[k].append(xval)
+        except:
+            pass
+
+x_vals1 = numpy.asarray(x_vals).transpose()
+
+dat= r.list(x=array(x_vals1), y=y_vals)
+
+set_default_mode(NO_CONVERSION)
+try:
+    linear_model = r.lm(r("y ~ x"), data = r.na_exclude(dat))
+except RException, rex:
+    stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.")
+set_default_mode(BASIC_CONVERSION)
+
+coeffs=linear_model.as_py()['coefficients']
+yintercept= coeffs['(Intercept)']
+summary = r.summary(linear_model)
+
+co = summary.get('coefficients', 'NA')
+"""
+if len(co) != len(x_vals)+1:
+    stop_err("Stopped performing linear regression on the input data, since one of the predictor columns contains only non-numeric or invalid values.")
+"""
+
+try:
+    yintercept = r.round(float(yintercept), digits=10)
+    pvaly = r.round(float(co[0][3]), digits=10)
+except:
+    pass
+
+print >>fout, "Y-intercept\t%s" %(yintercept)
+print >>fout, "p-value (Y-intercept)\t%s" %(pvaly)
+
+if len(x_vals) == 1:    #Simple linear  regression case with 1 predictor variable
+    try:
+        slope = r.round(float(coeffs['x']), digits=10)
+    except:
+        slope = 'NA'
+    try:
+        pval = r.round(float(co[1][3]), digits=10)
+    except:
+        pval = 'NA'
+    print >>fout, "Slope (c%d)\t%s" %(x_cols[0]+1,slope)
+    print >>fout, "p-value (c%d)\t%s" %(x_cols[0]+1,pval)
+else:    #Multiple regression case with >1 predictors
+    ind=1
+    while ind < len(coeffs.keys()):
+        try:
+            slope = r.round(float(coeffs['x'+str(ind)]), digits=10)
+        except:
+            slope = 'NA'
+        print >>fout, "Slope (c%d)\t%s" %(x_cols[ind-1]+1,slope)
+        try:
+            pval = r.round(float(co[ind][3]), digits=10)
+        except:
+            pval = 'NA'
+        print >>fout, "p-value (c%d)\t%s" %(x_cols[ind-1]+1,pval)
+        ind+=1
+
+rsq = summary.get('r.squared','NA')
+adjrsq = summary.get('adj.r.squared','NA')
+fstat = summary.get('fstatistic','NA')
+sigma = summary.get('sigma','NA')
+
+try:
+    rsq = r.round(float(rsq), digits=5)
+    adjrsq = r.round(float(adjrsq), digits=5)
+    fval = r.round(fstat['value'], digits=5)
+    fstat['value'] = str(fval)
+    sigma = r.round(float(sigma), digits=10)
+except:
+    pass
+
+print >>fout, "R-squared\t%s" %(rsq)
+print >>fout, "Adjusted R-squared\t%s" %(adjrsq)
+print >>fout, "F-statistic\t%s" %(fstat)
+print >>fout, "Sigma\t%s" %(sigma)
+
+r.pdf( outfile2, 8, 8 )
+if len(x_vals) == 1:    #Simple linear  regression case with 1 predictor variable
+    sub_title =  "Slope = %s; Y-int = %s" %(slope,yintercept)
+    try:
+        r.plot(x=x_vals[0], y=y_vals, xlab="X", ylab="Y", sub=sub_title, main="Scatterplot with regression")
+        r.abline(a=yintercept, b=slope, col="red")
+    except:
+        pass
+else:
+    r.pairs(dat, main="Scatterplot Matrix", col="blue")
+try:
+    r.plot(linear_model)
+except:
+    pass
+r.dev_off()
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/linear_regression.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/linear_regression.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,71 @@
+<tool id="LinearRegression1" name="Perform Linear Regression" version="1.0.1">
+  <description> </description>
+  <command interpreter="python">
+    linear_regression.py 
+      $input1
+      $response_col
+      $predictor_cols
+      $out_file1
+      $out_file2
+      1>/dev/null
+  </command>
+  <inputs>
+    <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/>
+    <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" numerical="True"/>
+    <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" numerical="True" multiple="true" >
+        <validator type="no_options" message="Please select at least one column."/>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input1" />
+    <data format="pdf" name="out_file2" />
+  </outputs>
+  <requirements>
+    <requirement type="python-module">rpy</requirement>
+  </requirements>
+  <tests>
+    <test>
+        <param name="input1" value="regr_inp.tabular"/>
+        <param name="response_col" value="3"/>
+        <param name="predictor_cols" value="1,2"/>
+        <output name="out_file1" file="regr_out.tabular"/>
+        <output name="out_file2" file="regr_out.pdf"/>
+    </test>
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Edit Datasets-&gt;Convert characters*
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+This tool uses the 'lm' function from R statistical package to perform linear regression on the input data. It outputs two files, one containing the summary statistics of the performed regression, and the other containing diagnostic plots to check whether model assumptions are satisfied.   
+
+*R Development Core Team (2009). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. ISBN 3-900051-07-0, URL http://www.R-project.org.*
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+- This tool currently treats all predictor and response variables as continuous numeric variables. Running the tool on categorical variables might result in incorrect results.
+
+- Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis.
+
+- The summary statistics in the output are described below:
+
+  - sigma: the square root of the estimated variance of the random error (standard error of the residiuals)
+  - R-squared: the fraction of variance explained by the model
+  - Adjusted R-squared: the above R-squared statistic adjusted, penalizing for the number of the predictors (p)
+  - p-value: p-value for the t-test of the null hypothesis that the corresponding slope is equal to zero against the two-sided alternative.
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/maf_cpg_filter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/maf_cpg_filter.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+#Guruprasad Ananda
+#Adapted from bx/scripts/maf_mask_cpg.py
+"""
+Mask out potential CpG sites from a maf. Restricted or inclusive definition
+of CpG sites can be used. The total fraction masked is printed to stderr.
+
+usage: %prog < input > output restricted
+    -m, --mask=N: Character to use as mask ('?' is default)
+"""
+
+from galaxy import eggs
+import pkg_resources 
+pkg_resources.require( "bx-python" )
+try:
+    pkg_resources.require( "numpy" )
+except:
+    pass
+import bx.align
+import bx.align.maf
+from bx.cookbook import doc_optparse
+import sys
+import bx.align.sitemask.cpg
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def main():
+    options, args = doc_optparse.parse( __doc__ )
+    try:
+        inp_file, out_file, sitetype, definition = args
+        if options.mask:
+            mask = int(options.mask)
+        else:
+            mask = 0
+    except:
+        print >> sys.stderr, "Tool initialization error."
+        sys.exit()
+
+    reader = bx.align.maf.Reader( open(inp_file, 'r') )
+    writer = bx.align.maf.Writer( open(out_file,'w') )
+    
+    mask_chr_dict = {0:'#', 1:'$', 2:'^', 3:'*', 4:'?', 5:'N'}
+    mask = mask_chr_dict[mask]
+    
+    if sitetype == "CpG":
+        if int(definition) == 1:
+            cpgfilter = bx.align.sitemask.cpg.Restricted( mask=mask )
+            defn = "CpG-Restricted"
+        else:
+            cpgfilter = bx.align.sitemask.cpg.Inclusive( mask=mask )
+            defn = "CpG-Inclusive"
+    else:
+        cpgfilter = bx.align.sitemask.cpg.nonCpG( mask=mask )
+        defn = "non-CpG"
+    cpgfilter.run( reader, writer.write )
+    
+    print "%2.2f percent bases masked; Mask character = %s, Definition = %s" %(float(cpgfilter.masked)/float(cpgfilter.total) * 100, mask, defn)
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/maf_cpg_filter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/maf_cpg_filter.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,87 @@
+<tool id="cpgFilter" name="Mask CpG/non-CpG sites" version="1.0.0">
+  <description> from MAF file</description>
+  <command interpreter="python">
+   maf_cpg_filter.py 
+    $input 
+    $out_file1 
+    $masksite.type
+    #if $masksite.type == "CpG":
+     $masksite.definition
+    #else:
+     "NA"
+   #end if
+    -m $mask_char
+  </command>
+  <inputs>
+    <page>
+     <param format="maf" name="input" type="data" label="Select data"/>
+    <param name="mask_char" size="5" type="select" label="Mask character">
+     <option value="0" selected="true">#</option>
+           <option value="1">$</option>
+           <option value="2">^</option>
+           <option value="3">*</option>
+           <option value="4">?</option>
+           <option value="5">N</option>
+        </param>
+        <conditional name="masksite">
+            <param name="type" size="5" type="select" label="Sites to be masked">
+                <option value="CpG" selected="true">CpG sites</option>
+                <option value="nonCpG">non-CpG sites</option>
+             </param>
+            <when value="CpG">
+                <param name="definition" size="5" type="select" label="Definition">
+                    <option value="0" selected="true">Inclusive</option>
+                    <option value="1">Restricted</option>
+                 </param>
+            </when>
+            <when value="nonCpG" />
+        </conditional>
+    </page>
+  </inputs>
+  <outputs>
+    <data format="maf" name="out_file1" metadata_source="input"/>
+  </outputs>
+  <requirements>
+    <requirement type="python-module">numpy</requirement>
+  </requirements>
+  <tests>
+    <test>
+      <param name="input" value="6.maf"/>
+      <param name="mask_char" value="0"/>
+      <param name="type" value="CpG" />
+      <param name="definition" value="0" />
+      <output name="out_file1" file="6_mask_cpg.maf"/>
+    </test>
+    <test>
+      <param name="input" value="6.maf"/>
+      <param name="mask_char" value="0"/>
+      <param name="type" value="nonCpG" />
+      <output name="out_file1" file="6_mask_noncpg.maf"/>
+    </test>
+  </tests>
+ <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This tool takes a MAF file as input and masks CpG sites in every alignment block of the MAF file. 
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+*Inclusive definition* defines CpG sites as those sites that are CG in at least one of the species.
+
+*Restricted definition* considers sites to be CpG if they are CG in at least one of the species, however, sites that are part of overlapping CpGs are excluded.
+
+For more information on CpG site definitions, please refer this article_.
+
+.. _article: http://mbe.oxfordjournals.org/cgi/content/full/23/3/565
+
+  </help>  
+
+
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/microsatellite_birthdeath.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/microsatellite_birthdeath.pl Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,3984 @@\n+#!/usr/bin/perl -w\n+use strict;\n+use warnings;\n+use Term::ANSIColor;\n+use Pod::Checker; \n+use File::Basename;\n+use IO::Handle;\n+use Cwd;\n+use File::Path qw(make_path remove_tree);\n+use File::Temp qw/ tempfile tempdir /;\n+my $tdir = tempdir( CLEANUP => 0 );\n+chdir $tdir;\n+my $dir = getcwd;  \n+#print "current dit=$dir\\n";\n+\n+use vars qw (%treesToReject %template $printer $interr_poscord $interrcord $no_of_interruptionscord $stringfile @tags \n+$infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species \n+$gapcord %thresholdhash $tree_decipherer @sp_ident %revHash %sameHash %treesToIgnore %alternate @exactspecies @exacttags);\n+use FileHandle;\n+use IO::Handle;                     # 5.004 or higher\n+\n+#my @ar = ("/Users/ydk/work/rhesus_microsat/results/galay/chr22_5sp.maf.txt", "/Users/ydk/work/rhesus_microsat/results/galay/dataset_11.dat",\n+#"/Users/ydk/work/rhesus_microsat/results/galay/chr22_5spec.maf.summ","hg18,panTro2,ponAbe2,rheMac2,calJac1","((((hg18, panTro2), ponAbe2), rheMac2), calJac1)","9,10,12,12",\n+#"10","0.8");\n+my @ar = @ARGV;\n+my ($maf, $orth, $summout, $species_set, $tree_definition, $thresholds, $FLANK_SUPPORT, $SIMILARITY_THRESH) = @ar;\n+$SIMILARITY_THRESH=$SIMILARITY_THRESH/100;\n+#########################\n+$SIMILARITY_THRESH = $SIMILARITY_THRESH/100;\n+my $EDGE_DISTANCE = 10; \n+my $COMPLEXITY_SUPPORT = 20;\n+load_thresholds("9_10_12_12");\n+#########################\n+\n+my $complexity=int($COMPLEXITY_SUPPORT * (1/40));\n+\n+#print "complexity=$complexity\\n";\n+#<STDIN>;\n+\n+#$printer = 1;\n+\n+my $rando = int(rand(1000));\n+my $localdate = `date`;\n+$localdate =~ /([0-9]+):([0-9]+):([0-9]+)/;\n+my $info = $rando.$1.$2.$3;\n+\n+#---------------------------------------------------------------------------\n+# GETTING INPUT INFORMATION AND OPENING INPUT AND OUTPUT FILES\n+\n+\n+my @thresharr = (0, split(/,/,$thresholds));\n+my $randno=int(rand(100000));\n+my $megamatch = $randno.".megamatch.net.axt"; #"/gpfs/home/ydk104/work/rhesus_microsat/axtNet/hg18.panTro2.ponAbe2.rheMac2.calJac1/chr1.hg18.panTro2.ponAbe2.rheMac2.calJac1.net.axt";\n+my $megamatchlck = $megamatch.".lck";\n+unlink $megamatchlck;\n+\n+#my $selected= $orth;\n+#my $eventfile = $orth;\n+#$selected = $selected."_SELECTED";\n+#$selected = $selected."_".$SIMILARITY_THRESH;\n+#my $runtime = $selected.".runtime";\n+\n+my $inputtags = "H:C:O:R:M";\n+$inputtags = $ARGV[3] if exists $ARGV[3] && $ARGV[3] =~ /[A-Z]:[A-Z]/;\n+\n+my @all_tags = split(/:/, $inputtags);\n+my $inputsp = "hg18:panTro2:ponAbe2:rheMac2:calJac1";\n+$inputsp = $ARGV[4] if exists $ARGV[4] && $ARGV[3] =~ /[0-9]+:/;\n+@sp_ident = split(/:/,$inputsp);\n+my $junkfile = $orth."_junk";\n+\n+my $sh = load_sameHash(1);\n+my $rh = load_revHash(1);\n+\n+#print "inputs are : \\n"; foreach(@ARGV){print $_,"\\n";} \n+#open (SELECT, ">$selected") or die "Cannot open selected file: $selected: $!";\n+open (SUMMARY, ">$summout") or die "Cannot open summout file: $summout: $!";\n+#open (RUN, ">$runtime") or die "Cannot open orth file: $runtime: $!";\n+#my $ctlfile = "baseml\\.ctl"; #$ARGV[4];\n+#my $treefile = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/"; \t#1 THIS IS THE THE TREE UNDER CONSIDERATION, IN NEWICK \n+my %registeredTrees = ();\n+my @removalReasons = \n+("microsatellite is compound", \n+"complex structure", \n+"if no. if micros is more than no. of species", \n+"if more than one micro per species ", \n+"if microsat contains N", \n+"different motif than required ", \n+"more than zero interruptions", \n+"microsat could not form key ", \n+"orthologous microsats of different motif size ",\n+"orthologous microsats of different motifs ", \n+"microsats belong to different alignment blocks altogether", \n+"microsat near edge", \n+"microsat in low complexity region", \n+"microsat flanks dont align well", \n+"phylogeny not informative");\n+my %allowedhash=();\n+#---------------------------------------------------------------------------\n+# WORKING ON MAKING THE MEGAMATCH FILE\n+my $chromt=int(rand(10000'..b' $line;\n+\t\tif ($line =~ /^a /){\n+\t\t\t$start = 1;\n+\t\t}\n+\t\t\n+\t\tif ($line =~ /^s /){\n+\t\t#\tprint "fields1 = $fields[1] , start = $start\\n";\n+\t\t\n+\t\t\tforeach my $sp (@species){\n+\t\t\t\tif ($fields[1] =~ /$sp/){\n+\t\t\t\t\t$species_counter = $species_counter."_".$sp;\n+\t\t\t\t\tpush(@sequences, $fields[6]);\n+\t\t\t\t\tmy @sp_info = split(/\\./,$fields[1]);\n+\t\t\t\t\tmy $title = join(" ",@sp_info, $fields[2], ($fields[2]+$fields[3]), $fields[4]);\n+\t\t\t\t\tpush(@titles, $title);\t\t\t\t\n+\t\t\t\t\t\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\t\t\n+\t\tif (($line !~ /^a/) && ($line !~ /^s/) && ($line !~ /^#/) && ($line !~ /^i/) && ($start = 1)){\n+\t\t\n+\t\t\tmy $arranged = reorderSpecies($species_counter, @species);\n+\t\t\tmy $stopper = 1;\n+\t\t\tmy $arrno = 0;\n+\t\t\tforeach my $set (@allowedset){\n+\t\t\t\tif ($arranged eq $set){\n+\t#\t\t\t\tprint "$arranged == $set\\n";\n+\t\t\t\t\t$stopper = 0; last;\n+\t\t\t\t}\n+\t\t\t\t$arrno++;\n+\t\t\t}\n+\t\n+\t\t\tif ($stopper == 0) {\n+\t\t\t#\tprint "    accepted\\n";\n+\t\t\t\t@titles = split ";", orderInfo(join(";", @titles), $species_counter, $arranged) if $species_counter ne $arranged;\n+\t\t\t\t\n+\t\t\t\t@sequences = split ";", orderInfo(join(";", @sequences), $species_counter, $arranged) if $species_counter ne $arranged;\n+\t\t\t\tmy $filteredseq = filter_gaps(@sequences);\n+\t\t\t\t\n+\t\t\t\tif ($filteredseq ne "SHORT"){\n+\t\t\t\t\t$counter++;\n+\t\t\t\t\tprint OUT join (" ",$counter, @titles), "\\n";\n+\t\t\t\t\tprint OUT $filteredseq, "\\n";\n+\t\t\t\t\tprint OUT "\\n"; \n+\t\t\t\t\t$countermatch++;\n+\t\t\t\t}\n+\t\t\t#\tmy @filtered_seq = split(/\\t/,filter_gaps(@sequences) );\n+\t\t\t}\n+\t\t\telse{#print "\\n";\n+\t\t\t}\n+\t\n+\t\t\t@sequences = (); @titles = (); $start = 0;$species_counter = "0";\n+\t\t\tnext;\t\t\n+\t\t\t\n+\t\t}\n+\t}\n+#\tprint "countermatch = $countermatch\\n";\n+}\n+\n+sub reorderSpecies{\n+\tmy @inarr=@_;\n+\tmy $currSpecies = shift (@inarr);\n+\tmy $ordered_species = 0;\n+\tmy @species=@inarr;\n+\tforeach my $order (@species){\n+\t\t$ordered_species = $ordered_species."_".$order\tif\t$currSpecies=~ /$order/;\n+\t}\n+\treturn $ordered_species;\n+\n+}\n+\n+sub filter_gaps{\n+\tmy @sequences = @_;\n+#\tprint "sequences sent are @sequences\\n";\n+\tmy $seq_length = length($sequences[0]);\n+\tmy $seq_no = scalar(@sequences);\n+\tmy $allgaps = ();\n+\tfor (1 ... $seq_no){\n+\t\t$allgaps = $allgaps."-";\n+\t}\n+\t\n+\tmy @seq_array = ();\n+\tmy $seq_counter = 0;\n+\tforeach my $seq (@sequences){\n+#\t\tmy @sequence = split(/\\s*/,$seq);\n+\t\t$seq_array[$seq_counter] = [split(/\\s*/,$seq)];\n+#\t\tpush @seq_array, [@sequence];\n+\t\t$seq_counter++;\n+\t}\n+\tmy $g = 0;\n+\twhile ( $g < $seq_length){\n+\t\tlast if (!exists $seq_array[0][$g]);\n+\t\tmy $bases = ();\n+\t\tfor my $u (0 ... $#seq_array){\n+\t\t\t$bases = $bases.$seq_array[$u][$g];\n+\t\t}\t\n+#\t\tprint $bases, "\\n";\n+\t\tif ($bases eq $allgaps){\n+#\t\t\tprint "bases are $bases, position is $g \\n";\n+\t\t\tfor my $seq (@seq_array){\n+\t\t\t\tsplice(@$seq , $g, 1);\n+\t\t\t}\n+\t\t}\n+\t\telse {\n+\t\t\t$g++;\n+\t\t}\n+\t}\n+\t\n+\tmy @outs = ();\n+\t\n+\tforeach my $seq (@seq_array){\n+\t\tpush(@outs, join("",@$seq));\n+\t}\n+\treturn "SHORT" if length($outs[0]) <=100;\n+\treturn (join("\\n", @outs));\t\n+}\n+\n+\n+sub allowedSetOfSpecies{\n+\tmy @allowed_species = split(/_/,$_[0]);\n+\tunshift @allowed_species, 0;\n+#\tprint "allowed set = @allowed_species \\n";\n+\tmy @output = ();\n+\tfor (0 ... scalar(@allowed_species) - 4){\n+\t\tpush(@output, join("_",@allowed_species));\n+\t\tpop @allowed_species;\n+\t}\n+\treturn join(";",reverse(@output));\n+\n+}\n+\n+\n+sub orderInfo{\n+\tmy @info = split(/;/,$_[0]);\n+#\tprint "info = @info";\n+\tmy @old = split(/_/,$_[1]);\n+\tmy @new = split(/_/,$_[2]);\n+\tshift @old; shift @new;\n+\tmy @outinfo = ();\n+\tforeach my $spe (@new){\n+\t\tfor my $no (0 ... $#old){\n+\t\t\tif ($spe eq $old[$no]){\n+\t\t\t\tpush(@outinfo, $info[$no]);\n+\t\t\t}\n+\t\t}\n+\t}\n+#\tprint "outinfo = @outinfo \\n"; \n+\treturn join(";", @outinfo);\n+}\n+\n+#xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx \n+\n+sub printarr {\n+\tprint ">::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\\n";\n+\tforeach my $line (@_) {print "$line\\n";}\n+\tprint "::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::<\\n";\n+}\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/microsatellite_birthdeath.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/microsatellite_birthdeath.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,63 @@
+<tool id="microsatellite_birthdeath" name="Identify microsatellite births and deaths" version="1.0.0">
+  <description> and causal mutational mechanisms from previously identified orthologous microsatellite sets</description>
+  <command interpreter="perl">
+      microsatellite_birthdeath.pl 
+      $alignment 
+      $orthfile 
+      $outfile 
+      ${alignment.metadata.species} 
+      "$tree_definition" 
+      $thresholds
+      $separation 
+      $simthresh
+
+  </command>
+  <inputs>
+    <page>
+        <param format="maf" name="alignment" type="data" label="Select MAF alignments"/>
+        
+        <param format="txt" name="orthfile" type="data" label="Select raw microsatellite data"/>
+
+     <param name="tree_definition" size="200" type="text" value= "((((hg18,panTro2),ponAbe2),rheMac2),calJac1)" label="Tree definition of all species above whether or not selected for microsatellite extraction" 
+     help="For example: ((((hg18,panTro2),ponAbe2),rheMac2),calJac1)"/>
+      
+       <param name="separation" size="10" type="integer" value="40" label="Total length of flanking DNA used for sequence-similarity comparisons among species"
+     help="A value of 40 means: 20 bp upstream and 20 bp downstream DNA will be used for similarity comparisons."/>

+      <param name="thresholds" size="15" type="text" value="9,10,12,12" label="Minimum Threshold for the number of repeats for microsatellites"
+     help="A value of 9,10,12,12 means: All monos having fewer than 9 repeats, dis having fewer than 5 repeats, tris having fewer than 4 repeats, tetras having fewer than 3 repeats will be excluded from the output."/>
+
+      <param name="simthresh" size="10" type="integer" value="80" label="Percent sequence similarity of flanking regions (of length same as  the above separation distance"
+     help="Enter a value from 0 to 100"/>
+
+
+     </page>
+  </inputs>
+  <outputs>
+    <data format="txt" name="outfile" metadata_source="orthfile"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="alignment" value="chr22_5sp.maf"/>
+      <param name="orthfile" value="chr22_5sp.microraw.tabular"/>
+      <param name="thresholds" value="9,10,12,12"/>
+      <param name="tree_definition" value="((((hg18, panTro2), ponAbe2), rheMac2), calJac1)"/>
+      <param name="separation" value="40"/>
+      <param name="simthresh" value="80"/>
+      <output name="outfile" file="chr22_5sp.microtab.tabular"/>
+    </test>
+  </tests>
+
+
+ <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This tool uses raw orthologous microsatellite clusters (identified by the tool "Extract orthologous microsatellites") to identify microsatellite births and deaths along individual lineages of a phylogenetic tree.
+
+</help>  
+
+
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/microsats_alignment_level.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/microsats_alignment_level.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,323 @@\n+ #!/usr/bin/env python\n+#Guruprasad Ananda\n+"""\n+Uses SPUTNIK to fetch microsatellites and extracts orthologous repeats from the sputnik output.\n+"""\n+from galaxy import eggs\n+import sys, os, tempfile, string, math, re\n+\n+def reverse_complement(text):\n+    DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" )\n+    comp = [ch for ch in text.translate(DNA_COMP)]\n+    comp.reverse()\n+    return "".join(comp)\n+\n+def main():\n+    if len(sys.argv) != 8:\n+        print >>sys.stderr, "Insufficient number of arguments."\n+        sys.exit()\n+    \n+    infile = open(sys.argv[1],\'r\')\n+    separation = int(sys.argv[2])\n+    outfile = sys.argv[3]\n+    align_type = sys.argv[4]\n+    if align_type == "2way":\n+        align_type_len = 2\n+    elif align_type == "3way":\n+        align_type_len = 3\n+    mono_threshold = int(sys.argv[5])\n+    non_mono_threshold = int(sys.argv[6])\n+    allow_different_units = int(sys.argv[7])\n+    \n+    print "Min distance = %d bp; Min threshold for mono repeats = %d; Min threshold for non-mono repeats = %d; Allow different motifs = %s" %(separation, mono_threshold, non_mono_threshold, allow_different_units==1)\n+    try:\n+        fout = open(outfile, "w")\n+        print >>fout, "#Block\\tSeq1_Name\\tSeq1_Start\\tSeq1_End\\tSeq1_Type\\tSeq1_Length\\tSeq1_RepeatNumber\\tSeq1_Unit\\tSeq2_Name\\tSeq2_Start\\tSeq2_End\\tSeq2_Type\\tSeq2_Length\\tSeq2_RepeatNumber\\tSeq2_Unit"\n+        #sputnik_cmd = os.path.join(os.path.split(sys.argv[0])[0], "sputnik")\n+        sputnik_cmd = "sputnik"\n+        input = infile.read()\n+        skipped = 0\n+        block_num = 0\n+        input = input.replace(\'\\r\',\'\\n\')\n+        for block in input.split(\'\\n\\n\'):\n+            block_num += 1\n+            tmpin = tempfile.NamedTemporaryFile()\n+            tmpout = tempfile.NamedTemporaryFile()\n+            tmpin.write(block.strip())\n+            blk = tmpin.read()\n+            cmdline = sputnik_cmd + " " + tmpin.name + "  > /dev/null 2>&1 >> " + tmpout.name\n+            try:\n+                os.system(cmdline)\n+            except Exception, es:\n+                continue\n+            sputnik_out = tmpout.read()\n+            tmpin.close()\n+            tmpout.close()\n+            if sputnik_out != "":\n+                if len(block.split(\'>\')[1:]) != 2:        #len(sputnik_out.split(\'>\')):\n+                    skipped += 1\n+                    continue\n+                align_block = block.strip().split(\'>\')\n+                \n+                lendict = {\'mononucleotide\':1, \'dinucleotide\':2, \'trinucleotide\':3, \'tetranucleotide\':4, \'pentanucleotide\':5, \'hexanucleotide\':6}\n+                blockdict={}\n+                r=0\n+                namelist=[]\n+                for k,sput_block in enumerate(sputnik_out.split(\'>\')[1:]):\n+                    whole_seq = \'\'.join(align_block[k+1].split(\'\\n\')[1:]).replace(\'\\n\',\'\').strip()\n+                    p = re.compile(\'\\n(\\S*nucleotide)\')\n+                    repeats = p.split(sput_block.strip())\n+                    repeats_count = len(repeats)\n+                    j = 1\n+                    name = repeats[0].strip()\n+                    try:\n+                        coords = re.search(\'\\d+[-_:]\\d+\',name).group()\n+                        coords = coords.replace(\'_\',\'-\').replace(\':\',\'-\')\n+                    except Exception, e:\n+                        coords = \'0-0\'\n+                        pass\n+                    r += 1\n+                    blockdict[r]={}\n+                    try:\n+                        sp_name = name[:name.index(\'.\')]\n+                        chr_name = name[name.index(\'.\'):name.index(\'(\')]\n+                        namelist.append(sp_name + chr_name)\n+                    except:\n+                        namelist.append(name[:20])\n+                    while j < repeats_count:\n+                        try:\n+                            if repeats[j].strip() not in lendict:\n+                                j += 2\n+                                continue\n+                           '..b'           if coord_s2 == \'marked\':\n+                            continue\n+                        if visited_2[ind] != 0:\n+                            continue\n+                        coord_e2 = blockdict[2][\'ends\'][ind2]\n+                        out = []\n+                        for ind1,coord_s1 in enumerate(blockdict[1][\'starts\']):\n+                            if coord_s1 == \'marked\':\n+                                continue\n+                            coord_e1 = blockdict[1][\'ends\'][ind1]\n+                            #skip if the 2 repeats are not of the same type or don\'t have the same repeating unit.\n+                            if allow_different_units == 0:\n+                                if (blockdict[1][\'types\'][ind1] != blockdict[2][\'types\'][ind2]):\n+                                    continue\n+                                else:\n+                                    if (blockdict[1][\'units\'][ind1] not in blockdict[2][\'units\'][ind2]*2):# and reverse_complement(blockdict[1][\'units\'][ind1]) not in blockdict[2][\'units\'][ind2]*2:\n+                                        continue\n+                            #skip if the repeat number thresholds are not met\n+                            if blockdict[1][\'types\'][ind1] == \'mononucleotide\':\n+                                if (int(blockdict[1][\'counts\'][ind1]) < mono_threshold):\n+                                    continue\n+                            else:\n+                                if (int(blockdict[1][\'counts\'][ind1]) < non_mono_threshold):\n+                                    continue\n+                            \n+                            if blockdict[2][\'types\'][ind2] == \'mononucleotide\':\n+                                if (int(blockdict[2][\'counts\'][ind2]) < mono_threshold):\n+                                    continue\n+                            else:\n+                                if (int(blockdict[2][\'counts\'][ind2]) < non_mono_threshold):\n+                                    continue\n+                            \n+                            if (coord_s2 in range(coord_s1,coord_e1)) or (coord_e2 in range(coord_s1,coord_e1)):\n+                                out.append(str(block_num)) \n+                                out.append(namelist[0])\n+                                rel_start = blockdict[1][\'whole_seq_start\'][ind1] + coord_s1 - blockdict[1][\'gaps_before_start\'][ind1]\n+                                rel_end = rel_start + int(blockdict[1][\'lengths\'][ind1]) \n+                                out.append(str(rel_start))\n+                                out.append(str(rel_end))\n+                                out.append(blockdict[1][\'types\'][ind1])\n+                                out.append(blockdict[1][\'lengths\'][ind1])\n+                                out.append(blockdict[1][\'counts\'][ind1])\n+                                out.append(blockdict[1][\'units\'][ind1])\n+                                out.append(namelist[1])\n+                                rel_start = blockdict[2][\'whole_seq_start\'][ind2] + coord_s2 - blockdict[2][\'gaps_before_start\'][ind2]\n+                                rel_end = rel_start + int(blockdict[2][\'lengths\'][ind2]) \n+                                out.append(str(rel_start))\n+                                out.append(str(rel_end))\n+                                out.append(blockdict[2][\'types\'][ind2])\n+                                out.append(blockdict[2][\'lengths\'][ind2])\n+                                out.append(blockdict[2][\'counts\'][ind2])\n+                                out.append(blockdict[2][\'units\'][ind2])\n+                                print >>fout, \'\\t\'.join(out)\n+                                visited_2[ind2] = 1\n+                                out=[]\n+                            \n+                    #print >>fout, blockdict\n+    except Exception, exc:\n+        print >>sys.stderr, "type(exc),args,exc: %s, %s, %s" %(type(exc), exc.args, exc)\n+\n+if __name__ == "__main__":\n+    main()\n+    \n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/microsats_alignment_level.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/microsats_alignment_level.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,61 @@
+<tool id="microsats_align1" name="Extract Orthologous Microsatellites">
+  <description> from pair-wise alignments</description>
+  <command interpreter="python">
+   microsats_alignment_level.py $input1 $separation $out_file1 "2way" $mono_threshold $non_mono_threshold $allow_different_units
+  </command>
+  <inputs>
+    <page>
+     <param format="fasta" name="input1" type="data" label="Select data"/>
+     <param name="separation" size="10" type="integer" value="10" label="Minimum base pair distance between adjacent microsatellites"
+     help="A value of 10 means: Adjacent microsatellites separated by less than 10 base pairs will be excluded from the output."/>
+     <param name="mono_threshold" size="10" type="integer" value="9" label="Minimum Threshold for the number of repeats for mononucleotide microsatellites"
+     help="A value of 9 means: All mononucleotide microsatellites having fewer than 9 repeats will be excluded from the output."/>
+     <param name="non_mono_threshold" size="10" type="integer" value="4" label="Minimum Threshold for the number of repeats for non-mononucleotide microsatellites"
+     help="A value of 4 means: All non-mononucleotide microsatellites having fewer than 4 repeats will be excluded from the output."/>
+     <param name="allow_different_units" size="5" type="select" label="Allow orthologous positions to have different microsatellite repeat units/motifs?">
+     <option value="0" selected="true">No</option>
+           <option value="1">Yes</option>
+         </param>
+    </page>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_file1" metadata_source="input1"/>
+  </outputs>
+  <requirements>
+     <requirement type="package">sputnik</requirement>
+  </requirements>
+  <tests>
+    <test>
+      <param name="input1" value="2way.maf"/>
+      <param name="separation" value="10"/>
+      <param name="mono_threshold" value="9"/>
+      <param name="non_mono_threshold" value="4"/>
+      <param name="allow_different_units" value="0"/>
+      <output name="out_file1" file="ortho_ms.tab"/>
+    </test>
+  </tests>
+
+ <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This tool uses a modified version of SPUTNIK to fetch microsatellite repeats from the input fasta sequences and extracts orthologous repeats from the sputnik output. The modified version allows detection of mononucleotide microsatellites. More information on SPUTNIK can be found on this website_. The modified version is available here_.
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+- Any block/s not containing exactly 2 species will be omitted. 
+
+- This tool will filter out microsatellites based on the user input values for minimum distance and repeat number thresholds. Further, this tool will also filter out microsatellites that have no orthologous microsatellites in one of the species.
+
+.. _website: http://espressosoftware.com/pages/sputnik.jsp   
+.. _here: http://www.bx.psu.edu/svn/universe/dependencies/sputnik/
+</help>  
+
+
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/microsats_mutability.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/microsats_mutability.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,489 @@\n+#!/usr/bin/env python\n+#Guruprasad Ananda\n+"""\n+This tool computes microsatellite mutability for the orthologous microsatellites fetched from  \'Extract Orthologous Microsatellites from pair-wise alignments\' tool.\n+"""\n+from galaxy import eggs\n+import sys, string, re, commands, tempfile, os, fileinput\n+from galaxy.tools.util.galaxyops import *\n+from bx.intervals.io import *\n+from bx.intervals.operations import quicksect\n+\n+fout = open(sys.argv[2],\'w\')\n+p_group = int(sys.argv[3])        #primary "group-by" feature\n+p_bin_size = int(sys.argv[4])\n+s_group = int(sys.argv[5])        #sub-group by feature\n+s_bin_size = int(sys.argv[6])\n+mono_threshold = 9\n+non_mono_threshold = 4\n+p_group_cols = [p_group, p_group+7]\n+s_group_cols = [s_group, s_group+7]\n+num_generations = int(sys.argv[7])\n+region = sys.argv[8] \n+int_file = sys.argv[9]\n+if int_file != "None": #User has specified an interval file\n+    try:\n+        fint = open(int_file, \'r\')\n+        dbkey_i = sys.argv[10]\n+        chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[11] )\n+    except:\n+        stop_err("Unable to open input Interval file")\n+    \n+def stop_err(msg):\n+    sys.stderr.write(msg)\n+    sys.exit()\n+\n+def reverse_complement(text):\n+    DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" )\n+    comp = [ch for ch in text.translate(DNA_COMP)]\n+    comp.reverse()\n+    return "".join(comp)\n+\n+def get_unique_elems(elems):\n+    seen=set()\n+    return[x for x in elems if x not in seen and not seen.add(x)]\n+\n+def get_binned_lists(uniqlist, binsize):\n+    binnedlist=[]\n+    uniqlist.sort()\n+    start = int(uniqlist[0])\n+    bin_ind=0\n+    l_ind=0\n+    binnedlist.append([])\n+    while l_ind < len(uniqlist):\n+        elem = int(uniqlist[l_ind])\n+        if elem in range(start,start+binsize):\n+            binnedlist[bin_ind].append(elem)\n+        else:\n+            start += binsize\n+            bin_ind += 1\n+            binnedlist.append([])\n+            binnedlist[bin_ind].append(elem)\n+        l_ind += 1\n+    return binnedlist\n+\n+def fetch_weight(H,C,t):\n+    if (H-(C-H)) < t:\n+        return 2.0\n+    else:\n+        return 1.0\n+\n+def mutabilityEstimator(repeats1,repeats2,thresholds):\n+    mut_num = 0.0    #Mutability Numerator\n+    mut_den = 0.0    #Mutability denominator\n+    for ind,H in enumerate(repeats1):\n+        C = repeats2[ind]\n+        t = thresholds[ind]\n+        w = fetch_weight(H,C,t)\n+        mut_num += ((H-C)*(H-C)*w)\n+        mut_den += w\n+    return [mut_num, mut_den]\n+\n+def output_writer(blk, blk_lines):\n+    global winspecies, speciesind\n+    all_elems_1=[]\n+    all_elems_2=[]\n+    all_s_elems_1=[]\n+    all_s_elems_2=[]\n+    for bline in blk_lines:\n+        if not(bline):\n+            continue\n+        items = bline.split(\'\\t\')\n+        seq1 = items[1]\n+        start1 = items[2]\n+        end1 = items[3]\n+        seq2 = items[8]\n+        start2 = items[9]\n+        end2 = items[10] \n+        if p_group_cols[0] == 6:\n+            items[p_group_cols[0]] = int(items[p_group_cols[0]])\n+            items[p_group_cols[1]] = int(items[p_group_cols[1]])\n+        if s_group_cols[0] == 6:\n+            items[s_group_cols[0]] = int(items[s_group_cols[0]])\n+            items[s_group_cols[1]] = int(items[s_group_cols[1]])\n+        all_elems_1.append(items[p_group_cols[0]])    #primary col elements for species 1\n+        all_elems_2.append(items[p_group_cols[1]])    #primary col elements for species 2\n+        if s_group_cols[0] != -1:    #sub-group is not None\n+            all_s_elems_1.append(items[s_group_cols[0]])    #secondary col elements for species 1\n+            all_s_elems_2.append(items[s_group_cols[1]])    #secondary col elements for species 2\n+    uniq_elems_1 = get_unique_elems(all_elems_1)\n+    uniq_elems_2 = get_unique_elems(all_elems_2)\n+    if s_group_cols[0] != -1:\n+        uniq_s_elems_1 = get_unique_elems(all_s_elems_1)\n+        uniq_s_elems_2 = get_unique_elems(all_s_elems_2)\n+    mut1={}\n+    mut2={}\n+    '..b'\n+            break\n+        if i == 30:\n+            break # Hopefully we\'ll never get here...\n+    \n+    if len( elems ) != 15:\n+        stop_err( "This tool only works on tabular data output by \'Extract Orthologous Microsatellites from pair-wise alignments\' tool. The data in your input dataset is either missing or not formatted properly." )\n+    global winspecies, speciesind\n+    if region == \'win\':\n+        if dbkey_i in elems[1]:\n+            winspecies = 1\n+            speciesind = 1 \n+        elif dbkey_i in elems[8]:\n+            winspecies = 2\n+            speciesind = 8\n+        else:\n+            stop_err("The species build corresponding to your interval file is not present in the Microsatellite file.") \n+        \n+    fin = open(infile, \'r\')\n+    skipped = 0\n+    blk=0\n+    win=0\n+    linestr=""\n+    \n+    if region == \'win\':\n+        \n+        msats = NiceReaderWrapper( fileinput.FileInput( infile ),\n+                                chrom_col = speciesind,\n+                                start_col = speciesind+1,\n+                                end_col = speciesind+2,\n+                                strand_col = -1,\n+                                fix_strand = True)\n+        msatTree = quicksect.IntervalTree()\n+        for item in msats:\n+            if type( item ) is GenomicInterval:\n+                msatTree.insert( item, msats.linenum, item.fields )\n+        \n+        for iline in fint:\n+            try:\n+                iline = iline.rstrip(\'\\r\\n\')\n+                if not(iline) or iline == "":\n+                    continue\n+                ielems = iline.strip("\\r\\n").split(\'\\t\')\n+                ichr = ielems[chr_col_i]\n+                istart = int(ielems[start_col_i])\n+                iend = int(ielems[end_col_i])\n+                isrc = "%s.%s" %(dbkey_i,ichr)\n+                if isrc not in msatTree.chroms:\n+                    continue\n+                result = []\n+                root = msatTree.chroms[isrc]    #root node for the chrom\n+                counter(root, istart, iend, lambda node: result.append( node ))\n+                if not(result):\n+                    continue\n+                tmpfile1 = tempfile.NamedTemporaryFile(\'wb+\')\n+                for node in result:\n+                    tmpfile1.write("%s\\n" % "\\t".join( node.other ))\n+                \n+                tmpfile1.seek(0)\n+                output_writer(iline, tmpfile1.readlines())\n+            except:\n+                skipped+=1\n+        if skipped:\n+            print "Skipped %d intervals as invalid." %(skipped)\n+    elif region == \'align\':\n+        if s_group_cols[0] != -1:\n+            print >>fout, "#Window\\tSpecies_1\\tSpecies_2\\tGroupby_Feature\\tSubGroupby_Feature\\tMutability\\tCount"\n+        else:\n+            print >>fout, "#Window\\tSpecies_1\\tWindow_Start\\tWindow_End\\tSpecies_2\\tGroupby_Feature\\tMutability\\tCount"\n+        prev_bnum = -1\n+        try:\n+            for line in fin:\n+                line = line.strip("\\r\\n")\n+                if not(line) or line == "":\n+                    continue\n+                elems = line.split(\'\\t\')\n+                try:\n+                    assert int(elems[0])\n+                    assert len(elems) == 15\n+                except:\n+                    continue\n+                new_bnum = int(elems[0])\n+                if new_bnum != prev_bnum:\n+                    if prev_bnum != -1:\n+                        output_writer(prev_bnum, linestr.strip().replace(\'\\r\',\'\\n\').split(\'\\n\'))\n+                    linestr = line + "\\n"\n+                else:\n+                    linestr += line\n+                    linestr += "\\n"\n+                prev_bnum = new_bnum\n+            output_writer(prev_bnum, linestr.strip().replace(\'\\r\',\'\\n\').split(\'\\n\'))\n+        except Exception, ea:\n+            print >>sys.stderr, ea\n+            skipped += 1\n+        if skipped:\n+            print "Skipped %d lines as invalid." %(skipped)\n+if __name__ == "__main__":\n+    main()\n\\ No newline at end of file\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/microsats_mutability.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/microsats_mutability.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,121 @@
+<tool id="microsats_mutability1" name="Estimate microsatellite mutability" version="1.1.0">
+  <description>by specified attributes</description>
+  <command interpreter="python">
+   microsats_mutability.py 
+   $input1 
+   $out_file1 
+   ${pri_condition.primary_group} 
+   #if $pri_condition.primary_group == "6":
+      ${pri_condition.binsize} ${pri_condition.subgroup} -1 
+    #else:
+      0 ${pri_condition.sub_condition.subgroup} 
+      #if $pri_condition.sub_condition.subgroup == "6":
+       ${pri_condition.sub_condition.s_binsize}
+      #else:
+       -1
+      #end if
+    #end if
+   $gens
+    ${region.type}
+    #if $region.type == "win":
+      ${region.input2} $input2.dbkey $input2.metadata.chromCol,$input2.metadata.startCol,$input2.metadata.endCol,$input2.metadata.strandCol
+    #else:
+      "None"
+    #end if
+  </command>
+  <inputs>
+    <page>
+      <param name="input1" type="data" format="tabular" label="Select dataset containing Orthologous microsatellites"/>
+      <conditional name="region">
+       <param name="type" type="select" label="Estimate rates corresponding to" multiple="false">
+          <option value="align">Alignment block</option>
+          <option value="win">Intervals in your history</option>
+      </param>
+      <when value="win">
+        <param format="interval" name="input2" type="data" label="Choose intervals">
+        <validator type="unspecified_build" />
+      </param>
+       </when>
+       <when value="align" />
+      </conditional>
+      <param name="gens" size="10" type="integer" value="1" label="Number of generations between the two species in input file"/>
+      <conditional name="pri_condition">
+       <param name="primary_group" type="select" label="Group by" multiple="false">
+          <option value="4">Motif type (mono/di/tri etc.)</option>
+          <option value="7">Repeat Unit (AG, GCT etc.)</option>
+          <option value="6">Repeat Number </option>
+       </param>
+       <when value="6">
+        <param name="binsize" size="10" type="integer" value="1" label="Bin-size" help="Bin-size denotes the number of repeat numbers to be considered as a group. Bin-size of 5 will group every 5 consecutive repeat numbers into a group."/>
+        <param name="subgroup" type="select" label="Sub-group by" multiple="false">
+       <option value="-1">None</option>
+   <option value="4">Motif type (mono/di/tri etc.)</option>
+   <option value="7">Repeat Unit (AG, GCT etc.)</option>
+ </param>
+       </when>
+       <when value="7">
+         <conditional name="sub_condition">
+         <param name="subgroup" type="select" label="Sub-group by" multiple="false">
+       <option value="-1">None</option>
+  <option value="4">Motif type (mono/di/tri etc.)</option>
+  <option value="6">Repeat Number </option>
+    </param>
+    <when value="-1"></when>
+        <when value="4"></when>
+        <when value="6">
+          <param name="s_binsize" size="10" type="integer" value="1" label="Bin size" help="Bin-size denotes the number of repeat numbers to be considered as a group. Bin-size of 5 will group every 5 consecutive repeat numbers into a group."/>
+        </when>
+ </conditional>
+       </when>
+       <when value="4">
+ <conditional name="sub_condition">
+         <param name="subgroup" type="select" label="Sub-group by" multiple="false">
+       <option value="-1">None</option>
+  <option value="7">Repeat Unit (AG, GCT etc.)</option>
+  <option value="6">Repeat Number </option>
+    </param>
+    <when value="-1"></when>
+        <when value="7"></when>
+    <when value="6">
+          <param name="s_binsize" size="10" type="integer" value="1" label="Bin size" help="Bin-size denotes the number of repeat numbers to be considered as a group. Bin-size of 5 will group every 5 consecutive repeat numbers into a group."/>
+        </when>
+ </conditional>
+       </when>
+      </conditional>
+    </page>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_file1" />
+  </outputs>
+  <!-- 
+  <tests>
+    <test>
+      <param name="input1" value="ortho_ms.tab"/>
+      <param name="type" value="align"/>
+      <param name="gens" value="1"/>
+      <param name="primary_group" value="4"/>
+      <param name="sub_condition|subgroup" value="7"/>
+      <output name="out_file1" file="ortho_ms_mut.tab"/>
+    </test>
+  </tests>
+   -->
+<help>
+.. class:: infomark
+
+**What it does**
+
+This tool computes microsatellite mutability for the orthologous microsatellites fetched from  'Extract Orthologous Microsatellites from pair-wise alignments' tool.
+
+Mutability is computed according to the method described in the following paper:
+
+*Webster et al., Microsatellite evolution inferred from human-chimpanzee genomic  sequence alignments, Proc Natl Acad Sci 2002 June 25; 99(13): 8748-8753*
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+The user selected group and subgroup by features, the computed mutability and the count of the number of repeats used to compute that mutability are added as columns to the output.
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,5392 @@\n+#!/usr/bin/perl\n+use strict;\n+use warnings;\n+use Term::ANSIColor;\n+use File::Basename;\n+use IO::Handle;\n+use Cwd;\n+use File::Path;\n+use File::Temp qw/ tempfile tempdir /;\n+use vars qw($distance @thresholds @tags $printer $mergestarts $mergeends $mergemicros $interrtypecord $microscanned $interrcord $interr_poscord $no_of_interruptionscord $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species $gapcord $prinkter);\n+\n+$ENV{\'PATH\'} .= \':\' . dirname($0);\n+my $date = `date`;\n+\n+my ($mafile, $orthfile, $threshold_array,  $species_set, $tree_definition, $separation) = @ARGV;\n+if (!$mafile or !$orthfile or !$threshold_array or !$separation or !$tree_definition or !$species_set) { die "missing arguments\\n"; }\n+\n+#-------------------------------------------------------------------------------\n+# WHICH SPUTNIK USED?\n+my $sputnikpath = ();\n+$sputnikpath = "sputnik";\n+#print "sputnik_Mac-PowerPC non-existant\\n" if !-e $sputnikpath;\n+#exit if !-e $sputnikpath;\n+#$sputnikpath = "bx-sputnik" ;\n+#print "ARGV input = @ARGV\\n";\n+#print "ARGV input :\\n mafile=$mafile\\n orthfile=$orthfile\\n threshold_array=$threshold_array\\n  species_set=$species_set\\n tree_definition=$tree_definition\\n separation=$separation\\n";\n+#-------------------------------------------------------------------------------\n+# RUNFILE\n+#-------------------------------------------------------------------------------\n+$distance = 1; #bp\n+$distance++;\n+#-------------------------------------------------------------------------------\n+# MICROSATELLITE THRESHOLD SETTINGS (LENGTH, BP)\n+$threshold_array=~ s/,/_/g;\n+my @thresharr = split("_",$threshold_array);\n+my @thresholds=@thresharr;\n+my $mono_threshold = $thresharr[0];\n+my $di_threshold = $thresharr[1];\n+my $tri_threshold = $thresharr[2];\n+my $tetra_threshold = $thresharr[3];\n+#my $threshold_array = join("_",($mono_threshold, $di_threshold, $tri_threshold, $tetra_threshold));\n+my $tdir = tempdir( CLEANUP => 0 );\n+chdir $tdir;\n+my $dir = getcwd;  \n+#print "current dit=$dir\\n";\n+#-------------------------------------------------------------------------------\n+# CREATE AXT FILES IN FORWARD AND REVERSE ORDERS IF NECESSARY\n+my @chrfiles=();\n+\n+#my $mafile =  "/Users/ydk/work/rhesus_microsat/results/galay/align.txt"; #$ARGV[0];\n+my $chromt=int(rand(10000));\n+my $p_chr=$chromt;\n+\n+\n+my @exactspeciesset_unarranged = split(/,/,$species_set);\n+$tree_definition=~s/[\\)\\(, ]/\\t/g;\n+my @treespecies=split(/\\t+/,$tree_definition);\n+my @exactspecies=();\n+\n+foreach my $spec (@treespecies){\n+\tforeach my $espec (@exactspeciesset_unarranged){\n+\t\tpush @exactspecies, $spec if $spec eq $espec;\n+\t}\n+}\n+#print "exactspecies=@exactspecies\\n";\n+my $focalspec = $exactspecies[0];\n+my $arranged_species_set=join(".",@exactspecies);\n+my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt");\n+#print "sending to maftoAxt_multispecies: $mafile, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \\n"; \n+maftoAxt_multispecies($mafile, $tree_definition, $chr_name, $species_set);\n+#print "done maf to axt conversion\\n";\n+my $reverse_chr_name = join(".",("chr".$p_chr."r"),$arranged_species_set, "net", "axt");\n+artificial_axdata_inverter ($chr_name, $reverse_chr_name);\n+#print "reverse_chr_name=$reverse_chr_name\\n"; \n+#-------------------------------------------------------------------------------\n+# FIND THE CORRESPONDING CHIMP CHROMOSOME FROM FILE ORTp_chrS.TXT\n+foreach my $direct ("reverse_direction","forward_direction"){\n+\t$p_chr=$chromt;\n+\t#print "direction = $direct\\n";\n+\t$p_chr = $p_chr."r" if $direct eq "reverse_direction";\n+\t$p_chr = $p_chr if $direct eq "forward_direction";\n+\tmy $config = $species_set;\n+\t$config=~s/,/./g;\n+\tmy @orgs = split(/\\./,$arranged_species_set);\n+\t#print "ORGS= @orgs\\n";\n+\tmy @tag=@orgs;\n+\t\t\n+\t\n+\tmy $tags = join(",", @tag);\n+\tmy @tags=@tag;\n+\tchomp $p_chr;\n+\t$tags = join("_", split(/,/, $tags));\n+\tmy $pchr = "chr".$p_chr;\n+\t\n+\tmy $ptag = '..b'{$key}}\\n AND printing the contents:\\n" if $prinkter == 1;\n+\t\t\tmy @firstcontig= @{$contigclusters{$key}};\n+\t\t\tdelete $foundkeys{$key2} if exists $foundkeys{$key2} ;\n+\t\t\tdelete $foundkeys{$key} if exists $foundkeys{$key};\n+\t\n+\t\t\tunshift @pool, pop @firstcontig;\n+#\t\t\tprint join("\\t",@pool),"\\n" if $prinkter == 1;\n+\t\t\tprint ORTH join ("\\n",@firstcontig),"\\n" if scalar(@firstcontig) > 0;\n+\t\t\tprint ORTH join ("\\t",@pool),"\\n";\t\t\n+\t\t#\tjoin();\n+\t\t}\n+\t\n+\t}\n+\t#close (NORTH);\n+#\tprint "founkeys_entered =$founkeys_enteredcount, plain_transfered=$plain_transfered,existing_removed=$existing_removed,founkeys_count =$founkeys_count, nopath_count =$nopath_count, transfered = $transfered, complete_transfered = $complete_transfered, totalcount = $totalcount, pathed=$pathed_count\\n" if $prinkter == 1;\n+\tclose (BO);\n+\tclose (ORTH);\n+\tclose (OUTP);\n+\treturn 1;\n+\t\n+}\n+sub stringPainter{\n+\tmy @string  = split(/_/,$_[0]);\n+#\tprint $_[0], " <- in stringPainter\\n";\n+#\tprint $_[1], " <- in clusters\\n";\n+\t\n+\tmy @clusters = split(/,/, $_[1]);\n+\tfor my $i (0 ... $#clusters){\n+\t\tmy $cluster = $clusters[$i];\n+#\t\tprint "cluster = $cluster\\n";\n+\t\tmy @parts = split(/\\./,$cluster);\n+\t\tmy @cord = split(/:|-/,shift(@parts));\n+\t\tmy $minstart = $cord[1];\n+\t\tmy $maxend = $cord[2];\n+#\t\tprint "minstart = $minstart , maxend = $maxend\\n";\n+\t\t\n+\t\tfor my $j (0 ... $#parts){\n+#\t\t\tprint "oing thri $parts[$j]\\n";\n+\t\t\tmy @cord = split(/:|-/,$parts[$j]);\n+\t\t\t$minstart = $cord[1] if $cord[1] < $minstart;\n+\t\t\t$maxend = $cord[2] if $cord[2] > $maxend;\n+\t\t}\n+#\t\tprint "minstart = $minstart , maxend = $maxend\\n";\n+\t\tfor my $pos ($minstart ... $maxend){ $string[$pos] = $string[$pos].",".$cluster;}\n+\t\t\t\t\n+\t\t\n+\t}\n+#\tprint "@string <-done from function stringPainter\\n";\n+\treturn join("_",@string);\n+}\n+\n+sub findClusters{\n+\tmy $continue = 0;\n+\tmy @mapped_clusters = ();\t\n+\tmy $clusterdist = $_[1];\n+\tmy $previous = \'x\';\n+\tmy @localcluster = ();\n+\tmy $cluster_starts = ();\n+\tmy $cluster_ends = ();\n+\tmy $localcluster_start = ();\n+\tmy $localcluster_end = ();\n+\tmy @record_cluster = ();\n+\tmy @string = split(/\\!/, $_[0]);\n+\tmy $zerolength=0;\n+\t\n+\tfor my $pos_pos (1 ... $#string){\n+\t\t\tmy $pos = $string[$pos_pos];\n+#\t\t\tprint $pos, "\\n";\n+\t\t\tif ($continue == 0 && $pos eq "x") {next;}\n+\t\t\t\n+\t\t\tif ($continue == 1 && $pos eq "x" && $zerolength <= $clusterdist){ \n+\t\t\t\tif ($zerolength == 0) {$localcluster_end = $pos_pos-1};\n+\t\t\t\t$zerolength++; \n+\t\t\t\t$continue = 1; \n+\t\t\t}\n+\n+\t\t\tif ($continue == 1 && $pos eq "x" && $zerolength > $clusterdist) { \n+\t\t\t\t$zerolength = 0; \n+\t\t\t\t$continue = 0; \n+\t\t\t\tmy %seen;\n+\t\t\t\tmy @uniqed = grep !$seen{$_}++, @localcluster;\n+#\t\t\t\tprint "caught cluster : @uniqed \\n";\n+\t\t\t\tpush(@mapped_clusters, [@uniqed]);\n+#\t\t\t\tprint "clustered:\\n@uniqed\\n";\n+\t\t\t\t@localcluster = ();\n+\t\t\t\t@record_cluster = ();\n+\t\t\t\t\n+\t\t\t}\n+\t\t\t\n+\t\t\tif ($pos ne "x"){\n+\t\t\t\t$zerolength = 0;\n+\t\t\t\t$continue = 1;\n+\t\t\t\t$pos =~ s/x,//g;\n+\t\t\t\tmy @entries = split(/,/,$pos);\n+\t\t\t\t$localcluster_end = 0;\n+\t\t\t\t$localcluster_start = 0;\n+\t\t\t\tpush(@record_cluster,$pos);\n+\t\t\t\n+\t\t\t\tif ($continue == 0){\n+\t\t\t\t\t@localcluster = ();\n+\t\t\t\t\t@localcluster = (@localcluster, @entries);\n+\t\t\t\t\t$localcluster_start = $pos_pos;\n+\t\t\t\t}\n+\t\t\t\n+\t\t\t\tif ($continue == 1 ) {\n+\t\t\t\t\t@localcluster = (@localcluster, @entries);\n+\t\t\t\t}\n+\t\t\t}\n+\t}\n+\t\n+\tif (scalar(@localcluster) > 0){\n+\t\tmy %seen;\n+\t\tmy @uniqed = grep !$seen{$_}++, @localcluster;\n+\t#\tprint "caught cluster : @uniqed \\n";\n+\t\tpush(@mapped_clusters, [@uniqed]);\n+\t#\tprint "clustered:\\n@uniqed\\n";\n+\t\t@localcluster = ();\n+\t\t@record_cluster = ();\n+\t}\n+\n+\tmy @returner = ();\n+\t\n+\tforeach my $clust (@mapped_clusters){\n+\t\tmy @localclust = @$clust;\n+\t\tmy @result = ();\n+\t\tforeach my $clustparts (@localclust){\n+\t\t\tpush(@result,$clustparts);\n+\t\t}\n+\t\tpush(@returner , join(".",@result));\n+\t}\t\n+#\tprint "returnig: ", join(",",@returner), "\\n";\n+\treturn join(",",@returner);\n+}\n+#xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx  multiSpecies_orthFinder4 xxxxxxxxxxxxxx  multiSpecies_orthFinder4 xxxxxxxxxxxxxx \n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/multispecies_MicrosatDataGenerator_interrupted_GALAXY.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/multispecies_MicrosatDataGenerator_interrupted_GALAXY.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,57 @@
+<tool id="multispecies_orthologous_microsats" name="Extract orthologous microsatellites" version="1.0.1">
+  <description> for multiple (>2) species alignments</description>
+  <command interpreter="perl">
+    multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl   
+    $input1 
+   $out_file1 
+   $thresholds 
+   $species 
+   "$treedefinition"
+   $separation 
+
+  </command>
+  <inputs>
+    <page>
+        <param format="maf" name="input1" type="data" label="Select MAF alignments"/>
+        <param name="separation" size="10" type="integer" value="10" label="Minimum base pair distance between adjacent microsatellite blocks"
+     help="A value of 10 means: Adjacent microsatellites separated by less than 10 base pairs will be excluded from the output."/>
+     <param name="thresholds" size="15" type="text" value="9,10,12,12" label="Minimum Threshold for the number of repeats for microsatellites"
+     help="A value of 9,10,12,12 means: All monos having fewer than 9 repeats, dis having fewer than 5 repeats, tris having fewer than 4 repeats, tetras having fewer than 3 repeats will be excluded from the output."/>
+        <param name="species" type="select" label="Select species" display="checkboxes" multiple="true" help="NOTE: Currently users are requested to select one of these three combinations: hg18-panTro2-ponAbe2, hg18-panTro2-ponAbe2-rheMac2 or hg18-panTro2-ponAbe2-rheMac2-calJac1">
+       <options>
+         <filter type="data_meta" ref="input1" key="species" />
+       </options>
+     </param>
+     <param name="treedefinition" size="200" type="text" value = "((((hg18,panTro2),ponAbe2),rheMac2),calJac1)" label="Tree definition of all species above whether or not selected for microsatellite extraction" 
+     help="For example: ((((hg18,panTro2),ponAbe2),rheMac2),calJac1)"/>
+    </page>
+  </inputs>
+  <outputs>
+    <data format="txt" name="out_file1" metadata_source="input1"/>
+  </outputs>
+  <requirements>
+     <requirement type="binary">sputnik</requirement>
+  </requirements>
+  <tests>
+    <test>
+      <param name="input1" value="chr22_5sp.maf"/>
+      <param name="thresholds" value="9,10,12,12"/>
+      <param name="species" value="hg18,panTro2,ponAbe2,rheMac2,calJac1"/>
+      <param name="treedefinition" value="((((hg18, panTro2), ponAbe2), rheMac2), calJac1)"/>
+      <param name="separation" value="10"/>
+      <output name="out_file1" file="chr22_5sp.microraw.tabular"/>
+    </test>
+  </tests>
+
+ <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This tool finds ortholgous microsatellite blocks between aligned species
+  
+</help>  
+
+
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/parseMAF_smallIndels.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/parseMAF_smallIndels.pl Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,698 @@\n+#!/usr/bin/perl -w\n+# a program to get indels \n+# input is a MAF format 3-way alignment file\n+# from 3-way blocks only at this time\n+# translate seq2, seq3, etc coordinates to + if align orient is reverse complement\n+ \n+use strict;\n+use warnings;\n+\n+# declare and initialize variables\n+my $fh; # variable to store filehandle\n+my $record;\n+my $offset;\n+my $library = $ARGV[0]; \n+my $count = 0;\n+my $count2 = 0;\n+my $count3 = 0;\n+my $count4 = 0;\n+my $start1 = my $start2 = my $start3 = my $start4 = my $start5 = my $start6 = 0;\n+my $orient = "";\n+my $outgroup = $ARGV[2];\n+my $ingroup1 = my $ingroup2 = "";\n+my $count_seq1insert = my $count_seq1delete = 0;\n+my $count_seq2insert = my $count_seq2delete = 0;\n+my $count_seq3insert = my $count_seq3delete = 0;\n+my @seq1_insert_lengths = my @seq1_delete_lengths = ();\n+my @seq2_insert_lengths = my @seq2_delete_lengths = ();\n+my @seq3_insert_lengths = my @seq3_delete_lengths = ();\n+my @seq1_insert =  my @seq1_delete =  my @seq2_insert =  my @seq2_delete =  my @seq3_insert =  my @seq3_delete = ();\n+my @seq1_insert_startOnly = my @seq1_delete_startOnly = my @seq2_insert_startOnly = my @seq2_delete_startOnly = ();\n+my @seq3_insert_startOnly = my @seq3_delete_startOnly = ();\n+my @indels = (); \n+\n+# check to make sure correct files\n+my $usage = "usage: parseMAF_smallIndels.pl [MAF.in] [small_Indels_summary.out] [outgroup]\\n";\n+die $usage unless @ARGV == 3;\n+\n+# perform some standard subroutines \n+$fh = open_file($library);\n+\n+$offset = tell($fh);\n+\n+#my $ofile = $ARGV[2];\n+#unless (open(OFILE, ">$ofile")){\n+#\t print "Cannot open output file \\"$ofile\\"\\n\\n";\n+#\t exit;\n+#}\n+\n+my $ofile2 = $ARGV[1];\n+unless (open(OFILE2, ">$ofile2")){\n+         print "Cannot open output file \\"$ofile2\\"\\n\\n";\n+         exit;\n+}\n+\n+\n+# header line for output files\n+#print OFILE "# small indel events, parsed from MAF 3-way alignment file, coords are translated from (-) to (+) if necessary\\n";\n+#print OFILE "#align\\tingroup1\\tingroup1_coord\\tingroup1_orient\\tingroup2\\tingroup2_coord\\tingroup2_orient\\toutgroup\\toutgroup_coord\\toutgroup_orient\\tindel_type\\n";\n+\n+#print OFILE2 "# small indels summary, parsed from MAF 3-way alignment file, coords are translated from (-) to (+) if necessary\\n";\n+print OFILE2 "#block\\tindel_type\\tindel_length\\tingroup1\\tingroup1_start\\tingroup1_end\\tingroup1_alignSize\\tingroup1_orient\\tingroup2\\tingroup2_start\\tingroup2_end\\tingroup2_alignSize\\tingroup2_orient\\toutgroup\\toutgroup_start\\toutgroup_end\\toutgroup_alignSize\\toutgroup_orient\\n";\n+\n+# main body of program\n+while ($record = get_next_record($fh) ){\n+\tif ($record=~ m/\\s*##maf(.*)\\s*# maf/s){\n+\t\tnext;\n+\t}\n+\n+\tmy @sequences = get_sequences_within_block($record);\n+\tmy @seq_info = get_indels_within_block(@sequences);\n+\tget_indels_lengths(@seq_info);\n+\t\n+\t$offset = tell($fh);\n+        $count++;\n+        \n+}\n+\n+get_starts_only(@seq1_insert);\n+get_starts_only(@seq1_delete);\n+get_starts_only(@seq2_insert);\n+get_starts_only(@seq2_delete);\n+get_starts_only(@seq3_insert);\n+get_starts_only(@seq3_delete);\n+\n+# print some things to keep track of progress\n+#print "# $library\\n";\n+#print "# number of records = $count\\n";\n+#print "# number of sequence \\"s\\" lines = $count2\\n";\n+if ($count3 != 0){\n+\tprint "Skipped $count3 blocks with only 2 seqs;\\n";\n+}\n+#print "# number of records with only h-m = $count4\\n\\n";\n+\n+print "Ingroup1 = $ingroup1; Ingroup2 = $ingroup2; Outgroup = $outgroup;\\n";\n+print "# of ingroup1 inserts = $count_seq1insert;\\n";\n+print "# of ingroup1 deletes = $count_seq1delete;\\n";\n+print "# of ingroup2 inserts = $count_seq2insert;\\n";\n+print "# of ingroup2 deletes = $count_seq2delete;\\n";\n+print "# of outgroup3 inserts = $count_seq3insert;\\n";\n+print "# of outgroup3 deletes = $count_seq3delete\\n";\n+\n+\n+#close OFILE;\n+\n+if ($count == $count3){\n+\tprint STDERR "Skipped all blocks since none of them contain 3-way alignments.\\n";\n+  \texit -1;\n+}\n+\n+###################SUBROUTINES#####################################\n+\n+# subr'..b's for seq2 and seq3\n+\t\t\t# remember for seq1, the gap spans (coord - 1) --> coord\n+\t\t\t$seq1_event_start = ($events[2]-1);\n+\t\t\t$seq1_event_end = ($events[2]);\n+                        $seq2_event_start = ($events[5]);\n+                        $seq2_event_end = ($events[5]+$events[11]-1);\n+                        $seq3_event_start = ($events[8]);\n+                        $seq3_event_end = ($events[8]+$events[11]-1);\n+\t\t\t$final_event_line = join("\\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9]));\n+\t\t}\n+\t\t# seq2_insert\n+\t\telsif ($event_type =~ m/$ingroup2/ && $event_type =~ m/insert/){\t\n+\t\t\t# only increase coords for seq2 \n+\t\t\t# remember that other two sequnences, the gap spans (coord - 1) --> coord\n+                        $seq1_event_start = ($events[2]-1);\n+                        $seq1_event_end = ($events[2]);\n+\t\t\t$seq2_event_start = ($events[5]);\n+                        $seq2_event_end = ($events[5]+$events[11]-1);\n+                        $seq3_event_start = ($events[8]-1);\n+\t\t\t$seq3_event_end = ($events[8]);\t\t\t\n+\t\t\t$final_event_line = join("\\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9]));\n+\t\t}\n+\t\t# seq2_delete\n+\t\telsif ($event_type =~ m/$ingroup2/ && $event_type =~ m/delete/){\n+\t\t\t# only increase coords for seq1 and seq3\n+\t\t\t# remember for seq2, the gap spans (coord - 1) --> coord\n+                        $seq1_event_start = ($events[2]);\n+\t\t\t$seq1_event_end = ($events[2]+$events[11]-1);\t\n+                        $seq2_event_start = ($events[5]-1);\n+\t                $seq2_event_end = ($events[5]);\n+                        $seq3_event_start = ($events[8]);\n+                        $seq3_event_end = ($events[8]+$events[11]-1);\n+\t\t\t$final_event_line = join("\\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9]));\n+\t\t}\t\n+\t\t# start testing w/seq3_insert\n+\t\telsif ($event_type =~ m/$outgroup/ && $event_type =~ m/insert/){\n+\t\t\t# only increase coord for rheMac\n+\t\t\t# remember that other two sequnences, the gap spans (coord - 1) --> coord\n+\t\t\t$seq1_event_start = ($events[2]-1);\n+\t\t\t$seq1_event_end = ($events[2]);\n+\t\t\t$seq2_event_start = ($events[5]-1);\n+\t\t\t$seq2_event_end = ($events[5]);\n+\t\t\t$seq3_event_start = ($events[8]);\n+\t\t\t$seq3_event_end = ($events[8]+$events[11]-1);\n+\t\t\t$final_event_line = join("\\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9]));\n+\t\t}\n+\t\t# seq3_delete\n+\t\telsif ($event_type =~ m/$outgroup/ && $event_type =~ m/delete/){\n+\t\t\t# only increase coords for seq1 and seq2\n+\t\t\t# remember for seq3, the gap spans (coord - 1) --> coord\n+\t\t\t$seq1_event_start = ($events[2]);\n+\t\t\t$seq1_event_end = ($events[2]+$events[11]-1);\n+\t\t\t$seq2_event_start = ($events[5]);\n+\t\t\t$seq2_event_end = ($events[5]+$events[11]-1);\n+\t\t\t$seq3_event_start = ($events[8]-1);\n+\t\t\t$seq3_event_end = ($events[8]);\n+\t\t\t$final_event_line = join("\\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9]));\n+\n+\t\t}\n+\t\t\n+\t\tprint OFILE2 "$final_event_line\\n";\n+\t}\n+}\n+close OFILE2;\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/quality_filter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/quality_filter.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,245 @@\n+#!/usr/bin/env python\n+#Guruprasad Ananda\n+"""\n+Filter based on nucleotide quality (PHRED score).\n+\n+usage: %prog input out_file primary_species mask_species score mask_char mask_region mask_region_length\n+"""\n+\n+\n+from __future__ import division\n+from galaxy import eggs\n+import pkg_resources \n+pkg_resources.require( "bx-python" )\n+pkg_resources.require( "lrucache" )\n+try:\n+    pkg_resources.require("numpy")\n+except:\n+    pass\n+\n+import psyco_full\n+import sys\n+import os, os.path\n+from UserDict import DictMixin\n+from bx.binned_array import BinnedArray, FileBinnedArray\n+from bx.bitset import *\n+from bx.bitset_builders import *\n+from fpconst import isNaN\n+from bx.cookbook import doc_optparse\n+from galaxy.tools.exception_handling import *\n+import bx.align.maf\n+\n+class FileBinnedArrayDir( DictMixin ):\n+    """\n+    Adapter that makes a directory of FileBinnedArray files look like\n+    a regular dict of BinnedArray objects. \n+    """\n+    def __init__( self, dir ):\n+        self.dir = dir\n+        self.cache = dict()\n+    def __getitem__( self, key ):\n+        value = None\n+        if key in self.cache:\n+            value = self.cache[key]\n+        else:\n+            fname = os.path.join( self.dir, "%s.qa.bqv" % key )\n+            if os.path.exists( fname ):\n+                value = FileBinnedArray( open( fname ) )\n+                self.cache[key] = value\n+        if value is None:\n+            raise KeyError( "File does not exist: " + fname )\n+        return value\n+\n+def stop_err(msg):\n+    sys.stderr.write(msg)\n+    sys.exit()\n+\n+def load_scores_ba_dir( dir ):\n+    """\n+    Return a dict-like object (keyed by chromosome) that returns \n+    FileBinnedArray objects created from "key.ba" files in `dir`\n+    """\n+    return FileBinnedArrayDir( dir )\n+\n+def bitwise_and ( string1, string2, maskch ):\n+    result=[]\n+    for i,ch in enumerate(string1):\n+        try:\n+            ch = int(ch)\n+        except:\n+            pass\n+        if string2[i] == \'-\':\n+            ch = 1\n+        if ch and string2[i]:\n+            result.append(string2[i])\n+        else:\n+            result.append(maskch)\n+    return \'\'.join(result)\n+\n+def main():   \n+    # Parsing Command Line here\n+    options, args = doc_optparse.parse( __doc__ )\n+    \n+    try:\n+        #chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols )\n+        inp_file, out_file, pri_species, mask_species, qual_cutoff, mask_chr, mask_region, mask_length, loc_file = args\n+        qual_cutoff = int(qual_cutoff)\n+        mask_chr = int(mask_chr)\n+        mask_region = int(mask_region)\n+        if mask_region != 3:\n+            mask_length = int(mask_length)\n+        else:\n+            mask_length_r = int(mask_length.split(\',\')[0])\n+            mask_length_l = int(mask_length.split(\',\')[1])\n+    except:\n+        stop_err( "Data issue, click the pencil icon in the history item to correct the metadata attributes of the input dataset." )\n+    \n+    if pri_species == \'None\':\n+        stop_err( "No primary species selected, try again by selecting at least one primary species." )\n+    if mask_species == \'None\':\n+        stop_err( "No mask species selected, try again by selecting at least one species to mask." )\n+\n+    mask_chr_count = 0\n+    mask_chr_dict = {0:\'#\', 1:\'$\', 2:\'^\', 3:\'*\', 4:\'?\', 5:\'N\'}\n+    mask_reg_dict = {0:\'Current pos\', 1:\'Current+Downstream\', 2:\'Current+Upstream\', 3:\'Current+Both sides\'}\n+\n+    #ensure dbkey is present in the twobit loc file\n+    filepath = None\n+    try:\n+        pspecies_all = pri_species.split(\',\')\n+        pspecies_all2 = pri_species.split(\',\')\n+        pspecies = []\n+        filepaths = []\n+        for line in open(loc_file):\n+            if pspecies_all2 == []:    \n+                break\n+            if line[0:1] == "#":\n+                continue\n+            fields = line.split(\'\\t\')\n+            try:\n+                build = fields[0]\n+                for i,dbkey in enumerate(pspecies_all2):\n+                  '..b'               sequence = block.components[seq].text\n+                s_start = block.components[seq].start\n+                size = len(sequence)    #this includes the gaps too\n+                status_str = \'1\'*size\n+                status_list = list(status_str)\n+                if status_strings == []:\n+                    status_strings.append(status_str)\n+                ind = 0\n+                s_end = block.components[seq].end\n+                #Get scores for the entire sequence\n+                try:\n+                    scores = scores_by_chrom[index][chr][s_start:s_end]\n+                except:\n+                    continue\n+                pos = 0\n+                while pos < (s_end-s_start):    \n+                    if sequence[ind] == \'-\':    #No score for GAPS\n+                        ind += 1\n+                        continue\n+                    score = scores[pos]\n+                    if score < qual_cutoff:\n+                        score = 0\n+                        \n+                    if not(score):\n+                        if mask_region == 0:    #Mask Corresponding position only\n+                            status_list[ind] = \'0\'\n+                            ind += 1\n+                            pos += 1\n+                        elif mask_region == 1:    #Mask Corresponding position + downstream neighbors\n+                            for n in range(mask_length+1):\n+                                try:\n+                                    status_list[ind+n] = \'0\'\n+                                except:\n+                                    pass\n+                            ind = ind + mask_length + 1\n+                            pos = pos + mask_length + 1\n+                        elif mask_region == 2:    #Mask Corresponding position + upstream neighbors\n+                            for n in range(mask_length+1):\n+                                try:\n+                                    status_list[ind-n] = \'0\'\n+                                except:\n+                                    pass\n+                            ind += 1\n+                            pos += 1\n+                        elif mask_region == 3:    #Mask Corresponding position + neighbors on both sides\n+                            for n in range(-mask_length_l,mask_length_r+1):\n+                                try:\n+                                    status_list[ind+n] = \'0\'\n+                                except:\n+                                    pass\n+                            ind = ind + mask_length_r + 1\n+                            pos = pos + mask_length_r + 1\n+                    else:\n+                        pos += 1\n+                        ind += 1\n+                    \n+                status_strings.append(\'\'.join(status_list))\n+        \n+        if status_strings == []:    #this block has no primary species\n+            continue\n+        output_status_str = status_strings[0]\n+        for stat in status_strings[1:]:\n+            try:\n+                output_status_str = bitwise_and (status_strings[0], stat, \'0\')\n+            except Exception, e:\n+                break\n+            \n+        for seq in range (len(block.components)):\n+            src = block.components[seq].src\n+            dbkey = src.split(\'.\')[0]\n+            if dbkey not in mask_species.split(\',\'):\n+                continue\n+            sequence = block.components[seq].text\n+            sequence = bitwise_and (output_status_str, sequence, mask_chr_dict[mask_chr])\n+            block.components[seq].text = sequence\n+            mask_chr_count += output_status_str.count(\'0\')\n+        maf_writer.write(block)\n+        maf_count += 1\n+        \n+    maf_reader.close()\n+    maf_writer.close()\n+    print "No. of blocks = %d; No. of masked nucleotides = %s; Mask character = %s; Mask region = %s; Cutoff used = %d" %(maf_count, mask_chr_count, mask_chr_dict[mask_chr], mask_reg_dict[mask_region], qual_cutoff)\n+    \n+    \n+if __name__ == "__main__":\n+    main()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/quality_filter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/quality_filter.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,115 @@
+<tool id="qualityFilter" name="Filter nucleotides" version="1.0.1">
+  <description> based on quality scores</description>
+  <command interpreter="python">
+   quality_filter.py 
+   $input 
+   $out_file1 
+   $primary_species 
+   $mask_species 
+   $score 
+   $mask_char 
+   ${mask_region.region} 
+   #if $mask_region.region == "3"
+   ${mask_region.lengthr},${mask_region.lengthl}
+   #elif $mask_region.region == "0"
+   1
+   #else
+   ${mask_region.length}
+ #end if
+   ${GALAXY_DATA_INDEX_DIR}/quality_scores.loc
+  </command>
+  <inputs>
+    <param format="maf" name="input" type="data" label="Select data"/>
+    <param name="primary_species" type="select" label="Use quality scores of" display="checkboxes" multiple="true">
+      <options>
+        <filter type="data_meta" ref="input" key="species" />
+      </options>  
+    </param>
+ <param name="mask_species" type="select" label="Mask Species" display="checkboxes" multiple="true">
+      <options>
+        <filter type="data_meta" ref="input" key="species" />
+      </options>  
+ </param>
+ <param name="score" size="10" type="integer" value="20" label="Quality score cut-off" help="Cut-off value of 20 means mask all nucleotides having quality score less than or equal to 20"/>
+ <param name="mask_char" size="5" type="select" label="Mask character">
+      <option value="0" selected="true">#</option>
+      <option value="1">$</option>
+      <option value="2">^</option>
+      <option value="3">*</option>
+      <option value="4">?</option>
+      <option value="5">N</option>
+    </param>
+ <conditional name="mask_region">
+      <param name="region" type="select" label="Mask region">
+        <option value="0" selected="true">Only the corresponding nucleotide </option>
+        <option value="1">Corresponding column + right-side neighbors</option>
+        <option value="2">Corresponding column + left-side neighbors</option>
+        <option value="3">Corresponding column + neighbors on both sides</option>
+      </param>
+      <when value="0">
+      </when>
+      <when value="1">
+        <param name="length" size="10" type="integer" value="2" label="Number of right-side neighbors"/>
+      </when>
+      <when value="2">
+        <param name="length" size="10" type="integer" value="2" label="Number of left-side neighbors"/>
+      </when>
+      <when value="3">
+        <param name="lengthr" size="10" type="integer" value="2" label="Number of neighbors on right-side" />
+        <param name="lengthl" size="10" type="integer" value="2" label="Number of neighbors on left-side" />
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="maf" name="out_file1" metadata_source="input"/>
+  </outputs>
+  <requirements>
+    <requirement type="python-module">numpy</requirement>
+  </requirements>
+  <tests>
+    <test>
+      <param name="input" value="6.maf"/>
+      <param name="primary_species" value="panTro2"/>
+      <param name="mask_species" value="hg18"/>
+      <param name="score" value="50"/>
+      <param name="mask_char" value="0"/>
+      <param name="region" value="0" />
+      <output name="out_file1" file="6_quality_filter.maf"/>
+    </test>
+  </tests>
+ <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This tool takes a MAF file as input and filters nucleotides in every alignment block of the MAF file based on their quality/PHRED scores. 
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+Any block/s not containing the primary species (species whose quality scores is to be used), will be omitted. 
+Also, any primary species whose quality scores are not available in Galaxy will be considered as a non-primary species. This info will appear as a message in the job history panel. 
+
+-----
+
+**Example**
+
+- For the following alignment block::
+
+   a score=4050.0
+   s hg18.chrX    3719221 48 - 154913754 tattttacatttaaaataaatatgtaaatatatattttatatttaaaa 
+   s panTro2.chrX 3560945 48 - 155361357 tattttatatttaaaataaagatgtaaatatatattttatatttaaaa 
+
+- running this tool with **Primary species as panTro2**, **Mask species as hg18, panTro2**, **Quality cutoff as 20**, **Mask character as #** and **Mask region as only the corresponding position** will return::
+
+   a score=4050.0
+   s hg18.chrX    3719221 48 - 154913754 ###tttac#####a###a#atatgtaaat###tattt#####ttaaaa 
+   s panTro2.chrX 3560945 48 - 155361357 ###tttat#####a###a#agatgtaaat###tattt#####ttaaaa 
+   
+   where, the positions containing # represent panTro2 nucleotides having quality scores less than 20.
+  </help>  
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/qv_to_bqv.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/qv_to_bqv.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+
+"""
+Adapted from bx/scripts/qv_to_bqv.py
+
+Convert a qual (qv) file to several BinnedArray files for fast seek.
+This script takes approximately 4 seconds per 1 million base pairs.
+
+The input format is fasta style quality -- fasta headers followed by 
+whitespace separated integers.
+
+usage: %prog qual_file output_file
+"""
+
+import pkg_resources 
+pkg_resources.require( "bx-python" )
+pkg_resources.require( "numpy" )
+import string
+import psyco_full
+import sys, re, os, tempfile
+from bx.binned_array import BinnedArrayWriter
+from bx.cookbook import *
+import fileinput
+
+def load_scores_ba_dir( dir ):
+    """
+    Return a dict-like object (keyed by chromosome) that returns 
+    FileBinnedArray objects created from "key.ba" files in `dir`
+    """
+    return FileBinnedArrayDir( dir )
+
+def main():
+    args = sys.argv[1:]
+    try:
+        qual_file_dir = args[0]
+        #mydir="/home/gua110/Desktop/chimp_quality_scores/chr22.qa"
+        mydir="/home/gua110/Desktop/rhesus_quality_scores/rheMac2.qual.qv"
+        qual_file_dir = mydir.replace(mydir.split("/")[-1], "")
+        output_file = args[ 1 ]
+        fo = open(output_file,"w")
+    except:
+        print "usage: qual_file output_file"
+        sys.exit()
+    
+    tmpfile = tempfile.NamedTemporaryFile()
+    cmdline = "ls " + qual_file_dir + "*.qa | cat >> " + tmpfile.name
+    os.system (cmdline)
+    for qual_file in tmpfile.readlines():
+        qual = fileinput.FileInput( qual_file.strip() )
+        outfile = None
+        outbin = None
+        base_count = 0
+        mega_count = 0
+    
+        for line in qual:
+            line = line.rstrip("\r\n")
+            if line.startswith(">"):
+                # close old
+                if outbin and outfile:
+                    print "\nFinished region " + region + " at " + str(base_count) + " base pairs."
+                    outbin.finish()
+                    outfile.close()
+                # start new file
+                region = line.lstrip(">")
+                #outfname = output_file + "." + region + ".bqv" #CHANGED
+                outfname = qual_file.strip() + ".bqv"
+                print >>fo, "Writing region " + region + " to file " + outfname
+                outfile = open( outfname , "wb")
+                outbin = BinnedArrayWriter(outfile, typecode='b', default=0)
+                base_count = 0
+                mega_count = 0
+            else:
+                if outfile and outbin:
+                    nums = line.split()
+                    for val in nums:
+                        outval = int(val)
+                        assert outval <= 255 and outval >= 0
+                        outbin.write(outval)
+                        base_count += 1
+                    if (mega_count * 1000000) <= base_count:
+                        sys.stdout.write(str(mega_count)+" ")
+                        sys.stdout.flush()
+                        mega_count = base_count // 1000000 + 1
+        if outbin and outfile:
+            print "\nFinished region " + region + " at " + str(base_count) + " base pairs."
+            outbin.finish()
+            outfile.close()
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/qv_to_bqv.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/qv_to_bqv.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,17 @@
+<tool id="qv2bqv" name="qv2bqv">
+  <description></description>
+  <command interpreter="python">qv_to_bqv.py "$input1" $output</command>
+  <inputs>
+    <param name="input1" type="data" format="interval" help="Directory" />
+   </inputs>
+  <outputs>
+    <data format="text" name="output" metadata_source="input1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="1.bed" />
+      <param name="input2" value="2.bed" />
+      <output name="output" file="gops-coverage.dat" />
+    </test>
+  </tests>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/rcve.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/rcve.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,143 @@
+#!/usr/bin/env python
+
+from galaxy import eggs
+
+import sys, string
+from rpy import *
+import numpy
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+def sscombs(s):
+    if len(s) == 1:
+        return [s]
+    else:
+        ssc = sscombs(s[1:])
+        return [s[0]] + [s[0]+comb for comb in ssc] + ssc
+
+
+infile = sys.argv[1]
+y_col = int(sys.argv[2])-1
+x_cols = sys.argv[3].split(',')
+outfile = sys.argv[4]
+
+print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1)
+fout = open(outfile,'w')
+
+for i, line in enumerate( file ( infile )):
+    line = line.rstrip('\r\n')
+    if len( line )>0 and not line.startswith( '#' ):
+        elems = line.split( '\t' )
+        break 
+    if i == 30:
+        break # Hopefully we'll never get here...
+
+if len( elems )<1:
+    stop_err( "The data in your input dataset is either missing or not formatted properly." )
+
+y_vals = []
+x_vals = []
+
+for k,col in enumerate(x_cols):
+    x_cols[k] = int(col)-1
+    x_vals.append([])
+    """
+    try:
+        float( elems[x_cols[k]] )
+    except:
+        try:
+            msg = "This operation cannot be performed on non-numeric column %d containing value '%s'." %( col, elems[x_cols[k]] )
+        except:
+            msg = "This operation cannot be performed on non-numeric data."
+        stop_err( msg )
+    """
+NA = 'NA'
+for ind,line in enumerate( file( infile )):
+    if line and not line.startswith( '#' ):
+        try:
+            fields = line.split("\t")
+            try:
+                yval = float(fields[y_col])
+            except Exception, ey:
+                yval = r('NA')
+                #print >>sys.stderr, "ey = %s" %ey
+            y_vals.append(yval)
+            for k,col in enumerate(x_cols):
+                try:
+                    xval = float(fields[col])
+                except Exception, ex:
+                    xval = r('NA')
+                    #print >>sys.stderr, "ex = %s" %ex
+                x_vals[k].append(xval)
+        except:
+            pass
+
+x_vals1 = numpy.asarray(x_vals).transpose()
+dat= r.list(x=array(x_vals1), y=y_vals)
+
+set_default_mode(NO_CONVERSION)
+try:
+    full = r.lm(r("y ~ x"), data= r.na_exclude(dat))    #full model includes all the predictor variables specified by the user
+except RException, rex:
+    stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.")
+set_default_mode(BASIC_CONVERSION)
+
+summary = r.summary(full)
+fullr2 = summary.get('r.squared','NA')
+
+if fullr2 == 'NA':
+    stop_error("Error in linear regression")
+
+if len(x_vals) < 10:
+    s = ""
+    for ch in range(len(x_vals)):
+        s += str(ch)
+else:
+    stop_err("This tool only works with less than 10 predictors.")
+
+print >>fout, "#Model\tR-sq\tRCVE_Terms\tRCVE_Value"
+all_combos = sorted(sscombs(s), key=len)
+all_combos.reverse()
+for j,cols in enumerate(all_combos):
+    #if len(cols) == len(s):    #Same as the full model above
+    #    continue
+    if len(cols) == 1:
+        x_vals1 = x_vals[int(cols)]
+    else:
+        x_v = []
+        for col in cols:
+            x_v.append(x_vals[int(col)])
+        x_vals1 = numpy.asarray(x_v).transpose()
+    dat= r.list(x=array(x_vals1), y=y_vals)
+    set_default_mode(NO_CONVERSION)
+    red = r.lm(r("y ~ x"), data= dat)    #Reduced model
+    set_default_mode(BASIC_CONVERSION)
+    summary = r.summary(red)
+    redr2 = summary.get('r.squared','NA')
+    try:
+        rcve = (float(fullr2)-float(redr2))/float(fullr2)
+    except:
+        rcve = 'NA'
+    col_str = ""
+    for col in cols:
+        col_str = col_str + str(int(x_cols[int(col)]) + 1) + " "
+    col_str.strip()
+    rcve_col_str = ""
+    for col in s:
+        if col not in cols:
+            rcve_col_str = rcve_col_str + str(int(x_cols[int(col)]) + 1) + " "
+    rcve_col_str.strip()
+    if len(cols) == len(s):    #full model
+        rcve_col_str = "-"
+        rcve = "-"
+    try:
+        redr2 = "%.4f" %(float(redr2))
+    except:
+        pass
+    try:
+        rcve = "%.4f" %(float(rcve))
+    except:
+        pass
+    print >>fout, "%s\t%s\t%s\t%s" %(col_str,redr2,rcve_col_str,rcve)
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/rcve.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/rcve.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,70 @@
+<tool id="rcve1" name="Compute RCVE" version="1.0.0">
+  <description> </description>
+  <command interpreter="python">
+    rcve.py 
+      $input1
+      $response_col
+      $predictor_cols
+      $out_file1
+      1>/dev/null
+  </command>
+  <inputs>
+    <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/>
+    <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" />
+    <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true">
+        <validator type="no_options" message="Please select at least one column."/>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input1" />
+  </outputs>
+  <requirements>
+    <requirement type="python-module">rpy</requirement>
+  </requirements>
+  <tests>
+    <!-- Test data with vlid values -->
+   <test>
+      <param name="input1" value="reg_inp.tab"/>
+      <param name="response_col" value="1"/>
+      <param name="predictor_cols" value="2,3,4"/>
+      <output name="out_file1" file="rcve_out.dat"/>
+    </test>
+    
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Edit Datasets-&gt;Convert characters*
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+This tool computes the RCVE (Relative Contribution to Variance) for all possible variable subsets using the following formula:
+
+**RCVE(i) = [R-sq (full: 1,2,..,i..,p-1) - R-sq(without i: 1,2,...,p-1)] / R-sq (full: 1,2,..,i..,p-1)**,
+which denotes the case where the 'i'th predictor is dropped. 
+
+
+In general,
+**RCVE(X+) = [R-sq (full: {X,X+}) - R-sq(reduced: {X})] / R-sq (full: {X,X+})**,
+where,
+
+- {X,X+} denotes the set of all predictors, 
+- X+ is the set of predictors for which we compute RCVE (and therefore drop from the full model to obtain a reduced one), 
+- {X} is the set of the predictors that are left in the reduced model after excluding {X+} 
+
+
+The 4 columns in the output are described below:
+
+- Column 1 (Model): denotes the variables present in the model ({X})
+- Column 2 (R-sq): denotes the R-squared value corresponding to the model in Column 1
+- Column 3 (RCVE_Terms): denotes the variable/s for which RCVE is computed ({X+}). These are the variables that are absent in the reduced model in Column 1. A '-' in this column indicates that the model in Column 1 is the Full model.
+- Column 4 (RCVE): denotes the RCVE value corresponding to the variable/s in Column 3. A '-' in this column indicates that the model in Column 1 is the Full model.
+  
+  
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/substitution_rates.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/substitution_rates.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+#guruprasad Ananda
+"""
+Estimates substitution rates from pairwise alignments using JC69 model.
+"""
+
+from galaxy import eggs
+from galaxy.tools.util.galaxyops import *
+from galaxy.tools.util import maf_utilities
+import bx.align.maf
+import sys, fileinput
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+if len(sys.argv) < 3:
+        stop_err("Incorrect number of arguments.")    
+    
+inp_file = sys.argv[1]
+out_file = sys.argv[2]
+fout = open(out_file, 'w')
+int_file = sys.argv[3]
+if int_file != "None":     #The user has specified an interval file
+    dbkey_i = sys.argv[4]
+    chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] )
+
+
+def rateEstimator(block):
+    global alignlen, mismatches
+
+    src1 = block.components[0].src
+    sequence1 = block.components[0].text
+    start1 = block.components[0].start
+    end1 = block.components[0].end
+    len1 = int(end1)-int(start1)
+    len1_withgap = len(sequence1)
+    mismatch = 0.0
+    
+    for seq in range (1,len(block.components)):
+        src2 = block.components[seq].src
+        sequence2 = block.components[seq].text
+        start2 = block.components[seq].start
+        end2 = block.components[seq].end
+        len2 = int(end2)-int(start2)
+        for nt in range(len1_withgap):
+            if sequence1[nt] not in '-#$^*?' and sequence2[nt] not in '-#$^*?': #Not a gap or masked character
+                if sequence1[nt].upper() != sequence2[nt].upper():
+                    mismatch += 1
+    
+    if int_file == "None":  
+        p = mismatch/min(len1,len2)
+        print >>fout, "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%.4f" %(src1,start1,end1,src2,start2,end2,min(len1,len2),mismatch,p)
+    else:
+        mismatches += mismatch
+        alignlen += min(len1,len2)
+              
+def main():
+    skipped = 0
+    not_pairwise = 0
+    
+    if int_file == "None":
+        try:
+            maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
+        except:
+            stop_err("Your MAF file appears to be malformed.")
+        print >>fout, "#Seq1\tStart1\tEnd1\tSeq2\tStart2\tEnd2\tL\tN\tp"
+        for block in maf_reader:
+            if len(block.components) != 2:
+                not_pairwise += 1
+                continue
+            try:
+                rateEstimator(block)
+            except:
+                skipped += 1
+    else:
+        index, index_filename = maf_utilities.build_maf_index( inp_file, species = [dbkey_i] )
+        if index is None:
+            print >> sys.stderr, "Your MAF file appears to be malformed."
+            sys.exit()
+        win = NiceReaderWrapper( fileinput.FileInput( int_file ),
+                                chrom_col=chr_col_i,
+                                start_col=start_col_i,
+                                end_col=end_col_i,
+                                strand_col=strand_col_i,
+                                fix_strand=True)
+        species=None
+        mincols = 0
+        global alignlen, mismatches
+        
+        for interval in win:
+            alignlen = 0
+            mismatches = 0.0
+            src = "%s.%s" % ( dbkey_i, interval.chrom )
+            for block in maf_utilities.get_chopped_blocks_for_region( index, src, interval, species, mincols ):
+                if len(block.components) != 2:
+                    not_pairwise += 1
+                    continue
+                try:
+                    rateEstimator(block)
+                except:
+                    skipped += 1
+            if alignlen:
+                p = mismatches/alignlen
+            else:
+                p = 'NA'
+            interval.fields.append(str(alignlen))
+            interval.fields.append(str(mismatches))
+            interval.fields.append(str(p))
+            print >>fout, "\t".join(interval.fields)    
+            #num_blocks += 1
+    
+    if not_pairwise:
+        print "Skipped %d non-pairwise blocks" %(not_pairwise)
+    if skipped:
+        print "Skipped %d blocks as invalid" %(skipped)
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/substitution_rates.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/substitution_rates.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,61 @@
+<tool id="subRate1" name="Estimate substitution rates " version="1.0.0">
+  <description> for non-coding regions</description>
+  <command interpreter="python">
+   substitution_rates.py 
+   $input 
+   $out_file1
+   #if $region.type == "win":
+      ${region.input2} ${region.input2.dbkey} ${region.input2.metadata.chromCol},$region.input2.metadata.startCol,$region.input2.metadata.endCol,$region.input2.metadata.strandCol
+    #else:
+      "None"
+    #end if 
+  </command>
+  <inputs>
+    <param format="maf" name="input" type="data" label="Select pair-wise alignment data"/>
+    <conditional name="region">
+       <param name="type" type="select" label="Estimate rates corresponding to" multiple="false">
+          <option value="align">Alignment block</option>
+          <option value="win">Intervals in your history</option>
+      </param>
+      <when value="win">
+        <param format="interval" name="input2" type="data" label="Choose intervals">
+        <validator type="unspecified_build" />
+      </param>
+       </when>
+       <when value="align" />
+      </conditional>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_file1" metadata_source="input"/>
+  </outputs>
+  
+  <tests>
+    <test>
+      <param name="input" value="Interval2Maf_pairwise_out.maf"/>
+      <param name="type" value="align"/>
+      <output name="out_file1" file="subRates1.out"/>
+    </test>
+  </tests>
+  
+ <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This tool takes a pairwise MAF file as input and estimates substitution rate according to Jukes-Cantor JC69 model. The 3 new columns appended to the output are explained below:
+
+- L: number of nucleotides compared
+- N: number of different nucleotides
+- p = N/L
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+Any block/s not containing exactly two sequences, will be omitted. 
+
+  </help>  
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/substitutions.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/substitutions.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+#Guruprasad ANanda
+"""
+Fetches substitutions from pairwise alignments.
+"""
+
+from galaxy import eggs
+
+from galaxy.tools.util import maf_utilities
+
+import bx.align.maf
+import sys
+import os, fileinput
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+if len(sys.argv) < 3:
+        stop_err("Incorrect number of arguments.")    
+    
+inp_file = sys.argv[1]
+out_file = sys.argv[2]
+fout = open(out_file, 'w')
+
+def fetchSubs(block):
+    
+    src1 = block.components[0].src
+    sequence1 = block.components[0].text
+    start1 = block.components[0].start
+    end1 = block.components[0].end
+    len1 = int(end1)-int(start1)
+    len1_withgap = len(sequence1)
+    
+    for seq in range (1,len(block.components)):
+        src2 = block.components[seq].src
+        sequence2 = block.components[seq].text
+        start2 = block.components[seq].start
+        end2 = block.components[seq].end
+        len2 = int(end2)-int(start2)
+        sub_begin = None
+        sub_end = None
+        begin = False
+        
+        for nt in range(len1_withgap):
+            if sequence1[nt] not in '-#$^*?' and sequence2[nt] not in '-#$^*?': #Not a gap or masked character
+                if sequence1[nt].upper() != sequence2[nt].upper():
+                    if not(begin):
+                        sub_begin = nt
+                        begin = True
+                    sub_end = nt
+                else:
+                    if begin:
+                        print >>fout, "%s\t%s\t%s" %(src1,start1+sub_begin-sequence1[0:sub_begin].count('-'),start1+sub_end-sequence1[0:sub_end].count('-'))
+                        print >>fout, "%s\t%s\t%s" %(src2,start2+sub_begin-sequence2[0:sub_begin].count('-'),start2+sub_end-sequence2[0:sub_end].count('-'))    
+                        begin = False
+
+            else:
+                if begin:
+                    print >>fout, "%s\t%s\t%s" %(src1,start1+sub_begin-sequence1[0:sub_begin].count('-'),end1+sub_end-sequence1[0:sub_end].count('-'))
+                    print >>fout, "%s\t%s\t%s" %(src2,start2+sub_begin-sequence2[0:sub_begin].count('-'),end2+sub_end-sequence2[0:sub_end].count('-'))    
+                    begin = False
+                    ended = False
+    
+              
+def main():
+    skipped = 0
+    not_pairwise = 0
+    try:
+        maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
+    except:
+        stop_err("Your MAF file appears to be malformed.")
+    print >>fout, "#Chr\tStart\tEnd"
+    for block in maf_reader:
+        if len(block.components) != 2:
+            not_pairwise += 1
+            continue
+        try:
+            fetchSubs(block)
+        except:
+            skipped += 1
+    
+    if not_pairwise:
+        print "Skipped %d non-pairwise blocks" %(not_pairwise)
+    if skipped:
+        print "Skipped %d blocks" %(skipped)
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/substitutions.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/substitutions.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,38 @@
+<tool id="substitutions1" name="Fetch substitutions " version="1.0.0">
+  <description> from pairwise alignments</description>
+  <command interpreter="python">
+   substitutions.py 
+   $input 
+   $out_file1
+  </command>
+  <inputs>
+    <param format="maf" name="input" type="data" label="Select pair-wise alignment data"/>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_file1" metadata_source="input"/>
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="Interval2Maf_pairwise_out.maf"/>
+      <output name="out_file1" file="subs.out"/>
+    </test>
+  </tests>
+ <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This tool takes a pairwise MAF file as input and fetches substitutions per alignment block.
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+Any block/s not containing exactly two sequences, will be omitted. 
+
+  </help>  
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/t_test_two_samples.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/t_test_two_samples.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,109 @@
+# A program to implement the non-pooled t-test for two samples where the alternative hypothesis is two-sided or one-sided. 
+# The first input file is a TABULAR format file representing the first sample and consisting of one column only.
+# The second input file is a TABULAR format file representing the first sample nd consisting of one column only.
+# The third input is the sidedness of the t-test: either two-sided or, one-sided with m1 less than m2 or, 
+# one-sided with m1 greater than m2. 
+# The fourth input is the equality status of the standard deviations of both populations
+# The output file is a TXT file representing the result of the two sample t-test.
+
+use strict;
+use warnings;
+
+#variable to handle the motif information
+my $motif;
+my $motifName = "";
+my $motifNumber = 0;
+my $totalMotifsNumber = 0;
+my @motifNamesArray = ();
+
+# check to make sure having correct files
+my $usage = "usage: non_pooled_t_test_two_samples_galaxy.pl [TABULAR.in] [TABULAR.in] [testSidedness] [standardDeviationEquality] [TXT.out] \n";
+die $usage unless @ARGV == 5;
+
+#get the input arguments
+my $firstSampleInputFile = $ARGV[0];
+my $secondSampleInputFile = $ARGV[1];
+my $testSidedness = $ARGV[2];
+my $standardDeviationEquality = $ARGV[3]; 
+my $outputFile = $ARGV[4];
+
+#open the input files
+open (INPUT1, "<", $firstSampleInputFile) || die("Could not open file $firstSampleInputFile \n"); 
+open (INPUT2, "<", $secondSampleInputFile) || die("Could not open file $secondSampleInputFile \n"); 
+open (OUTPUT, ">", $outputFile) || die("Could not open file $outputFile \n");
+
+
+#variables to store the name of the R script file
+my $r_script;
+
+# R script to implement the two-sample test on the motif frequencies in upstream flanking region 
+#construct an R script file and save it in the same directory where the perl file is located
+$r_script = "non_pooled_t_test_two_samples.r";
+
+open(Rcmd,">", $r_script) or die "Cannot open $r_script \n\n";
+print Rcmd "
+        sampleTable1 <- read.table(\"$firstSampleInputFile\", header=FALSE);
+ sample1 <- sampleTable1[, 1];
+
+ sampleTable2 <- read.table(\"$secondSampleInputFile\", header=FALSE);
+ sample2 <- sampleTable2[, 1];
+
+ testSideStatus <- \"$testSidedness\";
+ STEqualityStatus <- \"$standardDeviationEquality\";
+
+ #open the output a text file
+ sink(file = \"$outputFile\");
+
+ #check if the t-test is two-sided
+ if (testSideStatus == \"two-sided\"){
+
+ #check if the standard deviations are equal in both populations
+ if (STEqualityStatus == \"equal\"){
+ #two-sample t-test where standard deviations are assumed to be unequal, the test is two-sided
+ testResult <- t.test(sample1, sample2, var.equal = TRUE);
+ } else{
+ #two-sample t-test where standard deviations are assumed to be unequal, the test is two-sided
+ testResult <- t.test(sample1, sample2, var.equal = FALSE);
+ }
+ } else{  #the t-test is one sided
+
+ #check if the t-test is two-sided with m1 < m2
+ if (testSideStatus == \"one-sided:_m1_less_than_m2\"){
+
+ #check if the standard deviations are equal in both populations
+ if (STEqualityStatus == \"equal\"){
+ #two-sample t-test where standard deviations are assumed to be unequal, the test is one-sided: Halt: m1 < m2
+ testResult <- t.test(sample1, sample2, var.equal = TRUE, alternative = \"less\");
+ } else{
+ #two-sample t-test where standard deviations are assumed to be unequal, the test is one-sided: Halt: m1 < m2
+ testResult <- t.test(sample1, sample2, var.equal = FALSE, alternative = \"less\");
+ }
+ } else{   #the t-test is one-sided with m1 > m2
+ #check if the standard deviations are equal in both populations
+ if (STEqualityStatus == \"equal\"){
+ #two-sample t-test where standard deviations are assumed to be unequal, the test is one-sided: Halt: m1 < m2
+ testResult <- t.test(sample1, sample2, var.equal = TRUE, alternative = \"greater\");
+ } else{
+ #two-sample t-test where standard deviations are assumed to be unequal, the test is one-sided: Halt: m1 < m2
+ testResult <- t.test(sample1, sample2, var.equal = FALSE, alternative = \"greater\");
+ }
+ }
+ }
+
+ #save the output of the t-test into the output text file
+ testResult;
+
+ #close the output text file
+ sink();
+
+ #eof" . "\n";
+
+close Rcmd;
+
+system("R --no-restore --no-save --no-readline < $r_script > $r_script.out");
+
+#close the input and output files
+close(OUTPUT);
+close(INPUT2);
+close(INPUT1);
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/t_test_two_samples.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/t_test_two_samples.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,160 @@
+<tool id="t_test_two_samples" name="T Test for Two Samples" version="1.0.0">
+  <description></description>
+  
+  <command interpreter="perl">
+   t_test_two_samples.pl $inputFile1 $inputFile2 $inputTestSidedness3 $inputStandardDeviationEquality4 $outputFile1
+  </command>
+
+  <inputs>
+   <param format="tabular" name="inputFile1" type="data" label="Select the first sample tabular file"/>
+   <param format="tabular" name="inputFile2" type="data" label="Select the second sample tabular file"/>
+  
+    <param name="inputTestSidedness3" type="select" label="Choose the test sidedness:">
+     <option value="two-sided">Two-sided</option>
+       <option value="one-sided:_m1_less_than_m2">One-sided: m1 less than m2</option>
+       <option value="one-sided:_m1_greater_than_m2">One-sided: m1 greater than m2</option>
+    </param>
+    
+    <param name="inputStandardDeviationEquality4" type="select" label="Choose the standard deviation equality status of the two populations:">
+     <option value="equal">Equal</option>
+       <option value="unequal">Unequal</option>
+    </param>
+  </inputs>
+  
+  <outputs>
+    <data format="text" name="outputFile1"/>
+  </outputs>
+  
+  <tests>
+   <test>
+   <param name="inputFile1" value="sample1.tabular" ftype="tabular" />
+   <param name="inputFile2" value="sample2.tabular" ftype="tabular" />
+     <param name="inputTestSidedness3" value="Two-sided" />
+     <param name="inputStandardDeviationEquality4" value="Equal" />
+     <output name="outputFile1" file="t_test_result1.text" />
+   </test>
+  
+   <test>
+   <param name="inputFile1" value="sample1.tabular" ftype="tabular" />
+   <param name="inputFile2" value="sample2.tabular" ftype="tabular" />
+     <param name="inputTestSidedness3" value="Two-sided" />
+     <param name="inputStandardDeviationEquality4" value="Unequal" />
+     <output name="outputFile1" file="t_test_result2.text" />
+   </test>
+  
+   <test>
+   <param name="inputFile1" value="sample1.tabular" ftype="tabular" />
+   <param name="inputFile2" value="sample2.tabular" ftype="tabular" />
+     <param name="inputTestSidedness3" value="One-sided: m1 less than m2" />
+     <param name="inputStandardDeviationEquality4" value="Equal" />
+     <output name="outputFile1" file="t_test_result3.text" />
+   </test>
+  
+   <test>
+   <param name="inputFile1" value="sample1.tabular" ftype="tabular" />
+   <param name="inputFile2" value="sample2.tabular" ftype="tabular" />
+     <param name="inputTestSidedness3" value="One-sided: m1 less than m2" />
+     <param name="inputStandardDeviationEquality4" value="Unequal" />
+     <output name="outputFile1" file="t_test_result4.text" />
+   </test>
+  
+   <test>
+   <param name="inputFile1" value="sample1.tabular" ftype="tabular" />
+   <param name="inputFile2" value="sample2.tabular" ftype="tabular"/>
+     <param name="inputTestSidedness3" value="One-sided: m1 greater than m2" />
+     <param name="inputStandardDeviationEquality4" value="Equal" />
+     <output name="outputFile1" file="t_test_result5.text" />
+   </test>
+  
+   <test>
+   <param name="inputFile1" value="sample1.tabular" ftype="tabular" />
+   <param name="inputFile2" value="sample2.tabular" ftype="tabular" />
+     <param name="inputTestSidedness3" value="One-sided: m1 greater than m2" />
+     <param name="inputStandardDeviationEquality4" value="Unequal" />
+     <output name="outputFile1" file="t_test_result6.text" />
+   </test>
+  </tests>
+
+
+  <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This program implements the non-pooled t-test for two samples where the alternative hypothesis is two-sided or one-sided. The program takes four inputs:
+
+- The first input file is a TABULAR format file representing the first sample and consisting of one column only.
+- The second input file is a TABULAR format file representing the first sample and consisting of one column only.
+- The third input is the sidedness of the t-test: either two-sided or, one-sided with m1 less than m2 or, one-sided with m1 greater than m2. 
+- The fourth input is the equality status of the standard deviations of both populations.
+- The output file is a TXT file representing the result of the two-sample t-test.
+
+
+**Example**
+
+Let us have the first input file representing the first sample as follows::
+
+ 5
+ 4
+ 8
+ 6
+ 7
+ 2
+ 1
+ 1
+ 0
+ 6
+ 4
+ 5
+ 7
+ 5
+ 3
+ 2
+ 5
+ 8
+ 7
+ 6
+ 4
+
+And the second input file representing the second sample as follows::
+
+ 2
+ 3
+ 5
+ 1
+ 2
+ 7
+ 5
+ 4
+ 3
+ 2
+ 7
+ 6
+ 0
+ 8
+ 4
+ 6
+ 9
+ 2
+ 4
+ 5
+ 6
+
+Runnig the program and choosing "Two-sided" and "Equal" as parameters will give the following output::
+
+ Two Sample t-test
+
+ data:  sample1 and sample2 
+ t = -0.3247, df = 40, p-value = 0.7471
+ alternative hypothesis: true difference in means is not equal to 0 
+ 95 percent confidence interval:
+  -1.720030  1.243839 
+ sample estimates:
+ mean of x mean of y 
+  4.333333  4.571429 
+
+
+  </help>  
+  
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/windowSplitter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/windowSplitter.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+
+"""
+Split into windows.
+
+usage: %prog input size out_file
+   -l, --cols=N,N,N,N: Columns for chrom, start, end, strand in file
+"""
+
+import sys, re, os
+
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+from galaxy.tools.util.galaxyops import *
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def main():   
+    # Parsing Command Line here
+    options, args = doc_optparse.parse( __doc__ )
+    
+    try:
+        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols )
+        inp_file, winsize, out_file, makesliding, offset = args
+        winsize = int(winsize)
+        offset = int(offset)
+        makesliding = int(makesliding)
+        if strand_col_1 <= 0:
+            strand = "+"        #if strand is not defined, default it to +
+    except:
+        stop_err( "Data issue, click the pencil icon in the history item to correct the metadata attributes of the input dataset." )
+    
+    fo = open(out_file,'w')
+
+    skipped_lines = 0
+    first_invalid_line = 0
+    invalid_line = None
+    if offset == 0:
+        makesliding = 0
+
+    for i, line in enumerate( file( inp_file ) ):
+        line = line.strip()
+        if line and line[0:1] != "#":
+            try:
+                elems = line.split('\t')
+                if strand_col_1 != -1:
+                    strand = elems[strand_col_1]
+                start = int(elems[start_col_1])
+                end = int(elems[end_col_1])
+                if makesliding == 0:
+                    numwin = (end - start)/winsize
+                else:
+                    numwin = (end - start)/offset
+                if numwin > 0:
+                    for win in range(numwin):
+                        elems_1 = elems
+                        elems_1[start_col_1] = str(start)
+                        elems_1[end_col_1] = str(start + winsize)
+                        fo.write( "%s\n" % '\t'.join( elems_1 ) )
+                        if makesliding == 0:
+                            start = start + winsize
+                        else:
+                            start = start + offset
+                            if start+winsize > end:
+                                break
+            except:
+                skipped_lines += 1
+                if not invalid_line:
+                    first_invalid_line = i + 1
+                    invalid_line = line
+    
+    fo.close()
+
+    if makesliding == 1:                
+        print 'Window size=%d, Sliding=Yes, Offset=%d' %(winsize, offset)
+    else:
+        print 'Window size=%d, Sliding=No' %(winsize)
+    if skipped_lines > 0:
+        print 'Skipped %d invalid lines starting with #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )             
+    
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/regVariation/windowSplitter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/windowSplitter.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,104 @@
+<tool id="winSplitter" name="Make windows">
+  <description></description>
+  <command interpreter="python">windowSplitter.py $input $size $out_file1 ${wintype.choice} ${wintype.offset} -l ${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol}</command>
+  <inputs>
+    <!--<param label="Genome" name="dbkey" type="genomebuild"/>-->
+    <param format="interval" name="input" type="data" label="Select data"/>
+    <param name="size" size="10" type="integer" value="500" label="Window size"/>
+    <conditional name="wintype">
+     <param name="choice" type="select" label="Make sliding windows?">
+      <option value="0" selected="true">No</option>
+      <option value="1">Yes</option>
+ </param>
+ <when value="0">
+     <param name="offset" type="hidden" value="0" />
+     </when>
+     <when value="1">
+     <param name="offset" size="10" type="integer" value="10" label="Offset size"/>
+     </when>
+ </conditional>
+  </inputs>
+  <outputs>
+    <data format="interval" name="out_file1" metadata_source="input"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="4.bed"/>
+      <param name="size" value="5000"/>
+      <param name="choice" value="1"/>
+      <param name="offset" value="4000"/>
+      <output name="out_file1" file="4_windows.bed"/>
+    </test>
+  </tests>
+ <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This tool splits the intervals in the input file into smaller intervals based on the specified window-size and window type.
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+The positions at the end of the input interval which do not fit into the last window or a new window of required size, will be omitted from the output.
+
+-----
+
+.. class:: infomark
+
+**About formats**
+
+**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones:
+
+The first three BED fields (required) are::
+
+    1. chrom - The name of the chromosome (e.g. chr1, chrY_random).
+    2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.)
+    3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval).
+
+The additional BED fields (optional) are::
+
+    4. name - The name of the BED line.
+    5. score - A score between 0 and 1000.
+    6. strand - Defines the strand - either '+' or '-'.
+    7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser.
+    8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser.
+    9. reserved - This should always be set to zero.
+   10. blockCount - The number of blocks (exons) in the BED line.
+   11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
+   12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
+   13. expCount - The number of experiments.
+   14. expIds - A comma-separated list of experiment ids. The number of items in this list should correspond to expCount.
+   15. expScores - A comma-separated list of experiment scores. All of the expScores should be relative to expIds. The number of items in this list should correspond to expCount.
+
+-----
+
+**Example**
+
+- For the following dataset::
+
+   chr22  1000  4700  NM_174568 0 +
+
+- running this tool with **Window size as 1000**,  will return::
+
+   chr22  1000  2000  NM_174568 0 +
+   chr22  2000 3000  NM_174568 0 +
+   chr22  3000  4000  NM_174568 0 +
+   
+- running this tool to make **Sliding windows** of **size 1000** and **offset 500**,  will return::
+
+   chr22  1000  2000  NM_174568 0 +
+   chr22  1500 2500  NM_174568 0 +
+   chr22  2000  3000  NM_174568 0 +
+   chr22  2500 3500  NM_174568 0 +
+   chr22  3000  4000  NM_174568 0 +
+   chr22  3500 4500  NM_174568 0 +
+  
+  </help>  
+
+
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/listFiles.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/listFiles.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,227 @@
+#Provides Upload tool with access to list of available files
+import glob,sys
+import galaxy.app as thisapp
+import galaxy.util
+
+from elementtree.ElementTree import XML
+
+librepos = '/usr/local/galaxy/data/rg'
+myrepos = '/home/rerla/galaxy'
+marchinirepos = '/usr/local/galaxy/data/rg/snptest'
+
+from galaxy.tools.parameters import DataToolParameter
+
+#Provides Upload tool with access to list of available builds
+
+builds = []
+#Read build names and keys from galaxy.util
+for dbkey, build_name in galaxy.util.dbnames:
+    builds.append((build_name,dbkey,False))
+
+#Return available builds
+def get_available_builds(defval='hg18'):
+    for i,x in enumerate(builds):
+        if x[1] == defval:
+           x = list(x)
+           x[2] = True
+           builds[i] = tuple(x)
+    return builds
+
+
+
+def get_tabular_cols( input, outformat='gg' ):
+    """numeric only other than rs for strict genome graphs
+    otherwise tabular. Derived from galaxy tool source around August 2007 by Ross"""
+    columns = []
+    seenCnames = {}
+    elems = []
+    colnames = ['Col%d' % x for x in range(input.metadata.columns+1)]
+    strict = (outformat=='gg')
+    for i, line in enumerate( file ( input.file_name ) ):
+        if line and not line.startswith( '#' ): 
+            line = line.rstrip('\r\n')
+            elems = line.split( '\t' )
+    
+            """
+            Strict gg note:
+            Since this tool requires users to select only those columns
+            that contain numerical values, we'll restrict the column select
+            list appropriately other than the first column which must be a marker
+            """
+            if len(elems) > 0:
+                for col in range(1, input.metadata.columns+1):
+     isFloat = False # short circuit common result
+                    try:
+                        val = float(elems[col-1])
+                        isFloat = True
+                    except:
+                        val = elems[col-1]
+                        if val:
+                            if i == 0: # header row
+                               colnames[col] = val
+                    if isFloat or (not strict) or (col == 1): # all in if not GG
+                        option = colnames[col]
+                        if not seenCnames.get(option,None): # new
+                              columns.append((option,str(col),False))
+                              seenCnames[option] = option
+            #print 'get_tab: %d=%s. Columns=%s' % (i,line,str(columns))
+            if len(columns) > 0 and i > 10:
+                """
+                We have our select list built, so we can break out of the outer most for loop
+                """
+                break 
+        if i == 30:
+            break # Hopefully we never get here...
+    for option in range(min(5,len(columns))):
+      (x,y,z) = columns[option]
+      columns[option] = (x,y,True)
+    return columns # sorted select options
+
+def get_marchini_dir():
+    """return the filesystem directory for snptest style files"""
+    return marchinirepos
+
+
+def get_lib_SNPTESTCaCofiles():
+    """return a list of file names - without extensions - available for caco studies
+    These have a common file name with both _1 and _2 suffixes"""
+    d = get_marchini_dir()
+    testsuffix = '.gen_1' # glob these
+    flist = glob.glob('%s/*%s' % (d,testsuffix))
+    flist = [x.split(testsuffix)[0] for x in flist] # leaves with a list of file set names
+    if len(flist) > 0:
+        dat = [(flist[0],flist[0],True),]
+ dat += [(x,x,False) for x in flist[1:]]
+    else:
+        dat = [('No Marchini CaCo files found in %s - convert some using the Marchini converter tool' % d,'None',True),]
+    return dat
+
+def getChropt():
+    """return dynamic chromosome select options
+    """
+    c = ['X','Y']
+    c += ['%d' % x for x in range(1,23)]
+    dat = [(x,x,False) for x in c]
+    x,y,z = dat[3]
+    dat[3] = (x,y,True)
+    return dat
+
+
+def get_phecols(fname=''):
+   """ return a list of phenotype columns for a multi-select list
+   prototype:
+   foo = ('fake - not yet implemented','not implemented','False')
+   dat = [foo for x in range(5)]
+   return dat
+   """
+   try:
+    header = file(fname,'r').next().split()
+   except:
+        return [('get_phecols unable to open file %s' % fname,'None',False),]
+   dat = [(x,x,False) for x in header]
+   return dat
+
+#Return various kinds of files
+
+def get_lib_pedfiles():
+    dat = glob.glob('%s/ped/*.ped' % librepos)
+    dat += glob.glob('%s/ped/*.ped' % myrepos)
+    dat.sort()
+    if len(dat) > 0:
+        dat = [x.split('.ped')[0] for x in dat]
+     dat = [(x,x,'True') for x in dat]
+    else:
+        dat = [('No ped files - add some to %s/ped or %s/ped' % (librepos,myrepos),'None',True),]
+    return dat
+
+def get_lib_phefiles():
+    ext = 'phe'
+    dat = glob.glob('%s/pheno/*.%s' % (librepos,ext))
+    dat += glob.glob('%s/pheno/*.%s' % (myrepos,ext))
+    dat.sort()
+    if len(dat) > 0:
+     dat = [(x,x,'False') for x in dat]
+    else:
+        dat = [('No %s files - add some to %s/pheno or %s/pheno' % (ext,librepos,myrepos),'None',True),]
+    return dat
+
+def get_lib_bedfiles():
+    dat = glob.glob('%s/plinkbed/*.bed' % librepos)
+    dat += glob.glob('%s/plinkbed/*.bed' % myrepos)
+    dat.sort()
+    if len(dat) > 0:
+        dat = [x.split('.bed')[0] for x in dat]
+     dat = [(x,x,False) for x in dat]
+    else:
+        dat = [('No bed files - Please import some to %s/plinkbed or %s/plinkbed' % (librepos,myrepos),'None',True),]
+    return dat
+
+def get_lib_fbatfiles():
+    dat = glob.glob('%s/plinkfbat/*.ped' % librepos)
+    dat += glob.glob('%s/plinkfbat/*.ped' % myrepos)
+    dat.sort()
+    if len(dat) > 0:
+     dat = [(x,x,False) for x in dat]
+    else:
+        dat = [('No fbat bed files - Please import some to %s/plinkfbat or %s/plinkfbat' % (librepos,myrepos),'None',True),]
+    return dat
+
+def get_lib_mapfiles():
+    dat = glob.glob('%s/ped/*.map' % librepos)
+    dat += glob.glob('%s/ped/*.map' % myrepos)
+    dat.sort()
+    if len(dat) > 0:
+     dat = [(x,x,False) for x in dat]
+    else:
+        dat = [('No map files - add some to %s/ped' % librepos,'None',True),]
+    return dat
+
+def get_my_pedfiles():
+    dat = glob.glob('%s/*.ped' % myrepos)
+    if len(dat) > 0:
+     dat = [(x,x,False) for x in dat]
+    else:
+        dat = [('No ped files - add some to %s' % librepos,'None',True),]
+    return dat
+
+def get_my_mapfiles():
+    dat = glob.glob('%s/*.map' % myrepos)
+    if len(dat) > 0:
+     dat = [(x,x,'True') for x in dat]
+    else:
+        dat = [('No ped files - add some to %s' % librepos,'None',True),]
+    return dat
+
+def get_lib_xlsfiles():
+    dat = glob.glob('%s/*.xls' % librepos)
+    if len(dat) > 0:
+     dat = [(x,x,False) for x in dat]
+    else:
+        dat = [('No ped files - add some to %s' % librepos,'None',True),]
+    return dat
+
+def get_lib_htmlfiles():
+    dat = glob.glob('%s/*.html' % librepos)
+    if len(dat) > 0:
+     dat = [(x,x,False) for x in dat]
+    else:
+        dat = [('No ped files - add some to %s' % librepos,'None',True),]
+    return dat
+
+def get_my_xlsfiles():
+    dat = glob.glob('%s/*.xls' %  myrepos)
+    if len(dat) > 0:
+     dat = [(x,x,False) for x in dat]
+    else:
+        dat = [('No ped files - add some to %s' % librepos,'None',True),]
+    return dat
+
+def get_my_htmlfiles():
+    dat = glob.glob('%s/*.html' % myrepos)
+    if len(dat) > 0:
+     dat = [(x,x,False) for x in dat]
+    else:
+        dat = [('No ped files - add some to %s' % librepos,'None',True),]
+    return dat
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/plinkbinJZ.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/plinkbinJZ.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,868 @@\n+#!/usr/bin/env python2.4\r\n+"""\r\n+"""\r\n+\r\n+import optparse,os,subprocess,gzip,struct,time,commands\r\n+from array import array\r\n+\r\n+#from AIMS import util\r\n+#from pga import util as pgautil\r\n+\r\n+__FILE_ID__ = \'$Id: plinkbinJZ.py,v 1.14 2009/07/13 20:16:50 rejpz Exp $\'\r\n+\r\n+VERBOSE = True\r\n+\r\n+MISSING_ALLELES = set([\'N\', \'0\', \'.\', \'-\',\'\'])\r\n+\r\n+AUTOSOMES = set(range(1, 23) + [str(c) for c in range(1, 23)])\r\n+\r\n+MAGIC_BYTE1 = \'00110110\'\r\n+MAGIC_BYTE2 = \'11011000\'\r\n+FORMAT_SNP_MAJOR_BYTE = \'10000000\'\r\n+FORMAT_IND_MAJOR_BYTE = \'00000000\'\r\n+MAGIC1 = (0, 3, 1, 2)\r\n+MAGIC2 = (3, 1, 2, 0)\r\n+FORMAT_SNP_MAJOR = (2, 0, 0, 0)\r\n+FORMAT_IND_MAJOR = (0, 0, 0, 0)\r\n+HEADER_LENGTH = 3\r\n+\r\n+HOM0 = 3\r\n+HOM1 = 0\r\n+MISS = 2\r\n+HET  = 1\r\n+HOM0_GENO = (0, 0)\r\n+HOM1_GENO = (1, 1)\r\n+HET_GENO = (0, 1)\r\n+MISS_GENO = (-9, -9)\r\n+\r\n+GENO_TO_GCODE = {\r\n+    HOM0_GENO: HOM0, \r\n+    HET_GENO: HET, \r\n+    HOM1_GENO: HOM1, \r\n+    MISS_GENO: MISS, \r\n+    }\r\n+\r\n+CHROM_REPLACE = {\r\n+    \'X\': \'23\',\r\n+    \'Y\': \'24\',\r\n+    \'XY\': \'25\',\r\n+    \'MT\': \'26\',\r\n+    \'M\': \'26\',\r\n+}\r\n+\r\n+MAP_LINE_EXCEPTION_TEXT = """\r\n+One or more lines in the *.map file has only three fields.\r\n+The line was:\r\n+\r\n+%s\r\n+\r\n+If you are running rgGRR through EPMP, this is usually a\r\n+sign that you are using an old version of the map file.\r\n+You can correct the problem by re-running Subject QC.  If\r\n+you have already tried this, please contact the developers,\r\n+or file a bug.\r\n+"""\r\n+\r\n+INT_TO_GCODE = {\r\n+     0: array(\'i\', (0, 0, 0, 0)),   1: array(\'i\', (2, 0, 0, 0)),   2: array(\'i\', (1, 0, 0, 0)),   3: array(\'i\', (3, 0, 0, 0)), \r\n+     4: array(\'i\', (0, 2, 0, 0)),   5: array(\'i\', (2, 2, 0, 0)),   6: array(\'i\', (1, 2, 0, 0)),   7: array(\'i\', (3, 2, 0, 0)), \r\n+     8: array(\'i\', (0, 1, 0, 0)),   9: array(\'i\', (2, 1, 0, 0)),  10: array(\'i\', (1, 1, 0, 0)),  11: array(\'i\', (3, 1, 0, 0)), \r\n+    12: array(\'i\', (0, 3, 0, 0)),  13: array(\'i\', (2, 3, 0, 0)),  14: array(\'i\', (1, 3, 0, 0)),  15: array(\'i\', (3, 3, 0, 0)), \r\n+    16: array(\'i\', (0, 0, 2, 0)),  17: array(\'i\', (2, 0, 2, 0)),  18: array(\'i\', (1, 0, 2, 0)),  19: array(\'i\', (3, 0, 2, 0)), \r\n+    20: array(\'i\', (0, 2, 2, 0)),  21: array(\'i\', (2, 2, 2, 0)),  22: array(\'i\', (1, 2, 2, 0)),  23: array(\'i\', (3, 2, 2, 0)), \r\n+    24: array(\'i\', (0, 1, 2, 0)),  25: array(\'i\', (2, 1, 2, 0)),  26: array(\'i\', (1, 1, 2, 0)),  27: array(\'i\', (3, 1, 2, 0)), \r\n+    28: array(\'i\', (0, 3, 2, 0)),  29: array(\'i\', (2, 3, 2, 0)),  30: array(\'i\', (1, 3, 2, 0)),  31: array(\'i\', (3, 3, 2, 0)), \r\n+    32: array(\'i\', (0, 0, 1, 0)),  33: array(\'i\', (2, 0, 1, 0)),  34: array(\'i\', (1, 0, 1, 0)),  35: array(\'i\', (3, 0, 1, 0)), \r\n+    36: array(\'i\', (0, 2, 1, 0)),  37: array(\'i\', (2, 2, 1, 0)),  38: array(\'i\', (1, 2, 1, 0)),  39: array(\'i\', (3, 2, 1, 0)), \r\n+    40: array(\'i\', (0, 1, 1, 0)),  41: array(\'i\', (2, 1, 1, 0)),  42: array(\'i\', (1, 1, 1, 0)),  43: array(\'i\', (3, 1, 1, 0)), \r\n+    44: array(\'i\', (0, 3, 1, 0)),  45: array(\'i\', (2, 3, 1, 0)),  46: array(\'i\', (1, 3, 1, 0)),  47: array(\'i\', (3, 3, 1, 0)), \r\n+    48: array(\'i\', (0, 0, 3, 0)),  49: array(\'i\', (2, 0, 3, 0)),  50: array(\'i\', (1, 0, 3, 0)),  51: array(\'i\', (3, 0, 3, 0)), \r\n+    52: array(\'i\', (0, 2, 3, 0)),  53: array(\'i\', (2, 2, 3, 0)),  54: array(\'i\', (1, 2, 3, 0)),  55: array(\'i\', (3, 2, 3, 0)), \r\n+    56: array(\'i\', (0, 1, 3, 0)),  57: array(\'i\', (2, 1, 3, 0)),  58: array(\'i\', (1, 1, 3, 0)),  59: array(\'i\', (3, 1, 3, 0)), \r\n+    60: array(\'i\', (0, 3, 3, 0)),  61: array(\'i\', (2, 3, 3, 0)),  62: array(\'i\', (1, 3, 3, 0)),  63: array(\'i\', (3, 3, 3, 0)), \r\n+    64: array(\'i\', (0, 0, 0, 2)),  65: array(\'i\', (2, 0, 0, 2)),  66: array(\'i\', (1, 0, 0, 2)),  67: array(\'i\', (3, 0, 0, 2)), \r\n+    68: array(\'i\', (0, 2, 0, 2)),  69: array(\'i\', (2, 2, 0, 2)),  70: array(\'i\', (1, 2, 0, 2)),  71: array(\'i\', (3, 2, 0, 2)), \r\n+    72: array(\'i\', (0, 1, 0, 2)),  73: array(\'i\', (2, 1, 0, 2)),  74: array(\'i\', (1, 1, 0, 2)),  75: array(\'i\', (3, 1, 0, 2)), \r\n+    76: array(\'i\', (0, 3, 0, 2)),  77: array(\'i\', (2, 3, '..b'      """\r\n+        self.path = path\r\n+        self._subjects = {}\r\n+        self._ordered_subjects = []\r\n+\r\n+    def parse(self):\r\n+        """\r\n+        """\r\n+        print \'Reading pedigree information from [ %s ]\' % (self.path)\r\n+        fam = open(self.path, \'r\')\r\n+        for s, line in enumerate(fam):\r\n+            fid, iid, did, mid, sex, phe = line.strip().split()\r\n+            sid = iid.split(\'.\')[0]\r\n+            d_sid = did.split(\'.\')[0]\r\n+            m_sid = mid.split(\'.\')[0]\r\n+            skey = (fid, iid)\r\n+            self._ordered_subjects.append(skey)\r\n+            self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid)\r\n+        fam.close()\r\n+        print \'%s individuals read from [ %s ]\' % (s+1, self.path)\r\n+\r\n+### Command-line functionality and testing\r\n+def test(arg):\r\n+    \'\'\'\r\n+    \'\'\'\r\n+    \r\n+    import time\r\n+\r\n+    if arg == \'CAMP_AFFY.ped\':\r\n+        print \'Testing bed.parse(quick=True)\'\r\n+        s = time.time()\r\n+        bed = Bed(arg.replace(\'.ped\', \'.bed\'))\r\n+        bed.parse(quick=True)\r\n+        print bed.getGenotype((\'400118\', \'10300283\'), \'rs2000467\')\r\n+        print bed.getGenotype((\'400118\', \'10101384\'), \'rs2294019\')\r\n+        print bed.getGenotype((\'400121\', \'10101149\'), \'rs2294019\')        \r\n+        print bed.getGenotype((\'400123\', \'10200290\'), \'rs2294019\')        \r\n+        assert bed.getGenotype((\'400118\', \'10101384\'), \'rs2294019\') == (\'4\',\'4\')\r\n+        e = time.time()\r\n+        print \'e-s = %s\\n\' % (e-s)\r\n+    \r\n+    print \'Testing bed.parse\'\r\n+    s = time.time()\r\n+    bed = BPed(arg)\r\n+    bed.parse(quick=False)\r\n+    e = time.time()\r\n+    print \'e-s = %s\\n\' % (e-s)\r\n+\r\n+    print \'Testing bed.writeped\'\r\n+    s = time.time()\r\n+    outname = \'%s_BEDTEST\' % (arg)\r\n+    bed.writeped(outname)\r\n+    e = time.time()\r\n+    print \'e-s = %s\\n\' % (e-s)\r\n+    del(bed)\r\n+\r\n+    print \'Testing ped.parse\'\r\n+    s = time.time()\r\n+    ped = LPed(arg)\r\n+    ped.parse()\r\n+    e = time.time()\r\n+    print \'e-s = %s\\n\' % (e-s)\r\n+\r\n+    print \'Testing ped.writebed\'\r\n+    s = time.time()\r\n+    outname = \'%s_PEDTEST\' % (arg)\r\n+    ped.writebed(outname)\r\n+    e = time.time()\r\n+    print \'e-s = %s\\n\' % (e-s)\r\n+    del(ped)\r\n+    \r\n+def profile_bed(arg):\r\n+    """\r\n+    """\r\n+    bed = BPed(arg)\r\n+    bed.parse(quick=False)\r\n+    outname = \'%s_BEDPROFILE\' % (arg)\r\n+    bed.writeped(outname)\r\n+\r\n+def profile_ped(arg):\r\n+    """\r\n+    """\r\n+    ped = LPed(arg)\r\n+    ped.parse()\r\n+    outname = \'%s_PEDPROFILE\' % (arg)\r\n+    ped.writebed(outname)\r\n+\r\n+if __name__ == \'__main__\':\r\n+    """ Run as a command-line, this script should get one or more arguments,\r\n+        each one a ped file to be parsed with the PedParser (unit tests?)\r\n+    """\r\n+    op = optparse.OptionParser()\r\n+    op.add_option(\'--profile-bed\', action=\'store_true\', default=False)\r\n+    op.add_option(\'--profile-ped\', action=\'store_true\', default=False)\r\n+    opts, args = op.parse_args()\r\n+    \r\n+    if opts.profile_bed:\r\n+        import profile\r\n+        import pstats\r\n+        profile.run(\'profile_bed(args[0])\', \'fooprof\')\r\n+        p = pstats.Stats(\'fooprof\')\r\n+        p.sort_stats(\'cumulative\').print_stats(10)\r\n+    elif opts.profile_ped:\r\n+        import profile\r\n+        import pstats\r\n+        profile.run(\'profile_ped(args[0])\', \'fooprof\')\r\n+        p = pstats.Stats(\'fooprof\')\r\n+        p.sort_stats(\'cumulative\').print_stats(10)\r\n+    else:\r\n+        for arg in args:\r\n+            test(arg)\r\n+    \r\n+    ### Code used to generate the INT_TO_GCODE dictionary\r\n+    #print \'{\\n  \',\r\n+    #for i in range(256):\r\n+    #   b = INT2BIN[i]\r\n+    #    ints = []\r\n+    #    s = str(i).rjust(3)\r\n+    #    #print b\r\n+    #    for j in range(4):\r\n+    #        idx = j*2\r\n+    #        #print i, j, idx, b[idx:idx+2], int(b[idx:idx+2], 2)\r\n+    #        ints.append(int(b[idx:idx+2], 2))\r\n+    #    print \'%s: array(\\\'i\\\', %s),\' % (s,tuple(ints)),\r\n+    #    if i > 0 and (i+1) % 4 == 0:\r\n+    #        print \'\\n  \',\r\n+    #print \'}\'\r\n+\r\n+\r\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/plinkbinJZ.pyc
b
Binary file tools/rgenetics/plinkbinJZ.pyc has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgCaCo.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgCaCo.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,271 @@\n+#!/usr/local/bin/python\n+# hack to run and process a plink case control association\n+# expects args as  \n+# bfilepath outname jobname outformat (wig,xls)\n+# ross lazarus \n+# for wig files, we need annotation so look for map file or complain\n+"""\n+Parameters for wiggle track definition lines\n+All options are placed in a single line separated by spaces:\n+\n+  track type=wiggle_0 name=track_label description=center_label \\\n+        visibility=display_mode color=r,g,b altColor=r,g,b \\\n+        priority=priority autoScale=on|off \\\n+        gridDefault=on|off maxHeightPixels=max:default:min \\\n+        graphType=bar|points viewLimits=lower:upper \\\n+        yLineMark=real-value yLineOnOff=on|off \\\n+        windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16\n+"""\n+\n+import sys,math,shutil,subprocess,os,time,tempfile,string\n+from os.path import abspath\n+from rgutils import timenow, plinke\n+imagedir = \'/static/rg\' # if needed for images\n+myversion = \'V000.1 April 2007\'\n+verbose = False\n+\n+def makeGFF(resf=\'\',outfname=\'\',logf=None,twd=\'.\',name=\'track name\',description=\'track description\',topn=1000):\n+    """\n+    score must be scaled to 0-1000\n+    \n+    Want to make some wig tracks from each analysis\n+    Best n -log10(p). Make top hit the window.\n+    we use our tab output which has\n+    rs\tchrom\toffset\tADD_stat\tADD_p\tADD_log10p\n+    rs3094315\t1\t792429\t1.151\t0.2528\t0.597223\n+\n+    """\n+\n+    def is_number(s):\n+        try:\n+            float(s)\n+            return True\n+        except ValueError:\n+            return False\n+    header = \'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\\n\' % (name,description)          \n+    column_names = [ \'Seqname\', \'Source\', \'Feature\', \'Start\', \'End\', \'Score\', \'Strand\', \'Frame\', \'Group\' ]\n+    halfwidth=100\n+    resfpath = os.path.join(twd,resf)\n+    resf = open(resfpath,\'r\')\n+    resfl = resf.readlines() # dumb but convenient for millions of rows\n+    resfl = [x.split() for x in resfl]\n+    headl = resfl[0]\n+    resfl = resfl[1:]\n+    headl = [x.strip().upper() for x in headl]\n+    headIndex = dict(zip(headl,range(0,len(headl))))\n+    whatwewant = [\'CHR\',\'RS\',\'OFFSET\',\'LOG10ARMITAGEP\']\n+    wewant = [headIndex.get(x,None) for x in whatwewant]\n+    if None in wewant: # missing something\n+       logf.write(\'### Error missing a required header from %s in makeGFF - headIndex=%s\\n\' % (whatwewant,headIndex))\n+       return\n+    ppos = wewant[3] # last in list\n+    resfl = [x for x in resfl if x[ppos] > \'\' and x[ppos] <> \'NA\']\n+    resfl = [(float(x[ppos]),x) for x in resfl] # decorate\n+    resfl.sort()\n+    resfl.reverse() # using -log10 so larger is better\n+    pvals = [x[0] for x in resfl] # need to scale\n+    resfl = [x[1] for x in resfl] # drop decoration  \n+    resfl = resfl[:topn] # truncate\n+    maxp = max(pvals) # need to scale\n+    minp = min(pvals)\n+    prange = abs(maxp-minp) + 0.5 # fudge\n+    scalefact = 1000.0/prange\n+    logf.write(\'###maxp=%f,minp=%f,prange=%f,scalefact=%f\\n\' % (maxp,minp,prange,scalefact))\n+    for i,row in enumerate(resfl):\n+        row[ppos] = \'%d\' % (int(scalefact*pvals[i])) \n+        resfl[i] = row # replace\n+    outf = file(outfname,\'w\')\n+    outf.write(header)\n+    outres = [] # need to resort into chrom offset order\n+    for i,lrow in enumerate(resfl):\n+        chrom,snp,offset,p, = [lrow[x] for x in wewant]\n+        gff = (\'chr%s\' % chrom,\'rgCaCo\',\'variation\',\'%d\' % (int(offset)-halfwidth),\n+               \'%d\' % (int(offset)+halfwidth),p,\'.\',\'.\',\'%s logp=%1.2f\' % (snp,pvals[i]))\n+        outres.append(gff)\n+    outres = [(x[0],int(x[3]),x) for x in outres] # decorate\n+    outres.sort() # into chrom offset\n+    outres=[x[2] for x in outres] # undecorate\n+    outres = [\'\\t\'.join(x) for x in outres]    \n+    outf.write(\'\\n\'.join(outres))\n+    outf.write(\'\\n\')\n+    outf.close()\n+\n+\n+def plink_assocToGG(plinkout="hm",tag=\'test\'):\n+   """ plink --assoc output looks like this\n+   #  CHR         SNP   A1      F_A      F_'..b'e\n+    whatwewant = [\'CHR\',\'SNP\',\'TEST\',\'AFF\',\'UNAFF\',\'CHISQ\',\'P\']\n+    wewant = [headl.index(x) for x in whatwewant]\n+    llen = len(headl)\n+    lnum = anum = 0\n+    lastsnp = None # so we know when to write out a gg line\n+    outl = {}\n+    f.seek(0)\n+    for lnum,l in enumerate(f):\n+        if lnum == 0:\n+            continue\n+        ll = l.split()\n+        if delim:\n+           ll = l.split(delim)\n+        if len(ll) >= llen: # valid line\n+            chr,snp,test,naff,nuaff,chi,p = [ll[x] for x in wewant]\n+            snp = snp.strip()\n+            chrom,offset = rsdict.get(snp,(None,None))\n+            anum += 1\n+            fp = 1.0 # if NA\n+            lp = 0.0\n+            try:\n+                fp = float(p)\n+                if fp > 0:\n+                  lp = -math.log10(fp)\n+                else:\n+                    fp = 9e-100\n+                    flog.write(\'### WARNING - Plink calculated %s for %s p value!!! 9e-100 substituted!\\n\' % (p,test))\n+                    flog.write(\'### offending line #%d in %s = %s\' % (lnum,l))\n+            except:\n+                pass\n+            if snp <> lastsnp:\n+                if len(outl.keys()) > 3:\n+                    sl = [outl.get(x,\'?\') for x in (\'snp\',\'chrom\',\'offset\',\'GENO\',\'TREND\',\'ALLELIC\',\'DOM\')]\n+                    res.append(\'\\t\'.join(sl)) # last snp line\n+                outl = {\'snp\':snp,\'chrom\':chrom,\'offset\':offset} # first 3 cols for gg line\n+                lastsnp = snp # reset for next marker\n+            #if p == \'NA\':\n+            #      p = 1.0 \n+            # let\'s pass downstream for handling R is fine?\n+            outl[test] = \'%s\\t%f\' % (p,lp)\n+    if len(outl.keys()) > 3:\n+        l = [outl.get(x,\'?\') for x in (\'snp\',\'chrom\',\'offset\',\'GENO\',\'TREND\',\'ALLELIC\',\'DOM\')]\n+        res.append(\'\\t\'.join(l)) # last snp line\n+    f = file(outfname,\'w\')\n+    res.append(\'\')\n+    f.write(\'\\n\'.join(res))\n+    f.close()\n+\n+\n+\n+                \n+if __name__ == "__main__":\n+    """\n+    # called as \n+    <command interpreter="python">\n+        rgCaCo.py \'$i.extra_files_path/$i.metadata.base_name\' "$name" \n+        \'$out_file1\' \'$logf\' \'$logf.files_path\' \'$gffout\'\n+    </command>    </command>\n+    """\n+    if len(sys.argv) < 7:\n+       s = \'rgCaCo.py needs 6 params - got %s \\n\' % (sys.argv)\n+       print >> sys.stdout, s\n+       sys.exit(0)\n+    bfname = sys.argv[1]\n+    name = sys.argv[2]\n+    killme = string.punctuation + string.whitespace\n+    trantab = string.maketrans(killme,\'_\'*len(killme))\n+    name = name.translate(trantab)\n+    outfname = sys.argv[3]\n+    logf = sys.argv[4]\n+    logoutdir = sys.argv[5]\n+    gffout = sys.argv[6]\n+    topn = 1000\n+    try:\n+        os.makedirs(logoutdir)\n+    except:\n+        pass\n+    map_file = None\n+    me = sys.argv[0]\n+    amapf = \'%s.bim\' % bfname # to decode map in xformModel\n+    flog = file(logf,\'w\')\n+    logme = []\n+    cdir = os.getcwd()\n+    s = \'Rgenetics %s http://rgenetics.org Galaxy Tools, rgCaCo.py started %s\\n\' % (myversion,timenow())\n+    print >> sys.stdout, s # so will appear as blurb for file\n+    logme.append(s)\n+    if verbose:\n+        s = \'rgCaCo.py:  bfname=%s, logf=%s, argv = %s\\n\' % (bfname, logf, sys.argv) \n+        print >> sys.stdout, s # so will appear as blurb for file\n+        logme.append(s)\n+    twd = tempfile.mkdtemp(suffix=\'rgCaCo\') # make sure plink doesn\'t spew log file into the root!\n+    tname = os.path.join(twd,name)\n+    vcl = [plinke,\'--noweb\',\'--bfile\',bfname,\'--out\',name,\'--model\']\n+    p=subprocess.Popen(\' \'.join(vcl),shell=True,stdout=flog,cwd=twd)\n+    retval = p.wait()\n+    resf = \'%s.model\' % tname # plink output is here we hope\n+    xformModel(bfname,resf,outfname,name,amapf,flog) # leaves the desired summary file\n+    makeGFF(resf=outfname,outfname=gffout,logf=flog,twd=twd,name=\'rgCaCo_TopTable\',description=name,topn=topn)\n+    flog.write(\'\\n\'.join(logme))\n+    flog.close() # close the log used\n+    #shutil.copytree(twd,logoutdir)\n+    shutil.rmtree(twd) # clean up\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgCaCo.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgCaCo.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,103 @@
+<tool id="rgCaCo1" name="Case Control:">
+    <description>for unrelated subjects</description>
+    <command interpreter="python">
+        rgCaCo.py '$i.extra_files_path/$i.metadata.base_name' "$title"  '$out_file1' '$logf' '$logf.files_path' '$gffout'
+    </command>
+    <inputs>
+      <param name="i"  type="data" label="RGenetics genotype data from your current history"
+      format="pbed" />
+       <param name='title' type='text' size="132" value='CaseControl' label="Title for this job"/>
+
+    </inputs>
+
+   <outputs>
+       <data format="tabular" name="out_file1" label="${title}_rgCaCo.xls" />
+       <data format="txt" name="logf" label="${title}_rgCaCo.log"/>
+       <data format="gff" name="gffout" label="${title}_rgCaCoTop.gff" />
+   </outputs>
+<tests>
+ <test>
+ <param name='i' value='tinywga' ftype='pbed' >
+   <metadata name='base_name' value='tinywga' />
+   <composite_data value='tinywga.bim' />
+   <composite_data value='tinywga.bed' />
+   <composite_data value='tinywga.fam' />
+   <edit_attributes type='name' value='tinywga' /> 
+ </param>
+ <param name='title' value='rgCaCotest1' />
+ <output name='out_file1' file='rgCaCotest1_CaCo.xls' ftype='tabular' compare='diff' />
+ <output name='logf' file='rgCaCotest1_CaCo_log.txt' ftype='txt' compare='diff' lines_diff='20' />
+ <output name='gffout' file='rgCaCotest1_CaCo_topTable.gff' ftype='gff' compare='diff' />
+ </test>
+</tests>
+<help>
+
+.. class:: infomark
+
+**Syntax**
+
+- **Genotype file** is the input case control data chosen from available library Plink binary files
+- **Map file** is the linkage format .map file corresponding to the genotypes in the Genotype file
+- **Type of test** is the kind of test statistic to report such as Armitage trend test or genotype test
+- **Format** determines how your data will be returned to your Galaxy workspace
+
+-----
+
+**Summary**
+
+This tool will perform some standard statistical tests comparing subjects designated as
+affected (cases) and unaffected subjects (controls). To avoid bias, it is important that
+controls who had been affected would have been eligible for sampling as cases. This may seem
+odd, but it requires that the cases and controls are drawn from the same sampling frame.
+
+The armitage trend test is robust to departure from HWE and so very attractive - after all, a real disease
+mutation may well result in distorted HWE at least in cases. All the others are susceptible to
+bias in the presence of HWE departures.
+
+All of these tests are exquisitely sensitive to non-differential population stratification in cases
+compared to controls and this must be tested before believing any results here. Use the PCA method for
+100k markers or more.
+
+If you don't see the genotype data set you want here, it can be imported using one of the methods available from
+the Galaxy Get Data tool page.
+
+Output format can be UCSC .bed if you want to see your
+results as a fully fledged UCSC track. A map file containing the chromosome and offset for each marker is required for
+writing this kind of output.
+Alternatively you can use .gg for the UCSC Genome Graphs tool which has all of the advantages
+of the the .bed track, plus a neat, visual front end that displays a lot of useful clues.
+Either of these are a very useful way of quickly getting a look
+at your data in full genomic context.
+
+Finally, if you can't live without
+spreadsheet data, choose the .xls tab delimited format. It's not a stupid binary excel file. Just a plain old tab delimited
+one with a header. Fortunately excel is dumb enough to open these without much protest.
+
+
+-----
+
+.. class:: infomark
+
+**Attribution**
+
+This Galaxy tool relies on Plink (see Plinksrc_) to test Casae Control association models. 
+
+So, we rely on the author (Shaun Purcell) for the documentation you need specific to those settings - they are very nicely documented - see
+DOC_
+
+Tool and Galaxy datatypes originally designed and written for the Rgenetics
+series of whole genome scale statistical genetics tools by ross lazarus (ross.lazarus@gmail.com)
+
+Copyright Ross Lazarus March 2007
+This Galaxy wrapper is released licensed under the LGPL_ but is about as useful as a chocolate teapot without Plink which is GPL.
+
+I'm no lawyer, but it looks like you got GPL if you use this software. Good luck.
+
+.. _Plinksrc: http://pngu.mgh.harvard.edu/~purcell/plink/ 
+
+.. _LGPL: http://www.gnu.org/copyleft/lesser.html
+
+.. _DOC: http://pngu.mgh.harvard.edu/~purcell/plink/anal.shtml#cc
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgClean.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgClean.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,160 @@
+"""
+# galaxy tool xml files can define a galaxy supplied output filename
+# that must be passed to the tool and used to return output
+# here, the plink log file is copied to that file and removed
+# took a while to figure this out!
+# use exec_before_job to give files sensible names
+#
+# ross april 14 2007
+# plink cleanup script
+# ross lazarus March 2007 for camp illumina whole genome data
+# note problems with multiple commands being ignored - eg --freq --missing --mendel 
+# only the first seems to get done...
+#
+##Summary statistics versus inclusion criteria
+##
+##Feature                         As summary statistic    As inclusion criteria
+##Missingness per individual      --missing               --mind N
+##Missingness per marker          --missing               --geno N        
+##Allele frequency                --freq                  --maf N
+##Hardy-Weinberg equilibrium      --hardy                 --hwe N
+##Mendel error rates              --mendel                --me N M
+#
+# call as plinkClean.py $i $o $mind $geno $hwe $maf $mef $mei $outfile 
+# note plinkClean_code.py does some renaming before the job starts
+
+    
+    <command interpreter="python2.4">
+        rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind' '$geno' '$hwe' '$maf' 
+        '$mef' '$mei' '$out_file1' '$out_file1.files_path' '$userId' 

+  
+"""
+import sys,shutil,os,subprocess, glob, string, tempfile, time
+from rgutils import galhtmlprefix, timenow, plinke
+prog = os.path.split(sys.argv[0])[-1]
+myversion = 'January 4 2010'
+verbose=False
+
+
+def fixoutaff(outpath='',newaff='1'):
+    """ quick way to create test data sets - set all aff to 1 or 2 for
+    some hapmap data and then merge
+    [rerla@beast galaxy]$ head tool-data/rg/library/pbed/affyHM_CEU.fam
+    1341 14 0 0 2 1
+    1341 2 13 14 2 1
+    1341 13 0 0 1 1
+    1340 9 0 0 1 1
+    1340 10 0 0 2 1
+    """
+    nchanged = 0
+    fam = '%s.fam' % outpath
+    famf = open(fam,'r')
+    fl = famf.readlines()
+    famf.close()
+    for i,row in enumerate(fl):
+        lrow = row.split()
+        if lrow[-1] <> newaff:
+            lrow[-1] = newaff
+            fl[i] = ' '.join(lrow)
+            fl[i] += '\n'
+            nchanged += 1
+    fo = open(fam,'w')
+    fo.write(''.join(fl))
+    fo.close()
+    return nchanged
+            
+
+
+def clean():
+    """
+    """
+    if len(sys.argv) < 16:
+        print >> sys.stdout, '## %s expected 12 params in sys.argv, got %d - %s' % (prog,len(sys.argv),sys.argv)
+        print >> sys.stdout, """this script will filter a linkage format ped
+        and map file containing genotypes. It takes 14 parameters - the plink --f parameter and"
+        a new filename root for the output clean data followed by the mind,geno,hwe,maf, mef and mei"
+        documented in the plink docs plus the file to be returned to Galaxy
+        called as:
+        <command interpreter="python">
+        rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind'
+        '$geno' '$hwe' '$maf' '$mef' '$mei' '$out_file1' '$out_file1.files_path'
+        '$relfilter' '$afffilter' '$sexfilter' '$fixaff'
+        </command>
+
+        """
+        sys.exit(1)
+    plog = []
+    inpath = sys.argv[1]
+    inbase = sys.argv[2]
+    killme = string.punctuation + string.whitespace
+    trantab = string.maketrans(killme,'_'*len(killme))
+    title = sys.argv[3].translate(trantab)
+    mind = sys.argv[4]
+    geno = sys.argv[5]
+    hwe = sys.argv[6]
+    maf = sys.argv[7]
+    me1 = sys.argv[8]
+    me2 = sys.argv[9]
+    outfname = sys.argv[10]
+    outfpath = sys.argv[11]
+    relf = sys.argv[12]
+    afff = sys.argv[13]
+    sexf = sys.argv[14]
+    fixaff = sys.argv[15]
+    output = os.path.join(outfpath,outfname)
+    outpath = os.path.join(outfpath,title)
+    outprunepath = os.path.join(outfpath,'ldprune_%s' % title)
+    try:
+      os.makedirs(outfpath)
+    except:
+      pass
+    bfile = os.path.join(inpath,inbase)
+    outf = file(outfname,'w')
+    vcl = [plinke,'--noweb','--bfile',bfile,'--make-bed','--out',
+          outpath,'--set-hh-missing','--mind',mind,
+          '--geno',geno,'--maf',maf,'--hwe',hwe,'--me',me1,me2]
+    # yes - the --me parameter takes 2 values - mendels per snp and per family
+    if relf == 'oo': # plink filters are what they leave...
+        vcl.append('--filter-nonfounders') # leave only offspring
+    elif relf == 'fo':
+        vcl.append('--filter-founders')
+    if afff == 'affonly':
+        vcl.append('--filter-controls')
+    elif relf == 'unaffonly':
+        vcl.append('--filter-cases')
+    if sexf == 'fsex':
+        vcl.append('--filter-females')
+    elif relf == 'msex':
+        vcl.append('--filter-males')        
+    p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath)
+    retval = p.wait()
+    plog.append('%s started, called as %s' % (prog,' '.join(sys.argv)))
+    outf.write(galhtmlprefix % prog)
+    outf.write('<ul>\n')
+    plogf = '%s.log' % os.path.join(outfpath,title)
+    try:
+        plogl = file(plogf,'r').readlines()
+        plog += [x.strip() for x in plogl]
+    except:
+        plog += ['###Cannot open plink log file %s' % plogf,]
+    # if fixaff, want to 'fix' the fam file
+    if fixaff <> '0':
+        nchanged = fixoutaff(outpath=outpath,newaff=fixaff)
+        plog += ['## fixaff was requested  %d subjects affection status changed to %s' % (nchanged,fixaff)] 
+    pf = file(plogf,'w')
+    pf.write('\n'.join(plog))
+    pf.close()
+    globme = os.path.join(outfpath,'*')
+    flist = glob.glob(globme)
+    flist.sort()
+    for i, data in enumerate( flist ):
+        outf.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1]))
+    outf.write('</ul>\n')
+    outf.write("</ul></br></div></body></html>")
+    outf.close()
+
+
+if __name__ == "__main__":
+    clean()
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgClean.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgClean.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,154 @@\n+<tool id="rgClean1" name="Clean genotypes:">\n+    <description>filter markers, subjects</description>\n+\n+    <command interpreter="python">\n+        rgClean.py \'$input_file.extra_files_path\' \'$input_file.metadata.base_name\' \'$title\' \'$mind\'\n+        \'$geno\' \'$hwe\' \'$maf\' \'$mef\' \'$mei\' \'$out_file1\' \'$out_file1.files_path\'\n+        \'$relfilter\' \'$afffilter\' \'$sexfilter\' \'$fixaff\'\n+    </command>\n+\n+    <inputs>\n+       <param name="input_file"  type="data" label="RGenetics genotype library file in compressed Plink format"\n+         size="120" format="pbed" />\n+       <param name="title" type="text" size="80" label="Descriptive title for cleaned genotype file" value="Cleaned_data"/>\n+       <param name="geno"  type="text" label="Maximum Missing Fraction: Markers" value="0.05" />\n+       <param name="mind" type="text" value="0.1" label="Maximum Missing Fraction: Subjects"/>\n+       <param name="mef"  type="text" label="Maximum Mendel Error Rate: Family" value="0.05"/>\n+       <param name="mei"  type="text" label="Maximum Mendel Error Rate: Marker" value="0.05"/>\n+       <param name="hwe" type="text" value="0" label="Smallest HWE p value (set to 0 for all)" />\n+       <param name="maf" type="text" value="0.01"\n+       label="Smallest Minor Allele Frequency (set to 0 for all)"/>\n+       <param name=\'relfilter\' label = "Filter on pedigree relatedness" type="select"\n+   \t     optional="false" size="132"\n+         help="Optionally remove related subjects if pedigree identifies founders and their offspring">\n+         <option value="all" selected=\'true\'>No filter on relatedness</option>\n+         <option value="fo" >Keep Founders only (pedigree m/f ID = "0")</option>\n+         <option value="oo" >Keep Offspring only (one randomly chosen if >1 sibs in family)</option>\n+   \t\t</param>\n+       <param name=\'afffilter\' label = "Filter on affection status" type="select"\n+   \t     optional="false" size="132"\n+         help="Optionally remove affected or non affected subjects">\n+         <option value="allaff" selected=\'true\'>No filter on affection status</option>\n+         <option value="affonly" >Keep Controls only (affection=\'1\')</option>\n+         <option value="unaffonly" >Keep Cases only (affection=\'2\')</option>\n+   \t\t</param>\n+       <param name=\'sexfilter\' label = "Filter on gender" type="select"\n+   \t     optional="false" size="132"\n+         help="Optionally remove all male or all female subjects">\n+         <option value="allsex" selected=\'true\'>No filter on gender status</option>\n+         <option value="msex" >Keep Males only (pedigree gender=\'1\')</option>\n+         <option value="fsex" >Keep Females only (pedigree gender=\'2\')</option>\n+   \t\t</param>\n+       <param name="fixaff" type="text" value="0"\n+          label = "Change ALL subjects affection status to (0=no change,1=unaff,2=aff)"\n+          help="Use this option to switch the affection status to a new value for all output subjects" />\n+   </inputs>\n+\n+   <outputs>\n+       <data format="pbed" name="out_file1" metadata_source="input_file" label="${title}_rgClean.pbed"  />\n+   </outputs>\n+\n+<tests>\n+ <test>\n+    <param name=\'input_file\' value=\'tinywga\' ftype=\'pbed\' >\n+    <metadata name=\'base_name\' value=\'tinywga\' />\n+    <composite_data value=\'tinywga.bim\' />\n+    <composite_data value=\'tinywga.bed\' />\n+    <composite_data value=\'tinywga.fam\' />\n+    <edit_attributes type=\'name\' value=\'tinywga\' /> \n+    </param>\n+    <param name=\'title\' value=\'rgCleantest1\' />\n+    <param name="geno" value="1" />\n+    <param name="mind" value="1" />\n+    <param name="mef" value="0" />\n+    <param name="mei" value="0" />\n+    <param name="hwe" value="0" />\n+    <param name="maf" value="0" />\n+    <param name="relfilter" value="all" />\n+    <param name="afffilter" value="allaff" />\n+    <param name="sexfilter" value="allsex" />\n+    <param name="fixaff" value="0" />\n+    <output name=\'out_file1\' file=\'rgtestouts/rgClean/rgCleantest1.pbed\' compare="diff" lines_diff="25" >\n+   '..b"tax**\n+\n+- **Genotype data** is the input genotype file chosen from your current history\n+- **Descriptive title** is the name to use for the filtered output file\n+- **Missfrac threshold: subjects** is the threshold for missingness by subject. Subjects with more than this fraction missing will be excluded from the import\n+- **Missfrac threshold: markers** is the threshold for missingness by marker. Markers with more than this fraction missing will be excluded from the import\n+- **MaxMendel Individuals** Mendel error fraction above which to exclude subjects with more than the specified fraction of mendelian errors in transmission (for family data only)\n+- **MaxMendel Families** Mendel error fraction above which to exclude families with more than the specified fraction of mendelian errors in transmission (for family data only)\n+- **HWE** is the threshold for HWE test p values below which the marker will not be imported. Set this to -1 and all markers will be imported regardless of HWE p value\n+- **MAF** is the threshold for minor allele frequency - SNPs with lower MAF will be excluded\n+- **Filters** for founders/offspring or affected/unaffected or males/females are optionally available if needed\n+- **Change Affection** is only needed if you want to change the affection status for creating new analysis datasets\n+\n+-----\n+\n+**Attribution**\n+\n+This tool relies on the work of many people. It uses Plink http://pngu.mgh.harvard.edu/~purcell/plink/,\n+and the R http://cran.r-project.org/ and\n+Bioconductor http://www.bioconductor.org/ projects.\n+respectively.\n+\n+In particular, http://pngu.mgh.harvard.edu/~purcell/plink/\n+has excellent documentation describing the parameters you can set here.\n+\n+This implementation is a Galaxy tool wrapper around these third party applications.\n+It was originally designed and written for family based data from the CAMP Illumina run of 2007 by\n+ross lazarus (ross.lazarus@gmail.com) and incorporated into the rgenetics toolkit.\n+\n+Rgenetics merely exposes them, wrapping Plink so you can use it in Galaxy.\n+\n+-----\n+\n+**Summary**\n+\n+Reliable statistical inference depends on reliable data. Poor quality samples and markers\n+may add more noise than signal, decreasing statistical power. Removing the worst of them\n+can be done by setting thresholds for some of the commonly used technical quality measures\n+for genotype data. Of course discordant replicate calls are also very informative but are not\n+in scope here.\n+\n+Marker cleaning: Filters are available to remove markers below a specific minor allele\n+frequency, beyond a Hardy Wienberg threshold, below a minor allele frequency threshold,\n+or above a threshold for missingness. If family data are available, thresholds for Mendelian\n+error can be set.\n+\n+Subject cleaning: Filters are available to remove subjects with many missing calls. Subjects and markers for family data can be filtered by proportions\n+of Mendelian errors in observed transmission. Use the QC reporting tool to\n+generate a comprehensive series of reports for quality control.\n+\n+Note that ancestry and cryptic relatedness should also be checked using the relevant tools.\n+\n+-----\n+\n+.. class:: infomark\n+\n+**Tip**\n+\n+You can check that you got what you asked for by running the QC tool to ensure that the distributions\n+are truncated the way you expect. Note that you do not expect that the thresholds will be exactly\n+what you set - some bad assays and subjects are out in multiple QC measures, so you sometimes have\n+more samples or markers than you exactly set for each threshold. Finally, the ordering of\n+operations matters and Plink is somewhat restrictive about what it will do on each pass\n+of the data. At least it's fixed.\n+\n+-----\n+\n+This Galaxy tool was written by Ross Lazarus for the Rgenetics project\n+It uses Plink for most calculations - for full Plink attribution, source code and documentation,\n+please see http://pngu.mgh.harvard.edu/~purcell/plink/ plus some custom python code\n+\n+</help>\n+</tool>\n"
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgClustalw.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgClustalw.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,60 @@
+"""
+rgclustalw.py
+wrapper for clustalw necessitated by bad choice of output path for .dnd file based on input file. Naughty.
+Copyright ross lazarus march 2011
+All rights reserved
+Licensed under the LGPL
+"""
+
+import sys,optparse,os,subprocess,tempfile,shutil
+
+class Clustrunner:
+    """
+    """
+    def __init__(self,opts=None):
+        self.opts = opts
+        self.iname = 'infile_copy'
+        shutil.copy(self.opts.input,self.iname) 
+
+    def run(self):
+        tlf = open(self.opts.outlog,'w')
+        cl = ['clustalw2 -INFILE=%s -OUTFILE=%s -OUTORDER=%s -TYPE=%s -OUTPUT=%s' % (self.iname,self.opts.output,self.opts.out_order,self.opts.dnarna,self.opts.outform)]
+        if self.opts.seq_range_end <> None and self.opts.seq_range_start <> None:
+            cl.append('-RANGE=%s,%s' % (self.opts.seq_range_start,self.opts.seq_range_end))
+        if self.opts.outform=='CLUSTAL' and self.opts.outseqnos <> None:
+            cl.append('-SEQNOS=ON')
+        process = subprocess.Popen(' '.join(cl), shell=True, stderr=tlf, stdout=tlf)
+        rval = process.wait()
+        dndf = '%s.dnd' % self.iname
+        if os.path.exists(dndf):
+            tlf.write('\nClustal created the following dnd file for your information:\n')
+            dnds = open('%s.dnd' % self.iname,'r').readlines()
+     for row in dnds:
+                tlf.write(row)
+            tlf.write('\n')
+        tlf.close()
+        os.unlink(self.iname)
+    
+
+
+if __name__ == "__main__":
+    op = optparse.OptionParser()
+    op.add_option('-i', '--input', default=None)
+    op.add_option('-o', '--output', default=None)
+    op.add_option('-t', '--outname', default="rgClustal")
+    op.add_option('-s', '--out_order', default='ALIGNMENT')
+    op.add_option('-f', '--outform', default='CLUSTAL')
+    op.add_option('-e', '--seq_range_end',default=None)
+    op.add_option('-b', '--seq_range_start',default=None)
+    op.add_option('-l','--outlog',default='rgClustalw.log')
+    op.add_option('-q', '--outseqnos',default=None)    
+    op.add_option('-d', '--dnarna',default='DNA')    
+    
+    opts, args = op.parse_args()
+    assert opts.input <> None
+    assert os.path.isfile(opts.input)
+    c = Clustrunner(opts)
+    c.run()
+    
+            
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgClustalw.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgClustalw.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,128 @@
+<tool id="clustalw" name="ClustalW" version="0.1">
+   <description>multiple sequence alignment program for DNA or proteins</description>
+   <command interpreter="python"> 
+    rgClustalw.py -i "$input" -o "$output" -s "$out_order" -l "$outlog" -t "$outname" -d "$dnarna"
+    #if   ($range.mode=="part")
+-b "$range.seq_range_start" -e "$range.seq_range_end"
+    #end if
+    #if ($outcontrol.outform=="clustal")
+-f "CLUSTAL"
+    #if ($outcontrol.out_seqnos=="ON")
+-q "ON"
+    #end if
+    #end if
+    #if ($outcontrol.outform=="phylip")
+-f "PHYLIP"
+    #end if
+    #if ($outcontrol.outform=="fasta")
+-f "FASTA"
+    #end if
+   </command>
+  <inputs>
+   <page>
+    <param format="fasta" name="input" type="data" label="Fasta File" />
+    <param name="outname" label="Name for output files to make it easy to remember what you did" type="text" size="50" value="Clustal_run" />
+    <param name="dnarna" type="select" label="Data Type">
+      <option value="DNA" selected="True">DNA nucleotide sequences</option>
+      <option value="PROTEIN">Protein sequences</option>
+    </param>
+    <conditional name="outcontrol">
+      <param name="outform" type="select" label="Output alignment format">
+        <option value="clustal" selected="True">Native Clustal output format</option>
+        <option value="phylip">Phylip format</option>
+        <option value="fasta">Fasta format</option>
+      </param>
+      <when value="fasta" />
+      <when value="phylip" />
+      <when value="clustal">
+       <param name="out_seqnos" type="select" label="Show residue numbers in clustal format output">
+         <option value="ON">yes</option>
+         <option value="OFF" selected="true">no</option>
+       </param>
+      </when>
+    </conditional>
+    <param name="out_order" type="select" label="Output Order">
+      <option value="ALIGNED">aligned</option>
+      <option value="INPUT">same order as input file</option>
+    </param>
+
+    <conditional name="range">
+        <param name="mode" type="select" label="Output complete alignment (or specify part to output)">
+          <option value="complete">complete alignment</option>
+          <option value="part">only part of the alignment</option>
+        </param>
+        <when value="complete">
+        </when>
+        <when value="part">    
+           <param name="seq_range_start" size="5" type="integer" value="1" label="start point" help="sequence range to write">
+           </param>
+           <param name="seq_range_end" size="5" type="integer" value="99999" label="end point" >
+           </param> 
+        </when>
+    </conditional>
+   </page>
+  </inputs>
+  <outputs>
+    <data format="clustal" name="output"  label="${outname}_output.${outcontrol.outform}">
+       <change_format>
+           <when input="outcontrol.outform" value="phylip" format="phylip" />
+           <when input="outcontrol.outform" value="fasta" format="fasta" />
+       </change_format>
+    </data>
+    <data format="txt" name="outlog"  label="${outname}_clustal_log.txt"/>
+  </outputs>
+  <tests>
+     <test>
+        <param name="input" value="rgClustal_testin.fasta" />
+      <param name = "outname" value="" />
+      <param name = "outform" value="fasta" />
+      <param name = "dnarna" value="DNA" />
+      <param name = "mode" value="complete" />
+      <param name = "out_order" value="ALIGNED" />
+      <output name="output" file="rgClustal_testout.fasta" ftype="fasta" />
+      <output name="outlog" file="rgClustal_testout.log" ftype="txt" lines_diff="5" />
+     </test>
+  </tests>
+  <help>
+
+**Note**
+
+This tool allows you to run a multiple sequence alignment with ClustalW2 (see Clustsrc_) using the default options.

+For a tutorial introduction, see ClustalW2_
+
+You can align DNA or protein sequences in the input file which should be multiple sequences to be aligned in a fasta file
+
+A log will be output to your history showing the output Clustal would normally write to standard output.
+
+The alignments will appear as a clustal format file or optionally, as phylip or fasta format files in your history. If you choose fasta as 
+the output format, you can create a 'Logo' image using the Sequence Logo tool.
+
+If Clustal format is chosen, you have the option of adding basepair counts to the output
+
+A subsequence of the alignment can be output by setting the Output complete parameter to "Partial" and defining the offset and end of the subsequence to be output 
+
+----
+
+**Attribution**
+
+Clustal attribution and associated documentation are available at Clustsrc_
+
+The first iteration of this Galaxy wrapper was written by Hans-Rudolf Hotz - see Clustfirst_
+
+It was modified by Ross Lazarus for the rgenetics project - tests and some additional parameters were added
+
+This wrapper is released licensed under the LGPL_
+
+.. _ClustalW2: http://www.ebi.ac.uk/2can/tutorials/protein/clustalw.html  
+
+.. _Clustsrc: http://www.clustal.org
+
+.. _Clustfirst: http://lists.bx.psu.edu/pipermail/galaxy-dev/2010-November/003732.html
+
+.. _LGPL: http://www.gnu.org/copyleft/lesser.html
+
+  </help>
+
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgEigPCA.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgEigPCA.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,342 @@\n+"""\n+run smartpca\n+\n+This uses galaxy code developed by Dan to deal with\n+arbitrary output files using an html dataset with it\'s own\n+subdirectory containing the arbitrary files\n+We create that html file and add all the links we need\n+\n+Note that we execute the smartpca.perl program in the output subdirectory\n+to avoid having to clear out the job directory after running\n+\n+Code to convert linkage format ped files into eigenstratgeno format is left here\n+in case we decide to autoconvert\n+\n+Added a plot in R with better labels than the default eigensoft plot december 26 2007\n+\n+DOCUMENTATION OF smartpca program:\n+\n+smartpca runs Principal Components Analysis on input genotype data and\n+  outputs principal components (eigenvectors) and eigenvalues.\n+  The method assumes that samples are unrelated.  (However, a small number\n+  of cryptically related individuals is usually not a problem in practice\n+  as they will typically be discarded as outliers.)\n+\n+5 different input formats are supported.  See ../CONVERTF/README\n+for documentation on using the convertf program to convert between formats.\n+\n+The syntax of smartpca is "../bin/smartpca -p parfile".  We illustrate\n+how parfile works via a toy example (see example.perl in this directory).\n+This example takes input in EIGENSTRAT format.  The syntax of how to take input\n+in other formats is analogous to the convertf program, see ../CONVERTF/README.\n+\n+The smartpca program prints various statistics to standard output.\n+To redirect this information to a file, change the above syntax to\n+"../bin/smartpca -p parfile >logfile".  For a description of these\n+statistics, see the documentation file smartpca.info in this directory.\n+\n+Estimated running time of the smartpca program is\n+  2.5e-12 * nSNP * NSAMPLES^2 hours            if not removing outliers.\n+  2.5e-12 * nSNP * NSAMPLES^2 hours * (1+m)    if m outlier removal iterations.\n+Thus, under the default of up to 5 outlier removal iterations, running time is\n+  up to 1.5e-11 * nSNP * NSAMPLES^2 hours.\n+\n+------------------------------------------------------------------------\n+\n+DESCRIPTION OF EACH PARAMETER in parfile for smartpca:\n+\n+genotypename: input genotype file (in any format: see ../CONVERTF/README)\n+snpname:      input snp file      (in any format: see ../CONVERTF/README)\n+indivname:    input indiv file    (in any format: see ../CONVERTF/README)\n+evecoutname:  output file of eigenvectors.  See numoutevec parameter below.\n+evaloutname:  output file of all eigenvalues\n+\n+OPTIONAL PARAMETERS:\n+\n+numoutevec:     number of eigenvectors to output.  Default is 10.\n+numoutlieriter: maximum number of outlier removal iterations.\n+  Default is 5.  To turn off outlier removal, set this parameter to 0.\n+numoutlierevec: number of principal components along which to\n+  remove outliers during each outlier removal iteration.  Default is 10.\n+outliersigmathresh: number of standard deviations which an individual must\n+  exceed, along one of the top (numoutlierevec) principal components, in\n+  order for that individual to be removed as an outlier.  Default is 6.0.\n+outlieroutname: output logfile of outlier individuals removed. If not specified,\n+  smartpca will print this information to stdout, which is the default.\n+usenorm: Whether to normalize each SNP by a quantity related to allele freq.\n+  Default is YES.  (When analyzing microsatellite data, should be set to NO.\n+  See Patterson et al. 2006.)\n+altnormstyle: Affects very subtle details in normalization formula.\n+  Default is YES (normalization formulas of Patterson et al. 2006)\n+  To match EIGENSTRAT (normalization formulas of Price et al. 2006), set to NO.\n+missingmode: If set to YES, then instead of doing PCA on # reference alleles,\n+  do PCA on whether each data point is missing or nonmissing.  Default is NO.\n+  May be useful for detecting informative missingness (Clayton et al. 2005).\n+nsnpldregress: If set to a positive integer, then LD correction is turned on,\n+  an'..b'utputs,\'\n+        print \' and the 4 integer tuning parameters k,m,t and s in order. Given that, will run smartpca for eigensoft\'\n+        sys.exit(1)\n+    else:\n+        print >> sys.stdout, \'rgEigPCA.py got %s\' % (\' \'.join(sys.argv))\n+    skillme = \' %s\' % string.punctuation\n+    trantab = string.maketrans(skillme,\'_\'*len(skillme))\n+    ofname = sys.argv[5]\n+    progname = os.path.basename(sys.argv[0])\n+    infile = sys.argv[1]\n+    infpath,base_name = os.path.split(infile) # now takes precomputed or autoconverted ldreduced dataset\n+    title = sys.argv[2].translate(trantab) # must replace all of these for urls containing title\n+    outfile1 = sys.argv[3]\n+    newfilepath = sys.argv[4]\n+    try:\n+       os.mkdirs(newfilepath)\n+    except:\n+       pass\n+    op = os.path.split(outfile1)[0]\n+    try: # for test - needs this done\n+        os.makedirs(op)\n+    except:\n+        pass\n+    eigen_k = sys.argv[5]\n+    eigen_m = sys.argv[6]\n+    eigen_t = sys.argv[7]\n+    eigen_s = sys.argv[8]\n+    eigpca = sys.argv[9] # path to new dataset for pca results - for later adjustment\n+    eigentitle = os.path.join(newfilepath,title)\n+    explanations=[\'Samples plotted in first 2 eigenvector space\',\'Principle components\',\'Eigenvalues\',\n+    \'Smartpca log (contents shown below)\']\n+    rplotname = \'PCAPlot.pdf\'\n+    eigenexts = [rplotname, "pca.xls", "eval.xls"]\n+    newfiles = [\'%s_%s\' % (title,x) for x in eigenexts] # produced by eigenstrat\n+    rplotout = os.path.join(newfilepath,newfiles[0]) # for R plots\n+    eigenouts = [x for x in newfiles]\n+    eigenlogf = \'%s_log.txt\' % title\n+    newfiles.append(eigenlogf) # so it will also appear in the links\n+    lfname = outfile1\n+    lf = file(lfname,\'w\')\n+    lf.write(galhtmlprefix % progname)\n+    try:\n+        os.makedirs(newfilepath)\n+    except:\n+        pass\n+    smartCL = \'%s -i %s.bed -a %s.bim -b %s.fam -o %s -p %s -e %s -l %s -k %s -m %s -t %s -s %s\' % \\\n+          (smartpca,infile, infile, infile, eigenouts[1],\'%s_eigensoftplot.pdf\' % title,eigenouts[2],eigenlogf, \\\n+           eigen_k, eigen_m, eigen_t, eigen_s)\n+    env = os.environ\n+    p=subprocess.Popen(smartCL,shell=True,cwd=newfilepath)\n+    retval = p.wait()\n+    # copy the eigenvector output file needed for adjustment to the user\'s eigenstrat library directory\n+    elog = file(os.path.join(newfilepath,eigenlogf),\'r\').read()\n+    eeigen = os.path.join(newfilepath,\'%s.evec\' % eigenouts[1]) # need these for adjusting\n+    try:\n+        eigpcaRes = file(eeigen,\'r\').read()\n+    except:\n+        eigpcaRes = \'\'\n+    file(eigpca,\'w\').write(eigpcaRes)\n+    makePlot(eigpca=eigpca,pdfname=newfiles[0],title=title,nfp=newfilepath,rexe=rexe)\n+    s = \'Output from %s run at %s<br/>\\n\' % (progname,timenow())\n+    lf.write(\'<h4>%s</h4>\\n\' % s)\n+    lf.write(\'newfilepath=%s, rexe=%s\' % (newfilepath,rexe))\n+    lf.write(\'(click on the image below to see a much higher quality PDF version)\')\n+    thumbnail = \'%s.png\' % newfiles[0] # foo.pdf.png - who cares?\n+    if os.path.exists(os.path.join(newfilepath,thumbnail)):\n+        lf.write(\'<table border="0" cellpadding="10" cellspacing="10"><tr><td>\\n\')\n+        lf.write(\'<a href="%s"><img src="%s" alt="%s" hspace="10" align="left" /></a></td></tr></table><br/>\\n\' \\\n+            % (newfiles[0],thumbnail,explanations[0]))\n+    allfiles = os.listdir(newfilepath)\n+    allfiles.sort()\n+    sizes = [getfSize(x,newfilepath) for x in allfiles]\n+    lallfiles = [\'<li><a href="%s">%s %s</a></li>\\n\' % (x,x,sizes[i]) for i,x in enumerate(allfiles)] # html list\n+    lf.write(\'<div class="document">All Files:<ol>%s</ol></div>\' % \'\'.join(lallfiles))\n+    lf.write(\'<div class="document">Log %s contents follow below<p/>\' % eigenlogf)\n+    lf.write(\'<pre>%s</pre></div>\' % elog) # the eigenlog\n+    s = \'If you need to rerun this analysis, the command line used was\\n%s\\n<p/>\' % (smartCL)\n+    lf.write(s)\n+    lf.write(galhtmlpostfix) # end galhtmlprefix div\n+    lf.close()\n+\n+\n+if __name__ == "__main__":\n+   runEigen()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgEigPCA.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgEigPCA.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,167 @@
+<tool id="rgEigPCA1" name="Eigensoft:">
+    <description>PCA Ancestry using SNP</description>
+
+    <command interpreter="python">
+    rgEigPCA.py "$i.extra_files_path/$i.metadata.base_name" "$title" "$out_file1"
+    "$out_file1.files_path" "$k" "$m" "$t" "$s" "$pca"
+    </command>
+
+    <inputs>
+
+       <param name="i"  type="data" label="Input genotype data file"
+          size="120" format="ldindep" />
+       <param name="title"  type="text" value="Ancestry PCA" label="Title for outputs from this run"
+          size="80"  />
+       <param name="k"  type="integer" value="4" label="Number of principal components to output"
+          size="3"  />
+       <param name="m"  type="integer" value="0" label="Max. outlier removal iterations"
+          help="To turn on outlier removal, set m=5 or so. Do this if you plan on adjusting any analyses"
+          size="3"  />
+       <param name="t"  type="integer" value="5" label="# principal components used for outlier removal"
+          size="3"  />
+       <param name="s"  type="integer" value="6" label="#SDs for outlier removal"
+          help = "Any individual with SD along one of k top principal components > s will be removed as an outlier."
+          size="3"  />
+
+   </inputs>
+
+   <outputs>
+       <data name="out_file1" format="html" label="${title}_rgEig.html"/>
+       <data name="pca" format="txt" label="${title}_rgEig.txt"/>
+   </outputs>
+
+<tests>
+ <test>
+   <param name='i' value='tinywga' ftype='ldindep' >
+   <metadata name='base_name' value='tinywga' />
+   <composite_data value='tinywga.bim' />
+   <composite_data value='tinywga.bed' />
+   <composite_data value='tinywga.fam' />
+   <edit_attributes type='name' value='tinywga' /> 
+   </param>
+    <param name='title' value='rgEigPCAtest1' />
+    <param name="k" value="4" />
+    <param name="m" value="2" />
+    <param name="t" value="2" />
+    <param name="s" value="2" />
+    <output name='out_file1' file='rgtestouts/rgEigPCA/rgEigPCAtest1.html' ftype='html' compare='diff' lines_diff='195'>
+    <extra_files type="file" name='rgEigPCAtest1_PCAPlot.pdf' value="rgtestouts/rgEigPCA/rgEigPCAtest1_PCAPlot.pdf" compare="sim_size" delta="3000"/>
+    </output>
+    <output name='pca' file='rgtestouts/rgEigPCA/rgEigPCAtest1.txt' compare='diff'/>
+ </test>
+</tests>
+
+<help>
+
+
+**Syntax**
+
+- **Genotype data** is an input genotype dataset in Plink lped (http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml) format. See below for notes
+- **Title** is used to name the output files so you can remember what the outputs are for
+- **Tuning parameters** are documented in the Eigensoft (http://genepath.med.harvard.edu/~reich/Software.htm) documentation - see below 
+
+
+-----
+
+**Summary**
+
+Eigensoft requires ld-reduced genotype data. 
+Galaxy has an automatic converter for genotype data in Plink linkage pedigree (lped) format.
+For details of this generic genotype format, please see the Plink documentation at 
+http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml
+
+Reading that documentation, you'll see that the linkage pedigree format is really two related files with the same 
+file base name - a map and ped file - eg 'mygeno.ped' and 'mygeno.map'.
+The map file has the chromosome, offset, genetic offset and snp name corresponding to each
+genotype stored as separate alleles in the ped file. The ped file has family id, individual id, father id (or 0), mother id
+(or 0), gender (1=male, 2=female, 0=unknown) and affection (1=unaffected, 2=affected, 0=unknown), 
+then two separate allele columns for each genotype. 
+
+Once you have your data in the right format, you can upload those into your Galaxy history using the "upload" tool.
+
+To upload your lped data in the upload tool, choose 'lped' as the 'file format'. The tool form will change to 
+allow you to navigate to and select each member of the pair of  ped and map files stored on your local computer
+(or available at a public URL for Galaxy to grab). 
+Give the dataset a meaningful name (replace rgeneticsData with something more useful!) and click execute. 
+
+When the upload is done, your new lped format dataset will appear in your history and then, 
+when you choose the ancestry tool, that history dataset will be available as input.
+
+**Warning for the Impatient**
+
+When you execute the tool, it will look like it has not started running for a while as the automatic converter 
+reduces the amount of LD - otherwise eigenstrat gives biased results.
+
+
+**Attribution**
+
+This tool runs and relies on the work of many others, including the
+maintainers of the Eigensoft program, and the R and
+Bioconductor projects. For full attribution, source code and documentation, please see
+http://genepath.med.harvard.edu/~reich/Software.htm, http://cran.r-project.org/
+and http://www.bioconductor.org/ respectively
+
+This implementation is a Galaxy tool wrapper around these third party applications.
+It was originally designed and written for family based data from the CAMP Illumina run of 2007 by
+ross lazarus (ross.lazarus@gmail.com) and incorporated into the rgenetics toolkit.
+
+copyright Ross Lazarus 2007
+Licensed under the terms of the LGPL as documented http://www.gnu.org/licenses/lgpl.html
+but is about as useful as a sponge boat without EIGENSOFT pca code.
+
+**README from eigensoft2 distribution at http://genepath.med.harvard.edu/~reich/Software.htm**
+
+[rerla@beast eigensoft2]$ cat README
+EIGENSOFT version 2.0, January 2008 (for Linux only)
+
+This is the same as our EIGENSOFT 2.0 BETA release with a few recent changes
+as described at http://genepath.med.harvard.edu/~reich/New_In_EIGENSOFT.htm.
+
+Features of EIGENSOFT version 2.0 include:
+-- Keeping track of ref/var alleles in all file formats: see CONVERTF/README
+-- Handling data sets up to 8 billion genotypes: see CONVERTF/README
+-- Output SNP weightings of each principal component: see POPGEN/README
+
+The EIGENSOFT package implements methods from the following 2 papers:
+Patterson N. et al. 2006 PLoS Genetics in press (population structure)
+Price A.L. et al. 2006 NG 38:904-9 (EIGENSTRAT stratification correction)
+
+See POPGEN/README for documentation of population structure programs.
+
+See EIGENSTRAT/README for documentation of EIGENSTRAT programs.
+
+See CONVERTF/README for documentation of programs for converting file formats.
+
+
+Executables and source code:
+----------------------------
+All C executables are in the bin/ directory.
+
+We have placed source code for all C executables in the src/ directory,
+for users who wish to modify and recompile our programs.  For example, to
+recompile the eigenstrat program, type
+"cd src"
+"make eigenstrat"
+"mv eigenstrat ../bin"
+
+Note that some of our software will only compile if your system has the
+lapack package installed.  (This package is used to compute eigenvectors.)
+Some users may need to change "blas-3" to "blas" in the Makefile,
+depending on how blas and lapack are installed.
+
+If cc is not available on your system, try "cp Makefile.alt Makefile"
+and then recompile.
+
+If you have trouble compiling and running our code, try compiling and
+running the pcatoy program in the src directory:
+"cd src"
+"make pcatoy"
+"./pcatoy"
+If you are unable to run the pcatoy program successfully, please contact
+your system administrator for help, as this is a systems issue which is
+beyond our scope.  Your system administrator will be able to troubleshoot
+your systems issue using this trivial program.  [You can also try running
+the pcatoy program in the bin directory, which we have already compiled.]
+</help>
+</tool>
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgFastQC.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgFastQC.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,149 @@
+"""
+wrapper for fastqc
+
+called as
+  <command interpreter="python">
+    rgFastqc.py -i $input_file -d $html_file.files_path -o $html_file -n "$out_prefix"
+  </command>
+
+
+
+Current release seems overly intolerant of sam/bam header strangeness
+Author notified...
+
+
+"""
+
+import os,sys,subprocess,optparse,shutil,tempfile
+from rgutils import getFileString
+
+class FastQC():
+    """wrapper
+    """
+    
+    
+    def __init__(self,opts=None):
+        assert opts <> None
+        self.opts = opts
+        
+        
+    def run_fastqc(self):
+        """
+        In batch mode fastqc behaves not very nicely - will write to a new folder in
+        the same place as the infile called [infilebasename]_fastqc
+    rlazarus@omics:/data/galaxy/test$ ls FC041_1_sequence_fastqc
+    duplication_levels.png  fastqc_icon.png          per_base_n_content.png         per_sequence_gc_content.png       summary.txt
+    error.png               fastqc_report.html       per_base_quality.png           per_sequence_quality.png          tick.png
+    fastqc_data.txt         per_base_gc_content.png  per_base_sequence_content.png  sequence_length_distribution.png  warning.png
+
+        """
+        dummy,tlog = tempfile.mkstemp(prefix='rgFastQClog')
+        sout = open(tlog, 'w')
+        fastq = os.path.basename(self.opts.input)
+        cl = [self.opts.executable,'-o %s' % self.opts.outputdir]
+        if self.opts.informat in ['sam','bam']:
+            cl.append('-f %s' % self.opts.informat)
+        if self.opts.contaminants <> None :
+            cl.append('-c %s' % self.opts.contaminants)
+        cl.append(self.opts.input)
+        p = subprocess.Popen(' '.join(cl), shell=True, stderr=sout, stdout=sout, cwd=self.opts.outputdir)
+        return_value = p.wait()
+        sout.close()
+        runlog = open(tlog,'r').readlines()
+        os.unlink(tlog)
+        flist = os.listdir(self.opts.outputdir) # fastqc plays games with its output directory name. eesh
+        odpath = None
+        for f in flist:
+            d = os.path.join(self.opts.outputdir,f)
+            if os.path.isdir(d):
+                if d.endswith('_fastqc'):
+                    odpath = d 
+        hpath = None
+        if odpath <> None:
+            try: 
+                hpath = os.path.join(odpath,'fastqc_report.html')
+                rep = open(hpath,'r').readlines() # for our new html file but we need to insert our stuff after the <body> tag
+            except:
+                pass
+        if hpath == None:        
+            res =  ['## odpath=%s: No output found in %s. Output for the run was:<pre>\n' % (odpath,hpath),]
+            res += runlog
+            res += ['</pre>\n',
+                   'Please read the above for clues<br/>\n',
+                   'If you selected a sam/bam format file, it might not have headers or they may not start with @HD?<br/>\n',
+                   'It is also possible that the log shows that fastqc is not installed?<br/>\n',
+                   'If that is the case, please tell the relevant Galaxy administrator that it can be snarfed from<br/>\n',
+                   'http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/<br/>\n',]
+            return res
+        self.fix_fastqcimages(odpath)
+        flist = os.listdir(self.opts.outputdir) # these have now been fixed
+        excludefiles = ['tick.png','warning.png','fastqc_icon.png','error.png']
+        flist = [x for x in flist if not x in excludefiles]
+        for i in range(len(rep)): # need to fix links to Icons and Image subdirectories in lastest fastqc code - ugh
+            rep[i] = rep[i].replace('Icons/','')
+            rep[i] = rep[i].replace('Images/','')
+
+        html = self.fix_fastqc(rep,flist,runlog)
+        return html
+        
+
+        
+    def fix_fastqc(self,rep=[],flist=[],runlog=[]):
+        """ add some of our stuff to the html
+        """
+        bs = '</body></html>\n' # hope they don't change this
+        try:
+            bodyindex = rep.index(bs) # hope they don't change this
+        except:
+            bodyindex = len(rep) - 1
+        res = []
+        res.append('<table>\n')
+        flist.sort()
+        for i,f in enumerate(flist):
+             if not(os.path.isdir(f)):
+                 fn = os.path.split(f)[-1]
+                 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,getFileString(fn, self.opts.outputdir)))
+        res.append('</table><p/>\n') 
+        res.append('<a href="http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/">FastQC documentation and full attribution is here</a><br/><hr/>\n')
+        res.append('FastQC was run by Galaxy using the rgenetics rgFastQC wrapper - see http://rgenetics.org for details and licensing\n')
+        fixed = rep[:bodyindex] + res + rep[bodyindex:]
+        return fixed # with our additions
+
+
+    def fix_fastqcimages(self,odpath):
+        """ Galaxy wants everything in the same files_dir
+        """
+        icpath = os.path.join(odpath,'Icons')
+        impath = os.path.join(odpath,'Images')
+        for adir in [icpath,impath,odpath]:
+            if os.path.exists(adir):
+                flist = os.listdir(adir) # get all files created
+                for f in flist:
+                   if not os.path.isdir(os.path.join(adir,f)):
+                       sauce = os.path.join(adir,f)
+                       dest = os.path.join(self.opts.outputdir,f)
+                       shutil.move(sauce,dest)
+                os.rmdir(adir)
+
+    
+
+if __name__ == '__main__':
+    op = optparse.OptionParser()
+    op.add_option('-i', '--input', default=None)
+    op.add_option('-o', '--htmloutput', default=None)
+    op.add_option('-d', '--outputdir', default="/tmp/shortread")
+    op.add_option('-f', '--informat', default='fastq')
+    op.add_option('-n', '--namejob', default='rgFastQC')
+    op.add_option('-c', '--contaminants', default=None)
+    op.add_option('-e', '--executable', default='fastqc')
+    opts, args = op.parse_args()
+    assert opts.input <> None
+    assert os.path.isfile(opts.executable),'##rgFastQC.py error - cannot find executable %s' % opts.executable
+    if not os.path.exists(opts.outputdir): 
+        os.makedirs(opts.outputdir)
+    f = FastQC(opts)
+    html = f.run_fastqc()
+    f = open(opts.htmloutput, 'w')
+    f.write(''.join(html))
+    f.close()
+    
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgFastQC.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgFastQC.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,88 @@
+<tool name="Fastqc: Fastqc QC" id="fastqc" version="0.1">
+  <description>using FastQC from Babraham</description>
+  <command interpreter="python">
+    rgFastQC.py -i $input_file -d $html_file.files_path -o $html_file -n "$out_prefix" -f $input_file.ext -e ${GALAXY_DATA_INDEX_DIR}/shared/jars/FastQC/fastqc
+#if $contaminants.dataset and str($contaminants) > ''
+-c "$contaminants"
+#end if
+  </command>
+  <requirements>
+    <requirement type="package">FastQC</requirement>
+  </requirements>
+  <inputs>
+    <param format="fastqsanger,fastq,bam,sam" name="input_file" type="data" label="Short read data from your current history" />
+    <param name="out_prefix" value="FastQC" type="text" label="Title for the output file - to remind you what the job was for" size="80" />
+    <param name="contaminants" type="data" format="tabular" optional="true" label="Contaminant list" 
+           help="tab delimited file with 2 columns: name and sequence.  For example: Illumina Small RNA RT Primer CAAGCAGAAGACGGCATACGA"/>
+  </inputs>
+  <outputs>
+    <data format="html" name="html_file"  label="${out_prefix}.html" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input_file" value="1000gsample.fastq" />
+      <param name="out_prefix" value="fastqc_out" />
+      <param name="contaminants" value="fastqc_contaminants.txt" ftype="tabular" />
+      <output name="html_file" file="fastqc_report.html" ftype="html" lines_diff="100"/>
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**Purpose**
+
+FastQC aims to provide a simple way to do some quality control checks on raw
+sequence data coming from high throughput sequencing pipelines. 
+It provides a modular set of analyses which you can use to give a quick
+impression of whether your data has any problems of 
+which you should be aware before doing any further analysis.
+
+The main functions of FastQC are:
+
+- Import of data from BAM, SAM or FastQ files (any variant)
+- Providing a quick overview to tell you in which areas there may be problems
+- Summary graphs and tables to quickly assess your data
+- Export of results to an HTML based permanent report
+- Offline operation to allow automated generation of reports without running the interactive application
+
+**FastQC documentation**
+
+This is a Galaxy interface to the external package FastQC_.
+Specific documentation on FastQC can be found on that site.
+FastQC incorporates the Picard-tools_ libraries for sam/bam processing.
+
+ .. _FastQC: http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/
+ .. _Picard-tools: http://picard.sourceforge.net/index.shtml
+
+The contaminants file parameter was borrowed from the independently developed
+fastqcwrapper contributed to the Galaxy Community Tool Shed by J. Johnson.
+
+-----
+
+.. class:: infomark
+
+**Inputs and outputs**
+
+This wrapper will accept any fastq file as well as sam or bam as the primary file to check.
+It will also take an optional file containing a list of contaminants information, in the form of
+a tab-delimited file with 2 columns, name and sequence.
+
+The tool produces a single HTML output file that contains all of the results, including the following:
+
+- Basic Statistics
+- Per base sequence quality
+- Per sequence quality scores
+- Per base sequence content
+- Per base GC content
+- Per sequence GC content
+- Per base N content
+- Sequence Length Distribution
+- Sequence Duplication Levels
+- Overrepresented sequences
+- Kmer Content
+
+All except Basic Statistics and Overrepresented sequences are plots.
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgGLM.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgGLM.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,287 @@\n+#!/usr/local/bin/python\n+"""\n+# added most of the available options for linear models\n+# june 2009 rml\n+# hack to run and process a plink quantitative trait\n+#\n+\n+This is a wrapper for Shaun Purcell\'s Plink linear/logistic models for\n+traits, covariates and genotypes.\n+\n+It requires some judgement to interpret the findings\n+We need some better visualizations - manhattan plots are good.\n+svg with rs numbers for top 1%?\n+\n+toptable tools - truncate a gg file down to some low percentile\n+\n+intersect with other tables - eg gene expression regressions on snps\n+\n+\n+\n+"""\n+\n+import sys,math,shutil,subprocess,os,string,tempfile,shutil,commands\n+from rgutils import plinke\n+\n+def makeGFF(resf=\'\',outfname=\'\',logf=None,twd=\'.\',name=\'track name\',description=\'track description\',topn=1000):\n+    """\n+    score must be scaled to 0-1000\n+    \n+    Want to make some wig tracks from each analysis\n+    Best n -log10(p). Make top hit the window.\n+    we use our tab output which has\n+    rs\tchrom\toffset\tADD_stat\tADD_p\tADD_log10p\n+    rs3094315\t1\t792429\t1.151\t0.2528\t0.597223\n+\n+    """\n+\n+    def is_number(s):\n+        try:\n+            float(s)\n+            return True\n+        except ValueError:\n+            return False\n+    header = \'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\\n\' % (name,description)          \n+    column_names = [ \'Seqname\', \'Source\', \'Feature\', \'Start\', \'End\', \'Score\', \'Strand\', \'Frame\', \'Group\' ]\n+    halfwidth=100\n+    resfpath = os.path.join(twd,resf)\n+    resf = open(resfpath,\'r\')\n+    resfl = resf.readlines() # dumb but convenient for millions of rows\n+    resfl = [x.split() for x in resfl]\n+    headl = resfl[0]\n+    resfl = resfl[1:]\n+    headl = [x.strip().upper() for x in headl]\n+    headIndex = dict(zip(headl,range(0,len(headl))))\n+    chrpos = headIndex.get(\'CHROM\',None)\n+    rspos = headIndex.get(\'RS\',None)\n+    offspos = headIndex.get(\'OFFSET\',None)\n+    ppos = headIndex.get(\'ADD_LOG10P\',None)\n+    wewant = [chrpos,rspos,offspos,ppos]\n+    if None in wewant: # missing something\n+       logf.write(\'### Error missing a required header in makeGFF - headIndex=%s\\n\' % headIndex)\n+       return\n+    resfl = [x for x in resfl if x[ppos] > \'\']\n+    resfl = [(float(x[ppos]),x) for x in resfl] # decorate\n+    resfl.sort()\n+    resfl.reverse() # using -log10 so larger is better\n+    resfl = resfl[:topn] # truncate\n+    pvals = [x[0] for x in resfl] # need to scale\n+    resfl = [x[1] for x in resfl] # drop decoration\n+    if len(pvals) == 0:\n+        logf.write(\'### no pvalues found in resfl - %s\' % (resfl[:3]))\n+        sys.exit(1)\n+    maxp = max(pvals) # need to scale\n+    minp = min(pvals)\n+    prange = abs(maxp-minp) + 0.5 # fudge\n+    scalefact = 1000.0/prange\n+    logf.write(\'###maxp=%f,minp=%f,prange=%f,scalefact=%f\\n\' % (maxp,minp,prange,scalefact))\n+    for i,row in enumerate(resfl):\n+        row[ppos] = \'%d\' % (int(scalefact*pvals[i]))\n+        resfl[i] = row # replace\n+    outf = file(outfname,\'w\')\n+    outf.write(header)\n+    outres = [] # need to resort into chrom offset order\n+    for i,lrow in enumerate(resfl):\n+        chrom,snp,offset,p, = [lrow[x] for x in wewant]\n+        gff = (\'chr%s\' % chrom,\'rgGLM\',\'variation\',\'%d\' % (int(offset)-halfwidth),\n+               \'%d\' % (int(offset)+halfwidth),p,\'.\',\'.\',\'%s logp=%1.2f\' % (snp,pvals[i]))\n+        outres.append(gff)\n+    outres = [(x[0],int(x[3]),x) for x in outres] # decorate\n+    outres.sort() # into chrom offset\n+    outres=[x[2] for x in outres] # undecorate\n+    outres = [\'\\t\'.join(x) for x in outres]    \n+    outf.write(\'\\n\'.join(outres))\n+    outf.write(\'\\n\')\n+    outf.close()\n+\n+\n+\n+def xformQassoc(resf=\'\',outfname=\'\',logf=None,twd=\'.\'):\n+    """\tplink.assoc.linear to gg file\n+from the docs\n+The output per each SNP might look something like:\n+\n+    CHR        SNP      BP  A1       TEST   NMISS       OR      STAT         P\n+      5   rs000001   10001   A        ADD     664   0.7806    -1.942   0.05216\n+      '..b'np)\n+    # now have various tests indexed by rs\n+    tk = resdict.keys()\n+    tk.sort() # tests\n+    ohead = [\'rs\',\'chrom\',\'offset\']\n+    for t in tk: # add headers\n+        ohead.append(\'%s_stat\' % t)\n+        ohead.append(\'%s_p\' % t)\n+        ohead.append(\'%s_log10p\' % t)\n+    oheads = \'\\t\'.join(ohead)\n+    res = [oheads,]\n+    for snp in markerlist: # retain original order\n+        chrom,offset = rsdict[snp]\n+        outl = [snp,chrom,offset]\n+        for t in tk:\n+            outl += resdict[t][snp] # add stat,p for this test\n+        outs = \'\\t\'.join(outl)\n+        res.append(outs)\n+    f = file(outfname,\'w\')\n+    res.append(\'\')\n+    f.write(\'\\n\'.join(res))\n+    f.close()\n+\n+                \n+if __name__ == "__main__":\n+    """\n+\n+    <command interpreter="python">   \n+        rgGLM.py \'$i.extra_files_path/$i.metadata.base_name\' \'$phef.extra_files_path/$phef.metadata.base_name\'\n+        "$title1" \'$predvar\' \'$covar\' \'$out_file1\' \'$logf\' \'$i.metadata.base_name\'\n+        \'$inter\' \'$cond\' \'$gender\' \'$mind\' \'$geno\' \'$maf\' \'$logistic\' \'$wigout\'\n+    </command>\n+    """\n+    topn = 1000\n+    killme = string.punctuation+string.whitespace\n+    trantab = string.maketrans(killme,\'_\'*len(killme))\n+    if len(sys.argv) < 17:\n+       s = \'rgGLM.py needs 17 params - got %s \\n\' % (sys.argv)\n+       sys.stderr.write(s) # print >>,s would probably also work?\n+       sys.exit(0)\n+    blurb = \'rgGLM.py called with %s\' % sys.argv\n+    print >> sys.stdout,blurb\n+    bfname = sys.argv[1]\n+    phename = sys.argv[2]\n+    title = sys.argv[3]\n+    title.translate(trantab)\n+    predvar = sys.argv[4]\n+    covar = sys.argv[5].strip()\n+    outfname = sys.argv[6] \n+    logfname = sys.argv[7]\n+    op = os.path.split(logfname)[0]\n+    try: # for test - needs this done\n+        os.makedirs(op)\n+    except:\n+        pass    \n+    basename = sys.argv[8].translate(trantab)\n+    inter = sys.argv[9] == \'1\'\n+    cond = sys.argv[10].strip()\n+    if cond == \'None\':\n+        cond = \'\'\n+    gender = sys.argv[11] == \'1\'\n+    mind = sys.argv[12]\n+    geno = sys.argv[13]\n+    maf = sys.argv[14]\n+    logistic = sys.argv[15].strip()==\'1\'\n+    gffout = sys.argv[16]\n+    me = sys.argv[0]\n+    phepath = \'%s.pphe\' % phename\n+    twd = tempfile.mkdtemp(suffix=\'rgGLM\') # make sure plink doesn\'t spew log file into the root!\n+    tplog = os.path.join(twd,\'%s.log\' % basename) # should be path to plink log\n+    vcl = [plinke,\'--noweb\',\'--bfile\',bfname,\'--pheno-name\',\'"%s"\' % predvar,\'--pheno\',\n+           phepath,\'--out\',basename,\'--mind %s\' % mind, \'--geno %s\' % geno,\n+           \'--maf %s\' % maf]\n+    if logistic:\n+        vcl.append(\'--logistic\')\n+        resf = \'%s.assoc.logistic\' % basename # plink output is here we hope\n+    else:\n+        vcl.append(\'--linear\')\n+        resf = \'%s.assoc.linear\' % basename # plink output is here we hope\n+    resf = os.path.join(twd,resf)\n+    if gender:\n+        vcl.append(\'--sex\')\n+    if inter:\n+        vcl.append(\'--interaction\')\n+    if covar > \'None\':\n+        vcl += [\'--covar\',phepath,\'--covar-name\',covar] # comma sep list of covariates\n+    tcfile = None\n+    if len(cond) > 0: # plink wants these in a file..\n+        dummy,tcfile = tempfile.mkstemp(suffix=\'condlist\') #\n+        f = open(tcfile,\'w\')\n+        cl = cond.split()\n+        f.write(\'\\n\'.join(cl))\n+        f.write(\'\\n\')\n+        f.close()\n+        vcl.append(\'--condition-list %s\' % tcfile)\n+    p=subprocess.Popen(\' \'.join(vcl),shell=True,cwd=twd)\n+    retval = p.wait()\n+    if tcfile:\n+        os.unlink(tcfile)\n+    plinklog = file(tplog,\'r\').read()\n+    logf = file(logfname,\'w\')\n+    logf.write(blurb)\n+    logf.write(\'\\n\')\n+    logf.write(\'vcl=%s\\n\' % vcl)\n+    xformQassoc(resf=resf,outfname=outfname,logf=logf,twd=twd) # leaves the desired summary file\n+    makeGFF(resf=outfname,outfname=gffout,logf=logf,twd=twd,name=\'rgGLM_TopTable\',description=title,topn=topn)\n+    logf.write(\'\\n\')\n+    logf.write(plinklog)\n+    logf.close()\n+    #shutil.rmtree(twd) # clean up\n+\n+\n+\n+\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgGLM.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgGLM.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,146 @@
+<tool id="rgGLM1" name="Linear Models:" version="0.2">
+    <description>for genotype data</description>
+    <code file="rgGLM_code.py"/>
+    <command interpreter="python">
+        rgGLM.py '$i.extra_files_path/$i.metadata.base_name' '$phef.extra_files_path/$phef.metadata.base_name'
+        "$title" '$predvar' '$covar' '$out_file1' '$logf' '$i.metadata.base_name'
+        '$inter' '$cond' '$gender' '$mind' '$geno' '$maf' '$logistic' '$gffout'
+    </command>
+
+    <inputs>
+      <page>
+       <param name='title' label='Title for outputs' type='text' value='GLM' size="80" />
+       <param name="i" type="data" format="pbed" label="Genotype file" size="80"  />
+       <param name="phef"  type="data" format="pphe" label="Phenotype file" size="80"
+       help="Dependent variable and covariates will be chosen from this file on the next page"/>
+       <param name="logistic" type="text" value = "0" label="1=Use a logistic model (trait must be 1/2 coded like affection)"
+       help="Please read the Plink documentation about this option"  />
+       <param name="gender" type="text" value = "0" label="1=Add a gender term to model"  />
+       <param name='inter' label='1=Build an interaction model - please read the docs carefully before using this'
+         type='text' value='0' size="1" />
+       <param name="cond"  type="text"  area='true' size='15x20' value = ""
+       label="condition on this whitespace delimited rs (snp id) list"  />
+       <param name="mind" type="float" value = "0.1" label="Remove subjects with missing genotypes gt (eg 0.1)"
+       help = "Set to 1 to include all subjects in the input file" />
+       <param name="geno"  type="float" value = "0.1" label="Remove markers with missing genotypes gt (eg 0.1)"
+       help = "Set to 1 to include all markers in the input file"  />
+       <param name="maf"  type="float" value = "0.01" label="Remove markers with MAF lt (eg 0.01) "
+       help = "Set to 0 to include all markers in the input file"/>
+      </page>
+      <page>
+       <param name="predvar" size="80"  type="select" label="Dependent Trait"
+       dynamic_options="get_phecols(phef=phef,selectOne=1)"  display="radio" multiple="false"
+       help="Model this characteristic in terms of subject snp genotypes - eg rare allele dosage for additive model" />
+       <param name="covar" size="80"  type="select" label="Covariates"
+       dynamic_options="get_phecols(phef=phef,selectOne=0)" multiple="true" display="checkboxes"
+       help="Use these phenotypes as covariates in models of snp dosage effects on the dependent trait"/>
+      </page>
+   </inputs>
+
+   <outputs>
+       <data format="tabular" name="out_file1" label="${title}_rgGLM.xls"/>
+       <data format="txt" name="logf" label="${title}_rgGLMlog.txt" />
+       <data format="gff" name="gffout"  label="${title}_rgGLM.gff"/>
+   </outputs>
+<tests>
+ <test>
+  <param name='i' value='tinywga' ftype='pbed' >
+   <metadata name='base_name' value='tinywga' />
+   <composite_data value='tinywga.bim' />
+   <composite_data value='tinywga.bed' />
+   <composite_data value='tinywga.fam' />
+   <edit_attributes type='name' value='tinywga' /> 
+ </param>
+ <param name='phef' value='tinywga' ftype='pphe' >
+   <metadata name='base_name' value='tinywga' />
+   <composite_data value='tinywga.pphe' />
+   <edit_attributes type='name' value='tinywga' /> 
+ </param>
+ <param name='title' value='rgGLMtest1' />
+ <param name='predvar' value='c1' />
+ <param name='covar' value='None' />
+ <param name='inter' value='0' />
+ <param name='cond' value='' />
+ <param name='gender' value='0' />
+ <param name='mind' value='1.0' />
+ <param name='geno' value='1.0' />
+ <param name='maf' value='0.0' />
+ <param name='logistic' value='0' />
+ <output name='out_file1' file='rgGLMtest1_GLM.xls' ftype='tabular' compare="diff" />
+ <output name='logf' file='rgGLMtest1_GLM_log.txt' ftype='txt' compare="diff" lines_diff='36'/>
+ <output name='gffout' file='rgGLMtest1_GLM_topTable.gff' compare="diff" ftype='gff' />
+ </test>
+</tests>
+<help>
+
+.. class:: infomark
+
+**Syntax**
+
+Note this is a two form tool - you will choose the dependent trait and covariates
+on the second page based on the phenotype file you choose on the first page
+
+- **Genotype file** is the input Plink format compressed genotype (pbed) file
+- **Phenotype file** is the input Plink phenotype (pphe) file with FAMID IID followed by phenotypes
+- **Dependant variable** is the term on the left of the model and is chosen from the pphe columns on the second page
+- **Logistic** if you are (eg) using disease status as the outcome variable (case/control) - otherwise the model is linear.
+- **Covariates** are covariate terms on the right of the model, also chosen on the second page
+- **Interactions** will add interactions - please be careful how you interpret these - see the Plink documentation.
+- **Gender** will add gender as a model term - described in the Plink documentation
+- **Condition** will condition the model on one or more specific SNP rs ids as a whitespace delimited sequence
+- **Format** determines how your data will be returned to your Galaxy workspace
+
+-----
+
+.. class:: infomark
+
+**Summary**
+
+This tool will test GLM models for SNP predicting a dependent phenotype
+variable with adjustment for specified covariates.
+
+If you don't see the genotype or phenotype data set you want here, it can be imported using
+one of the methods available from the rg get data tool group.
+
+Output format can be UCSC .bed if you want to see one column of your
+results as a fully fledged UCSC genome browser track. A map file containing the chromosome and offset for each marker is
+required for writing this kind of output.
+Alternatively you can use .gg for the UCSC Genome Graphs tool which has all of the advantages
+of the the .bed track, plus a neat, visual front end that displays a lot of useful clues.
+Either of these are a very useful way of quickly getting a look
+at your data in full genomic context.
+
+Finally, if you can't live without
+spreadsheet data, choose the .xls tab delimited format. It's not a stupid binary excel file. Just a plain old tab
+delimited
+one with a header. Fortunately excel is dumb enough to open these without much protest.
+
+-----
+
+.. class:: infomark
+
+**Attribution**
+
+This Galaxy tool relies on Plink (see Plinksrc_) to test GLM models. 
+
+So, we rely on the author (Shaun Purcell) for the documentation you need specific to those settings - they are very nicely documented - see
+DOC_
+
+Tool and Galaxy datatypes originally designed and written for the Rgenetics
+series of whole genome scale statistical genetics tools by ross lazarus (ross.lazarus@gmail.com)
+
+Copyright Ross Lazarus March 2007
+This Galaxy wrapper is released licensed under the LGPL_ but is about as useful as a chocolate teapot without Plink which is GPL.
+
+I'm no lawyer, but it looks like you got GPL if you use this software. Good luck.
+
+.. _Plinksrc: http://pngu.mgh.harvard.edu/~purcell/plink/ 
+
+.. _LGPL: http://www.gnu.org/copyleft/lesser.html
+
+.. _DOC: http://pngu.mgh.harvard.edu/~purcell/plink/anal.shtml#glm
+
+</help>
+</tool>
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgGLM_code.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgGLM_code.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,23 @@
+# before running the qc, need to rename various output files
+import os,string,time
+from galaxy import datatypes 
+
+
+def get_phecols(phef='',selectOne=0):
+   """return column names """
+   phepath = phef.extra_files_path
+   phename = phef.metadata.base_name
+   phe = os.path.join(phepath,'%s.pphe' % phename)
+   head = open(phe,'r').next()
+   c = head.strip().split()[2:] # first are fid,iid
+   res = [(cname,cname,False) for cname in c]
+   if len(res) >= 1:
+       if selectOne:
+          x,y,z = res[0] # 0,1 = fid,iid
+          res[0] = (x,y,True) # set second selected
+       else:
+          res.insert(0,('None','None',True))
+   else:
+      res = [('None','no phenotype columns found',False),]
+   return res
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgGRR.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgGRR.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,1089 @@\n+"""\n+# july 2009: Need to see outliers so need to draw them last?\n+# could use clustering on the zscores to guess real relationships for unrelateds\n+# but definitely need to draw last\n+# added MAX_SHOW_ROWS to limit the length of the main report page\n+# Changes for Galaxy integration\n+# added more robust knuth method for one pass mean and sd\n+# no difference really - let\'s use scipy.mean() and scipy.std() instead...\n+# fixed labels and changed to .xls for outlier reports so can open in excel\n+# interesting - with a few hundred subjects, 5k gives good resolution\n+# and 100k gives better but not by much\n+# TODO remove non autosomal markers\n+# TODO it would be best if label had the zmean and zsd as these are what matter for\n+# outliers rather than the group mean/sd\n+# mods to rgGRR.py from channing CVS which John Ziniti has rewritten to produce SVG plots\n+# to make a Galaxy tool - we need the table of mean and SD for interesting pairs, the SVG and the log\n+# so the result should be an HTML file\n+\n+# rgIBS.py\n+# use a random subset of markers for a quick ibs\n+# to identify sample dups and closely related subjects\n+# try snpMatrix and plink and see which one works best for us?\n+# abecasis grr plots mean*sd for every subject to show clusters\n+# mods june 23 rml to avoid non-autosomal markers\n+# we seem to be distinguishing parent-child by gender - 2 clouds!\n+\n+\n+snpMatrix from David Clayton has:\n+ibs.stats function to calculate the identity-by-state stats of a group of samples\n+Description\n+Given a snp.matrix-class or a X.snp.matrix-class object with N samples, calculates some statistics\n+about the relatedness of every pair of samples within.\n+\n+Usage\n+ibs.stats(x)\n+8 ibs.stats\n+Arguments\n+x a snp.matrix-class or a X.snp.matrix-class object containing N samples\n+Details\n+No-calls are excluded from consideration here.\n+Value\n+A data.frame containing N(N - 1)/2 rows, where the row names are the sample name pairs separated\n+by a comma, and the columns are:\n+Count count of identical calls, exclusing no-calls\n+Fraction fraction of identical calls comparied to actual calls being made in both samples\n+Warning\n+In some applications, it may be preferable to subset a (random) selection of SNPs first - the\n+calculation\n+time increases as N(N - 1)M/2 . Typically for N = 800 samples and M = 3000 SNPs, the\n+calculation time is about 1 minute. A full GWA scan could take hours, and quite unnecessary for\n+simple applications such as checking for duplicate or related samples.\n+Note\n+This is mostly written to find mislabelled and/or duplicate samples.\n+Illumina indexes their SNPs in alphabetical order so the mitochondria SNPs comes first - for most\n+purpose it is undesirable to use these SNPs for IBS purposes.\n+TODO: Worst-case S4 subsetting seems to make 2 copies of a large object, so one might want to\n+subset before rbind(), etc; a future version of this routine may contain a built-in subsetting facility\n+"""\n+import sys,os,time,random,string,copy,optparse\n+\n+try:\n+  set\n+except NameError:\n+  from Sets import Set as set\n+\n+from rgutils import timenow,plinke\n+\n+import plinkbinJZ\n+\n+\n+opts = None\n+verbose = False\n+\n+showPolygons = False\n+\n+class NullDevice:\n+  def write(self, s):\n+    pass\n+\n+tempstderr = sys.stderr # save\n+#sys.stderr = NullDevice()\n+# need to avoid blather about deprecation and other strange stuff from scipy\n+# the current galaxy job runner assumes that\n+# the job is in error if anything appears on sys.stderr\n+# grrrrr. James wants to keep it that way instead of using the\n+# status flag for some strange reason. Presumably he doesn\'t use R or (in this case, scipy)\n+import numpy\n+import scipy\n+from scipy import weave\n+\n+\n+sys.stderr=tempstderr\n+\n+\n+PROGNAME = os.path.split(sys.argv[0])[-1]\n+X_AXIS_LABEL = \'Mean Alleles Shared\'\n+Y_AXIS_LABEL = \'SD Alleles Shared\'\n+LEGEND_ALIGN = \'topleft\'\n+LEGEND_TITLE = \'Relationship\'\n+DEFAULT_SYMBOL_SIZE = 1.0 # default symbol size\n+DEFAULT_SYMBOL_SIZE = 0.5 # default symbol size'..b'"""parse parameters from galaxy\n+    expect \'input pbed path\' \'basename\' \'outpath\' \'title\' \'logpath\' \'n\'\n+    <command interpreter="python">\n+         rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name"\n+        \'$out_file1\' \'$out_file1.files_path\' "$title1"  \'$n\' \'$Z\' \n+    </command>\n+\n+    """\n+    u="""<command interpreter="python">\n+         rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name"\n+        \'$out_file1\' \'$out_file1.files_path\' "$title1"  \'$n\' \'$Z\'\n+         </command>\n+      """\n+\n+\n+    if len(sys.argv) < 7:\n+        print >> sys.stdout, \'Need pbed inpath, basename, out_htmlname, outpath, title, logpath, nSNP, Zcutoff on command line please\'\n+        print >> sys.stdout, u\n+        sys.exit(1)\n+    ts = \'%s%s\' % (string.punctuation,string.whitespace)\n+    ptran =  string.maketrans(ts,\'_\'*len(ts))\n+    inpath = sys.argv[1]\n+    ldinpath = os.path.split(inpath)[0]\n+    basename = sys.argv[2]\n+    outhtml = sys.argv[3]\n+    newfilepath = sys.argv[4]\n+    title = sys.argv[5].translate(ptran)\n+    logfname = \'Log_%s.txt\' % title\n+    logpath = os.path.join(newfilepath,logfname) # log was a child - make part of html extra_files_path zoo\n+    n = int(sys.argv[6])\n+    try:\n+        Zcutoff = float(sys.argv[7])\n+    except:\n+        Zcutoff = 2.0\n+    try:\n+        os.makedirs(newfilepath)\n+    except:\n+        pass\n+    logf = file(logpath,\'w\')\n+    efp,ibase_name = os.path.split(inpath) # need to use these for outputs in files_path\n+    ped = plinkbinJZ.BPed(inpath)\n+    ped.parse(quick=True)\t\n+    if ped == None:\n+        print >> sys.stderr, \'## doIBSpy problem - cannot open %s or %s - cannot run\' % (ldreduced,basename)\n+        sys.exit(1)\n+    newfiles,explanations,repOut = doIBSpy(ped=ped,basename=basename,outdir=newfilepath,\n+                                    logf=logf,nrsSamples=n,title=title,pdftoo=0,Zcutoff=Zcutoff)\n+    logf.close()\n+    logfs = file(logpath,\'r\').readlines()\n+    lf = file(outhtml,\'w\')\n+    lf.write(galhtmlprefix % PROGNAME)\n+    # this is a mess. todo clean up - should each datatype have it\'s own directory? Yes\n+    # probably. Then titles are universal - but userId libraries are separate.\n+    s = \'<div>Output from %s run at %s<br>\\n\' % (PROGNAME,timenow())\n+    lf.write(\'<h4>%s</h4>\\n\' % s)\n+    fixed = ["\'%s\'" % x for x in sys.argv] # add quotes just in case\n+    s = \'If you need to rerun this analysis, the command line was\\n<pre>%s</pre>\\n</div>\' % (\' \'.join(fixed))\n+    lf.write(s)\n+    # various ways of displaying svg - experiments related to missing svg mimetype on test (!)\n+    #s = """<object data="%s" type="image/svg+xml"  width="%d" height="%d">\n+    #       <embed src="%s" type="image/svg+xml" width="%d" height="%d" />\n+    #       </object>""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT,newfiles[0],PLOT_WIDTH,PLOT_HEIGHT)\n+    s = """ <embed src="%s" type="image/svg+xml" width="%d" height="%d" />""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT)\n+    #s = """ <iframe src="%s" type="image/svg+xml" width="%d" height="%d" />""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT)\n+    lf.write(s)\n+    lf.write(\'<div><h4>Click the links below to save output files and plots</h4><br><ol>\\n\')\n+    for i in range(len(newfiles)):\n+       if i == 0:\n+            lf.write(\'<li><a href="%s" type="image/svg+xml" >%s</a></li>\\n\' % (newfiles[i],explanations[i]))\n+       else:\n+             lf.write(\'<li><a href="%s">%s</a></li>\\n\' % (newfiles[i],explanations[i]))\n+    flist = os.listdir(newfilepath)\n+    for fname in flist:\n+        if not fname in newfiles:\n+             lf.write(\'<li><a href="%s">%s</a></li>\\n\' % (fname,fname))\n+    lf.write(\'</ol></div>\')\n+    lf.write(\'<div>%s</div>\' % (\'\\n\'.join(repOut))) # repOut is a list of tables\n+    lf.write(\'<div><hr><h3>Log from this job (also stored in %s)</h3><pre>%s</pre><hr></div>\' % (logfname,\'\'.join(logfs)))\n+    lf.write(\'</body></html>\\n\')\n+    lf.close()\n+    logf.close()\n+\n+if __name__ == \'__main__\':\n+    doIBS()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgGRR.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgGRR.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,95 @@
+<tool id="rgGRR1" name="GRR:">
+    <description>Pairwise Allele Sharing</description>
+    <command interpreter="python">
+         rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name"
+        '$out_file1' '$out_file1.files_path' "$title"  '$n' '$Z'
+    </command>
+    <inputs>
+      <param name="i"  type="data" label="Genotype data file from your current history"
+      format="ldindep" />
+       <param name='title' type='text' size="80" value='rgGRR' label="Title for this job"/>
+       <param name="n" type="integer" label="N snps to use (0=all)" value="5000" />
+       <param name="Z" type="float" label="Z score cutoff for outliers (eg 2)" value="6"
+       help="2 works but for very large numbers of pairs, you might want to see less than 5%" />
+    </inputs>
+    <outputs>
+       <data format="html" name="out_file1" label="${title}_rgGRR.html"/>
+    </outputs>
+
+<tests>
+ <test>
+    <param name='i' value='tinywga' ftype='ldindep' >
+    <metadata name='base_name' value='tinywga' />
+    <composite_data value='tinywga.bim' />
+    <composite_data value='tinywga.bed' />       
+    <composite_data value='tinywga.fam' />
+    <edit_attributes type='name' value='tinywga' /> 
+    </param>
+  <param name='title' value='rgGRRtest1' />
+  <param name='n' value='100' />
+  <param name='Z' value='6' />
+  <param name='force' value='true' />
+  <output name='out_file1' file='rgtestouts/rgGRR/rgGRRtest1.html' ftype='html' compare="diff" lines_diff='350'>
+    <extra_files type="file" name='Log_rgGRRtest1.txt' value="rgtestouts/rgGRR/Log_rgGRRtest1.txt" compare="diff" lines_diff="170"/>
+    <extra_files type="file" name='rgGRRtest1.svg' value="rgtestouts/rgGRR/rgGRRtest1.svg" compare="diff" lines_diff="1000" />
+    <extra_files type="file" name='rgGRRtest1_table.xls' value="rgtestouts/rgGRR/rgGRRtest1_table.xls" compare="diff" lines_diff="100" />
+  </output>
+ </test>
+</tests>
+
+
+<help>
+
+.. class:: infomark
+
+**Explanation**
+
+This tool will calculate allele sharing among all subjects, one pair at a time. It outputs measures of average alleles
+shared and measures of variability for each pair of subjects and creates an interactive image where each pair is
+plotted in this mean/variance space. It is based on the GRR windows application available at
+http://www.sph.umich.edu/csg/abecasis/GRR/
+
+The plot is interactive - you can unselect one of the relationships in the legend to remove all those points
+from the plot for example. Details of outlier pairs will pop up when the pointer is over them. e found by moving your pointer
+over them. This relies on a working browser SVG plugin - try getting one installed for your browser if the interactivity is
+broken.
+
+-----
+
+**Syntax**
+
+- **Genotype file** is the input pedigree data chosen from available library Plink binary files
+- **Title** will be used to name the outputs so make it mnemonic and useful
+- **N** is left 0 to use all snps - otherwise you get a random sample - much quicker with little loss of precision > 5000 SNPS
+
+**Summary**
+
+Warning - this tool works pairwise so slows down exponentially with sample size. An LD-reduced dataset is
+strongly recommended as it will give good resolution with relatively few SNPs. Do not use all million snps from a whole
+genome chip - it's overkill - 5k is good, 10k is almost indistinguishable from 100k.
+
+SNP are sampled randomly from the autosomes - otherwise parent/child pairs will be separated by gender.
+This tool will estimate mean pairwise allele shareing among all subjects. Based on the work of Abecasis, it has
+been rewritten so it can run with much larger data sets, produces cross platform svg and runs
+on a Galaxy server, instead of being MS windows only. Written in is Python, it uses numpy, and the innermost loop
+is inline C so it can calculate about 50M SNPpairs/sec on a typical opteron server.
+
+Setting N to some (fraction) of available markers will speed up calculation - the difference is most painful for
+large subject N. The real cost is that every subject must be compared to every other one over all genotypes -
+this is an exponential problem on subjects.
+
+If you don't see the genotype data set you want here, it can be imported using one of the methods available from
+the Rgenetics Get Data tool.
+
+-----
+
+**Attribution**
+
+Based on an idea from G. Abecasis implemented as GRR (windows only) at http://www.sph.umich.edu/csg/abecasis/GRR/
+
+Ross Lazarus wrote the original pdf writer Galaxy tool version.
+John Ziniti added the C and created the slick svg representation.
+Copyright Ross Lazarus 2007
+Licensed under the terms of the LGPL as documented http://www.gnu.org/licenses/lgpl.html
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgGTOOL.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgGTOOL.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,58 @@
+#!/usr/local/bin/python
+# hack to run and process a linkage format file into
+# the format used by Marchini's SNPTEST imputed case control association
+# expects args as  
+#         rgGTOOL.py $i $o $discrete $logf $outdir
+# ross lazarus 
+
+import sys,math,shutil,subprocess,os,time
+from os.path import abspath
+imagedir = '/static/rg' # if needed for images
+myversion = 'V000.1 August 2007'
+
+def timenow():
+    """return current time as a string
+    """
+    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
+
+
+                
+if __name__ == "__main__":
+    if len(sys.argv) < 6:
+       s = 'rgGTOOL.py needs 5 params - got %s \n' % (sys.argv)
+       sys.stderr.write(s) # print >>,s would probably also work?
+       sys.exit(0)
+    print 'Rgenetics %s http://rgenetics.org SNPTEST Tools, rgGTOOL.py starting at %s' % (myversion,timenow())
+    pname = sys.argv[1]
+    lpedname = pname.split('.ped')[0] # get file name part
+    outname = sys.argv[2]
+    discrete = sys.argv[3]
+    logf = sys.argv[4]
+    outdir = sys.argv[5]
+    cdir = os.getcwd()
+    me = sys.argv[0]
+    mypath = abspath(os.path.join(cdir,me)) # get abs path to this python script
+    shpath = abspath(os.path.sep.join(mypath.split(os.path.sep)[:-1]))
+    alogf = abspath(os.path.join(cdir,logf)) # absolute paths
+    apedf = abspath(os.path.join(cdir,'%s.ped' % lpedname)) # absolute paths
+    amapf = abspath(os.path.join(cdir,'%s.map' % lpedname)) # absolute paths
+    outg = abspath(os.path.join(outdir,'%s.gen' % outname)) # absolute paths
+    outs = abspath(os.path.join(outdir,'%s.sample' % outname)) # absolute paths
+    workdir = abspath(os.path.sep.join(mypath.split(os.path.sep)[:-1])) # trim end off './database/files/foo.dat' 
+    os.chdir(workdir)
+    tlogname = '%s.logtemp' % outname
+    sto = file(tlogname,'w')
+    sto.write('rgGTOOL.py: called with %s\n' % (sys.argv)) 
+    exme = 'gtool'
+    vcl = [exme,'-P','--ped',apedf,'--map',amapf,'--discrete_phenotype',discrete,'--og',outg,'--os',outs]
+    #'/usr/local/bin/plink','/usr/local/bin/plink',pc1,pc2,pc3)
+    #os.spawnv(os.P_WAIT,plink,vcl)
+    p=subprocess.Popen(' '.join(vcl),shell=True,stdout=sto)
+    retval = p.wait()
+    sto.write('rgGTOOL.py after calling %s: vcl=%s\n' % (exme,vcl)) 
+    sto.close()
+    shutil.move(tlogname,alogf)
+    os.chdir(cdir)
+
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgGTOOL.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgGTOOL.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,53 @@
+<tool id="rgGTOOL1" name="Converter">
+
+  
+    <description>from linkage format to SNPTEST Marchini files</description>
+  
+    <command interpreter="python">
+        rgGTOOL.py $i $o $discrete $logf $outdir
+    </command>
+    
+    <inputs>    
+       <param name="i"  type="select" label="Genotype file" dynamic_options="get_lib_pedfiles()" /> 
+       <param name="discrete" type="select" label="Make Case/Control based on affection 2/1">
+                        <option selected="yes" value="1">Discrete</option>
+                        <option value="0">Continuous</option>
+       </param>
+       <param name="o" type="text" label="Output Marchini format name" value="Marchini"/>
+       <param name="outdir" type="hidden" value="/usr/local/galaxy/data/rg/snptest" />
+   </inputs>
+
+   <outputs>  
+       <data format="txt" name="logf"  />
+   </outputs>
+<help>
+
+
+**Syntax**
+
+- **Genotype file** is the input linkage format pedigree and corresponding map file
+- **Discrete** is the type of phenotype in the affection column 
+- **Output name** is the file name (.gen and .sample will be added) for the new SNPTEST compatible file
+
+**Note on Discrete**
+See GTOOL_ documentation link below for more details. Briefly, if
+your linkage format pedigree file has 1/2 in column 6 for control/case respectively, setting this to Yes will create two 
+complete sets of output files distinguished by 1 and 2 respectively. otherwise, affection status is assumed to contain a 
+continuous phenotype and a single output set is produced
+
+
+**Summary**
+
+Code used here from Jonathon Marchini's group - see documentation at GTOOL_.
+
+.. _GTOOL: http://www.stats.ox.ac.uk/~marchini/software/gwas/gtool.html
+
+-----
+
+**Attribution**
+Originally designed and written for the Rgenetics
+series of Galaxy tools by ross lazarus (ross.lazarus@gmail.com), who didn't write GTOOL_
+but wishes he had.
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgHaploView.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgHaploView.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,513 @@\n+"""\n+released under the terms of the LGPL\n+copyright ross lazarus August 2007\n+for the rgenetics project\n+\n+Special galaxy tool for the camp2007 data\n+Allows grabbing genotypes from an arbitrary region and estimating\n+ld using haploview\n+\n+stoopid haploview won\'t allow control of dest directory for plots - always end\n+up where the data came from - need to futz to get it where it belongs\n+\n+Needs a mongo results file in the location hardwired below or could be passed in as\n+a library parameter - but this file must have a very specific structure\n+rs chrom offset float1...floatn\n+\n+\n+"""\n+\n+\n+import sys, array, os, string, tempfile, shutil, subprocess, glob\n+from rgutils import galhtmlprefix\n+\n+progname = os.path.split(sys.argv[0])[1]\n+\n+javabin = \'java\'\n+#hvbin = \'/usr/local/bin/Haploview.jar\'\n+#hvbin = \'/home/universe/linux-i686/haploview/Haploview.jar\'\n+# get this from tool as a parameter - can use \n+\n+\n+\n+atrandic = {\'A\':\'1\',\'C\':\'2\',\'G\':\'3\',\'T\':\'4\',\'N\':\'0\',\'-\':\'0\',\'1\':\'1\',\'2\':\'2\',\'3\':\'3\',\'4\':\'4\',\'0\':\'0\'}\n+\n+class NullDevice:\n+    """ a dev/null for ignoring output\n+    """\n+    def write(self, s):\n+        pass\n+\n+class ldPlot:\n+    \n+    def __init__(self, argv=[]):\n+        """\n+        setup\n+        """\n+        self.args=argv\n+        self.parseArgs(argv=self.args)\n+        self.setupRegions()\n+                \n+    def parseArgs(self,argv=[]):\n+        """\n+        """\n+        ts = \'%s%s\' % (string.punctuation,string.whitespace)\n+        ptran =  string.maketrans(ts,\'_\'*len(ts))\n+        ### Figure out what genomic region we are interested in\n+        self.region = argv[1]\n+        self.orslist = argv[2].replace(\'X\',\' \').lower() # galaxy replaces newlines with XX - go figure\n+        self.title = argv[3].translate(ptran)\n+        # for outputs\n+        self.outfile = argv[4]\n+        self.logfn = \'Log_%s.txt\' % (self.title)\n+        self.histextra = argv[5]\n+        self.base_name = argv[6]\n+        self.pedFileBase = os.path.join(self.histextra,self.base_name)\n+        print \'pedfilebase=%s\' % self.pedFileBase\n+        self.minMaf=argv[7]\n+        if self.minMaf:\n+            try:\n+                self.minMaf = float(self.minMaf)\n+            except:\n+                self.minMaf = 0.0\n+        self.maxDist=argv[8] or None\n+        self.ldType=argv[9] or \'RSQ\'\n+        self.hiRes = (argv[10].lower() == \'hi\')\n+        self.memSize= argv[11] or \'1000\'\n+        self.memSize = int(self.memSize)\n+        self.outfpath = argv[12]\n+        self.infotrack = False # note that otherwise this breaks haploview in headless mode \n+        #infotrack = argv[13] == \'info\'\n+        # this fails in headless mode as at april 2010 with haploview 4.2\n+        self.tagr2 = argv[14] or \'0.8\'\n+        hmpanels = argv[15] # eg "[\'CEU\',\'YRI\']"\n+        if hmpanels:\n+           hmpanels = hmpanels.replace(\'[\',\'\')\n+           hmpanels = hmpanels.replace(\']\',\'\')\n+           hmpanels = hmpanels.replace("\'",\'\')\n+           hmpanels = hmpanels.split(\',\')\n+        self.hmpanels = hmpanels\n+        self.hvbin = argv[16] # added rml june 2008\n+        self.bindir = os.path.split(self.hvbin)[0]\n+        # jan 2010 - always assume utes are on path to avoid platform problems\n+        self.pdfjoin = \'pdfjoin\' # os.path.join(bindir,\'pdfjoin\')\n+        self.pdfnup = \'pdfnup\' # os.path.join(bindir,\'pdfnup\')\n+        self.mogrify = \'mogrify\' # os.path.join(bindir,\'mogrify\')\n+        self.convert = \'convert\' # os.path.join(bindir,\'convert\')\n+        self.log_file = os.path.join(self.outfpath,self.logfn)\n+        self.MAP_FILE = \'%s.map\' % self.pedFileBase\n+        self.DATA_FILE = \'%s.ped\' % self.pedFileBase\n+        try:\n+            os.makedirs(self.outfpath)\n+            s = \'## made new path %s\\n\' % self.outfpath\n+        except:\n+            pass\n+        self.lf = file(self.log_file,\'w\')\n+        s = \'PATH=%s\\n\' % os.environ.get(\'PATH\',\'?\')\n+        self.lf.write(s)\n+\n+    def getRs(self):\n+        if self.region > \'\':\n+            useR'..b's.path.exists(os.path.join(self.outfpath,mainpdf)):\n+            if not os.path.exists(os.path.join(self.outfpath,mainthumb)):\n+                outf.write(\'<table><tr><td colspan="3"><a href="%s">Main combined LD plot</a></td></tr></table>\\n\' % (mainpdf))\n+            else:\n+                outf.write(\'<table><tr><td><a href="%s"><img src="%s" title="Main combined LD image" hspace="10" align="middle">\' % (mainpdf,mainthumb))\n+                outf.write(\'</td><td>Click the thumbnail at left to download the main combined LD image <a href=%s>%s</a></td></tr></table>\\n\' % (mainpdf,mainpdf))\n+        else:\n+            outf.write(\'(No main image was generated - this usually means a Haploview error connecting to Hapmap site - please try later)<br/>\\n\')\n+        outf.write(\'<br><div><hr><ul>\\n\')\n+        for i, data in enumerate( flist ):\n+            dn = os.path.split(data)[-1]\n+            if dn[:3] <> \'all\':\n+                continue\n+            newdn = dn.translate(ftran)\n+            if dn <> newdn:\n+                os.rename(os.path.join(self.outfpath,dn),os.path.join(self.outfpath,newdn))\n+                dn = newdn\n+            dnlabel = dn\n+            ext = dn.split(\'.\')[-1]\n+            if dn == \'allnup.pdf\':\n+                dnlabel = \'All pdf plots on a single page\'\n+            elif dn == \'alljoin.pdf\':\n+                dnlabel = \'All pdf plots, each on a separate page\'\n+            outf.write(\'<li><a href="%s">%s - %s</a></li>\\n\' % (dn,dn,dnlabel))\n+        for i, data in enumerate( flist ):\n+            dn = os.path.split(data)[-1]\n+            if dn[:3] == \'all\':\n+                continue\n+            newdn = dn.translate(ftran)\n+            if dn <> newdn:\n+                os.rename(os.path.join(self.outfpath,dn),os.path.join(self.outfpath,newdn))\n+                dn = newdn\n+            dnlabel = dn\n+            ext = dn.split(\'.\')[-1]\n+            if dn == \'allnup.pdf\':\n+                dnlabel = \'All pdf plots on a single page\'\n+            elif dn == \'alljoin.pdf\':\n+                dnlabel = \'All pdf plots, each on a separate page\'\n+            elif ext == \'info\':\n+                dnlabel = \'%s map data for Haploview input\' % self.title\n+            elif ext == \'ped\':\n+                dnlabel = \'%s genotype data for Haploview input\' % self.title\n+            elif dn.find(\'CEU\') <> -1 or dn.find(\'YRI\') <> -1 or dn.find(\'CHB_JPT\') <> -1: # is hapmap\n+                dnlabel = \'Hapmap data\'\n+            if ext == \'TAGS\' or ext == \'TESTS\' or ext == \'CHAPS\':\n+                dnlabel = dnlabel + \' Tagger output\'\n+            outf.write(\'<li><a href="%s">%s - %s</a></li>\\n\' % (dn,dn,dnlabel))\n+        outf.write(\'</ol><br>\')\n+        outf.write("</div><div><hr>Job Log follows below (see %s)<pre>" % self.logfn)\n+        s = file(self.log_file,\'r\').readlines()\n+        s = \'\\n\'.join(s)\n+        outf.write(\'%s</pre><hr></div>\' % s)\n+        outf.write(\'</body></html>\')\n+        outf.close()\n+        if self.useTemp:\n+            try:\n+                os.unlink(self.tempMapName)\n+                os.unlink(self.tempPedName)\n+            except:\n+                pass\n+        \n+if __name__ == "__main__":\n+    """  ### Sanity check the arguments\n+\n+    <command interpreter="python">\n+    rgHaploView.py "$ucsc_region" "$rslist" "$title" "$out_file1"\n+    "$lhistIn.extra_files_path" "$lhistIn.metadata.base_name"\n+    "$minmaf" "$maxdist" "$ldtype" "$hires" "$memsize" "$out_file1.files_path"\n+    "$infoTrack" "$tagr2" "$hmpanel" ${GALAXY_DATA_INDEX_DIR}/rg/bin/haploview.jar\n+    </command>\n+\n+    remember to figure out chromosome and complain if > 1?\n+    and use the -chromosome <1-22,X,Y> parameter to haploview\n+    skipcheck?\n+    """\n+    progname = os.path.split(sys.argv[0])[-1]\n+    if len(sys.argv) < 16:\n+        s = \'##!%s: Expected 16 params in sys.argv, got %d (%s)\' % (progname,len(sys.argv), sys.argv)\n+        print s\n+        sys.exit(1)\n+    ld = ldPlot(argv = sys.argv)\n+    ld.doPlots()\n+    ld.writeHtml()\n+\n+\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgHaploView.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgHaploView.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,152 @@
+<tool id="rgHaploView1" name="LD plots:" version="0.3">
+
+    <description>and comparisons with HapMap data</description>
+
+    <command interpreter="python">
+    rgHaploView.py "$ucsc_region" "$rslist" "$title" "$out_file1"
+    "$lhistIn.extra_files_path" "$lhistIn.metadata.base_name"
+    "$minmaf" "$maxdist" "$ldtype" "$hires" "$memsize" "$out_file1.files_path"
+    "$infoTrack" "$tagr2" "$hmpanel" ${GALAXY_DATA_INDEX_DIR}/shared/jars/haploview.jar
+    </command>
+
+    <inputs>
+
+       <param name="lhistIn" type="data" format="lped"
+        label="Current history lPed format data"
+        size="80" help="Linkage Ped format data from your current history" />
+
+       <param name="ucsc_region" type="text" label="Optional subset region (blank=ALL. WARNING: doing this will fail if >1 chromosome in input data!)"
+        size="80" optional="true"
+        help="Region eg: chr9:119,506,000-119,518,000 Leave blank for all or to extract the rs list supplied below."/>
+
+       <param name="rslist" type="text" area='true' size='5x20' label="rs list" optional="true"
+       help="List of rs numbers to select - cut and paste or type, use space delimiters. Leave blank to extract region supplied above."  />
+
+       <param name="title" type="text" size="80" label="Title for output files" optional="true"
+        help="Descriptive title for new genotype/map files" value="LD Plots" />
+
+    <param name="ldtype" type="select" label="Type of LD measure to estimate and plot"
+        size="80" help="" >
+        <option value="RSQ" selected="True">rsquared (default)</option>
+        <option value="DEFAULT">D prime</option>
+        <option value="DPALT">D prime alternative</option>
+        <option value="GAB">Gabriel</option>
+        <option value="GAM">4 Gamete test</option>
+    </param>
+
+   <param name="minmaf" type="float" label = "Minimum minor allele frequency to use" value="0.05"
+    help="If &gt; 0.0, markers below this MAF will be ignored for calculations"/>
+
+    <param name="maxdist" type="integer" label = "Maximum distance (kbp) between markers for LD estimate"
+    value="200" help="If &lt; &gt; 0, only marker pairs at or below this distance will have LD calculated"/>
+
+    <param name="hmpanel" type="select" multiple="true" label="Hapmap panels to compare"
+        size="40" help="HapMap data LD plots will also be produced for each selected population panel" >
+        <option value='CEU' selected="True">CEPH (European) (default)</option>
+        <option value='YRI'>Yoruba (African)</option>
+        <option value='CHB+JPT'>Chinese + Japanese</option>
+        <option value="">(None - no comparison)</option>
+    </param>
+    <param name="tagr2" type="float" label = "rsquared threshold for tagging outputs" value="0.8"
+    help="Tagging output will use this value as the minimum rsquared threshold"/>
+
+    <param name="infoTrack" type="select" label="Add Hapmap information track to image"
+    help="Refseq genes and snp density can be added to the plot if desired for orientation" >
+    <option value="info">Add Information track (DISABLED! Awaiting bug fix from Haploview authors since reported in October 2009)</option>
+    <option value="noinfo" selected="True">No Information track</option>
+    </param>
+
+    <param name="hires" type="select" label="High resolution plots"
+    help="A high resolution plot file may be possible but only for small regions - not reliable &gt;100's of snps">
+    <option value="hi">High resolution - only a few (hundreds of) markers</option>
+    <option value="lo" selected="True">Low resolution - large number of markers</option>
+    </param>
+
+    <param name="memsize" type="select" label="System RAM to allocate"
+        size="80" help="Very large files will need extra memory (java is a bit of a pig)" >
+        <option value="1024">1GB</option>
+        <option value="2048" selected="True">2GB (default)</option>
+        <option value="4096">4GB</option>
+        <option value="6144">6GB</option>
+        <option value="8192">8GB</option>
+    </param>
+
+   </inputs>
+
+   <outputs>
+       <data format="html" name="out_file1" label="${title}.html" />
+   </outputs>
+
+<!-- python $TOOLPATH/$TOOL.py "" "rs2283802Xrs2267000Xrs16997606Xrs4820537Xrs3788347Xrs756632Xrs4820539Xrs2283804Xrs2267006Xrs4822363X" \
+"$NPRE" $OUTPATH/${NPRE}.html "test" "" "$INPATH" "tinywga" 0.0 200000 "RSQ" "lo" "2048" "$OUTPATH" "hg18" "noinfo" "0.8" \
+"['CEU','YRI','CHB+JPT']" $BINPATH/haploview.jar -->
+<tests>
+ <test>
+  <param name='lhistIn' value='tinywga' ftype='lped' >
+   <metadata name='base_name' value='tinywga' />
+   <composite_data value='tinywga.ped' />
+   <composite_data value='tinywga.map' />
+   <edit_attributes type='name' value='tinywga' /> 
+  </param>
+ <param name='ucsc_region' value='' />
+ <param name='title' value='rgHaploViewtest1' />
+ <param name='rslist' value="rs2283802 rs2267000 rs16997606 rs4820537 rs3788347 rs756632Xrs4820539 rs2283804 rs2267006 rs4822363" />
+ <param name='ldtype' value='RSQ' />
+ <param name='minmaf' value='0.0' />
+ <param name='maxdist' value='200000' />
+ <param name='tagr2' value='0.8' />
+ <param name='hmpanel' value="YRI" />
+ <param name='infoTrack' value='noinfo' />
+ <param name='hires' value='lo' />
+ <param name='memsize' value='2048' />
+ <output name='out_file1' file='rgtestouts/rgHaploView/rgHaploViewtest1.html' ftype='html' lines_diff="60">
+    <extra_files type="file" name='alljoin.pdf' value="rgtestouts/rgHaploView/alljoin.pdf" compare="sim_size" delta="50000"/>
+    <extra_files type="file" name='allnup.pdf' value="rgtestouts/rgHaploView/allnup.pdf" compare="sim_size" delta="50000" />
+    <extra_files type="file" name='Log_rgHaploViewtest1.txt' value="rgtestouts/rgHaploView/Log_rgHaploViewtest1.txt" compare="diff" lines_diff="50"/>
+    <extra_files type="file" name='rgHaploViewtest1.ped.TESTS' value="rgtestouts/rgHaploView/rgHaploViewtest1.ped.TESTS" compare="diff" 
+            lines_diff="20"/>
+    <extra_files type="file" name='rgHaploViewtest1.ped.TAGS' value="rgtestouts/rgHaploView/rgHaploViewtest1.ped.TAGS" compare="diff"
+            lines_diff="20" />
+ </output>
+ </test>
+</tests>
+
+<help>
+
+.. class:: infomark
+
+**Note**
+
+The input file must be in linkage ped format. A suitable file can be chosen from the system library,
+or from the files already imported into your current history. Use either one of the selection boxes to
+make your choice.
+
+-----
+
+**Syntax**
+
+- **Library Linkage Ped** is a linkage format pedigree file chosen from the system file Library
+- **History Linkage Ped** is a linkage format pedigree file chosen from your current Galaxy History
+- **Region** is the genomic region cut and paste from a UCSC browser location window
+- **Genome Build** is the version of the genome your markers are from - use hg18 for CAMP illumina data
+
+-----
+
+**Summary**
+
+This tool is a special purpose tool to estimate and plot linkage disequilibrium estimated
+from genotype data in linkage pedigree format (separate map file). All markers in the input file
+are used as the default. To limit the calculations to a subset of the input data, supply
+a specified genomic region in UCSC browser location format or a list of specific marker IDs.
+
+Note that you can choose either a file of the correct type (linkage pedigree - lped) from
+your current history **or** from the system library
+
+This tool currently calls Haploview for estimation and plots. For full attribution, source code and documentation, see
+http://www.broad.mit.edu/mpg/haploview/index.php
+
+Copyright, Ross Lazarus, April 2008 for the Rgenetics project
+Released under the LGPL. See http://www.gnu.org/licenses/lgpl.html for license terms.
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgLDIndep.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgLDIndep.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,182 @@\n+"""\n+# oct 2009 - must make a map file in case later usage requires it...\n+# galaxy tool xml files can define a galaxy supplied output filename\n+# that must be passed to the tool and used to return output\n+# here, the plink log file is copied to that file and removed\n+# took a while to figure this out!\n+# use exec_before_job to give files sensible names\n+#\n+# ross april 14 2007\n+# plink cleanup script\n+# ross lazarus March 2007 for camp illumina whole genome data\n+# note problems with multiple commands being ignored - eg --freq --missing --mendel \n+# only the first seems to get done...\n+#\n+##Summary statistics versus inclusion criteria\n+##\n+##Feature                         As summary statistic    As inclusion criteria\n+##Missingness per individual      --missing               --mind N\n+##Missingness per marker          --missing               --geno N        \n+##Allele frequency                --freq                  --maf N\n+##Hardy-Weinberg equilibrium      --hardy                 --hwe N\n+##Mendel error rates              --mendel                --me N M\n+#\n+# this is rgLDIndep.py - main task is to decrease LD by filtering high LD pairs\n+# remove that function from rgClean.py as it may not be needed.\n+  \n+"""\n+import sys,shutil,os,subprocess, glob, string, tempfile, time\n+from rgutils import plinke, timenow, galhtmlprefix\n+\n+prog = os.path.split(sys.argv[0])[-1]\n+myversion = \'January 4 2010\'\n+\n+\n+def pruneld(plinktasks=[] ,cd=\'./\',vclbase = []):\n+    """\n+    plink blathers when doing pruning - ignore\n+    Linkage disequilibrium based SNP pruning\n+    if a million snps in 3 billion base pairs, have mean 3k spacing\n+    assume 40-60k of ld in ceu, a window of 120k width is about 40 snps\n+    so lots more is perhaps less efficient - each window computational cost is\n+    ON^2 unless the code is smart enough to avoid unecessary computation where\n+    allele frequencies make it impossible to see ld > the r^2 cutoff threshold\n+    So, do a window and move forward 20? \n+    from the plink docs at http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#prune\n+    \n+Sometimes it is useful to generate a pruned subset of SNPs that are in approximate linkage equilibrium with each other. This can be achieved via two commands: --indep which prunes based on the variance inflation factor (VIF), which recursively removes SNPs within a sliding window; second, --indep-pairwise which is similar, except it is based only on pairwise genotypic correlation.\n+\n+Hint The output of either of these commands is two lists of SNPs: those that are pruned out and those that are not. A separate command using the --extract or --exclude option is necessary to actually perform the pruning.\n+\n+The VIF pruning routine is performed:\n+plink --file data --indep 50 5 2\n+\n+will create files\n+\n+     plink.prune.in\n+     plink.prune.out\n+\n+Each is a simlpe list of SNP IDs; both these files can subsequently be specified as the argument for \n+a --extract or --exclude command.\n+\n+The parameters for --indep are: window size in SNPs (e.g. 50), the number of SNPs to shift the \n+window at each step (e.g. 5), the VIF threshold. The VIF is 1/(1-R^2) where R^2 is the multiple correlation coefficient for a SNP being regressed on all other SNPs simultaneously. That is, this considers the correlations between SNPs but also between linear combinations of SNPs. A VIF of 10 is often taken to represent near collinearity problems in standard multiple regression analyses (i.e. implies R^2 of 0.9). A VIF of 1 would imply that the SNP is completely independent of all other SNPs. Practically, values between 1.5 and 2 should probably be used; particularly in small samples, if this threshold is too low and/or the window size is too large, too many SNPs may be removed.\n+\n+The second procedure is performed:\n+plink --file data --indep-pairwise 50 5 0.5\n+\n+This generates the same output files as the first version; the only difference is that a \n+simple pairwise threshold is'..b'bprocess.Popen(\' \'.join(vcl),shell=True,stdout=sto,stderr=sto,cwd=cd)\n+        retval = x.wait()\n+        sto.close()\n+        sto = open(plog,\'r\') # read\n+        try:\n+            lplog = sto.readlines()\n+            lplog = [x for x in lplog if x.find(\'Pruning SNP\') == -1]\n+            logres += lplog\n+            logres.append(\'\\n\')\n+        except:\n+            logres.append(\'### %s Strange - no std out from plink when running command line\\n%s\' % (timenow(),\' \'.join(vcl)))\n+        sto.close()\n+        os.unlink(plog) # no longer needed\n+    return logres\n+\n+\n+\n+def clean():\n+    """\n+    """\n+    if len(sys.argv) < 14:\n+        print >> sys.stdout, \'## %s expected 14 params in sys.argv, got %d - %s\' % (prog,len(sys.argv),sys.argv)\n+        print >> sys.stdout, """this script will filter a linkage format ped\n+        and map file containing genotypes. It takes 14 parameters - the plink --f parameter and"\n+        a new filename root for the output clean data followed by the mind,geno,hwe,maf, mef and mei"\n+        documented in the plink docs plus the file to be returned to Galaxy\n+        Called as:\n+        <command interpreter="python">\n+        rgLDIndep.py \'$input_file.extra_files_path\' \'$input_file.metadata.base_name\' \'$title\' \'$mind\'\n+        \'$geno\' \'$hwe\' \'$maf\' \'$mef\' \'$mei\' \'$out_file1\'\n+        \'$out_file1.extra_files_path\'  \'$window\' \'$step\' \'$r2\'\n+        </command>\n+        """\n+        sys.exit(1)\n+    plog = [\'## Rgenetics: http://rgenetics.org Galaxy Tools rgLDIndep.py started %s\\n\' % timenow()]\n+    inpath = sys.argv[1]\n+    inbase = sys.argv[2]\n+    killme = string.punctuation + string.whitespace\n+    trantab = string.maketrans(killme,\'_\'*len(killme))\n+    title = sys.argv[3].translate(trantab)\n+    mind = sys.argv[4]\n+    geno = sys.argv[5]\n+    hwe = sys.argv[6]\n+    maf = sys.argv[7]\n+    me1 = sys.argv[8]\n+    me2 = sys.argv[9]\n+    outfname = sys.argv[10]\n+    outfpath = sys.argv[11]\n+    winsize = sys.argv[12]\n+    step = sys.argv[13]\n+    r2 = sys.argv[14]\n+    output = os.path.join(outfpath,outfname)\n+    outpath = os.path.join(outfpath,title)\n+    outprunepath = os.path.join(outfpath,\'ldprune_%s\' % title)\n+    try:\n+      os.makedirs(outfpath)\n+    except:\n+      pass\n+    bfile = os.path.join(inpath,inbase)\n+    filterout = os.path.join(outpath,\'filtered_%s\' % inbase)\n+    outf = file(outfname,\'w\')\n+    outf.write(galhtmlprefix % prog)\n+    ldin = bfile\n+    plinktasks = [[\'--bfile\',ldin,\'--indep-pairwise %s %s %s\' % (winsize,step,r2),\'--out\',outpath,\n+    \'--mind\',mind,\'--geno\',geno,\'--maf\',maf,\'--hwe\',hwe,\'--me\',me1,me2,],\n+    [\'--bfile\',ldin,\'--extract %s.prune.in --make-bed --out %s\' % (outpath,outpath)],\n+    [\'--bfile\',outpath,\'--recode --out\',outpath]] # make map file - don\'t really need ped but...\n+    # subset of ld independent markers for eigenstrat and other requirements\n+    vclbase = [plinke,\'--noweb\']\n+    prunelog = pruneld(plinktasks=plinktasks,cd=outfpath,vclbase = vclbase)\n+    """This generates the same output files as the first version;\n+    the only difference is that a simple pairwise threshold is used.\n+    The first two parameters (50 and 5) are the same as above (window size and step);\n+    the third parameter represents the r^2 threshold.\n+    Note: this represents the pairwise SNP-SNP metric now, not the\n+    multiple correlation coefficient; also note, this is based on the\n+    genotypic correlation, i.e. it does not involve phasing. \n+    """\n+    plog += prunelog\n+    flog = \'%s.log\' % outpath\n+    flogf = open(flog,\'w\')\n+    flogf.write(\'\'.join(plog))\n+    flogf.write(\'\\n\')\n+    flogf.close()\n+    globme = os.path.join(outfpath,\'*\')\n+    flist = glob.glob(globme)\n+    flist.sort()\n+    for i, data in enumerate( flist ):\n+        outf.write(\'<li><a href="%s">%s</a></li>\\n\' % (os.path.split(data)[-1],os.path.split(data)[-1]))\n+    outf.write(\'</ol></div>\\n\')\n+    outf.write("</div></body></html>")\n+    outf.close()\n+\n+\n+if __name__ == "__main__":\n+    clean()\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgLDIndep.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgLDIndep.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,158 @@\n+<tool id="rgLDIndep1" name="LD Independent:">\n+    <code file="rgLDIndep_code.py"/>\n+\n+    <description>filter high LD pairs - decrease redundancy</description>\n+\n+    <command interpreter="python">\n+        rgLDIndep.py \'$input_file.extra_files_path\' \'$input_file.metadata.base_name\' \'$title1\' \'$mind\'\n+        \'$geno\' \'$hwe\' \'$maf\' \'$mef\' \'$mei\' \'$out_file1\'\n+        \'$out_file1.files_path\'  \'$window\' \'$step\' \'$r2\'\n+    </command>\n+\n+    <inputs>\n+       <param name="input_file"  type="data" label="RGenetics genotype data from your current history"\n+         size="80" format="pbed" />\n+       <param name="title1" type="text" size="80" label="Descriptive title for cleaned genotype file" value="LD_Independent"/>\n+       <param name="r2" type="float" value = "0.1"\n+       label="r2 threshold: Select only pairs at or below this r^2 threshold (eg 0.1)"\n+       help="LD threshold defining LD independent markers" />\n+       <param name="window" type="integer" value = "40" label="Window: Window size to limit LD pairwise"\n+       help = "Bigger is better but time taken blows up exponentially as the window grows!" />\n+       <param name="step" type="integer" value = "30" label="Step: Move window this far and recompute"\n+       help = "Smaller is better but of course, time increases..." />\n+       <param name="geno"  type="float" label="Maximum Missing Fraction: Markers" value="1.0" />\n+       <param name="mind" type="float" value="1.0" label="Maximum Missing Fraction: Subjects"/>\n+       <param name="mef"  type="float" label="Maximum Mendel Error Rate: Family" value="1.0"/>\n+       <param name="mei"  type="float" label="Maximum Mendel Error Rate: Marker" value="1.0"/>\n+       <param name="hwe" type="float" value="0.0" label="Smallest HWE p value (set to 0 for all)" />\n+       <param name="maf" type="float" value="0.0"\n+       label="Smallest Allowable Minor Allele Frequency (set to 0.0 for all)"/>\n+\n+   </inputs>\n+\n+   <outputs>\n+       <data format="pbed" name="out_file1" metadata_source="input_file"  />\n+   </outputs>\n+<tests>\n+ <test>\n+\n+    <param name=\'input_file\' value=\'tinywga\' ftype=\'pbed\' >\n+    <metadata name=\'base_name\' value=\'tinywga\' />\n+    <composite_data value=\'tinywga.bim\' />\n+    <composite_data value=\'tinywga.bed\' />\n+    <composite_data value=\'tinywga.fam\' />\n+    <edit_attributes type=\'name\' value=\'tinywga\' /> \n+    </param>\n+    <param name=\'title1\' value=\'rgLDIndeptest1\' />\n+    <param name="mind" value="1" />\n+    <param name="geno" value="1" />\n+    <param name="hwe" value="0" />\n+    <param name="maf" value="0" />\n+    <param name="mef" value="1" />\n+    <param name="mei" value="1" />\n+    <param name="window" value="10000" />\n+    <param name="step" value="5000" />\n+    <param name="r2" value="0.1" />\n+    <output name=\'out_file1\' file=\'rgtestouts/rgLDIndep/rgLDIndeptest1.pbed\' ftype=\'pbed\' compare="diff" lines_diff=\'7\'>\n+    <extra_files type="file" name=\'rgLDIndeptest1.bim\' value="rgtestouts/rgLDIndep/rgLDIndeptest1.bim" compare="sim_size" delta="1000"/>\n+    <extra_files type="file" name=\'rgLDIndeptest1.fam\' value="rgtestouts/rgLDIndep/rgLDIndeptest1.fam" compare="diff" />\n+    <extra_files type="file" name=\'rgLDIndeptest1.bed\' value="rgtestouts/rgLDIndep/rgLDIndeptest1.bed" compare="sim_size" delta = "1000" />\n+    </output>\n+ </test>\n+</tests>\n+<help>\n+\n+.. class:: infomark\n+\n+**Attribution**\n+\n+This tool relies on Plink from Shaun Purcell. For full documentation, please see his web site\n+at http://pngu.mgh.harvard.edu/~purcell/plink/ where there is excellent documentation describing\n+the parameters you can set here.\n+\n+Rgenetics merely exposes them, wrapping Plink so you can use it in Galaxy.\n+\n+**Summary**\n+\n+In addition to filtering some marker and sample quality measures,\n+this tool reduces the amount of overlapping information, by removing\n+most of the duplicate information contained in linkage disequilibrium. This is\n+a lossy process and for some methods, signal may be lost. Howe'..b' individual\'s genotype at one locus allows confident prediction of the genotype at the other.\n+In other words, high LD means information redundancy between markers. For some\n+purposes, removing some of this redundancy can improve the performance of some analyses.\n+Executing this tool will create a new genotype dataset in your current history containing\n+LD independent markers - most of the genetic information is retained but without as much redundancy.\n+\n+Set a pairwise LD threshold (eg r^2 = 0.2) and the (smaller) resulting dataset will have no\n+pairs of marker with r^2 greater than 0.2. Additional filters are available to remove markers\n+below a specific minor allele frequency, or above a specific level of missingness,\n+and to remove subjects using similar criteria. Subjects and markers for family data can be\n+filtered by proportions of Mendelian errors in observed transmission.\n+\n+-----\n+\n+**Syntax**\n+\n+- **Genotype data** is the input pedfile chosen from available library files\n+- **New name** is the name to use for the filtered output file\n+- **Missfrac threshold: subjects** is the threshold for missingness by subject. Subjects with more than this fraction missing will be excluded from the import\n+- **Missfrac threshold: markers** is the threshold for missingness by marker. Markers with more than this fraction missing will be excluded from the import\n+- **MaxMendel Individuals** Mendel error fraction above which to exclude subjects with more than the specified fraction of mendelian errors in transmission (for family data only)\n+- **MaxMendel Families** Mendel error fraction above which to exclude families with more than the specified fraction of mendelian errors in transmission (for family data only)\n+- **HWE** is the threshold for HWE test p values below which the marker will not be imported. Set this to -1 and all markers will be imported regardless of HWE p value\n+- **MAF** is the threshold for minor allele frequency - SNPs with lower MAF will be excluded\n+- **r^2** is the pairwise LD threshold as r^2. Lower -> less marker redundancy -> fewer markers\n+- **Window** is the window width for LD threshold. Bigger -> slower -> more complete\n+- **Skip** is the distance to move the window along the genome. Should be window or less.\n+\n+-----\n+\n+**Disclaimer**\n+\n+This tool relies on Plink from Shaun Purcell. For full documentation, please see his web site\n+at http://pngu.mgh.harvard.edu/~purcell/plink/ where thereis excellent documentation describing\n+the parameters you can set here. Rgenetics merely exposes them, and wraps Plink so you can use it in Galaxy.\n+\n+This tool is designed to create genotype data files with more or less LD independent sets of markers. These\n+reduced genotype data files are particularly useful for purposes such as evaluating\n+ancestry (eg eigenstrat) or relatedness (eg rgGRR)\n+\n+LD pruning decreases redundancy among the genotype data by removing one of each pair of markers\n+in strong LD (above the r^2 threshold) over successive genomic windows (the Window parameter),\n+skipping (the Skip parameter bases between windows. The defaults should produce useable outputs.\n+\n+This might be more efficient for rgGRR and\n+eigenstrat...The core quote is\n+\n+    "This generates the same output files as the first version;\n+    the only difference is that a simple pairwise threshold is used.\n+    The first two parameters (50 and 5) are the same as above (window size and step);\n+    the third parameter represents the r^2 threshold.\n+    Note: this represents the pairwise SNP-SNP metric now, not the\n+    multiple correlation coefficient; also note, this is based on the\n+    genotypic correlation, i.e. it does not involve phasing.\n+    "\n+\n+-----\n+\n+\n+\n+This Galaxy tool was written by Ross Lazarus for the Rgenetics project\n+It uses Plink for most calculations - for full Plink attribution, source code and documentation,\n+please see http://pngu.mgh.harvard.edu/~purcell/plink/ plus some custom python code\n+\n+</help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgLDIndep_code.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgLDIndep_code.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,25 @@
+from galaxy import app
+import os, string, time
+
+def timenow():
+    """return current time as a string
+    """
+    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
+
+
+
+def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
+    name,data = out_data.items()[0]
+    basename = param_dict['title1']
+    killme = string.punctuation + string.whitespace
+    trantab = string.maketrans(killme,'_'*len(killme))
+    title = basename.encode().translate(trantab)
+    info = '%s filtered by rgLDIndep.py at %s' % (title,timenow())
+    data.file_name = data.file_name
+    data.metadata.base_name = title
+    data.name = '%s.pbed' % title
+    data.info = info
+    app.model.context.flush()
+
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgManQQ.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgManQQ.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,334 @@\n+#!/usr/local/bin/python\n+# This is a truly ghastly hack\n+# all of the heavy data cleaning lifting is done in R which is a really dumb place IMHO\n+# Making a new file seems a waste but it would be far easier to set everything up in python\n+# seems to work so I\'m leaving it alone\n+# sigh. Should really move this gig to rpy - writing a robust R script is hard.\n+# updated to compress pdf using gs since millions of points = horsechoker pdfs and pdfs are good\n+# updated july 20 to fix sort order - R unique() sorts into strict collating order\n+# so need to sort after unique to revert to lexicographic order for x axis on Manhattan\n+# rgmanqq updated july 19 to deal with x,y and mt\n+# lots of fixes\n+# ross lazarus\n+import sys,math,shutil,subprocess,os,time,tempfile,string\n+from os.path import abspath\n+from rgutils import timenow, RRun, galhtmlprefix, galhtmlpostfix, galhtmlattr\n+progname = os.path.split(sys.argv[0])[1]\n+myversion = \'V000.1 March 2010\'\n+verbose = False\n+debug = False\n+\n+rcode="""\n+# generalised so 3 core fields passed as parameters ross lazarus March 24 2010 for rgenetics\n+# Originally created as qqman with the following \n+# attribution:\n+#--------------\n+# Stephen Turner\n+# http://StephenTurner.us/\n+# http://GettingGeneticsDone.blogspot.com/\n+\n+# Last updated: 19 July 2011 by Ross Lazarus\n+# R code for making manhattan plots and QQ plots from plink output files. \n+# With GWAS data this can take a lot of memory. Recommended for use on \n+# 64bit machines only, for now. \n+\n+#\n+\n+library(ggplot2)\n+\n+coloursTouse = c(\'firebrick\',\'darkblue\',\'goldenrod\',\'darkgreen\')\n+# not too ugly but need a colour expert please...\n+\n+\n+DrawManhattan = function(pvals=Null,chrom=Null,offset=Null,title=NULL, max.y="max",suggestiveline=0, genomewide=T, size.x.labels=9, \n+              size.y.labels=10, annotate=F, SNPlist=NULL,grey=0) {\n+        if (annotate & is.null(SNPlist)) stop("You requested annotation but provided no SNPlist!")\n+        genomewideline=NULL # was genomewideline=-log10(5e-8)\n+        n = length(pvals)\n+        if (genomewide) { # use bonferroni since might be only a small region?\n+            genomewideline = -log10(0.05/n) }\n+        offset = as.integer(offset)\n+        if (n > 1000000) { offset = offset/10000 }\n+        else if (n > 10000) { offset = offset/1000}\n+        chro = as.integer(chrom) # already dealt with X and friends?\n+        pvals = as.double(pvals)\n+        d=data.frame(CHR=chro,BP=offset,P=pvals)\n+        if ("CHR" %in% names(d) & "BP" %in% names(d) & "P" %in% names(d) ) {\n+                d=d[!is.na(d$P), ]\n+                d=d[!is.na(d$BP), ]\n+                d=d[!is.na(d$CHR), ]\n+                #limit to only chrs 1-22, x=23,y=24,Mt=25?\n+                d=d[d$CHR %in% 1:25, ]\n+                d=d[d$P>0 & d$P<=1, ]\n+                d$logp = as.double(-log10(d$P))\n+                dlen = length(d$P)\n+                d$pos=NA\n+                ticks=NULL\n+                lastbase=0\n+                chrlist = unique(d$CHR)\n+                chrlist = as.integer(chrlist)\n+                chrlist = sort(chrlist) # returns lexical ordering \n+                if (max.y=="max") { maxy = ceiling(max(d$logp)) } \n+                   else { maxy = max.y }\n+                nchr = length(chrlist) # may be any number?\n+                maxy = max(maxy,1.1*genomewideline)\n+                if (nchr >= 2) {\n+                    for (x in c(1:nchr)) {\n+                        i = chrlist[x] # need the chrom number - may not == index\n+                        if (x == 1) { # first time\n+                            d[d$CHR==i, ]$pos = d[d$CHR==i, ]$BP # initialize to first BP of chr1\n+                            dsub = subset(d,CHR==i)\n+                            dlen = length(dsub$P)\n+                            lastbase = max(dsub$pos) # last one\n+                            tks = d[d$CHR==i, ]$pos[floor(length(d[d$CHR==i, ]$pos)/2)+1]\n+                            lastchr = i\n+                        } '..b'set <> 0\n+    contains some R scripts as text strings - we substitute defaults into the calls\n+    to make them do our bidding - and save the resulting code for posterity\n+    this can be called externally, I guess...for QC eg?\n+    """\n+    if debug:\n+        print \'doManQQ\',input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir\n+    rcmd = \'%s%s\' % (rcode,rcode2 % (input_fname,chrom_col,offset_col,pval_cols,title,grey))\n+    if debug:\n+        print \'running\\n%s\\n\' % rcmd\n+    rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir)\n+    rlog.append(\'## R script=\')\n+    rlog.append(rcmd)\n+    return rlog,flist\n+  \n+def compressPDF(inpdf=None):\n+    """need absolute path to pdf\n+    """\n+    assert os.path.isfile(inpdf), "## Input %s supplied to compressPDF not found" % inpdf\n+    outpdf = \'%s_compressed\' % inpdf\n+    cl = ["gs", "-sDEVICE=pdfwrite", "-dNOPAUSE", "-dBATCH", "-sOutputFile=%s" % outpdf,inpdf]\n+    retval = subprocess.call(cl)\n+    if retval == 0:    \n+        os.unlink(inpdf) \n+        shutil.move(outpdf,inpdf)\n+    return retval\n+\n+def main():\n+    u = """<command interpreter="python">\n+        rgManQQ.py \'$input_file\' "$name" \'$out_html\' \'$out_html.files_path\' \'$chrom_col\' \'$offset_col\' \'$pval_col\'\n+    </command>\n+    """\n+    npar = 8\n+    if len(sys.argv) < npar:\n+            print >> sys.stdout, \'## error - too few command line parameters - wanting %d\' % npar\n+            print >> sys.stdout, u\n+            sys.exit(1)\n+    input_fname = sys.argv[1]\n+    title = sys.argv[2]\n+    killme = string.punctuation + string.whitespace\n+    trantab = string.maketrans(killme,\'_\'*len(killme))\n+    ctitle = title.translate(trantab)\n+    outhtml = sys.argv[3]\n+    outdir = sys.argv[4]\n+    try:\n+         chrom_col = int(sys.argv[5])\n+    except:\n+         chrom_col = -1\n+    try:\n+        offset_col = int(sys.argv[6])\n+    except:\n+        offset_col = -1\n+    p = sys.argv[7].strip().split(\',\')\n+    try:\n+        q = [int(x) for x in p]\n+    except:\n+        p = -1\n+    if chrom_col == -1 or offset_col == -1: # was passed as zero - do not do manhattan plots\n+        chrom_col = -1\n+        offset_col = -1\n+    grey = 0\n+    if (sys.argv[8].lower() in [\'1\',\'true\']):\n+       grey = 1\n+    if p == -1:\n+        print >> sys.stderr,\'## Cannot run rgManQQ - missing pval column\'\n+        sys.exit(1)\n+    p = [\'%d\' % (int(x) + 1) for x in p]\n+    rlog,flist = doManQQ(input_fname,chrom_col+1,offset_col+1,\',\'.join(p),title,grey,ctitle,outdir)\n+    flist.sort()\n+    html = [galhtmlprefix % progname,]\n+    html.append(\'<h1>%s</h1>\' % title)\n+    if len(flist) > 0:\n+        html.append(\'<table>\\n\')\n+        for row in flist:\n+            fname,expl = row # RRun returns pairs of filenames fiddled for the log and R script\n+            n,e = os.path.splitext(fname)\n+            if e in [\'.png\',\'.jpg\']:\n+                pdf = \'%s.pdf\' % n\n+                pdff = os.path.join(outdir,pdf)\n+                if os.path.exists(pdff):\n+                    rval = compressPDF(inpdf=pdff)\n+                    if rval <> 0:\n+                        pdf = \'%s(not_compressed)\' % pdf\n+                else:\n+                    pdf = \'%s(not_found)\' % pdf\n+                s= \'<tr><td><a href="%s"><img src="%s" title="%s" hspace="10" width="800"></a></td></tr>\' \\\n+                 % (pdf,fname,expl)\n+                html.append(s)\n+            else:\n+               html.append(\'<tr><td><a href="%s">%s</a></td></tr>\' % (fname,expl))\n+        html.append(\'</table>\\n\')\n+    else:\n+        html.append(\'<h2>### Error - R returned no files - please confirm that parameters are sane</h1>\')    \n+    html.append(\'<h3>R log follows below</h3><hr><pre>\\n\')\n+    html += rlog\n+    html.append(\'</pre>\\n\')   \n+    html.append(galhtmlattr % (progname,timenow()))\n+    html.append(galhtmlpostfix)\n+    htmlf = file(outhtml,\'w\')\n+    htmlf.write(\'\\n\'.join(html))\n+    htmlf.write(\'\\n\')\n+    htmlf.close()\n+    \n+  \n+\n+if __name__ == "__main__":\n+    main()\n+\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgManQQ.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgManQQ.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,119 @@
+<tool id="rgManQQ1" name="Manhattan/QQ:" version="1.0.3">
+    <code file="rgManQQ_code.py"/>
+
+    <description>Plots for WGA P values</description>
+
+    <command interpreter="python">
+        rgManQQ.py '$i' "$name" '$out_html' '$out_html.files_path' '$chrom_col' '$offset_col' '$pval_col' '$grey'
+    </command>
+
+    <inputs>
+      <page>
+      <param name="i"  type="data" label="Tabular data from your current history"
+      format="tabular" refresh_on_change="true"/>
+      </page>
+      <page>
+       <param name='name' type='text' size="132" value='Manhattan and QQ plots' label="Title for this job"/>
+       <param name="pval_col" type='select' size="5" label = 'P value (0-1) column in input file'  
+        dynamic_options="get_phecols(i,False,'pval')" refresh_on_change="true" multiple="true" 
+        help="(Select multiple P value columns for multiple plots holding down the [Ctrl] key as you click)" />
+       <param name="chrom_col" type='select' label = 'Chromosome column in input file'
+        help='Select "None" if chromosome not available or no Manhattan plot required'
+        dynamic_options="get_phecols(i,True,'chr')" />
+       <param name="offset_col" type='select' label = 'Base pair offset column in input file'
+        help='Select "None" if offset not available or no Manhattan plot required'
+        dynamic_options="get_phecols(i,True,'offs')" />
+       <param name="grey" type="boolean" checked="false" truevalue="true" falsevalue="false" 
+        label="Grey scale for Manhattan plot (default is colour"/> 
+       </page>
+    </inputs>
+    
+   <outputs>
+       <data format="html" name="out_html" />
+   </outputs>
+   <options refresh="True"/>
+
+<tests>
+ <test>
+ <param name='i' value='smallwgaP.xls' ftype='tabular' >
+ </param>
+ <param name='name' value='rgManQQtest1' />
+ <param name='pval_col' value='7' />
+ <param name='chrom_col' value='1' />
+ <param name='offset_col' value='2' />
+ <param name='grey' value='0' />
+ <output name='out_html' file='rgtestouts/rgManQQ/rgManQQtest1.html' ftype='html' lines_diff='60'>
+   <extra_files type="file" name='Allelep_manhattan.png' value='rgtestouts/rgManQQ/Allelep_manhattan.png' compare="sim_size" 
+     delta = "20000"/>
+   <extra_files type="file" name='Allelep_qqplot.png' value='rgtestouts/rgManQQ/Allelep_qqplot.png' compare="sim_size"
+     delta = "20000" />
+   <extra_files type="file" name='rgManQQtest1.R' value='rgtestouts/rgManQQ/rgManQQtest1.R' compare="diff" lines_diff="160"/>
+ </output>
+ </test>
+</tests>
+<help>
+
+.. class:: infomark
+
+**Syntax**
+
+- **Tabular Data** is a tab delimited header file with chromosome, offset and p values to be plotted
+- **Chromosome Column** is the column in that data containing the chromosome as an integer
+- **Offset Column** contains the offset within the chromosome
+- **P Value Column** contains the (untransformed) p values at that locus - choose multiple columns if needed
+
+NOTE - plotting millions of p values may take tens of minutes depending on 
+how busy the server is - be patient please. 
+
+-----
+
+.. class:: infomark
+
+**Summary**
+
+This tool will create a qq plot and a Manhattan plot for one or more GWA P value columns from a tabular
+dataset. For Manhattan plots, the data must include the chromosome (eg use 23,24,25 for x,y,mt...) and
+offset. Many analysis files contain the required fields but even without chromosome and offset, a qq plot 
+can be created.
+
+-----
+
+.. class:: infomark
+
+**Explanation**
+
+A "Manhattan" plot shows -log10 p values ordered by offset and by chromosome. Regions with interestingly
+improbable p values are above the red line which is drawn at the Bonferroni FWER control level (0.05/n 
+where n is the number of tests - this is highly conservative for correlated SNPs typical of GWA)
+
+.. image:: ./static/images/Armitagep_manhattan.png
+
+A quantile-quantile (QQ) plot is a good way to see systematic departures from the null expectation of 
+uniform p-values from a genomic analysis. If the QQ plot shows departure from the null (ie a uniform 0-1 
+distribution), you hope that this will be in the very smallest p-values suggesting that there might be some 
+interesting results to look at. A log scale will help emphasise departures from the null at low p values 
+more clear
+
+.. image:: ./static/images/Armitagep_qqplot.png
+
+-----
+
+.. class:: infomark
+
+**Attribution**
+
+This is a Galaxy tool written by Ross Lazarus. It relies on 
+ggplot2, an R package from hadley wickham and some 
+R code for manhattan and qq plots using ggplot2,
+borrowed from Stephen Turner found at http://GettingGeneticsDone.blogspot.com/
+
+copyright Ross Lazarus 2010
+Licensed under the terms of the LGPL as documented http://www.gnu.org/licenses/lgpl.html
+but is about as useful as a chocolate teapot without R and Galaxy which all have a
+twisty maze of little licenses, all different.
+
+I'm no lawyer, but it looks like at least LGPL if you create derived works from this code. 
+Good luck.
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgManQQ_code.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgManQQ_code.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,50 @@
+from galaxy import datatypes,model
+import sys,string,time
+
+
+def timenow():
+    """return current time as a string
+    """
+    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
+
+
+def get_phecols(i,addNone,hint):
+   """ 
+   return a list of phenotype columns for a multi-select list
+   """
+   hint = hint.lower()
+   fname = i.dataset.file_name
+   try:
+        f = open(fname,'r')
+   except:
+        return [('get_phecols unable to open file "%s"' % fname,'None',False),]
+   header = f.next()
+   h = header.strip().split()
+   dat = [(x,'%d' % i,False) for i,x in enumerate(h)]
+   matches = [i for i,x in enumerate(h) if x.lower().find(hint) <> -1]
+   if len(matches) > 0:
+       sel = matches[0]
+       dat[sel] = (dat[sel][0],dat[sel][1],True)
+   if addNone:
+        dat.insert(0,('None - no Manhattan plot','0', False ))
+   return dat
+
+
+def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
+    """Sets the name of the data
+       <outputs>
+       <data format="pdf" name="allqq" />
+       <data format="pdf" name="lowqq" parent="allqq"/>
+    </outputs>
+    """
+    outfile = 'out_html'
+    job_name = param_dict.get( 'name', 'Manhattan QQ plots' )
+    killme = string.punctuation + string.whitespace
+    trantab = string.maketrans(killme,'_'*len(killme))
+    newname = '%s.html' % job_name.translate(trantab)
+    data = out_data[outfile]
+    data.name = newname
+    data.info='%s run at %s' % (job_name,timenow())
+    out_data[outfile] = data
+    app.model.context.flush()
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgPedSub.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgPedSub.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,305 @@\n+"""\n+July 1 2009 added relatedness filter - fo/oo or all\n+released under the terms of the LGPL\n+copyright ross lazarus August 2007 \n+for the rgenetics project\n+\n+Special galaxy tool for the camp2007 data\n+Allows grabbing genotypes from an arbitrary region\n+\n+Needs a mongo results file in the location hardwired below or could be passed in as\n+a library parameter - but this file must have a very specific structure\n+rs chrom offset float1...floatn\n+\n+called as\n+\n+    <command interpreter="python2.4">\n+        campRGeno2.py $region "$rslist" "$title" $output1 $log_file $userId "$lpedIn" "$lhistIn"\n+    </command>\n+\n+\n+"""\n+\n+\n+import sys, array, os, string\n+from rgutils import galhtmlprefix,plinke,readMap\n+\n+progname = os.path.split(sys.argv[0])[1]\n+\n+\n+atrandic = {\'A\':\'1\',\'C\':\'2\',\'G\':\'3\',\'T\':\'4\',\'N\':\'0\',\'-\':\'0\',\'1\':\'1\',\'2\':\'2\',\'3\':\'3\',\'4\':\'4\',\'0\':\'0\'}\n+\n+def doImport(outfile=\'test\',flist=[]):\n+    """ import into one of the new html composite data types for Rgenetics\n+        Dan Blankenberg with mods by Ross Lazarus \n+        October 2007\n+    """\n+    out = open(outfile,\'w\')\n+    out.write(galhtmlprefix % progname)\n+\n+    if len(flist) > 0:\n+        out.write(\'<ol>\\n\')\n+        for i, data in enumerate( flist ):\n+           out.write(\'<li><a href="%s">%s</a></li>\\n\' % (os.path.split(data)[-1],os.path.split(data)[-1]))\n+        out.write(\'</ol>\\n\')\n+    else:\n+           out.write(\'No files found\')\n+    out.write("</div></body></html>")\n+    out.close()\n+\n+def setupPedFilter(relfilter=\'oo\',dfile=None):\n+    """ figure out who to drop to satisfy relative filtering\n+    note single offspring only from each family id\n+    ordering of pdict keys makes this \'random\' as the first one only is\n+    kept if there are multiple sibs from same familyid.\n+    """\n+    dropId = {}\n+    keepoff = (relfilter == \'oo\')\n+    keepfounder = (relfilter == \'fo\')\n+    pdict = {}\n+    for row in dfile:\n+        rowl = row.strip().split()\n+        if len(rowl) > 6:\n+            idk = (rowl[0],rowl[1])\n+            pa =  (rowl[0],rowl[2]) # key for father\n+            ma = (rowl[0],rowl[3]) # and mother\n+            pdict[idk] = (pa,ma)\n+    dfile.seek(0) # rewind\n+    pk = pdict.keys()\n+    for p in pk:\n+        parents = pdict[p]\n+        if pdict.get(parents[0],None) or pdict.get(parents[1],None): # parents are in this file\n+            if keepfounder:\n+                dropId[p] = 1 # flag for removal\n+        elif keepoff:\n+            dropId[p] = 1 # flag for removal \n+    if keepoff: # TODO keep only a random offspring if many - rely on pdict keys being randomly ordered...!   \n+        famseen = {}\n+        for p in pk: # look for multiples from same family - drop all but first\n+             famid = p[0]\n+             if famseen.get(famid,None):\n+                 dropId[p] = 1 # already got one from this family\n+             famseen.setdefault(famid,1)\n+    return dropId\n+   \n+def writeFped(rslist=[],outdir=None,title=\'Title\',basename=\'\',dfile=None,wewant=[],dropId={},outfile=None,logfile=None):\n+    """ fbat format version\n+    """\n+    outname = os.path.join(outdir,basename)\n+    pedfname = \'%s.ped\' % outname\n+    ofile = file(pedfname, \'w\')\n+    rsl = \' \'.join(rslist) # rslist for fbat\n+    ofile.write(rsl)\n+    s = \'wrote %d marker header to %s - %s\\n\' % (len(rslist),pedfname,rsl[:50])\n+    lf.write(s)\n+    ofile.write(\'\\n\')\n+    nrows = 0\n+    for line in dfile:\n+        line = line.strip()\n+        if not line:\n+            continue\n+        line = line.replace(\'D\',\'N\')\n+        fields = line.split()\n+        preamble = fields[:6]\n+        idk = (preamble[0],preamble[1])\n+        dropme = dropId.get(idk,None)\n+        if not dropme:\n+            g = [\'%s %s\' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant]\n+            g = \' \'.join(g)\n+            g = g.split() # we\'ll get there\n+            g = [atrandic.get(x,\'0\') for x in g] # numeric alleles...\n+            # hack for framingham ND\n+            ofile.write(\'%s'..b'XX - go figure\n+    title = conf.get(\'title\',\'\').translate(ptran) # for outputs\n+    outfile = conf.get(\'output1\',\'\')\n+    outdir = conf.get(\'outdir\',\'\')\n+    try:\n+        os.makedirs(outdir)\n+    except:\n+        pass\n+    outformat = conf.get(\'outformat\',\'lped\')\n+    basename = conf.get(\'basename\',title)\n+    logfile = os.path.join(outdir,\'%s.log\' % title) \n+    userId = conf.get(\'userId\',\'\') # for library\n+    pedFileBase = conf.get(\'inped\',\'\')\n+    relfilter = conf.get(\'relfilter\',\'\')\n+    MAP_FILE = \'%s.map\' % pedFileBase\n+    DATA_FILE = \'%s.ped\' % pedFileBase    \n+    title = conf.get(\'title\',\'lped subset\')\n+    lf = file(logfile,\'w\')\n+    lf.write(\'config file %s = \\n\' % configf)\n+    lf.write(\'\'.join(config))\n+    c = \'\'\n+    spos = epos = 0\n+    rslist = []\n+    rsdict = {}\n+    if region > \'\':\n+        try: # TODO make a regexp?\n+            c,rest = region.split(\':\')\n+            c = c.replace(\'chr\',\'\')\n+            rest = rest.replace(\',\',\'\') # remove commas\n+            spos,epos = rest.split(\'-\')\n+            spos = int(spos)\n+            epos = int(epos)\n+            s = \'## %s parsing chrom %s from %d to %d\\n\' % (progname,c,spos,epos)\n+            lf.write(s)\n+        except:\n+            s = \'##! %s unable to parse region %s - MUST look like "chr8:10,000-100,000\\n\' % (progname,region)\n+            lf.write(s)\n+            lf.close()\n+            sys.exit(1)\n+    else:\n+        rslist = orslist.split() # galaxy replaces newlines with XX - go figure\n+        rsdict = dict(zip(rslist,rslist))\n+    allmarkers = False\n+    if len(rslist) == 0 and epos == 0: # must be a full extract - presumably remove relateds or something\n+        allmarkers = True\n+    ### Figure out which markers are in this region\n+    markers,snpcols,rslist,rsdict = readMap(mapfile=MAP_FILE,allmarkers=allmarkers,rsdict=rsdict,c=c,spos=spos,epos=epos)\n+    if len(rslist) == 0:\n+            s = \'##! %s found no rs numbers in %s\\n\' % (progname,sys.argv[1:3])\n+            lf.write(s)\n+            lf.write(\'\\n\')\n+            lf.close()\n+            sys.exit(1)\n+    s = \'## %s looking for %d rs (%s....etc)\\n\' % (progname,len(rslist),rslist[:5])\n+    lf.write(s)\n+    try:\n+        dfile = open(DATA_FILE, \'r\')\n+    except: # bad input file name?\n+        s = \'##! rgPedSub unable to open file %s\\n\' % (DATA_FILE)\n+        lf.write(s)\n+        lf.write(\'\\n\')\n+        lf.close()\n+        print >> sys.stdout, s\n+        raise\n+        sys.exit(1)\n+    if relfilter <> \'all\': # must read pedigree and figure out who to drop\n+        dropId = setupPedFilter(relfilter=relfilter,dfile=dfile)\n+    else:\n+        dropId = {}\n+    wewant = [(6+(2*snpcols[x])) for x in rslist] # \n+    # column indices of first geno of each marker pair to get the markers into genomic\n+    ### ... and then parse the rest of the ped file to pull out\n+    ### the genotypes for all subjects for those markers\n+    # /usr/local/galaxy/data/rg/1/lped/\n+    if len(dropId.keys()) > 0:\n+        s = \'## dropped the following subjects to satisfy requirement that relfilter = %s\\n\' % relfilter\n+        lf.write(s)\n+        if relfilter == \'oo\':\n+            s = \'## note that one random offspring from each family was kept if there were multiple offspring\\n\'\n+            lf.write(s)\n+        s = \'FamilyId\\tSubjectId\\n\'\n+        lf.write(s)\n+        dk = dropId.keys()\n+        dk.sort()\n+        for k in dk:\n+            s = \'%s\\t%s\\n\' % (k[0],k[1])\n+            lf.write(s)\n+    lf.write(\'\\n\')\n+    lf.close()\n+    if outformat == \'lped\':\n+        nrows,pedfname=writePed(markers=markers,outdir=outdir,title=title,basename=basename,dfile=dfile,\n+                 wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile)\n+    elif outformat == \'fped\':\n+        nrows,pedfname=writeFped(rslist=rslist,outdir=outdir,title=title,basename=basename,dfile=dfile,\n+                  wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile)\n+    dfile.close()    \n+\n+if __name__ == "__main__":\n+    subset()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgPedSub.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgPedSub.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,143 @@
+<tool id="rgPedSub1" name="Subset markers:">
+
+    <description>region or rs list</description>
+
+    <command interpreter="python">
+        rgPedSub.py $script_file
+    </command>
+
+    <inputs>
+        <page>
+       <param name="title" type="text" size="80" label="Title for output files"
+        help="Descriptive title for new genotype/map files" value="Genotype_Subset" />
+       <param name="input1" type="data" format="lped"
+    label="Current history lPed format data" optional="false"
+     size="120" help="Choose a Linkage Ped format data from your current history" />
+       <param name='relfilter' label = "Filter out family relatedness" type="select"
+         optional="false" size="132"
+         help="Optionally remove related subjects if pedigree identifies founders and their offspring">
+         <option value="all" selected='true'>No filter on relatedness - all subjects passed through</option>
+         <option value="fo" >Founders only (pedigree mother and father ID = "0")</option>
+         <option value="oo" >Offspring only (one randomly chosen if >1 sibs in family)</option>
+    </param>
+
+        </page><page>
+       <conditional name="m">
+         <param name="mtype" type="select"  label="Markers in a genomic interval,or as an rs list?" refresh_on_change='true'
+         help="Indicate the markers to be saved - as a list or as genomic region coordinates">
+           <option value="grslist" >Cut and paste a list of marker ids as rs numbers</option>
+           <option value="gregion" selected='true'>Supply genomic coordinates for a region (as UCSC location)</option>
+         </param>
+         <when value="gregion">
+          <param name="region" type="text" label="Genomic refseq coordinates - chromosome:start-end"
+         size="120" help="Region to be saved as chr9:119,506,000-119,518,000"/>
+           <param name="rslist" type="hidden" value='' />
+         </when>
+         <when value="grslist">
+           <param name="region" value="" type="hidden"/>
+             <param name="rslist" type="text" area='true' size='15x20' label="marker id (rs) list"
+        help="Cut and paste, or type a list of marker ids separated by spaces"  />
+         </when>
+        </conditional>
+        </page>
+   </inputs>
+
+   <outputs>
+       <data format="lped" name="output1" metadata_source="input1" label="${title}.lped"/>
+   </outputs>
+
+<configfiles>
+<configfile name="script_file">
+title~~~~$title
+output1~~~~$output1
+userId~~~~$userId
+outformat~~~~lped
+basename~~~~$input1.metadata.base_name
+inped~~~~$input1.extra_files_path/$input1.metadata.base_name
+outdir~~~~$output1.files_path
+relfilter~~~~$relfilter
+#if $m.mtype=='grslist'
+rslist~~~~$m.rslist
+region~~~~
+#else
+rslist~~~~
+region~~~~$m.region
+#end if
+</configfile>
+</configfiles>
+
+<tests>
+ <test>
+    <param name='input1' value='tinywga' ftype='lped' >
+    <metadata name='base_name' value='tinywga' />
+    <composite_data value='tinywga.ped' />
+    <composite_data value='tinywga.map' />
+    <edit_attributes type='name' value='tinywga' /> 
+    </param>
+    <param name='title' value='rgPedSubtest1' />
+    <param name="mtype" value="grslist" />
+    <param name="region" value="" />
+    <param name="rslist" value="rs2283802Xrs2267000Xrs16997606Xrs4820537Xrs3788347Xrs756632Xrs4820539Xrs2283804Xrs2267006Xrs4822363X" />
+    <param name="relfilter" value="all" />
+    <output name='output1' file='rgtestouts/rgPedSub/rgPedSubtest1.lped' ftype='lped' linesDiff='7'/>
+ </test>
+</tests>
+
+<help>
+
+.. class:: infomark
+
+**Note**
+
+There are 2 forms to complete before the job is ready to be run
+
+  **Page 1**
+
+     give the job a mnemonic descriptive title and select the output format.
+
+     Choose a file containing genotypes and a pedigree from your current history
+
+     The input file must be in linkage ped format.
+
+     If the data are not yet in your history, import from one of the system libraries or upload from your computer using the get data tool
+
+  **Page 2**
+
+     Define the markers to be used. You can supply a UCSC style location as chr:start_offset-end_offset
+
+     or a list of marker ids - rs numbers. You can flip between marker input style by changing the select box.
+
+     If you supply a list, the markers must all be from the same chromosome or region for sensible results.
+
+Run the job and the subset file will eventually appear in your history ready to be used with other tools.
+
+-----
+
+**Syntax**
+
+- **Library Linkage Ped** is a linkage format pedigree file chosen from the system file Library
+- **History Linkage Ped** is a linkage format pedigree file chosen from your current Galaxy History
+- **Region** is the genomic region cut and paste from a UCSC browser location window
+- **Genome Build** is the version of the genome your markers are from - use hg18 for CAMP illumina data
+
+-----
+
+.. class:: infomark
+
+**Summary**
+
+This tool is a special purpose tool to extract genotypes from genotype data in linkage
+pedigree format (separate map file) over a specified genomic region
+The region to be extracted can be described as UCSC browser location, or as a list of
+markers.
+
+It is possible to retain ALL markers by leaving the rslist and region empty if you just want to remove
+all offspring from a pedigree for example
+
+The extracted data will appear in your current history as a new lped data set
+
+Copyright, Ross Lazarus, March 2008 for the Rgenetics project
+Released under the LGPL. See http://www.gnu.org/licenses/lgpl.html for license terms.
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgQC.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgQC.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,1354 @@\n+# oct 15 rpy replaced - temp fix until we get gnuplot working \n+# rpy deprecated - replace with RRun\n+# fixes to run functional test! oct1 2009\n+# needed to expand our path with os.path.realpath to get newpath working with\n+# all the fancy pdfnup stuff\n+# and a fix to pruneld to write output to where it should be\n+# smallish data in test-data/smallwga in various forms\n+# python ../tools/rgenetics/rgQC.py -i smallwga -o smallwga -s smallwga/smallwga.html -p smallwga\n+# child files are deprecated and broken as at july 19 2009\n+# need to move them to the html file extrafiles path\n+# found lots of corner cases with some illumina data where cnv markers were\n+# included\n+# and where affection status was all missing !\n+# added links to tab files showing worst 1/keepfrac markers and subjects\n+# ross lazarus january 2008\n+#\n+# added named parameters\n+# to ensure no silly slippages if non required parameter in the most general case\n+# some potentially useful things here reusable in complex scripts\n+# with lots\'o\'html (TM)\n+# aug 17 2007 rml\n+#\n+# added marker and subject and parenting april 14 rml\n+# took a while to get the absolute paths right for all the file munging\n+# as of april 16 seems to work..\n+# getting galaxy to serve images in html reports is a little tricky\n+# we don\'t want QC reports to be dozens of individual files, so need\n+# to use the url /static/rg/... since galaxy\'s web server will happily serve images\n+# from there\n+# galaxy passes output files as relative paths\n+# these have to be munged by rgQC.py before calling this\n+# galaxy will pass in 2 file names - one for the log\n+# and one for the final html report\n+# of the form \'./database/files/dataset_66.dat\'\n+# we need to be working in that directory so our plink output files are there\n+# so these have to be munged by rgQC.py before calling this\n+# note no ped file passed so had to remove the -l option\n+# for plinkParse.py that makes a heterozygosity report from the ped\n+# file - needs fixing...\n+# new: importing manhattan/qqplot plotter\n+# def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir):\n+#    """ draw a qq for pvals and a manhattan plot if chrom/offset <> 0\n+#    contains some R scripts as text strings - we substitute defaults into the calls\n+#    to make them do our bidding - and save the resulting code for posterity\n+#    this can be called externally, I guess...for QC eg?\n+#    """\n+#\n+#    rcmd = \'%s%s\' % (rcode,rcode2 % (input_fname,chrom_col,offset_col,pval_cols,title,grey))\n+#    rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir)\n+#    return rlog,flist\n+  \n+\n+from optparse import OptionParser\n+\n+import sys,os,shutil, glob, math, subprocess, time, operator, random, tempfile, copy, string\n+from os.path import abspath\n+from rgutils import galhtmlprefix, galhtmlpostfix, RRun, timenow, plinke, rexe, runPlink, pruneLD\n+import rgManQQ\n+\n+prog = os.path.split(sys.argv[0])[1]\n+vers = \'0.4 april 2009 rml\'\n+idjoiner = \'_~_~_\' # need something improbable..\n+# many of these may need fixing for a new install\n+\n+myversion = vers\n+keepfrac = 20 # fraction to keep after sorting by each interesting value\n+\n+missvals = {\'0\':\'0\',\'N\':\'N\',\'-9\':\'-9\',\'-\':\'-\'} # fix me if these change!\n+\n+mogresize = "x300" # this controls the width for jpeg thumbnails\n+\n+\n+\n+            \n+def makePlots(markers=[],subjects=[],newfpath=\'.\',basename=\'test\',nbreaks=\'20\',nup=3,height=10,width=8,rgbin=\'\'):\n+    """\n+    marker rhead = [\'snp\',\'chrom\',\'maf\',\'a1\',\'a2\',\'missfrac\',\n+    \'p_hwe_all\',\'logp_hwe_all\',\'p_hwe_unaff\',\'logp_hwe_unaff\',\'N_Mendel\']\n+    subject rhead = [\'famId\',\'iId\',\'FracMiss\',\'Mendel_errors\',\'Ped_sex\',\'SNP_sex\',\'Status\',\'Fest\']\n+    """\n+\n+        \n+    def rHist(plotme=[],outfname=\'\',xlabname=\'\',title=\'\',basename=\'\',nbreaks=50):\n+        """   rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50)\n+        # generic histogram and vertical boxplot in a 3:1 layout\n+        # returns the graphic file name for inclusio'..b'logf) # writes the subject_froot.xls file\n+    markers,markerTops = markerRep(froot=repout,outfname=amarkf,newfpath=newfpath,\n+                logf=alogf,maplist=maplist) # marker_froot.xls\n+    nbreaks = 100\n+    s = \'## starting plotpage, newfpath=%s,m=%s,s=%s/n\' % (newfpath,markers[:2],subjects[:2])\n+    alogf.write(s)\n+    print s\n+    plotpage,cruft = makePlots(markers=markers,subjects=subjects,newfpath=newfpath,\n+                         basename=basename,nbreaks=nbreaks,height=10,width=8,rgbin=rgbin)\n+    #plotpage = RmakePlots(markers=markers,subjects=subjects,newfpath=newfpath,basename=basename,nbreaks=nbreaks,rexe=rexe)\n+\n+    # [titles[n],plotnames[n],outfnames[n] ]\n+    html = []\n+    html.append(\'<table cellpadding="5" border="0">\')\n+    size = getfSize(amarkf,newfpath)\n+    html.append(\'<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>\' % \\\n+                (amarkf,\'Click here to download the Marker QC Detail report file\',size))\n+    size = getfSize(asubjf,newfpath)\n+    html.append(\'<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>\' % \\\n+                (asubjf,\'Click here to download the Subject QC Detail report file\',size))\n+    for (title,url,ofname) in plotpage:\n+        ttitle = \'Ranked %s\' % title\n+        dat = subjectTops.get(ttitle,None)\n+        if not dat:\n+            dat = markerTops.get(ttitle,None)\n+        imghref = \'%s.jpg\' % os.path.splitext(url)[0] # removes .pdf\n+        thumbnail = os.path.join(newfpath,imghref)\n+        if not os.path.exists(thumbnail): # for multipage pdfs, mogrify makes multiple jpgs - fugly hack\n+            imghref = \'%s-0.jpg\' % os.path.splitext(url)[0] # try the first jpg\n+            thumbnail = os.path.join(newfpath,imghref)\n+        if not os.path.exists(thumbnail):\n+            html.append(\'<tr><td colspan="3"><a href="%s">%s</a></td></tr>\' % (url,title))\n+        else:\n+            html.append(\'<tr><td><a href="%s"><img src="%s" alt="%s" hspace="10" align="middle">\' \\\n+                    % (url,imghref,title))\n+            if dat: # one or the other - write as an extra file and make a link here\n+                t = \'%s.xls\' % (ttitle.replace(\' \',\'_\'))\n+                fname = os.path.join(newfpath,t)\n+                f = file(fname,\'w\')\n+                f.write(\'\\n\'.join([\'\\t\'.join(x) for x in dat])) # the report\n+                size = getfSize(t,newfpath)\n+                html.append(\'</a></td><td>%s</td><td><a href="%s">Worst data</a>%s</td></tr>\' % (title,t,size))\n+            else:\n+                html.append(\'</a></td><td>%s</td><td>&nbsp;</td></tr>\' % (title))\n+    html.append(\'</table><hr><h3>All output files from the QC run are available below</h3>\')\n+    html.append(\'<table cellpadding="5" border="0">\\n\')\n+    flist = os.listdir(newfpath) # we want to catch \'em all\n+    flist.sort()\n+    for f in flist:\n+        fname = os.path.split(f)[-1]\n+        size = getfSize(fname,newfpath)\n+        html.append(\'<tr><td><a href="%s">%s</a>%s</td></tr>\' % (fname,fname,size))\n+    html.append(\'</table>\')\n+    alogf.close()\n+    plogf.close()\n+    llog = open(alog,\'r\').readlines()\n+    plogfile = open(plog,\'r\').readlines()\n+    os.unlink(alog)\n+    os.unlink(plog)\n+    llog += plogfile # add lines from pruneld log\n+    lf = file(ahtmlf,\'w\') # galaxy will show this as the default view\n+    lf.write(galhtmlprefix % progname)\n+    s = \'\\n<div>Output from Rgenetics QC report tool run at %s<br>\\n\' % (timenow())\n+    lf.write(\'<h4>%s</h4>\\n\' % s)\n+    lf.write(\'</div><div><h4>(Click any preview image to download a full sized PDF version)</h4><br><ol>\\n\')\n+    lf.write(\'\\n\'.join(html))\n+    lf.write(\'<h4>QC run log contents</h4>\')\n+    lf.write(\'<pre>%s</pre>\' % (\'\'.join(llog))) # plink logs\n+    if len(cruft) > 0:\n+        lf.write(\'<h2>Blather from pdfnup follows:</h2><pre>%s</pre>\' % (\'\'.join(cruft))) # pdfnup\n+    lf.write(\'%s\\n<hr>\\n\' % galhtmlpostfix)\n+    lf.close()\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgQC.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgQC.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,110 @@
+<tool id="rgQC1" name="QC reports:">
+
+    <description>Marker and Subject measures</description>
+
+    <command interpreter="python">
+        rgQC.py -i '$input_file.extra_files_path/$input_file.metadata.base_name' -o "$title"
+        -s '$html_file' -p '$html_file.files_path'
+    </command>
+
+    <inputs>
+          <param name="input_file" type="data" label="RGenetics genotype file in compressed Plink format"
+          size="80" format="pbed" />
+       <param name="title" size="80" type="text" value="RgQC report" label="Descriptive report title"/>
+   </inputs>
+
+   <outputs>
+       <data format="html" name="html_file" metadata_source="input_file" label="${title}.html"/>
+   </outputs>
+
+<tests>
+ <test>
+    <param name='input_file' value='tinywga' ftype='pbed' >
+    <metadata name='base_name' value='tinywga' />
+    <composite_data value='tinywga.bim' />
+    <composite_data value='tinywga.bed' />
+    <composite_data value='tinywga.fam' />
+    <edit_attributes type='name' value='tinywga' /> 
+    </param>
+    <param name='title' value='rgQCtest1' />
+    <output name='html_file' file='rgtestouts/rgQC/rgQCtest1.html' ftype='html' lines_diff='300'>
+    <param name="dbkey" value="hg18" />
+    <extra_files type="file" name='tinywga_All_Paged.pdf' value="rgtestouts/rgQC/tinywga_All_Paged.pdf" compare="sim_size" delta = "100000"/>
+    <extra_files type="file" name='tinywga.log' value="rgtestouts/rgQC/tinywga.log" compare="diff" lines_diff="15"/>
+    <extra_files type="file" name='tinywga.frq' value="rgtestouts/rgQC/tinywga.frq" compare="diff" />
+    <extra_files type="file" name='tinywga.het' value="rgtestouts/rgQC/tinywga.het" compare="diff" lines_diff="90"/>
+    <extra_files type="file" name='tinywga.hwe' value="rgtestouts/rgQC/tinywga.hwe" compare="diff" lines_diff="90"/>
+    <extra_files type="file" name='tinywga.imendel' value="rgtestouts/rgQC/tinywga.imendel" compare="diff"/>
+    <extra_files type="file" name='tinywga.imiss' value="rgtestouts/rgQC/tinywga.imiss" compare="diff" />
+    <extra_files type="file" name='tinywga.lmendel' value="rgtestouts/rgQC/tinywga.lmendel" compare="diff" />
+    <extra_files type="file" name='tinywga.lmiss' value="rgtestouts/rgQC/tinywga.lmiss" compare="diff" />
+    <extra_files type="file" name='tinywga_All_3x3.pdf' value="rgtestouts/rgQC/tinywga_All_3x3.pdf" compare="sim_size" delta="100000"/>
+    <extra_files type="file" name='ldp_tinywga.bed' value="rgtestouts/rgQC/ldp_tinywga.bed" compare="diff" lines_diff="10" />
+    <extra_files type="file" name='ldp_tinywga.bim' value="rgtestouts/rgQC/ldp_tinywga.bim" compare="sim_size" delta="1000" />
+    <extra_files type="file" name='ldp_tinywga.fam' value="rgtestouts/rgQC/ldp_tinywga.fam" compare="diff" />
+    <extra_files type="file" name='ldp_tinywga.log' value="rgtestouts/rgQC/ldp_tinywga.log" compare="diff" lines_diff="20"/>
+    <extra_files type="file" name='Ranked_Marker_HWE.xls' value="rgtestouts/rgQC/Ranked_Marker_HWE.xls" compare="diff" />
+    <extra_files type="file" name='Ranked_Marker_MAF.xls' value="rgtestouts/rgQC/Ranked_Marker_MAF.xls" compare="diff" />
+    <extra_files type="file" name='Ranked_Marker_Missing_Genotype.xls' value="rgtestouts/rgQC/Ranked_Marker_Missing_Genotype.xls" compare="diff" lines_diff="5"/>
+    <extra_files type="file" name='Ranked_Subject_Missing_Genotype.xls' value="rgtestouts/rgQC/Ranked_Subject_Missing_Genotype.xls" compare="diff" lines_diff="40"/>
+    <extra_files type="file" name='tinywga_fracmiss_cum.jpg' value="rgtestouts/rgQC/tinywga_fracmiss_cum.jpg" compare="sim_size" delta = "20000"/>     
+    <extra_files type="file" name='tinywga_fracmiss_cum.pdf' value="rgtestouts/rgQC/tinywga_fracmiss_cum.pdf" compare="sim_size" delta = "100000"/>     
+ </output>
+ </test>
+</tests>
+ <help>
+
+.. class:: infomark
+
+**Summary**
+
+This tool prepares an extensive and comprehensive series of reports for quality control checking of SNP genotypes from any arbitrary
+genotyping experiment. Designed for family based data, so includes optional reports on Mendelian errors by
+subject and by marker.
+
+The outputs include histograms and boxplots for missingness, maf, mendel counts and hwe by marker, and the ones that make sense by
+subject. The report is built as a single web page containing links to the summary marker and subject files.
+
+The F (inbreeding) statistic is calculated using a somewhat LD independent group of genotypes
+The Plink used is --indep-pairwise 40 20 0.5 until we make it configurable.
+High heterozygosity might mean contaminated sample - more than one DNA. Low heterozygosity might mean inbreeding as in strains
+of mice.
+
+If the data file you want is missing from the option list above,
+you will first need to "import" it so it will be available here. Files available in the system library
+can be imported by selecting and completing the "Import ped/map" choice from the Get Data tool group at the top of the Galaxy
+menu. Your system administrator will be responsible for adding files to the system library.
+
+-----
+
+.. class:: infomark
+
+**Syntax**
+
+- **Genotype file** is the input pedfile -
+- **Prefix** is a string used to name all of the outputs
+
+-----
+
+**Attribution**
+
+This Galaxy tool was written by Ross Lazarus for the Rgenetics project
+The current version uses Plink for most calculations and R for plotting - for full Plink attribution, source code and documentation,
+please see http://pngu.mgh.harvard.edu/~purcell/plink/ while R attribution and source code can be found at http://r-project.org
+
+Shaun Purcell provides the documentation you need specific to those settings, at
+http://pngu.mgh.harvard.edu/~purcell/plink/anal.shtml#glm
+
+Tool and Galaxy datatypes originally designed and written for the Rgenetics
+series of whole genome scale statistical genetics tools by ross lazarus (ross.lazarus@gmail.com)
+Shaun Purcell created and maintains Plink, while a cast of many maintain R.
+
+Please acknowledge your use of this tool, Galaxy, R and Plink in your publications and let
+us know so we can keep track. These tools all rely on highly competitive grant funding
+so your letting us know about publications is important to our ongoing support.
+
+</help>
+
+
+
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgQQ.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgQQ.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,365 @@\n+"""\n+oct 2009 - multiple output files \n+Dear Matthias,\n+\n+Yes, you can define number of outputs dynamically in Galaxy. For doing\n+this, you\'ll have to declare one output dataset in your xml and pass\n+its ID ($out_file.id) to your python script. Also, set\n+force_history_refresh="True" in your tool tag in xml, like this:\n+<tool id="split1" name="Split" force_history_refresh="True">\n+In your script, if your outputs are named in the following format,\n+primary_associatedWithDatasetID_designation_visibility_extension\n+(_DBKEY), all your datasets will show up in the history pane.\n+associatedWithDatasetID is the $out_file.ID passed from xml,\n+designation will be a unique identifier for each output (set in your\n+script),\n+visibility can be set to visible if you want the dataset visible in\n+your history, or notvisible otherwise\n+extension is the required format for your dataset (bed, tabular, fasta\n+etc)\n+DBKEY is optional, and can be set if required (e.g. hg18, mm9 etc)\n+\n+One of our tools "MAF to Interval converter" (tools/maf/\n+maf_to_interval.xml) already uses this feature. You can use it as a\n+reference.\n+\n+qq.chisq Quantile-quantile plot for chi-squared tests\n+Description\n+This function plots ranked observed chi-squared test statistics against the corresponding expected\n+order statistics. It also estimates an inflation (or deflation) factor, lambda, by the ratio of the trimmed\n+means of observed and expected values. This is useful for inspecting the results of whole-genome\n+association studies for overdispersion due to population substructure and other sources of bias or\n+confounding.\n+Usage\n+qq.chisq(x, df=1, x.max, main="QQ plot",\n+sub=paste("Expected distribution: chi-squared (",df," df)", sep=""),\n+xlab="Expected", ylab="Observed",\n+conc=c(0.025, 0.975), overdisp=FALSE, trim=0.5,\n+slope.one=FALSE, slope.lambda=FALSE,\n+thin=c(0.25,50), oor.pch=24, col.shade="gray", ...)\n+Arguments\n+x A vector of observed chi-squared test values\n+df The degreees of freedom for the tests\n+x.max If present, truncate the observed value (Y) axis here\n+main The main heading\n+sub The subheading\n+xlab x-axis label (default "Expected")\n+ylab y-axis label (default "Observed")\n+conc Lower and upper probability bounds for concentration band for the plot. Set this\n+to NA to suppress this\n+overdisp If TRUE, an overdispersion factor, lambda, will be estimated and used in calculating\n+concentration band\n+trim Quantile point for trimmed mean calculations for estimation of lambda. Default\n+is to trim at the median\n+slope.one Is a line of slope one to be superimpsed?\n+slope.lambda Is a line of slope lambda to be superimposed?\n+thin A pair of numbers indicating how points will be thinned before plotting (see\n+Details). If NA, no thinning will be carried out\n+oor.pch Observed values greater than x.max are plotted at x.max. This argument sets\n+the plotting symbol to be used for out-of-range observations\n+col.shade The colour with which the concentration band will be filled\n+... Further graphical parameter settings to be passed to points()\n+\n+Details\n+To reduce plotting time and the size of plot files, the smallest observed and expected points are\n+thinned so that only a reduced number of (approximately equally spaced) points are plotted. The\n+precise behaviour is controlled by the parameter thin, whose value should be a pair of numbers.\n+The first number must lie between 0 and 1 and sets the proportion of the X axis over which thinning\n+is to be applied. The second number should be an integer and sets the maximum number of points\n+to be plotted in this section.\n+The "concentration band" for the plot is shown in grey. This region is defined by upper and lower\n+probability bounds for each order statistic. The default is to use the 2.5 Note that this is not a\n+simultaneous confidence region; the probability that the plot will stray outside the band at some\n+point exceeds 95\n+When required, he dispersion factor is estimated by the ratio of the observ'..b'(yvec),nrows)\n+    else:\n+        yvec = [x for x in dat] \n+        maint=\'QQ %s (n=%d)\' % (title,nrows)\n+        xvec = unifx\n+    if logscale:\n+        maint = \'Log%s\' % maint\n+        mx = [0,math.log10(nrows)] # if 1000, becomes 3 for the null line\n+        ylab = \'-log10(%s) Quantiles\' % title\n+        xlab = \'-log10(Uniform 0-1) Quantiles\'\n+        yvec = [-math.log10(x) for x in yvec if x > 0.0]\n+    else:\n+        mx = [0,1]\n+        ylab = \'%s Quantiles\' % title\n+        xlab = \'Uniform 0-1 Quantiles\'\n+\n+    xv = [\'%f\' % x for x in xvec]\n+    R.append(\'xvec = c(%s)\' % \',\'.join(xv))\n+    yv = [\'%f\' % x for x in yvec]\n+    R.append(\'yvec = c(%s)\' % \',\'.join(yv))\n+    R.append(\'mx = c(0,%f)\' % (math.log10(fn)))\n+    R.append(\'pdf("%s",h=%d,w=%d)\' % (fname,h,w))\n+    R.append("par(lab=c(10,10,10))")\n+    R.append(\'qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)\' % (xlab,ylab,maint,title,colour))\n+    R.append(\'points(mx,mx,type="l")\')\n+    R.append(\'grid(col="lightgray",lty="dotted")\')\n+    R.append(\'dev.off()\')\n+    RRun(rcmd=R,title=\'makeQQplot\',outdir=outdir)\n+\n+\n+\n+def main():\n+    u = """\n+    """\n+    u = """<command interpreter="python">\n+        rgQQ.py "$input1" "$name" $sample "$cols" $allqq $height $width $logtrans $allqq.id $__new_file_path__ \n+    </command>                                                                                                 \n+\n+    </command>\n+    """\n+    print >> sys.stdout,\'## rgQQ.py. cl=\',sys.argv\n+    npar = 11\n+    if len(sys.argv) < npar:\n+            print >> sys.stdout, \'## error - too few command line parameters - wanting %d\' % npar\n+            print >> sys.stdout, u\n+            sys.exit(1)\n+    in_fname = sys.argv[1]\n+    name = sys.argv[2]\n+    sample = float(sys.argv[3])\n+    head = None\n+    columns = [int(x) for x in sys.argv[4].strip().split(\',\')] # work with python columns!\n+    allout = sys.argv[5]\n+    height = int(sys.argv[6])\n+    width = int(sys.argv[7])\n+    logscale = (sys.argv[8].lower() == \'true\')\n+    outid = sys.argv[9] # this is used to allow multiple output files \n+    outdir = sys.argv[10]\n+    nan_row = False\n+    rows = []\n+    for i, line in enumerate( file( sys.argv[1] ) ):\n+        # Skip comments\n+        if  line.startswith( \'#\' ) or ( i == 0 ):\n+            if i == 0:\n+                 head = line.strip().split("\\t")\n+            continue\n+        if len(line.strip()) == 0:\n+            continue\n+        # Extract values and convert to floats\n+        fields = line.strip().split( "\\t" )\n+        row = []\n+        nan_row = False\n+        for column in columns:\n+            if len( fields ) <= column:\n+                return fail( "No column %d on line %d: %s" % ( column, i, fields ) )\n+            val = fields[column]\n+            if val.lower() == "na":\n+                nan_row = True\n+            else:\n+                try:\n+                    row.append( float( fields[column] ) )\n+                except ValueError:\n+                    return fail( "Value \'%s\' in column %d on line %d is not numeric" % ( fields[column], column+1, i ) )\n+        if not nan_row:\n+           rows.append( row )\n+    if i > 1:\n+       i = i-1 # remove header row from count\n+    if head == None:\n+       head = [\'Col%d\' % (x+1) for x in columns]\n+    R = []\n+    for c,column in enumerate(columns): # we appended each column in turn\n+        outname = allout\n+        if c > 0: # after first time\n+            outname = \'primary_%s_col%s_visible_pdf\' % (outid,column)\n+            outname = os.path.join(outdir,outname)\n+        dat = []\n+        nrows = len(rows) # sometimes lots of NA\'s!!\n+        for arow in rows:\n+           dat.append(arow[c]) # remember, we appended each col in turn!\n+        cname = head[column]        \n+        makeQQ(dat=dat,sample=sample,fname=outname,title=\'%s_%s\' % (name,cname),\n+                   xvar=cname,h=height,w=width,logscale=logscale,outdir=outdir)\n+\n+\n+\n+if __name__ == "__main__":\n+    main()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgQQ.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgQQ.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,99 @@
+<tool id="rgQQ1" name="QQ Plots:">
+    <code file="rgQQ_code.py"/>
+
+    <description>for p values from an analysis </description>
+
+    <command interpreter="python">
+        rgQQ.py "$input1" "$title" "$sample" "$cols" "$allqq" "$height" "$width" "$logtrans" "$allqq.id" "$__new_file_path__"
+    </command>
+
+    <inputs>
+       <page>
+       <param name="input1"  type="data" label="Choose the History dataset containing p values to QQ plot"
+          size="80" format="tabular" help="Dataset missing? See Tip below" />
+       <param name="title" type="text" size="80" label = "Descriptive title for QQ plot" value="QQ" />
+
+       <param name="logtrans" type="boolean" label = "Use a log scale - recommended for p values in range 0-1.0"
+          truevalue="true" falsevalue="false"/>
+       <param name="sample" type="float" label="Random sample fraction - set to 1.0 for all data points" value="0.01"
+        help="If you have a million values, the QQ plots will be huge - a random sample of 1% will be fine" />
+       <param name="height" type="integer" label="PDF image height (inches)" value="6" />
+       <param name="width" type="integer" label="PDF image width (inches)" value="6" />
+       </page>
+       <page>
+       <param name="cols" type="select" display="checkboxes" multiple="True"
+       help="Choose from these numeric columns in the data file to make a quantile-quantile plot against a uniform distribution"
+       label="Columns (p values 0-1 eg) to make QQ plots" dynamic_options="get_columns( input1 )" />
+       </page>
+   </inputs>
+
+   <outputs>
+       <data format="pdf" name="allqq" label="${title}.html"/>
+   </outputs>
+
+<tests>
+ <test>
+ <param name='input1' value='tinywga.pphe' />
+ <param name='title' value="rgQQtest1" />
+ <param name='logtrans' value="false" />
+ <param name='sample' value='1.0' />
+ <param name='height' value='8' />
+ <param name='width' value='10' />
+ <param name='cols' value='3' />
+ <output name='allqq' file='rgQQtest1.pdf' ftype='binary' compare="diff" lines_diff="29"/>
+ </test>
+</tests>
+
+<help>
+
+.. class:: infomark
+
+**Explanation**
+
+A quantile-quantile (QQ) plot is a good way to see systematic departures from the null expectation of uniform p-values
+from a genomic analysis. If the QQ plot shows departure from the null (ie a uniform 0-1 distribution), you hope that this will be
+in the very smallest p-values suggesting that there might be some interesting results to look at. A log scale will help emphasise departures
+from the null at low p values more clear
+
+-----
+
+.. class:: infomark
+
+**Syntax**
+
+This tool has 2 pages. On the first one you choose the data set and output options, then on the second page, the
+column names are shown so you can choose the one containing the p values you wish to plot.
+
+- **History data** is one of your history tabular data sets
+- **Descriptive Title** is the text to appear in the output file names to remind you what the plots are!
+- **Use a Log scale** is recommended for p values in the range 0-1 as it highlights departures from the null at small p values
+- **Random Sample Fraction** is the fraction of points to randomly sample - highly recommended for >5k or so values
+- **Height and Width** will determine the scale of the pdf images
+
+
+-----
+
+.. class:: infomark
+
+**Summary**
+
+Generate a uniform QQ plot for any large number of p values from an analysis.
+Essentially a plot of n ranked p values against their rank as a centile - ie rank/n
+
+Works well where you have a column containing p values from
+a statistical test of some sort. These will be plotted against the values expected under the null. Departure
+from the diagonal suggests one distribution is more extreme than the other. You hope your p values are
+smaller than expected under the null.
+
+The sampling fraction will help cut down the size of the pdfs. If there are fewer than 5k points on any plot, all will be shown.
+Otherwise the sampling fraction will be used or 5k, whichever is larger.
+
+Note that the use of a log scale is ill-advised if you are plotting log transformed p values because the
+uniform distribution chosen for the qq plot is always 0-1 and log transformation is applied if required.
+The most useful plots for p values are log QQ plots of untransformed p values in the range 0-1
+
+Originally designed and written for family based data from the CAMP Illumina run of 2007 by
+ross lazarus (ross.lazarus@gmail.com)
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgQQ_code.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgQQ_code.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,50 @@
+# before running the qc, need to rename various output files
+#       <data format="html" name="html_file" />
+#       <data format="txt" name="log_file" parent="html_file" />
+#       <data format="tabular" name="marker_file" parent="html_file" />
+#       <data format="tabular" name="subject_file" parent="html_file" />
+
+from galaxy import datatypes,model
+import sys,string
+
+def get_columns( input ):
+    columns = []
+    elems = []
+    if input and input.metadata.columns:
+        ncols = input.metadata.columns
+        colnames = ['Col%d' % x for x in range(1,ncols+1)]
+        for i, line in enumerate( file ( input.file_name ) ):
+            valid = True
+            if line and not line.startswith( '#' ):
+                line = line.rstrip('\r\n')
+                elems = line.split( '\t' )
+
+                """
+                Since this tool requires users to select only those columns
+                that contain numerical values, we'll restrict the column select
+                list appropriately.
+                """
+                if len(elems) > 0:
+                    for col in range(len(elems)): # zero offset
+                       if i == 0: # header row
+                          colnames[col] = elems[col]
+                       else:
+                          val = elems[col]
+                          try:
+                              val = float(val)
+                              valid = True
+                          except:
+                              valid = False
+                       if valid:
+                            option = colnames[col]
+                            columns.append((option,str(col),False))
+                if len(columns) > 0:
+                    """
+                    We have our select list built, so we can break out of the outer most for loop
+                    """
+                    break
+            if i == 30:
+                break # Hopefully we never get here...
+    else:
+        columns = [('?','?',False),]
+    return columns
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgRegion.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgRegion.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,80 @@
+"""
+released under the terms of the LGPL
+copyright ross lazarus August 2007 
+for the rgenetics project
+
+Special galaxy tool for the camp2007 data
+Allows grabbing arbitrary columns from an arbitrary region
+
+Needs a mongo results file in the location hardwired below or could be passed in as
+a library parameter - but this file must have a very specific structure
+rs chrom offset float1...floatn
+
+called as
+    <command interpreter="python">
+        rsRegion.py $infile '$cols' $r $tag $out_file1
+    </command>
+
+cols is a delimited list of chosen column names for the subset
+r is a ucsc location region pasted into the tool
+
+"""
+
+
+import sys,string       
+
+trantab = string.maketrans(string.punctuation,'_'*len(string.punctuation))
+print >> sys.stdout, '##rgRegion.py started'
+if len(sys.argv) <> 6: 
+  print >> sys.stdout, '##!expected  params in sys.argv, got %d - %s' % (len(sys.argv),sys.argv)
+  sys.exit(1)
+print '##got %d - %s' % (len(sys.argv),sys.argv)
+# quick and dirty for galaxy - we always get something for each parameter
+fname = sys.argv[1]
+wewant = sys.argv[2].split(',')
+region = sys.argv[3].lower()
+tag = sys.argv[4].translate(trantab)
+ofname = sys.argv[5] 
+myname = 'rgRegion'
+if len(wewant) == 0: # no columns selected?
+  print >> sys.stdout, '##!%s:  no columns selected - cannot run' % myname
+  sys.exit(1)
+try:
+  f = open(fname,'r')
+except: # bad input file name?
+  print >> sys.stdout, '##!%s unable to open file %s' % (myname, fname)
+  sys.exit(1)
+try: # TODO make a regexp?
+  c,rest = region.split(':')
+  c = c.replace('chr','') # leave although will break strict genome graphs  
+  rest = rest.replace(',','') # remove commas
+  spos,epos = rest.split('-')
+  spos = int(spos)
+  epos = int(epos)
+except:
+  print >> sys.stdout, '##!%s unable to parse region %s - MUST look like "chr8:10,000-100,000' % (myname,region)
+  sys.exit(1)
+print >> sys.stdout, '##%s parsing chrom %s from %d to %d' % (myname, c,spos,epos)
+res = []
+cnames = f.next().strip().split() # column titles for output
+linelen = len(cnames)
+wewant = [int(x) - 1 for x in wewant] # need col numbers base 0
+for n,l in enumerate(f):
+  ll = l.strip().split()
+  thisc = ll[1]
+  thispos = int(ll[2])
+  if (thisc == c) and (thispos >= spos) and (thispos <= epos):
+     if len(ll) == linelen:
+        res.append([ll[x] for x in wewant]) # subset of columns!
+     else:
+        print >> sys.stdout, '##! looking for %d fields - found %d in ll=%s' % (linelen,len(ll),str(ll))
+o = file(ofname,'w')
+res = ['%s\n' % '\t'.join(x) for x in res] # turn into tab delim string
+print >> sys.stdout, '##%s selected and returning %d data rows' % (myname,len(res))
+head = [cnames[x] for x in wewant] # ah, list comprehensions - list of needed column names
+o.write('%s\n' % '\t'.join(head)) # header row for output
+o.write(''.join(res))
+o.close()
+f.close()    
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgRegion.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgRegion.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,56 @@
+<tool id="rgRegion" name="Subset:">
+    <description>genotypes from genomic region</description>
+  
+    <command interpreter="python">
+        rgRegion.py $infile $r $title $out_file1
+    </command>
+    
+    <inputs>    
+       <page>
+       <param name="infile" type="data" format="lped" label="Linkage ped genotype file name from current history" size="80"/>
+       <param name="title" type="text" size="80" label="Title for output files" optional="true"
+        help="Descriptive title for new genotype/map files" value="RGRegion" />
+       <param name="r" type="text" label="Region" help="Cut and paste a UCSC browser region" 
+        size="80" value="chr9:119,506,000-122,518,000"/>
+       <param name="rslist" type="text" area="true" label="List of rs numbers" help="Type (or cut and paste) a space or newline separated list of rs numbers" 
+        size="5x20"/>
+       <param name="outformat" type="select" label="Output file format" dynamic_options="get_rgRegionOutFormats()" size="80"/> 
+
+       </page>
+

+   </inputs>
+
+   <outputs>  
+       <data format="lped" name="out_file1" label="${title}.lped" metadata_source="infile" />
+   </outputs>
+<help>
+
+.. class:: infomark
+
+**Syntax**
+
+- **Source** is the file you want to extract some columns from over a genomic region such as a gene or chromosome
+- **Tag** is the name to give the results file for this run 
+- **Region** is the genomic region cut and paste from a UCSC browser location window
+- **Genome Build** is the version of the genome your markers are from - use hg18 for CAMP illumina data
+
+-----
+
+**Summary**
+
+This tool is a very general purpose report builder. It can cut specific columns from 
+amalgamated analyses - eg powers and pvalues,
+or regressions over a specified genomic region (given as a UCSC browser location - eg)
+
+It takes a tab delimited file containing rs chrom offset float1..floatn and cuts out a region and
+a subset of the columns into a tabular file. If you make sure that RS is included, the
+result that appears in your history will have a direct link to ucsc genome graphs for viewing
+in full genomic context
+
+ross lazarus (ross.lazarus@gmail.com)
+August 2007
+released under the LGPL. see documentation for license terms.
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgTDT.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgTDT.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,264 @@\n+#!/usr/local/bin/python\n+# hack to run and process a plink tdt\n+# expects args as\n+# bfilepath outname jobname outformat (wig,xls)\n+# ross lazarus\n+# for wig files, we need annotation so look for map file or complain\n+\n+"""\n+Parameters for wiggle track definition lines\n+All options are placed in a single line separated by spaces:\n+\n+  track type=wiggle_0 name=track_label description=center_label \\\n+        visibility=display_mode color=r,g,b altColor=r,g,b \\\n+        priority=priority autoScale=on|off \\\n+        gridDefault=on|off maxHeightPixels=max:default:min \\\n+        graphType=bar|points viewLimits=lower:upper \\\n+        yLineMark=real-value yLineOnOff=on|off \\\n+        windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16\n+"""\n+\n+import sys,math,shutil,subprocess,os,time,tempfile,shutil,string\n+from os.path import abspath\n+from optparse import OptionParser\n+from rgutils import timenow, plinke\n+myversion = \'v0.003 January 2010\'\n+verbose = False\n+\n+\n+\n+def makeGFF(resf=\'\',outfname=\'\',logf=None,twd=\'.\',name=\'track name\',description=\'track description\',topn=1000):\n+    """\n+    score must be scaled to 0-1000\n+\n+    Want to make some wig tracks from each analysis\n+    Best n -log10(p). Make top hit the window.\n+    we use our tab output which has\n+    rs\tchrom\toffset\tADD_stat\tADD_p\tADD_log10p\n+    rs3094315\t1\t792429\t1.151\t0.2528\t0.597223\n+\n+    """\n+\n+    def is_number(s):\n+        try:\n+            float(s)\n+            return True\n+        except ValueError:\n+            return False\n+    header = \'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\\n\' % (name,description)\n+    column_names = [ \'Seqname\', \'Source\', \'Feature\', \'Start\', \'End\', \'Score\', \'Strand\', \'Frame\', \'Group\' ]\n+    halfwidth=100\n+    resfpath = os.path.join(twd,resf)\n+    resf = open(resfpath,\'r\')\n+    resfl = resf.readlines() # dumb but convenient for millions of rows\n+    resfl = [x.split() for x in resfl]\n+    headl = resfl[0]\n+    resfl = resfl[1:]\n+    headl = [x.strip().upper() for x in headl]\n+    headIndex = dict(zip(headl,range(0,len(headl))))\n+    # s = \'rs\\tchrom\\toffset\\ta1\\ta2\\ttransmitted\\tuntransmitted\\tTDTChiSq\\tTDTp\\t-log10TDTp\\tAbsTDTOR\\tTDTOR\'\n+    chrpos = headIndex.get(\'CHROM\',None)\n+    rspos = headIndex.get(\'RS\',None)\n+    offspos = headIndex.get(\'OFFSET\',None)\n+    ppos = headIndex.get(\'-LOG10TDTP\',None)\n+    wewant = [chrpos,rspos,offspos,ppos]\n+    if None in wewant: # missing something\n+       logf.write(\'### Error missing a required header in makeGFF - headIndex=%s\\n\' % headIndex)\n+       return\n+    resfl = [x for x in resfl if x[ppos] > \'\']\n+    resfl = [(float(x[ppos]),x) for x in resfl] # decorate\n+    resfl.sort()\n+    resfl.reverse() # using -log10 so larger is better\n+    resfl = resfl[:topn] # truncate\n+    pvals = [x[0] for x in resfl] # need to scale\n+    resfl = [x[1] for x in resfl] # drop decoration\n+    maxp = max(pvals) # need to scale\n+    minp = min(pvals)\n+    prange = abs(maxp-minp) + 0.5 # fudge\n+    scalefact = 1000.0/prange\n+    logf.write(\'###maxp=%f,minp=%f,prange=%f,scalefact=%f\\n\' % (maxp,minp,prange,scalefact))\n+    for i,row in enumerate(resfl):\n+        row[ppos] = \'%d\' % (int(scalefact*pvals[i]))\n+        resfl[i] = row # replace\n+    outf = file(outfname,\'w\')\n+    outf.write(header)\n+    outres = [] # need to resort into chrom offset order\n+    for i,lrow in enumerate(resfl):\n+        chrom,snp,offset,p, = [lrow[x] for x in wewant]\n+        gff = (\'chr%s\' % chrom,\'rgTDT\',\'variation\',\'%d\' % (int(offset)-halfwidth),\n+               \'%d\' % (int(offset)+halfwidth),p,\'.\',\'.\',\'%s logp=%1.2f\' % (snp,pvals[i]))\n+        outres.append(gff)\n+    outres = [(x[0],int(x[3]),x) for x in outres] # decorate\n+    outres.sort() # into chrom offset\n+    outres=[x[2] for x in outres] # undecorate\n+    outres = [\'\\t\'.join(x) for x in outres]\n+    outf.write(\'\\n\'.join(outres))\n+    outf.write(\'\\n\')\n+    outf.close()\n+\n+\n+\n+def xformTDT(infname=\'\',resf=\'\',outfname=\'\',name'..b'chrom,offset,a1,a2,t,u,orat,chisq,p = [ll[x] for x in wewant]\n+            if chisq == \'NA\' or p == \'NA\' or orat == \'NA\':\n+                continue # can\'t use these lines - gg gets unhappy\n+            snp = snp.strip()\n+            lp = \'0.0\'\n+            fp = \'1.0\'\n+            fakeorat = \'1.0\'\n+            if p.upper().strip() <> \'NA\':\n+                try:\n+                   fp = float(p)\n+                   if fp <> 0:\n+                       lp = \'%6f\' % -math.log10(fp)\n+                       fp = \'%6f\' % fp\n+                except:\n+                  pass\n+            else:\n+                p = \'1.0\'\n+            if orat.upper().strip() <> \'NA\':\n+                try:\n+                   fakeorat = orat\n+                   if float(orat) < 1.0:\n+                      fakeorat = \'%6f\' % (1.0/float(orat)) # invert so large values big\n+                except:\n+                   pass\n+            else:\n+                orat = \'1.0\'\n+            outl = \'\\t\'.join([snp,chrom,offset,a1,a2,t,u,chisq,p,lp,fakeorat,orat])\n+            res.append(outl)\n+    f = file(outfname,\'w\')\n+    res.append(\'\')\n+    f.write(\'\\n\'.join(res))\n+    f.close()\n+\n+\n+if __name__ == "__main__":\n+    """ called as\n+    <command interpreter="python">\n+        rgTDT.py -i \'$infile.extra_files_path/$infile.metadata.base_name\' -o \'$title\' -f \'$outformat\' -r \'$out_file1\' -l \'$logf\' -x \'${GALAXY_DATA_INDEX_DIR}/rg/bin/pl$\n+\n+    </command>\n+\n+    """\n+    u = """ called in xml as\n+        <command interpreter="python2.4">\n+        rgTDT.py -i $i -o $out_prefix -f $outformat -r $out_file1 -l $logf\n+    </command>\n+    """\n+    if len(sys.argv) < 6:\n+       s = \'## Error rgTDT.py needs 5 command line params - got %s \\n\' % (sys.argv)\n+       if verbose:\n+            print >> sys.stdout, s\n+       sys.exit(0)\n+    parser = OptionParser(usage=u, version="%prog 0.01")\n+    a = parser.add_option\n+    a("-i","--infile",dest="bfname")\n+    a("-o","--oprefix",dest="oprefix")\n+    a("-f","--formatOut",dest="outformat")\n+    a("-r","--results",dest="outfname")\n+    a("-l","--logfile",dest="logf")\n+    a("-d","--du",dest="uId")\n+    a("-e","--email",dest="uEmail")\n+    a("-g","--gff",dest="gffout",default="")\n+    (options,args) = parser.parse_args()\n+    killme = string.punctuation + string.whitespace\n+    trantab = string.maketrans(killme,\'_\'*len(killme))\n+    title = options.oprefix\n+    title = title.translate(trantab)\n+    map_file = \'%s.bim\' % (options.bfname) #\n+    me = sys.argv[0]\n+    alogf = options.logf # absolute paths\n+    od = os.path.split(alogf)[0]\n+    try:\n+      os.path.makedirs(od)\n+    except:\n+      pass\n+    aoutf = options.outfname # absolute paths\n+    od = os.path.split(aoutf)[0]\n+    try:\n+      os.path.makedirs(od)\n+    except:\n+      pass\n+    vcl = [plinke,\'--noweb\', \'--bfile\',options.bfname,\'--out\',title,\'--mind\',\'0.5\',\'--tdt\']\n+    logme = []\n+    if verbose:\n+        s = \'Rgenetics %s http://rgenetics.org Galaxy Tools rgTDT.py started %s\\n\' % (myversion,timenow())\n+        print >> sys.stdout,s\n+        logme.append(s)\n+        s =\'rgTDT.py: bfname=%s, logf=%s, argv = %s\\n\' % (options.bfname,alogf, sys.argv)\n+        print >> sys.stdout,s\n+        logme.append(s)\n+        s = \'rgTDT.py: vcl=%s\\n\' % (\' \'.join(vcl))\n+        print >> sys.stdout,s\n+        logme.append(s)\n+    twd = tempfile.mkdtemp(suffix=\'rgTDT\') # make sure plink doesn\'t spew log file into the root!\n+    tname = os.path.join(twd,title)\n+    p=subprocess.Popen(\' \'.join(vcl),shell=True,cwd=twd)\n+    retval = p.wait()\n+    shutil.copy(\'%s.log\' % tname,alogf)\n+    sto = file(alogf,\'a\')\n+    sto.write(\'\\n\'.join(logme))\n+    resf = \'%s.tdt\' % tname # plink output is here we hope\n+    xformTDT(options.bfname,resf,aoutf,title,map_file) # leaves the desired summary file\n+    gffout = options.gffout\n+    if gffout > \'\':\n+        makeGFF(resf=aoutf,outfname=gffout,logf=sto,twd=\'.\',name=\'rgTDT_Top_Table\',description=title,topn=1000)\n+    shutil.rmtree(twd)\n+    sto.close()\n+\n+\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgTDT.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgTDT.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,108 @@
+<tool id="rgTDT1" name="Transmission Distortion:">
+    <description>for family data</description>
+
+    <command interpreter="python">
+        rgTDT.py -i '$i.extra_files_path/$i.metadata.base_name' -o '$title'
+        -r '$out_file1' -l '$logf'  -g '$gffout'
+    </command>
+
+    <inputs>
+       <param name="i"  type="data" label="Genotypes for analysis from your current history datasets"
+          size="132" format="pbed" />
+       <param name='title' type='text' value='rgTDT'  label="Title for the output to remind you what you did" size="80"/>
+   </inputs>
+
+   <outputs>
+       <data format="tabular" name="out_file1" label="${title}_rgTDT.xls"/>
+       <data format="gff" name="gffout" label="${title}_rgTDT.gff"/>
+       <data format="txt" name="logf" label="${title}_rgTDTlog.txt"/>
+   </outputs>
+
+<tests>
+ <test>
+ <param name='i' value='tinywga' ftype='pbed' >
+   <metadata name='base_name' value='tinywga' />
+   <composite_data value='tinywga.bim' />
+   <composite_data value='tinywga.bed' />
+   <composite_data value='tinywga.fam' />
+   <edit_attributes type='name' value='tinywga' /> 
+ </param>
+ <param name='title' value='rgTDTtest1' />
+ <output name='out_file1' file='rgTDTtest1_TDT.xls' ftype='tabular' compare="diff"/>
+ <output name='gffout' file='rgTDTtest1_TDT_topTable.gff' ftype='gff' compare="diff" />
+ <output name='logf' file='rgTDTtest1_TDT_log.txt' ftype='txt' lines_diff='79'/>
+ </test>
+</tests>
+
+
+<help>
+
+.. class:: infomark
+
+**Attribution**
+
+This tool relies on the work of many people. It uses Plink http://pngu.mgh.harvard.edu/~purcell/plink/ for
+analysis, and the R http://cran.r-project.org/ for graphics respectively.
+
+This implementation is a Galaxy tool wrapper around these third party applications.
+It was originally designed and written for family based data from the CAMP Illumina run of 2007 by
+ross lazarus (ross.lazarus@gmail.com) and incorporated into the rgenetics toolkit.
+
+Rgenetics merely exposes them, wrapping Plink so you can use it in Galaxy.
+
+-----
+
+.. class:: infomark
+
+**Syntax**
+
+- **Genotype file** is the input family data chosen from available library compressed files
+- **Format** determines how your data will be returned to your Galaxy workspace - the gg format is strongly recommended
+
+-----
+
+.. class:: infomark
+
+**Summary**
+
+This tool will perform the standard transmission distortion analyses suitable for
+nuclear families and a simple binary "affected" phenotype
+
+If you don't see the genotype data set you want here, it can be imported using one of the methods available from
+the Galaxy Get Data tool page.
+
+Outputs will include a GFF toptable with a link to view at UCSC if you want to see your
+results as a fully fledged UCSC track.
+
+Finally, if you can't live without
+spreadsheet data, choose the .xls tab delimited format. It's not a stupid binary excel file. Just a plain old tab delimited
+one with a header. Fortunately excel is dumb enough to open these without much protest.
+
+
+----
+
+.. class:: infomark
+
+**Attribution**
+
+This Galaxy tool relies on Plink (see Plinksrc_) to test TDT models. 
+
+So, we rely on the author (Shaun Purcell) for the documentation you need specific to those settings - they are very nicely documented - see
+DOC_
+
+Tool and Galaxy datatypes originally designed and written for the Rgenetics
+series of whole genome scale statistical genetics tools by ross lazarus (ross.lazarus@gmail.com)
+
+Copyright Ross Lazarus March 2007
+This Galaxy wrapper is released licensed under the LGPL_ but is about as useful as a chocolate teapot without Plink which is GPL.
+
+I'm no lawyer, but it looks like you got GPL if you use this software. Good luck.
+
+.. _Plinksrc: http://pngu.mgh.harvard.edu/~purcell/plink/ 
+
+.. _LGPL: http://www.gnu.org/copyleft/lesser.html
+
+.. _DOC: http://pngu.mgh.harvard.edu/~purcell/plink/anal.shtml#tdt
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgWebLogo3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgWebLogo3.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,157 @@
+"""
+# modified june 2 ross lazarus to add units option at Assaf Gordon's suggestion
+# rgWebLogo3.py
+# wrapper to check that all fasta files are same length
+
+"""
+import optparse, os, sys, subprocess, tempfile
+
+WEBLOGO = 'weblogo' # executable name for weblogo3 - confusing isn't it?
+
+class WL3:
+    """
+    simple wrapper class to check fasta sequence lengths are all identical
+    """
+    FASTASTARTSYM = '>'
+    badseq = '## error - sequences in file %s are not all the same length - cannot proceed. Please read the tool documentation carefully'
+
+    def __init__(self,opts=None):
+        assert opts<>None,'WL3 class needs opts passed in - got None'
+        self.opts = opts
+        self.fastaf = file(self.opts.input,'r')
+        self.clparams = {}
+
+    def whereis(self,program):
+        for path in os.environ.get('PATH', '').split(':'):
+            if os.path.exists(os.path.join(path, program)) and not os.path.isdir(os.path.join(path, program)):
+                return os.path.join(path, program)
+        return None
+
+    def runCL(self):
+        """ construct and run a command line
+        """
+        wl = self.whereis(WEBLOGO)
+        if not wl:
+             print >> sys.stderr, '## rgWebLogo3.py error - cannot locate the weblogo binary %s on the current path' % WEBLOGO
+             print >> sys.stderr, '## Please ensure it is installed and working from http://code.google.com/p/weblogo'
+             sys.exit(1)
+        cll = [WEBLOGO,]
+        cll += [' '.join(it) for it in list(self.clparams.items())]
+        cl = ' '.join(cll)
+        assert cl > '', 'runCL needs a command line as clparms'
+        fd,templog = tempfile.mkstemp(suffix='rgtempRun.txt')
+        tlf = open(templog,'w')
+        process = subprocess.Popen(cl, shell=True, stderr=tlf, stdout=tlf)
+        rval = process.wait()
+        tlf.close()
+        tlogs = ''.join(open(templog,'r').readlines())
+        if len(tlogs) > 1:
+            s = '## executing %s returned status %d and log (stdout/stderr) records: \n%s\n' % (cl,rval,tlogs)
+        else:
+            s = '## executing %s returned status %d. Nothing appeared on stderr/stdout\n' % (cl,rval)
+        os.unlink(templog) # always
+        if rval <> 0:
+             print >> sys.stderr, '## rgWebLogo3.py error - executing %s returned error code %d' % (cl,rval)
+             print >> sys.stderr, '## This may be a data problem or a tool dependency (%s) installation problem' % WEBLOGO
+             print >> sys.stderr, '## Please ensure %s is correctly installed and working on the command line -see http://code.google.com/p/weblogo' % WEBLOGO
+             sys.exit(1)
+        return s
+
+        
+    def iter_fasta(self):
+        """
+        generator for fasta sequences from a file
+        """
+        aseq = []
+        seqname = None
+        for i,row in enumerate(self.fastaf):
+            if row.startswith(self.FASTASTARTSYM):
+                if seqname <> None: # already in a sequence
+                    s = ''.join(aseq)
+                    l = len(s)
+                    yield (seqname,l)
+                    seqname = row[1:].strip()
+                    aseq = []
+                else:
+                    if i > 0:
+                        print >> sys.stderr,'Invalid fasta file %s - does not start with %s - please read the tool documentation carefully' % (self.opts.input,self.FASTASTARTSYM)
+                        sys.exit(1)
+                    else:
+                        seqname = row[1:].strip() 
+            else: # sequence row
+                if seqname == None:
+                    print >> sys.stderr,'Invalid fasta file %s - does not start with %s - please read the tool documentation carefully' % (self.opts.input,self.FASTASTARTSYM)
+                    sys.exit(1) 
+                else:
+                    aseq.append(row.strip())
+                
+        if seqname <> None: # last one
+            l = len(''.join(aseq))
+            yield (seqname,l)
+                
+        
+    def fcheck(self):
+        """ are all fasta sequence same length?
+        might be mongo big
+        """
+        flen = None
+        lasti = None
+        f = self.iter_fasta()
+        for i,(seqname,seqlen) in enumerate(f):
+            lasti = i
+            if i == 0:
+                flen = seqlen
+            else:
+                if seqlen <> flen:
+                    print >> sys.stderr,self.badseq % self.opts.input
+                    sys.exit(1)
+        return '# weblogo input %s has %d sequences all of length %d' % (self.opts.input,lasti,flen)
+
+
+    def run(self):
+        check = self.fcheck()
+        self.clparams['-f'] = self.opts.input
+        self.clparams['-o'] = self.opts.output
+        self.clparams['-t'] = '"%s"' % self.opts.logoname # must be wrapped as a string       
+        self.clparams['-F'] = self.opts.outformat       
+        if self.opts.size <> None:
+            self.clparams['-s'] = self.opts.size
+        if self.opts.lower <> None:
+            self.clparams['-l'] = self.opts.lower
+        if self.opts.upper <> None:
+            self.clparams['-u'] = self.opts.upper        
+        if self.opts.colours <> None:
+            self.clparams['-c'] = self.opts.colours
+        if self.opts.units <> None:
+            self.clparams['-U'] = self.opts.units
+        s = self.runCL()
+        return check,s
+
+
+if __name__ == '__main__':
+    '''
+    called as
+<command interpreter="python"> 
+    rgWebLogo3.py --outformat $outformat -s $size -i $input -o $output -t "$logoname" -c "$colours"
+#if $range.mode == 'part'
+-l "$range.seqstart" -u "$range.seqend"
+#end if
+    </command>
+
+    '''
+    op = optparse.OptionParser()
+    op.add_option('-i', '--input', default=None)
+    op.add_option('-F', '--outformat', default='png')
+    op.add_option('-s', '--size', default=None) 
+    op.add_option('-o', '--output', default='rgWebLogo3')
+    op.add_option('-t', '--logoname', default='rgWebLogo3')
+    op.add_option('-c', '--colours', default=None)
+    op.add_option('-l', '--lower', default=None)
+    op.add_option('-u', '--upper', default=None)  
+    op.add_option('-U', '--units', default=None)  
+    opts, args = op.parse_args()
+    assert opts.input <> None,'weblogo3 needs a -i parameter with a fasta input file - cannot open'
+    assert os.path.isfile(opts.input),'weblogo3 needs a valid fasta input file - cannot open %s' % opts.input
+    w = WL3(opts)
+    checks,s = w.run()
+    print >> sys.stdout, checks # for info
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgWebLogo3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgWebLogo3.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,137 @@
+<tool id="rgweblogo3" name="Sequence Logo" version="0.4">
+   <description>generator for fasta (eg Clustal alignments)</description>
+   <command interpreter="python"> 
+    rgWebLogo3.py -F $outformat -s $size -i $input -o $output -t "$logoname" -c "$colours" -U "$units"
+#if $range.mode == 'part'
+-l "$range.seqstart" -u "$range.seqend"
+#end if
+    </command>
+  <inputs>
+   <page>
+    <param format="fasta" name="input" type="data" label="Fasta File" />
+    <param name="logoname" label="Title for output Sequence Logo" type="text" size="50" value="Galaxy-Rgenetics Sequence Logo" />
+    <param name="outformat" type="select" label="Output format for image (or text report)" >
+      <option value="png">PNG screen quality</option>
+      <option value="png_print">High quality printable PNG</option>
+      <option value="pdf" selected="True">PDF</option>
+      <option value="jpeg">JPG</option>
+      <option value="eps">EPS</option>
+      <option value="txt">Text (shows the detailed calculations for each position - no image)</option>
+    </param>
+    <param name="units" type="select" label="Display Units"
+      help="What the height of each logo element depicts - eg bits of entropy (default)">
+      <option value="bits" selected="True">Entropy (bits)</option>
+      <option value="probability">Probability</option>
+      <option value="nats">Nats</option>
+      <option value="kT">kT</option>
+      <option value="kJ/mol">kJ/mol</option>
+      <option value="kcal/mol">kcal/mol</option>
+    </param>
+    <param name="colours" type="select" label="Colour scheme for output Sequence Logo" 
+      help="Note that some of these only make sense for protein sequences!">
+      <option value="auto" selected="True">Default automatic colour selection</option>
+      <option value="base pairing">Base pairing</option>
+      <option value="charge">Charge colours</option>
+      <option value="chemistry">Chemistry colours</option>
+      <option value="classic">Classical colours</option>
+      <option value="hydrophobicity">Hydrophobicity</option>
+      <option value="monochrome">monochrome</option>
+    </param>
+
+    
+    <conditional name="range">
+        <param name="mode" type="select" label="Include entire sequence (default) or specify a subsequence range to use">
+          <option value="complete" selected="true">complete sequence</option>
+          <option value="part">Only use a part of the sequence</option>
+        </param>
+        <when value="complete">
+        </when>
+        <when value="part">    
+           <param name="seqstart" size="5" type="integer" value="1" help="WARNING: Specifying indexes outside the sequence lengths will cause unpredictable but bad consequences!" 
+             label="Index (eg 1=first letter) of the start of the sequence range to include in the logo">
+           </param>
+           <param name="seqend" size="5" type="integer" value="99999" label="Index (eg 75=75th letter) of the end of the sequence range to include in the logo" >
+           </param> 
+        </when>
+    </conditional>
+    <param name="size" type="select" label="Output weblogo size" >
+      <option value="large" selected="True">Large</option>
+      <option value="medium">Medium</option>
+      <option value="small">Small</option>
+    </param>
+   </page>
+  </inputs>
+  <outputs>
+    <data format="pdf" name="output"  label="${logoname}_output.${outformat}">
+       <change_format>
+           <when input="outformat" value="png_print" format="png" />
+           <when input="outformat" value="png" format="png" />
+           <when input="outformat" value="jpeg" format="jpg" />
+           <when input="outformat" value="eps" format="eps" />
+           <when input="outformat" value="txt" format="txt" />
+       </change_format>
+    </data>
+  </outputs>
+  <tests>
+    <test>  
+      <param name="input" value="rgClustal_testout.fasta" />
+      <param name = "logoname" value="Galaxy/Rgenetics weblogo" />
+      <param name = "outformat" value="jpeg" />
+      <param name = "mode" value="complete" />
+      <param name = "size" value="medium" />      
+      <param name = "colours" value="auto" />
+      <param name = "units" value="bits" /> 
+      <output name="output" file="rgWebLogo3_test.jpg" ftype="jpg" compare="sim_size" delta="10000" />
+    </test>
+    <test>  
+      <param name="input" value="rgClustal_testout.fasta" />
+      <param name = "logoname" value="Galaxy/Rgenetics weblogo" />
+      <param name = "outformat" value="png" />
+      <param name = "mode" value="complete" />
+      <param name = "size" value="medium" />      
+      <param name = "colours" value="auto" />
+      <param name = "units" value="probability" /> 
+      <output name="output" file="rgWebLogo3_test2.png" ftype="png" compare="sim_size" delta="10000" />
+    </test>
+  </tests>
+  <help>
+
+**Note**
+
+This tool uses Weblogo3_ in Galaxy to generate a sequence logo. The input file must be a fasta file in your current history.
+
+It is recommended for (eg) viewing multiple sequence alignments output from the clustalw tool - set the output to fasta and feed
+it in to this tool.
+
+A typical output looks like this
+
+.. image:: ./static/images/rgWebLogo3_test.jpg
+
+----
+
+**Warning about input Fasta format files**
+
+The Weblogo3 program used by this tool will fail if your fasta sequences are not all EXACTLY the same length. The tool will provide a warning
+and refuse to call the weblogo3 executable if irregular length sequences are detected.
+
+Fasta alignments from the companion ClustalW Galaxy tool will work but many other fasta files may cause this tool to fail - please do not file 
+a Galaxy bug report - this is a feature of the tool and a problem with your source data - not a tool error - please make certain all your fasta 
+sequences are the same length!
+
+----
+
+**Attribution**
+
+Weblogo attribution and associated documentation are available at Weblogo3_
+
+This Galaxy wrapper was written by Ross Lazarus for the rgenetics project and the source code is licensed under the LGPL_ like other rgenetics artefacts
+
+.. _Weblogo3: http://weblogo.berkeley.edu/
+
+.. _LGPL: http://www.gnu.org/copyleft/lesser.html
+
+  </help>
+
+</tool>
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgfakePed.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgfakePed.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,537 @@\n+# modified may 2011 to name components (map/ped) as RgeneticsData to align with default base_name\r\n+# otherwise downstream tools fail\r\n+# modified march  2011 to remove post execution hook  \r\n+# pedigree data faker\r\n+# specifically designed for scalability testing of\r\n+# Shaun Purcel\'s PLINK package\r\n+# derived from John Ziniti\'s original suggestion\r\n+# allele frequency spectrum and random mating added\r\n+# ross lazarus me fecit january 13 2007\r\n+# copyright ross lazarus 2007\r\n+# without psyco\r\n+# generates about 10k snp genotypes in 2k subjects (666 trios) per minute or so.\r\n+# so 500k (a billion genotypes), at about 4 trios/min will a couple of hours to generate\r\n+# psyco makes it literally twice as quick!!\r\n+# all rights reserved except as granted under the terms of the LGPL\r\n+# see http://www.gnu.org/licenses/lgpl.html \r\n+# for a copy of the license you receive with this software\r\n+# and for your rights and obligations\r\n+# especially if you wish to modify or redistribute this code\r\n+# january 19 added random missingness inducer\r\n+# currently about 15M genos/minute without psyco, 30M/minute with\r\n+# so a billion genos should take about 40 minutes with psyco or 80 without...\r\n+# added mendel error generator jan 23 rml\r\n+\r\n+\r\n+import random,sys,time,os,string\r\n+\r\n+from optparse import OptionParser\r\n+\r\n+defbasename="RgeneticsData"    \r\n+width = 500000\r\n+ALLELES = [\'1\',\'2\',\'3\',\'4\']\r\n+prog = os.path.split(sys.argv[0])[-1]\r\n+debug = 0\r\n+\r\n+"""Natural-order sorting, supporting embedded numbers.\r\n+# found at http://lists.canonical.org/pipermail/kragen-hacks/2005-October/000419.html\r\n+note test code there removed to conserve brain space\r\n+foo9bar2 < foo10bar2 < foo10bar10\r\n+\r\n+"""\r\n+import random, re, sys\r\n+\r\n+def natsort_key(item): \r\n+    chunks = re.split(\'(\\d+(?:\\.\\d+)?)\', item)\r\n+    for ii in range(len(chunks)):\r\n+        if chunks[ii] and chunks[ii][0] in \'0123456789\':\r\n+            if \'.\' in chunks[ii]: numtype = float\r\n+            else: numtype = int\r\n+            # wrap in tuple with \'0\' to explicitly specify numbers come first\r\n+            chunks[ii] = (0, numtype(chunks[ii]))\r\n+        else:\r\n+            chunks[ii] = (1, chunks[ii])\r\n+    return (chunks, item)\r\n+\r\n+def natsort(seq):\r\n+    "Sort a sequence of text strings in a reasonable order."\r\n+    alist = [item for item in seq]\r\n+    alist.sort(key=natsort_key)\r\n+    return alist\r\n+\r\n+\r\n+def makeUniformMAFdist(low=0.02, high=0.5):\r\n+    """Fake a non-uniform maf distribution to make the data\r\n+    more interesting. Provide uniform 0.02-0.5 distribution"""\r\n+    MAFdistribution = []\r\n+    for i in xrange(int(100*low),int(100*high)+1):\r\n+       freq = i/100.0 # uniform\r\n+       MAFdistribution.append(freq)\r\n+    return MAFdistribution\r\n+\r\n+def makeTriangularMAFdist(low=0.02, high=0.5, beta=5):\r\n+    """Fake a non-uniform maf distribution to make the data\r\n+    more interesting - more rare alleles """\r\n+    MAFdistribution = []\r\n+    for i in xrange(int(100*low),int(100*high)+1):\r\n+       freq = (51 - i)/100.0 # large numbers of small allele freqs\r\n+       for j in range(beta*i): # or i*i for crude exponential distribution \r\n+            MAFdistribution.append(freq)\r\n+    return MAFdistribution\r\n+\r\n+def makeFbathead(rslist=[], chromlist=[], poslist=[], width=100000):\r\n+    """header row\r\n+    """\r\n+    res = [\'%s_%s_%s\' % (chromlist[x], poslist[x], rslist[x]) for x in range(len(rslist))]\r\n+    return \' \'.join(res)\r\n+\r\n+def makeMap( width=500000, MAFdistribution=[], useGP=False):\r\n+    """make snp allele and frequency tables for consistent generation"""\r\n+    usegp = 1\r\n+    snpdb = \'snp126\'\r\n+    hgdb = \'hg18\'\r\n+    alleles = []\r\n+    freqs = []\r\n+    rslist = []\r\n+    chromlist = []\r\n+    poslist = []\r\n+    for snp in range(width):\r\n+        random.shuffle(ALLELES)\r\n+        alleles.append(ALLELES[0:2]) # need two DIFFERENT alleles!\r\n+        freqs.append(random.choice(MAFdistribution)) # more rare alleles\r\n+    if useGP:\r\n+        try:\r\n+          '..b'+    outf.write(\'<br><h3>This is simulated null genotype data generated by Rgenetics!</h3>\')\r\n+    outf.write(\'%s called with command line:<br><pre>\' % prog)\r\n+    outf.write(\' \'.join(sys.argv))\r\n+    outf.write(\'\\n</pre>\\n\')\r\n+    outf.write("</div></body></html>")\r\n+    outf.close()\r\n+\r\n+\r\n+\r\n+if __name__ == "__main__":\r\n+    """\r\n+    """\r\n+    parser = OptionParser(usage=u, version="%prog 0.01")\r\n+    a = parser.add_option\r\n+    a("-n","--nsubjects",type="int",dest="Ntot",\r\n+      help="nsubj: total number of subjects",default=2000)\r\n+    a("-t","--title",dest="title",\r\n+      help="title: file basename for outputs",default=\'fakeped\')\r\n+    a("-c","--cases",type="int",dest="Naff",\r\n+      help="number of cases: independent subjects with status set to 2 (ie cases). If not set, NTOT/3 trios will be generated", default = 0)\r\n+    a("-s","--snps",dest="width",type="int",\r\n+      help="snps: total number of snps per subject", default=1000)\r\n+    a("-d","--distribution",dest="MAFdist",default="Uniform",\r\n+      help="MAF distribution - default is Uniform, can be Triangular")\r\n+    a("-o","--outf",dest="outf",\r\n+      help="Output file", default = \'fakeped\')\r\n+    a("-p","--outpath",dest="outpath",\r\n+      help="Path for output files", default = \'./\')\r\n+    a("-l","--pLink",dest="outstyle", default=\'L\',\r\n+      help="Ped files as for Plink - no header, separate Map file - default is Plink style")\r\n+    a("-w","--loWmaf", type="float", dest="lowmaf", default=0.01, help="Lower limit for SNP MAF (minor allele freq)")\r\n+    a("-m","--missing",dest="missrate",type="float",\r\n+      help="missing: probability of missing MCAR - default 0.0", default=0.0)\r\n+    a("-v","--valmiss",dest="missval",\r\n+      help="missing character: Missing allele code - usually 0 or N - default 0", default="0")\r\n+    a("-M","--Mendelrate",dest="mendrate",type="float",\r\n+      help="Mendelian error rate: probability of a mendel error per trio, default=0.0", default=0.0)   \r\n+    a("-H","--noHGRS",dest="useHG",type="int",\r\n+      help="Use local copy of UCSC snp126 database to generate real rs numbers", default=True)\r\n+    (options,args) = parser.parse_args()\r\n+    low = options.lowmaf\r\n+    try:\r\n+        os.makedirs(options.outpath)\r\n+    except:\r\n+        pass\r\n+    if options.MAFdist.upper() == \'U\':\r\n+        mafDist = makeUniformMAFdist(low=low, high=0.5)\r\n+    else:\r\n+        mafDist = makeTriangularMAFdist(low=low, high=0.5, beta=5)\r\n+    alleles,freqs, rslist, chromlist, poslist = makeMap(width=int(options.width),\r\n+                                        MAFdistribution=mafDist, useGP=False)\r\n+    fbathead = []\r\n+    s = string.whitespace+string.punctuation\r\n+    trantab = string.maketrans(s,\'_\'*len(s))\r\n+    title = string.translate(options.title,trantab)\r\n+    \r\n+    if options.outstyle == \'F\':\r\n+        fbatstyle = True\r\n+        fbathead = makeFbathead(rslist=rslist, chromlist=chromlist, poslist=poslist, width=options.width)\r\n+    else:\r\n+        fbatstyle = False\r\n+        writeMap(fprefix=defbasename, rslist=rslist, fpath=options.outpath,\r\n+                 chromlist=chromlist, poslist=poslist, width=options.width)\r\n+    if options.Naff > 0: # make case control data\r\n+        makeIndep(fprefix = defbasename, fpath=options.outpath,\r\n+                  width=options.width, Nunaff=options.Ntot-options.Naff,\r\n+                  Naff=options.Naff, MAFdistribution=mafDist,alleles=alleles, freqs=freqs,\r\n+                  fbatstyle=fbatstyle, missrate=options.missrate, missval=options.missval,\r\n+                  fbathead=fbathead)\r\n+    else:\r\n+        makePed(fprefix=defbasename, fpath=options.fpath,\r\n+            width=options.width, MAFdistribution=mafDist, nsubj=options.Ntot,\r\n+            alleles=alleles, freqs=freqs, fbatstyle=fbatstyle, missrate=options.missrate,\r\n+            mendrate=options.mendrate, missval=options.missval,\r\n+                  fbathead=fbathead)\r\n+    doImport(outfile=options.outf,outpath=options.outpath)\r\n+\r\n+\r\n+        \r\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgfakePed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgfakePed.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,112 @@
+<tool id="rgfakePed1" name="Null genotypes" version="0.02">
+  <description>for testing</description>
+  <command interpreter="python">rgfakePed.py --title '$title'
+  -o '$out_file1' -p '$out_file1.files_path' -c '$ncases' -n '$ntotal'
+  -s '$nsnp'  -w '$lowmaf' -v '$missingValue' -l '$outFormat'
+  -d '$mafdist' -m '$missingRate' -M '$mendelRate' </command>
+   <inputs>
+
+    <param name="title"
+         type="text" value="Fake_test_geno_data"
+         help="Name for outputs from this job"
+         label="Descriptive short name"/>
+    <param name="ntotal"
+         type="integer" value = "200"
+         help="N total: total number of subjects"
+         label="Create this total N subjects"/>
+    <param name="ncases" type="integer"
+         value="100"
+         help = "N cases: Independent subjects with status set to 2. Set 0 for family data (NSubj/3 trios)"
+         label="Total N Cases (0=generate family data - trios)"/>
+    <param name="nsnp"
+         type="integer" value="1000"
+         help="nsnp: total number of markers"
+         label="Total N SNP"/>
+    <param name="lowmaf" type="float"
+         value="0.01"
+         help = "Lower limit for MAF distribution"
+         label="Lower MAF limit (default=1%)"/>
+    <param name="mafdist"
+         type="select"
+         help="Choose a MAF distribution"
+         label="SNP Minor Allele Frequency distribution">
+           <option value="U" selected="true">Uniform</option>
+           <option value="T">Triangular (more low frequency SNPs)</option>
+    </param>
+    <param name="outFormat"
+         type="select"
+         help="Choose an output format"
+         label="Output format file type - linkage ped or fbat ped">
+           <option value="L" selected="true">Linkage format - separate .map file</option>
+           <option value="F">fbat style - marker names in a header row</option>
+    </param>
+    <param name="missingRate" type="float"
+         value="0.05"
+         help = "Fraction of genotypes to be randomly set missing"
+         label="Missing genotype call fraction"/>
+    <param name="mendelRate"
+         type="float" value = "0.05"
+         help="(family data) Fraction of apparently non-Mendelian transmission patterns"
+         label="Mendel error transmission rate"/>
+
+    <param name="missingValue" type="text" size="1"
+         value='0'
+         help = "Missing allele value"
+         label="Missing value for an allele for the output ped file"/>
+
+</inputs>
+
+ <outputs>
+    <data format="lped" name="out_file1" label="${title}.lped"/>
+  </outputs>
+<tests>
+ <test>
+    <param name='title' value='rgfakePedtest1' />
+    <param name="ntotal" value="40" />
+    <param name="ncases" value="20" />
+    <param name="nsnp" value="10" />
+    <param name="lowmaf" value="0" />
+    <param name="mafdist" value="T" />
+    <param name="outFormat" value="L" />
+    <param name="missingRate" value="0" />
+    <param name="mendelRate" value="0" />
+    <param name="missingValue" value="0" />
+    <output name='out_file1' file='rgtestouts/rgfakePed/rgfakePedtest1.lped' ftype='lped' compare="diff" lines_diff='5'>
+    <extra_files type="file" name='RgeneticsData.ped' value="rgtestouts/rgfakePed/rgfakePedtest1.ped" compare="diff" lines_diff='80'/>
+    <extra_files type="file" name='RgeneticsData.map' value="rgtestouts/rgfakePed/rgfakePedtest1.map" compare="diff" />
+    </output>
+ </test>
+</tests>
+<help>
+.. class:: infomark
+
+This tool allows you to generate an arbitrary (sort of)
+synthetic genotype file (no attempt at LD - the markers are independent)
+with optional missingness, Mendel errors, minor allele frequency settings, family structure
+These might be used for testing under
+the null hypothesis of no association and are certainly useful for
+scale testing.
+
+Note that although it runs reasonably fast given it's a script, generating a large data set takes
+a while. An hour or so should get you a reasonable (3GB) sized simulated null data set..
+
+A better simulator can easily be swapped in with this tool interface.
+
+-----
+
+.. class:: warningmark
+
+This tool is very experimental
+
+.. class:: infomark
+
+**Attribution and Licensing**
+
+Designed and written for the Rgenetics Galaxy tools
+copyright Ross Lazarus 2007 (ross.lazarus@gmail.com)
+Licensed under the terms of the _LGPL

+ .. _LGPL: http://www.gnu.org/copyleft/lesser.html
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgfakePhe.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgfakePhe.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,469 @@\n+"""\r\n+fakephe.py\r\n+ross lazarus sept 30 2007\r\n+This is available under the LGPL as defined then.\r\n+\r\n+use the pedigree data for ids\r\n+\r\n+use pythons generators to literally generate a bunch of random phenotype measurements\r\n+\r\n+Plink format at http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#pheno\r\n+is\r\n+\r\n+To specify an alternate phenotype for analysis, i.e. other than the one in the *.ped file (or, if using a binary fileset, the\r\n+*.fam file), use the --pheno option:\r\n+plink --file mydata --pheno pheno.txt\r\n+\r\n+where pheno.txt is a file that contains 3 columns (one row per individual):\r\n+\r\n+     Family ID\r\n+     Individual ID\r\n+     Phenotype\r\n+\r\n+NOTE The original PED file must still contain a phenotype in column 6 (even if this is a dummy phenotype, e.g. all missing),\r\n+unless the --no-pheno flag is given.\r\n+\r\n+If an individual is in the original file but not listed in the alternate phenotype file, that person\'s phenotype will be set to\r\n+missing. If a person is in the alternate phenotype file but not in the original file, that entry will be ignored. The order of\r\n+the alternate phenotype file need not be the same as for the original file.\r\n+\r\n+If the phenotype file contains more than one phenotype, then use the --mpheno N option to specify the Nth phenotype is the one\r\n+to be used:\r\n+plink --file mydata --pheno pheno2.txt --mpheno 4\r\n+\r\n+where pheno2.txt contains 5 different phenotypes (i.e. 7 columns in total), this command will use the 4th for analysis\r\n+(phenotype D):\r\n+\r\n+     Family ID\r\n+     Individual ID\r\n+     Phenotype A\r\n+     Phenotype B\r\n+     Phenotype C\r\n+     Phenotype D\r\n+     Phenotype E\r\n+\r\n+Alternatively, your alternate phenotype file can have a header row, in which case you can use variable names to specify which\r\n+phenotype to use. If you have a header row, the first two variables must be labelled FID and IID. All subsequent variable names\r\n+cannot have any whitespace in them. For example,\r\n+\r\n+     FID    IID      qt1   bmi    site\r\n+     F1     1110     2.3   22.22  2\r\n+     F2     2202     34.12 18.23  1\r\n+     ...\r\n+\r\n+then\r\n+plink --file mydata --pheno pheno2.txt --pheno-name bmi --assoc\r\n+\r\n+will select the second phenotype labelled "bmi", for analysis\r\n+\r\n+Finally, if there is more than one phenotype, then for basic association tests, it is possible to specify that all phenotypes\r\n+be tested, sequentially, with the output sent to different files: e.g. if bigpheno.raw contains 10,000 phenotypes, then\r\n+plink --bfile mydata --assoc --pheno bigpheno.raw --all-pheno\r\n+\r\n+will loop over all of these, one at a time testing for association with SNP, generating a lot of output. You might want to use\r\n+the --pfilter command in this case, to only report results with a p-value less than a certain value, e.g. --pfilter 1e-3.\r\n+\r\n+WARNING Currently, all phenotypes must be numerically coded, including missing values, in the alternate phenotype file. The\r\n+default missing value is -9, change this with --missing-phenotype, but it must be a numeric value still (in contrast to the\r\n+main phenotype in the PED/FAM file. This issue will be fixed in future releases.\r\n+Covariate files\r\n+\r\n+===========================\r\n+rgfakePhe.xml\r\n+<tool id="fakePhe1" name="Fake phenos">\r\n+  <description>for multiple null fake phenotype</description>\r\n+  <command interpreter="python2.4">rgfakePhe.py $input1 \'$title1\' $out_file1 $log_file1 $script_file</command>\r\n+   <inputs>\r\n+    <page>\r\n+    <param name="input1"\r\n+         type="library" format="lped"\r\n+         label="Pedigree from Dataset"/>\r\n+    <param name="title1" type="text"\r\n+         value="My fake phenos" size="60"\r\n+         label="Title for outputs"/>\r\n+    </page>\r\n+    <page>\r\n+    <repeat name="fakePhe" title="Phenotypes to Fake">\r\n+        <param name="pName" type="text" label="Phenotype Name">\r\n+        </param>\r\n+      <conditional name="series">\r\n+        <param name="phetype" type="select" label="Phenotype Type">\r\n+          <option v'..b')))\r\n+    outf.write("</div></body></html>\\n")\r\n+    outf.close()\r\n+\r\n+\r\n+def test():\r\n+    """test case\r\n+    need to get these out of a galaxy form - series of pages - get types\r\n+    on first screen, names on second, params on third?\r\n+    holy shit. this actually works I think\r\n+    """\r\n+    pT = [\'rnorm\',\'rnorm\',\'rnorm\',\'rnorm\',\'cat\',\'unif\']\r\n+    pN = [\'SysBP\',\'DiaBP\',\'HtCM\',\'WtKG\',\'Race\',\'age\']\r\n+    pP = [{\'Mean\':\'120\',\'SD\':\'10\'},{\'Mean\':\'90\',\'SD\':\'15\'},{\'Mean\':\'160\',\'SD\':\'20\'},{\'Mean\':\'60\',\'SD\':\'20\'}, \\\r\n+          {\'values\':\'Blink,What,Yours,green\'},{\'low\':16,\'hi\':99}]\r\n+    phes = makePhes(pheTypes=pT, pheNames=pN, pheParams=pP)\r\n+    ids = []\r\n+    for i in range(10):\r\n+        ids.append([\'fid%d\' % i,\'iid%d\' % i])\r\n+    pheres = makePhe(phes=phes,ids=ids)\r\n+    res = [\'\'.join(x) for x in pheres]\r\n+    print \'\\n\'.join(res)\r\n+\r\n+\r\n+\r\n+if __name__ == "__main__":\r\n+    """\r\n+   <command interpreter="python">rgfakePhe.py \'$infile1.extra_files_path/$infile1.metadata.base_name\'\r\n+   "$title1" \'$ppheout\' \'$ppheout.files_path\' \'$script_file \'\r\n+   </command>\r\n+    The xml file for this tool is complex, and allows any arbitrary number of\r\n+    phenotype columns to be specified from a couple of optional types - rnorm, cat\r\n+    are working now.\r\n+\r\n+    Note that we create new files in their respective library directories and link to them in the output file\r\n+    so they can be displayed and downloaded separately\r\n+\r\n+    """\r\n+    killme = string.punctuation + string.whitespace\r\n+    trantab = string.maketrans(killme,\'_\'*len(killme))\r\n+    progname = os.path.basename(sys.argv[0])\r\n+    cl = \'## at %s, %s got cl= %s\' % (timenow(),progname,\' \'.join(sys.argv))\r\n+    print >> sys.stdout,cl\r\n+    if len(sys.argv) < 5:\r\n+        test()\r\n+    else:\r\n+        inped = sys.argv[1]\r\n+        title = sys.argv[2].translate(trantab)\r\n+        ppheout = sys.argv[3]\r\n+        pphe_path = sys.argv[4]\r\n+        scriptfile = sys.argv[5]\r\n+        ind = file(scriptfile,\'r\').readlines()\r\n+        mylog = []\r\n+        s = \'## %s starting at %s<br/>\\n\' % (progname,timenow())\r\n+        mylog.append(s)\r\n+        mylog.append(cl)\r\n+        s = \'## params = %s<br/>\\n\' % (\' \'.join(sys.argv[1:]))\r\n+        mylog.append(s)\r\n+        s = \'\\n\'.join(ind)\r\n+        mylog.append(\'Script file %s contained %s<br/>\\n\' % (scriptfile,s))\r\n+        pT = []\r\n+        pN = []\r\n+        pP = []\r\n+        for l in ind:\r\n+            l = l.strip()\r\n+            if len(l) > 1:\r\n+                adict = eval(l)\r\n+                pT.append(adict.get(\'pT\',None))\r\n+                pN.append(adict.get(\'pN\',None))\r\n+                pP.append(eval(adict.get(\'pP\',None)))\r\n+        s = \'## pt,pn,pp=%s,%s,%s<br/>\\n\' % (str(pT),str(pN),str(pP))\r\n+        mylog.append(s)\r\n+        phes = makePhes(pheTypes=pT, pheNames=pN, pheParams=pP)\r\n+        ids = getIds(indir=inped) # for labelling rows\r\n+        pheres = makePhe(phes=phes,ids=ids) # random values from given distributions\r\n+        try:\r\n+            os.makedirs(pphe_path)\r\n+        except:\r\n+            pass\r\n+        outname = os.path.join(pphe_path,title)\r\n+        pphefname = \'%s.pphe\' % outname\r\n+        f = file(pphefname, \'w\')\r\n+        f.write(\'\\n\'.join(pheres))\r\n+        f.write(\'\\n\')\r\n+        f.close()\r\n+        if doFbatphe:\r\n+            try:\r\n+                os.makedirs(fphe_path)\r\n+            except:\r\n+                pass\r\n+            outname = os.path.join(fphe_path,title)\r\n+            fphefname = \'%s.phe\' % outname\r\n+            f = file(fphefname, \'w\')\r\n+            header = pheres[0].split()\r\n+            pheres[0] = \' \'.join(header[2:])# remove iid fid from header for fbat\r\n+            f.write(\'\\n\'.join(pheres))\r\n+            f.close()\r\n+            doImport(outfile=fpheout,flist=[fphefname,],expl=\'(FBAT phenotype format)\',mylog=mylog)\r\n+        #doImport(outfile=\'test\',flist=[],expl=\'\',mylog=[]):\r\n+        doImport(outfile=ppheout,flist=[pphefname,],expl=\'(Plink phenotype format)\',mylog=mylog)\r\n+\r\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgfakePhe.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgfakePhe.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,131 @@
+<tool id="fakePhe1" name="Null phenotypes">
+  <description>for testing</description>
+   <command interpreter="python">rgfakePhe.py '$infile1.extra_files_path/$infile1.metadata.base_name'
+   "$title1" '$ppheout' '$ppheout.files_path' '$script_file'
+   </command>
+   <inputs>
+    <page>
+    <param name="infile1"
+         type="data" format="pbed,lped"
+         label="Pedigree from Dataset" />
+        <param name="title1" type="text"
+         value="My null phenos" size="60"
+         label="Title for outputs"/>
+        <param name="dbkey" type="hidden" value='hg18' />
+    </page>
+    <page>
+    <repeat name="fakePhe" title="Phenotypes to simulate under the Null">
+        <param name="pName" type="text" label="Phenotype Name">
+        </param>
+      <conditional name="series">
+        <param name="phetype" type="select" label="Phenotype Distribution">
+          <option value="rnorm" selected="true">Random Normal variate</option>
+          <option value="unif">Random Uniform variate</option>
+          <option value="rgamma">Random Gamma variate</option>
+          <option value="weibull">Random Weibull variate</option>
+          <option value="exponential">Random exponential variate</option>
+          <option value="poisson">Random Poisson variate</option>
+          <option value="cat">Random categorical choice</option>
+        </param>
+        <when value="poisson">
+          <param name="lamb" type="integer" value="2" label="Lambda (mean and variance)" />
+        </when>
+        <when value="rnorm">
+          <param name="Mean" type="float" value="0.0" label="Mean" />
+          <param name="SD" type="float" label="SD" value="1.0"/>
+        </when>
+        <when value="exponential">
+          <param name="Mean" type="float" value="1.0" label="Mean" help="lambda for the exponential will be 1.0/Mean" />=
+        </when>
+        <when value="rgamma">
+          <param name="Alpha" type="float" value="10" label="Alpha">
+          </param>
+          <param name="Beta" type="float" label="Beta" value="1.0">
+          </param>
+        </when>
+        <when value="weibull">
+          <param name="Alpha" type="float" value="10" label="Alpha">
+          </param>
+          <param name="Beta" type="float" label="Beta" value="1.0">
+          </param>
+        </when>
+        <when value="unif">
+          <param name="low" type="float" value="0.0" label="Lowest uniform value">
+          </param>
+          <param name="hi" type="float" label="Highest uniform value" value="1.0"
+           help="A uniform value will be generated from the range specified (low to high) - eg 0.0 to 1.0">
+          </param>
+        </when>
+        <when value="cat">
+          <param name="values" type="text" value="A,B,C" label="Comma separated values to choose from"
+         help = "Each of the comma separated values will have an equal probability of being chosen - eg 'A1,A2,B1,B2'">
+          </param>
+        </when>
+      </conditional>
+    </repeat>
+    </page>
+</inputs>
+<outputs>
+       <data format="pphe" name="ppheout"  metadata_source="infile1" />
+</outputs>
+<configfiles>
+<configfile name="script_file">
+#for $n, $f in enumerate($fakePhe)
+#if $f.series.phetype=='rnorm'
+{'pN':'$f.pName','pT':'rnorm','pP':"{'Mean':'$f.series.Mean', 'SD':'$f.series.SD'}"}
+#elif $f.series.phetype=='rgamma'
+{'pN':'$f.pName','pT':'rgamma','pP':"{'Alpha':'$f.series.Alpha', 'Beta':'$f.series.Beta'}"}
+#elif $f.series.phetype=='poisson'
+{'pN':'$f.pName','pT':'poisson','pP':"{'lamb':'$f.series.lamb',}"}
+#elif $f.series.phetype=='exponential'
+{'pN':'$f.pName','pT':'exponential','pP':"{'Mean':'$f.series.Mean',}"}
+#elif $f.series.phetype=='weibull'
+{'pN':'$f.pName','pT':'weibull','pP':"{'Alpha':'$f.series.Alpha', 'Beta':'$f.series.Beta'}"}
+#elif $f.series.phetype=='cat'
+{'pN':'$f.pName','pT':'$f.series.phetype','pP':"{'values':'$f.series.values'}"}
+#elif $f.series.phetype=='unif'
+{'pN':'$f.pName','pT':'$f.series.phetype','pP':"{'low':'$f.series.low','hi':'$f.series.hi'}"}
+#end if
+#end for
+</configfile>
+</configfiles>
+<help>
+.. class:: infomark
+
+This tool allows you to generate an arbitrary (sort of)
+synthetic phenotype file with measurements drawn from normal,
+gamma, weibull, exponential, uniform or categorical distributions. These are for testing under
+the null hypothesis of no association - the values are random but
+from user specified distributions.
+
+Two output files will appear - one for FBAT and the other for Plink since unfortunately,
+they have slightly differing requirements for the header row.
+
+-----
+
+.. class:: warningmark
+
+This tool is very experimental
+
+-----
+
+- **Pedigree** is a library pedigree file - the id's will be used in the synthetic null phenotypes
+- **Title** is a name to give to the output phenotype file
+
+On the next page, you can add an unlimited number of various kinds of phenotypes including choices for
+categorical ones or distributions with specific parameters
+
+Just keep using the "Add new phenotype" button to add new specifications until you're done.
+Use the Execute button to run the program and generate the null phenotype data.
+The new files will be available on the drop down lists for appropriate tools - eg the
+FBAT format one will be available if you run the FBAT modelling tool.
+
+**Attribution**
+Originally designed and written for the Rgenetics
+series of Galaxy tools, and
+copyright Ross Lazarus 2007 (ross period lazarus at gmail period com)
+Licensed under the terms of the LGPL
+as documented http://www.gnu.org/licenses/lgpl.html
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgtest.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgtest.sh Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,201 @@\n+#!/bin/sh\n+# script to generate all functional test outputs for each rgenetics tool\n+# could be run at installation to ensure all dependencies are in place?\n+if test $# -lt 2\n+then\n+   echo "We need to agree on 2 parameters - GalaxyRoot and OutRoot - use paths to galaxy and galaxy to re-create all test outputs"\n+   echo "or more prudently, galaxy and /tmp/foo for checking without updating all your test-data"\n+   echo "Exiting with no changes"\n+   exit 1\n+fi\n+if [ $1 ]\n+then\n+  GALAXYROOT=$1\n+else\n+  GALAXYROOT=`pwd`\n+fi\n+if [ $2 ]\n+then\n+  OUTROOT=$2\n+else\n+  OUTROOT=`pwd`\n+  OUTROOT="$OUTROOT/test-data"\n+fi\n+echo "using $GALAXYROOT as galaxyroot and $OUTROOT as outroot"\n+# change this as needed for your local install\n+INPATH="${GALAXYROOT}/test-data"\n+JARPATH="${GALAXYROOT}/tool-data/shared/jars"\n+TOOLPATH="${GALAXYROOT}/tools/rgenetics"\n+OROOT="${OUTROOT}/test-data/rgtestouts"\n+NORMALOROOT="${OUTROOT}/test-data"\n+mkdir -p $OROOT\n+rm -rf $OROOT/*\n+# needed for testing - but tool versions should be bumped if this is rerun?\n+TOOL="rgManQQ"\n+NPRE=${TOOL}test1\n+OUTPATH="$OROOT/$TOOL"\n+mkdir $OUTPATH\n+CL="python $TOOLPATH/$TOOL.py "$INPATH/smallwgaP.xls" $NPRE ${OUTPATH}/${NPRE}.html $OUTPATH 1 2 7 0"\n+# rgManQQ.py \'$input_file\' "$name" \'$out_html\' \'$out_html.files_path\' \'$chrom_col\' \'$offset_col\' \n+# \'$pval_col\'\n+#python /opt/galaxy/tools/rgenetics/rgManQQ.py /opt/galaxy/test-data/smallwgaP.xls rgManQQtest1 \n+#/opt/galaxy/test-data/rgtestouts/rgManQQ/rgManQQtest1.html /opt/galaxy/test-data/rgtestouts/rgManQQ 1 2 5,7 \n+echo "Testing $TOOL using $CL"\n+python $TOOLPATH/$TOOL.py "$INPATH/smallwgaP.xls" $NPRE ${OUTPATH}/${NPRE}.html $OUTPATH 1 2 7 0\n+TOOL="rgfakePhe"\n+NPRE=${TOOL}test1\n+OUTPATH="$OROOT/$TOOL"\n+mkdir $OUTPATH\n+PSSCRIPT="$OUTPATH/script_file"\n+echo "{\'pN\':\'normtest\',\'pT\':\'rnorm\',\'pP\':\\"{\'Mean\':\'100\', \'SD\':\'10\'}\\"}" > $PSSCRIPT\n+echo "{\'pN\':\'cattest\',\'pT\':\'cat\',\'pP\':\\"{\'values\':\'red,green,blue\'}\\"}" >> $PSSCRIPT\n+echo "{\'pN\':\'uniftest\',\'pT\':\'$f.series.phetype\',\'pP\':\\"{\'low\':\'1\',\'hi\':\'100\'}\\"}" >> $PSSCRIPT\n+echo "{\'pN\':\'gammatest\',\'pT\':\'rgamma\',\'pP\':\\"{\'Alpha\':\'1\', \'Beta\':\'0.1\'}\\"}" >> $PSSCRIPT\n+echo "{\'pN\':\'poissontest\',\'pT\':\'poisson\',\'pP\':\\"{\'lamb\':\'1.0\',}\\"}" >> $PSSCRIPT\n+echo "{\'pN\':\'exptest\',\'pT\':\'exponential\',\'pP\':\\"{\'Mean\':\'100.0\',}\\"}" >> $PSSCRIPT\n+echo "{\'pN\':\'weibtest\',\'pT\':\'weibull\',\'pP\':\\"{\'Alpha\':\'1.0\', \'Beta\':\'0.1\'}\\"}" >> $PSSCRIPT\n+echo "now doing $TOOL"\n+python $TOOLPATH/$TOOL.py ${INPATH}/tinywga $NPRE $NPRE.pphe $OUTPATH $PSSCRIPT\n+#   <command interpreter="python">rgfakePhe.py \'$infile1.extra_files_path/$infile1.metadata.base_name\'\n+#   "$title1" \'$ppheout\' \'$ppheout.files_path\' \'$script_file\'\n+#\n+#\n+TOOL="rgQC"\n+NPRE=${TOOL}test1\n+echo "now doing $TOOL"\n+OUTPATH="$OROOT/$TOOL"\n+mkdir $OUTPATH\n+python $TOOLPATH/$TOOL.py -i "$INPATH/tinywga" -o $NPRE -s ${OUTPATH}/${NPRE}.html -p $OUTPATH\n+# rgQC.py -i \'$input_file.extra_files_path/$input_file.metadata.base_name\' -o "$out_prefix" \n+# -s \'$html_file\' -p \'$html_file.files_path\' \n+#\n+TOOL="rgGRR"\n+NPRE=${TOOL}test1\n+echo "now doing $TOOL"\n+OUTPATH="$OROOT/$TOOL"\n+mkdir $OUTPATH\n+CMD="python $TOOLPATH/$TOOL.py "$INPATH/tinywga" "tinywga" $OUTPATH/${NPRE}.html $OUTPATH "$NPRE" 100 6 true" \n+echo "doing $CMD"\n+$CMD\n+# rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name"\n+#\'$out_file1\' \'$out_file1.files_path\' "$title"  \'$n\' \'$Z\' \'$force\'\n+#\n+TOOL="rgLDIndep"\n+NPRE=${TOOL}test1\n+OUTPATH="$OROOT/$TOOL"\n+mkdir $OUTPATH\n+python $TOOLPATH/$TOOL.py "$INPATH" "tinywga" "$NPRE" 1 1 0 0 1 1 $OUTPATH/${NPRE}.pbed $OUTPATH 10000 5000 0.1 \n+#rgLDIndep.py \'$input_file.extra_files_path\' \'$input_file.metadata.base_name\' \'$title\' \'$mind\'\n+# \'$geno\' \'$hwe\' \'$maf\' \'$mef\' \'$mei\' \'$out_file1\'\n+#\'$out_file1.files_path\'  \'$window\' \'$step\' \'$r2\' \n+TOOL="rgPedSub"\n+NPRE=${TOOL}test1\n+OUTPATH="$OROOT/$TOOL"\n+mkdir $OUTPATH\n+PSSCRIPT="$OUTPATH/pedsub.script"\n+echo "title~~~~$NPRE" > $PSSCRIPT\n+echo "output1~~~~${OUTPATH}/${NPRE}.lped" >> $P'..b':\'weibtest\',\'pT\':\'weibull\',\'pP\':\\"{\'Alpha\':\'1.0\', \'Beta\':\'0.1\'}\\"}" >> $PSSCRIPT\n+echo "now doing $TOOL"\n+python $TOOLPATH/$TOOL.py $PSSCRIPT\n+#\n+echo "Now doing rgclean"\n+TOOL="rgClean"\n+NPRE=${TOOL}test1\n+OUTPATH="$OROOT/$TOOL"\n+mkdir $OUTPATH\n+python $TOOLPATH/$TOOL.py $INPATH "tinywga" "$NPRE" 1 1 0 0 1 1 $OUTPATH/${NPRE}.pbed $OUTPATH 0 0 0 0\n+# rgClean.py \'$input_file.extra_files_path\' \'$input_file.metadata.base_name\' \'$title\' \'$mind\'\n+#        \'$geno\' \'$hwe\' \'$maf\' \'$mef\' \'$mei\' \'$out_file1\' \'$out_file1.files_path\'\n+#        \'${GALAXY_DATA_INDEX_DIR}/rg/bin/plink\' \'$relfilter\' \'$afffilter\' \'$sexfilter\' \'$fixaff\'\n+#\n+echo "Now doing rgEigPCA"\n+TOOL="rgEigPCA"\n+NPRE=${TOOL}test1\n+OUTPATH="$OROOT/$TOOL"\n+mkdir $OUTPATH\n+python $TOOLPATH/$TOOL.py "$INPATH/tinywga" "$NPRE" ${OUTPATH}/${NPRE}.html $OUTPATH 4 2 2 2 $OUTPATH/rgEigPCAtest1.txt\n+#    rgEigPCA.py "$i.extra_files_path/$i.metadata.base_name" "$title" "$out_file1"\n+#    "$out_file1.files_path" "$k" "$m" "$t" "$s" "$pca" \n+#\n+TOOL="rgfakePed"\n+NPRE=${TOOL}test1\n+OUTPATH="$OROOT/$TOOL"\n+mkdir $OUTPATH\n+echo "now doing $TOOL"\n+python $TOOLPATH/$TOOL.py --title "$NPRE" -o $OUTPATH/${NPRE}.lped -p $OUTPATH -c "20" -n "40" -s "10" -w "0" -v "0" -l "pbed" -d "T" -m "0" -M "0"\n+#rgfakePed.py --title \'$title1\' \n+#  -o \'$out_file1\' -p \'$out_file1.extra_files_path\' -c \'$ncases\' -n \'$ntotal\'\n+#  -s \'$nsnp\'  -w \'$lowmaf\' -v \'$missingValue\' -l \'$outFormat\'\n+#  -d \'$mafdist\' -m \'$missingRate\' -M \'$mendelRate\'\n+#\n+TOOL="rgHaploView"\n+NPRE=${TOOL}test1\n+OUTPATH="$OROOT/$TOOL"\n+mkdir $OUTPATH\n+echo "now doing $TOOL"\n+python $TOOLPATH/$TOOL.py ""  "rs2283802Xrs2267000Xrs16997606Xrs4820537Xrs3788347Xrs756632Xrs4820539Xrs2283804Xrs2267006Xrs4822363X" \\\n+"$NPRE" $OUTPATH/${NPRE}.html  "$INPATH" "tinywga" 0.0 200000 "RSQ" "lo" "2048" "$OUTPATH" "noinfo" "0.8" "YRI" $JARPATH/haploview.jar\n+#  rgHaploView.py "$ucsc_region" "$rslist" "$title" "$output1"  \n+#  "$lhistIn.extra_files_path" "$lhistIn.metadata.base_name"\n+#  "$minmaf" "$maxdist" "$ldtype" "$hires" "$memsize" "$output1.files_path" \n+#  "$infoTrack" "$tagr2" "$hmpanel" ${GALAXY_DATA_INDEX_DIR}/rg/bin/haploview.jar\n+# note these statistical tools do NOT generate composite outputs\n+TOOL="rgGLM"\n+NPRE=${TOOL}test1\n+OUTPATH=$NORMALOROOT\n+echo "now doing $TOOL"\n+python $TOOLPATH/$TOOL.py "$INPATH/tinywga" $INPATH/tinywga "$NPRE" "c1" "" $OUTPATH/${NPRE}_GLM.xls \\\n+$OUTPATH/${NPRE}_GLM_log.txt "tinywga" "" "" "" 1 1 0 0 $OUTPATH/${NPRE}_GLM_topTable.gff \n+##        rgGLM.py \'$i.extra_files_path/$i.metadata.base_name\' \'$phef.extra_files_path/$phef.metadata.base_name\'\n+##        "$title1" \'$predvar\' \'$covar\' \'$out_file1\' \'$logf\' \'$dbkey\' \'$i.metadata.base_name\'\n+##        \'$inter\' \'$cond\' \'$gender\' \'$mind\' \'$geno\' \'$maf\' \'$logistic\' \'$gffout\'\n+#\n+TOOL="rgTDT"\n+NPRE=${TOOL}test1\n+OUTPATH=$NORMALOROOT\n+echo "now doing $TOOL"\n+python $TOOLPATH/$TOOL.py -i "$INPATH/tinywga"  -o "$NPRE" -r $OUTPATH/${NPRE}_TDT.xls \\\n+-l $OUTPATH/${NPRE}_TDT_log.txt -g $OUTPATH/${NPRE}_TDT_topTable.gff\n+##        rgTDT.py -i \'$infile.extra_files_path/$infile.metadata.base_name\' -o \'$title\'\n+##        -r \'$out_file1\' -l \'$logf\' -x \'${GALAXY_DATA_INDEX_DIR}/rg/bin/plink\'\n+##        -g \'$gffout\'\n+#\n+TOOL="rgCaCo"\n+NPRE=${TOOL}test1\n+OUTPATH=$NORMALOROOT\n+echo "now doing $TOOL"\n+python $TOOLPATH/rgCaCo.py $INPATH/tinywga "$NPRE" $OUTPATH/${NPRE}_CaCo.xls $OUTPATH/${NPRE}_CaCo_log.txt $OUTPATH $OUTPATH/${NPRE}_CaCo_topTable.gff\n+# rgCaCo.py \'$i.extra_files_path/$i.metadata.base_name\' "$name"  \'$out_file1\' \'$logf\' \'$logf.files_path\' \'$gffout\'\n+#\n+TOOL="rgQQ"\n+echo "now doing $TOOL"\n+NPRE=${TOOL}test1\n+OUTPATH=$NORMALOROOT\n+CL="python $TOOLPATH/$TOOL.py "$INPATH/tinywga.pphe" "$NPRE" 1 3 $OUTPATH/$NPRE.pdf 8 10 "false" 1 $OUTPATH"\n+echo "running $TOOL using $CL"\n+python $TOOLPATH/$TOOL.py "$INPATH/tinywga.pphe" "$NPRE" 1 3 $OUTPATH/$NPRE.pdf 8 10 "false" 1 $OUTPATH\n+# rgQQ.py "$input1" "$name" $sample "$cols" $allqq $height $width $log $allqq.id $__new_file_path__ \n+#\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgtest_one_tool.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgtest_one_tool.sh Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,224 @@\n+#!/bin/sh\n+# script to generate all functional test outputs for each rgenetics tool\n+# could be run at installation to ensure all dependencies are in place?\n+case $# in 0) echo "USAGE: ${0##*/} TooltoTest galaxyRoot outRoot"; exit 1;;\n+           [1-2]*) echo "Need ToolToTest and paths for galaxyRoot outRoot as parameters"; exit 2;;\n+           [5-10]*) echo "Too many arguments - ToolToTest and paths for galaxyRoot outRoot as parameters"; exit 2;;\n+           *)\n+esac\n+GALAXYROOT=$2\n+OUTROOT=$3\n+echo "using $GALAXYROOT"\n+# change this as needed for your local install\n+INPATH="${GALAXYROOT}/test-data"\n+JARPATH="${GALAXYROOT}/tool-data/shared/jars"\n+TOOLPATH="${GALAXYROOT}/tools/rgenetics"\n+OROOT="${OUTROOT}/test-data/rgtestouts"\n+NORMALOROOT="${OUTROOT}/test-data"\n+case "$1" in\n+\'rgManQQ\')\n+\n+TOOL="rgManQQ"\n+NPRE=${TOOL}test1\n+OUTPATH="$OROOT/$TOOL"\n+rm -rf $OUTPATH/*\n+CL="python $TOOLPATH/$TOOL.py "$INPATH/smallwgaP.xls" $NPRE ${OUTPATH}/${NPRE}.html $OUTPATH 1 2 7 0"\n+# rgManQQ.py \'$input_file\' "$name" \'$out_html\' \'$out_html.files_path\' \'$chrom_col\' \'$offset_col\' \n+# \'$pval_col\'\n+#python /opt/galaxy/tools/rgenetics/rgManQQ.py /opt/galaxy/test-data/smallwgaP.xls rgManQQtest1 \n+#/opt/galaxy/test-data/rgtestouts/rgManQQ/rgManQQtest1.html /opt/galaxy/test-data/rgtestouts/rgManQQ 1 2 5,7 \n+echo "Testing $TOOL using $CL"\n+python $TOOLPATH/$TOOL.py "$INPATH/smallwgaP.xls" $NPRE ${OUTPATH}/${NPRE}.html $OUTPATH 1 2 7 0\n+;;\n+\n+\'rgfakePhe\')\n+TOOL="rgfakePhe"\n+NPRE=${TOOL}test1\n+OUTPATH="$OROOT/$TOOL"\n+rm -rf $OUTPATH/*\n+PSSCRIPT="$OUTPATH/script_file"\n+echo "{\'pN\':\'normtest\',\'pT\':\'rnorm\',\'pP\':\\"{\'Mean\':\'100\', \'SD\':\'10\'}\\"}" > $PSSCRIPT\n+echo "{\'pN\':\'cattest\',\'pT\':\'cat\',\'pP\':\\"{\'values\':\'red,green,blue\'}\\"}" >> $PSSCRIPT\n+echo "{\'pN\':\'uniftest\',\'pT\':\'$f.series.phetype\',\'pP\':\\"{\'low\':\'1\',\'hi\':\'100\'}\\"}" >> $PSSCRIPT\n+echo "{\'pN\':\'gammatest\',\'pT\':\'rgamma\',\'pP\':\\"{\'Alpha\':\'1\', \'Beta\':\'0.1\'}\\"}" >> $PSSCRIPT\n+echo "{\'pN\':\'poissontest\',\'pT\':\'poisson\',\'pP\':\\"{\'lamb\':\'1.0\',}\\"}" >> $PSSCRIPT\n+echo "{\'pN\':\'exptest\',\'pT\':\'exponential\',\'pP\':\\"{\'Mean\':\'100.0\',}\\"}" >> $PSSCRIPT\n+echo "{\'pN\':\'weibtest\',\'pT\':\'weibull\',\'pP\':\\"{\'Alpha\':\'1.0\', \'Beta\':\'0.1\'}\\"}" >> $PSSCRIPT\n+echo "now doing $TOOL"\n+python $TOOLPATH/$TOOL.py ${INPATH}/tinywga $NPRE $NPRE.pphe $OUTPATH $PSSCRIPT\n+#   <command interpreter="python">rgfakePhe.py \'$infile1.extra_files_path/$infile1.metadata.base_name\'\n+#   "$title1" \'$ppheout\' \'$ppheout.files_path\' \'$script_file\'\n+#\n+;;\n+\'rgQC\')\n+\n+TOOL="rgQC"\n+NPRE=${TOOL}test1\n+echo "now doing $TOOL"\n+OUTPATH="$OROOT/$TOOL"\n+rm -rf $OUTPATH/*\n+CMD="python $TOOLPATH/$TOOL.py -i $INPATH/tinywga -o $NPRE -s ${OUTPATH}/${NPRE}.html -p $OUTPATH"\n+echo "doing $CMD"\n+$CMD\n+# rgQC.py -i \'$input_file.extra_files_path/$input_file.metadata.base_name\' -o "$out_prefix" \n+# -s \'$html_file\' -p \'$html_file.files_path\' \n+#\n+;;\n+\n+\'rgGRR\')\n+TOOL="rgGRR"\n+NPRE=${TOOL}test1\n+echo "now doing $TOOL"\n+OUTPATH="$OROOT/$TOOL"\n+rm -rf $OUTPATH/*\n+cmd="$TOOLPATH/$TOOL.py "$INPATH/tinywga" "tinywga" $OUTPATH/${NPRE}.html $OUTPATH "$NPRE" \'100\' \'6\' \'true\'"\n+echo "Doing $cmd"\n+python $TOOLPATH/$TOOL.py "$INPATH/tinywga" "tinywga" $OUTPATH/${NPRE}.html $OUTPATH "$NPRE" \'100\' \'6\' \n+# rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name"\n+#\'$out_file1\' \'$out_file1.files_path\' "$title"  \'$n\' \'$Z\' \n+;;\n+\'rgLDIndep\')\n+TOOL="rgLDIndep"\n+NPRE=${TOOL}test1\n+OUTPATH="$OROOT/$TOOL"\n+rm -rf $OUTPATH/*\n+python $TOOLPATH/$TOOL.py "$INPATH" "tinywga" "$NPRE" 1 1 0 0 1 1 $OUTPATH/${NPRE}.pbed $OUTPATH 10000 5000 0.1 \n+#rgLDIndep.py \'$input_file.extra_files_path\' \'$input_file.metadata.base_name\' \'$title\' \'$mind\'\n+# \'$geno\' \'$hwe\' \'$maf\' \'$mef\' \'$mei\' \'$out_file1\'\n+#\'$out_file1.files_path\'  \'$window\' \'$step\' \'$r2\' \n+;;\n+\n+\'rgPedSub\')\n+TOOL="rgPedSub"\n+NPRE=${TOOL}test1\n+OUTPATH="$OROOT/$TOOL"\n+rm -rf $OUTPATH/*\n+PSSCRIPT="$OUTPATH/pedsub.script"\n+echo "title~~~~$NPRE" > $PSSCRIPT\n+echo "output1~~~~${OUTPATH}/${NPRE}.lped" >> $PSSCRIPT\n+echo "outf'..b'ATH 0 0 0 0\n+# rgClean.py \'$input_file.extra_files_path\' \'$input_file.metadata.base_name\' \'$title\' \'$mind\'\n+#        \'$geno\' \'$hwe\' \'$maf\' \'$mef\' \'$mei\' \'$out_file1\' \'$out_file1.files_path\'\n+#        \'${GALAXY_DATA_INDEX_DIR}/rg/bin/plink\' \'$relfilter\' \'$afffilter\' \'$sexfilter\' \'$fixaff\'\n+#\n+;;\n+\n+\'rgEigPCA\')\n+\n+TOOL="rgEigPCA"\n+NPRE=${TOOL}test1\n+OUTPATH="$OROOT/$TOOL"\n+rm -rf $OUTPATH/*\n+python $TOOLPATH/$TOOL.py "$INPATH/tinywga" "$NPRE" ${OUTPATH}/${NPRE}.html $OUTPATH 4 2 2 2 $OUTPATH/rgEigPCAtest1.txt\n+#    rgEigPCA.py "$i.extra_files_path/$i.metadata.base_name" "$title" "$out_file1"\n+#    "$out_file1.files_path" "$k" "$m" "$t" "$s" "$pca" \n+#\n+;;\n+\n+\'rgfakePed\')\n+TOOL="rgfakePed"\n+NPRE=${TOOL}test1\n+OUTPATH="$OROOT/$TOOL"\n+rm -rf $OUTPATH/*\n+echo "now doing $TOOL"\n+python $TOOLPATH/$TOOL.py --title "$NPRE" -o $OUTPATH/${NPRE}.lped -p $OUTPATH -c "20" -n "40" -s "10" -w "0" -v "0" -l "pbed" -d "T" -m "0" -M "0"\n+#rgfakePed.py --title \'$title1\' \n+#  -o \'$out_file1\' -p \'$out_file1.extra_files_path\' -c \'$ncases\' -n \'$ntotal\'\n+#  -s \'$nsnp\'  -w \'$lowmaf\' -v \'$missingValue\' -l \'$outFormat\'\n+#  -d \'$mafdist\' -m \'$missingRate\' -M \'$mendelRate\'\n+;;\n+\n+\'rgHaploView\')\n+\n+TOOL="rgHaploView"\n+NPRE=${TOOL}test1\n+OUTPATH="$OROOT/$TOOL"\n+rm -rf $OUTPATH/*\n+CL="python $TOOLPATH/$TOOL.py \'\'  \'rs2283802 rs2267000 rs16997606 rs4820537 rs3788347 rs756632 rs4820539 rs2283804 rs2267006 rs4822363\' \'$NPRE\' $OUTPATH/${NPRE}.html  \'$INPATH\' \'tinywga\' 0.0 200000 \'RSQ\' \'lo\' \'2048\' \'$OUTPATH\' \'noinfo\' \'0.8\' \'YRI\' $JARPATH/haploview.jar"\n+echo "Testing $TOOL using $CL"\n+python $TOOLPATH/$TOOL.py ""  "rs2283802 rs2267000 rs16997606 rs4820537 rs3788347 rs756632 rs4820539 rs2283804 rs2267006 rs4822363" \\\n+"$NPRE" $OUTPATH/${NPRE}.html  "$INPATH" "tinywga" 0.0 200000 "RSQ" "lo" "2048" "$OUTPATH" "noinfo" "0.8" "YRI" $JARPATH/haploview.jar\n+#  rgHaploView.py "$ucsc_region" "$rslist" "$title" "$output1"  \n+#  "$lhistIn.extra_files_path" "$lhistIn.metadata.base_name"\n+#  "$minmaf" "$maxdist" "$ldtype" "$hires" "$memsize" "$output1.files_path" \n+#  "$infoTrack" "$tagr2" "$hmpanel" ${GALAXY_DATA_INDEX_DIR}/rg/bin/haploview.jar\n+# note these statistical tools do NOT generate composite outputs\n+;;\n+\n+\'rgGLM\')\n+TOOL="rgGLM"\n+NPRE=${TOOL}test1\n+OUTPATH=$NORMALOROOT\n+python $TOOLPATH/$TOOL.py "$INPATH/tinywga" $INPATH/tinywga "$NPRE" "c1" "" $OUTPATH/${NPRE}_GLM.xls \\\n+$OUTPATH/${NPRE}_GLM_log.txt "tinywga" "" "" "" 1 1 0 0 $OUTPATH/${NPRE}_GLM_topTable.gff \n+##        rgGLM.py \'$i.extra_files_path/$i.metadata.base_name\' \'$phef.extra_files_path/$phef.metadata.base_name\'\n+##        "$title1" \'$predvar\' \'$covar\' \'$out_file1\' \'$logf\' \'$dbkey\' \'$i.metadata.base_name\'\n+##        \'$inter\' \'$cond\' \'$gender\' \'$mind\' \'$geno\' \'$maf\' \'$logistic\' \'$gffout\'\n+;;\n+\n+\'rgTDT\')\n+TOOL="rgTDT"\n+NPRE=${TOOL}test1\n+OUTPATH=$NORMALOROOT\n+python $TOOLPATH/$TOOL.py -i "$INPATH/tinywga"  -o "$NPRE" -r $OUTPATH/${NPRE}_TDT.xls \\\n+-l $OUTPATH/${NPRE}_TDT_log.txt -g $OUTPATH/${NPRE}_TDT_topTable.gff\n+##        rgTDT.py -i \'$infile.extra_files_path/$infile.metadata.base_name\' -o \'$title\'\n+##        -r \'$out_file1\' -l \'$logf\' -x \'${GALAXY_DATA_INDEX_DIR}/rg/bin/plink\'\n+##        -g \'$gffout\'\n+;;\n+\n+\'rgCaCo\')\n+TOOL="rgCaCo"\n+NPRE=${TOOL}test1\n+OUTPATH=$NORMALOROOT\n+echo "now doing $TOOL"\n+python $TOOLPATH/rgCaCo.py $INPATH/tinywga "$NPRE" $OUTPATH/${NPRE}_CaCo.xls $OUTPATH/${NPRE}_CaCo_log.txt $OUTPATH $OUTPATH/${NPRE}_CaCo_topTable.gff\n+# rgCaCo.py \'$i.extra_files_path/$i.metadata.base_name\' "$name"  \'$out_file1\' \'$logf\' \'$logf.files_path\' \'$gffout\'\n+;;\n+\n+\'rgQQ\')\n+TOOL="rgQQ"\n+echo "now doing $TOOL"\n+NPRE=${TOOL}test1\n+OUTPATH=$NORMALOROOT\n+CL="python $TOOLPATH/$TOOL.py "$INPATH/tinywga.pphe" "$NPRE" 1 3 $OUTPATH/$NPRE.pdf 8 10 "false" 1 $OUTPATH"\n+echo "running $TOOL using $CL"\n+python $TOOLPATH/$TOOL.py "$INPATH/tinywga.pphe" "$NPRE" 1 3 $OUTPATH/$NPRE.pdf 8 10 "false" 1 $OUTPATH\n+# rgQQ.py "$input1" "$name" $sample "$cols" $allqq $height $width $log $allqq.id $__new_file_path__ \n+;;\n+esac\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgutils.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgutils.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,618 @@\n+# utilities for rgenetics\n+#\n+# copyright 2009 ross lazarus\n+# released under the LGPL\n+#\n+\n+import subprocess, os, sys, time, tempfile,string,plinkbinJZ\n+import datetime\n+\n+galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>\n+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\n+<head>\n+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n+<meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" />\n+<title></title>\n+<link rel="stylesheet" href="/static/style/base.css" type="text/css" />\n+</head>\n+<body>\n+<div class="document">\n+"""\n+galhtmlattr = """<h3><a href="http://rgenetics.org">Rgenetics</a> tool %s run at %s</h3>"""\n+galhtmlpostfix = """</div></body></html>\\n"""\n+\n+plinke = \'plink\' # changed jan 2010 - all exes must be on path\n+rexe = \'R\'       # to avoid cluster/platform dependencies\n+smartpca = \'smartpca.perl\'\n+\n+def timenow():\n+    """return current time as a string\n+    """\n+    return time.strftime(\'%d/%m/%Y %H:%M:%S\', time.localtime(time.time()))\n+\n+def timestamp():\n+    return datetime.datetime.now().strftime(\'%Y%m%d%H%M%S\')\n+\n+def fail( message ):\n+    print >> sys.stderr, message\n+    return -1\n+\n+def whereis(program):\n+    for path in os.environ.get(\'PATH\', \'\').split(\':\'):\n+        if os.path.exists(os.path.join(path, program)) and \\\n+           not os.path.isdir(os.path.join(path, program)):\n+            return os.path.join(path, program)\n+    return None\n+\n+\n+def bedToPicInterval(infile=None):\n+    """\n+    Picard tools requiring targets want\n+    a sam style header which incidentally, MUST be sorted in natural order - not lexicographic order:\n+\n+    @SQ     SN:chrM LN:16571\n+    @SQ     SN:chr1 LN:247249719\n+    @SQ     SN:chr2 LN:242951149\n+    @SQ     SN:chr3 LN:199501827\n+    @SQ     SN:chr4 LN:191273063\n+    added to the start of what looks like a bed style file\n+    chr1    67052400        67052451        -       CCDS635.1_cds_0_0_chr1_67052401_r\n+    chr1    67060631        67060788        -       CCDS635.1_cds_1_0_chr1_67060632_r\n+    chr1    67065090        67065317        -       CCDS635.1_cds_2_0_chr1_67065091_r\n+    chr1    67066082        67066181        -       CCDS635.1_cds_3_0_chr1_67066083_r\n+\n+\n+    see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1\n+    we need to add 1 to start coordinates on the way through - but length calculations are easier\n+    """\n+    # bedToPicard.py\n+    # ross lazarus October 2010\n+    # LGPL\n+    # for Rgenetics\n+\n+    def getFlen(bedfname=None):\n+        """\n+        find all features in a BED file and sum their lengths\n+        """\n+        features = {}\n+        try:\n+            infile = open(bedfname,\'r\')\n+        except:\n+            print \'###ERROR: getFlen unable to open bedfile %s\' % bedfname\n+            sys.exit(1)\n+        for i,row in enumerate(infile):\n+            if row[0] == \'@\': # shouldn\'t happen given a bed file!\n+                print \'row %d=%s - should NOT start with @!\' % (i,row)\n+                sys.exit(1)\n+        row = row.strip()\n+        if len(row) > 0:\n+            srow = row.split(\'\\t\')\n+            f = srow[0]\n+            spos = srow[1] # zero based from UCSC so no need to add 1 - eg 0-100 is 100 bases numbered 0-99 (!)\n+            epos = srow[2] # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1\n+            flen = int(epos) - int(spos)\n+            features.setdefault(f,0)\n+            features[f] += flen\n+        infile.close()\n+        return features\n+\n+    def keynat(string):\n+        \'\'\'\n+        borrowed from http://code.activestate.com/recipes/285264-natural-string-sorting/\n+        A natural sort helper function for sort() and sorted()\n+        without using regular expressions or exceptions.\n+\n+        >>> items = (\'Z\', \'a\', \'10th\', \'1st\', \'9\')\n+        >>> sorted(items)\n+        [\'10th\', \'1st\', \''..b'necessary to actually perform the pruning.\n+\n+The VIF pruning routine is performed:\n+plink --file data --indep 50 5 2\n+\n+will create files\n+\n+     plink.prune.in\n+     plink.prune.out\n+\n+Each is a simlpe list of SNP IDs; both these files can subsequently be specified as the argument for\n+a --extract or --exclude command.\n+\n+The parameters for --indep are: window size in SNPs (e.g. 50), the number of SNPs to shift the\n+window at each step (e.g. 5), the VIF threshold. The VIF is 1/(1-R^2) where R^2 is the multiple correlation coefficient for a SNP being regressed on all other \n+SNPs simultaneously. That is, this considers the correlations between SNPs but also between linear combinations of SNPs. A VIF of 10 is often taken to represent \n+near collinearity problems in standard multiple regression analyses (i.e. implies R^2 of 0.9). A VIF of 1 would imply that the SNP is completely independent of \n+all other SNPs. Practically, values between 1.5 and 2 should probably be used; particularly in small samples, if this threshold is too low and/or the window \n+size is too large, too many SNPs may be removed.\n+\n+The second procedure is performed:\n+plink --file data --indep-pairwise 50 5 0.5\n+\n+This generates the same output files as the first version; the only difference is that a\n+simple pairwise threshold is used. The first two parameters (50 and 5) are the same as above (window size and step); the third parameter represents the r^2 \n+threshold. Note: this represents the pairwise SNP-SNP metric now, not the multiple correlation coefficient; also note, this is based on the genotypic \n+correlation, i.e. it does not involve phasing.\n+\n+To give a concrete example: the command above that specifies 50 5 0.5 would a) consider a\n+window of 50 SNPs, b) calculate LD between each pair of SNPs in the window, b) remove one of a pair of SNPs if the LD is greater than 0.5, c) shift the window 5 \n+SNPs forward and repeat the procedure.\n+\n+To make a new, pruned file, then use something like (in this example, we also convert the\n+standard PED fileset to a binary one):\n+plink --file data --extract plink.prune.in --make-bed --out pruneddata\n+    """\n+    fplog,plog = tempfile.mkstemp()\n+    alog = []\n+    alog.append(\'## Rgenetics: http://rgenetics.org Galaxy Tools rgQC.py Plink pruneLD runner\\n\')\n+    for task in plinktasks: # each is a list\n+        vcl = vclbase + task\n+        sto = file(plog,\'w\')\n+        x = subprocess.Popen(\' \'.join(vcl),shell=True,stdout=sto,stderr=sto,cwd=cd)\n+        retval = x.wait()\n+        sto.close()\n+        try:\n+            lplog = file(plog,\'r\').readlines()\n+            lplog = [x for x in lplog if x.find(\'Pruning SNP\') == -1]\n+            alog += lplog\n+            alog.append(\'\\n\')\n+            os.unlink(plog) # no longer needed\n+        except:\n+            alog.append(\'### %s Strange - no std out from plink when running command line\\n%s\\n\' % (timenow(),\' \'.join(vcl)))\n+    return alog\n+\n+def readMap(mapfile=None,allmarkers=False,rsdict={},c=None,spos=None,epos=None):\n+    """abstract out - keeps reappearing\n+    """\n+    mfile = open(mapfile, \'r\')\n+    markers = []\n+    snpcols = {}\n+    snpIndex = 0 # in case empty or comment lines\n+    for rownum,row in enumerate(mfile):\n+        line = row.strip()\n+        if not line or line[0]==\'#\': continue\n+        chrom, snp, genpos, abspos = line.split()[:4] # just in case more cols\n+        try:\n+            abspos = int(abspos)\n+        except:\n+            abspos = 0 # stupid framingham data grumble grumble\n+        if allmarkers or rsdict.get(snp,None) or (chrom == c and (spos <= abspos <= epos)):\n+            markers.append((chrom,abspos,snp)) # decorate for sort into genomic\n+            snpcols[snp] = snpIndex # so we know which col to find genos for this marker\n+            snpIndex += 1\n+    markers.sort()\n+    rslist = [x[2] for x in markers] # drop decoration\n+    rsdict = dict(zip(rslist,rslist))\n+    mfile.close()\n+    return markers,snpcols,rslist,rsdict\n+\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/rgutils.pyc
b
Binary file tools/rgenetics/rgutils.pyc has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/test.eps
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/test.eps Fri Mar 09 19:37:19 2012 -0500
[
b"@@ -0,0 +1,851 @@\n+%!PS-Adobe-3.0 EPSF-3.0\n+%%Title:        Sequence Logo: sdfs\n+%%Creator:      WebLogo 3.1 (2011-02-16)\n+%%CreationDate: 2011-10-15 16:47:38.275112\n+%%BoundingBox:  0  0  281  92 \n+%%Pages: 0\n+%%DocumentFonts: \n+%%EndComments\n+\n+\n+% ---- VARIABLES ----\n+\n+/True   true def\n+/False  false def\n+\n+/debug              False        def\n+\n+/logo_height        92  def\n+/logo_width         281  def\n+/logo_title         (sdfs) def\n+/show_title         True def\n+\n+/logo_margin        2 def\n+/xaxis_label_height 6.0 def\n+/title_height       12 def\n+/stroke_width       0.5 def\n+/tic_length         5 def\n+\n+/lines_per_logo     1 def\n+/line_width         277.6 def\n+/line_height        70.0 def\n+/line_margin_left   30.0 def\n+/line_margin_right  10 def\n+/line_margin_bottom 12.0 def\n+/line_margin_top    4 def\n+\n+/stack_width         10.8 def\n+/stack_height        54.0 def\n+/stacks_per_line     22 def\n+/stack_margin        0.5 def\n+\n+/show_yaxis             True def      \n+/show_yaxis_label       True def\n+/yaxis_label            (bits) def\n+/yaxis_scale          2.0 def              % height in units \n+/yaxis_tic_interval     1.0 def           % in units\n+/yaxis_minor_tic_interval 0.2 def   % in units\n+\n+/show_xaxis_label       False def             % True or False\n+/show_xaxis             True def                   % True or False\n+/xaxis_label            () def\n+/xaxis_tic_interval     1 def\n+/rotate_numbers         False def               % True or False\n+/number_interval        5 def\n+/show_ends              False def          \n+/end_type               (-) def          % d: DNA, p: PROTEIN, -: none\n+\n+/show_fineprint         True def\n+/fineprint              (WebLogo 3.1) def\n+/logo_label             () def\n+\n+/show_boxes             False def    % True or False\n+/shrink                 false def    % True or False\n+/shrink_fraction        0.5 def               \n+\n+/show_errorbars         True def      % True or False\n+/errorbar_fraction      0.9 def\n+/errorbar_width_fraction  0.25 def\n+/errorbar_gray          0.75 def\n+\n+/fontsize               10 def\n+/small_fontsize         6 def\n+/title_fontsize         12 def\n+/number_fontsize        8 def\n+\n+\n+/UseCIEColor true def       % Fix for issue 4\n+/default_color [ 0.0 0.0 0.0 ] def \n+/color_dict << \n+  (T) [ 1.0 0.549019607843 0.0 ]\n+  (A) [ 1.0 0.549019607843 0.0 ]\n+  (U) [ 1.0 0.549019607843 0.0 ]\n+  (G) [ 0.0 0.0 1.0 ]\n+  (C) [ 0.0 0.0 1.0 ]\n+>> def\n+\n+\n+\n+% ---- DERIVED PARAMETERS ----\n+\n+/char_width stack_width 2 stack_margin mul sub def\n+/char_width2 char_width 2 div def\n+/char_width4 char_width 4 div def\n+\n+% movements to place 5'/N and 3'/C symbols\n+/leftEndDeltaX  fontsize neg         def\n+/leftEndDeltaY  fontsize 1.25 mul neg def\n+/rightEndDeltaX fontsize 0.25 mul     def\n+/rightEndDeltaY leftEndDeltaY        def\n+\n+\n+% ---- PROCEDURES ----\n+\n+\n+/SetTitleFont {/ArialMT findfont title_fontsize scalefont setfont} bind def\n+/SetLogoFont  {/Arial-BoldMT findfont char_width  scalefont setfont} bind def\n+/SetStringFont{/ArialMT findfont fontsize scalefont setfont} bind def\n+/SetPrimeFont {/Symbol findfont fontsize scalefont setfont} bind def\n+/SetSmallFont {/ArialMT findfont small_fontsize scalefont setfont} bind def\n+/SetNumberFont {/ArialMT findfont number_fontsize scalefont setfont} bind def\n+\n+/DrawBox { % width height \n+    /hh exch def\n+    /ww exch def\n+    gsave\n+        0.2 setlinewidth\n+        %0.5 setgray\n+        \n+        %0 0 moveto \n+        hh 0 rlineto\n+        0 ww rlineto\n+        hh neg 0 rlineto\n+        0 ww neg rlineto\n+        stroke\n+    grestore\n+} bind def\n+\n+\n+/StartLogo { \n+  %save \n+  gsave \n+\n+  \n+  debug { \n+    logo_margin logo_margin moveto\n+    logo_height logo_margin 2 mul sub\n+    logo_width logo_margin 2 mul sub\n+    DrawBox } if\n+    \n+  show_title { DrawTitle } if\n+  show_xaxis_label { DrawXaxisLable } if\n+  show_fineprint { DrawFineprint } if\n+  DrawLogoLabel\n+  \n+  \n+  MoveToFirstLine\n+} bind def\n+\n+\n+/DrawLogoLabel {\n+  gsave \n"..b'owSymbol\n+ 1.000000 0.037386 (G) ShowSymbol\n+ 1.000000 0.074773 (T) ShowSymbol\n+ 1.000000 0.074773 (A) ShowSymbol\n+ 0.169220 0.169220 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.000000 (A) ShowSymbol\n+ 1.000000 0.038953 (G) ShowSymbol\n+ 1.000000 0.155812 (C) ShowSymbol\n+ 1.000000 0.759583 (T) ShowSymbol\n+ 0.326656 0.326656 DrawErrorbar\n+EndStack\n+\n+(5) StartStack\n+ 1.000000 0.019459 (C) ShowSymbol\n+ 1.000000 0.038917 (A) ShowSymbol\n+ 1.000000 0.116752 (T) ShowSymbol\n+ 1.000000 0.778345 (G) ShowSymbol\n+ 0.350333 0.350333 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.000000 (G) ShowSymbol\n+ 1.000000 0.021020 (A) ShowSymbol\n+ 1.000000 0.168160 (C) ShowSymbol\n+ 1.000000 0.840802 (T) ShowSymbol\n+ 0.325915 0.325915 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.000000 (C) ShowSymbol\n+ 1.000000 0.083359 (T) ShowSymbol\n+ 1.000000 0.083359 (A) ShowSymbol\n+ 1.000000 0.854432 (G) ShowSymbol\n+ 0.347959 0.347959 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.000000 (T) ShowSymbol\n+ 1.000000 0.070036 (G) ShowSymbol\n+ 1.000000 0.070036 (C) ShowSymbol\n+ 1.000000 1.003846 (A) ShowSymbol\n+ 0.356819 0.356819 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.026714 (A) ShowSymbol\n+ 1.000000 0.040070 (G) ShowSymbol\n+ 1.000000 0.044523 (C) ShowSymbol\n+ 1.000000 0.106855 (T) ShowSymbol\n+ 0.196056 0.196056 DrawErrorbar\n+EndStack\n+\n+(10) StartStack\n+ 1.000000 0.014496 (A) ShowSymbol\n+ 1.000000 0.016107 (G) ShowSymbol\n+ 1.000000 0.020939 (T) ShowSymbol\n+ 1.000000 0.027382 (C) ShowSymbol\n+ 0.078924 0.106593 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.018308 (G) ShowSymbol\n+ 1.000000 0.032954 (A) ShowSymbol\n+ 1.000000 0.054923 (C) ShowSymbol\n+ 1.000000 0.073231 (T) ShowSymbol\n+ 0.164679 0.164679 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.015253 (C) ShowSymbol\n+ 1.000000 0.021790 (T) ShowSymbol\n+ 1.000000 0.032685 (A) ShowSymbol\n+ 1.000000 0.037043 (G) ShowSymbol\n+ 0.106770 0.125094 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.022457 (C) ShowSymbol\n+ 1.000000 0.028072 (T) ShowSymbol\n+ 1.000000 0.028072 (A) ShowSymbol\n+ 1.000000 0.058950 (G) ShowSymbol\n+ 0.137551 0.153378 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.013621 (G) ShowSymbol\n+ 1.000000 0.021404 (C) ShowSymbol\n+ 1.000000 0.029188 (T) ShowSymbol\n+ 1.000000 0.031133 (A) ShowSymbol\n+ 0.095346 0.115803 DrawErrorbar\n+EndStack\n+\n+(15) StartStack\n+ 1.000000 0.033669 (C) ShowSymbol\n+ 1.000000 0.067338 (A) ShowSymbol\n+ 1.000000 0.078561 (G) ShowSymbol\n+ 1.000000 0.370360 (T) ShowSymbol\n+ 0.303054 0.303054 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.000000 (G) ShowSymbol\n+ 1.000000 0.056955 (T) ShowSymbol\n+ 1.000000 0.132896 (A) ShowSymbol\n+ 1.000000 0.740420 (C) ShowSymbol\n+ 0.331433 0.331433 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.014884 (C) ShowSymbol\n+ 1.000000 0.044653 (T) ShowSymbol\n+ 1.000000 0.148844 (G) ShowSymbol\n+ 1.000000 0.520953 (A) ShowSymbol\n+ 0.310748 0.310748 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.000000 (G) ShowSymbol\n+ 1.000000 0.088853 (T) ShowSymbol\n+ 1.000000 0.126932 (A) ShowSymbol\n+ 1.000000 0.406183 (C) ShowSymbol\n+ 0.268423 0.268423 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.043760 (C) ShowSymbol\n+ 1.000000 0.065640 (T) ShowSymbol\n+ 1.000000 0.065640 (G) ShowSymbol\n+ 1.000000 0.361019 (A) ShowSymbol\n+ 0.304415 0.304415 DrawErrorbar\n+EndStack\n+\n+(20) StartStack\n+ 1.000000 0.021502 (G) ShowSymbol\n+ 1.000000 0.027646 (C) ShowSymbol\n+ 1.000000 0.036861 (A) ShowSymbol\n+ 1.000000 0.064506 (T) ShowSymbol\n+ 0.150515 0.158545 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.000000 (G) ShowSymbol\n+ 1.000000 0.015671 (C) ShowSymbol\n+ 1.000000 0.282073 (A) ShowSymbol\n+ 1.000000 0.470122 (T) ShowSymbol\n+ 0.247172 0.247172 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.022864 (G) ShowSymbol\n+ 1.000000 0.040011 (C) ShowSymbol\n+ 1.000000 0.080022 (A) ShowSymbol\n+ 1.000000 0.137181 (T) ShowSymbol\n+ 0.209363 0.209363 DrawErrorbar\n+EndStack\n+\n+EndLine\n+\n+EndLogo\n+\n+\n+%%EOF\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/test.pdf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/test.pdf Fri Mar 09 19:37:19 2012 -0500
[
b"@@ -0,0 +1,851 @@\n+%!PS-Adobe-3.0 EPSF-3.0\n+%%Title:        Sequence Logo: rgWebLogo3\n+%%Creator:      WebLogo 3.1 (2011-02-16)\n+%%CreationDate: 2011-10-15 16:48:55.480094\n+%%BoundingBox:  0  0  281  92 \n+%%Pages: 0\n+%%DocumentFonts: \n+%%EndComments\n+\n+\n+% ---- VARIABLES ----\n+\n+/True   true def\n+/False  false def\n+\n+/debug              False        def\n+\n+/logo_height        92  def\n+/logo_width         281  def\n+/logo_title         (rgWebLogo3) def\n+/show_title         True def\n+\n+/logo_margin        2 def\n+/xaxis_label_height 6.0 def\n+/title_height       12 def\n+/stroke_width       0.5 def\n+/tic_length         5 def\n+\n+/lines_per_logo     1 def\n+/line_width         277.6 def\n+/line_height        70.0 def\n+/line_margin_left   30.0 def\n+/line_margin_right  10 def\n+/line_margin_bottom 12.0 def\n+/line_margin_top    4 def\n+\n+/stack_width         10.8 def\n+/stack_height        54.0 def\n+/stacks_per_line     22 def\n+/stack_margin        0.5 def\n+\n+/show_yaxis             True def      \n+/show_yaxis_label       True def\n+/yaxis_label            (bits) def\n+/yaxis_scale          2.0 def              % height in units \n+/yaxis_tic_interval     1.0 def           % in units\n+/yaxis_minor_tic_interval 0.2 def   % in units\n+\n+/show_xaxis_label       False def             % True or False\n+/show_xaxis             True def                   % True or False\n+/xaxis_label            () def\n+/xaxis_tic_interval     1 def\n+/rotate_numbers         False def               % True or False\n+/number_interval        5 def\n+/show_ends              False def          \n+/end_type               (-) def          % d: DNA, p: PROTEIN, -: none\n+\n+/show_fineprint         True def\n+/fineprint              (WebLogo 3.1) def\n+/logo_label             () def\n+\n+/show_boxes             False def    % True or False\n+/shrink                 false def    % True or False\n+/shrink_fraction        0.5 def               \n+\n+/show_errorbars         True def      % True or False\n+/errorbar_fraction      0.9 def\n+/errorbar_width_fraction  0.25 def\n+/errorbar_gray          0.75 def\n+\n+/fontsize               10 def\n+/small_fontsize         6 def\n+/title_fontsize         12 def\n+/number_fontsize        8 def\n+\n+\n+/UseCIEColor true def       % Fix for issue 4\n+/default_color [ 0.0 0.0 0.0 ] def \n+/color_dict << \n+  (T) [ 1.0 0.549019607843 0.0 ]\n+  (A) [ 1.0 0.549019607843 0.0 ]\n+  (U) [ 1.0 0.549019607843 0.0 ]\n+  (G) [ 0.0 0.0 1.0 ]\n+  (C) [ 0.0 0.0 1.0 ]\n+>> def\n+\n+\n+\n+% ---- DERIVED PARAMETERS ----\n+\n+/char_width stack_width 2 stack_margin mul sub def\n+/char_width2 char_width 2 div def\n+/char_width4 char_width 4 div def\n+\n+% movements to place 5'/N and 3'/C symbols\n+/leftEndDeltaX  fontsize neg         def\n+/leftEndDeltaY  fontsize 1.25 mul neg def\n+/rightEndDeltaX fontsize 0.25 mul     def\n+/rightEndDeltaY leftEndDeltaY        def\n+\n+\n+% ---- PROCEDURES ----\n+\n+\n+/SetTitleFont {/ArialMT findfont title_fontsize scalefont setfont} bind def\n+/SetLogoFont  {/Arial-BoldMT findfont char_width  scalefont setfont} bind def\n+/SetStringFont{/ArialMT findfont fontsize scalefont setfont} bind def\n+/SetPrimeFont {/Symbol findfont fontsize scalefont setfont} bind def\n+/SetSmallFont {/ArialMT findfont small_fontsize scalefont setfont} bind def\n+/SetNumberFont {/ArialMT findfont number_fontsize scalefont setfont} bind def\n+\n+/DrawBox { % width height \n+    /hh exch def\n+    /ww exch def\n+    gsave\n+        0.2 setlinewidth\n+        %0.5 setgray\n+        \n+        %0 0 moveto \n+        hh 0 rlineto\n+        0 ww rlineto\n+        hh neg 0 rlineto\n+        0 ww neg rlineto\n+        stroke\n+    grestore\n+} bind def\n+\n+\n+/StartLogo { \n+  %save \n+  gsave \n+\n+  \n+  debug { \n+    logo_margin logo_margin moveto\n+    logo_height logo_margin 2 mul sub\n+    logo_width logo_margin 2 mul sub\n+    DrawBox } if\n+    \n+  show_title { DrawTitle } if\n+  show_xaxis_label { DrawXaxisLable } if\n+  show_fineprint { DrawFineprint } if\n+  DrawLogoLabel\n+  \n+  \n+  MoveToFirstLine\n+} bind def\n+\n+\n+/DrawLogoLabel "..b'owSymbol\n+ 1.000000 0.037386 (G) ShowSymbol\n+ 1.000000 0.074773 (T) ShowSymbol\n+ 1.000000 0.074773 (A) ShowSymbol\n+ 0.169220 0.169220 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.000000 (A) ShowSymbol\n+ 1.000000 0.038953 (G) ShowSymbol\n+ 1.000000 0.155812 (C) ShowSymbol\n+ 1.000000 0.759583 (T) ShowSymbol\n+ 0.326656 0.326656 DrawErrorbar\n+EndStack\n+\n+(5) StartStack\n+ 1.000000 0.019459 (C) ShowSymbol\n+ 1.000000 0.038917 (A) ShowSymbol\n+ 1.000000 0.116752 (T) ShowSymbol\n+ 1.000000 0.778345 (G) ShowSymbol\n+ 0.350333 0.350333 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.000000 (G) ShowSymbol\n+ 1.000000 0.021020 (A) ShowSymbol\n+ 1.000000 0.168160 (C) ShowSymbol\n+ 1.000000 0.840802 (T) ShowSymbol\n+ 0.325915 0.325915 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.000000 (C) ShowSymbol\n+ 1.000000 0.083359 (T) ShowSymbol\n+ 1.000000 0.083359 (A) ShowSymbol\n+ 1.000000 0.854432 (G) ShowSymbol\n+ 0.347959 0.347959 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.000000 (T) ShowSymbol\n+ 1.000000 0.070036 (G) ShowSymbol\n+ 1.000000 0.070036 (C) ShowSymbol\n+ 1.000000 1.003846 (A) ShowSymbol\n+ 0.356819 0.356819 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.026714 (A) ShowSymbol\n+ 1.000000 0.040070 (G) ShowSymbol\n+ 1.000000 0.044523 (C) ShowSymbol\n+ 1.000000 0.106855 (T) ShowSymbol\n+ 0.196056 0.196056 DrawErrorbar\n+EndStack\n+\n+(10) StartStack\n+ 1.000000 0.014496 (A) ShowSymbol\n+ 1.000000 0.016107 (G) ShowSymbol\n+ 1.000000 0.020939 (T) ShowSymbol\n+ 1.000000 0.027382 (C) ShowSymbol\n+ 0.078924 0.106593 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.018308 (G) ShowSymbol\n+ 1.000000 0.032954 (A) ShowSymbol\n+ 1.000000 0.054923 (C) ShowSymbol\n+ 1.000000 0.073231 (T) ShowSymbol\n+ 0.164679 0.164679 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.015253 (C) ShowSymbol\n+ 1.000000 0.021790 (T) ShowSymbol\n+ 1.000000 0.032685 (A) ShowSymbol\n+ 1.000000 0.037043 (G) ShowSymbol\n+ 0.106770 0.125094 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.022457 (C) ShowSymbol\n+ 1.000000 0.028072 (T) ShowSymbol\n+ 1.000000 0.028072 (A) ShowSymbol\n+ 1.000000 0.058950 (G) ShowSymbol\n+ 0.137551 0.153378 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.013621 (G) ShowSymbol\n+ 1.000000 0.021404 (C) ShowSymbol\n+ 1.000000 0.029188 (T) ShowSymbol\n+ 1.000000 0.031133 (A) ShowSymbol\n+ 0.095346 0.115803 DrawErrorbar\n+EndStack\n+\n+(15) StartStack\n+ 1.000000 0.033669 (C) ShowSymbol\n+ 1.000000 0.067338 (A) ShowSymbol\n+ 1.000000 0.078561 (G) ShowSymbol\n+ 1.000000 0.370360 (T) ShowSymbol\n+ 0.303054 0.303054 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.000000 (G) ShowSymbol\n+ 1.000000 0.056955 (T) ShowSymbol\n+ 1.000000 0.132896 (A) ShowSymbol\n+ 1.000000 0.740420 (C) ShowSymbol\n+ 0.331433 0.331433 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.014884 (C) ShowSymbol\n+ 1.000000 0.044653 (T) ShowSymbol\n+ 1.000000 0.148844 (G) ShowSymbol\n+ 1.000000 0.520953 (A) ShowSymbol\n+ 0.310748 0.310748 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.000000 (G) ShowSymbol\n+ 1.000000 0.088853 (T) ShowSymbol\n+ 1.000000 0.126932 (A) ShowSymbol\n+ 1.000000 0.406183 (C) ShowSymbol\n+ 0.268423 0.268423 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.043760 (C) ShowSymbol\n+ 1.000000 0.065640 (T) ShowSymbol\n+ 1.000000 0.065640 (G) ShowSymbol\n+ 1.000000 0.361019 (A) ShowSymbol\n+ 0.304415 0.304415 DrawErrorbar\n+EndStack\n+\n+(20) StartStack\n+ 1.000000 0.021502 (G) ShowSymbol\n+ 1.000000 0.027646 (C) ShowSymbol\n+ 1.000000 0.036861 (A) ShowSymbol\n+ 1.000000 0.064506 (T) ShowSymbol\n+ 0.150515 0.158545 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.000000 (G) ShowSymbol\n+ 1.000000 0.015671 (C) ShowSymbol\n+ 1.000000 0.282073 (A) ShowSymbol\n+ 1.000000 0.470122 (T) ShowSymbol\n+ 0.247172 0.247172 DrawErrorbar\n+EndStack\n+\n+() StartStack\n+ 1.000000 0.022864 (G) ShowSymbol\n+ 1.000000 0.040011 (C) ShowSymbol\n+ 1.000000 0.080022 (A) ShowSymbol\n+ 1.000000 0.137181 (T) ShowSymbol\n+ 0.209363 0.209363 DrawErrorbar\n+EndStack\n+\n+EndLine\n+\n+EndLogo\n+\n+\n+%%EOF\n+\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/rgenetics/test.png
b
Binary file tools/rgenetics/test.png has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/samtools/bam_to_sam.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/samtools/bam_to_sam.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,129 @@
+#!/usr/bin/env python
+"""
+Converts BAM data to sorted SAM data.
+usage: bam_to_sam.py [options]
+   --input1: SAM file to be converted
+   --output1: output dataset in bam format
+"""
+
+import optparse, os, sys, subprocess, tempfile, shutil
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+#from galaxy import util
+
+def stop_err( msg ):
+    sys.stderr.write( '%s\n' % msg )
+    sys.exit()
+
+def __main__():
+    #Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option( '', '--input1', dest='input1', help='The input SAM dataset' )
+    parser.add_option( '', '--output1', dest='output1', help='The output BAM dataset' )
+    parser.add_option( '', '--header', dest='header', action='store_true', default=False, help='Write SAM Header' )
+    ( options, args ) = parser.parse_args()
+
+    # output version # of tool
+    try:
+        tmp = tempfile.NamedTemporaryFile().name
+        tmp_stdout = open( tmp, 'wb' )
+        proc = subprocess.Popen( args='samtools 2>&1', shell=True, stdout=tmp_stdout )
+        tmp_stdout.close()
+        returncode = proc.wait()
+        stdout = None
+        for line in open( tmp_stdout.name, 'rb' ):
+            if line.lower().find( 'version' ) >= 0:
+                stdout = line.strip()
+                break
+        if stdout:
+            sys.stdout.write( 'Samtools %s\n' % stdout )
+        else:
+            raise Exception
+    except:
+        sys.stdout.write( 'Could not determine Samtools version\n' )
+
+    tmp_dir = tempfile.mkdtemp()
+
+    try:
+        # exit if input file empty
+        if os.path.getsize( options.input1 ) == 0:
+            raise Exception, 'Initial BAM file empty'
+        # Sort alignments by leftmost coordinates. File <out.prefix>.bam will be created. This command
+        # may also create temporary files <out.prefix>.%d.bam when the whole alignment cannot be fitted
+        # into memory ( controlled by option -m ).
+        tmp_sorted_aligns_file = tempfile.NamedTemporaryFile( dir=tmp_dir )
+        tmp_sorted_aligns_file_base = tmp_sorted_aligns_file.name
+        tmp_sorted_aligns_file_name = '%s.bam' % tmp_sorted_aligns_file.name
+        tmp_sorted_aligns_file.close()
+        command = 'samtools sort %s %s' % ( options.input1, tmp_sorted_aligns_file_base )
+        tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
+        tmp_stderr = open( tmp, 'wb' )
+        proc = subprocess.Popen( args=command, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
+        returncode = proc.wait()
+        tmp_stderr.close()
+        # get stderr, allowing for case where it's very large
+        tmp_stderr = open( tmp, 'rb' )
+        stderr = ''
+        buffsize = 1048576
+        try:
+            while True:
+                stderr += tmp_stderr.read( buffsize )
+                if not stderr or len( stderr ) % buffsize != 0:
+                    break
+        except OverflowError:
+            pass
+        tmp_stderr.close()
+        if returncode != 0:
+            raise Exception, stderr
+        # exit if sorted BAM file empty
+        if os.path.getsize( tmp_sorted_aligns_file_name) == 0:
+            raise Exception, 'Intermediate sorted BAM file empty'
+    except Exception, e:
+        #clean up temp files
+        if os.path.exists( tmp_dir ):
+            shutil.rmtree( tmp_dir )
+        stop_err( 'Error sorting alignments from (%s), %s' % ( options.input1, str( e ) ) )
+
+
+    try:
+        # Extract all alignments from the input BAM file to SAM format ( since no region is specified, all the alignments will be extracted ).
+        if options.header:
+            view_options = "-h"
+        else:
+            view_options = ""
+        command = 'samtools view %s -o %s %s' % ( view_options, options.output1, tmp_sorted_aligns_file_name )
+        tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
+        tmp_stderr = open( tmp, 'wb' )
+        proc = subprocess.Popen( args=command, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
+        returncode = proc.wait()
+        tmp_stderr.close()
+        # get stderr, allowing for case where it's very large
+        tmp_stderr = open( tmp, 'rb' )
+        stderr = ''
+        buffsize = 1048576
+        try:
+            while True:
+                stderr += tmp_stderr.read( buffsize )
+                if not stderr or len( stderr ) % buffsize != 0:
+                    break
+        except OverflowError:
+            pass
+        tmp_stderr.close()
+        if returncode != 0:
+            raise Exception, stderr
+    except Exception, e:
+        #clean up temp files
+        if os.path.exists( tmp_dir ):
+            shutil.rmtree( tmp_dir )
+        stop_err( 'Error extracting alignments from (%s), %s' % ( options.input1, str( e ) ) )
+    #clean up temp files
+    if os.path.exists( tmp_dir ):
+        shutil.rmtree( tmp_dir )
+    # check that there are results in the output file
+    if os.path.getsize( options.output1 ) > 0:
+        sys.stdout.write( 'BAM file converted to SAM' )
+    else:
+        stop_err( 'The output file is empty, there may be an error with your input file.' )
+
+if __name__=="__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/samtools/bam_to_sam.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/samtools/bam_to_sam.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,60 @@
+<tool id="bam_to_sam" name="BAM-to-SAM" version="1.0.3">
+  <requirements>
+    <requirement type="package">samtools</requirement>
+  </requirements>
+  <description>converts BAM format to SAM format</description>
+  <command interpreter="python">
+    bam_to_sam.py
+      --input1=$input1
+      --output1=$output1
+      $header
+  </command>
+  <inputs>
+    <param name="input1" type="data" format="bam" label="BAM File to Convert" />
+    <param name="header" type="boolean" truevalue="--header" falsevalue="" checked="False" label="Include header in output" />
+  </inputs>
+  <outputs>
+    <data format="sam" name="output1" label="${tool.name} on ${on_string}: converted SAM" />
+  </outputs>
+  <tests>
+    <test>
+      <!--
+      Bam-to-Sam command:
+      samtools view -o bam_to_sam_out1.sam test-data/bam_to_sam_in1.bam
+      bam_to_sam_in1.bam can be created from bam_to_sam_in1.sam
+      -->
+      <param name="input1" value="bam_to_sam_in1.bam" ftype="bam" />
+      <param name="header" value="" />
+      <output name="output1" file="bam_to_sam_out1.sam" sorted="True" />
+    </test>
+    <test>
+      <!--
+      Bam-to-Sam command:
+      samtools view -o bam_to_sam_out2.sam test-data/bam_to_sam_in2.bam
+      bam_to_sam_in2.bam can be created from bam_to_sam_in2.sam
+      -->
+      <param name="input1" value="bam_to_sam_in2.bam" ftype="bam" />
+      <param name="header" value="" />
+      <output name="output1" file="bam_to_sam_out2.sam" sorted="True" />
+    </test>
+    <test>
+      <!--
+      Bam-to-Sam command:
+      samtools view -h -o bam_to_sam_out3.sam test-data/bam_to_sam_in1.bam
+      bam_to_sam_in1.bam can be created from bam_to_sam_in1.sam
+      -->
+      <param name="input1" value="bam_to_sam_in1.bam" ftype="bam" />
+      <param name="header" value="--header" />
+      <output name="output1" file="bam_to_sam_out3.sam" sorted="True" lines_diff="6" /><!-- header param not working in func tests so won't produce correct 6-line header (fine in browser) -->
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool uses the SAMTools_ toolkit to produce a SAM file from a BAM file.
+
+.. _SAMTools: http://samtools.sourceforge.net/samtools.shtml
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/samtools/pileup_interval.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/samtools/pileup_interval.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+
+"""
+Condenses pileup format into ranges of bases.
+
+usage: %prog [options]
+   -i, --input=i: Input pileup file
+   -o, --output=o: Output pileup
+   -c, --coverage=c: Coverage
+   -f, --format=f: Pileup format
+   -b, --base=b: Base to select
+   -s, --seq_column=s: Sequence column
+   -l, --loc_column=l: Base location column
+   -r, --base_column=r: Reference base column
+   -C, --cvrg_column=C: Coverage column
+"""
+
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+import sys
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def __main__():
+    strout = ''
+    #Parse Command Line
+    options, args = doc_optparse.parse( __doc__ )
+    coverage = int(options.coverage)
+    fin = file(options.input, 'r')
+    fout = file(options.output, 'w')
+    inLine = fin.readline()
+    if options.format == 'six':
+        seqIndex = 0
+        locIndex = 1
+        baseIndex = 2
+        covIndex = 3
+    elif options.format == 'ten':
+        seqIndex = 0
+        locIndex = 1
+        if options.base == 'first':
+            baseIndex = 2
+        else:
+            baseIndex = 3
+        covIndex = 7
+    else:
+        seqIndex = int(options.seq_column) - 1
+        locIndex = int(options.loc_column) - 1
+        baseIndex = int(options.base_column) - 1
+        covIndex = int(options.cvrg_column) - 1
+    lastSeq = ''
+    lastLoc = -1
+    locs = []
+    startLoc = -1
+    bases = []
+    while inLine.strip() != '':
+        lineParts = inLine.split('\t')
+        try:
+            seq, loc, base, cov = lineParts[seqIndex], int(lineParts[locIndex]), lineParts[baseIndex], int(lineParts[covIndex])
+        except IndexError, ei:
+            if options.format == 'ten':
+                stop_err( 'It appears that you have selected 10 columns while your file has 6. Make sure that the number of columns you specify matches the number in your file.\n' + str( ei ) )
+            else:
+                stop_err( 'There appears to be something wrong with your column index values.\n' + str( ei ) )
+        except ValueError, ev:
+            if options.format == 'six':
+                stop_err( 'It appears that you have selected 6 columns while your file has 10. Make sure that the number of columns you specify matches the number in your file.\n' + str( ev ) )
+            else:
+                stop_err( 'There appears to be something wrong with your column index values.\n' + str( ev ) )
+#        strout += str(startLoc) + '\n'
+#        strout += str(bases) + '\n'
+#        strout += '%s\t%s\t%s\t%s\n' % (seq, loc, base, cov)
+        if loc == lastLoc+1 or lastLoc == -1:
+            if cov >= coverage:
+                if seq == lastSeq or lastSeq == '':
+                    if startLoc == -1:
+                        startLoc = loc
+                    locs.append(loc)
+                    bases.append(base)
+                else:
+                    if len(bases) > 0:
+                        fout.write('%s\t%s\t%s\t%s\n' % (lastSeq, startLoc-1, lastLoc, ''.join(bases)))
+                    startLoc = loc
+                    locs = [loc]
+                    bases = [base]
+            else:
+                if len(bases) > 0:
+                    fout.write('%s\t%s\t%s\t%s\n' % (lastSeq, startLoc-1, lastLoc, ''.join(bases)))
+                startLoc = -1
+                locs = []
+                bases = []
+        else:
+            if len(bases) > 0:
+                fout.write('%s\t%s\t%s\t%s\n' % (lastSeq, startLoc-1, lastLoc, ''.join(bases)))
+            if cov >= coverage:
+                startLoc = loc
+                locs = [loc]
+                bases = [base]
+            else:
+                startLoc = -1
+                locs = []
+                bases = []
+        lastSeq = seq
+        lastLoc = loc
+        inLine = fin.readline()
+    if len(bases) > 0:
+        fout.write('%s\t%s\t%s\t%s\n' % (lastSeq, startLoc-1, lastLoc, ''.join(bases)))
+    fout.close()
+    fin.close()
+    
+#    import sys
+#    strout += file(fout.name,'r').read()
+#    sys.stderr.write(strout)
+
+if __name__ == "__main__" : __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/samtools/pileup_interval.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/samtools/pileup_interval.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,189 @@
+<tool id="pileup_interval" name="Pileup-to-Interval" version="1.0.0">
+  <description>condenses pileup format into ranges of bases</description>
+  <requirements>
+    <requirement type="package">samtools</requirement>
+  </requirements>
+  <command interpreter="python">
+    pileup_interval.py 
+      --input=$input 
+      --output=$output 
+      --coverage=$coverage
+      --format=$format_type.format
+      #if $format_type.format == "ten":
+       --base=$format_type.which_base
+       --seq_column="None"
+       --loc_column="None"
+       --base_column="None"
+       --cvrg_column="None"
+      #elif $format_type.format == "manual":
+       --base="None"
+       --seq_column=$format_type.seq_column
+       --loc_column=$format_type.loc_column
+       --base_column=$format_type.base_column
+       --cvrg_column=$format_type.cvrg_column
+      #else:
+       --base="None"
+       --seq_column="None"
+       --loc_column="None"
+       --base_column="None"
+       --cvrg_column="None"
+      #end if
+  </command>
+  <inputs>
+    <param name="input" type="data" format="tabular" label="Choose a pileup file to condense:" />
+    <conditional name="format_type">
+      <param name="format" type="select" label="which contains:" help="See &quot;Types of pileup datasets&quot; below for examples">
+        <option value="six" selected="true">Pileup with six columns (simple)</option>
+        <option value="ten">Pileup with ten columns (with consensus)</option>
+        <option value="manual">Set columns manually</option>
+      </param>
+      <when value="six" />
+      <when value="ten">
+        <param name="which_base" type="select" label="Which base do you want to concatenate">
+          <option value="first" selected="true">Reference base (first)</option>
+          <option value="second">Consensus base (second)</option>
+        </param>
+      </when>
+      <when value="manual">
+        <param name="seq_column" label="Select column with sequence name" type="data_column" numerical="false" data_ref="input" />
+        <param name="loc_column" label="Select column with base location" type="data_column" numerical="false" data_ref="input" />
+        <param name="base_column" label="Select column with base to concatenate" type="data_column" numerical="false" data_ref="input" />
+        <param name="cvrg_column" label="Select column with coverage" type="data_column" numerical="true" data_ref="input" />
+      </when>
+    </conditional>
+    <param name="coverage" type="integer" value="3" label="Do not report bases with coverage less than:" />
+  </inputs>
+  <outputs>
+    <data format="tabular" name="output" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="pileup_interval_in1.tabular" />
+      <param name="format" value="six" />
+      <param name="coverage" value="3" />
+      <output name="output" file="pileup_interval_out1.tabular" />
+    </test>
+    <test>
+      <param name="input" value="pileup_interval_in2.tabular" />
+      <param name="format" value="ten" />
+      <param name="which_base" value="first" />
+      <param name="coverage" value="3" />
+      <output name="output" file="pileup_interval_out2.tabular" />
+    </test>
+    <test>
+      <param name="input" value="pileup_interval_in2.tabular" />
+      <param name="format" value="manual" />
+      <param name="seq_column" value="1" />
+      <param name="loc_column" value="2" />
+      <param name="base_column" value="3" />
+      <param name="cvrg_column" value="8" />
+      <param name="coverage" value="3" />
+      <output name="output" file="pileup_interval_out2.tabular" />
+    </test>
+  </tests> 
+  <help>
+    
+**What is does**
+
+Reduces the size of a results set by taking a pileup file and producing a condensed version showing consecutive sequences of bases meeting coverage criteria. The tool works on six and ten column pileup formats produced with *samtools pileup* command. You also can specify columns for the input file manually. The tool assumes that the pileup dataset was produced by *samtools pileup* command (although you can override this by setting column assignments manually).
+
+--------
+
+**Types of pileup datasets**
+
+The description of pileup format below is largely based on information that can be found on SAMTools_ documentation page. The 6- and 10-column variants are described below.
+
+.. _SAMTools: http://samtools.sourceforge.net/pileup.shtml
+
+**Six column pileup**::
+
+    1    2  3  4        5        6
+ ---------------------------------   
+ chrM  412  A  2       .,       II
+ chrM  413  G  4     ..t,     IIIH
+ chrM  414  C  4     ...a     III2
+ chrM  415  C  4     TTTt     III7
+   
+where::
+
+ Column Definition
+ ------ ----------------------------
+      1 Chromosome
+      2 Position (1-based)
+      3 Reference base at that position
+      4 Coverage (# reads aligning over that position)
+      5 Bases within reads where (see Galaxy wiki for more info)
+      6 Quality values (phred33 scale, see Galaxy wiki for more)
+       
+**Ten column pileup**
+
+The `ten-column`__ pileup incorporates additional consensus information generated with *-c* option of *samtools pileup* command::
+
+
+    1    2  3  4   5   6   7   8       9       10
+ ------------------------------------------------
+ chrM  412  A  A  75   0  25  2       .,       II
+ chrM  413  G  G  72   0  25  4     ..t,     IIIH
+ chrM  414  C  C  75   0  25  4     ...a     III2
+ chrM  415  C  T  75  75  25  4     TTTt     III7
+
+where::
+
+  Column Definition
+ ------- ----------------------------
+       1 Chromosome
+       2 Position (1-based)
+       3 Reference base at that position
+       4 Consensus bases
+       5 Consensus quality
+       6 SNP quality
+       7 Maximum mapping quality
+       8 Coverage (# reads aligning over that position)
+       9 Bases within reads where (see Galaxy wiki for more info)
+      10 Quality values (phred33 scale, see Galaxy wiki for more)
+
+
+.. __: http://samtools.sourceforge.net/cns0.shtml
+
+------
+
+**The output format**
+
+The output file condenses the information in the pileup file so that consecutive bases are listed together as sequences. The starting and ending points of the sequence range are listed, with the starting value converted to a 0-based value. 
+
+Given the following input with minimum coverage set to 3::
+
+    1    2  3  4        5        6
+ ---------------------------------   
+ chr1  112  G  3     ..Ta     III6
+ chr1  113  T  2     aT..     III5
+ chr1  114  A  5     ,,..     IIH2
+ chr1  115  C  4      ,.,      III
+ chrM  412  A  2       .,       II
+ chrM  413  G  4     ..t,     IIIH
+ chrM  414  C  4     ...a     III2
+ chrM  415  C  4     TTTt     III7
+ chrM  490  T  3        a        I

+the following would be the output::

+    1    2    3  4
+ -------------------
+ chr1  111  112  G
+ chr1  113  115  AC
+ chrM  412  415  GCC
+ chrM  489  490  T
+
+where::
+
+  Column Definition
+ ------- ----------------------------
+       1 Chromosome
+       2 Starting position (0-based)
+       3 Ending position (1-based)
+       4 Sequence of bases

+  </help>
+</tool>
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/samtools/pileup_parser.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/samtools/pileup_parser.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,121 @@
+#! /usr/bin/perl -w
+
+use strict;
+use POSIX;
+
+
+die "Usage: pileup_parser.pl <in_file> <ref_base_column> <read_bases_column> <base_quality_column> <coverage column> <qv cutoff> <coverage cutoff> <SNPs only?> <output bed?> <coord_column> <out_file> <total_diff> <print_qual_bases>\n" unless @ARGV == 13;
+
+my $in_file = $ARGV[0];
+my $ref_base_column = $ARGV[1]-1; # 1 based
+my $read_bases_column = $ARGV[2]-1; # 1 based
+my $base_quality_column = $ARGV[3]-1; # 1 based
+my $cvrg_column = $ARGV[4]-1; # 1 based
+my $quality_cutoff = $ARGV[5]; # phred scale integer
+my $cvrg_cutoff = $ARGV[6]; # unsigned integer
+my $SNPs_only = $ARGV[7]; # set to "Yes" to print only positions with SNPs; set to "No" to pring everything
+my $bed = $ARGV[8]; #set to "Yes" to convert coordinates to bed format (0-based start, 1-based end); set to "No" to leave as is
+my $coord_column = $ARGV[9]-1; #1 based 
+my $out_file = $ARGV[10];
+my $total_diff = $ARGV[11]; # set to "Yes" to print total number of deviant based
+my $print_qual_bases = $ARGV[12]; #set to "Yes" to print quality and read base columns
+
+my $invalid_line_counter = 0;
+my $first_skipped_line = "";
+my %SNPs = ('A',0,'T',0,'C',0,'G',0);
+my $above_qv_bases = 0;
+my $SNPs_exist = 0;
+my $out_string = "";
+my $diff_count = 0;
+
+open (IN, "<$in_file") or die "Cannot open $in_file $!\n";
+open (OUT, ">$out_file") or die "Cannot open $out_file $!\n";
+
+while (<IN>) {
+ chop;
+ next if m/^\#/;
+ my @fields = split /\t/;
+ next if $fields[ $ref_base_column ] eq "*"; # skip indel lines
+  my $read_bases   = $fields[ $read_bases_column ];
+  die "Coverage column" . ($cvrg_column+1) . " contains non-numeric values. Check your input parameters as well as format of input dataset." if ( not isdigit $fields[ $cvrg_column ] );
+    next if $fields[ $cvrg_column ] < $cvrg_cutoff;
+ my $base_quality = $fields[ $base_quality_column ];
+ if ($read_bases =~ m/[\$\^\+-]/) {
+ $read_bases =~ s/\^.//g; #removing the start of the read segement mark
+ $read_bases =~ s/\$//g; #removing end of the read segment mark
+ while ($read_bases =~ m/[\+-]{1}(\d+)/g) {
+ my $indel_len = $1;
+ $read_bases =~ s/[\+-]{1}$indel_len.{$indel_len}//; # remove indel info from read base field
+ }
+ }
+ if ( length($read_bases) != length($base_quality) ) {
+        $first_skipped_line = $. if $first_skipped_line eq "";
+        ++$invalid_line_counter;
+        next;
+ }
+ # after removing read block and indel data the length of read_base 
+ # field should identical to the length of base_quality field
+
+ my @bases = split //, $read_bases;
+ my @qv    = split //, $base_quality;
+
+ for my $base ( 0 .. @bases - 1 ) {
+ if ( ord( $qv[ $base ] ) - 33 >= $quality_cutoff and $bases[ $base ] ne '*')
+ {
+ ++$above_qv_bases;
+
+ if ( $bases[ $base ] =~ m/[ATGC]/i )
+ {
+ $SNPs_exist = 1;
+ $SNPs{ uc( $bases[ $base ] ) } += 1;
+ $diff_count += 1;
+ } elsif ( $bases[ $base ] =~ m/[\.,]/ ) {
+     $SNPs{ uc( $fields[ $ref_base_column ] ) } += 1;
+     }  
+ }
+ } 
+
+ if ($bed eq "Yes") {
+        my $start = $fields[ $coord_column ] - 1;
+        my $end   = $fields[ $coord_column ];
+        $fields[ $coord_column ] = "$start\t$end";
+ } 
+
+ if ($print_qual_bases ne "Yes") {
+        $fields[ $base_quality_column ] = "";
+        $fields[ $read_bases_column ] = "";
+ }
+        
+
+ $out_string = join("\t", @fields); # \t$read_bases\t$base_quality";
+ foreach my $SNP (sort keys %SNPs) {
+ $out_string .= "\t$SNPs{$SNP}";
+ }
+
+ if ($total_diff eq "Yes") {
+    $out_string .= "\t$above_qv_bases\t$diff_count\n";
+ } else {
+    $out_string .= "\t$above_qv_bases\n";
+ }
+
+ $out_string =~ s/\t+/\t/g;
+
+ if ( $SNPs_only eq "Yes" ) {
+ print OUT $out_string if $SNPs_exist == 1;
+ } else {
+ print OUT $out_string;
+ }
+
+
+ %SNPs = ();
+ %SNPs = ('A',0,'T',0,'C',0,'G',0);
+ $above_qv_bases = 0;
+ $SNPs_exist = 0;
+ $diff_count = 0;
+
+
+}
+
+print "Skipped $invalid_line_counter invalid line(s) beginning with line $first_skipped_line\n" if $invalid_line_counter > 0;
+close IN;
+close OUT;
b
diff -r 000000000000 -r 9071e359b9a3 tools/samtools/pileup_parser.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/samtools/pileup_parser.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,387 @@\n+<tool id="pileup_parser" name="Filter pileup" version="1.0.2">>\n+  <description>on coverage and SNPs</description>\n+  <command interpreter="perl">\n+    #if   $pileup_type.type_select == "six"    #pileup_parser.pl $input "3" "5" "6" "4" $qv_cutoff $cvrg_cutoff $snps_only $interval "2" $out_file1 $diff $qc_base\n+    #elif $pileup_type.type_select == "ten"    #pileup_parser.pl $input "3" "9" "10" "8" $qv_cutoff $cvrg_cutoff $snps_only $interval "2" $out_file1 $diff $qc_base\n+    #elif $pileup_type.type_select == "manual" #pileup_parser.pl $input $pileup_type.ref_base_column $pileup_type.read_bases_column $pileup_type.read_qv_column $pileup_type.cvrg_column $qv_cutoff $cvrg_cutoff $snps_only $interval $pileup_type.coord_column $out_file1 $diff $qc_base\n+    #end if#\n+  </command>\n+  <inputs>\n+    <param name="input" type="data" format="tabular" label="Select dataset"/>\n+    <conditional name="pileup_type">\n+        <param name="type_select" type="select" label="which contains" help="See &quot;Types of pileup datasets&quot; below for examples">\n+            <option value="six" selected="true">Pileup with six columns (simple)</option>\n+            <option value="ten">Pileup with ten columns (with consensus)</option>\n+            <option value="manual">Set columns manually</option>\n+        </param>\n+        <when value="manual">\n+            <param name="ref_base_column" label="Select column with reference base" type="data_column" numerical="false" data_ref="input" />\n+            <param name="read_bases_column" label="Select column with read bases" type="data_column" numerical="false" data_ref="input" help="something like this: ..,a.."/>\n+            <param name="read_qv_column" label="Select column with base qualities" type="data_column" numerical="false" data_ref="input" help="something like this: IIIGIAI"/>\n+            <param name="cvrg_column" label="Select column with coverage" type="data_column" numerical="true" data_ref="input" />\n+            <param name="coord_column" label="Select coordinate column" type="data_column" numerical="true" data_ref="input" />\n+        </when>\n+        <when value="six">\n+        </when>\n+        <when value="ten">\n+        </when>\n+    </conditional>\n+    <param name="qv_cutoff" label="Do not consider read bases with quality lower than" type="integer" value="20" help="No variants with quality below this value will be reported"/>\n+    <param name="cvrg_cutoff" label="Do not report positions with coverage lower than" type="integer" value="3" help="Pileup lines with coverage lower than this value will be skipped"/>\n+    <param name="snps_only" label="Only report variants?" type="select" help="See &quot;Examples 1 and 2&quot; below for explanation">\n+        <option value="No">No</option>\n+        <option value="Yes" selected="true">Yes</option>\n+    </param>\n+    <param name="interval" label="Convert coordinates to intervals?" type="select" help="See &quot;Output format&quot; below for explanation">\n+        <option value="No" selected="true">No</option>\n+        <option value="Yes">Yes</option>\n+    </param>\n+    <param name="diff" label="Print total number of differences?" type="select" help="See &quot;Example 3&quot; below for explanation">\n+        <option value="No" selected="true">No</option>\n+        <option value="Yes">Yes</option>\n+    </param>\n+    <param name="qc_base" label="Print quality and base string?" type="select" help="See &quot;Example 4&quot; below for explanation">\n+        <option value="No">No</option>\n+        <option value="Yes" selected="true">Yes</option>\n+    </param>\n+        \n+  </inputs>\n+  <outputs>\n+    <data format="tabular" name="out_file1">\n+        <change_format>\n+            <when input="interval" value="Yes" format="interval" />\n+        </change_format>\n+   </data>\n+  </outputs>\n+  <tests>\n+    <test>\n+      <param name="input" value="pileup_parser.6col.pileup"/>\n+      <output name="out_file1" file="pileup_parser.6col.20-3-yes-yes.pi'..b'ts (if Convert coordinates to intervals? is set to yes)\n+\n+\n+Note that in this case the coordinates of SNPs were converted to intervals, where the start coordinate is 0-based and the end coordinate in 1-based using the UCSC Table Browser convention. \n+ \n+Although three positions have variants in the original file (413, 414, and 415), only 413 and 415 are reported because the quality values associated with these two SNPs are above the threshold of 20. In the case of 414 the **a** allele has a quality value of 17 ( ord("2")-33 ), and is therefore not reported. Note that five columns have been added to each of the reported lines::\n+\n+  chrM  413  G  4  ..t,  IIIH  0  0  2  1  3\n+  \n+Here, there is one variant, **t**. Because the fourth column represents **T** counts, it is incremented by 1. The last column shows that at this position, three reads have bases above the quality threshold of 20.\n+\n+-----\n+\n+**Example 1**: Just variants\n+\n+In this mode, the tool only outputs the lines from the input datasets where at least one read contains a sequence variant with quality above the threshold set by the **Do not consider read bases with quality lower than** option. For example, suppose one has a pileup dataset like the following::\n+\n+ chrM  412  A  2       .,       II\n+ chrM  413  G  4     ..t,     III2\n+ chrM  414  C  4     ..Ta     III2\n+ chrM  415  C  4     TTTt     III7\n+ \n+To call all variants (with no restriction by coverage) with quality above phred value of 20, we will need to set the parameters as follows:\n+\n+.. image:: ./static/images/pileup_parser_help1.png \n+\n+Running the tool with these parameters will return::\n+\n+ chrM  413  G  4  ..t,  IIIH  0  0  0  1  3\n+ chrM  414  C  4  ..Ta  III2  0  2  0  1  3\n+ chrM  415  C  4  TTTt  III7  0  0  0  4  4\n+ \n+**Note** that position 414 is not reported because the *a* variant has associated quality value of 17 (because ord(\'2\')-33 = 17) and is below the phred threshold of 20 set by the **Count variants with quality above this value** parameter.\n+\n+-----\n+\n+**Example 2**: Report everything\n+\n+In addition to calling variants, it is often useful to know the quality adjusted coverage. Running the tool with these parameters:\n+\n+.. image:: ./static/images/pileup_parser_help2.png \n+\n+will report everything from the original file::\n+\n+ chrM  412  A  2  .,    II    2  0  0  0  2\n+ chrM  413  G  4  ..t,  III2  0  0  2  1  3\n+ chrM  414  C  4  ..Ta  III2  0  2  0  1  3\n+ chrM  415  C  4  TTTt  III7  0  0  0  4  4\n+ \n+Here, you can see that although the total coverage at position 414 is 4 (column 4), the quality adjusted coverage is 3 (last column). This is because only three out of four reads have bases with quality above the set threshold of 20 (the actual qualities are III2 or, after conversion,  40, 40, 40, 17).\n+\n+One can use the last column of this dataset to filter out (using Galaxy\'s **Filter** tool) positions where quality adjusted coverage (last column) is below a set threshold.\n+\n+------\n+\n+**Example 3**: Report everything and print total number of differences\n+\n+If you set the **Print total number of differences?** to **Yes** the tool will print an additional column with the total number of reads where a devinat base is above the quality threshold. So, seetiing parametrs like this:\n+\n+.. image:: ./static/images/pileup_parser_help3.png\n+\n+will produce this::\n+\n+ chrM  412  A  2  .,    II    2  0  0  0  2  0\n+ chrM  413  G  4  ..t,  III2  0  0  2  1  3  1\n+ chrM  414  C  4  ..Ta  III2  0  2  0  1  3  1\n+ chrM  415  C  4  TTTt  III7  0  0  0  4  4  0\n+ \n+ \n+-----\n+\n+**Example 4**: Report everything, print total number of differences, and ignore qualities and read bases\n+\n+Setting **Print quality and base string?** to **Yes** as shown here:\n+\n+.. image:: ./static/images/pileup_parser_help4.png\n+\n+will produce this::\n+\n+ chrM  412  A  2  2  0  0  0  2  0\n+ chrM  413  G  4  0  0  2  1  3  1\n+ chrM  414  C  4  0  2  0  1  3  1\n+ chrM  415  C  4  0  0  0  4  4  0\n+\n+\n+\n+ \n+</help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/samtools/sam2interval.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/samtools/sam2interval.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+
+import sys
+import optparse
+import re
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def main():
+    usage = """%prog [options]
+    
+options (listed below) default to 'None' if omitted
+    """
+    parser = optparse.OptionParser(usage=usage)
+
+    parser.add_option(
+        '-f','--input_sam_file',
+        metavar="INPUT_SAM_FILE",
+        dest='input_sam',
+        default = False,
+        help='Name of the SAM file to be filtered. STDIN is default')
+            
+    parser.add_option(
+        '-c','--flag_column',
+        dest='flag_col',
+        default = '2',
+        help='Column containing SAM bitwise flag. 1-based')
+        
+    parser.add_option(
+        '-s','--start_column',
+        dest='start_col',
+        default = '4',
+        help='Column containing position. 1-based')
+
+    parser.add_option(
+        '-g','--cigar_column',
+        dest='cigar_col',
+        default = '6',
+        help='Column containing CIGAR or extended CIGAR string')
+
+    parser.add_option(
+        '-r','--ref_column',
+        dest='ref_col',
+        default = '3',
+        help='Column containing name of the reference sequence coordinate. 1-based')
+        
+    parser.add_option(
+        '-e','--read_column',
+        dest='read_col',
+        default = '1',
+        help='Column containing read name. 1-based')
+
+    parser.add_option(
+        '-p','--print_all',
+        dest='prt_all',
+        action='store_true',
+        default = False,
+        help='Print coordinates and original SAM?')
+    
+    options, args = parser.parse_args()
+
+    if options.input_sam:
+        infile = open ( options.input_sam, 'r')
+    else:
+        infile = sys.stdin
+
+    cigar = re.compile( '\d+M|\d+N|\d+D|\d+P' )
+
+    print '#chrom\tstart\tend\tstrand\tread_name' # provide a (partial) header so that strand is automatically set in metadata
+
+    for line in infile:
+        line = line.rstrip( '\r\n' )
+        if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
+            fields = line.split( '\t' )
+            start = int( fields[ int( options.start_col ) - 1 ] ) - 1
+            end = 0
+            for op in cigar.findall( fields[ int( options.cigar_col) - 1 ] ):
+                end += int( op[ 0:len( op ) - 1 ] )
+                
+            strand = '+' 
+            if bool( int( fields[ int( options.flag_col ) - 1 ] ) & 0x0010 ):
+                strand = '-'
+            read_name = fields[ int( options.read_col ) - 1 ]
+            ref_name  = fields[ int( options.ref_col ) - 1 ]
+            
+            if ref_name != '*':
+                # Do not print lines with unmapped reads that contain '*' instead of chromosome name        
+                if options.prt_all: 
+                    print '%s\t%s\t%s\t%s\t%s' % (ref_name, str(start), str(end+start), strand, line)
+                else:
+                    print '%s\t%s\t%s\t%s\t%s' % (ref_name, str(start), str(end+start), strand, read_name)
+
+if __name__ == "__main__": main()
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/samtools/sam2interval.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/samtools/sam2interval.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,72 @@
+<tool id="sam2interval" name="Convert SAM" version="1.0.1">
+  <description>to interval</description>
+  <command interpreter="python">sam2interval.py --input_sam_file=$input1 $print_all > $out_file1
+  </command>
+  <inputs>
+    <param format="sam" name="input1" type="data" label="Select dataset to convert"/>
+    <param name="print_all" type="select" label="Print all?" help="Do you want to retain original SAM fields? See example below.">
+        <option value="-p">Yes</option>
+        <option value="">No</option>
+    </param>
+  </inputs>
+ <outputs>
+    <data format="interval" name="out_file1" label="Converted Interval" />
+  </outputs>
+<tests>
+    <test>          
+        <param name="input1" value="sam_bioinf_example.sam" ftype="sam"/>
+        <param name="print_all" value="Yes"/>
+        <output name="out_file1" file="sam2interval_printAll.dat" ftype="interval"/>
+    </test>
+    <test>          
+        <param name="input1" value="sam_bioinf_example.sam" ftype="sam"/>
+        <param name="print_all" value="No"/>
+        <output name="out_file1" file="sam2interval_noprintAll.dat" ftype="interval"/>
+    </test>
+    <test>
+        <param name="input1" value="sam2interval-test3.sam" ftype="sam"/>
+        <param name="print_all" value="No"/>
+        <output name="out_file1" file="sam2interval_with_unmapped_reads_noprintAll.dat" ftype="interval"/>
+    </test>
+
+</tests>
+  <help>
+
+**What it does**
+
+Converts positional information from a SAM dataset into interval format with 0-based start and 1-based end. CIGAR string of SAM format is used to compute the end coordinate.
+
+-----
+
+**Example**
+
+Converting the following dataset::
+
+ r001 163 ref  7 30 8M2I4M1D3M = 37  39 TTAGATAAAGGATACTA *
+ r002   0 ref  9 30 3S6M1P1I4M *  0   0 AAAAGATAAGGATA    *
+ r003   0 ref  9 30       5H6M *  0   0 AGCTAA            * NM:i:1
+ r004   0 ref 16 30    6M14N5M *  0   0 ATAGCTTCAGC       *
+ r003  16 ref 29 30       6H5M *  0   0 TAGGC             * NM:i:0
+ r001  83 ref 37 30         9M =  7 -39 CAGCGCCAT         *
+
+into Interval format will produce the following if *Print all?* is set to **Yes**::
+
+ ref  6 22 + r001 163 ref  7 30 8M2I4M1D3M = 37  39 TTAGATAAAGGATACTA *
+ ref  8 19 + r002   0 ref  9 30 3S6M1P1I4M *  0   0 AAAAGATAAGGATA    *
+ ref  8 14 + r003   0 ref  9 30 5H6M       *  0   0 AGCTAA            * NM:i:1
+ ref 15 40 + r004   0 ref 16 30 6M14N5M    *  0   0 ATAGCTTCAGC       *
+ ref 28 33 - r003  16 ref 29 30 6H5M       *  0   0 TAGGC             * NM:i:0
+ ref 36 45 - r001  83 ref 37 30 9M         =  7 -39 CAGCGCCAT         *

+Setting  *Print all?* to **No** will generate the following::
+
+ ref  6 22 + r001
+ ref  8 19 + r002
+ ref  8 14 + r003
+ ref 15 40 + r004
+ ref 28 33 - r003
+ ref 36 45 - r001
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/samtools/sam_bitwise_flag_filter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/samtools/sam_bitwise_flag_filter.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,149 @@
+#!/usr/bin/env python
+# Refactored on 11/13/2010 by Kanwei Li
+
+import sys
+import optparse
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def main():
+    usage = """%prog [options]
+    
+options (listed below) default to 'None' if omitted
+    """
+    parser = optparse.OptionParser(usage=usage)
+    
+    parser.add_option(
+        '--0x0001','--is_paired',
+        choices = ( '0','1' ),
+        dest='is_paired',
+        metavar="<0|1>",
+        help='The read is paired in sequencing')
+
+    parser.add_option(
+        '--0x0002','--is_proper_pair',
+        choices = ( '0','1' ),
+        metavar="<0|1>",
+        dest='is_proper_pair',
+        help='The read is mapped in a proper pair')
+
+    parser.add_option(
+        '--0x0004','--is_unmapped',
+        choices = ( '0','1' ),
+        metavar="<0|1>",
+        dest='is_unmapped',
+        help='The query sequence itself is unmapped')
+
+    parser.add_option(
+        '--0x0008','--mate_is_unmapped',
+        choices = ( '0','1' ),
+        metavar="<0|1>",
+        dest='mate_is_unmapped',
+        help='The mate is unmapped')
+
+    parser.add_option(
+        '--0x0010','--query_strand',
+        dest='query_strand',
+        metavar="<0|1>",
+        choices = ( '0','1' ),
+        help='Strand of the query: 0 = forward, 1 = reverse.')
+
+    parser.add_option(
+        '--0x0020','--mate_strand',
+        dest='mate_strand',
+        metavar="<0|1>",
+        choices = ('0','1'),
+        help='Strand of the mate: 0 = forward, 1 = reverse.')
+
+    parser.add_option(
+        '--0x0040','--is_first',
+        choices = ( '0','1' ),
+        metavar="<0|1>",
+        dest='is_first',
+        help='The read is the first read in a pair')
+
+    parser.add_option(
+        '--0x0080','--is_second',
+        choices = ( '0','1' ),
+        metavar="<0|1>",
+        dest='is_second',
+        help='The read is the second read in a pair')
+
+    parser.add_option(
+        '--0x0100','--is_not_primary',
+        choices = ( '0','1' ),
+        metavar="<0|1>",
+        dest='is_not_primary',
+        help='The alignment for the given read is not primary')
+
+    parser.add_option(
+        '--0x0200','--is_bad_quality',
+        choices = ( '0','1' ),
+        metavar="<0|1>",
+        dest='is_bad_quality',
+        help='The read fails platform/vendor quality checks')
+
+    parser.add_option(
+        '--0x0400','--is_duplicate',
+        choices = ( '0','1' ),
+        metavar="<0|1>",
+        dest='is_duplicate',
+        help='The read is either a PCR or an optical duplicate')
+        
+    parser.add_option(
+        '-f','--input_sam_file',
+        metavar="INPUT_SAM_FILE",
+        dest='input_sam',
+        default = False,
+        help='Name of the SAM file to be filtered. STDIN is default')
+            
+    parser.add_option(
+        '-c','--flag_column',
+        dest='flag_col',
+        default = '2',
+        help='Column containing SAM bitwise flag. 1-based')
+
+    options, args = parser.parse_args()
+
+    if options.input_sam:
+ infile = open ( options.input_sam, 'r')
+    else:
+     infile = sys.stdin
+        
+    opt_ary = [
+        options.is_paired,
+        options.is_proper_pair,
+        options.is_unmapped,
+        options.mate_is_unmapped,
+        options.query_strand,
+        options.mate_strand,
+        options.is_first,
+        options.is_second,
+        options.is_not_primary,
+        options.is_bad_quality,
+        options.is_duplicate
+    ]
+    
+    opt_map = { '0': False, '1': True }
+    used_indices = [(index, opt_map[opt]) for index, opt in enumerate(opt_ary) if opt is not None]
+    flag_col = int( options.flag_col ) - 1
+    
+    for line in infile:
+        line = line.rstrip( '\r\n' )
+        if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
+            fields = line.split( '\t' )
+            flags = int( fields[flag_col] )
+            
+            valid_line = True
+            for index, opt_bool in used_indices:
+                if bool(flags & 0x0001 << index) != opt_bool:
+                    valid_line = False
+                    break
+                    
+            if valid_line:
+                print line
+
+if __name__ == "__main__": main()
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/samtools/sam_bitwise_flag_filter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/samtools/sam_bitwise_flag_filter.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,97 @@
+<tool id="sam_bw_filter" name="Filter SAM" version="1.0.0">
+  <description>on bitwise flag values</description>
+  <parallelism method="basic"></parallelism>
+  <command interpreter="python">
+    sam_bitwise_flag_filter.py  
+      --input_sam_file=$input1
+      --flag_column=2
+      #for $bit in $bits
+       '${bit.flags}=${bit.states}'
+      #end for
+      > $out_file1
+  </command>
+  <inputs>
+    <param format="sam" name="input1" type="data" label="Select dataset to filter"/>
+    <repeat name="bits" title="Flag">
+      <param name="flags" type="select" label="Type">
+        <option value="--0x0001">Read is paired</option>
+        <option value="--0x0002">Read is mapped in a proper pair</option>
+        <option value="--0x0004">The read is unmapped</option>
+        <option value="--0x0008">The mate is unmapped</option>
+        <option value="--0x0010">Read strand</option>
+        <option value="--0x0020">Mate strand</option>
+        <option value="--0x0040">Read is the first in a pair</option>
+        <option value="--0x0080">Read is the second in a pair</option>
+        <option value="--0x0100">The alignment or this read is not primary</option>
+        <option value="--0x0200">The read fails platform/vendor quality checks</option>
+        <option value="--0x0400">The read is a PCR or optical duplicate</option>
+      </param>
+      <param name="states" type="select" display="radio" label="Set the states for this flag">
+         <option value="0">No</option>
+         <option value="1">Yes</option>
+       </param>
+    </repeat>
+  </inputs>
+  <outputs>
+    <data format="sam" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="sam_bw_filter.sam" ftype="sam"/>
+      <param name="flags" value="Read is mapped in a proper pair"/>
+      <param name="states" value="1"/>
+      <output name="out_file1" file="sam_bw_filter_0002-yes.sam" ftype="sam"/>
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+Allows parsing of SAM datasets using bitwise flag (the second column). The bits in the flag are defined as follows::
+
+    Bit Info
+ ------ --------------------------------------------------------------------------   
+ 0x0001 the read is paired in sequencing, no matter whether it is mapped in a pair 
+ 0x0002 the read is mapped in a proper pair (depends on the protocol, normally 
+        inferred during alignment) 1 
+ 0x0004 the query sequence itself is unmapped 
+ 0x0008 the mate is unmapped 1 
+ 0x0010 strand of the query (0 for forward; 1 for reverse strand) 
+ 0x0020 strand of the mate 1 
+ 0x0040 the read is the first read in a pair (see below)
+ 0x0080 the read is the second read in a pair (see below) 
+ 0x0100 the alignment is not primary (a read having split hits may 
+        have multiple primary alignment records) 
+ 0x0200 the read fails platform/vendor quality checks 
+ 0x0400 the read is either a PCR duplicate or an optical duplicate
+
+Note the following:
+
+- Flag 0x02, 0x08, 0x20, 0x40 and 0x80 are only meaningful when flag 0x01 is present. 
+- If in a read pair the information on which read is the first in the pair is lost in the upstream analysis, flag 0x01 should be set, while 0x40 and 0x80 should both be zero.
+
+-----
+
+**Example**
+
+Suppose the following dataset was generated with BWA mapper::
+
+ r001 163 ref  7 30 8M2I4M1D3M = 37  39 TTAGATAAAGGATACTA *
+ r002   0 ref  9 30 3S6M1P1I4M *  0   0 AAAAGATAAGGATA    *
+ r003   0 ref  9 30       5H6M *  0   0 AGCTAA            * NM:i:1
+ r004   0 ref 16 30    6M14N5M *  0   0 ATAGCTTCAGC       *
+ r003  16 ref 29 30       6H5M *  0   0 TAGGC             * NM:i:0
+ r001  83 ref 37 30         9M =  7 -39 CAGCGCCAT         *
+
+To select properly mapped pairs, click the **Add new Flag** button and set *Read mapped in a proper pair* to **Yes**. The following two reads will be returned::
+
+ r001 163 ref  7 30 8M2I4M1D3M = 37  39 TTAGATAAAGGATACTA *
+ r001  83 ref 37 30         9M =  7 -39 CAGCGCCAT         *
+
+For more information, please consult the `SAM format description`__.
+
+.. __: http://www.ncbi.nlm.nih.gov/pubmed/19505943
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/samtools/sam_merge.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/samtools/sam_merge.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+
+"""
+Merges any number of BAM files
+usage: %prog [options]
+    input1
+    output1
+    input2
+    [input3[,input4[,input5[,...]]]]
+"""
+
+import os, subprocess, sys, tempfile
+
+def stop_err( msg ):
+    sys.stderr.write( '%s\n' % msg )
+    sys.exit()
+
+def __main__():
+    infile =  sys.argv[1]
+    outfile = sys.argv[2]
+    if len( sys.argv ) < 3:
+        stop_err( 'There are not enough files to merge' )
+    filenames = sys.argv[3:]
+    # output version # of tool
+    try:
+        tmp = tempfile.NamedTemporaryFile().name
+        tmp_stdout = open( tmp, 'wb' )
+        proc = subprocess.Popen( args='samtools 2>&1', shell=True, stdout=tmp_stdout )
+        tmp_stdout.close()
+        returncode = proc.wait()
+        stdout = None
+        for line in open( tmp_stdout.name, 'rb' ):
+            if line.lower().find( 'version' ) >= 0:
+                stdout = line.strip()
+                break
+        if stdout:
+            sys.stdout.write( 'Samtools %s\n' % stdout )
+        else:
+            raise Exception
+    except:
+        sys.stdout.write( 'Could not determine Samtools version\n' )
+    cmd = 'samtools merge %s %s %s' % ( outfile, infile, ' '.join( filenames ) )
+    tmp = tempfile.NamedTemporaryFile().name
+    try:
+        tmp_stderr = open( tmp, 'wb' )
+        proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )
+        returncode = proc.wait()
+        tmp_stderr.close()
+        # get stderr, allowing for case where it's very large
+        tmp_stderr = open( tmp, 'rb' )
+        stderr = ''
+        buffsize = 1048576
+        try:
+            while True:
+                stderr += tmp_stderr.read( buffsize )
+                if not stderr or len( stderr ) % buffsize != 0:
+                    break
+        except OverflowError:
+            pass
+        tmp_stderr.close()
+        if returncode != 0:
+            raise Exception, stderr
+        if os.path.exists( tmp ):
+            os.unlink( tmp )
+    except Exception, e:
+        if os.path.exists( tmp ):
+            os.unlink( tmp )
+        stop_err( 'Error running SAMtools merge tool\n' + str( e ) )
+    if os.path.getsize( outfile ) > 0:
+        sys.stdout.write( '%s files merged.' % ( len( sys.argv ) - 2 ) )
+    else:
+        stop_err( 'The output file is empty, there may be an error with one of your input files.' )
+
+if __name__ == "__main__" : __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/samtools/sam_merge.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/samtools/sam_merge.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,61 @@
+<tool id="sam_merge2" name="Merge BAM Files" version="1.1.2">
+  <description>merges BAM files together</description>
+  <requirements>
+    <requirement type="package">picard</requirement>
+  </requirements>
+  <command>
+java -Xmx2G -jar ${GALAXY_DATA_INDEX_DIR}/shared/jars/MergeSamFiles.jar MSD=$mergeSD VALIDATION_STRINGENCY=LENIENT O=$output1 I=$input1 I=$input2 
+      #for $i in $inputs
+        I=${i.input}
+      #end for 
+    2&gt; $outlog
+  </command>
+  <inputs>
+    <param name="title" label="Name for the output merged bam file" type="text" default="Merged.bam"
+       help="This name will appear in your history so use it to remember what the new file in your history contains" />
+    <param name="mergeSD" value="true" type="boolean"  label="Merge all component bam file headers into the merged bam file"
+      truevalue="true" falsevalue="false" checked="yes" 
+      help="Control the MERGE_SEQUENCE_DICTIONARIES flag for Picard MergeSamFiles. Default (true) correctly propagates read groups and other important metadata" />
+    <param name="input1" label="First file" type="data" format="bam" />
+    <param name="input2" label="with file" type="data" format="bam" help="Need to add more files? Use controls below." />
+    <repeat name="inputs" title="Input Files">
+      <param name="input" label="Add file" type="data" format="bam" />
+    </repeat>
+  </inputs>
+  <outputs>
+    <data format="bam" name="output1" label="${title}.bam" />
+    <data format="txt" name="outlog" label="${title}_${tool.name}.log" />
+  </outputs>
+  <tests>
+    <!-- TODO: add ability to test framework to test without at least 
+         one repeat element value
+    -->
+    <test>
+      <param name="title" value="test1" />
+      <param name="mergeSD" value="true" />
+      <param name="input1" value="sam_merge_in1.bam" ftype="bam" /> 
+      <param name="input2" value="sam_merge_in2.bam" ftype="bam" />
+      <output name="output1" file="sam_merge_out1.bam" ftype="bam" />
+      <output name="outlog" file="sam_merge_out1.log" ftype="txt" lines_diff="10"/>
+    </test>
+    <test>
+      <param name="title" value="test2" />
+      <param name="mergeSD" value="true" />
+      <param name="input1" value="sam_merge_in1.bam" ftype="bam" /> 
+      <param name="input2" value="sam_merge_in2.bam" ftype="bam" />
+      <param name="input" value="sam_merge_in3.bam" ftype="bam" />
+      <output name="output1" file="sam_merge_out2.bam" ftype="bam" />
+      <output name="outlog" file="sam_merge_out2.log" ftype="txt" lines_diff="10"/>
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool uses the Picard_ merge command to merge any number of BAM files together into one BAM file while preserving the BAM
+metadata such as read groups
+
+.. _Picard: http://picard.sourceforge.net/command-line-overview.shtml#MergeSamFiles
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/samtools/sam_merge_code.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/samtools/sam_merge_code.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,34 @@
+from galaxy.tools.parameters import DataToolParameter
+
+def validate_input( trans, error_map, param_values, page_param_map ):
+    dbkeys = set()
+    data_param_names = set()
+    data_params = 0
+    for name, param in page_param_map.iteritems():
+        if isinstance( param, DataToolParameter ):
+            # for each dataset parameter
+            if param_values.get(name, None) != None:
+                dbkeys.add( param_values[name].dbkey )
+                data_params += 1
+                # check meta data
+#                try:
+#                    param = param_values[name]
+#                    startCol = int( param.metadata.startCol )
+#                    endCol = int( param.metadata.endCol )
+#                    chromCol = int( param.metadata.chromCol )
+#                    if param.metadata.strandCol is not None:
+#                        strandCol = int ( param.metadata.strandCol )
+#                    else:
+#                        strandCol = 0
+#                except:
+#                    error_msg = "The attributes of this dataset are not properly set. " + \
+#                    "Click the pencil icon in the history item to set the chrom, start, end and strand columns."
+#                    error_map[name] = error_msg
+            data_param_names.add( name )
+    if len( dbkeys ) > 1:
+        for name in data_param_names:
+            error_map[name] = "All datasets must belong to same genomic build, " \
+                "this dataset is linked to build '%s'" % param_values[name].dbkey
+    if data_params != len(data_param_names):
+        for name in data_param_names:
+            error_map[name] = "A dataset of the appropriate type is required"
b
diff -r 000000000000 -r 9071e359b9a3 tools/samtools/sam_pileup.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/samtools/sam_pileup.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,163 @@
+#!/usr/bin/env python
+
+"""
+Creates a pileup file from a bam file and a reference.
+
+usage: %prog [options]
+   -p, --input1=p: bam file
+   -o, --output1=o: Output pileup
+   -R, --ref=R: Reference file type
+   -n, --ownFile=n: User-supplied fasta reference file
+   -d, --dbkey=d: dbkey of user-supplied file
+   -x, --indexDir=x: Index directory
+   -b, --bamIndex=b: BAM index file
+   -s, --lastCol=s: Print the mapping quality as the last column
+   -i, --indels=i: Only output lines containing indels
+   -M, --mapCap=M: Cap mapping quality
+   -c, --consensus=c: Call the consensus sequence using MAQ consensu model
+   -T, --theta=T: Theta paramter (error dependency coefficient)
+   -N, --hapNum=N: Number of haplotypes in sample
+   -r, --fraction=r: Expected fraction of differences between a pair of haplotypes
+   -I, --phredProb=I: Phred probability of an indel in sequencing/prep
+
+"""
+
+import os, shutil, subprocess, sys, tempfile
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+
+def stop_err( msg ):
+    sys.stderr.write( '%s\n' % msg )
+    sys.exit()
+
+def check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR ):
+    seqFile = '%s/sam_fa_indices.loc' % GALAXY_DATA_INDEX_DIR
+    seqPath = ''
+    for line in open( seqFile ):
+        line = line.rstrip( '\r\n' )
+        if line and not line.startswith( '#' ) and line.startswith( 'index' ):
+            fields = line.split( '\t' )
+            if len( fields ) < 3:
+                continue
+            if fields[1] == dbkey:
+                seqPath = fields[2].strip()
+                break
+    return seqPath
+
+def __main__():
+    #Parse Command Line
+    options, args = doc_optparse.parse( __doc__ )
+    seqPath = check_seq_file( options.dbkey, options.indexDir )
+    # output version # of tool
+    try:
+        tmp = tempfile.NamedTemporaryFile().name
+        tmp_stdout = open( tmp, 'wb' )
+        proc = subprocess.Popen( args='samtools 2>&1', shell=True, stdout=tmp_stdout )
+        tmp_stdout.close()
+        returncode = proc.wait()
+        stdout = None
+        for line in open( tmp_stdout.name, 'rb' ):
+            if line.lower().find( 'version' ) >= 0:
+                stdout = line.strip()
+                break
+        if stdout:
+            sys.stdout.write( 'Samtools %s\n' % stdout )
+        else:
+            raise Exception
+    except:
+        sys.stdout.write( 'Could not determine Samtools version\n' )
+    #prepare file names 
+    tmpDir = tempfile.mkdtemp()
+    tmpf0 = tempfile.NamedTemporaryFile( dir=tmpDir )
+    tmpf0_name = tmpf0.name
+    tmpf0.close()
+    tmpf0bam_name = '%s.bam' % tmpf0_name
+    tmpf0bambai_name = '%s.bam.bai' % tmpf0_name
+    tmpf1 = tempfile.NamedTemporaryFile( dir=tmpDir )
+    tmpf1_name = tmpf1.name
+    tmpf1.close()
+    tmpf1fai_name = '%s.fai' % tmpf1_name
+    #link bam and bam index to working directory (can't move because need to leave original)
+    os.symlink( options.input1, tmpf0bam_name )
+    os.symlink( options.bamIndex, tmpf0bambai_name )
+    #get parameters for pileup command
+    if options.lastCol == 'yes':
+        lastCol = '-s'
+    else:
+        lastCol = ''
+    if options.indels == 'yes':
+        indels = '-i'
+    else:
+        indels = ''
+    opts = '%s %s -M %s' % ( lastCol, indels, options.mapCap )
+    if options.consensus == 'yes':
+        opts += ' -c -T %s -N %s -r %s -I %s' % ( options.theta, options.hapNum, options.fraction, options.phredProb )
+    #prepare basic pileup command
+    cmd = 'samtools pileup %s -f %s %s > %s'
+    try:
+        # have to nest try-except in try-finally to handle 2.4
+        try:
+            #index reference if necessary and prepare pileup command
+            if options.ref == 'indexed':
+                if not os.path.exists( "%s.fai" % seqPath ):
+                    raise Exception, "No sequences are available for '%s', request them by reporting this error." % options.dbkey
+                cmd = cmd % ( opts, seqPath, tmpf0bam_name, options.output1 )
+            elif options.ref == 'history':
+                os.symlink( options.ownFile, tmpf1_name )
+                cmdIndex = 'samtools faidx %s' % ( tmpf1_name )
+                tmp = tempfile.NamedTemporaryFile( dir=tmpDir ).name
+                tmp_stderr = open( tmp, 'wb' )
+                proc = subprocess.Popen( args=cmdIndex, shell=True, cwd=tmpDir, stderr=tmp_stderr.fileno() )
+                returncode = proc.wait()
+                tmp_stderr.close()
+                # get stderr, allowing for case where it's very large
+                tmp_stderr = open( tmp, 'rb' )
+                stderr = ''
+                buffsize = 1048576
+                try:
+                    while True:
+                        stderr += tmp_stderr.read( buffsize )
+                        if not stderr or len( stderr ) % buffsize != 0:
+                            break
+                except OverflowError:
+                    pass
+                tmp_stderr.close()
+                #did index succeed?
+                if returncode != 0:
+                    raise Exception, 'Error creating index file\n' + stderr
+                cmd = cmd % ( opts, tmpf1_name, tmpf0bam_name, options.output1 )
+            #perform pileup command
+            tmp = tempfile.NamedTemporaryFile( dir=tmpDir ).name
+            tmp_stderr = open( tmp, 'wb' )
+            proc = subprocess.Popen( args=cmd, shell=True, cwd=tmpDir, stderr=tmp_stderr.fileno() )
+            returncode = proc.wait()
+            tmp_stderr.close()
+            #did it succeed?
+            # get stderr, allowing for case where it's very large
+            tmp_stderr = open( tmp, 'rb' )
+            stderr = ''
+            buffsize = 1048576
+            try:
+                while True:
+                    stderr += tmp_stderr.read( buffsize )
+                    if not stderr or len( stderr ) % buffsize != 0:
+                        break
+            except OverflowError:
+                pass
+            tmp_stderr.close()
+            if returncode != 0:
+                raise Exception, stderr
+        except Exception, e:
+            stop_err( 'Error running Samtools pileup tool\n' + str( e ) )
+    finally:
+        #clean up temp files
+        if os.path.exists( tmpDir ):
+            shutil.rmtree( tmpDir )
+    # check that there are results in the output file
+    if os.path.getsize( options.output1 ) > 0:
+        sys.stdout.write( 'Converted BAM to pileup' )
+    else:
+        stop_err( 'The output file is empty. Your input file may have had no matches, or there may be an error with your input file or settings.' )
+
+if __name__ == "__main__" : __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/samtools/sam_pileup.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/samtools/sam_pileup.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,184 @@
+<tool id="sam_pileup" name="Generate pileup" version="1.1.1">
+  <description>from BAM dataset</description>
+  <requirements>
+    <requirement type="package">samtools</requirement>
+  </requirements>
+  <command interpreter="python">
+    sam_pileup.py
+      --input1=$input1
+      --output=$output1
+      --ref=$refOrHistory.reference
+      #if $refOrHistory.reference == "history":
+        --ownFile=$refOrHistory.ownFile
+      #else:
+        --ownFile="None"
+      #end if
+       --dbkey=${input1.metadata.dbkey}
+       --indexDir=${GALAXY_DATA_INDEX_DIR}
+       --bamIndex=${input1.metadata.bam_index}
+       --lastCol=$lastCol
+       --indels=$indels
+       --mapCap=$mapCap
+       --consensus=$c.consensus
+      #if $c.consensus == "yes":
+        --theta=$c.theta
+        --hapNum=$c.hapNum
+        --fraction=$c.fraction
+        --phredProb=$c.phredProb
+       #else:
+        --theta="None"
+        --hapNum="None"
+        --fraction="None"
+        --phredProb="None"
+      #end if
+  </command>
+  <inputs>
+    <conditional name="refOrHistory">
+      <param name="reference" type="select" label="Will you select a reference genome from your history or use a built-in index?">
+        <option value="indexed">Use a built-in index</option>
+        <option value="history">Use one from the history</option>
+      </param>
+      <when value="indexed">
+        <param name="input1" type="data" format="bam" label="Select the BAM file to generate the pileup file for">
+           <validator type="unspecified_build" />
+           <validator type="dataset_metadata_in_file" filename="sam_fa_indices.loc" metadata_name="dbkey" metadata_column="1" message="Sequences are not currently available for the specified build." line_startswith="index" />
+        </param>
+      </when>
+      <when value="history">
+        <param name="input1" type="data" format="bam" label="Select the BAM file to generate the pileup file for" />
+        <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference genome" />
+      </when>
+    </conditional>
+    <param name="lastCol" type="select" label="Whether or not to print the mapping quality as the last column" help="Makes the output easier to parse, but is space inefficient">
+      <option value="no">Do not print the mapping quality as the last column</option>
+      <option value="yes">Print the mapping quality as the last column</option>
+    </param>
+    <param name="indels" type="select" label="Whether or not to print only output pileup lines containing indels">
+      <option value="no">Print all lines</option>
+      <option value="yes">Print only lines containing indels</option>
+    </param>
+    <param name="mapCap" type="integer" value="60" label="Where to cap mapping quality" />
+    <conditional name="c">
+      <param name="consensus" type="select" label="Call consensus according to MAQ model?">
+        <option selected="true" value="no">No</option>
+        <option value="yes">Yes</option>
+      </param> 
+      <when value="no" />
+      <when value="yes">
+        <param name="theta" type="float" value="0.85" label="Theta parameter (error dependency coefficient) in the MAQ consensus calling model" />
+        <param name="hapNum" type="integer" value="2" label="Number of haplotypes in the sample" help="Greater than or equal to 2" />
+        <param name="fraction" type="float" value="0.001" label="Expected fraction of differences between a pair of haplotypes" />
+        <param name="phredProb" type="integer" value="40" label="Phred probability of an indel in sequencing/prep" />
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="output1" label="${tool.name} on ${on_string}: converted pileup" />
+  </outputs>
+  <tests>
+    <test>
+      <!--
+      Bam to pileup command:
+      samtools faidx chr_m.fasta
+      samtools pileup -M 60 -f chr_m.fasta test-data/sam_pileup_in1.bam > sam_pileup_out1.pileup
+      chr_m.fasta is the prefix of the index
+      -->
+      <param name="reference" value="history" />
+      <param name="input1" value="sam_pileup_in1.bam" ftype="bam" />
+      <param name="ownFile" value="chr_m.fasta" ftype="fasta" dbkey="equCab2" />
+      <param name="lastCol" value="no" />
+      <param name="indels" value="no" />
+      <param name="mapCap" value="60" />
+      <param name="consensus" value="no" />
+      <output name="output1" file="sam_pileup_out1.pileup" />
+    </test>
+    <test>
+      <!--
+      Bam to pileup command:
+      samtools pileup -M 60 -c -T 0.85 -N 2 -r 0.001 -I 40 -f chr_m.fasta test-data/sam_pileup_in1.bam > sam_pileup_out2.pileup
+      chr_m.fasta is the prefix of the index
+      -->
+      <param name="reference" value="indexed" />
+      <param name="input1" value="sam_pileup_in1.bam" ftype="bam" dbkey="equCab2" />
+      <param name="lastCol" value="no" />
+      <param name="indels" value="no" />
+      <param name="mapCap" value="60" />
+      <param name="consensus" value="yes" />
+      <param name="theta" value="0.85" />
+      <param name="hapNum" value="2" />
+      <param name="fraction" value="0.001" />
+      <param name="phredProb" value="40" />
+      <output name="output1" file="sam_pileup_out2.pileup" />
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+Uses SAMTools_' pileup command to produce a pileup dataset from a provided BAM dataset. It generates two types of pileup datasets depending on the specified options. If *Call consensus according to MAQ model?* option is set to **No**, the tool produces simple pileup. If the option is set to **Yes**, a ten column pileup dataset with consensus is generated. Both types of datasets are briefly summarized below.
+
+.. _SAMTools: http://samtools.sourceforge.net/samtools.shtml
+
+------
+
+**Types of pileup datasets**
+
+The description of pileup format below is largely based on information that can be found on SAMTools Pileup_ documentation page. The 6- and 10-column variants are described below.
+
+.. _Pileup: http://samtools.sourceforge.net/pileup.shtml
+
+**Six column pileup**::
+
+    1    2  3  4        5        6
+ ---------------------------------
+ chrM  412  A  2       .,       II
+ chrM  413  G  4     ..t,     IIIH
+ chrM  414  C  4     ...a     III2
+ chrM  415  C  4     TTTt     III7
+   
+where::
+
+  Column Definition
+ ------- ----------------------------
+       1 Chromosome
+       2 Position (1-based)
+       3 Reference base at that position
+       4 Coverage (# reads aligning over that position)
+       5 Bases within reads where (see Galaxy wiki for more info)
+       6 Quality values (phred33 scale, see Galaxy wiki for more)
+       
+**Ten column pileup**
+
+The `ten-column` (consensus_) pileup incorporates additional consensus information generated with *-c* option of *samtools pileup* command::
+
+
+    1    2  3  4   5   6   7   8       9       10
+ ------------------------------------------------
+ chrM  412  A  A  75   0  25  2       .,       II
+ chrM  413  G  G  72   0  25  4     ..t,     IIIH
+ chrM  414  C  C  75   0  25  4     ...a     III2
+ chrM  415  C  T  75  75  25  4     TTTt     III7
+
+where::
+
+  Column Definition
+ ------- --------------------------------------------------------
+       1 Chromosome
+       2 Position (1-based)
+       3 Reference base at that position
+       4 Consensus bases
+       5 Consensus quality
+       6 SNP quality
+       7 Maximum mapping quality
+       8 Coverage (# reads aligning over that position)
+       9 Bases within reads where (see Galaxy wiki for more info)
+      10 Quality values (phred33 scale, see Galaxy wiki for more)
+
+
+.. _consensus: http://samtools.sourceforge.net/cns0.shtml
+
+
+  </help>
+</tool>
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/samtools/sam_to_bam.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/samtools/sam_to_bam.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,197 @@\n+#!/usr/bin/env python\n+"""\n+Converts SAM data to sorted BAM data.\n+usage: sam_to_bam.py [options]\n+   --input1: SAM file to be converted\n+   --dbkey: dbkey value\n+   --ref_file: Reference file if choosing from history\n+   --output1: output dataset in bam format\n+   --index_dir: GALAXY_DATA_INDEX_DIR\n+"""\n+\n+import optparse, os, sys, subprocess, tempfile, shutil, gzip\n+from galaxy import eggs\n+import pkg_resources; pkg_resources.require( "bx-python" )\n+from bx.cookbook import doc_optparse\n+from galaxy import util\n+\n+def stop_err( msg ):\n+    sys.stderr.write( \'%s\\n\' % msg )\n+    sys.exit()\n+\n+def check_seq_file( dbkey, cached_seqs_pointer_file ):\n+    seq_path = \'\'\n+    for line in open( cached_seqs_pointer_file ):\n+        line = line.rstrip( \'\\r\\n\' )\n+        if line and not line.startswith( \'#\' ) and line.startswith( \'index\' ):\n+            fields = line.split( \'\\t\' )\n+            if len( fields ) < 3:\n+                continue\n+            if fields[1] == dbkey:\n+                seq_path = fields[2].strip()\n+                break\n+    return seq_path\n+\n+def __main__():\n+    #Parse Command Line\n+    parser = optparse.OptionParser()\n+    parser.add_option( \'\', \'--input1\', dest=\'input1\', help=\'The input SAM dataset\' )\n+    parser.add_option( \'\', \'--dbkey\', dest=\'dbkey\', help=\'The build of the reference dataset\' )\n+    parser.add_option( \'\', \'--ref_file\', dest=\'ref_file\', help=\'The reference dataset from the history\' )\n+    parser.add_option( \'\', \'--output1\', dest=\'output1\', help=\'The output BAM dataset\' )\n+    parser.add_option( \'\', \'--index_dir\', dest=\'index_dir\', help=\'GALAXY_DATA_INDEX_DIR\' )\n+    ( options, args ) = parser.parse_args()\n+\n+    # output version # of tool\n+    try:\n+        tmp = tempfile.NamedTemporaryFile().name\n+        tmp_stdout = open( tmp, \'wb\' )\n+        proc = subprocess.Popen( args=\'samtools 2>&1\', shell=True, stdout=tmp_stdout )\n+        tmp_stdout.close()\n+        returncode = proc.wait()\n+        stdout = None\n+        for line in open( tmp_stdout.name, \'rb\' ):\n+            if line.lower().find( \'version\' ) >= 0:\n+                stdout = line.strip()\n+                break\n+        if stdout:\n+            sys.stdout.write( \'Samtools %s\\n\' % stdout )\n+        else:\n+            raise Exception\n+    except:\n+        sys.stdout.write( \'Could not determine Samtools version\\n\' )\n+\n+    cached_seqs_pointer_file = \'%s/sam_fa_indices.loc\' % options.index_dir\n+    if not os.path.exists( cached_seqs_pointer_file ):\n+        stop_err( \'The required file (%s) does not exist.\' % cached_seqs_pointer_file )\n+    # If found for the dbkey, seq_path will look something like /galaxy/data/equCab2/sam_index/equCab2.fa,\n+    # and the equCab2.fa file will contain fasta sequences.\n+    seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file )\n+    tmp_dir = tempfile.mkdtemp()\n+    if not options.ref_file or options.ref_file == \'None\':\n+        # We\'re using locally cached reference sequences( e.g., /galaxy/data/equCab2/sam_index/equCab2.fa ).\n+        # The indexes for /galaxy/data/equCab2/sam_index/equCab2.fa will be contained in\n+        # a file named /galaxy/data/equCab2/sam_index/equCab2.fa.fai\n+        fai_index_file_base = seq_path\n+        fai_index_file_path = \'%s.fai\' % seq_path \n+        if not os.path.exists( fai_index_file_path ):\n+            #clean up temp files\n+            if os.path.exists( tmp_dir ):\n+                shutil.rmtree( tmp_dir )\n+            stop_err( \'No sequences are available for build (%s), request them by reporting this error.\' % options.dbkey )\n+    else:\n+        try:\n+            # Create indexes for history reference ( e.g., ~/database/files/000/dataset_1.dat ) using samtools faidx, which will:\n+            # - index reference sequence in the FASTA format or extract subsequence from indexed reference sequence\n+            # - if no region is specified, faidx will index the file and create <ref.fasta>.fai on the disk\n+            # - if regions are'..b'.close()\n+            if returncode != 0:\n+                raise Exception, stderr \n+            if os.path.getsize( fai_index_file_path ) == 0:\n+                raise Exception, \'Index file empty, there may be an error with your reference file or settings.\'\n+        except Exception, e:\n+            #clean up temp files\n+            if os.path.exists( tmp_dir ):\n+                shutil.rmtree( tmp_dir )\n+            stop_err( \'Error creating indexes from reference (%s), %s\' % ( options.ref_file, str( e ) ) )\n+    try:\n+        # Extract all alignments from the input SAM file to BAM format ( since no region is specified, all the alignments will be extracted ).\n+        tmp_aligns_file = tempfile.NamedTemporaryFile( dir=tmp_dir )\n+        tmp_aligns_file_name = tmp_aligns_file.name\n+        tmp_aligns_file.close()\n+        command = \'samtools view -bt %s -o %s %s\' % ( fai_index_file_path, tmp_aligns_file_name, options.input1 )\n+        tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name\n+        tmp_stderr = open( tmp, \'wb\' )\n+        proc = subprocess.Popen( args=command, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )\n+        returncode = proc.wait()\n+        tmp_stderr.close()\n+        # get stderr, allowing for case where it\'s very large\n+        tmp_stderr = open( tmp, \'rb\' )\n+        stderr = \'\'\n+        buffsize = 1048576\n+        try:\n+            while True:\n+                stderr += tmp_stderr.read( buffsize )\n+                if not stderr or len( stderr ) % buffsize != 0:\n+                    break\n+        except OverflowError:\n+            pass\n+        tmp_stderr.close()\n+        if returncode != 0:\n+            raise Exception, stderr\n+    except Exception, e:\n+        #clean up temp files\n+        if os.path.exists( tmp_dir ):\n+            shutil.rmtree( tmp_dir )\n+        stop_err( \'Error extracting alignments from (%s), %s\' % ( options.input1, str( e ) ) )\n+    try:\n+        # Sort alignments by leftmost coordinates. File <out.prefix>.bam will be created. This command\n+        # may also create temporary files <out.prefix>.%d.bam when the whole alignment cannot be fitted\n+        # into memory ( controlled by option -m ).\n+        tmp_sorted_aligns_file = tempfile.NamedTemporaryFile( dir=tmp_dir )\n+        tmp_sorted_aligns_file_name = tmp_sorted_aligns_file.name\n+        tmp_sorted_aligns_file.close()\n+        command = \'samtools sort %s %s\' % ( tmp_aligns_file_name, tmp_sorted_aligns_file_name )\n+        tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name\n+        tmp_stderr = open( tmp, \'wb\' )\n+        proc = subprocess.Popen( args=command, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )\n+        returncode = proc.wait()\n+        tmp_stderr.close()\n+        # get stderr, allowing for case where it\'s very large\n+        tmp_stderr = open( tmp, \'rb\' )\n+        stderr = \'\'\n+        buffsize = 1048576\n+        try:\n+            while True:\n+                stderr += tmp_stderr.read( buffsize )\n+                if not stderr or len( stderr ) % buffsize != 0:\n+                    break\n+        except OverflowError:\n+            pass\n+        tmp_stderr.close()\n+        if returncode != 0:\n+            raise Exception, stderr\n+    except Exception, e:\n+        #clean up temp files\n+        if os.path.exists( tmp_dir ):\n+            shutil.rmtree( tmp_dir )\n+        stop_err( \'Error sorting alignments from (%s), %s\' % ( tmp_aligns_file_name, str( e ) ) )\n+    # Move tmp_aligns_file_name to our output dataset location\n+    sorted_bam_file = \'%s.bam\' % tmp_sorted_aligns_file_name\n+    shutil.move( sorted_bam_file, options.output1 )\n+    #clean up temp files\n+    if os.path.exists( tmp_dir ):\n+        shutil.rmtree( tmp_dir )\n+    # check that there are results in the output file\n+    if os.path.getsize( options.output1 ) > 0:\n+        sys.stdout.write( \'SAM file converted to BAM\' )\n+    else:\n+        stop_err( \'Error creating sorted version of BAM file.\' )\n+\n+if __name__=="__main__": __main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/samtools/sam_to_bam.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/samtools/sam_to_bam.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,91 @@
+<tool id="sam_to_bam" name="SAM-to-BAM" version="1.1.2">
+  <description>converts SAM format to BAM format</description>
+  <requirements>
+    <requirement type="package">samtools</requirement>
+  </requirements>
+  <command interpreter="python">
+    sam_to_bam.py
+      --input1=$source.input1
+      #if $source.index_source == "history":
+        --dbkey=${ref_file.metadata.dbkey} 
+        --ref_file=$source.ref_file
+      #else
+        --dbkey=${input1.metadata.dbkey} 
+      #end if
+      --output1=$output1
+      --index_dir=${GALAXY_DATA_INDEX_DIR}
+  </command>
+  <inputs>
+    <conditional name="source">
+      <param name="index_source" type="select" label="Choose the source for the reference list">
+        <option value="cached">Locally cached</option>
+        <option value="history">History</option>
+      </param>
+      <when value="cached">
+        <param name="input1" type="data" format="sam" metadata_name="dbkey" label="SAM File to Convert">
+           <validator type="unspecified_build" />
+           <validator type="dataset_metadata_in_file" filename="sam_fa_indices.loc" metadata_name="dbkey" metadata_column="1" message="Sequences are not currently available for the specified build." line_startswith="index" />
+        </param>
+      </when>
+      <when value="history">
+        <param name="input1" type="data" format="sam" label="Convert SAM file" />
+        <param name="ref_file" type="data" format="fasta" metadata_name="dbkey" label="Using reference file" />
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="bam" name="output1" label="${tool.name} on ${on_string}: converted BAM">
+      <actions>
+        <conditional name="source.index_source">
+          <when value="cached">
+            <action type="metadata" name="dbkey">
+              <option type="from_param" name="source.input1" param_attribute="dbkey" />
+            </action>
+          </when>
+          <when value="history">
+            <action type="metadata" name="dbkey">
+              <option type="from_param" name="source.ref_file" param_attribute="dbkey" />
+            </action>
+          </when>
+        </conditional>
+      </actions>
+    </data>
+  </outputs>
+  <tests>
+    <test>
+      <!--
+      Sam-to-Bam command:
+      cp test-data/chr_m.fasta .
+      samtools faidx chr_m.fasta
+      samtools view -hbt chr_m.fasta.fai -o unsorted.bam test-data/sam_to_bam_in1.sam
+      samtools sort unsorted.bam sam_to_bam_out1
+      chr_m.fasta is the reference file (chrM from equCab2)
+      -->
+      <param name="index_source" value="history" /> 
+      <param name="input1" value="sam_to_bam_in1.sam" ftype="sam" />
+      <param name="ref_file" value="chr_m.fasta" ftype="fasta" dbkey="equCab2" />
+      <output name="output1" file="sam_to_bam_out1.bam" ftype="bam" />
+    </test>
+    <test>
+      <!--
+      Sam-to-Bam command:
+      samtools view -hbt chr_m.fasta.fai -o unsorted.bam test-data/sam_to_bam_in1.sam
+      samtools sort unsorted.bam sam_to_bam_out2
+      chr_m.fasta is the reference file and the index chr_m.fasta.fai 
+      these should be in the same directory, and chrM is from equCab2
+      -->
+      <param name="index_source" value="cached" />
+      <param name="input1" value="sam_to_bam_in1.sam" ftype="sam" dbkey="chrM" />
+      <output name="output1" file="sam_to_bam_out2.bam" ftype="bam" />
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool uses the SAMTools_ toolkit to produce an indexed BAM file based on a sorted input SAM file.
+
+.. _SAMTools: http://samtools.sourceforge.net/samtools.shtml
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/samtools/samtools_flagstat.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/samtools/samtools_flagstat.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,29 @@
+<tool id="samtools_flagstat" name="flagstat" version="1.0.0">
+  <requirements>
+    <requirement type="package">samtools</requirement>
+  </requirements>
+  <description>provides simple stats on BAM files</description>
+  <command>samtools flagstat $input1 > $output1
+  </command>
+  <inputs>
+    <param name="input1" type="data" format="bam" label="BAM File to Convert" />
+  </inputs>
+  <outputs>
+    <data name="output1" format="txt" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="3unsorted.bam" ftype="bam" />
+      <output name="output1" file="samtools_flagstat_out1.txt" />
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool uses the SAMTools_ toolkit to produce simple stats on a BAM file.
+
+.. _SAMTools: http://samtools.sourceforge.net/samtools.shtml
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/solid_tools/maq_cs_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/solid_tools/maq_cs_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,270 @@\n+#!/usr/bin/env python\n+#Guruprasad Ananda\n+#MAQ mapper for SOLiD colourspace-reads\n+\n+import sys, os, zipfile, tempfile, subprocess\n+\n+def stop_err( msg ):\n+    sys.stderr.write( "%s\\n" % msg )\n+    sys.exit()\n+ \n+def __main__():\n+\n+    out_fname = sys.argv[1].strip()\n+    out_f2 = open(sys.argv[2].strip(),\'r+\')\n+    ref_fname = sys.argv[3].strip()\n+    f3_read_fname = sys.argv[4].strip()\n+    f3_qual_fname = sys.argv[5].strip()\n+    paired = sys.argv[6]\n+    if paired == \'yes\':\n+        r3_read_fname = sys.argv[7].strip()\n+        r3_qual_fname = sys.argv[8].strip()\n+    min_mapqual = int(sys.argv[9].strip())\n+    max_mismatch = int(sys.argv[10].strip())\n+    out_f3name = sys.argv[11].strip()   \n+    subprocess_dict = {}\n+\n+    ref_csfa = tempfile.NamedTemporaryFile()\n+    ref_bfa = tempfile.NamedTemporaryFile()\n+    ref_csbfa = tempfile.NamedTemporaryFile()\n+    cmd2_1 = \'maq fasta2csfa %s > %s 2>&1\' %(ref_fname,ref_csfa.name)\n+    cmd2_2 = \'maq fasta2bfa %s %s 2>&1\' %(ref_csfa.name,ref_csbfa.name)\n+    cmd2_3 = \'maq fasta2bfa %s %s 2>&1\' %(ref_fname,ref_bfa.name)\n+    try:\n+        os.system(cmd2_1)\n+        os.system(cmd2_2)\n+        os.system(cmd2_3)\n+    except Exception, erf:\n+        stop_err(str(erf)+"Error processing reference sequence")\n+        \n+    if paired == \'yes\': #paired end reads\n+        tmpf = tempfile.NamedTemporaryFile()    #forward reads\n+        tmpr = tempfile.NamedTemporaryFile()    #reverse reads\n+        tmps = tempfile.NamedTemporaryFile()    #single reads\n+        tmpffastq = tempfile.NamedTemporaryFile()\n+        tmprfastq = tempfile.NamedTemporaryFile()\n+        tmpsfastq = tempfile.NamedTemporaryFile()\n+\n+        cmd1 = "solid2fastq_modified.pl \'yes\' %s %s %s %s %s %s %s 2>&1" %(tmpf.name,tmpr.name,tmps.name,f3_read_fname,f3_qual_fname,r3_read_fname,r3_qual_fname)\n+        try:\n+            os.system(cmd1)\n+            os.system(\'gunzip -c %s >> %s\' %(tmpf.name,tmpffastq.name))\n+            os.system(\'gunzip -c %s >> %s\' %(tmpr.name,tmprfastq.name))\n+            os.system(\'gunzip -c %s >> %s\' %(tmps.name,tmpsfastq.name))\n+\n+        except Exception, eq:\n+            stop_err("Error converting data to fastq format." + str(eq))\n+        \n+        #make a temp directory where the split fastq files will be stored\n+        try: \n+            split_dir = tempfile.mkdtemp()\n+            split_file_prefix_f = tempfile.mktemp(dir=split_dir)\n+            split_file_prefix_r = tempfile.mktemp(dir=split_dir)\n+            splitcmd_f = \'split -a 2 -l %d %s %s\' %(32000000,tmpffastq.name,split_file_prefix_f) #32M lines correspond to 8M reads\n+            splitcmd_r = \'split -a 2 -l %d %s %s\' %(32000000,tmprfastq.name,split_file_prefix_r) #32M lines correspond to 8M reads\n+\n+            os.system(splitcmd_f)\n+            os.system(splitcmd_r)\n+            os.chdir(split_dir)\n+            ii = 0\n+            for fastq in os.listdir(split_dir):\n+                if not fastq.startswith(split_file_prefix_f.split("/")[-1]):\n+                    continue\n+                fastq_r = split_file_prefix_r + fastq.split(split_file_prefix_f.split("/")[-1])[1] #find the reverse strand fastq corresponding to formward strand fastq\n+                tmpbfq_f = tempfile.NamedTemporaryFile()\n+                tmpbfq_r = tempfile.NamedTemporaryFile()\n+                cmd3 = \'maq fastq2bfq %s %s 2>&1; maq fastq2bfq %s %s 2>&1; maq map -c %s.csmap %s %s %s 1>/dev/null 2>&1; maq mapview %s.csmap > %s.txt\' %(fastq,tmpbfq_f.name,fastq_r,tmpbfq_r.name,fastq,ref_csbfa.name,tmpbfq_f.name,tmpbfq_r.name,fastq,fastq)\n+                subprocess_dict[\'sp\'+str(ii+1)] = subprocess.Popen([cmd3],shell=True,stdout=subprocess.PIPE)\n+                ii += 1\n+            while True:\n+                all_done = True\n+                for j,k in enumerate(subprocess_dict.keys()):\n+                    if subprocess_dict[\'sp\'+str(j+1)].wait() != 0:\n+                        err = subprocess_dict[\'sp\'+str(j+1)].communicate()[1] \n+       '..b'ax_mismatch,min_mapqual,ref_bfa.name,tmpcsmap.name,tmppileup.name)\n+            os.system(cmdpileup)\n+            tmppileup.seek(0)\n+            print >> out_f2, "#chr\\tposition\\tref_nt\\tcoverage\\tSNP_count\\tA_count\\tT_count\\tG_count\\tC_count"\n+            for line in file(tmppileup.name):\n+                elems = line.strip().split()\n+                ref_nt = elems[2].capitalize()\n+                read_nt = elems[4]\n+                coverage = int(elems[3])\n+                a,t,g,c = 0,0,0,0\n+                ref_nt_count = 0\n+                for ch in read_nt:\n+                    ch = ch.capitalize()\n+                    if ch not in [\'A\',\'T\',\'G\',\'C\',\',\',\'.\']:\n+                        continue\n+                    if ch in [\',\',\'.\']:\n+                        ch = ref_nt\n+                        ref_nt_count += 1\n+                    try:\n+                        nt_ind = [\'A\',\'T\',\'G\',\'C\'].index(ch)\n+                        if nt_ind == 0:\n+                            a+=1\n+                        elif nt_ind == 1:\n+                            t+=1\n+                        elif nt_ind == 2:\n+                            g+=1\n+                        else:\n+                            c+=1\n+                    except:\n+                        pass\n+                print >> out_f2, "%s\\t%s\\t%s\\t%s\\t%s\\t%s" %("\\t".join(elems[:4]),coverage-ref_nt_count,a,t,g,c)\n+        except Exception, er2:\n+            stop_err("Encountered error while mapping: %s" %(str(er2)))\n+    \n+    #Build custom track from pileup\n+    chr_list=[]\n+    out_f2.seek(0)\n+    fcov = tempfile.NamedTemporaryFile()\n+    fout_a = tempfile.NamedTemporaryFile()\n+    fout_t = tempfile.NamedTemporaryFile()\n+    fout_g = tempfile.NamedTemporaryFile()\n+    fout_c = tempfile.NamedTemporaryFile()\n+    fcov.write(\'\'\'track type=wiggle_0 name="Coverage track" description="Coverage track (from Galaxy)" color=0,0,0 visibility=2\\n\'\'\')\n+    fout_a.write(\'\'\'track type=wiggle_0 name="Track A" description="Track A (from Galaxy)" color=255,0,0 visibility=2\\n\'\'\')\n+    fout_t.write(\'\'\'track type=wiggle_0 name="Track T" description="Track T (from Galaxy)" color=0,255,0 visibility=2\\n\'\'\')\n+    fout_g.write(\'\'\'track type=wiggle_0 name="Track G" description="Track G (from Galaxy)" color=0,0,255 visibility=2\\n\'\'\')\n+    fout_c.write(\'\'\'track type=wiggle_0 name="Track C" description="Track C (from Galaxy)" color=255,0,255 visibility=2\\n\'\'\')\n+    \n+    for line in out_f2:\n+        if line.startswith("#"):\n+            continue\n+        elems = line.split()\n+        chr = elems[0]\n+            \n+        if chr not in chr_list:\n+            chr_list.append(chr)\n+            if not (chr.startswith(\'chr\') or chr.startswith(\'scaffold\')):\n+                chr = \'chr\'\n+            header = "variableStep chrom=%s" %(chr)\n+            fcov.write("%s\\n" %(header))\n+            fout_a.write("%s\\n" %(header))\n+            fout_t.write("%s\\n" %(header))\n+            fout_g.write("%s\\n" %(header))\n+            fout_c.write("%s\\n" %(header))\n+        try:\n+            pos = int(elems[1])\n+            cov = int(elems[3])\n+            a = int(elems[5])\n+            t = int(elems[6])\n+            g = int(elems[7])\n+            c = int(elems[8])\n+        except:\n+            continue\n+        fcov.write("%s\\t%s\\n" %(pos,cov))\n+        try:\n+            a_freq = a*100./cov\n+            t_freq = t*100./cov\n+            g_freq = g*100./cov\n+            c_freq = c*100./cov\n+        except ZeroDivisionError:\n+            a_freq=t_freq=g_freq=c_freq=0\n+        fout_a.write("%s\\t%s\\n" %(pos,a_freq))\n+        fout_t.write("%s\\t%s\\n" %(pos,t_freq))\n+        fout_g.write("%s\\t%s\\n" %(pos,g_freq))\n+        fout_c.write("%s\\t%s\\n" %(pos,c_freq))\n+\n+    fcov.seek(0)\n+    fout_a.seek(0)\n+    fout_g.seek(0)\n+    fout_t.seek(0)\n+    fout_c.seek(0)\n+    os.system("cat %s %s %s %s %s | cat > %s" %(fcov.name,fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_f3name))\n+\n+if __name__=="__main__":\n+    __main__()\n+        \n+    \n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/solid_tools/maq_cs_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/solid_tools/maq_cs_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,120 @@
+<tool id="maq_cs_wrapper" name="MAQ for SOLiD" version="1.0.0">
+    <description> </description>
+    <command interpreter="python">
+    maq_cs_wrapper.py 
+    $output1 
+    $output2 
+    $ref 
+    $library_type.f3_reads 
+    $library_type.f3_qual 
+    $library_type.is_paired
+    #if $library_type.is_paired == "yes":  
+     $library_type.r3_reads 
+     $library_type.r3_qual 
+    #else:
+     "None"
+     "None"
+    #end if
+    $min_mapqual
+    $max_mismatch
+    $output3
+    
+    </command>
+
+    <inputs>
+        <param name="ref" type="data" format="fasta" label="Target Genome"/> 
+        <conditional name="library_type">
+          <param name="is_paired" type="select" label="Is the library mate-paired?" multiple="false">
+             <option value="no">No</option>
+             <option value="yes">Yes</option>
+         </param>
+         <when value="no">
+           <param name="f3_reads" type="data" format="csfasta" label="F3 reads file"/> 
+           <param format="qualsolid" name="f3_qual" type="data" label="F3 quality file" help="If your dataset doesn't show up in the menu, click the pencil icon next to your dataset and set the datatype to 'qualsolid'" /> 
+          </when>
+          <when value="yes">
+           <param name="f3_reads" type="data" format="csfasta" label="F3 reads file"/> 
+           <param format="qualsolid" name="f3_qual" type="data" label="F3 quality file" help="If your dataset doesn't show up in the menu, click the pencil icon next to your dataset and set the datatype to 'qualsolid'" /> 
+           <param name="r3_reads" type="data" format="csfasta" label="R3 reads file"/> 
+           <param format="qualsolid" name="r3_qual" type="data" label="R3 quality file" help="If your dataset doesn't show up in the menu, click the pencil icon next to your dataset and set the datatype to 'qualsolid'" /> 
+          </when>
+      </conditional>
+      <param name="min_mapqual" type="integer" size="3" value="0" label="Minimum mapping quality allowed for a read to be used" help="Reads below the specified mapping quality will not be considered in coverage and SNP analysis."/> 
+      <param name="max_mismatch" type="integer" size="3" value="7" label="Maximum number of mismatches allowed for a read to be used" help="Reads above the specified threshold will not be considered in coverage and SNP analysis."/> 
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output1" metadata_source="ref" />
+        <data format="tabular" name="output2" metadata_source="ref" />
+        <data format="customtrack" name="output3" metadata_source="ref" />
+    </outputs>
+    
+    <!--  "ToolTestCase does not deal with multiple outputs properly yet."
+    <tests>
+        
+        <test>
+            <param name="ref" value="phiX_mod.fasta" />
+            <param name="is_paired" value="no" />
+            <param name="f3_reads" value="phiX_solid.csfasta" />
+            <param name="f3_qual" value="phiX_solid.qualsolid" />
+            <param name="min_mapqual" value="0" />
+            <param name="max_mismatch" value="7" />
+            <output name="output1" file="phiX_solid_maq.map" />
+            <output name="output2" file="phiX_solid_maq.pileup" />
+            <output name="output3" file="phiX_solid_maq.ctrack" />
+            
+        </test>
+    </tests>
+    -->
+<help>
+
+.. class:: infomark
+
+**What it does**
+
+This tool maps SOLiD color-space reads against the target genome using MAQ. It produces three output datasets: 
+
+
+**ALIGNMENT INFO** : contains the read alignment information, 
+
+**PILEUP** : contains the coverage and SNP statistics for every nucleotide of the target genome,
+
+**CUSTOM TRACK** : contains the coverage and SNP statistics as custom tracks displayable in the UCSC browser. 
+
+-----
+
+**The ALIGNMENT INFO dataset will contain the following fields:**
+
+* column 1  = read name
+* column 2  = chromosome
+* column 3  = position
+* column 4  = strand
+* column 5  = insert size from the outer coorniates of a pair
+* column 6  = paired flag
+* column 7  = mapping quality
+* column 8  = single-end mapping quality
+* column 9  = alternative mapping quality
+* column 10 = number of mismatches of the best hit
+* column 11 = sum of qualities of mismatched bases of the best hit
+* column 12 = number of 0-mismatch hits of the first 24bp
+* column 13 = number of 1-mismatch hits of the first 24bp on the reference
+* column 14 = length of the read
+* column 15 = read sequence
+* column 16 = read quality
+
+
+**The PILEUP dataset will contain the following fields:**
+
+* column 1  = chromosome
+* column 2  = position
+* column 3  = reference nucleotide
+* column 4  = coverage (number of reads that cover this position)
+* column 5  = number of SNPs
+* column 6  = number of As
+* column 7  = number of Ts
+* column 8  = number of Gs
+* column 9  = number of Cs
+
+</help>
+<code file="maq_cs_wrapper_code.py"/>
+
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/solid_tools/maq_cs_wrapper_code.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/solid_tools/maq_cs_wrapper_code.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,5 @@
+def exec_before_job(app, inp_data, out_data, param_dict, tool):
+    out_data['output1'].name = out_data['output1'].name + " [ ALIGNMENT INFO ]"
+    out_data['output2'].name = out_data['output2'].name + " [ PILEUP ]"
+    out_data['output3'].name = out_data['output3'].name + " [ CUSTOM TRACK ]"
+    
b
diff -r 000000000000 -r 9071e359b9a3 tools/solid_tools/qualsolid_boxplot_graph.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/solid_tools/qualsolid_boxplot_graph.sh Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,94 @@
+#!/bin/sh
+
+#    Modified fastq_quality_boxplot_graph.sh from FASTX-toolkit - FASTA/FASTQ preprocessing tools.
+#    Copyright (C) 2009  A. Gordon (gordon@cshl.edu)
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as
+#   published by the Free Software Foundation, either version 3 of the
+#   License, or (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#    You should have received a copy of the GNU Affero General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+function usage()
+{
+ echo "SOLiD-Quality BoxPlot plotter"
+ echo "Generates a SOLiD quality score box-plot graph "
+ echo
+ echo "Usage: $0 [-i INPUT.TXT] [-t TITLE] [-p] [-o OUTPUT]"
+ echo
+ echo "  [-p]           - Generate PostScript (.PS) file. Default is PNG image."
+ echo "  [-i INPUT.TXT] - Input file. Should be the output of \"solid_qual_stats\" program."
+ echo "  [-o OUTPUT]    - Output file name. default is STDOUT."
+ echo "  [-t TITLE]     - Title (usually the solid file name) - will be plotted on the graph."
+ echo
+ exit 
+}
+
+#
+# Input Data columns: #pos cnt min max sum        mean Q1 med Q3 IQR lW rW
+#  As produced by "solid_qual_stats" program
+
+TITLE="" # default title is empty
+FILENAME=""
+OUTPUTTERM="set term png size 800,600"
+OUTPUTFILE="/dev/stdout"    # Default output file is simply "stdout"
+while getopts ":t:i:o:ph" Option
+ do
+ case $Option in
+ # w ) CMD=$OPTARG; FILENAME="PIMSLogList.txt"; TARGET="logfiles"; ;;
+ t ) TITLE="for $OPTARG" ;;
+ i ) FILENAME=$OPTARG ;;
+ o ) OUTPUTFILE="$OPTARG" ;;
+ p ) OUTPUTTERM="set term postscript enhanced color \"Helvetica\" 4" ;;
+ h ) usage ;;
+ * ) echo "unrecognized argument. use '-h' for usage information."; exit -1 ;;
+ esac
+done
+shift $(($OPTIND - 1)) 
+
+
+if [ "$FILENAME" == "" ]; then
+ usage
+fi
+
+if [ ! -r "$FILENAME" ]; then
+ echo "Error: can't open input file ($1)." >&2
+ exit 1
+fi
+
+#Read number of cycles from the stats file (each line is a cycle, minus the header line)
+#But for the graph, I want xrange to reach (num_cycles+1), so I don't subtract 1 now.
+NUM_CYCLES=$(cat "$FILENAME" | wc -l) 
+
+GNUPLOTCMD="
+$OUTPUTTERM
+set boxwidth 0.8 
+set size 1,1
+set key Left inside
+set xlabel \"read position\"
+set ylabel \"Quality Score \"
+set title  \"Quality Scores $TITLE\"
+#set auto x
+set bars 4.0
+set xrange [ 0: $NUM_CYCLES ]
+set yrange [-2:45]
+set y2range [-2:45]
+set xtics 1 
+set x2tics 1
+set ytics 2
+set y2tics 2
+set tics out
+set grid ytics
+set style fill empty
+plot '$FILENAME' using 1:7:11:12:9 with candlesticks lt 1  lw 1 title 'Quartiles' whiskerbars, \
+      ''         using 1:8:8:8:8 with candlesticks lt -1 lw 2 title 'Medians'
+"
+
+echo "$GNUPLOTCMD" | gnuplot > "$OUTPUTFILE"
b
diff -r 000000000000 -r 9071e359b9a3 tools/solid_tools/solid_qual_boxplot.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/solid_tools/solid_qual_boxplot.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,40 @@
+<tool id="solid_qual_boxplot" name="Draw quality score boxplot" version="1.0.0">
+ <description>for SOLiD data</description>
+
+ <command interpreter="bash">qualsolid_boxplot_graph.sh -t '$input.name' -i $input -o $output</command>
+
+ <inputs>
+ <param format="txt" name="input" type="data" label="Statistics report file (output of 'Quality Statistics for SOLiD data' tool)" />
+ </inputs>
+
+ <outputs>
+ <data format="png" name="output" metadata_source="input" />
+ </outputs>
+<help>
+
+**What it does**
+
+Creates a boxplot graph for the quality scores in the library.
+
+.. class:: infomark
+
+**TIP:** Use the **Quality Statistics for SOLiD data** tool to generate the report file needed for this tool.
+
+-----
+
+**Output Example**
+
+* Black horizontal lines are medians
+* Rectangular red boxes show the Inter-quartile Range (IQR) (top value is Q3, bottom value is Q1)
+* Whiskers show outliers at max. 1.5*IQR
+
+
+.. image:: ./static/images/solid_qual.png
+
+------
+
+This tool is based on `FASTX-toolkit`__ by Assaf Gordon.
+
+ .. __: http://hannonlab.cshl.edu/fastx_toolkit/
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/solid_tools/solid_qual_stats.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/solid_tools/solid_qual_stats.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+#Guruprasad Ananda
+
+import sys, os, zipfile, tempfile
+
+QUAL_UPPER_BOUND = 41
+QUAL_LOWER_BOUND = 1
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()

+def unzip( filename ):
+    zip_file = zipfile.ZipFile( filename, 'r' )
+    tmpfilename = tempfile.NamedTemporaryFile().name
+    for name in zip_file.namelist():
+        file( tmpfilename, 'a' ).write( zip_file.read( name ) )
+    zip_file.close()
+    return tmpfilename
+   
+def __main__():
+
+    infile_score_name = sys.argv[1].strip()
+    fout = open(sys.argv[2].strip(),'r+w')
+
+    infile_is_zipped = False
+    if zipfile.is_zipfile( infile_score_name ):
+        infile_is_zipped = True
+        infile_name = unzip( infile_score_name )
+    else:
+        infile_name = infile_score_name
+    
+    readlen = None
+    invalid_lines = 0
+    j = 0
+    for line in file( infile_name ):
+        line = line.strip()
+        if not(line) or line.startswith("#") or line.startswith(">"):
+            continue
+        elems = line.split()
+        try:
+            for item in elems:
+                int(item)
+            if not readlen:
+                readlen = len(elems)
+            if len(elems) != readlen:
+                print "Note: Reads in the input dataset are of variable lengths."
+            j += 1
+        except ValueError:
+            invalid_lines += 1
+        if j > 10:
+            break
+        
+    position_dict = {}
+    print >>fout, "column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW"
+    for k,line in enumerate(file( infile_name )):
+        line = line.strip()
+        if not(line) or line.startswith("#") or line.startswith(">"):
+            continue
+        elems = line.split()
+        if position_dict == {}:
+            for pos in range(readlen):
+                position_dict[pos] = [0]*QUAL_UPPER_BOUND
+        if len(elems) != readlen:
+            invalid_lines += 1
+            continue
+        for ind,item in enumerate(elems):
+            try:
+                item = int(item)
+                position_dict[ind][item]+=1
+            except:
+                pass
+    
+    invalid_positions = 0
+    for pos in position_dict:
+        carr = position_dict[pos] #count array for position pos
+        total = sum(carr) #number of bases found in this column.
+        med_elem = int(round(total/2.0))
+        lowest = None   #Lowest quality score value found in this column.
+        highest = None  #Highest quality score value found in this column.
+        median = None   #Median quality score value found in this column.
+        qsum = 0.0      #Sum of quality score values for this column.
+        q1 = None       #1st quartile quality score.
+        q3 = None       #3rd quartile quality score.
+        q1_elem = int(round((total+1)/4.0))
+        q3_elem = int(round((total+1)*3/4.0))
+        
+        try:
+            for ind,cnt in enumerate(carr):
+                qsum += ind*cnt
+                
+                if cnt!=0:
+                    highest = ind
+                
+                if lowest==None and cnt!=0:  #first non-zero count
+                    lowest = ind
+                
+                if q1==None:
+                    if sum(carr[:ind+1]) >= q1_elem:
+                        q1 = ind
+                           
+                if median==None:
+                    if sum(carr[:ind+1]) < med_elem:
+                        continue
+                    median = ind
+                    if total%2 == 0: #even number of elements
+                        median2 = median
+                        if sum(carr[:ind+1]) < med_elem+1:
+                            for ind2,elem in enumerate(carr[ind+1:]):
+                                if elem != 0:
+                                    median2 = ind+ind2+1
+                                    break
+                        median = (median + median2)/2.0
+    
+                
+                if q3==None:
+                    if sum(carr[:ind+1]) >= q3_elem:
+                        q3 = ind
+                 
+                
+            mean = qsum/total    #Mean quality score value for this column.
+            iqr = q3-q1
+            left_whisker = max(q1 - 1.5*iqr,lowest)
+            right_whisker = min(q3 + 1.5*iqr,highest)
+            
+            print >>fout,"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %(pos+1,total,lowest,highest,qsum,mean,q1,median,q3,iqr,left_whisker,right_whisker)
+        except:
+            invalid_positions += 1
+            nullvals = ['NA']*11
+            print >>fout,"%s\t%s" %(pos+1,'\t'.join(nullvals))
+
+    if invalid_lines:
+        print "Skipped %d reads as invalid." %invalid_lines
+    if invalid_positions:
+        print "Skipped stats computation for %d read positions." %invalid_positions
+        
+if __name__=="__main__":
+    __main__()
+        
+    
b
diff -r 000000000000 -r 9071e359b9a3 tools/solid_tools/solid_qual_stats.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/solid_tools/solid_qual_stats.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,69 @@
+<tool id="solid_qual_stats" name="Compute quality statistics" version="1.0.0">
+    <description>for SOLiD data</description>
+    <command interpreter="python">solid_qual_stats.py $input $output1</command>
+
+    <inputs>
+        <param format="qualsolid" name="input" type="data" label="SOLiD qual file" help="If your dataset doesn't show up in the menu, click the pencil icon next to your dataset and set the datatype to 'qualsolid'" />
+    </inputs>
+    <outputs>
+        <data format="txt" name="output1" metadata_source="input" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" value="qualscores.qualsolid" />
+            <output name="output1" file="qualsolid.stats" />
+        </test>
+    </tests>
+
+<help>
+
+**What it does**
+
+Creates quality statistics report for the given SOLiD quality score file.
+
+.. class:: infomark
+
+**TIP:** This statistics report can be used as input for **Quality Boxplot for SOLiD data** tool.
+
+-----
+
+**The output file will contain the following fields:**
+
+* column    = column number (position on the read)
+* count   = number of bases found in this column.
+* min     = Lowest quality score value found in this column.
+* max     = Highest quality score value found in this column.
+* sum     = Sum of quality score values for this column.
+* mean    = Mean quality score value for this column.
+* Q1    = 1st quartile quality score.
+* med   = Median quality score.
+* Q3    = 3rd quartile quality score.
+* IQR   = Inter-Quartile range (Q3-Q1).
+* lW    = 'Left-Whisker' value (for boxplotting).
+* rW    = 'Right-Whisker' value (for boxplotting).
+
+
+
+
+
+**Output Example**::
+
+    column  count   min max sum mean    Q1  med Q3  IQR lW  rW
+    1   6362991 2  32  250734117   20.41   5  9  28  23   2  31
+    2   6362991 2  32  250531036   21.37  10  26 30  20   5  31
+    3   6362991 2  34  248722469   19.09  10  26 30  20   5  31
+    4   6362991 2  34  247654797   18.92  10  26 30  20   5  31
+    .
+    .    
+    32  6362991 2  31  143436943   16.54   3  10  25  22  2  31
+    33  6362991 2  32  114269843   16.96   3  10  25  22  2  31
+    34  6362991 2  29  140638447   12.10   3  10  25  22  2  29
+    35  6362991 2  29  138910532   11.83   3  10  25  22  2  29
+    
+------
+
+This tool is based on `FASTX-toolkit`__ by Assaf Gordon.
+
+ .. __: http://hannonlab.cshl.edu/fastx_toolkit/
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_assembly/velvetg.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_assembly/velvetg.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,301 @@\n+<tool id="velvetg" name="velvetg" version="1.0.0">\n+  <description>Velvet sequence assembler for very short reads</description>\n+  <command interpreter="python">\n+    velvetg_wrapper.py \n+           \'$input.extra_files_path\'\n+           \'$contigs\' \'$stats\' \'$LastGraph\' \'$velvet_asm\' \'$unused_reads_fasta\'\n+           #if $generate_amos.afg  == "yes":\n+               -amos_file $generate_amos.afg\n+           #end if\n+           #if $unused_reads.generate_unused  == "yes":\n+               -unused_reads $unused_reads.generate_unused\n+           #end if\n+           $read_trkg\n+           #if $coverage.cutoff == "auto":\n+               -cov_cutoff auto\n+           #elif $coverage.cutoff == "value":\n+               -cov_cutoff $coverage.cov_cutoff\n+           #end if\n+           #if $expected.coverage == "auto":\n+               -exp_cov auto\n+           #elif $expected.coverage == "value":\n+               -exp_cov $expected.cov_cutoff\n+           #end if\n+           #if $contig_lgth.use_contig_lgth == "yes":\n+               -min_contig_lgth $contig_lgth.min_contig_lgth\n+           #end if\n+           #if $reads.paired == "yes":\n+               #if int($reads.ins_length) > 0:\n+                   -ins_length $reads.ins_length\n+               #end if\n+               #if $reads.options.advanced == "yes":\n+                   #if int($reads.options.ins_length_sd) > 0:\n+                       -ins_length_sd $reads.options.ins_length_sd\n+                   #end if\n+                   #if int($reads.options.ins_length2) > 0:\n+                       -ins_length2 $reads.options.ins_length2\n+                   #end if\n+                   #if int($reads.options.ins_length2_sd) > 0:\n+                       -ins_length2_sd $reads.options.ins_length2_sd\n+                   #end if\n+                   #if int($reads.options.ins_length_long) > 0:\n+                       -ins_length_long $reads.options.ins_length_long\n+                   #end if\n+                   #if int($reads.options.ins_length_long_sd) > 0:\n+                       -ins_length_long_sd $reads.options.ins_length_long_sd\n+                   #end if\n+                   #if int($reads.options.max_branch_length) > 0:\n+                       -max_branch_length $reads.options.max_branch_length\n+                   #end if\n+                   #if int($reads.options.max_divergence) > 0:\n+                       -max_divergence $reads.options.max_divergence\n+                   #end if\n+                   #if int($reads.options.max_gap_count) > 0:\n+                       -max_gap_count $reads.options.max_gap_count\n+                   #end if\n+                   #if int($reads.options.min_pair_count) > 0:\n+                       -min_pair_count $reads.options.min_pair_count\n+                   #end if\n+                   #if int($reads.options.max_coverage) > 0:\n+                       -max_coverage $reads.options.max_coverage\n+                   #end if\n+                   #if int($reads.options.long_mult_cutoff) > 0:\n+                       -long_mult_cutoff $reads.options.long_mult_cutoff\n+                   #end if\n+                   $reads.options.scaffolding\n+               #end if\n+           #end if\n+  </command>\n+  <inputs>\n+    <param name="input" type="data" format="velvet" label="Velvet Dataset" help="Prepared by velveth."/>\n+    <conditional name="generate_amos">\n+      <param name="afg" type="select" label="Generate a AMOS.afg file">\n+        <option value="no">No</option>\n+        <option value="yes">Yes</option>\n+      </param>\n+      <when value="no"/>\n+      <when value="yes"/>\n+    </conditional>\n+\n+    <conditional name="unused_reads">\n+      <param name="generate_unused" type="select" label="Generate a UnusedReads fasta file">\n+        <option value="no">No</option>\n+        <option value="yes">Yes</option>\n+      </param>\n+      <when value="no"/>\n+      <when value="yes"/>\n+    </conditional>\n+\n+    <conditional name="last_graph">\n+      <param name="ge'..b"at node lengths are given in k-mers. To obtain the length in nucleotides of each node you simply need to add k - 1, where k is the word-length used in velveth.\n+The in and out columns correspond to the number of arcs on the 5' and 3' ends of the contig respectively.\n+The coverages in columns short1 cov, short1 Ocov, short2 cov, and short2 Ocov are provided in k-mer coverage (5.1).\n+Also, the difference between # cov and # Ocov is the way these values are computed. In the first count, slightly divergent sequences are added to the coverage tally. However, in the second, stricter count, only the sequences which map perfectly onto the consensus sequence are taken into account.\n+\n+**LastGraph**\n+\n+The *LastGraph* file.  \n+This file describes in its entirety the graph produced by Velvet. \n+\n+**AMOS.afg**\n+\n+The *velvet_asm.afg* file.  \n+This file is mainly designed to be read by the open-source AMOS genome assembly package. Nonetheless, a number of programs are available to transform this kind of file into other assembly file formats (namely ACE, TIGR, Arachne and Celera). See http://amos.sourceforge.net/ for more information.\n+The file describes all the contigs contained in the contigs.fa file (cf 4.2.1).\n+\n+------\n+\n+**Velvet parameter list**\n+\n+This is a list of implemented Velvetg options::\n+\n+  Standard options:\n+        -cov_cutoff  floating-point|auto : removal of low coverage nodes AFTER tour bus or allow the system to infer it\n+                (default: no removal)\n+        -ins_length  integer             : expected distance between two paired end reads (default: no read pairing)\n+        -read_trkg  yes|no               : tracking of short read positions in assembly (default: no tracking)\n+        -min_contig_lgth  integer        : minimum contig length exported to contigs.fa file (default: hash length * 2)\n+        -amos_file  yes|no               : export assembly to AMOS file (default: no export)\n+        -exp_cov  floating point|auto    : expected coverage of unique regions or allow the system to infer it\n+                (default: no long or paired-end read resolution)\n+   \n+  Advanced options:\n+        -ins_length2  integer            : expected distance between two paired-end reads in the second short-read dataset (default: no read pairing)\n+        -ins_length_long  integer        : expected distance between two long paired-end reads (default: no read pairing)\n+        -ins_length*_sd  integer         : est. standard deviation of respective dataset (default: 10% of corresponding length)\n+                [replace '*' by nothing, '2' or '_long' as necessary]\n+        -scaffolding  yes|no             : scaffolding of contigs used paired end information (default: on)\n+        -max_branch_length  integer      : maximum length in base pair of bubble (default: 100)\n+        -max_divergence  floating-point  : maximum divergence rate between two branches in a bubble (default: 0.2)\n+        -max_gap_count  integer          : maximum number of gaps allowed in the alignment of the two branches of a bubble (default: 3)\n+        -min_pair_count  integer         : minimum number of paired end connections to justify the scaffolding of two long contigs (default: 10)\n+        -max_coverage  floating point    : removal of high coverage nodes AFTER tour bus (default: no removal)\n+        -long_mult_cutoff  int           : minimum number of long reads required to merge contigs (default: 2)\n+        -unused_reads  yes|no            : export unused reads in UnusedReads.fa file (default: no)\n+   \n+  Output:\n+        directory/contigs.fa             : fasta file of contigs longer than twice hash length\n+        directory/stats.txt              : stats file (tab-spaced) useful for determining appropriate coverage cutoff\n+        directory/LastGraph              : special formatted file with all the information on the final graph\n+        directory/velvet_asm.afg         : (if requested) AMOS compatible assembly file\n+\n+  </help>\n+</tool>\n"
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_assembly/velvetg_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_assembly/velvetg_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+
+"""
+Classes encapsulating decypher tool.
+James E Johnson - University of Minnesota
+"""
+import pkg_resources;
+import logging, os, string, sys, tempfile, glob, shutil, types, urllib
+import shlex, subprocess
+from optparse import OptionParser, OptionGroup
+from stat import *
+
+
+log = logging.getLogger( __name__ )
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+def __main__():
+    #Parse Command Line
+    s = 'velvetg_wrapper.py:  argv = %s\n' % (sys.argv)
+    # print >> sys.stderr, s # so will appear as blurb for file
+    argcnt = len(sys.argv)
+    working_dir = sys.argv[1]
+    contigs = sys.argv[2]
+    stats = sys.argv[3]
+    LastGraph = sys.argv[4]
+    afgFile = sys.argv[5]
+    unusedReadsFile = sys.argv[6]
+    inputs = string.join(sys.argv[7:],' ')
+    cmdline = 'velvetg %s %s > /dev/null' % (working_dir, inputs)
+    # print >> sys.stderr, cmdline # so will appear as blurb for file
+    try:
+        proc = subprocess.Popen( args=cmdline, shell=True, stderr=subprocess.PIPE )
+        returncode = proc.wait()
+        # get stderr, allowing for case where it's very large
+        stderr = ''
+        buffsize = 1048576
+        try:
+            while True:
+                stderr += proc.stderr.read( buffsize )
+                if not stderr or len( stderr ) % buffsize != 0:
+                    break
+        except OverflowError:
+            pass
+        if returncode != 0:
+            raise Exception, stderr
+    except Exception, e:
+        stop_err( 'Error running velvetg ' + str( e ) )
+    out = open(contigs,'w')
+    contigs_path = os.path.join(working_dir,'contigs.fa')
+    for line in open( contigs_path ):
+        out.write( "%s" % (line) )
+    out.close()
+    out = open(stats,'w')
+    stats_path = os.path.join(working_dir,'stats.txt')
+    for line in open( stats_path ):
+        out.write( "%s" % (line) )
+    out.close()
+    if LastGraph != 'None':
+        out = open(LastGraph,'w')
+        LastGraph_path = os.path.join(working_dir,'LastGraph')
+        for line in open( LastGraph_path ):
+            out.write( "%s" % (line) )
+        out.close()
+    if afgFile != 'None':
+        out = open(afgFile,'w')
+        afgFile_path = os.path.join(working_dir,'velvet_asm.afg')
+        try:
+            for line in open( afgFile_path ):
+                out.write( "%s" % (line) )
+        except:
+            logging.warn( 'error reading %s' %(afgFile_path))
+            pass
+        out.close()
+    if unusedReadsFile != 'None':
+        out = open(unusedReadsFile,'w')
+        unusedReadsFile_path = os.path.join(working_dir,'UnusedReads.fa')
+        try:
+            for line in open( unusedReadsFile_path ):
+                out.write( "%s" % (line) )
+        except:
+            logging.info( 'error reading %s' %(unusedReadsFile_path))
+            pass
+        out.close()
+
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_assembly/velveth.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_assembly/velveth.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,128 @@
+<tool id="velveth" name="velveth" version="1.0.0">
+  <description>Prepare a dataset for the Velvet velvetg Assembler</description>
+  <command interpreter="python">
+    velveth_wrapper.py 
+           '$out_file1' '$out_file1.extra_files_path'
+           $hash_length
+           $strand_specific
+           #for $i in $inputs
+                ${i.file_format}
+                ${i.read_type}
+                ${i.input}
+           #end for
+  </command>
+  <inputs>
+    <param label="Hash Length" name="hash_length" type="select" help="k-mer length in base pairs of the words being hashed.">
+      <option value="11">11</option>
+      <option value="13">13</option>
+      <option value="15">15</option>
+      <option value="17">17</option>
+      <option value="19">19</option>
+      <option value="21" selected="yes">21</option>
+      <option value="23">23</option>
+      <option value="25">25</option>
+      <option value="27">27</option>
+      <option value="29">29</option>
+    </param>
+    <param name="strand_specific" type="boolean" checked="false" truevalue="-strand_specific" falsevalue="" label="Use strand specific transcriptome sequencing" help="If you are using a strand specific transcriptome sequencing protocol, you may wish to use this option for better results."/>
+    <repeat name="inputs" title="Input Files">
+      <param label="file format" name="file_format" type="select">
+        <option value="-fasta" selected="yes">fasta</option>
+        <option value="-fastq">fastq</option>
+        <option value="-eland">eland</option>
+        <option value="-gerald">gerald</option>
+      </param>
+      <param label="read type" name="read_type" type="select">
+        <option value="-short" selected="yes">short reads</option>
+        <option value="-shortPaired">shortPaired reads</option>
+        <option value="-short2">short2 reads</option>
+        <option value="-shortPaired2">shortPaired2 reads</option>
+        <option value="-long">long reads</option>
+        <option value="-longPaired">longPaired reads</option>
+      </param>
+
+      <param name="input" type="data" format="fasta,fastq,eland,gerald" label="Dataset"/>
+    </repeat>
+  </inputs>
+  <outputs>
+    <data format="velvet" name="out_file1" />
+  </outputs>
+  <requirements>
+    <requirement type="package">velvet</requirement>
+  </requirements>
+  <tests>
+    <test>
+      <param name="hash_length" value="21" />
+      <param name="read_type" value="-shortPaired" />
+      <!-- <repeat name="inputs"> -->
+      <param name="file_format" value="fasta" />
+      <param name="read_type" value="shortPaired reads" />
+      <param name="input" value="velvet_test_reads.fa" ftype="fasta" />
+      <!-- </repeat> -->
+      <param name="strand_specific" value="" />
+      <output name="out_file1" file="velveth_test1/output.html" lines_diff="4">
+        <extra_files type="file" name='Sequences' value="velveth_test1/Sequences" compare="diff" />
+        <extra_files type="file" name='Roadmaps' value="velveth_test1/Roadmaps" compare="diff" />
+      </output>
+    </test>
+  </tests>
+  <help>
+**Velvet Overview**
+
+Velvet_ is a de novo genomic assembler specially designed for short read sequencing technologies, such as Solexa or 454, developed by Daniel Zerbino and Ewan Birney at the European Bioinformatics Institute (EMBL-EBI), near Cambridge, in the United Kingdom.
+
+Velvet currently takes in short read sequences, removes errors then produces high quality unique contigs. It then uses paired-end read and long read information, when available, to retrieve the repeated areas between contigs.
+
+Read the Velvet `documentation`__ for details on using the Velvet Assembler.
+
+.. _Velvet: http://www.ebi.ac.uk/~zerbino/velvet/
+
+.. __: http://www.ebi.ac.uk/~zerbino/velvet/Manual.pdf
+
+------
+
+**Velveth**
+
+Velveth takes in a number of sequence files, produces a hashtable, then outputs two files in an output directory (creating it if necessary), Sequences and Roadmaps, which are necessary to velvetg.
+
+------
+
+**Hash Length**
+
+The hash length, also known as k-mer length, corresponds to the length, in base pairs, of the words being hashed. 
+
+The hash length is the length of the k-mers being entered in the hash table. Firstly, you must observe three technical constraints::
+
+# it must be an odd number, to avoid palindromes. If you put in an even number, Velvet will just decrement it and proceed.
+# it must be below or equal to MAXKMERHASH length (cf. 2.3.3, by default 31bp), because it is stored on 64 bits
+# it must be strictly inferior to read length, otherwise you simply will not observe any overlaps between reads, for obvious reasons.
+
+Now you still have quite a lot of possibilities. As is often the case, it's a trade- off between specificity and sensitivity. Longer kmers bring you more specificity (i.e. less spurious overlaps) but lowers coverage (cf. below). . . so there's a sweet spot to be found with time and experience.
+We like to think in terms of "k-mer coverage", i.e. how many times has a k-mer been seen among the reads. The relation between k-mer coverage Ck and standard (nucleotide-wise) coverage C is Ck = C # (L - k + 1)/L where k is your hash length, and L you read length.
+Experience shows that this kmer coverage should be above 10 to start getting decent results. If Ck is above 20, you might be "wasting" coverage. Experience also shows that empirical tests with different values for k are not that costly to run!
+
+**Input Files**
+
+Velvet works mainly with fasta and fastq formats. For paired-end reads, the assumption is that each read is next to its mate
+read. In other words, if the reads are indexed from 0, then reads 0 and 1 are paired, 2 and 3, 4 and 5, etc.
+
+Supported file formats are::
+
+  fasta
+  fastq 
+  fasta.gz 
+  fastq.gz 
+  eland
+  gerald
+
+Read categories are::
+
+  short (default)
+  shortPaired
+  short2 (same as short, but for a separate insert-size library)
+  shortPaired2 (see above)
+  long (for Sanger, 454 or even reference sequences)
+  longPaired
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_assembly/velveth_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_assembly/velveth_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+
+"""
+Classes encapsulating decypher tool.
+James E Johnson - University of Minnesota
+"""
+import pkg_resources
+import logging, os, string, sys, tempfile, glob, shutil, types, urllib
+import shlex, subprocess
+from optparse import OptionParser, OptionGroup
+from stat import *
+
+
+log = logging.getLogger( __name__ )
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+def __main__():
+    #Parse Command Line
+    s = 'velveth_wrapper.py:  argv = %s\n' % (sys.argv)
+    argcnt = len(sys.argv)
+    html_file = sys.argv[1]
+    working_dir = sys.argv[2]
+    try: # for test - needs this done
+        os.makedirs(working_dir)
+    except Exception, e:
+        stop_err( 'Error running velveth ' + str( e ) )
+    hash_length = sys.argv[3]
+    inputs = string.join(sys.argv[4:],' ')
+    cmdline = 'velveth %s %s %s > /dev/null' % (working_dir, hash_length, inputs)
+    try:
+        proc = subprocess.Popen( args=cmdline, shell=True, stderr=subprocess.PIPE )
+        returncode = proc.wait()
+        # get stderr, allowing for case where it's very large
+        stderr = ''
+        buffsize = 1048576
+        try:
+            while True:
+                stderr += proc.stderr.read( buffsize )
+                if not stderr or len( stderr ) % buffsize != 0:
+                    break
+        except OverflowError:
+            pass
+        if returncode != 0:
+            raise Exception, stderr
+    except Exception, e:
+        stop_err( 'Error running velveth ' + str( e ) )
+    sequences_path = os.path.join(working_dir,'Sequences')
+    roadmaps_path = os.path.join(working_dir,'Roadmaps')
+    rval = ['<html><head><title>Velvet Galaxy Composite Dataset </title></head><p/>']
+    rval.append('<div>%s<p/></div>' % (cmdline) )
+    rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
+    rval.append( '<li><a href="%s" type="text/plain">%s </a>%s</li>' % (sequences_path,'Sequences','Sequences' ) )
+    rval.append( '<li><a href="%s" type="text/plain">%s </a>%s</li>' % (roadmaps_path,'Roadmaps','Roadmaps' ) )
+    rval.append( '</ul></div></html>' )
+    f = file(html_file,'w')
+    f.write("\n".join( rval ))
+    f.write('\n')
+    f.close()
+
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_mapping/PerM.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_mapping/PerM.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,369 @@\n+<tool id="PerM" name="Map with PerM" version="1.1.2">\n+  <description>for SOLiD and Illumina</description>\n+  <!-- works with PerM version 0.2.6 -->\n+  <requirements>\n+      <requirement type="package">perm</requirement>\n+  </requirements>\n+  <command>\n+    echo -n "PerM "; PerM 2>&amp;1 | grep "Version";\n+    PerM\n+      #if $s.sourceOfRef.refSource == "history"\n+        $s.sourceOfRef.ref\n+      #else\n+        #if $s.space == "color"\n+          "${ filter( lambda x: str( x[0] ) == str( $s.sourceOfRef.index ), $__app__.tool_data_tables[ \'perm_color_indexes\' ].get_fields() )[0][-1] }"\n+        #elif $s.space == "base"\n+          "${ filter( lambda x: str( x[0] ) == str( $s.sourceOfRef.index ), $__app__.tool_data_tables[ \'perm_base_indexes\' ].get_fields() )[0][-1] }"\n+        #end if\n+      #end if\n+      #if $s.mate.singleOrPairs == "single":\n+        $s.mate.reads\n+      #else:\n+        -1 $s.mate.reads1 -2 $s.mate.reads2\n+        -U $s.mate.upperbound\n+        -L $s.mate.lowerbound\n+        $s.mate.excludeAmbiguousPairs\n+      #end if\n+      #if $s.space == "color":\n+        --readFormat "csfastq"\n+      #else:\n+        --readFormat "fastq"\n+      #end if\n+      #if $int($str($valAlign)) &gt;= 0\n+        -v $valAlign\n+      #end if\n+      #if $align.options == "full":\n+        --seed $align.seed\n+        -$align.alignments\n+        #if $str($align.delimiter) != "None"\n+          --delimiter $align.delimiter\n+        #end if\n+        -T $align.sTrimL\n+        $align.includeReadsWN\n+        $align.statsOnly\n+        $align.ignoreQS\n+      #end if\n+      #if $str($bUnmappedRead) == "true" and $s.space == "color"\n+        -u $unmappedReadOutCS\n+      #elif $str($bUnmappedRead) == "true" and $s.space == "base"\n+        -u $unmappedReadOut\n+      #end if\n+      -o $output\n+      --outputFormat sam\n+      --noSamHeader | tr \'\\r\' \'\\n\' | tr -cd "[:print:]\\t\\n " | grep "Reads\\|Sub0\\|Pairs\\|single" | sed \'s/.*Reads:,//\' | sed \'s/\\/.*dat,_ Sub0/Sub0/\'\n+  </command>\n+  <inputs>\n+    <conditional name="s">\n+      <param name="space" label="Is your data color space (SOLiD) or base space (Illumina)?" type="select">\n+        <option value="color">Color space</option>\n+        <option value="base">Base space</option>\n+      </param>\n+      <when value="color">\n+        <conditional name="sourceOfRef">\n+          <param name="refSource" label="Will you provide your own reference file from the history or use a built-in index?" type="select">\n+            <option value="indexed">Built-in index</option>\n+            <option value="history">Fasta file from history</option>\n+          </param>\n+          <when value="indexed">\n+            <param name="index" type="select" label="Select a reference genome (with seed and read length)" help="if your genome of interest is not listed - contact Galaxy team">\n+              <options from_data_table="perm_color_indexes"/>\n+            </param>\n+          </when>\n+          <when value="history">\n+            <param name="ref" format="fasta" type="data" label="Reference" />\n+          </when>\n+        </conditional>\n+        <conditional name="mate">\n+          <param name="singleOrPairs" label="Mate-paired?" type="select">\n+            <option value="single">Single-end</option>\n+            <option value="paired">Mate pairs</option>\n+          </param>\n+          <when value="single">\n+            <param format="fastqcssanger" name="reads" type="data" label="Reads" />\n+          </when>\n+          <when value="paired">\n+            <param name="reads1" format="fastqcssanger" label="Forward FASTQ file" type="data" />\n+            <param name="reads2" format="fastqcssanger" label="Reverse FASTQ file" type="data" />\n+            <param label="Upperbound of pairs separation (-U)" name="upperbound" type="integer" size="8" value="100000" />\n+            <param label="Lowerbound of pairs separation (-L)" name="lowerbound" type="integer" size="8" value="0" />\n+            <param label="Exclude'..b'------------------------------------\n+  0x0001  the read is paired in sequencing\n+  0x0002  the read is mapped in a proper pair\n+  0x0004  the query sequence itself is unmapped\n+  0x0008  the mate is unmapped\n+  0x0010  strand of the query (1 for reverse)\n+  0x0020  strand of the mate\n+  0x0040  the read is the first read in a pair\n+  0x0080  the read is the second read in a pair\n+  0x0100  the alignment is not primary\n+\n+Here is some sample output::\n+\n+  Qname\tFLAG\tRname\tPOS\tMAPQ\tCIAGR\tMRNM\tMPOS\tISIZE\tSEQ\tQUAL\tNM\tCS\tCQ\n+  491_28_332_F3   16      ref-1   282734  255     35M     *       0       0       AGTCAAACTCCGAATGCCAATGACTTATCCTTAGG    #%%%%%%%!!%%%!!%%%%%%%%!!%%%%%%%%%%      NM:i:3  CS:Z:C0230202330012130103100230121001212        CQ:Z:###################################\n+  491_28_332_F3   16      ref-1   269436  255     35M     *       0       0       AGTCAAACTCCGAATGCCAATGACTTATCCTTAGG    #%%%%%%%!!%%%!!%%%%%%%%!!%%%%%%%%%%      NM:i:3  CS:Z:C0230202330012130103100230121001212        CQ:Z:###################################\n+\n+The user can check a checkbox for optional output containing the unmmaped reads in fastqsanger or fastqcssanger. The default is to produce it.\n+\n+**PerM parameter list**\n+\n+Below is a list of PerM command line options for PerM. Not all of these are relevant to Galaxy\'s implementation, but are included for completeness.\n+\n+The command for single-end::\n+\n+  PerM [ref_or_index] [read] [options]\n+\n+The command for paired-end::\n+\n+  PerM [ref_or_index] -1 [read1] -2 [read1] [options]\n+\n+The command-line options::\n+\n+  -A                Output all alignments within the given mismatch threshold, end-to-end.\n+  -B                Output best alignments in terms of mismatches in the given mismatch threshold. [Default]\n+  -E                Output only the uniquely mapped reads in the given mismatch threshold.\n+  -m                Create the reference index, without reusing the saved index.\n+  -s PATH           Save the reference index to accelerate the mapping in the future. If PATH is not specified, the default path will be used.\n+  -v INT            Where INT is the number of mismatches allowed in one end. [Default=2]\n+  -T INT            Where INT is the length to truncate read length to, so 30 means use only first 30 bases (signals). Leave blank if the full read is meant to be used.\n+  -o PATH           Where PATH is for output the mapping of one read set. PerM\'s output are in .mapping or .sam format, determined by the ext name of PATH. Ex: -o out.sam will output in SAM format; -o out.mapping will output in .mapping format.\n+  -d PATH           Where PATH is the directory for multiple read sets.\n+  -u PATH           Print the fastq file of those unmapped reads to the file in PATH.\n+  --noSamHeader     Print no SAM header so it is convenient to concatenate multiple SAM output files.\n+  --includeReadsWN  Encodes N or "." with A or 3, respectively.\n+  --statsOnly       Output the mapping statistics in stdout only, without saving alignments to files.\n+  --ignoreQS        Ignore the quality scores in fastq or QUAL files.\n+  --seed {F2 | S11 | F3 | F4}    Specify the seed pattern, which has a specific full sensitivity. Check the algorithm page (link below) for seed patterns to balance the sensitivity and running time.\n+  --readFormat {fasta | fastq | csfasta | csfastq}    Read in reads in the specified format, instead of guessing according to the extension name.\n+  --delimiter CHAR  Which is a character used as the delimiter to separate the the read id, and the additional info in the line with ">" in fasta or csfasta.\n+\n+Paired reads options::\n+\n+  -e        Exclude ambiguous paired.\n+  -L INT    Mate-paired separate lower bound.\n+  -U INT    Mate-paired separate upper bound.\n+  -1 PATH   The forward reads file path.\n+  -2 PATH   The reversed reads file path.\n+\n+See the PerM `algorithm page`__ for information on algorithms and seeds.\n+\n+ .. __: http://code.google.com/p/perm/wiki/Algorithms\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_mapping/bfast_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_mapping/bfast_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,344 @@\n+#!/usr/bin/env python\n+\n+"""\n+Runs BFAST on single-end or paired-end data.\n+TODO: more documentation\n+\n+TODO: \n+    - auto-detect gzip or bz2\n+    - split options (?)\n+    - queue lengths (?)\n+    - assumes reference always has been indexed\n+    - main and secondary indexes\n+    - scoring matrix file ?\n+    - read group file ?\n+\n+usage: bfast_wrapper.py [options]\n+    -r, --ref=r: The reference genome to use or index\n+    -f, --fastq=f: The fastq file to use for the mapping\n+    -F, --output=u: The file to save the output (SAM format)\n+    -s, --fileSource=s: Whether to use a previously indexed reference sequence or one from history (indexed or history)\n+    -p, --params=p: Parameter setting to use (pre_set or full)\n+    -n, --numThreads=n: The number of threads to use\n+    -A, --space=A: The encoding space (0: base 1: color)\n+    -o, --offsets=o: The offsets for \'match\'\n+    -l, --loadAllIndexes=l: Load all indexes into memory\n+    -k, --keySize=k: truncate key size in \'match\'\n+    -K, --maxKeyMatches=K: the maximum number of matches to allow before a key is ignored\n+    -M, --maxNumMatches=M: the maximum number of matches to allow before the read is discarded\n+    -w, --whichStrand=w: the strands to consider (0: both 1: forward 2: reverse)\n+    -t, --timing=t: output timing information to stderr\n+    -u, --ungapped=u: performed ungapped local alignment\n+    -U, --unconstrained=U: performed local alignment without mask constraints\n+    -O, --offset=O: the number of bases before and after each hit to consider in local alignment\n+    -q, --avgMismatchQuality=q: average mismatch quality\n+    -a, --algorithm=a: post processing algorithm (0: no filtering, 1: all passing filters, 2: unique, 3: best scoring unique, 4: best score all)\n+    -P, --disallowPairing=P: do not choose alignments based on pairing\n+    -R, --reverse=R: paired end reads are given on reverse strands\n+    -z, --random=z: output a random best scoring alignment\n+    -D, --dbkey=D: Dbkey for reference genome\n+    -H, --suppressHeader=H: Suppress the sam header\n+"""\n+\n+import optparse, os, shutil, subprocess, sys, tempfile\n+\n+def stop_err( msg ):\n+    sys.stderr.write( \'%s\\n\' % msg )\n+    sys.exit()\n+\n+def __main__():\n+    #Parse Command Line\n+    parser = optparse.OptionParser()\n+    parser.add_option( \'-r\', \'--ref\', dest=\'ref\', help=\'The reference genome to index and use\' )\n+    parser.add_option( \'-f\', \'--fastq\', dest=\'fastq\', help=\'The fastq file to use for the mapping\' )\n+    parser.add_option( \'-F\', \'--output\', dest=\'output\', help=\'The file to save the output (SAM format)\' )\n+    parser.add_option( \'-A\', \'--space\', dest=\'space\', type="choice", default=\'0\', choices=(\'0\',\'1\' ), help=\'The encoding space (0: base 1: color)\' )\n+    parser.add_option( \'-H\', \'--suppressHeader\', action="store_true", dest=\'suppressHeader\', default=False, help=\'Suppress header\' )\n+    parser.add_option( \'-n\', \'--numThreads\', dest=\'numThreads\', type="int", default="1", help=\'The number of threads to use\' )\n+    parser.add_option( \'-t\', \'--timing\', action="store_true", default=False, dest=\'timing\', help=\'output timming information to stderr\' )\n+    parser.add_option( \'-l\', \'--loadAllIndexes\', action="store_true", default=False, dest=\'loadAllIndexes\', help=\'Load all indexes into memory\' )\n+    parser.add_option( \'-m\', \'--indexMask\', dest=\'indexMask\', help=\'String containing info on how to build custom indexes\' )\n+    parser.add_option( "-b", "--buildIndex", action="store_true", dest="buildIndex", default=False, help=\'String containing info on how to build custom indexes\' )\n+    parser.add_option( "--indexRepeatMasker", action="store_true", dest="indexRepeatMasker", default=False, help=\'Do not index lower case sequences. Such as those created by RepeatMasker\' )\n+    parser.add_option( \'--indexContigOptions\', dest=\'indexContigOptions\', default="", help=\'The contig range options to use for the indexing\' )\n+    parser.add_option( \'--indexExonsFileName\', dest=\'indexExonsFile'..b'read( buffsize )\n+                        if not stderr or len( stderr ) % buffsize != 0:\n+                            break\n+                except OverflowError:\n+                    pass\n+                tmp_stderr.close()\n+                if returncode != 0:\n+                    raise Exception, stderr\n+            except Exception, e:\n+                raise Exception, \'Error in \\\'bfast match\\\'. \\n\' + str( e )\n+            # bfast \'localalign\'\n+            try:\n+                tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name\n+                tmp_stderr = open( tmp, \'wb\' )\n+                proc = subprocess.Popen( args=bfast_localalign_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )\n+                returncode = proc.wait()\n+                tmp_stderr.close()\n+                # get stderr, allowing for case where it\'s very large\n+                tmp_stderr = open( tmp, \'rb\' )\n+                stderr = \'\'\n+                try:\n+                    while True:\n+                        stderr += tmp_stderr.read( buffsize )\n+                        if not stderr or len( stderr ) % buffsize != 0:\n+                            break\n+                except OverflowError:\n+                    pass\n+                tmp_stderr.close()\n+                if returncode != 0:\n+                    raise Exception, stderr\n+            except Exception, e:\n+                raise Exception, \'Error in \\\'bfast localalign\\\'. \\n\' + str( e )\n+            # bfast \'postprocess\'\n+            try:\n+                tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name\n+                tmp_stderr = open( tmp, \'wb\' )\n+                proc = subprocess.Popen( args=bfast_postprocess_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )\n+                returncode = proc.wait()\n+                tmp_stderr.close()\n+                # get stderr, allowing for case where it\'s very large\n+                tmp_stderr = open( tmp, \'rb\' )\n+                stderr = \'\'\n+                try:\n+                    while True:\n+                        stderr += tmp_stderr.read( buffsize )\n+                        if not stderr or len( stderr ) % buffsize != 0:\n+                            break\n+                except OverflowError:\n+                    pass\n+                tmp_stderr.close()\n+                if returncode != 0:\n+                    raise Exception, stderr\n+            except Exception, e:\n+                raise Exception, \'Error in \\\'bfast postprocess\\\'. \\n\' + str( e )\n+            # remove header if necessary\n+            if options.suppressHeader:\n+                tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir)\n+                tmp_out_name = tmp_out.name\n+                tmp_out.close()\n+                try:\n+                    shutil.move( options.output, tmp_out_name )\n+                except Exception, e:\n+                    raise Exception, \'Error moving output file before removing headers. \\n\' + str( e )\n+                fout = file( options.output, \'w\' )\n+                for line in file( tmp_out.name, \'r\' ):\n+                    if len( line ) < 3 or line[0:3] not in [ \'@HD\', \'@SQ\', \'@RG\', \'@PG\', \'@CO\' ]:\n+                        fout.write( line )\n+                fout.close()\n+            # check that there are results in the output file\n+            if os.path.getsize( options.output ) > 0:\n+                if "0" == options.space:\n+                    sys.stdout.write( \'BFAST run on Base Space data\' )\n+                else:\n+                    sys.stdout.write( \'BFAST run on Color Space data\' )\n+            else:\n+                raise Exception, \'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.\'\n+        except Exception, e:\n+            stop_err( \'The alignment failed.\\n\' + str( e ) )\n+    finally:\n+        # clean up temp dir\n+        if os.path.exists( tmp_dir ):\n+            shutil.rmtree( tmp_dir )\n+\n+if __name__=="__main__": __main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_mapping/bfast_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_mapping/bfast_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,379 @@\n+<tool id="bfast_wrapper" name="Map with BFAST" version="0.1.3">\n+  <description></description>\n+  <command interpreter="python">bfast_wrapper.py\n+    --numThreads="4" ##HACK: hardcode numThreads for now, should come from a location file\n+    --fastq="$input1"\n+    #if $input1.extension.startswith( "fastqcs" ):\n+        ##if extention starts with fastqcs, then we have a color space file\n+        --space="1" ##color space\n+    #else\n+        --space="0"\n+    #end if\n+    --output="$output"\n+    $suppressHeader\n+    \n+    #if $refGenomeSource.refGenomeSource_type == "history":\n+      ##build indexes on the fly\n+      --buildIndex\n+      --ref="${refGenomeSource.ownFile}"\n+      --indexMask="${",".join( [ "%s:%s" % ( str( custom_index.get( \'mask\' ) ).strip(), str( custom_index.get( \'hash_width\' ) ).strip() ) for custom_index in $refGenomeSource.custom_index ] )}"\n+      ${refGenomeSource.indexing_repeatmasker}\n+      #if $refGenomeSource.indexing_option.indexing_option_selector == "contig_offset":\n+        --indexContigOptions="${refGenomeSource.indexing_option.start_contig},${refGenomeSource.indexing_option.start_pos},${refGenomeSource.indexing_option.end_contig},${refGenomeSource.indexing_option.end_pos}"\n+      #elif $refGenomeSource.indexing_option.indexing_option_selector == "exons_file":\n+        --indexExonsFileName="${refGenomeSource.indexing_option.exons_file}"\n+      #end if\n+    #else:\n+      ##use precomputed indexes\n+      --ref="${ refGenomeSource.indices.fields.path }"\n+    #end if\n+    \n+    #if $params.source_select == "full":\n+      --offsets="$params.offsets"\n+      --keySize="$params.keySize"\n+      --maxKeyMatches="$params.maxKeyMatches"\n+      --maxNumMatches="$params.maxNumMatches"\n+      --whichStrand="$params.whichStrand"\n+      \n+      #if str( $params.scoringMatrixFileName ) != \'None\':\n+        --scoringMatrixFileName="$params.scoringMatrixFileName"\n+      #end if\n+      ${params.ungapped}\n+      ${params.unconstrained}\n+      --offset="${params.offset}"\n+      --avgMismatchQuality="${params.avgMismatchQuality}"\n+      \n+      --algorithm="${params.localalign_params.algorithm}"\n+      ${params.unpaired}\n+      ${params.reverseStrand}\n+      #if $params.localalign_params.algorithm == "3":\n+        ${params.localalign_params.pairedEndInfer}\n+        ${params.localalign_params.randomBest}\n+      #end if\n+    #end if\n+  </command>\n+  <inputs>\n+    <param name="input1" type="data" format="fastqsanger,fastqcssanger" label="FASTQ file" help="Must have Sanger-scaled quality values with ASCII offset 33"/>\n+    <conditional name="refGenomeSource">\n+      <param name="refGenomeSource_type" type="select" label="Will you select a reference genome from your history or use a built-in index?">\n+        <option value="indexed">Use a built-in index</option>\n+        <option value="history">Use one from the history</option>\n+      </param>\n+      <when value="indexed">\n+        <param name="indices" type="select" label="Select a reference genome index set">\n+          <options from_data_table="bfast_indexes">\n+            <filter type="multiple_splitter" column="2" separator=","/>\n+            <filter type="param_value" column="2" ref="input1" ref_attribute="extension"/>\n+            <filter type="sort_by" column="3"/>\n+            <validator type="no_options" message="No indexes are available for the selected input dataset"/>\n+          </options>\n+        </param>\n+      </when>\n+      <when value="history">\n+        <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" />\n+        <repeat name="custom_index" title="Custom indice" min="1" >\n+            <param name="mask" type="text" value="" label="Specify the mask" size="20">\n+              <!-- <validator type="no_options" message="No indexes are available for the selected input dataset"/> need is int validator here or regex all 01s-->\n+            </param>\n+            <param name="hash_widt'..b'umber of bases before and after the match to\n+  include in the reference genome\n+  -M  INT    Specifies the maximum total number of matches to consider\n+  before the read is discarded [384]\n+  -q  INT    Specifies the average mismatch quality\n+  -n  INT   Specifies the number of threads to use [1] \n+  -t         Specifies to output timing information\n+\n+For **postprocess**::\n+\n+  -a  INT    Specifies the algorithm to choose the alignment for each end of the read:\n+\n+    0: No filtering will occur.\n+    1: All alignments that pass the filters will be output\n+    2: Only consider reads that have been aligned uniquely\n+    3: Choose uniquely the alignment with the best score\n+    4: Choose all alignments with the best score\n+  \n+  -A  INT    0: NT space 1: Color space [0]\n+  -U      Specifies that pairing should not be performed\n+  -R          Specifies that paired reads are on opposite strands\n+  -q   INT    Specifies the average mismatch quality\n+  -x  FILE  Specifies the file name storing the scoring matrix\n+  -z          Specifies to output a random best scoring alignment (with -a 3)\n+  -r   FILE  Specifies to add the RG in the specified file to the SAM\n+  header and updates the RG tag (and LB/PU tags if present) in\n+  the reads (SAM only)\n+  -n  INT   Specifies the number of threads to use [1] \n+  -t         Specifies to output timing information\n+\n+  </help>\n+  <requirements>\n+    <requirement type="package">bfast</requirement>\n+  </requirements>\n+  <tests>\n+    <test>\n+      <param name="input1" ftype="fastqsanger" value="random_phiX_1.fastqsanger" />\n+      <param name="refGenomeSource_type" value="history" />\n+      <param name="ownFile" ftype="fasta" value="phiX.fasta" />\n+      <param name="mask" value="111111111111111111" />\n+      <param name="hash_width" value="14" />\n+      <param name="source_select" value="pre_set" />\n+      <param name="indexing_repeatmasker" value="False" />\n+      <param name="indexing_option_selector" value="default" />\n+      <param name="suppressHeader" value="" />\n+      <output name="output" ftype="sam" file="bfast_out1.sam" />\n+    </test>\n+    <test>\n+      <param name="input1" ftype="fastqsanger" value="random_phiX_1.fastqsanger"/>\n+      <param name="refGenomeSource_type" value="history" />\n+      <param name="ownFile" ftype="fasta" value="phiX.fasta" />\n+      <param name="mask" value="111111111111111111" />\n+      <param name="hash_width" value="14" />\n+      <param name="source_select" value="pre_set" />\n+      <param name="indexing_repeatmasker" value="False" />\n+      <param name="indexing_option_selector" value="default" />\n+      <param name="suppressHeader" value="--suppressHeader" />\n+      <output name="output" ftype="sam" file="bfast_out1.sam" lines_diff="3" /><!--  3 headers exist in compare file, but headers are suppressed -->\n+    </test>\n+    <test>\n+      <param name="input1" ftype="fastqcssanger" value="random_phiX_1.fastqcssanger" />\n+      <param name="refGenomeSource_type" value="history" />\n+      <param name="ownFile" ftype="fasta" value="phiX.fasta" />\n+      <param name="mask" value="111111111111111111" />\n+      <param name="hash_width" value="14" />\n+      <param name="source_select" value="pre_set" />\n+      <param name="indexing_repeatmasker" value="False" />\n+      <param name="indexing_option_selector" value="default" />\n+      <param name="suppressHeader" value="" />\n+      <output name="output" ftype="sam" file="bfast_out2.sam" />\n+    </test>\n+    <!-- test of pre-indexed data now -->\n+    <test>\n+      <param name="input1" ftype="fastqsanger" value="random_phiX_1.fastqsanger" />\n+      <param name="refGenomeSource_type" value="indexed" />\n+      <param name="indices" value="phiX_nt_50" />\n+      <param name="source_select" value="pre_set" />\n+      <param name="suppressHeader" value="" />\n+      <output name="output" ftype="sam" file="bfast_out3.sam" lines_diff="2" /><!-- MD:Z:11T38 instead of MD:Z:50 on one line-->\n+    </test>\n+  </tests>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_mapping/bowtie_color_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_mapping/bowtie_color_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,714 @@\n+<tool id="bowtie_color_wrapper" name="Map with Bowtie for SOLiD" version="1.1.2">\n+  <requirements><requirement type=\'package\'>bowtie</requirement></requirements>\n+  <description></description>\n+  <command interpreter="python">\n+    bowtie_wrapper.py \n+    ## Hackish setting of number of threads\n+    --threads="4"\n+    ## Outputs\n+      --output=$output\n+      #if str( $singlePaired.sPaired ) == "single"\n+        #if $output_unmapped_reads_l\n+          --output_unmapped_reads=$output_unmapped_reads_l\n+        #end if\n+        #if $output_suppressed_reads_l\n+          --output_suppressed_reads=$output_suppressed_reads_l\n+        #end if\n+      #else\n+        #if $output_unmapped_reads_l and $output_unmapped_reads_r\n+          --output_unmapped_reads_l=$output_unmapped_reads_l\n+          --output_unmapped_reads_r=$output_unmapped_reads_r\n+        #end if\n+        #if $output_suppressed_reads_l and $output_suppressed_reads_l\n+          --output_suppressed_reads_l=$output_suppressed_reads_l\n+          --output_suppressed_reads_r=$output_suppressed_reads_r\n+        #end if\n+      #end if\n+    ## Inputs\n+    --dataType="solid"\n+    --suppressHeader=$suppressHeader \n+    --genomeSource=$refGenomeSource.genomeSource\n+    #if $refGenomeSource.genomeSource == "history":\n+      ##index already exists\n+      #if $refGenomeSource.ownFile.extension.startswith( \'bowtie_\' ):\n+        ##user previously built\n+        --ref="${refGenomeSource.ownFile.extra_files_path}/${refGenomeSource.ownFile.metadata.base_name}"\n+        --do_not_build_index\n+      #else:\n+        ##build index on the fly\n+        --ref=$refGenomeSource.ownFile\n+        --indexSettings=$refGenomeSource.indexParams.indexSettings\n+        #if $refGenomeSource.indexParams.indexSettings == "indexFull":\n+          --iautoB=$refGenomeSource.indexParams.autoBehavior.autoB\n+          #if $refGenomeSource.indexParams.autoBehavior.autoB == "set":\n+            --ipacked=$refGenomeSource.indexParams.autoBehavior.packed\n+            --ibmax=$refGenomeSource.indexParams.autoBehavior.bmax\n+            --ibmaxdivn=$refGenomeSource.indexParams.autoBehavior.bmaxdivn\n+            --idcv=$refGenomeSource.indexParams.autoBehavior.dcv\n+          #end if\n+          --inodc=$refGenomeSource.indexParams.nodc\n+          --inoref=$refGenomeSource.indexParams.noref\n+          --ioffrate=$refGenomeSource.indexParams.offrate\n+          --iftab=$refGenomeSource.indexParams.ftab\n+          --intoa=$refGenomeSource.indexParams.ntoa\n+          --iendian=$refGenomeSource.indexParams.endian\n+          --iseed=$refGenomeSource.indexParams.seed\n+          --icutoff=$refGenomeSource.indexParams.cutoff\n+        #end if\n+      #end if\n+    #else\n+      ##use pre-built index\n+      --ref="${ filter( lambda x: str( x[0] ) == str( $refGenomeSource.index ), $__app__.tool_data_tables[ \'bowtie_indexes_color\' ].get_fields() )[0][-1] }"\n+    #end if\n+    --paired=$singlePaired.sPaired\n+    #if $singlePaired.sPaired == "single":\n+      --input1=$singlePaired.sInput1\n+      --params=$singlePaired.sParams.sSettingsType\n+      #if $singlePaired.sParams.sSettingsType == "full":\n+        --skip=$singlePaired.sParams.sSkip\n+        --alignLimit=$singlePaired.sParams.sAlignLimit\n+        --trimH=$singlePaired.sParams.sTrimH\n+        --trimL=$singlePaired.sParams.sTrimL\n+        --mismatchSeed=$singlePaired.sParams.sMismatchSeed\n+        --mismatchQual=$singlePaired.sParams.sMismatchQual\n+        --seedLen=$singlePaired.sParams.sSeedLen\n+        --rounding=$singlePaired.sParams.sRounding\n+        --maqSoapAlign=$singlePaired.sParams.sMaqSoapAlign\n+        --tryHard=$singlePaired.sParams.sTryHard\n+        --valAlign=$singlePaired.sParams.sValAlign\n+        --allValAligns=$singlePaired.sParams.sAllValAligns\n+        --suppressAlign=$singlePaired.sParams.sSuppressAlign\n+        --best=$singlePaired.sParams.sBestOption.sBest\n+        #if $singlePaired.sParams.sBestOption.sBest == "doBest":\n+          --maxBacktracks='..b'ality. Maximum permitted total of quality values at mismatched \n+                   read positions. Bowtie rounds quality values to the nearest 10 and saturates \n+                   at 30. [70]\n+  -l INT           Seed length. The number of bases on the high-quality end of the read to \n+                   which the -n ceiling applies. Must be at least 5. [28]\n+  --nomaqround     Suppress MAQ rounding. Values are internally rounded to the nearest 10 and \n+                   saturate at 30. This options turns off that rounding. [off] \n+  -v INT           MAQ- or SOAP-like alignment policy. This option turns off the default \n+                   MAQ-like alignment policy in favor of a SOAP-like one. End-to-end alignments \n+                   with at most INT mismatches. [off]\n+  -I INT           Minimum insert. The minimum insert size for valid paired-end alignments. \n+                   Does checking on untrimmed reads if -5 or -3 is used. [0]\n+  -X INT           Maximum insert. The maximum insert size for valid paired-end alignments. \n+                   Does checking on untrimmed reads if -5 or -3 is used. [250]\n+  --fr             Mate orientation. The upstream/downstream mate orientations for a valid \n+                   paired-end alignment against the forward reference strand. [--fr]\n+  --rf             Mate orientation. [off]\n+  --ff             Mate orientation. [off]\n+  --pairtries INT  Maximum alignment attempts for paired-end data. [100] \n+  --nofw           No forward aligning. Choosing this option means that Bowtie will not attempt \n+                   to align against the forward reference strand. [off]\n+  --norc           No reverse-complement aligning. Setting this will mean that Bowtie will not \n+                   attempt to align against the reverse-complement reference strand. [off]\n+  --maxbts INT     Maximum backtracks. The maximum number of backtracks permitted when aligning \n+                   a read in -n 2 or -n 3 mode. [125 without --best] [800 with --best]\n+  -y               Try hard. Try as hard as possible to find valid alignments when they exist, \n+                   including paired-end alignments. [off]\n+  --chunkmbs INT   Thread memory. The number of megabytes of memory a given thread is given to \n+                   store path descriptors in --best mode. [32]\n+  -k INT           Valid alignments. The number of valid alignments per read or pair. [off] \n+  -a               All valid alignments. Choosing this means that all valid alignments per read \n+                   or pair will be reported. [off]\n+  -m INT           Suppress alignments. Suppress all alignments for a particular read or pair \n+                   if more than INT reportable alignments exist for it. [no limit]\n+  --best           Best mode. Make Bowtie guarantee that reported singleton alignments are \n+                   "best" in terms of stratum (the number of mismatches) and quality values at \n+                   mismatched position. [off]\n+  --strata         Best strata. When running in best mode, report alignments that fall into the \n+                   best stratum if there are ones falling into more than one. [off]\n+  -o INT           Offrate override. Override the offrate of the index with INT. Some row \n+                   markings are discarded when index read into memory. INT must be greater than \n+                   the value used to build the index (default: 5). [off]\n+  --seed INT       Random seed. Use INT as the seed for the pseudo-random number generator. [off]\n+  --snpphred INT   Use INT as the SNP penalty for decoding colorspace alignments. True ratio of \n+                   SNPs per base in the subject genome. [see --snpfrac]\n+  --snpfrac DEC    Use DEC as the estimated ratio of SNPs per base when decoding colorspace \n+                   alignments. [0.001]\n+  --col-keepends   Keep the extreme-end nucleotides and qualities when decoding colorspace \n+                   alignments. [off]\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_mapping/bowtie_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_mapping/bowtie_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,469 @@\n+#!/usr/bin/env python\n+\n+"""\n+Runs Bowtie on single-end or paired-end data.\n+For use with Bowtie v. 0.12.7\n+\n+usage: bowtie_wrapper.py [options]\n+    -t, --threads=t: The number of threads to run\n+    -o, --output=o: The output file\n+    --output_unmapped_reads=: File name for unmapped reads (single-end)\n+    --output_unmapped_reads_l=: File name for unmapped reads (left, paired-end)\n+    --output_unmapped_reads_r=: File name for unmapped reads (right, paired-end)\n+    --output_suppressed_reads=: File name for suppressed reads because of max setting (single-end)\n+    --output_suppressed_reads_l=: File name for suppressed reads because of max setting (left, paired-end)\n+    --output_suppressed_reads_r=: File name for suppressed reads because of max setting (right, paired-end)\n+    -i, --input1=i: The (forward or single-end) reads file in Sanger FASTQ format\n+    -I, --input2=I: The reverse reads file in Sanger FASTQ format\n+    -4, --dataType=4: The type of data (SOLiD or Solexa)\n+    -2, --paired=2: Whether the data is single- or paired-end\n+    -g, --genomeSource=g: The type of reference provided\n+    -r, --ref=r: The reference genome to use or index\n+    -s, --skip=s: Skip the first n reads\n+    -a, --alignLimit=a: Only align the first n reads\n+    -T, --trimH=T: Trim n bases from high-quality (left) end of each read before alignment\n+    -L, --trimL=L: Trim n bases from low-quality (right) end of each read before alignment\n+    -m, --mismatchSeed=m: Maximum number of mismatches permitted in the seed\n+    -M, --mismatchQual=M: Maximum permitted total of quality values at mismatched read positions\n+    -l, --seedLen=l: Seed length\n+    -n, --rounding=n: Whether or not to round to the nearest 10 and saturating at 30\n+    -P, --maqSoapAlign=P: Choose MAQ- or SOAP-like alignment policy\n+    -w, --tryHard=: Whether or not to try as hard as possible to find valid alignments when they exist\n+    -v, --valAlign=v: Report up to n valid arguments per read\n+    -V, --allValAligns=V: Whether or not to report all valid alignments per read\n+    -G, --suppressAlign=G: Suppress all alignments for a read if more than n reportable alignments exist\n+    -b, --best=b: Whether or not to make Bowtie guarantee that reported singleton alignments are \'best\' in terms of stratum and in terms of the quality values at the mismatched positions\n+    -B, --maxBacktracks=B: Maximum number of backtracks permitted when aligning a read\n+    -R, --strata=R: Whether or not to report only those alignments that fall in the best stratum if many valid alignments exist and are reportable\n+    -j, --minInsert=j: Minimum insert size for valid paired-end alignments\n+    -J, --maxInsert=J: Maximum insert size for valid paired-end alignments\n+    -O, --mateOrient=O: The upstream/downstream mate orientation for valid paired-end alignment against the forward reference strand\n+    -A, --maxAlignAttempt=A: Maximum number of attempts Bowtie will make to match an alignment for one mate with an alignment for the opposite mate\n+    -f, --forwardAlign=f: Whether or not to attempt to align the forward reference strand\n+    -E, --reverseAlign=E: Whether or not to attempt to align the reverse-complement reference strand\n+    -F, --offrate=F: Override the offrate of the index to n\n+    -8, --snpphred=8: SNP penalty on Phred scale\n+    -6, --snpfrac=6: Fraction of sites expected to be SNP sites\n+    -7, --keepends=7: Keep extreme-end nucleotides and qualities\n+    -S, --seed=S: Seed for pseudo-random number generator\n+    -C, --params=C: Whether to use default or specified parameters\n+    -u, --iautoB=u: Automatic or specified behavior\n+    -K, --ipacked=K: Whether or not to use a packed representation for DNA strings\n+    -Q, --ibmax=Q: Maximum number of suffixes allowed in a block\n+    -Y, --ibmaxdivn=Y: Maximum number of suffixes allowed in a block as a fraction of the length of the reference\n+    -D, --idcv=D: The period for the difference-cover sample\n+    -U, '..b'smatchSeed, mismatchQual, seedLen, rounding, minInsert, \n+                              maxAlignAttempt, forwardAlign, reverseAlign, maxBacktracks,\n+                              tryHard, valAlign, allValAligns, suppressAlign, best,\n+                              strata, offrate, seed, snpphred, snpfrac, keepends,\n+                              output_unmapped_reads, output_suppressed_reads,\n+                              quality_score_encoding )\n+        except ValueError, e:\n+            # clean up temp dir\n+            if os.path.exists( tmp_index_dir ):\n+                shutil.rmtree( tmp_index_dir )\n+            stop_err( \'Something is wrong with the alignment parameters and the alignment could not be run\\n\' + str( e ) )\n+    try:\n+        # have to nest try-except in try-finally to handle 2.4\n+        try:\n+            # prepare actual mapping commands\n+            if options.paired == \'paired\':\n+                cmd2 = \'bowtie %s %s -1 %s -2 %s > %s\' % ( aligning_cmds, ref_file_name, options.input1, options.input2, options.output )\n+            else:\n+                cmd2 = \'bowtie %s %s %s > %s\' % ( aligning_cmds, ref_file_name, options.input1, options.output )\n+            # align\n+            tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name\n+            tmp_stderr = open( tmp, \'wb\' )\n+            proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )\n+            returncode = proc.wait()\n+            tmp_stderr.close()\n+            # get stderr, allowing for case where it\'s very large\n+            tmp_stderr = open( tmp, \'rb\' )\n+            stderr = \'\'\n+            buffsize = 1048576\n+            try:\n+                while True:\n+                    stderr += tmp_stderr.read( buffsize )\n+                    if not stderr or len( stderr ) % buffsize != 0:\n+                        break\n+            except OverflowError:\n+                pass\n+            tmp_stderr.close()\n+            if returncode != 0:\n+                raise Exception, stderr\n+            # get suppressed and unmapped reads output files in place if appropriate\n+            if options.paired == \'paired\' and tmp_suppressed_file_name and \\\n+                               options.output_suppressed_reads_l and options.output_suppressed_reads_r:\n+                try:\n+                    left = tmp_suppressed_file_name.replace( \'.fastq\', \'_1.fastq\' )\n+                    right = tmp_suppressed_file_name.replace( \'.fastq\', \'_1.fastq\' )\n+                    shutil.move( left, options.output_suppressed_reads_l )\n+                    shutil.move( right, options.output_suppressed_reads_r )\n+                except Exception, e:\n+                    sys.stdout.write( \'Error producing the suppressed output file.\\n\' )\n+            if options.paired == \'paired\' and tmp_unmapped_file_name and \\\n+                               options.output_unmapped_reads_l and options.output_unmapped_reads_r:\n+                try:\n+                    left = tmp_unmapped_file_name.replace( \'.fastq\', \'_1.fastq\' )\n+                    right = tmp_unmapped_file_name.replace( \'.fastq\', \'_2.fastq\' )\n+                    shutil.move( left, options.output_unmapped_reads_l )\n+                    shutil.move( right, options.output_unmapped_reads_r )\n+                except Exception, e:\n+                    sys.stdout.write( \'Error producing the unmapped output file.\\n\' )\n+            # check that there are results in the output file\n+            if os.path.getsize( options.output ) == 0:\n+                raise Exception, \'The output file is empty, there may be an error with your input file or settings.\'\n+        except Exception, e:\n+            stop_err( \'Error aligning sequence. \' + str( e ) )\n+    finally:\n+        # clean up temp dir\n+        if os.path.exists( tmp_index_dir ):\n+            shutil.rmtree( tmp_index_dir )\n+    stdout += \'Sequence file aligned.\\n\'\n+    sys.stdout.write( stdout )\n+\n+if __name__=="__main__": __main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_mapping/bowtie_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_mapping/bowtie_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,838 @@\n+<tool id="bowtie_wrapper" name="Map with Bowtie for Illumina" version="1.1.2">\n+  <requirements><requirement type=\'package\'>bowtie</requirement></requirements>\n+  <description></description>\n+  <parallelism method="basic"></parallelism>\n+  <command interpreter="python">\n+    bowtie_wrapper.py\n+      ## Hackish setting of number of threads\n+      --threads="4"\n+      ## Outputs\n+      --output=$output\n+      #if str( $singlePaired.sPaired ) == "single"\n+        #if $output_unmapped_reads_l\n+          --output_unmapped_reads=$output_unmapped_reads_l\n+        #end if\n+        #if $output_suppressed_reads_l\n+          --output_suppressed_reads=$output_suppressed_reads_l\n+        #end if\n+        --galaxy_input_format="${singlePaired.sInput1.ext}"\n+      #else\n+        #if $output_unmapped_reads_l and $output_unmapped_reads_r\n+          --output_unmapped_reads_l=$output_unmapped_reads_l\n+          --output_unmapped_reads_r=$output_unmapped_reads_r\n+        #end if\n+        #if $output_suppressed_reads_l and $output_suppressed_reads_l\n+          --output_suppressed_reads_l=$output_suppressed_reads_l\n+          --output_suppressed_reads_r=$output_suppressed_reads_r\n+        #end if\n+        --galaxy_input_format="${singlePaired.pInput1.ext}"\n+      #end if\n+      ## Inputs\n+      --dataType="solexa" ##this indicates that nucleotide base space is used in the wrapper\n+      --suppressHeader=$suppressHeader\n+      --genomeSource=$refGenomeSource.genomeSource\n+      #if $refGenomeSource.genomeSource == "history":\n+        ##index already exists\n+        #if $refGenomeSource.ownFile.extension.startswith( \'bowtie_\' ):\n+          ##user previously built\n+          --ref="${refGenomeSource.ownFile.extra_files_path}/${refGenomeSource.ownFile.metadata.base_name}"\n+          --do_not_build_index\n+        #else:\n+          ##build index on the fly\n+          --ref=$refGenomeSource.ownFile\n+          --indexSettings=$refGenomeSource.indexParams.indexSettings\n+          #if $refGenomeSource.indexParams.indexSettings == "indexFull":\n+            --iautoB=$refGenomeSource.indexParams.autoBehavior.autoB\n+            #if $refGenomeSource.indexParams.autoBehavior.autoB == "set":\n+              --ipacked=$refGenomeSource.indexParams.autoBehavior.packed\n+              --ibmax=$refGenomeSource.indexParams.autoBehavior.bmax\n+              --ibmaxdivn=$refGenomeSource.indexParams.autoBehavior.bmaxdivn\n+              --idcv=$refGenomeSource.indexParams.autoBehavior.dcv\n+            #end if\n+            --inodc=$refGenomeSource.indexParams.nodc\n+            --inoref=$refGenomeSource.indexParams.noref\n+            --ioffrate=$refGenomeSource.indexParams.offrate\n+            --iftab=$refGenomeSource.indexParams.ftab\n+            --intoa=$refGenomeSource.indexParams.ntoa\n+            --iendian=$refGenomeSource.indexParams.endian\n+            --iseed=$refGenomeSource.indexParams.seed\n+            --icutoff=$refGenomeSource.indexParams.cutoff \n+          #end if\n+        #end if\n+      #else\n+        ##use pre-built index\n+        ##--ref="${ filter( lambda x: str( x[0] ) == str( $refGenomeSource.index ), $__app__.tool_data_tables[ \'bowtie_indexes\' ].get_fields() )[0][-1] }"\n+        --ref="${ refGenomeSource.index.fields.path }"\n+      #end if\n+      --paired=$singlePaired.sPaired\n+      #if $singlePaired.sPaired == "single":\n+        --input1=$singlePaired.sInput1\n+        --params=$singlePaired.sParams.sSettingsType\n+        #if $singlePaired.sParams.sSettingsType == "full":\n+          --skip=$singlePaired.sParams.sSkip\n+          --alignLimit=$singlePaired.sParams.sAlignLimit\n+          --trimH=$singlePaired.sParams.sTrimH\n+          --trimL=$singlePaired.sParams.sTrimL\n+          --mismatchSeed=$singlePaired.sParams.sMismatchSeed\n+          --mismatchQual=$singlePaired.sParams.sMismatchQual\n+          --seedLen=$singlePaired.sParams.sSeedLen\n+          --rounding=$singlePaired.sParams.sRounding\n+          --maqSoapAlign=$singlePaired.sPara'..b'-n ceiling applies. Must be at least 5. [28]\n+  --nomaqround       Suppress MAQ rounding. Values are internally rounded to the nearest 10 and \n+                     saturate at 30. This options turns off that rounding. [off] \n+  -v INT             MAQ- or SOAP-like alignment policy. This option turns off the default \n+                     MAQ-like alignment policy in favor of a SOAP-like one. End-to-end alignments \n+                     with at most INT mismatches. [off]\n+  -I INT             Minimum insert. The minimum insert size for valid paired-end alignments. \n+                     Does checking on untrimmed reads if -5 or -3 is used. [0]\n+  -X INT             Maximum insert. The maximum insert size for valid paired-end alignments. \n+                     Does checking on untrimmed reads if -5 or -3 is used. [250]\n+  --fr               Mate orientation. The upstream/downstream mate orientations for a valid \n+                     paired-end alignment against the forward reference strand. [--fr]\n+  --rf               Mate orientation. [off]\n+  --ff               Mate orientation. [off]\n+  --pairtries INT    Maximum alignment attempts for paired-end data. [100] \n+  --nofw             No forward aligning. Choosing this option means that Bowtie will not attempt \n+                     to align against the forward reference strand. [off]\n+  --norc             No reverse-complement aligning. Setting this will mean that Bowtie will not \n+                     attempt to align against the reverse-complement reference strand. [off]\n+  --un FILENAME      Write all reads that could not be aligned to file [off]\n+  --max FILENAME     Write all reads with a number of valid alignments exceeding the limit\n+                     set with the -m option to file [off]\n+  --maxbts INT       Maximum backtracks. The maximum number of backtracks permitted when aligning \n+                     a read in -n 2 or -n 3 mode. [125 without --best] [800 with --best]\n+  -y                 Try hard. Try as hard as possible to find valid alignments when they exist, \n+                     including paired-end alignments. [off]\n+  --chunkmbs INT     Thread memory. The number of megabytes of memory a given thread is given to \n+                     store path descriptors in --best mode. [32]\n+  -k INT             Valid alignments. The number of valid alignments per read or pair. [off] \n+  -a                 All valid alignments. Choosing this means that all valid alignments per read \n+                     or pair will be reported. [off]\n+  -m INT             Suppress alignments. Suppress all alignments for a particular read or pair \n+                     if more than INT reportable alignments exist for it. [no limit]\n+  --best             Best mode. Make Bowtie guarantee that reported singleton alignments are \n+                     "best" in terms of stratum (the number of mismatches) and quality values at \n+                     mismatched position. [off]\n+  --strata           Best strata. When running in best mode, report alignments that fall into the \n+                     best stratum if there are ones falling into more than one. [off]\n+  -o INT             Offrate override. Override the offrate of the index with INT. Some row \n+                     markings are discarded when index read into memory. INT must be greater than \n+                     the value used to build the index (default: 5). [off]\n+  --seed INT         Random seed. Use INT as the seed for the pseudo-random number generator. [off]\n+  --snpphred INT     Use INT as the SNP penalty for decoding colorspace alignments. True ratio of \n+                     SNPs per base in the subject genome. [see --snpfrac]\n+  --snpfrac DEC      Use DEC as the estimated ratio of SNPs per base when decoding colorspace \n+                     alignments. [0.001]\n+  --col-keepends     Keep the extreme-end nucleotides and qualities when decoding colorspace \n+                     alignments. [off]\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_mapping/bwa_color_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_mapping/bwa_color_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,482 @@\n+<tool id="bwa_color_wrapper" name="Map with BWA for SOLiD" version="1.0.1">\n+  <description></description>\n+  <parallelism method="basic"></parallelism>\n+  <command interpreter="python">\n+    bwa_wrapper.py \n+      --threads="4"\n+      --color-space\n+\n+      ## reference source\n+      --fileSource=$genomeSource.refGenomeSource\n+      #if $genomeSource.refGenomeSource == "history":\n+        ##build index on the fly\n+        --ref="${genomeSource.ownFile}"\n+        --dbkey=$dbkey\n+      #else:\n+        ##use precomputed indexes\n+        --ref="${ filter( lambda x: str( x[0] ) == str( $genomeSource.indices ), $__app__.tool_data_tables[ \'bwa_indexes_color\' ].get_fields() )[0][-1] }"\n+        --do_not_build_index\n+      #end if\n+\n+      ## input file(s)\n+      --input1=$paired.input1\n+      #if $paired.sPaired == "paired":\n+        --input2=$paired.input2\n+      #end if\n+\n+      ## output file\n+      --output=$output\n+\n+      ## run parameters\n+      --genAlignType=$paired.sPaired\n+      --params=$params.source_select\n+      #if $params.source_select != "pre_set":\n+        --maxEditDist=$params.maxEditDist\n+        --fracMissingAligns=$params.fracMissingAligns\n+        --maxGapOpens=$params.maxGapOpens\n+        --maxGapExtens=$params.maxGapExtens\n+        --disallowLongDel=$params.disallowLongDel\n+        --disallowIndel=$params.disallowIndel\n+        --seed=$params.seed\n+        --maxEditDistSeed=$params.maxEditDistSeed\n+        --mismatchPenalty=$params.mismatchPenalty\n+        --gapOpenPenalty=$params.gapOpenPenalty\n+        --gapExtensPenalty=$params.gapExtensPenalty\n+        --suboptAlign=$params.suboptAlign\n+        --noIterSearch=$params.noIterSearch\n+        --outputTopN=$params.outputTopN\n+        --outputTopNDisc=$params.outputTopNDisc\n+        --maxInsertSize=$params.maxInsertSize\n+        --maxOccurPairing=$params.maxOccurPairing\n+        #if $params.readGroup.specReadGroup == "yes"\n+          --rgid="$params.readGroup.rgid"\n+          --rgcn="$params.readGroup.rgcn"\n+          --rgds="$params.readGroup.rgds"\n+          --rgdt="$params.readGroup.rgdt"\n+          --rgfo="$params.readGroup.rgfo"\n+          --rgks="$params.readGroup.rgks"\n+          --rglb="$params.readGroup.rglb"\n+          --rgpg="$params.readGroup.rgpg"\n+          --rgpi="$params.readGroup.rgpi"\n+          --rgpl="$params.readGroup.rgpl"\n+          --rgpu="$params.readGroup.rgpu"\n+          --rgsm="$params.readGroup.rgsm"\n+        #end if\n+      #end if\n+\n+      ## suppress output SAM header\n+      --suppressHeader=$suppressHeader\n+  </command>\n+  <requirements>\n+    <requirement type="package">bwa</requirement>\n+  </requirements>\n+  <inputs>\n+    <conditional name="genomeSource">\n+      <param name="refGenomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?">\n+        <option value="indexed">Use a built-in index</option>\n+        <option value="history">Use one from the history</option>\n+      </param>\n+      <when value="indexed">\n+        <param name="indices" type="select" label="Select a reference genome">\n+          <options from_data_table="bwa_indexes_color">\n+            <filter type="sort_by" column="2" />\n+            <validator type="no_options" message="No indexes are available for the selected input dataset" />\n+          </options>\n+        </param>\n+      </when>\n+      <when value="history">\n+        <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" />\n+      </when>\n+    </conditional>\n+    <conditional name="paired">\n+      <param name="sPaired" type="select" label="Is this library mate-paired?">\n+        <option value="single">Single-end</option>\n+        <option value="paired">Paired-end</option>\n+      </param>\n+      <when value="single">\n+        <param name="input1" type="data" format="fastqcssanger" label="FASTQ file (Nucleotide-space recoded from color-space)">\n+          <help>Convert color'..b"p towards the 3'-end [16]\n+  -i INT  Disallow an indel within INT bp towards the ends [5]\n+  -l INT  Take the first INT subsequence as seed. If INT is larger than the\n+          query sequence, seeding will be disabled. For long reads, this option \n+          is typically ranged from 25 to 35 for '-k 2'. [inf]\n+  -k INT  Maximum edit distance in the seed [2]\n+  -t INT  Number of threads (multi-threading mode) [1]\n+  -M INT  Mismatch penalty. BWA will not search for suboptimal hits with a score\n+          lower than (bestScore-misMsc). [3]\n+  -O INT  Gap open penalty [11]\n+  -E INT  Gap extension penalty [4]\n+  -c      Reverse query but not complement it, which is required for alignment\n+          in the color space.\n+  -R      Proceed with suboptimal alignments even if the top hit is a repeat. By\n+          default, BWA only searches for suboptimal alignments if the top hit is\n+          unique. Using this option has no effect on accuracy for single-end\n+          reads. It is mainly designed for improving the alignment accuracy of\n+          paired-end reads. However, the pairing procedure will be slowed down,\n+          especially for very short reads (~32bp).\n+  -N      Disable iterative search. All hits with no more than maxDiff\n+          differences will be found. This mode is much slower than the default.\n+\n+For **samse**::\n+\n+  -n INT  Maximum number of alignments to output in the XA tag for reads paired\n+          properly. If a read has more than INT hits, the XA tag will not be\n+          written. [3]\n+  -r STR  Specify the read group in a format like '@RG\\tID:foo\\tSM:bar' [null]\n+\n+For **sampe**::\n+\n+  -a INT  Maximum insert size for a read pair to be considered as being mapped\n+          properly. Since version 0.4.5, this option is only used when there\n+          are not enough good alignment to infer the distribution of insert\n+          sizes. [500]\n+  -n INT  Maximum number of alignments to output in the XA tag for reads paired\n+          properly. If a read has more than INT hits, the XA tag will not be\n+          written. [3]\n+  -N INT  Maximum number of alignments to output in the XA tag for disconcordant\n+          read pairs (excluding singletons). If a read has more than INT hits,\n+          the XA tag will not be written. [10]\n+  -o INT  Maximum occurrences of a read for pairing. A read with more\n+          occurrences will be treated as a single-end read. Reducing this\n+          parameter helps faster pairing. [100000]\n+  -r STR  Specify the read group in a format like '@RG\\tID:foo\\tSM:bar' [null]\n+\n+For specifying the read group in **samse** or **sampe**, use the following::\n+\n+  @RG   Read group. Unordered multiple @RG lines are allowed. \n+  ID    Read group identi\xef\xac\x81er. Each @RG line must have a unique ID. The value of\n+        ID is used in the RG tags of alignment records. Must be unique among all\n+        read groups in header section. Read group IDs may be modi\xef\xac\x81ed when\n+        merging SAM \xef\xac\x81les in order to handle collisions. \n+  CN    Name of sequencing center producing the read. \n+  DS    Description. \n+  DT    Date the run was produced (ISO8601 date or date/time). \n+  FO    Flow order. The array of nucleotide bases that correspond to the\n+        nucleotides used for each flow of each read. Multi-base flows are encoded\n+        in IUPAC format, and non-nucleotide flows by various other characters.\n+        Format : /\\*|[ACMGRSVTWYHKDBN]+/ \n+  KS    The array of nucleotide bases that correspond to the key sequence of each read. \n+  LB    Library. \n+  PG    Programs used for processing the read group. \n+  PI    Predicted median insert size. \n+  PL    Platform/technology used to produce the reads. Valid values : CAPILLARY,\n+        LS454, ILLUMINA, SOLID, HELICOS, IONTORRENT and PACBIO. \n+  PU    Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for\n+        SOLiD). Unique identi\xef\xac\x81er. \n+  SM    Sample. Use pool name where a pool is being sequenced. \n+\n+  </help>\n+</tool>\n+\n+\n"
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_mapping/bwa_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_mapping/bwa_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,342 @@\n+#!/usr/bin/env python\n+\n+"""\n+Runs BWA on single-end or paired-end data.\n+Produces a SAM file containing the mappings.\n+Works with BWA version 0.5.9.\n+\n+usage: bwa_wrapper.py [options]\n+\n+See below for options\n+"""\n+\n+import optparse, os, shutil, subprocess, sys, tempfile\n+\n+def stop_err( msg ):\n+    sys.stderr.write( \'%s\\n\' % msg )\n+    sys.exit()\n+\n+def check_is_double_encoded( fastq ):\n+    # check that first read is bases, not one base followed by numbers\n+    bases = [ \'A\', \'C\', \'G\', \'T\', \'a\', \'c\', \'g\', \'t\', \'N\' ]\n+    nums = [ \'0\', \'1\', \'2\', \'3\' ]\n+    for line in file( fastq, \'rb\'):\n+        if not line.strip() or line.startswith( \'@\' ):\n+            continue\n+        if len( [ b for b in line.strip() if b in nums ] ) > 0:\n+            return False\n+        elif line.strip()[0] in bases and len( [ b for b in line.strip() if b in bases ] ) == len( line.strip() ):\n+            return True\n+        else:\n+            raise Exception, \'First line in first read does not appear to be a valid FASTQ read in either base-space or color-space\'\n+    raise Exception, \'There is no non-comment and non-blank line in your FASTQ file\'\n+\n+def __main__():\n+    #Parse Command Line\n+    parser = optparse.OptionParser()\n+    parser.add_option( \'-t\', \'--threads\', dest=\'threads\', help=\'The number of threads to use\' )\n+    parser.add_option( \'-c\', \'--color-space\', dest=\'color_space\', action=\'store_true\', help=\'If the input files are SOLiD format\' )\n+    parser.add_option( \'-r\', \'--ref\', dest=\'ref\', help=\'The reference genome to use or index\' )\n+    parser.add_option( \'-f\', \'--input1\', dest=\'fastq\', help=\'The (forward) fastq file to use for the mapping\' )\n+    parser.add_option( \'-F\', \'--input2\', dest=\'rfastq\', help=\'The reverse fastq file to use for mapping if paired-end data\' )\n+    parser.add_option( \'-u\', \'--output\', dest=\'output\', help=\'The file to save the output (SAM format)\' )\n+    parser.add_option( \'-g\', \'--genAlignType\', dest=\'genAlignType\', help=\'The type of pairing (single or paired)\' )\n+    parser.add_option( \'-p\', \'--params\', dest=\'params\', help=\'Parameter setting to use (pre_set or full)\' )\n+    parser.add_option( \'-s\', \'--fileSource\', dest=\'fileSource\', help=\'Whether to use a previously indexed reference sequence or one form history (indexed or history)\' )\n+    parser.add_option( \'-n\', \'--maxEditDist\', dest=\'maxEditDist\', help=\'Maximum edit distance if integer\' )\n+    parser.add_option( \'-m\', \'--fracMissingAligns\', dest=\'fracMissingAligns\', help=\'Fraction of missing alignments given 2% uniform base error rate if fraction\' )\n+    parser.add_option( \'-o\', \'--maxGapOpens\', dest=\'maxGapOpens\', help=\'Maximum number of gap opens\' )\n+    parser.add_option( \'-e\', \'--maxGapExtens\', dest=\'maxGapExtens\', help=\'Maximum number of gap extensions\' )\n+    parser.add_option( \'-d\', \'--disallowLongDel\', dest=\'disallowLongDel\', help=\'Disallow a long deletion within specified bps\' )\n+    parser.add_option( \'-i\', \'--disallowIndel\', dest=\'disallowIndel\', help=\'Disallow indel within specified bps\' )\n+    parser.add_option( \'-l\', \'--seed\', dest=\'seed\', help=\'Take the first specified subsequences\' )\n+    parser.add_option( \'-k\', \'--maxEditDistSeed\', dest=\'maxEditDistSeed\', help=\'Maximum edit distance to the seed\' )\n+    parser.add_option( \'-M\', \'--mismatchPenalty\', dest=\'mismatchPenalty\', help=\'Mismatch penalty\' )\n+    parser.add_option( \'-O\', \'--gapOpenPenalty\', dest=\'gapOpenPenalty\', help=\'Gap open penalty\' )\n+    parser.add_option( \'-E\', \'--gapExtensPenalty\', dest=\'gapExtensPenalty\', help=\'Gap extension penalty\' )\n+    parser.add_option( \'-R\', \'--suboptAlign\', dest=\'suboptAlign\', help=\'Proceed with suboptimal alignments even if the top hit is a repeat\' )\n+    parser.add_option( \'-N\', \'--noIterSearch\', dest=\'noIterSearch\', help=\'Disable iterative search\' )\n+    parser.add_option( \'-T\', \'--outputTopN\', dest=\'outputTopN\', help=\'Maximum number of alignments to output in the XA tag for reads paired properly\' )\n+    parser.add_option( \'\','..b'                break\n+                except OverflowError:\n+                    pass\n+                tmp_stderr.close()\n+                if returncode != 0:\n+                    raise Exception, stderr\n+            except Exception, e:\n+                raise Exception, \'Error aligning sequence. \' + str( e )\n+            # and again if paired data\n+            try:\n+                if cmd2b:\n+                    tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name\n+                    tmp_stderr = open( tmp, \'wb\' )\n+                    proc = subprocess.Popen( args=cmd2b, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )\n+                    returncode = proc.wait()\n+                    tmp_stderr.close()\n+                    # get stderr, allowing for case where it\'s very large\n+                    tmp_stderr = open( tmp, \'rb\' )\n+                    stderr = \'\'\n+                    try:\n+                        while True:\n+                            stderr += tmp_stderr.read( buffsize )\n+                            if not stderr or len( stderr ) % buffsize != 0:\n+                                break\n+                    except OverflowError:\n+                        pass\n+                    tmp_stderr.close()\n+                    if returncode != 0:\n+                        raise Exception, stderr\n+            except Exception, e:\n+                raise Exception, \'Error aligning second sequence. \' + str( e )\n+            # generate align\n+            try:\n+                tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name\n+                tmp_stderr = open( tmp, \'wb\' )\n+                proc = subprocess.Popen( args=cmd3, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )\n+                returncode = proc.wait()\n+                tmp_stderr.close()\n+                # get stderr, allowing for case where it\'s very large\n+                tmp_stderr = open( tmp, \'rb\' )\n+                stderr = \'\'\n+                try:\n+                    while True:\n+                        stderr += tmp_stderr.read( buffsize )\n+                        if not stderr or len( stderr ) % buffsize != 0:\n+                            break\n+                except OverflowError:\n+                    pass\n+                tmp_stderr.close()\n+                if returncode != 0:\n+                    raise Exception, stderr\n+            except Exception, e:\n+                raise Exception, \'Error generating alignments. \' + str( e ) \n+            # remove header if necessary\n+            if options.suppressHeader == \'true\':\n+                tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir)\n+                tmp_out_name = tmp_out.name\n+                tmp_out.close()\n+                try:\n+                    shutil.move( options.output, tmp_out_name )\n+                except Exception, e:\n+                    raise Exception, \'Error moving output file before removing headers. \' + str( e )\n+                fout = file( options.output, \'w\' )\n+                for line in file( tmp_out.name, \'r\' ):\n+                    if not ( line.startswith( \'@HD\' ) or line.startswith( \'@SQ\' ) or line.startswith( \'@RG\' ) or line.startswith( \'@PG\' ) or line.startswith( \'@CO\' ) ):\n+                        fout.write( line )\n+                fout.close()\n+            # check that there are results in the output file\n+            if os.path.getsize( options.output ) > 0:\n+                sys.stdout.write( \'BWA run on %s-end data\' % options.genAlignType )\n+            else:\n+                raise Exception, \'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.\'\n+        except Exception, e:\n+            stop_err( \'The alignment failed.\\n\' + str( e ) )\n+    finally:\n+        # clean up temp dir\n+        if os.path.exists( tmp_index_dir ):\n+            shutil.rmtree( tmp_index_dir )\n+        if os.path.exists( tmp_dir ):\n+            shutil.rmtree( tmp_dir )\n+\n+if __name__=="__main__": __main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_mapping/bwa_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_mapping/bwa_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,474 @@\n+<tool id="bwa_wrapper" name="Map with BWA for Illumina" version="1.2.2">\n+  <description></description>\n+  <parallelism method="basic"></parallelism>\n+  <command interpreter="python">\n+    bwa_wrapper.py \n+      --threads="4"\n+\n+      #if $input1.ext == "fastqillumina":\n+            --illumina1.3\n+      #end if\n+\n+      ## reference source\n+      --fileSource=$genomeSource.refGenomeSource\n+      #if $genomeSource.refGenomeSource == "history":\n+        ##build index on the fly\n+        --ref="${genomeSource.ownFile}"\n+        --dbkey=$dbkey\n+      #else:\n+        ##use precomputed indexes\n+        --ref="${ filter( lambda x: str( x[0] ) == str( $genomeSource.indices ), $__app__.tool_data_tables[ \'bwa_indexes\' ].get_fields() )[0][-1] }"\n+        --do_not_build_index\n+      #end if\n+\n+      ## input file(s)\n+      --input1=$paired.input1\n+      #if $paired.sPaired == "paired":\n+        --input2=$paired.input2\n+      #end if\n+\n+      ## output file\n+      --output=$output\n+\n+      ## run parameters\n+      --genAlignType=$paired.sPaired\n+      --params=$params.source_select\n+      #if $params.source_select != "pre_set":\n+        --maxEditDist=$params.maxEditDist\n+        --fracMissingAligns=$params.fracMissingAligns\n+        --maxGapOpens=$params.maxGapOpens\n+        --maxGapExtens=$params.maxGapExtens\n+        --disallowLongDel=$params.disallowLongDel\n+        --disallowIndel=$params.disallowIndel\n+        --seed=$params.seed\n+        --maxEditDistSeed=$params.maxEditDistSeed\n+        --mismatchPenalty=$params.mismatchPenalty\n+        --gapOpenPenalty=$params.gapOpenPenalty\n+        --gapExtensPenalty=$params.gapExtensPenalty\n+        --suboptAlign=$params.suboptAlign\n+        --noIterSearch=$params.noIterSearch\n+        --outputTopN=$params.outputTopN\n+        --outputTopNDisc=$params.outputTopNDisc\n+        --maxInsertSize=$params.maxInsertSize\n+        --maxOccurPairing=$params.maxOccurPairing\n+        #if $params.readGroup.specReadGroup == "yes"\n+          --rgid="$params.readGroup.rgid"\n+          --rgcn="$params.readGroup.rgcn"\n+          --rgds="$params.readGroup.rgds"\n+          --rgdt="$params.readGroup.rgdt"\n+          --rgfo="$params.readGroup.rgfo"\n+          --rgks="$params.readGroup.rgks"\n+          --rglb="$params.readGroup.rglb"\n+          --rgpg="$params.readGroup.rgpg"\n+          --rgpi="$params.readGroup.rgpi"\n+          --rgpl="$params.readGroup.rgpl"\n+          --rgpu="$params.readGroup.rgpu"\n+          --rgsm="$params.readGroup.rgsm"\n+        #end if\n+      #end if\n+\n+      ## suppress output SAM header\n+      --suppressHeader=$suppressHeader\n+  </command>\n+  <requirements>\n+    <requirement type="package">bwa</requirement>\n+  </requirements>\n+  <inputs>\n+    <conditional name="genomeSource">\n+      <param name="refGenomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?">\n+        <option value="indexed">Use a built-in index</option>\n+        <option value="history">Use one from the history</option>\n+      </param>\n+      <when value="indexed">\n+        <param name="indices" type="select" label="Select a reference genome">\n+          <options from_data_table="bwa_indexes">\n+            <filter type="sort_by" column="2" />\n+            <validator type="no_options" message="No indexes are available" />\n+          </options>\n+        </param>\n+      </when>\n+      <when value="history">\n+        <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" />\n+      </when>\n+    </conditional>\n+    <conditional name="paired">\n+      <param name="sPaired" type="select" label="Is this library mate-paired?">\n+        <option value="single">Single-end</option>\n+        <option value="paired">Paired-end</option>\n+      </param>\n+      <when value="single">\n+        <param name="input1" type="data" format="fastqsanger,fastqillumina" label="FASTQ file" help="FASTQ with either Sanger-scaled quali'..b"wards the 3'-end [16]\n+  -i INT  Disallow an indel within INT bp towards the ends [5]\n+  -l INT  Take the first INT subsequence as seed. If INT is larger than the\n+          query sequence, seeding will be disabled. For long reads, this option \n+          is typically ranged from 25 to 35 for '-k 2'. [inf]\n+  -k INT  Maximum edit distance in the seed [2]\n+  -t INT  Number of threads (multi-threading mode) [1]\n+  -M INT  Mismatch penalty. BWA will not search for suboptimal hits with a score\n+          lower than (bestScore-misMsc). [3]\n+  -O INT  Gap open penalty [11]\n+  -E INT  Gap extension penalty [4]\n+  -c      Reverse query but not complement it, which is required for alignment\n+          in the color space.\n+  -R      Proceed with suboptimal alignments even if the top hit is a repeat. By\n+          default, BWA only searches for suboptimal alignments if the top hit is\n+          unique. Using this option has no effect on accuracy for single-end\n+          reads. It is mainly designed for improving the alignment accuracy of\n+          paired-end reads. However, the pairing procedure will be slowed down,\n+          especially for very short reads (~32bp).\n+  -N      Disable iterative search. All hits with no more than maxDiff\n+          differences will be found. This mode is much slower than the default.\n+\n+For **samse**::\n+\n+  -n INT  Maximum number of alignments to output in the XA tag for reads paired\n+          properly. If a read has more than INT hits, the XA tag will not be\n+          written. [3]\n+  -r STR  Specify the read group in a format like '@RG\\tID:foo\\tSM:bar' [null]\n+\n+For **sampe**::\n+\n+  -a INT  Maximum insert size for a read pair to be considered as being mapped\n+          properly. Since version 0.4.5, this option is only used when there\n+          are not enough good alignment to infer the distribution of insert\n+          sizes. [500]\n+  -n INT  Maximum number of alignments to output in the XA tag for reads paired\n+          properly. If a read has more than INT hits, the XA tag will not be\n+          written. [3]\n+  -N INT  Maximum number of alignments to output in the XA tag for disconcordant\n+          read pairs (excluding singletons). If a read has more than INT hits,\n+          the XA tag will not be written. [10]\n+  -o INT  Maximum occurrences of a read for pairing. A read with more\n+          occurrences will be treated as a single-end read. Reducing this\n+          parameter helps faster pairing. [100000]\n+  -r STR  Specify the read group in a format like '@RG\\tID:foo\\tSM:bar' [null]\n+\n+For specifying the read group in **samse** or **sampe**, use the following::\n+\n+  @RG   Read group. Unordered multiple @RG lines are allowed. \n+  ID    Read group identi\xef\xac\x81er. Each @RG line must have a unique ID. The value of\n+        ID is used in the RG tags of alignment records. Must be unique among all\n+        read groups in header section. Read group IDs may be modi\xef\xac\x81ed when\n+        merging SAM \xef\xac\x81les in order to handle collisions. \n+  CN    Name of sequencing center producing the read. \n+  DS    Description. \n+  DT    Date the run was produced (ISO8601 date or date/time). \n+  FO    Flow order. The array of nucleotide bases that correspond to the\n+        nucleotides used for each \xef\xac\x82ow of each read. Multi-base \xef\xac\x82ows are encoded\n+        in IUPAC format, and non-nucleotide \xef\xac\x82ows by various other characters.\n+        Format : /\\*|[ACMGRSVTWYHKDBN]+/ \n+  KS    The array of nucleotide bases that correspond to the key sequence of each read. \n+  LB    Library. \n+  PG    Programs used for processing the read group. \n+  PI    Predicted median insert size. \n+  PL    Platform/technology used to produce the reads. Valid values : CAPILLARY,\n+        LS454, ILLUMINA, SOLID, HELICOS, IONTORRENT and PACBIO. \n+  PU    Platform unit (e.g. \xef\xac\x82owcell-barcode.lane for Illumina or slide for\n+        SOLiD). Unique identi\xef\xac\x81er. \n+  SM    Sample. Use pool name where a pool is being sequenced. \n+\n+  </help>\n+</tool>\n+\n+\n"
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_mapping/fastq_statistics.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_mapping/fastq_statistics.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,89 @@
+<tool id="cshl_fastq_statistics" name="FASTQ Statistics">
+  <description>for Solexa file</description>
+  <command>cat $input | solexa_quality_statistics -o $output</command>
+  <inputs>
+    <param format="fastqsolexa" name="input" type="data" label="Library to analyze" />
+  </inputs>
+  <outputs>
+    <data format="txt" name="output" />
+  </outputs>
+  <help>
+
+**What it does**
+
+Creates quality statistics report for the given Solexa/FASTQ library.
+
+-----
+
+**The output file will contain the following fields:**
+
+* column = column number (1 to 36 for a 36-cycles read solexa file)
+* count   = number of bases found in this column.
+* min     = Lowest quality score value found in this column.
+* max     = Highest quality score value found in this column.
+* sum     = Sum of quality score values for this column.
+* mean    = Mean quality score value for this column.
+* Q1 = 1st quartile quality score.
+* med = Median quality score.
+* Q3 = 3rd quartile quality score.
+* IQR = Inter-Quartile range (Q3-Q1).
+* lW = 'Left-Whisker' value (for boxplotting).
+* rW = 'Right-Whisker' value (for boxplotting).
+* A_Count = Count of 'A' nucleotides found in this column.
+* C_Count = Count of 'C' nucleotides found in this column.
+* G_Count = Count of 'G' nucleotides found in this column.
+* T_Count = Count of 'T' nucleotides found in this column.
+* N_Count = Count of 'N' nucleotides found in this column.  
+
+
+.. class:: infomark
+
+**TIP:** This statistics report can be used as input for **Quality Score** and **Nucleotides Distribution** tools.
+
+
+
+
+
+**Output Example**::
+
+    column count min max sum mean Q1 med Q3 IQR lW rW A_Count C_Count G_Count T_Count N_Count
+    1 6362991 -4 40 250734117 39.41 40 40 40 0 40 40 1396976 1329101 678730 2958184 0
+    2 6362991 -5 40 250531036 39.37 40 40 40 0 40 40 1786786 1055766 1738025 1782414 0
+    3 6362991 -5 40 248722469 39.09 40 40 40 0 40 40 2296384 984875 1443989 1637743 0
+    4 6362991 -5 40 247654797 38.92 40 40 40 0 40 40 1683197 1410855 1722633 1546306 0
+    5 6362991 -4 40 248214827 39.01 40 40 40 0 40 40 2536861 1167423 1248968 1409739 0
+    6 6362991 -5 40 248499903 39.05 40 40 40 0 40 40 1598956 1236081 1568608 1959346 0
+    7 6362991 -4 40 247719760 38.93 40 40 40 0 40 40 1692667 1822140 1496741 1351443 0
+    8 6362991 -5 40 245745205 38.62 40 40 40 0 40 40 2230936 1343260 1529928 1258867 0
+    9 6362991 -5 40 245766735 38.62 40 40 40 0 40 40 1702064 1306257 1336511 2018159 0
+    10 6362991 -5 40 245089706 38.52 40 40 40 0 40 40 1519917 1446370 1450995 1945709 0
+    11 6362991 -5 40 242641359 38.13 40 40 40 0 40 40 1717434 1282975 1387804 1974778 0
+    12 6362991 -5 40 242026113 38.04 40 40 40 0 40 40 1662872 1202041 1519721 1978357 0
+    13 6362991 -5 40 238704245 37.51 40 40 40 0 40 40 1549965 1271411 1973291 1566681 1643
+    14 6362991 -5 40 235622401 37.03 40 40 40 0 40 40 2101301 1141451 1603990 1515774 475
+    15 6362991 -5 40 230766669 36.27 40 40 40 0 40 40 2344003 1058571 1440466 1519865 86
+    16 6362991 -5 40 224466237 35.28 38 40 40 2 35 40 2203515 1026017 1474060 1651582 7817
+    17 6362991 -5 40 219990002 34.57 34 40 40 6 25 40 1522515 1125455 2159183 1555765 73
+    18 6362991 -5 40 214104778 33.65 30 40 40 10 15 40 1479795 2068113 1558400 1249337 7346
+    19 6362991 -5 40 212934712 33.46 30 40 40 10 15 40 1432749 1231352 1769799 1920093 8998
+    20 6362991 -5 40 212787944 33.44 29 40 40 11 13 40 1311657 1411663 2126316 1513282 73
+    21 6362991 -5 40 211369187 33.22 28 40 40 12 10 40 1887985 1846300 1300326 1318380 10000
+    22 6362991 -5 40 213371720 33.53 30 40 40 10 15 40 542299 3446249 516615 1848190 9638
+    23 6362991 -5 40 221975899 34.89 36 40 40 4 30 40 347679 1233267 926621 3855355 69
+    24 6362991 -5 40 194378421 30.55 21 40 40 19 -5 40 433560 674358 3262764 1992242 67
+    25 6362991 -5 40 199773985 31.40 23 40 40 17 -2 40 944760 325595 1322800 3769641 195
+    26 6362991 -5 40 179404759 28.20 17 34 40 23 -5 40 3457922 156013 1494664 1254293 99
+    27 6362991 -5 40 163386668 25.68 13 28 40 27 -5 40 1392177 281250 3867895 821491 178
+    28 6362991 -5 40 156230534 24.55 12 25 40 28 -5 40 907189 981249 4174945 299437 171
+    29 6362991 -5 40 163236046 25.65 13 28 40 27 -5 40 1097171 3418678 1567013 280008 121
+    30 6362991 -5 40 151309826 23.78 12 23 40 28 -5 40 3514775 2036194 566277 245613 132
+    31 6362991 -5 40 141392520 22.22 10 21 40 30 -5 40 1569000 4571357 124732 97721 181
+    32 6362991 -5 40 143436943 22.54 10 21 40 30 -5 40 1453607 4519441 38176 351107 660
+    33 6362991 -5 40 114269843 17.96 6 14 30 24 -5 40 3311001 2161254 155505 734297 934
+    34 6362991 -5 40 140638447 22.10 10 20 40 30 -5 40 1501615 1637357 18113 3205237 669
+    35 6362991 -5 40 138910532 21.83 10 20 40 30 -5 40 1532519 3495057 23229 1311834 352
+    36 6362991 -5 40 117158566 18.41 7 15 30 23 -5 40 4074444 1402980 63287 822035 245
+    
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_mapping/lastz_paired_reads_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_mapping/lastz_paired_reads_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,847 @@\n+#!/usr/bin/env python\n+\n+"""\n+Runs Lastz paired read alignment process\n+Written for Lastz v. 1.02.00.\n+\n+# Author(s): based on various scripts written by Bob Harris (rsharris@bx.psu.edu),\n+# then tweaked to this form by Greg Von Kuster (greg@bx.psu.edu)\n+\n+This tool takes the following input:\n+a. A collection of 454 paired end reads ( a fasta file )\n+b. A linker sequence ( a very small fasta file )\n+c. A reference genome ( nob, 2bit or fasta )\n+\n+and uses the following process:\n+1. Split reads into mates:  the input to this step is the read file XXX.fasta, and the output is three\n+   files; XXX.short.fasta, XXX.long.fasta and XXX.mapping.  The mapping file records the information necessary\n+   to convert mate coordinates back into the original read, which is needed later in the process.\n+\n+2. Align short mates to the reference: this runs lastz against every chromosome.  The input is XXX.short.fasta\n+   and the reference genome, and the output is a SAM file, XXX.short.sam.\n+\n+3. Align long mates to the reference: this runs lastz against every chromosome.  The input is XXX.long.fasta\n+   and the reference genome, and the output is a SAM file, XXX.long.sam.\n+\n+4. Combine, and convert mate coordinates back to read coordinates.  The input is XXX.mapping, XXX.short.sam and\n+   XXX.long.sam, and the output is XXX.sam.\n+\n+usage: lastz_paired_reads_wrapper.py [options]\n+    --ref_name: The reference name to change all output matches to\n+    --ref_source: The reference is cached or from the history\n+    --source_select: Use pre-set or cached reference file\n+    --input1: The name of the reference file if using history or reference base name if using cached\n+    --input2: The reads file to align\n+    --input3: The sequencing linker file\n+    --input4: The base quality score 454 file\n+    --ref_sequences: The number of sequences in the reference file if using one from history \n+    --output: The name of the output file\n+    --lastz_seqs_file_dir: Directory of local lastz_seqs.loc file\n+"""\n+import optparse, os, subprocess, shutil, sys, tempfile, time\n+from string import maketrans\n+\n+from galaxy import eggs\n+import pkg_resources\n+pkg_resources.require( \'bx-python\' )\n+from bx.seq.twobit import *\n+from bx.seq.fasta import FastaReader\n+from galaxy.util.bunch import Bunch\n+from galaxy.util import string_as_bool\n+\n+# Column indexes for SAM required fields\n+SAM_QNAME_COLUMN = 0\n+SAM_FLAG_COLUMN  = 1\n+SAM_RNAME_COLUMN = 2\n+SAM_POS_COLUMN   = 3\n+SAM_MAPQ_COLUMN  = 4\n+SAM_CIGAR_COLUMN = 5\n+SAM_MRNM_COLUMN  = 6\n+SAM_MPOS_COLUMN  = 7\n+SAM_ISIZE_COLUMN = 8\n+SAM_SEQ_COLUMN   = 9\n+SAM_QUAL_COLUMN  = 10\n+SAM_MIN_COLUMNS  = 11\n+# SAM bit-encoded flags\n+BAM_FPAIRED      =    1    # the read is paired in sequencing, no matter whether it is mapped in a pair\n+BAM_FPROPER_PAIR =    2    # the read is mapped in a proper pair\n+BAM_FUNMAP       =    4    # the read itself is unmapped; conflictive with BAM_FPROPER_PAIR\n+BAM_FMUNMAP      =    8    # the mate is unmapped\n+BAM_FREVERSE     =   16    # the read is mapped to the reverse strand\n+BAM_FMREVERSE    =   32    # the mate is mapped to the reverse strand\n+BAM_FREAD1       =   64    # this is read1\n+BAM_FREAD2       =  128    # this is read2\n+BAM_FSECONDARY   =  256    # not primary alignment\n+BAM_FQCFAIL      =  512    # QC failure\n+BAM_FDUP         = 1024    # optical or PCR duplicate\n+\n+# Keep track of all created temporary files so they can be deleted\n+global tmp_file_names\n+tmp_file_names = []\n+# The values in the skipped_lines dict are tuples consisting of:\n+# - the number of skipped lines for that error\n+# If not a sequence error:\n+# - the 1st line number on which the error was found\n+# - the text of the 1st line on which the error was found\n+# If a sequence error:\n+# - The number of the sequence in the file\n+# - the sequence name on which the error occurred\n+# We may need to improve dealing with file position and text as\n+# much of it comes from temporary files that are created from '..b'uence in the dataset ( this may not be necessary ).\n+            error_msg = "The reference dataset is missing metadata, click the pencil icon in the history item and \'auto-detect\' the metadata attributes."\n+            ref_sequences = int( options.ref_sequences )\n+            if ref_sequences < 1:\n+                stop_err( error_msg )\n+        except:\n+            stop_err( error_msg )\n+    else:\n+        ref_sequences = 0\n+    tmp_w12_name = get_tmp_file_name( suffix=\'vs_linker.W12\' )\n+    tmp_T1_name = get_tmp_file_name( suffix=\'vs_linker.T1\' )\n+    # Run lastz twice ( with different options ) on the linker sequence and paired end reads,\n+    # looking for the linker ( each run finds some the other doesn\'t )\n+    command = \'lastz %s %s W=12 --notrans --exact=18 --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --coverage=85 --format=general-:name2,zstart2+,length2,size2 > %s\' % \\\n+        ( options.input3, options.input2, tmp_w12_name )\n+    run_command( command )\n+    command = \'lastz %s %s T=1 --match=1,2 O=1 E=2 X=15 K=10 Y=15 L=18 --ambiguousn --coverage=85 --format=general-:name2,zstart2+,length2,size2 > %s\' % \\\n+        ( options.input3, options.input2, tmp_T1_name )\n+    run_command( command )\n+    # Combine the alignment output from the two lastz runs\n+    tmp_combined_linker_file_name = get_tmp_file_name( suffix=\'vs_linker\' )\n+    command = \'cat %s %s | sort -u > %s\' % ( tmp_w12_name, tmp_T1_name, tmp_combined_linker_file_name )\n+    run_command( command )\n+    # Use the alignment info to split reads into left and right mates\n+    tmp_mates_mapping_file_name, tmp_mates_file_name, tmp_mates_short_file_name, tmp_mates_long_file_name = split_paired_reads( options.input2, tmp_combined_linker_file_name )\n+    # Align mates to the reference - tmp_align_file_names is a list of file names created by align_mates()\n+    tmp_align_file_name_list = align_mates( options.input1, options.ref_source, ref_name, ref_sequences, tmp_mates_short_file_name, tmp_mates_long_file_name )\n+    # Combine and convert mate coordinates back to read coordinates\n+    paired_mate_unmapper( options.input2, options.input4, tmp_mates_mapping_file_name, tmp_align_file_name_list, options.output )\n+    # Delete all temporary files\n+    for file_name in tmp_file_names:\n+        os.remove( file_name )\n+    # Handle any invalid lines in the input data\n+    if total_skipped_lines:\n+        msgs = dict( bad_interval="Bad interval in line",\n+                     inconsistent_read_lengths="Inconsistent read/quality lengths for seq #",\n+                     inconsistent_reads="Inconsistent reads for seq #",\n+                     inconsistent_sizes="Inconsistent sizes for seq #",\n+                     missing_mate="Mapping file does not include mate on line",\n+                     missing_quals="Missing quality values for name on line",\n+                     missing_seq="Missing sequence for name on line",\n+                     multiple_seqs="Multiple names for seq #",\n+                     no_header="First quality sequence has no header",\n+                     num_fields="Must have 4 fields in line",\n+                     reads_paired="SAM flag indicates reads already paired on line",\n+                     sam_flag="Bad SAM flag on line",\n+                     sam_headers="SAM headers on line",\n+                     sam_min_columns="Need 11 columns on line",\n+                     two_mate_names="Mate name already seen, line",\n+                     wrong_seq_len="Size differs from length of seq #" )\n+        print "Skipped %d invalid lines: "\n+        msg = ""\n+        for k, v in skipped_lines.items():\n+            if v[0]:\n+                # v[0] is the number of times the error occurred\n+                # v[1] is the position of the line or sequence in the file\n+                # v[2] is the name of the sequence or the text of the line\n+                msg += "(%d)%s %d:%s. " % ( v[0], msgs[k], v[1], v[2] )\n+        print msg\n+\n+if __name__=="__main__": __main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_mapping/lastz_paired_reads_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_mapping/lastz_paired_reads_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,278 @@\n+<tool id="lastz_paired_reads_wrapper" name="Lastz paired reads" version="1.1.1">\n+    <description> map short paired reads against reference sequence</description>\n+    <command interpreter="python">lastz_paired_reads_wrapper.py\n+      #if $seq_name.how_to_name=="yes":\n+        --ref_name=$seq_name.ref_name \n+      #end if\n+      --ref_source=$source.ref_source\n+      --input2=$input2\n+      --input3=$input3\n+      --input4=$input4\n+      #if $source.ref_source=="history":\n+        --input1=$source.input1\n+        --ref_sequences=$input1.metadata.sequences \n+      #else:\n+        --input1="${ filter( lambda x: str( x[0] ) == str( $source.input1_2bit ), $__app__.tool_data_tables[ \'lastz_seqs\' ].get_fields() )[0][-1] }"\n+      #end if\n+      --output=$output1\n+      --lastz_seqs_file_dir=${GALAXY_DATA_INDEX_DIR}\n+    </command>\n+    <inputs>\n+        <param name="input2" format="fasta" type="data" label="Align sequencing reads in" />\n+        <conditional name="source">\n+            <param name="ref_source" type="select" label="Against reference sequences that are">\n+                <option value="cached">locally cached</option>\n+                <option value="history">in your history</option>\n+            </param>\n+            <when value="cached">\n+                <param name="input1_2bit" type="select" label="Using reference genome" help="If your genome of interest is not listed, contact the Galaxy team">\n+                    <options from_data_table="lastz_seqs" />\n+                </param>\n+            </when>\n+            <when value="history">\n+                <param name="input1" type="data" format="fasta" label="Select a reference dataset" />\n+            </when>\n+        </conditional>\n+        <param name="input3" format="fasta" type="data" label="Linker file" />\n+        <param name="input4" format="qual454" type="data" label="Select a base quality score 454 dataset" />\n+        <conditional name="seq_name">\n+            <param name="how_to_name" type="select" label="Do you want to modify the reference name?">\n+                <option value="no">No</option>\n+                <option value="yes">Yes</option>\n+            </param>\n+            <when value="yes">\n+                <param name="ref_name" type="text" size="25" value="Type sequence name here" label="Enter name for the Reference sequence"/>\n+            </when>\n+            <when value="no" />\n+        </conditional>\n+    </inputs>\n+    <outputs>\n+        <data format="sam" name="output1" label="${tool.name} on ${on_string}: mapped reads" />\n+    </outputs>\n+    <requirements>\n+        <requirement type="package">lastz</requirement>\n+    </requirements>\n+    <tests>\n+        <test>\n+            <!--\n+                input1: a reference genome ( 2bit or fasta )\n+                input2: a collection of 454 paired end reads ( a fasta file )\n+                input3: a linker sequence ( a very small fasta file )\n+                input4: a base quality score 454 file ( qual454 )\n+            -->\n+            <param name="input2" value="lastz_paired_input2.fasta" ftype="fasta" />\n+            <param name="ref_source" value="cached" />\n+            <param name="input1_2bit" value="/galaxy/data/hg18/seq/chr21.2bit" />\n+            <param name="input3" value="lastz_paired_input3.fasta" ftype="fasta" />\n+            <param name="input4" value="lastz_paired_input4.qual454" ftype="qual454" />\n+            <param name="how_to_name" value="no" />\n+            <output name="output1" file="lastz_paired_out1.sam" />\n+        </test>\n+    </tests>\n+    <help>\n+        \n+**What it does**    \n+        \n+**LASTZ** is a high performance pairwise sequence aligner derived from BLASTZ. It is written by Bob Harris in Webb Miller\'s laboratory at Penn State University. Special scoring sets were derived to improve runtime performance and quality. This Galaxy version of LASTZ is geared towards aligning short (Illumina/Solexa, AB/SOLiD) and medium (Roche/454) paired read'..b' of words \'in\' a quantum ball\n+  --[no]entropy           involve entropy in filtering high scoring pairs\n+                          (default is "entropy")\n+  --[no]mirror            report/use mirror image of all gap-free alignments\n+                          (default is "mirror" for self-alignments only)\n+  --traceback=&lt;bytes&gt;     space for trace-back information\n+                          (default is 80.0M)\n+  --masking=&lt;count&gt;       mask any position in target hit this many times\n+                          zero indicates no masking\n+                          (default is no masking)\n+  --targetcapsule=&lt;capsule_file&gt;   the target seed word position table and seed\n+                          (as well as the target sequence)are read from specified file\n+  --segments=&lt;segment_file&gt;   read segments from a file, instead of discovering\n+                          them via seeding. Replaces other seeding or gap-free extension\n+                          options\n+  --[no]census[=&lt;file&gt;]     count/report how many times each target base aligns\n+                          (default is to not report census)\n+  --identity=&lt;min&gt;[..&lt;max&gt;]   filter alignments by percent identity\n+                          0&lt;=min&lt;=max&lt;=100;  blocks (or HSPs) outside min..max\n+                          are discarded\n+                          (default is no identity filtering)\n+  --coverage=&lt;min&gt;[..&lt;max&gt;]   filter alignments by percentage pf query covered\n+                          0&lt;=min&lt;=max&lt;=100;  blocks (or HSPs) outside min..max\n+                          are discarded\n+                          (default is no query coverage filtering)\n+  --notrivial             do not output trivial self-alignment block if the target and query \n+                          sequences are identical. Using --self enables this option automatically\n+  --output=&lt;output_file&gt;  write the alignments to the specified file name instead of stdout\n+  --code=&lt;file&gt;           give quantum code for query sequence (only for display)\n+  --format=&lt;type&gt;         specify output format; one of lav, axt, maf, maf+, maf-, text,\n+                          lav+text, cigar, text, rdplot, general, or general:&lt;fields&gt;\n+                          (by default output is LAV)\n+  --rdotplot=&lt;file&gt;       create an additional output file suitable for plotting the alignments \n+                          with the R statistical package.\n+  --markend               Just before normal completion, write "# lastz end-of-file" to output file\n+  --census[=&lt;output_file&gt;]    count and report how many times each target base aligns, up \n+                          to 255. Ns are included in the count\n+  --census16[=&lt;output_file&gt;]  count and report how many times each target base aligns, up\n+                          up 65 thousand\n+  --census32[=&lt;output_file&gt;]  count and report how many times each target bas aligns, up\n+                          to 4 billion\n+  --writecapsule=&lt;capsule_file&gt;    just write out a targegt capsule file and quit; don\'t \n+                          search for seeds or perform subsequent stages\n+  --verbosity=&lt;level&gt;     set info level (0 is minimum, 10 is everything)\n+                          (default is 0)\n+  --[no]runtime           report runtime in the output file\n+                          (default is to not report runtime)\n+  --tableonly[=count]     just produce the target position table, don\'t\n+                          search for seeds\n+  --[no]stats[=&lt;file&gt;]    show search statistics (or don\'t)\n+                          (not available in this build)\n+  --version               report the program version and quit\n+  --help                  list all options\n+  --help=files            list information about file specifiers\n+  --help=short[cuts]      list blastz-compatible shortcuts\n+  --help=yasra            list yasra-specific shortcuts\n+\n+    </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_mapping/lastz_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_mapping/lastz_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,290 @@\n+#!/usr/bin/env python\n+\n+"""\n+Runs Lastz\n+Written for Lastz v. 1.01.88.\n+\n+usage: lastz_wrapper.py [options]\n+    --ref_name: The reference name to change all output matches to\n+    --ref_source: Whether the reference is cached or from the history\n+    --source_select: Whether to used pre-set or cached reference file\n+    --input1: The name of the reference file if using history or reference base name if using cached\n+    --input2: The reads file to align \n+    --ref_sequences: The number of sequences in the reference file if using one from history \n+    --pre_set_options: Which of the pre set options to use, if using pre-sets\n+    --strand: Which strand of the read to search, if specifying all parameters\n+    --seed: Seeding settings, if specifying all parameters\n+    --gfextend: Whether to perform gap-free extension of seed hits to HSPs (high scoring segment pairs), if specifying all parameters\n+    --chain: Whether to perform chaining of HSPs, if specifying all parameters\n+    --transition: Number of transitions to allow in each seed hit, if specifying all parameters\n+    --O: Gap opening penalty, if specifying all parameters\n+    --E: Gap extension penalty, if specifying all parameters\n+    --X: X-drop threshold, if specifying all parameters\n+    --Y: Y-drop threshold, if specifying all parameters\n+    --K: Threshold for HSPs, if specifying all parameters\n+    --L: Threshold for gapped alignments, if specifying all parameters\n+    --entropy: Whether to involve entropy when filtering HSPs, if specifying all parameters\n+    --identity_min: Minimum identity (don\'t report matches under this identity)\n+    --identity_max: Maximum identity (don\'t report matches above this identity)\n+    --coverage: The minimum coverage value (don\'t report matches covering less than this) \n+    --unmask: Whether to convert lowercase bases to uppercase\n+    --out_format: The format of the output file (sam, diffs, or tabular (general))\n+    --output: The name of the output file\n+    --lastzSeqsFileDir: Directory of local lastz_seqs.loc file\n+"""\n+import optparse, os, subprocess, shutil, sys, tempfile, threading, time\n+from Queue import Queue\n+\n+from galaxy import eggs\n+import pkg_resources\n+pkg_resources.require( \'bx-python\' )\n+from bx.seq.twobit import *\n+from bx.seq.fasta import FastaReader\n+from galaxy.util.bunch import Bunch\n+\n+STOP_SIGNAL = object()\n+WORKERS = 4\n+SLOTS = 128\n+\n+def stop_err( msg ):\n+    sys.stderr.write( "%s" % msg )\n+    sys.exit()\n+\n+def stop_queues( lastz, combine_data ):\n+    # This method should only be called if an error has been encountered.\n+    # Send STOP_SIGNAL to all worker threads\n+    for t in lastz.threads:\n+        lastz.put( STOP_SIGNAL, True )\n+    combine_data.put( STOP_SIGNAL, True )\n+\n+class BaseQueue( object ):\n+    def __init__( self, num_threads, slots=-1 ):\n+        # Initialize the queue and worker threads\n+        self.queue = Queue( slots )\n+        self.threads = []\n+        for i in range( num_threads ):\n+            worker = threading.Thread( target=self.run_next )\n+            worker.start()\n+            self.threads.append( worker )\n+    def run_next( self ):\n+        # Run the next job, waiting until one is available if necessary\n+        while True:\n+            job = self.queue.get()\n+            if job is STOP_SIGNAL:\n+                return self.shutdown()\n+            self.run_job( job )\n+            time.sleep( 1 )\n+    def run_job( self, job ):\n+        stop_err( \'Not Implemented\' )\n+    def put( self, job, block=False ):\n+        # Add a job to the queue\n+        self.queue.put( job, block )\n+    def shutdown( self ):\n+        return\n+\n+class LastzJobQueue( BaseQueue ):\n+    """\n+    A queue that runs commands in parallel.  Blocking is done so the queue will\n+    not consume much memory.\n+    """\n+    def run_job( self, job ):\n+        # Execute the job\'s command\n+        proc = subprocess.Popen( args=job.command, shell=True, stderr=subprocess.PIPE, )\n+        proc.'..b'       stop_err( error_msg )\n+        except:\n+            stop_queues( lastz_job_queue, combine_data_queue )\n+            stop_err( error_msg )\n+        seqs = 0\n+        fasta_reader = FastaReader( open( options.input1 ) )\n+        while True:\n+            # Read the next sequence from the reference dataset\n+            seq = fasta_reader.next()\n+            if not seq:\n+                break\n+            seqs += 1\n+            # Create a temporary file to contain the current sequence as input to lastz\n+            tmp_in_fd, tmp_in_name = tempfile.mkstemp( suffix=\'.in\' )\n+            tmp_in = os.fdopen( tmp_in_fd, \'wb\' )\n+            # Write the current sequence to the temporary input file\n+            tmp_in.write( \'>%s\\n%s\\n\' % ( seq.name, seq.text ) )\n+            tmp_in.close()\n+            # Create a 2nd temporary file to contain the output from lastz execution on the current sequence\n+            tmp_out_fd, tmp_out_name = tempfile.mkstemp( suffix=\'.out\' )\n+            os.close( tmp_out_fd )\n+            # Generate the command line for calling lastz on the current sequence\n+            command = \'lastz %s%s%s %s %s --ambiguousn --nolaj --identity=%s..%s --coverage=%s --format=%s%s > %s\' % \\\n+                ( tmp_in_name, unmask, ref_name, input2, set_options, options.identity_min, \n+                  options.identity_max, options.coverage, format, tabular_fields, tmp_out_name )\n+            # Create a job object\n+            job = Bunch()\n+            job.command = command\n+            job.output = tmp_out_name\n+            job.cleanup = [ tmp_in_name, tmp_out_name ]\n+            job.combine_data_queue = combine_data_queue\n+            # Add another job to the lastz_job_queue. Execution \n+            # will wait at this point if the queue is full.\n+            lastz_job_queue.put( job, block=True )\n+        # Make sure the value of sequences in the metadata is the same as the\n+        # number of sequences read from the dataset ( this may not be necessary ).\n+        if ref_sequences != seqs:\n+            stop_queues( lastz_job_queue, combine_data_queue )\n+            stop_err( "The value of metadata.sequences (%d) differs from the number of sequences read from the reference (%d)." % ( ref_sequences, seqs ) )\n+    else:\n+        # Reference is a locally cached 2bit file, split job across number of chroms in 2bit file\n+        tbf = TwoBitFile( open( options.input1, \'r\' ) )\n+        for chrom in tbf.keys():\n+            # Create a temporary file to contain the output from lastz execution on the current chrom\n+            tmp_out_fd, tmp_out_name = tempfile.mkstemp( suffix=\'.out\' )\n+            os.close( tmp_out_fd )\n+            command = \'lastz %s/%s%s%s %s %s --ambiguousn --nolaj --identity=%s..%s --coverage=%s --format=%s%s >> %s\' % \\\n+                ( options.input1, chrom, unmask, ref_name, input2, set_options, options.identity_min, \n+                  options.identity_max, options.coverage, format, tabular_fields, tmp_out_name )\n+            # Create a job object\n+            job = Bunch()\n+            job.command = command\n+            job.output = tmp_out_name\n+            job.cleanup = [ tmp_out_name ]\n+            job.combine_data_queue = combine_data_queue\n+            # Add another job to the lastz_job_queue. Execution \n+            # will wait at this point if the queue is full.\n+            lastz_job_queue.put( job, block=True )\n+\n+    # Stop the lastz_job_queue\n+    for t in lastz_job_queue.threads:\n+        lastz_job_queue.put( STOP_SIGNAL, True )\n+    # Although all jobs are submitted to the queue, we can\'t shut down the combine_data_queue\n+    # until we know that all jobs have been submitted to its queue.  We do this by checking\n+    # whether all of the threads in the lastz_job_queue have terminated.\n+    while threading.activeCount() > 2:\n+        time.sleep( 1 )\n+    # Now it\'s safe to stop the combine_data_queue\n+    combine_data_queue.put( STOP_SIGNAL )\n+\n+if __name__=="__main__": __main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_mapping/lastz_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_mapping/lastz_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,524 @@\n+<tool id="lastz_wrapper_2" name="Lastz" version="1.2.2">\n+    <description> map short reads against reference sequence</description>\n+    <command interpreter="python">lastz_wrapper.py\n+      #if $seq_name.how_to_name=="yes":\n+        --ref_name=$seq_name.ref_name \n+      #end if\n+      --ref_source=$source.ref_source\n+      --source_select=$params.source_select\n+      --out_format=$out_format\n+      --input2=$input2 \n+      #if $source.ref_source=="history":\n+        --input1=$source.input1\n+        --ref_sequences=$input1.metadata.sequences \n+      #else:\n+        --input1="${ filter( lambda x: str( x[0] ) == str( $source.input1_2bit ), $__app__.tool_data_tables[ \'lastz_seqs\' ].get_fields() )[0][-1] }"\n+        --ref_sequences="None" \n+      #end if\n+      #if $params.source_select=="pre_set":\n+        --pre_set_options=${params.pre_set_options}\n+      #else:\n+        --strand=$params.strand\n+        --seed=$params.seed\n+        --gfextend=$params.gfextend\n+        --chain=$params.chain\n+        --transition="$params.transition"\n+        --O=$params.O\n+        --E=$params.E\n+        --X=$params.X\n+        --Y=$params.Y\n+        --K=$params.K\n+        --L=$params.L\n+        --entropy=$params.entropy \n+      #end if\n+      --identity_min=$min_ident\n+      --identity_max=$max_ident\n+      --coverage=$min_cvrg\n+      --output=$output1\n+      --unmask=$unmask\n+      --lastzSeqsFileDir=${GALAXY_DATA_INDEX_DIR}\n+    </command>\n+    <inputs>\n+        <param name="input2" format="fasta" type="data" label="Align sequencing reads in" />\n+        <conditional name="source">\n+            <param name="ref_source" type="select" label="Against reference sequences that are">\n+                <option value="cached">locally cached</option>\n+                <option value="history">in your history</option>\n+            </param>\n+            <when value="cached">\n+                <param name="input1_2bit" type="select" label="Using reference genome" help="If your genome of interest is not listed, contact the Galaxy team">\n+                    <options from_data_table="lastz_seqs" />\n+                </param>\n+            </when>\n+            <when value="history">\n+                <param name="input1" type="data" format="fasta" label="Select a reference dataset" />\n+            </when>\n+        </conditional>\n+        <param name="out_format" type="select" label="Output format">\n+            <option value="sam">SAM</option>\n+            <option value="diffs">Polymorphisms</option>\n+            <option value="tabular">Tabular</option>\n+        </param>\n+        <conditional name="params">\n+            <param name="source_select" type="select" label="Lastz settings to use" help="For most mapping needs use Commonly used settings. If you want full control use Full List">\n+                <option value="pre_set">Commonly used</option>\n+                <option value="full">Full Parameter List</option>\n+            </param>\n+            <when value="pre_set">\n+                <param name="pre_set_options" type="select" label="Select mapping mode">\n+                    <option value="yasra98">Roche-454 98% identity</option>\n+                    <option value="yasra95">Roche-454 95% identity</option>\n+                    <option value="yasra90">Roche-454 90% identity</option>\n+                    <option value="yasra85">Roche-454 85% identity</option>\n+                    <option value="yasra75">Roche-454 75% identity</option>\n+                    <option value="yasra95short">Illumina 95% identity</option>\n+                    <option value="yasra85short">Illumina 85% identity</option>\n+                </param>\n+            </when>\n+            <when value="full">\n+                <param name="strand" type="select" label="Which strand to search?">\n+                    <option value="both">Both</option>\n+                    <option value="plus">Search forward strand only (the one in the reference)</option>\n+                    <option v'..b'd of words \'in\' a quantum ball\n+  --[no]entropy           involve entropy in filtering high scoring pairs\n+                          (default is "entropy")\n+  --[no]mirror            report/use mirror image of all gap-free alignments\n+                          (default is "mirror" for self-alignments only)\n+  --traceback=&lt;bytes&gt;     space for trace-back information\n+                          (default is 80.0M)\n+  --masking=&lt;count&gt;       mask any position in target hit this many times\n+                          zero indicates no masking\n+                          (default is no masking)\n+  --targetcapsule=&lt;capsule_file&gt;   the target seed word position table and seed\n+                          (as well as the target sequence)are read from specified file\n+  --segments=&lt;segment_file&gt;   read segments from a file, instead of discovering\n+                          them via seeding. Replaces other seeding or gap-free extension\n+                          options\n+  --[no]census[=&lt;file&gt;]     count/report how many times each target base aligns\n+                          (default is to not report census)\n+  --identity=&lt;min&gt;[..&lt;max&gt;]   filter alignments by percent identity\n+                          0&lt;=min&lt;=max&lt;=100;  blocks (or HSPs) outside min..max\n+                          are discarded\n+                          (default is no identity filtering)\n+  --coverage=&lt;min&gt;[..&lt;max&gt;]   filter alignments by percentage pf query covered\n+                          0&lt;=min&lt;=max&lt;=100;  blocks (or HSPs) outside min..max\n+                          are discarded\n+                          (default is no query coverage filtering)\n+  --notrivial             do not output trivial self-alignment block if the target and query \n+                          sequences are identical. Using --self enables this option automatically\n+  --output=&lt;output_file&gt;  write the alignments to the specified file name instead of stdout\n+  --code=&lt;file&gt;           give quantum code for query sequence (only for display)\n+  --format=&lt;type&gt;         specify output format; one of lav, axt, maf, maf+, maf-, text,\n+                          lav+text, cigar, text, rdplot, general, or general:&lt;fields&gt;\n+                          (by default output is LAV)\n+  --rdotplot=&lt;file&gt;       create an additional output file suitable for plotting the alignments \n+                          with the R statistical package.\n+  --markend               Just before normal completion, write "# lastz end-of-file" to output file\n+  --census[=&lt;output_file&gt;]    count and report how many times each target base aligns, up \n+                          to 255. Ns are included in the count\n+  --census16[=&lt;output_file&gt;]  count and report how many times each target base aligns, up\n+                          up 65 thousand\n+  --census32[=&lt;output_file&gt;]  count and report how many times each target bas aligns, up\n+                          to 4 billion\n+  --writecapsule=&lt;capsule_file&gt;    just write out a target capsule file and quit; don\'t \n+                          search for seeds or perform subsequent stages\n+  --verbosity=&lt;level&gt;     set info level (0 is minimum, 10 is everything)\n+                          (default is 0)\n+  --[no]runtime           report runtime in the output file\n+                          (default is to not report runtime)\n+  --tableonly[=count]     just produce the target position table, don\'t\n+                          search for seeds\n+  --[no]stats[=&lt;file&gt;]    show search statistics (or don\'t)\n+                          (not available in this build)\n+  --version               report the program version and quit\n+  --help                  list all options\n+  --help=files            list information about file specifiers\n+  --help=short[cuts]      list blastz-compatible shortcuts\n+  --help=yasra            list yasra-specific shortcuts\n+\n+    </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_mapping/mosaik.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_mapping/mosaik.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,120 @@
+<?xml version="1.0"?>
+<tool id="mosaik_wrapper" name="Map with Mosaik" version="1.1.1">
+  <description/>
+  <requirements><requirement type="package">mosaik</requirement></requirements>
+  <command>
+    #set $processors = '-p 4'
+    #set $lm = ''
+    #if $paired.kind == 'single':
+        #set $mfl = ''
+        #set $ls  = ''
+    #else:
+        #set $ls = '-ls $mfl'
+    #end if
+    MosaikBuild -fr
+    #if $genomeSource.refGenomeSource == 'indexed':
+        ${ filter( lambda x: str( x[0] ) == str( $genomeSource.indexReference ), $__app__.tool_data_tables[ 'mosaik_indexes' ].get_fields() )[0][-1] }
+    #else:
+        $genomeSource.historyReference
+    #end if
+        -oa mosaik_ref_file;
+    MosaikBuild  -q $reads $mfl -st $st -out mosaik_reads_file;
+    MosaikAligner -ia mosaik_ref_file -in mosaik_reads_file -out mosaik_aligned_file $ls -mm $mm -mhp $mhp -act $act -bw $bw $processors $lm -hs 15;
+    MosaikText -in mosaik_aligned_file -$outFormat sam_bam_file;
+    #if str($outFormat) == 'bam':
+        samtools sort sam_bam_file sorted_bam;
+        mv sorted_bam.bam $output
+    #else:
+        gunzip sam_bam_file.gz;
+        mv sam_bam_file $output
+    #end if
+  </command>
+  <inputs>
+    <conditional name="genomeSource">
+      <param name="refGenomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?">
+        <option value="indexed">Use a built-in index</option>
+        <option value="history">Use one from the history</option>
+      </param>
+      <when value="indexed">
+        <param name="indexReference" type="select" label="Select a reference genome">
+          <options from_data_table="mosaik_indexes">
+            <filter type="sort_by" column="2"/>
+            <validator type="no_options" message="No indexes are available" />
+          </options>
+        </param>
+      </when>
+      <when value="history">
+        <param format="fasta" name="historyReference" type="data" metadata_name="dbkey" label="Select a reference from history"/>
+      </when>
+    </conditional>
+    <param format="fastq" name="reads" type="data" label="Fastq Reads File"/>
+    <param name="outFormat" type="select" label="Output Format">
+      <option value="sam">Sam</option>
+      <option value="bam">Bam</option>
+    </param>
+    <param name="st" type="select" label="Sequencing Technology Used">
+      <option value="454">454</option>
+      <option value="illumina">Illumina</option>
+      <option value="solid">Solid</option>
+      <option value="sanger">Sanger</option>
+      <option value="helicos">Helicos</option>
+    </param>
+    <conditional name="paired">
+      <param name="kind" type="select" label="Is this library mate-paired?">
+        <option value="single">Single-end</option>
+        <option value="paired">Paired-end</option>
+      </param>
+      <when value="single"/>
+      <when value="paired">
+        <param name="mfl" type="integer" value="200" label="Insert Size" help="the length between the paired reads"/>
+        <param name="ls" type="integer" value="50" label="Realignment Window" help="Window size to realign mate pairs that are out of position. Large values slow down performance"/>
+      </when>
+    </conditional>
+    <param name="mm" size="5" type="integer" value="6" label="Mismatches allowed" help="mismatches allowed per sequence"/>
+    <param name="act" size="5" type="integer" value="35" label="Alignment Candidate Threshold" help="determines which hash regions will be aligned with Smith Waterman"/>
+    <param name="bw" size="5" type="integer" value="19" label="Smith-Waterman band width"/>
+    <param name="mhp" size="5" type="integer" value="100" label="Maximum # Of Positions Stored Per Seed" help="number of places in the reference the aligner will try to place a particular hash"/>
+  </inputs>
+  <outputs>
+    <data format="sam" name="output">
+      <change_format>
+        <when input="outFormat" value="bam" format="bam" />
+      </change_format>
+      <actions>
+        <conditional name="genomeSource.refGenomeSource">
+          <when value="indexed">
+            <action type="metadata" name="dbkey">
+              <option type="from_data_table" name="mosaik_indexes" column="1">
+                <filter type="param_value" column="0" value="#" compare="startswith" keep="False" />
+                <filter type="param_value" ref="genomeSource.indexReference" column="0" />
+              </option>
+            </action>
+          </when>
+          <when value="history">
+            <action type="metadata" name="dbkey">
+              <option type="from_param" name="genomeSource.historyReference" param_attribute="dbkey" />
+            </action>
+          </when>
+        </conditional>
+      </actions>
+   </data>
+  </outputs>
+  <tests>
+    <test>
+      <param name="refGenomeSource" value="history"/>
+      <param name="historyReference" ftype="fasta" value="mosaik_test_ref.fasta"/>
+      <param name="reads" ftype="fastq" value="mosaik_test_input.fastq"/>
+      <param name="outFormat" value="sam"/>
+      <param name="st" value="454"/>
+      <param name="kind" value="single"/>
+      <param name="mm" value="6"/>
+      <param name="act" value="35"/>
+      <param name="bw" value="19"/>
+      <param name="mhp" value="100"/>
+      <output name="output" file="mosaik_test_out.sam" compare="sim_size" delta="0"/>
+    </test>
+  </tests>
+  <help>
+This tool uses Mosaik to align reads to a reference sequence.
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_mapping/srma_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_mapping/srma_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,195 @@\n+#!/usr/bin/env python\n+\n+"""\n+Runs SRMA on a SAM/BAM file;\n+TODO: more documentation\n+\n+usage: srma_wrapper.py [options]\n+\n+See below for options\n+"""\n+\n+import optparse, os, shutil, subprocess, sys, tempfile\n+\n+def stop_err( msg ):\n+    sys.stderr.write( \'%s\\n\' % msg )\n+    sys.exit()\n+\n+def parseRefLoc( refLoc, refUID ):\n+    for line in open( refLoc ):\n+        if not line.startswith( \'#\' ):\n+            fields = line.strip().split( \'\\t\' )\n+            if len( fields ) >= 3:\n+                if fields[0] == refUID:\n+                    return fields[1]\n+    return None\n+\n+def __main__():\n+    #Parse Command Line\n+    parser = optparse.OptionParser()\n+    parser.add_option( \'-r\', \'--ref\', dest=\'ref\', help=\'The reference genome to index and use\' )\n+    parser.add_option( \'-u\', \'--refUID\', dest=\'refUID\', help=\'The pre-index reference genome unique Identifier\' )\n+    parser.add_option( \'-L\', \'--refLocations\', dest=\'refLocations\', help=\'The filepath to the srma indices location file\' )\n+    parser.add_option( \'-i\', \'--input\', dest=\'input\', help=\'The SAM/BAM input file\' )\n+    parser.add_option( \'-I\', \'--inputIndex\', dest=\'inputIndex\', help=\'The SAM/BAM input index file\' )\n+    parser.add_option( \'-o\', \'--output\', dest=\'output\', help=\'The SAM/BAM output file\' )\n+    parser.add_option( \'-O\', \'--offset\', dest=\'offset\', help=\'The alignment offset\' )\n+    parser.add_option( \'-Q\', \'--minMappingQuality\', dest=\'minMappingQuality\', help=\'The minimum mapping quality\' )\n+    parser.add_option( \'-P\', \'--minAlleleProbability\', dest=\'minAlleleProbability\', help=\'The minimum allele probability conditioned on coverage (for the binomial quantile).\' )\n+    parser.add_option( \'-C\', \'--minAlleleCoverage\', dest=\'minAlleleCoverage\', help=\'The minimum haploid coverage for the consensus\' )\n+    parser.add_option( \'-R\', \'--range\', dest=\'range\', help=\'A range to examine\' )\n+    parser.add_option( \'-c\', \'--correctBases\', dest=\'correctBases\', help=\'Correct bases \' )\n+    parser.add_option( \'-q\', \'--useSequenceQualities\', dest=\'useSequenceQualities\', help=\'Use sequence qualities \' )\n+    parser.add_option( \'-M\', \'--maxHeapSize\', dest=\'maxHeapSize\', help=\'The maximum number of nodes on the heap before re-alignment is ignored\' )\n+    parser.add_option( \'-s\', \'--fileSource\', dest=\'fileSource\', help=\'Whether to use a previously indexed reference sequence or one from history (indexed or history)\' )\n+    parser.add_option( \'-p\', \'--params\', dest=\'params\', help=\'Parameter setting to use (pre_set or full)\' )\n+    parser.add_option( \'-j\', \'--jarBin\', dest=\'jarBin\', default=\'\', help=\'The path to where jars are stored\' )\n+    parser.add_option( \'-f\', \'--jarFile\', dest=\'jarFile\', help=\'The file name of the jar file to use\')\n+    (options, args) = parser.parse_args()\n+\n+    # make temp directory for srma\n+    tmp_dir = tempfile.mkdtemp()\n+    buffsize = 1048576\n+\n+    # set up reference filenames\n+    reference_filepath_name = None\n+    # need to create SRMA dict and Samtools fai files for custom genome\n+    if options.fileSource == \'history\':\n+        try:\n+            reference_filepath = tempfile.NamedTemporaryFile( dir=tmp_dir, suffix=\'.fa\' )\n+            reference_filepath_name = reference_filepath.name\n+            reference_filepath.close()\n+            fai_filepath_name = \'%s.fai\' % reference_filepath_name\n+            dict_filepath_name = reference_filepath_name.replace( \'.fa\', \'.dict\' )\n+            os.symlink( options.ref, reference_filepath_name )\n+            # create fai file using Samtools\n+            index_fai_cmd = \'samtools faidx %s\' % reference_filepath_name\n+            try:\n+                tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name\n+                tmp_stderr = open( tmp, \'wb\' )\n+                proc = subprocess.Popen( args=index_fai_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )\n+                returncode = proc.wait()\n+                tmp_stderr.close()\n+                # get stderr, allowing for case'..b'eption, e:\n+            # clean up temp dir\n+            if os.path.exists( tmp_dir ):\n+                shutil.rmtree( tmp_dir )\n+            stop_err( \'Problem handling SRMA index (dict file) for custom genome file: %s\\n\' % str( e ) )\n+    # using built-in dict/index files\n+    else:\n+        if options.ref:\n+            reference_filepath_name = options.ref\n+        else:\n+            reference_filepath_name = parseRefLoc( options.refLocation, options.refUID )\n+    if reference_filepath_name is None:\n+        raise ValueError( \'A valid genome reference was not provided.\' )\n+\n+    # set up aligning and generate aligning command options\n+    if options.params == \'pre_set\':\n+        srma_cmds = \'\'\n+    else:\n+        if options.useSequenceQualities == \'true\':\n+            useSequenceQualities = \'true\'\n+        else:\n+            useSequenceQualities = \'false\'\n+        ranges = \'null\'\n+        if options.range == \'None\':\n+            range = \'null\'\n+        else:\n+            range = options.range\n+        srma_cmds = "OFFSET=%s MIN_MAPQ=%s MINIMUM_ALLELE_PROBABILITY=%s MINIMUM_ALLELE_COVERAGE=%s RANGES=%s RANGE=%s CORRECT_BASES=%s USE_SEQUENCE_QUALITIES=%s MAX_HEAP_SIZE=%s" % ( options.offset, options.minMappingQuality, options.minAlleleProbability, options.minAlleleCoverage, ranges, range, options.correctBases, options.useSequenceQualities, options.maxHeapSize )\n+\n+    # perform alignments\n+    buffsize = 1048576\n+    try:\n+        #symlink input bam and index files due to the naming conventions required by srma here\n+        input_bam_filename = os.path.join( tmp_dir, \'%s.bam\' % os.path.split( options.input )[-1] )\n+        os.symlink( options.input, input_bam_filename )\n+        input_bai_filename = "%s.bai" % os.path.splitext( input_bam_filename )[0]\n+        os.symlink( options.inputIndex, input_bai_filename )\n+\n+        #create a temp output name, ending in .bam due to required naming conventions? unkown if required\n+        output_bam_filename = os.path.join( tmp_dir, "%s.bam" % os.path.split( options.output )[-1] )\n+        # generate commandline\n+        cmd = \'java -jar %s I=%s O=%s R=%s %s\' % ( os.path.join( options.jarBin, options.jarFile ), input_bam_filename, output_bam_filename, reference_filepath_name, srma_cmds )\n+\n+        # need to nest try-except in try-finally to handle 2.4\n+        try:\n+            try:\n+                tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name\n+                tmp_stderr = open( tmp, \'wb\' )\n+                proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )\n+                returncode = proc.wait()\n+                tmp_stderr.close()\n+                # get stderr, allowing for case where it\'s very large\n+                tmp_stderr = open( tmp, \'rb\' )\n+                stderr = \'\'\n+                try:\n+                    while True:\n+                        stderr += tmp_stderr.read( buffsize )\n+                        if not stderr or len( stderr ) % buffsize != 0:\n+                            break\n+                except OverflowError:\n+                    pass\n+                tmp_stderr.close()\n+                if returncode != 0:\n+                    raise Exception, stderr\n+            except Exception, e:\n+                raise Exception, \'Error executing SRMA. \' + str( e )\n+            # move file from temp location (with .bam name) to provided path\n+            shutil.move( output_bam_filename, options.output )\n+            # check that there are results in the output file\n+            if os.path.getsize( options.output ) <= 0:\n+                raise Exception, \'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.\'\n+        except Exception, e:\n+            stop_err( \'The re-alignment failed.\\n\' + str( e ) )\n+    finally:\n+        # clean up temp dir\n+        if os.path.exists( tmp_dir ):\n+            shutil.rmtree( tmp_dir )\n+\n+if __name__=="__main__": __main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/sr_mapping/srma_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/sr_mapping/srma_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,218 @@\n+<tool id="srma_wrapper" name="Re-align with SRMA" version="0.2.5">\n+  <description></description>\n+  <command interpreter="python">srma_wrapper.py \n+    #if $refGenomeSource.refGenomeSource_type == "history":\n+      --ref=$refGenomeSource.ownFile\n+    #else:\n+      --ref="${ filter( lambda x: str( x[0] ) == str( $refGenomeSource.ref ), $__app__.tool_data_tables[ \'srma_indexes\' ].get_fields() )[0][-1] }"\n+      --refUID=$refGenomeSource.ref\n+      --refLocations=${GALAXY_DATA_INDEX_DIR}/srma_index.loc\n+    #end if\n+    --input=$input\n+    --inputIndex=${input.metadata.bam_index}\n+    --output=$output\n+    --params=$params.source_select\n+    --fileSource=$refGenomeSource.refGenomeSource_type\n+    --jarBin="${GALAXY_DATA_INDEX_DIR}/shared/jars"\n+    #if $params.source_select == "full":\n+      --offset=$params.offset\n+      --minMappingQuality=$params.minMappingQuality\n+      --minAlleleProbability=$params.minAlleleProbability\n+      --minAlleleCoverage=$params.minAlleleCoverage\n+      --range=$params.range\n+      --correctBases=$params.correctBases\n+      --useSequenceQualities=$params.useSequenceQualities\n+      --maxHeapSize=$params.maxHeapSize\n+    #end if\n+    --jarFile="srma.jar"\n+  </command>\n+  <inputs>\n+    <conditional name="refGenomeSource">\n+      <param name="refGenomeSource_type" type="select" label="Will you select a reference genome from your history or use a built-in reference?">\n+        <option value="built-in">Use a built-in reference</option>\n+        <option value="history">Use one from the history</option>\n+      </param>\n+      <when value="built-in">\n+        <param name="ref" type="select" label="Select a reference genome">\n+          <options from_data_table="srma_indexes">\n+            <filter type="sort_by" column="2" />\n+            <validator type="no_options" message="No indexes are available" />\n+          </options>\n+        </param>\n+      </when>\n+      <when value="history">\n+        <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" />\n+      </when>\n+    </conditional>\n+    <param name="input" type="data" format="bam" label="Input BAM file" help="The input BAM file to re-align"/>\n+    <conditional name="params">\n+      <param name="source_select" type="select" label="SRMA settings to use" help="For most re-alignment needs, use Commonly Used settings. If you want full control use Full Parameter List">\n+        <option value="pre_set">Commonly Used</option>\n+        <option value="full">Full Parameter List</option>\n+      </param>\n+      <when value="pre_set" />\n+      <when value="full">\n+        <param name="offset" type="integer" value="20" label="Offset" help="The alignment offset" />\n+        <param name="minMappingQuality" type="integer" value="0" label="Minimum mapping quality" help="The minimum mapping quality" />\n+        <param name="minAlleleProbability" type="float" value="0.1" label="Minimum allele probability" help="The minimum allele probability conditioned on coverage (for the binomial quantile)." />\n+        <param name="minAlleleCoverage" type="integer" value="2" label="Minimum allele coverage" help="The minimum haploid coverage for the consensus. Default value: 3. This option can be set " />\n+        <param name="range" type="text" value="null" label="Range" help="A range to examine" />\n+        <param name="correctBases" type="boolean" truevalue="true" falsevalue="false" checked="no" label="Correct bases" help="Correct bases " />\n+        <param name="useSequenceQualities" type="boolean" truevalue="true" falsevalue="false" checked="no" label="Use sequence qualities" help="Use sequence qualities " />\n+        <param name="maxHeapSize" type="integer" value="8192" label="Maximum heap size" help="The maximum number of nodes on the heap before re-alignment is ignored" />\n+      </when>\n+    </conditional>\n+  </inputs>\n+  <outputs>\n+    <data format="bam" name="output" label="${tool.name} on ${on_strin'..b' name="maxHeapSize" value="8192" />\n+          <output name="output" file="srma_out2.bam" ftype="bam" lines_diff="2" /><!-- allows tag with version number to be different -->\n+      </test>\n+  </tests>\n+  <help>\n+**What it does**\n+\n+SRMA is a short read micro re-aligner for next-generation high throughput sequencing data.\n+\n+Sequence alignment algorithms examine each read independently. When indels occur towards the ends of reads, the alignment can lead to false SNPs as well as improperly placed indels. This tool aims to perform a re-alignment of each read to a graphical representation of all alignments within a local region to provide a better overall base-resolution consensus.\n+\n+Currently this tool works well with and has been tested on 30x diploid coverage genome sequencing data from Illumina and ABI SOLiD technology. This tool may not work well with 454 data, as indels are a significant error mode for 454 data. \n+\n+------\n+\n+Please cite the website "http://srma.sourceforge.net" as well as:\n+\n+Homer N, and Nelson SF.  SRMA: short read micro re-aligner. 2010.\n+\n+------\n+\n+**Know what you are doing**\n+\n+.. class:: warningmark\n+\n+There is no such thing (yet) as an automated gearshift in short read mapping. It is all like stick-shift driving in San Francisco. In other words = running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy.\n+\n+.. __: http://srma.sourceforge.net/\n+\n+------\n+\n+**Input formats**\n+\n+SRMA accepts a BAM input file. Note that this file should have been generated from a SAM file which contains the header.\n+\n+------\n+\n+**Outputs**\n+\n+The output is in BAM format, see http://samtools.sourceforge.net for more details.\n+\n+-------\n+\n+**SRMA settings**\n+\n+All of the options have a default value. You can change any of them. Most of the options in SRMA have been implemented here.\n+\n+------\n+\n+**SRMA parameter list**\n+\n+This is an exhaustive list of SRMA options:\n+\n+For **SRMA**::\n+\n+  INPUT=File\n+  I=File                        The input SAM or BAM file. Required. \n+  \n+  OUTPUT=File\n+  O=File                        The output SAM or BAM file. Default value: null. \n+  \n+  REFERENCE=File\n+  R=File                        The reference FASTA file. Required. \n+  \n+  OFFSET=Integer                The alignment offset. Default value: 20. This option can be set to \'null\' to clear the \n+                                default value. \n+  \n+  MIN_MAPQ=Integer              The minimum mapping quality. Default value: 0. This option can be set to \'null\' to clear \n+                                the default value. \n+  \n+  MINIMUM_ALLELE_PROBABILITY=Double\n+                                The minimum allele probability conditioned on coverage (for the binomial quantile). \n+                                Default value: 0.1. This option can be set to \'null\' to clear the default value. \n+  \n+  MINIMUM_ALLELE_COVERAGE=Integer\n+                                The minimum haploid coverage for the consensus. Default value: 3. This option can be set \n+                                to \'null\' to clear the default value. \n+  \n+  RANGE=String                  A range to examine. Default value: null. \n+  \n+  CORRECT_BASES=Boolean         Correct bases. Default value: false. This option can be set to \'null\' to clear the \n+                                default value. Possible values: {true, false} \n+  \n+  USE_SEQUENCE_QUALITIES=BooleanUse sequence qualities Default value: true. This option can be set to \'null\' to clear the \n+                                default value. Possible values: {true, false} \n+  \n+  MAX_HEAP_SIZE=Integer         The maximum number of nodes on the heap before re-alignment is ignored Default value: \n+                                8192. This option can be set to \'null\' to clear the default value. \n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/aggregate_binned_scores_in_intervals.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/aggregate_binned_scores_in_intervals.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,113 @@
+<tool id="aggregate_scores_in_intervals2" description="such as phastCons, GERP, binCons, and others for a set of genomic intervals" name="Aggregate datapoints" version="1.1.3">
+  <description>Appends the average, min, max of datapoints per interval</description>
+  <command interpreter="python">
+    #if $score_source_type.score_source == "user" #aggregate_scores_in_intervals.py $score_source_type.input2 $input1 ${input1.metadata.chromCol} ${input1.metadata.startCol} ${input1.metadata.endCol} $out_file1 --chrom_buffer=3
+    #else                                         #aggregate_scores_in_intervals.py $score_source_type.datasets $input1 ${input1.metadata.chromCol} ${input1.metadata.startCol} ${input1.metadata.endCol} $out_file1 -b
+    #end if#
+  </command>
+  <inputs>
+    <param format="interval" name="input1" type="data" label="Interval file"/>
+    <conditional name="score_source_type">
+      <param name="score_source" type="select" label="Score Source">
+        <option value="cached" selected="true">Locally Cached Scores</option>
+        <option value="user">Scores in Your History</option>
+      </param>
+      <when value="cached">
+        <param name="datasets" type="select" label="Available datasets" display="radio">
+          <options from_file="binned_scores.loc">
+            <column name="name" index="1"/>
+            <column name="value" index="2"/>
+            <column name="dbkey" index="0"/>
+            <filter type="data_meta" ref="input1" key="dbkey" column="0" />
+          </options>
+        </param>
+      </when>
+      <when value="user">
+        <param format="wig" name="input2" type="data" label="Score file">
+          <options>
+            <filter type="data_meta" ref="input1" key="dbkey" />
+          </options>
+        </param>
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="interval" name="out_file1" metadata_source="input1"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="6.bed" dbkey="hg17" ftype="bed"/>
+      <param name="score_source" value="cached"/>
+      <param name="datasets" value="/galaxy/data/binned_scores/hg17/phastcons_encode_sep2005_tba" />
+      <output name="out_file1" file="aggregate_binned_scores_in_intervals.out" />
+    </test>
+    <test>
+      <param name="input1" value="9_hg18.bed" dbkey="hg18" ftype="bed"/>
+      <param name="score_source" value="cached"/>
+      <param name="datasets" value="/galaxy/data/binned_scores/hg18/phastCons17way/ba" />
+      <output name="out_file1" file="aggregate_binned_scores_in_intervals2.interval" />
+    </test>
+    <test>
+      <param name="input1" value="6.bed" dbkey="hg17" ftype="bed"/>
+      <param name="score_source" value="user"/>
+      <param name="input2" value="aggregate_binned_scores_3.wig" dbkey="hg17" ftype="wig"/>
+      <output name="out_file1" file="aggregate_binned_scores_in_intervals3.out"/>
+    </test>
+  </tests>
+  <help>
+
+.. class:: warningmark
+
+This tool currently only has cached data for genome builds hg16, hg17 and hg18. However, you may use your own data point (wiggle) data, such as those available from UCSC. If you are trying to use your own data point file and it is not appearing as an option, make sure that the builds for your history items are the same.
+
+.. class:: warningmark
+
+This tool assumes that the input dataset is in interval format and contains at least a chrom column, a start column and an end column.  These 3 columns can be dispersed throughout any number of other data columns. 
+
+-----
+
+.. class:: infomark
+
+**TIP:** Computing summary information may throw exceptions if the data type (e.g., string, integer) in every line of the columns is not appropriate for the computation (e.g., attempting numerical calculations on strings).  If an exception is thrown when computing summary information for a line, that line is skipped as invalid for the computation.  The number of invalid skipped lines is documented in the resulting history item as a "Data issue".
+
+-----
+
+**Syntax**
+
+This tool appends columns of summary information for each interval matched against a selected dataset.  For each interval, the average, minimum and maximum for the data falling within the interval is computed.
+
+- Several quantitative scores are provided for the ENCODE regions.
+
+  - Various Scores
+      - Regulatory Potential
+      - Neutral rate (Ancestral Repeats)
+      - GC fraction
+  - Conservation Scores
+      - PhastCons
+      - binCons
+      - GERP
+
+-----
+
+**Example**
+
+If your original data has the following format:
+
++------+-----+-----+---+------+
+|other1|chrom|start|end|other2|
++------+-----+-----+---+------+
+
+and you choose to aggregate phastCons scores, your output will look like this:
+
++------+-----+-----+---+------+---+---+---+
+|other1|chrom|start|end|other2|avg|min|max|
++------+-----+-----+---+------+---+---+---+
+
+where:
+
+* **avg** - average phastCons score for each region
+* **min** - minimum phastCons score for each region
+* **max** - maximum phastCons score for each region
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/aggregate_scores_in_intervals.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/aggregate_scores_in_intervals.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,243 @@\n+#!/usr/bin/env python\n+# Greg Von Kuster\n+"""\n+usage: %prog score_file interval_file chrom start stop [out_file] [options] \n+    -b, --binned: \'score_file\' is actually a directory of binned array files\n+    -m, --mask=FILE: bed file containing regions not to consider valid\n+    -c, --chrom_buffer=INT: number of chromosomes (default is 3) to keep in memory when using a user supplied score file\n+"""\n+\n+from __future__ import division\n+from galaxy import eggs\n+import pkg_resources \n+pkg_resources.require( "bx-python" )\n+pkg_resources.require( "lrucache" )\n+try:\n+    pkg_resources.require( "python-lzo" )\n+except:\n+    pass\n+\n+import psyco_full\n+import sys\n+import os, os.path\n+from UserDict import DictMixin\n+import bx.wiggle\n+from bx.binned_array import BinnedArray, FileBinnedArray\n+from bx.bitset import *\n+from bx.bitset_builders import *\n+from fpconst import isNaN\n+from bx.cookbook import doc_optparse\n+from galaxy.tools.exception_handling import *\n+\n+assert sys.version_info[:2] >= ( 2, 4 )\n+\n+import tempfile, struct\n+class PositionalScoresOnDisk:\n+    fmt = \'f\'\n+    fmt_size = struct.calcsize( fmt )\n+    default_value = float( \'nan\' )\n+    \n+    def __init__( self ):\n+        self.file = tempfile.TemporaryFile( \'w+b\' )\n+        self.length = 0\n+    def __getitem__( self, i ):\n+        if i < 0: i = self.length + i\n+        if i < 0 or i >= self.length: return self.default_value\n+        try:\n+            self.file.seek( i * self.fmt_size )\n+            return struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]\n+        except Exception, e:\n+            raise IndexError, e\n+    def __setitem__( self, i, value ):\n+        if i < 0: i = self.length + i\n+        if i < 0: raise IndexError, \'Negative assignment index out of range\'\n+        if i >= self.length:\n+            self.file.seek( self.length * self.fmt_size )\n+            self.file.write( struct.pack( self.fmt, self.default_value ) * ( i - self.length ) )\n+            self.length = i + 1\n+        self.file.seek( i * self.fmt_size )\n+        self.file.write( struct.pack( self.fmt, value ) )\n+    def __len__( self ):\n+        return self.length\n+    def __repr__( self ):\n+        i = 0\n+        repr = "[ "\n+        for i in xrange( self.length ):\n+            repr = "%s %s," % ( repr, self[i] )\n+        return "%s ]" % ( repr )\n+\n+class FileBinnedArrayDir( DictMixin ):\n+    """\n+    Adapter that makes a directory of FileBinnedArray files look like\n+    a regular dict of BinnedArray objects. \n+    """\n+    def __init__( self, dir ):\n+        self.dir = dir\n+        self.cache = dict()\n+    def __getitem__( self, key ):\n+        value = None\n+        if key in self.cache:\n+            value = self.cache[key]\n+        else:\n+            fname = os.path.join( self.dir, "%s.ba" % key )\n+            if os.path.exists( fname ):\n+                value = FileBinnedArray( open( fname ) )\n+                self.cache[key] = value\n+        if value is None:\n+            raise KeyError( "File does not exist: " + fname )\n+        return value\n+\n+def stop_err(msg):\n+    sys.stderr.write(msg)\n+    sys.exit()\n+    \n+def load_scores_wiggle( fname, chrom_buffer_size = 3 ):\n+    """\n+    Read a wiggle file and return a dict of BinnedArray objects keyed \n+    by chromosome.\n+    """ \n+    scores_by_chrom = dict()\n+    try:\n+        for chrom, pos, val in bx.wiggle.Reader( UCSCOutWrapper( open( fname ) ) ):\n+            if chrom not in scores_by_chrom:\n+                if chrom_buffer_size:\n+                    scores_by_chrom[chrom] = BinnedArray()\n+                    chrom_buffer_size -= 1\n+                else:\n+                    scores_by_chrom[chrom] = PositionalScoresOnDisk()\n+            scores_by_chrom[chrom][pos] = val\n+    except UCSCLimitException:\n+        # Wiggle data was truncated, at the very least need to warn the user.\n+        print \'Encountered message from UCSC: "Reached output limit of 100000 data values", so be aware your data was truncated.\'\n+ '..b'name == \'None\':\n+        stop_err( \'This tool works with data from genome builds hg16, hg17 or hg18.  Click the pencil icon in your history item to set the genome build if appropriate.\' )\n+    \n+    try:\n+        chrom_col = int(chrom_col) - 1\n+        start_col = int(start_col) - 1\n+        stop_col = int(stop_col) - 1\n+    except:\n+        stop_err( \'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.\' )\n+\n+    if chrom_col < 0 or start_col < 0 or stop_col < 0:\n+        stop_err( \'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.\' )\n+        \n+    if binned:\n+        scores_by_chrom = load_scores_ba_dir( score_fname )\n+    else:\n+        try:\n+            chrom_buffer = int( options.chrom_buffer )\n+        except:\n+            chrom_buffer = 3\n+        scores_by_chrom = load_scores_wiggle( score_fname, chrom_buffer )\n+\n+    if mask_fname:\n+        masks = binned_bitsets_from_file( open( mask_fname ) )\n+    else:\n+        masks = None\n+\n+    skipped_lines = 0\n+    first_invalid_line = 0\n+    invalid_line = \'\'\n+\n+    for i, line in enumerate( open( interval_fname )):\n+        valid = True\n+        line = line.rstrip(\'\\r\\n\')\n+        if line and not line.startswith( \'#\' ):\n+            fields = line.split()\n+            \n+            try:\n+                chrom, start, stop = fields[chrom_col], int( fields[start_col] ), int( fields[stop_col] )\n+            except:\n+                valid = False\n+                skipped_lines += 1\n+                if not invalid_line:\n+                    first_invalid_line = i + 1\n+                    invalid_line = line\n+            if valid:\n+                total = 0\n+                count = 0\n+                min_score = 100000000\n+                max_score = -100000000\n+                for j in range( start, stop ):\n+                    if chrom in scores_by_chrom:\n+                        try:\n+                            # Skip if base is masked\n+                            if masks and chrom in masks:\n+                                if masks[chrom][j]:\n+                                    continue\n+                            # Get the score, only count if not \'nan\'\n+                            score = scores_by_chrom[chrom][j]\n+                            if not isNaN( score ):\n+                                total += score\n+                                count += 1\n+                                max_score = max( score, max_score )\n+                                min_score = min( score, min_score )\n+                        except:\n+                            continue\n+                if count > 0:\n+                    avg = total/count\n+                else:\n+                    avg = "nan"\n+                    min_score = "nan"\n+                    max_score = "nan"\n+                \n+                # Build the resulting line of data\n+                out_line = []\n+                for k in range(0, len(fields)):\n+                    out_line.append(fields[k])\n+                out_line.append(avg)\n+                out_line.append(min_score)\n+                out_line.append(max_score)\n+                \n+                print >> out_file, "\\t".join( map( str, out_line ) )\n+            else:\n+                skipped_lines += 1\n+                if not invalid_line:\n+                    first_invalid_line = i + 1\n+                    invalid_line = line\n+        elif line.startswith( \'#\' ):\n+            # We\'ll save the original comments\n+            print >> out_file, line\n+            \n+    out_file.close()\n+\n+    if skipped_lines > 0:\n+        print \'Data issue: skipped %d invalid lines starting at line #%d which is "%s"\' % ( skipped_lines, first_invalid_line, invalid_line )\n+        if skipped_lines == i:\n+            print \'Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.\'\n+\n+if __name__ == "__main__": main()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/column_maker.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/column_maker.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+# This tool takes a tab-delimited textfile as input and creates another column in the file which is the result of
+# a computation performed on every row in the original file.  The tool will skip over invalid lines within the file,
+# informing the user about the number of lines skipped.  
+import sys, re, os.path
+from galaxy import eggs
+from galaxy.tools import validation
+from galaxy.datatypes import metadata
+from math import log,exp,sqrt,ceil,floor
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+inp_file = sys.argv[1]
+out_file = sys.argv[2]
+expr = sys.argv[3]
+round_result = sys.argv[4]
+try:
+    in_columns = int( sys.argv[5] )
+except:
+    stop_err( "Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it.  This tool can only be used with tab-delimited data." )
+if in_columns < 2:
+    # To be considered tabular, data must fulfill requirements of the sniff.is_column_based() method.
+    stop_err( "Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it.  This tool can only be used with tab-delimited data." )
+try:
+    in_column_types = sys.argv[6].split( ',' )
+except:
+    stop_err( "Missing or invalid 'column_types' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it.  This tool can only be used with tab-delimited data." )
+if len( in_column_types ) != in_columns:
+    stop_err( "The 'columns' metadata setting does not conform to the 'column_types' metadata setting, click the pencil icon in the history item and select the Auto-detect option to correct it.  This tool can only be used with tab-delimited data." )
+    
+# Unescape if input has been escaped
+mapped_str = {
+    '__lt__': '<',
+    '__le__': '<=',
+    '__eq__': '==',
+    '__ne__': '!=',
+    '__gt__': '>',
+    '__ge__': '>=',
+    '__sq__': '\'',
+    '__dq__': '"',
+}
+for key, value in mapped_str.items():
+    expr = expr.replace( key, value )
+
+# Prepare the column variable names and wrappers for column data types
+cols, type_casts = [], []
+for col in range( 1, in_columns + 1 ):
+    col_name = "c%d" % col
+    cols.append( col_name )
+    col_type = in_column_types[ col - 1 ].strip()
+    if round_result == 'no' and col_type == 'int':
+        col_type = 'float'
+    type_cast = "%s(%s)" % ( col_type, col_name )
+    type_casts.append( type_cast )
+        
+col_str = ', '.join( cols )    # 'c1, c2, c3, c4'
+type_cast_str = ', '.join( type_casts )  # 'str(c1), int(c2), int(c3), str(c4)'
+assign = "%s = line.split( '\\t' )" % col_str
+wrap = "%s = %s" % ( col_str, type_cast_str )
+skipped_lines = 0
+first_invalid_line = 0
+invalid_line = None
+lines_kept = 0
+total_lines = 0
+out = open( out_file, 'wt' )
+
+# Read input file, skipping invalid lines, and perform computation that will result in a new column
+code = '''
+for i, line in enumerate( file( inp_file ) ):
+    total_lines += 1
+    line = line.rstrip( '\\r\\n' )
+    if not line or line.startswith( '#' ):
+        skipped_lines += 1
+        if not invalid_line:
+            first_invalid_line = i + 1
+            invalid_line = line
+        continue
+    try:
+        %s
+        %s
+        new_val = %s
+        if round_result == "yes":
+            new_val = int( round( new_val ) )
+        new_line = line + '\\t' + str( new_val )
+        print >> out, new_line
+        lines_kept += 1
+    except:
+        skipped_lines += 1
+        if not invalid_line:
+            first_invalid_line = i + 1
+            invalid_line = line
+''' % ( assign, wrap, expr )
+
+valid_expr = True
+try:
+    exec code
+except Exception, e:
+    out.close()
+    if str( e ).startswith( 'invalid syntax' ):
+        valid_expr = False
+        stop_err( 'Expression "%s" likely invalid. See tool tips, syntax and examples.' % expr )
+    else:
+        stop_err( str( e ) )
+
+if valid_expr:
+    out.close()
+    valid_lines = total_lines - skipped_lines
+    print 'Creating column %d with expression %s' % ( in_columns + 1, expr )
+    if valid_lines > 0:
+        print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
+    else:
+        print 'Possible invalid expression "%s" or non-existent column referenced. See tool tips, syntax and examples.' % expr
+    if skipped_lines > 0:
+        print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/column_maker.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/column_maker.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,83 @@
+<tool id="Add_a_column1" name="Compute" version="1.1.0">
+  <description>an expression on every row</description>
+  <command interpreter="python">
+    column_maker.py $input $out_file1 "$cond" $round ${input.metadata.columns} "${input.metadata.column_types}"
+  </command>
+  <inputs>
+    <param name="cond" size="40" type="text" value="c3-c2" label="Add expression"/>
+    <param format="tabular" name="input" type="data" label="as a new column to" help="Dataset missing? See TIP below"/>
+    <param name="round" type="select" label="Round result?">
+      <option value="no">NO</option>
+      <option value="yes">YES</option>
+    </param>    
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="cond" value="c3-c2"/>
+      <param name="input" value="1.bed"/>
+      <param name="round" value="no"/>
+      <output name="out_file1" file="column_maker_out1.interval"/>
+    </test>
+    <test>
+      <param name="cond" value="c4*1"/>
+      <param name="input" value="1.interval"/>
+      <param name="round" value="no"/>
+      <output name="out_file1" file="column_maker_out2.interval"/>
+    </test>
+    <test>
+      <param name="cond" value="c4*1"/>
+      <param name="input" value="1.interval"/>
+      <param name="round" value="yes"/>
+      <output name="out_file1" file="column_maker_out3.interval"/>
+    </test>
+  </tests>
+  <help>
+
+ .. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
+
+-----
+
+**What it does**
+
+This tool computes an expression for every row of a dataset and appends the result as a new column (field).
+
+- Columns are referenced with **c** and a **number**. For example, **c1** refers to the first column of a tab-delimited file
+
+- **c3-c2** will add a length column to the dataset if **c2** and **c3** are start and end position
+
+-----
+
+**Example**
+
+If this is your input::
+
+   chr1  151077881  151077918  2  200  -
+   chr1  151081985  151082078  3  500  +
+
+computing "c4*c5" will produce::
+
+   chr1  151077881  151077918  2  200  -   400.0
+   chr1  151081985  151082078  3  500  +  1500.0
+    
+if, at the same time, "Round result?" is set to **YES** results will look like this::
+
+   chr1  151077881  151077918  2  200  -   400
+   chr1  151081985  151082078  3  500  +  1500
+
+You can also use this tool to evaluate expressions. For example, computing "c3>=c2" for Input will result in the following::
+
+   chr1  151077881  151077918  2  200  -  True
+   chr1  151081985  151082078  3  500  +  True
+
+or computing "type(c2)==type('') for Input will return::
+
+   chr1  151077881  151077918  2  200  -  False
+   chr1  151081985  151082078  3  500  +  False
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/cor.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/cor.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+#Greg Von Kuster
+"""
+Calculate correlations between numeric columns in a tab delim file.
+usage: %prog infile output.txt columns method
+"""
+
+import sys
+from rpy import *
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+    
+def main():
+    method = sys.argv[4]
+    assert method in ( "pearson", "kendall", "spearman" )
+
+    try:
+        columns = map( int, sys.argv[3].split( ',' ) )
+    except:
+        stop_err( "Problem determining columns, perhaps your query does not contain a column of numerical data." )
+    
+    matrix = []
+    skipped_lines = 0
+    first_invalid_line = 0
+    invalid_value = ''
+    invalid_column = 0
+
+    for i, line in enumerate( file( sys.argv[1] ) ):
+        valid = True
+        line = line.rstrip('\n\r')
+
+        if line and not line.startswith( '#' ): 
+            # Extract values and convert to floats
+            row = []
+            for column in columns:
+                column -= 1
+                fields = line.split( "\t" )
+                if len( fields ) <= column:
+                    valid = False
+                else:
+                    val = fields[column]
+                    if val.lower() == "na": 
+                        row.append( float( "nan" ) )
+                    else:
+                        try:
+                            row.append( float( fields[column] ) )
+                        except:
+                            valid = False
+                            skipped_lines += 1
+                            if not first_invalid_line:
+                                first_invalid_line = i+1
+                                invalid_value = fields[column]
+                                invalid_column = column+1
+        else:
+            valid = False
+            skipped_lines += 1
+            if not first_invalid_line:
+                first_invalid_line = i+1
+
+        if valid:
+            matrix.append( row )
+
+    if skipped_lines < i:
+        try:
+            out = open( sys.argv[2], "w" )
+        except:
+            stop_err( "Unable to open output file" )
+
+        # Run correlation
+        try:
+            value = r.cor( array( matrix ), use="pairwise.complete.obs", method=method )
+        except Exception, exc:
+            out.close()
+            stop_err("%s" %str( exc ))
+        for row in value:
+            print >> out, "\t".join( map( str, row ) )
+        out.close()
+
+    if skipped_lines > 0:
+        msg = "..Skipped %d lines starting with line #%d. " %( skipped_lines, first_invalid_line )
+        if invalid_value and invalid_column > 0:
+            msg += "Value '%s' in column %d is not numeric." % ( invalid_value, invalid_column )
+        print msg
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/cor.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/cor.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,101 @@
+<tool id="cor2" name="Correlation">
+  <description>for numeric columns</description>
+  <command interpreter="python">cor.py $input1 $out_file1 $numeric_columns $method</command>
+  <inputs>
+    <param format="tabular" name="input1" type="data" label="Dataset" help="Dataset missing? See TIP below"/>
+    <param name="numeric_columns" label="Numerical columns" type="data_column" numerical="True" multiple="True" data_ref="input1" help="Multi-select list - hold the appropriate key while clicking to select multiple columns" />
+    <param name="method" type="select" label="Method">
+      <option value="pearson">Pearson</option>
+      <option value="kendall">Kendall rank</option>
+      <option value="spearman">Spearman rank</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="txt" name="out_file1" />
+  </outputs>
+  <requirements>
+    <requirement type="python-module">rpy</requirement>
+  </requirements>
+  <tests>
+    <!--
+    Test a tabular input with the first line being a comment without a # character to start
+    -->
+    <test>
+      <param name="input1" value="cor.tabular" />
+      <param name="numeric_columns" value="2,3" />
+      <param name="method" value="pearson" />
+      <output name="out_file1" file="cor_out.txt" />
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
+
+.. class:: warningmark
+
+Missing data ("nan") removed from each pairwise comparison
+
+-----
+
+**Syntax**
+
+This tool computes the matrix of correlation coefficients between numeric columns.
+
+- All invalid, blank and comment lines are skipped when performing computations.  The number of skipped lines is displayed in the resulting history item.
+
+- **Pearson's Correlation** reflects the degree of linear relationship between two variables. It ranges from +1 to -1. A correlation of +1 means that there is a perfect positive linear relationship between variables. The formula for Pearson's correlation is:
+
+    .. image:: ./static/images/pearson.png
+
+    where n is the number of items
+
+- **Kendall's rank correlation** is used to measure the degree of correspondence between two rankings and assessing the significance of this correspondence. The formula for Kendall's rank correlation is:
+
+    .. image:: ./static/images/kendall.png
+
+    where n is the number of items, and P is the sum.
+
+- **Spearman's rank correlation** assesses how well an arbitrary monotonic function could describe the relationship between two variables, without making any assumptions about the frequency distribution of the variables. The formula for Spearman's rank correlation is
+
+    .. image:: ./static/images/spearman.png
+
+    where D is the difference between the ranks of corresponding values of X and Y, and N is the number of pairs of values.
+
+-----
+
+**Example**
+
+- Input file::
+
+    #Person Height Self Esteem
+    1 68 4.1
+    2  71  4.6
+    3  62  3.8
+    4  75  4.4
+    5  58  3.2
+    6  60  3.1
+    7  67  3.8
+    8  68  4.1
+    9  71  4.3
+    10  69  3.7
+    11  68  3.5
+    12  67  3.2
+    13  63  3.7
+    14  62  3.3
+    15  60  3.4
+    16  63  4.0
+    17  65  4.1
+    18  67  3.8
+    19  63  3.4
+    20  61  3.6
+
+- Computing the correlation coefficients between columns 2 and 3 of the above file (using Pearson's Correlation), the output is::
+
+    1.0 0.730635686279
+    0.730635686279 1.0
+
+  So the correlation for our twenty cases is .73, which is a fairly strong positive relationship.
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/correlation.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/correlation.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,84 @@
+#!/usr/bin/perl
+
+###########################################################################
+# Purpose: To calculate the correlation of two sets of scores in one file.
+# Usage: correlation.pl infile.bed output.txt column1 column2
+#        (column start from 1)
+# Written by: Yi Zhang  (June, 2005)
+###########################################################################
+if (!$ARGV[0] || !$ARGV[1] || !defined($ARGV[2]) || !defined($ARGV[3]) ) {
+   print STDERR "Usage: correlation.pl infile.bed output.txt column1 column2\n";
+   print STDERR "       (column start from 1)\n"; 
+   exit;
+}
+my $file = $ARGV[0];
+my $out = $ARGV[1];
+
+die "<font color=\"yellow\">The input columns contain numerical values: $ARGV[2], $ARGV[3]</font>.\n" if ($ARGV[2] =~ /[a-zA-Z]+/ || $ARGV[3] =~ /[a-zA-Z]+/);
+
+my $col1 = $ARGV[2] - 1;
+my $col2 = $ARGV[3] - 1;
+
+my ($f, $o);
+my (@a, @b);
+
+my $n_t = 0;
+open($f, $file) or die "Could't open $file, $!\n";
+while(<$f>) {
+  chomp;
+  my @t = split(/\t/);
+  if ($n_t == 0) { 
+     $n_t = scalar(@t) - 1; 
+     die "<font color=\"yellow\">The input column number exceeds the size of the file: $col1, $col2, $n_t</font>\n" if ( $col1 > $n_t || $col2 > $n_t );
+  }
+  die "<font color=\"yellow\">The columns you have selected contain non numeric characters:$t[$col1] and $t[$col2] \n</font>" if ($t[$col1] =~ /[a-zA-Z]+/ || $t[$col2] =~ /[a-zA-Z]+/);  
+  push(@a, $t[$col1]);
+  push(@b, $t[$col2]);
+}
+close($f);
+
+my $result = correlation(\@a, \@b);
+
+open($o, ">$out") or die "Couldn't open $out, $!\n";
+$col1 = $col1 + 1;
+$col2 = $col2 + 1;
+print $o "The correlation of column $col1 and $col2 is $result\n";
+close($o);
+print "The correlation of column $col1 and $col2 is $result\n";
+
+sub correlation {
+   my ($array1ref, $array2ref) = @_;
+   my ($sum1, $sum2);
+   my ($sum1_squared, $sum2_squared); 
+   foreach (@$array1ref) { $sum1 += $_;  $sum1_squared += $_**2; }
+   foreach (@$array2ref) { $sum2 += $_;  $sum2_squared += $_**2; }
+   my $numerator = (@$array1ref**2) * covariance($array1ref, $array2ref);
+   my $denominator = sqrt(((@$array1ref * $sum1_squared) - ($sum1**2)) *
+                          ((@$array1ref * $sum2_squared) - ($sum2**2)));
+   my $r;
+   if ($denominator == 0) {
+     print STDERR "The denominator is 0.\n";
+  exit 0; 
+   } else {
+      $r = $numerator / $denominator;
+   }
+    return $r;
+}
+
+sub covariance {
+   my ($array1ref, $array2ref) = @_;
+   my ($i, $result);
+   for ($i = 0; $i < @$array1ref; $i++) {
+       $result += $array1ref->[$i] * $array2ref->[$i];
+   }
+   $result /= @$array1ref;
+   $result -= mean($array1ref) * mean($array2ref);
+}
+
+sub mean {
+  my ($arrayref) = @_;
+  my $result;
+  foreach (@$arrayref) { $result += $_; }
+  return $result/@$arrayref;
+}
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/correlation.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/correlation.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,15 @@
+<tool id="Pearson_and_apos_Correlation1" name="Pearson and apos Correlation">
+  <description>between any two numeric columns</description>
+  <command interpreter="perl">correlation.pl $input $out_file1 $col1 $col2</command>
+  <inputs>
+<!--    <display>on column $col1 and column $col2 of $input</display> -->
+    <param name="col1" size="3" type="text" value="5" label="Correlate data in column"/>
+    <param name="col2" size="3" type="text" value="6" label="with data in column"/>
+    <param format="txt" name="input" type="data" label="in Query"/>
+  </inputs>
+  <outputs>
+    <data format="txt" name="out_file1" />
+  </outputs>
+  <help>Computes Pearsons correlation coefficient between any two numerical columns. Column numbers start at 1.
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/count_gff_features.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/count_gff_features.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+# This tool takes a gff file as input and counts the number of features in it.
+
+import sys, fileinput
+from galaxy import eggs
+from galaxy.datatypes.util.gff_util import GFFReaderWrapper
+from bx.intervals.io import GenomicInterval
+
+# Get args.
+input_file = sys.argv[1:]
+
+# Count features.
+count = 0
+for feature in GFFReaderWrapper( fileinput.FileInput( input_file ), fix_strand=True ):
+    if isinstance( feature, GenomicInterval ):
+        count += 1
+
+print count
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/count_gff_features.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/count_gff_features.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,26 @@
+<tool id="count_gff_features" name="Count GFF Features" version="0.1">
+    <description></description>
+    <command interpreter="python">
+        count_gff_features.py $input &gt; $output
+    </command>
+    <inputs>
+        <param format="gff" name="input" type="data" label="GFF Dataset to Filter"/>
+    </inputs>
+    <outputs>
+        <data format="txt" name="output"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" value="gff2bed_in2.gff"/>
+            <output name="output" file="count_gff_features_out1.txt"/>
+        </test>
+        <test>
+            <param name="input" value="gff_filter_by_feature_count_out1.gff"/>
+            <output name="output" file="count_gff_features_out2.txt"/>
+        </test>
+    </tests>
+    <help>
+        Counts the number of features in a GFF dataset. GFF features are often spread across multiple lines; this tool counts the number of 
+        features in dataset rather than the number of lines.
+    </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/dna_filtering.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/dna_filtering.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,216 @@\n+#!/usr/bin/env python\n+\n+"""\n+This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.\n+\n+usage: %prog [options]\n+    -i, --input=i: tabular input file\n+    -o, --output=o: filtered output file\n+    -c, --cond=c: conditions to filter on\n+    -n, --n_handling=n: how to handle N and X\n+    -l, --columns=l: columns \n+    -t, --col_types=t: column types    \n+\n+"""\n+\n+#from __future__ import division\n+import os.path, re, string, string, sys\n+from galaxy import eggs\n+import pkg_resources; pkg_resources.require( "bx-python" )\n+from bx.cookbook import doc_optparse\n+\n+# Older py compatibility\n+try:\n+    set()\n+except:\n+    from sets import Set as set\n+\n+#assert sys.version_info[:2] >= ( 2, 4 )\n+\n+def get_operands( filter_condition ):\n+    # Note that the order of all_operators is important\n+    items_to_strip = [ \'==\', \'!=\', \' and \', \' or \' ]\n+    for item in items_to_strip:\n+        if filter_condition.find( item ) >= 0:\n+            filter_condition = filter_condition.replace( item, \' \' )\n+    operands = set( filter_condition.split( \' \' ) )\n+    return operands\n+\n+def stop_err( msg ):\n+    sys.stderr.write( msg )\n+    sys.exit()\n+\n+def __main__():\n+    #Parse Command Line\n+    options, args = doc_optparse.parse( __doc__ )\n+    input = options.input\n+    output = options.output\n+    cond = options.cond\n+    n_handling = options.n_handling\n+    columns = options.columns\n+    col_types = options.col_types\n+\n+    try:\n+        in_columns = int( columns )\n+        assert col_types  #check to see that the column types variable isn\'t null\n+        in_column_types = col_types.split( \',\' )\n+    except:\n+        stop_err( "Data does not appear to be tabular.  This tool can only be used with tab-delimited data." )\n+\n+    # Unescape if input has been escaped\n+    cond_text = cond.replace( \'__eq__\', \'==\' ).replace( \'__ne__\', \'!=\' ).replace( \'__sq__\', "\'" )\n+    orig_cond_text = cond_text\n+    # Expand to allow for DNA codes\n+    dot_letters = [ letter for letter in string.uppercase if letter not in \\\n+                   [ \'A\', \'C\', \'G\', \'T\', \'U\', \'B\', \'D\', \'H\', \'K\', \'M\', \'N\', \'R\', \'S\', \'V\', \'W\', \'X\', \'Y\' ] ]\n+    dot_letters.append( \'.\' )\n+    codes = {\'A\': [ \'A\', \'D\', \'H\', \'M\', \'R\', \'V\', \'W\' ],\n+             \'C\': [ \'C\', \'B\', \'H\', \'M\', \'S\', \'V\', \'Y\' ],\n+             \'G\': [ \'G\', \'B\', \'D\', \'K\', \'R\', \'S\', \'V\' ],\n+             \'T\': [ \'T\', \'U\', \'B\', \'D\', \'H\', \'K\', \'W\', \'Y\' ],\n+             \'U\': [ \'T\', \'U\', \'B\', \'D\', \'H\', \'K\', \'W\', \'Y\' ],\n+             \'K\': [ \'G\', \'T\', \'U\', \'B\', \'D\', \'H\', \'K\', \'R\', \'S\', \'V\', \'W\', \'Y\' ],\n+             \'M\': [ \'A\', \'C\', \'B\', \'D\', \'H\', \'M\', \'R\', \'S\', \'V\', \'W\', \'Y\' ],\n+             \'R\': [ \'A\', \'G\', \'B\', \'D\', \'H\', \'K\', \'M\', \'R\', \'S\', \'V\', \'W\' ],\n+             \'Y\': [ \'C\', \'T\', \'U\', \'B\', \'D\', \'H\', \'K\', \'M\', \'S\', \'V\', \'W\', \'Y\' ],\n+             \'S\': [ \'C\', \'G\', \'B\', \'D\', \'H\', \'K\', \'M\', \'R\', \'S\', \'V\', \'Y\' ],\n+             \'W\': [ \'A\', \'T\', \'U\', \'B\', \'D\', \'H\', \'K\', \'M\', \'R\', \'V\', \'W\', \'Y\' ],\n+             \'B\': [ \'C\', \'G\', \'T\', \'U\', \'B\', \'D\', \'H\', \'K\', \'M\', \'R\', \'S\', \'V\', \'W\', \'Y\' ],\n+             \'V\': [ \'A\', \'C\', \'G\', \'B\', \'D\', \'H\', \'K\', \'M\', \'R\', \'S\', \'V\', \'W\' ],\n+             \'H\': [ \'A\', \'C\', \'T\', \'U\', \'B\', \'D\', \'H\', \'K\', \'M\', \'R\', \'S\', \'V\', \'W\', \'Y\' ],\n+             \'D\': [ \'A\', \'G\', \'T\', \'U\', \'B\', \'D\', \'H\', \'K\', \'M\', \'R\', \'S\', \'V\', \'W\', \'Y\' ],\n+             \'.\': dot_letters,\n+             \'-\': [ \'-\' ]}\n+    # Add handling for N and X\n+    if n_handling == "all":\n+        codes[ \'N\' ] = [ \'A\', \'C\', \'G\', \'T\', \'U\', \'B\', \'D\', \'H\', \'K\', \'M\', \'N\', \'R\', \'S\', \'V\', \'W\', \'X\', \'Y\' ]\n+        codes[ \'X\' ] = [ \'A\', \'C\', \'G\', \'T\', \'U\', \'B\', \'D\', \'H\', \'K\', \'M\', \'N\', \'R\', \'S\', \'V\', \'W\', \'X\', \'Y\' ]\n+        for code in codes.keys():\n+            if code != \'.\' and code != \'-\':\n+                codes[code].append( \'N\' )\n+                codes[code].append( \'X\' )\n+   '..b'right.find( \'"\' ) >= 0:\n+                test = right.replace( "\'", \'\' ).replace( \'"\', \'\' )\n+                assert test in string.uppercase or test.find( \'+\' ) >= 0 or test.find( \'.\' ) >= 0 or test.find( \'-\' ) >= 0\\\n+                        or test.startswith( \'chr\' ) or test.startswith( \'scaffold\' ), \\\n+                        \'The value to search for should be a valid base, code, plus sign, chromosome (like "chr1") or scaffold (like "scaffold5"). \' \\\n+                        \'Use the general filter tool to filter on anything else first\'\n+            else:\n+                assert right.startswith( \'c\' ), \'The column names should start with c (lowercase)\'\n+            match_replace[match] = new_match\n+        if len( match_replace.keys() ) == 0:\n+            raise Exception, \'There do not appear to be any valid conditions\'\n+        for match in match_replace.keys():\n+            cond_text = cond_text.replace( match, match_replace[match] )\n+    except Exception, e:\n+        stop_err( "At least one of your conditions is invalid. Make sure to use only \'!=\' or \'==\', valid column numbers, and valid base values.\\n" + str(e) )\n+\n+    # Attempt to determine if the condition includes executable stuff and, if so, exit\n+    secured = dir()\n+    operands = get_operands( cond_text )\n+    for operand in operands:\n+        try:\n+            check = int( operand )\n+        except:\n+            if operand in secured:\n+                stop_err( "Illegal value \'%s\' in condition \'%s\'" % ( operand, cond_text ) )\n+\n+    # Prepare the column variable names and wrappers for column data types\n+    cols, type_casts = [], []\n+    for col in range( 1, in_columns + 1 ):\n+        col_name = "c%d" % col\n+        cols.append( col_name )\n+        col_type = in_column_types[ col - 1 ]\n+        type_cast = "%s(%s)" % ( col_type, col_name )\n+        type_casts.append( type_cast )\n+\n+    col_str = \', \'.join( cols )    # \'c1, c2, c3, c4\'\n+    type_cast_str = \', \'.join( type_casts )  # \'str(c1), int(c2), int(c3), str(c4)\'\n+    assign = "%s = line.split( \'\\\\t\' )" % col_str\n+    wrap = "%s = %s" % ( col_str, type_cast_str )\n+    skipped_lines = 0\n+    first_invalid_line = 0\n+    invalid_line = None\n+    lines_kept = 0\n+    total_lines = 0\n+    out = open( output, \'wt\' )\n+    # Read and filter input file, skipping invalid lines\n+    code = \'\'\'\n+for i, line in enumerate( file( input ) ):\n+    total_lines += 1\n+    line = line.rstrip( \'\\\\r\\\\n\' )\n+    if not line or line.startswith( \'#\' ):\n+        skipped_lines += 1\n+        if not invalid_line:\n+            first_invalid_line = i + 1\n+            invalid_line = line\n+        continue\n+    try:\n+        %s = line.split( \'\\\\t\' )\n+        %s = %s\n+        if %s:\n+            lines_kept += 1\n+            print >> out, line\n+    except Exception, e:\n+        skipped_lines += 1\n+        if not invalid_line:\n+            first_invalid_line = i + 1\n+            invalid_line = line\n+\'\'\' % ( col_str, col_str, type_cast_str, cond_text )\n+\n+    valid_filter = True\n+    try:\n+        exec code\n+    except Exception, e:\n+        out.close()\n+        if str( e ).startswith( \'invalid syntax\' ):\n+            valid_filter = False\n+            stop_err( \'Filter condition "%s" likely invalid. See tool tips, syntax and examples.\' % orig_cond_text + \' \'+str(e))\n+        else:\n+            stop_err( str( e ) )\n+\n+    if valid_filter:\n+        out.close()\n+        valid_lines = total_lines - skipped_lines\n+        print \'Filtering with %s, \' % orig_cond_text\n+        if valid_lines > 0:\n+            print \'kept %4.2f%% of %d lines.\' % ( 100.0*lines_kept/valid_lines, total_lines )\n+        else:\n+            print \'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.\' % orig_cond_text\n+        if skipped_lines > 0:\n+            print \'Skipped %d invalid lines starting at line #%d: "%s"\' % ( skipped_lines, first_invalid_line, invalid_line )\n+    \n+if __name__ == "__main__" : __main__()\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/dna_filtering.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/dna_filtering.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,136 @@
+<tool id="dna_filter" name="Filter on ambiguities" version="1.0.0">
+  <description>in polymorphism datasets</description>
+  <command interpreter="python">
+    dna_filtering.py
+      --input=$input 
+      --output=$out_file1 
+      --cond="$cond" 
+      --n_handling=$n_handling
+      --columns=${input.metadata.columns} 
+      --col_types="${input.metadata.column_types}"
+  </command>
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Filter" help="Dataset missing? See TIP below."/>
+    <param name="cond" size="40" type="text" value="c4 == 'G'" label="With following condition" help="Double equal signs, ==, must be used as shown above. To filter for an arbitrary string, use the Select tool.">
+      <validator type="empty_field" message="Enter a valid filtering condition, see syntax and examples below."/>
+    </param>
+    <param name="n_handling" type="select" label="What is the meaning of N" help="Everything matches everything, Unknown matches nothing">
+      <option value="all">Everything (A, T, C, G)</option>
+      <option value="none">Unknown</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" ftype="tabular" value="dna_filter_in1.tabular" />
+      <param name="cond" value="c8=='G'" />
+      <param name="n_handling" value="all" />
+      <output name="out_file1" ftype="tabular" file="dna_filter_out1.tabular" />
+    </test>
+    <test>
+      <param name="input" value="dna_filter_in1.tabular" />
+      <param name="cond" value="(c10 == c11 or c17 == c18) and c6 != 'C' and c23 == 'R'" />
+      <param name="n_handling" value="all" />
+      <output name="out_file1" file="dna_filter_out2.tabular" />
+    </test>
+    <test>
+      <param name="input" value="dna_filter_in1.tabular" />
+      <param name="cond" value="c4=='B' or c9==c10" />
+      <param name="n_handling" value="none" />
+      <output name="out_file1" file="dna_filter_out3.tabular" />
+    </test>
+    <test>
+      <param name="input" value="dna_filter_in1.tabular" />
+      <param name="cond" value="c1!='chr1' and c7!='Y' and c25!='+'" />
+      <param name="n_handling" value="none" />
+      <output name="out_file1" file="dna_filter_out4.tabular" />
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
+
+.. class:: warningmark
+
+**TIP:** This tool is intended primarily for comparing column values (such as "c5==c12"), although it is also possible to filter on specific values (like "c6!='G'"). Be aware that when searching for specific values, any possible match is considered. So if you search on "c6!='G'", rows will be excluded when c6 is G, K, R, S, B, V, or D (plus N or X if you set that to equal "Everything"), because it is possible those values could indicate G. 
+
+-----
+
+**What it does**
+
+This tool is written for a very specific case related to an analysis of polymorphism data. Suppose you have a table of SNP data that looks like this::
+
+  chromosome start end patient1 parient2 patient3 patient4
+  --------------------------------------------------------
+  chr1       100   101 A        M        C        R 
+  chr1       200   201 T        K        C        C 
+  
+and your want to select all rows where patient1 has the same base as patient2. Unfortunately you cannot do this with the *Filter and Sort -> Filter* tool because it does not understand DNA ambiguity codes (see below). For example, at position 100 patient1 is the same as patient2 because M is a mix of As and Cs. This tool is designed to make filtering on ambiguities possible.
+
+-----
+
+**Syntax**
+
+The filter tool allows you to restrict the dataset using simple conditional statements:
+
+- Columns are referenced with **c** and a **number**. For example, **c1** refers to the first column of a tab-delimited file (e.g., **c4 == c5**)
+- When using 'equal-to' operator **double equal sign '==' must be used** ( e.g., **c1=='chr1'** )
+- Non-numerical values must be included in single or double quotes ( e.g., **c6=='C'** )
+- Filtering condition can include logical operators, but **make sure operators are all lower case** ( e.g., **(c1!='chrX' and c1!='chrY') or c6=='+'** )
+
+------
+
+**Allowed types of filtering**
+
+The following types of filtering are allowed:
+
+- Testing columns for equality (e.g., c2 == c4 or c2 != c4)
+- Testing that a column contains a particular base (e.g., c4 == 'C'). Only bases listed in *DNA Codes* below are allowed.
+- Testing that a column represents a plus or a minus strand (e.g., c3 == '+' or c3 != '-')
+- Testing that a column is a chromosomes (c1 == 'chrX') or a scaffold (c1 == 'scaffold87976')
+
+All other types of filtering should be done with *Filter and Sort -> Filter* tool.
+
+-----
+
+**DNA Codes**
+
+The following are the DNA codes used for filtering::
+
+  Code   Meaning
+  ----   --------------------------
+   A     A
+   T     T
+   U     T
+   G     G
+   C     C
+   K     G or T
+   M     A or C
+   R     A or G
+   Y     C or T
+   S     C or G
+   W     A or T
+   B     C, G or T
+   V     A, C or G
+   H     A, C or T
+   D     A, G or T
+   X     A, C, G or T
+   N     A, C, G or T
+   .     not (A, C, G or T)
+   -     gap of indeterminate length
+
+-----
+
+**Example**
+
+- **c8=='A'** selects lines in which the eighth column is A, M, R, W, V, H, or D, or N or X if appropriate
+- **c12==c15** selects lines where the value in the twelfth column could be the same as the fifteenth and the fifteenth column could be the same as the twelfth column (based on appropriate codes)
+- **c9!=c19** selects lines where column nine could not be the same as column nineteen or column nineteen could not be the same as column nine (using appropriate codes)
+- **c4 == 'A' and c4 == c5** selects lines where column 4 and 5 are both A, M, R, W, V, H, D or N, or X if appropriate
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/filtering.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/filtering.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,139 @@
+#!/usr/bin/env python
+# This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties.
+# The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
+
+from __future__ import division
+import sys, re, os.path
+from galaxy import eggs
+
+# Older py compatibility
+try:
+    set()
+except:
+    from sets import Set as set
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def get_operands( filter_condition ):
+    # Note that the order of all_operators is important
+    items_to_strip = ['+', '-', '**', '*', '//', '/', '%', '<<', '>>', '&', '|', '^', '~', '<=', '<', '>=', '>', '==', '!=', '<>', ' and ', ' or ', ' not ', ' is ', ' is not ', ' in ', ' not in ']
+    for item in items_to_strip:
+        if filter_condition.find( item ) >= 0:
+            filter_condition = filter_condition.replace( item, ' ' )
+    operands = set( filter_condition.split( ' ' ) )
+    return operands
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+in_fname = sys.argv[1]
+out_fname = sys.argv[2]
+cond_text = sys.argv[3]
+try:
+    in_columns = int( sys.argv[4] )
+    assert sys.argv[5]  #check to see that the column types variable isn't null
+    in_column_types = sys.argv[5].split( ',' )
+except:
+    stop_err( "Data does not appear to be tabular.  This tool can only be used with tab-delimited data." )
+
+# Unescape if input has been escaped
+mapped_str = {
+    '__lt__': '<',
+    '__le__': '<=',
+    '__eq__': '==',
+    '__ne__': '!=',
+    '__gt__': '>',
+    '__ge__': '>=',
+    '__sq__': '\'',
+    '__dq__': '"',
+}
+for key, value in mapped_str.items():
+    cond_text = cond_text.replace( key, value )
+    
+# Attempt to determine if the condition includes executable stuff and, if so, exit
+secured = dir()
+operands = get_operands(cond_text)
+for operand in operands:
+    try:
+        check = int( operand )
+    except:
+        if operand in secured:
+            stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) )
+
+# Work out which columns are used in the filter (save using 1 based counting)
+used_cols = sorted(set(int(match.group()[1:]) \
+                   for match in re.finditer('c(\d)+', cond_text))) 
+largest_col_index = max(used_cols)
+
+# Prepare the column variable names and wrappers for column data types. Only 
+# cast columns used in the filter.
+cols, type_casts = [], []
+for col in range( 1, largest_col_index + 1 ):
+    col_name = "c%d" % col
+    cols.append( col_name )
+    col_type = in_column_types[ col - 1 ]
+    if col in used_cols:
+        type_cast = "%s(%s)" % ( col_type, col_name )
+    else:
+        #If we don't use this column, don't cast it.
+        #Otherwise we get errors on things like optional integer columns.
+        type_cast = col_name
+    type_casts.append( type_cast )

+col_str = ', '.join( cols )    # 'c1, c2, c3, c4'
+type_cast_str = ', '.join( type_casts )  # 'str(c1), int(c2), int(c3), str(c4)'
+assign = "%s, = line.split( '\\t' )[:%i]" % ( col_str, largest_col_index )
+wrap = "%s = %s" % ( col_str, type_cast_str )
+skipped_lines = 0
+invalid_lines = 0
+first_invalid_line = 0
+invalid_line = None
+lines_kept = 0
+total_lines = 0
+out = open( out_fname, 'wt' )
+    
+# Read and filter input file, skipping invalid lines
+code = '''
+for i, line in enumerate( file( in_fname ) ):
+    total_lines += 1
+    line = line.rstrip( '\\r\\n' )
+    if not line or line.startswith( '#' ):
+        skipped_lines += 1
+        continue
+    try:
+        %s
+        %s
+        if %s:
+            lines_kept += 1
+            print >> out, line
+    except:
+        invalid_lines += 1
+        if not invalid_line:
+            first_invalid_line = i + 1
+            invalid_line = line
+''' % ( assign, wrap, cond_text )
+
+valid_filter = True
+try:
+    exec code
+except Exception, e:
+    out.close()
+    if str( e ).startswith( 'invalid syntax' ):
+        valid_filter = False
+        stop_err( 'Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % cond_text )
+    else:
+        stop_err( str( e ) )
+
+if valid_filter:
+    out.close()
+    valid_lines = total_lines - skipped_lines
+    print 'Filtering with %s, ' % cond_text
+    if valid_lines > 0:
+        print 'kept %4.2f%% of %d valid lines (%d total lines).' % ( 100.0*lines_kept/valid_lines, valid_lines, total_lines )
+    else:
+        print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % cond_text
+    if invalid_lines:
+        print 'Skipped %d invalid line(s) starting at line #%d: "%s"' % ( invalid_lines, first_invalid_line, invalid_line )
+    if skipped_lines:
+        print 'Skipped %i comment (starting with #) or blank line(s)' % skipped_lines
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/filtering.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/filtering.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,76 @@
+<tool id="Filter1" name="Filter" version="1.1.0">
+  <description>data on any column using simple expressions</description>
+  <command interpreter="python">
+    filtering.py $input $out_file1 "$cond" ${input.metadata.columns} "${input.metadata.column_types}"
+  </command>
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Filter" help="Dataset missing? See TIP below."/>
+    <param name="cond" size="40" type="text" value="c1=='chr22'" label="With following condition" help="Double equal signs, ==, must be used as shown above. To filter for an arbitrary string, use the Select tool.">
+      <validator type="empty_field" message="Enter a valid filtering condition, see syntax and examples below."/>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="1.bed"/>
+      <param name="cond" value="c1=='chr22'"/>
+      <output name="out_file1" file="filter1_test1.bed"/>
+    </test>
+    <test>
+      <param name="input" value="7.bed"/>
+      <param name="cond" value="c1=='chr1' and c3-c2>=2000 and c6=='+'"/>
+      <output name="out_file1" file="filter1_test2.bed"/>
+    </test>
+    <!-- Test filtering of file with a variable number of columns. -->
+    <test>
+      <param name="input" value="filter1_in3.sam"/>
+      <param name="cond" value="c3=='chr1' and c5>5"/>
+      <output name="out_file1" file="filter1_test3.sam"/>
+    </test>
+    <test>
+      <param name="input" value="filter1_inbad.bed"/>
+      <param name="cond" value="c1=='chr22'"/>
+      <output name="out_file1" file="filter1_test4.bed"/>
+    </test>
+  </tests>
+  <help>
+
+.. class:: warningmark
+
+Double equal signs, ==, must be used as *"equal to"* (e.g., **c1 == 'chr22'**)
+
+.. class:: infomark
+
+**TIP:** Attempting to apply a filtering condition may throw exceptions if the data type (e.g., string, integer) in every line of the columns being filtered is not appropriate for the condition (e.g., attempting certain numerical calculations on strings).  If an exception is thrown when applying the condition to a line, that line is skipped as invalid for the filter condition.  The number of invalid skipped lines is documented in the resulting history item as a "Condition/data issue".
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
+
+-----
+
+**Syntax**
+
+The filter tool allows you to restrict the dataset using simple conditional statements.
+
+- Columns are referenced with **c** and a **number**. For example, **c1** refers to the first column of a tab-delimited file
+- Make sure that multi-character operators contain no white space ( e.g., **&lt;=** is valid while **&lt; =** is not valid )
+- When using 'equal-to' operator **double equal sign '==' must be used** ( e.g., **c1=='chr1'** )
+- Non-numerical values must be included in single or double quotes ( e.g., **c6=='+'** )
+- Filtering condition can include logical operators, but **make sure operators are all lower case** ( e.g., **(c1!='chrX' and c1!='chrY') or not c6=='+'** )
+
+-----
+
+**Example**
+
+- **c1=='chr1'** selects lines in which the first column is chr1
+- **c3-c2&lt;100*c4** selects lines where subtracting column 3 from column 2 is less than the value of column 4 times 100
+- **len(c2.split(',')) &lt; 4** will select lines where the second column has less than four comma separated elements
+- **c2>=1** selects lines in which the value of column 2 is greater than or equal to 1
+- Numbers should not contain commas - **c2&lt;=44,554,350** will not work, but **c2&lt;=44554350** will
+- Some words in the data can be used, but must be single or double quoted ( e.g., **c3=='exon'** )
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/generate_matrix_for_pca_lda.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/generate_matrix_for_pca_lda.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,147 @@
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+
+my $Input_Matrix = $ARGV[0];
+my $Input_Label = $ARGV[1];
+
+my %Hash_X = ();
+my %Hash_Y = ();
+my $My_Num_X = 0;
+my $My_Num_Y = 0;
+
+open (OUT, "> $ARGV[2]");
+
+open (LABEL, "< $Input_Label")     ||
+ die "Sorry, I couldn't open the escape.txt for clone: $!\n";
+
+my $Label_Index = 0;
+my $X_Label;
+my $input_Label;
+while (defined($input_Label = <LABEL>)){
+ chomp($input_Label);
+ my @cArray_Label = $input_Label =~ /(\S+)\s*/g;
+ if ($input_Label =~ /\w/){
+ if ($Label_Index == 0){
+ $Hash_X{$cArray_Label[0]} = $cArray_Label[1];
+ $X_Label = $cArray_Label[1];
+ $Label_Index = 1;
+ }else{
+ if ($cArray_Label[1] eq $X_Label){
+ $Hash_X{$cArray_Label[0]} = $cArray_Label[1];
+ }else{
+ $Hash_Y{$cArray_Label[0]} = $cArray_Label[1];
+ }
+ }
+ }
+}
+close(LABEL);
+
+open (MATRIX, "< $Input_Matrix")     ||
+ die "Sorry, I couldn't open the escape.txt for clone: $!\n";
+
+my %Hash_Matrix = ();
+my %Hash_Features = ();
+my @cArray_Features = ();
+
+my %Hash_Sum = ();
+my $Matrix_Index = 0;
+my $input_Matrix;
+while (defined($input_Matrix = <MATRIX>)){
+ chomp($input_Matrix);
+ my @cArray_Matrix = $input_Matrix =~ /(\S+)\s*/g;
+ if ($input_Matrix =~ /\w/){
+ if ($Matrix_Index == 0){
+ @cArray_Features = @cArray_Matrix;
+ my $Temp_Num_Array = scalar(@cArray_Matrix);
+ my $Temp_Index = 0;
+ for(;$Temp_Index < $Temp_Num_Array; $Temp_Index++){
+ $Hash_Features{$cArray_Matrix[$Temp_Index]} = "BOL";
+ $Hash_Sum{$cArray_Matrix[$Temp_Index]} = 0;
+ }
+ $Matrix_Index = 1;
+ }else{
+ $Hash_Matrix{$cArray_Matrix[0]} = $input_Matrix;
+ }
+ }
+}
+close(MATRIX);
+
+my $Trace_Key;
+
+foreach $Trace_Key (sort {$a cmp $b} keys %Hash_X){
+ my @cArray_Trace_X = $Hash_Matrix{$Trace_Key} =~ /(\S+)\s*/g;
+ my $Num_Array_Feature_X = scalar(@cArray_Features);
+ my $Index_Feature_X = 0;
+ for(;$Index_Feature_X < $Num_Array_Feature_X; $Index_Feature_X++){
+ if ($Hash_Features{$cArray_Features[$Index_Feature_X]} eq "BOL"){
+ $Hash_Features{$cArray_Features[$Index_Feature_X]} = $cArray_Trace_X[$Index_Feature_X + 1];
+ }else{
+ $Hash_Features{$cArray_Features[$Index_Feature_X]} = $Hash_Features{$cArray_Features[$Index_Feature_X]} . "\t" . $cArray_Trace_X[$Index_Feature_X + 1];
+ }
+
+ $Hash_Sum{$cArray_Features[$Index_Feature_X]} += $cArray_Trace_X[$Index_Feature_X + 1];
+ } 
+ $My_Num_X ++;
+}
+
+my $Append_Key;
+foreach $Append_Key (keys %Hash_Features){
+ $Hash_Features{$Append_Key} = $Hash_Features{$Append_Key} . "\t" . $Hash_Sum{$Append_Key};
+ $Hash_Sum{$Append_Key} = 0;
+}
+
+foreach $Trace_Key (sort {$a cmp $b} keys %Hash_Y){
+ my @cArray_Trace_Y = $Hash_Matrix{$Trace_Key} =~ /(\S+)\s*/g;
+ my $Num_Array_Feature_Y = scalar(@cArray_Features);
+ my $Index_Feature_Y = 0;
+ for(;$Index_Feature_Y < $Num_Array_Feature_Y; $Index_Feature_Y++){
+ if ($Hash_Features{$cArray_Features[$Index_Feature_Y]} eq "BOL"){
+ $Hash_Features{$cArray_Features[$Index_Feature_Y]} = $cArray_Trace_Y[$Index_Feature_Y + 1];
+ }else{
+ $Hash_Features{$cArray_Features[$Index_Feature_Y]} = $Hash_Features{$cArray_Features[$Index_Feature_Y]} . "\t" . $cArray_Trace_Y[$Index_Feature_Y + 1];
+ }
+
+ $Hash_Sum{$cArray_Features[$Index_Feature_Y]} += $cArray_Trace_Y[$Index_Feature_Y + 1];
+ } 
+ $My_Num_Y ++;
+}
+
+foreach $Append_Key (keys %Hash_Features){
+ $Hash_Features{$Append_Key} = $Hash_Features{$Append_Key} . "\t" . $Hash_Sum{$Append_Key} . "\t" . "EOL";
+}
+
+my $Prt_Key;
+print OUT " \t";
+foreach $Prt_Key (sort {$a cmp $b} keys %Hash_X){
+ print OUT "$Prt_Key \t";
+}
+print OUT "X(SUM) \t";
+
+foreach $Prt_Key (sort {$a cmp $b} keys %Hash_Y){
+ print OUT "$Prt_Key \t";
+}
+print OUT "Y(SUM) \t";
+print OUT "\n";
+
+my $Prt_Index = 0;
+my $Prt_Array_Num = scalar (@cArray_Features);
+for(;$Prt_Index < $Prt_Array_Num; $Prt_Index++){
+ print OUT "$cArray_Features[$Prt_Index] \t$Hash_Features{$cArray_Features[$Prt_Index]}\n";
+}
+
+print OUT " \t";
+my $My_Label_Index = 0;
+for(;$My_Label_Index < $My_Num_X; $My_Label_Index++){
+ print OUT "X \t";
+}
+print OUT " \t";
+
+$My_Label_Index = 0;
+for(;$My_Label_Index < $My_Num_Y; $My_Label_Index++){
+ print OUT "Y \t";
+}
+print OUT " \t\n";
+
+close(OUT);
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/generate_matrix_for_pca_lda.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/generate_matrix_for_pca_lda.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,48 @@
+<tool id="generate_matrix_for_pca_and_lda1" name="Generate A Matrix">
+    <description>for using PC and LDA</description>
+    <command interpreter="perl">generate_matrix_for_pca_lda.pl $input_1 $input_2 $output</command>
+
+    <inputs>
+        <param format="tabular" name="input_1" type="data" label="Source file First: a matrix (samples/observations in rows and variables/features in columns)"> </param>
+        <param format="tabular" name="input_2" type="data" label="Source file Second: a table (samples/observations with response/class label)"> </param>
+    </inputs>
+
+    <outputs>
+        <data format="tabular" name="output" />
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="input_1" value="matrix_generator_for_pc_and_lda_input_1.tabular"/>
+            <param name="input_2" value="matrix_generator_for_pc_and_lda_input_2.tabular"/>
+            <output name="output" file="matrix_generator_for_pc_and_lda_output.tabular"/>
+        </test>
+    </tests>
+
+    <help>
+
+.. class:: infomark
+
+**What it does**
+
+This tool consists of a module to generate a matrix to be used for running the Linear Discriminant Analysis as described in Carrel et al., 2006 (PMID: 17009873)
+
+*Carrel L, Park C, Tyekucheva S, Dunn J, Chiaromonte F, et al. (2006) Genomic Environment Predicts Expression Patterns on the Human     Inactive X Chromosome. PLoS Genet 2(9): e151. doi:10.1371/journal.pgen.0020151*
+
+-----
+
+**Example**
+
+- Input file (Source file First)
+
+.. image:: ./static/images/tools/lda/first_matrix_generator_example_file.png
+
+
+- Input file (Source file Second)
+
+.. image:: ./static/images/tools/lda/second_matrix_generator_example_file.png
+
+
+</help>
+
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/grouping.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/grouping.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,162 @@
+#!/usr/bin/env python
+# Guruprasad Ananda
+# Refactored 2011, Kanwei Li
+# Refactored to use numpy instead of rpy
+"""
+This tool provides the SQL "group by" functionality.
+"""
+import sys, commands, tempfile, random
+try:
+    import numpy
+except:
+    from galaxy import eggs
+    eggs.require( "numpy" )
+    import numpy
+
+from itertools import groupby
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+def mode(data):
+    counts = {}
+    for x in data:
+        counts[x] = counts.get(x,0) + 1
+    maxcount = max(counts.values())
+    modelist = []
+    for x in counts:
+        if counts[x] == maxcount:
+            modelist.append( str(x) )
+    return ','.join(modelist)
+    
+def main():
+    inputfile = sys.argv[2]
+    ignorecase = int(sys.argv[4])
+    ops = []
+    cols = []
+    round_val = []
+    data_ary = []
+    
+    for var in sys.argv[5:]:
+        op, col, do_round = var.split()
+        ops.append(op)
+        cols.append(col)
+        round_val.append(do_round)
+    """
+    At this point, ops, cols and rounds will look something like this:
+    ops:  ['mean', 'min', 'c']
+    cols: ['1', '3', '4']
+    round_val: ['no', 'yes' 'no']
+    """
+
+    try:
+        group_col = int( sys.argv[3] )-1
+    except:
+        stop_err( "Group column not specified." )
+    
+    str_ops = ['c', 'length', 'unique', 'random', 'cuniq', 'Mode'] #ops that can handle string/non-numeric inputs
+    
+    tmpfile = tempfile.NamedTemporaryFile()
+    
+    try:
+        """
+        The -k option for the Posix sort command is as follows:
+        -k, --key=POS1[,POS2]
+        start a key at POS1, end it at POS2 (origin 1)
+        In other words, column positions start at 1 rather than 0, so 
+        we need to add 1 to group_col.
+        if POS2 is not specified, the newer versions of sort will consider the entire line for sorting. To prevent this, we set POS2=POS1.
+        """
+        case = ''
+        if ignorecase == 1:
+            case = '-f' 
+        command_line = "sort -t ' ' %s -k%s,%s -o %s %s" % (case, group_col+1, group_col+1, tmpfile.name, inputfile)
+    except Exception, exc:
+        stop_err( 'Initialization error -> %s' %str(exc) )
+    
+    error_code, stdout = commands.getstatusoutput(command_line)
+    
+    if error_code != 0:
+        stop_err( "Sorting input dataset resulted in error: %s: %s" %( error_code, stdout ))
+        
+    fout = open(sys.argv[1], "w")
+    
+    def is_new_item(line):
+        item = line.strip().split("\t")[group_col]
+        if ignorecase == 1:
+            return item.lower()
+        return item
+        
+    for key, line_list in groupby(tmpfile, key=is_new_item):
+        op_vals = [ [] for op in ops ]
+        out_str = key
+        multiple_modes = False
+        mode_index = None
+        
+        for line in line_list:
+            fields = line.strip().split("\t")
+            for i, col in enumerate(cols):
+                col = int(col)-1 # cXX from galaxy is 1-based
+                try:
+                    val = fields[col].strip()
+                    op_vals[i].append(val)
+                except IndexError:
+                    sys.stderr.write( 'Could not access the value for column %s on line: "%s". Make sure file is tab-delimited.\n' % (col+1, line) )
+                    sys.exit( 1 )
+                
+        # Generate string for each op for this group
+        for i, op in enumerate( ops ):
+            data = op_vals[i]
+            rval = ""
+            if op == "mode":
+                rval = mode( data )
+            elif op == "length":
+                rval = len( data )
+            elif op == "random":
+                rval = random.choice(data)
+            elif op in ['cat', 'cat_uniq']:
+                if op == 'cat_uniq':
+                    data = numpy.unique(data)
+                rval = ','.join(data)
+            elif op == "unique":
+                rval = len( numpy.unique(data) )
+            else:
+                # some kind of numpy fn
+                try:
+                    data = map(float, data)
+                except ValueError:
+                    sys.stderr.write( "Operation %s expected number values but got %s instead.\n" % (op, data) )
+                    sys.exit( 1 )
+                rval = getattr(numpy, op)( data )
+                if round_val[i] == 'yes':
+                    rval = round(rval)
+                else:
+                    rval = '%g' % rval
+                        
+            out_str += "\t%s" % rval
+        
+        fout.write(out_str + "\n")
+    
+    # Generate a useful info message.
+    msg = "--Group by c%d: " %(group_col+1)
+    for i, op in enumerate(ops):
+        if op == 'cat':
+            op = 'concat'
+        elif op == 'cat_uniq':
+            op = 'concat_distinct'
+        elif op == 'length':
+            op = 'count'
+        elif op == 'unique':
+            op = 'count_distinct'
+        elif op == 'random':
+            op = 'randomly_pick'
+        
+        msg += op + "[c" + cols[i] + "] "
+    
+    print msg
+    fout.close()
+    tmpfile.close()
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/grouping.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/grouping.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,123 @@
+<tool id="Grouping1" name="Group" version="2.0.0">
+  <description>data by a column and perform aggregate operation on other columns.</description>
+  <command interpreter="python">
+    grouping.py 
+      $out_file1
+      $input1
+      $groupcol
+      $ignorecase
+      #for $op in $operations
+       '${op.optype}
+        ${op.opcol}
+        ${op.opround}'
+      #end for
+  </command>
+  <inputs>
+    <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/>
+    <param name="groupcol" label="Group by column" type="data_column" data_ref="input1" />
+    <param name="ignorecase" type="boolean" truevalue="1" falsevalue="0">
+      <label>Ignore case while grouping?</label>
+    </param>
+    <repeat name="operations" title="Operation">
+      <param name="optype" type="select" label="Type">
+        <option value="mean">Mean</option>
+        <option value="median">Median</option>
+        <option value="mode">Mode</option>
+        <option value="max">Maximum</option>
+        <option value="min">Minimum</option>
+        <option value="sum">Sum</option>
+        <option value="length">Count</option>
+        <option value="unique">Count Distinct</option>
+        <option value="cat">Concatenate</option>
+        <option value="cat_uniq">Concatenate Distinct</option>
+        <option value="random">Randomly pick</option>
+        <option value="std">Standard deviation</option>
+      </param>
+      <param name="opcol" label="On column" type="data_column" data_ref="input1" />
+      <param name="opround" type="select" label="Round result to nearest integer?">
+         <option value="no">NO</option>
+         <option value="yes">YES</option>
+       </param>
+    </repeat>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_file1" />
+  </outputs>
+  <requirements>
+    <requirement type="python-module">numpy</requirement>
+  </requirements>
+  <tests>
+    <!-- Test valid data -->
+    <test>
+      <param name="input1" value="1.bed"/>
+      <param name="groupcol" value="1"/>
+      <param name="ignorecase" value="true"/>
+      <param name="optype" value="mean"/>
+      <param name="opcol" value="2"/>
+      <param name="opround" value="no"/>
+      <output name="out_file1" file="groupby_out1.dat"/>
+    </test>
+    <!-- Long case but test framework doesn't allow yet
+    <test>
+      <param name="input1" value="1.bed"/>
+      <param name="groupcol" value="1"/>
+      <param name="ignorecase" value="false"/>
+      <param name="operations" value='[{"opcol": "2", "__index__": 0, "optype": "mean", "opround": "no"}, {"opcol": "2", "__index__": 1, "optype": "median", "opround": "no"}, {"opcol": "6", "__index__": 2, "optype": "mode", "opround": "no"}, {"opcol": "2", "__index__": 3, "optype": "max", "opround": "no"}, {"opcol": "2", "__index__": 4, "optype": "min", "opround": "no"}, {"opcol": "2", "__index__": 5, "optype": "sum", "opround": "no"}, {"opcol": "1", "__index__": 6, "optype": "length", "opround": "no"}, {"opcol": "1", "__index__": 7, "optype": "unique", "opround": "no"}, {"opcol": "1", "__index__": 8, "optype": "cat", "opround": "no"}, {"opcol": "6", "__index__": 9, "optype": "cat_uniq", "opround": "no"}, {"opcol": "2", "__index__": 10, "optype": "random", "opround": "no"}, {"opcol": "2", "__index__": 11, "optype": "std", "opround": "no"}]'/>
+      <output name="out_file1" file="groupby_out3.tabular"/>
+    </test>
+    -->
+    <!-- Test data with an invalid value in a column. Can't do it because test framework doesn't allow testing of errors
+    <test>
+      <param name="input1" value="1.tabular"/>
+      <param name="groupcol" value="1"/>
+      <param name="ignorecase" value="true"/>
+      <param name="optype" value="mean"/>
+      <param name="opcol" value="2"/>
+      <param name="opround" value="no"/>
+      <output name="out_file1" file="groupby_out2.dat"/>
+    </test>
+     -->
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
+
+-----
+
+**Syntax**
+
+This tool allows you to group the input dataset by a particular column and perform aggregate functions: Mean, Median, Mode, Sum, Max, Min, Count, Concatenate, and Randomly pick on any column(s).
+
+The Concatenate function will take, for each group, each item in the specified column and build a comma delimited list. Concatenate Unique will do the same but will build a list of unique items with no repetition.
+
+Count and Count Unique are equivalent to Concatenate and Concatenate Unique, but will only count the number of items and will return an integer.
+
+- If multiple modes are present, all are reported.
+
+-----
+
+**Example**
+
+- For the following input::
+
+   chr22  1000  1003  TTT
+   chr22  2000  2003  aaa
+   chr10  2200  2203  TTT
+   chr10  1200  1203  ttt
+   chr22  1600  1603  AAA
+
+- **Grouping on column 4** while ignoring case, and performing operation **Count on column 1** will return::
+
+   AAA    2
+   TTT    3
+   
+- **Grouping on column 4** while not ignoring case, and performing operation **Count on column 1** will return::
+
+   aaa    1
+   AAA    1
+   ttt    1
+   TTT    2
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/gsummary.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/gsummary.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,114 @@
+#!/usr/bin/env python
+
+import sys, re, tempfile
+from rpy import *
+# Older py compatibility
+try:
+    set()
+except:
+    from sets import Set as set
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def S3_METHODS( all="key" ):
+    Group_Math =  [ "abs", "sign", "sqrt", "floor", "ceiling", "trunc", "round", "signif",
+        "exp", "log", "cos", "sin", "tan", "acos", "asin", "atan", "cosh", "sinh", "tanh",
+        "acosh", "asinh", "atanh", "lgamma", "gamma", "gammaCody", "digamma", "trigamma",
+        "cumsum", "cumprod", "cummax", "cummin", "c" ]
+    Group_Ops = [ "+", "-", "*", "/", "^", "%%", "%/%", "&", "|", "!", "==", "!=", "<", "<=", ">=", ">", "(", ")", "~", "," ]
+    if all is "key":
+        return { 'Math' : Group_Math, 'Ops' : Group_Ops }
+
+def main():
+    try:
+        datafile = sys.argv[1]
+        outfile_name = sys.argv[2]
+        expression = sys.argv[3]
+    except: 
+        stop_err( 'Usage: python gsummary.py input_file ouput_file expression' )
+
+    math_allowed = S3_METHODS()[ 'Math' ]
+    ops_allowed = S3_METHODS()[ 'Ops' ]
+
+    # Check for invalid expressions
+    for word in re.compile( '[a-zA-Z]+' ).findall( expression ):
+        if word and not word in math_allowed: 
+            stop_err( "Invalid expression '%s': term '%s' is not recognized or allowed" %( expression, word ) )
+    symbols = set()
+    for symbol in re.compile( '[^a-z0-9\s]+' ).findall( expression ):
+        if symbol and not symbol in ops_allowed:
+            stop_err( "Invalid expression '%s': operator '%s' is not recognized or allowed" % ( expression, symbol ) )
+        else:
+            symbols.add( symbol )
+    if len( symbols ) == 1 and ',' in symbols:
+        # User may have entered a comma-separated list r_data_frame columns
+        stop_err( "Invalid columns '%s': this tool requires a single column or expression" % expression )
+
+    # Find all column references in the expression
+    cols = []
+    for col in re.compile( 'c[0-9]+' ).findall( expression ):
+        try:
+            cols.append( int( col[1:] ) - 1 )
+        except:
+            pass

+    tmp_file = tempfile.NamedTemporaryFile( 'w+b' )
+    # Write the R header row to the temporary file
+    hdr_str = "\t".join( "c%s" % str( col+1 ) for col in cols )
+    tmp_file.write( "%s\n" % hdr_str )
+    skipped_lines = 0
+    first_invalid_line = 0
+    i = 0
+    for i, line in enumerate( file( datafile ) ):
+        line = line.rstrip( '\r\n' )
+        if line and not line.startswith( '#' ):
+            valid = True
+            fields = line.split( '\t' )
+            # Write the R data row to the temporary file
+            for col in cols:
+                try:
+                    float( fields[ col ] )
+                except:
+                    skipped_lines += 1
+                    if not first_invalid_line:
+                        first_invalid_line = i + 1
+                    valid = False
+                    break
+            if valid:
+                data_str = "\t".join( fields[ col ] for col in cols )
+                tmp_file.write( "%s\n" % data_str )
+    tmp_file.flush()
+
+    if skipped_lines == i + 1:
+        stop_err( "Invalid column or column data values invalid for computation.  See tool tips and syntax for data requirements." )
+    else:
+        # summary function and return labels
+        summary_func = r( "function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }" )
+        headings = [ 'sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%' ]
+        headings_str = "\t".join( headings )
+        
+        set_default_mode( NO_CONVERSION )
+        r_data_frame = r.read_table( tmp_file.name, header=True, sep="\t" )
+        
+        outfile = open( outfile_name, 'w' )
+
+        for col in re.compile( 'c[0-9]+' ).findall( expression ):
+            r.assign( col, r[ "$" ]( r_data_frame, col ) )
+        try:
+            summary = summary_func( r( expression ) )
+        except RException, s:
+            outfile.close()
+            stop_err( "Computation resulted in the following error: %s" % str( s ) )
+        summary = summary.as_py( BASIC_CONVERSION )
+        outfile.write( "#%s\n" % headings_str )
+        outfile.write( "%s\n" % "\t".join( [ "%g" % ( summary[ k ] ) for k in headings ] ) )
+        outfile.close()
+
+        if skipped_lines:
+            print "Skipped %d invalid lines beginning with line #%d.  See tool tips for data requirements." % ( skipped_lines, first_invalid_line )        
+
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/gsummary.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/gsummary.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,76 @@
+<tool id="Summary_Statistics1" name="Summary Statistics" version="1.1.0">
+  <description>for any numerical column</description>
+  <command interpreter="python">gsummary.py $input $out_file1 "$cond"</command>
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Summary statistics on" help="Dataset missing? See TIP below"/>
+    <param name="cond" size="30" type="text" value="c5" label="Column or expression" help="See syntax below">
+      <validator type="empty_field" message="Enter a valid column or expression, see syntax below for examples"/>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_file1" />
+  </outputs>
+  <requirements>
+    <requirement type="python-module">rpy</requirement>
+  </requirements>
+  <tests>
+    <test>
+      <param name="input" value="1.bed"/>
+      <output name="out_file1" file="gsummary_out1.tabular"/>
+      <param name="cond" value="c2"/>
+    </test>
+  </tests>
+  <help>
+
+.. class:: warningmark
+
+This tool expects input datasets consisting of tab-delimited columns (blank or comment lines beginning with a # character are automatically skipped).
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert delimiters to TAB*
+
+.. class:: infomark
+
+**TIP:** Computing summary statistics may throw exceptions if the data value in every line of the columns being summarized is not numerical.  If a line is missing a value or contains a non-numerical value in the column being summarized, that line is skipped and the value is not included in the statistical computation.  The number of invalid skipped lines is documented in the resulting history item.
+
+.. class:: infomark
+
+**USING R FUNCTIONS:** Most functions (like *abs*) take only a single expression. *log* can take one or two parameters, like *log(expression,base)*
+
+Currently, these R functions are supported: *abs, sign, sqrt, floor, ceiling, trunc, round, signif, exp, log, cos, sin, tan, acos, asin, atan, cosh, sinh, tanh, acosh, asinh, atanh, lgamma, gamma, gammaCody, digamma, trigamma, cumsum, cumprod, cummax, cummin*
+
+-----
+
+**Syntax**
+
+This tool computes basic summary statistics on a given column, or on a valid expression containing one or more columns.
+
+- Columns are referenced with **c** and a **number**. For example, **c1** refers to the first column of a tab-delimited file.
+
+- For example:
+
+  - **log(c5)** calculates the summary statistics for the natural log of column 5
+  - **(c5 + c6 + c7) / 3** calculates the summary statistics on the average of columns 5-7
+  - **log(c5,10)** summary statistics of the base 10 log of column 5
+  - **sqrt(c5+c9)** summary statistics of the square root of column 5 + column 9
+
+-----
+
+**Examples**
+
+- Input Dataset::
+
+    c1      c2      c3      c4      c5              c6
+    586     chrX    161416  170887  41108_at        16990
+    73      chrX    505078  532318  35073_at        1700
+    595     chrX    1361578 1388460 33665_s_at      1960
+    74      chrX    1420620 1461919 1185_at         8600
+
+- Summary Statistics on column c6 of the above input dataset::
+
+    #sum       mean      stdev     0%        25%       50%       75%        100%
+    29250.000  7312.500  7198.636  1700.000  1895.000  5280.000  10697.500  16990.000
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/gsummary.xml.groups
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/gsummary.xml.groups Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,62 @@
+<tool id="Summary Statistics1" name="Summary Statistics">
+  <description>of a column in a tab delimited file according to an expression</description>
+  <command interpreter="python">gsummary.py $input $out_file1 "$cond" "$groups"</command>
+  <inputs>
+    <param name="cond" size="40" type="text" value="c5" label="expression"/>
+    <param name="groups" size="40" type="text" value="none" label="group terms (c1,c4,etc.)"/>
+    <param format="txt" name="input" type="data" label="summary statistics on"/>
+
+  </inputs>
+  <outputs>
+    <data format="txt" name="out_file1" />
+  </outputs>
+  <help>
+
+.. class:: warningmark
+
+This tool expects input datasets to consist of tab-delimited columns (blank or comment lines beginning with a # character are automatically skipped).
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
+
+.. class:: infomark
+
+**TIP:** Computing summary statistics may throw exceptions if the data value in every line of the columns being summarized is not numerical.  If a line is missing a value or contains a non-numerical value in the column being summarized, that line is skipped and the value is not included in the statistical computation.  The number of invalid skipped lines is documented in the resulting history item.
+
+**Syntax**
+
+This tool computes basic summary statistics on a given column, or on an expression containing those columns
+
+- Columns are referenced with **c** and a **number**. For example, **c1** refers to the first column of a tab-delimited file
+- To group the summary by the values in a column or columns, specify in the **group terms** box...
+    + **c1**  *group by the values in column 1*
+    + **c1,c4** *group by the values in column 1, then by the values in column 4*
+
+
+-----
+
+**Expression examples**
+
+- **log(c5)** calculates the summary statistics for the natural log of column 5
+- **(c5 + c6 + c7) / 3** calculates the summary statistics on the average of columns 5-7
+- **log(c5,10)** summary statistics of the base 10 log of column 5
+- **sqrt(c5+c9)** summary statistics of the square root of column 5 + column 9
+
+**Group examples**
+
+- **c1**  group by the values in column 1
+- **c1,c4** group by the values in column 1, then by the values in column 4
+
+-----
+
+.. class:: infomark
+
+**TIP:** Most functions (like *abs*) take only a single expression. *log* can take one or two parameters, like *log(expression,base)* 
+
+Currently, these R functions are supported: *abs, sign, sqrt, floor, ceiling, trunc, round, signif, exp, log, cos, sin, tan, acos, asin, atan, cosh, sinh, tanh, acosh, asinh, atanh, lgamma, gamma, gammaCody, digamma, trigamma, cumsum, cumprod, cummax, cummin*
+
+.. |INFO| image:: ./static/images/icon_info_sml.gif
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/lda_analy.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/lda_analy.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,285 @@\n+<tool id="lda_analy1" name="Perform LDA" version="1.0.1">\n+\t<description>Linear Discriminant Analysis</description>\n+\t<command interpreter="sh">r_wrapper.sh $script_file</command>\n+\t<inputs>\n+\t\t<param format="tabular" name="input" type="data" label="Source file"/>\n+\t\t<param name="cond" size="30" type="integer" value="3" label="Number of principal components" help="See TIP below">\n+\t\t\t<validator type="empty_field" message="Enter a valid number of principal components, see syntax below for examples"/>\n+\t\t</param>\n+\n+\t</inputs>\n+\t<outputs>\n+\t\t<data format="txt" name="output" />\n+\t</outputs>\n+\n+\t<tests>\n+\t\t<test>\n+\t\t\t<param name="input" value="matrix_generator_for_pc_and_lda_output.tabular"/>\n+\t\t\t<output name="output" file="lda_analy_output.txt"/>\n+\t\t\t<param name="cond" value="2"/>\n+\n+\t\t</test>\n+\t</tests>\n+\n+\t<configfiles>\n+        \t<configfile name="script_file">\n+\n+        rm(list = objects() )\n+\n+        ############# FORMAT X DATA #########################\n+        format&lt;-function(data) {\n+            ind=NULL\n+            for(i in 1 : ncol(data)){\n+                if (is.na(data[nrow(data),i])) {\n+                    ind&lt;-c(ind,i)\n+                }\n+            }\n+            #print(is.null(ind))\n+            if (!is.null(ind)) {\n+                data&lt;-data[,-c(ind)]\n+            }\n+\n+            data\n+        }\n+\n+        ########GET RESPONSES ###############################\n+        get_resp&lt;- function(data) {\n+            resp1&lt;-as.vector(data[,ncol(data)])\n+                resp=numeric(length(resp1))\n+            for (i in 1:length(resp1)) {\n+                if (resp1[i]=="Y ") {\n+                    resp[i] = 0\n+                }\n+                if (resp1[i]=="X ") {\n+                    resp[i] = 1\n+                }\n+            }\n+                return(resp)\n+        }\n+\n+        ######## CHARS TO NUMBERS ###########################\n+        f_to_numbers&lt;- function(F) { \n+            ind&lt;-NULL\n+            G&lt;-matrix(0,nrow(F), ncol(F))\n+            for (i in 1:nrow(F)) {\n+                for (j in 1:ncol(F)) {\n+                    G[i,j]&lt;-as.integer(F[i,j])\n+                }\n+            }\n+            return(G)\n+        }\n+\n+        ###################NORMALIZING#########################\n+        norm &lt;- function(M, a=NULL, b=NULL) {\n+            C&lt;-NULL\n+            ind&lt;-NULL\n+\n+            for (i in 1: ncol(M)) {\n+                if (sd(M[,i])!=0) {\n+                    M[,i]&lt;-(M[,i]-mean(M[,i]))/sd(M[,i])\n+                }\n+                #   else {print(mean(M[,i]))}   \n+            }\n+            return(M)\n+        }\n+\n+        ##### LDA DIRECTIONS #################################\n+        lda_dec &lt;- function(data, k){\n+            priors=numeric(k)\n+            grandmean&lt;-numeric(ncol(data)-1)\n+            means=matrix(0,k,ncol(data)-1)\n+            B = matrix(0, ncol(data)-1, ncol(data)-1)\n+            N=nrow(data)\n+            for (i in 1:k){\n+                priors[i]=sum(data[,1]==i)/N\n+                grp=subset(data,data\\$group==i)\n+                means[i,]=mean(grp[,2:ncol(data)])\n+                #print(means[i,])\n+                #print(priors[i])\n+                #print(priors[i]*means[i,])\n+                grandmean = priors[i]*means[i,] + grandmean           \n+            }\n+\n+            for (i in 1:k) {\n+                B= B + priors[i]*((means[i,]-grandmean)%*%t(means[i,]-grandmean))\n+            }\n+    \n+            W = var(data[,2:ncol(data)])\n+            svdW = svd(W)\n+            inv_sqrtW =solve(svdW\\$v %*% diag(sqrt(svdW\\$d)) %*% t(svdW\\$v))\n+            B_star= t(inv_sqrtW)%*%B%*%inv_sqrtW\n+            B_star_decomp = svd(B_star)\n+            directions  = inv_sqrtW%*%B_star_decomp\\$v\n+            return( list(directions, B_star_decomp\\$d) )                          \n+        }\n+\n+        ################ NAIVE BAYES FOR 1D SIR OR LDA ##############\n+        naive_bayes_classifier &lt;- function(resp, tr_data, test_'..b'        #             print(paste(c(msg, \'overall : \', (1-er)*100, "%."),collapse=" "))\n+        #             print(paste(c(msg, \'within escapes : \', (1-er_esc)*100, "%."),collapse=" "))\n+        #             print(paste(c(msg, \'within subjects: \', (1-er_subj)*100, "%."),collapse=" ")) \n+            }\n+            return(c((1-er)*100, (1-er_esc)*100, (1-er_subj)*100))                                                                                    \n+        }\n+\n+        ## Main Function ##\n+\n+\tfiles&lt;-matrix("${input}", 1,1, byrow=T)\n+\n+\td&lt;-"${cond}"   # Number of PC\n+\n+\ttau&lt;-seq(0,1, by=0.005)\n+\t#tau&lt;-seq(0,1, by=0.1)\n+\tfor_curve=matrix(-10, 3,length(tau))\n+\n+\t##############################################################\n+\n+\ttest_data_whole_X &lt;-read.delim(files[1,1], row.names=1)\n+\n+\t#### FORMAT TRAINING DATA ####################################\n+\t# get only necessary columns \n+\n+\ttest_data_whole_X&lt;-format(test_data_whole_X)\n+\toligo_labels&lt;-test_data_whole_X[1:(nrow(test_data_whole_X)-1),ncol(test_data_whole_X)]\n+\ttest_data_whole_X&lt;-test_data_whole_X[,1:(ncol(test_data_whole_X)-1)]\n+\n+\tX_names&lt;-colnames(test_data_whole_X)[1:ncol(test_data_whole_X)]\n+\ttest_data_whole_X&lt;-t(test_data_whole_X)\n+\tresp&lt;-get_resp(test_data_whole_X) \n+\tldaqda_resp = resp + 1\n+\ta&lt;-sum(resp)\t\t# Number of Subject\n+\tb&lt;-length(resp) - a\t# Number of Escape   \n+\t## FREQUENCIES #################################################\n+\tF&lt;-test_data_whole_X[,1:(ncol(test_data_whole_X)-1)]\n+\tF&lt;-f_to_numbers(F)\n+\tFN&lt;-norm(F, a, b)\n+\tss&lt;-svd(FN)\n+\teigvar&lt;-NULL\n+\teig&lt;-ss\\$d^2\n+\n+\tfor ( i in 1:length(ss\\$d)) {\n+\t\teigvar[i]&lt;-sum(eig[1:i])/sum(eig)\n+\t}\n+\n+\t#print(paste(c("Variance explained : ", eigvar[d]*100, "%"), collapse=""))\n+\t\n+\tZ&lt;-F%*%ss\\$v\n+\n+\tldaqda_data &lt;- data.frame(group=ldaqda_resp,Z[,1:d])\n+\tlda_dir&lt;-lda_dec(ldaqda_data,2)\n+\ttrain_lda_pred &lt;-Z[,1:d]%*%lda_dir[[1]]\n+\n+\t############# NAIVE BAYES CROSS-VALIDATION #############\n+\t### LDA #####\n+\n+\ty&lt;-ldaqda_resp\n+\tX&lt;-F\n+\tcv&lt;-matrix(c(rep(\'NA\',nrow(test_data_whole_X))), nrow(test_data_whole_X), length(tau))\n+\tfor (i in 1:nrow(test_data_whole_X)) {\n+\t#\tprint(i)\n+\t\tresp&lt;-y[-i]\n+\t\tp&lt;-matrix(X[-i,], dim(X)[1]-1, dim(X)[2])\n+\t\ttestdata&lt;-matrix(X[i,],1,dim(X)[2])\n+\t\tp1&lt;-norm(p)\n+\t\tsss&lt;-svd(p1)\n+\t\tpred&lt;-(p%*%sss\\$v)[,1:d]\n+\t\ttest&lt;- (testdata%*%sss\\$v)[,1:d]\n+\t\tlda  &lt;- lda_dec(data.frame(group=resp,pred),2)\n+\t\tpred &lt;- pred[,1:d]%*%lda[[1]][,1]\n+\t\ttest &lt;- test%*%lda[[1]][,1]\n+\t\ttest&lt;-matrix(test, 1, length(test))\n+\t\tfor (t in 1:length(tau)) {\n+\t\t\tcv[i, t] &lt;- naive_bayes_classifier (resp, pred, test,k=2, tau[t]) \n+\t\t}\n+ \t}\n+\n+\tfor (t in 1:length(tau)) {\n+\t\ttr_err&lt;-ext_error_rate(cv[,t], ldaqda_resp , c("CV"), 1)\n+\t\tfor_curve[1:3,t]&lt;-tr_err\n+\t}\n+\n+\tdput(for_curve, file="${output}")\n+\n+\n+\t\t</configfile>\n+\t</configfiles>\n+\n+\t<help>\n+\n+.. class:: infomark\r\n+\r\n+**TIP:** If you want to perform Principal Component Analysis (PCA) on the give numeric input data (which corresponds to the "Source file First in "Generate A Matrix" tool), please use *Multivariate Analysis/Principal Component Analysis*\r\n+\r\n+-----\r\n+\n+.. class:: infomark\n+\n+**What it does**\n+\n+This tool consists of the module to perform the Linear Discriminant Analysis as described in Carrel et al., 2006 (PMID: 17009873)\n+\n+*Carrel L, Park C, Tyekucheva S, Dunn J, Chiaromonte F, et al. (2006) Genomic Environment Predicts Expression Patterns on the Human \tInactive X Chromosome. PLoS Genet 2(9): e151. doi:10.1371/journal.pgen.0020151*\n+\n+-----\n+\n+.. class:: warningmark\r\n+\r\n+**Note**\r\n+\n+- Output from "Generate A Matrix" tool is used as input file for this tool \r\n+- Output of this tool contains LDA classification success rates for different values of the turning parameter tau (from 0 to 1 with 0.005 interval). This output file will be used to establish the ROC plot, and you can obtain more detail information from this plot. \n+\n+\n+</help>\n+\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/plot_from_lda.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/plot_from_lda.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,258 @@\n+<tool id="plot_for_lda_output1" name="Draw ROC plot" version="1.0.1">\n+\t<description>on "Perform LDA" output</description>\n+\t<command interpreter="sh">r_wrapper.sh $script_file</command>\n+\n+\t<inputs>\n+\t\t<param format="txt" name="input" type="data" label="Source file"> </param>\n+\t\t<param name="my_title" size="30" type="text" value="My Figure" label="Title of your plot" help="See syntax below"> </param>\n+\t\t<param name="X_axis" size="30" type="text" value="Text for X axis" label="Legend of X axis in your plot" help="See syntax below"> </param>\n+\t\t<param name="Y_axis" size="30" type="text" value="Text for Y axis" label="Legend of Y axis in your plot" help="See syntax below"> </param>\n+\t</inputs>\n+\t<outputs>\n+\t\t<data format="pdf" name="pdf_output" />\n+\t</outputs>\n+\n+\t<tests>\n+\t\t<test>\n+\t\t\t<param name="input" value="lda_analy_output.txt"/>\n+\t\t\t<param name="my_title" value="Test Plot1"/>\n+\t\t\t<param name="X_axis" value="Test Plot2"/>\n+\t\t\t<param name="Y_axis" value="Test Plot3"/>\n+\t\t\t<output name="pdf_output" file="plot_for_lda_output.pdf"/>\n+\t\t</test>\n+\t</tests>\n+\n+    <configfiles>\n+            <configfile name="script_file">\n+\n+        rm(list = objects() )\n+\n+        ############# FORMAT X DATA #########################\n+        format&lt;-function(data) {\n+            ind=NULL\n+            for(i in 1 : ncol(data)){\n+                if (is.na(data[nrow(data),i])) {\n+                    ind&lt;-c(ind,i)\n+                }\n+            }\n+            #print(is.null(ind))\n+            if (!is.null(ind)) {\n+                data&lt;-data[,-c(ind)]\n+            }\n+\n+            data\n+        }\n+\n+        ########GET RESPONSES ###############################\n+        get_resp&lt;- function(data) {\n+            resp1&lt;-as.vector(data[,ncol(data)])\n+                resp=numeric(length(resp1))\n+            for (i in 1:length(resp1)) {\n+                if (resp1[i]=="Control ") {\n+                    resp[i] = 0\n+                }\n+                if (resp1[i]=="XLMR ") {\n+                    resp[i] = 1\n+                }\n+            }\n+                return(resp)\n+        }\n+\n+        ######## CHARS TO NUMBERS ###########################\n+        f_to_numbers&lt;- function(F) { \n+            ind&lt;-NULL\n+            G&lt;-matrix(0,nrow(F), ncol(F))\n+            for (i in 1:nrow(F)) {\n+                for (j in 1:ncol(F)) {\n+                    G[i,j]&lt;-as.integer(F[i,j])\n+                }\n+            }\n+            return(G)\n+        }\n+\n+        ###################NORMALIZING#########################\n+        norm &lt;- function(M, a=NULL, b=NULL) {\n+            C&lt;-NULL\n+            ind&lt;-NULL\n+\n+            for (i in 1: ncol(M)) {\n+                if (sd(M[,i])!=0) {\n+                    M[,i]&lt;-(M[,i]-mean(M[,i]))/sd(M[,i])\n+                }\n+                #   else {print(mean(M[,i]))}   \n+            }\n+            return(M)\n+        }\n+\n+        ##### LDA DIRECTIONS #################################\n+        lda_dec &lt;- function(data, k){\n+            priors=numeric(k)\n+            grandmean&lt;-numeric(ncol(data)-1)\n+            means=matrix(0,k,ncol(data)-1)\n+            B = matrix(0, ncol(data)-1, ncol(data)-1)\n+            N=nrow(data)\n+            for (i in 1:k){\n+                priors[i]=sum(data[,1]==i)/N\n+                grp=subset(data,data\\$group==i)\n+                means[i,]=mean(grp[,2:ncol(data)])\n+                #print(means[i,])\n+                #print(priors[i])\n+                #print(priors[i]*means[i,])\n+                grandmean = priors[i]*means[i,] + grandmean           \n+            }\n+\n+            for (i in 1:k) {\n+                B= B + priors[i]*((means[i,]-grandmean)%*%t(means[i,]-grandmean))\n+            }\n+    \n+            W = var(data[,2:ncol(data)])\n+            svdW = svd(W)\n+            inv_sqrtW =solve(svdW\\$v %*% diag(sqrt(svdW\\$d)) %*% t(svdW\\$v))\n+            B_star= t(inv_sqrtW)%*%B%*%inv_sqrtW\n+            B_star_decomp = svd(B_star)\n+            direc'..b'+            }\n+            else {\n+               cl[1]=2 \n+               cl[2]=1\n+            }\n+\n+            for (i in 1:length(test_data)) {\n+\n+                if (test_data[i] &lt;= cutoff) {\n+                    predclass[i] = cl[1]\n+            }\n+                else {\n+                    predclass[i] = cl[2] \n+            }  \n+                }\n+            #print(means)\n+            #print(mean(means))\n+            #X11()\n+            #plot(test_data,pch=predclass, col=resp) \n+            predclass\n+        }\n+\n+        ################# EXTENDED ERROR RATES #################\n+        ext_error_rate &lt;- function(predclass, actualclass,msg=c("you forgot the message"), pr=1) {\n+                 er=sum(predclass != actualclass)/length(predclass)\n+\n+                 matr&lt;-data.frame(predclass=predclass,actualclass=actualclass)\n+                 escapes = subset(matr, actualclass==1)\n+                 subjects = subset(matr, actualclass==2)      \n+                 er_esc=sum(escapes\\$predclass != escapes\\$actualclass)/length(escapes\\$predclass) \n+                 er_subj=sum(subjects\\$predclass != subjects\\$actualclass)/length(subjects\\$predclass)   \n+\n+                 if (pr==1) {\n+        #             print(paste(c(msg, \'overall : \', (1-er)*100, "%."),collapse=" "))\n+        #             print(paste(c(msg, \'within escapes : \', (1-er_esc)*100, "%."),collapse=" "))\n+        #             print(paste(c(msg, \'within subjects: \', (1-er_subj)*100, "%."),collapse=" ")) \n+            }\n+            return(c((1-er)*100, (1-er_esc)*100, (1-er_subj)*100))                                                                                    \n+        }\n+\n+        ## Main Function ##\n+\n+\tfiles_alias&lt;-c("${my_title}")\n+\ttau=seq(0,1,by=0.005)\n+\tnfiles=1\n+\tf = c("${input}")\n+\n+\trez_ext&lt;-list()\n+\tfor (i in 1:nfiles) {\n+\t\trez_ext[[i]]&lt;-dget(paste(f[i], sep="",collapse=""))\n+\t}\n+\n+\ttau&lt;-tau[1:(length(tau)-1)]\n+\tfor (i in 1:nfiles) {\n+\t\trez_ext[[i]]&lt;-rez_ext[[i]][,1:(length(tau)-1)]\n+\t}\n+\n+\t######## OPTIMAIL TAU ###########################\n+\n+\t#rez_ext\n+\n+\trate&lt;-c("Optimal tau","Tr total", "Tr Y", "Tr X")\n+\n+\tm_tr&lt;-numeric(nfiles)\n+\tm_xp22&lt;-numeric(nfiles)\n+\tm_x&lt;-numeric(nfiles)\n+\n+\tfor (i in 1:nfiles) {\n+\t\tr&lt;-rez_ext[[i]]\n+\t\t#tr\n+\t#\trate&lt;-rbind(rate, c(files_alias[i]," "," "," ") )\n+\t\tmm&lt;-which((r[3,])==max(r[3,]))\n+\n+\t\tm_tr[i]&lt;-mm[1]\n+\t\trate&lt;-rbind(rate,c(tau[m_tr[i]],r[,m_tr[i]]))\n+\t}\n+\tprint(rate)\n+\n+\tpdf(file= paste("${pdf_output}"))\n+\n+\tplot(rez_ext[[i]][2,]~rez_ext[[i]][3,], xlim=c(0,100), ylim=c(0,100), xlab="${X_axis}   [1-FP(False Positive)]", ylab="${Y_axis}   [1-FP(False Positive)]", type="l", lty=1, col="blue", xaxt=\'n\', yaxt=\'n\')\n+\tfor (i in 1:nfiles) {\n+\t\tlines(rez_ext[[i]][2,]~rez_ext[[i]][3,], xlab="${X_axis}   [1-FP(False Positive)]", ylab="${Y_axis}   [1-FP(False Positive)]", type="l", lty=1, col=i)   \n+\t\t# pt=c(r,)\n+\t\tpoints(x=rez_ext[[i]][3,m_tr[i]],y=rez_ext[[i]][2,m_tr[i]], pch=16, col=i)  \n+\t}\n+\n+\n+\ttitle(main="${my_title}", adj=0, cex.main=1.1)\n+\taxis(2, at=c(0,20,40,60,80,100), labels=c(\'0\',\'20\',\'40\',\'60\',\'80\',\'100%\'))\n+\taxis(1, at=c(0,20,40,60,80,100), labels=c(\'0\',\'20\',\'40\',\'60\',\'80\',\'100%\')) \n+\n+\t#leg=c("10 kb","50 kb","100 kb")\n+\t#legend("bottomleft",legend=leg , col=c(1,2,3), lty=c(1,1,1))\n+\n+\t#dev.off()\n+\n+\t\t</configfile>\n+\t</configfiles>\n+\n+\n+\t<help>\n+.. class:: infomark\n+\n+**What it does**\n+\n+This tool generates a Receiver Operating Characteristic (ROC) plot that shows LDA classification success rates for different values of the tuning parameter tau as Figure 3 in Carrel et al., 2006 (PMID: 17009873).\n+\n+*Carrel L, Park C, Tyekucheva S, Dunn J, Chiaromonte F, et al. (2006) Genomic Environment Predicts Expression Patterns on the Human Inactive X Chromosome. PLoS Genet 2(9): e151. doi:10.1371/journal.pgen.0020151*\n+\n+-----\n+\n+.. class:: warningmark\n+\n+**Note**\n+\n+- Output from "Perform LDA" tool is used as input file for this tool.\n+\n+</help>\n+\n+\n+\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/r_wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/r_wrapper.sh Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+### Run R providing the R script in $1 as standard input and passing 
+### the remaining arguments on the command line
+
+# Function that writes a message to stderr and exits
+fail()
+{
+    echo "$@" >&2
+    exit 1
+}
+
+# Ensure R executable is found
+which R > /dev/null || fail "'R' is required by this tool but was not found on path" 
+
+# Extract first argument
+infile=$1; shift
+
+# Ensure the file exists
+test -f $infile || fail "R input file '$infile' does not exist"
+
+# Invoke R passing file named by first argument to stdin
+R --vanilla --slave $* < $infile
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/wiggle_to_simple.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/wiggle_to_simple.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+"""
+Read a wiggle track and print out a series of lines containing
+"chrom position score". Ignores track lines, handles bed, variableStep
+and fixedStep wiggle lines.
+"""
+import sys
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+import bx.wiggle
+from galaxy.tools.exception_handling import *
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def main():
+    if len( sys.argv ) > 1: 
+        in_file = open( sys.argv[1] )
+    else: 
+        in_file = open( sys.stdin )
+    
+    if len( sys.argv ) > 2:
+        out_file = open( sys.argv[2], "w" )
+    else:
+        out_file = sys.stdout
+    
+    try:
+        for fields in bx.wiggle.IntervalReader( UCSCOutWrapper( in_file ) ):
+            out_file.write( "%s\n" % "\t".join( map( str, fields ) ) )
+    except UCSCLimitException:
+        # Wiggle data was truncated, at the very least need to warn the user.
+        print 'Encountered message from UCSC: "Reached output limit of 100000 data values", so be aware your data was truncated.'
+    except ValueError, e:
+        in_file.close()
+        out_file.close()
+        stop_err( str( e ) )
+
+    in_file.close()
+    out_file.close()
+
+if __name__ == "__main__": main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/stats/wiggle_to_simple.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/wiggle_to_simple.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,84 @@
+<tool id="wiggle2simple1" name="Wiggle-to-Interval">
+  <description>converter</description>
+  <command interpreter="python">wiggle_to_simple.py $input $out_file1 </command>
+  <inputs>
+    <param format="wig" name="input" type="data" label="Convert"/>
+  </inputs>
+  <outputs>
+    <data format="interval" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="2.wig" />
+      <output name="out_file1" file="2.interval"/>
+    </test>
+  </tests>
+  <help>
+**Syntax**
+
+This tool converts wiggle data into interval type.
+
+- **Wiggle format**: The .wig format is line-oriented. Wiggle data is preceded by a UCSC track definition line.  Following the track definition line is the track data, which can be entered in three different formats described below.
+
+  - **BED format** with no declaration line and four columns of data::
+
+      chromA  chromStartA  chromEndA  dataValueA
+      chromB  chromStartB  chromEndB  dataValueB
+
+  - **variableStep** two column data; started by a declaration line and followed with chromosome positions and data values::
+
+      variableStep  chrom=chrN  [span=windowSize]
+      chromStartA  dataValueA
+      chromStartB  dataValueB
+
+  - **fixedStep** single column data; started by a declaration line and followed with data values::
+
+      fixedStep  chrom=chrN  start=position  step=stepInterval  [span=windowSize]
+      dataValue1
+      dataValue2
+
+-----
+
+**Example**
+
+- input wiggle format file::
+
+    #track type=wiggle_0 name="Bed Format" description="BED format"
+    chr19 59302000 59302300 -1.0
+    chr19 59302300 59302600 -0.75
+    chr19 59302600 59302900 -0.50
+    chr19 59302900 59303200 -0.25
+    chr19 59303200 59303500 0.0
+    #track type=wiggle_0 name="variableStep" description="variableStep format"
+    variableStep chrom=chr19 span=150
+    59304701 10.0
+    59304901 12.5
+    59305401 15.0
+    59305601 17.5
+    #track type=wiggle_0 name="fixedStep" description="fixed step" visibility=full
+    fixedStep chrom=chr19 start=59307401 step=300 span=200
+    1000
+    900
+    800
+    700
+    600
+
+- convert the above file to interval file::
+
+    chr19 59302000 59302300 + -1.0
+    chr19 59302300 59302600 + -0.75
+    chr19 59302600 59302900 + -0.5
+    chr19 59302900 59303200 + -0.25
+    chr19 59303200 59303500 + 0.0
+    chr19 59304701 59304851 + 10.0
+    chr19 59304901 59305051 + 12.5
+    chr19 59305401 59305551 + 15.0
+    chr19 59305601 59305751 + 17.5
+    chr19 59307701 59307901 + 1000.0
+    chr19 59308001 59308201 + 900.0
+    chr19 59308301 59308501 + 800.0
+    chr19 59308601 59308801 + 700.0
+    chr19 59308901 59309101 + 600.0
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/taxonomy/find_diag_hits.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/taxonomy/find_diag_hits.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,170 @@
+#!/usr/bin/env python
+
+"""
+tax_read_grouping.py <file in taxonomy format> <id column> <taxonomic ranks> <output format> <output file>
+    finds reads that only hit one taxonomic group. For example, consider the folliowing example:
+    
+    read1   mammalia
+    read1   insecta
+    read2   insecta
+    
+    in this case only read2 will be selected becuase it stays within insecta
+    
+    This program takes the following options:
+    
+    file in taxonomy format - dataset that complies with Galaxy's taxonomy format
+    id column               - integer specifying the number of column containing seq id (starting with 1)
+    taxonomic ranks         - a comma separated list of ranks from this list:
+    
+         superkingdom
+         kingdom
+         subkingdom
+         superphylum
+         phylum
+         subphylum
+         superclass
+         class
+         subclass
+         superorder
+         order
+         suborder
+         superfamily
+         family
+         subfamily
+         tribe
+         subtribe
+         genus
+         subgenus
+         species
+         subspecies
+    
+    output format           - reads or counts
+
+"""
+
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( 'pysqlite' )
+from pysqlite2 import dbapi2 as sqlite
+import string, sys, tempfile
+
+# This dictionary maps taxonomic ranks to fields of Taxonomy file
+taxRank = {
+        'root'        :2, 
+        'superkingdom':3, 
+        'kingdom'     :4, 
+        'subkingdom'  :5, 
+        'superphylum' :6, 
+        'phylum'      :7, 
+        'subphylum'   :8, 
+        'superclass'  :9, 
+        'class'       :10, 
+        'subclass'    :11, 
+        'superorder'  :12, 
+        'ord'         :13, 
+        'suborder'    :14, 
+        'superfamily' :15,
+        'family'      :16,
+        'subfamily'   :17,
+        'tribe'       :18,
+        'subtribe'    :19,
+        'genus'       :20,
+        'subgenus'    :21,
+        'species'     :22,
+        'subspecies'  :23,
+        'order'       :13
+    }
+
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+
+db = tempfile.NamedTemporaryFile('w')
+
+try:
+    con = sqlite.connect(db.name)
+    cur = con.cursor()
+except:
+    stop_err('Cannot connect to %s\n') % db.name
+    
+try:
+    tax_file   = open(sys.argv[1], 'r')
+    id_col     = int(sys.argv[2]) - 1
+    taxa       = string.split(sys.argv[3].rstrip(),',')
+    
+    if sys.argv[4] == 'reads':
+        out_format = True
+    elif sys.argv[4] == 'counts':
+        out_format = False
+    else:
+        stop_err('Please specify "reads" or "counts" for output format\n')
+    out_file = open(sys.argv[5], 'w')
+    
+except:
+    stop_err('Check arguments\n')
+    
+if taxa[0] == 'None': stop_err('Please, use checkboxes to specify taxonomic ranks.\n')
+
+sql = ""
+for i in range(len(taxa)):
+        if taxa[i] == 'order': taxa[i] = 'ord' # SQL does not like fields to be named 'order'
+        sql += '%s text, ' % taxa[i]
+
+sql = sql.strip(', ')
+sql = 'create table tax (name varchar(50) not null, ' + sql + ')'
+
+    
+cur.execute(sql)
+
+invalid_line_number = 0
+
+try:
+    for line in tax_file:
+        fields = string.split(line.rstrip(), '\t')
+        if len(fields) < 24: 
+            invalid_line_number += 1
+            continue # Skipping malformed taxonomy lines
+        
+        val_string = '"' + fields[id_col] + '", '
+        
+        for rank in taxa:
+            taxon = fields[taxRank[rank]]
+            val_string += '"%s", ' % taxon
+                
+        val_string = val_string.strip(', ')
+        val_string = "insert into tax values(" + val_string + ")"
+        cur.execute(val_string)
+except Exception, e:
+    stop_err('%s\n' % e)
+
+tax_file.close()    
+
+try:    
+    for rank in taxa:
+        cur.execute('create temporary table %s (name varchar(50), id text, rank text)' % rank  )
+        cur.execute('insert into %s select name, name || %s as id, %s from tax group by id' % ( rank, rank, rank ) )
+        cur.execute('create temporary table %s_count(name varchar(50), id text, rank text, N int)' % rank)
+        cur.execute('insert into %s_count select name, id, rank, count(*) from %s group by name' % ( rank, rank) )
+        
+        if rank == 'ord':
+            rankName = 'order'
+        else:
+            rankName = rank
+    
+        if out_format:
+            cur.execute('select name,rank from %s_count where N = 1 and length(rank)>1' % rank)
+            for item in cur.fetchall():
+                out_string = '%s\t%s\t' % ( item[0], item[1] )
+                out_string += rankName
+                print >>out_file, out_string
+        else:
+            cur.execute('select rank, count(*) from %s_count where N = 1 and length(rank)>1 group by rank' % rank)
+            for item in cur.fetchall():
+                out_string = '%s\t%s\t' % ( item[0], item[1] )
+                out_string += rankName
+                print >>out_file, out_string
+except Exception, e:
+    stop_err("%s\n" % e)
+    
b
diff -r 000000000000 -r 9071e359b9a3 tools/taxonomy/find_diag_hits.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/taxonomy/find_diag_hits.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,99 @@
+<tool id="find_diag_hits" name="Find diagnostic hits" version="1.0.0">
+    <description></description>
+    <requirements>
+        <requirement type="package">taxonomy</requirement>
+    </requirements>
+    <command interpreter="python">find_diag_hits.py $input1 $id_col $rank_list $out_format $out_file1</command>
+    <inputs>
+        <param format="taxonomy" name="input1" type="data" label="Find diagnostic hits in"/>
+        <param name="id_col" type="data_column" data_ref="input1" numerical="False" label="Select column with sequence id" />
+        <param name="rank_list" type="select" display="checkboxes" multiple="true" label="select taxonomic ranks">
+            <option value="superkingdom">Superkingdom</option>
+            <option value="kingdom">Kingdom</option>
+            <option value="subkingdom">Subkingdom</option>
+            <option value="superphylum">Superphylum</option>
+            <option value="phylum">Phylum</option>
+            <option value="subphylum">Subphylum</option>
+            <option value="superclass">Superclass</option>
+            <option value="class">Class</option>
+            <option value="subclass">Subclass</option>
+            <option value="superorder">Superorder</option>
+            <option value="order">Order</option>
+            <option value="suborder">Suborder</option>
+            <option value="superfamily">Superfamily</option>
+            <option value="family">Family</option>
+            <option value="subfamily">Subfamily</option>
+            <option value="tribe">Tribe</option>
+            <option value="subtribe">Subtribe</option>
+            <option value="genus">Genus</option>
+            <option value="subgenus">Subgenus</option>
+            <option selected="true" value="species">Species</option>
+            <option value="subspecies">Subspecies</option>
+        </param>
+        <param name="out_format" type="select" label="Select output format">
+            <option value="reads">Diagnostic read list</option>
+            <option value="counts">Number of diagnostic reads per taxonomic rank</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="out_file1" />
+    </outputs>
+      <tests>
+    <test>
+      <param name="input1" value="taxonomyGI.taxonomy" ftype="taxonomy"/>
+      <param name="id_col" value="1" />
+      <param name="rank_list" value="order,genus" />
+      <param name="out_format" value="counts" />
+      <output name="out_file1" file="find_diag_hits.tabular" />
+    </test> 
+  </tests>
+
+    
+<help>
+
+**What it does**
+
+When performing metagenomic analyses it is often necessary to identify sequence reads corresponding to a particular taxonomic group, or, in other words, diagnostic of a particular taxonomic rank. This utility performs this analysis. It takes data generated by *Taxonomy manipulation->Fetch Taxonomic Ranks* as input and outputs either a list of sequence reads unique to a particular taxonomic rank, or a list of taxonomic ranks and the count of unique reads corresponding to each rank. 
+
+------
+
+**Example**
+
+Suppose the *Taxonomy manipulation->Fetch Taxonomic Ranks* generated the following taxonomy representation::
+
+    read1 2      root Eukaryota Metazoa n n Chordata   Craniata Gnathostomata Mammalia n        Laurasiatheria   n           Ruminantia  n             Bovidae     Bovinae      n          n          Bos        n Bos taurus        n
+    read2 12585  root Eukaryota Metazoa n n Chordata   Craniata Gnathostomata Mammalia n        Euarchontoglires Primates  Haplorrhini Hominoidea    Hominidae   n            n          n          Homo       n Homo sapiens      n 
+    read1 58615  root Eukaryota Metazoa n n Arthropoda n        Hexapoda      Insecta  Neoptera Amphiesmenoptera Lepidoptera Glossata    Papilionoidea Nymphalidae Nymphalinae  Melitaeini Phyciodina Anthanassa n Anthanassa otanes n 
+    read3 56785  root Eukaryota Metazoa n n Chordata   Craniata Gnathostomata Mammalia n        Euarchontoglires Primates  Haplorrhini Hominoidea    Hominidae   n            n          n          Homo       n Homo sapiens      n   
+
+Running this tool with the following parameters:
+
+  * *Select column with sequence id* set to **c1**
+  * *Select taxonomic ranks* with **order**, and **genus** checked
+  * *Output format* set to **Diagnostic read list**
+  
+will return::
+
+    read2 Primates order
+    read3 Primates order
+    read2 Homo     genus
+    read3 Homo     genus
+    
+Changing *Output format* set to **Number of diagnostic reads per taxonomic rank** will produce::
+
+    Primates 2       order
+    Homo     2       genus
+    
+.. class:: infomark
+
+Note that **read1** is omitted because it is non-unique: it hits Mammals and Insects at the same time.    
+
+--------
+
+.. class:: warningmark
+
+This tool omits "**n**" corresponding to ranks missing from NCBI taxonomy. In the above example *Home sapiens* contains the order name (Primates) while *Bos taurus* does not.
+
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/taxonomy/gi2taxonomy.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/taxonomy/gi2taxonomy.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,174 @@
+import sys
+import string
+import tempfile
+import subprocess
+from os import path
+
+# -----------------------------------------------------------------------------------
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+# -----------------------------------------------------------------------------------
+def gi_name_to_sorted_list(file_name, gi_col, name_col):
+    """ Suppose input file looks like this:
+        a       2
+        b       4
+        c       5
+        d       5
+        where column 1 is gi_col and column 0 is name_col
+        output of this function will look like this:
+        [[2, 'a'], [4, 'b'], [5, 'c'], [5, 'd']]
+    """
+
+    result = []
+    try:
+        F = open( file_name, 'r' )
+        try:
+            for line in F:
+                file_cols = string.split(line.rstrip(), '\t')
+                file_cols[gi_col] = int(  file_cols[gi_col] )
+                result.append( [ file_cols[gi_col], file_cols[name_col] ] )
+        except:
+            print >>sys.stderr, 'Non numeric GI field...skipping'
+            
+    except Exception, e:
+        stop_err('%s\n' % e)
+    F.close()
+    result.sort()
+    return result   
+
+# -----------------------------------------------------------------------------------
+
+def collapse_repeating_gis( L ):
+    """ Accepts 2-d array of gi-key pairs such as this
+        L = [
+                [gi1, 'key1'],
+                [gi1, 'key2'],
+                [gi2','key3']
+            ]
+
+         Returns this:
+         [      [gi1, 'key1', 'key2'],
+                [gi2, 'key3' ]
+         ]
+         
+         The first value in each sublist MUST be int
+    """
+    gi = []
+    i = 0
+    result = []
+    
+    try:
+        for item in L:
+            if i == 0:
+                prev = item[0]
+            
+            if prev != item[0]:
+                prev_L = []
+                prev_L.append( prev )
+                result.append( prev_L + gi )
+                prev = item[0]
+                gi =[]
+                
+            gi.append( item[1] )
+            i += 1
+            
+    except Exception, e:
+        stop_err('%s\n' % e)
+        
+    prev_L = []
+    prev_L.append( prev )
+    result.append( prev_L + gi )
+    del(L)
+    return result
+
+# -----------------------------------------------------------------------------------
+
+def get_taxId( gi2tax_file, gi_name_list, out_file ):
+    """ Maps GI numbers from gi_name_list to TaxId identifiers from gi2tax_file and
+        prints result to out_file
+
+        gi2tax_file MUST be sorted on GI column
+
+        gi_name_list is a list that look slike this:
+        [[1,'a'], [2,'b','x'], [7,'c'], [10,'d'], [90,'f']]
+        where the first element of each sublist is a GI number
+        this list MUST also be sorted on GI
+
+        This function searches through 117,000,000 rows of gi2taxId file from NCBI
+        in approximately 4 minutes. This time is not dependent on the length of
+        gi_name_list
+    """
+
+    L = gi_name_list.pop(0)
+    my_gi = L[0]
+    F = open( out_file, 'w' )
+    gi = 0
+    for line in file( gi2tax_file ):
+        line = line.rstrip()
+        gi, taxId = string.split( line, '\t' )
+        gi = int( gi )
+        
+        if gi > my_gi:
+            try:
+                while ( my_gi < gi ):
+                    L = gi_name_list.pop(0)
+                    my_gi = L[0]
+            except:
+                break
+    
+        if  gi == my_gi:
+            for i in range( 1,len( L ) ):
+                print >>F, '%s\t%s\t%d' % (L[i], taxId, gi)
+            try:
+                L = gi_name_list.pop(0)
+                my_gi = L[0]
+            except:
+                break
+
+# -----------------------------------------------------------------------------------
+
+
+try:
+    in_f          = sys.argv[1]            # input file with GIs
+    gi_col        = int( sys.argv[2] ) - 1 # column in input containing GIs
+    name_col      = int( sys.argv[3] ) - 1 # column containing sequence names
+    out_f         = sys.argv[4]            # output file
+    tool_data     = sys.argv[5]
+except:
+    stop_err('Check arguments\n')
+
+#  GI2TAX point to a file produced by concatenation of:
+#  ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.zip
+#  and
+#  ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.zip
+#  a sorting using this command:
+#  sort -n -k 1
+
+GI2TAX = path.join( tool_data, 'taxonomy', 'gi_taxid_sorted.txt' )
+
+#  NAME_FILE and NODE_FILE point to names.dmg and nodes.dmg
+#  files contained within:
+#  ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
+
+NAME_FILE = path.join( tool_data, 'taxonomy', 'names.dmp' )
+NODE_FILE = path.join( tool_data, 'taxonomy', 'nodes.dmp' )
+
+g2n =  gi_name_to_sorted_list(in_f, gi_col, name_col)
+
+if len(g2n) == 0:
+    stop_err('No valid GI-containing fields. Please, check your column assignments.\n')
+
+tb_F = tempfile.NamedTemporaryFile('w')
+
+get_taxId( GI2TAX, collapse_repeating_gis( g2n ), tb_F.name )
+
+try:
+    tb_cmd = 'taxBuilder %s %s %s %s' % ( NAME_FILE, NODE_FILE, tb_F.name, out_f )
+    retcode = subprocess.call( tb_cmd, shell=True )
+    if retcode < 0:
+        print >>sys.stderr, "Execution of taxBuilder terminated by signal", -retcode
+except OSError, e:
+    print >>sys.stderr, "Execution of taxBuilder2tree failed:", e
b
diff -r 000000000000 -r 9071e359b9a3 tools/taxonomy/gi2taxonomy.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/taxonomy/gi2taxonomy.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,102 @@
+<tool id="Fetch Taxonomic Ranks" name="Fetch taxonomic representation" version="1.1.0">
+  <description></description>
+    <requirements>
+        <requirement type="package">taxonomy</requirement>
+    </requirements>
+  <command interpreter="python">gi2taxonomy.py $input $giField $idField $out_file1 ${GALAXY_DATA_INDEX_DIR}</command>
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Show taxonomic representation for"></param>
+    <param name="giField" label="GIs column" type="data_column" data_ref="input" numerical="True" help="select column containing GI numbers"/>
+    <param name="idField" label="Name column" type="data_column" data_ref="input" help="select column containing identifiers you want to include into output"/>
+  </inputs>
+  <outputs>
+    <data format="taxonomy" name="out_file1" />
+  </outputs>
+  <requirements>
+    <requirement type="binary">taxBuilder</requirement>
+  </requirements>
+  <tests>
+    <test>
+      <param name="input" ftype="tabular" value="taxonomy2gi-input.tabular"/>
+      <param name="giField" value="1"/>
+      <param name="idField" value="2"/>
+      <output name="out_file1" file="taxonomy2gi-output.tabular"/>
+    </test>
+  </tests>
+
+  <help>
+
+.. class:: infomark
+
+Use *Filter and Sort->Filter* to restrict output of this tool to desired taxonomic ranks. You can also use *Text Manipulation->Cut* to remove unwanted columns from the output.
+
+------
+
+**What it does**
+
+Fetches taxonomic information for a list of GI numbers (sequences identifiers used by the National Center for Biotechnology Information http://www.ncbi.nlm.nih.gov).
+
+-------
+
+**Example**
+
+Suppose you have BLAST output that looks like this::
+  
+   +-----------------------+----------+----------+-----------------+------------+------+--------+
+   | queryId               | targetGI | identity | alignmentLength | mismatches | gaps | score  |
+   +-----------------------+----------+----------+-----------------+------------+------+--------+
+   | 1L_EYKX4VC01BXWX1_265 |  1430919 |    90.09 |             212 |         15 |    6 | 252.00 | 
+   +-----------------------+----------+----------+-----------------+------------+------+--------+
+
+and you want to obtain full taxonomic representation for GIs listed in *targetGI* column. If you set parameters as shown here:
+
+.. image:: ./static/images/fetchTax.png
+
+
+the tool will generate the following output (you may need to scroll sideways to see the entire line)::
+
+  1                     2    3    4         5       6 7 8        9        10            11       12 13               14       15         16          17        18  19  20 21  22  23           24 25
+  1L_EYKX4VC01BXWX1_265 9606 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n  Euarchontoglires Primates Haplorrhini Hominoidea Hominidae n   n   n  Homo n  Homo sapiens n  1430919
+
+In other words the tool printed *Name column*, *taxonomy Id*, appended 22 columns containing taxonomic ranks from Superkingdom to Subspecies and added *GI* as the last column. Below is a formal definition of the output columns::
+
+    Column Definition
+   ------- ------------------------------------------
+         1 Name (specified by 'Name column' dropdown)
+         2 GI   (specified by 'GI column' dropdown)
+         3 root
+         4 superkingdom
+         5 kingdom
+         6 subkingdom
+         7 superphylum
+         8 phylum
+         9 subphylum
+        10 superclass
+        11 class
+        12 subclass
+        13 superorder
+        14 order
+        15 suborder
+        16 superfamily
+        17 family
+        18 subfamily
+        19 tribe
+        20 subtribe
+        21 genus
+        22 subgenus
+        23 species
+        24 subspecies
+
+------
+
+.. class:: warningmark
+
+**Why do I have these "n" things?** 
+
+Be aware that the NCBI taxonomy (ftp://ftp.ncbi.nih.gov/pub/taxonomy/) this tool relies upon is incomplete.  This means that for many species one or more ranks are absent and represented as "**n**". In the above example *subkingdom*, *superphylum* etc. are missing.
+
+
+</help>
+</tool>
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/taxonomy/lca.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/taxonomy/lca.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,193 @@
+#!/usr/bin/env python
+#Guruprasad Ananda
+"""
+Least Common Ancestor tool.
+"""
+import sys, string, re, commands, tempfile, random
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+def main():
+    try:
+        inputfile = sys.argv[1]
+        outfile = sys.argv[2]
+        rank_bound = int( sys.argv[3] )
+        """
+        Mapping of ranks:
+        root        :2, 
+        superkingdom:3, 
+        kingdom     :4, 
+        subkingdom  :5, 
+        superphylum :6, 
+        phylum      :7, 
+        subphylum   :8, 
+        superclass  :9, 
+        class       :10, 
+        subclass    :11, 
+        superorder  :12, 
+        order       :13, 
+        suborder    :14, 
+        superfamily :15,
+        family      :16,
+        subfamily   :17,
+        tribe       :18,
+        subtribe    :19,
+        genus       :20,
+        subgenus    :21,
+        species     :22,
+        subspecies  :23,
+        """
+    except:
+        stop_err("Syntax error: Use correct syntax: program infile outfile")
+    
+    fin = open(sys.argv[1],'r')
+    for j, line in enumerate( fin ):
+        elems = line.strip().split('\t')
+        if len(elems) < 24:
+            stop_err("The format of the input dataset is incorrect. Taxonomy datatype should contain at least 24 columns.")
+        if j > 30:
+            break
+        cols = range(1,len(elems))
+    fin.close()
+       
+    group_col = 0
+    tmpfile = tempfile.NamedTemporaryFile()
+
+    try:
+        """
+        The -k option for the Posix sort command is as follows:
+        -k, --key=POS1[,POS2]
+        start a key at POS1, end it at POS2 (origin 1)
+        In other words, column positions start at 1 rather than 0, so 
+        we need to add 1 to group_col.
+        if POS2 is not specified, the newer versions of sort will consider the entire line for sorting. To prevent this, we set POS2=POS1.
+        """
+        command_line = "sort -f -k " + str(group_col+1) +"," + str(group_col+1) + " -o " + tmpfile.name + " " + inputfile
+    except Exception, exc:
+        stop_err( 'Initialization error -> %s' %str(exc) )
+        
+    error_code, stdout = commands.getstatusoutput(command_line)
+    
+    if error_code != 0:
+        stop_err( "Sorting input dataset resulted in error: %s: %s" %( error_code, stdout ))    
+
+    prev_item = ""
+    prev_vals = []
+    remaining_vals = []
+    skipped_lines = 0
+    fout = open(outfile, "w")
+    block_valid = False
+    
+    
+    for ii, line in enumerate( file( tmpfile.name )):
+        if line and not line.startswith( '#' ) and len(line.split('\t')) >= 24: #Taxonomy datatype should have at least 24 columns
+            line = line.rstrip( '\r\n' )
+            try:
+                fields = line.split("\t")
+                item = fields[group_col]
+                if prev_item != "":
+                    # At this level, we're grouping on values (item and prev_item) in group_col
+                    if item == prev_item:
+                        # Keep iterating and storing values until a new value is encountered.
+                        if block_valid:
+                            for i, col in enumerate(cols):
+                                if col >= 3:
+                                    prev_vals[i].append(fields[col].strip())
+                                    if len(set(prev_vals[i])) > 1:
+                                        block_valid = False
+                                        break
+                            
+                    else:   
+                        """
+                        When a new value is encountered, write the previous value and the 
+                        corresponding aggregate values into the output file.  This works 
+                        due to the sort on group_col we've applied to the data above.
+                        """
+                        out_list = ['']*24
+                        out_list[0] = str(prev_item)
+                        out_list[1] = str(prev_vals[0][0])
+                        out_list[2] = str(prev_vals[1][0])
+                        
+                        for k, col in enumerate(cols):
+                            if col >= 3 and col < 24:
+                                if len(set(prev_vals[k])) == 1:
+                                    out_list[col] = prev_vals[k][0]
+                                else:
+                                    break
+                        while k < 23:
+                            out_list[k+1] = 'n' 
+                            k += 1
+                        
+                        j = 0
+                        while True:
+                            try:
+                                out_list.append(str(prev_vals[23+j][0]))
+                                j += 1
+                            except:
+                                break
+                            
+                        if rank_bound == 0:     
+                            print >>fout, '\t'.join(out_list).strip()
+                        else:
+                            if ''.join(out_list[rank_bound:24]) != 'n'*( 24 - rank_bound ):
+                                print >>fout, '\t'.join(out_list).strip()
+                        
+                        block_valid = True
+                        prev_item = item   
+                        prev_vals = [] 
+                        for col in cols:
+                            val_list = []
+                            val_list.append(fields[col].strip())
+                            prev_vals.append(val_list)
+                        
+                else:
+                    # This only occurs once, right at the start of the iteration.
+                    block_valid = True
+                    prev_item = item    #groupby item
+                    for col in cols:    #everyting else
+                        val_list = []
+                        val_list.append(fields[col].strip())
+                        prev_vals.append(val_list)
+            
+            except:
+                skipped_lines += 1
+        else:
+            skipped_lines += 1
+            
+    # Handle the last grouped value
+    out_list = ['']*24
+    out_list[0] = str(prev_item)
+    out_list[1] = str(prev_vals[0][0])
+    out_list[2] = str(prev_vals[1][0])
+    
+    for k, col in enumerate(cols):
+        if col >= 3 and col < 24:
+            if len(set(prev_vals[k])) == 1:
+                out_list[col] = prev_vals[k][0]
+            else:
+                break
+    while k < 23:
+        out_list[k+1] = 'n' 
+        k += 1
+    
+    j = 0
+    while True:
+        try:
+            out_list.append(str(prev_vals[23+j][0]))
+            j += 1
+        except:
+            break
+        
+    if rank_bound == 0:     
+        print >>fout, '\t'.join(out_list).strip()
+    else:
+        if ''.join(out_list[rank_bound:24]) != 'n'*( 24 - rank_bound ):
+            print >>fout, '\t'.join(out_list).strip()
+        
+    if skipped_lines > 0:
+        print "Skipped %d invalid lines." % ( skipped_lines )
+    
+if __name__ == "__main__":
+    main()
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/taxonomy/lca.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/taxonomy/lca.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,100 @@
+<tool id="lca1" name="Find lowest diagnostic rank" version="1.0.1">
+  <description></description>
+    <requirements>
+        <requirement type="package">taxonomy</requirement>
+    </requirements>
+  <command interpreter="python">
+    lca.py $input1 $out_file1 $rank_bound
+  </command>
+  <inputs>
+    <param format="taxonomy" name="input1" type="data" label="for taxonomy dataset"/>
+    <param name="rank_bound" label="require the lowest rank to be at least" type="select">
+        <option value="0">No restriction</option>
+        <option value="3">Superkingdom</option>
+        <option value="4">Kingdom</option>
+        <option value="5">Subkingdom</option>
+        <option value="6">Superphylum</option>
+        <option value="7">Phylum</option>
+        <option value="8">Subphylum</option>
+        <option value="9">Superclass</option>
+        <option value="10">Class</option>
+        <option value="11">Subclass</option>
+        <option value="12">Superorder</option>
+        <option value="13">Order</option>
+        <option value="14">Suborder</option>
+        <option value="15">Superfamily</option>
+        <option value="16">Family</option>
+        <option value="17">Subfamily</option>
+        <option value="18">Tribe</option>
+        <option value="19">Subtribe</option>
+        <option value="20">Genus</option>
+        <option value="21">Subgenus</option>
+        <option value="22">Species</option>
+        <option value="23">Subspecies</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="taxonomy" name="out_file1" metadata_source="input1" />
+  </outputs>
+  <tests>
+     <test>
+          <param name="input1" value="lca_input.taxonomy" ftype="taxonomy"/>
+          <param name="rank_bound" value="0" />
+          <output name="out_file1" file="lca_output.taxonomy" ftype="taxonomy"/>
+     </test> 
+     <test>
+          <param name="input1" value="lca_input2.taxonomy" ftype="taxonomy"/>
+          <param name="rank_bound" value="7" />
+          <output name="out_file1" file="lca_output2.taxonomy" ftype="taxonomy"/>
+     </test> 
+     
+     <!--Test case with invalid lines -->
+     <test>
+          <param name="input1" value="lca_input3.taxonomy" ftype="taxonomy"/>
+          <param name="rank_bound" value="10" />
+          <output name="out_file1" file="lca_output3.taxonomy" ftype="taxonomy"/>
+     </test> 
+ </tests>
+
+ <help>
+
+**What it does**
+
+This tool identifies the lowest taxonomic rank for which a mategenomic sequencing read is diagnostic. It takes datasets produced by *Fetch Taxonomic Ranks* tool (aka Taxonomy format) as the input. 
+
+-------
+
+**Example**
+
+Suppose you have two reads, **read_1** and **read_2**, with the following taxonomic profiles (scroll sideways to see the entire dataset)::
+  
+    read_1 1 root superkingdom1 kingdom1 subkingdom1 superphylum1 phylum1 subphylum1 superclass1 class1 subclass1 superorder1 order1 suborder1 superfamily1 family1 subfamily1 tribe1 subtribe1 genus1 subgenus1 species1 subspecies1
+    read_1 2 root superkingdom1 kingdom1 subkingdom1 superphylum1 phylum1 subphylum1 superclass1 class1 subclass1 superorder1 order1 suborder1 superfamily1 family1 subfamily1 tribe1 subtribe1 genus2 subgenus2 species2 subspecies2
+    read_2 3 root superkingdom1 kingdom1 subkingdom1 superphylum1 phylum3 subphylum3 superclass3 class3 subclass3 superorder3 order3 suborder3 superfamily3 family3 subfamily3 tribe3 subtribe3 genus3 subgenus3 species3 subspecies3
+    read_2 4 root superkingdom1 kingdom1 subkingdom1 superphylum1 phylum4 subphylum4 superclass4 class4 subclass4 superorder4 order4 suborder4 superfamily4 family4 subfamily4 tribe4 subtribe4 genus4 subgenus4 species4 subspecies4
+
+For **read_1** taxonomic labels are consistent until the genus level, where the taxonomy splits into two branches, one ending with *subspecies1* and the other with *subspecies2*. This implies **that the lowest taxomomic rank read_1 can identify is SUBTRIBE**.  Similarly, read_2 is diagnostic up until the **superphylum** level.  As a results the output of this tool will be::
+
+    read_1 2 root superkingdom1 kingdom1 subkingdom1 superphylum1 phylum1 subphylum1 superclass1 class1 subclass1 superorder1 order1 suborder1 superfamily1 family1 subfamily1 tribe1 subtribe1 n n n n
+    read_2 3 root superkingdom1 kingdom1 subkingdom1 superphylum1 n       n          n           n      n         n           n      n         n            n       n          n      n         n n n n
+    
+where, **n** means *EMPTY*. 
+
+--------
+
+**What's up with the drop down?**
+
+Why do we need the *require the lowest rank to be at least* dropdown?  Let's look at the above example again. Suppose you need to find only those reads that are diagnostic on at least phylum level. To do this you need to set the *require the lowest rank to be at least* to **phylum**. As a result your output will look like this::
+
+    read_1 2 root superkingdom1 kingdom1 subkingdom1 superphylum1 phylum1 subphylum1 superclass1 class1 subclass1 superorder1 order1 suborder1 superfamily1 family1 subfamily1 tribe1 subtribe1 n n n n
+
+.. class:: infomark
+    
+Note, that **read_2** is now omitted as it matches two phyla (**phylum3** and **phylum4**) and therefore is not diagnostic (but rather cosmopolitan) on *phylum* level. 
+
+
+
+
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/taxonomy/poisson2test.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/taxonomy/poisson2test.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,124 @@
+#!/usr/local/bin/python
+
+import sys
+from math import *
+from rpy import *
+
+
+if ((len(sys.argv)-1) != 6):
+    print 'too few parameters' 
+    print 'usage: inputfile, col1, col2, d-value(not 0), p-val correction method(0 or 1)'
+    sys.exit()
+    
+try:
+    lines_arr = open(sys.argv[1]).readlines()
+except IOError:
+    print'cannot open',sys.argv[1]
+    sys.exit()  

+try:
+    i = int(sys.argv[2]) #first column to compare
+    j = int(sys.argv[3]) #second colum to compare
+    d = float(sys.argv[4]) #correction factor
+    k = int(sys.argv[5]) #p-val correction method
+    outfile = open(sys.argv[6],'w') # output data
+    
+    if (i>j):
+        print 'column order not correct col1 < col2'
+        print 'usage: inputfile, col1, col2, d-value, p-val correction method'
+        sys.exit()      
+        
+    try:
+        a = 1 / d
+        assert k in [0,1]
+    except ZeroDivisionError:
+        print 'd cannot be 0'
+        print 'usage: inputfile, col1, col2, d-value, p-val correction method'
+        sys.exit()
+    except:
+        print ' p-val correction should be 0 or 1 (0 = "bonferroni", 1 = "fdr")'
+        print 'usage: inputfile, col1, col2, d-value, p-val correction method'
+        sys.exit()
+except ValueError:
+    print 'parameters are not integers'
+    print 'usage: inputfile, col1, col2, d-value, p-val correction method'
+    sys.exit()
+   
+
+fsize = len(lines_arr)
+
+z1 = []
+z2 = []
+pz1 = []
+pz2 = []
+field = []
+
+if d<1: # Z score calculation
+    for line in lines_arr:
+        line.strip()
+        field = line.split('\t')
+        
+        x = int(field[j-1]) #input column 2
+        y = int(field[i-1]) #input column 1
+        if y>x:
+            z1.append(float((y - ((1/d)*x))/sqrt((1/d)*(x + y))))
+            z2.append(float((2*(sqrt(y+(3/8))-sqrt((1/d)*(x+(3/8)))))/sqrt(1+(1/d))))
+        else:
+            tmp_var1 = x
+            x = y
+            y = tmp_var1
+            z1.append(float((y - (d*x))/sqrt(d*(x + y))))
+            z2.append(float((2*(sqrt(y+(3/8))-sqrt(d*(x+(3/8)))))/sqrt(1+d)))
+            
+else: #d>1 Z score calculation
+    for line in lines_arr:
+        line.strip()
+        field = line.split('\t')
+        x = int(field[i-1]) #input column 1
+        y = int(field[j-1]) #input column 2
+        
+        if y>x:
+            z1.append(float((y - (d*x))/sqrt(d*(x + y))))
+            z2.append(float((2*(sqrt(y+(3/8))-sqrt(d*(x+(3/8)))))/sqrt(1+d)))
+        else:
+            tmp_var2 = x
+            x = y
+            y = tmp_var2
+            z1.append(float((y - ((1/d)*x))/sqrt((1/d)*(x + y))))
+            z2.append(float((2*(sqrt(y+(3/8))-sqrt((1/d)*(x+(3/8)))))/sqrt(1+(1/d))))
+        
+  
+   
+
+
+# P-value caluculation for z1 and z2
+for p in z1:
+    
+    pz1.append(float(r.pnorm(-abs(float(p)))))
+
+for q in z2:
+    
+    pz2.append(float(r.pnorm(-abs(float(q)))))    
+
+# P-value correction for pz1 and pz2
+
+if k == 0:
+    corrz1 = r.p_adjust(pz1,"bonferroni",fsize)
+    corrz2 = r.p_adjust(pz2,"bonferroni",fsize)
+  
+   
+else:
+  
+    corrz1 = r.p_adjust(pz1,"fdr",fsize)
+    corrz2 = r.p_adjust(pz2,"fdr",fsize)
+    
+
+#printing all columns
+for n in range(fsize):
+    print >> outfile, "%s\t%4.3f\t%4.3f\t%8.6f\t%8.6f\t%8.6f\t%8.6f" %(lines_arr[n].strip(),z1[n],z2[n],pz1[n],pz2[n],corrz1[n],corrz2[n])
+
+
+      
+      
+      
+          
b
diff -r 000000000000 -r 9071e359b9a3 tools/taxonomy/poisson2test.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/taxonomy/poisson2test.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,113 @@
+<tool id="poisson2test" name="Poisson two-sample test" version="1.0.0">
+  <description></description>
+    <requirements>
+        <requirement type="package">taxonomy</requirement>
+    </requirements>
+  <command interpreter="python">poisson2test.py $input1 $input2 $input3 $input4 $input5 $output1 2>/dev/null </command>
+  <inputs>
+    <param name="input1" format="tabular" type="data" label="Input File"/>
+    <param name="input2" type="integer" size="5" value="2" label="First Column"/>
+    <param name="input3" type="integer" size="5" value="3" label="Second Column"/>
+    <param name="input4" type="float" size="5" value="1" label="D value"/>
+    <param name="input5" type="select" label="correction method">
+        <option value="0">Bonferroni</option>
+        <option value="1">FDR</option>
+    </param> 
+  </inputs>
+  <outputs>
+    <data format="tabular" name="output1" />
+  </outputs> 
+  <tests>
+    <test>
+        <param name="input1" value="poisson2test1.tabular" ftype="tabular"/>
+        <param name="input2" value="2" />
+        <param name="input3" value="3" />
+        <param name="input4" value="0.44" />
+        <param name="input5" value="0" />
+        <output name="output1" file="poisson2test1_out.tabular" />    
+    </test>
+    <test>
+        <param name="input1" value="poisson2test2.tabular" ftype="tabular"/>
+        <param name="input2" value="2" />
+        <param name="input3" value="3" />
+        <param name="input4" value="0.44" />
+        <param name="input5" value="0" />
+        <output name="output1" file="poisson2test2_out.tabular" />    
+    </test>    
+  </tests>
+  <help>
+
+**What it does**
+
+Suppose you have metagenomic samples from two different locations and have classified the reads unique to various taxa. Now you want to test if the number of reads that fall in a particular taxon in location 1 is different from those that fall in the same taxon in location 2. 
+This utility performs this analysis. It assumes that the data comes from a Poisson process and calculates two Z scores (Z1 and Z2) based on the work by Shiue and Bain; 1982 (Z1) and Huffman; 1984 (Z2).
+
+-----
+
+**Z score formula**
+
+Equation 1:
+
+.. image:: ./static/images/poisson2test_eqn1.png 
+

+Equation 2:
+
+.. image:: ./static/images/poisson2test_eqn2.png
+
+
+X = number of reads falling in a particular taxon in location 1

+Y = number of reads falling in the same taxon in location 2

+d = correction factor that accounts for biases in sample collection, DNA concentration, read numbers etc. between the two locations. 
+
+Not only that, this utility also provides corresponding p-values and corrected p-values (using Bonferroni or False Discovery Rate (FDR)). It takes in an input file (a tab delimited file consisting of three or more columns (taxon/category, read counts in location 1, read counts in location 2)), columns to compare, d value and a correction method 0 (Bonferroni) or 1 (FDR).
+
+-----
+
+**Example**
+
+- Input File: phylum, read count in location-1, read count in location-2::
+
+    Annelida            36     2
+    Apicomplexa         17     8
+    Arthropoda        1964   928
+    Ascomycota         436    49
+    Basidiomycota       77    55
+
+- Arguments to be supplied by the user::
+
+    col_i   col_j   d-value    correction-method
+    
+    2       3       0.44       Bonferroni
+
+- Output File: phylum, readcount1, readcount2, z1, z2, p1, p2, corrected p1, corrected p2::
+
+    Annelida            36     2   3.385   4.276  0.000356  0.000010  0.00463  0.00012
+    Apicomplexa         17     8  -0.157  -0.156  0.437707  0.438103  1.00000  1.00000
+    Arthropoda        1964   928  -1.790  -1.777  0.036755  0.037744  0.47782  0.49067
+    Ascomycota         436    49   9.778  11.418  0.000000  0.000000  0.00000  0.00000
+    Basidiomycota       77    55  -2.771  -2.659  0.002792  0.003916  0.03629  0.05091
+
+-----
+
+**Note**
+
+- Input file should be Tab delimited
+- i &lt; j
+- d cannot be 0
+- k = Bonferroni or FDR
+
+-----
+
+**References**
+
+- Shiue, W. and Bain, L. (1982). Experiment Size and Power Comparisons for Two-Sample Poisson Tests. Applied Statistics 31, 130-134.
+
+- Huffman, M. D. (1984). An Improved Approximate Two-Sample Poisson Test. Applied Statistics 33, 224-226.
+
+  </help>
+</tool>
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/taxonomy/t2ps_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/taxonomy/t2ps_wrapper.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,67 @@
+"""
+Wrapper for tree2PS-fast
+Requires ps2pdf (a part of ghostscript package) to be installed
+
+t2ps_wrapper.py <taxonomy file> <output PDF file> <max_tree_level> <font_size> <max_leaves> <count_duplicate_tax_id>
+
+    taxonomy file    - tree file produced by taxonomy2tree program written by Sergei Kosakovski Pond
+    output PDF file  - tree image
+    max_tree_level   - integer from 0 to 21; 0 = show all levels
+    font_size        - integer from 2 to 255 (8 is the best)
+    max_leaves       - integer from 0 to infinity (0 = show all)
+    count_duplicate  - 0 (do not count) or 1 (count)
+    
+anton nekrutenko | anton@bx.psu.edu
+tree2PS-fast is written by Sergei Kosakovski Pond | sergeilkp@mac.com
+"""
+
+import string, sys, tempfile, subprocess
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+
+try:
+    tree_file = sys.argv[1]
+    pdf_file  = sys.argv[2]
+    max_tree_level = sys.argv[3]
+    font_size = sys.argv[4]
+    max_leaves = sys.argv[5]
+    dups = sys.argv[6]
+except:
+    stop_err('Check arguments\n')
+
+newick_file = tempfile.NamedTemporaryFile('w')    
+ps_file = tempfile.NamedTemporaryFile('w')
+
+# Execute taxonomy2tree
+    
+try:
+    t2t_cmd = 'taxonomy2tree %s %s %s /dev/null 1 > /dev/null 2>&1' % ( tree_file, max_tree_level, newick_file.name )
+    retcode = subprocess.call( t2t_cmd, shell=True )
+    if retcode < 0:
+        print >>sys.stderr, "Execution of taxonomy2tree terminated by signal", -retcode
+except OSError, e:
+    print >>sys.stderr, "Execution of taxonomy2tree failed:", e
+
+
+# Execute tree2PS-fast
+    
+try:
+    t2ps_cmd = 'tree2PS-fast %s %s %s %s %s %s' % ( newick_file.name, ps_file.name, max_tree_level, font_size, max_leaves, dups )
+    retcode = subprocess.call( t2ps_cmd, shell=True )
+    if retcode < 0:
+        print >>sys.stderr, "Execution of tree2PS-fast terminated by signal", -retcode
+except OSError, e:
+    print >>sys.stderr, "Execution of tree2PS-fast failed:", e
+    
+# Convert PS to PDF
+
+try:
+    ps2pdf_cmd = 'ps2pdf %s %s' % ( ps_file.name, pdf_file )
+    retcode = subprocess.call( ps2pdf_cmd, shell=True )
+    if retcode < 0:
+        print >>sys.stderr, "Execution of ps2pdf terminated by signal", -retcode
+except OSError, e:
+    print >>sys.stderr, "Execution of ps2pdf failed:", e
b
diff -r 000000000000 -r 9071e359b9a3 tools/taxonomy/t2ps_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/taxonomy/t2ps_wrapper.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,129 @@
+<tool id="Draw_phylogram" name="Draw phylogeny" version="1.0.0">
+  <description></description>
+    <requirements>
+        <requirement type="package">taxonomy</requirement>
+    </requirements>
+  <command interpreter="python">t2ps_wrapper.py $input $out_file1 $max_tree_level $font_size $max_leaves 1</command>
+  <inputs>
+    <param format="taxonomy" name="input" type="data" label="Draw phylogram for"></param>
+    <param name="max_tree_level" label="show ranks from root to" type="select" help="Choosing to show entire tree may produce very large PDF file disabling your viewer">
+        <option value="8">Class</option> 
+        <option value="0">Show entire tree</option>
+        <option value="1">Superkingdom</option>
+        <option value="2">Kingdom</option>
+        <option value="3">Subkingdom</option>
+        <option value="4">Superphylum</option>
+        <option value="5">Phylum</option>
+        <option value="6">Subphylum</option>
+        <option value="7">Superclass</option>
+        <option value="9">Subclass</option>
+        <option value="10">Superorder</option>
+        <option value="11">Order</option>
+        <option value="12">Suborder</option>
+        <option value="13">Superfamily</option>
+        <option value="14">Family</option>
+        <option value="15">Subfamily</option>
+        <option value="16">Tribe</option>
+        <option value="17">Subtribe</option>
+        <option value="18">Genus</option>
+        <option value="19">Subgenus</option>
+        <option value="20">Species</option>
+        <option value="21">Subspecies</option>
+    </param>
+    <param name="font_size" type="select" label="select font size">
+        <option value="8">Normal</option>
+        <option value="4">Tiny</option>
+        <option value="12">Large</option>
+    </param>
+    <param name="max_leaves" type="text" size="5" value="0" label="maximum number of leaves" help="set to 0 to show all"/>
+  </inputs>
+  <outputs>
+    <data format="pdf" name="out_file1" />
+  </outputs>
+  <requirements>
+    <requirement type="binary">tree2PS-fast</requirement>
+  </requirements>
+  <help>
+
+**What it does**
+
+Given taxonomy representation (produced by *Taxonomy manipulation->Fetch Taxonomic Ranks* tool) this utility produces a graphical representations of phylogenetic tree in PDF format.
+
+--------
+
+**Example 1: Fake data**
+
+Suppose you have the following dataset::
+
+    Species_1 1 root superkingdom1 kingdom1 subkingdom1 superphylum1 phylum1 subphylum1 superclass1 class1 subclass1 superorder1 order1 suborder1 superfamily1 family1 subfamily1 tribe1 subtribe1 genus1 subgenus1 species1 subspecies1
+    Species_2 2 root superkingdom1 kingdom1 subkingdom1 superphylum1 phylum1 subphylum1 superclass1 class1 subclass1 superorder1 order1 suborder1 superfamily1 family1 subfamily1 tribe1 subtribe1 genus2 subgenus2 species2 subspecies2
+    Species_3 3 root superkingdom1 kingdom1 subkingdom1 superphylum1 phylum3 subphylum3 superclass3 class3 subclass3 superorder3 order3 suborder3 superfamily3 family3 subfamily3 tribe3 subtribe3 genus3 subgenus3 species3 subspecies3
+    Species_4 4 root superkingdom1 kingdom1 subkingdom1 superphylum1 phylum4 subphylum4 superclass4 class4 subclass4 superorder4 order4 suborder4 superfamily4 family4 subfamily4 tribe4 subtribe4 genus4 subgenus4 species4 subspecies4
+
+Drawing the tree with default parameters (without changing anything in the interface) will produce this tree:
+
+.. image:: ./static/images/t2ps_ideal.png 
+   :width: 500
+
+(for explanation of colors and numbers on the tree scroll to the bottom of this help section)
+
+Here *Class* rank represent terminal nodes (leaves) of the tree because it is the default setting of the "*show ranks from root to*" drop-down.  Changing the drop-down to "*Subspecies*" will produce this:
+
+.. image:: ./static/images/t2ps_ideal_ssp.png 
+   :width: 1000
+
+--------
+
+**Example 2: Fake data with missing nodes**
+
+Real taxonomic datasets almost always contain empty nodes.  These are represented with "**n**" as shown below::
+
+    Species_1 1 root superkingdom1 kingdom1 subkingdom1 superphylum1 phylum1 subphylum1 superclass1 class1 subclass1 superorder1 order1 suborder1 superfamily1 family1 subfamily1 tribe1 subtribe1 genus1 subgenus1 species1 subspecies1
+    Species_2 2 root superkingdom1 kingdom1 subkingdom1 superphylum1 phylum1 subphylum1 superclass1 class1 subclass1 superorder1 order1 suborder1 superfamily1 family1 subfamily1 tribe1 subtribe1 genus2 n         species2 subspecies2
+    Species_3 3 root superkingdom1 kingdom1 subkingdom1 superphylum1 n       subphylum3 superclass3 class3 subclass3 superorder3 order3 suborder3 superfamily3 family3 subfamily3 tribe3 subtribe3 genus3 subgenus3 species3 subspecies3
+    Species_4 4 root superkingdom1 kingdom1 subkingdom1 superphylum1 phylum4 subphylum4 superclass4 class4 subclass4 superorder4 order4 suborder4 superfamily4 family4 subfamily4 tribe4 subtribe4 genus4 subgenus4 species4 subspecies4
+    
+(here *phylum* for Species_3 and *subgenus* for Species_2 are unassigned)
+
+A full tree for this dataset will look like this:
+
+.. image:: ./static/images/t2ps_missing_nodes.png 
+   :width: 1000
+
+Missing nodes are simply omitted from the tree (there are no gray boxes corresponding to "n") but the branch length is maintained so that taxa belonging to the same taxonomic rank are always aligned with each other
+
+--------
+
+**Autoscaling the tree**
+
+You can use the "*maximum number of leaves*" to restrict the tree to a specified number of leaves (external nodes).  Using the following setting on the above dataset (note *show ranks from root to* set to *show entire tree* and *maximum number of leaves* is set *3*):
+
+.. image:: ./static/images/t2ps_autoscale.png 
+
+will produce this tree:
+
+.. image:: ./static/images/t2ps_autoscale_tree.png 
+   :width: 1000
+
+Here the tree is automatically trimmed at a taxonomic rank that will only have 3 outer nodes.  This is very useful for initial evaluation of very large trees where you want to only see, say, 1,000 outer nodes at once.
+
+-------
+
+**Explanation of phylogenetic tree markup** 
+
+Branches of the tree are colored according to the heatmap below.  The "bluer" the branch the lesser the number of leaves it leads to and vice versa. 
+
+.. image:: ./static/images/t2ps_heatmap.png 
+
+Each node is labeled with taxonomic name and the number of tree leaves belonging to this taxonomic group:
+
+.. image:: ./static/images/t2ps_node_label.png 
+
+
+
+
+
+  </help>
+</tool>
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/taxonomy/t2t_report.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/taxonomy/t2t_report.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,72 @@
+<tool id="t2t_report" name="Summarize taxonomy" version="1.0.0">
+    <description></description>
+    <requirements>
+        <requirement type="package">taxonomy</requirement>
+    </requirements>
+    <command>taxonomy2tree $input 0 /dev/null $out_file1 0</command>
+    <inputs>
+        <param format="taxonomy" name="input" type="data" label="Summarize taxonomic representation for"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="out_file1" />
+    </outputs>
+  <requirements>
+    <requirement type="binary">taxonomy2tree</requirement>
+  </requirements>
+  <tests>
+    <test>
+      <param name="input" value="taxonomyGI.taxonomy" ftype="taxonomy"/>
+      <output name="out_file1" file="t2t_report.tabular"/>
+    </test>
+  </tests>
+
+    
+<help>
+
+**What it does**
+
+Given taxonomy representation (produced by *Taxonomy manipulation->Fetch Taxonomic Ranks* tool) this utility computes a summary of all taxonomic ranks. 
+
+------
+
+**Example**
+
+Suppose the *Taxonomy manipulation->Fetch Taxonomic Ranks* generated the following taxonomy representation::
+
+    9916 2      root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n Laurasiatheria   n        Ruminantia  n           Bovidae  Bovinae n n Bos  n Bos taurus   n
+    9606 12585 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n Euarchontoglires Primates Haplorrhini Hominoidea Hominidae n       n n Homo n Homo sapiens n
+
+Running this tool will generate the following output::
+    
+    Rank         Rank Name          Count
+    -------------------------------------
+    root         root               2
+    superkingdom Eukaryota          2
+    kingdom      Metazoa            2
+    phylum       Chordata           2
+    subphylum  Craniata           2
+    superclass  Gnathostomata      2
+    class        Mammalia           2
+    superorder   Euarchontoglires   1
+    superorder   Laurasiatheria     1
+    order        Primates           1
+    suborder     Haplorrhini        1
+    suborder     Ruminantia         1
+    superfamily  Hominoidea         1
+    family       Bovidae            1
+    family       Hominidae          1
+    subfamily    Bovinae            1
+    genus        Bos                1
+    genus        Homo               1
+    species      Bos taurus         1
+    species      Homo sapiens       1
+    
+The output is sorted on Rank and then on Rank Name.  
+
+.. class:: warningmark
+
+**Note** that this tool omits "**n**" corresponding to ranks missing from NCBI taxonomy. In the above example *Home sapiens* contains the order name (Primates) while *Bos taurus* does not.
+
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/tool_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/tool_conf.xml Fri Mar 09 19:37:19 2012 -0500
b
b'@@ -0,0 +1,701 @@\n+<?xml version="1.0"?>\n+<toolbox>\n+<label text="Sharp lab tools" id="sharplab" />\n+  <section name="Single interval manipulation" id="singleinterval">\n+    <tool file="mytools/bedclean.xml"/>\n+    <tool file="mytools/bedsort.xml"/>\n+    <tool file="mytools/collapseBed.xml" />\n+        <tool file="mytools/makewindow.xml" />\n+    <tool file="mytools/resize.xml" />\n+    <tool file="mytools/random_interval.xml"/>\n+    <tool file="mytools/shuffleBed.xml"/>\n+    <tool file="mytools/genomeView.xml"/>\n+  </section>\n+    <section name="Meta-Analysis" id="xuebing">\n+    <tool file="mytools/genomeView.xml"/>\n+    <tool file="mytools/intersectbed.xml"/>\n+    <tool file="mytools/closestBed.xml"/>\n+    <tool file="mytools/spatial_proximity.xml"/>\n+    <tool file="mytools/bwBinavg.xml"/>\n+    <tool file="mytools/metaintv.xml" />\n+    <tool file="mytools/metaintv_ext.xml" />\n+    <tool file="mytools/alignr.xml" />\n+    <tool file="mytools/align2multiple.xml" />\n+    <tool file="mytools/align2database.xml" />\n+    <tool file="mytools/intersectSig.xml" />\n+    <tool file="mytools/bigWigAverageOverBed.xml" />\n+    <tool file="mytools/endbias.xml" />\n+  </section>\n+    <section name="Statistics/Visualization" id="sharpvis">\n+    <tool file="mytools/genomeView.xml"/>\n+    <tool file="mytools/intervalSize.xml" />\n+    <tool file="mytools/intersectSig.xml" />\n+    <tool file="mytools/cdf.xml" />\n+    <tool file="mytools/binaverage.xml" />\n+    <tool file="mytools/alignvis.xml" />\n+    <tool file="mytools/plotmatrix.xml" />\n+    <tool file="mytools/venn.xml"/>\n+  </section>\n+  <section name="Text/Format manipulation" id="sharptext">\n+     <tool file="mytools/collapseTab.xml" />\n+     <tool file="mytools/fastqdump.xml" />\n+    <tool file="mytools/bowtie2bed.xml" />\n+    <tool file="mytools/sampline.xml" />\n+    <tool file="mytools/headtail.xml" />\n+    <tool file="mytools/convertEnsembl.xml" />\n+    <tool file="mytools/removeDuplicate.xml" />\n+    <tool file="mytools/bed_to_bam.xml" />\n+     <tool file="mytools/makebigwig.xml" />\n+   \n+  </section>\n+    <section name="Sequence/Motif" id="sharpsequence">\n+    <tool file="extract/extract_genomic_dna.xml" />\n+    <tool file="mytools/revcompl.xml"  />\n+    <tool file="mytools/fastashuffle1.xml"  />\n+    <tool file="mytools/fastashuffle2.xml"  />\n+    <tool file="mytools/iupac2meme.xml" />\n+    <tool file="mytools/seq2meme.xml" />\n+    <tool file="mytools/memelogo.xml" />    \n+    <tool file="mytools/fastamarkov.xml" />\n+    <tool file="mytools/meme.xml"/>\n+    <tool file="mytools/dreme.xml"/>    \n+    <tool file="mytools/fimo2.xml"/>\n+        <tool file="mytools/fimo2bed.xml"/>\n+    <tool file="rgenetics/rgWebLogo3.xml" />\n+    <tool file="mytools/splicesite.xml" />\n+  </section>\n+  \n+    <section name="Conservation/Other scores" id="score">\n+    <tool file="mytools/phastCons.xml"  />\n+  </section>\n+\n+<label text="selected tools" id="selectedtools" />\n+  <section name="Get Data" id="sharpgetext">\n+    <tool file="data_source/upload.xml"/>\n+    <tool file="data_source/ucsc_tablebrowser.xml" />\n+    <tool file="data_source/biomart.xml" />\n+  </section>\n+  <section name="Operate on Genomic Intervals" id="sharpbxops">\n+    <tool file="new_operations/intersect.xml" />\n+    <tool file="new_operations/subtract.xml" />\n+    <tool file="new_operations/merge.xml" />\n+    <tool file="new_operations/concat.xml" />\n+    \n+    <tool file="mytools/closestBed.xml" />\n+    <tool file="mytools/flankBed.xml" />\n+    <tool file="mytools/shuffleBed.xml" />\n+    <tool file="mytools/sortBed.xml" />\n+    \n+    <tool file="new_operations/basecoverage.xml" />\n+    <tool file="new_operations/coverage.xml" />\n+    <tool file="new_operations/complement.xml" />\n+    <tool file="new_operations/cluster.xml" id="cluster" />\n+    <tool file="new_operations/join.xml" />\n+    <tool file="new_operations/get_flanks.xml" />\n+    <tool file="new_operations/flanking_features.xml" />\n+    <tool file="annotation_profiler/annotat'..b'tool file="emboss_5/emboss_dreg.xml" />\n+    <tool file="emboss_5/emboss_einverted.xml" />\n+    <tool file="emboss_5/emboss_epestfind.xml" />\n+    <tool file="emboss_5/emboss_equicktandem.xml" />\n+    <tool file="emboss_5/emboss_est2genome.xml" />\n+    <tool file="emboss_5/emboss_etandem.xml" />\n+    <tool file="emboss_5/emboss_extractfeat.xml" />\n+    <tool file="emboss_5/emboss_extractseq.xml" />\n+    <tool file="emboss_5/emboss_freak.xml" />\n+    <tool file="emboss_5/emboss_fuzznuc.xml" />\n+    <tool file="emboss_5/emboss_fuzzpro.xml" />\n+    <tool file="emboss_5/emboss_fuzztran.xml" />\n+    <tool file="emboss_5/emboss_garnier.xml" />\n+    <tool file="emboss_5/emboss_geecee.xml" />\n+    <tool file="emboss_5/emboss_getorf.xml" />\n+    <tool file="emboss_5/emboss_helixturnhelix.xml" />\n+    <tool file="emboss_5/emboss_hmoment.xml" />\n+    <tool file="emboss_5/emboss_iep.xml" />\n+    <tool file="emboss_5/emboss_infoseq.xml" />\n+    <tool file="emboss_5/emboss_isochore.xml" />\n+    <tool file="emboss_5/emboss_lindna.xml" />\n+    <tool file="emboss_5/emboss_marscan.xml" />\n+    <tool file="emboss_5/emboss_maskfeat.xml" />\n+    <tool file="emboss_5/emboss_maskseq.xml" />\n+    <tool file="emboss_5/emboss_matcher.xml" />\n+    <tool file="emboss_5/emboss_megamerger.xml" />\n+    <tool file="emboss_5/emboss_merger.xml" />\n+    <tool file="emboss_5/emboss_msbar.xml" />\n+    <tool file="emboss_5/emboss_needle.xml" />\n+    <tool file="emboss_5/emboss_newcpgreport.xml" />\n+    <tool file="emboss_5/emboss_newcpgseek.xml" />\n+    <tool file="emboss_5/emboss_newseq.xml" />\n+    <tool file="emboss_5/emboss_noreturn.xml" />\n+    <tool file="emboss_5/emboss_notseq.xml" />\n+    <tool file="emboss_5/emboss_nthseq.xml" />\n+    <tool file="emboss_5/emboss_octanol.xml" />\n+    <tool file="emboss_5/emboss_oddcomp.xml" />\n+    <tool file="emboss_5/emboss_palindrome.xml" />\n+    <tool file="emboss_5/emboss_pasteseq.xml" />\n+    <tool file="emboss_5/emboss_patmatdb.xml" />\n+    <tool file="emboss_5/emboss_pepcoil.xml" />\n+    <tool file="emboss_5/emboss_pepinfo.xml" />\n+    <tool file="emboss_5/emboss_pepnet.xml" />\n+    <tool file="emboss_5/emboss_pepstats.xml" />\n+    <tool file="emboss_5/emboss_pepwheel.xml" />\n+    <tool file="emboss_5/emboss_pepwindow.xml" />\n+    <tool file="emboss_5/emboss_pepwindowall.xml" />\n+    <tool file="emboss_5/emboss_plotcon.xml" />\n+    <tool file="emboss_5/emboss_plotorf.xml" />\n+    <tool file="emboss_5/emboss_polydot.xml" />\n+    <tool file="emboss_5/emboss_preg.xml" />\n+    <tool file="emboss_5/emboss_prettyplot.xml" />\n+    <tool file="emboss_5/emboss_prettyseq.xml" />\n+    <tool file="emboss_5/emboss_primersearch.xml" />\n+    <tool file="emboss_5/emboss_revseq.xml" />\n+    <tool file="emboss_5/emboss_seqmatchall.xml" />\n+    <tool file="emboss_5/emboss_seqret.xml" />\n+    <tool file="emboss_5/emboss_showfeat.xml" />\n+    <tool file="emboss_5/emboss_shuffleseq.xml" />\n+    <tool file="emboss_5/emboss_sigcleave.xml" />\n+    <tool file="emboss_5/emboss_sirna.xml" />\n+    <tool file="emboss_5/emboss_sixpack.xml" />\n+    <tool file="emboss_5/emboss_skipseq.xml" />\n+    <tool file="emboss_5/emboss_splitter.xml" />\n+    <tool file="emboss_5/emboss_supermatcher.xml" />\n+    <tool file="emboss_5/emboss_syco.xml" />\n+    <tool file="emboss_5/emboss_tcode.xml" />\n+    <tool file="emboss_5/emboss_textsearch.xml" />\n+    <tool file="emboss_5/emboss_tmap.xml" />\n+    <tool file="emboss_5/emboss_tranalign.xml" />\n+    <tool file="emboss_5/emboss_transeq.xml" />\n+    <tool file="emboss_5/emboss_trimest.xml" />\n+    <tool file="emboss_5/emboss_trimseq.xml" />\n+    <tool file="emboss_5/emboss_twofeat.xml" />\n+    <tool file="emboss_5/emboss_union.xml" />\n+    <tool file="emboss_5/emboss_vectorstrip.xml" />\n+    <tool file="emboss_5/emboss_water.xml" />\n+    <tool file="emboss_5/emboss_wobble.xml" />\n+    <tool file="emboss_5/emboss_wordcount.xml" />\n+    <tool file="emboss_5/emboss_wordmatch.xml" />\n+  </section>\n+-->\n+</toolbox>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/._awk_tool.xml
b
Binary file tools/unix_tools/._awk_tool.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/._awk_wrapper.sh
b
Binary file tools/unix_tools/._awk_wrapper.sh has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/._cut_tool.xml
b
Binary file tools/unix_tools/._cut_tool.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/._cut_wrapper.sh
b
Binary file tools/unix_tools/._cut_wrapper.sh has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/._find_and_replace.pl
b
Binary file tools/unix_tools/._find_and_replace.pl has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/._find_and_replace.xml
b
Binary file tools/unix_tools/._find_and_replace.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/._grep_tool.xml
b
Binary file tools/unix_tools/._grep_tool.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/._grep_wrapper.sh
b
Binary file tools/unix_tools/._grep_wrapper.sh has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/._grep_wrapper_old.sh
b
Binary file tools/unix_tools/._grep_wrapper_old.sh has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/._join_tool.sh
b
Binary file tools/unix_tools/._join_tool.sh has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/._join_tool.xml
b
Binary file tools/unix_tools/._join_tool.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/._remove_ending.sh
b
Binary file tools/unix_tools/._remove_ending.sh has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/._remove_ending.xml
b
Binary file tools/unix_tools/._remove_ending.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/._sed_tool.xml
b
Binary file tools/unix_tools/._sed_tool.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/._sed_wrapper.sh
b
Binary file tools/unix_tools/._sed_wrapper.sh has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/._sort_tool.xml
b
Binary file tools/unix_tools/._sort_tool.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/._uniq_tool.xml
b
Binary file tools/unix_tools/._uniq_tool.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/._word_list_grep.pl
b
Binary file tools/unix_tools/._word_list_grep.pl has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/._word_list_grep.xml
b
Binary file tools/unix_tools/._word_list_grep.xml has changed
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/awk_tool.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/awk_tool.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,138 @@
+<tool id="cshl_awk_tool" name="awk">
+  <description></description>
+  <command interpreter="sh">awk_wrapper.sh $input $output '$file_data' '$FS' '$OFS'</command>
+  <inputs>
+    <param format="txt" name="input" type="data" label="File to process" />
+
+    <param name="FS" type="select" label="Input field-separator">
+ <option value=",">comma (,)</option>
+ <option value=":">colons (:) </option>
+ <option value=" ">single space</option>
+ <option value=".">dot (.)</option>
+ <option value="-">dash (-)</option>
+ <option value="|">pipe (|)</option>
+ <option value="_">underscore (_)</option>
+ <option selected="True" value="tab">tab</option>
+    </param>
+
+    <param name="OFS" type="select" label="Output field-separator">
+ <option value=",">comma (,)</option>
+ <option value=":">colons (:)</option>
+ <option value=" ">space ( )</option>
+ <option value="-">dash (-)</option>
+ <option value=".">dot (.)</option>
+ <option value="|">pipe (|)</option>
+ <option value="_">underscore (_)</option>
+ <option selected="True" value="tab">tab</option>
+    </param>
+
+
+    <!-- Note: the parameter ane MUST BE 'url_paste' -
+         This is a hack in the galaxy library (see ./lib/galaxy/util/__init__.py line 142)
+  If the name is 'url_paste' the string won't be sanitized, and all the non-alphanumeric characters 
+  will be passed to the shell script -->
+    <param name="file_data" type="text" area="true" size="5x35" label="AWK Program" help=""> 
+     <validator type="expression" message="Invalid Program!">value.find('\'')==-1</validator>
+    </param>
+
+  </inputs>
+  <tests>
+   <test>
+   <param name="input" value="unix_awk_input1.txt" />
+   <output name="output" file="unix_awk_output1.txt" />
+   <param name="FS" value="tab" />
+   <param name="OFS" value="tab" />
+   <param name="file_data"  value="$2>0.5 { print $2*9, $1 }" />
+   </test>
+  </tests>
+  <outputs>
+    <data format="input" name="output" metadata_source="input" />
+  </outputs>
+<help>
+
+**What it does**
+
+This tool runs the unix **awk** command on the selected data file.
+
+.. class:: infomark
+
+**TIP:** This tool uses the **extended regular** expression syntax (not the perl syntax).
+
+
+**Further reading**
+
+- Awk by Example (http://www.ibm.com/developerworks/linux/library/l-awk1.html)
+- Long AWK tutorial (http://www.grymoire.com/Unix/Awk.html)
+- Learn AWK in 1 hour (http://www.selectorweb.com/awk.html)
+- awk cheat-sheet (http://cbi.med.harvard.edu/people/peshkin/sb302/awk_cheatsheets.pdf)
+- Collection of useful awk one-liners (http://student.northpark.edu/pemente/awk/awk1line.txt)
+
+-----
+
+**AWK programs**
+
+Most AWK programs consist of **patterns** (i.e. rules that match lines of text) and **actions** (i.e. commands to execute when a pattern matches a line).
+
+The basic form of AWK program is::
+
+    pattern { action 1; action 2; action 3; }
+
+
+
+
+
+**Pattern Examples**
+
+- **$2 == "chr3"**  will match lines whose second column is the string 'chr3'
+- **$5-$4>23**  will match lines that after subtracting the value of the fourth column from the value of the fifth column, gives value alrger than 23.
+- **/AG..AG/** will match lines that contain the regular expression **AG..AG** (meaning the characeters AG followed by any two characeters followed by AG). (This is the way to specify regular expressions on the entire line, similar to GREP.)
+- **$7 ~ /A{4}U/**  will match lines whose seventh column contains 4 consecutive A's followed by a U. (This is the way to specify regular expressions on a specific field.)
+- **10000 &lt; $4 &amp;&amp; $4 &lt; 20000** will match lines whose fourth column value is larger than 10,000 but smaller than 20,000
+- If no pattern is specified, all lines match (meaning the **action** part will be executed on all lines).
+
+
+
+**Action Examples**
+
+- **{ print }** or **{ print $0 }**   will print the entire input line (the line that matched in **pattern**). **$0** is a special marker meaning 'the entire line'.
+- **{ print $1, $4, $5 }** will print only the first, fourth and fifth fields of the input line.
+- **{ print $4, $5-$4 }** will print the fourth column and the difference between the fifth and fourth column. (If the fourth column was start-position in the input file, and the fifth column was end-position - the output file will contain the start-position, and the length).
+- If no action part is specified (not even the curly brackets) - the default action is to print the entire line.
+
+
+
+
+
+
+
+
+
+**AWK's Regular Expression Syntax**
+
+The select tool searches the data for lines containing or not containing a match to the given pattern. A Regular Expression is a pattern descibing a certain amount of text. 
+
+- **( ) { } [ ] . * ? + \ ^ $** are all special characters. **\\** can be used to "escape" a special character, allowing that special character to be searched for.
+- **^** matches the beginning of a string(but not an internal line).
+- **(** .. **)** groups a particular pattern.
+- **{** n or n, or n,m **}** specifies an expected number of repetitions of the preceding pattern.
+
+  - **{n}** The preceding item is matched exactly n times.
+  - **{n,}** The preceding item ismatched n or more times. 
+  - **{n,m}** The preceding item is matched at least n times but not more than m times. 
+
+- **[** ... **]** creates a character class. Within the brackets, single characters can be placed. A dash (-) may be used to indicate a range such as **a-z**.
+- **.** Matches any single character except a newline.
+- ***** The preceding item will be matched zero or more times.
+- **?** The preceding item is optional and matched at most once.
+- **+** The preceding item will be matched one or more times.
+- **^** has two meaning:
+  - matches the beginning of a line or string. 
+  - indicates negation in a character class. For example, [^...] matches every character except the ones inside brackets.
+- **$** matches the end of a line or string.
+- **\|** Separates alternate possibilities. 
+
+
+**Note**: AWK uses extended regular expression syntax, not Perl syntax. **\\d**, **\\w**, **\\s** etc. are **not** supported.
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/awk_wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/awk_wrapper.sh Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,47 @@
+#!/bin/sh
+
+##
+## Galaxy wrapper for AWK command
+##
+
+##
+## command line arguments:
+##   input_file
+##   output_file
+##   awk-program
+##   input-field-separator
+##   output-field-separator
+
+INPUT="$1"
+OUTPUT="$2"
+PROG="$3"
+FS="$4"
+OFS="$5"
+
+shift 5
+
+if [ -z "$OFS" ]; then
+ echo usage: $0 INPUTFILE OUTPUTFILE AWK-PROGRAM FS OFS>&2
+ exit 1
+fi
+
+if [ ! -r "$INPUT" ]; then
+ echo "error: input file ($INPUT) not found!" >&2
+ exit 1
+fi
+
+if [ "$FS" == "tab" ]; then
+ FS="\t"
+fi
+if [ "$OFS" == "tab" ]; then
+ OFS="\t"
+fi
+
+# Messages printed to STDOUT will be displayed in the "INFO" field in the galaxy dataset.
+# This way the user can tell what was the command
+echo "awk" "$PROG"
+
+awk --sandbox -v OFS="$OFS" -v FS="$FS" --re-interval "$PROG" "$INPUT" > "$OUTPUT"
+if (( $? ));  then exit; fi
+
+exit 0
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/cut_tool.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/cut_tool.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,94 @@
+<tool id="cshl_cut_tool" name="cut">
+  <description>columns from files</description>
+  <command interpreter="sh">
+   cut_wrapper.sh '$complement' '$cutwhat' '$list' '$input' '$output'
+  </command>
+
+  <inputs>
+ <param format="txt" name="input" type="data" label="file to cut" />
+
+     <param name="complement" type="select" label="Operation">
+       <option value="">Keep</option>
+       <option value="--complement">Discard</option>
+ </param>
+
+     <param name="cutwhat" type="select" label="Cut by">
+       <option value="-f">fields</option>
+       <option value="-c">characters</option>
+ </param>
+
+ <param name="list" type="text" size="20" label="List of Fields/Characters/Bytes" help="These will be kept/discarded (depending on 'operation'). &lt;BR /&gt; Examples: 1,3,4 or 2-5" value = "" />
+  </inputs>
+
+  <tests>
+   <test>
+   <param name="input" value="unix_cut_input1.txt" />
+   <output name="output" file="unix_cut_output1.txt" />
+   <param name="complement" value="Keep" />
+   <param name="cutwhat" value="fields" />
+   <param name="list"  value="1,3,4" />
+   </test>
+   <test>
+   <param name="input" value="unix_cut_input1.txt" />
+   <output name="output" file="unix_cut_output1.txt" />
+   <param name="complement" value="Discard" />
+   <param name="cutwhat" value="fields" />
+   <param name="list"  value="2" />
+   </test>
+  </tests>
+
+  <outputs>
+    <data format="input" name="output" metadata_source="input"/>
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool runs the **cut** unix command, which extract or delete columns from a file.
+
+-----
+
+Field List Example:
+
+**1,3,7** - Cut specific fields/characters.
+
+**3-**    - Cut from the third field/character to the end of the line.
+
+**2-5**   - Cut from the second to the fifth field/character.
+
+**-8**    - Cut from the first to the eight field/characters.
+
+
+
+
+Input Example::
+
+    fruit color price weight
+    apple red 1.4 0.5
+    orange orange 1.5 0.3
+    banana yellow 0.9 0.3
+
+
+Output Example ( **Keeping fields 1,3,4** )::
+
+    fruit price weight
+    apple 1.4 0.5
+    orange 1.5 0.3
+    banana 0.9 0.3
+
+Output Example ( **Discarding field 2** )::
+
+    fruit price weight
+    apple 1.4 0.5
+    orange 1.5 0.3
+    banana 0.9 0.3
+
+Output Example ( **Keeping 3 characters** )::
+
+    fru
+    app
+    ora
+    ban
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/cut_wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/cut_wrapper.sh Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,52 @@
+#!/bin/sh
+
+##
+## Galaxy wrapper for cut command.
+##
+
+##
+## command line arguments:
+##   complement flag (might be empty string)
+##   what to cut (fields or characters)
+##   cut list (e.g. 1,2,3,4)
+##   input_file
+##   output_file
+
+COMPLEMENT="$1"
+CUTWHAT="$2"
+CUTLIST="$3"
+INPUT="$4"
+OUTPUT="$5"
+
+if [ -z "$OUTPUT" ]; then
+ echo "This script should be run from inside galaxy!" >&2
+ exit 1
+fi
+
+if [ ! -r "$INPUT" ]; then
+ echo "error: input file ($INPUT) not found!" >&2
+ exit 1
+fi
+
+# Messages printed to STDOUT will be displayed in the "INFO" field in the galaxy dataset.
+# This way the user can tell what was the command
+if [ -z "$COMPLEMENT" ]; then
+ echo -n "Extracting " 
+else
+ echo "Deleting "
+fi
+
+case $CUTWHAT in
+ -f) echo -n "field(s) "
+ ;;
+
+ -c) echo -n "character(s) "
+ ;;
+esac
+
+echo "$CUTLIST"
+
+
+cut $COMPLEMENT $CUTWHAT $CUTLIST < $INPUT > $OUTPUT
+
+exit 
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/find_and_replace.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/find_and_replace.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,202 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use Getopt::Std;
+
+sub parse_command_line();
+sub build_regex_string();
+sub usage();
+
+my $input_file ;
+my $output_file;
+my $find_pattern ;
+my $replace_pattern ;
+my $find_complete_words ;
+my $find_pattern_is_regex ;
+my $find_in_specific_column ;
+my $find_case_insensitive ;
+my $replace_global ;
+my $skip_first_line ;
+
+
+##
+## Program Start
+##
+usage() if @ARGV<2;
+parse_command_line();
+my $regex_string = build_regex_string() ;
+
+# Allow first line to pass without filtering?
+if ( $skip_first_line ) {
+ my $line = <$input_file>;
+ print $output_file $line ;
+}
+
+
+##
+## Main loop
+##
+
+## I LOVE PERL (and hate it, at the same time...)
+##
+## So what's going on with the self-compiling perl code?
+##
+## 1. The program gets the find-pattern and the replace-pattern from the user (as strings).
+## 2. If both the find-pattern and replace-pattern are simple strings (not regex), 
+##    it would be possible to pre-compile a regex (with qr//) and use it in a 's///'
+## 3. If the find-pattern is a regex but the replace-pattern is a simple text string (with out back-references)
+##    it is still possible to pre-compile the regex and use it in a 's///'
+## However,
+## 4. If the replace-pattern contains back-references, pre-compiling is not possible.
+##    (in perl, you can't precompile a substitute regex).
+##    See these examples:
+##    http://www.perlmonks.org/?node_id=84420
+##    http://stackoverflow.com/questions/125171/passing-a-regex-substitution-as-a-variable-in-perl
+##
+##    The solution:
+##    we build the regex string as valid perl code (in 'build_regex()', stored in $regex_string ),
+##    Then eval() a new perl code that contains the substitution regex as inlined code.
+##    Gotta love perl!
+
+my $perl_program ;
+if ( $find_in_specific_column ) {
+ # Find & replace in specific column
+
+ $perl_program = <<EOF;
+ while ( <STDIN> ) {
+ chomp ;
+ my \@columns = split ;
+
+ #not enough columns in this line - skip it
+ next if ( \@columns < $find_in_specific_column ) ;
+
+ \$columns [ $find_in_specific_column - 1 ] =~ $regex_string ;
+
+ print STDOUT join("\t", \@columns), "\n" ;
+ }
+EOF
+
+} else {
+ # Find & replace the entire line
+ $perl_program = <<EOF;
+ while ( <STDIN> ) {
+ $regex_string ;
+ print STDOUT;
+ }
+EOF
+}
+
+
+# The dynamic perl code reads from STDIN and writes to STDOUT,
+# so connect these handles (if the user didn't specifiy input / output
+# file names, these might be already be STDIN/OUT, so the whole could be a no-op).
+*STDIN = $input_file ;
+*STDOUT = $output_file ;
+eval $perl_program ;
+
+
+##
+## Program end
+##
+
+
+sub parse_command_line()
+{
+ my %opts ;
+ getopts('grsiwc:o:', \%opts) or die "$0: Invalid option specified\n";
+
+ die "$0: missing Find-Pattern argument\n" if (@ARGV==0); 
+ $find_pattern = $ARGV[0];
+ die "$0: missing Replace-Pattern argument\n" if (@ARGV==1); 
+ $replace_pattern = $ARGV[1];
+
+ $find_complete_words = ( exists $opts{w} ) ;
+ $find_case_insensitive = ( exists $opts{i} ) ;
+ $skip_first_line = ( exists $opts{s} ) ;
+ $find_pattern_is_regex = ( exists $opts{r} ) ;
+ $replace_global = ( exists $opts{g} ) ;
+
+ # Search in specific column ?
+ if ( defined $opts{c} ) {
+ $find_in_specific_column = $opts{c};
+
+ die "$0: invalid column number ($find_in_specific_column).\n"
+ unless $find_in_specific_column =~ /^\d+$/ ;
+
+ die "$0: invalid column number ($find_in_specific_column).\n"
+ if $find_in_specific_column <= 0; 
+ }
+ else {
+ $find_in_specific_column = 0 ;
+ }
+
+ # Output File specified (instead of STDOUT) ?
+ if ( defined $opts{o} ) {
+ my $filename = $opts{o};
+ open $output_file, ">$filename" or die "$0: Failed to create output file '$filename': $!\n" ;
+ } else {
+ $output_file = *STDOUT ;
+ }
+
+
+ # Input file Specified (instead of STDIN) ?
+ if ( @ARGV>2 ) {
+ my $filename = $ARGV[2];
+ open $input_file, "<$filename" or die "$0: Failed to open input file '$filename': $!\n" ;
+ } else {
+ $input_file = *STDIN;
+ }
+}
+
+sub build_regex_string()
+{
+ my $find_string ;
+ my $replace_string ;
+
+ if ( $find_pattern_is_regex ) {
+ $find_string = $find_pattern ;
+ $replace_string = $replace_pattern ;
+ } else {
+ $find_string = quotemeta $find_pattern ;
+ $replace_string = quotemeta $replace_pattern;
+ }
+
+ if ( $find_complete_words ) {
+ $find_string = "\\b($find_string)\\b"; 
+ }
+
+ my $regex_string = "s/$find_string/$replace_string/";
+
+ $regex_string .= "i" if ( $find_case_insensitive );
+ $regex_string .= "g" if ( $replace_global ) ;
+
+
+ return $regex_string;
+}
+
+sub usage()
+{
+print <<EOF;
+
+Find and Replace
+Copyright (C) 2009 - by A. Gordon ( gordon at cshl dot edu )
+
+Usage: $0 [-o OUTPUT] [-g] [-r] [-w] [-i] [-c N] [-l] FIND-PATTERN REPLACE-PATTERN [INPUT-FILE]
+
+   -g   - Global replace - replace all occurences in line/column. 
+          Default - replace just the first instance.
+   -w   - search for complete words (not partial sub-strings).
+   -i   - case insensitive search.
+   -c N - check only column N, instead of entire line (line split by whitespace).
+   -l   - skip first line (don't replace anything in it)
+   -r   - FIND-PATTERN and REPLACE-PATTERN are perl regular expression,
+          usable inside a 's///' statement.
+          By default, they are used as verbatim text strings.
+   -o OUT - specify output file (default = STDOUT).
+   INPUT-FILE - (optional) read from file (default = from STDIN).
+
+
+EOF
+
+ exit;
+}
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/find_and_replace.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/find_and_replace.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,154 @@
+<tool id="cshl_find_and_replace" name="Find and Replace">
+  <description>text</description>
+  <command interpreter="perl">
+ find_and_replace.pl
+ #if $searchwhere.choice == "column":
+ -c $searchwhere.column
+ #end if
+ -o $output 
+ $caseinsensitive 
+ $wholewords 
+ $skip_first_line
+ $is_regex
+ '$url_paste'
+ '$file_data'
+ '$input'
+  </command>
+  <inputs>
+    <param format="txt" name="input" type="data" label="File to process" />
+
+    <!-- Note: the parameter ane MUST BE 'url_paste' -
+         This is a hack in the galaxy library (see ./lib/galaxy/util/__init__.py line 142)
+  If the name is 'url_paste' the string won't be sanitized, and all the non-alphanumeric characters 
+  will be passed to the shell script -->
+  <param name="url_paste" type="text" size="20" label="Find pattern" help="Use simple text, or a valid regular expression (without backslashes // ) " > 
+     <validator type="expression" message="Invalid Program!">value.find('\'')==-1</validator>
+ </param>
+
+  <param name="file_data" type="text" size="20" label="Replace with" help="Use simple text, or &amp; (ampersand) and \\1 \\2 \\3 to refer to matched text. See examples below." >
+     <validator type="expression" message="Invalid Program!">value.find('\'')==-1</validator>
+ </param>
+
+ <param name="is_regex" type="boolean" checked="false" truevalue="-r" falsevalue="" label="Find-Pattern is a regular expression" 
+ help="see help section for details." />
+
+ <param name="caseinsensitive" type="boolean" checked="false" truevalue="-i" falsevalue="" label="Case-Insensitive search" 
+ help="" />
+
+ <param name="wholewords" type="boolean" checked="false" truevalue="-w" falsevalue="" label="find whole-words" 
+ help="ignore partial matches (e.g. 'apple' will not match 'snapple') " />
+
+ <param name="skip_first_line" type="boolean" checked="false" truevalue="-s" falsevalue="" label="Ignore first line" 
+ help="Select this option if the first line contains column headers. Text in the line will not be replaced. " />
+
+ <conditional name="searchwhere">
+ <param name="choice" type="select" label="Replace text in">
+ <option value="line" selected="true">entire line</option>
+ <option value="column">specific column</option>
+ </param>
+
+ <when value="line">
+ </when>
+
+ <when value="column">
+     <param name="column" label="in column" type="data_column" data_ref="input" accept_default="true" />
+ </when>
+ </conditional>
+  </inputs>
+
+  <outputs>
+    <data format="input" name="output" metadata_source="input" />
+  </outputs>
+
+<help>
+
+**What it does**
+
+This tool finds &amp; replaces text in an input dataset.
+
+.. class:: infomark
+
+The **pattern to find** can be a simple text string, or a perl **regular expression** string (depending on *pattern is a regex* check-box).
+
+.. class:: infomark
+
+When using regular expressions, the **replace pattern** can contain back-references ( e.g. \\1 )
+
+.. class:: infomark
+
+This tool uses Perl regular expression syntax.
+
+-----
+
+**Examples of *regular-expression* Find Patterns**
+
+- **HELLO**     The word 'HELLO' (case sensitive).
+- **AG.T**      The letters A,G followed by any single character, followed by the letter T.
+- **A{4,}**     Four or more consecutive A's.
+- **chr2[012]\\t**       The words 'chr20' or 'chr21' or 'chr22' followed by a tab character.
+- **hsa-mir-([^ ]+)**        The text 'hsa-mir-' followed by one-or-more non-space characters. When using parenthesis, the matched content of the parenthesis can be accessed with **\1** in the **replace** pattern.
+
+
+**Examples of Replace Patterns**
+
+- **WORLD**  The word 'WORLD' will be placed whereever the find pattern was found.
+- **FOO-&amp;-BAR**  Each time the find pattern is found, it will be surrounded with 'FOO-' at the begining and '-BAR' at the end. **&amp;** (ampersand) represents the matched find pattern.
+- **\\1**   The text which matched the first parenthesis in the Find Pattern.
+
+
+-----
+
+**Example 1**
+
+**Find Pattern:** HELLO
+**Replace Pattern:** WORLD
+**Regular Expression:** no
+**Replace what:** entire line
+
+Every time the word HELLO is found, it will be replaced with the word WORLD. 
+
+-----
+
+**Example 2**
+
+**Find Pattern:** ^chr 
+**Replace Pattern:** (empty)
+**Regular Expression:** yes
+**Replace what:** column 11
+
+If column 11 (of every line) begins with ther letters 'chr', they will be removed. Effectively, it'll turn "chr4" into "4" and "chrXHet" into "XHet"
+
+
+-----
+
+**Perl's Regular Expression Syntax**
+
+The Find &amp; Replace tool searches the data for lines containing or not containing a match to the given pattern. A Regular Expression is a pattern descibing a certain amount of text. 
+
+- **( ) { } [ ] . * ? + \\ ^ $** are all special characters. **\\** can be used to "escape" a special character, allowing that special character to be searched for.
+- **^** matches the beginning of a string(but not an internal line).
+- **(** .. **)** groups a particular pattern.
+- **{** n or n, or n,m **}** specifies an expected number of repetitions of the preceding pattern.
+
+  - **{n}** The preceding item is matched exactly n times.
+  - **{n,}** The preceding item ismatched n or more times. 
+  - **{n,m}** The preceding item is matched at least n times but not more than m times. 
+
+- **[** ... **]** creates a character class. Within the brackets, single characters can be placed. A dash (-) may be used to indicate a range such as **a-z**.
+- **.** Matches any single character except a newline.
+- ***** The preceding item will be matched zero or more times.
+- **?** The preceding item is optional and matched at most once.
+- **+** The preceding item will be matched one or more times.
+- **^** has two meaning:
+  - matches the beginning of a line or string. 
+  - indicates negation in a character class. For example, [^...] matches every character except the ones inside brackets.
+- **$** matches the end of a line or string.
+- **\\|** Separates alternate possibilities. 
+- **\\d** matches a single digit
+- **\\w** matches a single letter or digit or an underscore.
+- **\\s** matches a single white-space (space or tabs).
+
+
+</help>
+
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/grep_tool.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/grep_tool.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,130 @@
+<tool id="cshl_grep_tool" name="grep">
+  <description></description>
+  <command interpreter="sh">grep_wrapper.sh $input $output '$url_paste' $color -A $lines_after -B $lines_before $invert $case_sensitive</command>
+  <inputs>
+    <param format="txt" name="input" type="data" label="Select lines from" />
+
+    <param name="invert" type="select" label="that">
+      <option value="">Match</option>
+      <option value="-v">Don't Match</option>
+    </param>
+
+    <!-- Note: the parameter ane MUST BE 'url_paste' -
+         This is a hack in the galaxy library (see ./lib/galaxy/util/__init__.py line 142)
+  If the name is 'url_paste' the string won't be sanitized, and all the non-alphanumeric characters 
+  will be passed to the shell script -->
+    <param name="url_paste" type="text" size="40" label="Regular Expression" help=""> 
+     <validator type="expression" message="Invalid Program!">value.find('\'')==-1</validator>
+    </param>
+
+    <param name="case_sensitive" type="select"  label="Match type"> 
+      <option value="-i">case insensitive</option>
+      <option value="">case sensitive</option>
+    </param>
+
+    <param name="lines_before" type="integer"  label="Show lines preceding the matched line" help="(same as grep -B, leave it at zero unless you know what you're doing)" value="0" /> 
+    <param name="lines_after" type="integer"  label="Show lines trailing the matched line" help="(same as grep -A, leave it at zero unless you know what you're doing)" value="0" /> 
+
+    <param name="color" type="select"  label="Output"> 
+      <option value="NOCOLOR">text file (for further processing)</option>
+      <option value="COLOR">Highlighted HTML (for easier viewing)</option>
+    </param>
+
+  </inputs>
+  <tests>
+   <test>
+   <!-- grep a FASTA file for sequences with specific motif -->
+   <param name="input" value="unix_grep_input1.txt" />
+   <output name="output" file="unix_grep_output1.txt" />
+   <param name="case_sensitive" value="case sensitive" />
+   <param name="invert" value="" />
+   <param name="url_paste" value="AA.{2}GT" />
+   <param name="lines_before" value="1" />
+   <param name="lines_after" value="0" />
+   <param name="color" value="NOCOLOR" />
+   </test>
+   <test>
+   <!-- grep a FASTA file for sequences with specific motif -
+   show highlighed output -->
+   <param name="input" value="unix_grep_input1.txt" />
+   <output name="output" file="unix_grep_output2.html" />
+   <param name="case_sensitive" value="case sensitive" />
+   <param name="invert" value="" />
+   <param name="url_paste" value="AA.{2}GT" />
+   <param name="lines_before" value="0" />
+   <param name="lines_after" value="0" />
+   <param name="color" value="COLOR" />
+   </test>
+  </tests>
+  <outputs>
+   <data format="input" name="output" metadata_source="input" >
+ <change_format>
+ <when input="color" value="COLOR" format="HTML" />
+ </change_format>
+    </data>
+  </outputs>
+<help>
+
+**What it does**
+
+This tool runs the unix **grep** command on the selected data file.
+
+.. class:: infomark
+
+**TIP:** This tool uses the **perl** regular expression syntax (same as running 'grep -P'). This is **NOT** the POSIX or POSIX-extended syntax (unlike the awk/sed tools).
+
+
+**Further reading**
+
+- Wikipedia's Regular Expression page (http://en.wikipedia.org/wiki/Regular_expression)
+- Regular Expressions cheat-sheet (PDF) (http://www.addedbytes.com/cheat-sheets/download/regular-expressions-cheat-sheet-v2.pdf)
+- Grep Tutorial (http://www.panix.com/~elflord/unix/grep.html)
+
+-----
+
+**Grep Examples**
+
+- **AGC.AAT** would match lines with AGC followed by any character, followed by AAT (e.g. **AGCQAAT**, **AGCPAAT**, **AGCwAAT**)
+- **C{2,5}AGC** would match lines with 2 to 5 consecutive Cs followed by AGC
+- **TTT.{4,10}AAA** would match lines with 3 Ts, followed by 4 to 10 characters (any characeters), followed by 3 As.
+- **^chr([0-9A-Za-z])+** would match lines that begin with chromsomes, such as lines in a BED format file.
+- **(ACGT){1,5}** would match at least 1 "ACGT" and at most 5 "ACGT" consecutively.
+- **hsa|mmu** would match lines containing "hsa" or "mmu" (or both).

+-----
+
+**Regular Expression Syntax**
+
+The select tool searches the data for lines containing or not containing a match to the given pattern. A Regular Expression is a pattern descibing a certain amount of text. 
+
+- **( ) { } [ ] . * ? + \ ^ $** are all special characters. **\\** can be used to "escape" a special character, allowing that special character to be searched for.
+- **^** matches the beginning of a string(but not an internal line).
+- **\\d** matches a digit, same as [0-9].
+- **\\D** matches a non-digit.
+- **\\s** matches a whitespace character.
+- **\\S** matches anything BUT a whitespace.
+- **\\t** matches a tab.
+- **\\w** matches an alphanumeric character ( A to Z, 0 to 9 and underscore )
+- **\\W** matches anything but an alphanumeric character.
+- **(** .. **)** groups a particular pattern.
+- **\\Z** matches the end of a string(but not a internal line).
+- **{** n or n, or n,m **}** specifies an expected number of repetitions of the preceding pattern.
+
+  - **{n}** The preceding item is matched exactly n times.
+  - **{n,}** The preceding item ismatched n or more times. 
+  - **{n,m}** The preceding item is matched at least n times but not more than m times. 
+
+- **[** ... **]** creates a character class. Within the brackets, single characters can be placed. A dash (-) may be used to indicate a range such as **a-z**.
+- **.** Matches any single character except a newline.
+- ***** The preceding item will be matched zero or more times.
+- **?** The preceding item is optional and matched at most once.
+- **+** The preceding item will be matched one or more times.
+- **^** has two meaning:
+  - matches the beginning of a line or string. 
+  - indicates negation in a character class. For example, [^...] matches every character except the ones inside brackets.
+- **$** matches the end of a line or string.
+- **\|** Separates alternate possibilities. 
+
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/grep_wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/grep_wrapper.sh Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,62 @@
+#!/bin/sh
+
+##
+## Galaxy wrapper for GREP command.
+##
+
+##
+## command line arguments:
+##   input_file
+##   output_file
+##   regex
+##   COLOR or NOCOLOR
+##   [other parameters passed on to grep]
+
+INPUT="$1"
+OUTPUT="$2"
+REGEX="$3"
+COLOR="$4"
+
+shift 4
+
+if [ -z "$COLOR" ]; then
+ echo usage: $0 INPUTFILE OUTPUTFILE REGEX COLOR\|NOCOLOR [other grep patameters] >&2
+ exit 1
+fi
+
+if [ ! -r "$INPUT" ]; then
+ echo "error: input file ($INPUT) not found!" >&2
+ exit 1
+fi
+
+# Messages printed to STDOUT will be displayed in the "INFO" field in the galaxy dataset.
+# This way the user can tell what was the command
+echo "grep" "$@" "$REGEX"
+
+if [ "$COLOR" == "COLOR" ]; then
+ #
+ # What the heck is going on here???
+ # 1. "GREP_COLORS" is an environment variable, telling GREP which ANSI colors to use.
+ # 2. "--colors=always" tells grep to actually use colors (according to the GREP_COLORS variable)
+ # 3. first sed command translates the ANSI color to a <FONT> tag with blue color (and a <B> tag, too)
+ # 4. second sed command translates the no-color ANSI command to a </FONT> tag (and a </B> tag, too)
+ # 5. htmlize_pre scripts takes a text input and wraps it in <HTML><BODY><PRE> tags, making it a fixed-font HTML file.
+
+ GREP_COLORS="ms=31" grep --color=always "$@" -- "$REGEX" "$INPUT" | \
+ grep -v "^\[36m\[K--\[m\[K$" | \
+ sed -r 's/\[[0123456789;]+m\[K?/<font color="blue"><b>/g' | \
+ sed -r 's/\[m\[K?/<\/b><\/font>/g' | \
+ htmlize_pre.sh > "$OUTPUT"
+
+
+ if (( $? ));  then exit; fi
+
+elif [ "$COLOR" == "NOCOLOR" ]; then
+ grep "$@" -- "$REGEX" "$INPUT" | grep -v "^--$" > "$OUTPUT"
+ if (( $? ));  then exit; fi
+else
+ echo Error: third parameter must be "COLOR" or "NOCOLOR" >&2
+ exit 1
+fi
+
+exit 0
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/grep_wrapper_old.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/grep_wrapper_old.sh Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,62 @@
+#!/bin/sh
+
+##
+## Galaxy wrapper for GREP command.
+##
+
+##
+## command line arguments:
+##   input_file
+##   output_file
+##   regex
+##   COLOR or NOCOLOR
+##   [other parameters passed on to grep]
+
+INPUT="$1"
+OUTPUT="$2"
+REGEX="$3"
+COLOR="$4"
+
+shift 4
+
+if [ -z "$COLOR" ]; then
+ echo usage: $0 INPUTFILE OUTPUTFILE REGEX COLOR\|NOCOLOR [other grep patameters] >&2
+ exit 1
+fi
+
+if [ ! -r "$INPUT" ]; then
+ echo "error: input file ($INPUT) not found!" >&2
+ exit 1
+fi
+
+# Messages printed to STDOUT will be displayed in the "INFO" field in the galaxy dataset.
+# This way the user can tell what was the command
+echo "grep" "$@" "$REGEX"
+
+if [ "$COLOR" == "COLOR" ]; then
+ #
+ # What the heck is going on here???
+ # 1. "GREP_COLORS" is an environment variable, telling GREP which ANSI colors to use.
+ # 2. "--colors=always" tells grep to actually use colors (according to the GREP_COLORS variable)
+ # 3. first sed command translates the ANSI color to a <FONT> tag with blue color (and a <B> tag, too)
+ # 4. second sed command translates the no-color ANSI command to a </FONT> tag (and a </B> tag, too)
+ # 5. htmlize_pre scripts takes a text input and wraps it in <HTML><BODY><PRE> tags, making it a fixed-font HTML file.
+
+ GREP_COLORS="ms=31" grep --color=always -P "$@" -- "$REGEX" "$INPUT" | \
+ grep -v "^\[36m\[K--\[m\[K$" | \
+ sed -r 's/\[[0123456789;]+m\[K?/<font color="blue"><b>/g' | \
+ sed -r 's/\[m\[K?/<\/b><\/font>/g' | \
+ htmlize_pre.sh > "$OUTPUT"
+
+
+ if (( $? ));  then exit; fi
+
+elif [ "$COLOR" == "NOCOLOR" ]; then
+ grep -P "$@" -- "$REGEX" "$INPUT" | grep -v "^--$" > "$OUTPUT"
+ if (( $? ));  then exit; fi
+else
+ echo Error: third parameter must be "COLOR" or "NOCOLOR" >&2
+ exit 1
+fi
+
+exit 0
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/join_tool.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/join_tool.sh Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,37 @@
+#!/bin/sh
+
+#
+# NOTE:
+#  This is a wrapper for GNU's join under galaxy
+#  not ment to be used from command line (if you're using the command line, simply run 'join' directly...)
+#
+# All parameters must be supplied.
+# the join_tool.xml file takes care of that.
+
+JOINTYPE="$1"
+OUTPUT_FORMAT="$2"
+EMPTY_STRING="$3"
+DELIMITER="$4"
+IGNORE_CASE="$5"
+
+INPUT1="$6"
+COLUMN1="$7"
+INPUT2="$8"
+COLUMN2="$9"
+OUTPUT="${10}"
+
+if [ "$OUTPUT" == "" ]; then
+ echo "This script is part of galaxy. Don't run it manually.\n" >&2
+ exit 1;
+fi
+
+#This a TAB hack for galaxy (which can't transfer a "\t" as a parameter)
+[ "$DELIMITER" == "tab" ] && DELIMITER=" "
+
+#Remove spaces from the output format (if the user entered any)
+OUTPUT_FORMAT=${OUTPUT_FORMAT// /}
+[ "$OUTPUT_FORMAT" != "" ] && OUTPUT_FORMAT="-o $OUTPUT_FORMAT"
+
+echo join $OUTPUT_FORMAT -t "$DELIMITER" -e "$EMPTY_STRING" $IGNORE_CASE $JOINTYPE -1 "$COLUMN1" -2 "$COLUMN2" 
+#echo join $OUTPUT_FORMAT -t "$DELIMITER" -e "$EMPTY_STRING" $IGNORE_CASE $JOINTYPE -1 "$COLUMN1" -2 "$COLUMN2" "$INPUT1" "$INPUT2" \> "$OUTPUT" 
+join $OUTPUT_FORMAT -t "$DELIMITER" -e "$EMPTY_STRING" $JOINTYPE -1 "$COLUMN1" -2 "$COLUMN2" "$INPUT1" "$INPUT2" > "$OUTPUT" || exit 1
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/join_tool.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/join_tool.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,54 @@
+<tool id="cshl_join_tool" name="join">
+  <description>two files</description>
+  <command interpreter="sh">join_tool.sh "$jointype" "$output_format" 
+   "$empty_string_filler" "$delimiter"
+ "$ignore_case"
+ "$input1" "$column1"
+ "$input2" "$column2"
+ "$output"
+  </command>
+  
+  <inputs>
+ <param format="txt" name="input1" type="data" label="1st file" />
+ <param name="column1" label="Column to use from 1st file" type="data_column" data_ref="input1" accept_default="true" />
+
+ <param format="txt" name="input2" type="data" label="2nd File" />
+ <param name="column2" label="Column to use from 2nd file" type="data_column" data_ref="input2" accept_default="true" />
+
+ <param name="jointype" type="select" label="Output lines appearing in">
+       <option value=" ">BOTH 1st &amp; 2nd file.</option>
+       <option value="-v 1">1st but not in 2nd file. [-v 1]</option>
+       <option value="-v 2">2nd but not in 1st file. [-v 2]</option>
+       <option value="-a 1">both 1st &amp; 2nd file, plus unpairable lines from 1st file. [-a 1]</option>
+       <option value="-a 2">both 1st &amp; 2nd file, plus unpairable lines from 2st file. [-a 2]</option>
+       <option value="-a 1 -a 2">All Lines [-a 1 -a 2]</option>
+ </param>
+
+     <param name="delimiter" type="select" label="field-separator [-t]">
+ <option value=",">comma (,)</option>
+ <option value=":">colons (:) </option>
+ <option value=" ">single space</option>
+ <option value=".">dot (.)</option>
+ <option value="-">dash (-)</option>
+ <option value="|">pipe (|)</option>
+ <option value="_">underscore (_)</option>
+ <option selected="True" value="tab">tab</option>
+     </param>
+
+ <param name="ignore_case" type="select" label="Case sensitivity">
+       <option value="">Case sensitive</option>
+       <option value="-i">Case INsensitive [-i]</option>
+ </param>
+
+ <param name="empty_string_filler" type="text" size="20" label="String replacement for empty fields [-e EMPTY]" help="Leave empty unless you know what you're doing. Use this when specifing output format" /> 
+
+ <param name="output_format" type="text" size="30" label="Output line format [-o FORMAT]" help="Leave empty unless you know what you're doing. Example: 1.1,2.1,2.1" /> 
+
+  </inputs>
+  <outputs>
+    <data name="output" format="input" metadata_source="input1" />
+  </outputs>
+  
+<help>
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/remove_ending.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/remove_ending.sh Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,69 @@
+#!/bin/sh
+
+# Version 0.1 ,  15aug08
+# Written by Assaf Gordon (gordon@cshl.edu)
+#
+
+LINES="$1"
+INFILE="$2"
+OUTFILE="$3"
+
+if [ "$LINES" == "" ]; then
+ cat >&2 <<EOF 
+Remove Ending Lines
+
+Usage: $0 LINES [INFILE] [OUTFILE]
+
+   LINES - number of lines to remove from the end of the file
+   [INFILE] - input file (if not specified - defaults to STDIN)
+   [OUTFILE]- output file (if not specified - defaults to STDOUT)
+
+Input Example:
+
+#Chr Start End
+chr1 10 15
+chr1 40 20
+chr1 21 14
+total   3 chromosomes
+
+Removing 1 line (the last line) produces:
+
+#Chr Start End
+chr1 10 15
+chr1 20 40
+chr 14 21
+
+Usage Example:
+   
+   \$ $0 1 < my_input_file.txt > my_output_file.txt
+
+EOF
+
+ exit 1
+fi
+
+#Validate line argument - remove non-digits characters
+LINES=${LINES//[^[:digit:]]/}
+
+#Make sure the line strings isn't empty
+#(after the regex above, they will either contains digits or be empty)
+if [ -z "$LINES" ]; then
+ echo "Error: bad line value (must be numeric)" >&2
+ exit 1
+fi
+
+# Use default (stdin/out) values if infile / outfile not specified
+[ -z "$INFILE" ] && INFILE="/dev/stdin"
+[ -z "$OUTFILE" ] && OUTFILE="/dev/stdout"
+
+#Make sure the input file (if specified) exists.
+if [ ! -r "$INFILE" ]; then
+ echo "Error: input file ($INFILE) not found!" >&2
+ exit 1
+fi
+
+
+# The "gunzip -f" trick allows
+# piping a file (gzip or plain text, real file name or "/dev/stdin") to sed 
+gunzip -f <"$INFILE" | sed -n -e :a -e "1,${LINES}!{P;N;D;};N;ba" > "$OUTFILE"
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/remove_ending.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/remove_ending.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,43 @@
+<tool id="Remove ending" name="Remove ending">
+  <description>of a file</description>
+  <command interpreter="sh">remove_ending.sh $num_lines $input $out_file1</command>
+  <inputs>
+    <param name="num_lines" size="5" type="integer" value="1" label="Remove last" help="lines"/>
+    <param format="txt" name="input" type="data" label="from"/>
+  </inputs>
+  <tests>
+   <test>
+   <param name="input" value="remove_ending_input1.txt" />
+   <output name="out_file1" file="remove_ending_output1.txt" />
+   <param name="num_lines" value="2" />
+   </test>
+  </tests>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input"/>
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool removes specified number of lines from the ending of a dataset
+
+-----
+
+**Example**
+
+Input File::
+
+    chr7  56632  56652   D17003_CTCF_R6  310  +
+    chr7  56736  56756   D17003_CTCF_R7  354  +
+    chr7  56761  56781   D17003_CTCF_R4  220  +
+    chr7  56772  56792   D17003_CTCF_R7  372  +
+    chr7  56775  56795   D17003_CTCF_R4  207  +
+
+After removing the last 2 lines the dataset will look like this::
+
+    chr7  56632  56652   D17003_CTCF_R6  310  +
+    chr7  56736  56756   D17003_CTCF_R7  354  +
+    chr7  56761  56781   D17003_CTCF_R4  220  +
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/sed_tool.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/sed_tool.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,92 @@
+<tool id="cshl_sed_tool" name="sed">
+  <description></description>
+  <!-- NOTE
+     'sandbox' is a patched SED program,
+   which blocks executing shell commands and file reading/writing.
+
+   Hopefully, it is safe enough to allow users to execute their own SED commands
+   -->
+  <command interpreter="sh">sed_wrapper.sh $silent $input $output '$url_paste'</command>
+  <inputs>
+    <param format="txt" name="input" type="data" label="File to process" />
+
+    <!-- Note: the parameter ane MUST BE 'url_paste' -
+         This is a hack in the galaxy library (see ./lib/galaxy/util/__init__.py line 142)
+  If the name is 'url_paste' the string won't be sanitized, and all the non-alphanumeric characters 
+  will be passed to the shell script -->
+    <param name="url_paste" type="text" area="true" size="5x35" label="SED Program" help=""> 
+     <validator type="expression" message="Invalid Program!">value.find('\'')==-1</validator>
+    </param>
+
+    <param name="silent" type="select"  label="operation mode" help="(Same as 'sed -n', leave at 'normal' unless you know what you're doing)" > 
+      <option value="">normal</option>
+      <option value="-n">silent</option>
+    </param>
+
+  </inputs>
+  <outputs>
+    <data format="input" name="output" metadata_source="input" />
+  </outputs>
+<help>
+
+**What it does**
+
+This tool runs the unix **sed** command on the selected data file.
+
+.. class:: infomark
+
+**TIP:** This tool uses the **extended regular** expression syntax (same as running 'sed -r').
+
+
+
+**Further reading**
+
+- Short sed tutorial (http://www.linuxhowtos.org/System/sed_tutorial.htm)
+- Long sed tutorial (http://www.grymoire.com/Unix/Sed.html)
+- sed faq with good examples (http://sed.sourceforge.net/sedfaq.html)
+- sed cheat-sheet (http://www.catonmat.net/download/sed.stream.editor.cheat.sheet.pdf)
+- Collection of useful sed one-liners (http://student.northpark.edu/pemente/sed/sed1line.txt)
+
+-----
+
+**Sed commands**
+
+The most useful sed command is **s** (substitute).
+
+**Examples**
+
+- **s/hsa//**  will remove the first instance of 'hsa' in every line.
+- **s/hsa//g**  will remove all instances (beacuse of the **g**) of 'hsa' in every line.
+- **s/A{4,}/--&amp;--/g**  will find sequences of 4 or more consecutive A's, and once found, will surround them with two dashes from each side. The **&amp;** marker is a place holder for 'whatever matched the regular expression'.
+- **s/hsa-mir-([^ ]+)/short name: \\1 full name: &amp;/**  will find strings such as 'hsa-mir-43a' (the regular expression is 'hsa-mir-' followed by non-space characters) and will replace it will string such as 'short name: 43a full name: hsa-mir-43a'.  The **\\1** marker is a place holder for 'whatever matched the first parenthesis' (similar to perl's **$1**) .
+
+
+**sed's Regular Expression Syntax**
+
+The select tool searches the data for lines containing or not containing a match to the given pattern. A Regular Expression is a pattern descibing a certain amount of text. 
+
+- **( ) { } [ ] . * ? + \ ^ $** are all special characters. **\\** can be used to "escape" a special character, allowing that special character to be searched for.
+- **^** matches the beginning of a string(but not an internal line).
+- **(** .. **)** groups a particular pattern.
+- **{** n or n, or n,m **}** specifies an expected number of repetitions of the preceding pattern.
+
+  - **{n}** The preceding item is matched exactly n times.
+  - **{n,}** The preceding item ismatched n or more times. 
+  - **{n,m}** The preceding item is matched at least n times but not more than m times. 
+
+- **[** ... **]** creates a character class. Within the brackets, single characters can be placed. A dash (-) may be used to indicate a range such as **a-z**.
+- **.** Matches any single character except a newline.
+- ***** The preceding item will be matched zero or more times.
+- **?** The preceding item is optional and matched at most once.
+- **+** The preceding item will be matched one or more times.
+- **^** has two meaning:
+  - matches the beginning of a line or string. 
+  - indicates negation in a character class. For example, [^...] matches every character except the ones inside brackets.
+- **$** matches the end of a line or string.
+- **\|** Separates alternate possibilities. 
+
+
+**Note**: SED uses extended regular expression syntax, not Perl syntax. **\\d**, **\\w**, **\\s** etc. are **not** supported.
+
+</help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/sed_wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/sed_wrapper.sh Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,37 @@
+#!/bin/sh
+
+##
+## Galaxy wrapper for SED command
+##
+
+##
+## command line arguments:
+##   input_file
+##   output_file
+##   sed-program
+##   [other parameters passed on to sed]
+
+INPUT="$1"
+OUTPUT="$2"
+PROG="$3"
+
+shift 3
+
+if [ -z "$PROG" ]; then
+ echo usage: $0 INPUTFILE OUTPUTFILE SED-PROGRAM [other sed patameters] >&2
+ exit 1
+fi
+
+if [ ! -r "$INPUT" ]; then
+ echo "error: input file ($INPUT) not found!" >&2
+ exit 1
+fi
+
+# Messages printed to STDOUT will be displayed in the "INFO" field in the galaxy dataset.
+# This way the user can tell what was the command
+echo "sed" "$@" "$PROG"
+
+sed -r --sandbox "$@" "$PROG" "$INPUT" > "$OUTPUT"
+if (( $? ));  then exit; fi
+
+exit 0
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/sort_tool.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/sort_tool.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,134 @@
+<tool id="cshl_sort_tool" name="Sort">
+  <!-- 
+    note 1:
+   the 'version' sort (or natual order sort)
+   requires GNU Coreutils 7.1 or later
+
+ note 2:
+   for greater efficiency, sort buffer size is very large.
+   If your Galaxy server doesn't have so much memory (or the
+   sorts you use don't require it) - you can decrease the memory size.
+   (argument is "-S 2G")
+  -->
+  <command>sort -S 2G $unique 
+      #for $key in $sortkeys
+       '-k ${key.column},${key.column}${key.order}${key.style}'
+      #end for
+   $input > $out_file1
+  </command>
+
+  <inputs>
+ <param format="txt" name="input" type="data" label="Sort Query" />
+
+ <param name="unique" type="select" label="Output only unique values?">
+ <option value="">No</option>
+ <option value="-u">Yes</option>
+ </param>
+
+ <repeat name="sortkeys" title="sort key">
+     <param name="column" label="on column" type="data_column" data_ref="input" accept_default="true" />
+     <param name="order" type="select" display="radio" label="in">
+       <option value="r">Descending order</option>
+       <option value="">Ascending order</option>
+     </param>
+     <param name="style" type="select" display="radio" label="Flavor">
+       <option value="n">Fast numeric sort ([-n])</option>
+       <option value="g">General numeric sort ( scientific notation [-g])</option>
+       <option value="V">Natural/Version sort ([-V]) </option>
+       <option value="">Alphabetical sort</option>
+     </param>
+ </repeat>
+  </inputs>
+  <tests>
+   <test>
+   <!-- Sort Descending numerical order,
+        with scientific notation -->
+   <param name="input" value="unix_sort_input1.txt" />
+   <output name="output" file="unix_sort_output1.txt" />
+   <param name="unique" value="No" />
+   <param name="column" value="2" />
+   <param name="order"  value="r" />
+   <param name="style"  value="g" />
+   </test>
+   <test>
+   <!-- Sort Ascending numerical order,
+   with scientific notation - outputing unique values only 
+
+   The catch:
+    chr15 appears twice, with the same value (0.0314 and 3.14e-2).
+ In the output, it should appear only once because of the unique flag
+   -->
+   <param name="input" value="unix_sort_input1.txt" />
+   <output name="output" file="unix_sort_output2.txt" />
+   <param name="unique" value="Yes" />
+   <param name="column" value="2" />
+   <param name="order"  value="" />
+   <param name="style"  value="g" />
+   </test>
+   <test>
+   <!-- Sort Ascending 'natural' order -->
+   <param name="input" value="unix_sort_input1.txt" />
+   <output name="output" file="unix_sort_output3.txt" />
+   <param name="unique" value="No" />
+   <param name="column" value="1" />
+   <param name="order"  value="" />
+   <param name="style"  value="V" />
+   </test>
+  </tests>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input"/>
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool runs the unix **sort** command on the selected data file.
+
+-----
+
+**Sorting Styles**
+
+* **Fast Numeric**: sort by numeric values. Handles integer values (e.g. 43, 134) and decimal-point values (e.g. 3.14). *Does not* handle scientific notation (e.g. -2.32e2).
+* **General Numeric**: sort by numeric values. Handles all numeric notations (including scientific notation). Slower than *fast numeric*, so use only when necessary.
+* **Natural Sort**: Sort in 'natural' order (natural to humans, not to computers). See example below.
+* **Alphabetical sort**: Sort in strict alphabetical order. See example below.
+
+
+
+
+**Sorting Examples**
+
+Given the following list::
+
+    chr4
+    chr13
+    chr1
+    chr10
+    chr20
+    chr2
+
+**Alphabetical sort** would produce the following sorted list::
+
+    chr1
+    chr10
+    chr13
+    chr2
+    chr20
+    chr4
+
+**Natural Sort** would produce the following sorted list::
+
+    chr1
+    chr2
+    chr4
+    chr10
+    chr13
+    chr20
+
+
+.. class:: infomark
+
+If you're planning to use the file with another tool that expected sorted files (such as *join*), you should use the **Alphabetical sort**,  not the **Natural Sort**. Natural sort order is easier for humans, but is unnatural for computer programs.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/uniq_tool.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/uniq_tool.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,25 @@
+<tool id="cshl_uniq_tool" name="uniq">
+  <command>
+   uniq -f $skipfields $count $repeated $ignorecase $uniqueonly $input $output
+  </command>
+
+  <inputs>
+ <param format="txt" name="input" type="data" label="file to scan for unique values" />
+
+ <param name="count" type="boolean" label="count [-c]" help="prefix lines by the number of occurrences" truevalue="-c" falsevalue="" />
+
+ <param name="repeated" type="boolean" label="repeated [-d]" help="only print duplicate lines" truevalue="-d" falsevalue="" />
+
+ <param name="ignorecase" type="boolean" label="ignore case [-i]" help="ignore differences in case when comparing" truevalue="-i" falsevalue="" />
+
+ <param name="uniqueonly" type="boolean" label="unique only [-u]" help="only print unique lines" truevalue="-u" falsevalue="" />
+
+ <param name="skipfields" type="integer" label="skip fields [-f]" help="avoind comparing the first N fields. (use zero to start from the first field)" size="2" value="0" />
+  </inputs>
+
+  <outputs>
+    <data format="input" name="output" metadata_source="input"/>
+  </outputs>
+  <help>
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/word_list_grep.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/word_list_grep.pl Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,182 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use Getopt::Std;
+
+sub parse_command_line();
+sub load_word_list();
+sub compile_regex(@);
+sub usage();
+
+my $word_list_file;
+my $input_file ;
+my $output_file;
+my $find_complete_words ;
+my $find_inverse; 
+my $find_in_specific_column ;
+my $find_case_insensitive ;
+my $skip_first_line ;
+
+
+##
+## Program Start
+##
+usage() if @ARGV==0;
+parse_command_line();
+
+my @words = load_word_list();
+
+my $regex = compile_regex(@words);
+
+# Allow first line to pass without filtering?
+if ( $skip_first_line ) {
+ my $line = <$input_file>;
+ print $output_file $line ;
+}
+
+
+##
+## Main loop
+##
+while ( <$input_file> ) {
+ my $target = $_;
+
+
+ # If searching in a specific column (and not in the entire line)
+ # extract the content of that one column
+ if ( $find_in_specific_column ) {
+ my @columns = split ;
+
+ #not enough columns in this line - skip it
+ next if ( @columns < $find_in_specific_column ) ;
+
+ $target = $columns [ $find_in_specific_column - 1 ] ;
+ }
+
+ # Match ?
+ if ( ($target =~ $regex) ^ ($find_inverse) ) {
+ print $output_file $_ ;
+ }
+}
+
+close $input_file;
+close $output_file;
+
+##
+## Program end
+##
+
+
+sub parse_command_line()
+{
+ my %opts ;
+ getopts('siwvc:o:', \%opts) or die "$0: Invalid option specified\n";
+
+ die "$0: missing word-list file name\n" if (@ARGV==0); 
+
+ $word_list_file = $ARGV[0];
+ die "$0: Word-list file '$word_list_file' not found\n" unless -e $word_list_file ;
+
+ $find_complete_words = ( exists $opts{w} ) ;
+ $find_inverse = ( exists $opts{v} ) ;
+ $find_case_insensitive = ( exists $opts{i} ) ;
+ $skip_first_line = ( exists $opts{s} ) ;
+
+
+ # Search in specific column ?
+ if ( defined $opts{c} ) {
+ $find_in_specific_column = $opts{c};
+
+ die "$0: invalid column number ($find_in_specific_column).\n"
+ unless $find_in_specific_column =~ /^\d+$/ ;
+
+ die "$0: invalid column number ($find_in_specific_column).\n"
+ if $find_in_specific_column <= 0; 
+ }
+ else {
+ $find_in_specific_column = 0 ;
+ }
+
+
+ # Output File specified (instead of STDOUT) ?
+ if ( defined $opts{o} ) {
+ my $filename = $opts{o};
+ open $output_file, ">$filename" or die "$0: Failed to create output file '$filename': $!\n" ;
+ } else {
+ $output_file = *STDOUT ;
+ }
+
+
+
+ # Input file Specified (instead of STDIN) ?
+ if ( @ARGV>1 ) {
+ my $filename = $ARGV[1];
+ open $input_file, "<$filename" or die "$0: Failed to open input file '$filename': $!\n" ;
+ } else {
+ $input_file = *STDIN;
+ }
+}
+
+sub load_word_list()
+{
+ open WORDLIST, "<$word_list_file" or die "$0: Failed to open word-list file '$word_list_file'\n" ;
+ my @words ;
+ while ( <WORDLIST> ) {
+ chomp ;
+ s/^\s+//;
+ s/\s+$//;
+ next if length==0;
+ push @words,quotemeta $_;
+ }
+ close WORDLIST;
+
+ die "$0: Error: word-list file '$word_list_file' is empty!\n" 
+        unless @words;
+
+ return @words;
+}
+
+sub compile_regex(@)
+{
+ my @words = @_;
+
+ my $regex_string = join ( '|', @words ) ;
+ if ( $find_complete_words ) {
+ $regex_string = "\\b($regex_string)\\b"; 
+ }
+ my $regex;
+
+ if ( $find_case_insensitive ) {
+ $regex = qr/$regex_string/i ;
+ } else {
+ $regex = qr/$regex_string/;
+ }
+
+ return $regex;
+}
+
+sub usage()
+{
+print <<EOF;
+
+Word-List Grep
+Copyright (C) 2009 - by A. Gordon ( gordon at cshl dot edu )
+
+Usage: $0 [-o OUTPUT] [-s] [-w] [-i] [-c N] [-v] WORD-LIST-FILE [INPUT-FILE]
+
+   -s   - do not filter first line - always output the first line from the input file.
+   -w   - search for complete words (not partial sub-strings).
+   -i   - case insensitive search.
+   -v   - inverse - output lines NOT matching the word list.
+   -c N - check only column N, instead of entire line (line split by whitespace).
+   -o OUT - specify output file (default = STDOUT).
+   WORD-LIST-FILE - file containing one word per line. These will be used
+          for the search. 
+   INPUT-FILE - (optional) read from file (default = from STDIN).
+
+
+
+EOF
+
+ exit;
+}
b
diff -r 000000000000 -r 9071e359b9a3 tools/unix_tools/word_list_grep.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/unix_tools/word_list_grep.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,106 @@
+<tool id="cshl_word_list_grep" name="Select lines">
+<description>by word list</description>
+<command interpreter="perl">
+ word_list_grep.pl 
+ #if $searchwhere.choice == "column":
+ -c $searchwhere.column
+ #end if
+ -o $output 
+ $inverse 
+ $caseinsensitive 
+ $wholewords 
+ $skip_first_line
+ $wordlist 
+ $input
+</command>
+
+<inputs>
+ <param name="input" format="txt" type="data" label="input file" />
+ <param name="wordlist" format="txt" type="data" label="word list file" />
+
+
+ <param name="inverse" type="boolean" checked="false" truevalue="-v" falsevalue="" label="Inverse filter" 
+ help="Report lines NOT matching the word list" />
+
+ <param name="caseinsensitive" type="boolean" checked="false" truevalue="-i" falsevalue="" label="Case-Insensitive search" 
+ help="" />
+
+ <param name="wholewords" type="boolean" checked="false" truevalue="-w" falsevalue="" label="find whole-words" 
+ help="ignore partial matches (e.g. 'apple' will not match 'snapple') " />
+
+ <param name="skip_first_line" type="boolean" checked="false" truevalue="-s" falsevalue="" label="Ignore first line" 
+ help="Select this option if the first line contains column headers. First line will not be filtered. " />
+
+ <conditional name="searchwhere">
+ <param name="choice" type="select" label="Search words in">
+ <option value="line" selected="true">entire line</option>
+ <option value="column">specific column</option>
+ </param>
+
+ <when value="line">
+ </when>
+
+ <when value="column">
+     <param name="column" label="in column" type="data_column" data_ref="input" accept_default="true" />
+ </when>
+ </conditional>
+
+</inputs>
+
+<outputs>
+ <data name="output" format="input" metadata_source="input" />
+</outputs>
+
+<help>
+**What it does**
+
+This tool selects lines that match words from a word list.
+
+--------
+
+**Example**
+
+Input file (UCSC's rmsk track from dm3)::
+
+    585 787 66 241 11 chrXHet 2860 3009 -201103 - DNAREP1_DM LINE Penelope 0 594 435 1
+    585 1383 78 220 0 chrXHet 3012 3320 -200792 - DNAREP1_DM LINE Penelope -217 377 2 1
+    585 244 103 0 0 chrXHet 3737 3776 -200336 - DNAREP1_DM LINE Penelope -555 39 1 1
+    585 2270 83 144 0 chrXHet 7907 8426 -195686 + DNAREP1_DM LINE Penelope 1 594 0 1
+    585 617 189 73 68 chrXHet 10466 10671 -193441 + DNAREP1_DM LINE Penelope 368 573 -21 1
+    586 1122 71 185 0 chrXHet 173138 173322 -30790 - PROTOP DNA P -4033 447 230 1
+    ...
+    ...
+
+
+Word list file::
+
+  STALKER
+  PROTOP
+

+
+Output sequence (searching in column 11)::
+
+    586 1122 71 185 0 chrXHet 173138 173322 -30790         - PROTOP DNA P -4033 447 230 1
+    586 228 162 0 0 chrXHet 181026 181063 -23049         + STALKER4_I LTR Gypsy 9 45 -6485 1
+    585 245 105 26 0 chr3R 41609 41647 -27863406 + PROTOP_B DNA P 507 545 -608 4
+    586 238 91 0 0 chr3R 140224 140257 -27764796 - PROTOP_B DNA P -617 536 504 4
+    ...
+    ...
+
+( With **find whole-words** not selected, *PROTOP* matched *PROTOP_B*, *STALKER* matched *STALKER4_I* )
+
+
+
+
+Output sequence (searching in column 11, and whole-words only)::
+
+    586 670 90 38 57 chrXHet 168356 168462 -35650 - PROTOP DNA P -459 4021 3918 1
+    586 413 139 70 0 chrXHet 168462 168548 -35564 - PROTOP DNA P -3406 1074 983 1
+    586 1122 71 185 0 chrXHet 173138 173322 -30790 - PROTOP DNA P -4033 447 230 1
+    ...
+    ...
+
+</help>
+
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/validation/fix_errors.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/validation/fix_errors.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+"""
+Fix errors in a dataset.
+For now, only removing erroneous lines is supported.
+
+usage: %prog input errorsfile output
+    -x, --ext: dataset extension (type)
+    -m, --methods=N: comma separated list of repair methods
+"""
+
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+
+from galaxy import util
+
+def main():
+    options, args = doc_optparse.parse( __doc__ )
+    methods = []
+    try:
+        if options.methods: methods = options.methods.split(",")
+    except:
+        pass
+    
+    ext = options.ext
+
+    in_file = open(args[0], "r")
+    error_file = open(args[1], "r")
+    out_file = open(args[2], "w")
+
+    # string_to_object errors
+    error_list = util.string_to_object(error_file.read())
+    # index by error type and then by line number
+    error_lines = {}
+    error_types = {}
+    for error in error_list:
+        if error.linenum:
+            if error.linenum in error_lines:
+                error_lines[error.linenum].append(error)
+            else:
+                error_lines[error.linenum] = [error]
+        error_type = error.__class__.__name__
+        if error_type in error_types:
+            error_types[error_type].append(error)
+        else:
+            error_types[error_type] = [error]
+
+    linenum = 0
+    for line in in_file:
+        linenum += 1
+        # write unless
+        if "lines" in methods:
+            if linenum in error_lines:
+                line = None
+            # other processing here?
+        if line:
+            out_file.write(line)
+    
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/validation/fix_errors.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/validation/fix_errors.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+<tool name="Fix errors" id="fix_errors" hidden="true">
+
+ <description>in data validation</description>
+
+ <command interpreter="python">
+ fix_errors.py $input $errorsfile $output -x $ext --methods=$methods
+ </command>
+
+ <inputs>
+ <param name="errorsfile" type="text" />
+         <param type="data" name="input" />
+ <param name="ext" type="text" />
+ <param name="methods" type="text" />
+ </inputs>
+
+ <code file="fix_errors_code.py"/>
+
+ <outputs>
+ <data name="output" format="input" metadata="input" />
+ </outputs>
+
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/validation/fix_errors_code.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/validation/fix_errors_code.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,49 @@
+# runs after the job (and after the default post-filter)
+
+import pkg_resources
+pkg_resources.require( "bx-python" )
+
+from galaxy import datatypes, jobs, util
+# needed to reference ParseError types, is this bad?
+from bx.tabular.io import *
+from bx.intervals.io import *
+import sys, tempfile, os
+
+def validate(incoming):
+    """Validator"""
+    #raise Exception, 'not quite right'
+    pass
+
+def exec_before_job( app, inp_data, out_data, param_dict, tool=None):
+    """Build a temp file with errors in it"""
+    errors = []
+    for name, data in inp_data.items():
+        validation_errors = data.validation_errors
+        for error in validation_errors:
+            # build dummy class
+            try:
+                temp = eval(error.err_type)()
+            except:
+                temp = object()
+            # stuff attributes
+            temp.__dict__ = util.string_to_object( error.attributes )
+            errors.append(temp)
+    # There *should* only be 1 input, so we assume there is and continue
+    # base64 pickel
+    errors_str = util.object_to_string( errors )
+    # write
+    database_tmp = "./database/tmp" # globaly visible path
+    error_file = tempfile.NamedTemporaryFile(mode="w", dir=database_tmp, suffix=".b64")
+    error_file_name = error_file.name
+    error_file.close()
+    error_file = open(error_file_name, "w")
+    error_file.write(errors_str)
+    error_file.close()
+    param_dict["errorsfile"] = error_file_name
+    
+    
+def exec_after_process( app, inp_data, out_data, param_dict, tool=None, stdout=None, stderr=None):
+    # in a perfect world, changes to param_dict would persist
+    # for now, unlink from tool
+    # os.unlink(param_dict["errorsfile"])
+    pass
b
diff -r 000000000000 -r 9071e359b9a3 tools/validation/validate.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/validation/validate.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+
+"""
+Validate a dataset based on extension a metadata passed in on the
+command line.  Outputs a binhex'd representation of the exceptions.
+
+usage: %prog input output
+    -m, --metadata=N: base64 pickeled metadata
+    -x, --ext=N: extension as understood by galaxy
+"""
+
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+
+from galaxy import model
+from fileinput import FileInput
+from galaxy import util
+
+def main():
+    options, args = doc_optparse.parse( __doc__ )
+
+    try:
+        extension = options.ext
+    except:
+        doc_optparse.exception()
+
+    # create datatype
+    data = model.Dataset( extension=extension, id=int( args[0] ) )
+    data.file_path = "/home/ian/trunk/database/files/"
+    
+    if options.metadata:
+        data.metadata = util.string_to_object( options.metadata )
+
+    errors = data.datatype.validate( data )
+    print util.object_to_string(errors)
+
+if __name__ == "__main__":
+    main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/vcf_tools/annotate.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vcf_tools/annotate.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,163 @@
+#!/usr/bin/python
+
+import os.path
+import sys
+import optparse
+
+import vcfClass
+from vcfClass import *
+
+import tools
+from tools import *
+
+if __name__ == "__main__":
+  main()
+
+# Check that the reference and alternate in the dbsnp vcf file match those
+# from the input vcf file.
+def checkRefAlt(vcfRef, vcfAlt, dbsnpRef, dbsnpAlt, ref, position, annotation):
+  text = "WARNING: ref and alt alleles differ between vcf and " + annotation + " " + ref + ":" + str(position) + " vcf: " + \
+         vcfRef + "/" + vcfAlt + ", dbsnp: " + dbsnpRef + "/" + dbsnpAlt
+
+  allelesAgree = True
+  if vcfRef.lower() != dbsnpRef.lower():
+    if vcfRef.lower() != dbsnpAlt.lower():
+      #print >> sys.stderr, text
+      allelesAgree = False
+  else:
+    if vcfAlt.lower() != dbsnpAlt.lower():
+      #print >> sys.stderr, text
+      allelesAgree = False
+
+  return allelesAgree
+
+# Intersect two vcf files.  It is assumed that the two files are
+# sorted by genomic coordinates and the reference sequences are
+# in the same order.
+def annotateVcf(v, d, outputFile, annotation):
+  success1 = v.getRecord()
+  success2 = d.getRecord()
+  currentReferenceSequence = v.referenceSequence
+
+# Finish when the end of the first file has been reached.
+  while success1:
+
+# If the end of the dbsnp vcf file is reached, write out the
+# remaining records from the vcf file.
+    if not success2:
+      outputFile.write(v.record)
+      success1 = v.getRecord()
+
+    if v.referenceSequence == d.referenceSequence and v.referenceSequence == currentReferenceSequence:
+      if v.position == d.position:
+        allelesAgree = checkRefAlt(v.ref, v.alt, d.ref, d.alt, v.referenceSequence, v.position, annotation)
+        if annotation == "dbsnp": v.rsid = d.getDbsnpInfo()
+        elif annotation == "hapmap":
+          if allelesAgree: v.info += ";HM3"
+          else: v.info += ";HM3A"
+        record = v.buildRecord(False)
+        outputFile.write(record)
+
+        success1 = v.getRecord()
+        success2 = d.getRecord()
+      elif d.position > v.position: success1 = v.parseVcf(d.referenceSequence, d.position, True, outputFile)
+      elif v.position > d.position: success2 = d.parseVcf(v.referenceSequence, v.position, False, None)
+    else:
+      if v.referenceSequence == currentReferenceSequence: success1 = v.parseVcf(d.referenceSequence, d.position, True, outputFile)
+      elif d.referenceSequence == currentReferenceSequence: success2 = d.parseVcf(v.referenceSequence, v.position, False, None)
+
+# If the last record for a reference sequence is the same for both vcf
+# files, they will both have referenceSequences different from the
+# current reference sequence.  Change the reference sequence to reflect
+# this and proceed.
+      else:
+        if v.referenceSequence != d.referenceSequence:
+          print >> sys.stderr, "ERROR: Reference sequences for both files are unexpectedly different."
+          print >> sys.stderr, "Check that both files contain records for the following reference sequences:"
+          print >> sys.stderr, "\t", v.referenceSequence, " and ", d.referenceSequence
+          exit(1)
+      currentReferenceSequence = v.referenceSequence
+
+def main():
+
+# Parse the command line options
+  usage = "Usage: vcfPytools.py annotate [options]"
+  parser = optparse.OptionParser(usage = usage)
+  parser.add_option("-i", "--in",
+                    action="store", type="string",
+                    dest="vcfFile", help="input vcf files")
+  parser.add_option("-d", "--dbsnp",
+                    action="store", type="string",
+                    dest="dbsnpFile", help="input dbsnp vcf file")
+  parser.add_option("-m", "--hapmap",
+                    action="store", type="string",
+                    dest="hapmapFile", help="input hapmap vcf file")
+  parser.add_option("-o", "--out",
+                    action="store", type="string",
+                    dest="output", help="output vcf file")
+
+  (options, args) = parser.parse_args()
+
+# Check that a single  vcf file is given.
+  if options.vcfFile == None:
+    parser.print_help()
+    print >> sys.stderr, "\nInput vcf file (--in, -i) is required for dbsnp annotation."
+    exit(1)
+
+# Check that either a hapmap or a dbsnp vcf file is included.
+  if options.dbsnpFile == None and options.hapmapFile == None:
+    parser.print_help()
+    print >> sys.stderr, "\ndbSNP or hapmap vcf file is required (--dbsnp, -d, --hapmap, -h)."
+    exit(1)
+  elif options.dbsnpFile != None and options.hapmapFile != None:
+    parser.print_help()
+    print >> sys.stderr, "\ndbSNP or hapmap vcf file is required, not both (--dbsnp, -d, --hapmap, -h)."
+    exit(1)
+
+# Set the output file to stdout if no output file was specified.
+  outputFile, writeOut = setOutput(options.output) # tools.py
+
+  v = vcf() # Define vcf object.
+  d = vcf() # Define dbsnp/hapmap vcf object.
+  if options.dbsnpFile:
+    d.dbsnpVcf = True
+    annotationFile = options.dbsnpFile
+    annotation = "dbsnp"
+  elif options.hapmapFile:
+    d.hapmapVcf = True
+    annotationFile = options.hapmapFile
+    annotation = "hapmap"
+
+# Open the vcf files.
+  v.openVcf(options.vcfFile)
+  d.openVcf(annotationFile)
+
+# Read in the header information.
+  v.parseHeader(options.vcfFile, writeOut)
+  d.parseHeader(annotationFile, writeOut)
+
+# Add an extra line to the vcf header to indicate the file used for
+# performing dbsnp annotation.
+  taskDescriptor = "##vcfPytools=annotated vcf file with "
+  if options.dbsnpFile: taskDescriptor += "dbSNP file " + options.dbsnpFile
+  elif options.hapmapFile:
+    taskDescriptor += "hapmap file " + options.hapmapFile
+    v.infoHeaderString["HM3"] = "##INFO=<ID=HM3,Number=0,Type=Flag,Description=\"Hapmap3.2 membership determined from file " + \
+                                options.hapmapFile + "\">"
+    v.infoHeaderString["HM3A"] = "##INFO=<ID=HM3A,Number=0,Type=Flag,Description=\"Hapmap3.2 membership (with different alleles)" + \
+                                 ", determined from file " + options.hapmapFile + "\">"
+  writeHeader(outputFile, v, False, taskDescriptor) # tools.py
+
+# Annotate the vcf file.
+  annotateVcf(v, d, outputFile, annotation)
+
+# Check that the input files had the same list of reference sequences.
+# If not, it is possible that there were some problems.
+  checkReferenceSequenceLists(v.referenceSequenceList, d.referenceSequenceList) # tools.py
+
+# Close the vcf files.
+  v.closeVcf(options.vcfFile)
+  d.closeVcf(annotationFile)
+
+# End the program.
+  return 0
b
diff -r 000000000000 -r 9071e359b9a3 tools/vcf_tools/annotate.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vcf_tools/annotate.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,62 @@
+<tool id="vcf_annotate" name="Annotate" version="1.0.0">
+  <description>a VCF file (dbSNP, hapmap)</description>
+  <command interpreter="python">
+    vcfPytools.py
+      annotate 
+      --in=$input1
+      #if $annotation_options.annotate == "dbsnp"
+      --dbsnp=$input2
+      #elif $annotation_options.annotate == "hapmap"
+      --hapmap=$input2
+      #end if
+      --out=$output1
+  </command>
+  <inputs>
+    <param name="input1" label="VCF file to annotate" type="data" format="vcf" />
+    <conditional name="annotation_options">
+      <param name="annotate" type="select" label="annotation source">
+        <option value="dbsnp">dbSNP vcf file</option>
+        <option value="hapmap">hapmap vcf file</option>
+      </param>
+      <when value="dbsnp">
+        <param name="input2" label="dbSNP vcf file" type="data" format="vcf" help="This option will annotate the vcf file with dbSNP rsid values.  The input dbSNP file must also be in vcf v4.0 format.  Only dbSNP entries with VC=SNP are included."/>
+      </when>
+      <when value="hapmap">
+        <param name="input2" label="hapmap vcf file" type="data" format="vcf" help="This option will annotate the vcf file info string to include HM3 if the record is included hapmap.  If the ref/alt values do not match the hapmap file, the info string will be populated with HM3A."/>
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="vcf" name="output1" label="${tool.name} ${on_string}" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="test.small.vcf" ftype="vcf" />
+      <param name="annotate" value="dbsnp" />
+      <param name="input2" value="dbsnp.small.vcf" ftype="vcf" />
+      <output name="output" file="test_annotated_dbsnp.vcf" lines_diff="6" ftype="vcf" />
+    </test>
+    <test>
+      <param name="input1" value="test.small.vcf" ftype="vcf" />
+      <param name="annotate" value="hapmap" />
+      <param name="input2" value="hapmap.small.vcf" ftype="vcf" />
+      <output name="output" file="test_annotated_hapmap.vcf" lines_diff="6" ftype="vcf" />
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool uses vcfPytools_' annotate command annotate a VCF file
+
+.. _vcfPytools: https://github.com/AlistairNWard/vcfPytools
+
+Currently, either a hapmap or a dbsnp file should be provided, not both.
+
+dbSNP option will annotate the VCF file with dbSNP rsid values.  The input dbSNP file must also be in VCF v4.0 format.  Only dbSNP entries with VC=SNP are included.
+
+hapmap option will annotate the VCF file info string to include HM3 if the record is included hapmap.  If the ref/alt values do not match the hapmap file, the info string will be populated with HM3A.
+
+
+  </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/vcf_tools/bedClass.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vcf_tools/bedClass.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,80 @@
+#!/usr/bin/python
+
+import os.path
+import sys
+
+class bed:
+  def __init__(self):
+    self.numberTargets = 0
+    self.referenceSequences = {}
+    self.referenceSequenceList = []
+
+  def openBed(self, filename):
+    if filename == "stdin": self.filehandle = sys.stdin
+    else:
+      try: self.filehandle = open(filename,"r")
+      except IOError:
+        print >> sys.stderr, "Failed to find file: ",filename
+        exit(1)
+
+# Get a bed record.
+  def getRecord(self):
+    self.record = self.filehandle.readline()
+    if not self.record: return False
+
+    self.numberTargets = self.numberTargets + 1
+    self.ref = ""
+    self.start = 0
+    self.end = 0
+
+# bed file should be 0-based, half-open, so the start coordinate
+# must be that in the bed file plus one.
+    entries = self.record.rstrip("\n").split("\t")
+    self.referenceSequence = entries[0]
+
+# Add the reference sequence to the dictionary.  If it didn't previously
+# exist append the reference sequence to the end of the list as well. 
+# This ensures that the order in which the reference sequences appeared
+# in the header can be preserved.
+    if self.referenceSequence not in self.referenceSequences:
+      self.referenceSequences[self.referenceSequence] = True
+      self.referenceSequenceList.append(self.referenceSequence)
+
+    try: self.start = int(entries[1]) + 1
+    except:
+      text = "start position need is not an integer"
+      self.generalError(text, "start", entries[1])
+
+    try: self.end = int(entries[2])
+    except:
+      text = "end position need is not an integer"
+      self.generalError(text, "end", entries[2])
+
+# Check that the record is a valid interval.
+    if self.end - self.start < 0:
+      print >> sys.stderr, "Invalid target interval:\n\t", self.record
+      exit(1)
+
+    return True
+
+# Parse through the bed file until the correct reference sequence is
+# encountered and the end position is greater than or equal to that requested.
+  def parseBed(self, referenceSequence, position):
+    success = True
+    if self.referenceSequence != referenceSequence:
+      while self.referenceSequence != referenceSequence and success: success = self.getRecord()
+
+    while self.referenceSequence == referenceSequence and self.end < position and success: success = self.getRecord()
+
+    return success
+
+# Close the bed file.
+  def closeBed(self, filename):
+    self.filehandle.close()
+
+# Define error messages for different handled errors.
+  def generalError(self, text, field, fieldValue):
+    print >> sys.stderr, "\nError encountered when attempting to read:"
+    if field != "": print >> sys.stderr, "\t", field, ":             ", fieldValue
+    print >> sys.stderr,  "\n", text
+    exit(1)
b
diff -r 000000000000 -r 9071e359b9a3 tools/vcf_tools/extract.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vcf_tools/extract.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,155 @@
+#!/usr/bin/python
+
+import os.path
+import sys
+import optparse
+
+import vcfClass
+from vcfClass import *
+
+import tools
+from tools import *
+
+if __name__ == "__main__":
+  main()
+
+def main():
+
+# Parse the command line options
+  usage = "Usage: vcfPytools.py extract [options]"
+  parser = optparse.OptionParser(usage = usage)
+  parser.add_option("-i", "--in",
+                    action="store", type="string",
+                    dest="vcfFile", help="input vcf file (stdin for piped vcf)")
+  parser.add_option("-o", "--out",
+                    action="store", type="string",
+                    dest="output", help="output validation file")
+  parser.add_option("-s", "--reference-sequence",
+                    action="store", type="string",
+                    dest="referenceSequence", help="extract records from this reference sequence")
+  parser.add_option("-r", "--region",
+                    action="store", type="string",
+                    dest="region", help="extract records from this region")
+  parser.add_option("-q", "--keep-quality",
+                    action="append", type="string", nargs=2,
+                    dest="keepQuality", help="keep records containing this quality")
+  parser.add_option("-k", "--keep-info",
+                    action="append", type="string",
+                    dest="infoKeep", help="keep records containing this info field")
+  parser.add_option("-d", "--discard-info",
+                    action="append", type="string",
+                    dest="infoDiscard", help="discard records containing this info field")
+  parser.add_option("-p", "--pass-filter",
+                    action="store_true", default=False,
+                    dest="passFilter", help="discard records whose filter field is not PASS")
+
+  (options, args) = parser.parse_args()
+
+# Check that a vcf file is given.
+  if options.vcfFile == None:
+    parser.print_help()
+    print >> sys.stderr, "\nInput vcf file (--in, -i) is required."
+    exit(1)
+
+# Check that either a reference sequence or region is specified,
+# but not both if not dealing with info fields.
+  if not options.infoKeep and not options.infoDiscard and not options.passFilter and not options.keepQuality:
+    if not options.referenceSequence and not options.region:
+      parser.print_help()
+      print >> sys.stderr, "\nA region (--region, -r) or reference sequence (--reference-sequence, -s) must be supplied"
+      print >> sys.stderr, "if not extracting records based on info strings."
+      exit(1)
+  if options.referenceSequence and options.region:
+    parser.print_help()
+    print >> sys.stderr, "\nEither a region (--region, -r) or reference sequence (--reference-sequence, -s) can be supplied, but not both."
+    exit(1)
+
+# If a region was supplied, check the format.
+  if options.region:
+    if options.region.find(":") == -1 or options.region.find("..") == -1:
+      print >> sys.stderr, "\nIncorrect format for region string.  Required: ref:start..end."
+      exit(1)
+    regionList = options.region.split(":",1)
+    referenceSequence = regionList[0]
+    try: start = int(regionList[1].split("..")[0])
+    except ValueError:
+      print >> sys.stderr, "region start coordinate is not an integer"
+      exit(1)
+    try: end = int(regionList[1].split("..")[1])
+    except ValueError:
+      print >> sys.stderr, "region end coordinate is not an integer"
+      exit(1)
+
+# Ensure that discard-info and keep-info haven't both been defined.
+  if options.infoKeep and options.infoDiscard:
+    print >> sys.stderr, "Cannot specify fields to keep and discard simultaneously."
+    exit(1)
+
+# If the --keep-quality argument is used, check that a value and a logical
+# argument are supplied and that the logical argument is valid.
+
+  if options.keepQuality:
+    for value, logic in options.keepQuality:
+      if logic != "eq" and logic != "lt" and logic != "le" and logic != "gt" and logic != "ge":
+        print >> sys.stderr, "Error with --keep-quality (-q) argument.  Must take the following form:"
+        print >> sys.stderr, "\npython vcfPytools extract --in <VCF> --keep-quality <value> <logic>"
+        print >> sys.stderr, "\nwhere logic is one of: eq, le, lt, ge or gt"
+        exit(1)
+    try: qualityValue = float(value)
+    except ValueError:
+      print >> sys.stderr, "Error with --keep-quality (-q) argument.  Must take the following form:"
+      print >> sys.stderr, "Quality value must be an integer or float value."
+      exit(1)
+    qualityLogic = logic
+
+# Set the output file to stdout if no output file was specified.
+  outputFile, writeOut = setOutput(options.output)
+
+  v = vcf() # Define vcf object.
+
+# Set process info to True if info strings need to be parsed.
+  if options.infoKeep or options.infoDiscard: v.processInfo = True
+
+# Open the file.
+  v.openVcf(options.vcfFile)
+
+# Read in the header information.
+  v.parseHeader(options.vcfFile, writeOut)
+  taskDescriptor = "##vcfPytools=extract data"
+  writeHeader(outputFile, v, False, taskDescriptor) # tools.py
+
+# Read through all the entries and write out records in the correct
+# reference sequence.
+  while v.getRecord():
+    writeRecord = True
+    if options.referenceSequence and v.referenceSequence != options.referenceSequence: writeRecord = False
+    elif options.region:
+      if v.referenceSequence != referenceSequence: writeRecord = False
+      elif v.position < start or v.position > end: writeRecord = False
+
+# Only consider these fields if the record is contained within the
+# specified region.
+    if options.infoKeep and writeRecord:
+      for tag in options.infoKeep:
+        if v.infoTags.has_key(tag):
+          writeRecord = True
+          break
+        if not v.infoTags.has_key(tag): writeRecord = False
+    if options.infoDiscard and writeRecord:
+      for tag in options.infoDiscard:
+        if v.infoTags.has_key(tag): writeRecord = False
+    if options.passFilter and v.filters != "PASS" and writeRecord: writeRecord = False
+    if options.keepQuality:
+      if qualityLogic == "eq" and v.quality != qualityValue: writeRecord = False
+      if qualityLogic == "le" and v.quality > qualityValue: writeRecord = False
+      if qualityLogic == "lt" and v.quality >= qualityValue: writeRecord = False
+      if qualityLogic == "ge" and v.quality < qualityValue: writeRecord = False
+      if qualityLogic == "gt" and v.quality <= qualityValue: writeRecord = False
+
+    if writeRecord: outputFile.write(v.record)
+
+# Close the file.
+  v.closeVcf(options.vcfFile)
+
+# Terminate the program cleanly.
+  return 0
b
diff -r 000000000000 -r 9071e359b9a3 tools/vcf_tools/extract.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vcf_tools/extract.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,111 @@
+<tool id="vcf_extract" name="Extract" version="1.0.0">
+  <description>reads from a specified region</description>
+  <command interpreter="python">
+    vcfPytools.py
+      extract 
+      --in=$input1
+      --out=$output1
+      #if $reference_sequence.value.strip()
+        --reference-sequence=$reference_sequence
+      #end if
+      #if $region.value.strip()
+        --region=$region
+      #end if
+      #if $keep_quality.value.strip()
+        --keep-quality=$keep_quality
+      #end if
+      #if $keep_info.value.strip()
+      --keep-info=$keep_info
+      #end if
+      #if $discard_info.value.strip()
+        --discard-info=$discard_info
+      #end if
+      $pass_filter
+  </command>
+  <inputs>
+    <param name="input1" label="VCF file" type="data" format="vcf" />
+    <param name="reference_sequence" label="Extract records from this reference sequence" type="text" value='' />
+    <param name="region" label="Extract records from this region" type="text" value='' help="The format of the region is ref:start..end, where the start and end coordinates are 1-based"/>
+    <param name="keep_quality" label="Keep records containing this quality" type="text" value='' help="This requires two arguments: the quality value and a logical operator (eq - equals, le - less than or equal to, lt - less than, ge - greater than or equal to , gt - greater than) to determine which records to keep.  For example: '90 ge' will retain all records that have a quality of 90 or greater"/>
+    <param name="keep_info" label="Keep records containing this info field" type="text" value='' />
+    <param name="discard_info" label="Discard records containing this info field" type="text" value='' />
+    <param name="pass_filter" label="Discard records whose filter field is not PASS" type="boolean" truevalue="--pass-filter" falsevalue="" checked="False"/>
+  </inputs>
+  <tests>
+    <test>
+      <param name="input1" value="test_filter_quality_9_DP_2000_lt.vcf" ftype="vcf" />
+      <param name="reference_sequence" value='' />
+   <param name="region" value='' />
+   <param name="keep_quality" value='' />
+   <param name="keep_info" value='' />
+   <param name="discard_info" value='' />
+   <param name="pass_filter" value='true' />      
+      <output name="output" file="test_extract_pass_filter_quality_9_DP_2000_lt.vcf" lines_diff="6" ftype="vcf" />
+    </test>
+    <test>
+      <param name="input1" value="test.small.vcf" ftype="vcf" />
+      <param name="reference_sequence" value='' />
+      <param name="region" value='20:80000..100000' />
+      <param name="keep_quality" value='' />
+      <param name="keep_info" value='' />
+      <param name="discard_info" value='' />
+      <param name="pass_filter" value='false' />      
+      <output name="output" file="test_extract_region_80000_100000.vcf" ftype="vcf" />
+    </test>
+    <test>
+      <param name="input1" value="test.small.vcf" ftype="vcf" />
+      <param name="reference_sequence" value='' />
+      <param name="region" value='' />
+      <param name="keep_quality" value='90 ge' />
+      <param name="keep_info" value='' />
+      <param name="discard_info" value='' />
+      <param name="pass_filter" value='false' />      
+      <output name="output" file="test_extract_quality_90_ge.vcf" ftype="vcf" />
+    </test>
+    <test>
+      <param name="input1" value="test.small.vcf" ftype="vcf" />
+      <param name="reference_sequence" value='' />
+      <param name="region" value='' />
+      <param name="keep_quality" value='' />
+      <param name="keep_info" value='TV' />
+      <param name="discard_info" value='' />
+      <param name="pass_filter" value='false' />      
+      <output name="output" file="test_extract_keep_info_TV.vcf" ftype="vcf" />
+    </test>
+    <test>
+      <param name="input1" value="test.small.vcf" ftype="vcf" />
+      <param name="reference_sequence" value='' />
+      <param name="region" value='' />
+      <param name="keep_quality" value='' />
+      <param name="keep_info" value='' />
+      <param name="discard_info" value='TV' />
+      <param name="pass_filter" value='false' />      
+      <output name="output" file="test_extract_discard_info_TV.vcf" ftype="vcf" />
+    </test>
+  </tests>
+  <outputs>
+    <data format="vcf" name="output1" label="${tool.name} from ${on_string}" />
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool uses vcfPytools_' extract command to extract reads from a specified region of a VCF file
+
+.. _vcfPytools: https://github.com/AlistairNWard/vcfPytools
+
+Option **Extract records from this reference sequence** outputs all records from the specified reference sequence from the input vcf file into the output vcf file.
+
+Option **Extract records from this region** outputs all records from the specified region from the input vcf file into the output vcf file.  The format of the region is ref:start..end, where the start and end coordinates are 1-based.
+
+Option **Keep records containing this quality** allows only records with specified quality values to be retained.  This requires two arguments: the quality value and a logical operator (eq - equals, le - less than or equal to, lt - less than, ge - greater than or equal to , gt - greater than) to determine which records to keep.  For example: **90 ge** will retain all records that have a quality of 90 or greater.
+
+Option **Keep records containing this info field** allows all records to be removed unless they contain this value in the info field.
+
+Option **Discard records containing this info field** ensures that all records containing this value in the info field will not be included in the output file.  This cannot be used in conjunction with Keep info field to avoid conflict.
+
+Option **Discard records whose filter field is not PASS** will only output records that have the filter field populated with PASS.  All filtered records or records that haven't undergone filtering will be discarded.
+
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/vcf_tools/filter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vcf_tools/filter.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,162 @@
+#!/usr/bin/python
+
+import os.path
+import sys
+import optparse
+
+import vcfClass
+from vcfClass import *
+
+import tools
+from tools import *
+
+if __name__ == "__main__":
+  main()
+
+def filterFail(text, file):
+  print >> sys.stderr, text
+  if file != None: os.remove(file)
+  exit(1)
+
+def main():
+
+# Parse the command line options
+  usage = "Usage: vcfPytools.py filter [options]"
+  parser = optparse.OptionParser(usage = usage)
+  parser.add_option("-i", "--in",
+                    action="store", type="string",
+                    dest="vcfFile", help="input vcf file")
+  parser.add_option("-o", "--out",
+                    action="store", type="string",
+                    dest="output", help="output vcf file")
+  parser.add_option("-q", "--quality",
+                    action="store", type="int",
+                    dest="quality", help="filter out SNPs with qualities lower than selected value")
+  parser.add_option("-n", "--info",
+                    action="append", type="string", nargs=3,
+                    dest="infoFilters", help="filter based on entries in the info string")
+  parser.add_option("-r", "--remove-genotypes",
+                    action="store_true", default=False,
+                    dest="removeGeno", help="remove the genotype strings from the vcf file")
+  parser.add_option("-m", "--mark-as-pass",
+                    action="store_true", default=False,
+                    dest="markPass", help="Mark all records as having passed filters")
+
+  (options, args) = parser.parse_args()
+
+# Check that a single vcf file is given.
+  if options.vcfFile == None:
+    parser.print_help()
+    print >> sys.stderr, "\nInput vcf file (-i, --input) is required for vcf filtering."
+    exit(1)
+
+# The --mark-as-pass option can only be used if no actual filters
+# have been specified.
+  if options.markPass and options.infoFilters:
+    print >> sys.stderr, "--mark-as-pass cannot be used in conjunction with filters."
+    exit(1)
+
+# Set the output file to stdout if no output file was specified.
+  outputFile, writeOut = setOutput(options.output) # tools.py
+
+  v = vcf() # Define vcf object.
+
+# Open the vcf file.
+  v.openVcf(options.vcfFile)
+
+# Read in the header information.
+  v.parseHeader(options.vcfFile, writeOut)
+  taskDescriptor = "##vcfPytools="
+  if options.infoFilters:
+    taskDescriptor += "filtered using the following filters: "
+    for filter, value, logic in options.infoFilters: taskDescriptor += str(filter) + str(value) + ","
+    taskDescriptor = taskDescriptor.rstrip(",")
+  if options.markPass: taskDescriptor += "marked all records as PASS"
+    
+  writeHeader(outputFile, v, options.removeGeno, taskDescriptor)
+
+# Check that specified filters from the info field are either integers or floats.
+  if options.infoFilters:
+    v.processInfo = True # Process the info string
+    filters = {}
+    filterValues = {}
+    filterLogic = {}
+    for filter, value, logic in options.infoFilters:
+      filterName = str(filter) + str(value)
+      if "-" in filter or "-" in value or "-" in logic:
+        print >> sys.stderr, "\n--info (-n) requires three arguments, for example:"
+        print >> sys.stderr, "\t--info DP 5 lt: filter records with DP less than (lt) 5.\n"
+        print >> sys.stderr, "allowed logic arguments:\n\tgt: greater than\n\tlt: less than."
+        print >> sys.stderr, "\nError in:", filter
+        exit(1)
+      if logic != "gt" and logic != "lt":
+        print >> sys.stderr, "\nfilter logic not recognised."
+        print >> sys.stderr, "allowed logic arguments:\n\tgt: greater than\n\tlt: less than."
+        print >> sys.stderr, "\nError in:", filter
+        exit(1)
+      if v.infoHeaderTags.has_key(filter):
+        if v.infoHeaderTags[filter][1].lower() == "integer":
+          try:
+            filters[filterName] = filter
+            filterValues[filterName] = int(value)
+            filterLogic[filterName] = logic
+            #filterLogic[filterName] = logic
+          except ValueError:
+            text = "Filter " + filter + " requires an integer entry, not " + str(type(value))
+            filterFail(text, options.output)
+
+        if v.infoHeaderTags[filter][1].lower() == "float":
+          try:
+            filters[filterName] = filter
+            filterValues[filterName] = float(value)
+            filterLogic[filterName] = logic
+            #filters[filterName] = float(value)
+            #filterLogic[filterName] = logic
+          except ValueError:
+            text = "Filter " + filter + " requires an float entry, not " + str(type(value))
+            filterFail(text, options.output)
+
+      else:
+        text = "Filter " + filter + " has no explanation in the header.  Unknown type for the entry."
+        filterFail(text, options.output)
+
+# Parse the vcf file and check if any of the filters are failed.  If
+# so, build up a string of failed filters.
+  while v.getRecord():
+    filterString = ""
+
+# Mark the record as "PASS" if --mark-as-pass was applied.
+    if options.markPass: v.filters = "PASS"
+
+# Check for quality filtering.
+    if options.quality != None:
+      if v.quality < options.quality:
+        filterString = filterString + ";" + "Q" + str(options.quality) if filterString != "" else "Q" + str(options.quality)
+
+# Check for filtering on info string filters.
+    if options.infoFilters:
+      for filterName, filter in filters.iteritems():
+        value = filterValues[filterName]
+        logic = filterLogic[filterName]
+        if v.infoTags.has_key(filter):
+          if type(value) == int:
+            if logic == "lt" and int(v.infoTags[filter]) < value:
+              filterString = filterString + ";" + filter + str(value) if filterString != "" else filter + str(value)
+            if logic == "gt" and int(v.infoTags[filter]) > value:
+              filterString = filterString + ";" + filter + str(value) if filterString != "" else filter + str(value)
+          elif type(value) == float:
+            if logic == "lt" and float(v.infoTags[filter]) < value:
+              filterString = filterString + ";" + filter + str(value) if filterString != "" else filter + str(value)
+            if logic == "gt" and float(v.infoTags[filter]) > value:
+              filterString = filterString + ";" + filter + str(value) if filterString != "" else filter + str(value)
+
+    filterString = "PASS" if filterString == "" else filterString
+    v.filters = filterString
+    record = v.buildRecord(options.removeGeno)
+    outputFile.write(record)
+
+# Close the vcf files.
+  v.closeVcf(options.vcfFile)
+
+# Terminate the program.
+  return 0
b
diff -r 000000000000 -r 9071e359b9a3 tools/vcf_tools/filter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vcf_tools/filter.xml Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,64 @@
+<tool id="vcf_filter" name="Filter" version="1.0.0">
+  <description>a VCF file</description>
+  <command interpreter="python">
+    vcfPytools.py
+      filter 
+      --in=$input1
+      --out=$output1
+      --quality=$quality
+      #for $i in $info_filter:
+        --info ${i.info}
+      #end for
+      $remove_genotypes
+      $mark_as_pass
+  </command>
+  <inputs>
+    <param name="input1" label="VCF file" type="data" format="vcf" />
+    <param name="quality" label="Filter by quality" type="integer" value='' help="Filter out SNPs with qualities lower than selected value" />
+    <repeat name="info_filter" title="Filter based on entries in the info string">
+      <param name="info" label="Filter" type="text" value='' help='This option takes three values: the info string tag, the cutoff value and whether to filter out those records with less than (lt) or greater than (gt) this value.  For example: DP 10 lt ' />
+    </repeat>
+    <param name="remove_genotypes" label="Remove the genotype strings" type="boolean" truevalue="--remove-genotypes" falsevalue="" checked="False" />
+    <param name="mark_as_pass" label="Mark all records as having passed filters" type="boolean" truevalue="--mark-as-pass" falsevalue="" checked="False" />
+  </inputs>
+  <tests>
+    <test>
+      <param name="input1" value="test.small.vcf" ftype="vcf" />
+      <param name="quality" value="9" />
+      <param name="info" value="NS 360 gt"/>
+      <param name="remove_genotypes" value="" />
+      <param name="mark_as_pass" value="" />
+      <output name="output" file="test_filter_quality_9_NS_360_gt.vcf" lines_diff="6" ftype="vcf" />
+    </test>
+    <test>
+      <param name="input1" value="test.small.vcf" ftype="vcf" />
+      <param name="quality" value="9" />
+      <param name="info" value="DP 2000 lt"/>
+      <param name="remove_genotypes" value="" />
+      <param name="mark_as_pass" value="" />
+      <output name="output" file="test_filter_quality_9_DP_2000_lt.vcf" lines_diff="6" ftype="vcf" />
+    </test>
+  </tests>
+  <outputs>
+    <data format="vcf" name="output1" label="${tool.name} ${on_string}" />
+  </outputs>
+  <help>
+
+**What it does**
+
+This tool uses vcfPytools_' filter command
+
+.. _vcfPytools: https://github.com/AlistairNWard/vcfPytools
+
+Quality option will check the variant quality for each record and if it is below the defined value, the filter field will be populated with the filter entry Q[value].
+
+Any value in the info string can be used for filtering by using the 'Filter by info' option.  This option takes three values: the info string tag, the cutoff value and whether to filter out those records with less than (lt) or greater than (gt) this value.  For example:
+
+  DP 10 lt 
+
+would filter out all varianta with a depth (DP) less than 10 and the filter field would be populated with DP10.
+
+This option can be defined as many times as required.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/vcf_tools/intersect.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vcf_tools/intersect.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,181 @@
+#!/usr/bin/python
+
+import os.path
+import sys
+import optparse
+
+import bedClass
+from bedClass import *
+
+import vcfClass
+from vcfClass import *
+
+import tools
+from tools import *
+
+if __name__ == "__main__":
+  main()
+
+# Intersect two vcf files.  It is assumed that the two files are
+# sorted by genomic coordinates and the reference sequences are
+# in the same order.
+def intersectVcf(v1, v2, priority, outputFile):
+  success1 = v1.getRecord()
+  success2 = v2.getRecord()
+  currentReferenceSequence = v1.referenceSequence
+
+# As soon as the end of either file is reached, there can be no
+# more intersecting SNPs, so terminate.
+  while success1 and success2:
+    if v1.referenceSequence == v2.referenceSequence and v1.referenceSequence == currentReferenceSequence:
+      if v1.position == v2.position:
+        writeVcfRecord(priority, v1, v2, outputFile)
+        success1 = v1.getRecord()
+        success2 = v2.getRecord()
+      elif v2.position > v1.position: success1 = v1.parseVcf(v2.referenceSequence, v2.position, False, None)
+      elif v1.position > v2.position: success2 = v2.parseVcf(v1.referenceSequence, v1.position, False, None)
+    else:
+      if v1.referenceSequence == currentReferenceSequence: success1 = v1.parseVcf(v2.referenceSequence, v2.position, False, None)
+      elif v2.referenceSequence == currentReferenceSequence: success2 = v2.parseVcf(v1.referenceSequence, v1.position, False, None)
+
+# If the last record for a reference sequence is the same for both vcf
+# files, they will both have referenceSequences different from the
+# current reference sequence.  Change the reference sequence to reflect
+# this and proceed.
+      else:
+        if v1.referenceSequence != v2.referenceSequence:
+          print >> sys.stderr, "ERROR: Reference sequences for both files are unexpectedly different."
+          print >> sys.stderr, "Check that both files contain records for the following reference sequences:"
+          print >> sys.stderr, "\t", v1.referenceSequence, " and ", v2.referenceSequence
+          exit(1)
+      currentReferenceSequence = v1.referenceSequence
+
+# Intersect a vcf file and a bed file.  It is assumed that the 
+# two files are sorted by genomic coordinates and the reference
+# sequences are in the same order.
+def intersectVcfBed(v, b, outputFile):
+  successb = b.getRecord()
+  successv = v.getRecord()
+  currentReferenceSequence = v.referenceSequence
+
+# As soon as the end of the first file is reached, there are no
+# more intersections and the program can terminate.
+  while successv:
+    if v.referenceSequence == b.referenceSequence:
+      if v.position < b.start: successv = v.parseVcf(b.referenceSequence, b.start, False, None)
+      elif v.position > b.end: successb = b.parseBed(v.referenceSequence, v.position)
+      else:
+        outputFile.write(v.record)
+        successv = v.getRecord()
+    else:
+      if v.referenceSequence == currentReferenceSequence: successv = v.parseVcf(b.referenceSequence, b.start, False, None)
+      if b.referenceSequence == currentReferenceSequence: successb = b.parseBed(v.referenceSequence, v.position)
+      currentReferenceSequence = v.referenceSequence
+
+def main():
+
+# Parse the command line options
+  usage = "Usage: vcfPytools.py intersect [options]"
+  parser = optparse.OptionParser(usage = usage)
+  parser.add_option("-i", "--in",
+                    action="append", type="string",
+                    dest="vcfFiles", help="input vcf files")
+  parser.add_option("-b", "--bed",
+                    action="store", type="string",
+                    dest="bedFile", help="input bed vcf file")
+  parser.add_option("-o", "--out",
+                    action="store", type="string",
+                    dest="output", help="output vcf file")
+  parser.add_option("-f", "--priority-file",
+                    action="store", type="string",
+                    dest="priorityFile", help="output records from this vcf file")
+
+  (options, args) = parser.parse_args()
+
+# Check that a single  vcf file is given.
+  if options.vcfFiles == None:
+    parser.print_help()
+    print >> sys.stderr, "\nAt least one vcf file (--in, -i) is required for performing intersection."
+    exit(1)
+  elif len(options.vcfFiles) > 2:
+    parser.print_help()
+    print >> sys.stderr, "\nAt most, two vcf files (--in, -i) can be submitted for performing intersection."
+    exit(1)
+  elif len(options.vcfFiles) == 1 and not options.bedFile:
+    parser.print_help()
+    print >> sys.stderr, "\nIf only one vcf file (--in, -i) is specified, a bed file is also required for performing intersection."
+    exit(1)
+
+# Set the output file to stdout if no output file was specified.
+  outputFile, writeOut = setOutput(options.output) # tools.py
+
+# If intersecting with a bed file, call the bed intersection routine.
+  if options.bedFile:
+    v = vcf() # Define vcf object.
+    b = bed() # Define bed object.
+
+# Open the files.
+    v.openVcf(options.vcfFiles[0])
+    b.openBed(options.bedFile)
+
+# Read in the header information.
+    v.parseHeader(options.vcfFiles[0], writeOut)
+    taskDescriptor = "##vcfPytools=intersect " + options.vcfFiles[0] + ", " + options.bedFile
+    writeHeader(outputFile, v, False, taskDescriptor) # tools.py
+
+# Intersect the vcf file with the bed file.
+    intersectVcfBed(v, b, outputFile)
+
+# Check that the input files had the same list of reference sequences.
+# If not, it is possible that there were some problems.
+    checkReferenceSequenceLists(v.referenceSequenceList, b.referenceSequenceList) # tools.py
+
+# Close the files.
+    v.closeVcf(options.vcfFiles[0])
+    b.closeBed(options.bedFile)
+
+  else:
+    priority = setVcfPriority(options.priorityFile, options.vcfFiles)
+    v1 = vcf() # Define vcf object.
+    v2 = vcf() # Define vcf object.
+
+# Open the vcf files.
+    v1.openVcf(options.vcfFiles[0])
+    v2.openVcf(options.vcfFiles[1])
+
+# Read in the header information.
+    v1.parseHeader(options.vcfFiles[0], writeOut)
+    v2.parseHeader(options.vcfFiles[1], writeOut)
+    if priority == 3:
+      v3 = vcf() # Generate a new vcf object that will contain the header information of the new file.
+      mergeHeaders(v1, v2, v3) # tools.py
+      v1.processInfo = True
+      v2.processInfo = True
+    else: checkDataSets(v1, v2)
+    
+    #print v1.samplesList
+    #print v2.samplesList
+
+# Check that the header for the two files contain the same samples.
+    if v1.samplesList != v2.samplesList:
+      print >> sys.stderr, "vcf files contain different samples (or sample order)."
+      exit(1)
+    else:
+      taskDescriptor = "##vcfPytools=intersect " + v1.filename + ", " + v2.filename
+      if priority == 3: writeHeader(outputFile, v3, False, taskDescriptor)
+      elif (priority == 2 and v2.hasHeader) or not v1.hasHeader: writeHeader(outputFile, v2, False, taskDescriptor) # tools.py
+      else: writeHeader(outputFile, v1, False, taskDescriptor) # tools.py
+
+# Intersect the two vcf files.
+    intersectVcf(v1, v2, priority, outputFile)
+
+# Check that the input files had the same list of reference sequences.
+# If not, it is possible that there were some problems.
+    checkReferenceSequenceLists(v1.referenceSequenceList, v2.referenceSequenceList) # tools.py
+
+# Close the vcf files.
+    v1.closeVcf(options.vcfFiles[0])
+    v2.closeVcf(options.vcfFiles[1])
+
+# End the program.
+  return 0
b
diff -r 000000000000 -r 9071e359b9a3 tools/vcf_tools/intersect.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vcf_tools/intersect.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,80 @@
+<tool id="vcf_intersect" name="Intersect" version="1.0.0">
+  <description>Generate the intersection of two VCF files</description>
+  <command interpreter="python">
+    vcfPytools.py
+      intersect 
+      --in=$input1
+      #if $format_type.format == "vcf"
+      --in=$input2
+      #elif $format_type.format == "bed"
+      --bed=$input2
+      #end if
+      #if $priority_file.value == "first_file"
+      --priority-file=$input1
+      #elif $priority_file.value == "second_file"
+      --priority-file=$input2
+      #end if
+      --out=$output1
+  </command>
+  <inputs>
+    <param name="input1" label="First VCF file" type="data" format="vcf" />
+    <conditional name="format_type">
+      <param name="format" type="select" label="intersect with file of format">
+        <option value="vcf">VCF</option>
+        <option value="bed">BED</option>
+      </param>
+      <when value="vcf">
+        <param name="input2" label="second VCF file" type="data" format="vcf"/>
+      </when>
+      <when value="bed">
+        <param name="input2" label="second BED file" type="data" format="bed"/>
+      </when>
+    </conditional>
+    <param name="priority_file" type="select" label="Priority file" help="If the priority file argument is set (this must be equal to one of the input vcf files), then the record written to the output will come from this file.  If this argument is not set, the record with the highest quality is written out.">
+      <option value="none">None</option>
+      <option value="first_file">First file</option>
+      <option value="second_file">Second file</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="vcf" name="output1" label="${tool.name} on ${on_string}" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="1.vcf" ftype="vcf" />
+      <param name="format" value="vcf" />
+      <param name="input2" value="2.vcf" ftype="vcf" />
+      <param name="priority_file" value="none" />
+      <output name="output" file="1_2_intersect_priority_0.vcf" lines_diff="2" ftype="vcf" />
+    </test>
+    <test>
+      <param name="input1" value="1.vcf" ftype="vcf" />
+      <param name="format" value="vcf" />
+      <param name="input2" value="2.vcf" ftype="vcf" />
+      <param name="priority_file" value="first_file" />
+      <output name="output" file="1_2_intersect_priority_1.vcf" lines_diff="2" ftype="vcf" />
+    </test>
+    <test>
+      <param name="input1" value="1.vcf" ftype="vcf" />
+      <param name="format" value="vcf" />
+      <param name="input2" value="2.vcf" ftype="vcf" />
+      <param name="priority_file" value="second_file" />
+      <output name="output" file="1_2_intersect_priority_2.vcf" lines_diff="2" ftype="vcf" />
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool uses vcfPytools_' intersect command to generate the intersection of two VCF files
+
+.. _vcfPytools: https://github.com/AlistairNWard/vcfPytools
+
+Two input files are required as input and the intersection of these two files is generated and sent to the output.  These files must be sorted by genomic coordinate to function correctly, although the reference sequence order is no important.
+
+The intersection can be calculated on two VCF files or a VCF and a BED file.
+
+If the priority file argument is set (this must be equal to one of the input VCF files), then the record written to the output will come from this file.  If this argument is not set, the record with the highest quality is written out.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9071e359b9a3 tools/vcf_tools/tools.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vcf_tools/tools.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,188 @@\n+#!/usr/bin/python\n+\n+import os.path\n+import sys\n+import vcfPytools\n+from vcfPytools import __version__\n+\n+# Determine whether to output to a file or stdout.\n+def setOutput(output):\n+  if output == None:\n+    outputFile = sys.stdout\n+    writeOut = False\n+  else:\n+    output = os.path.abspath(output)\n+    outputFile = open(output, \'w\')\n+    writeOut = True\n+\n+  return outputFile, writeOut\n+\n+# Determine which file has priority for writing out records.\n+def setVcfPriority(priorityFile, vcfFiles):\n+  if priorityFile == None: priority = 0\n+  elif priorityFile == vcfFiles[0]: priority = 1\n+  elif priorityFile == vcfFiles[1]: priority = 2\n+  elif priorityFile.lower() == "merge": priority = 3\n+  else:\n+    print >> sys.stderr, "vcf file give priority must be one of the two input vcf files or merge."\n+    exit(1)\n+\n+  return priority\n+\n+# If the union or intersection of two vcf files is being performed\n+# and the output vcf file is to contain the information from both\n+# files, the headers need to be merged to ensure that all info and\n+# format entries have an explanation.\n+def mergeHeaders(v1, v2, v3):\n+\n+# If either file does not have a header, terminate the program.\n+# In order to merge the headers, the different fields must be\n+# checked to ensure the files are compatible.\n+  if not v1.hasHeader or not v2.hasHeader:\n+    print >> sys.stderr, "Both vcf files must have a header in order to merge data sets."\n+    exit(1)\n+\n+  v3.infoHeaderTags = v1.infoHeaderTags.copy()\n+  v3.formatHeaderTags = v1.formatHeaderTags.copy()\n+  v3.numberDataSets = v1.numberDataSets\n+  v3.includedDataSets = v1.includedDataSets.copy()\n+  v3.headerText = v1.headerText\n+  v3.headerTitles = v1.headerTitles\n+  v3.infoHeaderString = v1.infoHeaderString.copy()\n+  v3.formatHeaderString = v1.formatHeaderString.copy()\n+\n+# Merge the info field descriptions.\n+  for tag in v2.infoHeaderTags:\n+    if v1.infoHeaderTags.has_key(tag):\n+      if v1.infoHeaderTags[tag][0] != v2.infoHeaderTags[tag][0] or \\\n+         v1.infoHeaderTags[tag][1] != v2.infoHeaderTags[tag][1]:\n+        print v1.infoHeaderTags[tag][0]\n+        print v1.infoHeaderTags[tag][1]\n+        print v1.infoHeaderTags[tag][2]\n+        print >> sys.stderr, "Input vcf files have different definitions for " + tag + " field."\n+        exit(1)\n+    else: v3.infoHeaderTags[tag] = v2.infoHeaderTags[tag]\n+\n+# Merge the format field descriptions.\n+  for tag in v2.formatHeaderTags:\n+    if v1.formatHeaderTags.has_key(tag):\n+      if v1.formatHeaderTags[tag][0] != v2.formatHeaderTags[tag][0] or \\\n+         v1.formatHeaderTags[tag][1] != v2.formatHeaderTags[tag][1]:\n+        print >> sys.stderr, "Input vcf files have different definitions for " + tag + " field."\n+        exit(1)\n+    else: v3.formatHeaderTags[tag] = v2.formatHeaderTags[tag]\n+\n+# Now check to see if the vcf files contain information from multiple\n+# records themselves and create an ordered list in which the data\n+# will appear in the file.  For instance, of the first file has\n+# already got two sets of data and is being intersected with a file\n+# with one set of data, the order of data in the new vcf file will be\n+# the two sets from the first file followed by the second, e.g.\n+# AB=3/2/4, where the 3 and 2 are from the first file and the 4 is the\n+# value of AC from the second vcf.  The header will have a ##FILE for\n+# each of the three files, so the origin if the data can be recovered.\n+  if v1.numberDataSets == 0:\n+    v3.includedDataSets[v3.numberDataSets + 1] = v1.filename\n+    v3.numberDataSets += 1\n+  if v2.numberDataSets == 0:\n+    v3.includedDataSets[v3.numberDataSets + 1] = v2.filename\n+    v3.numberDataSets += 1\n+  else:\n+    for i in range(1, v2.numberDataSets + 1):\n+      v3.includedDataSets[v3.numberDataSets + 1] = v2.includedDataSets[i]\n+      v3.numberDataSets += 1\n+\n+# If either of the input files contain multiple data sets (e.g. multiple\n+# vcf files have undergone intersection or union calculations and all\n+# inform'..b'OR:"\n+    print >> sys.stderr, "input vcf file(s) contain data sets from multiple vcf files."\n+    print >> sys.stderr, "Further intersection or union operations must include --priority-file merge"\n+    print >> sys.stderr, "Other tools may be incompatible with this format."\n+    exit(1)\n+\n+# Write the header to file.\n+def writeHeader (outputFile, v, removeGenotypes, taskDescriptor):\n+  if not v.hasHeader: \n+    v.headerText = "##fileformat=VCFv4.0\\n##source=vcfPytools " + __version__ + "\\n"\n+    v.headerTitles = "#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\n"\n+  outputFile.write(v.headerText) if v.headerText != "" else None\n+  print >> outputFile, taskDescriptor\n+  for tag in v.infoHeaderString: print >> outputFile, v.infoHeaderString[tag]\n+  for tag in v.formatHeaderString: print >> outputFile, v.formatHeaderString[tag]\n+\n+# Write out a list of files indicating which data set belongs to which file.\n+  if v.numberDataSets != 0:\n+    for i in range(1, v.numberDataSets + 1):\n+      print >> outputFile, "##FILE=<ID=" + str(i) + ",\\"" + v.includedDataSets[i] + "\\">"\n+\n+  if removeGenotypes:\n+    line = v.headerTitles.rstrip("\\n").split("\\t")\n+    newHeaderTitles = line[0]\n+    for i in range(1,8):\n+      newHeaderTitles = newHeaderTitles + "\\t" + line[i]\n+    newHeaderTitles = newHeaderTitles + "\\n"\n+    outputFile.write( newHeaderTitles )\n+  else:\n+    outputFile.write( v.headerTitles )\n+\n+# Check that the two reference sequence lists are identical.\n+# If there are a different number or order, the results may\n+# not be as expected.\n+def checkReferenceSequenceLists(list1, list2):\n+  errorMessage = False\n+  if len(list1) != len(list2):\n+    print >> sys.stderr, "WARNING: Input files contain a different number of reference sequences."\n+    errorMessage = True\n+  elif list1 != list2:\n+    print >> sys.stderr, "WARNING: Input files contain different or differently ordered reference sequences."\n+    errorMessage = True\n+  if errorMessage:\n+    print >> sys.stderr, "Results may not be as expected."\n+    print >> sys.stderr, "Ensure that input files have the same reference sequences in the same order."\n+    print >> sys.stderr, "Reference sequence lists observed were:\\n\\t", list1, "\\n\\t", list2\n+\n+# Write out a vcf record to file.  The record written depends on the\n+# value of \'priority\' and could therefore be the record from either\n+# of the vcf files, or a combination of them.\n+\n+def writeVcfRecord(priority, v1, v2, outputFile):\n+  if priority == 0:\n+    if v1.quality >= v2.quality: outputFile.write(v1.record)\n+    else: outputFile.write(v2.record)\n+  elif priority == 1: outputFile.write(v1.record)\n+  elif priority == 2: outputFile.write(v2.record)\n+  elif priority == 3:\n+\n+# Define the missing entry values (depends on the number of data sets\n+# in the file).\n+    info = ""\n+    missingEntry1 = missingEntry2 = "."\n+    for i in range(1, v1.numberDataSets): missingEntry1 += "/."\n+    for i in range(1, v2.numberDataSets): missingEntry2 += "/."\n+    secondList = v2.infoTags.copy()\n+\n+# Build up the info field.\n+    for tag in v1.infoTags:\n+      if secondList.has_key(tag):\n+        if v1.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + v1.infoTags[tag] + "/" + v2.infoTags[tag] + ";"\n+        del secondList[tag]\n+      else: \n+        if v1.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + v1.infoTags[tag] + "/" + missingEntry2 + ";"\n+\n+# Now include the info tags that are not populated in the first vcf file.\n+    for tag in secondList:\n+      if v2.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + missingEntry1 + "/" + v2.infoTags[tag] + ";"\n+\n+# Build the complete record.\n+    info = info.rstrip(";")\n+    record = v1.referenceSequence + "\\t" + str(v1.position) + "\\t" + v1.rsid + "\\t" + v1.ref + "\\t" + \\\n+             v1.alt + "/" + v2.alt + "\\t" + v1.quality + "/" + v2.quality + "\\t.\\t" + info\n+    print >> outputFile, record\n+  else:\n+    print >> sys.sterr, "Unknown file priority."\n+    exit(1)\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/vcf_tools/vcfClass.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vcf_tools/vcfClass.py Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,422 @@\n+#!/usr/bin/python\n+\n+import os.path\n+import sys\n+import re\n+\n+class vcf:\n+  def __init__(self):\n+\n+# Header info.\n+    self.filename = ""\n+    self.hasHeader = True\n+    self.headerText = ""\n+    self.headerTitles = ""\n+    #self.headerInfoText = ""\n+    #self.headerFormatText = ""\n+\n+# Store the info and format tags as well as the lines that describe\n+# them in a dictionary.\n+    self.numberDataSets = 0 \n+    self.includedDataSets = {}\n+    self.infoHeaderTags = {}\n+    self.infoHeaderString = {}\n+    self.formatHeaderTags = {}\n+    self.formatHeaderString = {}\n+\n+# Genotype information.\n+    self.genotypes = False\n+    self.infoField = {}\n+\n+# Reference sequence information.\n+    self.referenceSequences = {}\n+    self.referenceSequenceList = []\n+    self.referenceSequence = ""\n+\n+# Record information.\n+    self.position = -1\n+    self.samplesList = []\n+\n+# Determine which fields to process.\n+    self.processInfo = False\n+    self.processGenotypes = False\n+    self.dbsnpVcf = False\n+    self.hapmapVcf = False\n+\n+# Open a vcf file.\n+  def openVcf(self, filename):\n+    if filename == "stdin":\n+      self.filehandle = sys.stdin\n+      self.filename = "stdin"\n+    else:\n+      try: self.filehandle = open(filename,"r")\n+      except IOError:\n+        print >> sys.stderr, "Failed to find file: ",filename\n+        exit(1)\n+      self.filename = os.path.abspath(filename)\n+\n+# Parse the vcf header.\n+  def parseHeader(self, filename, writeOut):\n+    while self.getHeaderLine(filename, writeOut):\n+      continue\n+\n+# Determine the type of information in the header line.\n+  def getHeaderLine(self, filename, writeOut):\n+    self.headerLine = self.filehandle.readline().rstrip("\\n")\n+    if self.headerLine.startswith("##INFO"): success = self.headerInfo(writeOut, "info")\n+    elif self.headerLine.startswith("##FORMAT"): success = self.headerInfo(writeOut, "format")\n+    elif self.headerLine.startswith("##FILE"): success = self.headerFiles(writeOut)\n+    elif self.headerLine.startswith("##"): success = self.headerAdditional()\n+    elif self.headerLine.startswith("#"): success = self.headerTitleString(filename, writeOut)\n+    else: success = self.noHeader(filename, writeOut)\n+\n+    return success\n+\n+# Read information on an info field from the header line.\n+  def headerInfo(self, writeOut, lineType):\n+    tag = self.headerLine.split("=",1)\n+    tagID = (tag[1].split("ID=",1))[1].split(",",1)\n+\n+# Check if this info field has already been defined.\n+    if (lineType == "info" and self.infoHeaderTags.has_key(tagID[0])) or (lineType == "format" and self.formatHeaderTags.has_key(tagID[0])):\n+      print >> sys.stderr, "Info tag \\"", tagID[0], "\\" is defined multiple times in the header."\n+      exit(1)\n+\n+# Determine the number of entries, entry type and description.\n+    tagNumber = (tagID[1].split("Number=",1))[1].split(",",1)\n+    tagType = (tagNumber[1].split("Type=",1))[1].split(",",1)\n+    try: tagDescription = ( ( (tagType[1].split("Description=\\"",1))[1] ).split("\\">") )[0]\n+    except IndexError: tagDescription = ""\n+    tagID = tagID[0]; tagNumber = tagNumber[0]; tagType = tagType[0]\n+\n+# Check that the number of fields associated with the tag is either\n+# an integer or a \'.\' to indicate variable number of entries.\n+    if tagNumber == ".": tagNumber = "variable"\n+    else:\n+      try: tagNumber = int(tagNumber)\n+      except ValueError:\n+        print >> sys.stderr, "\\nError parsing header.  Problem with info tag:", tagID\n+        print >> sys.stderr, "Number of fields associated with this tag is not an integer or \'.\'"\n+        exit(1)\n+\n+    if lineType == "info":\n+      self.infoHeaderTags[tagID] = tagNumber, tagType, tagDescription\n+      self.infoHeaderString[tagID] = self.headerLine\n+    if lineType == "format":\n+      self.formatHeaderTags[tagID] = tagNumber, tagType, tagDescription\n+      self.formatHeaderString[tagID] = self.headerLine\n+\n+    return True\n+\n+# Check to see if the records contain information fro'..b'ues != 0 and infoType == "Flag":\n+          print >> sys.stderr, "ERROR"\n+          exit(1)\n+        else:\n+          fields = self.infoTags[tag].split(",")\n+          if len(fields) != numberValues:\n+            text = "Unexpected number of entries"\n+            self.generalError(text, "information tag", tag)\n+\n+          for i in range(infoNumber):\n+            try: result.append(fields[i])\n+            except IndexError:\n+              text = "Insufficient values. Expected: " + self.infoHeaderTags[tag][0]\n+              self.generalError(text, "tag:", tag)\n+      else: numberValues = 0\n+\n+    else:\n+      text = "information field does not have a definition in the header"\n+      self.generalError(text, "tag", tag)\n+\n+    return numberValues, infoType, result\n+\n+# Get the genotype information.\n+  def getGenotypeInfo(self, sample, tag):\n+    result = []\n+    if self.formatHeaderTags.has_key(tag):\n+      infoNumber = self.formatHeaderTags[tag][0]\n+      infoType = self.formatHeaderTags[tag][1]\n+      numberValues = infoNumber\n+\n+      if self.genotypeFields[sample] == "." and len(self.genotypeFields[sample]) == 1:\n+        numberValues = 0\n+        result = "."\n+      else:\n+        if self.genotypeFields[sample].has_key(tag):\n+          if tag == "GT":\n+            if len(self.genotypeFields[sample][tag]) != 3 and len(self.genotypeFields[sample][tag]) != 1:\n+              text = "Unexected number of characters in genotype (GT) field"\n+              self.generalError(text, "sample", sample)\n+\n+# If a diploid call, check whether or not the genotype is phased.\n+            elif len(self.genotypeFields[sample][tag]) == 3:\n+              self.phased = True if self.genotypeFields[sample][tag][1] == "|" else False\n+              result.append( self.genotypeFields[sample][tag][0] )\n+              result.append( self.genotypeFields[sample][tag][2] )\n+            elif len(self.genotypeFields[sample][tag]) == 3:\n+              result.append( self.genotypeFields[sample][tag][0] )\n+          else:\n+            fields = self.genotypeFields[sample][tag].split(",")\n+            if len(fields) != numberValues:\n+              text = "Unexpected number of characters in " + tag + " field"\n+              self.generalError(text, "sample", sample)\n+\n+            for i in range(infoNumber): result.append(fields[i])\n+    else:\n+      text = "genotype field does not have a definition in the header"\n+      self.generalError(text, "tag", tag)\n+\n+    return numberValues, result\n+\n+# Parse the dbsnp entry.  If the entry conforms to the required variant type,\n+# return the dbsnp rsid value, otherwise ".".\n+  def getDbsnpInfo(self):\n+\n+# First check that the variant class (VC) is listed as SNP.\n+    vc = self.info.split("VC=",1)\n+    if vc[1].find(";") != -1: snp = vc[1].split(";",1) \n+    else:\n+      snp = []\n+      snp.append(vc[1])\n+\n+    if snp[0].lower() == "snp": rsid = self.rsid\n+    else: rsid = "."\n+\n+    return rsid\n+\n+# Build a new vcf record.\n+  def buildRecord(self, removeGenotypes):\n+    record = self.referenceSequence + "\\t" + \\\n+                str(self.position) + "\\t" + \\\n+                self.rsid + "\\t" + \\\n+                self.ref + "\\t" + \\\n+                self.alt + "\\t" + \\\n+                str(self.quality) + "\\t" + \\\n+                self.filters + "\\t" + \\\n+                self.info\n+\n+    if self.hasGenotypes and not removeGenotypes: record += self.genotypeString\n+\n+    record += "\\n"\n+\n+    return record\n+\n+# Close the vcf file.\n+  def closeVcf(self, filename):\n+    self.filehandle.close()\n+\n+# Define error messages for different handled errors.\n+  def generalError(self, text, field, fieldValue):\n+    print >> sys.stderr, "\\nError encountered when attempting to read:"\n+    print >> sys.stderr, "\\treference sequence :\\t", self.referenceSequence\n+    print >> sys.stderr, "\\tposition :\\t\\t", self.position\n+    if field != "": print >> sys.stderr, "\\t", field, ":\\t", fieldValue\n+    print >> sys.stderr,  "\\n", text\n+    exit(1)\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/vcf_tools/vcfPytools.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vcf_tools/vcfPytools.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,82 @@
+#!/usr/bin/python
+
+import os.path
+import sys
+
+__author__ = "alistair ward"
+__version__ = "version 0.26"
+__date__ = "february 2011"
+
+def main():
+  usage = "Usage: vcfPytools.py [tool] [options]\n\n" + \
+          "Available tools:\n" + \
+          "  annotate:\n\tAnnotate the vcf file with membership in other vcf files.\n" + \
+          "  extract:\n\tExtract vcf records from a region.\n" + \
+          "  filter:\n\tFilter the vcf file.\n" + \
+          "  intersect:\n\tGenerate the intersection of two vcf files.\n" + \
+          "  merge:\n\tMerge a list of vcf files.\n" + \
+          "  multi:\n\tFind the intersections and unique fractions of multiple vcf files.\n" + \
+          "  sort:\n\tSort a vcf file.\n" + \
+          "  stats:\n\tGenerate statistics from a vcf file.\n" + \
+          "  union:\n\tGenerate the union of two vcf files.\n" + \
+          "  unique:\n\tGenerate the unique fraction from two vcf files.\n" + \
+          "  validate:\n\tValidate the input vcf file.\n\n" + \
+          "vcfPytools.py [tool] --help for information on a specific tool."
+
+# Determine the requested tool.
+
+  if len(sys.argv) > 1:
+    tool = sys.argv[1]
+  else:
+    print >> sys.stderr, usage
+    exit(1)
+
+  if tool == "annotate":
+    import annotate
+    success = annotate.main()
+  elif tool == "extract":
+    import extract
+    success = extract.main()
+  elif tool == "filter":
+    import filter
+    success = filter.main()
+  elif tool == "intersect":
+    import intersect
+    success = intersect.main()
+  elif tool == "multi":
+    import multi
+    success = multi.main()
+  elif tool == "merge":
+    import merge
+    success = merge.main()
+  elif tool == "sort":
+    import sort
+    success = sort.main()
+  elif tool == "stats":
+    import stats
+    success = stats.main()
+  elif tool == "union":
+    import union
+    success = union.main()
+  elif tool == "unique":
+    import unique
+    success = unique.main()
+  elif tool == "test":
+    import test
+    success = test.main()
+  elif tool == "validate":
+    import validate
+    success = validate.main()
+  elif tool == "--help" or tool == "-h" or tool == "?":
+    print >> sys.stderr, usage
+  else:
+    print >> sys.stderr, "Unknown tool: ",tool
+    print >> sys.stderr, "\n", usage
+    exit(1)
+
+# If program completed properly, terminate.
+
+  if success == 0: exit(0)
+
+if __name__ == "__main__":
+  main()
b
diff -r 000000000000 -r 9071e359b9a3 tools/visualization/GMAJ.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/visualization/GMAJ.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+
+"""
+Script that Creates a zip file for use by GMAJ
+"""
+import sys, zipfile
+
+def __main__():
+    #create a new zip file
+    out_file  = zipfile.ZipFile( sys.argv[1], "w" )
+    #add info files
+    out_file.write( sys.argv[3], "input.gmaj" ) #THIS FILE MUST BE ADDED FIRST
+    out_file.write( sys.argv[2], "input.maf" )
+    
+    #add annotation files
+    for line in open( sys.argv[4] ):
+        try:
+            out_file.write( *[ field.strip() for field in line.split( "=", 1 ) ] )
+        except:
+            continue
+    out_file.close()
+
+if __name__ == "__main__": __main__()
b
diff -r 000000000000 -r 9071e359b9a3 tools/visualization/GMAJ.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/visualization/GMAJ.xml Fri Mar 09 19:37:19 2012 -0500
[
b'@@ -0,0 +1,200 @@\n+<tool id="gmaj_1" name="GMAJ" version="2.0.1">\r\n+<description>Multiple Alignment Viewer</description>\r\n+  <command interpreter="python">GMAJ.py $out_file1 $maf_input $gmaj_file $filenames_file</command>\r\n+  <inputs>\r\n+      <param name="maf_input" type="data" format="maf" label="Alignment File" optional="False">\n+        <validator type="metadata" check="species_chromosomes" message="Metadata missing, click the pencil icon in the history item and use the auto-detect feature to correct this issue."/>\n+      </param>\r\n+      <param name="refseq" label="Reference Sequence" type="select">\r\n+        <option value="first" selected="true">First sequence in each block</option>\r\n+        <option value="any">Any sequence</option>\r\n+      </param>\r\n+      <repeat name="annotations" title="Annotations">\r\n+        <conditional name="annotation_style">\r\n+          <param name="style" type="select" label="Annotation Style" help="If your data is not in a style similar to what is available from Galaxy (and the UCSC table browser), choose \'Basic\'.">\r\n+            <option value="galaxy" selected="true">Galaxy</option>\r\n+            <option value="basic">Basic</option>\r\n+          </param>\r\n+          <when value="galaxy">\r\n+            <param name="species" type="select" label="Species" multiple="False">\r\n+              <options>\r\n+                <filter type="data_meta" ref="maf_input" key="species" />\r\n+              </options>\r\n+            </param>\r\n+            <param name="exons_file" type="data" format="bed,gff" label="Exons File" optional="True"/>\r\n+            <param name="highlights_file" type="data" format="bed,gff" label="Highlights File" optional="True"/>\r\n+            <param name="underlays_file" type="data" format="bed,gff" label="Underlays File" optional="True"/>\r\n+            <param name="repeats_file" type="data" format="bed,gff" label="Repeats File" optional="True"/>\r\n+            <param name="links_file" type="data" format="bed,gff" label="Links File" optional="True"/>\r\n+          </when>\r\n+          <when value="basic">\r\n+            <param name="seq_name" label="Full Sequence Name" value="" type="text">\r\n+              <validator type="empty_field" message="You must supply the sequence name"/>\r\n+            </param>\r\n+            <param name="exons_file" type="data" format="bed,gff" label="Exons File" optional="True"/>\r\n+            <param name="highlights_file" type="data" format="bed,gff" label="Highlights File" optional="True"/>\r\n+            <param name="underlays_file" type="data" format="bed,gff" label="Underlays File" optional="True"/>\r\n+            <param name="repeats_file" type="data" format="bed,gff" label="Repeats File" optional="True"/>\r\n+            <param name="links_file" type="data" format="bed,gff" label="Links File" optional="True"/>\r\n+            <param name="offset" label="Offset" value="0" type="integer"/>\r\n+          </when>\r\n+        </conditional>\r\n+      </repeat>\r\n+      <param name="nowarn" type="drill_down" display="checkbox" hierarchy="recurse" multiple="true" label="Choose Warnings to Suppress" separator=" " help="These do not affect behavior, only suppress warning messages.">\r\n+        <options>    \r\n+          <option name="All" value="all">\r\n+            <option name="MAF File" value="maf">\r\n+              <option name="Invalid MAF version (maf_version)" value="maf_version"/>\r\n+              <option name="Skipping unsupported paragraph (maf_paragraph)" value="maf_paragraph"/>\r\n+              <option name="Unrecognized character found in alignment (bad_char_all)" value="bad_char_all"/>\r\n+              <option name="Skipping all reconstruction scores: no species specified (recon_noseq)" value="recon_noseq"/>\r\n+              <option name="Skipping reconstruction scores in blocks with missing row (recon_missing)" value="recon_missing"/>\r\n+              <option name="The first row in some blocks is not the specified reference sequence (refseq_not_first)" value="refseq_not_fi'..b'unt}.exons.${annotation.annotation_style[\'exons_file\'].extension}\r\n+#end if\r\n+#if $annotation.annotation_style[\'repeats_file\'].dataset:\r\n+repeats = ${annotation_count}.repeats.${annotation.annotation_style[\'repeats_file\'].extension}\r\n+#end if\r\n+#if $annotation.annotation_style[\'links_file\'].dataset:\r\n+links = ${annotation_count}.links.${annotation.annotation_style[\'links_file\'].extension}\r\n+#end if\r\n+#if $annotation.annotation_style[\'underlays_file\'].dataset:\r\n+underlays = ${annotation_count}.underlays.${annotation.annotation_style[\'underlays_file\'].extension}\r\n+#end if\r\n+#if $annotation.annotation_style[\'highlights_file\'].dataset:\r\n+highlights = ${annotation_count}.highlights.${annotation.annotation_style[\'highlights_file\'].extension}\r\n+#end if\r\n+#if $annotation.annotation_style.style == "basic":\r\n+offset = $annotation.annotation_style[\'offset\']\r\n+#end if\r\n+\r\n+#set $seq_count = $seq_count + 1\r\n+#end for\r\n+#end for\r\n+</configfile>\r\n+    <configfile name="filenames_file">\r\n+#for $annotation_count, $annotation in $enumerate( $annotations ):\r\n+#if $annotation.annotation_style[\'exons_file\'].dataset:\r\n+$annotation.annotation_style[\'exons_file\'] = ${annotation_count}.exons.${annotation.annotation_style[\'exons_file\'].extension}\r\n+#end if\r\n+#if $annotation.annotation_style[\'repeats_file\'].dataset:\r\n+$annotation.annotation_style[\'repeats_file\'] = ${annotation_count}.repeats.${annotation.annotation_style[\'repeats_file\'].extension}\r\n+#end if\r\n+#if $annotation.annotation_style[\'links_file\'].dataset:\r\n+$annotation.annotation_style[\'links_file\'] = ${annotation_count}.links.${annotation.annotation_style[\'links_file\'].extension}\r\n+#end if\r\n+#if $annotation.annotation_style[\'underlays_file\'].dataset:\r\n+$annotation.annotation_style[\'underlays_file\'] = ${annotation_count}.underlays.${annotation.annotation_style[\'underlays_file\'].extension}\r\n+#end if\r\n+#if $annotation.annotation_style[\'highlights_file\'].dataset:\r\n+$annotation.annotation_style[\'highlights_file\'] = ${annotation_count}.highlights.${annotation.annotation_style[\'highlights_file\'].extension}\r\n+#end if\r\n+#end for\r\n+</configfile>\r\n+  </configfiles>\r\n+  <outputs>\r\n+    <data name="out_file1" format="gmaj.zip"/>\r\n+  </outputs>\r\n+<help>\r\n+.. class:: infomark\r\n+\r\n+**Reference Sequence:**\r\n+The default option, &quot;First sequence in each block&quot;, is the correct choice for the vast majority of MAF alignments.  The alternative, &quot;Any sequence&quot;, will allow you to flip the blocks to view them with any of the MAF sequences as the reference, but this is only appropriate if the file was generated by a sequence-symmetric alignment program such as TBA_.  Using &quot;Any sequence&quot; with an ordinary MAF will **not** give the same results as if that alignment had been run with a different reference sequence.\r\n+\r\n+.. class:: infomark\r\n+\r\n+**Annotation Style:**\r\n+The default style, &quot;Galaxy&quot;, specifies one set of annotations for each species in the MAF file; it assumes that if you have, say, exons for several chromosomes of one species, they are all together in one file. The other style, &quot;Basic&quot;, is more flexible but cumbersome: a separate set of files is specified for each sequence (e.g. chromosome), and you must fill in the full sequence name as it appears in the MAF. The Basic style also allows you to provide a display offset that GMAJ will add to all of the position labels for that sequence.  With either style, specifying more than one set of annotations for the same sequence will result in an error message from GMAJ.\r\n+\r\n+----\r\n+\r\n+**What it does**\r\n+\r\n+GMAJ is an interactive viewer for MAF alignments, with support for optional annotation data.  In addition to browsing the alignments, you can select and export them according to a variety of criteria and send the output back to your Galaxy history.\r\n+\r\n+For detailed information on GMAJ, click here_.\r\n+\r\n+.. _here: /static/gmaj/docs/gmaj_readme.html\r\n+.. _TBA: http://www.bx.psu.edu/miller_lab/\r\n+  </help>\r\n+</tool>\n'
b
diff -r 000000000000 -r 9071e359b9a3 tools/visualization/LAJ.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/visualization/LAJ.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+
+"""
+Copies LAV file over to new file for use with LAJ
+"""
+import sys, shutil
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+shutil.copyfile(sys.argv[1],sys.argv[2])
b
diff -r 000000000000 -r 9071e359b9a3 tools/visualization/LAJ.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/visualization/LAJ.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,32 @@
+<tool id="laj_1" name="LAJ">
+<description>Pairwise Alignment Viewer</description>
+  <command interpreter="python">LAJ.py $maf_input $out_file1</command>
+  <inputs>
+      <param name="maf_input" type="data" format="lav" label="Alignment File" optional="False"/>
+      <param name="seq_file1" type="data" format="fasta" label="First Sequence File" optional="True"/>
+      <param name="seq_file2" type="data" format="fasta" label="Second Sequence File" optional="True"/>
+      <param name="exonfile" type="data" format="txt" label="Exon File" optional="True"/>
+      <param name="repeatfile" type="data" format="txt" label="Repeat File" optional="True"/>
+      <param name="annotationfile" type="data" format="txt" label="Annotation File" optional="True"/>
+      <param name="underlayfile" type="data" format="txt" label="Underlay File" optional="True"/>
+      <param name="highlightfile" type="data" format="txt" label="Highlight File" optional="True"/>
+  </inputs>
+  <outputs>
+    <data name="out_file1" format="laj"/>
+  </outputs>
+<help>
+You can use this tool to view a set of LAV alignments.  You may include FASTA formatted sequences for both species.
+
+For detailed information on LAJ, click here_.
+
+.. _here: http://globin.cse.psu.edu/dist/laj/
+
+Laj is a tool for viewing and manipulating the output from pairwise alignment programs such as blastz. It can display interactive dotplot, pip, and text representations of the alignments, a diagram showing the locations of exons and repeats, and annotation links to other web sites containing additional information about particular regions.
+
+.. class:: infomark
+
+**Note:** If you save output from the applet, you will need to manually refresh your history. 
+
+  </help>
+  <code file="LAJ_code.py"/>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/visualization/LAJ_code.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/visualization/LAJ_code.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,40 @@
+#post processing, add sequence and additional annoation info if available
+from urllib import urlencode
+from galaxy.datatypes.images import create_applet_tag_peek
+
+def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
+    primary_data = out_data.items()[0][1]
+    
+    #default params for LAJ type
+    params = {
+    "alignfile1": "display?id=%s" % primary_data.id,
+    "buttonlabel": "Launch LAJ",
+    "title": "LAJ in Galaxy",
+    "posturl": "history_add_to?%s" % urlencode( { 'history_id': primary_data.history_id, 'ext': 'lav', 'name': 'LAJ Output', 'info': 'Added by LAJ', 'dbkey': primary_data.dbkey } )
+    }
+    for name,data in inp_data.items():
+        if name == "maf_input":
+            params["alignfile1"] = "display?id=%s" % data.id
+        elif name == "seq_file1" and data.state == data.states.OK and data.has_data():
+            params["file1seq1"] = "display?id=%s" % data.id
+        elif name == "seq_file2" and data.state == data.states.OK and data.has_data():
+            params["file1seq2"] = "display?id=%s" % data.id
+        elif name == "exonfile" and data.state == data.states.OK and data.has_data():
+            params["exonfile"] = "display?id=%s" % data.id
+        elif name == "repeatfile" and data.state == data.states.OK and data.has_data():
+            params["repeatfile"] = "display?id=%s" % data.id
+        elif name == "annotationfile" and data.state == data.states.OK and data.has_data():
+            params["annotationfile"] = "display?id=%s" % data.id
+        elif name == "underlayfile" and data.state == data.states.OK and data.has_data():
+            params["underlayfile"] = "display?id=%s" % data.id
+        elif name == "highlightfile" and data.state == data.states.OK and data.has_data():
+            params["highlightfile"] = "display?id=%s" % data.id
+    
+    if "file1seq1" not in params and "file1seq2" not in params:
+        params["noseq"] = "true"
+    
+    class_name = "edu.psu.cse.bio.laj.LajApplet.class"
+    archive = "/static/laj/laj.jar"
+    primary_data.peek = create_applet_tag_peek( class_name, archive, params )
+    app.model.context.add( primary_data )
+    app.model.context.flush()
b
diff -r 000000000000 -r 9071e359b9a3 tools/visualization/build_ucsc_custom_track.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/visualization/build_ucsc_custom_track.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+"""
+Build a UCSC genome browser custom track file
+"""
+
+import sys, os
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+args = sys.argv[1:]
+
+out_fname = args.pop(0)
+out = open( out_fname, "w" )
+
+num_tracks = 0
+skipped_lines = 0
+first_invalid_line = 0
+while args:
+    # Suck in one dataset worth of arguments
+    in_fname = args.pop(0)
+    type = args.pop(0)
+    colspec = args.pop(0)
+    name = args.pop(0)
+    description = args.pop(0)
+    color = args.pop(0).replace( '-', ',' )
+    visibility = args.pop(0)
+    # Do the work
+    if type == "wig":
+        print >> out, '''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s''' \
+                      % ( name, description, color, visibility )
+        for i, line in enumerate( file( in_fname ) ):
+            print >> out, line,
+        print >> out
+    elif type == "bed":
+        print >> out, '''track name="%s" description="%s" color=%s visibility=%s''' \
+                      % ( name, description, color, visibility )
+        for i, line in enumerate( file( in_fname ) ):
+            print >> out, line,
+        print >> out
+    else:
+        # Assume type is interval (don't pass this script anything else!)
+        try:
+            c, s, e, st = [ int( x ) - 1 for x in colspec.split( "," ) ]
+        except:
+            try:
+                c, s, e = [ int( x ) - 1 for x in colspec.split( "," )[:3] ]
+                st = -1    #strand column is absent
+            except:
+                stop_err( "Columns in interval file invalid for UCSC custom track." )
+        
+        print >> out, '''track name="%s" description="%s" color=%s visibility=%s''' \
+                      % ( name, description, color, visibility )
+        i = 0
+        for i, line in enumerate( file( in_fname ) ):
+            line = line.rstrip( '\r\n' )
+            if line and not line.startswith( '#' ):
+                fields = line.split( "\t" )
+                if st > 0:
+                    #strand column is present
+                    try:
+                        print >> out, "%s\t%s\t%s\t%d\t0\t%s" % ( fields[c], fields[s], fields[e], i, fields[st] )
+                    except:
+                        skipped_lines += 1
+                        if not first_invalid_line:
+                            first_invalid_line = i+1
+                else:
+                    try:
+                        print >> out, "%s\t%s\t%s" % ( fields[c], fields[s], fields[e] )
+                    except:
+                        skipped_lines += 1
+                        if not first_invalid_line:
+                            first_invalid_line = i+1
+        print >> out
+    num_tracks += 1
+    
+out.close()
+
+print "Generated a custom track containing %d subtracks." % num_tracks
+if skipped_lines:
+    print "Skipped %d invalid lines starting at #%d" % ( skipped_lines, first_invalid_line )
+
+
+
b
diff -r 000000000000 -r 9071e359b9a3 tools/visualization/build_ucsc_custom_track.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/visualization/build_ucsc_custom_track.xml Fri Mar 09 19:37:19 2012 -0500
b
@@ -0,0 +1,82 @@
+<tool id="build_ucsc_custom_track_1" name="Build custom track">
+  <description>for UCSC genome browser</description>    
+  <command interpreter="python">
+    build_ucsc_custom_track.py 
+      "$out_file1"
+      #for $t in $tracks
+        "${t.input.file_name}"
+        "${t.input.ext}"
+        #if $t.input.ext == "interval"
+          ${t.input.metadata.chromCol},${t.input.metadata.startCol},${t.input.metadata.endCol},${t.input.metadata.strandCol}
+        #else
+          "NA"
+        #end if
+        "${t.name}"
+        "${t.description}"
+        "${t.color}"
+        "${t.visibility}"
+      #end for
+  </command>
+  <inputs>
+    <repeat name="tracks" title="Track">
+      <param name="input" type="data" format="interval,wig" label="Dataset"/>
+      <param name="name" type="text" size="15" value="User Track">
+        <validator type="length" max="15"/>
+      </param>
+      <param name="description" type="text" value="User Supplied Track (from Galaxy)">
+        <validator type="length" max="60"/>
+      </param>
+   <param label="Color" name="color" type="select">
+   <option selected="yes" value="0-0-0">Black</option>
+   <option value="255-0-0">Red</option>
+   <option value="0-255-0">Green</option>
+   <option value="0-0-255">Blue</option>
+   <option value="255-0-255">Magenta</option>
+   <option value="0-255-255">Cyan</option>
+   <option value="255-215-0">Gold</option>
+   <option value="160-32-240">Purple</option>
+   <option value="255-140-0">Orange</option>
+   <option value="255-20-147">Pink</option>
+   <option value="92-51-23">Dark Chocolate</option>
+   <option value="85-107-47">Olive green</option>
+   </param>
+   <param label="Visibility" name="visibility" type="select">
+   <option selected="yes" value="1">Dense</option>
+   <option value="2">Full</option>
+   <option value="3">Pack</option>
+   <option value="4">Squish</option>
+   <option value="0">Hide</option>
+   </param>
+    </repeat>
+  </inputs>
+ <outputs>
+   <data format="customtrack" name="out_file1" />
+ </outputs>
+    <tests>
+        <!--TODO: add a 2nd test here that includes 2 tracks -->
+        <test>
+       <param name="input" value="customTrack1.bed" />
+       <param name="name" value="User Track" />
+       <param name="description" value="User Supplied Track (from Galaxy)" />
+       <param name="color" value="0-0-0" />
+       <param name="visibility" value="1" />
+       <output name="out_file1" file="build_ucsc_custom_track_out1.customtrack" />
+        </test>
+ </tests>
+<help>
+
+.. class:: infomark
+
+This tool allows you to build custom tracks using datasets in your history for the UCSC genome browser. You can view these custom tracks on the UCSC genome browser by clicking on **display at UCSC main/test** link in the history panel of the output dataset.
+
+-----
+
+.. class:: warningmark
+
+Please note that this tool requires **all input datasets(tracks) to have the same genome build**. The tool throws an error when this requirement is not met. You may then have to choose a valid dataset or remove invalid tracks.
+
+</help>
+
+<code file="build_ucsc_custom_track_code.py" />
+  
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 9071e359b9a3 tools/visualization/build_ucsc_custom_track_code.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/visualization/build_ucsc_custom_track_code.py Fri Mar 09 19:37:19 2012 -0500
[
@@ -0,0 +1,21 @@
+# runs after the job (and after the default post-filter)
+
+# Older py compatibility
+try:
+    set()
+except:
+    from sets import Set as set
+
+def validate_input( trans, error_map, param_values, page_param_map ):
+    dbkeys = set()
+    tracks = param_values['tracks']
+    for track in tracks:
+        if track['input']:
+            dbkeys.add( track['input'].dbkey )
+    if len( dbkeys ) > 1:
+        # FIXME: Should be able to assume error map structure is created
+        if 'tracks' not in error_map:
+            error_map['tracks'] = [ dict() for t in tracks ]
+            for i in range( len( tracks ) ):
+                error_map['tracks'][i]['input'] = \
+                    "All datasets must belong to same genomic build"
\ No newline at end of file