# HG changeset patch # User osiris_phylogenetics # Date 1394565553 25200 # Node ID 5b9a38ec4a39ab77060794484d6ef2e06e6809a7 First commit of old repositories diff -r 000000000000 -r 5b9a38ec4a39 alignment/alignment.tool_conf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/alignment.tool_conf Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,24 @@ +<<<<<<< /home/galaxy/galaxy-dist/tools/osiris/alignment/alignment.tool_conf +
+======= +
+>>>>>>> /tmp/alignment.tool_conf~other.3Q5xSY +
+ diff -r 000000000000 -r 5b9a38ec4a39 alignment/fasconcat.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/fasconcat.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,37 @@ +#!/usr/bin/perl + +use strict; + +my $fasconcatPath = '/home/galaxy/galaxy_dist/tools/fasconcat/FASconCAT_v1.0.pl'; + +my $outputFormat = $ARGV[0]; +my $limit = $ARGV[1]; +my $outFormat; +my @inputFiles; + +for(my $i = 2; $i <= $limit; $i++) { + $inputFiles[$i] = " -f "; + $inputFiles[$i] = $inputFiles[$i].$ARGV[$i]; +} + +if($outputFormat == "0") { + $outFormat = ""; +} +elsif($outputFormat == "1") { + $outFormat = " -p -p"; +} +else { + $outFormat = " -n -n"; +} + +my $run = qx/$fasconcatPath -s -i "@inputfiles" $outFormat /; + +if($outputFormat == "0") { + qx/cp FcC_smatrix.fas output/; +} +elsif($outputFormat == "1") { + qx/cp FcC_smatrix.phy output/; +} +else { + qx/cp FcC_smatrix.nex output/; +} diff -r 000000000000 -r 5b9a38ec4a39 alignment/fasconcat.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/fasconcat.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,59 @@ + + Appends all input sequence files into one file + fasconcat.pl $outputFormat $totalNum + #for $file in $sourceFiles + ${file.input} + #end for + + + + + + + + + + + + + + + + + + +**What it does** + +FASConcat appends all input sequences into one file. + +------ + +**Inputs** + +FASTA, Phylip, NEXUS + +------ + +**Outputs** + +FASTA, Phylip, NEXUS + +------- + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the +osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this +tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + diff -r 000000000000 -r 5b9a38ec4a39 alignment/fasta_concatenate_by_species.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/fasta_concatenate_by_species.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,40 @@ +#!/usr/bin/env python +#Dan Blankenberg +""" +Takes a Multiple Alignment FASTA file and concatenates +sequences for each species, resulting in one sequence +alignment per species. +""" + +import sys, tempfile +from galaxy import eggs +from galaxy.tools.util.maf_utilities import iter_fasta_alignment +from galaxy.util.odict import odict + +def __main__(): + input_filename = sys.argv[1] + output_filename = sys.argv[2] + species = odict() + cur_size = 0 + for components in iter_fasta_alignment( input_filename ): + species_not_written = species.keys() + for component in components: + if component.species not in species: + species[component.species] = tempfile.TemporaryFile() + species[component.species].write( "-" * cur_size ) + species[component.species].write( component.text ) + try: + species_not_written.remove( component.species ) + except ValueError: + #this is a new species + pass + for spec in species_not_written: + species[spec].write( "-" * len( components[0].text ) ) + cur_size += len( components[0].text ) + out = open( output_filename, 'wb' ) + for spec, f in species.iteritems(): + f.seek( 0 ) + out.write( ">%s\n%s\n" % ( spec, f.read() ) ) + out.close() + +if __name__ == "__main__" : __main__() diff -r 000000000000 -r 5b9a38ec4a39 alignment/fasta_concatenate_by_species.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/fasta_concatenate_by_species.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,89 @@ + + FASTA alignment by species + fasta_concatenate_by_species.py $input1 $out_file1 + + + + + + + + + + + + + +**What it does** + +This tools attempts to parse FASTA headers to determine the species for each sequence in a multiple FASTA alignment. +It then linearly concatenates the sequences for each species in the file, creating one sequence per determined species. + +------- + +**Example** + +Starting FASTA:: + + >hg18.chr1(+):10016339-10016341|hg18_0 + GT + >panTro2.chr1(+):10195380-10195382|panTro2_0 + GT + >rheMac2.chr1(+):13119747-13119749|rheMac2_0 + GT + >mm8.chr4(-):148269679-148269681|mm8_0 + GT + >canFam2.chr5(+):66213635-66213637|canFam2_0 + GT + + >hg18.chr1(-):100323677-100323679|hg18_1 + GT + >panTro2.chr1(-):101678671-101678673|panTro2_1 + GT + >rheMac2.chr1(-):103154011-103154013|rheMac2_1 + GT + >mm8.chr3(+):116620616-116620618|mm8_1 + GT + >canFam2.chr6(+):52954092-52954094|canFam2_1 + GT + + + +becomes:: + + >hg18 + GTGT + >panTro2 + GTGT + >rheMac2 + GTGT + >mm8 + GTGT + >canFam2 + GTGT + + +.. class:: warningmark + + This tool will only work properly on files with Galaxy style FASTA headers. + +------- + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at +bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please +consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + + diff -r 000000000000 -r 5b9a38ec4a39 alignment/gap-rem.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/gap-rem.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,57 @@ + + Removes gap in sequences + seqfill.pl $file $question_mark $hyphen $N $usePart $pfile + + + + + + + + + + + + + + + +**What it does** +Sequence Gap Remover + +Gaps across all species are introduced by phylocatenator when selecting pre-aligned datasets + +Takes an input phylip file and removes any specified gap characters that exist in the same columns of containing sequences. + +------ + +**Inputs** + +Phylip + +------ + +**Outputs** + +Text file + +------ + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at +bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider +citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + + diff -r 000000000000 -r 5b9a38ec4a39 alignment/gblocks.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/gblocks.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,30 @@ +#! /usr/bin/perl -w + +use strict; +use warnings; +#gblocks.pl [fasta file] + +my $infile=shift(@ARGV); +my $datatype=shift(@ARGV); +my $gaps=shift(@ARGV); +my $size=shift(@ARGV); +my $outfileloc=shift(@ARGV); +my $htmlfileloc=shift(@ARGV); + + + + +##For debugging command line pass, uncomment next +#for (my $i=0; $i < @ARGV; $i++){ +# print "Parameter #$i ".$ARGV[$i]."\n\n"; +#} + +system "Gblocks $infile $datatype $gaps -b4=$size"; + +#Gblocks requires output from $input.fas to be written to $input.fas-gb +#Copy that file to gout where galaxy expects to find the output +my $outfile = $infile."-gb"; +my $htmlfile = $outfile.".htm"; +system "cat $outfile > $outfileloc"; +system "cat $htmlfile > $htmlfileloc"; +exit; diff -r 000000000000 -r 5b9a38ec4a39 alignment/gblocks.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/gblocks.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,93 @@ + + Convert Aligned FASTA to phylip Extended + + gblocks.pl $input $datatype $gaps $Block $out_file $html_file > $screen + + + + + + + + + + + + + + + + + + + + + +**What it does** + +About GBlocks + +Version 0.91b, January 2002 + +Copyright Jose Castresana + +Gblocks is a computer program written in ANSI C language that eliminates poorly aligned +positions and divergent regions of an alignment of DNA or protein sequences. These +positions may not be homologous or may have been saturated by multiple substitutions and it +is convenient to eliminate them prior to phylogenetic analysis. Gblocks selects blocks in a +similar way as it is usually done by hand but following a reproducible set of conditions. +The selected blocks must fulfill certain requirements with respect to the lack of large +segments of contiguous nonconserved positions, lack of gap positions and high conservation +of flanking positions, making the final alignment more suitable for phylogenetic analysis. +Gblocks outputs several files to visualize the selected blocks. The use of a program such +as Gblocks reduces the necessity of manually editing multiple alignments, makes the +automation of phylogenetic analysis of large data sets feasible and, finally, facilitates +the reproduction of the alignments and subsequent phylogenetic analysis by other +researchers. Gblocks is very fast in processing alignments and it is therefore highly +suitable for large-scale phylogenetic analyses. + +Several parameters can be modified to make the selection of blocks more or less stringent. +In general, a relaxed selection of blocks is better for short alignments, whereas a +stringent selection is more adequate for longer ones. Be aware that the default options of +Gblocks are stringent. + +------ + +**Inputs** + +FASTA + +------ + +**Outputs** + +Phylip extended + +------- + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at +bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider +citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Talavera, G., and Castresana, J. (2007). Improvement of phylogenies after removing +divergent and ambiguously aligned blocks from protein sequence alignments. Systematic +Biology 56, 564-577. + +Castresana, J. (2000). Selection of conserved blocks from multiple alignments for their +use in phylogenetic analysis. Molecular Biology and Evolution 17, 540-552. + + diff -r 000000000000 -r 5b9a38ec4a39 alignment/jmodeltest.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/jmodeltest.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,57 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use Cwd; + +my $jmodeltest_path = '/home/galaxy/pkgs/jmodeltest'; +my $jmodeltest_tool = '/home/galaxy/galaxy-dist/tools/jmodeltest'; + +my $galaxyPath = getcwd(); + +my $input = $ARGV[0]; +my $likelihoodStyle = $ARGV[1]; +my $likelihood = $ARGV[2]; +my $criterion = $ARGV[3]; +my $extension = $ARGV[4]; + +# open increment file +open my $file, '<', $jmodeltest_tool."\/increment.txt"; + my $increment = <$file>; + $increment = int($increment); +close $file; + +# get the current increment +my $temp = $increment; + +# update the increment +open(UPDATE, '>'.$jmodeltest_tool."\/increment.txt"); + $increment = $increment + 1; + print UPDATE $increment; +close(UPDATE); + +chdir("$jmodeltest_path"); + +if($likelihoodStyle eq "-t") { + #only need to copy input file + qx/cp $input input.$temp.$extension/; + + #print qx/ls/; + + qx/java -jar jModelTest.jar -d input.$temp.$extension $likelihoodStyle $likelihood -$criterion > $galaxyPath\/output.txt 2> $galaxyPath\/err_log.txt/; + qx/rm input.$temp.*/; +} +elsif($likelihoodStyle eq "-u") { + #copy input file + qx/cp $input input.$temp.$extension/; + #copy likelihood tree + qx/cp $likelihood likelihood.$temp.tre/; + qx/java -jar jModelTest.jar -d input.$temp.$extension $likelihoodStyle likelihood.$temp.tre -$criterion > $galaxyPath\/output.txt 2> $galaxyPath\/err_log.txt/; + + # clean up + qx/rm input.$temp.*/; + qx/rm likelihood.$temp.tre/; + +} + +chdir("$galaxyPath"); \ No newline at end of file diff -r 000000000000 -r 5b9a38ec4a39 alignment/jmodeltest.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/jmodeltest.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,101 @@ + + Statistical selection of best-fit models of nucleotide substitution + + #if $likelihoodInput.likelihoodStyle == "-t": + jmodeltest.pl $input -t ${likelihoodInput.likelihoodPARAM} $criterion $inputType + #elif $likelihoodInput.likelihoodStyle == "-u": + jmodeltest.pl $input -u ${likelihoodInput.likelihoodFILE} $criterion $inputType + #else: + #end if# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +"jModelTest is a new program for the statistical selection of models of nucleotide substitution based on “Phyml” (Guindon and Gascuel 2003. +A simple, fast, and accurate algorithm to estimate large phylogenies by maximum likelihood. Syst Biol. 52:696–704.). It implements 5 +different selection strategies, including “hierarchical and dynamical likelihood ratio tests,” the “Akaike information criterion,” the +“Bayesian information criterion,” and a “decision-theoretic performance-based” approach. This program also calculates the relative +importance and model-averaged estimates of substitution parameters, including a model-averaged estimate of the phylogeny." + +(Posada, 2008; see full citation below) + +------ + +**Inputs** + +Phylip, FASTA, NEXUS + +------ + +**Outputs** + +Text file + +------- + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at +bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please +consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +jModelTest: Posada, D. 2008. jModelTest: Phylogenetic Model Averaging. Mol Biol Evol 25 (7): 1253-1256. + +Phyml: Guindon S and Gascuel O (2003). A simple, fast and accurate method to estimate large phylogenies by maximum-likelihood". Systematic +Biology 52: 696-704. + + diff -r 000000000000 -r 5b9a38ec4a39 alignment/mafft.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/mafft.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,32 @@ +#!/usr/bin/perl + +my $strategy = $ARGV[0]; +my $input = $ARGV[1]; +my $output = "seqs_aligned.fasta"; + +if($strategy eq "Auto") { + my $run = qx/mafft --auto $input > $output 2>log.txt/; +} +elsif($strategy eq "FFT-NS-1") { + my $run = qx/mafft --retree 1 $input > $output 2>log.txt/; +} +elsif($strategy eq "FFT-NS-2") { + my $run = qx/mafft --retree 2 $input > $output 2>log.txt/; +} +elsif($strategy eq "FFT-NS-i") { + my $run = qx/mafft-fftnsi $input > $output 2>log.txt/; +} +elsif($strategy eq "E-INS-i") { + my $run = qx/mafft-einsi $input > $output 2>log.txt/; +} +elsif($strategy eq "L-INS-i") { + my $run = qx/mafft-linsi $input > $output 2>log.txt/; +} +elsif($strategy eq "G-INS-i") { + my $run = qx/mafft-ginsi $input > $output 2>log.txt/; +} +elsif($strategy eq "Q-INS-i") { + my $run = qx/mafft-qinsi $input > $output 2>log.txt/; +} + +print $run; diff -r 000000000000 -r 5b9a38ec4a39 alignment/mafft.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/mafft.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,79 @@ + + Multiple Sequence Alignment + + mafft + + + mafft.pl $Strategy $input1 + + + + + + + + + + + + + + + + + + + + +**What it does** + +MAFFT is a multiple sequence alignment program. It offers a range of multiple alignment methods. +From the MAFFT website: http://mafft.cbrc.jp/alignment/software/ + +------ + +**Inputs** + Auto (FFT-NS-1, FFT-NS-2, FFT-NS-i or L-INS-i; depends on data size). + + FFT-NS-1 (Very fast; recommended for more than 2,000 sequences; progressive method). + + FFT-NS-2 (Fast; progressive method). + + FFT-NS-i (Slow; iterative refinement method). + + E-INS-i (Very slow; recommended for less than 200 sequences with multiple conserved domains and long gaps). + + L-INS-i (Very slow; recommended for less than 200 sequences with one conserved domain and long gaps). + + G-INS-i (Very slow; recommended for less than 200 sequences with global homology). + + Q-INS-i (Extremely slow; secondary structure of RNA is considered; recommended for a global alignment of highly diverged ncRNAs with less than 200 sequences × less than 1,000 nucleotides). + +------ + +**Output** + +FASTA + +------ + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +MAFFT: Katoh, Toh 2008 (Briefings in Bioinformatics 9:286-298) +Recent developments in the MAFFT multiple sequence alignment program. + + diff -r 000000000000 -r 5b9a38ec4a39 alignment/mview.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/mview.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,12 @@ +#!/usr/bin/perl + +my $input = $ARGV[0]; +my $dna = $ARGV[1]; + +if ($dna eq 'dna'){ + $dna = '-DNA'; +}else{ + $dna = ''; +} +my $run = qx/mview -in pearson $dna -bold -coloring group -html head $input/; +print $run; diff -r 000000000000 -r 5b9a38ec4a39 alignment/mview.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/mview.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,59 @@ + + View multiple sequence alignment in HTML + mview + mview.pl $input $dna > $output + + + + + + + + + + + + +**What it does** + +From the MView website: http://bio-mview.sourceforge.net/ + +MView reformats the results of a sequence database search or a multiple alignment adding optional HTML markup to control coloring and webpage layout. + +------ + +**Inputs** + +FASTA + +------ + +**Outputs** + +HTML + +------- + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider +citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Citation for MView: + +Brown, N.P., Leroy C., Sander C. (1998). MView: A Web compatible database search or multiple alignment viewer. Bioinformatics. 14(4):380-381. + + + diff -r 000000000000 -r 5b9a38ec4a39 alignment/phylocatenator.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/phylocatenator.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,413 @@ +#! /usr/bin/perl -w + +use strict; +use warnings; +#phylocatenator.pl +#Written by Todd H. Oakley UCSB +#Version 1.0 May, 2012 +# +#phyloconcatenator.pl [infile] [also requires arguments 1-8 detailed below] +# +#Version 1.0.1 Sept 2012 -- fixed a bug that led sometimes to species with no data being retained. +#This bug would not affect the data that was retained, so if raxml ran all was fine. However, +#raxml will not run with species that have completely missing data. Added extra section to remove +#those species. + +#For debugging command line pass, uncomment next +#for(my $i=0; $i < @ARGV; $i++){ +# print "Arg $i: $ARGV[$i] \n\n"; +#} +#exit; + +die "Check arguments" unless @ARGV == 9; + +#Obtain Arguments +my $infile1=shift(@ARGV); #0 input file +my $mingf_persp=shift(@ARGV); #1 minimum number of genefamiles to keep species +my $minsp_pergf=shift(@ARGV); #2 minimum number of species to keep genefamily +my $min_gf_len =shift(@ARGV); #3 minimum gene family length +my $speciesfile=shift(@ARGV); #4 optional species file 'None' if false +my $modelsfile=shift(@ARGV); #5 models file +my $outfile=shift(@ARGV); #6 data outfile +my $partfile=shift(@ARGV); #7 partition file +my $htmlfile=shift(@ARGV); #8 html file + +#Open 2 output files +open (OUT, ">$outfile") or die "Cannot create $outfile \n"; +open (PART, ">$partfile") or die "Cannot create $partfile\n"; +open (TABLE, ">$htmlfile") or die "Cannot create $htmlfile\n"; + +my %HoAdatatable; +my %HoGF; +my %lengthof; +my %ModelFor; +my $line = ""; + +my @specieslist; +my @genelist; +my @nsp; +my $nsl=1; +my @uniquemodels; +my @nullmodels; +my @retainedspecies; + +# read file with species +unless($speciesfile eq 'None'){ + open SPFILE, $speciesfile or die "ERROR: Cannot open file $speciesfile\n\n"; + $nsl=0; +while () { + chomp; + my $currentinput = "$_"; + if($currentinput =~m /\w/){ #must have a word otherwise empty + push(@nsp, $currentinput); + }else{ + die "ERROR: Fewer than 4 species meet your specified criteria.\n"; + } +# if(@nsp < 4){ +# die "ERROR: Species file must have more than 4 species.\n"; +# } +} +#print "\n\n"; +} #end of unless + +#Determine models used for each partition, make hash +unless($modelsfile eq 'None'){ + open MODFILE, $modelsfile or die "ERROR: Cannot open file $modelsfile\n\n"; + while () { + chomp; + my $currentinput = "$_"; + if($currentinput =~m /\t/){ #must have a tab otherwise wrong file format + if($currentinput =~m /\t\t/){ + print OUT "ERROR: file contains 2 tabs in a row. Check phytab format.\n"; + die; + }else{ + my @genemodel = split(/\t/, $currentinput); + my $genefamily=$genemodel[0]; + my $curmodel = $genemodel[1]; + if (exists $ModelFor{$genefamily}) { + print OUT "ERROR: Model specification for $genefamily is duplicated\n"; + die; + }else{ + $ModelFor{$genefamily}=$curmodel; + push(@uniquemodels,$curmodel); + } + } + }else{ + die "ERROR: Model LUT must be genefamily\tmodel and contain no blank lines\n"; + } + } +} +@uniquemodels = uniq(@uniquemodels); #remove redundant models - uniq is subroutine at end of script + +#Now check that models are valid raxml models NOT DONE YET +checkraxmlmodel(); + +#INPUT ALL DATA +open(INFILE1, $infile1); +foreach $line() { + chop($line); + my $getline = $line; + my @column = split(/\t/, $getline); + my $species = $column[0]; + my $genefamily = $column[1]; + my $genename = $column[2]; + my $sequence = $column[3]; + + if (exists $HoAdatatable{$species}{$genefamily}) { + print OUT "ERROR: $species $genefamily is duplicated\n"; + die; + }else{ + $HoAdatatable{$species}{$genefamily} = $sequence; + $HoGF{$genefamily}{$species}=$sequence; + } +} + +#If no models file selected, set every gene family to GTR model +if($modelsfile eq 'None'){ + foreach my $gfkey (sort keys %HoGF){ + $ModelFor{$gfkey} = 'GTR'; + } + push(@uniquemodels, 'GTR'); +} + +#First, keep all species with enough total partitions present +foreach my $specieskey (sort keys %HoAdatatable) +{ + #Count species with minimum gfs + my $ngf_persp=0; + foreach my $genefamilykey (sort keys %{$HoAdatatable{$specieskey}}) + { + $ngf_persp++; + } + unless($ngf_persp < $mingf_persp){ #too few genes for this species, delete the sp + if($nsl){ #No species list supplied, push all species into list + push(@specieslist,$specieskey); + }else{ + #See if current specieskey is in inputted species list + my $nsp; + foreach $nsp(@nsp){ + if (index($nsp,$specieskey) ge 0){ + push(@specieslist,$specieskey); + } + } + } + } +} +print OUT "\n"; +unless (@specieslist){ + print OUT "ERROR: No species with more than $mingf_persp genes\n"; + die; +} +if(@specieslist < 4){ + print OUT "ERROR: Less than 4 species with more than $mingf_persp genes\n"; + die; +} + + +my $oldgenelen = 0; +my $currentseqlen = 0; +foreach my $gfkey (sort keys %HoGF) +{ + $oldgenelen=0; + #Count gfs with minimum species + my $nsp_pergf=0; + for(my $j=0; $j<@specieslist; $j++){ + if(exists $HoGF{$gfkey}{$specieslist[$j]}){ + $nsp_pergf++ ; ###if exists $HoGF{$gfkey}{$specieslist[$j]}; + + + ##get length of gene and check it is consistent + if(exists $lengthof{$gfkey}){ + $currentseqlen = length($HoGF{$gfkey}{$specieslist[$j]}); + # $lengthof{$gfkey} = $currentseqlen; + if($currentseqlen == $oldgenelen){ + #$oldgenelen = $lengthof{$gfkey}; + #okay + }else{ + print OUT "ERROR: $specieslist[$j] $gfkey sequences ". + "different lengths than previous. Sequences must be aligned. If ". + "sequences are aligned, check that the line ". + "does not have an extra data column before the ". + "sequence.\n\n"; + die "ERROR: $specieslist[$j] $gfkey sequences ". + "different lengths than previous. Sequences must be aligned. If ". + "sequences are aligned, check that the line ". + "does not have an extra data column before the ". + "sequence.\n\n"; + } + }else{ + $currentseqlen = length($HoGF{$gfkey}{$specieslist[$j]}); + if($currentseqlen == 0){ + die "ERROR: Zero length sequence in file.\n" + } + $lengthof{$gfkey} = $currentseqlen; + $oldgenelen = $lengthof{$gfkey}; + } + } + } + if($nsp_pergf < $minsp_pergf){ + #too few species for this gene family + }else{ + if(exists $lengthof{$gfkey}){ + if($lengthof{$gfkey}>$min_gf_len){ + push(@genelist,$gfkey); + } + } + } +} +if (@genelist==0){ + print OUT "ERROR: No gene families/partitions meet the specified criteria\n"; + die "ERROR: No gene families/partitions meet the specified criteria\n"; +} + +#Now must delete species that lack any *retained* partitions, ie some species may be all missing +foreach(@specieslist) +{ + my $curspecies=$_; + #Count species with minimum gfs + my $ngf_persp=0; + my $nonmissing=0; + foreach (@genelist) + { + if (exists $HoAdatatable{$curspecies}{$_}){ + my $cursequence = $HoAdatatable{$curspecies}{$_}; + $cursequence =~ s/\?//g; + $cursequence =~ s/\-//g; + #Will not remove N's assumes those are not missing data. Revisit? + my $curlen = length($cursequence); + $nonmissing = $nonmissing + $curlen; + } + } + if($nonmissing > 0) #must remove species as it contains none of the genes retained + { + print "$curspecies has $nonmissing Non-Missing characters\n"; + push(@retainedspecies, $curspecies); + } +} +@specieslist=@retainedspecies; + +#Remove blankspecies from +#print phylip file +#calculate n characters +my $nchar=0; #total characters +my $ncharPs =0; #start count for characters in current partition +my $ncharPe =0; #end count for characters in current partition + +#First count total characters for first line +for(my $k=0; $k<@genelist; $k++){ + if (exists $lengthof{$genelist[$k]}){ #check that length was calculated + $nchar = $nchar + $lengthof{$genelist[$k]}; + }else{ + die "ERROR: $genelist[$k] LENGTH MISSING\n"; + } +} +print OUT @specieslist." ".$nchar."\n"; +htmlheader(); + +#Need to determine gene list order, which will change due to partitioning +#then write header line of gene names, hopefully in correct order +#print TABLE ""; #Blank line in species column +print TABLE "Partition:"; +for(my $part=0; $part < @uniquemodels; $part++){ + for(my $k=0; $k<@genelist; $k++){ + #First check if current gf matches current partition + if(exists $ModelFor{$genelist[$k]}){ + if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ + if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ + print TABLE "$genelist[$k]"; + } + } + } + } +} +print TABLE ""; + +#print TABLE ""; #Blank line in species column +print TABLE "Model:"; +for(my $part=0; $part < @uniquemodels; $part++){ + for(my $k=0; $k<@genelist; $k++){ + #First check if current gf matches current partition + if(exists $ModelFor{$genelist[$k]}){ + if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ + if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ + print TABLE "$uniquemodels[$part]"; + } + } + } + } +} +#End of htmlheader printing + +for(my $j=0; $j<@specieslist; $j++){ + print OUT "$specieslist[$j]\t"; + print TABLE " + + $specieslist[$j]"; + for(my $part=0; $part < @uniquemodels; $part++){ + for(my $k=0; $k<@genelist; $k++){ + #First check if current gf matches current partition + if(exists $ModelFor{$genelist[$k]}){ + if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ + if (exists $HoAdatatable{$specieslist[$j]}{$genelist[$k]}){ + print OUT $HoAdatatable{$specieslist[$j]}{$genelist[$k]}; + print TABLE ""; + $ncharPe = $ncharPe + $lengthof{$genelist[$k]}; + }else{ + if (exists $lengthof{$genelist[$k]}){ + for(my $gap=0; $gap<$lengthof{$genelist[$k]}; $gap++){ + print OUT "?"; + } + $ncharPe = $ncharPe + $lengthof{$genelist[$k]}; + print TABLE ""; + #print TABLE "$genelist[$k]"; + }else{ + die "ERROR: BUG!! $genelist[$k] LENGTH MISSING\n"; + } + } + } + }else{ + die "ERROR: $genelist[$k] is not assigned a model. Check model LUT input.\n"; + } + + } + if($j==0){ #print partitions first time through gene lists + if(($ncharPe==0) || ($ncharPs > $ncharPe)){ + push(@nullmodels, $uniquemodels[$part]); + #print "NOTE: no partitions under the model $uniquemodels[$part] made final dataset\n"; + }else{ + print PART "$uniquemodels[$part], $uniquemodels[$part] = $ncharPs - $ncharPe \n"; + } + $ncharPs=$ncharPe + 1; + } + } + print TABLE ""; + print OUT "\n"; +} +if($ncharPe/@genelist != $nchar){ + # Have to account for multiple times through gene lists -- ncharPe is summed multiple times but printed once + #die "ERROR: BUG!! Last partition number doesn't match total n characters\n"; +} + +#print Statistics to screen can be redirected for Log File +print "\nSPECIES:\n"; +unless($speciesfile eq 'None'){ + print "Used species file to select species. \n"; +} +print "Number of species with $mingf_persp or more genefamilies/partitions: ".@specieslist."\n"; +print "Species list: @specieslist\n\n\n"; +print "\nPARTITIONS/GENE FAMILIES:\n"; +print "Number genefamilies/partitions longer than $min_gf_len characters and present in at least $minsp_pergf genefamilies/partitions: ".@genelist."\n"; +for(my $i=0; $i < @genelist; $i++){ + print "$genelist[$i] Length:$lengthof{$genelist[$i]}\n"; +} + + #Printing Model stats +print "\nMODELS:\n"; +if($modelsfile eq 'None'){ + print "All partitions set to GTR (no model LUT supplied)\n\n"; +}else{ + print "A LUT of models was supplied.\n"; + print "The following models were present in the model LUT file:\n"; + print join(" ", @uniquemodels), "\n"; + print "\n"; + for(my $i=0; $i < @nullmodels; $i++){ + print "NOTE: no partitions under the model $nullmodels[$i] made final dataset\n"; + } +} +close PART; +close OUT; +close SPFILE; +close TABLE; + +sub uniq { + return keys %{{ map { $_ => 1 } @_ }}; +} + +sub checkraxmlmodel { + my @raxmlmodels = ("DNA","BIN","MULTI","DAYHOFF", "DCMUT", "JTT", "MTREV", "WAG", "RTREV", "CPREV", "VT", "BLOSUM62", "MTMAM", "LG", "GTR", "MTART", "MTZOA", "FLU","PMB", "HIVB","HIVW","JTTDCMUT"); + return(1); #Not yet implemented +} +sub htmlheader { + print TABLE ' + + + Dataset Presences and Absences + + +
+ + "; + print TABLE ""; + } + print TABLE " + "; +} +sub htmltail { +print TABLE ' + +'; +} diff -r 000000000000 -r 5b9a38ec4a39 alignment/phylocatenator.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/phylocatenator.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,130 @@ + + Produces concatenated sequence file from phytab file of aligned sequences + + phylocatenator.pl $input1 $genes $species $mingene $species_file $models_file $out_file1 $partition_file $html_file > $phylocat_log + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool produces a concatenated data set for phylogenetics when not all genes are sampled for all species. + +------ + +**Basic Example** + +The input data must be in phytab column format. Column 1 is species name, C2 is genefamily, C3 individual gene name, C4 is sequence. +Sequences of each gene family must be aligned:: + + species1 gene1 genenameA acgttagcgcgctatagc + species2 gene1 genenameB acgttag--cgctataaa + species3 gene1 genenameC acgttagcgcgctatagc + species4 gene1 genenameD acgttagcgcgctatagc + species1 gene2 genenameE --gttagtttgcta + species3 gene2 genenameF gtgttagtttgcta + +Two variables are $gene and $species. These set thresholds for +inclusion of data. $species is the minimum number of species that +contain a particular gene. $gene sets a minimum number of gene families +that a species must have to be included in the dataset. + +Running phylocatenator on the above data with 0 for genes and 0 for species yields:: + + 4 32 + species1 acgttagcgcgctatagc--gttagtttgcta + species2 acgttag--cgctataaa?????????????? + species3 acgttagcgcgctatagcgtgttagtttgcta + species4 acgttagcgcgctatagc?????????????? + +**Optional Functionality** + +I. You may enter a list of species. +Species not in this list will not be written to the output file. +For example, a species list of:: + + species1 + species2 + + +Would change the above output to:: + + species1 acgttagcgcgctatagc--gttagtttgcta + species2 acgttag--cgctataaa?????????????? + +II. Table of partition models + +You may enter a table of models for each gene family/partition. Phylocatenator will then sort all the data to put all data +for the same models together. It will then create the appropriate partition file, which will specify each model in raxml. +Currently, it is only possible to partiion data into valid raxml models. + +The format is a tab-delimited file as follows:: + + gene1 WAG + gene2 JTT + gene3 DNA + gene4 WAG + +Valid models include the following:: + + BIN = binary morphological data + MULTI = multistate morphological data + DNA = DNA data + WAG = one of several protein models listed in raxml help documents + +III. Attribute + +You may enter a table with an attribute/value for each gene family/partition. Phylocatenator will then select the data based +on that value. + +The format is a tab-delimited file as follows:: + + gene1 3.1 + gene2 2.2 + gene3 0.9 + gene4 6.5 + +You can choose gene partitions based on the attribute value. +For example, if the numbers above represent rate of evolution, you could +choose to include 'slow' genes with a rate less than 2.5 + +------ + +**Additional Information** + +http://osiris-phylogenetics.blogspot.com/2012/10/phylocatenator.html +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------ + +**Citation** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +First used in this paper + +Oakley, Todd H, Joanna M Wolfe, Annie R Lindgren, and Alexander K Zaharoff. 2012. Phylotranscriptomics to Bring the Understudied into the Fold: Monophyletic Ostracoda, Fossil Placement, and Pancrustacean Phylogeny. lecular Biology and Evolution. doi:10.1093/molbev/mss216. + + + diff -r 000000000000 -r 5b9a38ec4a39 alignment/phytab_aliscorecut.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/phytab_aliscorecut.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,144 @@ +import os +import optparse +import subprocess +from multiprocessing import Pool +import shutil + +results_dir = "./data" +results = "results.data" +fasta_extension = ".afa" +alicut_prefix = "ALICUT_" +familyList = [] +galaxy_tool_dir = "/home/galaxy/bin/" +forbidden_chars = { + '(': '__rb__', + ')': '__lb__', + ':': '__co__', + ';': '__sc__', + ',': '__cm__', + '--': '__dd__', + '*': '__st__', + '|': '__pi__', + ' ': '__sp__' +} + + +def unescape(string): + mapped_chars = { + '>': '__gt__', + '<': '__lt__', + "'": '__sq__', + '"': '__dq__', + '[': '__ob__', + ']': '__cb__', + '{': '__oc__', + '}': '__cc__', + '@': '__at__', + '\n': '__cn__', + '\r': '__cr__', + '\t': '__tc__', + '#': '__pd__' + } + + for key, value in mapped_chars.iteritems(): + string = string.replace(value, key) + + return string + + +def unpackData(families): + with open(families) as f: + for line in f: + seq = Sequence(line) + with open(results_dir + os.sep + seq.family + fasta_extension, "a") as p: + p.write(seq.printFASTA()) + + +class Sequence: + def __init__(self, string): + lis = string.split('\t') + self.species = lis[0] + self.family = lis[1] + self.name = lis[2] + self.header = ' '.join(lis[:-1]) + self.sequence = lis[-1] + self.string = string + + def escapedHeader(self): + string = self.header + for key, value in forbidden_chars.iteritems(): + string = string.replace(key, value) + return string + + def printFASTA(self): + return '>' + self.escapedHeader() + '\n' + self.sequence + '\n' + + +def unescapeHeader(header): + string = header + for key, value in forbidden_chars.iteritems(): + string = string.replace(value, key) + return string + + +def toData(text): + text = text.split('\n') + result = '' + for line in text: + if '>' in line: + line = '\n' + unescapeHeader(line.replace('>', "")) + '\t' + line = line.replace(" ", "\t") + result += line + return result[1:] # Index past the first newline char + + +def aliscore(input): + file_name = results_dir + os.sep + input + # print file_name + pop = subprocess.Popen(["perl", "-I", galaxy_tool_dir, galaxy_tool_dir + "Aliscore.02.pl", "-i", file_name]) + pop.wait() + + +def main(): + usage = """%prog [options] +options (listed below) default to 'None' if omitted + """ + parser = optparse.OptionParser(usage=usage) + + parser.add_option( + '-i', '--input', + dest='families', + action='store', + type='string', + metavar="FILE", + help='Name of input sequences.') + + options, args = parser.parse_args() + + families = unescape(options.families) + + os.mkdir(results_dir) + + unpackData(families) + + list_of_files = [file for file in os.listdir(results_dir) if file.lower().endswith(fasta_extension)] + + pool = Pool() + pool.map(aliscore, list_of_files) + + alicut = "ALICUT_V2.0_modified.pl" + shutil.copy(galaxy_tool_dir + alicut, results_dir + os.sep + alicut) + os.chdir(results_dir) + pop = subprocess.Popen(["perl", "./" + alicut]) + pop.wait() + os.chdir("../") + + result = [file for file in os.listdir(results_dir) if file.startswith(alicut_prefix)] + with open(results_dir + os.sep + results, "a") as f: + for file in result: + if file.endswith(fasta_extension): + with open(results_dir + os.sep + file, "r") as r: + f.write(toData(r.read()) + "\n") + +if __name__ == '__main__': + main() diff -r 000000000000 -r 5b9a38ec4a39 alignment/phytab_aliscorecut.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/phytab_aliscorecut.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,60 @@ + + Runs Aliscore then Alicut on an aligned sequence in phytab format. + + aliscore and alicut + + + aliscorecut.py -i $sequence > $aliscorecut_stdout 2>&1 + + + + + + + + + + + +**What it does** + +Aliscore identifies ambiguously aligned regions of a multiple sequence alignment. Alicut deletes sites marked by aliscore. This tool combines both into one tool. + +------ + +**Inputs** + +Aligned sequences in phytab format + +------ + +**Outputs** + +phytab format + +------ + +**Additional information** + +For information on phytab format see: http://osiris-phylogenetics.blogspot.com/2012/09/introduction-to-phytab-format.html + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at +bitbucket.org + +------ + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider +citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Aliscore: Kuck P, Meusemann K, Dambach J, Thormann B, von Reumont BM, et al. (2010) Parametric and non-parametric masking of randomness in sequence alignments can be improved and leads to better +resolved trees. Front Zool 7: 10. + + diff -r 000000000000 -r 5b9a38ec4a39 alignment/phytab_mafft.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/phytab_mafft.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,216 @@ +#!/usr/bin/env python + +import os +import optparse +import subprocess +from multiprocessing import Pool + +directory = "" +results = "results.data" +extension = ".fs" +aligned_extension = ".afa" + + +def unescape(string): + mapped_chars = { + '>': '__gt__', + '<': '__lt__', + "'": '__sq__', + '"': '__dq__', + '[': '__ob__', + ']': '__cb__', + '{': '__oc__', + '}': '__cc__', + '@': '__at__', + '\n': '__cn__', + '\r': '__cr__', + '\t': '__tc__', + '#': '__pd__' + } + + for key, value in mapped_chars.iteritems(): + string = string.replace(value, key) + + return string + + +def isTabular(file): + with open(file) as f: + for line in f: + if line[0] == '>': + return False + return True + + +def toData(text): + text = text.split('\n') + result = '' + for line in text: + if '>' in line: + line = '\n' + line.replace('> ', "") + '\t' + line = line.replace(" ", "\t") + result += line + return result[1:] # Index past the first newline char + +def toDataSingle(text): + text = text.split('\n') + result = '' + for line in text: + line = line + '\n' + result += line + return result[1:] # Index past the first newline char + +def mafftauto(input): + file_name = directory + os.sep + input + aln = file_name + aligned_extension + call = subprocess.call(['mafft', '--auto', '--out', aln, file_name]) + +def mafft1(input): + file_name = directory + os.sep + input + aln = file_name + aligned_extension + call = subprocess.call(['mafft', '--retree', '1', '--out', aln, file_name]) + +def mafft2(input): + file_name = directory + os.sep + input + aln = file_name + aligned_extension + call = subprocess.call(['mafft', '--retree', '2', '--out', aln, file_name]) + +def maffti(input): + file_name = directory + os.sep + input + aln = file_name + aligned_extension + call = subprocess.call(['mafft-fftnsi', '--out', aln, file_name]) + +def maffteinsi(input): + file_name = directory + os.sep + input + aln = file_name + aligned_extension + call = subprocess.call(['mafft-einsi', '--out', aln, file_name]) + +def mafftlinsi(input): + file_name = directory + os.sep + input + aln = file_name + aligned_extension + call = subprocess.call(['mafft-linsi', '--out', aln, file_name]) + +def mafftginsi(input): + file_name = directory + os.sep + input + aln = file_name + aligned_extension + call = subprocess.call(['mafft-ginsi', '--out', aln, file_name]) + +def mafftqinsi(input): + file_name = directory + os.sep + input + aln = file_name + aligned_extension + call = subprocess.call(['mafft-qinsi', '--out', aln, file_name]) + + +class Sequence: + def __init__(self, string): + lis = string.split() + self.species = lis[0] + self.family = lis[1] + self.name = lis[2] + self.header = ' '.join(lis[:-1]) + self.sequence = lis[-1] + self.string = string + + def printFASTA(self): + return '> ' + self.header + '\n' + self.sequence + '\n' + + +def saveMulti(tabFile): + with open(tabFile) as f: + for line in f: + seq = Sequence(line) + with open(directory + os.sep + seq.family + extension, "a") as p: + p.write(seq.printFASTA()) + + +def saveSingle(fastaFile): + with open(fastaFile) as f: + for line in f: + with open(directory + os.sep + "fasta" + extension, "a") as p: + p.write(line) + + +def main(): + usage = """%prog [options] +options (listed below) default to 'None' if omitted + """ + parser = optparse.OptionParser(usage=usage) + + parser.add_option( + '-d', '--directory', + metavar="PATH", + dest='path', + default='.', + help='Path to working directory.') + + parser.add_option( + '-i', '--in', + dest='input', + action='store', + type='string', + metavar="FILE", + help='Name of input data.') + + parser.add_option( + '-s', '--strat', + dest='strategy', + action='store', + type='string', + help='Alignement algorithm to use.') + + options, args = parser.parse_args() + + global directory + inputFile = unescape(options.input) + directory = unescape(options.path) + os.sep + "data" + strategy = unescape(options.strategy) + + os.mkdir(directory) + + if isTabular(inputFile): + saveMulti(inputFile) + else: + saveSingle(inputFile) + + pool = Pool() + list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)] + list_of_files = sorted(list_of_files) + if strategy == 'Auto': + pool.map(mafftauto, list_of_files) + + elif strategy == 'FFT-NS-1': + pool.map(mafft1, list_of_files) + + elif strategy == 'FFT-NS-2': + pool.map(mafft2, list_of_files) + + elif strategy == 'FFT-NS-i': + pool.map(maffti, list_of_files) + + elif strategy == 'E-INS-i': + pool.map(maffteinsi, list_of_files) + + elif strategy == 'L-INS-i': + pool.map(mafftlinsi, list_of_files) + + elif strategy == 'G-INS-i': + pool.map(mafftginsi, list_of_files) + + elif strategy == 'Q-INS-i': + pool.map(mafftqinsi, list_of_files) + + result = [file for file in os.listdir(directory) if file.lower().endswith(aligned_extension)] + if isTabular(inputFile): + with open(directory + os.sep + results, "a") as f: + for file in result: + with open(directory + os.sep + file, "r") as r: + f.write(toData(r.read()) + "\n") + else: + with open(directory + os.sep + results, "a") as f: + for file in result: + with open(directory + os.sep + file, "r") as r: + f.write(toDataSingle(r.read()) + "\n") + +if __name__ == '__main__': + main() + diff -r 000000000000 -r 5b9a38ec4a39 alignment/phytab_mafft.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/phytab_mafft.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,86 @@ + + Multiple Sequence Alignment + + mafft + + + phytab_mafft.py -s $Strategy -i $input > $mafft_stdout 2>&1 + + + + + + + + + + + + + + + + + + + + + +**What it does** +This tool runs MAFFT alignment algorithm on when given a single-gene (fasta or phytab) or a multi-gene (phytab) dataset. +Output for single genes is fasta; output for multi-genes is phytab. + +------ + +**Inputs** + +FASTA (single-gene) or phytab (single- or multi-gene). + +*Algorithm strategies:* + + Auto (FFT-NS-1, FFT-NS-2, FFT-NS-i or L-INS-i; depends on data size; may vary between gene partitions in cases of multi-gene input datasets). + + FFT-NS-1 (Very fast; recommended for more than 2,000 sequences; progressive method). + + FFT-NS-2 (Fast; progressive method). + + FFT-NS-i (Slow; iterative refinement method). + + E-INS-i (Very slow; recommended for less than 200 sequences with multiple conserved domains and long gaps). + + L-INS-i (Very slow; recommended for less than 200 sequences with one conserved domain and long gaps). + + G-INS-i (Very slow; recommended for less than 200 sequences with global homology). + + Q-INS-i (Extremely slow; secondary structure of RNA is considered; recommended for a global alignment of highly diverged ncRNAs with less than 200 sequences × less than 1,000 nucleotides). + +------ + +**Outputs** + +phytab + +------ + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +MAFFT: MAFFT: Katoh, Toh 2008 (Briefings in Bioinformatics 9:286-298) +Recent developments in the MAFFT multiple sequence alignment program. + +MAFFT website: http://mafft.cbrc.jp/alignment/server/index.html + + diff -r 000000000000 -r 5b9a38ec4a39 alignment/phytab_muscle.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/phytab_muscle.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,150 @@ +import os +import optparse +import subprocess +from multiprocessing import Pool + +directory = "" +results = "results.data" +extension = ".fs" +aligned_extension = ".afa" + + +def unescape(string): + mapped_chars = { + '>': '__gt__', + '<': '__lt__', + "'": '__sq__', + '"': '__dq__', + '[': '__ob__', + ']': '__cb__', + '{': '__oc__', + '}': '__cc__', + '@': '__at__', + '\n': '__cn__', + '\r': '__cr__', + '\t': '__tc__', + '#': '__pd__' + } + + for key, value in mapped_chars.iteritems(): + string = string.replace(value, key) + + return string + + +def isTabular(file): + with open(file) as f: + for line in f: + if line[0] == '>': + return False + return True + + +def toData(text): + text = text.split('\n') + result = '' + for line in text: + if '>' in line: + line = '\n' + line.replace('> ', "") + '\t' + line = line.replace(" ", "\t") + result += line + return result[1:] # Index past the first newline char + +def toDataSingle(text): + text = text.split('\n') + result = '' + for line in text: + line = line + '\n' + result += line + return result[1:] # Index past the first newline char + +def muscle(input): + file_name = directory + os.sep + input + popen = subprocess.Popen(['muscle', "-in", file_name, "-out", file_name + aligned_extension]) # ./muscle + popen.wait() + + popen = subprocess.Popen(['pwd']) # ./muscle + popen.wait() + + +class Sequence: + def __init__(self, string): + lis = string.split() + self.species = lis[0] + self.family = lis[1] + self.name = lis[2] + self.header = ' '.join(lis[:-1]) + self.sequence = lis[-1] + self.string = string + + def printFASTA(self): + return '> ' + self.header + '\n' + self.sequence + '\n' + + +def saveMulti(tabFile): + with open(tabFile) as f: + for line in f: + seq = Sequence(line) + with open(directory + os.sep + seq.family + extension, "a") as p: + p.write(seq.printFASTA()) + + +def saveSingle(fastaFile): + with open(fastaFile) as f: + for line in f: + with open(directory + os.sep + "fasta" + extension, "a") as p: + p.write(line) + + +def main(): + usage = """%prog [options] +options (listed below) default to 'None' if omitted + """ + parser = optparse.OptionParser(usage=usage) + + parser.add_option( + '-d', '--directory', + metavar="PATH", + dest='path', + default='.', + help='Path to working directory.') + + parser.add_option( + '-i', '--in', + dest='input', + action='store', + type='string', + metavar="FILE", + help='Name of input data.') + + options, args = parser.parse_args() + + global directory + inputFile = unescape(options.input) + directory = unescape(options.path) + os.sep + "data" + + os.mkdir(directory) + + if isTabular(inputFile): + saveMulti(inputFile) + else: + saveSingle(inputFile) + + pool = Pool() + list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)] + pool.map(muscle, list_of_files) + + result = [file for file in os.listdir(directory) if file.lower().endswith(aligned_extension)] + if isTabular(inputFile): + with open(directory + os.sep + results, "a") as f: + for file in result: + with open(directory + os.sep + file, "r") as r: + f.write(toData(r.read()) + "\n") + else: + with open(directory + os.sep + results, "a") as f: + for file in result: + with open(directory + os.sep + file, "r") as r: + f.write(toDataSingle(r.read()) + "\n") + +if __name__ == '__main__': + main() diff -r 000000000000 -r 5b9a38ec4a39 alignment/phytab_muscle.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/phytab_muscle.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,60 @@ + + MUSCLE: Multiple sequence alignment. Input can be FASTA or phytab format. + + muscle + + + phytab_muscle.py -i $data > $muscle_stdout 2>&1 + + + + + + + + + + + +**What it does** + +Uses MUSCLE to perform multiple sequence alignment on multiple gene families in parallel. + +------ + +**Inputs** + +Can take as input fasta or phytab format. Phytab allows alignment in parallel of multiple separate gene families/paralogs. See: +http://osiris-phylogenetics.blogspot.com/2012/09/introduction-to-phytab-format.html + +------ + +**Outputs** + +Either a fasta file or phytab file of aligned sequences + +------- + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Runs MUSCLE 3.8 Multiple Sequence Alignment +See MUSCLE help: http://www.drive5.com/muscle/muscle_userguide3.8.html + + + + + diff -r 000000000000 -r 5b9a38ec4a39 alignment/phytab_mview.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/phytab_mview.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,138 @@ +#!/usr/bin/env python +## usage: ./phytab_mview.py -i -d +## splits up an aligned phytab file containing multiple genes into +## individual files to run mview + +import sys, os, os.path, tempfile, shutil, re, shlex, subprocess +import optparse +from multiprocessing import Pool + +#define some variables to call later: + +directory = "" +extension = ".fs" +html_header = """ + + + + + +

PHYTAB MVIEW ALIGNMENT VIEWER

+
Select from below to view aligned sequence as HTML (left) or FASTA (right) in browser.
+
+
Species
'; + for(my $i=1; $i < @genelist+1; $i++){ +#Genes are printing in wrong order +# print TABLE "
$genelist[$i]$i
+ + + + """ +html_close = """ +

MView
+ +""" + +#define some functions to call in 'main': +# first, sanitize problematic characters +def unescape(string): + mapped_chars = { + '>': '__gt__', + '<': '__lt__', + "'": '__sq__', + '"': '__dq__', + '[': '__ob__', + ']': '__cb__', + '{': '__oc__', + '}': '__cc__', + '@': '__at__', + '\n': '__cn__', + '\r': '__cr__', + '\t': '__tc__', + '#': '__pd__' + } + + for key, value in mapped_chars.iteritems(): + string = string.replace(value, key) + + return string +# next, define tabular --> fasta conversion +class Sequence: + def __init__(self, string): + lis = string.split() + self.species = lis[0] + self.family = lis[1] + self.name = lis[2] + self.header = ' '.join(lis[:-1]) + self.sequence = lis[-1] + self.string = string + + def printFASTA(self): + return '> ' + self.header + '\n' + self.sequence + '\n' + +# then define function to apply preceding conversion method to all genes +# (creates separate file for each gene) +def saveMulti(tabFile): + with open(tabFile) as f: + for line in f: + seq = Sequence(line) + with open(seq.family + extension, "a") as p: + p.write(seq.printFASTA()) + +#subroutine to write main HTML output containing valid urls to mview htmls +def resultsto_output_html(html_mainoutput,basepath): + htmllist = [f for f in os.listdir(basepath) if 'html' in f] + sortedhtmllist = sorted(htmllist) + html = open(html_mainoutput, 'w') + html.write(html_header) + for f in sortedhtmllist: + f_path = os.path.join(basepath,f) + htmllink = '

\n' + html.write(htmllink) + html.write(html_close) + html.close() + +def main(): +#the command line arguments from the xml: + """ + ##params for galaxy wrapper + $input + $dna + $output + "$output.extra_files_path" #save the htmlfiles here + """ + inputphytabfile = sys.argv[1] + dnaorprotein = sys.argv[2] + output = sys.argv[3] + extra_files_path = sys.argv[4] + + inputFile = unescape(inputphytabfile) + ##make the fasta files + saveMulti(inputFile) + + #prepare to put mview htmls into valid path + + if not os.path.isdir(extra_files_path): #make filepath for alns to go with galaxy info + os.makedirs(extra_files_path) + + # execute mview on each fasta, storing in extra_files_path as .html + list_of_fastafiles = [f for f in os.listdir(os.getcwd()) if 'fs' in f] + sortedfileorder = sorted(list_of_fastafiles) + for gene_aln in sortedfileorder: + result_htmlfile = gene_aln + '.html' + result_path = os.path.join(extra_files_path,result_htmlfile) #puts the htmls in permanent Galaxy directory + if dnaorprotein is 'dna': + cmd = subprocess.Popen(['mview','-in','pearson','-DNA','-bold','-coloring','group','-html','head', gene_aln],stdout=subprocess.PIPE) + else: + cmd = subprocess.Popen(['mview','-in','pearson','-bold','-coloring','group','-html','head', gene_aln],stdout=subprocess.PIPE) + cmd.wait() + out = cmd.communicate()[0] + + with open(result_path, 'wb') as fileout: + fileout.write(out) + ##now have # of gene htmls in extra_files_path/ + + #write main html output + resultsto_output_html(output,extra_files_path) + + +if __name__ == '__main__': + main() + diff -r 000000000000 -r 5b9a38ec4a39 alignment/phytab_mview.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/phytab_mview.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,69 @@ + + View multiple phytab sequence alignments in HTML + mview + phytab_mview.py $phytabinput $dna $output "$output.extra_files_path" + + + + + + + + + + + + + + +**What it does** + +This tool uses the MVIEW package to convert an aligned sequences file in phytab format (ie, phytab_PRANK or phytab_MUSCLE output) to HTML for visualizing each gene's alignment directly in Galaxy. + +------ + +**Input** + +ALIGNED phytab example input (fields are tab-delimited):: + + Species_1 GeneA UniqueID AAAATGCCTA-GTC + Species_2 GeneA UniqueID AAATTGCCTA-GTG + Species_3 GeneA UniqueID AAAATGGCTAGGAC + Species_1 GeneB UniqueID TACGTAG-CTTGATCCTATAA + Species_2 GeneB UniqueID TACGTAGTCTAGATCCTATAA + Species_3 GeneB UniqueID TACGTAGTCTTGATGGTATAA + +------ + +**Output** + +links to html files for viewing multiple sequence alignments + +------ + +**Additional Information** + +For large datasets, this tool can be slow, as this version runs serially. For a +parallel version that sends different genes to different processors, contact +ucsb_phylogenetics@lifesci.ucsb.edu + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at +bitbucket.org + +------ + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Citation for MView: +Brown, N.P., Leroy C., Sander C. (1998). MView: A Web compatible database search or multiple alignment viewer. Bioinformatics. 14(4):380-381. + + + diff -r 000000000000 -r 5b9a38ec4a39 alignment/phytab_pal2nal.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/phytab_pal2nal.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,63 @@ +#!/usr/bin/env python + +""" +pal2nal.py +---------- +Runs pal2nal.pl on GENENAME protein/DNA file pairings. Each file with protein + information must have a corresponding file with DNA information for its + gene. Depends on the Splitter class from splitterforpal2nal.py, which must + be in the same path as this script Designed for use with the Galaxy + bioinformatics platform. +""" + +import os +import sys +import zipfile +import shutil +from splitter import Splitter + +__author__ = 'William Chen' +__license__ = 'BSD 2-clause' +__status__ = 'In development' + +# declare working directory +CDIR = '/home/galaxy/galaxy-dist/tools/Chen_dev/splitforpal2nal/' + +# get inputs for pal2nal.pl +protFile = sys.argv[1] # Phytab file with protein info +dnaFile = sys.argv[2] # Phytab file with DNA info +outputType = sys.argv[3] +blockonly = sys.argv[4] +nogap = sys.argv[5] +nomismatch = sys.argv[6] +codontable = sys.argv[7] +outFormat = sys.argv[8] + +# remove whatever is at the file location marked as the output, if exists +try: + os.remove(sys.argv[9]) +except OSError: + pass +# change working directory to where we want +os.chdir(CDIR) +# call Splitter from splitterforpal2nal.py and split the Phytab files into +# FASTA files +sp = Splitter(dnaFile=dnaFile, protFile=protFile) +sp.generateFasta() +# create zip file to collect pal2nal.pl output +zip = zipfile.ZipFile('myzip.zip', 'w') +# run FASTA output through pal2nal.pl and write pal2nal.pl's output into a zip +# file +# TODO: Use Popen instead of os.system +for key in sp.getProteinInfo(): + os.system('perl pal2nal.pl ' + key + 'Prot.fas ' + key + 'DNA.fas' + ' ' + + outputType + ' ' + blockonly + ' ' + nogap + ' ' + nomismatch + + ' ' + codontable + ' -output ' + outFormat + ' 1> out' + key + + ' 2>/dev/null') + zip.write('out' + key) + +# move the zip file to the output location +os.system('mv myzip.zip ' + sys.argv[9]) + +# print the output location for reference +print(sys.argv[9]) diff -r 000000000000 -r 5b9a38ec4a39 alignment/phytab_pal2nal.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/phytab_pal2nal.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,96 @@ + + Convert tab-delimited multiple sequence alignment of proteins and nucleotides into codon alignments + phytab_pal2nal.py $alnfile $nucfile fasta $blockonly $nogap $nomismatch $codontable $format $output1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +"PAL2NAL is a program that converts a multiple sequence alignment of proteins and the corresponding DNA (or mRNA) sequences into a codon alignment. The program automatically assigns the corresponding codon sequence even if the input DNA sequence has mismatches with the input protein sequence, or contains UTRs, polyA tails. It can also deal with frame shifts in the input alignment, which is suitable for the analysis of pseudogenes. The resulting codon alignment can further be subjected to the calculation of synonymous (dS) and non-synonymous (dN) substitution rates." (PAL2NAL web server; http://www.bork.embl.de/pal2nal) + +Phytab PAL2NAL v. 14 generates codon alignments for phytab input files (phytab protein alignment and corresponding phytab DNA or mRNA alignment). + +------ + +**Inputs** + +Input file 1: An alignment of protein sequences in phytab format. +Input file 2: An alignment of corresponding (to Input file 1) DNA or mRNA sequences in phytab format. + +Introduction to phytab format: http://osiris-phylogenetics.blogspot.com/2012/09/introduction-to-phytab-format.html + +------ + +**Outputs** + +The output is a zipped file of individual codon alignments for each protein in FASTA format. + +------- + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing t$ + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Mikita Suyama, David Torrents, and Peer Bork (2006) +PAL2NAL: robust conversion of protein sequence alignments into the corresponding codon alignments. +Nucleic Acids Res. 34, W609-W612. + + + + diff -r 000000000000 -r 5b9a38ec4a39 alignment/phytab_prank.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/phytab_prank.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,135 @@ +import os +import optparse +import subprocess +from multiprocessing import Pool + +directory = "" +results = "results.data" +extension = ".fs" +aligned_extension = ".afa" +output_extension = ".afa.2.fas" + + +def unescape(string): + mapped_chars = { + '>': '__gt__', + '<': '__lt__', + "'": '__sq__', + '"': '__dq__', + '[': '__ob__', + ']': '__cb__', + '{': '__oc__', + '}': '__cc__', + '@': '__at__', + '\n': '__cn__', + '\r': '__cr__', + '\t': '__tc__', + '#': '__pd__' + } + + for key, value in mapped_chars.iteritems(): + string = string.replace(value, key) + + return string + + +def isTabular(file): + with open(file) as f: + for line in f: + if line[0] == '>': + return False + return True + + +def toData(text): + text = text.split('\n') + result = '' + for line in text: + if '>' in line: + line = '\n' + line.replace('>__XX__', "") + '\t' + line = line.replace("__XX__", "\t") + result += line + return result[1:] # Index past the first newline char + + +def prank(input): + file_name = directory + os.sep + input + popen = subprocess.Popen(['pwd']) + popen.wait() + popen = subprocess.Popen(['prank', "-d=" + file_name, "-o=" + file_name + aligned_extension, "-quiet"]) + popen.wait() + +class Sequence: + def __init__(self, string): + lis = string.split() + self.species = lis[0] + self.family = lis[1] + self.name = lis[2] + self.header = '__XX__'.join(lis[:-1]) #prank replaces space with _ so can't join with spaces like muscle does + self.sequence = lis[-1] + self.string = string + + def printFASTA(self): + return '>__XX__' + self.header + '\n' + self.sequence + '\n' + + +def saveMulti(tabFile): + with open(tabFile) as f: + for line in f: + seq = Sequence(line) + with open(directory + os.sep + seq.family + extension, "a") as p: + p.write(seq.printFASTA()) + + +def saveSingle(fastaFile): + with open(fastaFile) as f: + for line in f: + with open(directory + os.sep + "fasta" + extension, "a") as p: + p.write(line) + + +def main(): + usage = """%prog [options] +options (listed below) default to 'None' if omitted + """ + parser = optparse.OptionParser(usage=usage) + + parser.add_option( + '-d', '--directory', + metavar="PATH", + dest='path', + default='.', + help='Path to working directory.') + + parser.add_option( + '-i', '--in', + dest='input', + action='store', + type='string', + metavar="FILE", + help='Name of input data.') + + options, args = parser.parse_args() + + global directory + inputFile = unescape(options.input) + directory = unescape(options.path) + os.sep + "data" + + os.mkdir(directory) + + if isTabular(inputFile): + saveMulti(inputFile) + else: + saveSingle(inputFile) + + pool = Pool() + list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)] + pool.map(prank, list_of_files) + result = [file for file in os.listdir(directory) if file.lower().endswith(output_extension)] + with open(directory + os.sep + results, "a") as f: + for file in result: + with open(directory + os.sep + file, "r") as r: + f.write(toData(r.read()) + "\n") + +if __name__ == '__main__': + main() diff -r 000000000000 -r 5b9a38ec4a39 alignment/phytab_prank.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/phytab_prank.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,64 @@ + + PRANK: Probabilistic Alignment Kit. Input can be FASTA or phytab format. + + prank + + + phytab_prank.py -i $data > $prank_stdout 2>&1 + + + + + + + + + + + +**What it does** + +PRANK is a probabilistic multiple alignment program for DNA, codon and amino-acid sequences. It's based on a novel algorithm that treats insertions correctly and avoids over-estimation of the number of deletion events. In addition, PRANK borrows ideas from maximum likelihood methods used in phylogenetics and correctly takes into account the evolutionary distances between sequences. Lastly, PRANK allows for defining a potential structure for sequences to be aligned and then, simultaneously with the alignment, predicts the locations of structural units in the sequences. From the PRANK website: http://code.google.com/p/prank-msa/wiki/PRANK?tm=6 + +------ + +**Inputs** + +FASTA or phytab file format. + +------ + +**Outputs** + +phytab file format. Introduction to phytab: +http://osiris-phylogenetics.blogspot.com/2012/09/introduction-to-phytab-format.html + +------- + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the +osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a +publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +PRANK was developed by Ari Löytynoja and Nick Goldman and is maintained at http://code.google.com/p/prank-msa/ + +A citation for PRANK is: +Phylogeny-Aware Gap Placement Prevents Errors in Sequence Alignment and Evolutionary Analysis. Science 320:1632-1635. + + + + + diff -r 000000000000 -r 5b9a38ec4a39 alignment/phytab_ssr.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/phytab_ssr.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,170 @@ +#!/usr/bin/env python + +import os +import optparse +import subprocess +from multiprocessing import Pool +from functools import partial + +directory = "" +results = "results.data" +extension = ".fs" +slim_extension = ".slim" +jarpath='/home/galaxy/pkgs/SSR/SimilarSequenceRemover.jar' + +def unescape(string): + mapped_chars = { + '>': '__gt__', + '<': '__lt__', + "'": '__sq__', + '"': '__dq__', + '[': '__ob__', + ']': '__cb__', + '{': '__oc__', + '}': '__cc__', + '@': '__at__', + '\n': '__cn__', + '\r': '__cr__', + '\t': '__tc__', + '#': '__pd__' + } + + for key, value in mapped_chars.iteritems(): + string = string.replace(value, key) + + return string + + +def isTabular(file): + with open(file) as f: + for line in f: + if line[0] == '>': + return False + return True + + +def toData(text): + text = text.split('\n') + result = '' + for line in text: + if '>' in line: + line = '\n' + line.replace('> ', "") + '\t' + line = line.replace(" ", "\t") + result += line + return result[1:] # Index past the first newline char + +def toDataSingle(text): + text = text.split('\n') + result = '' + for line in text: + line = line + '\n' + result += line + return result[1:] # Index past the first newline char + +def runSimilarSequenceRemover(input): + input_name = directory + os.sep + input + outfasta = input_name + slim_extension + call = subprocess.call(['java','-jar', jarpath, '-i', input_name, '-o', outfasta, 's=' + percentage, '-h', alignallornot ]) + +class Sequence: + def __init__(self, string): + lis = string.split() + self.species = lis[0] + self.family = lis[1] + self.name = lis[2] + self.header = ' '.join(lis[:-1]) + self.sequence = lis[-1] + self.string = string + + def printFASTA(self): + return '> ' + self.header + '\n' + self.sequence + '\n' + + +def saveMulti(tabFile): + with open(tabFile) as f: + for line in f: + seq = Sequence(line) + with open(directory + os.sep + seq.family + extension, "a") as p: + p.write(seq.printFASTA()) + + +def saveSingle(fastaFile): + with open(fastaFile) as f: + for line in f: + with open(directory + os.sep + "fasta" + extension, "a") as p: + p.write(line) + + +def main(): + usage = """%prog [options] +options (listed below) default to 'None' if omitted + """ + parser = optparse.OptionParser(usage=usage) + + parser.add_option( + '-d', '--directory', + metavar="PATH", + dest='path', + default='.', + help='Path to working directory.') + + parser.add_option( + '-i', '--in', + dest='input', + action='store', + type='string', + metavar="FILE", + help='Name of input data.') + + parser.add_option( + '-a', '--alignall', + dest='alignall', + action='store', + type='string', help='t or f. t only aligns first 100 sites instead of entire seq') + + parser.add_option( + '-s', '--similarity', + dest='similarity', + action='store', + type='float', + help='Percentage similarity as cutoff (eg 0.99).') + + options, args = parser.parse_args() + + global directory + global percentage + global alignallornot + inputFile = unescape(options.input) + directory = unescape(options.path) + os.sep + "data" + percentage = str(options.similarity) + alignallornot = options.alignall + + os.mkdir(directory) + + if isTabular(inputFile): + saveMulti(inputFile) + else: + saveSingle(inputFile) + + pool = Pool() + list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)] + list_of_files = sorted(list_of_files) + + pool.map(runSimilarSequenceRemover, list_of_files) + + result = [file for file in os.listdir(directory) if file.lower().endswith(slim_extension)] + if isTabular(inputFile): + with open(directory + os.sep + results, "a") as f: + for file in result: + with open(directory + os.sep + file, "r") as r: + f.write(toData(r.read()) + "\n") + else: + with open(directory + os.sep + results, "a") as f: + for file in result: + with open(directory + os.sep + file, "r") as r: + f.write(r.read() +'\n') + +if __name__ == '__main__': + main() + + diff -r 000000000000 -r 5b9a38ec4a39 alignment/phytab_ssr.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/phytab_ssr.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,53 @@ + + Removes redundant sequences + + java + + phytab_ssr.py -i $in -s $percentage -a $h + + + + + + + + + + + + +**What it does** + +phytab Similar Sequence Remover will take a set of sequences in either FASTA or phytab format and remove redundant sequences based on the minimum similarity percentage specified. + +------ + +**Inputs** + +FASTA, phytab + +------ + +**Outputs** + +FASTA, phytab + +------- + +**Additional Information** + +Introduction to phytab format: http://osiris-phylogenetics.blogspot.com/2012/09/introduction-to-phytab-format.html + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + diff -r 000000000000 -r 5b9a38ec4a39 alignment/prank.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/prank.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,15 @@ +#!/usr/bin/perl + +my $input = $ARGV[0]; +my $format = $ARGV[1]; +my $missing = $ARGV[2]; +my $output = "output"; +my $fparam; + +if($missing eq 'yes'){ + $fparam = "-F"; +}else{ + $fparam = ""; +} +my $run = qx/prank -d=$input -o=$output -f=$format $fparam/; +print $run; diff -r 000000000000 -r 5b9a38ec4a39 alignment/prank.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/prank.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,68 @@ + + Phylogeny Aware Multiple Sequence Alignment + + prank + + + prank.pl $input1 $format $missing + + + + + + + + + + + + + + + +**What it does** + +"PRANK is a probabilistic multiple alignment program for DNA, codon and amino-acid sequences. It's based on a novel +algorithm that treats insertions correctly and avoids over-estimation of the number of deletion events. In addition, +PRANK borrows ideas from maximum likelihood methods used in phylogenetics and correctly takes into account the +evolutionary distances between sequences. Lastly, PRANK allows for defining a potential structure for sequences to be +aligned and then, simultaneously with the alignment, predicts the locations of structural units in the sequences." +From the PRANK website: http://code.google.com/p/prank-msa/wiki/PRANK?tm=6 + +------ + +**Inputs** + +FASTA + +------ + +**Outputs** + +FASTA + +------- + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +PRANK was developed by Ari Löytynoja and Nick Goldman and is maintained at http://code.google.com/p/prank-msa/ + +The WebCite online citation for PRANK is available here: +http://www.webcitation.org/query.php?url=http://tinyurl.com/prank-msa&refdoi=10.1186/1471-2105-11-579 + + diff -r 000000000000 -r 5b9a38ec4a39 alignment/profilemafft.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/profilemafft.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,9 @@ + +#!/usr/bin/perl + +my $new_sequences = $ARGV[0]; +my $existing_alignment = $ARGV[1]; +my $output = "seqs_aligned.fasta"; + +system "mafft --add $new_sequences --reorder $existing_alignment > $output 2>log.txt "; + diff -r 000000000000 -r 5b9a38ec4a39 alignment/profilemafft.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/profilemafft.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,61 @@ + + Add Sequence(s) to existing Multiple Sequence Alignment + + mafft + + + profilemafft.pl $new_sequences $existing_alignment + + + + + + + + + + + + +**What it does** + +This tool runs MAFFT profile alignment to add new sequences to an existing alignment + +------ + +**Inputs** + +FASTA -- aligned sequences +FASTA -- new sequences to add to alignment + +------ + +**Outputs** + +aligns new sequences into existing alignment outputting fasta + +------ + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +MAFFT: MAFFT: Katoh, Toh 2008 (Briefings in Bioinformatics 9:286-298) +Recent developments in the MAFFT multiple sequence alignment program. + +MAFFT website: http://mafft.cbrc.jp/alignment/server/index.html + + + diff -r 000000000000 -r 5b9a38ec4a39 alignment/prottest.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/prottest.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,59 @@ + + Selection of best-fit models of protein evolution. + + prottest + + + prottest_wrapper.pl -i $input -o $output + + + + + + + + + + +**What it does** + +ProtTest is a bioinformatic tool for the selection of the most appropriate model of protein evolution (among the set of candidate models) for the data at hand. ProtTest makes this selection by finding the model with the smallest Akaike Information Criterion (AIC) or Bayesian Information Criterion (BIC) score. At the same time, ProtTest obtains model-averaged estimates of different parameters (Posada and Buckley 2004) and calculates the importance of each of these parameters. ProtTest differs from its nucleotide homolog Modeltest (Posada and Crandall 1998) in that it does not include likelihood ratio tests (many models implemented in ProtTest are not nested). + +------ + +**Inputs** + +Phylip recommended + +------ + +**Outputs** + +Text file + +------- + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at +bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please +consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +ProtTest 2.4 link: http://darwin.uvigo.es/software/prottest.html + +Citation: +Abascal F, Zardoya R, Posada, D. 2005. ProtTest: Selection of best-fit models of protein evolution. Bioinformatics: 21(9):2104-2105. + + diff -r 000000000000 -r 5b9a38ec4a39 alignment/prottest_wrapper.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/prottest_wrapper.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,16 @@ +#!/usr/bin/perl + +use warnings; +use strict; +use Cwd; + +my $dir=getcwd(); + +#protest directory placed in main user path. Also, changed runProttest +#script to include full path of jar file +my $prottestPath='/home/galaxy/pkgs/ProtTest2.4'; + +my $input=$ARGV[1]; +my $output=$ARGV[3]; + +system "$prottestPath/runProtTest -i $input -o $output" ; diff -r 000000000000 -r 5b9a38ec4a39 alignment/seqfill.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/seqfill.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,139 @@ +#!/usr/bin/perl + +my $file = $ARGV[0]; +my $q_mark = $ARGV[1]; +my $hyphen = $ARGV[2]; +my $N = $ARGV[3]; +my $usePartFile = $ARGV[4]; +my $partFile = $ARGV[5]; + +my $out = "out.phylipnon"; # output file + +open(FILE, $file); + my @speciesNames; + my @sequenceLines; + + my @currentLineContent; + + my $i = 0; + while($currentLine = ) { + chomp($currentLine); + @currentLineContent = split(/\t/, $currentLine); + $speciesNames[$i] = $currentLineContent[0]; + $sequenceLines[$i] = $currentLineContent[1]; + $i++; + } + + my $dataInfo = $speciesNames[1]; # gets num of species and sequence length + my @numbers = split(/ /, $dataInfo); + + my $numberOfSpecies = $numbers[0]; + my $sequenceLength = $numbers[1]; + +close(FILE); + +open(OUT, '>'.$out); + my @columnData; # this will have $sequenceLength elements + for($j = 0; $j < $numberOfSpecies+2; $j++) { + for($k = 0; $k < $sequenceLength; $k++) { + $currChar = substr($sequenceLines[$j], $k, 1); + $columnData[$k] = $columnData[$k].$currChar; + } + } + + # mark locations that will be removed + my @flagMap; + for($i = 0; $i < $sequenceLength; $i++) { + $flagMap[$i] = 0; + } + my $index = 0; + foreach $el(@columnData) { + my $tot = 0; + my $q_mark_occur = 0; + my $hyphen_occur = 0; + my $N_occur = 0; + + if($q_mark eq "true") { + $q_mark_occur = ($el =~ tr/?//); + } + if($hyphen eq "true") { + $hyphen_occur = ($el =~ tr/-//); + } + if($N eq "true") { + $N_occur = ($el =~ tr/N//); + } + + $tot = $q_mark_occur + $hyphen_occur + $N_occur; + if($tot == $numberOfSpecies) { + $flagMap[$index] = 1; + } + $index++; + } + + my $newSequenceLength = $sequenceLength; + foreach $el(@flagMap) { + if($el == 1) { + $newSequenceLength--; + } + } + + print OUT $speciesNames[0]."\n"; + print OUT $numberOfSpecies." ".$newSequenceLength."\n"; + for($i = 2; $i < $numberOfSpecies+3; $i++) { + print OUT $speciesNames[$i]."\t"; + for($j = 0; $j < $sequenceLength; $j++) { + if($flagMap[$j] == 0) { + my $character = substr($sequenceLines[$i], $j, 1); + print OUT $character; + } + } + print OUT "\n"; + } + +close(OUT); + +my $partOut = "partOut.txt"; + +if($usePartFile eq "true") { + # update the partition file + open(PART, $partFile); + my @data; + my @ranges; + my @names; + $i = 0; + while($currentLine = ) { + @data = split(/=/, $currentLine); + $names[$i] = $data[0]; + $ranges[$i] = $data[1]; + $i++; + } + close(PART); + + my $firstFlag = 1; + open(PARTOUT, '>'.$partOut); + $j = 0; + my $newLower; + foreach $el(@ranges) { + print PARTOUT $names[$j]." = "; + @lowerUpper = split(/-/, $el); + if($firstFlag == 1) { + $newLower = $lowerUpper[0]; + $firstFlag = 0; + } + my $currUpper = $lowerUpper[1]; + my $newUpper = $currUpper; + + + + for($i = $currLower; $i < $currUpper; $i++) { + if($flagMap[$i] == 1) { + $newUpper--; + } + } + + print PARTOUT $newLower." - ".$newUpper."\n"; + $newLower = $newUpper + 1; + $j++; + } + close(PARTOUT); +} diff -r 000000000000 -r 5b9a38ec4a39 getdata/gb_gene_summary.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getdata/gb_gene_summary.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,57 @@ + + Summarizes gene names in a GenBank flatfile + + genbankstrip.pl "-f"$data_file "-l"$length "-ts"$species $sp > $logfile + + + + + + + + + + + + +**What it does** + +Summarizes gene names in a genbank flatfile. + +------ + +**Inputs** + +A genbank flatfile. + +------ + +**Outputs** + +A summary of gene names, and how many species are have data available for each gene. + +------- + +**Additional Information** + +The gene names can be used to pull genes using genbankstrip. +Often, the same gene will have many names in genbank. These names can be synonymized by altering the genbankstrip.pl script. + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + + + + + + diff -r 000000000000 -r 5b9a38ec4a39 getdata/genbankstrip.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getdata/genbankstrip.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,1708 @@ +#!/usr/bin/perl -w +# +# Modified for Osiris by THO. +# +# +# GenBankStrip.pl v2.0 +# Last modified July 25, 2005 14:29 +# (c) Olaf R.P. Bininda-Emonds +# +# Input: +# 1) A GenBank output file +# 2) A file containing gene names, each on a separate line +# +# Generates: +# 1) A Se-Al formatted (and optionally a nexus-formatted) datafile containing stripped sequences +# 2) A summary file giving status of each entry in GenBank file +# +# Usage: GenBankStrip.pl -f [-g] [-k] [-l] [-o] [-s] [-t] [-h] [-v] +# options: -f = file containing sequences in GenBank format +# -g = file containing specific genes to be stripped +# -i = strip a single gene +# -k = number of (longest) sequences to retain per species for a given gene (default = all) +# -l = minimum length required for all non-tRNA genes (default = none) +# -o = provide output in nexus (n) phytab (p) and/or Se-Al (s) format in addition to fasta format +# -s = only process sequences with valid species names (i.e., no species with sp. or cf. in names) +# -t|s> = minimum number of sequences (g; default = 0) or species (s; default = 0) a gene must have to be retained; both can be specified simultaneously +# -h = print this message and quit +# -v = verbose output +# -a = Accession number appears first in stripped Fasta File -- Added by THO +use strict; + +# Set default values + # Genes and sequences + my %synonym; + my %complement = ('A' => 'T', 'C' => 'G', 'G' => 'C', 'T' => 'A', + 'M' => 'K', 'R' => 'Y', 'W' => 'S', 'S' => 'W', 'Y' => 'R', 'K' => 'M', + 'B' => 'V', 'D' => 'H', 'H' => 'D', 'V' => 'B', 'N' => 'N', '-' => '-', '?' => '?'); + my %maxLength; + + # I/O + my $gbFile; + my ($countFile, $rejectFile, $stripFile); + my $geneFile; + my %userGene; + my $singleGene; + my $keepGene = 0; + my $minLength; + my $minLengthTRNA; + my $speciesBlock = 0; + my $seqThreshold = 0; + my $speciesThreshold = 0; + my $sealPrint = 0; + my $nexusPrint = 0; + my $phytabPrint = 0; + + # Global stats + my (@globalGeneList, %globalGenePresent, %geneStatus, @rejectList, $stripCount); + my (@speciesList, %speciesPresent, %speciesGenePresent, %quickSpeciesCount, %speciesCount, %quickSequenceCount, %sequenceCount); + $sequenceCount{"all"} = 0; + + # Miscellaneous + my $debug = 0; + my $verbose = 0; + my $accessionFirst = 0; #Added by THO as option to have accession # first in FASTA output + +for (my $i = 0; $i <= $#ARGV; $i++) + { +#Original stripped out directory structure, required by Galaxy +# if ($ARGV[$i] =~ /^-f([\w+\.?\w+?]*)/) + if ($ARGV[$i] =~ /^-f(.+)/) + { + $gbFile = $1; + +# (my $baseFile = $gbFile) =~ s/.\w+$//; +# $countFile .= $baseFile . "_geneCount.txt"; +# $rejectFile .= $baseFile . "_gbs.rejectlist.txt"; +# $stripFile .= $baseFile . "_gbs.striplist.txt"; +# } +#for Galaxy, easier to have same file name each time it'r run + (my $baseFile = $gbFile) =~ s/.\w+$//; + $countFile .= "geneCount.txt"; + $rejectFile .= "rejectlist.txt"; + $stripFile .= "striplist.txt"; + } + elsif ($ARGV[$i] =~ /^-g([\w+\.?\w+?]*)/) + { + $geneFile = $1; + +#This is a hack to write the name of a single gene into a new genefile based on the Galaxy call + +open (GENE,">$geneFile") or die "Cannot open file $geneFile containing gene names.\n"; +print GENE "$geneFile\n"; +close GENE; + } + elsif ($ARGV[$i] =~ /^-i([\w+\.?\w+?]*)/) + { + $singleGene = $1; + } + elsif ($ARGV[$i] =~ /^-k(\d+)/) + { + $keepGene = $1; + } + elsif ($ARGV[$i] =~ /^-l(\d+)/) + { + $minLength = $1; + $minLengthTRNA = 50; + } + elsif ($ARGV[$i] =~ /^-on/) + { + $nexusPrint = 1; + } + elsif ($ARGV[$i] =~ /^-op/) + { + $phytabPrint = 1; + } + elsif ($ARGV[$i] =~ /^-os/) + { + $sealPrint = 1; + } + elsif ($ARGV[$i] =~ /^-s/) + { + $speciesBlock = 1; + } + elsif ($ARGV[$i] =~ /^-tg(\d+)/) + { + $seqThreshold = $1; + } + elsif ($ARGV[$i] =~ /^-ts(\d+)/) + { + $speciesThreshold = $1; + } + elsif ($ARGV[$i] =~ /^-v/) + { + $verbose = 1; + } + elsif ($ARGV[$i] =~ /^-a/) + { + $accessionFirst = 1; + } + elsif ($ARGV[$i] =~ /^-x/) + { + $debug = 1; + $verbose = 1; + } + elsif ($ARGV[$i] =~ /^-h/) + { + print "Usage: GenBankStrip.pl -f [-g] [-k] [-l] [-o] [-s] [-t] [-h] [-v]\n"; + print "Options: -f = file containing sequences in GenBank format\n"; + print " -g = file containing specific genes to be stripped\n"; + print " -k = number of (longest) sequences to retain per species for a given gene (default = all)\n"; + print " -l = minimum length required for all non-tRNA genes (default = none)\n"; + print " -o = provide output in nexus (n) phytab (p) and/or Se-Al (s) format in addition to fasta format\n"; + print " -s = only process sequences for valid species names (i.e., no species with sp. or cf. in names)\n"; + print " -t|s> = minimum number of sequences (g; default = 0) or species (s; default = 0)\n"; + print " a gene must have to be retained; both can be specified simultaneously\n"; + print " -h = print this message and quit\n"; + print " -v = verbose output\n"; + exit(0); + } + else + { + print "Don't understand argument: $ARGV[$i]\n"; + print "Usage: GenBankStrip.pl -f [-g] [-k] [-l] [-o] [-s] [-t] [-h] [-v]\n"; + exit(1); + } + } + +die "ERROR: Must supply name of GenBank output file using flag -f.\n" if (not $gbFile); + +# Load in hardwired gene synonyms + geneSynonyms(); + +# Get list of target genes (if desired) + if ($geneFile) + { + my $userGeneCount = 0; + my %userGenePresent; + setLineBreak($geneFile); + open (GENE,"<$geneFile") or die "Cannot open file $geneFile containing gene names.\n"; + print "Gene(s) to be stripped:\n"; + while () + { + chomp; + next unless ($_); + my $gene = $_; + $gene = $synonym{$gene} if (defined $synonym{$gene}); + $userGene{$gene} = 1; + unless ($userGenePresent{$gene}) + { + $userGeneCount++; + $userGenePresent{$gene}++; + print "\t$gene\n"; + } + } + close GENE; + +# die "ERROR: No genes read in from file $geneFile\n"; + } +#THO added -h command to easilly pass a single gene for Galaxy + if($singleGene) + { + my $userGeneCount = 1; + my %userGenePresent; + print "Gene(s) to be stripped:\n"; + my $gene = $singleGene; + $gene = $synonym{$gene} if (defined $synonym{$gene}); + print "\t$gene\n"; + $geneFile = $singleGene; + } + +# Print parameter summary + print "The following parameters have been set by the user:\n"; + print "\tFile containing GenBank sequence data: $gbFile\n"; + print "\tFile containing target genes to be stripped: $geneFile\n" if ($geneFile); + print "\tUser-defined constraints(s):\n"; + print "\t\tNumber of sequences: $seqThreshold\n" if ($seqThreshold); + print "\t\tNumber of species: $speciesThreshold\n" if ($speciesThreshold); + print "\t\tMinimum sequence length: global - $minLength bp; tRNAs - $minLengthTRNA bp\n" if (defined $minLength); + print "\t\tOnly using species with valid names\n" if ($speciesBlock); + print "\t\tNone\n" if (not $seqThreshold and not $speciesThreshold and not defined $minLength and not $speciesBlock); + print "\tNumber of sequences to keep per species for each gene: "; + if ($keepGene) + { + print "$keepGene\n"; + } + else + { + print "all\n"; + } + print "\tOutput format(s): fasta"; + print ", Se-Al" if ($sealPrint); + print ", nexus" if ($nexusPrint); + print ", phytab" if ($phytabPrint); + print "\n"; + +# Do quick gene count if thresholds indicated; takes longer but 1) saves many disk operations and 2) less memory intensive + geneCount($gbFile) if (not defined $geneFile and ($seqThreshold or $speciesThreshold)); # Don't bother if user gene list given; will usually be small enough so that benefits won't come into play + +# Read in GenBank file and strip genes + my $stripZero = time; + print "\nProcessing GenBank file $gbFile ...\n"; + setLineBreak($gbFile); + open (DATA, "<$gbFile") or die "Cannot open GenBank output file $gbFile\n"; + my @allAccNum; + my ($accNum, %accRead, $duplEntry, $organism, %species, $geneName); + my $speciesFlag = 0; + my (%genePresent, @geneList); + my (@startList, @stopList, $complementFlag, $typeStatus, %pseudoStatus, %seqType, $joinLine, @joinSegments, %geneStart, %geneStop, %compStatus, $fullSeq); + my $nameFlag = 0; + my $joinFlag = 0; + my $pseudoFlag = 0; + my $readSeqs = 0; + + while () + { + chomp; + my $readLine = $_; + next if ($readLine =~ /^\s*LOCUS/ or $readLine =~ /^\s*DEFINITION/ or $readLine =~ /^\s*VERSION/ or $readLine =~ /^\s*KEYWORDS/ or $readLine =~ /^\s*SOURCE/ or $readLine =~ /^\s*ORGANISM/ or $readLine =~ /^\s*REFERENCE/ or $readLine =~ /^\s*AUTHORS/ or $readLine =~ /^\s*TITLE/ or $readLine =~ /^\s*JOURNAL/ or $readLine =~ /^\s*MEDLINE/ or $readLine =~ /^\s*PUBMED/ or $readLine =~ /^\s*FEATURES/ or $readLine =~ /^\s*COMMENT/ or $readLine =~ /^\s*BASE COUNT/ or $readLine =~ /^\s*source/ or $readLine =~ /^\s*\/codon/ or $readLine =~ /^\s*\/transl/ or $readLine =~ /^\s*\/db_/ or $readLine =~ /^\s*CONTIG/); + + # Get accession number + if ($readLine =~ /^\s*ACCESSION\s+(.+)/) + { + $accNum = $1; + print "$readLine\n" if ($debug); + # Clear variables + undef @geneList; + undef %genePresent; + undef $fullSeq; + undef @startList; + undef @stopList; + undef %geneStart; + undef %geneStop; + undef %pseudoStatus; + undef $geneName; + $speciesFlag = $nameFlag = $joinFlag = $pseudoFlag = $readSeqs = $duplEntry = 0; + + if (not $accRead{$accNum}) # Check for duplicate entries + { + $accRead{$accNum} = 1; + push @allAccNum, $accNum; + if (scalar(@allAccNum) == int(scalar(@allAccNum)/10000)*10000 and $verbose) + { + print "\tSequences read in: ".scalar(@allAccNum)."\n"; + } + print "\tAccession number: $accNum\n" if ($debug); + } + else + { +print "*****NOTE -- DUPLICATE ENTRY Accession $accNum\n"; + $duplEntry = 1; + } + } + + # Get organism name + if ($readLine =~ /^\s*\/organism=\"(.+)\"/) + { + $organism = $1; + $organism =~ s/\s/_/g; + print "$readLine\n" if ($debug); + $species{$accNum} = $organism; + $speciesFlag = 1 if ($organism =~ /sp\./ or $organism =~ /cf\./ or $organism =~ /_X_/i); + if ($debug) + { + print "\t\tOrganism: $organism"; + print " (blocked)" if ($speciesFlag and $speciesBlock); + print "\n"; + } + } + next if ($speciesFlag and $speciesBlock); # Entry pertains to invalid species name; skip parsing rest of entry + + + # Get gene boundaries; process previous set of CDs + if ($readLine =~ /\?(\d+)\>?/ or $joinFlag == 1 or $readLine =~ /^\s*ORIGIN/) # ORIGIN will process last set of CDs + { + next if ($readLine =~ /^\s+\/\w+/); # Prevents spurious matches with lines beginning with "/feature" + $readSeqs = 1 if ($readLine =~ /^\s*ORIGIN/); # Indicates that remaining lines will contain sequence information + + # Process previous gene; need to do here to account for a posteriori declarations of pseudogene status + if ($geneName and $nameFlag == 2 and @startList and @stopList) # Process complete gene + { + print "\t\t\t\tParsed name: $geneName (type: $typeStatus)\n" if ($debug); + + # Clean up gene name and misleading punctuation + $geneName = geneClean($geneName); + $pseudoStatus{$geneName} = 1 if ($pseudoFlag); + + if (defined @ { $geneStart{$geneName} } and ((defined $pseudoStatus{$geneName} and $pseudoStatus{$geneName} == 1) or $typeStatus =~ /intron/i or $typeStatus =~ /UTR/i)) # Gene has previously stored CDs that might not have been recognized as a pseudogene or non-coding region + { + print "\t\t\t\t\tSubsequent occurrence of now recognized pseudogene or non-coding region; comparing new and stored CDs\n" if ($debug); + for (my $i = 0; $i < scalar(@startList); $i++) # Check each occurrence in new CDs for matches in stored ones + { + my $newStart = $startList[$i]; + my $newStop = $stopList[$i]; + print "\t\t\t\t\t\tChecking new CDs $newStart to $newStop\n" if ($debug); + for (my $j = 0; $j < scalar(@ { $geneStart{$geneName} }); $j++) + { + if ($newStart == $geneStart{$geneName}[$j] and $newStop == $geneStop{$geneName}[$j]) + { + print "\t\t\t\t\t\t\tMatch with stored CDs (no. $j); deleted\n" if ($debug); + splice(@ { $geneStart{$geneName} }, $j, 1); + splice(@ { $geneStop{$geneName} }, $j, 1); + } + } + } + if ($debug) + { + print "\n\t\t\t\t\tCurrent gene boundaries after pseudogene / non-coding check (type = $seqType{$geneName}):\n"; + if (scalar(@ { $geneStart{$geneName} }) < 1) + { + print "\t\t\t\t\t\tnone\n"; + } + else + { + for (my $i = 0; $i < scalar(@ { $geneStart{$geneName} }); $i++) + { + print "\t\t\t\t\t\t$geneStart{$geneName}[$i]\t$geneStop{$geneName}[$i]\n"; + } + } + } + } + + # Only process coding regions of user-desired genes (if applicable), genes with sensible CDs, non-blocked genes, and genes that are not obvious singletons or pseudogenes + unless (($geneFile and not defined $userGene{$geneName}) or + (defined $geneStatus{$geneName} and $geneStatus{$geneName} eq "rejected") or + (defined $pseudoStatus{$geneName} and $pseudoStatus{$geneName} == 1) or + $geneName =~ /hypothetical/i or $geneName =~ /open reading frame/i or $geneName =~ /similar/i or $geneName =~ /homolog/i or $geneName =~ /putative/i or $geneName =~ /unknown/i or $geneName =~ /unnamed/i or $geneName =~ /\d+rik/i or $geneName =~ /possible/i or $geneName =~ /pseudo/i + or $typeStatus =~ /UTR/i or $typeStatus =~ /intron/i or $typeStatus =~ /misc_feature/i) + { + if (not defined $genePresent{$geneName}) # Process first occurrence of gene in entry + { + if ($debug) + { + print "\t\t\t\t\tFirst occurrence of $geneName\n" if ($debug); + for (my $i = 0; $i < scalar(@startList); $i++) + { + print "\t\t\t\t\t\t$startList[$i]\t$stopList[$i]\n"; + } + } + $genePresent{$geneName} = 1; + push @geneList, $geneName; + push @ { $geneStart{$geneName} }, $_ foreach (@startList); + push @ { $geneStop{$geneName} }, $_ foreach (@stopList); + + $seqType{$geneName} = $typeStatus; + $compStatus{$geneName} = 0; # Note whether gene is complemented or not + $compStatus{$geneName} = 1 if ($complementFlag); + } + else # Attempt to add secondary occurrences + { + my $storedSegments = scalar(@ { $geneStop{$geneName} }) - 1; + my $lastStop = $geneStop{$geneName}[$storedSegments]; + $lastStop = 0 if (not defined $lastStop); + my $newStart = $startList[0]; + + if ($debug) + { + print "\t\t\t\t\tSecondary occurrence of $geneName with boundaries\n" if ($debug); + for (my $i = 0; $i < scalar(@startList); $i++) + { + print "\t\t\t\t\t\t$startList[$i]\t$stopList[$i]\n"; + } + } + if ($seqType{$geneName} eq "gene" and $typeStatus ne "gene") # New information probably more precise and better accounts for structure + { + print "\n\t\t\t\t\t\tNew segment more precisely defined by type; replaced\n" if ($debug); + undef @ { $geneStart{$geneName} }; + undef @ { $geneStop{$geneName} }; + push @ { $geneStart{$geneName} }, $_ foreach (@startList); + push @ { $geneStop{$geneName} }, $_ foreach (@stopList); + $seqType{$geneName} = $typeStatus; + } + + elsif ($newStart > $lastStop) # New segment occurs distal to last stored segment (could also be contiguous); append to boundaries + { + print "\n\t\t\t\t\t\tContiguous with or occurs after last stored segment; appended\n" if ($debug); + push @ { $geneStart{$geneName} }, $_ foreach (@startList); + push @ { $geneStop{$geneName} }, $_ foreach (@stopList); + $seqType{$geneName} = "composite"; + } + elsif (scalar(@ { $geneStart{$geneName} }) == 1 and scalar(@startList) > 1) # Replace single stored segment with new segments derived from join statement; probably better accounts for intron/exon structure + { + print "\n\t\t\t\t\t\tNew segments subdivide single stored segment; replaced\n" if ($debug); + undef @ { $geneStart{$geneName} }; + undef @ { $geneStop{$geneName} }; + push @ { $geneStart{$geneName} }, $_ foreach (@startList); + push @ { $geneStop{$geneName} }, $_ foreach (@stopList); + $seqType{$geneName} = "composite"; + } + else + { + print "\n\t\t\t\t\t\tNew segment overlaps stored segments; rejected\n" if ($debug); + } + } + if ($debug) + { + print "\n\t\t\t\t\tCurrent gene boundaries after CDS processing (type = $seqType{$geneName}):\n"; + if (scalar(@ { $geneStart{$geneName} }) < 1) + { + print "\t\t\t\t\t\tnone\n"; + } + else + { + for (my $i = 0; $i < scalar(@ { $geneStart{$geneName} }); $i++) + { + print "\t\t\t\t\t\t$geneStart{$geneName}[$i]\t$geneStop{$geneName}[$i]\n"; + } + } + } + } + + # Clear entries for gene + $nameFlag = $complementFlag = 0; + undef @startList; # Prevents double listing of genes if CDS use both /gene and /product tags + undef @stopList; + undef $typeStatus; + $pseudoStatus{$geneName} = 0; # Clear pseudogene status for previous set of CDs + } + next if ($readLine =~ /^\s*ORIGIN/); # No more CDs to worry about + + print "$readLine\n" if ($debug); + + # Reset gene name and information for current set of CDs + undef $geneName; + $nameFlag = $pseudoFlag = $complementFlag = 0; + $complementFlag = 1 if ($readLine =~ /complement/); #DISABLED BY THO BECAUSE ONLY COMPLEMENTS AND DOESN'T REVERSE + + # Get new CD information + if ($readLine =~ /^\s+\S+\s+join\(/i or $readLine =~ /^\s+\S+\s+order\(/) + { + $joinFlag = 1; + undef @startList; + undef @stopList; + undef $joinLine; + } + if ($readLine =~ /^\s+(\S+)\s+/i) + { + $typeStatus = $1; + } + + if ($joinFlag) + { + $joinLine .= $readLine; + if ($readLine =~ /\)$/) # Have accounted for multiline join statements; process + { + $joinFlag = 0; + $complementFlag = 1 if ($joinLine =~ /complement/); + + # Clean up join statement + $joinLine =~ s/complement\(//; + $joinLine =~ s/>//g; + $joinLine =~ s/ $stop); + push @startList, $start; + push @stopList, $stop; + } + + undef $joinLine; + $typeStatus = "join"; + } + } + else + { + my ($start, $stop); + if ($readLine =~ /\?(\d+)\>?/) + { + $start = $1; + $stop = $2; + } + next unless ($start and $stop); + next if ($start > $stop); + undef @startList; + push @startList, $start; + undef @stopList; + push @stopList, $stop; + print "\t\tParsed boundaries: $1\t$2\n" if ($debug); + } + } + + + # Check for pseudogene status + if ($readLine =~ /\s+\/pseudo/ or $readLine =~ /pseudogene/ or $readLine =~ /putative/) + { + print "$readLine\n" if ($debug); + $pseudoStatus{$geneName} = 1 if (defined $geneName); + $pseudoFlag = 1; + print "\t\t\t\tWARNING: suspected pseudogene or of uncertain (\"putative\") status\n" if ($debug); + } + + if ($readLine =~ /codon recognized/ and defined $geneName and $geneName =~ /trna/i) + { + print "$readLine\n" if ($debug); + $pseudoStatus{$geneName} = 1; + $pseudoFlag = 1; + print "\t\t\t\tWARNING: tRNA for an alternative codon; ignored\n" if ($debug); + } + + # Get gene name + if ($readLine =~ /^\s*(\/gene)|(\/product)=\"(.+)/ and $nameFlag == 0) # Get start of gene name + { + $geneName = $1 if ($readLine =~ /=\"(.+)/); + print "$readLine\n" if ($debug); + + # Check whether name wraps onto a new line + if (substr($readLine, -1) ne "\"") + { + $nameFlag = 1; + next; + } + else + { + $nameFlag = 2; + $geneName =~ s/\"$// if ($geneName =~ /\"$/); + } + } + + if ($nameFlag == 1) # Get continuation / end of gene name + { + print "$readLine\n" if ($debug); + $geneName .= $readLine; + $nameFlag = 2 if ($readLine =~ /\"$/) # Gene name is complete + } + + # Read in sequence information and append + if ($readLine =~ /^\s*\d*1\s\w+/ and @geneList and $readSeqs) + { + my $seqFrag = $readLine; + $seqFrag =~ s/\s+//g; + $seqFrag =~ s/\d+//g; + $fullSeq .= uc($seqFrag); + } + + # End of entry; process + if ($readLine =~ /\/\//) + { + next if ($duplEntry); + next unless (@geneList and defined $fullSeq); + foreach my $gene (@geneList) + { + # Safeties; shouldn't come into play + next if ($geneFile and not defined $userGene{$gene}); + next if (defined $geneStatus{$gene} and $geneStatus{$gene} eq "rejected"); + next if (defined $pseudoStatus{$gene} and $pseudoStatus{$gene} == 1); + + print "\n\t\tProcessing gene $gene\n" if ($debug); + + # Strip gene out of full sequence and format as fasta + next unless (@ { $geneStart{$gene} } and @ { $geneStop{$gene} } ); + + my $geneSeq; + for (my $i = 0; $i < scalar(@ { $geneStart{$gene} }); $i++) + { + print "\t\t\tGetting sequence between positions $geneStart{$gene}[$i] and $geneStop{$gene}[$i]\n" if ($debug); + next if ($geneStart{$gene}[$i] > length($fullSeq) or $geneStop{$gene}[$i] > length($fullSeq)); + my $geneSegment = substr($fullSeq,$geneStart{$gene}[$i]-1,$geneStop{$gene}[$i]-$geneStart{$gene}[$i]+1); + print "\t\t\t\t$geneSegment\n" if ($debug); + $geneSeq .= $geneSegment; + } + + next unless ($geneSeq); + $geneSeq = complement($geneSeq) if ($compStatus{$gene}); + $maxLength{$gene} = length($geneSeq) if (not defined $maxLength{$gene} or length($geneSeq) > $maxLength{$gene}); + + # Check if sequence matches length threshold (if appropriate) + if (defined $minLength) + { + if ($gene =~ /^trna-\w\w\w/) + { + if (length($geneSeq) < $minLengthTRNA) + { + printf "\t\t\tRejected: length (%s bp) does not meet tRNA length threshold ($minLengthTRNA bp)\n", length($geneSeq) if ($debug); + next; + } + } + else + { + if (length($geneSeq) < $minLength) + { + printf "\t\t\tRejected: length (%s bp) does not meet global threshold ($minLength bp)\n", length($geneSeq) if ($debug); + next; + } + } + } + if ($gene =~ /trna-\w\w\w/ and length($geneSeq) > 100) + { + printf "\t\t\tRejected: length of tRNA too long (%s bp); indicates parsing failure\n", length($geneSeq) if ($debug); + next; + } + + my $breakPoint = 79; + until ($breakPoint > length($geneSeq)) + { + my $replaceString = "\n" . substr($geneSeq, $breakPoint, 1); + substr($geneSeq, $breakPoint, 1) = $replaceString; + $breakPoint += 80; + } + + # Append sequence to file + $geneStatus{$gene} = "stripped"; + $sequenceCount{$gene}++; + $sequenceCount{$species{$accNum}}++; + $sequenceCount{"all"}++; + my $fastaFile = $gene."_gbs.fasta.txt"; + + my $IOicon = ">"; + $IOicon .= ">" if ($sequenceCount{$gene} > 1); + + unless ($debug) + + + { + open (GENE, $IOicon, $fastaFile) or die "Cannot open file $fastaFile to write sequence to."; + print GENE ">$species{$accNum} $accNum\n"; + print GENE "$geneSeq\n"; + close GENE; + } + + # Update various statistical counters + unless ($globalGenePresent{$gene}) # Gene counter + { + push @globalGeneList, $gene; + $globalGenePresent{$gene} = 1; + } + + unless ($speciesPresent{$species{$accNum}}) # Species counter + { + push @speciesList, $species{$accNum}; + $speciesPresent{$species{$accNum}} = 1; + } + + $speciesGenePresent{$gene}{$species{$accNum}}++; # Species-gene counter + $speciesCount{$gene}++ if ($speciesGenePresent{$gene}{$species{$accNum}} == 1); + + # Print out summary stats for gene + if ($debug) + { + print "\t\t\tGene: $gene\n"; + printf "\t\t\t\tGene boundaries (%s bp)", length($geneSeq) - ($geneSeq =~ tr/\n//); + print " (complemented)" if ($compStatus{$gene}); + print ":\n"; + for (my $i = 0; $i < scalar(@ { $geneStart{$gene} }); $i++) + { + print "\t\t\t\t\t$geneStart{$gene}[$i]\t$geneStop{$gene}[$i]\n"; + } + print "\t\t\t\t\t$geneSeq\n"; + } + } + } + } +close DATA; + +# Print out summary stats + print "\n\tSummary stats:\n"; + printf "\t\tTotal number of accessions processed: %s\n", scalar(@allAccNum); + printf "\t\tTotal number of unique species: %s\n", scalar(@speciesList); + printf "\t\tTotal number of unique sequences: %s\n", $sequenceCount{"all"}; + printf "\t\tTotal number of unique genes: %s\n", scalar(@globalGeneList); + + my $stripTime = time - $stripZero; + print "\n\t\tTime taken to process file: $stripTime seconds\n"; + + if (not @globalGeneList) + { + print "\nNOTE: No genes stripped from file $gbFile; exiting\n"; + exit(0); + } + +# Reprocess results to 1) recheck whether stripped genes actually fall below user-set thresholds (geneCount procedure is a rough maximum), 2) pare down to required number of sequences per species, and 3) produce appropriate output files + print "\nPostprocessing results ...\n"; + my $postZero = time; + + @globalGeneList = sort(@globalGeneList); + $stripCount = scalar(@globalGeneList); + + foreach my $gene (@globalGeneList) + { + next if ($debug); + print "\tProcessing gene $gene ($sequenceCount{$gene} sequences for $speciesCount{$gene} species)\n" if ($verbose); + if ($sequenceCount{$gene} < $seqThreshold or $speciesCount{$gene} < $speciesThreshold) # Gene does not meet threshold requirements; reject + { + $geneStatus{$gene} = "rejected"; + push @rejectList, $gene; + $stripCount--; + print "\t\tRejected; does not meet threshold requirements\n" if ($verbose); + + # Remove associated file + my $rmString = $gene."_gbs.fasta.txt"; + $rmString =~ s/\s/\\ /g; + $rmString =~ s/\(/\\(/g; + $rmString =~ s/\)/\\)/g; + $rmString =~ s/\'/\\'/g; + system ("rm $rmString"); + } + else + { + my (%infLength, %lengthList, %criticalLength); + undef @speciesList; + $geneStatus{$gene} = "stripped"; + if ($verbose) + { + print "\t\tStripped"; + print "; meets threshold requirements" if ($seqThreshold or $speciesThreshold); + print "\n"; + } + + # Reload sequences from disk + my $inputFile = $gene."_gbs.fasta.txt"; + setLineBreak($inputFile); + open (FASTA, "<$inputFile") or die "Cannot open file $inputFile to read data for gene $gene\n"; + my (@accList, %speciesName, %fastaSeq, $species, $fastaAcc); + + undef %speciesPresent; + while () + { + chomp; + if ($_ =~ "^>") + { + ($species, $fastaAcc) = split; + $species =~ s/^>//; + $fastaAcc =~ s/^\(//; + $fastaAcc =~ s/\)$//; + + + push @accList, $fastaAcc; + $speciesName{$fastaAcc} = $species; + $speciesPresent{$species}++; + push @speciesList, $species if ($speciesPresent{$species} == 1); + } + else + { + $fastaSeq{$fastaAcc} .= $_; + } + } + close FASTA; + @accList = sort { $speciesName{$a} cmp $speciesName{$b} } keys %speciesName; + + # Pare down to desired number of sequences as needed + if ($keepGene) + { + undef %lengthList; + print "\t\tParing down to $keepGene best lengths for each of $speciesCount{$gene} species ...\n" if ($verbose); + # Get informative length for each sequence + foreach my $entry (@accList) + { + $infLength{$entry} = ($fastaSeq{$entry} =~ tr/ACGT//); + push @ { $lengthList{$speciesName{$entry}} }, $infLength{$entry}; + } + + # Determine critical length and correct sequence count for number of deleted species + foreach my $species (@speciesList) + { + print "\t\t\tSpecies: $species ($speciesPresent{$species} sequences)\n" if ($debug); + next if ($speciesPresent{$species} <= $keepGene); # Only process species for which gene has been sampled more than threshold + @ { $lengthList{$species} } = sort {$b <=> $a} @ { $lengthList{$species} }; # Sort in descending order and get critical length + $criticalLength{$species} = $lengthList{$species}[$keepGene-1]; + + # Correct sequence count + my $i = $keepGene; + $i++ until ($i >= $speciesPresent{$species} or $lengthList{$species}[$i] < $criticalLength{$species}); + $sequenceCount{$gene} -= (scalar(@ { $lengthList{$species} }) - $i); + + if ($debug) + { + print "\t\t\t\tCritical length: $criticalLength{$species}\n"; + printf "\t\t\t\tNumber of sequences deleted: %s\n", scalar(@ { $lengthList{$species} }) - $i; + print "\t\t\t\tTotal number of sequences for gene remaining: $sequenceCount{$gene}\n"; + } + } + + print "\t\t\tFinal count: $sequenceCount{$gene} sequences\n" if ($verbose); + + # Recheck if paring has dropped sequence below sequence threshold + if ($sequenceCount{$gene} < $seqThreshold) + { + print "\t\t\tNOTE: Gene ($sequenceCount{$gene} sequences) now falls below threshold of $seqThreshold sequences; no file created\n" if ($verbose); + $geneStatus{$gene} = "rejected"; + push @rejectList, $gene; + $stripCount--; + + # Remove associated file + my $rmString = $gene."_gbs.fasta.txt"; + $rmString =~ s/\s/\\ /g; + $rmString =~ s/\(/\\(/g; + $rmString =~ s/\)/\\)/g; + $rmString =~ s/\'/\\'/g; + system ("rm $rmString"); + + next; + } + } + + # Produce appropriate output files + print "\t\tSaving output ...\n" if ($verbose); + my %printCount; + + # Open files as needed + my $sealFile = $gene."_gbs.seal"; + open (SEAL, ">$sealFile") or die "Cannot open Se-Al formatted file $sealFile for writing\n" if ($sealPrint); + my $nexusFile = $gene."_gbs.nex"; + open (NEX, ">$nexusFile") or die "Cannot open nexus-formatted file $nexusFile for writing\n" if ($nexusPrint); +# my $phytabFile = $gene."_gbs.phytab"; +# open (PHYTAB, ">$phytabFile") or die "Cannot open nexus-formatted file $phytabFile for writing\n" if ($phytabPrint); +#Currently assumes that only 1 file will be written for ease of Galaxy implementation + my $phytabFile = "osiris_gbs.phytab"; + open (PHYTAB, ">$phytabFile") or die "Cannot open nexus-formatted file $phytabFile for writing\n" if ($phytabPrint); + my $fastaFile = $gene."_gbs.fasta.txt"; + $fastaFile = $gene."_gbs.new.fasta.txt" if ($debug); + open (FASTA, ">$fastaFile") or die "Cannot open fasta-formatted file $fastaFile for writing\n"; + + # Print headers + if ($sealPrint) + { + print SEAL "Database={\n"; + print SEAL "\tID='MLst';\n"; + print SEAL "\tOwner=null;\n"; + print SEAL "\tName=null;\n"; + print SEAL "\tDescription=null;\n"; + print SEAL "\tFlags=0;\n"; + print SEAL "\tCount=2;\n"; + print SEAL "\t{\n\t\t{\n"; + + print SEAL "\t\t\tID='PAli';\n"; + print SEAL "\t\t\tOwner=1;\n"; + printf SEAL "\t\t\tName=\"%sgbs\";\n", $gene; + print SEAL "\t\t\tDescription=null;\n"; + print SEAL "\t\t\tFlags=0;\n"; + print SEAL "\t\t\tNumSites=$maxLength{$gene};\n"; + print SEAL "\t\t\tType=\"Nucleotide\";\n"; + print SEAL "\t\t\tFeatures=null;\n"; + print SEAL "\t\t\tColourMode=1;\n"; + print SEAL "\t\t\tLabelMode=0;\n"; + print SEAL "\t\t\ttriplets=false;\n"; + print SEAL "\t\t\tinverse=true;\n"; + printf SEAL "\t\t\tCount=%s;\n", $sequenceCount{$gene}; + print SEAL "\t\t\t{\n"; + } + + if ($nexusPrint) + { + print NEX "#NEXUS\n\n"; + print NEX "Begin data;\n"; + printf NEX "\tDimensions ntax=%s nchar=%s;\n", $sequenceCount{$gene}, $maxLength{$gene}; + print NEX "\tFormat datatype=nucleotide gap=-;\n\n"; + print NEX "\tmatrix\n\n"; + } + + # Print sequence data + my $i = 0; + foreach my $entry (@accList) + { + next if (defined $criticalLength{$speciesName{$entry}} and $infLength{$entry} < $criticalLength{$speciesName{$entry}}); + + $printCount{$speciesName{$entry}}++; + $speciesName{$entry} .= "_$printCount{$speciesName{$entry}}" if ($printCount{$speciesName{$entry}} > 1); + + if ($sealPrint) + { + $i++; + print SEAL "\t\t\t\t{\n"; + print SEAL "\t\t\t\t\tID='PSeq';\n"; + print SEAL "\t\t\t\t\tOwner=2;\n"; + print SEAL "\t\t\t\t\tName=\"$speciesName{$entry}\";\n"; + print SEAL "\t\t\t\t\tDescription=null;\n"; + print SEAL "\t\t\t\t\tFlags=0;\n"; + print SEAL "\t\t\t\t\tAccession=\"$entry\";\n"; + print SEAL "\t\t\t\t\tType=\"DNA\";\n"; + printf SEAL "\t\t\t\t\tLength=%s;\n", length($fastaSeq{$entry}); + print SEAL "\t\t\t\t\tSequence=\"$fastaSeq{$entry}\";\n"; + print SEAL "\t\t\t\t\tGeneticCode=1;\n"; + print SEAL "\t\t\t\t\tCodeTable=null;\n"; + print SEAL "\t\t\t\t\tFrame=1;\n"; + print SEAL "\t\t\t\t\tFeatures=null;\n"; + print SEAL "\t\t\t\t\tParent=null;\n"; + print SEAL "\t\t\t\t\tComplemented=false;\n"; + print SEAL "\t\t\t\t\tReversed=false;\n"; + print SEAL "\t\t\t\t}"; + print SEAL "," unless ($i == $sequenceCount{$gene}); + print SEAL "\n"; + } + + if ($nexusPrint) + { + my $gaps = "-" x ($maxLength{$gene} - length($fastaSeq{$entry})); + print NEX "$speciesName{$entry}\t$fastaSeq{$entry}$gaps\n"; + } + if($phytabPrint) + { + print PHYTAB "$speciesName{$entry} \t$gene \t $entry \t $fastaSeq{$entry} \n" + } + + my $breakPoint = 79; + until ($breakPoint > length($fastaSeq{$entry})) + { + my $replaceString = "\n" . substr($fastaSeq{$entry}, $breakPoint, 1); + substr($fastaSeq{$entry}, $breakPoint, 1) = $replaceString; + $breakPoint += 80; + } + if ($accessionFirst) #ADDED BY THO + { + print FASTA ">$entry", "\___", "$speciesName{$entry}\n"; + print FASTA "$fastaSeq{$entry}\n"; + } + else + { + print FASTA ">$speciesName{$entry} ($entry)\n"; + print FASTA "$fastaSeq{$entry}\n"; + } + } + # Print footers + if ($sealPrint) + { + print SEAL "\t\t\t};\n"; + print SEAL "\t\t},\n"; + print SEAL "\t\t{\n"; + print SEAL "\t\t\tID='MCoL';\n"; + print SEAL "\t\t\tOwner=1;\n"; + print SEAL "\t\t\tName=\"Genetic Codes\";\n"; + print SEAL "\t\t\tDescription=\"Custom Genetic Codes\";\n"; + print SEAL "\t\t\tFlags=0;\n"; + print SEAL "\t\t\tCount=0;\n"; + print SEAL "\t\t}\n"; + print SEAL "\t};\n"; + print SEAL "};\n"; + } + if ($nexusPrint) + { + print NEX "\t;\nend;\n"; + } + + # Close files as appropriate + close SEAL if ($sealPrint); + close NEX if ($nexusPrint); + close FASTA if ($keepGene); + } + } + + my $postTime = time - $postZero; + print "\n" if ($verbose); + print "\tTime taken: $postTime seconds\n"; + +# Print out final summary stats / files + print "\nFinal summary statistics\n"; + + # Print file of stripped genes + open (STRIP, ">$stripFile") or die "Cannot write summary of stripped genes to $stripFile.\n"; + print STRIP "The following $stripCount genes were stripped successfully from file $gbFile\n"; + print STRIP "\nGene\tNo. of sequences\tNo. of species\n\n"; + + foreach my $gene (@globalGeneList) + { + print STRIP "$gene\t$sequenceCount{$gene}\t$speciesCount{$gene}\n" unless ($geneStatus{$gene} eq "rejected"); + } + close STRIP; + + print "\tSummaries of $stripCount genes that were stripped from $gbFile have been written to $stripFile\n"; + + # Print file of rejected genes + if (@rejectList) + { + @rejectList = sort @rejectList; + open (REJECT, ">$rejectFile") or die "Cannot write summary of rejected genes to $rejectFile.\n"; + printf REJECT "The following %s genes do not meet user-defined threshold(s) of", scalar(@rejectList); + print REJECT " $seqThreshold sequences" if ($seqThreshold); + print REJECT " or" if ($seqThreshold and $speciesThreshold); + print REJECT " $speciesThreshold species" if ($speciesThreshold); + print REJECT "; an asterisk indicates that values given are rough maximums\n"; + print REJECT "\nGene\tNo. of sequences\tNo. of species\n\n"; + + foreach my $gene (@rejectList) + { + print REJECT "$gene\t"; + if (defined $sequenceCount{$gene}) # Need to do alternative versions depending on whether gene was properly counted or not + { + print REJECT "$sequenceCount{$gene}\t"; + } + else + { + print REJECT "$quickSequenceCount{$gene} \*\t"; + } + if (defined $speciesCount{$gene}) + { + print REJECT "$speciesCount{$gene}\n"; + } + else + { + print REJECT "$quickSpeciesCount{$gene} \*\n"; + } + } + close REJECT; + + printf "\tSummaries of %s genes that did not meet threshold(s) have been written to $rejectFile\n", scalar(@rejectList); + } + + if ($debug) + { + print "\n\tSummary stats for each processed gene:\n"; + foreach my $gene (@globalGeneList) + { + print "\t\tGene: $gene ($geneStatus{$gene})\n"; + print "\t\t\tTotal number of sequences: $sequenceCount{$gene}\n"; + print "\t\t\tTotal number of unique species: $speciesCount{$gene}\n"; + } + } + +exit(0); + +# Subroutines used in the script + +sub setLineBreak # Check line breaks of input files and set input record separator accordingly + { + my $gbFile = shift; + $/ ="\n"; + open (IN, "<$gbFile") or die "Cannot open $gbFile to check form of line breaks.\n"; + while () + { + if ($_ =~ /\r\n/) + { + print "\tDOS line breaks detected ...\n" if ($debug); + $/ ="\r\n"; + last; + } + elsif ($_ =~ /\r/) + { + print "\tMac line breaks detected ...\n" if ($debug); + $/ ="\r"; + last; + } + else + { + print "\tUnix line breaks detected ...\n" if ($debug); + $/ ="\n"; + last; + } + } + close IN; + } + +sub complement # Outputs complementary sequence to one provided + { + my $tempSeq = shift; + + my $compSeq; + for (my $nt = 0; $nt < length($tempSeq); $nt++) + { + if (not defined $complement{substr($tempSeq, $nt, 1)}) + { + $compSeq .= "?"; + } + else + { + $compSeq .= $complement{substr($tempSeq, $nt, 1)}; + } + } + my $revCompSeq = reverse($compSeq); + return $revCompSeq; + } + +sub geneSynonyms # Define gene synonyms; originally compiled by Robin Beck + { + #Opsin gene added by THO + $synonym{$_} = "opsin" foreach ("opsin", "anceropsin", "blop", "blue-sensitive_opsin", "blue-sensitive_opsin_precursor", "blue-sensitive_rhodopsin", "blue-sensitive_visual_pigment", "blue_opsin", "bluerh", "boceropsin", "buvops", "compound_eye_opsin_bcrh1", "compound_eye_opsin_bcrh2", "lateral_eye_opsin", "locus_opsin_1", "locust_opsin_2", "long-wavelenght_opsin", "long-wavelength_like_opsin", "long-wavelength_opsin", "long-wavelength_rhodopsin", "long-wavelength_sensitive_opsin_1", "long-wavelength_sensitive_opsin_2", "long_wave_opsin", "long_wavelength-sensitive_opsin", "long_wavelength-sensitive_rhodopsin", "long_wavelength-sensitive_visual_pigment", "long_wavelength_opsin", "long_wavelength_sensitive_opsin_1", "long_wavelength_sensitive_opsin_2", "lop2", "lw_opsin", +"ocellar_opsin", "opsin", "opsin_2", "opsin_bcrh1", "opsin_bcrh2", +"opsin_rh1", "opsin_rh3", "opsin_rh4", "piceropsin", "pteropsin", +"rh1_opsin", "rh2_opsin", "rh3_opsin", "rh4_opsin", "rh6_rhodopsin", +"rhodopsin", "rhodopsin_1", "rhodopsin_2_cg16740-pa", "rhodopsin_3", +"rhodopsin_3_cg10888-pa", "rhodopsin_4", "rhodopsin_4_cg9668-pa", +"rhodopsin_5", "rhodopsin_5_cg5279-pa", "rhodopsin_6", +"rhodopsin_6_cg5192-pb", "rhodopsin_7_cg5638-pa", +"rhodopsin_long-wavelength", "short_wavelength-sensitive_opsin", +"ultraviolet-sensitive_opsin", "ultraviolet-sensitive_rhodopsin", +"ultraviolet-sensitive_visual_pigment", "ultraviolet_sensitive_opsin", +"uv-sensitive_opsin", "uv-wavelength_like_opsin", "uv_opsin", "uvop", +"uvrh", "uvrh1", "amblop", "amuvop", +"lwrh", "lwrh1", "lwrh2", "lw", "lw-rh", "lw_rh", "ninae", "ninae-pa", +"op", "ops", "ops1", "opsin_1", "opsin_3", "rh", "rh1", "rh2", "rh3", +"rh2-pa", "rh3-pa", "rh4", "rh4-pa", "rh5", "rh6", "rh7", "rho", +"visual_pigment", "long-wavelength_rodopsin", +"long_wavelength_rhodopsin", "short-wavelength_rhodopsin"); + + #Chloroplast genes added by THO*** + $synonym{$_} = "rbcl" foreach ("rbcl", "large_subunit_of_riblose-15-bisphosphate_carboxylase-oxygenase","larger_subunit_of_rubisco", + "rubisco_large_subunit", "ribulose_15-bisphosphate_carboxylase_large_subunit", "ribulosebiphosphate_carboxylase_large_subunit"); + $synonym{$_} = "matk" foreach ("matk", "maturase_k", "maturase"); + + # Mitochondrial genes + $synonym{$_} = "mtatp6" foreach ("mtatp6", "atp synthase 6", "atp synthase a chain", "atp6", "mt-atp6", + "atpase subunit 6", "atpase6", "atpase 6", "atpase6 protein", "atp6", + "atp synthase f0 subunit 6", "atp synthase a chain protein 6", + "atp synthase subunit 6", "atpase6", "atp6", "atp sythase subunit 6", + "atpase subunit 6, atpase6", "f0-atpase subunit 6", "f1atpase subunit 6", + "f1-atpase subunit 6", "atpase 6", "atpase 6 gene", "atpase subunit 6 (atp6)", + "atpase subunit 6 (atpase6)"); + $synonym{$_} = "mtatp8" foreach ("mtatp8", "atp synthase 8", "atp synthase protein 8", "atpase subunit 8", + "a6l", "atp8", "mt-atp8", "atpase subunit 8", "atpase8", "atpase 8", + "atpase8 protein", "atp8", "atp synthase f0 subunit 8", "atp synthase protein 8", + "atpase-8", "atp synthase subunit 8", "atpase8", "atp8", "atp sythase subunit 8", + "atpase subunit8", "f0-atpase subunit 8", "f1 atpase subunit 8", "f1-atpase subunit 8", + "protein a6l", "atpase 8", "atpase subunit 8 (atp8)", "atpase subunit 8 (atpase8)", + "atpase 8 gene", "atpase subunit 8 (atp8)", "atpase subunit 8 (atpase8)"); + $synonym{$_} = "mtco1" foreach ("mtco1", "cytochrome c oxidase i", "coi", "mt-co1", "co i", "ccoi", "cox 1", "cox i", + "coi", "cytochrome c oxidase subunit i", "cytochrome oxidase subunit i", "cox1", + "cytochrome c oxidase subunit 1", "cytochrome oxidase subunit 1", "co1", + "cytochrome c oxidase polypeptide i", "cytochrome oxidase i", "cox-1", + "cytochrome oxidase c subunit 1", "cox1", "co i", "co1", "cox 1", "coxi", + "cytochrome c oxidase polypeptide i", "cytochrome c-oxidase subunit 1", + "cytochrome oxidase c subunit i", "cytochrome-c oxidase i", "cytochrome c1 mrna", + "cytochrome c1", "cytochrome c oxidase subunit 1 (coxi)", "cytochrome c oxidase chain i", + "cytochrome c1 (aa 1-241)", "cytochrome c oxidase polypeptide 1", + "cytochrome c oxidase subunit 1 (coi)", "cytochrome c-1"); + $synonym{$_} = "mtco2" foreach ("mtco2", "cytochrome c oxidase ii", "cytochrome c oxidase polypeptide ii", "coii", + "mt-co2", "coii", "cytochrome c oxidase subunit ii", "cytochrome oxidase subunit ii", + "cytochrome oxidase subunit 2", "cytochrome c oxidase subunit 2", + "cytochrome c oxidase ii", "cox2", "co2", "cytochrome c oxidase polypeptide ii", + "cytochrome oxidase ii", "cox2", "cytochrome oxidase c subunit 2", + "cytochrome-c oxidase", "co ii", "co2", "cox 2", "cytochrome c oxidase polypeptide ii", + "cytochrome c-oxidase subunit 2", "cytochrome oxidase c subunit ii", + "cytochrome oxidase subunit2", "cytochrome-c oxidase ii", "cytochrome c oxidase subunit 2 (coxii)", + "cytochrome c oxidase subunit 2 (coii)", "cytochrome c oxidase chain ii", + "cytochrome c oxidase polypeptide 2", "cytochrome c oxidase ii subunit", + "cytochrome c oxidase ii mrna"); + $synonym{$_} = "mtco3" foreach ("mtco3", "cytochrome c oxidase iii", "coiii", "mt-co3", "co iii", "ccoiii", "cox3", + "cox iii", "cytochrome oxidase subunit iii", "coiii", "cox iii", + "cytochrome c oxidase subunit iii", "cytochrome oxidase subunit 3", "cox3", + "cytochrome c oxidase subunit 3", "co3", "cytochrome c oxidase polypeptide iii", + "cox3", "cytochrome oxidase c subunit 3", "cytochrome oxidase iii", + "cytochrome c oxidase polypeptide iii", "co iii", "coiii", "coiii protein", "coxiii", + "cytochrome c oxidase subunit iii, coiii", "cytochrome c-oxidase subunit 3", + "cytochrome c-oxidase subunit three", "cytochrome oxidase c subunit iii", + "cytochrome-c oxidase iii", "cytochrme c oxidase iii", "cytochrome c oxidase subunit 3 (coiii)", + "cytochrome oxidase subunit iii type 2"); + $synonym{$_} = "mtnd1" foreach ("mtnd1", "nd1", "nadh dehydrogenase 1", "nadh dehydrogenase, subunit 1 (complex i)", + "mt-nd1", "nadh-ubiquinone oxidoreductase chain 1", "nd1", "nadh1", "nd1", + "nadh-ubiquinone oxidoreductase chain 1", "nadh-ubiquinone oxidoreductase subunit 1", + "nadh dehydrogenase i", "nadh dehydrogenase 1", "nadh subunit 1", + "nadh dehydrogenase subnuit 1", "nadh1", "nadh1 protein", + "nadh-ubiquinone oxidoreductase subunit i", "nadh dehydrogenase subunit 1", + "nadh dehydrogenase subunit 1 (nd1)", "nadh dehydrogenase subunit 1 type 2", + "nadh dehydrogenase subunit 1 type 1", "nadh dehydrogenase subunit i", "nadh dehydrogenase i", + "nadh dehydrogenase subnit 1", "nadh dehydrogenase ubiquinone 1 alpha", + "nadh dehydrogenase subunit i"); + $synonym{$_} = "mtnd2" foreach ("ndh-u1", "mtnd2", "nadh dehydrogenase 2", "nadh dehydrogenase, subunit 2 (complex i)", + "nadh-ubiquinone oxidoreductase chain 2", "nd2", "mt-nd2", "urf2", + "nadh dehydrogenase subunit 2", "nd2", "nadh2", + "nadh-ubiquinone oxidoreductase chain 2", "nadh dehydrogenase 2", "nadh subunit 2", + "nadh-ubiquinone oxidoreductase subunit 2", "nadh dehydrogenase subnuit 2", + "nadh deydrogenase subunit 2", "nadh2", "nadh-ubiquinone oxidoreductase subunit ii", + "nd2", "nadh2 protein", "nadh dehydrogenase subunit 2 (nd2)", "nadh dehydrogenase subunit ii", + "nadh dehydrogenase ii", "nadh dehydrogense 2", "nadh dehydrogenase subunit ii"); + $synonym{$_} = "mtnd3" foreach ("mtnd3", "ndh-u3", "nadh dehydrogenase 3", "nd3", "nadh3", + "nadh-ubiquinone oxidoreductase chain 3", + "nadh-ubiquinone/plastoquinone oxidoreductase chain 3", "mt-md3", "urf3", + "nadh dehydrogenase subunit 3", "nd3", "nadh3", + "nadh-ubiquinone oxidoreductase chain 3", "nadh subunit 3", "nadh3", "nadh3 protein", + "nadh-ubiquinone oxidoreductase subunit 3", "nadh dehydrogenase subnuit 3", + "nadh-ubiquinone oxidoreductase subunit iii", "nadh3 protein", + "nadh dehydrogenase subunit 3 (nd3)", "nadh dehydrogenase subunit iii", "nadh dehydrogenase iii", + "nadh dehydrogenase subunit iii"); + $synonym{$_} = "mtnd4" foreach ("mtnd4", "ndh-u4", "nadh dehydrogenase 4", "nadh-ubiquinone oxidoreductase chain 4", "mt-nd4", + "nd4", "urf4", "nadh:ubiquinone oxidoreductase subunit 4 (chain m)", + "nadh dehydrogenase subunit 4", "nd4", "nadh4", + "nadh-ubiquinone oxidoreductase chain 4", "nadh subunit 4", "nadh4", + "nadh-ubiquinone oxidoreductase subunit 4", "nadh dehydrogenase subunit4", + "nadh-ubiquinone oxidoreductase subunit iv", "nadh4 protein", + "nadh dehydrogenase subunit 4 (nd4)", "nadh dehydrogenase subunit iv", "nadh dehydrogenase iv", + "nadh dehydrogenase subunit iv"); + $synonym{$_} = "mtnd4l" foreach ("mtnd4l", "ndh-u4l", "nadh dehydrogenase 4l", "nadh dehydrogenase, subunit 4l (complex i)", + "nadh-ubiquinone oxidoreductase chain 4l", "nd4l", "mt-md4l", "urf4l", + "nadh dehydrogenase subunit 4l", "nd4l", "nadh4l", "nadh-ubiquinone oxidoreductase chain 4l", + "nadh subunit 4l", "nadh4l", "nadh-ubiquinone oxidoreductase subunit 4l", + "nadh dehydrogenase subunit 4 l", "nadh4l protein", "nadh dehydrogenase subunit 4l (nd4l)", + "nadh dehydrogenase subunit ivl", "nadh dehydrogenase ivl", "nadh dehydrogenase subunit ivl"); + $synonym{$_} = "mtnd5" foreach ("mtnd5", "ndh-u5", "nadh dehydrogenase 5", "nd5", "nadh-ubiquinone oxidoreductase chain 5", + "mt-nd5", "urf5", "nadh dehydrogenase subunit 5", "nadh5", "nd5", + "nadh dehydrogenase-5", "nadh-ubiquinone oxidoreductase chain 5", + "nadh dehydrogenase 5", "nadh 5", "nadh subunit 5", + "nadh-ubiquinone oxidoreductase subunit 5", "nadh5", "nadh-dehydrogenase subunit 5", + "nadh-dehydrogenase subunit 5, nd5", "nadh-ubiquinone oxidoreductase subunit v", + "nadh5 protein", "nadh dehydrogenase subunit 5 (nd5)", "nadh dehydrogenase subunit v", + "nadh dehydrogenase v", "nadh dehydrogenase subunit v"); + $synonym{$_} = "mtnd6" foreach ("mtnd6", "ndh-u6", "nadh dehydrogenase 6", "nd6", "nadh6", + "nadh-ubiquinone oxidoreductase chain 6", + "nadh-ubiquinone/plastoquinone oxidoreductase chain 6", "mt-md6", "urf6", + "nadh dehydrogenase subunit 6", "nd6", "nadh6", + "nadh-ubiquinone oxidoreductase chain 6", "nadh subunit 6", + "nadh-ubiquinone oxidoreductase subunit 6", "nadh dehydrogenase 6", "nadh6", + "nadh-ubiquinone oxidoreductase subunit vi", "nadh6 protein", + "nadh dehydrogenase subunit 6 (nd6)", "nadh dehydrogenase subunit vi", "nadh dehydrogenase vi", + "nadh dehydrogenase subunit vi"); + $synonym{$_} = "mtrnr1" foreach ("mtrnr1", "mt-rnr1", "12s rna", "12s rrna", "12s ribosomal rna", "12s rrna", + "small subunit ribosomal rna", "12 rrna", "12 s ribosomal rna", "12s rrna, mtssu rrna", + +"12s rrna gene", "12s small subunit ribosomal rna", +"mitochondrial ssu ribosomal rna", "rrns", + "ssu ribosomal rna", "12s","12s_ribosmal_rna", "s-rrna", "srrna", "ssu"); + $synonym{$_} = "mtrnr2" foreach ("16s", "16s_ribosomal_rna","l-rrna","lrrna","lsu", "mtrnr2", "mt-rnr2", "16s rrna", "16s rna", "16srrna", "16s ribosomal rna", "16s rrna", "rrna large subunit", + "large subunit ribosomal rna", "16 s ribosomal rna", "16s mitochondrial ribosomal rna", + +"mitochondrial 16s ribosomal rna", "16s large subunit ribosomal +rna", "16s_ribosmal_rna","rrnl"); + + # Nuclear genes + $synonym{$_} = "adora3" foreach ("adora3", "adenosine a3 receptor", "a3ar", "ara3", "gpcr2", "adora3", "adenosine a3 receptor", + "a3 adenosine receptor", "adenosine-3 receptor"); + $synonym{$_} = "adra2b" foreach ("adra2b", "adrenergic, alpha-2b-, receptor", "adra2l1", "adrarl1", "adra2rl1", + "alpha-2b adrenergic receptor", "alpha-2b adrenoceptor", "subtype c2", "[a]2b", "adra-2b", + "alpha2-c2", "alpha2b", "subtype alpha2-c2", "alpha-2-adrenergic receptor-like 1", + "alpha adrenergic receptor 2b", "adra2b", "alpha 2b adrenergic receptor", "adra2b", + "alpha adrenergic receptor subtype 2b", "alpha-2b-adrenergic receptor", + "alpha adrenergic receptor, subtype 2b", "alpha-2b adrenergic receptor", "alpha-2b adrenoceptor"); + $synonym{$_} = "adrb2" foreach ("adrb2", "adrenergic, beta-2-, receptor, surface", "adrb2r", "adrbr", "beta-2 adrenergic receptor", + "b2ar", "bar", "beta-2 adrenoceptor", "catecholamine receptor", "adrb-2", "badm", "beta 2-ar", + "gpcr7", "adrb2", "beta-2 adrenergic receptor", "beta-2-adrenergic receptor", + "adrenergic receptor beta 2", "beta 2 adrenergic receptor", "beta2-adrenergic receptor", + "beta2 adrenergic receptor"); + $synonym{$_} = "apob" foreach ("apob", "apolipoprotein b", "ag(x) antigen", "apolipoprotein b-100 precursor", "apo b-100", + "apolipoproteinb-48", "apo b-48", "fldb", "apob-100", "apolipoprotein b", "apolipoprotein b 100", + "apob", "apolipoprotein b-100", "apob"); + $synonym{$_} = "app" foreach ("app", "amyloid beta (a4) precursor protein", "protease nexin-ii, alzheimer disease", "ad1", + "human mrna for amyloid a4 precursor of alzheimer's disease", "appi", "beta-amyloid protein", + "beta-app", "a-beta", "a4", "cvap", "aaa", "abeta", "amyloid beta-peptide", + "amyloid beta (a4) precursor protein", "adap", "appican", "betaapp", "app", + "amyloid beta precursor protein", "amyloid precursor protein", "amyloid beta precursor b1", + "beta amyloid protein precursor", "beta-a4"); + $synonym{$_} = "atp7a" foreach ("atp7a", "atpase, cu++ transporting, alpha polypeptide (menkes syndrome)", "mnk", + "copper-transporting atpase 1", "copper pump 1", "menkes disease-associated protein", "mc1", + "copper binding p-type atpase 7a", "blo", "blotchy", "br", "brindled", "menkes protein", "mo", + "mottled", "mk", "ohs", "atp7a", "copper transporting atpase", + "atpase, cu++ transporting, alpha polypeptide", "menkes syndrome protein"); + $synonym{$_} = "bdnf" foreach ("bdnf", "brain-derived neurotrophic factor", "brain-derived neurotrophic factor precursor", "bdnf", + "brain-derived neurotrophic factor", "brain derived neurotrophic factor", + "brain-derived neurotrophic factor mature peptide", "bdnf"); + $synonym{$_} = "bmi1" foreach ("bmi1", "b lymphoma mo-mlv insertion region (mouse)", + "murine leukemia viral (bmi-1) oncogene homolog", "polycomb complex protein bmi-1", "rnf51", + "mgc12685", "oncogene bmi-1", "bmi1", "bmi-1", "bmi-1 protein", "oncoprotein bmi-1"); + $synonym{$_} = "brca1" foreach ("brca1", "breast cancer 1, early onset", "pscp", "papillary serous carcinoma of the peritoneum", + "breast cancer type 1 susceptibility protein", "breast and ovarian cancer susceptibility gene", + "rnf53", "breast-ovarian cancer, included", "brca1", "brca1", + "breast and ovarian cancer susceptibility protein", "brca1 protein", + "breast and ovarian cancer susceptibility"); + $synonym{$_} = "chrna1" foreach ("chrna1", "cholinergic receptor, nicotinic, alpha polypeptide 1 (muscle)", "chrna", + "acetylcholine receptor protein, alpha chain precursor", "achra", "achr-1", "acra", "chrna1", + "nicotinic cholinergic receptor alpha polypeptide", "chrna1", "achr", "achr prepeptide", "chrna", + "neuronal nicotinic acetylcholine receptor alpha", "nicotinic acetylcholine recepter alpha-subunit"); + $synonym{$_} = "cftr" foreach ("cftr", "cystic fibrosis transmembrane conductance regulator, atp-binding cassette (sub-family c, + member 7)", "mrp7", "cf", "abcc7", "abc35", "cftr", "cystic fibrosis transmembrane conductance", + "cftr chloride channel"); + $synonym{$_} = "cnr1" foreach ("cnr1", "cannabinoid receptor 1 (brain)", "cnr", "cb1", "cb-r", "cann6", "cb>1<", "cb1k5", "cb1a", + "central cannabinoid receptor", "cnr1", "cannabinoid receptor 1", "cb1", "cb1 cannabinoid receptor", + "cb1 cannabinoid receptor", "cbr"); + $synonym{$_} = "crem" foreach ("crem", "camp responsive modulator", "crea", "camp-responsive element modulator, alpha isoform", + "crem", "camp responsive element moderator", "camp responsive element modulator", + "camp-responsive element moderator"); + $synonym{$_} = "edg1" foreach ("edg1", "endothelial differentiation, sphingolipid g-protein-coupled receptor, 1", "d1s3362", "edg-1", + "1pb1", "s1p1", "ecgf1", "chedg1", "sphingosine 1-phosphate receptor edg1", + "g protein-coupled sphingolipid receptor", "edg1"); + $synonym{$_} = "ghr" foreach ("ghr", "growth hormone receptor", "growth hormone receptor precursor", "gh receptor", + "serum binding protein", "growth hormone receptor", "ghr", "growth hormone receptor precursor", + "ghr", "bovine growth hormone receptor", "mature growth hormone receptor"); + $synonym{$_} = "pgk1" foreach ("pgk1", "phosphoglycerate kinase 1", "pgk-1", "primer recognition protein 2", "prp 2", "pgka", + "pgk-1", "pgk-1", "phosphoglycerate kinase 1"); + $synonym{$_} = "plcb4" foreach ("plcb4", "phospholipase c, beta 4", + "1-phosphatidylinositol-4,5-bisphosphate phosphodiesterase beta 4", "plc-beta-4", + "phospholipase c-beta-4", "plcb4", "phospholipase c beta 4"); + $synonym{$_} = "pnoc" foreach ("pnoc", "prepronociceptin", "propronociceptin", "nociceptin precursor", "orphanin fq", "ppnoc", "ofq", + "n/ofq", "n23k", "npnc1", "ofq/n", "proorphanin", "pnoc", "prepronociceptin", + "nociceptin/orphanin fq precursor"); + $synonym{$_} = "prkci" foreach ("prkci", "protein kinase c, iota", "pkci", "dxs1179e", "npkc-iota", "protein kinase c iota", + "prkci"); + $synonym{$_} = "prnp" foreach ("prnp", "(prion protein (p27-30) (creutzfeld-jakob disease, gerstmann-strausler-scheinker syndrome, fatal familial insomnia)", + "cjd", "major prion protein precursor", "prp", "prp27-30", "prp33-35c", "ascr", "prip", "gss", + "prn-i", "prn-p", "prpc", "prpsc", "sinc", "mgc26679", "cd230 antigen", "prion-related protein", + "prion protein", "prp", "prnp", "prion protein precursor", "prion protein", "prnp", "prp", + "prion protein prp", "prion protein precursor prp", "prp", "prp", "prp gene", "greater kudu prp", + "major prion protein", "prion protein variant 110p", "prion protein variant 143r", + "prion protein variant 240s", "prion protein variant 37v", "prion protein variant 37v/240s", + "prion protein, prp", "prnp", "prmp"); + $synonym{$_} = "rag1" foreach ("rag1", "recombination activating gene 1", "v(d)j recombination activating protein 1", "rag-1", + "recombination activating protein 1", "rag1", "rag-1", "recombination activating gene-1", + "recombination activating gene 1", "recombination-activating gene 1", "rag1", "rag 1", "rag1"); + $synonym{$_} = "rag2" foreach ("rag2", "recombination activator protein 2", "recombination activating protein 2", "rag-2", "rag 2", + "recombination activating protein", "recombination activating gene-2", "rag2", + "recombination activating protein 2", "rag2", "rag2", "rag2", "rag-2", "rag-2 protein", + "recombinase activating gene 2", "recombination activating protein 2"); + $synonym{$_} = "rbp3" foreach ("rbp3", "retinol-binding protein 3, interstitial", "interphotoreceptor retinoid-binding protein precursor", + "irbp", "interstitial retinol-binding protein", "rbp-3", "interphotoreceptor retinoid binding protein", + "irbp", "irbp", "interphotoreceptor retinoid-binding protein", "interphotorecepter retinoid binding protein", + "interphotoreceptor retinoid binding protein (irbp)", "irbp mrna"); + $synonym{$_} = "tnf" foreach ("tnf", "tumor necrosis factor (tnf superfamily, member 2)", "tnfa", "dif", "tnfsf2", + "tumor necrosis factor (cachectin)", "tumor necrosis factor, alpha (cachectin)", + "tumor necrosis factor", "tnf-alpha", "cachectin", "tnfsf1a", "apc1 protein", + "tnf, monocyte-derived", "tnf, macrophage-derived", "tnf superfamily, member 2", + "tumor necrosis factor-alpha", "tumor necrosis factor alpha", "tnfa", "tnf-alpha", + "tumor necrosis factor alpha precursor", "tnfa", "tnfa", "tnfalpha", "tumour necrosis factor alpha", + "tnf alpha", "tnf-a", "tumor necrosis factor alpha, tnf-alpha", "bovine tumor necrosis factor alpha", + "tumor necrosis factor alpha (cachetin)", "tumor necrosis factor-alpha precursor"); + $synonym{$_} = "tp53" foreach ("tp53", "tumor protein p53 (li-fraumeni syndrome)", "p53", "cellular tumor antigen p53", + "phosphoprotein p53", "trp53", "transformation related protein 53", "p53", "p53 protein", "p53", + "tp53", "tumor suppressor p53", "53 kda phosphoprotein", "insulin recptor substrate p53 short form", + "p53 gene product", "p53 tumor suppressor gene", "p53 tumor suppressor protein", + "tumor suppressor p53 phosphoprotein"); + $synonym{$_} = "ttr" foreach ("ttr", "transthyretin (prealbumin, amyloidosis type i)", "palb", "tthy", "tbpa", "attr", + "transthyretin", "transthyretin precursor", "transthyretin subunit", "ttr", "ttr"); + $synonym{$_} = "tyr" foreach ("tyr", "tyrosinase (oculocutaneous albinism ia)", "ocaia", "monophenol monooxygenase", + "tumor rejection antigen ab", "sk29-ab", "lb24-ab", "albino", "c", "skc35", "oca1a", "tyrosinase", + "tyr", "tyrosinase precursor", "truncated tyrosinase", "tyr", "tyr"); + $synonym{$_} = "vwf" foreach ("vwf", "von willebrand factor", "f8vwf", "coagulation factor viii", "von willebrand factor", "vwf", + "von willebrand factor", "vwf", "vwf", "vwf", "von willebrand factor, vwf", "von willebrand factor precursor", + "von willebrand factor precursor", "wf"); + $synonym{$_} = "zfx" foreach ("zfx", "zinc finger protein, x-linked", "zinc finger x-chromosomal protein", "zfx", "zinc finger protein zfx", + "zfx", "zinc finger protein zfx", "x-linked zinc finger protein zfx", "zfx", "x-linked zinc finger protein", + "zinc finger x-linked protein 1", "zinc finger x-linked protein 2", "zinc finger protein x linked", + "zfx1", "zfx-1", "zfx2", "zfx-2","zfx protein mrna", "zinc finger protein zfx isoform 4", + "zfx product, isoform 1", "zfx product, isoform 2", "zfx product, isoform 3", + "zfx product, isoform 4", "zfx protein"); + $synonym{$_} = "zfy" foreach ("zfy", "zinc finger protein, y-linked", "zinc finger y-chromosomal protein", "zfy", "zfy", + "zinc finger protein zfy", "zinc finger protein zfy", "y-linked zinc finger protein", + "y-linked zinc finger protein zfy", "zinc finger protein y linked", + "y-linked zinc finger protein 2", "y-linked zinc finger protein 1", "zfy1", "zfy-1", "zfy2", + "zfy-2"); + + # Additional genes + $synonym{$_} = "18s_rrna" foreach ("18s rrna", "18s_rrna", "18s ribosomal rna", + "18 rrna", "18 s ribosomal rna", "18 s ribosomal rna", "18s small subunit ribosomal rna", + "18s_ribsomal_rna", "18s_ribosomal_rna_a-type", "18s_rna_gene", "18s_rrna", "nuclear_18s_ribosomal_rna"); + + $synonym{$_} = "28s_rrna" foreach ("28s_large_subunit_ribosomal_rna", "28s rrna", "28s_rrna", "28s ribosomal rna", "28 rrna", "put. 28S ribosomal RNA", "28 s ribosomal rna", "28 s ribosomal rna", + "rrna-28s_rrna-related", "28s ribosomal rna v region"); + $synonym{$_} = "5.8s" foreach ("5.8s_ribosomal_rna", "5.8s_rrna", "5s_ribosomal_rna","5s_rrna","5.8s_rna_gene"); + $synonym{$_} = "mtcyb" foreach ("cob", "mtcyb", "cyt b", "cytb", "cytochrome b", "cytochrome b", "cytb", "cyt b", "cytb", "cytb", + "'cytochrome b'", "cytochrome-b", "skeletal muscle cytochrome b", "cyt-b", "cyt.b", "cyto b", + "cytob", "cytb1", "cytb2", "cytb3", "cytb4", "cytochrome b light strand", "cyt-b", "cytob", + "cyt.b", "cytb", "cytochrome b", "mitochondrial cytochrome b", "cytochrome b protein", + "duodenal cytochrome b"); + $synonym{$_} = "alpha_lactalbumin" foreach ("alpha lactalbumin", "alpha_lactalbumin", "alpha-lactalbumin", "alpla lactalbumin", + "alpah lactalbumin", "a-lacta", "a-lacta", "lactalbumin, alpha-", "lactalbumin, alpha", + "mature alpha-lactalbumin"); + $synonym{$_} = "alpha-s1-casein" foreach ("alpha-s1-casein", "as1-casein", "alpha s1 casein", "alpha s1-casein b", + "alpha s1 casein", "alpha s1-casein", "alpha(s1) casein", "alpha-s1 casein", "alpha-s1 casein", + "alpha-s1-casein", "alpha-s1-casein mrna", "alphas1-casein", "as1-casein", "alfas1-casein", + "alpha-sl casein", "casein alpha s1"); + $synonym{$_} = "alpha-s2-casein" foreach ("alpha-s2-casein", "as2-casein", "alpha s2 casein", "alpha s2-casein b", + "alpha s2 casein", "alpha s2-casein", "alpha(s2) casein", "alpha-s2 casein", "alpha-s2 casein", + "alpha-s2-casein", "alpha-s2-casein mrna", "alphas2-casein", "as2-casein", + "alpha s2a casein (aa 1 to 165)", "alpha s2a casein (aa 1 to 167)", "alph as2-casein", + "alpha(s2)-casein"); + $synonym{$_} = "b-casein" foreach ("b-casein", "beta casein", "beta-casein", "beta-casein (aa 1 - 213)", "beta-casein a3", + "beta-casein precursor", "beta casein precursor", "beta-casein variant a2", + "beta-casein variant i", "mat. beta-casein", "casein beta", "casein b", "casein b (aa 1-183) pgpk48", + "mature beta-casein (aa 1 to 226)"); + $synonym{$_} = "k-casein" foreach ("k-casein", "kappa casein", "kappa-casein", "kappa-casein precursor", "kappa casein precursor", + "casein kappa", "kappa-cas", "kappa-casein long form", "kappa-casein mature peptide", + "kappa-casein short form"); + $synonym{$_} = "sry" foreach ("sry", "sex determining factor sry", "sex determining region y protein", "sex-determining factor sry", + "sex-determining region y", "sry gene", "sry protein", "sex region of y chromosome"); + $synonym{$_} = "c-mos" foreach ("c-mos", "oocyte maturation factor mos", "mos", "mos protein"); + $synonym{$_} = "cryaa" foreach ("cryaa", "crya1", "alpha a-crystallin","alpha a-crystallin", "alphaa-crystallin", + "alpha-a crystallin chain", "alpha-a-crystallin", "crystallin, alpha a", "alphaa-crystallin (crya1)", + "alpha a crystallin"); + $synonym{$_} = "cryab" foreach ("cryab", "crya2", "alpha b-crystallin","alpha b-crystallin", "alphab-crystallin", + "alpha-b crystallin chain", "alpha-b-crystallin", "crystallin, alpha b", "alphab-crystallin (crya2)", + "alpha b crystallin"); + $synonym{$_} = "t-cell_receptor_beta" foreach ("t-cell receptor beta", "t cell receptor beta", "t-cell receptor beta chain", + "t cell receptor beta chain", "t-cell receptor beta chain variable region", + "t cell receptor beta chain variable region", "t-cell receptor beta chain variable", + "t cell receptor beta chain variable", "t-cell receptor variable beta chain", + "t cell receptor variable beta chain", "t-cell receptor beta-chain", + "t cell receptor beta-chain", "t-cell receptor beta chain vj region", + "t cell receptor beta chain vj region", "t-cell receptor beta chain v-d-j region", + "t cell receptor beta chain v-d-j region", "t-cell receptor beta chain variable segment", + "t cell receptor beta chain variable segment", "t-cell receptor beta chain constant region", + "t cell receptor beta chain constant region", "t-cell receptor v beta gene", + "t cell receptor v beta gene", "t-cell receptor-beta chain", "t cell receptor-beta chain", + "t cell receptor bata chain", "t-cell receptor beta-chain", "t cell receptor beta-chain", + "t-cell receptor beta chain (v-d-j-c)", "t cell receptor beta chain (v-d-j-c)", + "t-cell receptor beta-chain v region", "t cell receptor beta-chain v region", + "t-cell receptor v region beta-chain", "t cell receptor v region beta-chain", + "t-cell receptor beta chain vdj region", "t cell receptor beta chain vdj region", + "t-cell receptor beta chain v-region", "t cell receptor beta chain v-region", + "t-cell receptor v-beta", "t cell receptor v-beta", "t-cell_receptor_beta"); + $synonym{$_} = "t-cell_receptor_alpha" foreach ("t-cell receptor alpha", "t cell receptor alpha", "t-cell receptor alpha chain", + "t cell receptor alpha chain", "t-cell receptor alpha chain variable region", + "t cell receptor alpha chain variable region", "t-cell receptor alpha chain variable", + "t cell receptor alpha chain variable", "t-cell receptor variable alpha chain", + "t cell receptor variable alpha chain", "t-cell receptor alpha-chain", + "t cell receptor alpha-chain", "t-cell receptor alpha chain vj region", + "t cell receptor alpha chain vj region", "t-cell receptor alpha chain v-d-j region", + "t cell receptor alpha chain v-d-j region", "t-cell receptor alpha chain variable segment", + "t cell receptor alpha chain variable segment", "t-cell receptor alpha chain constant region", + "t cell receptor alpha chain constant region", "t-cell receptor v alpha gene", + "t cell receptor v alpha gene", "t-cell receptor-alpha chain", "t cell receptor-alpha chain", + "t cell receptor bata chain", "t-cell receptor alpha-chain", "t cell receptor alpha-chain", + "t-cell receptor alpha chain (v-d-j-c)", "t cell receptor alpha chain (v-d-j-c)", + "t-cell receptor alpha-chain v region", "t cell receptor alpha-chain v region", + "t-cell receptor v region alpha-chain", "t cell receptor v region alpha-chain", + "t-cell receptor alpha chain vdj region", "t cell receptor alpha chain vdj region", + "t-cell receptor alpha chain v-region", "t cell receptor alpha chain v-region", + "t-cell receptor v-alpha", "t cell receptor v-alpha", "t-cell_receptor_alpha"); + + # tRNAs + my @aminoAcids = qw(gly ala val leu ile met phe trp pro ser thr cys tyr asn gln asp glu lys arg his); + my %AAcodes = ('glycine' => 'gly', 'alanine' => 'ala', 'valine' => 'val', 'leucine' => 'leu', 'isoleucine' => 'ile', + 'methionine' => 'met', 'phenylalanine' => 'phe', 'tryptophan' => 'trp', 'proline' => 'pro', + 'serine' => 'ser', 'threonine' => 'thr', 'cysteine' => 'cys', 'tyrosine' => 'tyr', 'asparagine' => 'asn', + 'glutamine' => 'gln', 'aspartic acid' => 'asp', 'glutamic acid' => 'glu', 'lysine' => 'lys', + 'arginine' => 'arg', 'histidine' => 'his'); + my %fullName = ('gly' => 'glycine', 'ala' => 'alanine', 'val' => 'valine', 'leu' => 'leucine', 'ile' => 'isoleucine', + 'met' => 'methionine', 'phe' => 'phenylalanine', 'trp' => 'tryptophan', 'pro' => 'proline', + 'ser' => 'serine', 'thr' => 'threonine', 'cys' => 'cysteine', 'tyr' => 'tyrosine', 'asn' => 'asparagine', + 'gln' => 'glutamine', 'asp' => 'aspartic acid', 'glu' => 'glutamic acid', 'lys' => 'lysine', + 'arg' => 'arginine', 'his' => 'histidine'); + + foreach my $aa (@aminoAcids) + { + # Abbreviations + $synonym{"trna $aa"} = "trna-$aa"; + $synonym{"transfer rna $aa"} = "trna-$aa"; + $synonym{"transfer rna-$aa"} = "trna-$aa"; + $synonym{"trna($aa)"} = "trna-$aa"; + $synonym{"trna $aa gene"} = "trna-$aa"; + $synonym{"trna-$aa gene"} = "trna-$aa"; + $synonym{"$aa trna"} = "trna-$aa"; + + my $dashedName = $aa."-trna"; + $synonym{"$dashedName"} = "trna-$aa"; + + # And again for full names + $synonym{"trna $fullName{$aa}"} = "trna-$aa"; + $synonym{"transfer rna $fullName{$aa}"} = "trna-$aa"; + $synonym{"transfer rna-$fullName{$aa}"} = "trna-$aa"; + $synonym{"trna-$fullName{$aa}"} = "trna-$aa"; + $synonym{"trna($fullName{$aa})"} = "trna-$aa"; + $synonym{"trna $fullName{$aa} gene"} = "trna-$aa"; + $synonym{"trna-$fullName{$aa} gene"} = "trna-$aa"; + $synonym{"$fullName{$aa} trna"} = "trna-$aa"; + + $dashedName = $fullName{$aa}."-trna"; + $synonym{"$dashedName"} = "trna-$aa"; + } + } + +sub geneClean + { + my $dirtyGene = shift; + + # Initial synonymizing + $dirtyGene = lc($dirtyGene); # Only work with lower-case for synonymizing and file names + $dirtyGene = $synonym{$dirtyGene} if (defined $synonym{$dirtyGene}); + + # Clean end of gene name + $dirtyGene =~ s/\"$//g; + $dirtyGene =~ s/\s+$//g; + $dirtyGene =~ s/^\s+//g; + + # Remove punctuation that will confound file saving + $dirtyGene =~ s/^'//g; + $dirtyGene =~ s/'$//; + $dirtyGene =~ s/:/ /g; + $dirtyGene =~ s/, / /g; + $dirtyGene =~ s/,//g; + $dirtyGene =~ s/\*/-/g; + $dirtyGene =~ s/\$/-/g; + $dirtyGene =~ s/\#/-/g; + $dirtyGene =~ s/\&/and/g; + $dirtyGene =~ s/\//-/g; + $dirtyGene =~ s/\\//g; + $dirtyGene =~ s/\|/-/g; + $dirtyGene =~ s/;//g; + $dirtyGene =~ s/\//g; + $dirtyGene =~ s/-+/-/g; + $dirtyGene =~ s/\s+-/-/g; + $dirtyGene =~ s/-\s+/-/g; + $dirtyGene =~ s/^-+//; + $dirtyGene =~ s/`/'/; + + # Collapse multiple whitespace + $dirtyGene =~ s/\s+/_/g; + + # Clean up some tRNA variants (easier than specifying explicit synonyms for each tRNA) + if ($dirtyGene =~ /tRNA/i) + { + $dirtyGene =~ s/_\d+$//; + $dirtyGene =~ s/ \d+$//; + } + + # Final synonymizing + $dirtyGene = $synonym{$dirtyGene} if (defined $synonym{$dirtyGene}); # Recheck for synonym + $dirtyGene = lc($dirtyGene); # Ensure that gene names are in lower case for further synonymizing and file saving (final safety) + + return $dirtyGene; + } + +sub geneCount + { + # Set local parameters + my $gbFile = shift; + + my (%iGene, @geneList, $speciesNum, %quickSpeciesGenePresent, %speciesCounter); + my $wordBlock = 1; + my $modelBlock = 0; + my %modelSpecies; + my $stripCount = 0; + + print "\nCounting gene occurrences in file $gbFile to establish genes that do not meet user-defined threshold(s)\n"; + + # Search input file + setLineBreak($gbFile); + open (GENE, "<$gbFile") or die "Cannot open GenBank output file, $gbFile.\n"; + my $accCount = 0; + my $modelFlag = 0; + my $speciesFlag = 0; + my $nameFlag = 0; + my $countZero = time; + my ($organism, $gene); + while () + { + next unless ($_ =~ /LOCUS/ or $_ =~ /^\s*(\/gene)|(\/product)=\"/ or $_ =~ /^\s*\/organism=\"/ or $nameFlag == 1); + chomp; + my $gbLine = $_; + + if ($gbLine =~ /LOCUS/) + { + undef %iGene; + undef $organism; + $modelFlag = $speciesFlag = $nameFlag = 0; + $accCount++; + print "\t$accCount sequences read in\n" if ($accCount == int($accCount/10000)*10000 and $verbose); + next; + } + + # Get organism name + if ($gbLine =~ /^\s*\/organism=\"(.+)\"/) + { + $organism = $1; + $organism =~ s/\s/_/g; + $modelFlag = 1 if (defined $modelSpecies{$organism}); + $speciesFlag = 1 if ($organism =~ /sp\./ or $organism =~ /cf\./); + unless ($speciesFlag or ($modelFlag and $modelBlock)) + { + $speciesCounter{$organism}++; + $speciesNum++ if ($speciesCounter{$organism} == 1); + } + } + next if ($modelFlag and $modelBlock); # Entry pertains to model organism; skip parsing rest of entry + + # Get gene name + if ($gbLine =~ /^\s*(\/gene)|(\/product)=\"(.+)/) # Get start of gene name + { + $gene = $1 if ($gbLine =~ /=\"(.+)/); + + # Check whether name wraps onto a new line + if (substr($gbLine, -1) ne "\"") + { + $nameFlag = 1; + next; + } + else + { + $nameFlag = 2; + $gene =~ s/\"$// if ($gene =~ /\"$/); + } + } + + if ($nameFlag == 1) # Get continuation / end of gene name + { + $gene .= $gbLine; + $nameFlag = 2 if ($gbLine =~ /\"$/) # Gene name is complete + } + + if ($gene and $nameFlag == 2) # Process complete gene + { + next if (not defined $organism); + unless ($wordBlock and ($gene =~ /hypothetical/i or $gene =~ /open reading frame/i or $gene =~ /similar/i or $gene =~ /homolog/i or $gene =~ /putative/i or $gene =~ /unknown/i or $gene =~ /unnamed/i or $gene =~ /\d+rik/i or $gene =~ /possible/i or $gene =~ /pseudo/i)) + { + $gene = geneClean($gene); # Clean up and synonymize gene name + + # Increment counters + $iGene{$gene}++; # Sequence counter + $quickSequenceCount{$gene}++ if ($iGene{$gene} == 1); + + $quickSpeciesGenePresent{$gene}{$organism}++; # Species-gene counter + $quickSpeciesCount{$gene}++ if ($quickSpeciesGenePresent{$gene}{$organism} == 1); + } + $nameFlag = 0; + undef $gene; + } + } + close GENE; + print "\n" if ($verbose); + my $countTime = time - $countZero; + + # Print results and block genes as appropriate + open (OUT, ">$countFile") or die "Cannot open results file for quick gene count, $countFile.\n"; + if ($speciesThreshold) + { + @geneList = sort {$quickSpeciesCount{$b} <=> $quickSpeciesCount{$a} } keys %quickSpeciesCount; + } + else + { + @geneList = sort {$quickSequenceCount{$b} <=> $quickSequenceCount{$a} } keys %quickSequenceCount; + } + + printf OUT "Searching $accCount sequences took $countTime seconds with %s distinct genes found\n", scalar(@geneList); + print OUT "\tModel organisms were excluded from search\n" if ($modelBlock); + print OUT "\tCommon words were excluded from search\n" if ($wordBlock); + print OUT "\nGene\tNo. of sequences\tNo. of species\n\n"; + + print "\tGene counts:\n" if ($debug); + foreach my $entry (@geneList) + { + print OUT "$entry\t$quickSequenceCount{$entry}\t$quickSpeciesCount{$entry}\n"; + print "\t$entry\t$quickSequenceCount{$entry}\t$quickSpeciesCount{$entry}\n" if ($debug); + if ($quickSequenceCount{$entry} < $seqThreshold or $quickSpeciesCount{$entry} < $speciesThreshold) + { + $geneStatus{$entry} = "rejected"; + push @rejectList, $entry; + } + else + { + $stripCount++; + } + } + close OUT; + + printf "\tSearching $accCount sequences took $countTime seconds with %s distinct genes found; results written to $countFile\n", scalar(@geneList); + if (not $stripCount) + { + print "No genes were present more than the threshold value(s)\n"; + exit(0); + } + } + +# Version history +# +# v2.0 (July 25, 2005) +# - improved and more complete parsing of GenBank files (esp. of gene names or CDs covering multiple lines, +# join statements and other multi-segment entries, detection of pseudogenes) +# - added ability to mine input file for all gene content +# - added ability to strip only those genes present for 1) a minimum number of sequences and/or species and +# 2) species with valid names only +# - added (incomplete) gene synonymy list (initially compiled by Robin Beck) +# - minor bug fixes +# +# v1.0.1a (April 29, 2004) +# - added automatic detection of line breaks +# - minor bug fixes +# +# v1.0 (April 30, 2002) +# - initial release diff -r 000000000000 -r 5b9a38ec4a39 getdata/genbankstrip.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getdata/genbankstrip.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,65 @@ + + Extracts sequences from GenBank files by gene name + + genbankstrip.pl "-f"$data_file "-g"$gene "-l"$length $sp -op + + + + + + + + + + + + +**What it does** + +Extracts sequences from GenBank flat file based on gene name. + +------ + +**Inputs** + +There are two alternative ways to input information. +1. A list of Accession numbers in a text file that is uploaded to the Galaxy history. +2. Optionally, the user can paste in a list of accession numbers separated by spaces. + +------ + +**Outputs** + +The user selects which output format to use for data downloaded from GenBank. Options Include:: + 1. GenBank format + 2. FASTA format + 3. phytab format + +phytab format includes a gene name, which is not standardized in GenBank format, so the user must add manually the gene +family names for all entries. If adding only one gene name, that will be used for all entries. + +------- + +**Additional Information** + +phytab format is described here: +http://osiris-phylogenetics.blogspot.com/2012/09/introduction-to-phytab-format.html + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + + + + + + diff -r 000000000000 -r 5b9a38ec4a39 getdata/generate_from_phylota.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getdata/generate_from_phylota.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,119 @@ +#!/usr/bin/perl -w +use strict; +use LWP::Simple; +use Bio::SeqIO; + +my $infile = $ARGV[0]; +my $database = $ARGV[1]; +my $outfile = $ARGV[2]; +my $treefile = $ARGV[3]; +my $phytabfile = $ARGV[4]; + +open(IN, "$infile") or exit; +open(OUT, ">$outfile") or exit; +open(TREES, ">$treefile") or exit; +open(DATA, "<$database") or exit; +my @data=; +close(DATA); +my @foundtrees; +my @alltrees; +while(){ + my $line = $_; + chomp($line); + $line =~ s/ /_/g; + print "Finding trees with $line ...."; + @foundtrees = grep /$line/, @data; + my $numtrees = scalar @foundtrees; + print " $numtrees Trees\n"; + if($numtrees == 0){ + my @genus = split(/_/,$line); + @foundtrees = grep /$genus[0]/, @data; + print "\tTrying genus $genus[0]"; + my $numtrees = scalar @foundtrees; + print " $numtrees genus Trees\n"; + } + push(@alltrees,@foundtrees); +} + +@alltrees = uniq(@alltrees); +print TREES @alltrees; + +#get fasta files for trees +for(my $i=0;$i < @alltrees; $i++){ + my @tablines = split(/\t/,$alltrees[$i]); + my @tici = split(/_/, $tablines[0]); + my $ti = $tici[0]; + my $ci = $tici[1]; + my $addstring = $ti.$ci."_"; + $ti =~ s/ti//; + $ci =~ s/ci//; + my $fastafile = getfastafromphylota($ci,$ti); + #Add TI_CI_ to each fastaheader + $fastafile =~ s/\>/\>$addstring/g; + print OUT $fastafile; +} +close(IN); +close(OUT); +close(TREES); + +#Now convert fasta file to phytab file and write +open(PHYTAB, ">$phytabfile") or exit; +# open infile fasta file +my $in_obj = Bio::SeqIO->new(-file => $outfile, '-format' =>'fasta'); +my $total=0; +# grab sequence object +while (my $seq = $in_obj->next_seq() ) { + my $seq_obj = $in_obj; + my $sequenceid = $seq->id; + my $species_name = $seq->desc; + my $fullheader = $sequenceid." ".$species_name; + my $sequence = $seq->seq; + my @header = split(/_/, $fullheader); + my $cluster = $header[0]; + my $seqgi = $header[1]; + $seqgi =~ s/gi//; + my $seqti = $header[2]; + $seqti =~ s/ti//; + my $seqsp = $header[3]; + $seqsp = cleansp($seqsp); + print "Writing phytab for $seqsp\n"; + print PHYTAB $seqsp."\t".$cluster."\t".$seqgi."\t".$sequence."\n"; +} +close(PHYTAB); + +#remove duplicate lines (trees) +sub uniq { + my %seen = (); + my @r = (); + foreach my $a (@_) { + unless ($seen{$a}) { + push @r, $a; + $seen{$a} = 1; + } + } + return @r; +} +sub cleansp +{ + my $seqsp = shift; + $seqsp =~ s/ /_/g; + $seqsp =~ s/\.//g; + $seqsp =~ s/\'//g; + $seqsp =~ s/\-//g; + return($seqsp); +} +sub getfastafromphylota +{ + my $ci=shift; + my $ti=shift; + + print "Writing: CI:$ci TI:$ti\n"; + + my $url = 'http://phylota.net/cgi-bin/sql_getcluster_fasta.cgi?format=all&db=184&ti='.$ti.'&cl='.$ci.'&ntype=1'; + my $content = get $url; + die "Couldn't get $url" unless defined $content; + $content =~ s/\\//; + $content =~ s/\<\/html\>//; + $content =~ s/\<\/pre\>//; + return($content); +} diff -r 000000000000 -r 5b9a38ec4a39 getdata/generate_from_phylota.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getdata/generate_from_phylota.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,65 @@ + + Pull phylogenies and genetic data from phylota with species list input + + Bio + LWP + + + generate_from_phylota.pl $infile $database $outfile $treefile $phytabfile + + + + + + + + + + + +**What it does** + +Generate_from_PhyLoTA pulls pre-calculated trees and corresponding raw data from the PhyLoTA browser. The PhyLoTA browser groups +GenBank data into gene clusters by similarity, and uses those clusters for phylogenetic analysis. + +------ + +**Inputs** + +1. A target list of species. +2. The pre-calculated set of phylota trees available from the phylota web page. Available here: http://phylota.net/pb/Download/pb.dmp.maximalnr.trees.184.gz + +------ + +**Outputs** + +1. All raw sequence data in fasta format for the trees in #2. +2. All trees in newick format. Trees will contain more species than the species list, and all species are retained. +3. All raw sequence data in PHYTAB format. http://osiris-phylogenetics.blogspot.com/2012/09/introduction-to-phytab-format.html + +------- + +**Additional Information** + +http://osiris-phylogenetics.blogspot.com/2012/09/generatefromphylota.html + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Sanderson, M. J., D. Boss, D. Chen, K. A. Cranston, and A. Wehe. 2008. The PhyLoTA Browser: processing GenBank for molecular phylogenetics research. Syst. Biol. 57:335-346. + + + + + diff -r 000000000000 -r 5b9a38ec4a39 getdata/get_1_gb.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getdata/get_1_gb.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,32 @@ +#!/usr/bin/perl -w +use strict; + +#use FindBin; +#use lib "$FindBin::Bin/lib"; +use Bio::DB::GenBank; +use Bio::SeqIO; + +my $accession = $ARGV[0]; +my $datatype = $ARGV[1]; +my $outtype = $ARGV[2]; +my $outfile = $ARGV[3]; + + + my $qry_string .= $accession."[accession]"; + my $fh = Bio::SeqIO->newFh(-format=>$outtype, -file=>">$outfile"); + + my $GBseq; + my $gb = new Bio::DB::GenBank; + my $query = Bio::DB::Query::GenBank->new + (-query =>$qry_string, + -db =>$datatype); + + my $count; + my $species; + my $seqio = $gb->get_Stream_by_query($query); + while( defined ($GBseq = $seqio->next_seq )) { + my $sequence = $GBseq; # read a sequence object + print $fh $sequence; # write a sequence object + } + +exit; diff -r 000000000000 -r 5b9a38ec4a39 getdata/get_1_gb.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getdata/get_1_gb.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,60 @@ + + Grab one sequence from GenBank using accession number query + + bioperl + + + get_1_gb.pl $acc $database $outtype $outfile 2>&1 + + + + + + + + + + + + + + + + +**What it does** + +Get_1_GB grabs one sequence from GenBank using accession number query. + +------ + +**Inputs** + +GenBank accession number + +------ + +**Outputs** + +FASTA or GenBank + +------- + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Link to Genbank: http://www.ncbi.nlm.nih.gov/genbank/ + + diff -r 000000000000 -r 5b9a38ec4a39 getdata/get_gb.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getdata/get_gb.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,118 @@ +#!/usr/bin/perl -w +use strict; + +#use FindBin; +#use lib "$FindBin::Bin/lib"; +use Bio::DB::GenBank; +use Bio::SeqIO; + + +my $datafile = $ARGV[0]; +my $datatype = $ARGV[1]; +my $outtype = $ARGV[2]; +my $outfile = $ARGV[3]; +my $manual = $ARGV[4]; +my $mannames = $ARGV[5]; +my $genenames = $ARGV[6]; + + +my $accessions; +my @accnums; +my @newnames; +my $manbin=0; +my @genenames; +my $genebin=0; + +unless($mannames eq ''){ + @newnames = split(/ /,$mannames); + $manbin=1; +} + +unless($genenames eq ''){ + @genenames = split(/ /,$genenames); + $genebin=1; +} + +if($datafile eq 'None'){ + @accnums = split(/ /,$manual); +# if(@accnums != @newnames && $manbin ==1 ){ +# die "Must have the same number of Custom Names as Accession Numbers\n"; +# } +}else{ + open (FILE,"<$datafile") or die "Cannot open file containing accession numbers\n"; + + while () + { + chomp; + next unless ($_); + push(@accnums, $_); + } +} + my $countnames = 0; + foreach (@accnums){ + #Should check input for one word per line and throw error if not, which is not done + + $accessions = $_; + chomp; + if($accessions eq ""){ + die "Put spaces between accession numbers\n"; + } + my $qry_string .= $accessions."[accession]"." "; + + my $GBseq; + my $gb = new Bio::DB::GenBank; + my $query = Bio::DB::Query::GenBank->new + (-query =>$qry_string, + -db =>$datatype); + + my $count; + my $species; + my $seqio; + if($outtype eq "phytab"){ #print phytab format, do not use bioperl as below. + open(OUTFILE, ">>$outfile"); + if( defined ($seqio = $gb->get_Stream_by_query($query)) ){ + # my $seqio = $gb->get_Stream_by_query($query); + while( defined ($GBseq = $seqio->next_seq )) { + my $sequence = $GBseq; # read a sequence object + if($manbin ==1){ #replace GenBank Names with Custom Names + $sequence->id($newnames[$countnames]); + $sequence->desc(''); + $species = $sequence->id; + $countnames++; + }else{ + $species = $sequence->species->binomial; + $species =~ s/ /_/g ; + } + if(@genenames > 0){ + if(@genenames == 1){ + print OUTFILE $species."\t".$genenames[0]."\t".$sequence->accession."\t".$sequence->seq."\n"; + }else{ + print OUTFILE $species."\t".$genenames[$countnames-1]."\t".$sequence->accession."\t".$sequence->seq."\n"; + } + }else{ + print OUTFILE $species."\tNone\t".$sequence->accession."\t".$sequence->seq."\n"; + } + } + }else{ + print "Did not find $accessions\n"; + } + }else{ + my $fh = Bio::SeqIO->newFh(-format=>$outtype, -file=>">>$outfile"); + + if( defined ($seqio = $gb->get_Stream_by_query($query)) ){ + # my $seqio = $gb->get_Stream_by_query($query); + while( defined ($GBseq = $seqio->next_seq )) { + my $sequence = $GBseq; # read a sequence object + if($manbin ==1){ #replace GenBank Names with Custom Names + $sequence->id($newnames[$countnames]); + $sequence->desc(''); + $countnames++; + } + print $fh $sequence; # write a sequence object + } + }else{ + print "Did not find $accessions\n"; + } + } + } +exit; diff -r 000000000000 -r 5b9a38ec4a39 getdata/get_gb.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getdata/get_gb.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,84 @@ + + Grab GenBank Data from a text list of accession numbers + + Bio + + + #if $outtypeconditional.outtype == "phytab": + get_gb.pl $data_file $database $outtypeconditional.outtype $outfile '$manual' '$mannames' '$outtypeconditional.genenames' + #else: + get_gb.pl $data_file $database $outtypeconditional.outtype $outfile '$manual' '$mannames' None + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Downloads data from GenBank based on Accession numbers. + +------ + +**Inputs** + +There are two alternative ways to input information. +1. A list of Accession numbers in a text file that is uploaded to the Galaxy history. +2. Optionally, the user can paste in a list of accession numbers separated by spaces. + +------ + +**Outputs** + +The user selects which output format to use for data downloaded from GenBank. Options Include:: + 1. GenBank format + 2. Fasta format + 3. phytab format + +phytab format includes a gene name, which is not standardized in GenBank format, so the user must add manually the gene +family names for all entries. If adding only one gene name, that will be used for all entries. + +------- + +**Additional Information** + +phytab format is described here: +http://osiris-phylogenetics.blogspot.com/2012/09/introduction-to-phytab-format.html + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + + diff -r 000000000000 -r 5b9a38ec4a39 getdata/get_gb_sp.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getdata/get_gb_sp.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,68 @@ +#!/usr/bin/perl +use strict; +no warnings; #genbank produces annoying warning if no sequence is found + +#use FindBin; +#use lib "$FindBin::Bin/lib"; +use Bio::DB::GenBank; +use Bio::SeqIO; +use Bio::Root::Exception; +use Error qw(:try); + + +my $datafile = $ARGV[0]; +my $datatype = $ARGV[1]; +my $outtype = $ARGV[2]; +my $outfile = $ARGV[3]; +my $nodata = $ARGV[4]; + +my $accessions; +my @accnums; + + open (FILE,"<$datafile") or die "Cannot open file containing accession numbers\n"; + open (OUT,">$outfile") or die "Cannot open outfile\n"; + close OUT; #This overwrites old file if it exists + open (ND,">$nodata") or die "Cannot open file\n"; + my $fh = Bio::SeqIO->newFh(-format=>$outtype, -file=>">>$outfile"); + + + while () + { + chomp; + next unless ($_); + push(@accnums, $_); + } + close FILE; + + my $countnames = 0; + foreach (@accnums){ + #Should check input for one word per line and throw error if not, which is not done + + $accessions = $_; + chomp; + if($accessions eq ""){ + die "Put spaces between accession numbers. No Empty Lines allowed.\n"; + } + my $qry_string .= $accessions."[organism]"." "; + +# my $GBseq; + my $gb = new Bio::DB::GenBank; + my $query = Bio::DB::Query::GenBank->new + (-query =>$qry_string, + -db =>$datatype); + + my $seqio; + + if (eval {$gb->get_Stream_by_query($query)}){ + $seqio = $gb->get_Stream_by_query($query); + while( my $GBseq = $seqio->next_seq ) { + my $sequence = $GBseq; # read a sequence object + print $fh $sequence; # write a sequence object + } + }else{ + print ND "$accessions\n"; + } + } +exit; + + diff -r 000000000000 -r 5b9a38ec4a39 getdata/get_gb_sp.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getdata/get_gb_sp.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,64 @@ + + Grab All GenBank Data from a text list of species + + Bio + + + get_gb_sp.pl $data_file $database $outtype $outfile $nffile 2> error.log + + + + + + + + + + + + + + + + + + + +**What it does** + +Downloads ALL data from GenBank based on species names. + +------ + +**Inputs** + +A text list of species names (or genus names will retrieve all data for a genus.) + +------ + +**Outputs** + +The user selects which output format to use for data downloaded from GenBank. Options Include:: + 1. GenBank format + 2. FASTA format + +------- + +**Additional Information** + +GenBank format can be converted to phytab or other formats using GenBankstrip. + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + + diff -r 000000000000 -r 5b9a38ec4a39 getdata/get_seqs.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getdata/get_seqs.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,380 @@ +#!/jgi/tools/bin/perl -w + +# +# This script creates a Fasta/Qual/Fastq file of selected sequences, with optional filters. +# +# 02/24/10 : created by Ed Kirton +# 12/07/10 : fixed Fastq bug +# + +use strict; +use warnings; +use Getopt::Long; +use IO::File; +#use PerlIO::gzip; +use FindBin; +use lib $FindBin::Bin; +use FastaDb; +use FastqDb; + +my $usage = <<'ENDHERE'; +NAME: + get_seqs.pl +PURPOSE: + To extract a subset of sequences by ID. +INPUT: + --db <*.fasta|fastq> : file containing sequences in Fasta or Fastq format + --table <*.tsv> : file containing sequence IDs (optional; default=stdin) + --col : column of table containing sequence IDs (optional; default=1=first column) +OUTPUT: + --selected <*.fasta|fastq> : file containing named sequences + --unselected <*.fasta|fastq> : file containing unselected sequences +OPTIONS: + --cosorted : uses faster algorithm if IDs appear in both files in the same order + --paired : filter complete read-pair when one read is selected (requires Illumina-style read IDs; i.e. */1, */2) + --ignore case : ignore case differences between IDs + --gzip : compress all outfiles +OPTIONAL FILTERS: + Optional filters, each in the form of column:condition:value. + Where column is the column in the table (containing IDs) + Condition is one of the following: + String operators: + s_eq + s_ne + s_contains + s_notcontains + s_startswith + s_notstartswith + s_endswith + s_notendswith + Numerical operators: + n_eq + n_ne + n_gt + n_lt + Where value is a string or number as appropriate. +AUTHOR/SUPPORT: + Edward Kirton (ESKirton@LBL.gov) +ENDHERE + +# +# VALIDATE INPUT +# +my ($help, $dbfile, $tablefile, $id_col, $ignorecase, $cosorted, $selected, $unselected, $gzip, $paired); +GetOptions( + 'd|db=s' => \$dbfile, + 't|table=s' => \$tablefile, + 'c|col=i' => \$id_col, + 'ignorecase' => \$ignorecase, + 'cosorted' => \$cosorted, + 's|selected=s' => \$selected, + 'u|unselected=s' => \$unselected, + 'g|gzip' => \$gzip, + 'p|paired' => \$paired, + 'h|help' => \$help +); +if ($help) { print $usage; exit; } +die("DB required\n") unless $dbfile; +die("DB file not found: $dbfile\n") unless -f $dbfile; +die("Table required\n") unless $tablefile; +die("Table file not found: $tablefile\n") unless -f $tablefile; +$selected = '' if !defined($selected) or $selected eq 'None'; +$unselected = '' if !defined($unselected) or $unselected eq 'None'; +$id_col=1 unless $id_col; +die("Invalid id column, $id_col\n") unless $id_col > 0; + +my $filters = []; +while (my $filter = shift @ARGV) { + next unless $filter; + my @a_filter = split(/:/, $filter); + die("Invalid number of filter options: @a_filter") unless @a_filter == 3; + push @$filters, \@a_filter; +} + +# +# MAIN +# +my ($n_selected,$n_unselected); +if ($cosorted) { + # SEARCH IS FAST AND EASY IF INPUTS SIMILARLY SORTED! + ($n_selected,$n_unselected) = search_cosorted($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters); +} else { + # INPUT NOT CO-SORTED SO KEEP ALL IDS IN RAM + ($n_selected,$n_unselected) = search($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters); +} +print "Selected = $n_selected; Unselected = $n_unselected\n"; +exit; + +# +# RETURNS TRUE ONLY IF RECORD MATCHES (OPTIONAL) SEARCH CRITERIA +# +sub match +{ + my ($filters, $row) = @_; + foreach my $filterA (@$filters) { + my ($condition, $col, $value) = @$filterA; + my $x = $row->[ $col - 1 ]; + if ($condition eq 's_eq') { return 0 unless $x eq $value } + elsif ($condition eq 's_ne') { return 0 unless $x ne $value } + elsif ($condition eq 's_contains') { return 0 unless $x =~ /$value/ } + elsif ($condition eq 's_notcontains') { return 0 unless $x !~ /$value/ } + elsif ($condition eq 's_startswith') { return 0 unless $x =~ /^$value/ } + elsif ($condition eq 's_notstartswith') { return 0 unless $x !~ /^$value/ } + elsif ($condition eq 's_endswith') { return 0 unless $x =~ /$value$/ } + elsif ($condition eq 's_notendswith') { return 0 unless $x !~ /$value$/ } + elsif ($condition eq 'n_eq') { return 0 unless $x == $value } + elsif ($condition eq 'n_ne') { return 0 unless $x != $value } + elsif ($condition eq 'n_gt') { return 0 unless $x > $value } + elsif ($condition eq 'n_lt') { return 0 unless $x < $value } + } + return 1; +} + +# +# SIMULTANEOUSLY PARSE TWO STREAMS +# +sub search_cosorted +{ + my ($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters) = @_; + my $sfh = new IO::File; + my $ufh = new IO::File; + my $table = new IO::File; + my $n_selected = 0; + my $n_unselected = 0; + + # OPEN FILES + if ($tablefile) { + open($table, "<$tablefile") or die("Unable to open file, $tablefile: $!\n"); + } else { + $table=*STDIN; + } + if ($selected) { + if ($gzip) { +# open($sfh, '>:gzip', $selected) or die("Unable to open file, $selected: $!\n"); + } else { + open($sfh, ">$selected") or die("Unable to open file, $selected: $!\n"); + } + } else { + open($sfh, ">/dev/null"); + } + if ($unselected) { + if ($gzip) { +# open($ufh, '>:gzip', $unselected) or die("Unable to open file, $unselected: $!\n"); + } else { + open($ufh, ">$unselected") or die("Unable to open file, $unselected: $!\n"); + } + } else { + open($ufh, ">/dev/null"); + } + + # GET FIRST MATCHING TARGET ID + my $prev_target_id = ''; + my $target_id = ''; + get_next_matching_target_id($table,$id_col,$ignorecase,$filters,\$target_id,\$prev_target_id,$paired); + unless ($target_id) { + # no records match search criteria + close $table; + close $sfh if $selected; + if ($unselected) { + open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); + while () { + print $ufh $_; + ++$n_unselected; + } + close DB; + } + close $ufh; + return 0; + } + + # DETERMINE FILETYPE + open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); + my $format; + while () { + chomp; + if (/^#/ or ! $_) { next } + elsif (/^>/) { $format='fasta' } + elsif (/^@/) { $format='fastq' } + else { die "Invalid DB file format" } + last; + } + close DB; + + # PARSE + my $db = $format eq 'fasta' ? FastaDb->new($dbfile) : FastqDb->new($dbfile); + while (my $rec=$db->next_seq ) { + unless ($target_id) { + last unless $unselected; # done if no more seqs to get + # otherwise dump rest of seqs in unselected file + print $ufh $rec->output; + ++$n_unselected; + while ($rec=$db->next_seq ) { + print $ufh $rec->output; + ++$n_unselected; + } + last; + } + my $id=$ignorecase ? uc($rec->id):$rec->id; + if ($id eq $prev_target_id or $id eq $target_id) { + # selected seq + print $sfh $rec->output; + ++$n_selected; + get_next_matching_target_id($table,$id_col,$ignorecase,$filters,\$target_id,\$prev_target_id,$paired); + } else { + # unselected seq + print $ufh $rec->output; + ++$n_unselected; + } + } + close $table; + close $sfh; + close $ufh; + + # If some target seqs not found, it's likely the files were not cosorted, so try unsorted search function. + if ($target_id) { + print "Files don't appear to be cosorted, trying unsorted search\n"; + return search($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $filters); + } + return ($n_selected,$n_unselected); + + sub get_next_matching_target_id { + my ($table,$id_col,$ignorecase,$filters,$target_idR,$prev_target_idR,$paired)=@_; + $$prev_target_idR = $$target_idR; + $$target_idR = ''; + while (<$table>) { + chomp; + my @row = split(/\t/); + die("Bad input table") unless @row >= $id_col; + next unless match($filters, \@row); + my $new_target_id = $ignorecase ? uc($row[ $id_col - 1 ]) : $row[ $id_col - 1 ]; + $new_target_id=$1 if $new_target_id =~ /^(\S+)/; # use first word only + $new_target_id=$1 if $paired and $new_target_id =~ /^(\S+)\/[12]$/; + next if $new_target_id eq $$prev_target_idR; + $$target_idR=$new_target_id; + last; # return to parsing db file + } + } +} + +# +# LOAD IDS IN RAM THEN PARSE DB. +# +sub search +{ + my ($dbfile, $tablefile, $id_col, $ignorecase, $selected, $unselected, $paired, $gzip, $filters) = @_; + my $sfh = new IO::File; # selected seqs + my $ufh = new IO::File; # unselected seqs + my $table=new IO::File; + my $n_selected=0; + my $n_unselected=0; + my %ids = (); + open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); + if ($tablefile) { + open($table, "<$tablefile") or die("Unable to open file, $tablefile: $!\n"); + } else { + $table=*STDIN; + } + if ($selected) { + if ($gzip) { +# open($sfh, '>:gzip', $selected) or die("Unable to open file, $selected: $!\n"); + } else { + open($sfh, ">$selected") or die("Unable to open file, $selected: $!\n"); + } + } else { + open($sfh, ">/dev/null"); + } + if ($unselected) { + if ($gzip) { +# open($ufh, '>:gzip', $unselected) or die("Unable to open file, $unselected: $!\n"); + } else { + open($ufh, ">$unselected") or die("Unable to open file, $unselected: $!\n"); + } + } else { + open($ufh, ">/dev/null"); + } + + # LOAD IDS OF MATCHING ROWS + my $num_targets=0; + while (<$table>) { + next if /^#/; + chomp; + my @row = split(/\t/); + my $id = $ignorecase ? uc($row[ $id_col - 1 ]) : $row[ $id_col - 1 ]; + $id=$1 if $id =~ /^(\S+)/; + $id=$1 if $paired and $id =~ /^(\S+)\/[12]$/; + if (match($filters, \@row)) { + # remember this ID + $ids{$id} = 0; # number of reads with this ID found (counter for paired option) + ++$num_targets; + } + } + unless ($num_targets) { + # no records match search criteria + close $table; + close $sfh if $selected; + if ($unselected) { + open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); + while () { + print $ufh $_; + ++$n_unselected; + } + close DB; + } + close $ufh; + return 0; + } + + + # DETERMINE FILETYPE + open(DB, "<$dbfile") or die("Unable to open file, $dbfile: $!\n"); + my $format; + while () { + chomp; + if (/^#/ or /^$/) { next } + elsif (/^>/) { $format='fasta' } + elsif (/^@/) { $format='fastq' } + else { die "Invalid DB file format" } + last; + } + close DB; + + # GET SEQS + my $db = $format eq 'fasta' ? FastaDb->new($dbfile) : FastqDb->new($dbfile); + while (my $rec=$db->next_seq ) { + my $id = $ignorecase ? uc($rec->id) : $rec->id; + $id = $1 if $paired and $id =~ /^(\S+)\/[12]$/; + if (exists($ids{$id})) { + # selected + print $sfh $rec->output; + ++$n_selected; + if (!$paired) { + delete $ids{$id}; + } else { + $ids{$id} += 1; + delete $ids{$id} if $ids{$id} == 2; + } + } else { + # unselected + print $ufh $rec->output; + ++$n_unselected; + } + } + close $table; + close $sfh; + close $ufh; + + # MAKE SURE ALL TARGETS WERE FOUND + foreach my $id (keys %ids) { + if ($ids{$id}) { + delete($ids{$id}); # SOMETIMES INFILES CONTAIN ONLY ONE READ OF PAIR + }elsif($id eq 'EMPTY') { #ADDEDBY THO TO ALLOW EMPTY blast results + #in workflow using checkempty.pl + exit; + } else { + warn("Seq not found: $id\n"); + } + } + return ($n_selected,$n_unselected); +} + +__END__ diff -r 000000000000 -r 5b9a38ec4a39 getdata/get_seqs.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getdata/get_seqs.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,92 @@ + +Create a file of selected sequences +get_seqs.pl +$cosorted +$paired +$ignorecase +-db $infile +-table $input_table +-col $id_column +-selected $selected +-unselected $unselected +#for $i in $filters +"${i.condition}:${i.column}:{i.value}" +#end for + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + outselect == "selected" or outselect == "both" + + + outselect == "unselected" or outselect == "both" + + + +**What it does** + +Produces a file of selected sequences. + +If the files are cosorted, a much faster algorithm is used. If it fails, it assumes the files were not cosorted and automatically tries again using the slower algorithm which does not require cosorted files. + +One or more filters may be included to evaluate text or numeric data. + +------ + +**Inputs** + +FASTA file of sequence database, text file of target IDs. + +------ + +**Outputs** + +FASTA file. User may select to either output the selected sequences, the other sequences, or both files. + +------- + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + diff -r 000000000000 -r 5b9a38ec4a39 getdata/getdata.tool_conf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getdata/getdata.tool_conf Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,11 @@ +
+ + + + + + + + + +
diff -r 000000000000 -r 5b9a38ec4a39 getdata/phylota_with_taxid.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getdata/phylota_with_taxid.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,109 @@ +#!/usr/bin/perl -w +use strict; +use LWP::Simple; +use Bio::SeqIO; + +my $ti = $ARGV[0]; +my $outfile = $ARGV[1]; +my $phytabfile = $ARGV[2]; + +open(OUT, ">$outfile") or exit; + +my $content = getclustersfromphylota($ti); +my @weblines = split(/\<\/tr\>/, $content); +my @ci; + +#Parse html from phylota browser to retain just each ci +foreach(@weblines){ + if($_ =~ m/getcluster\.cgi/){ + chomp; + $_ =~ s/\&ntype\=1\&db\=184\".+// ; + $_ =~ s/(.*?)getcluster\.cgi.+cl\=// ; + $_ =~ s/\<\/font\>\<\/td\>// ; + chomp; + $_ =~ s/^\n// ; + push(@ci, $_); + } +} + +#get fasta files for trees +for(my $i=0;$i < @ci; $i++){ + my $ci = $ci[$i]; + my $addstring = 'ti'.$ti.'ci'.$ci.'_'; + my $fastafile = getfastafromphylota($ci,$ti); + #Add TI_CI_ to each fastaheader + $fastafile =~ s/\>/\>$addstring/g; + print OUT $fastafile; +} +close(OUT); + +#Now convert fasta file to phytab file and write +open(PHYTAB, ">$phytabfile") or exit; +# open infile fasta file +my $in_obj = Bio::SeqIO->new(-file => $outfile, '-format' =>'fasta'); +my $total=0; +# grab sequence object +while (my $seq = $in_obj->next_seq() ) { + my $seq_obj = $in_obj; + my $sequenceid = $seq->id; + my $species_name = $seq->desc; + my $fullheader = $sequenceid." ".$species_name; + my $sequence = $seq->seq; + my @header = split(/_/, $fullheader); + my $cluster = $header[0]; + my $seqgi = $header[1]; + $seqgi =~ s/gi//; + my $seqti = $header[2]; + $seqti =~ s/ti//; + my $seqsp = $header[3]; + $seqsp = cleansp($seqsp); + print PHYTAB $seqsp."\t".$cluster."\t".$seqgi."\t".$sequence."\n"; +} +close(PHYTAB); + + + + + + +#************************************************************** +#sub routines + +sub cleansp +{ + my $seqsp = shift; + $seqsp =~ s/ /_/g; + $seqsp =~ s/\.//g; + $seqsp =~ s/\'//g; + $seqsp =~ s/\-//g; + return($seqsp); +} +sub getfastafromphylota +{ + my $ci=shift; + my $ti=shift; + + #print "Writing: CI:$ci TI:$ti\n"; + + my $url = 'http://phylota.net/cgi-bin/sql_getcluster_fasta.cgi?format=all&db=184&ti='.$ti.'&cl='.$ci.'&ntype=1'; + my $content = get $url; + die "Couldn't get $url" unless defined $content; + $content =~ s/\\//; + $content =~ s/\<\/html\>//; + $content =~ s/\<\/pre\>//; + return($content); +} +sub getclustersfromphylota +{ + my $ti=shift; + + #print "Writing: CI:$ci TI:$ti\n"; + + my $url = 'http://phylota.net/cgi-bin/sql_getclusterset.cgi?ti='.$ti.'&ntype=1&piflag=1&dflag=0&db=184'; + my $content = get $url; + die "Couldn't get $url" unless defined $content; + $content =~ s/\\//; + $content =~ s/\<\/html\>//; + $content =~ s/\<\/pre\>//; + return($content); +} diff -r 000000000000 -r 5b9a38ec4a39 getdata/phylota_with_taxid.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getdata/phylota_with_taxid.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,61 @@ + + Pull all genetic data from phylota using a GenBank Taxonomy ID + + Bio + LWP + + + phylota_with_taxid.pl $input $outfile $phytabfile + + + + + + + + + +**What it does** + +PhyLoTA_with_taxID pulls pre-calculated trees and corresponding raw data from the PhyLoTA browser. The PhyLoTA browser groups GenBank +data into gene clusters by similarity, and uses those clusters for phylogenetic analysis. + +------ + +**Inputs** + +1. A GenBank Taxon ID. +Search Here for ID of a taxon +http://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/ + +------ + +**Outputs** + +1. All raw sequence data in fasta format for the trees in #2. +2. All raw sequence data in PHYTAB format. http://osiris-phylogenetics.blogspot.com/2012/09/introduction-to-phytab-format.html + +------- + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------ + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Sanderson, M. J., D. Boss, D. Chen, K. A. Cranston, and A. Wehe. 2008. The PhyLoTA Browser: processing GenBank for molecular phylogenetics research. Syst. Biol. 57:335-346. + + + + + diff -r 000000000000 -r 5b9a38ec4a39 orthologs/evolmap.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/evolmap.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,86 @@ +#!/usr/bin/perl + +my $evolmapPath = "\/home\/galaxy\/galaxy_dist\/tool-data\/shared\/jars\/evolmap"; + +my $file = "Options.txt"; + +my $tree_Input = $ARGV[0]; +my $tag_Input = ".dat"; +my $protein_Input = $ARGV[1]; +my $database_name_Input = "dataout"; +my $read_database_Input = $ARGV[3]; +my $Blastall_Input = $ARGV[4]; +my $read_blast_scores_Input = $ARGV[5]; +my $alignments_Input = $ARGV[6]; +my $bit_scores_Input = $ARGV[7]; +my $read_scores_Input = $ARGV[8]; +my $read_ancestors_Input = $ARGV[9]; +my $view_ancestors_Input = "false"; +my $sfa_Input = $ARGV[10]; +my $ortholog_threshold_Input = $ARGV[11]; +my $diverged_threshold_Input = $ARGV[12]; +my $diverged_std_Input = $ARGV[13]; +my $avg_of_paralogs_Input = $ARGV[14]; +#my $numDiffGenes = $ARGV[15]; + +my $temp = $tree_Input; +$temp =~ tr/(),/ /; +my @genes = split(' ', $temp); +my $size = @genes; + +my @treeFiles; +my $argIndex = 15; +my $count; +for($count = 0; $count < $size; $count++) { + $treeFiles[$count] = $ARGV[$argIndex]; + $argIndex++; +} + +my $tree_copy = ""; +my $index = 0; +my $flag = 1; +for($count = 0; $count < length($tree_Input); $count++) { + if(substr($tree_Input, $count, 1) eq '(' || substr($tree_Input, $count, 1) eq ')' || substr($tree_Input, $count, 1) eq ',') { + $tree_copy = $tree_copy.substr($tree_Input, $count, 1); + $flag = 1; + } + else { + if($flag) { + $tree_copy = $tree_copy.$treeFiles[$index]; + $index++; + } + $flag = 0; + } +} + +$tree_copy =~ s/\Q.dat\E//g; + +open(CONFIG, '>'.$file); + +print CONFIG "processors = 10\n"; +#print CONFIG "tree = ".$tree_Input."\n"; +print CONFIG "tree = ".$tree_copy."\n"; +print CONFIG "tag = ".$tag_Input."\n"; +print CONFIG "protein = ".$protein_Input."\n"; +print CONFIG "database_name = ".$database_name_Input."\n"; +print CONFIG "read_database = ".$read_database_Input."\n"; +print CONFIG "Blastall = ".$Blastall_Input."\n"; +print CONFIG "read_blast_scores = ".$read_blast_scores_Input."\n"; +print CONFIG "alignments = ".$alignments_Input."\n"; +print CONFIG "bit_scores = ".$bit_scores_Input."\n"; +print CONFIG "read_scores = ".$read_scores_Input."\n"; +print CONFIG "read_ancestors = ".$read_ancestors_Input."\n"; +print CONFIG "view_ancestors = ".$view_ancestors_Input."\n"; +print CONFIG "sfa = ".$sfa_Input."\n"; +print CONFIG "ortholog_threshold = ".$ortholog_threshold_Input."\n"; +print CONFIG "diverged_threshold = ".$diverged_threshold_Input."\n"; +print CONFIG "diverged_std = ".$diverged_std_Input."\n"; +print CONFIG "avg_of_paralogs = ".$avg_of_paralogs_Input."\n"; + +close(CONFIG); + +if($read_ancestors_Input eq "false" || $read_scores_Input eq "false") { + my $run = qx/java -jar -Xms8000m -Xmx8000m $evolmapPath\/EvolMAP.jar $file/; + print $run; +} + diff -r 000000000000 -r 5b9a38ec4a39 orthologs/evolmap.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/evolmap.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,95 @@ + + Runs EvolMAP. + evolmap.pl '$tree' $protein $database_name $read_database $Blastall $read_blast_scores $alignments $bit_scores $read_scores $read_ancestors $sfa $ortholog_threshold $diverged_threshold $diverged_std $avg_of_paralogs #for $file in $fileList + ${file.tree_file} +#end for + + + + + + + + + + + + + + + + + + + + + + + + + + + + + EvolMAP is an algorithm and software for estimating the composition of ancestral genomes and the timing of gene duplication and loss events. + The input is a species-tree and genes from its modern species. + The output is the inferred ancestral genes of the speciation nodes of the tree and the inferred gene duplication and loss events specific to each branch. + + EvolMAP features include: + * Detection of orthologous groups from an ancestral gene perspective (i.e. descendants of an ancestral gene) + * Scalable and fast genome-level comparisons laying out timings of gene duplications and losses + * Generating gene expansion (GE) tree which is useful to track evolution of a specific domain on the species tree + * Generating average ortholog divergence (AOD) tree which is a measure of the molecular clock + * Categorizing divergence of gene duplications into in-paralogs, diverged in-paralogs and ambiguous gains + + Onur Sakarya, Kenneth S. Kosik and Todd H. Oakley. Reconstructing ancestral genome content based on symmetrical best alignments and Dollo parsimony. Bioinformatics 2008 24(5):606-612. + + http://kosik-web.mcdb.ucsb.edu/evolmap/index.htm + + Options overview + + tree: input newick format species tree + Example input: (((human,chimp),(mouse,rat)),dog) + + protein: amino-acid or nucleotide file + Check if true, else false + + database_name: analysis name + Example input: mammal_genomes + Output database name would be: mammal_genomes.gd.fa + + read_database: if true, reads already created fasta from disk + Check if true, else false + + blastall: runs blastall first -- if false, it generates all-to-all Needleman-wunsch scores which is slow for large datasets. + Check if true, else false + + read_blast_scores: if true, reads already calculated blast scores + Check if true, else false + + alignments: Number of top alignments for each gene to be calculated by blastall (if used) + Example input: 300 + + bit_scores: if false, calculate needleman-wusch alignment scores for the blast hits, if true, uses blast bit scores. + Check if true, else false + + read_scores: if true, reads scores from already calculated score file + Check if true, else false + + read_ancestors: reads already calculated ancestor from file and re-runs Dollo parsimony + Check if true, else false + + ortholog_threshold: minimum similarity threshold for orthologs + Example input: 250 + + diverged_threshold: minimum similarity threshold for diverged paralogs + Example input: 250 + + diverged_std: diverged paralogs are allowed to be at most this many ortholog divergence standard deviations from the ancestor node's average sym-bet score. + Example input: 3 + + avg_of_paralogs: if true, while calculating similarity between two ancestral genes, avg. score of all its members to members of the other gene are considered if false only best score between the members is considered + Check if true, else false + + + diff -r 000000000000 -r 5b9a38ec4a39 orthologs/evolmap_long.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/evolmap_long.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,95 @@ + + Runs EvolMAP for longer (bigger) jobs. + evolmap.pl '$tree' $protein $database_name $read_database $Blastall $read_blast_scores $alignments $bit_scores $read_scores $read_ancestors $sfa $ortholog_threshold $diverged_threshold $diverged_std $avg_of_paralogs #for $file in $fileList + ${file.tree_file} +#end for + + + + + + + + + + + + + + + + + + + + + + + + + + + + + EvolMAP is an algorithm and software for estimating the composition of ancestral genomes and the timing of gene duplication and loss events. + The input is a species-tree and genes from its modern species. + The output is the inferred ancestral genes of the speciation nodes of the tree and the inferred gene duplication and loss events specific to each branch. + + EvolMAP features include: + * Detection of orthologous groups from an ancestral gene perspective (i.e. descendants of an ancestral gene) + * Scalable and fast genome-level comparisons laying out timings of gene duplications and losses + * Generating gene expansion (GE) tree which is useful to track evolution of a specific domain on the species tree + * Generating average ortholog divergence (AOD) tree which is a measure of the molecular clock + * Categorizing divergence of gene duplications into in-paralogs, diverged in-paralogs and ambiguous gains + + Onur Sakarya, Kenneth S. Kosik and Todd H. Oakley. Reconstructing ancestral genome content based on symmetrical best alignments and Dollo parsimony. Bioinformatics 2008 24(5):606-612. + + http://kosik-web.mcdb.ucsb.edu/evolmap/index.htm + + Options overview + + tree: input newick format species tree + Example input: (((human,chimp),(mouse,rat)),dog) + + protein: amino-acid or nucleotide file + Check if true, else false + + database_name: analysis name + Example input: mammal_genomes + Output database name would be: mammal_genomes.gd.fa + + read_database: if true, reads already created fasta from disk + Check if true, else false + + blastall: runs blastall first -- if false, it generates all-to-all Needleman-wunsch scores which is slow for large datasets. + Check if true, else false + + read_blast_scores: if true, reads already calculated blast scores + Check if true, else false + + alignments: Number of top alignments for each gene to be calculated by blastall (if used) + Example input: 300 + + bit_scores: if false, calculate needleman-wusch alignment scores for the blast hits, if true, uses blast bit scores. + Check if true, else false + + read_scores: if true, reads scores from already calculated score file + Check if true, else false + + read_ancestors: reads already calculated ancestor from file and re-runs Dollo parsimony + Check if true, else false + + ortholog_threshold: minimum similarity threshold for orthologs + Example input: 250 + + diverged_threshold: minimum similarity threshold for diverged paralogs + Example input: 250 + + diverged_std: diverged paralogs are allowed to be at most this many ortholog divergence standard deviations from the ancestor node's average sym-bet score. + Example input: 3 + + avg_of_paralogs: if true, while calculating similarity between two ancestral genes, avg. score of all its members to members of the other gene are considered if false only best score between the members is considered + Check if true, else false + + + diff -r 000000000000 -r 5b9a38ec4a39 orthologs/evolmap_output.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/evolmap_output.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,40 @@ + + Modifies EvolMAP output + output.pl $file $numDiffGenes '$speciesList' +#for $file in $fileList + ${file.treeFile} +#end for + + + + + + + + + + + + + + + + + http://labs.eemb.ucsb.edu/oakley/todd/ + + Modifies EvolMAP output. Must provide the original EvolMAP output and files used. + + Input preconditions: + Species fasta files - must be inputted in the same order as the original tree from the EvolMAP run + EvolMAP output file - must be from an EvolMAP run + Number of different species - the number of different species you want searched and pulled to the output file + Species tree - must be the same tree you used to get the EvolMAP output file in the original EvolMAP run + + Input file format: + FASTA (.fasta/.fa) + + Output format: + Text (txt) + + + diff -r 000000000000 -r 5b9a38ec4a39 orthologs/evolmap_output_exclude.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/evolmap_output_exclude.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,23 @@ +#!/usr/bin/perl + +my $input = $ARGV[0]; +my $excludeList = $ARGV[1]; + +my @species = split(/,/, $excludeList); + +open(INPUT, $input); + open(OUTPUT, '>'."output.txt"); + while(my $currLine = ) { + my @currentLine = split(/\t/, $currLine); + my $flag = 0; + for(my $i = 0; $i < @species; $i++) { + if($species[$i] eq $currentLine[0]) { + $flag = 1; + } + } + if($flag == 0) { + print OUTPUT $currLine; + } + } + close(OUTPUT); +close(INPUT); \ No newline at end of file diff -r 000000000000 -r 5b9a38ec4a39 orthologs/evolmap_output_exclude.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/evolmap_output_exclude.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,24 @@ + + Modifies EvolMAP Output to exclude specific species. + evolmap_output_exclude.pl $input $speciesList + + + + + + + + + + + http://labs.eemb.ucsb.edu/oakley/todd/ + + Takes the output file of EvolMAP Output tool as input and deletes the user defined species from this file and is ouputted to a new text file. + + Input format: + Text (txt) + + Output format: + Text (txt) + + diff -r 000000000000 -r 5b9a38ec4a39 orthologs/hmmbuild.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/hmmbuild.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,106 @@ +import os +import optparse +import subprocess +from multiprocessing import Pool + +directory = "./data" +results = "results.data" +extension = ".afa" +model_extension = ".hmm" +inputFile = "" +index_of_name_in_hmm = 6 + + +def unescape(string): + mapped_chars = { + '>': '__gt__', + '<': '__lt__', + "'": '__sq__', + '"': '__dq__', + '[': '__ob__', + ']': '__cb__', + '{': '__oc__', + '}': '__cc__', + '@': '__at__', + '\n': '__cn__', + '\r': '__cr__', + '\t': '__tc__', + '#': '__pd__' + } + + for key, value in mapped_chars.iteritems(): + string = string.replace(value, key) + + return string + + +def toData(text): + lis = text.split() + name = lis[index_of_name_in_hmm] + text = name + "\t" + text.replace("\n", "\\n") + return text + + +def hmmbuild(input): + file_name = directory + os.sep + input + # print file_name + # return subprocess.Popen(['hmmbuild', "--informat", "afa", file_name + ".hmm", file_name], stdout=subprocess.PIPE).communicate()[0] # ./muscle + pop = subprocess.Popen(['hmmbuild', "--informat", "afa", file_name + ".hmm", file_name]) + pop.wait() + + +class Sequence: + def __init__(self, string): + lis = string.split('\t') + # print lis + self.species = lis[0] + self.family = lis[1] + self.name = lis[2] + self.header = ' '.join(lis[:-1]) + self.sequence = lis[-1] + self.string = string + + def printFASTA(self): + return '> ' + self.header + '\n' + self.sequence + '\n' + + +def main(): + usage = """%prog [options] +options (listed below) default to 'None' if omitted + """ + parser = optparse.OptionParser(usage=usage) + + parser.add_option( + '-i', '--in', + dest='input', + action='store', + type='string', + metavar="FILE", + help='Name of input data.') + + options, args = parser.parse_args() + + global inputFile, directory + inputFile = unescape(options.input) + + os.mkdir(directory) + + with open(inputFile) as f: + for line in f: + seq = Sequence(line) + with open(directory + os.sep + seq.family + extension, "a") as p: + p.write(seq.printFASTA()) + + pool = Pool() + list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)] + + pool.map(hmmbuild, list_of_files) + + result = [file for file in os.listdir(directory) if file.lower().endswith(model_extension)] + with open(directory + os.sep + results, "a") as f: + for file in result: + with open(directory + os.sep + file, "r") as r: + f.write(toData(r.read()) + "\n") + +if __name__ == '__main__': + main() diff -r 000000000000 -r 5b9a38ec4a39 orthologs/hmmbuild.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/hmmbuild.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,25 @@ + + hmmbuild :: profile HMM construction from multiple sequence alignments in phytab format. + + hmmbuild + + + hmmbuild.py -i $data > $hmmbuild_stdout 2>&1 + + + + + + + + + + + + **hmmbuild v3.0** + + Runs hmmbuild on MUSCLE alignment data. + + See hmmbuild help: ftp://selab.janelia.org/pub/software/hmmer3/3.0/Userguide.pdf + + diff -r 000000000000 -r 5b9a38ec4a39 orthologs/hmmsearch.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/hmmsearch.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,116 @@ +import os +import optparse +import subprocess +from multiprocessing import Pool + +results_dir = "./data" +results = "results.data" +result_extension = ".out" +model_extension = ".hmm" +database = "" + + +def unescape(string): + mapped_chars = { + '>': '__gt__', + '<': '__lt__', + "'": '__sq__', + '"': '__dq__', + '[': '__ob__', + ']': '__cb__', + '{': '__oc__', + '}': '__cc__', + '@': '__at__', + '\n': '__cn__', + '\r': '__cr__', + '\t': '__tc__', + '#': '__pd__' + } + + for key, value in mapped_chars.iteritems(): + string = string.replace(value, key) + + return string + + +def unpackData(models): + with open(models) as f: + for line in f: + hmm = HMM(line) + with open(results_dir + os.sep + hmm.name + model_extension, "a") as p: + # print(hmm.model) + p.write(hmm.model) + + +class HMM: + def __init__(self, string): + lis = string.split('\t') + # print lis + self.model = self.restoreNewLines(lis[1]) + self.name = lis[0] + + def restoreNewLines(self, string): + return string.replace('\\n', '\n') + + +def toData(text): + # lis = text.split() + # name = lis[index_of_name_in_hmm] + # text = name + "\t" + text.replace("\n", "\\n") + # text = text.replace("\n", "\\n") + return text + + +def hmmsearch(input): + file_name = results_dir + os.sep + input + # print file_name + # return subprocess.Popen(['hmmbuild', "--informat", "afa", file_name + ".hmm", file_name], stdout=subprocess.PIPE).communicate()[0] # ./muscle + pop = subprocess.Popen(['hmmsearch', "-o", file_name + result_extension, file_name, database]) + pop.wait() + + +def main(): + usage = """%prog [options] +options (listed below) default to 'None' if omitted + """ + parser = optparse.OptionParser(usage=usage) + + parser.add_option( + '-i', '--hmm', + dest='hmm', + action='store', + type='string', + metavar="FILE", + help='Name of input hmm models.') + + parser.add_option( + '-d', '--database', + dest='database', + action='store', + type='string', + metavar="FILE", + help='Name of sequence database.') + + options, args = parser.parse_args() + + global database + models = unescape(options.hmm) + database = unescape(options.database) + + os.mkdir(results_dir) + + unpackData(models) + + list_of_files = [file for file in os.listdir(results_dir) if file.lower().endswith(model_extension)] + + pool = Pool() + pool.map(hmmsearch, list_of_files) + + result = [file for file in os.listdir(results_dir) if file.lower().endswith(result_extension)] + with open(results_dir + os.sep + results, "a") as f: + for file in result: + with open(results_dir + os.sep + file, "r") as r: + f.write(toData(r.read()) + "\n") + +if __name__ == '__main__': + main() diff -r 000000000000 -r 5b9a38ec4a39 orthologs/hmmsearch.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/hmmsearch.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,26 @@ + + hmmsearch :: search profile(s) against a sequence database. Produces a phytab output. + + hmmsearch + + + hmmsearch.py -i $hmm -d $database > $hmmsearch_stdout 2>&1 + + + + + + + + + + + + + **hmmsearch v3.0** + + Runs hmmsearch on HMM model data. + + See hmmsearch help: ftp://selab.janelia.org/pub/software/hmmer3/3.0/Userguide.pdf + + diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/README_UCSBhamster --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/README_UCSBhamster Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,78 @@ +Dependancies that must be installed and available in path: +blastall +formatdb + +Dependancies distributed with hamster and galaxy hamster +BioPerl +genewise +rungenewise.pm +wisecfg -- directory of configuration files for genewise + + + +QUICKSTART + +1. Copy galaxy_hamster files into the directory where you want to +install the program. + +2. execute the install.pl script to configure required directories. +Type: + +./install.pl + +3. Make sure blastall and formatdb are installed on your system, + and have been configured on your PATH. + +4. add ucsb_hamster.xml to Galaxy's tool_conf.xml file. + +5. Restart Galaxy + + +Now, ucsb hamster can be used using HMM's constructed in a Galaxy +history using phytab_hmmbuild. + + + + + + +DETAILED INSTALLATION PROCEDURES: + +If for some reason install.pl does not work, first make sure that +you have deleted the PATH variable pointing to any existing galaxy_hamster +installation on your system in the .bashrc file. + +If you have deleted this variable and still cannot execute the install script, +you may follow the detailed installation procedure below. + +1. Copy galaxy_hamster files into the directory where you want to install +the program. + +2. Directory that contains the following scripts must be in the overall +path: +translate.pl +emap2fasta.pl +unbuild.py + +You may add the directory into your .bashrc file located in your HOME directory. + +For example: +export PATH="/home/galaxy/galaxy-dist/tools/galaxy_hamster:$PATH"; + +3. Type: source .bashrc to reload your environment variables. + +4. In the file run_genewise.pm located in the lib direcotry, you must set the full +path of the genewise configuration directory. The configurat iondirectory is +called wisecfg and it is distributed with galaxy_hamster. + +5. From here, ensure that blastall and formatdb are installed on your system and +have been configured on your PATH. + +6. Add ucsb_hamster.xml to Galaxy's tool_conf.xml file. + +7. When installing on an Ubuntu system, the ucsb_hamster.sh script was not finding +translate.pl even though it was in the path. To fix this I (THO) added translate.pl and +the main hamster script to /usr/local/bin/ with a symbolic link, which seemed to fix +this problem. + +8. Restart Galaxy. diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/genewise Binary file orthologs/ucsb_hamster/genewise has changed diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/genewisedb Binary file orthologs/ucsb_hamster/genewisedb has changed diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/hamstrsearch_local-hmmer3.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/hamstrsearch_local-hmmer3.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,922 @@ +#!/usr/bin/perl -w +use strict; + +use FindBin; +use lib "$FindBin::Bin/lib"; +use Getopt::Long; +use Bio::SearchIO; +use Bio::Search::Hit::BlastHit; +use run_genewise; + +# PROGRAMNAME: hamstrsearch_local.pl + +# Copyright (C) 2009 INGO EBERSBERGER, ingo.ebersberger@univie.ac.at +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published +# by the Free Software Foundation; either version 3 of the License +# or any later version. + +# This program is distributed in the hope that it will be useful +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# You should have received a copy of the GNU General Public License +# along with this program; If not, see http://www.gnu.org/licenses + +# PROGRAM DESCRIPTION: This is the relevant version! + +# DATE: Wed Dec 19 10:41:09 CEST 2007 + + +######################## start main ############################# +my $version = "\nhamstrsearch_local-v1.0.pl\nGalaxy implementation by Todd H. Oakley and Roger Ngo, UCSB\n"; +print "$version\n"; +### EDIT THE FOLLOWING LINES TO CUSTOMIZE YOUR SCRIPT +## note: all the variables can also be set via the command line +my $pid = $$; +my $prog = 'hmmsearch'; #program for the hmm search +my $alignmentprog = 'clustalw'; + + +#PATH SETTINGS +my $hmmpath = '.'; +my $blastpath = '.'; #Uses Galaxy working directory +my $tmpdir = 'tmp_' . $pid; +my $eval = 1; # eval cutoff for the hmm search +my $logfile = "hamstrsearch_" . $pid . '.log'; +my $hmm_dir = 'hmm_dir'; +my $fa_dir = ''; +############################## +my $help; +my $seq2store_file=''; +my $cds2store_file=''; +my $hmm; +my @hmms; +my $fa; +my $fafile; +my @seqs2store; +my @cds2store; +my $ep2eg; +my $estfile; +my $aln; +my $idfile; +my $taxon_check = 0; +my $hmmset; +my $hmmsearch_dir; +my $dbfile = ''; # the file hmmsearch is run against +my $dbfile_short; +my $taxon_file; +my $refspec_string; +my @refspec = qw(); +my @primer_taxa; +my $refspec_name = ''; +my $taxon_global; +my $fileobj; +my $fa_dir_neu = ''; +my $gwrefprot; +my $seqtype; +my $align; +my $rep; +my $estflag; +my $proteinflag; +my $refseq; +my $refspec_final = ''; +my $concat; +my $seqs2store_file; + +#####determine the hostname####### +my $hostname = `hostname`; +chomp $hostname; +print "hostname is $hostname\n"; + +################################# +if (@ARGV==0) { + $help = 1; +} +## help message +my $helpmessage = " +This program is freely distributed under a GPL. See -version for more info +Copyright (c) GRL limited: portions of the code are from separate copyrights + +\nUSAGE: hamstrsearch_local.pl -sequence_file=<> -hmmset=<> -taxon=<> -refspec=<> [-est|-protein] [-hmm=<>] [-representative] [-h] + +OPTIONS: + +-sequence_file: + path and name of the file containing the sequences hmmer is run against + Per default, this file should be in the data directory. +-est: + set this flag if you are searching in ESTs +-protein: + set this flag if you are searching in protein sequences +-hmmset: + specifies the name of the core-ortholog set. + The program will look for the files in the default directory 'core-orthologs' unless you specify + a different one. +-taxon: + You need to specify a default taxon name from which your ESTs or protein sequences are derived. +-refspec: + sets the reference species. Note, it has to be a species that contributed sequences + to the hmms you are using. NO DEFAULT IS SET! For a list of possible reference + taxa you can have a look at the speclist.txt file in the default core-ortholog sets + that come with this distribution. Please use the 5 letter abreviations. If you choose + to use core-orthologs were not every taxon is represented in all core-orthologs, you + can provide a comma-separated list with the preferred refspec first. The lower-ranking + reference species will only be used if a certain gene is not present in the preferred + refspecies due to alternative paths in the transitive closure to define + the core-orthologs. + CURRENTLY NO CHECK IS IMPLEMENTED! + NOTE: A BLAST-DB FOR THE REFERENCE SPECIES IS REQUIRED! +-eval_limit=<> + This options allows to set the e-value cut-off for the HMM search. + DEFAULT: 1 +-hmm: + option to provide only a single hmm to be used for the search. + Note, this file has to end with .hmm + +### the following options should only be used when you chose to alter the default structure of the +### hamstrsearch_local directories. Currently, this has not been extensively tested. +-fasta_file: + path and name of the file containing the core-ortholog sequences + you don't have to use this option when you +-hmmpath: + sets the path to the hmm_dir. By default this is set to the current directory. +-blastpath: + sets the path where the blast-dbs are located. Per default this is ../blast_dir + Note, the program expects the structure blastpath/refspec/refspec_prot. + To overrule this structure you will have to edit the script. + \n\n"; +GetOptions ("h" => \$help, + "hmm=s" => \$hmm, + "est" => \$estflag, + "protein"=> \$proteinflag, + "sequence_file=s" => \$dbfile, + "fasta_file=s" => \$fafile, + "hmmset=s" => \$hmmset, + "hmmpath=s" => \$hmmpath, + "taxon_file=s" => \$taxon_file, + "taxon=s" => \$taxon_global, + "eval_limit=s" => \$eval, + "refspec=s" => \$refspec_string, + "estfile=s" => \$estfile, + "representative" => \$rep, + "blastpath=s" => \$blastpath, + "galaxyout=s" => \$seqs2store_file, + "2galaxyout=s" => \$cds2store_file); + +if ($help) { + print $helpmessage; + exit; +} + +## 1) check if all information is available to run HaMStR +my ($check, @log) = &checkInput(); +if ($check == 0) { + print join "\n", @log; + print "$helpmessage"; + exit; +} +else { + open (OUT, ">$logfile") or die "could not open logfile $logfile\n"; + print OUT join "\n", @log; + close OUT; +} +### read in of the core-ortholog sequences +my $co_seqs = parseSeqfile("$fafile"); + +## 2) loop through the hmms +## process each hmm file separately +for (my $i = 0; $i < @hmms; $i++) { + $fileobj = undef; + my @seqs = qw(); + my @newseqs = qw();## var to contain the sequences to be added to the orthologous cluster + my @newcds = qw(); + my $hmm = $hmms[$i]; + my $hmmout = $hmm; + $hmmout =~ s/\.hmm/\.out/; + ## 3) run the hmm search + if (!(-e "$hmmsearch_dir/$hmmout")) { + print "now running $prog using $hmm\n"; + !`$prog $hmm_dir/$hmm $dbfile >$hmmsearch_dir/$hmmout` or die "Problem running hmmsearch\n"; + } + else { + print "an hmmresult $hmmout already exists. Using this one!\n"; + print "NOTE: in Galaxy the hmm results are stored in the directory of the dataset\n"; + } + + ## 4) process the hmm search result + my $hitcount = 0; + ## 4a) loop through the individual results + ## now the modified version for hmmer3 comes + my ($query_name, @results) = parseHmmer3($hmmout, $hmmsearch_dir); + if (! @results) { + print "no hit found for $query_name\n"; + next; + } + chomp $query_name; + print "Results for $query_name\n"; + for (my $k = 0; $k < @results; $k++) { + my $hitname = $results[$k]; + print "$hitname\n"; + my $keep = 0; + my $hitseq = ''; + $refseq = ''; + ## 4b) test for the reciprocity criterion fulfilled + ($keep, $hitname, $hitseq, $refspec_final, $refseq) = &check4reciprocity($query_name, $hitname, @refspec); + if ($keep == 1) { + ## blast search with the hmm hit identifies the core-ortholog sequence of the reference species + ## check for the taxon from the taxon_file.txt. + my $taxon = ''; + if ($taxon_check){ + if ($taxon_check == 1) { + $taxon = &getTaxon($hitname); + } + elsif ($taxon_check == 2) { + $taxon = $taxon_global; + } + } + ## put the info about the hits into an object for later post-processing + $fileobj->{$taxon}->{prot}->[$hitcount] = $hitseq; + $fileobj->{$taxon}->{ids}->[$hitcount] = $hitname; + $fileobj->{$taxon}->{refseq} = $refseq; + $hitcount++; + } + else { + print "match to different protein from $refspec_final\n"; + } + } + ## 5) do the rest only if at least one hit was obtained + if (defined $fileobj) { + ## 5a) if the hits are derived from ESTs, get the best ORF + if ($estflag) { + $fileobj = &predictORF(); + } + ## 5b) if the user has chosen to postprocess the results + if ($rep) { + &processHits($refseq, $concat); + } + ## 6) prepare the output + my @taxa = keys(%$fileobj); + for (my $i = 0; $i< @taxa; $i++) { + if ($rep) { +# push @newseqs, ">$query_name|$taxa[$i]|$fileobj->{$taxa[$i]}->{refid}"; +#Rearrange Order for Galaxy - want species first for phylotable format convert pipes to tabs + push @newseqs, ">$taxa[$i]\t$query_name\t$fileobj->{$taxa[$i]}->{refid}"; + push @newseqs, $fileobj->{$taxa[$i]}->{refprot}; + if ($estflag) { +# push @newcds, ">$query_name|$taxa[$i]|$fileobj->{$taxa[$i]}->{refid}"; +#Rearrange Order for Galaxy - want species first for phylotable format + push @newcds, ">$taxa[$i]\t$query_name\t$fileobj->{$taxa[$i]}->{refid}"; + push @newcds, $fileobj->{$taxa[$i]}->{refcds}; + } + } + else { + my $idobj = $fileobj->{$taxa[$i]}->{ids}; + my $protobj = $fileobj->{$taxa[$i]}->{prot}; + my $cdsobj = $fileobj->{$taxa[$i]}->{cds}; + for (my $j = 0; $j < @$idobj; $j++) { +# push @newseqs, ">$query_name|$taxa[$i]|$idobj->[$j]"; +#Rearrange Order for Galaxy - want species first for phylotable format also tabs not pipe + push @newseqs, ">$taxa[$i]\t$query_name\t$idobj->[$j]"; + push @newseqs, $protobj->[$j]; + if ($estflag) { +# push @newcds, ">$query_name|$taxa[$i]|$idobj->[$j]"; +#Rearrange Order for Galaxy - want species first for phylotable format + push @newcds, ">$taxa[$i]\t$query_name\t$idobj->[$j]"; + push @newcds, $cdsobj->[$j]; + } + } + } + my $refs = $co_seqs->{$query_name}; + for (keys %$refs) { + my $line = ">$query_name|$_|" . $refs->{$_}->{seqid} . "\n" . $refs->{$_}->{seq}; + push @seqs, $line; + } + chomp @seqs; + print "\n"; + @seqs = (@seqs, @newseqs); + open (OUT, ">$fa_dir_neu/$query_name.fa"); + print OUT join "\n", @seqs; + print OUT "\n"; + close OUT; + for (my $i = 0; $i < @newseqs; $i+= 2) { +# my $line = $newseqs[$i] . "|" . $newseqs[$i+1]; +#Galaxy uses tabs not pipes + my $line = $newseqs[$i] . "\t" . $newseqs[$i+1]; + $line =~ s/>//; + push @seqs2store, $line; + if ($estflag) { +#Galaxy uses tabs not pipes +# my $cdsline = $newcds[$i] . "|" . $newcds[$i+1]; + my $cdsline = $newcds[$i] . "\t" . $newcds[$i+1]; + $cdsline =~ s/>//; + push @cds2store, $cdsline; + } + } + } + } +} +if (@seqs2store > 0) { + open (OUT, ">$seqs2store_file") or die "failed to open output SEQS file\n"; + print OUT join "\n", @seqs2store; + print OUT "\n"; + close OUT; + if ($estflag) { + open (OUT, ">$cds2store_file") or die "failed to open output CDS file\n"; + print OUT join "\n", @cds2store; + print OUT "\n"; + close OUT; + } +} +else { + open (OUT, ">$seqs2store_file") or die "failed to open output SEQS file\n"; + print OUT "no hits found\n"; +} +exit; +##################### start sub ############### +####### checkInput performs a number of checks whether sufficient information +### and all data are available to run HaMStR +sub checkInput { + my @log; + my $check = 1; + $dbfile_short = $dbfile; + $dbfile_short =~ s/\..*//; + ## 1) check for filetype + print "Checking for filetype:\t"; + if (!defined $estflag and !defined $proteinflag) { + push @log, "please determine the sequence type. Choose between the options -EST or -protein"; + print "failed\n"; + $check = 0; + } + else { + if ($estflag) { + $estfile = $dbfile; + $dbfile = "$dbfile.tc"; + push @log, "HaMStR will run on the ESTs in $estfile"; + push @log, "Translating ESTs"; + if (!(-e "$dbfile")) { + print "translating $estfile, this may take a while\n"; + `translate.pl -in=$estfile -out=$dbfile`; + open (LOG, "$logfile") or die "could not open logfile $logfile\n"; + my @info = ; + @log = (@log, @info); + close LOG; + } + else { + push @log, "Translated file already exists, using this one\n"; + } + if (! -e "$dbfile") { + push @log, "The translation of $estfile failed. Check the script translate.pl"; + print "failed\n"; + $check = 0; + } + } + else { + ## file type is protein + print "succeeded\n"; + } + } + ## 2) Check for presence of blastall + print "Checking for the blast program\t"; + if (!(`blastall`)) { + push @log, "could not execute blastall. Please check if this program is installed and executable"; + print "failed\n"; + $check = 0; + } + else { + push @log, "check for blastall succeeded"; + print "succeeded\n"; + } + ## 3) Check for presence of hmmsearch + print "Checking for hmmsearch\t"; + if (! `$prog -h`) { + push @log, "could not execute $prog. Please check if this program is installed and executable"; + print "failed\n"; + $check = 0; + } + else { + push @log, "check for $prog succeeded\n"; + print "succeeded\n"; + } + ## 4) Check for reference taxon + print "Checking for reference species and blast-dbs\t"; + if (!(defined $refspec_string)) { + push @log, "Please provide a reference species for the reblast!"; + print "failed\n"; + $check = 0; + } + else { + push @log, "Reference species for the re-blast: $refspec_string"; + @refspec = split /,/, $refspec_string; + $refspec_name = $refspec[0]; + print "succeeded\n"; + } + ## 5) Check for presence of the required blast dbs + print "checking for blast-dbs:\t"; + for (my $i = 0; $i < @refspec; $i++) { + my $blastpathtmp = "$blastpath/$refspec[$i]/$refspec[$i]" . "_prot"; + if (! (-e "$blastpathtmp.pin")) { + push @log, "please edit the blastpath. Could not find $blastpathtmp"; + print "$blastpathtmp failed\n"; + $check = 0; + } + else { + push @log, "check for $blastpathtmp succeeded"; + print "succeeded\n"; + } + } + ## 6) Check for presence of the directory structure + print "checking for presence of the hmm files:\t"; + if (!(defined $hmmset)) { + $hmmpath = '.'; + $hmmset = 'manual'; + } + else { + $hmmpath = "$hmmpath/$hmmset"; + $fafile = "$hmmpath/$hmmset" . '.fa'; + } + $hmm_dir = "$hmmpath/$hmm_dir"; + $hmmsearch_dir = $dbfile_short . '_' . $hmmset; + +#CHANGED FOR GALAXY DIRECTORY + # $fa_dir_neu = 'fa_dir_' . $dbfile_short . '_' . $hmmset . '_' . $refspec_name; + $fa_dir_neu = $dbfile_short . '_' . $hmmset . '_' . $refspec_name; + ## 7) check for the presence of the hmm-files and the fasta-file + if (!(-e "$hmm_dir")) { + push @log, "Could not find $hmm_dir"; + print "failed\n"; + $check = 0; + } + else { + if (defined $hmm) { + if (! -e "$hmm_dir/$hmm") { + push @log, "$hmm has been defined but could not be found in $hmm_dir/$hmm"; + $check = 0; + } + else { + push @log, "$hmm has been found"; + if ($hmm =~ /\.hmm$/) { + @hmms = ($hmm); + } + } + } + else { + push @log, "running HaMStR with all hmms in $hmm_dir"; + @hmms = `ls $hmm_dir`; + } + chomp @hmms; + print "succeeded\n"; + } + + ## 8) Test for presence of the fasta file containing the sequences of the core-ortholog cluster + print "checking for presence of the core-ortholog file:\t"; + if (defined $fafile) { + if (! -e "$fafile") { + push @log, "Could not find the file $fafile"; + print "failed\n"; + $check = 0; + } + else { + push @log, "check for $fafile succeeded"; + print "succeeded\n"; + } + } + else { + push @log, "Please provide path and name of fasta file containing the core-ortholog sequences"; + $check = 0; + print "failed\n"; + } + ## 9) Checks for the taxon_file + print "testing whether the taxon has been determined:\t"; + if (!(defined $taxon_file) or (!(-e "$taxon_file"))) { + if (defined $taxon_global) { + push @log, "using default taxon $taxon_global for all sequences"; + print "succeeded\n"; + $taxon_check = 2; + } + else { + push @log, "No taxon_file found. Please provide a global taxon name using the option -taxon"; + print "failed\n"; + $check = 0; + } + } + else { + push @log, "using the file $taxon_file as taxon_file"; + print "succeeded\n"; + $taxon_check = 1; + } + ## 10) Set the file where the matched seqs are found +#CHANGED BY THO FOR GALAXY TO ALLOW DETERMINATION OF OUTPUT FILE Made INPUT Option +# $seqs2store_file = 'hamstrsearch_' . $dbfile_short . '_' . $hmmset . '.out'; +# $cds2store_file = 'hamstrsearch_' . $dbfile_short . '_' . $hmmset . '_cds.out'; + + ## 11) apply the evalue-cut-off to the hmmsearch program + $prog = $prog . " -E $eval"; + push @log, "hmmsearch: $prog"; + + ## 12) setting up the directories where the output files will be put into. + if ($check == 1) { + if (!(-e "$hmmsearch_dir")) { + `mkdir $hmmsearch_dir`; + } + if (!(-e "$fa_dir_neu")) { + `mkdir $fa_dir_neu`; + } + if (!(-e "$tmpdir")) { + `mkdir $tmpdir`; + } + } + return ($check, @log); +} +################# +## check4reciprocity is the second major part of the program. It checks +## whether the protein sequence that has been identified by the hmmsearch +## identifies in turn the protein from the reference taxon that was used to +## build the hmm. +sub check4reciprocity { + my ($query_name, $hitname, @refspec) = @_; + my $searchdb; + ## get the sequence from the db_file + my $hitseq = `grep -m 1 -A 1 ">$hitname" $dbfile | tail -n 1`; + if (!defined $hitseq) { + print "could not retrieve a sequence for $hitname. Skipping...\n"; + return(0, '', '', ''); + } + else { + ## now get the sequence used for building the hmm + my @original; + my $refspec_final; + for (my $i = 0; $i < @refspec; $i++) { + @original = `grep "^>$query_name|$refspec[$i]" $fafile |sed -e "s/.*$refspec[$i]\|//"`; + chomp @original; + + if (@original > 0) { + $refspec_final = $refspec[$i]; + $searchdb = "$blastpath/$refspec_final/$refspec_final" . "_prot"; + last; + } + else { + print "original sequence not be found with grepping for ^>$query_name|$refspec[$i]. Proceeding with next refspec\n"; + } + } + if (@original == 0) { + print "original sequence not be found\n"; + return (0, '', '', $refspec_final); + } + print "REFSPEC is $refspec_final\n"; + ## continue with the blast + chomp $hitseq; +# $hitname =~ s/\|.*//; + ## now run the blast + open (OUT, ">$tmpdir/$$.fa") or die "could not open out for writing\n"; + print OUT ">$hitname\n$hitseq"; + close OUT; + !`blastall -p blastp -d $searchdb -v 10 -b 10 -i $tmpdir/$$.fa -o $tmpdir/$$.blast` or die "Problem running blast\n"; + ## now parse the best blast hit + my @hits = &getBestBlasthit("$tmpdir/$$.blast"); + if (@hits > 0) { + + print "hmm-seq: ", join "\t", @original , "\n"; + ## now loop through the best hits with the same evalue and check whether + ## among these I find the same seq as in $original + for (my $i = 0; $i <@hits; $i++) { + print "blast-hit: $hits[$i]"; + ## now loop through all the refspec-sequences in the hmm file + for (my $j = 0; $j < @original; $j++) { + if ($original[$j] eq $hits[$i]) { + print "\tHit\n"; + my ($refseq) = `grep -A 1 "$query_name|$refspec_final|$original[$j]" $fafile |tail -n 1`; + return (1, $hitname, $hitseq, $refspec_final, $refseq); + } + else { + print "\nnot hitting $original[$j]\n"; + } + } + } + ### if we end up here, we didn't find a hit that matches to the original sequence + ### in the top hits with the same eval + return (0, '', '', $refspec_final); + } + else { + print "no hit obtained\n"; + return(0, '', '', $refspec_final); + } + } +} +############# +sub getBestBlasthit { + my @hits; + my ($file) = @_; + my $searchio = Bio::SearchIO->new(-file => $file, + -format => 'blast', + -report_type => 'blastp') or die "parse failed"; + while( my $result = $searchio->next_result ){ + my $count = 0; + my $sig; + my $sig_old; + while( my $hit = $result->next_hit){ + ## now I enter all top hits having the same evalue into the result + $sig = $hit->score; + if (!defined $sig_old) { + $sig_old = $sig; + } + if ($sig == $sig_old) { + push @hits, $hit->accession; + } + else { + last; + } + } + } + return(@hits); +} +################## +sub getTaxon { + my ($hitname) = @_; +# my $q = "select name from taxon t, est_project e, est_info i, annotation_neu a where a.id = $hitname and a.contig_id = i.contig_id and i.project_id = e.project_id and e.taxon_id = t.taxon_id"; + if ($hitname =~ /\D/) { + $hitname =~ s/_.*//; + } + my $taxon = `grep -m 1 "^$hitname," $taxon_file | sed -e 's/^.*,//'`; + chomp $taxon; + $taxon =~ s/^[0-9]+,//; + $taxon =~ s/\s*$//; + $taxon =~ s/\s/_/g; + if ($taxon) { + return ($taxon); + } + else { + return(); + } +} +############### +sub processHits { + my ($concat) = @_; + ## 1) align all hit sequences for a taxon against the reference species + my @taxa = keys(%$fileobj); + for (my $i = 0; $i < @taxa; $i++) { + &orfRanking($taxa[$i]); + } +} + +################ +sub predictORF { + my $fileobj_new; +# my ($refseq) = @_; + my @taxa = keys(%$fileobj); + for (my $i = 0; $i < @taxa; $i++) { + my $protobj = $fileobj->{$taxa[$i]}->{prot}; + my $idobj = $fileobj->{$taxa[$i]}->{ids}; + my $refseq = $fileobj->{$taxa[$i]}->{refseq}; + my @ids = @$idobj; + for (my $j = 0; $j < @ids; $j++) { + ## determine the reading frame + my ($rf) = $ids[$j] =~ /.*_RF([0-9]+)/; + print "rf is $rf\n"; + $ids[$j] =~ s/_RF.*//; +# my $est = `grep -A 1 "$ids[$j]" $estfile |tail -n 1`; +################new grep command from version 8 to fix bug + my $est = `grep -A 1 ">$ids[$j]\\b" $estfile |tail -n 1`; + if (! $est) { + die "error in retrieval of est sequence for $ids[$j] in subroutine processHits\n"; + } + ## the EST is translated in rev complement + if ($rf > 3) { + $est = revComp($est); + } + + my $gw = run_genewise->new($est, $refseq, "$tmpdir"); + my $translation = $gw->translation; + my $cds = $gw->codons; + $translation =~ s/[-!]//g; + $fileobj_new->{$taxa[$i]}->{ids}->[$j] = $ids[$j]; + $fileobj_new->{$taxa[$i]}->{prot}->[$j] = $translation; + $fileobj_new->{$taxa[$i]}->{cds}->[$j] = $cds; + $fileobj_new->{$taxa[$i]}->{refseq} = $refseq; + } + } + return($fileobj_new); +} +############################ +sub orfRanking { + my ($spec) = @_; + my $result; + my $refprot; + my $refcds; + my @toalign; + my $protobj = $fileobj->{$spec}->{prot}; + my $idobj = $fileobj->{$spec}->{ids}; + my $refcluster; ## variables to take the cluster and its id for later analysis + my $refid; + if (@$protobj == 1) { + ## nothing to chose from + $refprot = $protobj->[0]; + $refcds = $fileobj->{$spec}->{cds}->[0]; + my $length = length($refprot); + $refid = $idobj->[0] . "-" . $length; + } + else { + ## more than one cluster + push @toalign, ">$refspec_final"; + push @toalign, $fileobj->{$spec}->{refseq}; + ## now walk through all the contigs + for (my $i = 0; $i < @$protobj; $i++) { + my @testseq = (">$idobj->[$i]", $protobj->[$i]); + @testseq = (@testseq, @toalign); + open (OUT, ">$tmpdir/$pid.ref.fa") or die "could not open file for writing refseqs\n"; + print OUT join "\n", @testseq; + close OUT; + ## run clustalw + !(`$alignmentprog $tmpdir/$pid.ref.fa -output=fasta -outfile=$tmpdir/$pid.ref.aln 2>&1 >$tmpdir/$pid.ref.log`) or die "error running clustalw\n"; + ## get the alignment score + $result->[$i]->{score} = `grep "Alignment Score" $tmpdir/$pid.ref.log |sed -e 's/[^0-9]//g'`; + if (!$result->[$i]->{score}) { + die "error in determining alignment score\n"; + } + chomp $result->[$i]->{score}; + ## get the aligned sequence + open (ALN, "$tmpdir/$pid.ref.aln") or die "failed to open alignment file\n"; + my @aln = ; + close ALN; + my $aseq = extractSeq($idobj->[$i], @aln); + ## remove the terminal gaps + $aseq =~ s/-*$//; + $result->[$i]->{aend} = length $aseq; + my ($head) = $aseq =~ /^(-*).*/; + ($result->[$i]->{astart}) = length($head)+1; + } + ### the results for all seqs has been gathered, now order them + $result = sortRef($result); + ($refprot, $refcds, $refid) = &determineRef($result,$spec); + } + $fileobj->{$spec}->{refprot} = $refprot; + $fileobj->{$spec}->{refcds} = $refcds; + $fileobj->{$spec}->{refid} = $refid; + return(); +} +########################### +sub sortRef { + my $result = shift; + my @sort; + for (my $i = 0; $i < @$result; $i++) { + push @sort, "$i,$result->[$i]->{astart},$result->[$i]->{aend},$result->[$i]->{score}"; + } + open (OUT, ">$tmpdir/$pid.sort") or die "failed to write for sorting\n"; + print OUT join "\n", @sort; + close OUT; + `sort -n -t ',' -k 2 $tmpdir/$pid.sort >$tmpdir/$pid.sort.out`; + @sort = `less $tmpdir/$pid.sort`; + chomp @sort; + $result = undef; + for (my $i = 0; $i < @sort; $i++) { + ($result->[$i]->{id}, $result->[$i]->{start}, $result->[$i]->{end}, $result->[$i]->{score}) = split ',', $sort[$i]; + } + return($result); +} +######################## +sub determineRef { + my ($result, $spec) = @_; + my $lastend = 0; + my $lastscore = 0; + my $final; + my $count = 0; + my $id = ''; + for (my $i = 0; $i < @$result; $i++) { + if ($result->[$i]->{start} < $lastend or $lastend == 0) { + if ($result->[$i]->{score} > $lastscore) { + $lastend = $result->[$i]->{end}; + $lastscore = $result->[$i]->{score}; + $id = $result->[$i]->{id}; + } + } + elsif ($result->[$i]->{start} > $lastend) { + ## a new part of the alignment is covered. Fix the results obtained so far + $final->[$count]->{id} = $id; + $lastend = $result->[$i]->{end}; + $id = $result->[$i]->{id}; + $count++; + } + } + $final->[$count]->{id} = $id; + ## now concatenate the results + my $refprot = ''; + my $refid = ''; + my $refcds = ''; + for (my $i = 0; $i < @$final; $i++) { + my $seq = $fileobj->{$spec}->{prot}->[$final->[$i]->{id}]; + my $cdsseq = $fileobj->{$spec}->{cds}->[$final->[$i]->{id}]; + my $length = length($seq); + $refid .= "$fileobj->{$spec}->{ids}->[$final->[$i]->{id}]-$length" . "PP"; + $refprot .= $seq; + if ($estflag) { + $refcds .= $cdsseq; + } + } + $refid =~ s/PP$//; + return($refprot, $refcds, $refid); +} +############################# +sub extractSeq { + my ($id, @aln) = @_; + my $seq = ''; + my $start = 0; + for (my $i = 0; $i < @aln; $i++) { + if ($aln[$i] =~ $id) { + $start = 1; + } + elsif ($aln[$i] =~ />/ and $start == 1) { + last; + } + elsif ($start == 1) { + $seq .= $aln[$i]; + } + } + $seq =~ s/\s//g; + return ($seq); +} +############################## +sub revComp { + my ($seq) = @_; + $seq =~ tr/AGCTYRKMWSagct/TCGARYMKWSTCGA/; + $seq = reverse($seq); + return($seq); +} +############################## +sub parseHmmer3 { + my ($file, $path) = @_; + if (!defined $path) { + $path = '.'; + } + open (IN, "$path/$file") or die "failed to open $file\n"; + my @data = ; + close IN; + ### extract the hits + my @hit; + my $start = 0; + my $stop = 0; + my $i = 0; + for (my $i = 0; $i < @data; $i++) { + if (!($data[$i] =~ /\S/)) { + next; + } + else { + if ($data[$i] =~ /Scores for complete sequence/) { + $start = 1; + $i += 4; + } + elsif (($data[$i] =~ /inclusion threshold/) or ($data[$i] =~ /Domain and alignment/)) { + last; + } + if ($start == 1 and $stop == 0) { + $data[$i] =~ s/^\s+//; + my @list = split /\s+/, $data[$i]; + push @hit, $list[8]; + $start = 0; #Added by THO + } + } + } + ### get the query_id + my ($query) = grep /^Query:/, @data; + $query =~ s/^Query:\s+//; + $query =~ s/\s.*//; + if (defined $hit[0]) { + chomp @hit; + return ($query, @hit); + } + else { + return ($query); + } +} +##################### +sub parseSeqfile { + my $seqref; + my $id; + my $spec; + my $seqid; + my $seq; + my $file = shift; + open (IN, "$file") or die "failed to open $file\n"; + my @seqs = ; + close IN; + chomp @seqs; + for (my $i = 0; $i < @seqs; $i++) { + if ($seqs[$i] =~ />/) { + $seqs[$i] =~ s/>//; + if (defined $id and defined $seq) { + $seqref->{$id}->{$spec}->{seqid} = $seqid; + $seqref->{$id}->{$spec}->{seq} = $seq; + $seq = undef; + } + ($id, $spec, $seqid) = split (/\|/, $seqs[$i]); + } + else { + $seq .= $seqs[$i]; + } + } + if (defined $id and defined $seq) { + $seqref->{$id}->{$spec}->{seqid} = $seqid; + $seqref->{$id}->{$spec}->{seq} = $seq; + $seq = undef; + } + return ($seqref); +} diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/install_ucsb_hamster.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/install_ucsb_hamster.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,51 @@ +#!/usr/bin/perl + +# UCSB Hamster Installation Script + +use strict; +use warnings; +use Cwd; + +my $galaxy_hamster = getcwd(); + +my @tokens = split('/', $galaxy_hamster); + +# Checking if all dependencies in environment variable or not. +# If conflicts are found, install will abort. Otherwise, the +# hamster dependencies will be added to the path + +chdir("\/$tokens[1]\/$tokens[2]\/"); + +open(BASHRCIN, "<".".bashrc"); + while(my $currLine = ) { + if($currLine eq "export PATH=\"$galaxy_hamster:\$PATH\";\n") { + die "Error: Please make sure there are no conflicting files in your environment variables of your .bashrc file.\n\nIf you are unsure what to do, refer to the detailed installation procedures in the README file.\nAborted installation."; + } + } +close(BASHRCIN); + +open(BASHRC, ">>".".bashrc"); + print BASHRC "export PATH=\"$galaxy_hamster:\$PATH\";\n"; +close(BASHRC); + +# edit wisecfg to include $galaxy_hamster/lib/wisecfg +chdir("$galaxy_hamster\/lib"); + +open(WISECFG, "<"."run_genewise.pm"); + my $new_run_genewise_pm = ""; + my $i = 0; + while(my $currLine = ) { + $i++; + if($i == 3) { + $currLine = "$ENV{'WISECONFIGDIR'} = "."\"$galaxy_hamster".'/lib/wisecfg"'.";\n"; + } + $new_run_genewise_pm = $new_run_genewise_pm.$currLine; + } +close(WISECFG); + +open(WISECFGOUT, ">"."run_genewise.pm"); + print WISECFGOUT $new_run_genewise_pm; +close(WISECFGOUT); + +# installation was a success +print "Installation was a SUCCESS!\n\nYou may re-run this script in the future to reconfigure UCSB Hamster.\n\n" diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/run_genewise.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/run_genewise.pm Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,250 @@ +package run_genewise; +use strict; +$ENV{'WISECONFIGDIR'} = "/home/osiris/galaxy-dist/tools/osiris/orthologs/ucsb_hamster/lib/wisecfg"; +# this module runs genewise on a DNA sequence and a protein sequence +# and then allows to parse this result. +# the constructor creates an object containing a reference to an array +# containing the file content +1; +sub new { + my $self_tmp = []; + my $self; + my ($class, $dna, $prot, $path) = @_; + if (!defined $path) { + $path = '/tmp'; + } +$dna =~ s/R/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence +$dna =~ s/S/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence +$dna =~ s/W/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence +$dna =~ s/D/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence +$dna =~ s/K/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence +$dna =~ s/Y/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence +$dna =~ s/B/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence +$dna =~ s/V/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence +$dna =~ s/M/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence + + # the file names + my $protname = 'protein'; + my $dnaname = 'dna'; + + ## print the two sequences to default path /tmp/ + open (DNA, ">$path/dna.fa") or die "could not open $path/dna.fa for writing\n"; + print DNA ">$dnaname\n$dna"; + close DNA; + open (PROTEIN, ">$path/prot.fa") or die "could not open $path/prot.fa for writing\n"; + print PROTEIN ">$protname\n$prot"; + close PROTEIN; + + ## run genewise on the two sequences + `echo \$WISECONFIGDIR`; + +# $self_tmp = [`.\/genewise -trans -cdna -pep -sum $path/prot.fa $path/dna.fa`]; +#THO--For Galaxy run Genewise in the path + $self_tmp = [`genewise -trans -cdna -pep -sum $path/prot.fa $path/dna.fa`]; + for (my $i = 0; $i < @$self_tmp; $i++) { + $self_tmp->[$i] =~ s/\s{1,}$//; + } + $self->{gw} = $self_tmp; + $self->{nt_seq} = $dna; + $self->{prot_seq} = $prot; + $self->{protname} = $protname; + $self->{dnaname} = $dnaname; + $self->{gw_count} = @$self_tmp; + $self->{get_indel} = 1; ## per default the indel-part is recovererd, rather than masked by 'N'. See code for details + $self->{indels} = _GetIndels($self_tmp); + bless ($self, $class); + return $self;} +################# +## sub score extract the score for the alignment +sub score { + my $self = shift; + my $score; + for (my $i = 0; $i < $self->{gw_count}; $i ++) { + if ($self->{gw}->[$i] =~ /^(\d{1,}\.{0,1}\d{0,}).*/) { + $score = $1; + last; + } + } + return ($score); +} +################## +sub protein { + my $self = shift; + my $gw = $self->{gw}; + my $prot = ''; + for (my $i = 0; $i < @$gw; $i++) { + if ($gw->[$i] =~ />.*\.pep/) { #the protein seq starts + my $count = 1; + while ($gw->[$i+$count] ne '//') { + my $protpart = $gw->[$i+$count]; + chomp $protpart; + $prot .= $protpart; + $count ++; + } + } + elsif (length $prot > 0) { + last; + } + } + return($prot); + } +################## +sub translation { + my $self = shift; + my $finish = 0; + my $translated_seq = ''; + my @transtmp; + + ## step 1: extract the relevant info from the genewise output + + for (my $i = 0; $i < $self->{gw_count}; $i++) { + if ($self->{gw}->[$i] =~ />.*.tr/) {# a translated bit starts + while ($self->{gw}->[$i] !~ '//') { + push @transtmp, $self->{gw}->[$i]; + $i++; + } + last; # end the for loop since nothing left to be done + } + } + + ## step two: get the sequences + my $count = -1; + my $trans; + for (my $i = 0; $i < @transtmp; $i++) { + if ($transtmp[$i] =~ />/) { + $count++; + $trans->[$count]->{seq} = ''; # initialize + if ($transtmp[$i] =~ /.*\[([0-9]{1,}):([0-9]{1,})\].*/) { + $trans->[$count]->{start} = $1; + $trans->[$count]->{end} = $2; + } + } + else { + $trans->[$count]->{seq} .= $transtmp[$i]; + } + } + + ## step 3: connect the fragments + if (@$trans == 1) { + $translated_seq = $trans->[0]->{seq}; + } + else { + for (my $i = 0; $i < @$trans; $i++) { + $translated_seq .= $trans->[$i]->{seq}; + if ($i < (@$trans - 1)) { + my $missing = $trans->[$i+1]->{start} - $trans->[$i]->{end} -1; + $translated_seq .= 'X'; + } + } + } + return($translated_seq); + } + +################## +sub codons { + my $self = shift; + my $finish = 0; + my $codon_seq = ''; + my @transtmp; + + ## step 1: extract the relevant info from the genewise output + for (my $i = 0; $i < $self->{gw_count}; $i++) { + if ($self->{gw}->[$i] =~ />.*sp$/) {# the codons set starts + while ($self->{gw}->[$i] !~ '//') { + push @transtmp, $self->{gw}->[$i]; + $i++; + } + last; # end the for loop since nothing left to be done + } + } + + ## step two: get the sequences + my $count = -1; + my $trans; + for (my $i = 0; $i < @transtmp; $i++) { + if ($transtmp[$i] =~ />/) { + $count++; + $trans->[$count]->{seq} = ''; # initialize + if ($transtmp[$i] =~ /.*\[([0-9]{1,}):([0-9]{1,})\].*/) { + $trans->[$count]->{start} = $1; + $trans->[$count]->{end} = $2; + } + } + else { + $transtmp[$i] =~ tr/a-z/A-Z/; + $trans->[$count]->{seq} .= $transtmp[$i]; + } + } + + ## step 3: connect the fragments + if ( @$trans == 1) { + $codon_seq = $trans->[0]->{seq}; + } + else { + for (my $i = 0; $i < @$trans; $i++) { + $codon_seq .= $trans->[$i]->{seq}; + if ($i < (@$trans - 1)) { + my $indel = ''; + my $missing = $trans->[$i+1]->{start} - $trans->[$i]->{end} -1; + + ## now decide whether the nts that did not got translated are masked by + ## 'N' or whether they will be represented as lower case letters + if ($self->{get_indel}) { + $indel = substr($self->{nt_seq}, $trans->[$i]->{end}, $missing); + $indel =~ tr/A-Z/a-z/; + } + else { + $indel = 'N' x $missing; + } + ## now append gap characters until the frame is recovered. Not that the gap + ## characters are added to the end of the indel-part. Thus, the codons are + ## not considered. + while (length($indel)%3 != 0) { + $indel .= '-'; + } + + $codon_seq .= $indel; + } + } + } + return ($codon_seq); + } +########################### +sub protein_borders { + my $self = shift; + my $gw = $self->{gw}; + for (my $i = 0; $i < @$gw; $i++) { + if ($gw->[$i] =~ /Bits.*introns$/) { + my ($start, $end) = $gw->[$i+1] =~ /.*$self->{protname}\s{1,}([0-9]{1,})\s{1,}([0-9]{1,}).*/; + return($start, $end); + } + else { + die "no protein-start and end could not be determnined. Check genewise command\n"; + } + } +} +########################## +sub cdna_borders { + my $self = shift; + my $gw = $self->{gw}; + for (my $i = 0; $i < @$gw; $i++) { + if ($gw->[$i] =~ /Bits.*introns$/) { + my ($start, $end) = $gw->[$i+1] =~ /.*$self->{dnaname}\s{1,}([0-9]{1,})\s{1,}([0-9]{1,}).*/; + return($start, $end); + } + else { + die "no cdna-start and end could not be determnined. Check genewise command\n"; + } + } +} +########################## +sub _GetIndels { + my $gw = shift; + my $indel; + for (my $i = 0; $i < @$gw; $i++) { + if ($gw->[$i] =~ /Bits/) { + $indel = $gw->[$i+1] =~ /.*([0-9]{1,})/; + return($indel); + } + } +} diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/BLOSUM45.bla --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/BLOSUM45.bla Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,28 @@ +# Matrix made by matblas from blosum45.iij +# * column uses minimum score +# BLOSUM Clustered Scoring Matrix in 1/3 Bit Units +# Blocks Database = /data/blocks_5.0/blocks.dat +# Cluster Percentage: >= 45 +# Entropy = 0.3795, Expected = -0.2789 + A R N D C Q E G H I L K M F P S T W Y V * + 5 -2 -1 -2 -1 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -2 -2 0 -5 +-2 7 0 -1 -3 1 0 -2 0 -3 -2 3 -1 -2 -2 -1 -1 -2 -1 -2 -5 +-1 0 6 2 -2 0 0 0 1 -2 -3 0 -2 -2 -2 1 0 -4 -2 -3 -5 +-2 -1 2 7 -3 0 2 -1 0 -4 -3 0 -3 -4 -1 0 -1 -4 -2 -3 -5 +-1 -3 -2 -3 12 -3 -3 -3 -3 -3 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 -5 +-1 1 0 0 -3 6 2 -2 1 -2 -2 1 0 -4 -1 0 -1 -2 -1 -3 -5 +-1 0 0 2 -3 2 6 -2 0 -3 -2 1 -2 -3 0 0 -1 -3 -2 -3 -5 + 0 -2 0 -1 -3 -2 -2 7 -2 -4 -3 -2 -2 -3 -2 0 -2 -2 -3 -3 -5 +-2 0 1 0 -3 1 0 -2 10 -3 -2 -1 0 -2 -2 -1 -2 -3 2 -3 -5 +-1 -3 -2 -4 -3 -2 -3 -4 -3 5 2 -3 2 0 -2 -2 -1 -2 0 3 -5 +-1 -2 -3 -3 -2 -2 -2 -3 -2 2 5 -3 2 1 -3 -3 -1 -2 0 1 -5 +-1 3 0 0 -3 1 1 -2 -1 -3 -3 5 -1 -3 -1 -1 -1 -2 -1 -2 -5 +-1 -1 -2 -3 -2 0 -2 -2 0 2 2 -1 6 0 -2 -2 -1 -2 0 1 -5 +-2 -2 -2 -4 -2 -4 -3 -3 -2 0 1 -3 0 8 -3 -2 -1 1 3 0 -5 +-1 -2 -2 -1 -4 -1 0 -2 -2 -2 -3 -1 -2 -3 9 -1 -1 -3 -3 -3 -5 + 1 -1 1 0 -1 0 0 0 -1 -2 -3 -1 -2 -2 -1 4 2 -4 -2 -1 -5 + 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -1 -1 2 5 -3 -1 0 -5 +-2 -2 -4 -4 -5 -2 -3 -2 -3 -2 -2 -2 -2 1 -3 -4 -3 15 3 -3 -5 +-2 -1 -2 -2 -3 -1 -2 -3 2 0 0 -1 0 3 -3 -2 -1 3 8 -1 -5 + 0 -2 -3 -3 -1 -3 -3 -3 -3 3 1 -2 1 0 -3 -1 0 -3 -1 5 -5 +-5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 1 diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/BLOSUM80.bla --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/BLOSUM80.bla Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,31 @@ +# Matrix made by matblas from blosum80_3.iij +# * column uses minimum score +# BLOSUM Clustered Scoring Matrix in 1/3 Bit Units +# Blocks Database = /data/blocks_5.0/blocks.dat +# Cluster Percentage: >= 80 +# Entropy = 0.9868, Expected = -0.7442 + A R N D C Q E G H I L K M F P S T W Y V B Z X * + 7 -3 -3 -3 -1 -2 -2 0 -3 -3 -3 -1 -2 -4 -1 2 0 -5 -4 -1 -3 -2 -1 -8 +-3 9 -1 -3 -6 1 -1 -4 0 -5 -4 3 -3 -5 -3 -2 -2 -5 -4 -4 -2 0 -2 -8 +-3 -1 9 2 -5 0 -1 -1 1 -6 -6 0 -4 -6 -4 1 0 -7 -4 -5 5 -1 -2 -8 +-3 -3 2 10 -7 -1 2 -3 -2 -7 -7 -2 -6 -6 -3 -1 -2 -8 -6 -6 6 1 -3 -8 +-1 -6 -5 -7 13 -5 -7 -6 -7 -2 -3 -6 -3 -4 -6 -2 -2 -5 -5 -2 -6 -7 -4 -8 +-2 1 0 -1 -5 9 3 -4 1 -5 -4 2 -1 -5 -3 -1 -1 -4 -3 -4 -1 5 -2 -8 +-2 -1 -1 2 -7 3 8 -4 0 -6 -6 1 -4 -6 -2 -1 -2 -6 -5 -4 1 6 -2 -8 + 0 -4 -1 -3 -6 -4 -4 9 -4 -7 -7 -3 -5 -6 -5 -1 -3 -6 -6 -6 -2 -4 -3 -8 +-3 0 1 -2 -7 1 0 -4 12 -6 -5 -1 -4 -2 -4 -2 -3 -4 3 -5 -1 0 -2 -8 +-3 -5 -6 -7 -2 -5 -6 -7 -6 7 2 -5 2 -1 -5 -4 -2 -5 -3 4 -6 -6 -2 -8 +-3 -4 -6 -7 -3 -4 -6 -7 -5 2 6 -4 3 0 -5 -4 -3 -4 -2 1 -7 -5 -2 -8 +-1 3 0 -2 -6 2 1 -3 -1 -5 -4 8 -3 -5 -2 -1 -1 -6 -4 -4 -1 1 -2 -8 +-2 -3 -4 -6 -3 -1 -4 -5 -4 2 3 -3 9 0 -4 -3 -1 -3 -3 1 -5 -3 -2 -8 +-4 -5 -6 -6 -4 -5 -6 -6 -2 -1 0 -5 0 10 -6 -4 -4 0 4 -2 -6 -6 -3 -8 +-1 -3 -4 -3 -6 -3 -2 -5 -4 -5 -5 -2 -4 -6 12 -2 -3 -7 -6 -4 -4 -2 -3 -8 + 2 -2 1 -1 -2 -1 -1 -1 -2 -4 -4 -1 -3 -4 -2 7 2 -6 -3 -3 0 -1 -1 -8 + 0 -2 0 -2 -2 -1 -2 -3 -3 -2 -3 -1 -1 -4 -3 2 8 -5 -3 0 -1 -2 -1 -8 +-5 -5 -7 -8 -5 -4 -6 -6 -4 -5 -4 -6 -3 0 -7 -6 -5 16 3 -5 -8 -5 -5 -8 +-4 -4 -4 -6 -5 -3 -5 -6 3 -3 -2 -4 -3 4 -6 -3 -3 3 11 -3 -5 -4 -3 -8 +-1 -4 -5 -6 -2 -4 -4 -6 -5 4 1 -4 1 -2 -4 -3 0 -5 -3 7 -6 -4 -2 -8 +-3 -2 5 6 -6 -1 1 -2 -1 -6 -7 -1 -5 -6 -4 0 -1 -8 -5 -6 6 0 -3 -8 +-2 0 -1 1 -7 5 6 -4 0 -6 -5 1 -3 -6 -2 -1 -2 -5 -4 -4 0 6 -1 -8 +-1 -2 -2 -3 -4 -2 -2 -3 -2 -2 -2 -2 -2 -3 -3 -1 -1 -5 -3 -2 -3 -1 -2 -8 +-8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 1 diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/aa.rnd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/aa.rnd Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,25 @@ +! +! AMINO_ALPHABET "ACDEFGHIKLMNPQRSTVWY" +! +! +A 0.08713 +C 0.03347 +D 0.04687 +E 0.04953 +F 0.03977 +G 0.08861 +H 0.03362 +I 0.03689 +K 0.08048 +L 0.08536 +M 0.01475 +N 0.04043 +P 0.05068 +Q 0.03826 +R 0.04090 +S 0.06958 +T 0.05854 +V 0.06472 +W 0.01049 +Y 0.02992 + diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/blosum30.bla --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/blosum30.bla Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,31 @@ +# Matrix made by matblas from blosum30.iij +# * column uses minimum score +# BLOSUM Clustered Scoring Matrix in 1/5 Bit Units +# Blocks Database = /data/blocks_5.0/blocks.dat +# Cluster Percentage: >= 30 +# Entropy = 0.1424, Expected = -0.1074 + A R N D C Q E G H I L K M F P S T W Y V B Z X * + 4 -1 0 0 -3 1 0 0 -2 0 -1 0 1 -2 -1 1 1 -5 -4 1 0 0 0 -7 +-1 8 -2 -1 -2 3 -1 -2 -1 -3 -2 1 0 -1 -1 -1 -3 0 0 -1 -2 0 -1 -7 + 0 -2 8 1 -1 -1 -1 0 -1 0 -2 0 0 -1 -3 0 1 -7 -4 -2 4 -1 0 -7 + 0 -1 1 9 -3 -1 1 -1 -2 -4 -1 0 -3 -5 -1 0 -1 -4 -1 -2 5 0 -1 -7 +-3 -2 -1 -3 17 -2 1 -4 -5 -2 0 -3 -2 -3 -3 -2 -2 -2 -6 -2 -2 0 -2 -7 + 1 3 -1 -1 -2 8 2 -2 0 -2 -2 0 -1 -3 0 -1 0 -1 -1 -3 -1 4 0 -7 + 0 -1 -1 1 1 2 6 -2 0 -3 -1 2 -1 -4 1 0 -2 -1 -2 -3 0 5 -1 -7 + 0 -2 0 -1 -4 -2 -2 8 -3 -1 -2 -1 -2 -3 -1 0 -2 1 -3 -3 0 -2 -1 -7 +-2 -1 -1 -2 -5 0 0 -3 14 -2 -1 -2 2 -3 1 -1 -2 -5 0 -3 -2 0 -1 -7 + 0 -3 0 -4 -2 -2 -3 -1 -2 6 2 -2 1 0 -3 -1 0 -3 -1 4 -2 -3 0 -7 +-1 -2 -2 -1 0 -2 -1 -2 -1 2 4 -2 2 2 -3 -2 0 -2 3 1 -1 -1 0 -7 + 0 1 0 0 -3 0 2 -1 -2 -2 -2 4 2 -1 1 0 -1 -2 -1 -2 0 1 0 -7 + 1 0 0 -3 -2 -1 -1 -2 2 1 2 2 6 -2 -4 -2 0 -3 -1 0 -2 -1 0 -7 +-2 -1 -1 -5 -3 -3 -4 -3 -3 0 2 -1 -2 10 -4 -1 -2 1 3 1 -3 -4 -1 -7 +-1 -1 -3 -1 -3 0 1 -1 1 -3 -3 1 -4 -4 11 -1 0 -3 -2 -4 -2 0 -1 -7 + 1 -1 0 0 -2 -1 0 0 -1 -1 -2 0 -2 -1 -1 4 2 -3 -2 -1 0 -1 0 -7 + 1 -3 1 -1 -2 0 -2 -2 -2 0 0 -1 0 -2 0 2 5 -5 -1 1 0 -1 0 -7 +-5 0 -7 -4 -2 -1 -1 1 -5 -3 -2 -2 -3 1 -3 -3 -5 20 5 -3 -5 -1 -2 -7 +-4 0 -4 -1 -6 -1 -2 -3 0 -1 3 -1 -1 3 -2 -2 -1 5 9 1 -3 -2 -1 -7 + 1 -1 -2 -2 -2 -3 -3 -3 -3 4 1 -2 0 1 -4 -1 1 -3 1 5 -2 -3 0 -7 + 0 -2 4 5 -2 -1 0 0 -2 -2 -1 0 -2 -3 -2 0 0 -5 -3 -2 5 0 -1 -7 + 0 0 -1 0 0 4 5 -2 0 -3 -1 1 -1 -4 0 -1 -1 -1 -2 -3 0 4 0 -7 + 0 -1 0 -1 -2 0 -1 -1 -1 0 0 0 0 -1 -1 0 0 -2 -1 0 -1 0 -1 -7 +-7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 1 diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/blosum62.bla --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/blosum62.bla Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,31 @@ +# Matrix made by matblas from blosum62.iij +# * column uses minimum score +# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units +# Blocks Database = /data/blocks_5.0/blocks.dat +# Cluster Percentage: >= 62 +# Entropy = 0.6979, Expected = -0.5209 + A R N D C Q E G H I L K M F P S T W Y V B Z X * + 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 +-1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4 +-2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4 +-2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4 + 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 +-1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4 +-1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 + 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4 +-2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4 +-1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4 +-1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4 +-1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4 +-1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4 +-2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4 +-1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4 + 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4 + 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4 +-3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4 +-2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4 + 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4 +-2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4 +-1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 + 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4 +-4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/cb.tmf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/cb.tmf Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,1237 @@ + + +tmfreq +paired internal +-:- 5836 +-:A 906 +-:B 0.00 +-:C 0.00 +-:D 107 +-:E 84 +-:F 1 +-:G 330 +-:H 4 +-:I 6 +-:J 0.00 +-:K 879 +-:L 178 +-:M 234 +-:N 288 +-:O 0.00 +-:P 629 +-:Q 67 +-:R 423 +-:S 692 +-:T 377 +-:U 0.00 +-:V 475 +-:W 50 +-:X 0.00 +-:Y 148 +-:Z 0.00 +A:A 8356 +A:B 0.00 +A:C 0.00 +A:D 11 +A:E 237 +A:F 0.00 +A:G 872 +A:H 5 +A:I 19 +A:J 0.00 +A:K 412 +A:L 16 +A:M 12 +A:N 112 +A:O 0.00 +A:P 698 +A:Q 71 +A:R 61 +A:S 1072 +A:T 875 +A:U 0.00 +A:V 902 +A:W 7 +A:X 0.00 +A:Y 0.00 +A:Z 0.00 +B:B 0.00 +B:C 0.00 +B:D 0.00 +B:E 0.00 +B:F 0.00 +B:G 0.00 +B:H 0.00 +B:I 0.00 +B:J 0.00 +B:K 0.00 +B:L 0.00 +B:M 0.00 +B:N 0.00 +B:O 0.00 +B:P 0.00 +B:Q 0.00 +B:R 0.00 +B:S 0.00 +B:T 0.00 +B:U 0.00 +B:V 0.00 +B:W 0.00 +B:X 0.00 +B:Y 0.00 +B:Z 0.00 +C:C 1 +C:D 0.00 +C:E 0.00 +C:F 96 +C:G 50 +C:H 0.00 +C:I 0.00 +C:J 0.00 +C:K 0.00 +C:L 2 +C:M 0.00 +C:N 0.00 +C:O 0.00 +C:P 0.00 +C:Q 0.00 +C:R 0.00 +C:S 0.00 +C:T 0.00 +C:U 0.00 +C:V 0.00 +C:W 0.00 +C:X 0.00 +C:Y 0.00 +C:Z 0.00 +D:D 9289 +D:E 957 +D:F 0.00 +D:G 180 +D:H 0.00 +D:I 3 +D:J 0.00 +D:K 0.00 +D:L 0.00 +D:M 0.00 +D:N 55 +D:O 0.00 +D:P 37 +D:Q 36 +D:R 99 +D:S 117 +D:T 125 +D:U 0.00 +D:V 45 +D:W 0.00 +D:X 0.00 +D:Y 50 +D:Z 0.00 +E:E 9911 +E:F 0.00 +E:G 667 +E:H 49 +E:I 34 +E:J 0.00 +E:K 147 +E:L 50 +E:M 0.00 +E:N 14 +E:O 0.00 +E:P 218 +E:Q 461 +E:R 1 +E:S 157 +E:T 129 +E:U 0.00 +E:V 173 +E:W 0.00 +E:X 0.00 +E:Y 0.00 +E:Z 0.00 +F:F 5946 +F:G 0.00 +F:H 0.00 +F:I 0.00 +F:J 0.00 +F:K 0.00 +F:L 365 +F:M 0.00 +F:N 0.00 +F:O 0.00 +F:P 0.00 +F:Q 0.00 +F:R 0.00 +F:S 1 +F:T 0.00 +F:U 0.00 +F:V 2 +F:W 52 +F:X 0.00 +F:Y 141 +F:Z 0.00 +G:G 15636 +G:H 0.00 +G:I 14 +G:J 0.00 +G:K 35 +G:L 52 +G:M 4 +G:N 82 +G:O 0.00 +G:P 243 +G:Q 179 +G:R 9 +G:S 355 +G:T 150 +G:U 0.00 +G:V 456 +G:W 0.00 +G:X 0.00 +G:Y 0.00 +G:Z 0.00 +H:H 0.00 +H:I 0.00 +H:J 0.00 +H:K 2 +H:L 0.00 +H:M 0.00 +H:N 0.00 +H:O 0.00 +H:P 49 +H:Q 0.00 +H:R 0.00 +H:S 3 +H:T 38 +H:U 0.00 +H:V 0.00 +H:W 0.00 +H:X 0.00 +H:Y 0.00 +H:Z 0.00 +I:I 781 +I:J 0.00 +I:K 0.00 +I:L 1113 +I:M 0.00 +I:N 6 +I:O 0.00 +I:P 433 +I:Q 0.00 +I:R 0.00 +I:S 155 +I:T 91 +I:U 0.00 +I:V 112 +I:W 2 +I:X 0.00 +I:Y 0.00 +I:Z 0.00 +J:J 0.00 +J:K 0.00 +J:L 0.00 +J:M 0.00 +J:N 0.00 +J:O 0.00 +J:P 0.00 +J:Q 0.00 +J:R 0.00 +J:S 0.00 +J:T 0.00 +J:U 0.00 +J:V 0.00 +J:W 0.00 +J:X 0.00 +J:Y 0.00 +J:Z 0.00 +K:K 6931 +K:L 412 +K:M 49 +K:N 8 +K:O 0.00 +K:P 268 +K:Q 6 +K:R 870 +K:S 404 +K:T 324 +K:U 0.00 +K:V 22 +K:W 0.00 +K:X 0.00 +K:Y 0.00 +K:Z 0.00 +L:L 10639 +L:M 9 +L:N 26 +L:O 0.00 +L:P 242 +L:Q 0.00 +L:R 47 +L:S 7 +L:T 128 +L:U 0.00 +L:V 525 +L:W 0.00 +L:X 0.00 +L:Y 50 +L:Z 0.00 +M:M 1035 +M:N 1 +M:O 0.00 +M:P 1 +M:Q 0.00 +M:R 1 +M:S 6 +M:T 13 +M:U 0.00 +M:V 0.00 +M:W 0.00 +M:X 0.00 +M:Y 0.00 +M:Z 0.00 +N:N 1186 +N:O 0.00 +N:P 5 +N:Q 16 +N:R 8 +N:S 89 +N:T 228 +N:U 0.00 +N:V 42 +N:W 0.00 +N:X 0.00 +N:Y 98 +N:Z 0.00 +O:O 0.00 +O:P 0.00 +O:Q 0.00 +O:R 0.00 +O:S 0.00 +O:T 0.00 +O:U 0.00 +O:V 0.00 +O:W 0.00 +O:X 0.00 +O:Y 0.00 +O:Z 0.00 +P:P 12009 +P:Q 34 +P:R 52 +P:S 1012 +P:T 399 +P:U 0.00 +P:V 547 +P:W 15 +P:X 0.00 +P:Y 0.00 +P:Z 0.00 +Q:Q 61 +Q:R 1 +Q:S 152 +Q:T 23 +Q:U 0.00 +Q:V 32 +Q:W 0.00 +Q:X 0.00 +Q:Y 0.00 +Q:Z 0.00 +R:R 3652 +R:S 41 +R:T 33 +R:U 0.00 +R:V 0.00 +R:W 0.00 +R:X 0.00 +R:Y 0.00 +R:Z 0.00 +S:S 6355 +S:T 541 +S:U 0.00 +S:V 73 +S:W 63 +S:X 0.00 +S:Y 0.00 +S:Z 0.00 +T:T 4355 +T:U 0.00 +T:V 304 +T:W 12 +T:X 0.00 +T:Y 50 +T:Z 0.00 +U:U 0.00 +U:V 0.00 +U:W 0.00 +U:X 0.00 +U:Y 0.00 +U:Z 0.00 +V:V 3145 +V:W 0.00 +V:X 0.00 +V:Y 0.00 +V:Z 0.00 +W:W 2401 +W:X 0.00 +W:Y 47 +W:Z 0.00 +X:X 0.00 +X:Y 0.00 +X:Z 0.00 +Y:Y 6033 +Y:Z 0.00 +Z:Z 0.00 +endpaired +unpaired internal +- 351 +A 460 +B 0.00 +C 3 +D 408 +E 464 +F 251 +G 699 +H 3 +I 71 +J 0.00 +K 354 +L 490 +M 48 +N 69 +O 0.00 +P 578 +Q 24 +R 179 +S 353 +T 251 +U 0.00 +V 200 +W 101 +X 0.00 +Y 253 +Z 0.00 +endunpaired +paired external +-:- 2260 +-:A 2 +-:B 0.00 +-:C 0.00 +-:D 0.00 +-:E 0.00 +-:F 0.00 +-:G 0.00 +-:H 0.00 +-:I 0.00 +-:J 0.00 +-:K 92 +-:L 0.00 +-:M 0.00 +-:N 45 +-:O 0.00 +-:P 0.00 +-:Q 46 +-:R 50 +-:S 1 +-:T 46 +-:U 0.00 +-:V 48 +-:W 0.00 +-:X 0.00 +-:Y 0.00 +-:Z 0.00 +A:A 7600 +A:B 0.00 +A:C 1 +A:D 86 +A:E 66 +A:F 161 +A:G 139 +A:H 0.00 +A:I 1 +A:J 0.00 +A:K 138 +A:L 123 +A:M 0.00 +A:N 644 +A:O 0.00 +A:P 186 +A:Q 35 +A:R 0.00 +A:S 1605 +A:T 141 +A:U 0.00 +A:V 327 +A:W 0.00 +A:X 0.00 +A:Y 145 +A:Z 0.00 +B:B 0.00 +B:C 0.00 +B:D 0.00 +B:E 0.00 +B:F 0.00 +B:G 0.00 +B:H 0.00 +B:I 0.00 +B:J 0.00 +B:K 0.00 +B:L 0.00 +B:M 0.00 +B:N 0.00 +B:O 0.00 +B:P 0.00 +B:Q 0.00 +B:R 0.00 +B:S 0.00 +B:T 0.00 +B:U 0.00 +B:V 0.00 +B:W 0.00 +B:X 0.00 +B:Y 0.00 +B:Z 0.00 +C:C 0.00 +C:D 0.00 +C:E 0.00 +C:F 0.00 +C:G 0.00 +C:H 0.00 +C:I 0.00 +C:J 0.00 +C:K 0.00 +C:L 0.00 +C:M 0.00 +C:N 0.00 +C:O 0.00 +C:P 1 +C:Q 0.00 +C:R 0.00 +C:S 0.00 +C:T 3 +C:U 0.00 +C:V 45 +C:W 0.00 +C:X 0.00 +C:Y 0.00 +C:Z 0.00 +D:D 3495 +D:E 320 +D:F 0.00 +D:G 100 +D:H 0.00 +D:I 0.00 +D:J 0.00 +D:K 44 +D:L 0.00 +D:M 0.00 +D:N 147 +D:O 0.00 +D:P 0.00 +D:Q 58 +D:R 0.00 +D:S 50 +D:T 6 +D:U 0.00 +D:V 2 +D:W 47 +D:X 0.00 +D:Y 0.00 +D:Z 0.00 +E:E 3347 +E:F 0.00 +E:G 2 +E:H 0.00 +E:I 0.00 +E:J 0.00 +E:K 44 +E:L 0.00 +E:M 0.00 +E:N 1 +E:O 0.00 +E:P 0.00 +E:Q 190 +E:R 0.00 +E:S 30 +E:T 2 +E:U 0.00 +E:V 0.00 +E:W 1 +E:X 0.00 +E:Y 0.00 +E:Z 0.00 +F:F 5036 +F:G 0.00 +F:H 0.00 +F:I 13 +F:J 0.00 +F:K 0.00 +F:L 105 +F:M 0.00 +F:N 17 +F:O 0.00 +F:P 0.00 +F:Q 0.00 +F:R 0.00 +F:S 74 +F:T 4 +F:U 0.00 +F:V 4 +F:W 96 +F:X 0.00 +F:Y 454 +F:Z 0.00 +G:G 9902 +G:H 0.00 +G:I 0.00 +G:J 0.00 +G:K 193 +G:L 0.00 +G:M 0.00 +G:N 2 +G:O 0.00 +G:P 4 +G:Q 2 +G:R 49 +G:S 51 +G:T 5 +G:U 0.00 +G:V 98 +G:W 1 +G:X 0.00 +G:Y 0.00 +G:Z 0.00 +H:H 1275 +H:I 0.00 +H:J 0.00 +H:K 0.00 +H:L 0.00 +H:M 0.00 +H:N 0.00 +H:O 0.00 +H:P 0.00 +H:Q 0.00 +H:R 0.00 +H:S 0.00 +H:T 0.00 +H:U 0.00 +H:V 0.00 +H:W 0.00 +H:X 0.00 +H:Y 50 +H:Z 0.00 +I:I 1426 +I:J 0.00 +I:K 0.00 +I:L 808 +I:M 0.00 +I:N 1 +I:O 0.00 +I:P 0.00 +I:Q 1 +I:R 0.00 +I:S 1 +I:T 0.00 +I:U 0.00 +I:V 89 +I:W 0.00 +I:X 0.00 +I:Y 34 +I:Z 0.00 +J:J 0.00 +J:K 0.00 +J:L 0.00 +J:M 0.00 +J:N 0.00 +J:O 0.00 +J:P 0.00 +J:Q 0.00 +J:R 0.00 +J:S 0.00 +J:T 0.00 +J:U 0.00 +J:V 0.00 +J:W 0.00 +J:X 0.00 +J:Y 0.00 +J:Z 0.00 +K:K 3191 +K:L 0.00 +K:M 11 +K:N 361 +K:O 0.00 +K:P 0.00 +K:Q 133 +K:R 385 +K:S 90 +K:T 138 +K:U 0.00 +K:V 45 +K:W 22 +K:X 0.00 +K:Y 22 +K:Z 0.00 +L:L 5074 +L:M 0.00 +L:N 2 +L:O 0.00 +L:P 49 +L:Q 1 +L:R 0.00 +L:S 14 +L:T 1 +L:U 0.00 +L:V 248 +L:W 48 +L:X 0.00 +L:Y 3 +L:Z 0.00 +M:M 0.00 +M:N 0.00 +M:O 0.00 +M:P 0.00 +M:Q 0.00 +M:R 35 +M:S 0.00 +M:T 0.00 +M:U 0.00 +M:V 0.00 +M:W 2 +M:X 0.00 +M:Y 2 +M:Z 0.00 +N:N 7464 +N:O 0.00 +N:P 0.00 +N:Q 3 +N:R 0.00 +N:S 207 +N:T 86 +N:U 0.00 +N:V 168 +N:W 1 +N:X 0.00 +N:Y 37 +N:Z 0.00 +O:O 0.00 +O:P 0.00 +O:Q 0.00 +O:R 0.00 +O:S 0.00 +O:T 0.00 +O:U 0.00 +O:V 0.00 +O:W 0.00 +O:X 0.00 +O:Y 0.00 +O:Z 0.00 +P:P 3731 +P:Q 0.00 +P:R 50 +P:S 0.00 +P:T 103 +P:U 0.00 +P:V 45 +P:W 0.00 +P:X 0.00 +P:Y 0.00 +P:Z 0.00 +Q:Q 1279 +Q:R 0.00 +Q:S 17 +Q:T 6 +Q:U 0.00 +Q:V 50 +Q:W 0.00 +Q:X 0.00 +Q:Y 0.00 +Q:Z 0.00 +R:R 595 +R:S 1 +R:T 0.00 +R:U 0.00 +R:V 0.00 +R:W 70 +R:X 0.00 +R:Y 70 +R:Z 0.00 +S:S 2017 +S:T 6 +S:U 0.00 +S:V 76 +S:W 0.00 +S:X 0.00 +S:Y 43 +S:Z 0.00 +T:T 1230 +T:U 0.00 +T:V 190 +T:W 0.00 +T:X 0.00 +T:Y 3 +T:Z 0.00 +U:U 0.00 +U:V 0.00 +U:W 0.00 +U:X 0.00 +U:Y 0.00 +U:Z 0.00 +V:V 4431 +V:W 0.00 +V:X 0.00 +V:Y 3 +V:Z 0.00 +W:W 2404 +W:X 0.00 +W:Y 4 +W:Z 0.00 +X:X 0.00 +X:Y 0.00 +X:Z 0.00 +Y:Y 1790 +Y:Z 0.00 +Z:Z 0.00 +endpaired +unpaired external +- 97 +A 380 +B 0.00 +C 1 +D 157 +E 147 +F 220 +G 409 +H 52 +I 76 +J 0.00 +K 162 +L 231 +M 1 +N 333 +O 0.00 +P 158 +Q 62 +R 38 +S 126 +T 64 +U 0.00 +V 206 +W 102 +X 0.00 +Y 89 +Z 0.00 +endunpaired +paired hydrophobic +-:- 0.00 +-:A 0.00 +-:B 0.00 +-:C 0.00 +-:D 0.00 +-:E 0.00 +-:F 0.00 +-:G 0.00 +-:H 0.00 +-:I 0.00 +-:J 0.00 +-:K 0.00 +-:L 0.00 +-:M 0.00 +-:N 0.00 +-:O 0.00 +-:P 0.00 +-:Q 0.00 +-:R 0.00 +-:S 0.00 +-:T 0.00 +-:U 0.00 +-:V 0.00 +-:W 0.00 +-:X 0.00 +-:Y 0.00 +-:Z 0.00 +A:A 9335 +A:B 0.00 +A:C 108 +A:D 0.00 +A:E 0.00 +A:F 230 +A:G 302 +A:H 0.00 +A:I 26 +A:J 0.00 +A:K 0.00 +A:L 315 +A:M 0.00 +A:N 11 +A:O 0.00 +A:P 0.00 +A:Q 2 +A:R 0.00 +A:S 97 +A:T 109 +A:U 0.00 +A:V 80 +A:W 0.00 +A:X 0.00 +A:Y 0.00 +A:Z 0.00 +B:B 0.00 +B:C 0.00 +B:D 0.00 +B:E 0.00 +B:F 0.00 +B:G 0.00 +B:H 0.00 +B:I 0.00 +B:J 0.00 +B:K 0.00 +B:L 0.00 +B:M 0.00 +B:N 0.00 +B:O 0.00 +B:P 0.00 +B:Q 0.00 +B:R 0.00 +B:S 0.00 +B:T 0.00 +B:U 0.00 +B:V 0.00 +B:W 0.00 +B:X 0.00 +B:Y 0.00 +B:Z 0.00 +C:C 1937 +C:D 0.00 +C:E 0.00 +C:F 60 +C:G 78 +C:H 0.00 +C:I 98 +C:J 0.00 +C:K 0.00 +C:L 0.00 +C:M 98 +C:N 0.00 +C:O 0.00 +C:P 0.00 +C:Q 0.00 +C:R 0.00 +C:S 554 +C:T 330 +C:U 0.00 +C:V 150 +C:W 50 +C:X 0.00 +C:Y 0.00 +C:Z 0.00 +D:D 0.00 +D:E 0.00 +D:F 0.00 +D:G 0.00 +D:H 0.00 +D:I 0.00 +D:J 0.00 +D:K 0.00 +D:L 0.00 +D:M 0.00 +D:N 0.00 +D:O 0.00 +D:P 0.00 +D:Q 0.00 +D:R 0.00 +D:S 0.00 +D:T 0.00 +D:U 0.00 +D:V 0.00 +D:W 0.00 +D:X 0.00 +D:Y 0.00 +D:Z 0.00 +E:E 2550 +E:F 0.00 +E:G 0.00 +E:H 0.00 +E:I 0.00 +E:J 0.00 +E:K 0.00 +E:L 0.00 +E:M 0.00 +E:N 0.00 +E:O 0.00 +E:P 49 +E:Q 1 +E:R 0.00 +E:S 0.00 +E:T 0.00 +E:U 0.00 +E:V 0.00 +E:W 0.00 +E:X 0.00 +E:Y 0.00 +E:Z 0.00 +F:F 6104 +F:G 0.00 +F:H 0.00 +F:I 0.00 +F:J 0.00 +F:K 0.00 +F:L 330 +F:M 0.00 +F:N 0.00 +F:O 0.00 +F:P 0.00 +F:Q 0.00 +F:R 0.00 +F:S 4 +F:T 210 +F:U 0.00 +F:V 10 +F:W 0.00 +F:X 0.00 +F:Y 98 +F:Z 0.00 +G:G 8731 +G:H 0.00 +G:I 0.00 +G:J 0.00 +G:K 0.00 +G:L 0.00 +G:M 0.00 +G:N 0.00 +G:O 0.00 +G:P 0.00 +G:Q 0.00 +G:R 50 +G:S 57 +G:T 2 +G:U 0.00 +G:V 49 +G:W 0.00 +G:X 0.00 +G:Y 0.00 +G:Z 0.00 +H:H 2550 +H:I 0.00 +H:J 0.00 +H:K 0.00 +H:L 0.00 +H:M 0.00 +H:N 0.00 +H:O 0.00 +H:P 0.00 +H:Q 0.00 +H:R 50 +H:S 0.00 +H:T 0.00 +H:U 0.00 +H:V 0.00 +H:W 0.00 +H:X 0.00 +H:Y 0.00 +H:Z 0.00 +I:I 5218 +I:J 0.00 +I:K 0.00 +I:L 448 +I:M 48 +I:N 0.00 +I:O 0.00 +I:P 0.00 +I:Q 0.00 +I:R 0.00 +I:S 0.00 +I:T 95 +I:U 0.00 +I:V 2349 +I:W 0.00 +I:X 0.00 +I:Y 0.00 +I:Z 0.00 +J:J 0.00 +J:K 0.00 +J:L 0.00 +J:M 0.00 +J:N 0.00 +J:O 0.00 +J:P 0.00 +J:Q 0.00 +J:R 0.00 +J:S 0.00 +J:T 0.00 +J:U 0.00 +J:V 0.00 +J:W 0.00 +J:X 0.00 +J:Y 0.00 +J:Z 0.00 +K:K 1275 +K:L 0.00 +K:M 0.00 +K:N 50 +K:O 0.00 +K:P 0.00 +K:Q 50 +K:R 0.00 +K:S 0.00 +K:T 0.00 +K:U 0.00 +K:V 0.00 +K:W 0.00 +K:X 0.00 +K:Y 0.00 +K:Z 0.00 +L:L 9469 +L:M 96 +L:N 0.00 +L:O 0.00 +L:P 0.00 +L:Q 0.00 +L:R 0.00 +L:S 49 +L:T 3 +L:U 0.00 +L:V 433 +L:W 188 +L:X 0.00 +L:Y 0.00 +L:Z 0.00 +M:M 4854 +M:N 0.00 +M:O 0.00 +M:P 0.00 +M:Q 0.00 +M:R 0.00 +M:S 0.00 +M:T 0.00 +M:U 0.00 +M:V 0.00 +M:W 0.00 +M:X 0.00 +M:Y 0.00 +M:Z 0.00 +N:N 1280 +N:O 0.00 +N:P 0.00 +N:Q 22 +N:R 0.00 +N:S 457 +N:T 0.00 +N:U 0.00 +N:V 0.00 +N:W 0.00 +N:X 0.00 +N:Y 0.00 +N:Z 0.00 +O:O 0.00 +O:P 0.00 +O:Q 0.00 +O:R 0.00 +O:S 0.00 +O:T 0.00 +O:U 0.00 +O:V 0.00 +O:W 0.00 +O:X 0.00 +O:Y 0.00 +O:Z 0.00 +P:P 2451 +P:Q 49 +P:R 0.00 +P:S 0.00 +P:T 0.00 +P:U 0.00 +P:V 0.00 +P:W 0.00 +P:X 0.00 +P:Y 0.00 +P:Z 0.00 +Q:Q 3776 +Q:R 0.00 +Q:S 74 +Q:T 0.00 +Q:U 0.00 +Q:V 0.00 +Q:W 0.00 +Q:X 0.00 +Q:Y 0.00 +Q:Z 0.00 +R:R 3775 +R:S 0.00 +R:T 0.00 +R:U 0.00 +R:V 0.00 +R:W 0.00 +R:X 0.00 +R:Y 0.00 +R:Z 0.00 +S:S 3338 +S:T 72 +S:U 0.00 +S:V 10 +S:W 0.00 +S:X 0.00 +S:Y 0.00 +S:Z 0.00 +T:T 1287 +T:U 0.00 +T:V 105 +T:W 0.00 +T:X 0.00 +T:Y 0.00 +T:Z 0.00 +U:U 0.00 +U:V 0.00 +U:W 0.00 +U:X 0.00 +U:Y 0.00 +U:Z 0.00 +V:V 8357 +V:W 0.00 +V:X 0.00 +V:Y 0.00 +V:Z 0.00 +W:W 2306 +W:X 0.00 +W:Y 0.00 +W:Z 0.00 +X:X 0.00 +X:Y 0.00 +X:Z 0.00 +Y:Y 1176 +Y:Z 0.00 +Z:Z 0.00 +endpaired +unpaired hydrophobic +- 0.00 +A 399 +B 0.00 +C 108 +D 0.00 +E 103 +F 263 +G 360 +H 103 +I 270 +J 0.00 +K 53 +L 416 +M 199 +N 62 +O 0.00 +P 100 +Q 155 +R 153 +S 161 +T 70 +U 0.00 +V 398 +W 97 +X 0.00 +Y 49 +Z 0.00 +endunpaired +endtmfreq + + + + + + diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/codon.table --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/codon.table Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,66 @@ +! this is a codon table +! by ewan +TTT F +TTC F +TTA L +TTG L +TCT S +TCC S +TCA S +TCG S +TAT Y +TAC Y +TAA X +TAG X +TGT C +TGC C +TGA X +TGG W +CTT L +CTC L +CTA L +CTG L +CCT P +CCC P +CCA P +CCG P +CAT H +CAC H +CAA Q +CAG Q +CGT R +CGC R +CGA R +CGG R +ATT I +ATC I +ATA I +ATG M +ACT T +ACC T +ACA T +ACG T +AAT N +AAC N +AAA K +AAG K +AGT S +AGC S +AGA R +AGG R +GTT V +GTC V +GTA V +GTG V +GCT A +GCC A +GCA A +GCG A +GAT D +GAC D +GAA E +GAG E +GGT G +GGC G +GGA G +GGG G diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/gene.stat --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/gene.stat Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,170 @@ +# +# new genestats file +# +splice3 10 +ENSE00000673410 TCCCACATAGATCA +ENSE00000673409 TATCCTGCAGTATG +ENSE00000673408 TTTGCTATAGATTA +ENSE00000673407 GGTTTTTCAGTTGC +ENSE00000401072 CTCCCATTAGGGTT +ENSE00000868868 TTATTTCTAGCTGA +ENSE00000401061 CTTTGGTTAGGCAG +ENSE00000673400 GTTCTCCCAGGCCT +ENSE00000673402 TGTTTTATAGGGAT +ENSE00000673403 CTTCTTTCAGAATG +ENSE00000868865 ATTATCTTAGTTTC +ENSE00000662824 ATTTTTTTAGTGGA +ENSE00000662823 TCATTTTTAGGAAC +ENSE00000662822 CAAATTTCAGCCAA +ENSE00000662821 TATTCTACAGGAAG +ENSE00000662820 TATTTGGCAGCCGT +ENSE00000662819 TTTCTTACAGATTA +ENSE00000814444 ATTTTTAAAGCATT +ENSE00000814445 TTTTTTCCAGGGAC +ENSE00000814446 CTCTGTCTAGTTTC +ENSE00000814447 CTGCTTTTAGTCTC +ENSE00000814448 TGGATTTCAGTTTC +ENSE00000789672 TGGCATTTAGAAAT +ENSE00000789671 TTGTCCTCAGATTT +ENSE00000814451 TTTCGGGTAGATCA +ENSE00000814452 TTGCTCCTAGGTTT +ENSE00000789668 TTCCCTACAGGCGA +ENSE00000789652 TTCTTTGCAGTTAT +ENSE00000789653 ATTCATTTAGGCAG +ENSE00000789654 TTTGGATCAGGATA +ENSE00000789655 TTTATTTTAGGAAT +ENSE00000789656 CCCTTCCCAGCAAG +ENSE00000789657 TACATGTAAGACCT +ENSE00000789658 TTCCCTGCAGGATA +ENSE00000789659 TTTTGATTAGGATG +ENSE00000789660 TATATTGCAGATAT +ENSE00000789661 CTTTTTCCAGATAA +ENSE00000789662 TTTATTTCAGCTGG +ENSE00000450960 TGTCCTTCAGAACA +ENSE00000789663 TGTTTCTTAGGTAG +ENSE00000789664 ACCTCTGTAGGCAG +ENSE00000789665 GTTCTTTTAGGACA +ENSE00000789666 TTTTTAATAGGGAA +ENSE00000450963 TCCCTGGCAGACTG +ENSE00000813771 AACTTTTCAGCTCT +ENSE00000813770 TATTACACAGGATT +// +splice5 5 +ENSE00000673410 ACTGGTGAGTCCTT +ENSE00000673409 TGAGGTAAGCCTGA +ENSE00000673408 TACGGTAAGTGGTA +ENSE00000673407 TGAAGTAAGGTGCC +ENSE00000401072 CCAAGTAAGTTTTT +ENSE00000868868 TAAGGTGCGTTCAT +ENSE00000401061 AAAAGTAAGTAAAT +ENSE00000673400 AAACGTAAGTTGGA +ENSE00000673402 TGAGGTATGTAAGA +ENSE00000673403 TCAGGTATGACATT +ENSE00000868865 CTATGTGAGTTATG +ENSE00000662824 GTAAGTTATAAGAA +ENSE00000662823 CCATGTAAGTGGTA +ENSE00000662822 TTAGGTAGGTACTA +ENSE00000662821 TCAGGTAGGTACAT +ENSE00000662820 TCAGGTACAGTGAA +ENSE00000662819 CGAGGTATGCAACT +ENSE00000814444 CAGAGTAAGTAACC +ENSE00000814445 AGAGGTAAGCCAGG +ENSE00000814446 GATGGTAAGATGAT +ENSE00000814447 CAGGGTGAGTTGGA +ENSE00000814448 TCAGGTGAGGGCAT +ENSE00000789672 TTAAGTAAGTTCAG +ENSE00000789671 AAAGGTGAATGCTT +ENSE00000814451 ACAGGTCAGAGGCC +ENSE00000814452 GAAGGTAAGAATTA +ENSE00000789668 TGAGGTGAGTACTT +ENSE00000789652 TTAAGTAAGTTTGT +ENSE00000789653 ACAGGTAAAATTTG +ENSE00000789654 TAAGGTAAGGCTTT +ENSE00000789655 ACAGGTAAGAAGAA +ENSE00000789656 TTAGGTAAGCTTCA +// +# A G C T N +intron_emission +0.25 0.25 0.25 0.25 0.25 +// +polyp_emission +0.2 0.2 0.3 0.3 0.25 +// +rnd_emission +0.25 0.25 0.25 0.25 0.25 +// +rndcodon +AAA 5290.000000 +AAC 4795.000000 +AAG 8178.000000 +AAT 3305.000000 +ACA 6240.000000 +ACC 7728.000000 +ACG 3347.000000 +ACT 4930.000000 +AGA 8491.000000 +AGC 8639.000000 +AGG 8997.000000 +AGT 4417.000000 +ATA 1975.000000 +ATC 4973.000000 +ATG 6474.000000 +ATT 3083.000000 +CAA 7057.000000 +CAC 6815.000000 +CAG 11041.000000 +CAT 5779.000000 +CCA 10537.000000 +CCC 10307.000000 +CCG 5621.000000 +CCT 10134.000000 +CGA 3377.000000 +CGC 5146.000000 +CGG 5375.000000 +CGT 2765.000000 +CTA 3502.000000 +CTC 7465.000000 +CTG 13780.000000 +CTT 5453.000000 +GAA 7461.000000 +GAC 6937.000000 +GAG 9975.000000 +GAT 4949.000000 +GCA 7747.000000 +GCC 10890.000000 +GCG 4828.000000 +GCT 9371.000000 +GGA 10143.000000 +GGC 10400.000000 +GGG 8869.000000 +GGT 5567.000000 +GTA 2143.000000 +GTC 4593.000000 +GTG 8189.000000 +GTT 3021.000000 +TAA 1775.000000 +TAC 3687.000000 +TAG 1333.000000 +TAT 2477.000000 +TCA 6180.000000 +TCC 7668.000000 +TCG 2875.000000 +TCT 5767.000000 +TGA 7315.000000 +TGC 8625.000000 +TGG 11718.000000 +TGT 5197.000000 +TTA 1664.000000 +TTC 5462.000000 +TTG 4420.000000 +TTT 3453.000000 +// + + + + + + + + + diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/gon120.bla --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/gon120.bla Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,30 @@ +# benner-gonnet matrix 2/95 +# pam := 120 +# print (DayhoffM (NewLogPAM1, pam)); +# DayMatrix(Peptide, pam=120, Simil: max=16.556, min=-9.624, max offdiag=5.460, +# del=-22.183-1.396*(k-1)) + A R N D C Q E G H I L K M F P S T W Y V B Z X * + 5.9 -2.3 -2.0 -1.8 -0.1 -1.2 -0.9 -0.3 -2.6 -2.7 -3.0 -1.9 -1.7 -4.8 -0.6 1.6 0.2 -6.8 -4.8 0.0 0.0 0.0 0.0 -9.6 + -2.3 8.3 -1.0 -2.7 -4.5 1.6 -1.0 -3.0 -0.1 -5.3 -4.4 3.6 -3.8 -6.8 -3.0 -1.5 -1.6 -3.0 -3.7 -4.4 0.0 0.0 0.0 -9.6 + -2.0 -1.0 8.0 2.6 -4.1 0.1 -0.1 -0.7 1.4 -5.5 -6.1 0.5 -4.6 -5.8 -3.2 1.0 0.0 -6.8 -2.8 -5.0 0.0 0.0 0.0 -9.6 + -1.8 -2.7 2.6 8.2 -6.9 0.1 3.5 -1.4 -0.7 -8.0 -8.2 -0.8 -6.4 -8.7 -2.8 -0.5 -1.3 -9.6 -5.3 -6.5 0.0 0.0 0.0 -9.6 + -0.1 -4.5 -4.1 -6.9 14.4 -5.6 -6.8 -4.5 -3.1 -3.6 -3.9 -6.4 -2.6 -2.6 -6.8 -0.7 -2.1 -2.9 -2.0 -1.1 0.0 0.0 0.0 -9.6 + -1.2 1.6 0.1 0.1 -5.6 7.4 2.5 -3.0 1.7 -4.3 -3.0 2.0 -1.4 -5.3 -1.4 -0.7 -0.9 -4.9 -3.8 -3.6 0.0 0.0 0.0 -9.6 + -0.9 -1.0 -0.1 3.5 -6.8 2.5 7.2 -3.1 -0.6 -5.5 -5.7 1.1 -4.0 -7.8 -2.2 -0.9 -1.5 -7.8 -5.6 -3.8 0.0 0.0 0.0 -9.6 + -0.3 -3.0 -0.7 -1.4 -4.5 -3.0 -3.1 9.0 -3.7 -8.7 -8.2 -3.4 -6.5 -9.2 -4.1 -0.6 -3.5 -6.6 -7.4 -6.5 0.0 0.0 0.0 -9.6 + -2.6 -0.1 1.4 -0.7 -3.1 1.7 -0.6 -3.7 11.0 -4.8 -4.2 -0.2 -2.8 -1.4 -3.1 -1.4 -1.4 -2.8 2.6 -4.7 0.0 0.0 0.0 -9.6 + -2.7 -5.3 -5.5 -8.0 -3.6 -4.3 -5.5 -8.7 -4.8 7.2 2.8 -4.4 2.9 -0.3 -5.6 -4.4 -1.7 -4.6 -3.1 4.2 0.0 0.0 0.0 -9.6 + -3.0 -4.4 -6.1 -8.2 -3.9 -3.0 -5.7 -8.2 -4.2 2.8 6.6 -4.4 3.4 1.5 -4.4 -4.7 -3.4 -2.9 -2.0 1.3 0.0 0.0 0.0 -9.6 + -1.9 3.6 0.5 -0.8 -6.4 2.0 1.1 -3.4 -0.2 -4.4 -4.4 6.9 -2.7 -6.7 -2.4 -1.0 -0.6 -6.8 -4.4 -3.9 0.0 0.0 0.0 -9.6 + -1.7 -3.8 -4.6 -6.4 -2.6 -1.4 -4.0 -6.5 -2.8 2.9 3.4 -2.7 9.5 1.1 -5.6 -2.9 -1.6 -3.1 -2.2 1.0 0.0 0.0 0.0 -9.6 + -4.8 -6.8 -5.8 -8.7 -2.6 -5.3 -7.8 -9.2 -1.4 -0.3 1.5 -6.7 1.1 10.2 -7.2 -5.8 -4.7 2.7 5.5 -1.7 0.0 0.0 0.0 -9.6 + -0.6 -3.0 -3.2 -2.8 -6.8 -1.4 -2.2 -4.1 -3.1 -5.6 -4.4 -2.4 -5.6 -7.2 10.5 -0.5 -1.0 -8.9 -5.9 -4.2 0.0 0.0 0.0 -9.6 + 1.6 -1.5 1.0 -0.5 -0.7 -0.7 -0.9 -0.6 -1.4 -4.4 -4.7 -1.0 -2.9 -5.8 -0.5 6.0 2.6 -5.7 -3.5 -2.8 0.0 0.0 0.0 -9.6 + 0.2 -1.6 0.0 -1.3 -2.1 -0.9 -1.5 -3.5 -1.4 -1.7 -3.4 -0.6 -1.6 -4.7 -1.0 2.6 6.4 -6.7 -4.2 -0.3 0.0 0.0 0.0 -9.6 + -6.8 -3.0 -6.8 -9.6 -2.9 -4.9 -7.8 -6.6 -2.8 -4.6 -2.9 -6.8 -3.1 2.7 -8.9 -5.7 -6.7 16.6 3.3 -5.9 0.0 0.0 0.0 -9.6 + -4.8 -3.7 -2.8 -5.3 -2.0 -3.8 -5.6 -7.4 2.6 -3.1 -2.0 -4.4 -2.2 5.5 -5.9 -3.5 -4.2 3.3 11.1 -3.3 0.0 0.0 0.0 -9.6 + 0.0 -4.4 -5.0 -6.5 -1.1 -3.6 -3.8 -6.5 -4.7 4.2 1.3 -3.9 1.0 -1.7 -4.2 -2.8 -0.3 -5.9 -3.3 6.5 0.0 0.0 0.0 -9.6 + 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -9.6 + 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -9.6 + 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -9.6 + -9.6 -9.6 -9.6 -9.6 -9.6 -9.6 -9.6 -9.6 -9.6 -9.6 -9.6 -9.6 -9.6 -9.6 -9.6 -9.6 -9.6 -9.6 -9.6 -9.6 -9.6 -9.6 -9.6 1 diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/gon160.bla --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/gon160.bla Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,27 @@ +# benner-gonnet matrix +# pam := 160 +# print (DayhoffM (NewLogPAM1, pam)); +# DayMatrix(Peptide, pam=160, Simil: max=15.808, min=-7.776, max offdiag=5.560, +# del=-21.255-1.396*(k-1)) + A R N D C Q E G H I L K M F P S T W Y V * + 4.6 -1.6 -1.2 -1.1 0.3 -0.7 -0.4 0.2 -1.8 -1.8 -2.2 -1.2 -1.2 -3.8 -0.1 1.6 0.5 -5.5 -3.7 0.1 -7.8 + -1.6 7.1 -0.4 -1.6 -3.5 1.7 -0.3 -2.1 0.3 -4.1 -3.5 3.5 -2.9 -5.3 -2.1 -0.9 -0.9 -2.4 -2.9 -3.4 -7.8 + -1.2 -0.4 6.5 2.6 -3.1 0.5 0.5 -0.2 1.5 -4.4 -4.8 0.8 -3.6 -4.7 -2.2 1.1 0.3 -5.5 -2.2 -3.8 -7.8 + -1.1 -1.6 2.6 7.0 -5.3 0.6 3.4 -0.7 -0.1 -6.2 -6.5 -0.1 -5.0 -7.0 -1.9 0.0 -0.6 -7.8 -4.2 -4.9 -7.8 + 0.3 -3.5 -3.1 -5.3 13.5 -4.2 -5.2 -3.4 -2.3 -2.5 -2.9 -4.8 -1.9 -1.8 -5.2 -0.2 -1.4 -2.1 -1.3 -0.6 -7.8 + -0.7 1.7 0.5 0.6 -4.2 5.6 2.3 -2.1 1.7 -3.2 -2.4 2.0 -1.2 -4.1 -0.8 -0.2 -0.4 -4.0 -2.9 -2.7 -7.8 + -0.4 -0.3 0.5 3.4 -5.2 2.3 5.9 -2.1 -0.1 -4.3 -4.5 1.3 -3.1 -6.2 -1.4 -0.3 -0.8 -6.4 -4.4 -3.0 -7.8 + 0.2 -2.1 -0.2 -0.7 -3.4 -2.1 -2.1 8.2 -2.7 -7.0 -6.7 -2.4 -5.2 -7.6 -3.0 -0.1 -2.4 -5.5 -6.0 -5.2 -7.8 + -1.8 0.3 1.5 -0.1 -2.3 1.7 -0.1 -2.7 9.3 -3.7 -3.2 0.2 -2.1 -0.7 -2.2 -0.8 -0.8 -1.9 2.7 -3.5 -7.8 + -1.8 -4.1 -4.4 -6.2 -2.5 -3.2 -4.3 -7.0 -3.7 5.9 3.0 -3.5 2.9 0.3 -4.3 -3.3 -1.2 -3.4 -2.0 4.0 -7.8 + -2.2 -3.5 -4.8 -6.5 -2.9 -2.4 -4.5 -6.7 -3.2 3.0 5.7 -3.4 3.4 1.9 -3.5 -3.6 -2.4 -2.0 -1.1 1.7 -7.8 + -1.2 3.5 0.8 -0.1 -4.8 2.0 1.3 -2.4 0.2 -3.5 -3.4 5.5 -2.1 -5.3 -1.6 -0.4 -0.2 -5.4 -3.5 -3.0 -7.8 + -1.2 -2.9 -3.6 -5.0 -1.9 -1.2 -3.1 -5.2 -2.1 2.9 3.4 -2.1 7.6 1.4 -4.2 -2.3 -1.1 -2.2 -1.3 1.4 -7.8 + -3.8 -5.3 -4.7 -7.0 -1.8 -4.1 -6.2 -7.6 -0.7 0.3 1.9 -5.3 1.4 9.1 -5.8 -4.5 -3.6 3.2 5.6 -0.8 -7.8 + -0.1 -2.1 -2.2 -1.9 -5.2 -0.8 -1.4 -3.0 -2.2 -4.3 -3.5 -1.6 -4.2 -5.8 9.6 0.0 -0.4 -7.4 -4.8 -3.2 -7.8 + 1.6 -0.9 1.1 0.0 -0.2 -0.2 -0.3 -0.1 -0.8 -3.3 -3.6 -0.4 -2.3 -4.5 0.0 4.4 2.3 -4.7 -2.8 -2.0 -7.8 + 0.5 -0.9 0.3 -0.6 -1.4 -0.4 -0.8 -2.4 -0.8 -1.2 -2.4 -0.2 -1.1 -3.6 -0.4 2.3 5.0 -5.4 -3.2 0.0 -7.8 + -5.5 -2.4 -5.5 -7.8 -2.1 -4.0 -6.4 -5.5 -1.9 -3.4 -2.0 -5.4 -2.2 3.2 -7.4 -4.7 -5.4 15.8 3.8 -4.5 -7.8 + -3.7 -2.9 -2.2 -4.2 -1.3 -2.9 -4.4 -6.0 2.7 -2.0 -1.1 -3.5 -1.3 5.6 -4.8 -2.8 -3.2 3.8 10.0 -2.4 -7.8 + 0.1 -3.4 -3.8 -4.9 -0.6 -2.7 -3.0 -5.2 -3.5 4.0 1.7 -3.0 1.4 -0.8 -3.2 -2.0 0.0 -4.5 -2.4 5.3 -7.8 + -7.8 -7.8 -7.8 -7.8 -7.8 -7.8 -7.8 -7.8 -7.8 -7.8 -7.8 -7.8 -7.8 -7.8 -7.8 -7.8 -7.8 -7.8 -7.8 -7.8 1 diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/gon200.bla --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/gon200.bla Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,25 @@ + A R N D C Q E G H I L K M F P S T W Y V B Z X * + 3.5 -1.1 -0.7 -0.6 0.4 -0.4 -0.2 0.4 -1.3 -1.2 -1.6 -0.7 -0.9 -3.0 0.1 1.4 0.6 -4.5 -2.9 0.1 0.0 0.0 0.0 -6.4 + -1.1 6.0 0.0 -0.9 -2.8 1.7 0.2 -1.5 0.5 -3.2 -2.8 3.2 -2.2 -4.2 -1.5 -0.5 -0.5 -2.0 -2.3 -2.6 0.0 0.0 0.0 -6.4 + -0.7 0.0 5.2 2.4 -2.4 0.6 0.8 0.1 1.4 -3.6 -3.9 0.8 -2.8 -3.8 -1.5 1.0 0.5 -4.5 -1.8 -3.0 0.0 0.0 0.0 -6.4 + -0.6 -0.9 2.4 5.9 -4.2 0.8 3.1 -0.3 0.2 -5.0 -5.2 0.3 -3.9 -5.7 -1.2 0.3 -0.3 -6.4 -3.5 -3.9 0.0 0.0 0.0 -6.4 + 0.4 -2.8 -2.4 -4.2 12.6 -3.3 -4.0 -2.6 -1.8 -1.8 -2.1 -3.7 -1.4 -1.2 -4.1 0.0 -0.9 -1.5 -0.8 -0.3 0.0 0.0 0.0 -6.4 + -0.4 1.7 0.6 0.8 -3.3 4.1 2.1 -1.5 1.5 -2.5 -2.0 1.8 -1.1 -3.3 -0.5 0.0 -0.1 -3.3 -2.3 -2.1 0.0 0.0 0.0 -6.4 + -0.2 0.2 0.8 3.1 -4.0 2.1 4.8 -1.4 0.2 -3.4 -3.6 1.3 -2.5 -5.0 -0.9 0.0 -0.4 -5.3 -3.5 -2.4 0.0 0.0 0.0 -6.4 + 0.4 -1.5 0.1 -0.3 -2.6 -1.5 -1.4 7.5 -2.0 -5.7 -5.5 -1.7 -4.3 -6.3 -2.2 0.2 -1.7 -4.8 -5.0 -4.2 0.0 0.0 0.0 -6.4 + -1.3 0.5 1.4 0.2 -1.8 1.5 0.2 -2.0 7.8 -2.9 -2.5 0.5 -1.7 -0.3 -1.6 -0.4 -0.5 -1.3 2.5 -2.7 0.0 0.0 0.0 -6.4 + -1.2 -3.2 -3.6 -5.0 -1.8 -2.5 -3.4 -5.7 -2.9 5.0 3.0 -2.8 2.8 0.7 -3.4 -2.5 -0.8 -2.6 -1.3 3.6 0.0 0.0 0.0 -6.4 + -1.6 -2.8 -3.9 -5.2 -2.1 -2.0 -3.6 -5.5 -2.5 3.0 4.9 -2.7 3.2 2.0 -2.9 -2.8 -1.8 -1.3 -0.5 1.8 0.0 0.0 0.0 -6.4 + -0.7 3.2 0.8 0.3 -3.7 1.8 1.3 -1.7 0.5 -2.8 -2.7 4.4 -1.8 -4.2 -1.0 -0.1 0.0 -4.4 -2.8 -2.3 0.0 0.0 0.0 -6.4 + -0.9 -2.2 -2.8 -3.9 -1.4 -1.1 -2.5 -4.3 -1.7 2.8 3.2 -1.8 5.9 1.5 -3.3 -1.8 -0.8 -1.6 -0.7 1.6 0.0 0.0 0.0 -6.4 + -3.0 -4.2 -3.8 -5.7 -1.2 -3.3 -5.0 -6.3 -0.3 0.7 2.0 -4.2 1.5 8.1 -4.8 -3.6 -2.9 3.5 5.4 -0.3 0.0 0.0 0.0 -6.4 + 0.1 -1.5 -1.5 -1.2 -4.1 -0.5 -0.9 -2.2 -1.6 -3.4 -2.9 -1.0 -3.3 -4.8 8.7 0.2 -0.1 -6.1 -3.9 -2.5 0.0 0.0 0.0 -6.4 + 1.4 -0.5 1.0 0.3 0.0 0.0 0.0 0.2 -0.4 -2.5 -2.8 -0.1 -1.8 -3.6 0.2 3.3 1.9 -4.0 -2.3 -1.5 0.0 0.0 0.0 -6.4 + 0.6 -0.5 0.5 -0.3 -0.9 -0.1 -0.4 -1.7 -0.5 -0.8 -1.8 0.0 -0.8 -2.9 -0.1 1.9 3.7 -4.4 -2.5 0.0 0.0 0.0 0.0 -6.4 + -4.5 -2.0 -4.5 -6.4 -1.5 -3.3 -5.3 -4.8 -1.3 -2.6 -1.3 -4.4 -1.6 3.5 -6.1 -4.0 -4.4 15.1 4.0 -3.5 0.0 0.0 0.0 -6.4 + -2.9 -2.3 -1.8 -3.5 -0.8 -2.3 -3.5 -5.0 2.5 -1.3 -0.5 -2.8 -0.7 5.4 -3.9 -2.3 -2.5 4.0 9.0 -1.7 0.0 0.0 0.0 -6.4 + 0.1 -2.6 -3.0 -3.9 -0.3 -2.1 -2.4 -4.2 -2.7 3.6 1.8 -2.3 1.6 -0.3 -2.5 -1.5 0.0 -3.5 -1.7 4.3 0.0 0.0 0.0 -6.4 + 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -6.4 + 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -6.4 + 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -6.4 + -6.4 -6.4 -6.4 -6.4 -6.4 -6.4 -6.4 -6.4 -6.4 -6.4 -6.4 -6.4 -6.4 -6.4 -6.4 -6.4 -6.4 -6.4 -6.4 -6.4 -6.4 -6.4 -6.4 1.0 diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/gon250.bla --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/gon250.bla Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,27 @@ +# benner-gonnet matrix +# pam := 250 +# print (DayhoffM (NewLogPAM1, pam)); +# DayMatrix(Peptide, pam=250, Simil: max=14.152, min=-5.161, max offdiag=5.080, +# del=-19.814-1.396*(k-1)) + A R N D C Q E G H I L K M F P S T W Y V * + 2.4 -0.6 -0.3 -0.3 0.5 -0.2 0.0 0.5 -0.8 -0.8 -1.2 -0.4 -0.7 -2.3 0.3 1.1 0.6 -3.6 -2.2 0.1 -5.2 + -0.6 4.7 0.3 -0.3 -2.2 1.5 0.4 -1.0 0.6 -2.4 -2.2 2.7 -1.7 -3.2 -0.9 -0.2 -0.2 -1.6 -1.8 -2.0 -5.2 + -0.3 0.3 3.8 2.2 -1.8 0.7 0.9 0.4 1.2 -2.8 -3.0 0.8 -2.2 -3.1 -0.9 0.9 0.5 -3.6 -1.4 -2.2 -5.2 + -0.3 -0.3 2.2 4.7 -3.2 0.9 2.7 0.1 0.4 -3.8 -4.0 0.5 -3.0 -4.5 -0.7 0.5 0.0 -5.2 -2.8 -2.9 -5.2 + 0.5 -2.2 -1.8 -3.2 11.5 -2.4 -3.0 -2.0 -1.3 -1.1 -1.5 -2.8 -0.9 -0.8 -3.1 0.1 -0.5 -1.0 -0.5 0.0 -5.2 + -0.2 1.5 0.7 0.9 -2.4 2.7 1.7 -1.0 1.2 -1.9 -1.6 1.5 -1.0 -2.6 -0.2 0.2 0.0 -2.7 -1.7 -1.5 -5.2 + 0.0 0.4 0.9 2.7 -3.0 1.7 3.6 -0.8 0.4 -2.7 -2.8 1.2 -2.0 -3.9 -0.5 0.2 -0.1 -4.3 -2.7 -1.9 -5.2 + 0.5 -1.0 0.4 0.1 -2.0 -1.0 -0.8 6.6 -1.4 -4.5 -4.4 -1.1 -3.5 -5.2 -1.6 0.4 -1.1 -4.0 -4.0 -3.3 -5.2 + -0.8 0.6 1.2 0.4 -1.3 1.2 0.4 -1.4 6.0 -2.2 -1.9 0.6 -1.3 -0.1 -1.1 -0.2 -0.3 -0.8 2.2 -2.0 -5.2 + -0.8 -2.4 -2.8 -3.8 -1.1 -1.9 -2.7 -4.5 -2.2 4.0 2.8 -2.1 2.5 1.0 -2.6 -1.8 -0.6 -1.8 -0.7 3.1 -5.2 + -1.2 -2.2 -3.0 -4.0 -1.5 -1.6 -2.8 -4.4 -1.9 2.8 4.0 -2.1 2.8 2.0 -2.3 -2.1 -1.3 -0.7 0.0 1.8 -5.2 + -0.4 2.7 0.8 0.5 -2.8 1.5 1.2 -1.1 0.6 -2.1 -2.1 3.2 -1.4 -3.3 -0.6 0.1 0.1 -3.5 -2.1 -1.7 -5.2 + -0.7 -1.7 -2.2 -3.0 -0.9 -1.0 -2.0 -3.5 -1.3 2.5 2.8 -1.4 4.3 1.6 -2.4 -1.4 -0.6 -1.0 -0.2 1.6 -5.2 + -2.3 -3.2 -3.1 -4.5 -0.8 -2.6 -3.9 -5.2 -0.1 1.0 2.0 -3.3 1.6 7.0 -3.8 -2.8 -2.2 3.6 5.1 0.1 -5.2 + 0.3 -0.9 -0.9 -0.7 -3.1 -0.2 -0.5 -1.6 -1.1 -2.6 -2.3 -0.6 -2.4 -3.8 7.6 0.4 0.1 -5.0 -3.1 -1.8 -5.2 + 1.1 -0.2 0.9 0.5 0.1 0.2 0.2 0.4 -0.2 -1.8 -2.1 0.1 -1.4 -2.8 0.4 2.2 1.5 -3.3 -1.9 -1.0 -5.2 + 0.6 -0.2 0.5 0.0 -0.5 0.0 -0.1 -1.1 -0.3 -0.6 -1.3 0.1 -0.6 -2.2 0.1 1.5 2.5 -3.5 -1.9 0.0 -5.2 + -3.6 -1.6 -3.6 -5.2 -1.0 -2.7 -4.3 -4.0 -0.8 -1.8 -0.7 -3.5 -1.0 3.6 -5.0 -3.3 -3.5 14.2 4.1 -2.6 -5.2 + -2.2 -1.8 -1.4 -2.8 -0.5 -1.7 -2.7 -4.0 2.2 -0.7 0.0 -2.1 -0.2 5.1 -3.1 -1.9 -1.9 4.1 7.8 -1.1 -5.2 + 0.1 -2.0 -2.2 -2.9 0.0 -1.5 -1.9 -3.3 -2.0 3.1 1.8 -1.7 1.6 0.1 -1.8 -1.0 0.0 -2.6 -1.1 3.4 -5.2 + -5.2 -5.2 -5.2 -5.2 -5.2 -5.2 -5.2 -5.2 -5.2 -5.2 -5.2 -5.2 -5.2 -5.2 -5.2 -5.2 -5.2 -5.2 -5.2 -5.2 1 diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/gon350.bla --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/gon350.bla Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,27 @@ +# benner-gonnet matrix +# ppam := 350 +# print (DayhoffM (NewLogPAM1, pam)); +# DayMatrix(Peptide, pam=350, Simil: max=12.362, min=-3.511, max offdiag=4.155, +# del=-18.727-1.396*(k-1)) + A R N D C Q E G H I L K M F P S T W Y V * + 1.0 -0.2 0.0 0.0 0.4 0.0 0.1 0.5 -0.4 -0.4 -0.7 -0.1 -0.4 -1.4 0.3 0.6 0.4 -2.4 -1.4 0.0 -3.5 + -0.2 2.8 0.4 0.2 -1.3 1.0 0.6 -0.4 0.6 -1.5 -1.4 1.8 -1.1 -2.0 -0.4 0.1 0.0 -1.2 -1.2 -1.2 -3.5 + 0.0 0.4 2.0 1.5 -1.1 0.6 0.9 0.5 0.8 -1.8 -1.9 0.7 -1.4 -2.0 -0.3 0.6 0.4 -2.4 -1.0 -1.3 -3.5 + 0.0 0.2 1.5 2.9 -1.9 0.8 1.9 0.4 0.5 -2.4 -2.5 0.6 -1.9 -3.0 -0.2 0.5 0.2 -3.5 -1.8 -1.8 -3.5 + 0.4 -1.3 -1.1 -1.9 9.3 -1.4 -1.7 -1.2 -0.8 -0.5 -0.7 -1.6 -0.4 -0.3 -1.8 0.1 -0.2 -0.4 -0.1 0.2 -3.5 + 0.0 1.0 0.6 0.8 -1.4 1.2 1.1 -0.4 0.7 -1.1 -1.1 1.0 -0.7 -1.6 0.0 0.2 0.1 -1.9 -1.1 -0.9 -3.5 + 0.1 0.6 0.9 1.9 -1.7 1.1 2.0 -0.2 0.4 -1.7 -1.8 0.9 -1.3 -2.5 0.0 0.3 0.1 -2.9 -1.7 -1.2 -3.5 + 0.5 -0.4 0.5 0.4 -1.2 -0.4 -0.2 5.1 -0.7 -2.9 -3.0 -0.4 -2.3 -3.5 -0.7 0.5 -0.5 -3.0 -2.7 -2.1 -3.5 + -0.4 0.6 0.8 0.5 -0.8 0.7 0.4 -0.7 3.3 -1.3 -1.1 0.5 -0.8 0.1 -0.5 0.0 -0.1 -0.3 1.5 -1.1 -3.5 + -0.4 -1.5 -1.8 -2.4 -0.5 -1.1 -1.7 -2.9 -1.3 2.7 2.2 -1.3 1.9 1.1 -1.6 -1.1 -0.3 -0.9 -0.1 2.2 -3.5 + -0.7 -1.4 -1.9 -2.5 -0.7 -1.1 -1.8 -3.0 -1.1 2.2 2.8 -1.4 2.1 1.8 -1.5 -1.3 -0.7 -0.1 0.5 1.6 -3.5 + -0.1 1.8 0.7 0.6 -1.6 1.0 0.9 -0.4 0.5 -1.3 -1.4 1.8 -0.9 -2.1 -0.2 0.2 0.2 -2.3 -1.4 -1.0 -3.5 + -0.4 -1.1 -1.4 -1.9 -0.4 -0.7 -1.3 -2.3 -0.8 1.9 2.1 -0.9 2.3 1.4 -1.4 -0.9 -0.4 -0.3 0.2 1.4 -3.5 + -1.4 -2.0 -2.0 -3.0 -0.3 -1.6 -2.5 -3.5 0.1 1.1 1.8 -2.1 1.4 5.1 -2.5 -1.8 -1.3 3.5 4.2 0.5 -3.5 + 0.3 -0.4 -0.3 -0.2 -1.8 0.0 0.0 -0.7 -0.5 -1.6 -1.5 -0.2 -1.4 -2.5 5.6 0.4 0.2 -3.4 -2.0 -1.1 -3.5 + 0.6 0.1 0.6 0.5 0.1 0.2 0.3 0.5 0.0 -1.1 -1.3 0.2 -0.9 -1.8 0.4 1.0 0.8 -2.2 -1.2 -0.6 -3.5 + 0.4 0.0 0.4 0.2 -0.2 0.1 0.1 -0.5 -0.1 -0.3 -0.7 0.2 -0.4 -1.3 0.2 0.8 1.1 -2.3 -1.2 0.0 -3.5 + -2.4 -1.2 -2.4 -3.5 -0.4 -1.9 -2.9 -3.0 -0.3 -0.9 -0.1 -2.3 -0.3 3.5 -3.4 -2.2 -2.3 12.4 3.9 -1.4 -3.5 + -1.4 -1.2 -1.0 -1.8 -0.1 -1.1 -1.7 -2.7 1.5 -0.1 0.5 -1.4 0.2 4.2 -2.0 -1.2 -1.2 3.9 5.7 -0.4 -3.5 + 0.0 -1.2 -1.3 -1.8 0.2 -0.9 -1.2 -2.1 -1.1 2.2 1.6 -1.0 1.4 0.5 -1.1 -0.6 0.0 -1.4 -0.4 2.1 -3.5 + -3.5 -3.5 -3.5 -3.5 -3.5 -3.5 -3.5 -3.5 -3.5 -3.5 -3.5 -3.5 -3.5 -3.5 -3.5 -3.5 -3.5 -3.5 -3.5 -3.5 1 diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/human.gf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/human.gf Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,541 @@ +# Splice sites and intron regions information for GeneWise-21 +# Wed Jan 1 13:42:23 IST 1997 +# Created by Mor Amitai (mor@compugen.co.il) +# dataset : hum1 and hum2 from embl48 +# Consensi are read from top down. The value, for a sequence, +# is the number in the line of the first consensus that +# matches the sequence. +# Note: the set of sequences that are represented by a consensus +# are all the sequences that match this consensus and none of the +# previous consensi +# +# the numbers in types 5SS, 3SS, CDS, and the emissions are the number of +# occurrences of each sequence in the database. +# In case of a consensus this is the number of occurrences of sequences +# that are represented by the consensus in the database divided by the +# number of sequences that are represented by the consensus. +# *stay_prob is the probability of the transition from the state to itself. +# No_Spacer_Prob is the probability of transition from Pyrimidine directly +# to 3SS (no spacer). +type 5SS +center 3 +phase all +begin consensus +C-GGTGAGTG 15.75 +--GGTGAGTG 8.83333 +CAGGTGAG-- 7.6 +AAGGTGAG-- 6.86667 +CAGGTAAG-- 4.3125 +GAGGTGAG-- 4.26667 +AAGGTAAG-- 4.1875 +--GGTGAGT- 1.71795 +---GTGAGTG 1.66667 +--GGTGAG-C 1.51282 +-A-GTAAGT- 1.41071 +-AGGTG-GT- 1.39583 +-TGGTAAG-- 1.39062 +CAGGTA-G-- 1.35417 +AAGGTA-G-- 1.04167 +-GGGTAAG-- 1 +--GGTGAG-- 0.91453 +-C-GTAAGT- 0.828125 +---GTGAGT- 0.729167 +-AGGTAA--- 0.578704 +-AGGTG-G-- 0.506944 +CAGGTA---- 0.381944 +--GGTA-GT- 0.339286 +-AGGTGA--- 0.265625 +--GGTCAG-- 0.207031 +---GTAAG-- 0.155556 +-AGGTA---- 0.134921 +-AGGT--G-- 0.111607 +--GGT--G-- 0.0369898 +-AGGT----- 0.0250947 +--GGTA---- 0.0234375 +--AGT--G-- 0.0132415 +---GT----- 0.00148067 +end consensus +type 5SS +center 3 +phase 0 +begin consensus +A-GGTGAGTG 8.25 +C-GGTGAGTG 7.75 +AAGGTGAG-- 3.86667 +CAGGTGAG-- 3.46667 +AAGGTAAG-- 2.5 +GAGGTGAG-- 2.4375 +--GGTGAGT- 1.19565 +-AGGTAAG-- 1.02083 +-A-GTGAGT- 0.916667 +-AGGT--GTG 0.785714 +AAGGT-GG-- 0.716667 +-C-GTGAGT- 0.666667 +---GTAAGTG 0.566667 +--GGTGAG-- 0.423077 +--GGTAAG-- 0.355556 +CAGGT--G-- 0.3 +---GTAAGT- 0.284722 +-AGGTAA--- 0.265625 +--GGTA-GT- 0.196429 +-AGGTAC--- 0.157895 +AAGGTG---- 0.13964 +-AGGTG---- 0.048048 +-AGGT----- 0.0181818 +--GGT--G-- 0.0129717 +---GTA---- 0.00291667 +---GT----- 0.000915751 +end consensus +type 5SS +center 3 +phase 1 +begin consensus +--GGTGAGTG 4.375 +CAGGTGAG-- 2.8 +AAGGTGAG-- 2.26667 +-AGGTAAG-- 1.01562 +-TGGTAAG-- 0.8125 +-TGGTGAG-- 0.7 +G-GGTGAG-- 0.755556 +CAGGT-GG-- 0.546875 +--GGTGAG-- 0.428571 +---GTGAGTG 0.625 +---GTAAGT- 0.223214 +AAGGT--G-- 0.200893 +-AGGTAA--- 0.1875 +--GGTA-G-G 0.175 +CAGGT----- 0.0733945 +---GTGAG-- 0.0527778 +--GGT--GT- 0.0512129 +A-GGTA---- 0.046398 +-AGGT----- 0.0137104 +---G-AAG-- 0.00972447 +---GT----- 0.00111909 +end consensus +type 5SS +center 3 +phase 2 +begin consensus +-AGGTGAG-- 0.703125 +C--GTGAGT- 0.516667 +-AGGTAAG-- 0.5 +---GTAAGT- 0.241667 +--GGTGAG-- 0.227778 +CAGGTA---- 0.133333 +---GTGAG-- 0.0569444 +-AGGT--G-- 0.0483491 +---GTAAG-- 0.0444444 +--GGTA---- 0.00896991 +---GT----- 0.000797367 +end consensus +type 3SS +center 3 +phase all +begin consensus +CAGGTG 143 +CAGGGT 120 +CAGGGC 88 +CAGGGA 82 +CAGGCT 77 +CAGGAG 73 +CAGGGG 66 +CAGGTA 60 +CAGGCC 58 +CAGGAA 54 +CAGATC 50 +CAGCTG 50 +CAGG-C 43.5 +CAGG-T 38.5 +CAGA-C 35.6667 +CAGAT- 29.3333 +CAGA-A 27.3333 +CAG-TC 25.5 +TAGG-G 21.75 +CAG-CA 21.6667 +TAGGA- 21.3333 +TAGGG- 20.6667 +CAGA-G 19.3333 +CAGTG- 18.75 +CAGA-- 18 +TAGG-- 13.8333 +CAG-C- 11.8571 +CAG-T- 10.4 +CAG-A- 7.375 +TAGA-- 6.0625 +TAGC-- 3.3125 +AAGG-- 3.125 +-AGT-- 1.08333 +-AG--- 0.714286 +end consensus +type 3SS +center 3 +phase 0 +begin consensus +CAGGGT 88 +CAGGTG 64 +CAGGAG 43 +CAGGG- 40.3333 +CAGATC 32 +CAGG-C 25 +CAGG-A 19 +CAGG-T 17.3333 +CAGA-C 14.6667 +CAG-TG 13 +CAGAA- 13 +CAGA-T 11.3333 +CAGC-C 10.25 +TAGG-- 9.8125 +CAG--A 4.09091 +CAGT-- 4.45455 +TAG-T- 3.33333 +TAG--- 1.36111 +-AGG-- 1.06061 +-AG--- 0.342857 +end consensus +type 3SS +center 3 +phase 1 +begin consensus +CAGGTG 71 +CAGGCT 36 +CAGGG- 24.75 +CAG-CC 17.5 +CAG-TG 16.6667 +CAGAG- 15.75 +CAGG-- 14.5556 +CAGA-A 13.3333 +CAGTG- 10.75 +CAG-TC 10.3333 +TAGG-G 9.5 +CAGA-- 6.16667 +TAGG-- 5 +CAGC-- 3.69231 +TAG--- 1.66667 +-AG--- 0.328467 +end consensus +type 3SS +center 3 +phase 2 +begin consensus +CAGG-A 19.25 +CAGG-T 14.75 +CAG-G- 4.57143 +CAG--T 4.66667 +TAGG-- 3 +CAG--- 2.87879 +TAG--- 0.645833 +AAG--- 0.25 +end consensus +type CDS +phase all +begin consensus +AAA 5290.000000 +AAC 4795.000000 +AAG 8178.000000 +AAT 3305.000000 +ACA 6240.000000 +ACC 7728.000000 +ACG 3347.000000 +ACT 4930.000000 +AGA 8491.000000 +AGC 8639.000000 +AGG 8997.000000 +AGT 4417.000000 +ATA 1975.000000 +ATC 4973.000000 +ATG 6474.000000 +ATT 3083.000000 +CAA 7057.000000 +CAC 6815.000000 +CAG 11041.000000 +CAT 5779.000000 +CCA 10537.000000 +CCC 10307.000000 +CCG 5621.000000 +CCT 10134.000000 +CGA 3377.000000 +CGC 5146.000000 +CGG 5375.000000 +CGT 2765.000000 +CTA 3502.000000 +CTC 7465.000000 +CTG 13780.000000 +CTT 5453.000000 +GAA 7461.000000 +GAC 6937.000000 +GAG 9975.000000 +GAT 4949.000000 +GCA 7747.000000 +GCC 10890.000000 +GCG 4828.000000 +GCT 9371.000000 +GGA 10143.000000 +GGC 10400.000000 +GGG 8869.000000 +GGT 5567.000000 +GTA 2143.000000 +GTC 4593.000000 +GTG 8189.000000 +GTT 3021.000000 +TAA 1775.000000 +TAC 3687.000000 +TAG 1333.000000 +TAT 2477.000000 +TCA 6180.000000 +TCC 7668.000000 +TCG 2875.000000 +TCT 5767.000000 +TGA 7315.000000 +TGC 8625.000000 +TGG 11718.000000 +TGT 5197.000000 +TTA 1664.000000 +TTC 5462.000000 +TTG 4420.000000 +TTT 3453.000000 +end consensus +type CDS +phase 0 +begin consensus +AAA 2167.000000 +AAC 2839.000000 +AAG 4830.000000 +AAT 1616.000000 +ACA 1543.000000 +ACC 3187.000000 +ACG 983.000000 +ACT 1484.000000 +AGA 995.000000 +AGC 2722.000000 +AGG 1326.000000 +AGT 1093.000000 +ATA 585.000000 +ATC 3281.000000 +ATG 2538.000000 +ATT 1584.000000 +CAA 1141.000000 +CAC 1981.000000 +CAG 4796.000000 +CAT 1016.000000 +CCA 2021.000000 +CCC 3183.000000 +CCG 1104.000000 +CCT 2289.000000 +CGA 770.000000 +CGC 2011.000000 +CGG 1762.000000 +CGT 687.000000 +CTA 745.000000 +CTC 2964.000000 +CTG 6980.000000 +CTT 1200.000000 +GAA 2729.000000 +GAC 3946.000000 +GAG 6121.000000 +GAT 2318.000000 +GCA 1767.000000 +GCC 4902.000000 +GCG 1288.000000 +GCT 2556.000000 +GGA 2322.000000 +GGC 4338.000000 +GGG 2688.000000 +GGT 1903.000000 +GTA 690.000000 +GTC 2172.000000 +GTG 4546.000000 +GTT 1020.000000 +TAA 0.000000 +TAC 2405.000000 +TAG 0.000000 +TAT 1323.000000 +TCA 990.000000 +TCC 2579.000000 +TCG 684.000000 +TCT 1522.000000 +TGA 0.000000 +TGC 1747.000000 +TGG 1766.000000 +TGT 931.000000 +TTA 397.000000 +TTC 3156.000000 +TTG 1313.000000 +TTT 1697.000000 +end consensus +type CDS +phase 1 +begin consensus +AAA 1534.000000 +AAC 1140.000000 +AAG 2597.000000 +AAT 767.000000 +ACA 3632.000000 +ACC 3429.000000 +ACG 1849.000000 +ACT 2262.000000 +AGA 4427.000000 +AGC 4014.000000 +AGG 5377.000000 +AGT 1927.000000 +ATA 953.000000 +ATC 1055.000000 +ATG 3488.000000 +ATT 777.000000 +CAA 998.000000 +CAC 1332.000000 +CAG 3260.000000 +CAT 731.000000 +CCA 4701.000000 +CCC 3908.000000 +CCG 2252.000000 +CCT 2992.000000 +CGA 471.000000 +CGC 1361.000000 +CGG 1785.000000 +CGT 443.000000 +CTA 825.000000 +CTC 1766.000000 +CTG 4378.000000 +CTT 882.000000 +GAA 924.000000 +GAC 843.000000 +GAG 1897.000000 +GAT 424.000000 +GCA 3140.000000 +GCC 3275.000000 +GCG 1806.000000 +GCT 2595.000000 +GGA 1911.000000 +GGC 2034.000000 +GGG 2835.000000 +GGT 762.000000 +GTA 577.000000 +GTC 968.000000 +GTG 2506.000000 +GTT 563.000000 +TAA 622.000000 +TAC 561.000000 +TAG 912.000000 +TAT 322.000000 +TCA 3963.000000 +TCC 3535.000000 +TCG 1466.000000 +TCT 2607.000000 +TGA 3311.000000 +TGC 4099.000000 +TGG 6194.000000 +TGT 1772.000000 +TTA 773.000000 +TTC 1353.000000 +TTG 2662.000000 +TTT 713.000000 +end consensus +type CDS +phase 2 +begin consensus +AAA 1589.000000 +AAC 816.000000 +AAG 751.000000 +AAT 922.000000 +ACA 1065.000000 +ACC 1112.000000 +ACG 515.000000 +ACT 1184.000000 +AGA 3069.000000 +AGC 1903.000000 +AGG 2294.000000 +AGT 1397.000000 +ATA 437.000000 +ATC 637.000000 +ATG 448.000000 +ATT 722.000000 +CAA 4918.000000 +CAC 3502.000000 +CAG 2985.000000 +CAT 4032.000000 +CCA 3815.000000 +CCC 3216.000000 +CCG 2265.000000 +CCT 4853.000000 +CGA 2136.000000 +CGC 1774.000000 +CGG 1828.000000 +CGT 1635.000000 +CTA 1932.000000 +CTC 2735.000000 +CTG 2422.000000 +CTT 3371.000000 +GAA 3808.000000 +GAC 2148.000000 +GAG 1957.000000 +GAT 2207.000000 +GCA 2840.000000 +GCC 2713.000000 +GCG 1734.000000 +GCT 4220.000000 +GGA 5910.000000 +GGC 4028.000000 +GGG 3346.000000 +GGT 2902.000000 +GTA 876.000000 +GTC 1453.000000 +GTG 1137.000000 +GTT 1438.000000 +TAA 1153.000000 +TAC 721.000000 +TAG 421.000000 +TAT 832.000000 +TCA 1227.000000 +TCC 1554.000000 +TCG 725.000000 +TCT 1638.000000 +TGA 4004.000000 +TGC 2779.000000 +TGG 3758.000000 +TGT 2494.000000 +TTA 494.000000 +TTC 953.000000 +TTG 445.000000 +TTT 1043.000000 +end consensus +type Intron_Corr_Term +phase all + 65.6094 +type Intron_Corr_Term +phase 0 +141.429 +type Intron_Corr_Term +phase 1 +172.738 +type Intron_Corr_Term +phase 2 +371.127 +type Intron_emission +begin consensus +A 399845.000000 +C 371259.000000 +G 393779.000000 +T 425926.000000 +end consensus +type Pyrimidine_emission +begin consensus +A 2299.000000 +C 18610.000000 +G 2345.000000 +T 17132.000000 +end consensus +type Spacer_emission +begin consensus +A 3020.000000 +C 3834.000000 +G 3644.000000 +T 4224.000000 +end consensus +type Central_Intron_Stay_Prob +0.99853 +type Pyrimidine_Stay_Prob +0.944485 +type No_Spacer_Prob +0.331508 +type Spacer_Stay_Prob +0.902704 diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/human.gp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/human.gp Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,543 @@ +# Splice sites and intron regions information for GeneWise-21 +# Wed Jan 1 13:42:23 IST 1997 +# Created by Mor Amitai (mor@compugen.co.il) +# dataset : hum1 and hum2 from embl48 +# Consensi are read from top down. The value, for a sequence, +# is the number in the line of the first consensus that +# matches the sequence. +# Note: the set of sequences that are represented by a consensus +# are all the sequences that match this consensus and none of the +# previous consensi +# +# the numbers in types 5SS, 3SS, CDS, and the emissions are the number of +# occurrences of each sequence in the database. +# In case of a consensus this is the number of occurrences of sequences +# that are represented by the consensus in the database divided by the +# number of sequences that are represented by the consensus. +# *stay_prob is the probability of the transition from the state to itself. +# No_Spacer_Prob is the probability of transition from Pyrimidine directly +# to 3SS (no spacer). +type 5SS +center 3 +phase all +begin consensus +C-GGTGAGTG 15.75 +--GGTGAGTG 8.83333 +CAGGTGAG-- 7.6 +AAGGTGAG-- 6.86667 +CAGGTAAG-- 4.3125 +GAGGTGAG-- 4.26667 +AAGGTAAG-- 4.1875 +--GGTGAGT- 1.71795 +---GTGAGTG 1.66667 +--GGTGAG-C 1.51282 +-A-GTAAGT- 1.41071 +-AGGTG-GT- 1.39583 +-TGGTAAG-- 1.39062 +CAGGTA-G-- 1.35417 +AAGGTA-G-- 1.04167 +-GGGTAAG-- 1 +--GGTGAG-- 0.91453 +-C-GTAAGT- 0.828125 +---GTGAGT- 0.729167 +-AGGTAA--- 0.578704 +-AGGTG-G-- 0.506944 +CAGGTA---- 0.381944 +--GGTA-GT- 0.339286 +-AGGTGA--- 0.265625 +--GGTCAG-- 0.207031 +---GTAAG-- 0.155556 +-AGGTA---- 0.134921 +-AGGT--G-- 0.111607 +--GGT--G-- 0.0369898 +-AGGT----- 0.0250947 +--GGTA---- 0.0234375 +--AGT--G-- 0.0132415 +---GT----- 0.00148067 +end consensus +type 5SS +center 3 +phase 0 +begin consensus +A-GGTGAGTG 8.25 +C-GGTGAGTG 7.75 +AAGGTGAG-- 3.86667 +CAGGTGAG-- 3.46667 +AAGGTAAG-- 2.5 +GAGGTGAG-- 2.4375 +--GGTGAGT- 1.19565 +-AGGTAAG-- 1.02083 +-A-GTGAGT- 0.916667 +-AGGT--GTG 0.785714 +AAGGT-GG-- 0.716667 +-C-GTGAGT- 0.666667 +---GTAAGTG 0.566667 +--GGTGAG-- 0.423077 +--GGTAAG-- 0.355556 +CAGGT--G-- 0.3 +---GTAAGT- 0.284722 +-AGGTAA--- 0.265625 +--GGTA-GT- 0.196429 +-AGGTAC--- 0.157895 +AAGGTG---- 0.13964 +-AGGTG---- 0.048048 +-AGGT----- 0.0181818 +--GGT--G-- 0.0129717 +---GTA---- 0.00291667 +---GT----- 0.000915751 +end consensus +type 5SS +center 3 +phase 1 +begin consensus +--GGTGAGTG 4.375 +CAGGTGAG-- 2.8 +AAGGTGAG-- 2.26667 +-AGGTAAG-- 1.01562 +-TGGTAAG-- 0.8125 +-TGGTGAG-- 0.7 +G-GGTGAG-- 0.755556 +CAGGT-GG-- 0.546875 +--GGTGAG-- 0.428571 +---GTGAGTG 0.625 +---GTAAGT- 0.223214 +AAGGT--G-- 0.200893 +-AGGTAA--- 0.1875 +--GGTA-G-G 0.175 +CAGGT----- 0.0733945 +---GTGAG-- 0.0527778 +--GGT--GT- 0.0512129 +A-GGTA---- 0.046398 +-AGGT----- 0.0137104 +---G-AAG-- 0.00972447 +---GT----- 0.00111909 +end consensus +type 5SS +center 3 +phase 2 +begin consensus +-AGGTGAG-- 0.703125 +C--GTGAGT- 0.516667 +-AGGTAAG-- 0.5 +---GTAAGT- 0.241667 +--GGTGAG-- 0.227778 +CAGGTA---- 0.133333 +---GTGAG-- 0.0569444 +-AGGT--G-- 0.0483491 +---GTAAG-- 0.0444444 +--GGTA---- 0.00896991 +---GT----- 0.000797367 +end consensus +type 3SS +center 3 +phase all +begin consensus +CAGGTG 143 +CAGGGT 120 +CAGGGC 88 +CAGGGA 82 +CAGGCT 77 +CAGGAG 73 +CAGGGG 66 +CAGGTA 60 +CAGGCC 58 +CAGGAA 54 +CAGATC 50 +CAGCTG 50 +CAGG-C 43.5 +CAGG-T 38.5 +CAGA-C 35.6667 +CAGAT- 29.3333 +CAGA-A 27.3333 +CAG-TC 25.5 +TAGG-G 21.75 +CAG-CA 21.6667 +TAGGA- 21.3333 +TAGGG- 20.6667 +CAGA-G 19.3333 +CAGTG- 18.75 +CAGA-- 18 +TAGG-- 13.8333 +CAG-C- 11.8571 +CAG-T- 10.4 +CAG-A- 7.375 +TAGA-- 6.0625 +TAGC-- 3.3125 +AAGG-- 3.125 +-AGT-- 1.08333 +-AG--- 0.714286 +end consensus +type 3SS +center 3 +phase 0 +begin consensus +CAGGGT 88 +CAGGTG 64 +CAGGAG 43 +CAGGG- 40.3333 +CAGATC 32 +CAGG-C 25 +CAGG-A 19 +CAGG-T 17.3333 +CAGA-C 14.6667 +CAG-TG 13 +CAGAA- 13 +CAGA-T 11.3333 +CAGC-C 10.25 +TAGG-- 9.8125 +CAG--A 4.09091 +CAGT-- 4.45455 +TAG-T- 3.33333 +TAG--- 1.36111 +-AGG-- 1.06061 +-AG--- 0.342857 +end consensus +type 3SS +center 3 +phase 1 +begin consensus +CAGGTG 71 +CAGGCT 36 +CAGGG- 24.75 +CAG-CC 17.5 +CAG-TG 16.6667 +CAGAG- 15.75 +CAGG-- 14.5556 +CAGA-A 13.3333 +CAGTG- 10.75 +CAG-TC 10.3333 +TAGG-G 9.5 +CAGA-- 6.16667 +TAGG-- 5 +CAGC-- 3.69231 +TAG--- 1.66667 +-AG--- 0.328467 +end consensus +type 3SS +center 3 +phase 2 +begin consensus +CAGG-A 19.25 +CAGG-T 14.75 +CAG-G- 4.57143 +CAG--T 4.66667 +TAGG-- 3 +CAG--- 2.87879 +TAG--- 0.645833 +AAG--- 0.25 +end consensus +type CDS +phase all +begin consensus +AAA 5290.000000 +AAC 4795.000000 +AAG 8178.000000 +AAT 3305.000000 +ACA 6240.000000 +ACC 7728.000000 +ACG 3347.000000 +ACT 4930.000000 +AGA 8491.000000 +AGC 8639.000000 +AGG 8997.000000 +AGT 4417.000000 +ATA 1975.000000 +ATC 4973.000000 +ATG 6474.000000 +ATT 3083.000000 +CAA 7057.000000 +CAC 6815.000000 +CAG 11041.000000 +CAT 5779.000000 +CCA 10537.000000 +CCC 10307.000000 +CCG 5621.000000 +CCT 10134.000000 +CGA 3377.000000 +CGC 5146.000000 +CGG 5375.000000 +CGT 2765.000000 +CTA 3502.000000 +CTC 7465.000000 +CTG 13780.000000 +CTT 5453.000000 +GAA 7461.000000 +GAC 6937.000000 +GAG 9975.000000 +GAT 4949.000000 +GCA 7747.000000 +GCC 10890.000000 +GCG 4828.000000 +GCT 9371.000000 +GGA 10143.000000 +GGC 10400.000000 +GGG 8869.000000 +GGT 5567.000000 +GTA 2143.000000 +GTC 4593.000000 +GTG 8189.000000 +GTT 3021.000000 +TAA 1775.000000 +TAC 3687.000000 +TAG 1333.000000 +TAT 2477.000000 +TCA 6180.000000 +TCC 7668.000000 +TCG 2875.000000 +TCT 5767.000000 +TGA 7315.000000 +TGC 8625.000000 +TGG 11718.000000 +TGT 5197.000000 +TTA 1664.000000 +TTC 5462.000000 +TTG 4420.000000 +TTT 3453.000000 +end consensus +type CDS +phase 0 +begin consensus +AAA 2167.000000 +AAC 2839.000000 +AAG 4830.000000 +AAT 1616.000000 +ACA 1543.000000 +ACC 3187.000000 +ACG 983.000000 +ACT 1484.000000 +AGA 995.000000 +AGC 2722.000000 +AGG 1326.000000 +AGT 1093.000000 +ATA 585.000000 +ATC 3281.000000 +ATG 2538.000000 +ATT 1584.000000 +CAA 1141.000000 +CAC 1981.000000 +CAG 4796.000000 +CAT 1016.000000 +CCA 2021.000000 +CCC 3183.000000 +CCG 1104.000000 +CCT 2289.000000 +CGA 770.000000 +CGC 2011.000000 +CGG 1762.000000 +CGT 687.000000 +CTA 745.000000 +CTC 2964.000000 +CTG 6980.000000 +CTT 1200.000000 +GAA 2729.000000 +GAC 3946.000000 +GAG 6121.000000 +GAT 2318.000000 +GCA 1767.000000 +GCC 4902.000000 +GCG 1288.000000 +GCT 2556.000000 +GGA 2322.000000 +GGC 4338.000000 +GGG 2688.000000 +GGT 1903.000000 +GTA 690.000000 +GTC 2172.000000 +GTG 4546.000000 +GTT 1020.000000 +TAA 0.000000 +TAC 2405.000000 +TAG 0.000000 +TAT 1323.000000 +TCA 990.000000 +TCC 2579.000000 +TCG 684.000000 +TCT 1522.000000 +TGA 0.000000 +TGC 1747.000000 +TGG 1766.000000 +TGT 931.000000 +TTA 397.000000 +TTC 3156.000000 +TTG 1313.000000 +TTT 1697.000000 +end consensus +type CDS +phase 1 +begin consensus +AAA 1534.000000 +AAC 1140.000000 +AAG 2597.000000 +AAT 767.000000 +ACA 3632.000000 +ACC 3429.000000 +ACG 1849.000000 +ACT 2262.000000 +AGA 4427.000000 +AGC 4014.000000 +AGG 5377.000000 +AGT 1927.000000 +ATA 953.000000 +ATC 1055.000000 +ATG 3488.000000 +ATT 777.000000 +CAA 998.000000 +CAC 1332.000000 +CAG 3260.000000 +CAT 731.000000 +CCA 4701.000000 +CCC 3908.000000 +CCG 2252.000000 +CCT 2992.000000 +CGA 471.000000 +CGC 1361.000000 +CGG 1785.000000 +CGT 443.000000 +CTA 825.000000 +CTC 1766.000000 +CTG 4378.000000 +CTT 882.000000 +GAA 924.000000 +GAC 843.000000 +GAG 1897.000000 +GAT 424.000000 +GCA 3140.000000 +GCC 3275.000000 +GCG 1806.000000 +GCT 2595.000000 +GGA 1911.000000 +GGC 2034.000000 +GGG 2835.000000 +GGT 762.000000 +GTA 577.000000 +GTC 968.000000 +GTG 2506.000000 +GTT 563.000000 +TAA 622.000000 +TAC 561.000000 +TAG 912.000000 +TAT 322.000000 +TCA 3963.000000 +TCC 3535.000000 +TCG 1466.000000 +TCT 2607.000000 +TGA 3311.000000 +TGC 4099.000000 +TGG 6194.000000 +TGT 1772.000000 +TTA 773.000000 +TTC 1353.000000 +TTG 2662.000000 +TTT 713.000000 +end consensus +type CDS +phase 2 +begin consensus +AAA 1589.000000 +AAC 816.000000 +AAG 751.000000 +AAT 922.000000 +ACA 1065.000000 +ACC 1112.000000 +ACG 515.000000 +ACT 1184.000000 +AGA 3069.000000 +AGC 1903.000000 +AGG 2294.000000 +AGT 1397.000000 +ATA 437.000000 +ATC 637.000000 +ATG 448.000000 +ATT 722.000000 +CAA 4918.000000 +CAC 3502.000000 +CAG 2985.000000 +CAT 4032.000000 +CCA 3815.000000 +CCC 3216.000000 +CCG 2265.000000 +CCT 4853.000000 +CGA 2136.000000 +CGC 1774.000000 +CGG 1828.000000 +CGT 1635.000000 +CTA 1932.000000 +CTC 2735.000000 +CTG 2422.000000 +CTT 3371.000000 +GAA 3808.000000 +GAC 2148.000000 +GAG 1957.000000 +GAT 2207.000000 +GCA 2840.000000 +GCC 2713.000000 +GCG 1734.000000 +GCT 4220.000000 +GGA 5910.000000 +GGC 4028.000000 +GGG 3346.000000 +GGT 2902.000000 +GTA 876.000000 +GTC 1453.000000 +GTG 1137.000000 +GTT 1438.000000 +TAA 1153.000000 +TAC 721.000000 +TAG 421.000000 +TAT 832.000000 +TCA 1227.000000 +TCC 1554.000000 +TCG 725.000000 +TCT 1638.000000 +TGA 4004.000000 +TGC 2779.000000 +TGG 3758.000000 +TGT 2494.000000 +TTA 494.000000 +TTC 953.000000 +TTG 445.000000 +TTT 1043.000000 +end consensus +type Intron_Corr_Term +phase all + 65.6094 +type Intron_Corr_Term +phase 0 +141.429 +type Intron_Corr_Term +phase 1 +172.738 +type Intron_Corr_Term +phase 2 +371.127 +type Intron_emission +begin consensus +A 399845.000000 +C 371259.000000 +G 393779.000000 +T 425926.000000 +end consensus +type Pyrimidine_emission +begin consensus +A 2299.000000 +C 18610.000000 +G 2345.000000 +T 17132.000000 +end consensus +type Spacer_emission +begin consensus +A 3020.000000 +C 3834.000000 +G 3644.000000 +T 4224.000000 +end consensus +type Central_Intron_Stay_Prob +0.99853 +type Pyrimidine_Stay_Prob +0.944485 +type No_Spacer_Prob +0.331508 +type Spacer_Stay_Prob +0.902704 + + diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/human.stats --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/human.stats Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,92 @@ +# +# +# new genestats file +# +intron_emission +0.25 0.25 0.25 0.25 0.1 +// +rnd_emission +0.25 0.25 0.25 0.25 0.1 +// +splice3 10 +ENSE00000673410 TCCCACATAGATCA +ENSE00000673409 TATCCTGCAGTATG +ENSE00000673408 TTTGCTATAGATTA +ENSE00000673407 GGTTTTTCAGTTGC +ENSE00000401072 CTCCCATTAGGGTT +ENSE00000868868 TTATTTCTAGCTGA +ENSE00000401061 CTTTGGTTAGGCAG +ENSE00000673400 GTTCTCCCAGGCCT +ENSE00000673402 TGTTTTATAGGGAT +ENSE00000673403 CTTCTTTCAGAATG +ENSE00000868865 ATTATCTTAGTTTC +ENSE00000662824 ATTTTTTTAGTGGA +ENSE00000662823 TCATTTTTAGGAAC +ENSE00000662822 CAAATTTCAGCCAA +ENSE00000662821 TATTCTACAGGAAG +ENSE00000662820 TATTTGGCAGCCGT +ENSE00000662819 TTTCTTACAGATTA +ENSE00000814444 ATTTTTAAAGCATT +ENSE00000814445 TTTTTTCCAGGGAC +ENSE00000814446 CTCTGTCTAGTTTC +ENSE00000814447 CTGCTTTTAGTCTC +ENSE00000814448 TGGATTTCAGTTTC +ENSE00000789672 TGGCATTTAGAAAT +ENSE00000789671 TTGTCCTCAGATTT +ENSE00000814451 TTTCGGGTAGATCA +ENSE00000814452 TTGCTCCTAGGTTT +ENSE00000789668 TTCCCTACAGGCGA +ENSE00000789652 TTCTTTGCAGTTAT +ENSE00000789653 ATTCATTTAGGCAG +ENSE00000789654 TTTGGATCAGGATA +ENSE00000789655 TTTATTTTAGGAAT +ENSE00000789656 CCCTTCCCAGCAAG +ENSE00000789657 TACATGTAAGACCT +ENSE00000789658 TTCCCTGCAGGATA +ENSE00000789659 TTTTGATTAGGATG +ENSE00000789660 TATATTGCAGATAT +ENSE00000789661 CTTTTTCCAGATAA +ENSE00000789662 TTTATTTCAGCTGG +ENSE00000450960 TGTCCTTCAGAACA +ENSE00000789663 TGTTTCTTAGGTAG +ENSE00000789664 ACCTCTGTAGGCAG +ENSE00000789665 GTTCTTTTAGGACA +ENSE00000789666 TTTTTAATAGGGAA +ENSE00000450963 TCCCTGGCAGACTG +ENSE00000813771 AACTTTTCAGCTCT +ENSE00000813770 TATTACACAGGATT +// +splice5 5 +ENSE00000673410 ACTGGTGAGTCCTT +ENSE00000673409 TGAGGTAAGCCTGA +ENSE00000673408 TACGGTAAGTGGTA +ENSE00000673407 TGAAGTAAGGTGCC +ENSE00000401072 CCAAGTAAGTTTTT +ENSE00000868868 TAAGGTGCGTTCAT +ENSE00000401061 AAAAGTAAGTAAAT +ENSE00000673400 AAACGTAAGTTGGA +ENSE00000673402 TGAGGTATGTAAGA +ENSE00000673403 TCAGGTATGACATT +ENSE00000868865 CTATGTGAGTTATG +ENSE00000662824 GTAAGTTATAAGAA +ENSE00000662823 CCATGTAAGTGGTA +ENSE00000662822 TTAGGTAGGTACTA +ENSE00000662821 TCAGGTAGGTACAT +ENSE00000662820 TCAGGTACAGTGAA +ENSE00000662819 CGAGGTATGCAACT +ENSE00000814444 CAGAGTAAGTAACC +ENSE00000814445 AGAGGTAAGCCAGG +ENSE00000814446 GATGGTAAGATGAT +ENSE00000814447 CAGGGTGAGTTGGA +ENSE00000814448 TCAGGTGAGGGCAT +ENSE00000789672 TTAAGTAAGTTCAG +ENSE00000789671 AAAGGTGAATGCTT +ENSE00000814451 ACAGGTCAGAGGCC +ENSE00000814452 GAAGGTAAGAATTA +ENSE00000789668 TGAGGTGAGTACTT +ENSE00000789652 TTAAGTAAGTTTGT +ENSE00000789653 ACAGGTAAAATTTG +ENSE00000789654 TAAGGTAAGGCTTT +ENSE00000789655 ACAGGTAAGAAGAA +ENSE00000789656 TTAGGTAAGCTTCA +// \ No newline at end of file diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/idenity.bla --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/idenity.bla Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,28 @@ +# made by ewan's routine...blast like format + A B C D E F G H I J K L M N O P Q R S T U V W X Y Z * + 100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 100 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 100 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/methods --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/methods Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,236 @@ + +type aa +real int +endtype + +type codon +real int +endtype + + +# +# Stuff for PROTEIN +# + +type PROTEIN +real ComplexSequence* +dbtype ProteinDB* +init init_ProteinDB +reload reload_ProteinDB +close close_ProteinDB +addentry dataentry_add_ProteinDB +name get_allocd_name_ComplexSequence +hardlink hard_link_ComplexSequence +free free_ComplexSequence +threadsafe +endtype + + +method AMINOACID +map CSEQ_PROTEIN_AMINOACID +arg PROTEIN +arg int +return aa +endmethod + + +# +# End of protein type and methods +# + + + +type COMPMAT +real CompMat* +endtype + +method AAMATCH +map CompMat_AAMATCH +arg COMPMAT +arg aa +arg aa +return int +endmethod + + +type DNACOMPMAT +real DnaMatrix* +endtype + +method DNABASEMATCH +map DnaMatrix_MATCH +arg DNACOMPMAT +arg base +arg base +return int +endmethod + +type CODONMATRIX +real CodonMatrixScore* +endtype + +method CODON_MATCH +map CodonMatrix_MATCH +arg CODONMATRIX +arg codon +arg codon +return int +endmethod + + +# +# Genomic types and methods +# + + +type GENOMIC +real ComplexSequence* +dbtype GenomicDB* +init init_GenomicDB +reload reload_GenomicDB +close close_GenomicDB +addentry dataentry_add_GenomicDB +name get_allocd_name_ComplexSequence +hardlink hard_link_ComplexSequence +free free_ComplexSequence +threadsafe +endtype + +method GENOMIC_CODON +map CSEQ_GENOMIC_CODON +arg GENOMIC +arg int +return codon +endmethod + +method GENOMIC_BASE +map CSEQ_GENOMIC_BASE +arg GENOMIC +arg int +return base +endmethod + +method GENOMIC_5SS +map CSEQ_GENOMIC_5SS +arg GENOMIC +arg int +return int +endmethod + +method GENOMIC_3SS +map CSEQ_GENOMIC_3SS +arg GENOMIC +arg int +return int +endmethod + + +method GENOMIC_REPEAT +map CSEQ_GENOMIC_REPEAT +arg GENOMIC +arg int +return int +endmethod + +method GENOMIC_CDS_POT +map CSEQ_GENOMIC_CDSPOT +arg GENOMIC +arg int +return int +endmethod + + + +# +# Cdna Methods +# + + +type CDNA +real ComplexSequence* +dbtype cDNADB* +init init_cDNADB +reload reload_cDNADB +close close_cDNADB +addentry dataentry_add_cDNADB +hardlink hard_link_ComplexSequence +free free_ComplexSequence +threadsafe +endtype + +method CDNA_CODON +map CSEQ_CDNA_CODON +arg CDNA +arg int +return codon +endmethod + +method CDNA_BASE +map CSEQ_CDNA_BASE +arg CDNA +arg int +return base +endmethod + +method CDNA_SEQ_POS +map CSEQ_SEQ +arg CDNA +arg int +return char* +endmethod + + +# +# Dna mehtods +# + +type DNA +real ComplexSequence* +endtype + +method DNA_BASE +map CSEQ_DNA_BASE +arg DNA +arg int +return base +endmethod + +type DNAMAT +real DnaMatrix* +endtype + +method DNAMATCH +map DnaMatrix_MATCH +arg DNAMAT +arg base +arg base +return int +endmethod + +# genewise models + +type CDNA_HMM +real GeneWiseScore* +dbtype GeneWiseDB* +init init_GeneWiseDB +reload reload_GeneWiseDB +close close_GeneWiseDB +addentry dataentry_add_GeneWiseDB +hardlink hard_link_GeneWiseScore +free free_GeneWiseScore +threadsafe +endtype + +type GENEWISEMODEL +real GeneWiseScore* +dbtype GeneWiseDB* +init init_GeneWiseDB +reload reload_GeneWiseDB +close close_GeneWiseDB +addentry dataentry_add_GeneWiseDB +hardlink hard_link_GeneWiseScore +free free_GeneWiseScore +threadsafe +endtype + + diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/pb.gf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/pb.gf Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,387 @@ +# Splice sites and intron regions information for GeneWise-21 May 1500 +# Created by Mor Amitai (mor@compugen.co.il) +# dataset : ~/ftp/pub/PomBase/temp/cds_cdna ~/ftp/pub/PomBase/temp/cds_ex +# Consensi are read from top down. The value, for a sequence, +# is the number in the line of the first consensus that +# matches the sequence. +# Note: the set of sequences that are represented by a consensus +# are all the sequences that match this consensus and none of the +# previous consensi +# +# the numbers in types 5SS, 3SS, CDS, and the emissions are the number of +# occurrences of each sequence in the database. +# In case of a consensus this is the number of occurrences of sequences +# that are represented by the consensus in the database divided by the +# number of sequences that are represented by the consensus. +# *stay_prob is the probability of the transition from the state to itself. +# No_Spacer_Prob is the probability of transition from Pyrimidine directly +# to 3SS (no spacer). +type 5SS +center 3 +phase all +begin consensus +---GTA-GT- 8.69141 +---GT----- 0.0821553 +end consensus +type 5SS +center 3 +phase 0 +begin consensus +---GTA-GT- 3.71094 +---GT----- 0.0356523 +end consensus +type 5SS +center 3 +phase 1 +begin consensus +---GTA-GT- 2.92969 +---GT----- 0.0294519 +end consensus +type 5SS +center 3 +phase 2 +begin consensus +---GT----- 0.0488281 +end consensus +type 3SS +center 3 +phase all +begin consensus +TAG--- 1.64062 +-AG--- 22.9167 +end consensus +type 3SS +center 3 +phase 0 +begin consensus +TAG--- 0.765625 +-AG--- 9.375 +end consensus +type 3SS +center 3 +phase 1 +begin consensus +TAG--- 0.515625 +-AG--- 8.85417 +end consensus +type 3SS +center 3 +phase 2 +begin consensus +-AG--- 12.5 +end consensus +type CDS +phase all +begin consensus +AAA 238.6335 +AAC 105.316 +AAG 156.4912 +AAT 168.6987 +ACA 98.7639 +ACC 64.5552 +ACG 58.8997 +ACT 105.1091 +AGA 126.4206 +AGC 83.1769 +AGG 72.7625 +AGT 86.5564 +ATA 99.3847 +ATC 104.0746 +ATG 139.4558 +ATT 185.1134 +CAA 149.8701 +CAC 57.5203 +CAG 70.8314 +CAT 95.1086 +CCA 79.5215 +CCC 49.1751 +CCG 45.5887 +CCT 83.2459 +CGA 70.6935 +CGC 36.8985 +CGG 35.4502 +CGT 74.1419 +CTA 83.6597 +CTC 86.1426 +CTG 91.798 +CTT 143.6629 +GAA 168.216 +GAC 64.1414 +GAG 82.3493 +GAT 125.593 +GCA 79.3146 +GCC 51.589 +GCG 36.4847 +GCT 96.488 +GGA 94.6258 +GGC 47.037 +GGG 35.2433 +GGT 84.6942 +GTA 73.7281 +GTC 68.8313 +GTG 72.4867 +GTT 127.5931 +TAA 113.0406 +TAC 99.9364 +TAG 59.1066 +TAT 138.5592 +TCA 115.9373 +TCC 92.1429 +TCG 76.4179 +TCT 120.5582 +TGA 148.7666 +TGC 96.7638 +TGG 118.0064 +TGT 96.6949 +TTA 153.8704 +TTC 145.5251 +TTG 156.6981 +TTT 212.8391 +end consensus +type CDS +phase 0 +begin consensus +AAA 248.8005 +AAC 103.9424 +AAG 172.7552 +AAT 217.5971 +ACA 87.4108 +ACC 69.0194 +ACG 38.6426 +ACT 163.0428 +AGA 70.2593 +AGC 51.4546 +AGG 27.6904 +AGT 88.0307 +ATA 74.1855 +ATC 72.1191 +ATG 126.4667 +ATT 251.6935 +CAA 170.2754 +CAC 39.4692 +CAG 70.2593 +CAT 104.5623 +CCA 77.4918 +CCC 56.6207 +CCG 25.8306 +CCT 144.4448 +CGA 48.1483 +CGC 42.1556 +CGG 17.1515 +CGT 131.8395 +CTA 50.628 +CTC 47.735 +CTG 34.3031 +CTT 165.7292 +GAA 288.4763 +GAC 103.9424 +GAG 150.4375 +GAT 257.6862 +GCA 100.4294 +GCC 74.5988 +GCG 29.1369 +GCT 190.5266 +GGA 104.9756 +GGC 59.927 +GGG 26.4506 +GGT 158.29 +GTA 76.252 +GTC 75.0121 +GTG 49.5948 +GTT 200.6522 +TAA 0 +TAC 77.0786 +TAG 0 +TAT 147.7511 +TCA 102.4959 +TCC 77.2852 +TCG 41.5356 +TCT 182.6741 +TGA 0 +TGC 32.2366 +TGG 58.8938 +TGT 48.5616 +TTA 163.0428 +TTC 90.7171 +TTG 145.0647 +TTT 196.5193 +end consensus +type CDS +phase 1 +begin consensus +AAA 234.9249 +AAC 117.7724 +AAG 221.908 +AAT 133.0621 +ACA 116.7393 +ACC 50.4148 +ACG 81.6142 +ACT 75.6223 +AGA 108.6812 +AGC 97.937 +AGG 113.4334 +AGT 73.5561 +ATA 169.427 +ATC 136.5747 +ATG 253.9338 +ATT 167.5674 +CAA 120.0452 +CAC 66.7377 +CAG 102.8959 +CAT 78.1017 +CCA 99.7966 +CCC 45.6626 +CCG 72.7296 +CCT 59.5061 +CGA 43.5964 +CGC 22.728 +CGG 37.8111 +CGT 30.9927 +CTA 166.1211 +CTC 141.3269 +CTG 215.7094 +CTT 157.6497 +GAA 86.573 +GAC 38.431 +GAG 54.5472 +GAT 44.0097 +GCA 61.5722 +GCC 35.5383 +GCG 47.5222 +GCT 41.117 +GGA 40.7038 +GGC 24.5876 +GGG 33.4722 +GGT 31.406 +GTA 107.6481 +GTC 79.1348 +GTG 150.4181 +GTT 89.4657 +TAA 114.0533 +TAC 76.862 +TAG 96.4907 +TAT 76.6554 +TCA 89.6723 +TCC 59.5061 +TCG 76.0355 +TCT 60.3325 +TGA 103.7224 +TGC 66.5311 +TGG 120.8717 +TGT 64.0517 +TTA 199.3866 +TTC 168.6005 +TTG 265.2978 +TTT 181.2042 +end consensus +type CDS +phase 2 +begin consensus +AAA 231.6265 +AAC 94.221 +AAG 74.5916 +AAT 155.1753 +ACA 92.1547 +ACC 74.385 +ACG 56.6152 +ACT 76.6578 +AGA 200.2195 +AGC 100.2131 +AGG 77.2777 +AGT 98.1468 +ATA 54.549 +ATC 103.5191 +ATG 37.8124 +ATT 135.7526 +CAA 159.1012 +CAC 66.5332 +CAG 39.4654 +CAT 102.6926 +CCA 61.3676 +CCC 45.4575 +CCG 38.4322 +CCT 45.8707 +CGA 120.4623 +CGC 46.0774 +CGG 51.6562 +CGT 59.7146 +CTA 34.2997 +CTC 69.426 +CTG 25.4149 +CTT 107.445 +GAA 129.3472 +GAC 50.2099 +GAG 42.1515 +GAT 75.0048 +GCA 76.038 +GCC 44.8376 +GCG 33.06 +GCT 57.855 +GGA 138.2321 +GGC 56.8219 +GGG 46.0774 +GGT 64.467 +GTA 37.3991 +GTC 52.4827 +GTG 17.5631 +GTT 92.568 +TAA 224.8079 +TAC 145.8772 +TAG 80.7903 +TAT 191.128 +TCA 155.5886 +TCC 139.6784 +TCG 111.7841 +TCT 118.6027 +TGA 342.1709 +TGC 191.5413 +TGG 174.1848 +TGT 177.4908 +TTA 98.9733 +TTC 177.0775 +TTG 59.508 +TTT 260.3474 +end consensus +type Intron_Corr_Term +phase all + 225.12 +type Intron_Corr_Term +phase 0 + 524.049 +type Intron_Corr_Term +phase 1 + 652.388 +type Intron_Corr_Term +phase 2 + 998.969 +type Intron_emission +begin consensus +A 123.3115 +C 59.5593 +G 67.0399 +T 150.0893 +end consensus +type Pyrimidine_emission +begin consensus +A 26.1549 +C 86.9936 +G 12.7932 +T 274.0583 +end consensus +type Spacer_emission +begin consensus +A 149.5231 +C 60.7484 +G 51.3573 +T 138.3712 +end consensus +type Intron_Stay_Prob + 0.988929 +type Central_Intron_Stay_Prob + 0.983466 +type Pyrimidine_Stay_Prob + 0.904578 +type No_Spacer_Prob + 0.0743243 +type Spacer_Stay_Prob + 0.952081 diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/pombe.gf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/pombe.gf Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,1664 @@ +# Splice sites and intron regions information for GeneWise-21 +# Fri May 15 13:20:38 IDT 1998 +# Created by Mor Amitai (mor@compugen.co.il) +# dataset : ~/ftp/pub/PomBase/temp/cds_cdna ~/ftp/pub/PomBase/temp/cds_ex +# Consensi are read from top down. The value, for a sequence, +# is the number in the line of the first consensus that +# matches the sequence. +# Note: the set of sequences that are represented by a consensus +# are all the sequences that match this consensus and none of the +# previous consensi +# +# the numbers in types 5SS, 3SS, CDS, and the emissions are the number of +# occurrences of each sequence in the database. +# In case of a consensus this is the number of occurrences of sequences +# that are represented by the consensus in the database divided by the +# number of sequences that are represented by the consensus. +# *stay_prob is the probability of the transition from the state to itself. +# No_Spacer_Prob is the probability of transition from Pyrimidine directly +# to 3SS (no spacer). +type 5SS +center 3 +phase all +begin consensus +---GTA-GT- 0.0869141 +---GT----- 0.000821553 +end consensus +type 5SS +center 3 +phase 0 +begin consensus +---GTA-GT- 0.0371094 +---GT----- 0.000356523 +end consensus +type 5SS +center 3 +phase 1 +begin consensus +---GTA-GT- 0.0292969 +---GT----- 0.000294519 +end consensus +type 5SS +center 3 +phase 2 +begin consensus +---GT----- 0.000488281 +end consensus +type 5SS_codon +phase all +begin consensus +AAA 1.864078 +AAC 0.621359 +AAG 6.524272 +AAT 1.864078 +AAN 2.71845 +ACA 0.932039 +ACC 0.310680 +ACG 1.553398 +ACT 0.932039 +ACN 0.932039 +AGA 1.553398 +AGC 0.310680 +AGG 1.553398 +AGT 1.864078 +AGN 1.32039 +ATA 0.932039 +ATC 0.621359 +ATG 2.796117 +ATT 1.553398 +ATN 1.47573 +ANA 1.32039 +ANC 0.46602 +ANG 3.1068 +ANT 1.5534 +ANN 1.61165 +CAA 1.242718 +CAC 0.310680 +CAG 3.417476 +CAT 0.621359 +CAN 1.39806 +CCA 0.310680 +CCC 0.310680 +CCG 0.621359 +CCT 0.310680 +CCN 0.38835 +CGA 0.621359 +CGC 0.310680 +CGG 1.242718 +CGT 0.310680 +CGN 0.621359 +CTA 0.310680 +CTC 0.310680 +CTG 1.864078 +CTT 0.621359 +CTN 0.776699 +CNA 0.621359 +CNC 0.31068 +CNG 1.78641 +CNT 0.46602 +CNN 0.796117 +GAA 0.310680 +GAC 0.621359 +GAG 3.106796 +GAT 1.242718 +GAN 1.32039 +GCA 0.621359 +GCC 0.621359 +GCG 0.310680 +GCT 0.621359 +GCN 0.543689 +GGA 0.310680 +GGC 0.310680 +GGG 0.310680 +GGT 0.310680 +GGN 0.31068 +GTA 0.310680 +GTC 0.310680 +GTG 0.932039 +GTT 0.310680 +GTN 0.46602 +GNA 0.38835 +GNC 0.46602 +GNG 1.16505 +GNT 0.621359 +GNN 0.660194 +TAA 0.621359 +TAC 0.932039 +TAG 0.932039 +TAT 0.621359 +TAN 0.776699 +TCA 0.621359 +TCC 0.310680 +TCG 0.621359 +TCT 0.932039 +TCN 0.621359 +TGA 0.621359 +TGC 0.621359 +TGG 1.242718 +TGT 0.621359 +TGN 0.776699 +TTA 0.621359 +TTC 0.621359 +TTG 3.106796 +TTT 1.864078 +TTN 1.5534 +TNA 0.621359 +TNC 0.621359 +TNG 1.47573 +TNT 1.00971 +TNN 0.932039 +NAA 1.00971 +NAC 0.621359 +NAG 3.49515 +NAT 1.08738 +NAN 1.5534 +NCA 0.621359 +NCC 0.38835 +NCG 0.776699 +NCT 0.699029 +NCN 0.621359 +NGA 0.776699 +NGC 0.38835 +NGG 1.08738 +NGT 0.776699 +NGN 0.757282 +NTA 0.543689 +NTC 0.46602 +NTG 2.17476 +NTT 1.08738 +NTN 1.06796 +NNA 0.737864 +NNC 0.46602 +NNG 1.8835 +NNT 0.912621 +NNN 1 +end consensus +type 5SS_codon +phase 0 +begin consensus +AAA 0.512000 +AAC 0.512000 +AAG 6.144000 +AAT 1.536000 +AAN 2.176 +ACA 1.024000 +ACC 0.512000 +ACG 0.512000 +ACT 0.512000 +ACN 0.64 +AGA 0.512000 +AGC 0.512000 +AGG 1.536000 +AGT 1.024000 +AGN 0.896 +ATA 0.512000 +ATC 0.512000 +ATG 2.048000 +ATT 1.536000 +ATN 1.152 +ANA 0.64 +ANC 0.512 +ANG 2.56 +ANT 1.152 +ANN 1.216 +CAA 1.024000 +CAC 0.512000 +CAG 4.608000 +CAT 1.024000 +CAN 1.792 +CCA 0.512000 +CCC 0.512000 +CCG 1.024000 +CCT 0.512000 +CCN 0.64 +CGA 0.512000 +CGC 0.512000 +CGG 1.024000 +CGT 0.512000 +CGN 0.64 +CTA 0.512000 +CTC 0.512000 +CTG 1.536000 +CTT 1.024000 +CTN 0.896 +CNA 0.64 +CNC 0.512 +CNG 2.048 +CNT 0.768 +CNN 0.992 +GAA 0.512000 +GAC 0.512000 +GAG 4.096000 +GAT 1.536000 +GAN 1.664 +GCA 1.024000 +GCC 1.024000 +GCG 0.512000 +GCT 1.024000 +GCN 0.896 +GGA 0.512000 +GGC 0.512000 +GGG 0.512000 +GGT 0.512000 +GGN 0.512 +GTA 0.512000 +GTC 0.512000 +GTG 1.536000 +GTT 0.512000 +GTN 0.768 +GNA 0.64 +GNC 0.64 +GNG 1.664 +GNT 0.896 +GNN 0.96 +TAA 0.512000 +TAC 1.536000 +TAG 0.512000 +TAT 0.512000 +TAN 0.768 +TCA 0.512000 +TCC 0.512000 +TCG 0.512000 +TCT 1.536000 +TCN 0.768 +TGA 0.512000 +TGC 0.512000 +TGG 0.512000 +TGT 0.512000 +TGN 0.512 +TTA 1.024000 +TTC 0.512000 +TTG 2.048000 +TTT 1.536000 +TTN 1.28 +TNA 0.64 +TNC 0.768 +TNG 0.896 +TNT 1.024 +TNN 0.832 +NAA 0.64 +NAC 0.768 +NAG 3.84 +NAT 1.152 +NAN 1.6 +NCA 0.768 +NCC 0.64 +NCG 0.64 +NCT 0.896 +NCN 0.736 +NGA 0.512 +NGC 0.512 +NGG 0.896 +NGT 0.64 +NGN 0.64 +NTA 0.64 +NTC 0.512 +NTG 1.792 +NTT 1.152 +NTN 1.024 +NNA 0.64 +NNC 0.608 +NNG 1.792 +NNT 0.96 +NNN 1 +end consensus +type 5SS_codon +phase 1 +begin consensus +AAA 1.699115 +AAC 0.566372 +AAG 3.964602 +AAT 1.132743 +AAN 1.84071 +ACA 1.132743 +ACC 0.566372 +ACG 2.265487 +ACT 1.699115 +ACN 1.41593 +AGA 1.132743 +AGC 0.566372 +AGG 1.699115 +AGT 1.132743 +AGN 1.13274 +ATA 1.699115 +ATC 1.132743 +ATG 2.831858 +ATT 1.699115 +ATN 1.84071 +ANA 1.41593 +ANC 0.707965 +ANG 2.69027 +ANT 1.41593 +ANN 1.55752 +CAA 1.132743 +CAC 0.566372 +CAG 1.132743 +CAT 0.566372 +CAN 0.849557 +CCA 0.566372 +CCC 0.566372 +CCG 0.566372 +CCT 0.566372 +CCN 0.566372 +CGA 0.566372 +CGC 0.566372 +CGG 1.699115 +CGT 0.566372 +CGN 0.849558 +CTA 0.566372 +CTC 0.566372 +CTG 2.265487 +CTT 0.566372 +CTN 0.991151 +CNA 0.707965 +CNC 0.566372 +CNG 1.41593 +CNT 0.566372 +CNN 0.814159 +GAA 0.566372 +GAC 1.132743 +GAG 1.699115 +GAT 0.566372 +GAN 0.99115 +GCA 0.566372 +GCC 0.566372 +GCG 0.566372 +GCT 0.566372 +GCN 0.566372 +GGA 0.566372 +GGC 0.566372 +GGG 0.566372 +GGT 0.566372 +GGN 0.566372 +GTA 0.566372 +GTC 0.566372 +GTG 0.566372 +GTT 0.566372 +GTN 0.566372 +GNA 0.566372 +GNC 0.707965 +GNG 0.849558 +GNT 0.566372 +GNN 0.672567 +TAA 0.566372 +TAC 0.566372 +TAG 0.566372 +TAT 1.132743 +TAN 0.707965 +TCA 0.566372 +TCC 0.566372 +TCG 0.566372 +TCT 0.566372 +TCN 0.566372 +TGA 0.566372 +TGC 0.566372 +TGG 1.699115 +TGT 0.566372 +TGN 0.849558 +TTA 0.566372 +TTC 0.566372 +TTG 3.964602 +TTT 1.699115 +TTN 1.69912 +TNA 0.566372 +TNC 0.566372 +TNG 1.69912 +TNT 0.99115 +TNN 0.955752 +NAA 0.99115 +NAC 0.707965 +NAG 1.84071 +NAT 0.849557 +NAN 1.09735 +NCA 0.707965 +NCC 0.566372 +NCG 0.991151 +NCT 0.849558 +NCN 0.778761 +NGA 0.707965 +NGC 0.566372 +NGG 1.41593 +NGT 0.707965 +NGN 0.849558 +NTA 0.849558 +NTC 0.707965 +NTG 2.40708 +NTT 1.13274 +NTN 1.27434 +NNA 0.814159 +NNC 0.637168 +NNG 1.66372 +NNT 0.884956 +NNN 1 +end consensus +type 5SS_codon +phase 2 +begin consensus +AAA 2.666667 +AAC 1.333333 +AAG 2.666667 +AAT 2.000000 +AAN 2.16667 +ACA 0.666667 +ACC 0.666667 +ACG 1.333333 +ACT 0.666667 +ACN 0.833334 +AGA 2.666667 +AGC 0.666667 +AGG 0.666667 +AGT 2.666667 +AGN 1.66667 +ATA 0.666667 +ATC 0.666667 +ATG 1.333333 +ATT 0.666667 +ATN 0.833334 +ANA 1.66667 +ANC 0.833333 +ANG 1.5 +ANT 1.5 +ANN 1.375 +CAA 1.333333 +CAC 0.666667 +CAG 1.333333 +CAT 0.666667 +CAN 1 +CCA 0.666667 +CCC 0.666667 +CCG 0.666667 +CCT 0.666667 +CCN 0.666667 +CGA 1.333333 +CGC 0.666667 +CGG 0.666667 +CGT 0.666667 +CGN 0.833333 +CTA 0.666667 +CTC 0.666667 +CTG 0.666667 +CTT 0.666667 +CTN 0.666667 +CNA 1 +CNC 0.666667 +CNG 0.833333 +CNT 0.666667 +CNN 0.791667 +GAA 0.666667 +GAC 0.666667 +GAG 0.666667 +GAT 1.333333 +GAN 0.833334 +GCA 0.666667 +GCC 0.666667 +GCG 0.666667 +GCT 0.666667 +GCN 0.666667 +GGA 0.666667 +GGC 0.666667 +GGG 0.666667 +GGT 0.666667 +GGN 0.666667 +GTA 0.666667 +GTC 0.666667 +GTG 0.666667 +GTT 0.666667 +GTN 0.666667 +GNA 0.666667 +GNC 0.666667 +GNG 0.666667 +GNT 0.833333 +GNN 0.708334 +TAA 1.333333 +TAC 0.666667 +TAG 2.000000 +TAT 0.666667 +TAN 1.16667 +TCA 1.333333 +TCC 0.666667 +TCG 1.333333 +TCT 0.666667 +TCN 1 +TGA 1.333333 +TGC 1.333333 +TGG 1.333333 +TGT 1.333333 +TGN 1.33333 +TTA 0.666667 +TTC 1.333333 +TTG 0.666667 +TTT 1.333333 +TTN 1 +TNA 1.16667 +TNC 1 +TNG 1.33333 +TNT 1 +TNN 1.125 +NAA 1.5 +NAC 0.833333 +NAG 1.66667 +NAT 1.16667 +NAN 1.29167 +NCA 0.833334 +NCC 0.666667 +NCG 1 +NCT 0.666667 +NCN 0.791667 +NGA 1.5 +NGC 0.833334 +NGG 0.833334 +NGT 1.33333 +NGN 1.125 +NTA 0.666667 +NTC 0.833334 +NTG 0.833333 +NTT 0.833334 +NTN 0.791667 +NNA 1.125 +NNC 0.791667 +NNG 1.08333 +NNT 1 +NNN 1 +end consensus +type 3SS +center 3 +phase all +begin consensus +TAG--- 1.64062 +-AG--- 0.229167 +end consensus +type 3SS +center 3 +phase 0 +begin consensus +TAG--- 0.765625 +-AG--- 0.09375 +end consensus +type 3SS +center 3 +phase 1 +begin consensus +TAG--- 0.515625 +-AG--- 0.0885417 +end consensus +type 3SS +center 3 +phase 2 +begin consensus +-AG--- 0.125 +end consensus +type 3SS_codon +phase all +begin consensus +AAA 1.802817 +AAC 1.502347 +AAG 0.300469 +AAT 1.802817 +AAN 1.35211 +ACA 0.300469 +ACC 1.802817 +ACG 0.300469 +ACT 0.901408 +ACN 0.826291 +AGA 0.600939 +AGC 0.300469 +AGG 0.600939 +AGT 1.502347 +AGN 0.751173 +ATA 1.502347 +ATC 2.103286 +ATG 1.502347 +ATT 2.103286 +ATN 1.80282 +ANA 1.05164 +ANC 1.42723 +ANG 0.676056 +ANT 1.57746 +ANN 1.1831 +CAA 2.103286 +CAC 0.300469 +CAG 0.300469 +CAT 1.201878 +CAN 0.976526 +CCA 0.600939 +CCC 0.600939 +CCG 0.901408 +CCT 0.600939 +CCN 0.676056 +CGA 0.300469 +CGC 0.300469 +CGG 0.300469 +CGT 0.901408 +CGN 0.450704 +CTA 0.901408 +CTC 0.600939 +CTG 1.201878 +CTT 1.201878 +CTN 0.976526 +CNA 0.976526 +CNC 0.450704 +CNG 0.676056 +CNT 0.976526 +CNN 0.769953 +GAA 1.802817 +GAC 0.300469 +GAG 1.201878 +GAT 2.403756 +GAN 1.42723 +GCA 0.901408 +GCC 0.300469 +GCG 1.201878 +GCT 1.502347 +GCN 0.976526 +GGA 0.901408 +GGC 0.600939 +GGG 0.600939 +GGT 1.201878 +GGN 0.826291 +GTA 1.201878 +GTC 1.502347 +GTG 0.600939 +GTT 1.502347 +GTN 1.20188 +GNA 1.20188 +GNC 0.676056 +GNG 0.901408 +GNT 1.65258 +GNN 1.10798 +TAA 0.600939 +TAC 0.300469 +TAG 0.300469 +TAT 1.502347 +TAN 0.676056 +TCA 1.201878 +TCC 0.901408 +TCG 0.901408 +TCT 1.201878 +TCN 1.05164 +TGA 0.300469 +TGC 0.300469 +TGG 0.901408 +TGT 1.502347 +TGN 0.751173 +TTA 0.600939 +TTC 0.600939 +TTG 1.201878 +TTT 2.704225 +TTN 1.277 +TNA 0.676056 +TNC 0.525821 +TNG 0.826291 +TNT 1.7277 +TNN 0.938967 +NAA 1.57746 +NAC 0.600939 +NAG 0.525821 +NAT 1.7277 +NAN 1.10798 +NCA 0.751173 +NCC 0.901408 +NCG 0.826291 +NCT 1.05164 +NCN 0.882629 +NGA 0.525821 +NGC 0.375587 +NGG 0.600939 +NGT 1.277 +NGN 0.694835 +NTA 1.05164 +NTC 1.20188 +NTG 1.12676 +NTT 1.87793 +NTN 1.31455 +NNA 0.976526 +NNC 0.769953 +NNG 0.769953 +NNT 1.48357 +NNN 1 +end consensus +type 3SS_codon +phase 0 +begin consensus +AAA 1.465649 +AAC 1.465649 +AAG 0.488550 +AAT 1.954198 +AAN 1.34351 +ACA 0.488550 +ACC 1.954198 +ACG 0.488550 +ACT 0.977099 +ACN 0.977099 +AGA 0.488550 +AGC 0.488550 +AGG 0.488550 +AGT 1.954198 +AGN 0.854962 +ATA 1.465649 +ATC 0.488550 +ATG 0.977099 +ATT 1.465649 +ATN 1.09924 +ANA 0.9771 +ANC 1.09924 +ANG 0.610687 +ANT 1.58779 +ANN 1.0687 +CAA 2.931298 +CAC 0.488550 +CAG 0.488550 +CAT 0.977099 +CAN 1.22137 +CCA 0.977099 +CCC 0.488550 +CCG 0.488550 +CCT 0.977099 +CCN 0.732824 +CGA 0.488550 +CGC 0.488550 +CGG 0.488550 +CGT 0.488550 +CGN 0.48855 +CTA 1.465649 +CTC 0.977099 +CTG 0.488550 +CTT 0.977099 +CTN 0.977099 +CNA 1.46565 +CNC 0.610687 +CNG 0.48855 +CNT 0.854962 +CNN 0.854962 +GAA 1.954198 +GAC 0.488550 +GAG 0.977099 +GAT 2.931298 +GAN 1.58779 +GCA 0.977099 +GCC 0.488550 +GCG 0.977099 +GCT 0.488550 +GCN 0.732824 +GGA 0.488550 +GGC 0.977099 +GGG 0.977099 +GGT 1.465649 +GGN 0.977099 +GTA 1.465649 +GTC 0.977099 +GTG 0.488550 +GTT 1.465649 +GTN 1.09924 +GNA 1.22137 +GNC 0.732824 +GNG 0.854962 +GNT 1.58779 +GNN 1.09924 +TAA 0.488550 +TAC 0.488550 +TAG 0.488550 +TAT 0.977099 +TAN 0.610687 +TCA 1.465649 +TCC 1.465649 +TCG 0.977099 +TCT 1.954198 +TCN 1.46565 +TGA 0.488550 +TGC 0.488550 +TGG 0.977099 +TGT 0.488550 +TGN 0.610687 +TTA 0.977099 +TTC 0.488550 +TTG 0.488550 +TTT 2.931298 +TTN 1.22137 +TNA 0.854962 +TNC 0.732825 +TNG 0.732824 +TNT 1.58779 +TNN 0.977099 +NAA 1.70992 +NAC 0.732825 +NAG 0.610687 +NAT 1.70992 +NAN 1.19084 +NCA 0.977099 +NCC 1.09924 +NCG 0.732824 +NCT 1.09924 +NCN 0.977099 +NGA 0.48855 +NGC 0.610687 +NGG 0.732824 +NGT 1.09924 +NGN 0.732825 +NTA 1.34351 +NTC 0.732824 +NTG 0.610687 +NTT 1.70992 +NTN 1.09924 +NNA 1.12977 +NNC 0.793893 +NNG 0.671756 +NNT 1.40458 +NNN 1 +end consensus +type 3SS_codon +phase 1 +begin consensus +AAA 1.684211 +AAC 1.122807 +AAG 0.561404 +AAT 1.684211 +AAN 1.26316 +ACA 0.561404 +ACC 1.684211 +ACG 0.561404 +ACT 0.561404 +ACN 0.842106 +AGA 1.122807 +AGC 0.561404 +AGG 1.122807 +AGT 0.561404 +AGN 0.842106 +ATA 1.122807 +ATC 2.245614 +ATG 2.245614 +ATT 1.684211 +ATN 1.82456 +ANA 1.12281 +ANC 1.40351 +ANG 1.12281 +ANT 1.12281 +ANN 1.19298 +CAA 0.561404 +CAC 0.561404 +CAG 0.561404 +CAT 1.122807 +CAN 0.701755 +CCA 0.561404 +CCC 0.561404 +CCG 1.122807 +CCT 0.561404 +CCN 0.701755 +CGA 0.561404 +CGC 0.561404 +CGG 0.561404 +CGT 1.684211 +CGN 0.842106 +CTA 0.561404 +CTC 0.561404 +CTG 2.245614 +CTT 1.684211 +CTN 1.26316 +CNA 0.561404 +CNC 0.561404 +CNG 1.12281 +CNT 1.26316 +CNN 0.877193 +GAA 1.684211 +GAC 0.561404 +GAG 1.684211 +GAT 1.122807 +GAN 1.26316 +GCA 1.122807 +GCC 0.561404 +GCG 0.561404 +GCT 1.122807 +GCN 0.842105 +GGA 1.122807 +GGC 0.561404 +GGG 0.561404 +GGT 1.122807 +GGN 0.842105 +GTA 1.122807 +GTC 1.684211 +GTG 1.122807 +GTT 1.122807 +GTN 1.26316 +GNA 1.26316 +GNC 0.842106 +GNG 0.982457 +GNT 1.12281 +GNN 1.05263 +TAA 0.561404 +TAC 0.561404 +TAG 0.561404 +TAT 0.561404 +TAN 0.561404 +TCA 0.561404 +TCC 0.561404 +TCG 0.561404 +TCT 0.561404 +TCN 0.561404 +TGA 0.561404 +TGC 0.561404 +TGG 1.122807 +TGT 1.122807 +TGN 0.842105 +TTA 0.561404 +TTC 1.122807 +TTG 2.245614 +TTT 2.245614 +TTN 1.54386 +TNA 0.561404 +TNC 0.701755 +TNG 1.12281 +TNT 1.12281 +TNN 0.877193 +NAA 1.12281 +NAC 0.701755 +NAG 0.842106 +NAT 1.12281 +NAN 0.947369 +NCA 0.701755 +NCC 0.842106 +NCG 0.701755 +NCT 0.701755 +NCN 0.736842 +NGA 0.842106 +NGC 0.561404 +NGG 0.842105 +NGT 1.12281 +NGN 0.842106 +NTA 0.842106 +NTC 1.40351 +NTG 1.96491 +NTT 1.68421 +NTN 1.47368 +NNA 0.877193 +NNC 0.877193 +NNG 1.08772 +NNT 1.15789 +NNN 1 +end consensus +type 3SS_codon +phase 2 +begin consensus +AAA 1.333333 +AAC 1.333333 +AAG 0.666667 +AAT 0.666667 +AAN 1 +ACA 0.666667 +ACC 0.666667 +ACG 0.666667 +ACT 1.333333 +ACN 0.833334 +AGA 0.666667 +AGC 0.666667 +AGG 0.666667 +AGT 1.333333 +AGN 0.833334 +ATA 1.333333 +ATC 2.666667 +ATG 0.666667 +ATT 2.000000 +ATN 1.66667 +ANA 1 +ANC 1.33333 +ANG 0.666667 +ANT 1.33333 +ANN 1.08333 +CAA 1.333333 +CAC 0.666667 +CAG 0.666667 +CAT 1.333333 +CAN 1 +CCA 0.666667 +CCC 1.333333 +CCG 1.333333 +CCT 0.666667 +CCN 1 +CGA 0.666667 +CGC 0.666667 +CGG 0.666667 +CGT 0.666667 +CGN 0.666667 +CTA 0.666667 +CTC 0.666667 +CTG 0.666667 +CTT 0.666667 +CTN 0.666667 +CNA 0.833333 +CNC 0.833333 +CNG 0.833333 +CNT 0.833333 +CNN 0.833334 +GAA 0.666667 +GAC 0.666667 +GAG 0.666667 +GAT 1.333333 +GAN 0.833334 +GCA 0.666667 +GCC 0.666667 +GCG 2.000000 +GCT 2.666667 +GCN 1.5 +GGA 1.333333 +GGC 0.666667 +GGG 0.666667 +GGT 0.666667 +GGN 0.833333 +GTA 0.666667 +GTC 1.333333 +GTG 0.666667 +GTT 1.333333 +GTN 1 +GNA 0.833334 +GNC 0.833334 +GNG 1 +GNT 1.5 +GNN 1.04167 +TAA 1.333333 +TAC 0.666667 +TAG 0.666667 +TAT 2.666667 +TAN 1.33333 +TCA 1.333333 +TCC 0.666667 +TCG 1.333333 +TCT 0.666667 +TCN 1 +TGA 0.666667 +TGC 0.666667 +TGG 0.666667 +TGT 2.666667 +TGN 1.16667 +TTA 0.666667 +TTC 0.666667 +TTG 0.666667 +TTT 0.666667 +TTN 0.666667 +TNA 1 +TNC 0.666667 +TNG 0.833333 +TNT 1.66667 +TNN 1.04167 +NAA 1.16667 +NAC 0.833333 +NAG 0.666667 +NAT 1.5 +NAN 1.04167 +NCA 0.833334 +NCC 0.833333 +NCG 1.33333 +NCT 1.33333 +NCN 1.08333 +NGA 0.833334 +NGC 0.666667 +NGG 0.666667 +NGT 1.33333 +NGN 0.875 +NTA 0.833333 +NTC 1.33333 +NTG 0.666667 +NTT 1.16667 +NTN 1 +NNA 0.916667 +NNC 0.916667 +NNG 0.833334 +NNT 1.33333 +NNN 1 +end consensus +type CDS +phase all +begin consensus +AAA 2.386335 +AAC 1.053160 +AAG 1.564912 +AAT 1.686987 +AAN 1.67285 +ACA 0.987639 +ACC 0.645552 +ACG 0.588997 +ACT 1.051091 +ACN 0.81832 +AGA 1.264206 +AGC 0.831769 +AGG 0.727625 +AGT 0.865564 +AGN 0.922291 +ATA 0.993847 +ATC 1.040746 +ATG 1.394558 +ATT 1.851134 +ATN 1.32007 +ANA 1.40801 +ANC 0.892807 +ANG 1.06902 +ANT 1.36369 +ANN 1.18338 +CAA 1.498701 +CAC 0.575203 +CAG 0.708314 +CAT 0.951086 +CAN 0.933326 +CCA 0.795215 +CCC 0.491751 +CCG 0.455887 +CCT 0.832459 +CCN 0.643828 +CGA 0.706935 +CGC 0.368985 +CGG 0.354502 +CGT 0.741419 +CGN 0.54296 +CTA 0.836597 +CTC 0.861426 +CTG 0.917980 +CTT 1.436629 +CTN 1.01316 +CNA 0.959362 +CNC 0.574341 +CNG 0.609171 +CNT 0.990398 +CNN 0.783318 +GAA 1.682160 +GAC 0.641414 +GAG 0.823493 +GAT 1.255930 +GAN 1.10075 +GCA 0.793146 +GCC 0.515890 +GCG 0.364847 +GCT 0.964880 +GCN 0.659691 +GGA 0.946258 +GGC 0.470370 +GGG 0.352433 +GGT 0.846942 +GGN 0.654001 +GTA 0.737281 +GTC 0.688313 +GTG 0.724867 +GTT 1.275931 +GTN 0.856598 +GNA 1.03971 +GNC 0.578997 +GNG 0.56641 +GNT 1.08592 +GNN 0.81776 +TAA 1.130406 +TAC 0.999364 +TAG 0.591066 +TAT 1.385592 +TAN 1.02661 +TCA 1.159373 +TCC 0.921429 +TCG 0.764179 +TCT 1.205582 +TCN 1.01264 +TGA 1.487666 +TGC 0.967638 +TGG 1.180064 +TGT 0.966949 +TGN 1.15058 +TTA 1.538704 +TTC 1.455251 +TTG 1.566981 +TTT 2.128391 +TTN 1.67233 +TNA 1.32904 +TNC 1.08592 +TNG 1.02557 +TNT 1.42163 +TNN 1.21554 +NAA 1.6744 +NAC 0.817285 +NAG 0.921946 +NAT 1.3199 +NAN 1.18338 +NCA 0.933843 +NCC 0.643655 +NCG 0.543477 +NCT 1.0135 +NCN 0.78362 +NGA 1.10127 +NGC 0.65969 +NGG 0.653656 +NGT 0.855218 +NGN 0.817458 +NTA 1.02661 +NTC 1.01143 +NTG 1.1511 +NTT 1.67302 +NTN 1.21554 +NNA 1.18403 +NNC 0.783016 +NNG 0.817544 +NNT 1.21541 +NNN 1 +end consensus +type CDS +phase 0 +begin consensus +AAA 2.488005 +AAC 1.039424 +AAG 1.727552 +AAT 2.175971 +AAN 1.85774 +ACA 0.874108 +ACC 0.690194 +ACG 0.386426 +ACT 1.630428 +ACN 0.895289 +AGA 0.702593 +AGC 0.514546 +AGG 0.276904 +AGT 0.880307 +AGN 0.593587 +ATA 0.741855 +ATC 0.721191 +ATG 1.264667 +ATT 2.516935 +ATN 1.31116 +ANA 1.20164 +ANC 0.741339 +ANG 0.913887 +ANT 1.80091 +ANN 1.16444 +CAA 1.702754 +CAC 0.394692 +CAG 0.702593 +CAT 1.045623 +CAN 0.961415 +CCA 0.774918 +CCC 0.566207 +CCG 0.258306 +CCT 1.444448 +CCN 0.76097 +CGA 0.481483 +CGC 0.421556 +CGG 0.171515 +CGT 1.318395 +CGN 0.598237 +CTA 0.506280 +CTC 0.477350 +CTG 0.343031 +CTT 1.657292 +CTN 0.745988 +CNA 0.866359 +CNC 0.464951 +CNG 0.368861 +CNT 1.36644 +CNN 0.766653 +GAA 2.884763 +GAC 1.039424 +GAG 1.504375 +GAT 2.576862 +GAN 2.00136 +GCA 1.004294 +GCC 0.745988 +GCG 0.291369 +GCT 1.905266 +GCN 0.986729 +GGA 1.049756 +GGC 0.599270 +GGG 0.264506 +GGT 1.582900 +GGN 0.874108 +GTA 0.762520 +GTC 0.750121 +GTG 0.495948 +GTT 2.006522 +GTN 1.00378 +GNA 1.42533 +GNC 0.783701 +GNG 0.639049 +GNT 2.01789 +GNN 1.21649 +TAA 0.000000 +TAC 0.770786 +TAG 0.000000 +TAT 1.477511 +TAN 0.562074 +TCA 1.024959 +TCC 0.772852 +TCG 0.415356 +TCT 1.826741 +TCN 1.00998 +TGA 0.000000 +TGC 0.322366 +TGG 0.588938 +TGT 0.485616 +TGN 0.34923 +TTA 1.630428 +TTC 0.907171 +TTG 1.450647 +TTT 1.965193 +TTN 1.48836 +TNA 0.663847 +TNC 0.693294 +TNG 0.613735 +TNT 1.43877 +TNN 0.85241 +NAA 1.76888 +NAC 0.811082 +NAG 0.98363 +NAT 1.81899 +NAN 1.34565 +NCA 0.91957 +NCC 0.69381 +NCG 0.337864 +NCT 1.70172 +NCN 0.913241 +NGA 0.558458 +NGC 0.464434 +NGG 0.325466 +NGT 1.0668 +NGN 0.603791 +NTA 0.910271 +NTC 0.713958 +NTG 0.888573 +NTT 2.03649 +NTN 1.13732 +NNA 1.03929 +NNC 0.670821 +NNG 0.633883 +NNT 1.656 +NNN 1 +end consensus +type CDS +phase 1 +begin consensus +AAA 2.349249 +AAC 1.177724 +AAG 2.219080 +AAT 1.330621 +AAN 1.76917 +ACA 1.167393 +ACC 0.504148 +ACG 0.816142 +ACT 0.756223 +ACN 0.810976 +AGA 1.086812 +AGC 0.979370 +AGG 1.134334 +AGT 0.735561 +AGN 0.984019 +ATA 1.694270 +ATC 1.365747 +ATG 2.539338 +ATT 1.675674 +ATN 1.81876 +ANA 1.57443 +ANC 1.00675 +ANG 1.67722 +ANT 1.12452 +ANN 1.34573 +CAA 1.200452 +CAC 0.667377 +CAG 1.028959 +CAT 0.781017 +CAN 0.919451 +CCA 0.997966 +CCC 0.456626 +CCG 0.727296 +CCT 0.595061 +CCN 0.694237 +CGA 0.435964 +CGC 0.227280 +CGG 0.378111 +CGT 0.309927 +CGN 0.337821 +CTA 1.661211 +CTC 1.413269 +CTG 2.157094 +CTT 1.576497 +CTN 1.70202 +CNA 1.0739 +CNC 0.691138 +CNG 1.07286 +CNT 0.815626 +CNN 0.913382 +GAA 0.865730 +GAC 0.384310 +GAG 0.545472 +GAT 0.440097 +GAN 0.558902 +GCA 0.615722 +GCC 0.355383 +GCG 0.475222 +GCT 0.411170 +GCN 0.464374 +GGA 0.407038 +GGC 0.245876 +GGG 0.334722 +GGT 0.314060 +GGN 0.325424 +GTA 1.076481 +GTC 0.791348 +GTG 1.504181 +GTT 0.894657 +GTN 1.06667 +GNA 0.741243 +GNC 0.444229 +GNG 0.714899 +GNT 0.514996 +GNN 0.603842 +TAA 1.140533 +TAC 0.768620 +TAG 0.964907 +TAT 0.766554 +TAN 0.910154 +TCA 0.896723 +TCC 0.595061 +TCG 0.760355 +TCT 0.603325 +TCN 0.713866 +TGA 1.037224 +TGC 0.665311 +TGG 1.208717 +TGT 0.640517 +TGN 0.887942 +TTA 1.993866 +TTC 1.686005 +TTG 2.652978 +TTT 1.812042 +TTN 2.03622 +TNA 1.26709 +TNC 0.928749 +TNG 1.39674 +TNT 0.95561 +TNN 1.13705 +NAA 1.38899 +NAC 0.749508 +NAG 1.1896 +NAT 0.829572 +NAN 1.03942 +NCA 0.919451 +NCC 0.477804 +NCG 0.694754 +NCT 0.591445 +NCN 0.670863 +NGA 0.741759 +NGC 0.529459 +NGG 0.763971 +NGT 0.500016 +NGN 0.633801 +NTA 1.60646 +NTC 1.31409 +NTG 2.2134 +NTT 1.48972 +NTN 1.65592 +NNA 1.16416 +NNC 0.767716 +NNG 1.21543 +NNT 0.852688 +NNN 1 +end consensus +type CDS +phase 2 +begin consensus +AAA 2.316265 +AAC 0.942210 +AAG 0.745916 +AAT 1.551753 +AAN 1.38904 +ACA 0.921547 +ACC 0.743850 +ACG 0.566152 +ACT 0.766578 +ACN 0.749532 +AGA 2.002195 +AGC 1.002131 +AGG 0.772777 +AGT 0.981468 +AGN 1.18964 +ATA 0.545490 +ATC 1.035191 +ATG 0.378124 +ATT 1.357526 +ATN 0.829083 +ANA 1.44637 +ANC 0.930845 +ANG 0.615742 +ANT 1.16433 +ANN 1.03932 +CAA 1.591012 +CAC 0.665332 +CAG 0.394654 +CAT 1.026926 +CAN 0.919481 +CCA 0.613676 +CCC 0.454575 +CCG 0.384322 +CCT 0.458707 +CCN 0.47782 +CGA 1.204623 +CGC 0.460774 +CGG 0.516562 +CGT 0.597146 +CGN 0.694776 +CTA 0.342997 +CTC 0.694260 +CTG 0.254149 +CTT 1.074450 +CTN 0.591464 +CNA 0.938077 +CNC 0.568735 +CNG 0.387422 +CNT 0.789307 +CNN 0.670885 +GAA 1.293472 +GAC 0.502099 +GAG 0.421515 +GAT 0.750048 +GAN 0.741783 +GCA 0.760380 +GCC 0.448376 +GCG 0.330600 +GCT 0.578550 +GCN 0.529477 +GGA 1.382321 +GGC 0.568219 +GGG 0.460774 +GGT 0.644670 +GGN 0.763996 +GTA 0.373991 +GTC 0.524827 +GTG 0.175631 +GTT 0.925680 +GTN 0.500032 +GNA 0.952541 +GNC 0.51088 +GNG 0.34713 +GNT 0.724737 +GNN 0.633822 +TAA 2.248079 +TAC 1.458772 +TAG 0.807903 +TAT 1.911280 +TAN 1.60651 +TCA 1.555886 +TCC 1.396784 +TCG 1.117841 +TCT 1.186027 +TCN 1.31413 +TGA 3.421709 +TGC 1.915413 +TGG 1.741848 +TGT 1.774908 +TGN 2.21347 +TTA 0.989733 +TTC 1.770775 +TTG 0.595080 +TTT 2.603474 +TTN 1.48977 +TNA 2.05385 +TNC 1.63544 +TNG 1.06567 +TNT 1.86892 +TNN 1.65597 +NAA 1.86221 +NAC 0.892103 +NAG 0.592497 +NAT 1.31 +NAN 1.1642 +NCA 0.962872 +NCC 0.760896 +NCG 0.599729 +NCT 0.747465 +NCN 0.767741 +NGA 2.00271 +NGC 0.986634 +NGG 0.87299 +NGT 0.999548 +NGN 1.21547 +NTA 0.563053 +NTC 1.00626 +NTG 0.350746 +NTT 1.49028 +NTN 0.852586 +NNA 1.34771 +NNC 0.911474 +NNG 0.603991 +NNT 1.13682 +NNN 1 +end consensus +type Intron_Corr_Term +phase all + 225.12 +type Intron_Corr_Term +phase 0 + 524.049 +type Intron_Corr_Term +phase 1 + 652.388 +type Intron_Corr_Term +phase 2 + 998.969 +type Intron_emission +begin consensus +A 1.233115 +C 0.595593 +G 0.670399 +T 1.500893 +end consensus +type Pyrimidine_emission +begin consensus +A 0.261549 +C 0.869936 +G 0.127932 +T 2.740583 +end consensus +type Spacer_emission +begin consensus +A 1.495231 +C 0.607484 +G 0.513573 +T 1.383712 +end consensus +type Intron_Stay_Prob + 0.988929 +type Central_Intron_Stay_Prob + 0.983466 +type Pyrimidine_Stay_Prob + 0.904578 +type No_Spacer_Prob + 0.0743243 +type Spacer_Stay_Prob + 0.952081 diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/tm.pri --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/tm.pri Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,63 @@ +# +# +# Ok - something hacked around for the Transmembrane model +# +# This is an example of a Brown-Haussler mixture Dirichlet prior. +# + +Mix # Strategy (mixture Dirichlet) +Amino # type of prior (Amino or Nucleic) + +# Transitions +2 # 2 component - first component an ungappy region. +0.3 # with probability = 1.0 +10. 0.015 0.015 # m->m, m->i, m->d alpha's +10. 1.170 0.060 # i->m, i->i, i->d alpha's +10. 0.060 0.300 # d->m, d->i, d->d alpha's +0.7 # with probability = 1.0 +2.970 0.015 0.015 # m->m, m->i, m->d alpha's +1.770 1.170 0.060 # i->m, i->i, i->d alpha's +2.640 0.060 0.300 # d->m, d->i, d->d alpha's + +# A C D E F G H I K L M N P Q R S T V W Y + +# ok +# +# 1st column meant to be a pretty heavy hyrdophobic prior thing. +# +# Match emissions +# A C D E F G H I K L M N P Q R S T V W Y +# +11 # 11 components +0.031 +# A C D E F G H I K L M N P Q R S T V W Y +20. 15. 0.2 0.2 20. 4. 4. 20. 0.2 20. 20. 20. 4. 1. 0.2 4. 4. 20. 15. 15. +0.001 +22.379 0.175 0.818 1.110 0.292 1.928 0.175 0.234 0.467 0.409 0.117 0.643 0.993 0.351 0.292 25.184 1.811 0.643 0.117 0.292 +0.025 +4.468 0.194 1.632 7.266 0.466 13.443 0.272 0.583 1.049 1.243 0.194 0.622 0.777 2.487 0.466 1.399 0.699 0.932 0.117 0.505 +0.091 +0.788 0.158 0.158 0.237 6.150 0.237 0.079 16.873 0.237 27.991 2.917 0.237 0.237 0.237 0.237 0.237 0.473 20.421 0.158 0.788 +0.024 +1.313 0.303 0.808 0.707 0.606 0.808 0.303 0.707 0.606 1.717 0.303 0.505 87.087 0.606 0.505 1.515 0.909 1.010 0.202 0.505 +0.701 +0.160 0.042 0.126 0.146 0.070 0.120 0.054 0.094 0.166 0.138 0.056 0.090 0.062 0.094 0.080 0.150 0.126 0.122 0.038 0.070 +0.015 +1.017 0.113 87.577 16.950 0.565 0.565 0.113 0.452 0.791 0.339 0.226 1.017 0.565 0.452 0.339 0.565 0.339 0.452 0.001 0.791 +0.041 +0.994 0.339 2.031 1.185 0.804 1.269 1.016 0.402 0.741 0.592 0.444 4.401 0.529 0.592 0.508 1.502 1.164 0.571 0.360 1.671 +0.024 +26.730 0.286 0.800 1.885 0.800 0.971 0.228 1.142 2.056 3.770 0.343 0.742 1.142 0.685 0.571 1.199 1.713 11.252 0.171 0.514 +0.017 +1.170 0.177 1.489 0.745 1.028 0.957 0.177 0.674 2.092 1.347 0.142 1.170 0.603 0.496 17.053 2.801 0.957 1.560 0.177 0.638 +0.030 +0.001 0.001 0.001 0.001 0.001 5793.873 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 + +## Insert emissions +1 # Single component +1.0 # with probability 1.0 +17430 6690 9370 9910 7950 17720 6720 7380 16100 17070 2950 8090 10140 7650 8180 13920 11710 12940 2100 5980 + + + + diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/wise.2 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/wise.2 Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,32 @@ +! Wise links files +! For use on molbiol +! modifed by ewan Nov 10th +! EMBL_FULL WiseDataDir:em.in +! EMBL_PRI WiseDataDir:empri.in +! EMBL_VER WiseDataDir:emver.in +! +swiss swiss.in protein +!tremblnr trembl.in protein +!rrm rrm.in protein +!rrmDNA rrmdna.in dna +embl em.in dna +EMBL_Est est.in dna +!EMBL_without_est EMmEst.in dna +EMBL_fun fun.in dna +EMBL_inv inv.in dna +EMBL_mam mam.in dna +EMBL_org org.in dna +EMBL_pat patent.in dna +EMBL_phg phg.in dna +EMBL_pln pln.in dna +EMBL_pri pri.in dna +EMBL_pro pro.in dna +EMBL_rod rod.in dna +EMBL_sts sts.in dna +EMBL_syn syn.in dna +EMBL_unc unc.in dna +EMBL_vrl vrl.in dna +EMBL_vrt vrt.in dna +!SH2_Est sh2est.in dna +!test test.in dna +!test2 test2.in dna diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/wise.per --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/wise.per Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,104 @@ +! this is the wise config file +! for you to fool around with Remember ! lines are comments +! and keys should go flush with the left hand margin +! +! All wise.cfg's at the moment should have interactive key +! +interactive +! +! This is the linkfile designation (for searchwise/swise) +! +linkfile wise.links +! +! The next lines show a UNIX style background system +! +queuecommand nice nohup csh +addamperstand +! +! +viewlength 20 +! +! view command for pairwise.... +viewer more +! +!viewbackground +! +! +!The next two keys give default codon and matrix files +!I'm using here the fact that pairwise/searchwise will open files +!from WISECONFIGDIR if it can't open the file as the straight filename +! +codondefault codon.table +matrixdefault blosum62.bla +! +! +! The next lines give default for all situations gap and ext +! and other parameters +! +gapdefault 1900 +extdefault 200 +framedefault 2500 +frameextdefault 2000 +stopdefault 500 +! +!The next lines gives default specific for profiles or protein sequences +!I've found these values by rule of thumb. These values will superceed +! the defaults above +! +gap_pro 2200 +ext_pro 200 +gap_seq 700 +ext_seq 70 +! +!This gives the number of cells to report in PairWise +! +reportrate 10000 +! +!This tells the screen length in PairWise/Opensearch help files +! +screenlength 20 +! +!These give help files +! +localhtmlhelp http://www.sanger.ac.uk/~birney/wise/help/ +! +! This gives the SRS link and library links +! +srslink http://www.sanger.ac.uk/srs/srsc? +srslibrary +-l+swissprot+-l+embl+-l+swissnew+-l+emnew +! +! This gives the sort of extensions to append to output file +! in searchwise +! +outputappend .out +htmlappend .html +alignhtmlappend _aln.html +alignappend .aln +! +pairmainhelp pairmain.hlp +pairalignhelp pairaln.hlp +confighelp pairconfig.hlp +! +protopmenuhelp buildprofile.hlp +promatrixmenuhelp pwmatrix.hlp +proweightmenuhelp pwweights.hlp +progapmenuhelp pwgap.hlp +! +searchmainhelp searchmain.hlp +searchparahelp searchaln.hlp +searchsubmithelp searchsubmit.hlp +! +! +! +stagger 500 +highscore 250 +align 50 +! +deletecom +! +! +! +hard_linkage + + + diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/lib/wisecfg/worm.gf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/lib/wisecfg/worm.gf Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,898 @@ +# Splice sites and intron regions information for GeneWise-21 +# Tue Jan 14 16:50:39 IST 1997 +# Created by Mor Amitai (mor@compugen.co.il) +# dataset : C.Elegans data from the Sanger Centre. given by Ewan Birney +# preliminary results. For Ewan's use only +# Consensi are read from top down. The value, for a sequence, +# is the number in the line of the first consensus that +# matches the sequence. +# Note: the set of sequences that are represented by a consensus +# are all the sequences that match this consensus and none of the +# previous consensi +# +# the numbers in types 5SS, 3SS, CDS, and the emissions are the number of +# occurrences of each sequence in the database. +# In case of a consensus this is the number of occurrences of sequences +# that are represented by the consensus in the database divided by the +# number of sequences that are represented by the consensus. +# *stay_prob is the probability of the transition from the state to itself. +# No_Spacer_Prob is the probability of transition from Pyrimidine directly +# to 3SS (no spacer). +type 5SS +center 3 +phase all +begin consensus +AAAGTGAGTT 61 +GATGTAAGTT 51 +A-TGTAAGTT 28.25 +-AAGTGAGTT 28 +-AAGTAAGTT 24.75 +A-AGTGAGTT 22.3333 +AA-GTGAGTT 22 +TTGGT-AGTT 20.25 +AAGGTTTGT- 19.75 +-AGGTAATTT 19.25 +T-TGTAAGTT 19 +A-AGTAAGTT 18.3333 +AT-GTGAGTT 17 +GA-GTGAGTT 17 +-AGGTAAATT 16.75 +A-GGTTAGTT 16.75 +AAGGTAA-AA 13.5 +AAGGTAA-AT 13.25 +T--GTAAGTT 10.4 +-A-GTAAGTT 11.1429 +-T-GTAAGTT 10.3 +--GGTGAGTT 10.1667 +--AGTGAGTT 9.22222 +-C-GTAAGTT 8.8 +-T-GTGAGTT 8.33333 +--GGTTAGTT 8 +AA-GTAAG-A 7.73333 +-AGGTTTGT- 7.41667 +-AGGTGAG-A 7.0625 +CAGGTAA-A- 6.5625 +AA-GTA-GTT 6.5 +-A-GTTAGTT 6.41667 +-G-GTAAGTT 6.4 +-AGGTATTT- 6 +AA-GTAAG-T 5.54545 +-AGGTAAA-A 5.5 +-AAGTAAGT- 5.45455 +-A-GTAAGAA 5.36364 +--GGTATGTT 5.26667 +-TGGTGAGT- 5.25 +-AAGTGAGT- 5.16667 +-ATGTAAGT- 5.09091 +AAGGT-AGT- 5.09091 +-AGGTGAGA- 4.75 +-TGGTAAGT- 4.58333 +-AGGTAATT- 4.5 +-AGGTATGT- 4.41667 +--GGTAAGAT 4.28571 +--GGTAAGAA 4.25 +---GTGAGTT 4.2 +--GGTTGGTT 3.5625 +-AGGTTTGA- 3.4375 +A--GTAAGT- 3.28571 +AAGGTAT-A- 3.125 +-AGGT-AGT- 2.61111 +--GGTAA-TT 2.575 +T--GTAAGT- 2.5 +--GGTT-GTT 2.42857 +-A-GTA-GTT 2.42424 +-AGGTAAG-- 2.23529 +---GTTAGTT 2.16667 +---GTAAGAT 2.04444 +A--GTGAGT- 1.97436 +A--GTA-GTT 1.9697 +-AGGTA--TT 1.84375 +-AGGTAA-A- 1.82143 +C--GTAAGT- 1.80556 +AA-GTGAG-- 1.7619 +AAGGTA---A 1.7234 +--GGTGAG-A 1.64286 +G--GTAAGT- 1.63889 +T--GTA-GTT 1.63636 +-TGGTAAG-- 1.625 +CAGGTA---A 1.54902 +--GGTGAG-T 1.54762 +AAGGTA---T 1.52632 +AAGGTT-G-- 1.46 +AA-GT-A-TT 1.45833 +CAGGTA---T 1.42857 +A--GTAAGA- 1.37838 +---GTGAGTG 1.35897 +-AGGTG-GT- 1.16667 +--GGTATG-A 1.07407 +-AGGTT-TT- 0.952381 +-A-GTAAG-- 0.794118 +-AGGTAA--- 0.790698 +--GGTA-GT- 0.734375 +-A-GT-AGT- 0.731183 +--GGT-AGT- 0.696629 +-AGGTA-G-- 0.66 +---GTAAG-A 0.643478 +-AGGT-AG-- 0.59375 +--GGTA-G-T 0.58871 +--GGTAA--A 0.583333 +AA-GTA-G-- 0.555556 +-AGGTAT--- 0.532787 +-A-GT--GTT 0.525 +--GGTT-GT- 0.515873 +-A-GTGAG-- 0.481481 +--GGTAA-A- 0.466667 +-AGGT-A-T- 0.466667 +-A-GTAA-T- 0.407407 +---GTGAG-A 0.4 +-AGGTTT--- 0.39 +A-GGTA--T- 0.362319 +---GTATGT- 0.326797 +---GTAAG-- 0.221854 +-AAGTA--A- 0.219298 +---GT--GTT 0.209402 +--GGTA-G-- 0.1875 +--GGT-AG-- 0.155556 +--GGTA--T- 0.15493 +-AGGT---A- 0.1375 +A--GTAA--- 0.116838 +-A-GTA--T- 0.109053 +---GTT-GT- 0.0944444 +-AAGT-A--- 0.0761905 +---GT--GAT 0.0670886 +-AGGT----- 0.0541575 +--GGTA---- 0.0411523 +---GTA-G-- 0.037037 +---GTAA--- 0.0363757 +---GT-A-T- 0.0309423 +A--GT----T 0.0201761 +---GTG-G-- 0.0192593 +---GT--G-- 0.00953471 +---GT---T- 0.0090535 +---GT----- 0.00283019 +end consensus +type 5SS +center 3 +phase 0 +begin consensus +GATGTAAGTT 43 +AA-GTGAGTT 15.25 +A-TGTAAGTT 14.75 +AAGGTTTGT- 13.5 +GA-GTGAGTT 11.25 +AAGGTAA-TT 11 +CAGGTAA-TT 10.25 +A-AGTAAGTT 9.5 +-AGGTTAGTT 9 +-TGGTGAGTT 9 +AAGGTAA-AT 8.75 +GC-GTAAGTT 8.5 +T-TGTAAGTT 8.5 +-AGGTATTTT 7.75 +A--GTGAGTT 5.90909 +--GGTAAGTT 4.92308 +-A-GTAAGTT 5.25 +--GGTTAGTT 4.66667 +AAGGTAA--A 4.5 +-AGGTTTGT- 4.25 +--GGTGAGTT 4.125 +-A-GTAAGAA 3.73333 +AAGGTT-GT- 3.54545 +-AGGTGAGA- 3.5 +-ATGTAAGT- 3.41667 +--GGTATGTT 3.3125 +GAGGTAA-T- 3.26667 +CA-GT-AGTT 3.2 +-AGGTAAAA- 3.14286 +-TGGTGAGT- 2.75 +CAGGTAA--A 2.71429 +-AGGTGAG-A 2.66667 +-ATGTA-GTT 2.66667 +AAGGTAT-T- 2.5 +AAGGTT-GA- 2.4375 +CAGGTAT-T- 2.35714 +---GTAAGTT 2.29167 +--GGTAAGAT 2.26667 +---GTGAGTT 1.7 +-A-GT-AGTT 1.47619 +AA-GTAAG-- 1.40816 +-AGGTGAG-- 1.40625 +--TGTAAGT- 1.38889 +--GGTAAG-A 1.32075 +--GGTT-GTT 1.21429 +-A-GTGAGT- 1.16667 +--GGTAA-TT 1.12821 +AAGGTAT--- 1.125 +A--GTA-GTT 1.09756 +CAGGTA---T 1.05556 +AAGGTA---T 1 +CAGGT--GT- 1 +GAGGTA--T- 0.956522 +A-GGTA-GT- 0.871795 +AAGGT-A-T- 0.866667 +CAGGTA--A- 0.857143 +GA-GTAAG-- 0.829787 +-AGGTTTG-- 0.818182 +A--GTAAGA- 0.809524 +--GGTGAG-A 0.795455 +-A-GTGAGA- 0.75 +-TGGTA-G-T 0.74 +GAGGTAT--- 0.729167 +--GGT-AGTG 0.695652 +--AGTAAG-A 0.607843 +-AGGTTTT-- 0.546875 +---GT-AGTT 0.488095 +---GTAAGT- 0.45 +---GTA-GTT 0.415929 +---GTGAGT- 0.413793 +AAGGTA---- 0.40566 +CAGGTA---- 0.375 +GAGGTA---- 0.367188 +-AGGTT-G-- 0.337838 +--GGT-AG-T 0.289855 +-A-GT-A-TT 0.28655 +---GTAAG-T 0.282051 +--GGTAA-T- 0.264957 +--GGTATG-- 0.264368 +AA-GT--GT- 0.259887 +--GGTAA-A- 0.244186 +A--GTAA--A 0.212644 +-ATGTA-G-- 0.171123 +-A-GTGA--A 0.15493 +--GGT--GT- 0.107914 +-AGGTT---- 0.0951684 +--GGTA-G-- 0.0941176 +-A-GT-AG-- 0.0711297 +--GGTA--T- 0.0663812 +-AAGTA---- 0.0590164 +---GTT-GT- 0.0509259 +--TGTAA--- 0.0499376 +-AGGTG---- 0.0425532 +--GGTA---- 0.0223793 +---GT-AG-- 0.0221502 +---GTA-G-- 0.0173088 +---GT-A-T- 0.0157715 +A--GT----T 0.0103437 +---GT--G-- 0.00594251 +---GT---T- 0.00467836 +---GT----- 0.00116399 +end consensus +type 5SS +center 3 +phase 1 +begin consensus +A-AGTGAGTT 9.25 +TT-GTAAGTT 9.25 +-TGGTGAGTT 8.25 +A-TGTAAGTT 8 +-T-GTAAGTT 5.09091 +--AGTGAGTT 4.33333 +A--GTAAGTT 3.66667 +--GGTTAGTT 3.125 +-AGGTAA-TT 2.66667 +TTGGTAAG-- 2.2 +--GGTTTGTT 2.1875 +CA-GTAAGT- 2.06667 +T--GTAAGTT 2.81818 +--GGTGAGTA 1.9375 +---GTGAGTT 1.70455 +-AGGTAA-A- 1.17188 +A--GTAAGT- 1.16667 +--AGT-AGTT 1 +-TGGTAAG-- 0.928571 +AA-GTAAG-- 0.909091 +--GGTA-GTT 0.884615 +---GTAAGTA 0.878049 +--GGTGAGA- 0.796875 +A--GTGAGT- 0.727273 +AAGGTAT--- 0.714286 +-TGGTT-GT- 0.696429 +--AGTA-GTT 0.6875 +---GTAAGAT 0.679245 +-AGGTT-GT- 0.660714 +---GTAAGT- 0.521739 +--GGTATG-A 0.516667 +-AGGTAA--- 0.403846 +--GGTGAG-- 0.342105 +--GGTA--TT 0.316384 +A--GT--GTT 0.283333 +---GTAAG-A 0.245283 +-AGGTA-G-- 0.228758 +-TGGTA-G-- 0.207317 +-A-GTGAG-- 0.19883 +-AGGTT-G-- 0.1875 +-TGGTAA--- 0.183333 +-AGGTG--T- 0.130802 +-AGGTA---- 0.109091 +---GT-AGT- 0.103792 +---GTAAG-- 0.103774 +---GT--GTT 0.0700637 +-A-GTA--T- 0.0614035 +-AGGTT---- 0.0442708 +---GTATG-- 0.0401554 +--GGTA---- 0.0193694 +--GGT--G-- 0.0173797 +A--GT-A--- 0.0144654 +---GT-A--- 0.0048218 +---GT--G-- 0.00395005 +---GT----- 0.00155651 +end consensus +type 5SS +center 3 +phase 2 +begin consensus +-AAGTGAGTT 17.25 +-AAGTAAGTT 13.75 +-GAGTGAGTT 8.5 +TG-GTAAGTT 7.75 +A--GTAAGTT 4.26667 +T--GTAAGTT 4.09091 +-A-GTGAGTT 3.58333 +T--GTGAGTT 3.09091 +-AAGTAAGT- 3.08333 +-AAGTGAGT- 3 +A--GTGAGTT 2.90909 +-A-GTTAGTT 2.6875 +-AGGTAA-TT 2.5 +-A-GTAAGAA 2.4375 +-A-GTAAGAT 2.1875 +-A-GTATGTT 2.125 +A--GTAAGT- 1.24444 +--TGTAAGT- 1.13636 +--AGT-AGTT 1.13158 +-AGGTGAG-- 1.03333 +-AGGTAA--A 0.983051 +-AGGTT-GT- 0.966667 +--CGT-AGTT 0.857143 +-G-GTA-GTT 0.7 +--GGTAAGA- 0.660714 +--GGTT-GTT 0.645833 +-AGGTA-TT- 0.642857 +T--GT-AGTA 0.581818 +---GTAAGT- 0.5 +---GTA-GTT 0.429688 +---GTGAGT- 0.427711 +-A-GTAAG-- 0.381944 +-AGGTAA--- 0.298387 +-AGGTA-G-- 0.272222 +A--GTAAG-- 0.25 +-A-GT--GTT 0.228571 +-AAGTA-G-- 0.2 +---GTAA-TT 0.194444 +---GTGAG-A 0.194444 +---GTTAGT- 0.19209 +-AGGTTT--- 0.158333 +---GT-AGAT 0.153488 +-AGGTA---- 0.100379 +---GTAAG-- 0.097561 +--GGTAA--- 0.0759259 +-A-GT-A-T- 0.0700549 +--GGT--GT- 0.0628019 +AA-GTA---- 0.052381 +---GTATG-- 0.0413437 +--GGT--G-- 0.0202815 +-A-GT-A--- 0.0149254 +---GT---TT 0.0146813 +---GT---T- 0.00532449 +---GTA---- 0.00401891 +---GT----- 0.00131086 +end consensus +type 3SS +center 3 +phase all +begin consensus +CAGAAA 372 +CAGATT 316 +CAGATG 290 +CAGATC 263 +CAGGAA 253 +CAGAAT 246 +CAGAAC 221 +CAGGTT 218 +CAGATA 215 +CAGAGA 200 +CAGGAT 175 +CAGAAG 167 +CAGACA 160 +CAGCAA 158 +CAGACT 151 +CAGGT- 148.667 +CAGGAG 139 +CAGCTC 129 +CAGAGT 127 +CAGGCT 125 +CAGGCA 124 +CAGCTT 116 +CAG-TC 115 +CAGAGC 112 +CAGGGA 112 +CAGACG 109 +CAGCCA 106 +CAG-TT 100 +CAGGAC 100 +CAGCTG 96 +CAGCAT 95 +CAGACC 90 +CAGTGA 89 +CAGCTA 85 +CAGGGT 85 +CAGCAC 80 +TAGAAA 80 +CAGGCC 77 +CAG-TG 73 +CAGAGG 72 +CAG-CA 70 +TAGGAA 66 +CAGGCG 65 +CAG-AC 62 +CAGTTA 62 +CAG-AT 61 +CAGTG- 60.3333 +CAGCGT 55 +TAGGTT 53 +TAGAAT 51 +CAGCCG 50 +TAGAT- 49.25 +CAGTC- 43 +TAGGT- 40.6667 +CAGC-- 37 +TAGG-T 32.3333 +TAGAC- 24.75 +TAG-AC 19.25 +TAG-CA 19 +TAG-GA 17.5 +TAGCT- 16 +-AGGGC 15 +AAGGT- 14.25 +TAG-A- 13 +AAGAT- 12.5 +TAG--T 10 +TAG--- 7.9375 +AAGA-- 5.25 +AAGG-- 4.63636 +AAG--- 1.84375 +-AG--- 1.60606 +end consensus +type 3SS +center 3 +phase 0 +begin consensus +CAGAAA 167 +CAGATT 160 +CAGGTT 160 +CAGATG 145 +CAGATC 143 +CAGGAA 139 +CAGAAT 113 +CAGGAT 111 +CAGAAC 100 +CAGGT- 86.3333 +CAGGCT 82 +CAGGCA 80 +CAGA-A 79.3333 +CAGCTT 79 +CAGGGA 79 +CAGGAG 77 +CAGCAA 75 +CAGCTC 72 +CAGACT 70 +CAGCCA 64 +CAGGGT 59 +CAGAAG 58 +CAG-TC 57 +CAGAGT 49 +CAG-AC 48.6667 +CAGGCC 48 +CAG-TT 45 +CAGACC 45 +TAGGAA 41 +CAGAGC 39 +CAGCAT 34 +CAGGCG 33 +CAGTTG 33 +TAGAAA 33 +TAGGAT 33 +TAGGT- 26.75 +CAG-TA 26 +TAGAT- 23.25 +CAGT-T 21.6667 +CAG-C- 19.7143 +CAG-GC 16.6667 +CAGC-- 15.2 +TAGAA- 14.6667 +TAGG-- 12.2 +TAGAC- 10 +AAGGT- 9.5 +TAG-T- 6.375 +TAG--A 5.85714 +CAG-GG 10 +TAG--T 4.57143 +AAGG-- 3.16667 +AAGA-- 2.875 +-AG--- 0.575221 +end consensus +type 3SS +center 3 +phase 1 +begin consensus +CAGATG 88 +CAGAAA 86 +CAGGAA 80 +CAGGTG 60 +CAGAA- 56.6667 +CAGAT- 56.3333 +CAGCTG 55 +CAGACG 53 +CAGGAG 52 +CAGCAA 43 +CAGGAT 43 +CAG-TC 37.3333 +CAGACT 33 +CAG-T- 26.8571 +CAGT-A 23.6667 +CAG-CA 24.6667 +CAG-AT 24 +CAGAG- 22 +CAG-CG 21.3333 +CAGTG- 19.6667 +CAG-AC 19 +CAG-CT 15 +TAG-AA 12.75 +TAGAT- 12.5 +CAG-CC 11.25 +CAGCG- 11.75 +TAGGT- 9.25 +CAGGG- 8 +TAGA-- 6 +TAG-T- 4.375 +TAGG-- 4.27273 +-AGCA- 2.75 +TAG--- 2.10526 +-AG-A- 1.44 +AAG--- 0.875 +-AG--- 0.145833 +end consensus +type 3SS +center 3 +phase 2 +begin consensus +CAGAAA 119 +CAGAGA 114 +CAGATT 100 +CAGAAT 79 +CAGAAC 76 +CAGAT- 66.3333 +CAGTGA 60 +CAGAG- 49 +CAGCAA 40 +CAGAAG 38 +CAGAC- 36.5 +CAG--A 23.1 +CAGTG- 24 +CAG--T 20 +TAGA-A 17.5 +CAG-TG 16.3333 +CAG--C 14.5455 +TAGA-T 12.5 +CAG--G 8.875 +TAGA-C 8.5 +TAGG-A 8.5 +TAG-T- 4.33333 +AAGA-- 2.5 +TAG--- 2.5 +AAG--- 0.6875 +-AG--- 0.15625 +end consensus +type CDS +phase all +begin consensus +AAA 84633.000000 +AAC 46592.000000 +AAG 60437.000000 +AAT 62516.000000 +ACA 46370.000000 +ACC 24553.000000 +ACG 26490.000000 +ACT 35930.000000 +AGA 58310.000000 +AGC 33692.000000 +AGG 25680.000000 +AGT 32467.000000 +ATA 28413.000000 +ATC 49079.000000 +ATG 55085.000000 +ATT 57612.000000 +CAA 70444.000000 +CAC 32088.000000 +CAG 36967.000000 +CAT 39211.000000 +CCA 41062.000000 +CCC 12109.000000 +CCG 20902.000000 +CCT 16745.000000 +CGA 38937.000000 +CGC 16213.000000 +CGG 19184.000000 +CGT 23512.000000 +CTA 19576.000000 +CTC 36258.000000 +CTG 35452.000000 +CTT 37193.000000 +GAA 78165.000000 +GAC 30372.000000 +GAG 41478.000000 +GAT 54272.000000 +GCA 33687.000000 +GCC 21235.000000 +GCG 14088.000000 +GCT 33925.000000 +GGA 53236.000000 +GGC 16915.000000 +GGG 11380.000000 +GGT 18746.000000 +GTA 18214.000000 +GTC 27126.000000 +GTG 28827.000000 +GTT 37436.000000 +TAA 21016.000000 +TAC 24215.000000 +TAG 11205.000000 +TAT 34251.000000 +TCA 57624.000000 +TCC 32884.000000 +TCG 36367.000000 +TCT 41910.000000 +TGA 53826.000000 +TGC 36079.000000 +TGG 43985.000000 +TGT 36905.000000 +TTA 24513.000000 +TTC 56201.000000 +TTG 51452.000000 +TTT 46581.000000 +end consensus +type CDS +phase 0 +begin consensus +AAA 29214.000000 +AAC 14368.000000 +AAG 21430.000000 +AAT 23711.000000 +ACA 15287.000000 +ACC 7556.000000 +ACG 6113.000000 +ACT 15236.000000 +AGA 13083.000000 +AGC 6073.000000 +AGG 2390.000000 +AGT 9505.000000 +ATA 5289.000000 +ATC 14308.000000 +ATG 19502.000000 +ATT 24404.000000 +CAA 23759.000000 +CAC 6982.000000 +CAG 11123.000000 +CAT 11541.000000 +CCA 22932.000000 +CCC 2834.000000 +CCG 6380.000000 +CCT 7320.000000 +CGA 9565.000000 +CGC 4160.000000 +CGG 2879.000000 +CGT 10219.000000 +CTA 5048.000000 +CTC 11747.000000 +CTG 8079.000000 +CTT 18194.000000 +GAA 35701.000000 +GAC 13432.000000 +GAG 19257.000000 +GAT 31475.000000 +GCA 16802.000000 +GCC 10031.000000 +GCG 5651.000000 +GCT 20774.000000 +GGA 28462.000000 +GGC 4276.000000 +GGG 2708.000000 +GGT 9185.000000 +GTA 7376.000000 +GTC 10825.000000 +GTG 10041.000000 +GTT 20526.000000 +TAA 0.000000 +TAC 10158.000000 +TAG 0.000000 +TAT 12776.000000 +TCA 16414.000000 +TCC 7520.000000 +TCG 9109.000000 +TCT 14215.000000 +TGA 0.000000 +TGC 6122.000000 +TGG 7942.000000 +TGT 8393.000000 +TTA 6535.000000 +TTC 18524.000000 +TTG 15014.000000 +TTT 14468.000000 +end consensus +type CDS +phase 1 +begin consensus +AAA 27124.000000 +AAC 17085.000000 +AAG 28251.000000 +AAT 16214.000000 +ACA 16360.000000 +ACC 5893.000000 +ACG 13413.000000 +ACT 9274.000000 +AGA 18727.000000 +AGC 12003.000000 +AGG 12559.000000 +AGT 8521.000000 +ATA 16240.000000 +ATC 19315.000000 +ATG 28352.000000 +ATT 15596.000000 +CAA 24103.000000 +CAC 13923.000000 +CAG 18346.000000 +CAT 15063.000000 +CCA 10417.000000 +CCC 4057.000000 +CCG 9508.000000 +CCT 3959.000000 +CGA 9972.000000 +CGC 4083.000000 +CGG 7638.000000 +CGT 5560.000000 +CTA 8897.000000 +CTC 15780.000000 +CTG 22899.000000 +CTT 9969.000000 +GAA 18216.000000 +GAC 8305.000000 +GAG 14912.000000 +GAT 9677.000000 +GCA 6876.000000 +GCC 4330.000000 +GCG 5051.000000 +GCT 4374.000000 +GGA 5662.000000 +GGC 3760.000000 +GGG 3721.000000 +GGT 2776.000000 +GTA 6705.000000 +GTC 9254.000000 +GTG 14905.000000 +GTT 6438.000000 +TAA 7340.000000 +TAC 4898.000000 +TAG 5780.000000 +TAT 6230.000000 +TCA 20217.000000 +TCC 9131.000000 +TCG 15798.000000 +TCT 10258.000000 +TGA 18951.000000 +TGC 9200.000000 +TGG 15809.000000 +TGT 8676.000000 +TTA 11663.000000 +TTC 21772.000000 +TTG 29546.000000 +TTT 14611.000000 +end consensus +type CDS +phase 2 +begin consensus +AAA 28295.000000 +AAC 15139.000000 +AAG 10756.000000 +AAT 22591.000000 +ACA 14723.000000 +ACC 11104.000000 +ACG 6964.000000 +ACT 11420.000000 +AGA 26500.000000 +AGC 15616.000000 +AGG 10731.000000 +AGT 14441.000000 +ATA 6884.000000 +ATC 15456.000000 +ATG 7231.000000 +ATT 17612.000000 +CAA 22582.000000 +CAC 11183.000000 +CAG 7498.000000 +CAT 12607.000000 +CCA 7713.000000 +CCC 5218.000000 +CCG 5014.000000 +CCT 5466.000000 +CGA 19400.000000 +CGC 7970.000000 +CGG 8667.000000 +CGT 7733.000000 +CTA 5631.000000 +CTC 8731.000000 +CTG 4474.000000 +CTT 9030.000000 +GAA 24248.000000 +GAC 8635.000000 +GAG 7309.000000 +GAT 13120.000000 +GCA 10009.000000 +GCC 6874.000000 +GCG 3386.000000 +GCT 8777.000000 +GGA 19112.000000 +GGC 8879.000000 +GGG 4951.000000 +GGT 6785.000000 +GTA 4133.000000 +GTC 7047.000000 +GTG 3881.000000 +GTT 10472.000000 +TAA 13676.000000 +TAC 9159.000000 +TAG 5425.000000 +TAT 15245.000000 +TCA 20993.000000 +TCC 16233.000000 +TCG 11460.000000 +TCT 17437.000000 +TGA 34875.000000 +TGC 20757.000000 +TGG 20234.000000 +TGT 19836.000000 +TTA 6315.000000 +TTC 15905.000000 +TTG 6892.000000 +TTT 17502.000000 +end consensus +type Intron_Corr_Term +phase all + 93.3478 +type Intron_Corr_Term +phase 0 + 194.143 +type Intron_Corr_Term +phase 1 + 362.18 +type Intron_Corr_Term +phase 2 + 357.052 +type Intron_emission +begin consensus +A 572525.000000 +C 267017.000000 +G 250100.000000 +T 588809.000000 +end consensus +type Pyrimidine_emission +begin consensus +A 0.000000 +C 0.000000 +G 0.000000 +T 22799.000000 +end consensus +type Spacer_emission +begin consensus +A 117.000000 +C 183.000000 +G 73.000000 +T 172.000000 +end consensus +type Central_Intron_Stay_Prob + 0.994723 +type Pyrimidine_Stay_Prob + 0.719098 +type No_Spacer_Prob + 0.687177 +type Spacer_Stay_Prob + 0.163615 diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/translate.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/translate.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,193 @@ +#!/usr/bin/perl -w +use strict; +use FindBin; +use lib "$FindBin::Bin/lib"; +use Getopt::Long; +use Bio::Perl; +use Bio::SeqIO; +use File::Copy; + +# PROGRAMNAME: translate.pl + +# AUTHOR: INGO EBERSBERGER, ingo.ebersberger@univie.ac.at + +# PROGRAM DESCRIPTION: + +# DATE: Tue May 12 14:03:34 CEST 2009 + + +# DATE LAST MODIFIED: +######################## start main ############################# +my $help; +my @out; +my @estout; +my $infile; +my $trunc = 1; +my $outfile = "translate_tc.out"; +my $limit = 20; ## this sets the maximum length for the sequence identifier. If sequence identifier are +## too long, then one can run into troubles with the parsing of the hmmsearch results. +open (LOG, ">>hamstrsearch.log") or warn "could not open logfile for writing\n"; +print LOG "##### Translation of the ESTs########"; +######### +my $usage = "Name:\n\ttranslate.pl\n +Synopsis:\n\ttranslate_tc5.pl [-infile=FILE] [options] [-outfile=FILE]\n +Description:\n\tThis program takes a batch fasta-file with DNA +\tsequences as an input and translates the individual DNA sequences in +\tall six reading frames. +\t-infile: provide the relative or absolute path of the infile\n +\t-outfile: provide the realtive or absolute path of the +\toutfile. Default is: translate_tc.out\n +\ttrunc: set -trunc=0 to prevent truncation of the sequence header (see below). +\t-h: prints this help-message\n +NOTE: if the seq-id (everything up to the first [[:space:]]) contains a '|' everything between the '>' and the '|' will be taken as seq-id. Otherwise, the entire seq-id will be used. You can change this behavior by setting -trunc=0\n +NOTE: the script as an automated routine to check for unique sequence names in the input file. This may lead to cases where the $trunc value is overruled and additionally part of the sequence description may be included."; +########## + +GetOptions ( + "h" => \$help, + "infile=s" => \$infile, + "outfile=s" => \$outfile, + "trunc=s" => \$trunc); +if ($help) { + print "$usage"; + exit; +} +if (-e "$outfile") { + print LOG "an outfile $outfile already exists. Renaming to $outfile.old\n\n"; + my $newname = "$outfile.old"; + rename($outfile, $newname); +} +##BUG -- BIOPERL GUESSES PROTEIN FILE FORMAT WHEN AMBIGUITY CODES ARE PRESENT +##CAUSING AN ERROR IN THE TRANLATE_6 FRAMES, WHICH INTERRUPTS ALL TRANSLATION -- THO +#below replaces read_all_sequences function to declare that sequence is DNA +#original line next +#my @seq_object = read_all_sequences($infile, 'fasta'); + + my $tempseqio; + $tempseqio = Bio::SeqIO->new( '-file' => $infile, '-format' => 'fasta'); + my @seq_object; + + while( my $seq = $tempseqio->next_seq() ) { + $seq->alphabet('dna'); + push(@seq_object,$seq); + } +#End THO changes + + +## determine whether the seq-ids are unique given the chosen value for $trunc +my ($message, $cont, $check) = &checkIds(); +if ($cont == 1) { + ## the check for unique identifiers has failed and the programm is exiting + print LOG "$message\n"; + exit; +} +else { + print LOG "All sequence identifier are unique!\n"; + if ($check == 2) { + my $newname = "$infile.original"; + rename($infile, $newname); + print LOG "Sequence description was needed to make seq-id unique. The original version of the infile was stored in $infile.original\n"; + } + for (my $j = 0; $j < @seq_object; $j++) { + my $finalid = $seq_object[$j]->{finalid}; + my $estseq = $seq_object[$j]->seq; + my $inid = $seq_object[$j]->display_id; + my @all_trans = Bio::SeqUtils->translate_6frames($seq_object[$j]); + for (my $i = 0; $i < @all_trans; $i++) { + my $count = $i+1; + my $id = $all_trans[$i]->display_id; + my $seq = $all_trans[$i]->seq; + $id =~ s/$inid/$finalid/; + $id =~ s/-[0-9][RF]/_RF$count.0/; + push @out, ">$id\n$seq"; + } + push @estout, ">$finalid\n$estseq"; + if ($j%100 == 0) { + print "$j Sequences processed\n"; + open (OUT, ">>$outfile") or die "failed to open outfile\n"; + print OUT join "\n", @out; + print OUT "\n"; + @out = qw(); + close OUT; + if ($check == 2) { + ## part of the description was added to the seq-id + open (OUT, ">>$infile"); + print OUT join "\n", @estout; + print OUT "\n"; + @estout = qw(); + } + } + } + open (OUT, ">>$outfile") or die "failed to open outfile\n"; + print OUT join "\n", @out; + print OUT "\n"; + @out = qw(); + close OUT; + if ($check == 2) { + ## part of the description was added to the seq-id + open (OUT, ">>$infile"); + print OUT join "\n", @estout; + print OUT "\n"; + close OUT; + @estout = qw(); + } +} +close LOG; +exit; +########################## start sub ################ +sub checkIds { + my $message; + my $check = 1; + my $cont = 1; + my $counter; + ## Everything up to the first whitespace + ## in the fasta header will be taken as sequence id by bioperl. If this + ## id contains a '|' and $trunc is set to 1 (default), the ids may no longer + ## be unique. This will be checked and if necessary the id will not be truncated + ## for $check == 0, the truncated version of the id will be checked (only if $trunc == 1) + ## for $check == 1, the complete id will be checked + ## for $check == 2, the first 20 characters of the concatenated id and description + ## will be checked + if ($trunc == 1) { + $check = 0; + } + + while ($check < 3 and $cont == 1) { + $cont = 0; + for (my $i=0; $i < @seq_object; $i++) { + my $id = $seq_object[$i]->display_id; + $id =~ s/(.{0,$limit}).*/$1/; + if ($check == 0) { + $id =~ s/|.*//; + } + elsif ($check == 2) { +print "Check 2: ".$seq_object[$i]->display_id." \n"; + $id = $id . '_' . $seq_object[$i]->desc; + $id =~ s/(.{0,$limit}).*/$1/; + } + if (defined $counter->{$id}) { + if ($check == 0) { + $message = "trying next without truncating the id"; + } + elsif ($check == 1) { + $message = 'trying next to include sequence description'; + } + else { + $message = "Sequence identifier are not unique, using the first 20 characters. Aborting..."; + } + print LOG "sequence ids are not unique in the file $infile, $message. The offending identfier is $id\n\n"; + print "sequence ids are not unique in the file $infile, $message. The offending identfier is $id\n\n"; + $check ++; + $cont = 1; + $counter = undef; + last; + } + else { + $counter->{$id} = 1; + $seq_object[$i]->{finalid} = $id; + } + } + } + ## return the value of $cont. If this is 1, then the sequence id check has failed. + return($message, $cont, $check); +} diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/translate.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/translate.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,23 @@ + + Runs Translate + translate.pl -infile=$inFile 2>log.txt + + + + + + + + + + + Translate is a part of Hamster. + + Note: Translate will only read the first 20 characters of a header. If there are identical + headers within your file, the script will crash. + + Input: FASTA file containing ESTs. + + + + diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/ucsb_galaxy_hamster.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/ucsb_galaxy_hamster.sh Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,87 @@ +#!/bin/sh +# set -x +# Uncomment 'set -x' for debug information + +############################################# +# UCSB HAMSTER - GALAXY HISTORY # +# # +# Executed when user chooses Galaxy history # +############################################# + +# Set your hamster script location here. The directory containing these scripts should be in +#your path +script="hamstrsearch_local-hmmer3.pl" +# Set your unbuild.py script location here +unbuild="unbuild.py" +# Set your emap2fasta.pl script location here +emap2fasta="emap2fasta.pl" + + +# 1 - Sequence input file +# 2 - Proteins results output file +# 3 - CDS results output file +# 4 - Screen log +# 5 - Species name +# 6 - whether to use EST flag D=DNA so use -est flag P=Protein so do not use -est flag in hmmstr call +# 7 - HMM Input from UCSB HMMBUILD +# 8 - MUSCLE data from UCSB MUSCLE +# 9 - Reference Species File +# 10 - Reference Species Name + +input=$1 +proteins=$2 +cdsfile=$3 +screenlog=$4 +speciesName=$5 +datatype=$6 +hmm_data=$7 +muscle_data=$8 +filepath=`pwd` +tail="_prot" +tail2="_temp" + +# set flag based on input +if [ $datatype = "P" ]; + then + estflag="-protein" + else + estflag="-est" +fi + +refspfile=${9} +refsphist=${10} + + + +echo "Protein or EST? : $estflag" >> $screenlog +echo "Reference genome file from galaxy history: $refspfile" >> $screenlog +echo "Reference species genome name: $refsphist" >> $screenlog + +# unbuild.py here on $hmm_data +mkdir core +mkdir core/hmm_dir +cp $hmm_data core/core.fa + +$unbuild core/hmm_dir core/core.fa +cp core/hmm_dir/hmmlist.txt core/hmmlist.txt + +# use formatdb to generate new blastdb from this input file +refsphistGALAXY=$refsphist +mkdir $refsphistGALAXY + +cp $muscle_data $refsphistGALAXY/$refsphist$tail2 +$emap2fasta $refsphistGALAXY/$refsphist$tail2 $refsphist +cp full.fasta core/core.fa + +cp $refspfile $refsphistGALAXY/$refsphist$tail +cd $refsphistGALAXY +formatdb -t $refsphist -i $refsphist$tail -n $refsphist$tail + +echo "*** Direcotry Structure of Ref. Genome ***" >> $screenlog +ls -l >> $screenlog +echo >> $screenlog + +cd $filepath + +# script execution +$script -sequence_file=$1 $estflag -taxon=$5 -hmmset=core -refspec=$refsphistGALAXY -galaxyout=$2 -2galaxyout=$cdsfile 2>log.txt >> $screenlog diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/ucsb_hamster.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/ucsb_hamster.sh Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,67 @@ +#!/bin/sh + +############################################# +# UCSB HAMSTER # +# # +# Executes Hamster with given XML parameter # +############################################# + +#the name of the script is here +#script="/home/osiris/galaxy-dist/tools/osiris/orthologs/ucsb_hamster/hamstrsearch_local-hmmer3.pl" +script="hamstrsearch_local-hmmer3.pl" + + + +#Variables input from xml +# 1 - Sequence input file +# 2 - Proteins Results Output file +# 3 - cds Results Output file +# 4 - Screen Log +# 5 - Species Name +# 6 - Whether to use -est flag (if D) or not (if P) +# 7 - Core ortholog name +# 8 - Base path for local core orthologs +# 9 - Base path for local reference blast database +#10 - Reference genome + +input=$1 +proteins=$2 +cdsfile=$3 +screenlog=$4 +species=$5 +datatype=$6 +core=$7 +corepath=$8 +blastpath=$9 +genome=${10} + +echo "ucsb_hamster.sh script parameters" >> $screenlog +echo "Core ortholog name is $core " >> $screenlog +echo "Reference genome name is $genome " >> $screenlog +echo "Species name is $species " >> $screenlog +echo "Datatype $datatype " >> $screenlog + +#set flag based on input +if [ $datatype = "P" ]; + then + estflag="-protein" + else + estflag="-est" + fi + +# First copy hmm's to working directory +# Currently copies from Data directory +mkdir $core +mkdir $core/hmm_dir +cp -r $corepath/* ./$core/ +echo "cp $corepath/hmm_dir/* ./$core/hmm_dir/" >> $screenlog +cp -r $corepath/hmm_dir/* ./$core/hmm_dir/ + +# Currently copies from data directory +mkdir $genome +cp $blastpath/* ./$genome/ + +# Now call the actual Hamster Script +#$script -sequence_file=$input $estflag -taxon=$species -hmmset=$core -refspec=$genome -galaxyout=$proteins -2galaxyout=$cdsfile >> $screenlog 2>log.txt +$script -sequence_file=$input $estflag -taxon=$species -hmmset=$core -refspec=$genome -galaxyout=$proteins -2galaxyout=$cdsfile >> $screenlog + diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/ucsb_hamster.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/ucsb_hamster.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,69 @@ + + UCSB Hamster is a Galaxy port for the Hamster script which finds orthologs based on HMM's. + + #if $db_opts.db_opts_selector == "local": + ucsb_hamster.sh $infile $proteins $cds $log $species $estflag + ${db_opts.database.fields.value} + ${db_opts.database.fields.corepath} + ${db_opts.database.fields.blastpath} + ${db_opts.database.fields.refsp} + #elif $db_opts.db_opts_selector == "hist": + ucsb_galaxy_hamster.sh $infile $proteins $cds $log $species $estflag + ${db_opts.hmm_input} + ${db_opts.muscle_data} + ${db_opts.refspgalaxyhist} + ${db_opts.refspgalaxyname} + #else: + #end if# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +HaMster finds orthologs in a fasta file. Orthologs are found by searching an input fasta file using HMM models. The HMM models come from aligning "core orthologs" from whole genomes. The best hit is then checked against a reference genome using BLAST. + http://www.biomedcentral.com/1471-2148/9/157 + + + diff -r 000000000000 -r 5b9a38ec4a39 orthologs/ucsb_hamster/unbuild.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/unbuild.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,54 @@ +#!/usr/bin/env python +''' +Usage: unbuild.py outputDirectoryName hmmFileWithTabs +This script takes a tab sperated, newline escaped HMM models file then splits the models up and makes a file list. +The output files are put into the directory outputDirectoryName. +''' +from sys import argv, exit +import os + + +def parse(line): + line = line.partition('\t') + name = line[0] + ".hmm" + data = line[2].replace("\\n", "\n") + return name, data + + +def writeFile(location, data): + try: + with open(location, 'w') as f: + f.write(data) + except: + pass + + +def createDir(dir): + if not os.path.exists(dir): + os.makedirs(dir) + + +if __name__ == "__main__": + if len(argv) != 3: + print "Usage: unbuild.py outputDirectoryName hmmFileWithTabs" + exit() + dir = argv[1] + createDir(dir) + + name_list = [] # Stores new file names + + # Read data and seperate into files. + with open(argv[2], 'r') as f: + for line in f: + name, data = parse(line) + if data.strip() == "": # no data? skip it. + continue + name_list.append(name) + name = dir + os.sep + name + writeFile(name, data) + + # Write new file names to a file. + name_list_location = dir + os.sep + "hmmlist.txt" + with open(name_list_location, 'w') as f: + for name in name_list: + f.write(name + "\n") diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/addstring2fashead.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/addstring2fashead.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,21 @@ +#!/usr/bin/perl -w + +use strict; + +use FindBin; +use lib "$FindBin::Bin/lib"; +#use Bio::DB::Fasta; +use Bio::SeqIO; +use Bio::Seq; + +# open infile fasta file + my $in_obj = Bio::SeqIO->new(-file => $ARGV[0], '-format' =>'fasta'); + + my $currentinput = $ARGV[1]; + +# grab sequence object + while (my $seq = $in_obj->next_seq() ) { + my $seq_obj = $in_obj; + print ">".$currentinput.$seq->id."\n"; + print $seq->seq."\n"; + } diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/addstring2fashead.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/addstring2fashead.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,51 @@ + + Converts FASTA file with sequences from same species and gene family to PHYTAB format + + addstring2fashead.pl $infile $species > $outfile + 2>&1 + + + + + + + + + +**What it does** + +This tool adds a text string to the beginning of all headers of an input file in FASTA format. The FASTA +input file should have sequences from the same species and gene family. The output is a PHYTAB format file. + +------ + +**Inputs** + +FASTA file of sequences from same species and gene family. FASTA format: +http://www.ncbi.nlm.nih.gov/blast/blastcgihelp.shtml + +------ + +**Outputs** + +PHYTAB file format. Description: +http://osiris-phylogenetics.blogspot.com/2012/09/introduction-to-phytab-format.html + +------- + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------ + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a +publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/beautify_fasta.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/beautify_fasta.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,50 @@ +#!/usr/bin/perl -w + +use strict; + +use FindBin; +use lib "$FindBin::Bin/lib"; +use Bio::DB::Fasta; +use Bio::SeqIO; +use Bio::Seq; + +#inputs +my $infile=shift(@ARGV); +my $outfile=shift(@ARGV); +my $delpipes=shift(@ARGV); +my $convgi=shift(@ARGV); +my $delslash=shift(@ARGV); +my $renumber=shift(@ARGV); +my $space=shift(@ARGV); + +my $seqid; +my $newnumbers=1; #for sequential renumbering of header +# open infile fasta file +my $in_obj = Bio::SeqIO->new(-file => $infile, '-format' =>'fasta'); +open FILE, ">$outfile" or die $!; + +while (my $seq = $in_obj->next_seq() ) { + my $sequence = $seq->seq; + $seqid = $seq->id; + + if($delslash eq 'yes'){ + $seqid =~ s/\\/_/g; + } + if($convgi eq 'yes'){ + $seqid =~ s/gi\|/gi_/g; + } + if($delpipes eq 'yes'){ + $seqid =~ s/\|/ /g; + } + $sequence =~ s/\n//g; + $sequence =~ tr/a-z/A-Z/; + print FILE ">"; + if($renumber eq 'yes'){ + print FILE $newnumbers; + if($space eq 'yes'){ + print FILE " "; + } + $newnumbers++; + } + print FILE $seqid." ".$seq->desc."\n".$sequence."\n"; +} diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/beautify_fasta.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/beautify_fasta.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,55 @@ + + Converts interleaved FASTA to non-interleaved + + beautify_fasta.pl $infile $outfile $delpipes $convgi $delslash $renumber $space + + + + + + + + + + + + + **What it does** + +Input an interleaved FASTA file. Beautify_FASTA converts interleaved FASTA to +non-interleaved (full sequence on one line). In addition, you can choose to +convert symbols from the FASTA ID. The FASTA ID is the descriptor from the > +to the first space. The FASTA description (first line after any space) is not changed. + +------ + +**Inputs** + +Interleaved FASTA file. + +------ + +**Outputs** + +Non-interleaved FASTA file with optional modifications to the FASTA ID. + +------- + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the +osiris_phylogenetics site at bitbucket.org + +------ + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please +consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + + diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/change_sp.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/change_sp.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,70 @@ +#!/usr/bin/perl -w +use strict; + +my $infile = $ARGV[0]; +my $changefile = $ARGV[1]; +my $outfile = $ARGV[2]; + +open(IN, "$infile") or exit; +open(CHANGE, "$changefile"); +open(OUT, ">$outfile") or exit; + + +my %speciesFor; #Hash to associate code with species name + +while () +{ + chomp; + my $currentinput = "$_"; + if($currentinput =~m /\t/){ #must have a tab otherwise wrong file format + if($currentinput =~m /\t\t/){ + print OUT "ERROR: file contains 2 tabs in a row. Check phytab format.\n"; + die("ERROR: file contains 2 tabs in a row. Check it is in phytab format"); + }else{ + my @changepair = split(/\t/, $currentinput); + my $codename=$changepair[0]; + my $sp_name = $changepair[1]; + if (exists $speciesFor{$codename}) { + print OUT "ERROR: Species name specification for $codename is duplicated\n"; + die("ERROR: Species name specifiation for for $codename is duplicated\n"); + }else{ + $speciesFor{$codename}=$sp_name; + } + } + }else{ + die "ERROR: Species conversion table must be genefamily\tmodel and contain no blank lines\n"; + } +} +while () { + chomp; + my $currentinput = "$_"; + if($currentinput =~m /\t/){ #must have a tab otherwise wrong file format + if($currentinput =~m /\t\t/){ + print OUT "ERROR: file contains 2 tabs in a row. Check phytab format.\n"; + die; + }else{ + my @changepair = split(/\t/, $currentinput); + my $sp_name=$changepair[0]; + + if (exists $speciesFor{$sp_name}) { + $currentinput =~s /$sp_name/$speciesFor{$sp_name}/ ; + print OUT $currentinput."\n"; + }else{ + print OUT $currentinput."\n"; + } + } + }else{ + die "ERROR: Input a PHYTAB file in Tabular format\n"; + } +} +close(IN); +close(OUT); +close(CHANGE); + +sub change +{ + + my $changetext = shift; + $changetext =~ s/ /_/g; + return $changetext; +} diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/change_sp.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/change_sp.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,47 @@ + + Change code name to full species name in fasta or phytab + + change_sp.pl $infile $changefile $outfile + + + + + + + + + +**What it does** + +Many times a dataset will have code names instead of full species names. This will search for a code name and replace with a species name based on a table of input values. + +------ + +**Inputs** + +PHYTAB with species codes + +------ + +**Outputs** + +PHYTAB with full species names replacing codes +------ + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the +osiris_phylogenetics site at bitbucket.org + +------ + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use +of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/fasta2phylip-e.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/fasta2phylip-e.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,53 @@ + + Convert Aligned FASTA to PHYLIP extended + + seqConverterG.pl -d$input $format -O$output + + + + + + + + + + + + + +**What it does** + +FASTA2PHYLIP-E uses seqConverter.pl (Bininda-Emonds, 2010) to convert an aligned FASTA input file to PHYLIP extended file format. + +------ + +**Inputs** + +Aligned FASTA file. + +------ + +**Outputs** + +PHYLIP extended file. + +------ + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a +publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Bininda-Emonds, O.R.P. 2010. seqConverter.pl. Program distributed by the author. AG Systematik und +Evolutionsbiologie, IBU - Fakultät V, Carl von Ossietzky Universität Oldenburg. + +http://www.molekularesystematik.uni-oldenburg.de/33997.html#Sequences + + + diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/fasta2phytab.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/fasta2phytab.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,40 @@ +#!/usr/bin/perl -w + +use strict; + +use FindBin; +use lib "$FindBin::Bin/lib"; +use Bio::DB::Fasta; +use Bio::SeqIO; +use Bio::Seq; + +#inputs +my $infile=shift(@ARGV); +my $species=shift(@ARGV); +my $partition=shift(@ARGV); +my $delpipes=shift(@ARGV); +my $fromfasta; +#for debugging xml input +#print "$infile $species $partition $delpipes\n"; +#exit; + +if($species eq "from fasta"){ + $fromfasta=1; +} +my $seqid; +# open infile fasta file +my $in_obj = Bio::SeqIO->new(-file => $infile, '-format' =>'fasta'); + +#no warnings 'uninitialized'; #Was getting error on one fasta for uninitialized sequences. Never could track down why and used this as a workaround +while (my $seq = $in_obj->next_seq() ) { + my $sequence = $seq->seq; + $seqid = $seq->id; + if($delpipes eq 'yes'){ + $seqid =~ s/\|/_/g; + } + if($fromfasta){ + $species = $seqid; + } + $sequence =~ s/\n//g; + print $species."\t".$partition."\t".$seqid."\t".$sequence."\n"; +} diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/fasta2phytab.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/fasta2phytab.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,66 @@ + + Converts FASTA file with sequences from same gene family to PHYTAB format + + #if $sp_opts.sp_opts_selector=="fasta" + fasta2phytab.pl $infile 'from fasta' $partition $convert > $outfile + #else + fasta2phytab.pl $infile $sp_opts.species $partition $convert > $outfile + #end if + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +FASTA2PHYTAB takes an input FASTA file with sequences from the same gene family, and adds species name - either from the fasta header or from a single text entry - and +partition (gene family) name that will apply for all sequences. The output is a PHYTAB tabular format file. + +------ + +**Inputs** + +A FASTA file with all sequences from the same gene family. + +------ + +**Outputs** + +PHYTAB file format: http://osiris-phylogenetics.blogspot.com/2012/09/introduction-to-phytab-format.html + +------- + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the +osiris_phylogenetics site at bitbucket.org + +------ + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a +publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/gb2phytab.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/gb2phytab.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,41 @@ +#!/usr/bin/perl -w +use strict; + +use Bio::SeqIO; + + +my $datafile = $ARGV[0]; +my $outfile = $ARGV[1]; + +open FILE, ">$outfile" or die "Cannot Write File\n"; + +my $seqio_object = Bio::SeqIO->new(-file => $datafile,'-format' => 'genbank'); + +while(my $seq_object = $seqio_object->next_seq){ + my $organism = $seq_object->species->binomial(); + $organism =~ s/ /_/g; + my $accession = $seq_object->id; + for my $feat_object ($seq_object->get_SeqFeatures) { + if ($feat_object->primary_tag eq "CDS") { + my $sequence = $feat_object->spliced_seq->seq; + if ($feat_object->has_tag('gene')) { + for my $name ($feat_object->get_tag_values('product')){ + $name =~ s/ /_/g; + print FILE $organism."\t".$name."\t".$accession."\t".$sequence."\n"; + } + } + }elsif ($feat_object->primary_tag eq "misc_RNA") { + my $sequence = $feat_object->spliced_seq->seq; + if ($feat_object->has_tag('product')) { + for my $name ($feat_object->get_tag_values('product')){ + $name =~ s/ /_/g; + print FILE $organism."\t".$name."\t".$accession."\t".$sequence."\n"; + } + } + } + + } +} +close FILE; + + diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/gb2phytab.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/gb2phytab.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,51 @@ + + Convert GenBank format flat file to PHYTAB tabular file + + raxml + + + gb2phytab.pl $data_file $outfile 2>&1 + + + + + + + + +**What it does** + +This tool converts a file in the GenBank (NCBI) flat file format to the PHYTAB tabular format. + +------ + +**Inputs** + +GenBank flat file format. Description: http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord + +------ + +**Outputs** + +PHYTAB tabular file format. Description:: +http://osiris-phylogenetics.blogspot.com/2012/09/introduction-to-phytab-format.html + +------ + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the +osiris_phylogenetics site at bitbucket.org + +------ + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a +publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/length_outliers.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/length_outliers.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,59 @@ +#!/usr/bin/perl -w + +use strict; + +use Bio::DB::Fasta; +use Bio::SeqIO; +use Bio::Seq; + +#inputs +my $infile=shift(@ARGV); +my $outfile=shift(@ARGV); +my $deloutfile=shift(@ARGV); +my $percent=shift(@ARGV); + +my $seqid; +my $newnumbers=1; #for sequential renumbering of header + +open FILE, ">$outfile" or die $!; +open DELFILE, ">$deloutfile" or die $!; + + +# open infile fasta file to get average length +my $in_obj = Bio::SeqIO->new(-file => $infile, '-format' =>'fasta'); + +my $seqcount; +my $seqsum = 0; +my $avelen; +while (my $seq = $in_obj->next_seq() ) { + my $sequence = $seq->seq; + my $seqlen = length($sequence); + $seqcount++; + $seqsum = $seqsum + $seqlen; +} +$avelen = $seqsum/$seqcount; +print "AVE= $avelen \n"; + + +# open infile fasta file to get average length +$in_obj = Bio::SeqIO->new(-file => $infile, '-format' =>'fasta'); + +while (my $seq = $in_obj->next_seq() ) { + my $sequence = $seq->seq; + $seqid = $seq->id; + $sequence =~ s/\n//g; + $sequence =~ tr/a-z/A-Z/; + my $seqlen = length($sequence); + + if($seqlen > ($avelen * ($percent/100) ) ){ + print FILE ">"; + print FILE $seqid." ".$seq->desc."\n".$sequence."\n"; + }else{ +print "Writing sequence of $seqlen to DELFILE\n"; + print DELFILE ">"; + print DELFILE $seqid." ".$seq->desc."\n".$sequence."\n"; + } +} + +close FILE; +close DELFILE; diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/length_outliers.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/length_outliers.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,49 @@ + + Identifies sequences shorter than average in FASTA file + + length_outliers.pl $infile $outfile1 $outfile2 $percent + + + + + + + + + + +**What it does** + +Deletes sequences from a fasta file that are shorter than a specified percentage of the average of all sequences. + +------ + +**Inputs** + +FASTA + +------ + +**Outputs** + +2 FASTA files: one with sequences that are long enough, the other with sequences that are too short. + +------ + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the +osiris_phylogenetics site at bitbucket.org + +------ + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use +of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/phylip2fasta.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/phylip2fasta.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,27 @@ +#!/usr/bin/perl -w + + +my $infile = $ARGV[0]; +my $interlv = $ARGV[1]; +my $idlen = $ARGV[2]; +my $outfile = $ARGV[3]; + +#open (OUT, ">$outfile"); + +use strict; + use Bio::AlignIO; + use Bio::SimpleAlign; + #you can set the name length to something other than the default 10 + #if you use a version of phylip (hacked) that accepts ids > 10 + my $outstream = Bio::AlignIO->new(-format => 'fasta', + -fh => \*STDOUT ); + + # convert data from one format to another + my $phylipstream = Bio::AlignIO->new(-interleaved => $interlv, + -format => 'phylip', + -file => '<'.$infile, + -idlength=>$idlen ); + while( my $aln = $phylipstream->next_aln ) { + $outstream->write_aln($aln); + } + diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/phylip2fasta.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/phylip2fasta.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,48 @@ + + Converts phylip file to fasta format + + phylip2fasta.pl $infile $interlv $idlen > $outfile + + + + + + + + + + **What it does** + +phylip2fasta takes an input phylip file and converts it to a fasta file + +------ + +**Inputs** + +A phylip file with sequences from the same species and gene family. The user must choose if the file +is interleaved, and specifiy the number of characters in the ID's of the sequences (default phylip +format is 10). ------ + +**Outputs** + +FASTA file + +------- + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or preferably, if you +can, enter them on the osiris_phylogenetics repository site at bitbucket.org + +------ + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of +this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/phyloconversion.tool_conf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/phyloconversion.tool_conf Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,20 @@ +
+
diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/prune_phytab_using_list.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/prune_phytab_using_list.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,31 @@ +#!/usr/bin/python -tt + +##usage: ./pullgoodseqs.py > outfile +#import modules +import sys, os, numpy, re + +def read(filename): + f = open(filename) + bad = open(sys.argv[2]) + lines = f.readlines() + badlines = bad.readlines() + badstripped = [line[:-1] for line in badlines] + str1 = '|'.join(badstripped) + str2 = '('+str1[:-1]+')' + pattern = re.compile(str2) + count=0 + for line in lines: +# line.strip() + match = pattern.findall(line) + if match and sys.argv[3] == 'keep': + print line, + if not match and sys.argv[3] == 'discard': + print line, + bad.close() + f.close() + +def main(): + read(sys.argv[1]) + +if __name__ == '__main__': + main() diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/prune_phytab_using_list.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/prune_phytab_using_list.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,76 @@ + + Filters PHYTAB dataset by comparison to a text list + + prune_phytab_using_list.py $input1 $matchlist $tokeep > $output + + + + + + + + + + + + + + + +**What it does** + +This tool filters a PHYTAB sequence file to remove undesired sequences based on a list provided. + +----- + +**Basic Example** + +The input data must be in phytab column format (fields are tab-delimited). Column 1 is species name, C2 is genefamily, C3 unique sequence identifier, C4 is sequence:: + + species1 gene1 uniquenameA acgttagcgcgctatagc + species2 gene1 uniquenameB acgttag--cgctataaa + species3 gene1 uniquenameC acgttagcgcgctatagc + species4 gene1 uniquenameD acgttagcgcgctatagc + species1 gene2 uniquenameE --gttagtttgcta + species3 gene2 uniquenameF gtgttagtttgcta + +Sequences from selected taxa, genes, or specific sequences provided on the List input will be excluded or retained (depending on the popup option selected) in the resulting PHYTAB output. +The format of the list may consist of + +taxa only:: + + species1 + species4 + +genes only:: + + geneA + geneB + +specific genes from select taxa (tab-delimited):: + + species1 geneA + species4 geneB + +(This last sort of list is produced by the tool 'Long Branch Finder'.) + +------ + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the +osiris_phylogenetics site at bitbucket.org + +------ + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + + + diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/raxify_fasta.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/raxify_fasta.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,32 @@ +#!/usr/bin/perl -w +use strict; + +my $infile = $ARGV[0]; +my $outfile = $ARGV[1]; + +open(IN, "$infile") or exit; +open(OUT, ">$outfile") or exit; + +while () { + my $line = $_; + if($line =~ m/\>/ ){ + $line=raxify($line); + } + if($line =~ m/\n/){ + print OUT $line; + }else{ + print OUT $line."\n"; + } +} +close(IN); +close(OUT); + +sub raxify +{ + my $raxline = shift; + $raxline = substr($raxline,0,51); + $raxline =~ s/\./_/g; + $raxline =~ s/\|/_/g; + $raxline =~ s/ /_/g; + return $raxline; +} diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/raxify_fasta.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/raxify_fasta.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,50 @@ + + Alters FASTA headers for more consistent use in FASTA2PHYLIPe for +RAxML + + raxify_fasta.pl $infile $outfile + + + + + + + + +**What it does** + +Truncates FASTA header to 50 characters then replaces spaces, pipes and periods with +underscores. + +------ + +**Inputs** + +FASTA + +------ + +**Outputs** + +PHYLIP extended + +------ + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the +osiris_phylogenetics site at bitbucket.org + +------ + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use +of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/remove_gbs_dupes.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/remove_gbs_dupes.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,43 @@ +#!/usr/bin/perl -w +use strict; + +my $datafile = $ARGV[0]; +my $keepfile = $ARGV[1]; +my $delfile = $ARGV[2]; +my $subsp = $ARGV[3]; +my $var = $ARGV[4]; + +open (FILE,"<$datafile") or die "Cannot open file input file\n"; +open (KFILE,">$keepfile") or die "Cannot open file $keepfile\n"; +open (DFILE,">$delfile") or die "Cannot open file delfile\n"; + +my $keep = 1; + +while () +{ + if($_ =~ m/_\d/){ + $keep=0; + }else{ + if($subsp==1){ + if($_ =~ m/subsp/){ + $keep=0; + } + } + if($var==1){ + if($_ =~ m/_var_/){ + $keep=0; + } + } + } + + if($keep == 0){ + print DFILE $_; + }else{ + print KFILE $_; + } + $keep=1; #reset variable. Default is keep +} +close FILE; +close KFILE; +close DFILE; + diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/remove_gbs_dupes.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/remove_gbs_dupes.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,44 @@ + + Finds duplicates in genbankstrip output files + + remove_gbs_dupes.pl $infile $okfile $dupefile $subsp $var + + + + + + + + + + + + +**What it does** + +Finds species names that end in _2 or _3 or ... _N which are duplicate genes in a species in genbankstrip, and it writes those lines to another file. + +------ + +**Inputs** + +1. PHYTAB format input file. + +------ + +**Outputs** + +Two PHYTAB format files. One contains only unique species. The other contains duplicates. + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + + diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/remove_phytab_dupes.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/remove_phytab_dupes.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,60 @@ +#!/usr/bin/perl +use strict; + + + +my $infile=$ARGV[0]; +my $keeplongest=$ARGV[1]; +my $ignoregaps=$ARGV[2]; +my $uniout=$ARGV[3]; +my $dupout=$ARGV[4]; + +open IN, $infile or die "Cannot open $infile\n"; + +my %UniquesHash; +my @DupeArray; + +while(){ + my $row = $_; + chomp($row); + my @column = split(/\t/, $row); + my $species = $column[0]; + my $partition = $column[1]; + my $id = $column[2]; + my $sequence = $column[3]; + + if(exists $UniquesHash{$species}{$partition}){ + my @dupeseq = split(/\t/, $UniquesHash{$species}{$partition}); + my ($savlen,$curlen); + if($ignoregaps==1){ + my $nogapsav = $dupeseq[1]; + my $nogapcur = $sequence; + $nogapsav =~ s/\-//g; + $nogapcur =~ s/\-//g; + $savlen = length($nogapsav); + $curlen = length($nogapcur); + }else{ + $savlen = length($dupeseq[1]); + $curlen = length($sequence); + } + if($curlen > $savlen && $keeplongest==1) { #current is longer so keep that one + my $oldline = $species."\t".$partition."\t".$UniquesHash{$species}{$partition}."\n"; + $UniquesHash{$species}{$partition} = "$id\t$sequence"; + push(@DupeArray, $oldline); + }else{ + push(@DupeArray, "$species\t$partition\t$id\t$sequence\n"); + } + }else{ + $UniquesHash{$species}{$partition} = "$id\t$sequence"; + } +} + +open OUT, ">".$uniout or die "Cannot open $uniout\n"; +open DUPES, ">".$dupout or die "Cannot open $dupout\n"; + +print DUPES @DupeArray; +for my $spname ( keys %UniquesHash ) { + for my $partname ( keys %{ $UniquesHash{$spname} } ) { + print OUT "$spname\t$partname\t$UniquesHash{$spname}{$partname}\n"; + } +} diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/remove_phytab_dupes.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/remove_phytab_dupes.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,48 @@ + + Finds duplicates in phytab format files + + remove_phytab_dupes.pl $infile $keeplongest $ignoregaps $okfile $dupefile + + + + + + + + + + + + + + +**What it does** + +Finds lines with identical species (column 1) and partition (column 2) and writes on of the duplicates to one file, unique lines to another file. + +------ + +**Inputs** + +1. PHYTAB format input file. +2. Choose whether to keep the longest or first sequence of a duplicated species in output file one. +3. Choose whether to ignore gap characters (-) when comparing length of sequences for input #2. + +------ + +**Outputs** + +Two PHYTAB format files. One contains only unique species. The other contains duplicates. + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + + diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/seqConverterG.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/seqConverterG.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,2062 @@ +#!/usr/bin/perl -w +# +# seqConverter.pl v1.2 +# Last modified August 12, 2010 21:09 +# Minor modifications for running in Galaxy by TH Oakley Jan 2012 +# (c) Olaf R.P. Bininda-Emonds +# +# Input: +# Sequence data in any of fasta, GenBank, nexus, (classic or extended) phylip, +# or Se-Al formats +# +# Output: +# Sequence data in any of fasta, nexus, (classic or extended) phylip, and/or +# Se-Al formats. +# +# Usage: seqConverter.pl -d -o [-a] [-c] [-g] +# [-G] [-H] [-i] [-j] [-l] [-n] [-r] [-s] [-t] +# [-u] [-v] [-h] +# [-O] specify outfile +# options: -a = print out accession numbers instead of taxon labels for nexus +# and phylip output +# -c = global genetic code (to be used for translation unless +# otherwise specified) (default = standard code; end +# with * to override local settings) +# -d = file containing raw sequence information; * = batch +# convert all specified file types in working +# directory (suffixes must be .fasta / .fas, .gb, +# .nexus / .nex, .phylip / .ph, or .seal as +# appropriate) +# -g = do not convert ~ gap characters (e.g., from BioEdit) to - +# -G = convert flanking gaps to Ns +# -H = convert sequence data to haplotype data +# -i = format of sequence file (fasta (f), GenBank (g), +# nexus (n), phylip (p), or Se-Al (s)); default = +# autodetect +# -j = produce jackknifed data sets each with a single sequence +# deleted +# -l = interleave nexus- and/or phylip-formatted output +# (default = sequential) with specified sequence length +# (between 10 and 100 inclusive; default = 80); +# NOTE: also sets default interleave length for fasta +# output +# -n = convert ambiguous nucleotides to Ns +# -o = output results in any or all of fasta (f), nexus +# (n), classic or extended phylip (pc or pe), +# and/or Se-Al (s) formats (NOTE: flag can be +# invoked multiple times) +# -r = order sequences in final output alphabetically by name +# (a; default) or in input order from file (i) +# -s = split data set into individual partitions according to nexus +# charset statements +# -t = translate sequences into amino acids +# -u = interactive user-input mode +# -h = print this message and quit +# -v = verbose output + +use strict; +use POSIX; +#THO added ability for Galaxy -G to specify outfile name +my $OutFile; + +# Set user-input defaults and associated parameters + # Data set variables + my $inputType = ""; # Options are "fasta", "GenBank", "nexus", "phylip", and "Se-Al" + my ($readFile, @seqFiles); + my $dataSource; + my $seqType = "nucleotide"; + my $globalGenCode = 0; + my $globalOverride = 0; + + my (@accNum, %nameLabel, %sequence, %geneticCode, %accPresent); + my (@charsetList, %charsetStart, %charsetEnd); + my (%deletedSeq, %finalSeq); + my $seqCount; + my $ntax; + my $nameClean = 1; + my $skipDuplicates = 1; + + # User input variables + my $seqOrder = "alphabetical"; # Options are "alphabetical" (default) and "input" + my $seqSplit = 0; + my $jackknife = 0; + my $gapChange = 1; + my $ambigChange = 0; + my $translateSeqs = 0; + my $haploTyping = 0; + my $haploFile; + my $flankGap = 0; + + # Translation variables and genetic codes + my %transTable = ('1' => 'standard', + '2' => 'vertebrate mitochondrial', + '3' => 'yeast mitochondrial', + '4' => 'mold, protozoan and colenterate mitochondrial and mycoplasam/spiroplasma', + '5' => 'invertebrate mitochondrial', + '6' => 'ciliate, dasycladacean and hexamita nuclear', + '9' => 'echinoderm mitochondrial', + '10' => 'euplotid nuclear', + '11' => 'bacterial and plant plastid', + '12' => 'alternative yeast nuclear', + '13' => 'ascidian mitochondrial', + '14' => 'alternative flatworm mitochondrial', + '15' => 'Blepharisma nuclear', + '16' => 'chlorophycean mitochondrial', + '21' => 'trematode mitochondrial', + '22' => 'Scenedesmus obliquus mitochondrial', + '23' => 'Thraustochytrium mitochondrial'); + my %gb2seal = ('1' => '0', + '2' => '1', + '3' => '2', + '4' => '3', + '4' => '4', + '5' => '5', + '6' => '6', + '9' => '7', + '10' => '8', + '11' => '9', + '12' => '10', + '13' => '11', + '14' => '12', + '15' => '13', + '16' => '1', + '21' => '1', + '22' => '1', + '23' => '1'); + my %seal2gb = ('0' => '1', + '1' => '2', + '2' => '3', + '3' => '4', + '4' => '4', + '5' => '5', + '6' => '6', + '7' => '9', + '8' => '10', + '9' => '11', + '10' => '12', + '11' => '13', + '12' => '14', + '13' => '15'); + my %genCodes; + foreach my $code (qw(0 1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23)) + { + $genCodes{$code} = 1; + } + my %DNAtoAA; + my @ambigList = qw(A C G T M R W S Y K V H D B N); + my %ambigCode = ('A' => 'A', 'C' => 'C', 'G' => 'G', 'T' => 'T', + 'AC' => 'M', 'AG' => 'R', 'AT' => 'W', 'CG' => 'S', 'CT' => 'Y', 'GT' => 'K', + 'ACG' => 'V', 'ACT' => 'H', 'AGT' => 'D', 'CGT' => 'B', 'ACGT' => 'N'); + my (%constitNTlist); + while (my ($nt, $code) = each %ambigCode) # Where $nt = key and $code = value + { + push @{$constitNTlist{$code}}, $_ foreach (split("",$nt)); + } + + # Output variables + my $maxLength; + my $fastaPrint = 0; + my $fastaOut; + my $nexusPrint = 0; + my $nexusOut; + my ($phylipTradPrint, $phylipExtPrint) = (0, 0); + my $phylipOut; + my $sealPrint = 0; + my $sealOut; + my $outFormat = "sequential"; + my $interleaveLength = 80; + my $fastaLength = 80; + my $accPrint = 0; + + # Miscellaneous variables + my $verbose = 0; + my $debug = 0; + my $perlScript = "seqConverterG.pl"; + my $version = "1.2"; + +# Read in user input + if (not @ARGV or join(' ', @ARGV) =~ /\s-u/ or $ARGV[0] =~ /^-u/) # Enter interactive user-input mode + { + print "Entering interactive user-input mode. Type \"q\" at any prompt to exit program.\n"; + + # Get print format + my $defaultAcc = ($accPrint) ? "y" : "n"; + undef $accPrint; + until (defined $accPrint) + { + print "\tPrint out accession numbers only to nexus- and/or phylip output (y|n) [$defaultAcc]: "; + $accPrint = ; + chomp ($accPrint); + exit(0) if ($accPrint eq "q"); + if (substr($accPrint, 0, 1) =~ /^n/i or $accPrint eq "") + { + $accPrint = 0; + } + elsif (substr($accPrint, 0, 1) =~ /^y/i) + { + $accPrint = 1; + } + else + { + print "\t\tInvalid input ($accPrint)\n"; + undef $accPrint; + } + } + + # Get datafile + until (defined $readFile) + { + print "\tEnter name of data file (* = batch convert): "; + $readFile = ; + chomp ($readFile); + exit(0) if ($readFile eq "q"); + unless (-e $readFile or $readFile eq "*") + { + print "\t\tFile '$readFile' does not exist\n"; + undef $readFile; + } + } + push @seqFiles, $readFile; + + # Get format of datafile + my $defaultInput = "autodetect"; + undef $inputType; + until (defined $inputType) + { + print "\tEnter format of file $readFile (fasta|GenBank|nexus|phylip|Se-Al) [$defaultInput]: "; + $inputType = ; + chomp ($inputType); + exit(0) if ($inputType =~ /^q/i); + if (substr($inputType, 0, 1) =~ /^a/i or $inputType eq "") + { + $inputType = "autodetect"; + } + elsif (substr($inputType, 0, 1) =~ /^f/i) + { + $inputType = "fasta"; + } + elsif (substr($inputType, 0, 1) =~ /^g/i) + { + $inputType = "GenBank"; + } + elsif (substr($inputType, 0, 1) =~ /^n/i) + { + $inputType = "nexus"; + } + elsif (substr($inputType, 0, 1) =~ /^p/i) + { + $inputType = "phylip"; + } + elsif (substr($inputType, 0, 1) =~ /^s/i) + { + $inputType = "Se-Al"; + } + else + { + print "\t\tInvalid input ($inputType)\n"; + undef $inputType; + } + } + $inputType = "" if ($inputType eq "autodetect"); + + # Get whether or not to clean sequence labels + my $defaultClean = ($nameClean) ? "y" : "n"; + undef $nameClean; + until (defined $nameClean) + { + print "\tClean sequence labels by changing non-alphanumeric characters to underscores (y|n)? [$defaultClean]: "; + $nameClean = ; + chomp ($nameClean); + exit(0) if ($nameClean =~ /^q/i); + if (substr($nameClean, 0, 1) =~ /^y/i or $nameClean eq "") + { + $nameClean = 1; + } + elsif (substr($nameClean, 0, 1) =~ /^n/i) + { + $nameClean = 0; + } + else + { + print "\t\tInvalid input ($seqSplit)\n"; + undef $nameClean; + } + } + + # Get whether or not to split sequences + my $defaultSplit = ($seqSplit) ? "y" : "n"; + undef $seqSplit; + until (defined $seqSplit) + { + print "\tSplit into individual partitions following charset statements (y|n)? [$defaultSplit]: "; + $seqSplit = ; + chomp ($seqSplit); + exit(0) if ($seqSplit =~ /^q/i); + if (substr($seqSplit, 0, 1) =~ /^n/i or $seqSplit eq "") + { + $seqSplit = 0; + } + elsif (substr($seqSplit, 0, 1) =~ /^y/i) + { + $seqSplit = 1; + } + else + { + print "\t\tInvalid input ($seqSplit)\n"; + undef $seqSplit; + } + } + + # Get whether or not to jackknife sequences + my $defaultJack = ($jackknife) ? "y" : "n"; + undef $jackknife; + until (defined $jackknife) + { + print "\tProduce replicate, jackknifed data sets (y|n)? [$defaultJack]: "; + $jackknife = ; + chomp ($jackknife); + exit(0) if ($jackknife =~ /^q/i); + if (substr($jackknife, 0, 1) =~ /^n/i or $jackknife eq "") + { + $jackknife = 0; + } + elsif (substr($jackknife, 0, 1) =~ /^y/i) + { + $jackknife = 1; + } + else + { + print "\t\tInvalid input ($jackknife)\n"; + undef $jackknife; + } + } + + # Get whether or not to convert gaps + my $defaultGaps = ($gapChange) ? "y" : "n"; + undef $gapChange; + until (defined $gapChange) + { + print "\tConvert ~ gap characters to - (y|n)? [$defaultGaps]: "; + $gapChange = ; + chomp ($gapChange); + exit(0) if ($gapChange =~ /^q/i); + if (substr($gapChange, 0, 1) =~ /^n/i) + { + $gapChange = 0; + } + elsif (substr($gapChange, 0, 1) =~ /^y/i or $gapChange eq "") + { + $gapChange = 1; + } + else + { + print "\t\tInvalid input ($gapChange)\n"; + undef $gapChange; + } + } + + # Get whether or not to convert flanking gaps + my $defaultFlank = ($flankGap) ? "y" : "n"; + undef $flankGap; + until (defined $flankGap) + { + print "\tConvert flanking gap characters to Ns (y|n)? [$defaultFlank]: "; + $flankGap = ; + chomp ($flankGap); + exit(0) if ($flankGap =~ /^q/i); + if (substr($flankGap, 0, 1) =~ /^n/i) + { + $flankGap = 0; + } + elsif (substr($flankGap, 0, 1) =~ /^y/i or $flankGap eq "") + { + $flankGap = 1; + } + else + { + print "\t\tInvalid input ($flankGap)\n"; + undef $flankGap; + } + } + + # Get whether or not to convert ambiguous nucleotides + my $defaultAmbig = ($ambigChange) ? "y" : "n"; + undef $ambigChange; + until (defined $ambigChange) + { + print "\tConvert ambiguous nucleotides to Ns (y|n)? [$defaultAmbig]: "; + $ambigChange = ; + chomp ($ambigChange); + exit(0) if ($ambigChange =~ /^q/i); + if (substr($ambigChange, 0, 1) =~ /^n/i or $ambigChange eq "") + { + $ambigChange = 0; + } + elsif (substr($ambigChange, 0, 1) =~ /^y/i) + { + $ambigChange = 1; + } + else + { + print "\t\tInvalid input ($ambigChange)\n"; + undef $ambigChange; + } + } + + # Get output order of sequences + my $defaultOrder = $seqOrder; + undef $seqOrder; + until (defined $seqOrder) + { + print "\tEnter output order for sequences (alphabetical|clustal|input file) [$defaultOrder]: "; + $seqOrder = ; + chomp ($seqOrder); + exit(0) if ($seqOrder =~ /^q/i); + if (substr($seqOrder, 0, 1) =~ /^i/i) + { + $seqOrder = "input"; + } + elsif (substr($seqOrder, 0, 1) =~ /^a/i or $seqOrder eq "") + { + $seqOrder = "alphabetical"; + } + else + { + print "\t\tInvalid input ($seqOrder)\n"; + undef $seqOrder; + } + } + + # Get whether or not to convert to haplotypes + my $defaultHaplo = ($haploTyping) ? "y" : "n"; + undef $haploTyping; + until (defined $haploTyping) + { + print "\tConvert sequence data to haplotypes (y|n)? [$defaultHaplo]: "; + $haploTyping = ; + chomp ($haploTyping); + exit(0) if ($haploTyping =~ /^q/i); + if (substr($haploTyping, 0, 1) =~ /^n/i or $haploTyping eq "") + { + $haploTyping = 0; + } + elsif (substr($haploTyping, 0, 1) =~ /^y/i) + { + $haploTyping = 1; + } + else + { + print "\t\tInvalid input ($haploTyping)\n"; + undef $haploTyping; + } + } + + # Get whether or not to convert to amino acids + my $defaultTranslate = ($translateSeqs) ? "y" : "n"; + undef $translateSeqs; + until (defined $translateSeqs) + { + print "\tConvert sequence data to amino acids (y|n)? [$defaultTranslate]: "; + $translateSeqs = ; + chomp ($translateSeqs); + exit(0) if ($translateSeqs =~ /^q/i); + if (substr($translateSeqs, 0, 1) =~ /^n/i or $translateSeqs eq "") + { + $translateSeqs = 0; + } + elsif (substr($translateSeqs, 0, 1) =~ /^y/i) + { + $translateSeqs = 1; + } + else + { + print "\t\tInvalid input ($translateSeqs)\n"; + undef $translateSeqs; + } + } + + # Get genetic code for translation + my $defaultCode = $globalGenCode; + $globalGenCode = 99; + print "\tGenetic codes available for protein translation:\n"; + foreach my $code (qw(1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23)) + { + print "\t\t$code: $transTable{$code}"; + print "\n"; + } + until (defined $genCodes{$globalGenCode}) + { + $globalOverride = 0; + print "\tEnter global genetic code to be applied (0 = no translation; follow with an * to override local code) [$defaultCode]: "; + $globalGenCode = ; + chomp ($globalGenCode); + exit(0) if ($globalGenCode =~ /^q/i); + $globalGenCode = $defaultCode if ($globalGenCode eq ""); + if ($globalGenCode =~ /\*$/) + { + $globalOverride = 1; + $globalGenCode =~ s/\*$//; + } + print "\t\tInvalid input ($globalGenCode)\n" unless (defined $genCodes{$globalGenCode} or $globalGenCode == 0); + } + + # Get output formats + my $defaultFasta = ($fastaPrint) ? "y" : "n"; + undef $fastaPrint; + until (defined $fastaPrint) + { + print "\tOutput results in fasta format (y|n) [$defaultFasta]: "; + $fastaPrint = ; + chomp ($fastaPrint); + exit(0) if ($fastaPrint =~ /^q/i); + if (substr($fastaPrint, 0, 1) =~ /^y/i) + { + $fastaPrint = 1; + } + elsif (substr($fastaPrint, 0, 1) =~ /^n/i or $fastaPrint eq "") + { + $fastaPrint = 0; + } + else + { + print "\t\tInvalid input ($fastaPrint)\n"; + undef $fastaPrint; + } + } + + my $defaultNexus = ($nexusPrint) ? "y" : "n"; + undef $nexusPrint; + until (defined $nexusPrint) + { + print "\tOutput results in nexus format (y|n) [$defaultNexus]: "; + $nexusPrint = ; + chomp ($nexusPrint); + exit(0) if ($nexusPrint =~ /^q/i); + if (substr($nexusPrint, 0, 1) =~ /^y/i) + { + $nexusPrint = 1; + } + elsif (substr($nexusPrint, 0, 1) =~ /^n/i or $nexusPrint eq "") + { + $nexusPrint = 0; + } + else + { + print "\t\tInvalid input ($nexusPrint)\n"; + undef $nexusPrint; + } + } + + my $defaultPhylip = ($phylipTradPrint) ? "y" : "n"; + undef $phylipTradPrint; + until (defined $phylipTradPrint or $phylipExtPrint) + { + print "\tOutput results in traditional phylip format (y|n) [$defaultPhylip]: "; + $phylipTradPrint = ; + chomp ($phylipTradPrint); + exit(0) if ($phylipTradPrint =~ /^q/i); + if (substr($phylipTradPrint, 0, 1) =~ /^y/i) + { + $phylipTradPrint = 1; + } + elsif (substr($phylipTradPrint, 0, 1) =~ /^n/i or $phylipTradPrint eq "") + { + $phylipTradPrint = 0; + } + else + { + print "\t\tInvalid input ($phylipTradPrint)\n"; + undef $phylipTradPrint; + } + } + + if ($phylipTradPrint == 0) # Check for extended format + { + my $defaultPhylip = ($phylipExtPrint) ? "y" : "n"; + undef $phylipExtPrint; + until (defined $phylipExtPrint or $phylipExtPrint) + { + print "\tOutput results in extended phylip format (y|n) [$defaultPhylip]: "; + $phylipExtPrint = ; + chomp ($phylipExtPrint); + exit(0) if ($phylipExtPrint =~ /^q/i); + if (substr($phylipExtPrint, 0, 1) =~ /^y/i) + { + $phylipExtPrint = 1; + } + elsif (substr($phylipExtPrint, 0, 1) =~ /^n/i or $phylipExtPrint eq "") + { + $phylipExtPrint = 0; + } + else + { + print "\t\tInvalid input ($phylipExtPrint)\n"; + undef $phylipExtPrint; + } + } + } + + my $defaultSeal = ($sealPrint) ? "y" : "n"; + undef $sealPrint; + until (defined $sealPrint) + { + print "\tOutput results in Se-Al format (y|n) [$defaultSeal]: "; + $sealPrint = ; + chomp ($sealPrint); + exit(0) if ($sealPrint =~ /^q/i); + if (substr($sealPrint, 0, 1) =~ /^y/i) + { + $sealPrint = 1; + } + elsif (substr($sealPrint, 0, 1) =~ /^n/i or $sealPrint eq "") + { + $sealPrint = 0; + } + else + { + print "\t\tInvalid input ($sealPrint)\n"; + undef $sealPrint; + } + } + + if ($nexusPrint or $phylipTradPrint or $phylipExtPrint) + { + my $defaultInterleave = ($outFormat) ? "y" : "n"; + undef $outFormat; + until (defined $outFormat) + { + print "\tInterleave output for nexus- and/or phylip-formatted output with length (n|10-100) [$defaultInterleave]: "; + $outFormat = ; + chomp ($outFormat); + exit(0) if ($outFormat =~ /^q/i); + if ($outFormat =~ /(\d+)/) + { + if ($outFormat =~ /\D/ or $outFormat < 10 or $outFormat > 100) + { + print "\t\tInvalid input ($outFormat)\n"; + undef $outFormat; + } + else + { + $outFormat = $1; + } + } + elsif (substr($outFormat, 0, 1) =~ /^n/i or $outFormat eq "") + { + $outFormat = "sequential"; + } + else + { + print "\t\tInvalid input ($outFormat)\n"; + undef $outFormat; + } + } + if ($outFormat ne "sequential") + { + $interleaveLength = $outFormat; + $outFormat = "interleave"; + } + } + + # Get verbose output mode + my $defaultVerbose = ($verbose) ? "y" : "n"; + undef $verbose; + until (defined $verbose) + { + print "\tOutput verbose results to screen (y|n) [$defaultVerbose]: "; + $verbose = ; + chomp ($verbose); + exit(0) if ($verbose =~ /^q/i); + if (substr($verbose, 0, 1) =~ /^y/i) + { + $verbose = 1; + print "\n"; + } + elsif (substr($verbose, 0, 1) =~ /^n/i or $verbose eq "") + { + $verbose = 0; + } + elsif (substr($verbose, 0, 1) =~ /^x/i or $verbose eq "") + { + $verbose = $debug = 1; + } + else + { + print "\t\tInvalid input ($verbose)\n"; + undef $verbose; + } + } + } + elsif (join(' ', @ARGV) =~ /\s-h/ or $ARGV[0] =~ /^-h/) # Print help screen + { + print "Usage: seqConverter.pl -d -o [-a] [-c] [-g]\n"; + print " [-G] [-H] [-i] [-j] [-l] [-n] [-r] [-s] [-t]\n"; + print " [-u] [-v] [-h]\n"; + print "Version: $version\n"; + print "Options: -a = print out accession numbers instead of taxon labels for nexus\n"; + print " and phylip output\n"; + print " -c = global genetic code (to be used for translation unless\n"; + print " otherwise specified) (default = standard code; end\n"; + print " with * to override local settings)\n"; + foreach my $code (qw(1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23)) + { + print " $code: $transTable{$code}"; + print "\n"; + } + print " -d = file containing raw sequence information; * = batch\n"; + print " convert all specified file types in working\n"; + print " directory (suffixes must be .fasta / .fas, .gb\n"; + print " .nexus / .nex, .phylip / .ph, or .seal as\n"; + print " appropriate)\n"; + print " -g = do not convert ~ gap characters (e.g., from BioEdit) to -\n"; + print " -G = convert flanking gaps to Ns\n"; + print " -H = convert sequence data to haplotype data\n"; + print " -i = format of sequence file (fasta (f), GenBank (g),\n"; + print " nexus (n), phylip (p), or Se-Al (s)); default =\n"; + print " autodetect\n"; + print " -j = produce jackknifed data sets each with a single sequence\n"; + print " deleted\n"; + print " -l = interleave nexus- and/or phylip-formatted output\n"; + print " (default = sequential) with specified sequence length\n"; + print " (between 10 and 100 inclusive; default = 80);\n"; + print " NOTE: also sets default interleave length for fasta\n"; + print " output\n"; + print " -n = convert ambiguous nucleotides to Ns\n"; + print " -o = output results in any or all of fasta (f), nexus\n"; + print " (n), classic or extended phylip (pc or pe),\n"; + print " and/or Se-Al (s) formats (NOTE: flag can be\n"; + print " invoked multiple times)\n"; + print " -r = order sequences in final output alphabetically by name\n"; + print " (a; default) or in input order from file (i)\n"; + print " -s = split data set into individual partitions according to nexus\n"; + print " charset statements\n"; + print " -t = translate sequences into amino acids\n"; + print " -u = interactive user-input mode\n"; + print " -h = print this message and quit\n"; + print " -v = verbose output\n"; + exit(0); + } + else # Process switches + { + for (my $i = 0; $i <= $#ARGV; $i++) + { + if ($ARGV[$i] eq "-a") + { + $accPrint = 1; + } + elsif ($ARGV[$i] =~ /-c(\d+)/) + { + $globalGenCode = $1; + if ($globalGenCode != 0 and not defined $genCodes{$globalGenCode}) + { + print "Don't understand argument: $ARGV[$i]\n"; + print "Usage: seqConverter.pl -d -o [-a] [-c] [-g]\n"; + print " [-H] [-i] [-j] [-l] [-n] [-r] [-s] [-t] [-u]\n"; + print " [-v] [-h]\n"; + print "Version: $version\n"; + exit(1); + } + $globalOverride = 1 if ($ARGV[$i] =~ /\*$/); + } + elsif ($ARGV[$i] =~ /^-d(.*)/) + { + $readFile = $1; + unless ($readFile eq "*") + { + die "ERROR: Data file $readFile does not exist.\n" unless (-e $readFile); + } + push @seqFiles, $readFile; + } + elsif ($ARGV[$i] eq "-g") + { + $gapChange = 0; + } + elsif ($ARGV[$i] eq "-G") + { + $flankGap = 1; + } + elsif ($ARGV[$i] eq "-H") + { + $haploTyping = 1; + } + elsif ($ARGV[$i] eq "-if") + { + $inputType = "fasta"; + } + elsif ($ARGV[$i] eq "-ig") + { + $inputType = "GenBank"; + } + elsif ($ARGV[$i] eq "-in") + { + $inputType = "nexus"; + } + elsif ($ARGV[$i] eq "-ip") + { + $inputType = "phylip"; + } + elsif ($ARGV[$i] eq "-is") + { + $inputType = "Se-Al"; + } + elsif ($ARGV[$i] eq "-j") + { + $jackknife = 1; + } + elsif ($ARGV[$i] eq "-k") + { + $nameClean = 0; + } + elsif ($ARGV[$i] =~ /-l(\d+)?/) + { + $interleaveLength = $1 if (defined $1); + $outFormat = "interleave"; + } + elsif ($ARGV[$i] eq "-n") + { + $ambigChange = 1; + } + elsif ($ARGV[$i] eq "-of") + { + $fastaPrint = 1; + } + elsif ($ARGV[$i] eq "-on") + { + $nexusPrint = 1; + } + elsif ($ARGV[$i] eq "-opc") + { + $phylipTradPrint = 1; + } + elsif ($ARGV[$i] eq "-ope") + { + $phylipExtPrint = 1; + } + elsif ($ARGV[$i] eq "-os") + { + $sealPrint = 1; + } + elsif ($ARGV[$i] eq "-ra") + { + $seqOrder = "alphabetical"; + } + elsif ($ARGV[$i] eq "-ri") + { + $seqOrder = "input"; + } + elsif ($ARGV[$i] eq "-s") + { + $seqSplit = 1; + } + elsif ($ARGV[$i] eq "-t") + { + $translateSeqs = 1; + $globalGenCode = 1 unless ($globalGenCode); + } + elsif ($ARGV[$i] eq "-v") + { + $verbose = 1; + } + elsif ($ARGV[$i] eq "-x") + { + $debug = 1; + $verbose = 1; + } + elsif ($ARGV[$i] =~ /^-O(.*)/) + { + $OutFile = $1; + } + else + { + print "Don't understand argument: $ARGV[$i]\n"; + print "Usage: seqConverter.pl -d -o [-a] [-c] [-g]\n"; + print " [-G] [-H] [-i] [-j] [-l] [-n] [-r] [-s] [-t]\n"; + print " [-u] [-v] [-h]\n"; + print "Version: $version\n"; + exit(1); + } + } + } + $accPrint = 0 if ($accPrint eq "n"); + geneticCoder() if ($globalGenCode); + $fastaLength = $interleaveLength; + +# Check for I/O errors + die "ERROR: Must supply name of file containing sequence data.\n" if (not @seqFiles); + die "ERROR: Must supply at least one output format.\n" unless ($fastaPrint or $nexusPrint or $phylipTradPrint or $phylipExtPrint or $sealPrint); + die "ERROR: Sequence length for interleaved format must be between 10 and 100 inclusive.\n" if (($fastaPrint or (($nexusPrint or $phylipTradPrint or $phylipExtPrint) and $outFormat eq "interleave")) and ($interleaveLength < 10 or $interleaveLength > 100)); + +# Read in sequence data + if ($seqFiles[0] eq "*") # Batch convert all appropriate files in working directory + { + if ($inputType) + { + undef @seqFiles; + + system("ls -d * > convertList.txt"); + setLineBreak("convertList.txt"); + open (LIST, ") + { + chomp; + next unless ($_); + push @seqFiles, $_ if ($inputType eq "Se-Al" and $_ =~ /\.seal$/); + push @seqFiles, $_ if ($inputType eq "phylip" and ($_ =~ /\.phylip$/ or $_ =~ /\.ph$/)); + push @seqFiles, $_ if ($inputType eq "nexus" and ($_ =~ /\.nexus$/ or $_ =~ /\.nex$/)); + push @seqFiles, $_ if ($inputType eq "GenBank" and ($_ =~ /\.gb$/)); + push @seqFiles, $_ if ($inputType eq "fasta" and ($_ =~ /\.fasta$/ or $_ =~ /\.fas$/)); + } + close LIST; + unlink ("convertList.txt") unless ($debug); + + die "ERROR: No files of type $inputType found for batch conversion.\n" if (not @seqFiles); + } + else + { + die "ERROR: Must specify input file type for batch conversion\n"; + } + } + + foreach my $seqFile (@seqFiles) + { + print "\nConverting file $seqFile ...\n"; + + # Set output file names + $dataSource = $seqFile; + $dataSource =~ s/\.\w+$//; + + if ($fastaPrint) + { + $fastaOut = $dataSource . ".fasta"; + $fastaOut =~ s/\.fasta$/_new.fasta/ if ($fastaOut eq $seqFile); + $fastaOut =~ s/\.fasta$/_haplo.fasta/ if ($haploTyping); + } + if ($nexusPrint) + { +# $nexusOut = $dataSource . ".nex"; +# Fixed Output File name much easier for Galaxy + $nexusOut = $OutFile; + $nexusOut =~ s/\.nex$/_new.nex/ if ($nexusOut eq $seqFile); + $nexusOut =~ s/\.nex$/_haplo.nex/ if ($haploTyping); + } + if ($phylipTradPrint or $phylipExtPrint) + { +# $phylipOut = $dataSource . ".phylip"; +# Fixed Output File name much easier for Galaxy + $phylipOut = $OutFile; + $phylipOut =~ s/\.phylip$/_new.phylip/ if ($phylipOut eq $seqFile); + $phylipOut =~ s/\.phylip$/_haplo.phylip/ if ($haploTyping); + } + if ($sealPrint) + { + $sealOut = $dataSource . ".seal"; + $sealOut =~ s/\.seal$/_new.seal/ if ($sealOut eq $seqFile); + $sealOut =~ s/\.seal$/_haplo.seal/ if ($haploTyping); + } + + $haploFile = $dataSource . "_haplotypeSeqs.txt"; + + $phylipExtPrint = 0 if ($phylipTradPrint); + + # Read in sequence data + # Clear variables + undef @accNum; + undef %nameLabel; + undef %sequence; + undef %geneticCode; + undef %accPresent; + undef %deletedSeq; + undef %finalSeq; + undef $seqCount; + undef $ntax; + undef @charsetList; + undef %charsetStart; + undef %charsetEnd; + my (@haploList, %haploID, %haploSeqs); + $maxLength = 0; + + seqRead($seqFile); + if (not @accNum) + { + print "\tERROR: Could not read in sequences from file $seqFile; skipping to next file\n"; + next; + } + + # Process for printing + my $stopCodonCount; + $seqType = "protein" if ($globalGenCode and $translateSeqs); + foreach my $seq (@accNum) + { + $sequence{$seq} =~ s/\~/-/g if ($gapChange); + $sequence{$seq} =~ s/R|Y|M|K|S|W|H|B|V|D/N/ig if ($ambigChange and $seqType eq "nucleotide"); + + $finalSeq{$seq} = $sequence{$seq}; + $geneticCode{$seq} = $globalGenCode if ($globalOverride); + if ($globalGenCode and $translateSeqs) + { + $finalSeq{$seq} = translate($sequence{$seq}, $geneticCode{$seq}); + $stopCodonCount++ if (($finalSeq{$seq} =~ tr/\*//) > 2); # Check for stop codons + } + $maxLength = length($finalSeq{$seq}) if (length($finalSeq{$seq}) > $maxLength); + $nameLabel{$seq} =~ s/\W+/_/g if ($nameClean); # Clean sequence labels of non-alphanumerics + } + printf "\n\tWARNING: $stopCodonCount of %s sequences had more than two stop codons; check for\n\t\t1) proper reading frame,\n\t\t2) proper genetic code, or\n\t\t3) that sequences are coding DNA\n", scalar(@accNum) if ($globalGenCode and $stopCodonCount); + + # Add gaps to end of any sequence less than maximum length + foreach my $seq (@accNum) + { + $finalSeq{$seq} .= "-" x ($maxLength - length($sequence{$seq})); + $sequence{$seq} = $finalSeq{$seq}; + } + + # Convert flanking gaps to Ns if desired + if ($flankGap) + { + foreach my $seq (@accNum) + { + if ($finalSeq{$seq} =~ /^(\-+)/) + { + my $startGap = $1; + my $startN = "N" x length($startGap); + $finalSeq{$seq} =~s/^$startGap/$startN/; + } + if ($finalSeq{$seq} =~ /(\-+)$/) + { + my $endGap = $1; + my $endN = "N" x length($endGap); + $finalSeq{$seq} =~s/$endGap$/$endN/; + } + } + } + + # Determine haplotypes as needed + if ($haploTyping) + { + foreach my $entry (@accNum) + { + $haploID{$entry} = 0; + + foreach my $haploType (@haploList) + { + if ($finalSeq{$entry} eq $finalSeq{$haploType}) # Matches existing haplotype; add to list + { + $haploID{$entry} = $haploType; + push @{$haploSeqs{$haploType}}, $entry; + last; + } + } + + if (not $haploID{$entry}) # No match to existing haplotype; define new haplotype + { + $haploID{$entry} = "haplo" . (scalar(@haploList) + 1); + push @haploList, $haploID{$entry}; + $nameLabel{$haploID{$entry}} = $haploID{$entry}; + $finalSeq{$haploID{$entry}} = $finalSeq{$entry}; + push @{$haploSeqs{$haploID{$entry}}}, $entry; + } + } + + undef @accNum; + @accNum = @haploList; + + open (HAPLO, ">$haploFile") or die "Cannot print to file $haploFile\n"; + foreach my $haploType (@haploList) + { + print HAPLO "$haploType:"; + print HAPLO "\t$nameLabel{$_}" foreach @{$haploSeqs{$haploType}}; + print HAPLO "\n"; + } + close HAPLO; + } + + # Print results! + print "\nPrinting results ...\n"; + @accNum = sort { $nameLabel{$a} cmp $nameLabel{$b} } keys %nameLabel if ($seqOrder eq "alphabetical" and not $haploTyping); + + # Print full data set + $ntax = scalar(@accNum); + seqPrint($seqFile); + + # Print jackknifed data sets + if ($jackknife) + { + my $delseqCount = 0; + foreach my $seq (@accNum) + { + $deletedSeq{$seq} = 1; + $delseqCount++; + + # Change output file names + $fastaOut = $dataSource . "_jack$delseqCount.fasta"; + $nexusOut = $dataSource . "_jack$delseqCount.nex"; + $phylipOut = $dataSource . "_jack$delseqCount.phylip"; + $sealOut = $dataSource . "_jack$delseqCount.seal"; + + $ntax = scalar(@accNum) - 1; + seqPrint($seqFile); + + $deletedSeq{$seq} = 0; # Reset deleted sequence + } + } + + # Print data set partitions + if (@charsetList and $seqSplit) + { + my $delCount = 0; + foreach my $partition (@charsetList) + { + $delCount = 0; + # Change output file names + $fastaOut = $dataSource . "_$partition.fasta"; + $nexusOut = $dataSource . "_$partition.nex"; + $phylipOut = $dataSource . "_$partition.phylip"; + $sealOut = $dataSource . "_$partition.seal"; + + # Restrict sequence to partition limits + foreach my $seq (@accNum) + { + $deletedSeq{$seq} = 0; # Reset all deleted sequences + $finalSeq{$seq} = substr($sequence{$seq}, $charsetStart{$partition} - 1, $charsetEnd{$partition} - $charsetStart{$partition} + 1); + # Check that sequence remains informative + unless ($finalSeq{$seq} =~ /a/i or $finalSeq{$seq} =~ /c/i or $finalSeq{$seq} =~ /g/i or $finalSeq{$seq} =~ /t/i) + { + $delCount++; + $deletedSeq{$seq} = 1; + } + } + $ntax = scalar(@accNum) - $delCount; + $maxLength = $charsetEnd{$partition} - $charsetStart{$partition} + 1; + seqPrint($seqFile); + } + } + } + +exit(0); + +### Subroutines used in the program + +sub setLineBreak # Check line breaks of input files and set input record separator accordingly + { + my $inFile = shift; + $/ ="\n"; + open (IN, "<$inFile") or die "Cannot open $inFile to check form of line breaks.\n"; + while () + { + if ($_ =~ /\r\n/) + { + print "\tDOS line breaks detected ...\n" if ($verbose); + $/ ="\r\n"; + last; + } + elsif ($_ =~ /\r/) + { + print "\tMac line breaks detected ...\n" if ($verbose); + $/ ="\r"; + last; + } + else + { + print "\tUnix line breaks detected ...\n" if ($verbose); + $/ ="\n"; + last; + } + } + close IN; + } + +sub seqRead + { + my $seqFile = shift; + undef %sequence; + + print "\nReading in sequence data from file $seqFile (type is $inputType) ...\n" if ($inputType); + setLineBreak($seqFile); + open (SEQ, "<$seqFile") or die "Cannot open file containing sequences, $seqFile\n"; + my ($header, $tempAcc, $tempName, $tempSeq); + my $fastaAcc; + my (%nexusSpecies, %nexusAcc, $nexusRead, $commentFlag); + my ($phylipLineCount, $phylipTaxa, $phylipChars, %phylipSeq); + my $sealCode; + my ($sealDelFlag, $owner) = (0, 0); + my ($gbAcc, $gbRead); + my $macBlock = 0; + my %accCount; + + while () + { + chomp; + my $lineRead = $_; + next unless ($lineRead); + + # Autodetect sequence format + if (not $inputType) + { + $inputType = "fasta" if ($lineRead =~ /^>/); + $inputType = "nexus" if ($lineRead =~ /\#nexus/i); + $inputType = "phylip" if ($lineRead =~ /^\s*\d+\s+\d+/); + $inputType = "Se-Al" if ($lineRead =~ /^\s*Database=\{/i); + $inputType = "GenBank" if ($lineRead =~ /^\s*LOCUS/); + print "\nReading in sequence data from file $seqFile (type determined to be $inputType) ...\n" if ($inputType); + } + + if ($inputType eq "nexus") + { + # Check if charset statement present + if ($lineRead =~ /charset/i) + { + $lineRead =~ s/\s+//g; + $lineRead =~ s/;$//; + + my ($charsetName, $charsetBounds) = split('=', $lineRead); + $charsetName =~ s/charset//i; + push @charsetList, $charsetName; + if ($charsetBounds =~ /(\d+)-*(\d*)/) + { + $charsetStart{$charsetName} = $1; + if ($2) + { + $charsetEnd{$charsetName} = $2; + } + else + { + $charsetEnd{$charsetName} = $1; + } + } + } + + # Otherwise block out MacClade / PAUP blocks + $macBlock = 1 if ($lineRead =~ /begin macclade;/i or $lineRead =~ /begin paup;/i); + $macBlock = 0 if ($macBlock and $lineRead =~ /end;/i); + next if ($macBlock); + + # Otherwise only read in data lines or charset statements + if ($lineRead =~ /^\s*matrix/i) + { + $nexusRead = 1; + next; + } + $nexusRead = 0 if ($lineRead =~ /;\s*$/); + next unless ($nexusRead); + + # Remove MacClade sequence lengths + $lineRead =~ s/\[\d+\]$//; + + $commentFlag = 1 if ($lineRead =~ /\[/); + if ($lineRead =~ /\]/) + { + $commentFlag = 0; + next; + } + next if ($commentFlag); + + next unless ($lineRead =~ /a/i or $lineRead =~ /c/i or $lineRead =~ /g/i or $lineRead =~ /t/i or $lineRead =~ /n/i or $lineRead =~ /\?/ or $lineRead =~ /\-/); + + # Clean up input line + $lineRead =~ s/^\s+//; + $lineRead =~ s/\'//g; + + my @nexusLine = split(/\s+/, $lineRead); + my $species = shift(@nexusLine); + $species =~ s/\s+/_/g; + $species =~ s/\_+/_/g; + my $seq = join('', @nexusLine); + $seq =~ s/\s+//g; + $seqType = "protein" if ($seq =~ /E/i or $seq =~ /Q/i or $seq =~ /I/i or $seq =~ /L/i or $seq =~ /F/i or $seq =~ /P/i); + if (not defined $nexusSpecies{$species}) + { + $nexusSpecies{$species} = 1; + $seqCount++; + $nexusAcc{$species} = "tAlign_".$seqCount; + push @accNum, $nexusAcc{$species}; + $nameLabel{$nexusAcc{$species}} = $species; + $sequence{$nexusAcc{$species}} = uc($seq); + $geneticCode{$nexusAcc{$species}} = $globalGenCode; + } + else # Sequences are in interleaved format; append sequence + { + $sequence{$nexusAcc{$species}} .= uc($seq); + } + } + + if ($inputType eq "fasta") + { + if ($lineRead =~/^\s*>/) + { + my $species; + $seqCount++; + (my $tempSpecies = $lineRead) =~ s/^\s*>//; + + if ($tempSpecies =~ /^Mit\.\s+/) # Entry comes from European RNA project + { + $tempSpecies =~ s/^Mit\.\s+//i; # To fix entries from European RNA project + my @speciesInfo = split(/\s+/, $tempSpecies); + $species = join('_', $speciesInfo[0], $speciesInfo[1]); + if (defined $speciesInfo[2]) + { + $fastaAcc = $speciesInfo[2]; + $accCount{$fastaAcc}++; + if ($accCount{$fastaAcc} > 1) + { + print "\nWARNING: Accession number $fastaAcc used more than once"; + print "; skipping this and subsequent entries" if ($skipDuplicates); + print "\n"; + } + } + else + { + $fastaAcc = "tAlign_".$seqCount; + $accCount{$fastaAcc}++; + } + } + else + { + my @speciesLine = split(/\s+/, $tempSpecies); + if ($speciesLine[$#speciesLine] =~ /^\(?\w+\d+\)?$/ and scalar(@speciesLine) > 1) # Check whether last entry is an accession number + { + $fastaAcc = pop (@speciesLine); + $fastaAcc =~ s/^\(//g; + $fastaAcc =~ s/\)$//g; + $accCount{$fastaAcc}++; + if ($accCount{$fastaAcc} > 1) + { + print "\nWARNING: Accession number $fastaAcc used more than once"; + if ($skipDuplicates) + { + print "; skipping this and subsequent entries\n"; + } + else + { + print "; assigning temporary accession number\n"; + $fastaAcc = "tAlign_" . $seqCount; + } + } + } + else + { + $fastaAcc = "tAlign_".$seqCount; + $accCount{$fastaAcc}++; + } + $species = join('_', @speciesLine); + $species = "Sequence_".$seqCount if ($species eq ""); + } + push @accNum, $fastaAcc unless ($accCount{$fastaAcc} > 1 and $skipDuplicates); + $geneticCode{$fastaAcc} = $globalGenCode; + ($nameLabel{$fastaAcc} = $species) =~ s/\s+/_/; + $nameLabel{$fastaAcc} =~ s/\_+/_/; + } + else + { + next if ($accCount{$fastaAcc} > 1 and $skipDuplicates); + $lineRead =~ s/\s+//g; + $seqType = "protein" if ($lineRead =~ /E/i or $lineRead =~ /Q/i or $lineRead =~ /I/i or $lineRead =~ /L/i or $lineRead =~ /F/i or $lineRead =~ /P/i); + $sequence{$fastaAcc} .= uc($lineRead); + } + } + + if ($inputType eq "Se-Al") + { + my $header; + $sealDelFlag = 1 if ($lineRead =~/MCoL/); # Se-Al sometimes places deleted species at end of file; do not read in remainder of file + next if ($sealDelFlag == 1); + next unless ($lineRead =~/NumSites/i or $lineRead =~/Owner/i or $lineRead =~/Name/i or $lineRead =~/Accession/i or $lineRead =~/Sequence/i or $lineRead =~/GeneticCode/i or $lineRead =~/Frame/i); + if ($lineRead =~/Owner\s*\=\s*(\d+)/i) + { + $owner = $1; + } + if ($lineRead =~/Accession/i and $owner == 2) + { + $seqCount++; + if ($lineRead =~ /null/ or $lineRead =~ /\"\"/) + { + $tempAcc = "tAlign_" . $seqCount; + $accCount{$tempAcc}++; + } + else + { + ($header, $tempAcc) = split (/=/, $lineRead); + $tempAcc =~ s/\"//g; + $tempAcc =~ s/;//g; + $accCount{$tempAcc}++; + if ($accCount{$tempAcc} > 1) + { + print "\nWARNING: Accession number $fastaAcc used more than once"; + if ($skipDuplicates) + { + print "; skipping this and subsequent entries\n"; + } + else + { + print "; assigning temporary accession number\n"; + $tempAcc = "tAlign_" . $seqCount; + } + print "\n"; + } + } + push @accNum, $tempAcc unless ($accCount{$tempAcc} > 1 and $skipDuplicates); + } + if ($lineRead =~/Name/i and $owner == 2) + { + ($header, $tempName) = split (/=/, $lineRead); + $tempName =~ s/\"//g; + $tempName =~ s/\s*;//g; + $tempName =~ s/\s+/_/g; + $tempName =~ s/\_+/_/g; + } + if ($lineRead =~/GeneticCode=(\d)/i and $owner == 2) + { + $sealCode = $1; + $geneticCode{$tempAcc} = $seal2gb{$sealCode}; + } + if ($lineRead =~/Sequence/i and $owner == 2) + { + next if ($accCount{$tempAcc} > 1 and $skipDuplicates); + ($header, $tempSeq) = split (/=/, $lineRead); + $tempSeq =~ s/\"//g; + $tempSeq =~ s/;//g; + $tempSeq =~ s/\s+//g; + $nameLabel{$tempAcc} = $tempName; + $sequence{$tempAcc} = uc($tempSeq); + $seqType = "protein" if ($tempSeq =~ /E/i or $tempSeq =~ /Q/i or $tempSeq =~ /I/i or $tempSeq =~ /L/i or $tempSeq =~ /F/i or $tempSeq =~ /P/i); + } + if ($lineRead =~/Frame=(\d)/i) # Correct for reading frame + { + my $readingFrame = $1; + $sequence{$tempAcc} = "--" . $sequence{$tempAcc} if ($readingFrame == 2); + $sequence{$tempAcc} = "-" . $sequence{$tempAcc} if ($readingFrame == 3); + } + } + + if ($inputType eq "phylip") + { + if ($lineRead =~ /^\s*(\d+)\s+(\d+)/) + { + $phylipTaxa = $1; + $phylipChars = $2; + $phylipLineCount = 0; + } + else + { + $phylipLineCount++; + + $lineRead =~ s/\s//g; + + $phylipSeq{$phylipLineCount} .= $lineRead; + + $phylipLineCount = 0 if ($phylipLineCount == $phylipTaxa); + } + } + + if ($inputType eq "GenBank") + { + # Get species name and accession number + # Pure GenBank format + $gbAcc = $1 if ($lineRead =~ /^\s*ACCESSION\s+(\w+\d+)/); + if ($lineRead =~ /^\s*ORGANISM\s+/) + { + $seqCount++; + $gbAcc = "tAlign_" . $seqCount if (not defined $gbAcc); + $lineRead =~ s/^\s+//; + my @orgLine = split(/\s+/, $lineRead); + my $header = shift (@orgLine); + $nameLabel{$gbAcc} = join('_', @orgLine); + $accCount{$gbAcc}++; + } + # BioEdit format + if ($lineRead =~ /^\s*TITLE/ and not defined ($gbAcc)) + { + $seqCount++; + $gbAcc = "tAlign_" . $seqCount; + my @accLine = split (/\s+/, $lineRead); + $gbAcc = $1 if ($accLine[2] =~ /^\((\w+\d+)\)/); + $nameLabel{$gbAcc} = $accLine[1]; + $accCount{$gbAcc}++; + } + + if ($lineRead =~ /^\s*ORIGIN/) + { + if ($accCount{$gbAcc} > 1) + { + print "\nWARNING: Accession number $gbAcc used more than once"; + if ($skipDuplicates) + { + print "; skipping this and subsequent entries\n"; + next; + } + else + { + print "; assigning temporary accession number\n"; + $gbAcc = "tAlign_" . $seqCount; + $gbRead = 1; + } + } + else + { + $gbRead = 1; + $seqCount++; + } + next; + } + + if ($lineRead =~ /^\s*\/\//) # End of accession; process + { + $gbRead = 0; + next unless ($gbAcc); + + push @accNum, $gbAcc unless ($accCount{$gbAcc} > 1); + $geneticCode{$gbAcc} = $globalGenCode; + $nameLabel{$gbAcc} =~ s/\s+/_/g; + $nameLabel{$gbAcc} =~ s/\_+/_/g; + $sequence{$gbAcc} =~ s/\d//g; + $sequence{$gbAcc} =~ s/\s+//g; + $sequence{$gbAcc} =~ s/\~/-/g if ($gapChange); + undef $gbAcc; + } + + next unless ($gbRead); + + if ($gbRead and $lineRead =~ /^\s+\d+/) + { + $sequence{$gbAcc} .= uc($lineRead); + } + } + } + close SEQ; + + if ($inputType eq "phylip") # Postprocess input to derive taxon names and sequence; accounts for both sequential and extended formatting + { + for (my $i = 1; $i <= $phylipTaxa; $i++) + { + my $phylipAcc = "tAlign_" . $i; + + push @accNum, $phylipAcc; + $geneticCode{$phylipAcc} = $globalGenCode; + + # Derive taxon name and sequence + $sequence{$phylipAcc} = uc(substr($phylipSeq{$i}, 0 - $phylipChars)); + $seqType = "protein" if ($sequence{$phylipAcc} =~ /E/i or $sequence{$phylipAcc} =~ /Q/i or $sequence{$phylipAcc} =~ /I/i or $sequence{$phylipAcc} =~ /L/i or $sequence{$phylipAcc} =~ /F/i or $sequence{$phylipAcc} =~ /P/i); + + $nameLabel{$phylipAcc} = substr($phylipSeq{$i}, 0, length($phylipSeq{$i}) - $phylipChars); + $nameLabel{$phylipAcc} =~ s/\s+/_/g; + $nameLabel{$phylipAcc} =~ s/\_+/_/g; + } + } + } + +sub seqPrint + { + my $seqFile = shift; + + $interleaveLength = $maxLength if ($outFormat eq "sequential"); + + # Print fasta-formatted file + if ($fastaPrint) + { + print "\tWriting to fasta-formatted file $fastaOut ...\n"; + open (FASTA, ">$fastaOut") or die "Cannot open fasta file for aligned DNA sequences, $fastaOut"; + foreach my $entry (@accNum) + { +# next if ($deletedSeq{$entry} or not defined $sequence{$entry}); + print FASTA ">$nameLabel{$entry}"; + print FASTA "\t($entry)" unless ($entry =~ /^tAlign/); + print FASTA "\n"; + + # Print sequence + my $fastaSeq = $finalSeq{$entry}; + for (my $breakpoint = 0; $breakpoint <= length($fastaSeq); $breakpoint += $fastaLength) + { + print FASTA substr($fastaSeq, $breakpoint, $fastaLength) . "\n"; + } +# my $fastaSeq = $finalSeq{$entry}; +# my $breakPoint = 80; +# my $breakCount = 0; +# until ($breakPoint > length($fastaSeq)) +# { +# $breakCount++; +# my $replaceString = "\n" . substr($fastaSeq, $breakPoint, 1); +# substr($fastaSeq, $breakPoint, 1) = $replaceString; +# $breakPoint += 81; # Latter accounts for addition of \n to string +# } +# print FASTA "\n$fastaSeq\n"; + } + close FASTA; + } + + # Print nexus-formatted file + if ($nexusPrint) + { + print "\tWriting to nexus file $nexusOut"; + print " in interleaved format" if ($outFormat eq "interleave"); + print " ...\n"; + open (NEX, ">$nexusOut") or die "Cannot open nexus file for aligned DNA sequences, $nexusOut"; + print NEX "#NEXUS\n\n"; + print NEX "[File created from $seqFile using $perlScript v$version on ".localtime()."]\n\n"; + print NEX "begin data;\n"; + print NEX "\tdimensions ntax = $ntax nchar = $maxLength;\n"; + print NEX "\tformat datatype = $seqType gap = - missing = ?"; + print NEX " interleave" if ($outFormat eq "interleave"); + print NEX ";\n\n"; + print NEX "\tmatrix\n\n"; + for (my $interleaveCount = 1; $interleaveCount <= ceil($maxLength / $interleaveLength); $interleaveCount++) + { + my $seqStart = (($interleaveCount - 1) * $interleaveLength) + 1; + my $seqEnd = $interleaveCount * $interleaveLength; + $seqEnd = $maxLength if ($maxLength <= $seqEnd); + my $printLength = $seqEnd - $seqStart + 1; + + foreach my $entry (@accNum) + { + next if ($deletedSeq{$entry}); + my $nexusName = $nameLabel{$entry}; + $nexusName = $entry if ($accPrint); + if ($nexusName =~ /\W/) + { + print NEX "'$nexusName'"; + } + else + { + print NEX "$nexusName"; + } + + my $printSeq = substr($finalSeq{$entry}, $seqStart - 1, $printLength); + if ($outFormat eq "interleave") # Add gaps every 10 elements in sequence + { + for (my $gapAdd = 100; $gapAdd >= 10; $gapAdd-=10) + { + next if $gapAdd > $printLength; + my $bp = substr($printSeq, $gapAdd - 1, 1) . " "; + substr($printSeq, $gapAdd - 1, 1, $bp); + } + } + print NEX "\t$printSeq\n"; + } + print NEX "\n" if ($interleaveCount * $interleaveLength <= $maxLength); + } +# foreach my $entry (@accNum) +# { +# next if ($deletedSeq{$entry}); +# if ($nameLabel{$entry} =~ /\W/) +# { +# print NEX "'$nameLabel{$entry}'"; +# } +# else +# { +# print NEX "$nameLabel{$entry}"; +# } +# print NEX "\t$finalSeq{$entry}\n"; +# } + print NEX "\t;\nend;\n"; + close NEX; + + if (-e "/Developer/Tools/SetFile") + { + system ("/Developer/Tools/SetFile -t 'TEXT' $nexusOut"); + system ("/Developer/Tools/SetFile -c 'PAUP' $nexusOut"); + } + } + + # Print phylip-formatted file (on demand) + if ($phylipTradPrint or $phylipExtPrint) + { + my $maxTaxLength = 50; + $maxTaxLength = 10 if ($phylipTradPrint); + my $blankName = " " x $maxTaxLength; + my %shortNameCount; + + print "\tWriting to phylip file $phylipOut ...\n"; + open (PHYLIP, ">$phylipOut") or die "Cannot open phylip file for aligned DNA sequences, $phylipOut"; + print PHYLIP " $ntax $maxLength\n"; + for (my $interleaveCount = 1; $interleaveCount <= ceil($maxLength / $interleaveLength); $interleaveCount++) + { + my $seqStart = (($interleaveCount - 1) * $interleaveLength) + 1; + my $seqEnd = $interleaveCount * $interleaveLength; + $seqEnd = $maxLength if ($maxLength <= $seqEnd); + my $printLength = $seqEnd - $seqStart + 1; + +# foreach my $entry (@accNum) +# { +# my $trimmedName = substr($nameLabel{$entry}, 0, $maxTaxLength); +# $shortNameCount{trimmedName} = 0; +# } + + foreach my $entry (@accNum) + { + next if ($deletedSeq{$entry}); + + my $phylipName = $nameLabel{$entry}; + $phylipName = $entry if ($accPrint); + + # Print name label as appropriate; also check label and adjust to proper length if needed + if ($interleaveCount == 1) + { + if (length($phylipName) < $maxTaxLength) + { + $shortNameCount{$phylipName}++; + $phylipName .= " " x ($maxTaxLength - length($phylipName)) if ($phylipTradPrint); # Pad end of name with spaces as needed + } + else + { + my $trimmedName = substr($phylipName, 0, $maxTaxLength); + $shortNameCount{$trimmedName}++; + if ($shortNameCount{$trimmedName} > 1) # Check for duplicates among shortened names and make unique by adding numbers + { + $phylipName = substr($phylipName, 0, $maxTaxLength - length($shortNameCount{$trimmedName})); + $phylipName .= $shortNameCount{$trimmedName}; + $phylipName .= " " x ($maxTaxLength - length($phylipName)); # Pad end of name with spaces as needed + } + else + { + $phylipName = $trimmedName; + } + } + print PHYLIP "$phylipName"; + print PHYLIP " " if ($phylipExtPrint); + } + else + { + print PHYLIP "$blankName"; + } + + # Print sequence + my $printSeq = substr($finalSeq{$entry}, $seqStart - 1, $printLength); + if ($outFormat eq "interleave") # Add gaps every 10 elements in sequence + { + for (my $gapAdd = 100; $gapAdd >= 10; $gapAdd-=10) + { + next if $gapAdd > $printLength; + my $bp = substr($printSeq, $gapAdd - 1, 1) . " "; + substr($printSeq, $gapAdd - 1, 1, $bp); + } + } + print PHYLIP "$printSeq\n"; + } + print PHYLIP "\n" if ($interleaveCount * $interleaveLength <= $maxLength); + +# foreach my $entry (@accNum) +# { +# next if ($deletedSeq{$entry}); +# +# my $phylipName = $nameLabel{$entry}; +# +# # Check name label and adjust to proper length if needed +# if (length($phylipName) < $maxTaxLength) +# { +# $shortNameCount{$phylipName}++; +# $phylipName .= " " x ($maxTaxLength - length($phylipName)); # Pad end of name with spaces as needed +# } +# else +# { +# my $trimmedName = substr($phylipName, 0 , $maxTaxLength); +# $shortNameCount{$trimmedName}++; +# if ($shortNameCount{$trimmedName} > 1) # Check for duplicates among shortened names and make unique by adding numbers +# { +# $phylipName = substr($phylipName, 0, $maxTaxLength - length($shortNameCount{$trimmedName})); +# $phylipName .= $shortNameCount{$trimmedName}; +# $phylipName .= " " x ($maxTaxLength - length($phylipName)); # Pad end of name with spaces as needed +# } +# else +# { +# $phylipName = $trimmedName; +# } +# } +# +# print PHYLIP "$phylipName"; +# print PHYLIP " " if ($phylipExtPrint); +# print PHYLIP "$finalSeq{$entry}\n"; + } + close PHYLIP; + } + + # Print Se-Al-formatted file (on demand) + if ($sealPrint) + { + print "\tWriting to Se_Al file $sealOut ...\n"; + open (SEAL, ">$sealOut") or die "Cannot open Se-Al file for aligned DNA sequences, $sealOut\n"; + print SEAL "Database={\n"; + print SEAL "\tID='MLst';\n"; + print SEAL "\tOwner=null;\n"; + print SEAL "\tName=null;\n"; + print SEAL "\tDescription=null;\n"; + print SEAL "\tFlags=0;\n"; + print SEAL "\tCount=2;\n"; + print SEAL "\t{\n\t\t{\n"; + + print SEAL "\t\t\tID='PAli';\n"; + print SEAL "\t\t\tOwner=1;\n"; + print SEAL "\t\t\tName=\"$seqFile\";\n"; + print SEAL "\t\t\tDescription=null;\n"; + print SEAL "\t\t\tFlags=0;\n"; + print SEAL "\t\t\tNumSites=$maxLength;\n"; + print SEAL "\t\t\tType="; + if ($seqType eq "nucleotide") + { + print SEAL "\"Nucleotide\""; + } + else + { + print SEAL "\"Amino Acid\""; + } + print SEAL ";\n"; + print SEAL "\t\t\tFeatures=null;\n"; + print SEAL "\t\t\tColourMode="; + if ($seqType eq "nucleotide") + { + print SEAL "1"; + } + else + { + print SEAL "2"; + } + print SEAL ";\n"; + print SEAL "\t\t\tLabelMode=0;\n"; + print SEAL "\t\t\ttriplets=false;\n"; + print SEAL "\t\t\tinverse=true;\n"; + print SEAL "\t\t\tCount=$ntax;\n"; + print SEAL "\t\t\t{\n"; + + my $i = 0; + foreach my $sequence (@accNum) + { + next if ($deletedSeq{$sequence}); + $i++; + print SEAL "\t\t\t\t{\n"; + print SEAL "\t\t\t\t\tID='PSeq';\n"; + print SEAL "\t\t\t\t\tOwner=2;\n"; + print SEAL "\t\t\t\t\tName=\"$nameLabel{$sequence}\";\n"; + print SEAL "\t\t\t\t\tDescription=null;\n"; + print SEAL "\t\t\t\t\tFlags=0;\n"; + print SEAL "\t\t\t\t\tAccession="; + if ($sequence =~/^tAlign_/) + { + print SEAL "null;\n"; + } + else + { + print SEAL "$sequence;\n"; + } + if ($seqType eq "nucleotide") + { + print SEAL "\t\t\t\t\tType=\"DNA\";\n"; + } + else + { + print SEAL "\t\t\t\t\tType=\"AA\";\n"; + } + print SEAL "\t\t\t\t\tLength=".length($finalSeq{$sequence}).";\n"; + print SEAL "\t\t\t\t\tSequence=\"$finalSeq{$sequence}\";\n"; + if (defined $geneticCode{$sequence} and $geneticCode{$sequence} != 0) + { + print SEAL "\t\t\t\t\tGeneticCode=$gb2seal{$geneticCode{$sequence}};\n"; + } + else + { + print SEAL "\t\t\t\t\tGeneticCode=-1;\n"; # Default for Se-Al is non-coding + } + print SEAL "\t\t\t\t\tCodeTable=null;\n"; + print SEAL "\t\t\t\t\tFrame=1;\n"; + print SEAL "\t\t\t\t\tFeatures=null;\n"; + print SEAL "\t\t\t\t\tParent=null;\n"; + print SEAL "\t\t\t\t\tComplemented=false;\n"; + print SEAL "\t\t\t\t\tReversed=false;\n"; + print SEAL "\t\t\t\t}"; + print SEAL "," unless ($i == $ntax); + print SEAL "\n"; + } + + print SEAL "\t\t\t};\n"; + print SEAL "\t\t},\n"; + print SEAL "\t\t{\n"; + print SEAL "\t\t\tID='MCoL';\n"; + print SEAL "\t\t\tOwner=1;\n"; + print SEAL "\t\t\tName=\"Genetic Codes\";\n"; + print SEAL "\t\t\tDescription=\"Custom Genetic Codes\";\n"; + print SEAL "\t\t\tFlags=0;\n"; + print SEAL "\t\t\tCount=0;\n"; + print SEAL "\t\t}\n"; + print SEAL "\t};\n"; + print SEAL "};\n"; + close SEAL; + + if (-e "/Developer/Tools/SetFile") + { + system ("/Developer/Tools/SetFile -t 'TEXT' $sealOut"); + system ("/Developer/Tools/SetFile -c 'SEAL' $sealOut"); + } + } + } + +sub geneticCoder # Create translation tables for all genetic codes + { + my %geneticCode = ('1' => 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', + '2' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG', + '3' => 'FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG', + '4' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', + '5' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG', + '6' => 'FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', + '9' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG', + '10' => 'FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', + '11' => 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', + '12' => 'FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', + '13' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG', + '14' => 'FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG', + '15' => 'FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', + '16' => 'FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', + '21' => 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG', + '22' => 'FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG', + '23' => 'FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'); + + foreach my $code (qw(1 2 3 4 5 6 9 10 11 12 13 14 15 16 21 22 23)) + { + # Establish basic translation table for each genetic code +# print "\nEstablishing \"$transTable{$code}\" genetic code ...\n" if ($debug); + my $position = 0; + foreach my $base1 (qw (T C A G)) + { + foreach my $base2 (qw (T C A G)) + { + foreach my $base3 (qw (T C A G)) + { + my $codon = $base1.$base2.$base3; + $DNAtoAA{$code}{$codon} = substr($geneticCode{$code}, $position, 1); +# print "\t$codon = $DNAtoAA{$code}{$codon}\n" if ($debug); + $position++; + } + } + } + + # Extend translation table to account for ambiguity codes (note: does not account for gaps) +# print "\nExtending translation table to account for ambiguity codes ...\n" if ($debug); + foreach my $firstPos (@ambigList) + { + foreach my $secondPos (@ambigList) + { + foreach my $thirdPos (@ambigList) + { + my $codon = $firstPos.$secondPos.$thirdPos; + next if (defined $DNAtoAA{$code}{$codon}); + my $refAA = ""; + foreach my $firstNT (@ {$constitNTlist{$firstPos} }) + { + last if (defined $DNAtoAA{$code}{$codon}); + foreach my $secondNT (@ {$constitNTlist{$secondPos} }) + { + last if (defined $DNAtoAA{$code}{$codon}); + foreach my $thirdNT (@ {$constitNTlist{$thirdPos} }) + { + my $testCodon = $firstNT.$secondNT.$thirdNT; + if (not $refAA) + { + $refAA = $DNAtoAA{$code}{$testCodon}; + } + else + { + if ($DNAtoAA{$code}{$testCodon} ne $refAA) + { + $DNAtoAA{$code}{$codon} = "?"; + last; + } + } + } + } + } + $DNAtoAA{$code}{$codon} = $refAA if (not defined $DNAtoAA{$code}{$codon}); +# print "\t$codon = $DNAtoAA{$code}{$codon}\n" if ($debug); + } + } + } + } + return; + } + +sub translate # Translate a DNA sequence to an AA sequence (note: does not account for gaps) + { + my $DNAseq = shift; + my $userCode = shift; + + my $protSeq; + for (my $codonStart = 0; $codonStart < length($DNAseq); $codonStart += 3) + { + if (length($DNAseq) - $codonStart >= 3) # Codon is complete; translate + { + my $codon = substr($DNAseq, $codonStart, 3); + if ($codon =~ /-/ or $codon =~ /\./) + { + $protSeq .= "?"; + } + else + { + $protSeq .= $DNAtoAA{$userCode}{$codon}; + } + } + else # Incomplete codon; automatically translates as ? + { + $protSeq .= "?"; + } + } + + return $protSeq; + } + +# Version history +# +# v1.2 (August 12, 2010) +# - added switch to actually allow fasta output to be specified (or not) +# - sets TYPE and CREATOR codes for nexus files on Mac systems when +# SetFile is present +# - can now parse GenBank formatted output (both pure GenBank and BioEdit +# versions) +# - error checking: warns if same accession number used more than once +# and skips subsequent entires +# - added ability to: +# - detect sequence type of input (nucleotide vs protein) and set as +# appropriate in output files +# - convert nucleotide input to proteins +# - convert sequence input to haplotypes +# - interleave nexus- and phylip-formatted output (request by Michael +# Craige) and to use inputted value to specify line lengths in fasta +# output +# - output individual data partitions specified according to nexus- +# formatted charset statements +# - output jackknifed data sets, each missing a single taxon +# - clean sequence labels of non-alphanumeric characters (on by +# default) +# - convert all ~ gap characters (e.g., from BioEdit) to - +# - convert all ambiguous nucleotides to Ns +# - change flnaking gaps to Ns (indirect request by Simon Creer) +# - fixed classic phylip output such that it now conforms 100% to the +# phylip guidelines +# - improved parsing of MacClade generating files, including blocking out +# MacClade and PAUP blocks when reading in nexus-formatted files +# - fixed translation between GenBank and Se-Al genetic codes +# - changed all instances of #nexus to #NEXUS in output files for +# compatibility with TNT (thanks to Douglas Chester for spotting this) +# - minor bug fixes +# +# v1.1 (March 2, 2006) +# - added ability to batch convert all specified file types in working +# directory (use -d*) +# - updated to seqRead module 1.1.1 (includes autodetection of sequence +# format) +# - checks that necessary input file(s) exists before proceeding +# - added GNU GPL statement +# - sets TYPE and CREATOR codes for Se-Al files on Mac systems when +# SetFile is present +# - minor bug fixes +# +# v1.0 (May 30, 2005) +# - initial release +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# A copy of the GNU General Public License is available at +# http://www.gnu.org/copyleft/gpl.html or by writing to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Fifth Floor, Boston, MA, 02110-1301, USA. + diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/tnt2phytab.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/tnt2phytab.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,58 @@ +#!/usr/bin/perl + +use strict; + +my $file = $ARGV[0]; + +# read file with genes +open FILE, $file or die "ERROR: Cannot open file $file\n"; +my $firstline=0; +my $datatype; +my $taxa; +while () { + my $currentinput = "$_"; + if($firstline==0){ + if($currentinput =~ m/nstates/){ + my @splitlines=split(' ',$currentinput); + $splitlines[2] =~ s/\;//; + if($splitlines[2] == 2){ + $datatype = "binary"; + }elsif($splitlines[2] > 2){ + $datatype = "multi"; + } + }else{ + die "ERROR: file does not begin with nstates line. Must be TNT file exported from MorphoBank.org"; + } + } + if($firstline==1){ + if($currentinput =~ m/xread/){ + }else{ + die "ERROR: file does not contain xread line. Must be TNT file exported from MorphoBank.org"; + } + } + if($firstline==2){ + if($currentinput =~ m/Morpho/){ + }else{ + die "ERROR: file does not contain Morphobank Comment line. Must be TNT file exported from MorphoBank.org"; + } + } + if($firstline==3){ + if($currentinput =~ m/\d/){ + my @splitlines=split(' ',$currentinput); + $taxa = $splitlines[1]."\n"; + }else{ + die "ERROR: file does not contain number of taxa. Must be TNT file exported from MorphoBank.org"; + } + } + if($firstline==4){ + if($currentinput =~ m/\d/){ + die "ERROR: file does not contain empty line after taxa numbers . Must be TNT file exported from MorphoBank.org"; + }else{ + } + } + if(($firstline>4)&&($firstline<(3+2+$taxa))){ + my @splitlines=split(' ',$currentinput); + print $splitlines[0]."\t".$datatype."\t".$splitlines[0]."_".$datatype."\t".$splitlines[1]."\n"; + } + $firstline++; +} diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/tnt2phytab.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/tnt2phytab.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,53 @@ + + Converts TNT text output from MorphoBank into PHYTAB format. + tnt2phytab.pl $input > out + + + + + + + + +**What it does** + +TNT2PHYTAB takes an input TNT text file generated in Morphobank and converts it to PHYTAB file format. + +------ + +**Inputs** + +TNT text file generated in MorphoBank. + +Link to MorphoBank: http://www.morphobank.org/ + +------ + +**Outputs** + +PHYTAB file format. Description: http://osiris-phylogenetics.blogspot.com/2012/09/introduction-to-phytab-format.html + +------ + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the +osiris_phylogenetics site at bitbucket.org + +------ + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a +publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Information about MorphoBank is here. + +http://www.morphobank.org/index.php/About/Index + + + diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/uniprotfasta2phytab.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/uniprotfasta2phytab.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,38 @@ +#!/usr/bin/perl -w + +use strict; + +use FindBin; +use lib "$FindBin::Bin/lib"; +use Bio::DB::Fasta; +use Bio::SeqIO; +use Bio::Seq; + +#inputs +my $infile=shift(@ARGV); +my $partition=shift(@ARGV); +#my $delpipes=shift(@ARGV); +my $species; + +my $seqid; +# open infile fasta file +my $in_obj = Bio::SeqIO->new(-file => $infile, '-format' =>'fasta'); + +while (my $seq = $in_obj->next_seq() ) { + my $sequence = $seq->seq; + my @rawid = split(/\|/, $seq->id); + $seqid = $rawid[1]; +# $seqid = $seq->id; + + $sequence =~ s/\n//g; + $species = $seq->desc; + #species Name is after OS= + $species =~ s/.+OS\=//; + $species =~ s/.+OS\=//; + #species Name is before GN= sometimes PE= + $species =~ s/ GN\=.+//; + $species =~ s/ PE\=.+//; + $species =~ s/ /_/g; + + print $species."\t".$partition."\t".$seqid."\t".$sequence."\n"; +} diff -r 000000000000 -r 5b9a38ec4a39 phyloconversion/uniprotfasta2phytab.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phyloconversion/uniprotfasta2phytab.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,51 @@ + + Converts FASTA file downloaded from uniprot with sequences from same species and gene family to PHYTAB +format + + uniprotfasta2phytab.pl $infile $gene > $outfile + + + + + + + + + +**What it does** + +FASTA2PHYTAB takes an input FASTA file with sequences from the gene family, and extracts species name and +partition (gene family) name that will apply for all sequences. The output is a PHYTAB tabular format file. + +------ + +**Inputs** + +A FASTA file downloaded from uniprot. This should have a header something like + >tr|B9UM22|B9UM22_MOUSE G protein-coupled receptor 81 OS=Mus musculus GN=Gpr81 PE=2 SV=1 + +------ + +**Outputs** + +PHYTAB file format: http://osiris-phylogenetics.blogspot.com/2012/09/introduction-to-phytab-format.html + +------ + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the +osiris_phylogenetics site at bitbucket.org + +------ + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a +publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/NJst.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/NJst.sh Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,4 @@ +#First call perl script which reads trees and writes +/home/galaxy/galaxy-dist/tools/Rtools/makeNJst.pl $1 $2 > Rnjst.R 2>log.txt + +R --vanilla < Rnjst.R 2>log.txt diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/NJst.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/NJst.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,40 @@ + + Estimate species tree with NJst from table of tree names and newick trees + NJst.sh $input $output + + + + + + + + + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +**What it does** +From the abstract: NJst is a "distance method for inferring unrooted species trees from a collection of unrooted gene trees. The species tree is estimated by the neighbor joining (NJ) tree built +from a distance matrix in which the distance between two species is defined as the average number of internodes between two species across gene trees, that is, average gene-tree internode +distance. The distance method is named NJst to distinguish it from the original NJ method. Under the coalescent model, we show that if gene trees are known or estimated correctly, the NJst +method is statistically consistent in estimating unrooted species trees. The simulation results suggest that NJst and STAR (another coalescence-based method for inferring species trees) perform +almost equally well in estimating topologies of species trees, whereas the Bayesian coalescence-based method, BEST, outperforms both NJst and STAR. Unlike BEST and STAR, the NJst method can take +unrooted gene trees to infer species trees without using an outgroup. In addition, the NJst method can handle missing data and is thus useful in phylogenomic studies in which data sets often +contain missing loci for some individuals." + +------- + +**Citations** +Liang Liu and Lili Yu. Estimating Species Trees from Unrooted Gene Trees. Syst Biol (2011) 60(5): 661-667 first published online March 28, 2011 doi:10.1093/sysbio/syr027 + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html +Please direct questions or comments regarding tool functionality to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + + + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/beast.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/beast.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,35 @@ +#!/usr/bin/python +""" +This program makes a new copy of a BEAST XML config file and changes the name of the log and tree files. +The names have to be changed since Galaxy does not support dynamic output file names. +The new XML file name must not be the same as the original!!! +Script prints unique filename. + + +Usage: python beast.py XMLFILE SAVE_DIRECTORY +""" + +import sys +import time +from xml.etree.ElementTree import ElementTree + +beastDOM = ElementTree() +beastDOM.parse(sys.argv[1]) +logs = beastDOM.findall('mcmc/log') +for log in logs: + if log.get('id', False) == "fileLog": + log.set('fileName', 'data.log') + +logs = beastDOM.findall('mcmc/logTree') +for log in logs: + if log.get('id', False) == "treeFileLog": + log.set('fileName', 'data.trees') + +if len(sys.argv) > 2: + directory = sys.argv[2] +else: + directory = "" + +filename = directory + "/" + str(time.time()) + '.xml' +beastDOM.write(filename) +print filename diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/beast.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/beast.sh Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,8 @@ +#!/bin/bash + +# Helper script to change output file names then run BEAST. + +# Usage: ./beast.sh STDOUT_LOG XMLCONFIG BEAGLESSE + +newxml=$(python26 /home/galaxy/galaxy-dist/tools/osiris/phylogenies/beast.py ${2} $(pwd)) +java -jar -Xms4096m -Xmx8192m /home/galaxy/pkgs/BEAST172/lib/beast.jar -overwrite ${3} -threads 8 ${newxml} > ${1} 2>&1 diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/beast.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/beast.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,36 @@ + + Bayesian evolutionary analysis by sampling trees. + + beast + + + beast.sh $beast_stdout $xmlconfig $beagle_sse + + + + + + + + + + + + + + **BEAST v1.7.2** + + This tool takes an xml file configured as a BEAST datafile, and executes BEAST for phylogenetic analysis. + + **If the job fails** + + Make sure your XML file is formatted correctly by running BEAST on you own system first. + + **If your output is empty** + + This tool requires that your output files be named data.log and data.trees. The tool will try and rename them for you, but it may fail. + To rename them manually look in your XML for the fileLog and treeLog nodes. Change the fileName attribute of each node to data.log and data.trees, respectively. + + See BEAST wiki: http://beast.bio.ed.ac.uk/Main_Page + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/beast_treeannotator.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/beast_treeannotator.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,23 @@ +#!/usr/bin/perl + +my $treeannotator_path = '/home/galaxy/pkgs/BEAST172/bin/treeannotator'; + +my $input = $ARGV[0]; +my $burnin = $ARGV[1]; +my $Node_heights = $ARGV[2]; + +my $node_opt; + +if($Node_heights eq "0") { + $node_opt = "keep"; +} +elsif($Node_heights eq "1") { + $node_opt = "median"; +} +elsif($Node_heights eq "2") { + $node_opt = "mean"; +} + +my $run = qx/$treeannotator_path -heights $node_opt -burnin $burnin $input out.tre 2>log.txt/; + +print $run; diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/beast_treeannotator.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/beast_treeannotator.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,51 @@ + + from BEAST 1.7.2 + +beast_treeannotator.pl $input $burnin $node_heights + + + + + + + + + + + + + + + + TreeAnnotator is a part of BEAST 1.7.2. + + http://beast.bio.ed.ac.uk/Main_Page + + This program assists in summarizing the information from a sample of trees produced by BEAST onto a single target tree. + The summary information includes the posterior probabilities of the nodes in the target tree, the posterior estimates and HPD limits of the node heights and (in the case of a relaxed molecular clock model) the rates. + + Burnin: This option allows you to select the amount of burn-in, i.e., the number of samples that will be discarded at the start of the run, so that you are only analysing the part of the trace that is in equilibrium. + + Node heights: This option allows you select how the node heights are summarised on the target tree. You can choose to keep the heights that the target tree has, or rescale it to reflect the posterior mean/median node heights for the clades contained in the target tree. + + http://beast.bio.ed.ac.uk/TreeAnnotator + + Citations: + + http://mbe.oxfordjournals.org/content/early/2012/02/25/molbev.mss075.abstract + Drummond AJ, Suchard MA, Xie D and Rambaut A "Bayesian phylogenetics with BEAUti and the BEAST 1.7" "Molecular Biology And Evolution" "in press" + + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/genetree_read_placement.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/genetree_read_placement.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,70 @@ +#! /usr/bin/perl -w + +use strict; +use warnings; + +##For debugging command line pass, uncomment next +#for (my $i=0; $i < @ARGV; $i++){ +# print "Parameter #$i ".$ARGV[$i]."\n\n"; +#} +#exit; + +my $newgenes=shift(@ARGV); #0 new genes to align +my $align = shift(@ARGV); #1 alignment program to use +my $path = shift(@ARGV); #2 path to tree and gene data +my $name = shift(@ARGV); #3 name of gene family + +#If $newgenes has not hits, do not do read placement, just write tree with no hits +my $buffer; +my $lines = 0; +open(FILE, $newgenes) or die "Can't open `$newgenes': $!"; +while (sysread FILE, $buffer, 4096) { + $lines += ($buffer =~ tr/\n//); +} +close FILE; + +if($lines < 1){ + print "No hits found. Skipping read placement\n Tree copied to output.\n"; + system "cp $path.tre RAxML_labelledTree.EPA_TEST"; + system "cp $path.tre RAxML_originalLabelledTree.EPA_TEST"; +}else{ + + #First concatenate fasta files and align + system "cat $newgenes $path.fas > toalign.fas"; + + if($align eq "MUSCLE"){ + system "muscle -in toalign.fas -out aligned.fas"; + } + elsif($align eq "MAFFT") { + system "mafft --auto toalign.fas > aligned.fas"; + } + elsif($align eq "PRANK") { + system "prank -d=toalign.fas -o=aligned -f=fasta -F"; + system "mv aligned.2.fas aligned.fas"; + } + + #convert to phylip format, uses seqConverter.pl + system "/home/galaxy/galaxy-dist/tools/oakley_dev/seqConverterG.pl -daligned.fas -ope -Oaligned.phy"; + + system "raxmlHPC-PTHREADS-SSE3 -f v -s aligned.phy -m PROTGAMMAWAG -t $path.tre -n EPA_TEST -T 4"; +} + +#Now make tab delimited file to use in tab2trees +#open treefile to read tree line +open(TREE, "<","RAxML_labelledTree.EPA_TEST") or die "Can't open RESULT File!"; +my $finaltree; +while (){ + if($_ =~ /\;/m){ + $finaltree = $_; + chomp($finaltree); + } +} +close TREE; + +$name =~ s/ /_/g; +chomp($name); +#remove clade labels +$finaltree =~ s/\[I\d+\]//g; +open(TAB, '>treeout.tab') or die "Can't open File!"; +print TAB $name."\t".$finaltree."\n"; +close TAB; diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/genetree_read_placement.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/genetree_read_placement.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,109 @@ + + Places reads on a gene tree chosen from a menu. + + raxml + muscle + mafft + prank + + + genetree_read_placement.pl $alignment $alignprog ${tree.fields.path} "${tree.fields.name}" > $stdout 2>&1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool places unknown genes into a pre-calculated gene phylogeny. This can be used for annotating unknown genes. + +------ + +**Inputs** + +1. Input file is a file of sequences. +2. The user selects a program to perform multiple sequence alignment of the input genes plus a database. +3. Second input is selected from a list of gene trees, that are specified in a .loc file (see additional information below). + +------ + +**Outputs** + +RAxML writes the resulting tree file in newick text format, which can be viewed in Osiris with TreeVector (of the mothur package). In addition, if bootstrapping was selected, the individual bootstrap trees and the ML tree with support are written as separate newick files. + +------- + +**Installation Information** + +1. The command this tool runs is: +raxmlHPC-PTHREADS-SSE3 -f v -s $alignment -m PROTGAMMAWAG -t $tree -n EPA_TEST -T 8 + +Which specifies 8 concurrent threads with -T 8. Change the xml if you want to call different numbers of threads. If using pbs or other job runner, make sure universe.ini file is set to match the number of threads requested. + +2. Adding the trees that pop up on the menu and associated data used to build those trees requires +adding a genetrees.loc file in the tool-data directory of Galaxy. Each line of the loc file +specifies a data set, using three columns tab separated: + + unique_id TAB caption for menu TAB /base_name_path/ + +So, for example, if your gene family is named opsin and the path to data files is /home/galaxy/data/genetrees/. The base name is used to specify two files basename.fas and basename.tre. In this case the path directory would contain opsin.fas and opsin.tre + +opsin.tre is a newick phylogeny file and opsin.fas is a fasta file with the sequences (with the same names) used to make opsin.tre + + +Example of .loc file line + + opsin Porter Opsin Tree /home/galaxy/data/genetrees/opsin + +raxml Home Page: +http://www.exelixis-lab.org/software.html + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +S.A. Berger, D. Krompass. Stamatakis: "Performance, Accuracy and Web-Server for Evolutionary Placement of Short Sequence Reads under maximum-likelihood". In Systematic Biology 60(3):291-302, 2011. + +Stamatakis, A. (2006). RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands of taxa and mixed models. Bioinformatics. +http://bioinformatics.oxfordjournals.org/content/22/21/2688.short + +See also references for MAFFT, PRANK, and MUSCLE. + + + + + + + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/long_branch_finder.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/long_branch_finder.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,36 @@ +#!/usr/bin/python -tt + +##usage: ./long_branch_finder.py > outfile +#import modules +import sys, os, numpy, re + +def read(filename): + f = open(filename) + lines = f.readlines() + for eachline in lines: + line = eachline.split('\t') + gene = line[0] + d1 = {} + d1[gene] = line[1] #matches genename with its tree + treetips = re.findall('[a-zA-Z0-9]+(?:_[a-zA-Z0-9]+)?\w*:\d+\.\d+', line[1]) #should be more flexible in recognizing speciesnames in trees + # treetips = re.findall('[a-zA-Z0-9]+(?:_[a-zA-Z0-9]+)?:\d+\.\d+', line[1]) # makes a list of items like 'spname:bl' + #treetips = re.findall('[A-Z][a-z]+_[a-z]+:\d+\.\d+', line[1]) # makes a list of items like 'spname:bl' + d2 = {} + for i in treetips: + spbl = i.split(':') + d2.update({spbl[1] : spbl[0]}) #creates link betwn taxon and its BL + tipbl = re.findall('\d+\.\d+', str(treetips)) + std = numpy.std([float(i) for i in tipbl]) +# numstd = 3*std + numstd = int(sys.argv[2])*std + for i in tipbl: + if float(i) > float(numstd): +# print d2[str(i)] + '\t' + gene + '\t' + i + print d2[str(i)] + '\t' + gene + f.close() + +def main(): + read(sys.argv[1]) + +if __name__ == '__main__': + main() diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/long_branch_finder.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/long_branch_finder.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,58 @@ + + Identifies long tips in newick trees + long_branch_finder.py $tabulartreelist $cutoffvalue > $output + + + + + + + + + +**What it does** + +Use this tool to create a list of terminal branches from one or more trees that exceed a length cut-off. + +This tool calculates the statistics for the branch-length distribution in each newick tree. A terminal branch whose length falls past a specified number of standard deviations is reported in +the output. +The list output from this tool may be used to filter/prune the original PHYTAB sequence file using the tool "Prune phytab using list". + +----- + +**Example** + +Input File (2 tab-delimited columns):: + + GeneA (((Felis_catus:0.234,Canis_familiaris:0.345):0.567,Equus_monoclonius:3.98):0.564,Mus_musculus:0.456):0.0; + GeneB (((Felis_catus:4.123,Canis_familiaris:0.035):0.234,Equus_monoclonius:1.12):0.345,Mus_musculus:0.234):0.0; + +Currently, taxon names must only include only alphanumeric characters in two fields delimted by one underscore, for example: + + Genus_species + GENUSspecies_sample2010 + +If 3 standard deviations are selected as the cut-off, then the output from the example above will yield:: + + Equus_monoclonius geneA + Felis_catus geneB + +Only terminal branches will be identified. (To remove entire clades subtended by long internal branches, see the tool "phytab_LB_pruner".) + +------- + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/makeNJst.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/makeNJst.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,87 @@ +#!/usr/bin/perl + +#This script generates an R script to call NJst +#input is a table with treenamenewick tree +use strict; +use Bio::TreeIO; + +my $filename = $ARGV[0]; +my $outfile = $ARGV[1]; +open FILE, $filename or die $!; + + +my @splitline; + +print "require(phybase);\n"; +print "genetrees<-c("; +my $counter=0; +my $tree; +while () { + chomp; + #get a line from the data file + my $currentinput = "$_"; + @splitline = split(/\t/); + my $treename= $splitline[0]; + $tree = $splitline[1]; + unless($counter==0){ + print ", "; + } + $counter++; + print "'$tree'"; +} +print ")\n"; #close genetree vector +print "taxaname<-c("; +my $spnum = tree2spList($tree); +print ")\nspname<-taxaname\n"; +print "species.structure<-matrix(0,$spnum,$spnum)\n"; +print "diag(species.structure)<-1\n"; +print "\n"; +print "result<-NJst(genetrees,taxaname,spname,species.structure)\n"; +print "write(result, file='$outfile')\n"; +close FILE; + + + + + +#This script requires phybase R package +#NJst is a function used as follows +# genetrees<-c("(A:0.004,(B:0.003,(C:0.002,(D:0.001,E:0.001) +# :0.001):0.001):0.001);","(A:0.004,(B:0.003,(E:0.002,(D:0.001,C:0.001):0.001):0.001):0.001);","(A:0.004,(B:0.003,(C:0.002,(D:0.001,E:0.001):0.001):0.001):0.001);") +# taxaname<-c("A","B","C","D","E") +# spname<-taxaname +# species.structure<-matrix(0, 5, 5) +# diag(species.structure)<-1 +# +# NJst(genetrees,taxaname, spname, species.structure) + + + +sub tree2spList { + my $treefile=shift; + + my ($charactername, $characterstate); + my ($call, $sp_id, $char_id); + + #Open treefile and get taxon names from tree + my $stringfh; + open($stringfh, "<", \$treefile); + + my $input = Bio::TreeIO->new(-format => 'newick', -fh => $stringfh); + my $tree = $input->next_tree; + + my @taxa = $tree->get_leaf_nodes; + my @names = map { $_->id } @taxa; + + my $count=0; + foreach(@names){ + my $treespecies = $_; + $treespecies =~ s/^\s+|\s+$//g ; #Trim leading and trailing whitespace + unless($count==0){ + print ","; + } + print "'$treespecies'"; + $count++ + } + return $count; +} #end of tree2spList subroutine diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/phylobayes33_wrapper.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/phylobayes33_wrapper.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,23 @@ +#!/usr/bin/perl + +my $phylobayes_path = '/home/galaxy/pkgs/phylobayes3.3b/exe_lin64/pb'; +my $readPB_path = '/home/galaxy/pkgs/phylobayes3.3b/exe_lin64/readpb'; + +my $fileName = $ARGV[0]; +my $nchainInput = $ARGV[1]; +my $cycle_bp_trace_comp = $ARGV[2]; +my $discrepancies_threshold = $ARGV[3]; +my $effective_size_floor = $ARGV[4]; +my $jobName = "dataset"; + +my $burnin = $ARGV[5]; +my $sampleInterval = $ARGV[6]; + +my $run1 = qx/$phylobayes_path -d $fileName -nchain $nchainInput $cycle_bp_trace_comp $discrepancies_threshold $effective_size_floor $jobName 2>errorlog/; +print $run1; + +my $list = qx/ls -l/; +print $list; + +my $run2 = qx/$readPB_path -x $burnin $sampleInterval $jobName 2>errorlog/; +print $run2; diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/phylobayes33_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/phylobayes33_wrapper.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,53 @@ + + version 3.3b + + Phylobayes 3.3b + + +phylobayes33_wrapper.pl $filename $nchain $cycles $discrepancies $effectivesize $burnin $sampleInterval + + + + + + + + + + + + + + + + + +**How it works** +PhyloBayes is a Bayesian Monte Carlo Markov Chain (MCMC) sampler for phylogenetic reconstruction using protein alignments. +Compared to other phylogenetic MCMC samplers (e.g. MrBayes ), the main distinguishing feature of PhyloBayes is the underlying probabilistic model, CAT. +It is particularly well suited for large multigene alignments, such as those used in phylogenomics. + +The version 2.3 of phylobayes allows for divergence time estimation, posterior predictive analyses, including compositional homogeneity and saturation tests, +data recoding (analogous to R/Y coding, but for amino-acids), and cross-validation. It also implements a more efficient tree searching MCMC algorithm. + +http://www.atgc-montpellier.fr/phylobayes/ + +**Citations** +Phylobayes - Bayesian phylogenetic software based on mixture models. + +Lartillot N., Philippe H. "A Bayesian Mixture Model for Across-Site Heterogeneities in the Amino-Acid Replacement Process." Molecular Biology and Evolution. 2004 21(6):1095-1109. +http://www.atgc-montpellier.fr/download/papers/cat_2004.pdf + +Lartillot N., Philippe H. "Computing Bayes factors using thermodynamic integration." Systematic Biology. 2006 55:195-207. +http://www.atgc-montpellier.fr/download/papers/phylobayes_2006.pdf + +Lartillot N., Brinkmann H., Philippe H. "Suppression of long-branch attraction artefacts in the animal phylogeny using a site-heterogeneous model." BMC Evolutionary Biology. 2007 Feb 8;7 +Suppl 1:S4. + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/phytab_clearcut.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/phytab_clearcut.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,143 @@ +import os +import optparse +import subprocess +from multiprocessing import Pool + +directory = "" +results = "results.data" +extension = ".fs" +aligned_extension = ".tre" +datatype = "" + +def unescape(string): + mapped_chars = { + '>': '__gt__', + '<': '__lt__', + "'": '__sq__', + '"': '__dq__', + '[': '__ob__', + ']': '__cb__', + '{': '__oc__', + '}': '__cc__', + '@': '__at__', + '\n': '__cn__', + '\r': '__cr__', + '\t': '__tc__', + '#': '__pd__' + } + + for key, value in mapped_chars.iteritems(): + string = string.replace(value, key) + + return string + + +def isTabular(file): + with open(file) as f: + for line in f: + if line[0] == '>': + return False + return True + +def toData(text, name): + name = name.replace("fasta", "") #file name has fasta when fasta file called + text = name.replace(".fs.tre", "") + "\t" + text.replace(" " , "") + return text + +# +#def toData(text): +# text = text.split('\n') +# result = '' +# for line in text: +# if '>' in line: +# line = '\n' + line.replace('>', "") + '\t' +# line = line.replace(" ", "\t") +# result += line +# return result[1:] # Index past the first newline char + +def clearcut(input): + file_name = directory + os.sep + input + popen = subprocess.Popen(['clearcut', "--in=" + file_name, "--out="+file_name + aligned_extension, "--alignment","-k", indata]) + popen.wait() + +class Sequence: + def __init__(self, string): + lis = string.split() + self.species = lis[0] + self.family = lis[1] + self.name = lis[2] + self.header = ' '.join(lis[:-1]) + self.sequence = lis[-1] + self.string = string + + def printFASTA(self): + return '>' + self.header + '\n' + self.sequence + '\n' + +def saveMulti(tabFile): + with open(tabFile) as f: + for line in f: + seq = Sequence(line) + with open(directory + os.sep + seq.family + extension, "a") as p: + p.write(seq.printFASTA()) + +def saveSingle(fastaFile): + with open(fastaFile) as f: + for line in f: + with open(directory + os.sep + "fasta" + extension, "a") as p: + p.write(line) + +def main(): + usage = """%prog [options] +options (listed below) default to 'None' if omitted + """ + parser = optparse.OptionParser(usage=usage) + + parser.add_option( + '-d', '--directory', + metavar="PATH", + dest='path', + default='.', + help='Path to working directory.') + + parser.add_option( + '-i', '--in', + dest='input', + action='store', + type='string', + metavar="FILE", + help='Name of input data.') + + parser.add_option( + '-t', '--type', + dest='datatype', + action='store', + type='string', + help='-P for protein. -D for DNA.') + + options, args = parser.parse_args() + + global directory + global indata + inputFile = unescape(options.input) + directory = unescape(options.path) + os.sep + "data" + indata = "-" + unescape(options.datatype) + + os.mkdir(directory) + + if isTabular(inputFile): + saveMulti(inputFile) + else: + saveSingle(inputFile) + + pool = Pool() + list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)] + pool.map(clearcut, list_of_files) + + result = [file for file in os.listdir(directory) if file.lower().endswith(aligned_extension)] + with open(directory + os.sep + results, "a") as f: + for file in result: + with open(directory + os.sep + file, "r") as r: + f.write(toData(r.read(),file)) + +if __name__ == '__main__': + main() diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/phytab_clearcut.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/phytab_clearcut.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,70 @@ + + clearcut: Generate Neighbor Joining phylogeny. Input can be aligned fasta or phytab format. + + clearcut + + + phytab_clearcut.py -i $data -t $datatype > $clearcut_stdout 2>&1 + + + + + + + + + + + + + + + + +**What it does** + +This tool uses the the program Clearcut to infer a Neighbor-Joining phylogeny using the fast RNJ algorithm for a data set using uncorrected distances (p-distances). + +------ + +**Input** +The program will automatically detect possible inputs of two types +1. ALIGNED FASTA file for a single gene family + +2. ALIGNED PHYTAB for multiple genes. See +http://osiris-phylogenetics.blogspot.com/2012/09/introduction-to-phytab-format.html for description of phytab format. + +------ + +**Output** + +A phylogenetic tree in newick format. In the Osiris package this can be visualized with a tool like TreeVector. + +------ + +**Additional Information** + +http://bioinformatics.hungry.com/clearcut/ + +------ + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Evans, J., L. Sheneman, and J.A. Foster (2006) Relaxed Neighbor-Joining: A Fast Distance-Based Phylogenetic Tree Construction Method, J. Mol. Evol., 62, 785-792 + + + + + + + + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/phytab_raxml.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/phytab_raxml.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,206 @@ +#!/usr/bin/env python + +## +## This tool runs RAxML on each partition/gene in a PHYTAB file. +## If N = # of nodes requested in job runner, then N RAxML jobs will run simultaneously. Make sure that the +## number of processors ('ppn') in the job runner matches the 'numthreads' value set in command line arguement. +## +## Usage: ./phytab_raxml.py -i -e -f -T 4 +## example: ./phytab_raxml.py -i myphytab.txt -e PROTGAMMAWAG -f None -T 4 +## or: ./phytab_raxml.py -i myphtab.txt -e None -f modelsforeachpartition.txt -T 4 + +import optparse +import os +import subprocess +import multiprocessing + +RESULTS_DIR = 'results' +RESULTS_FILE = 'results.phy' +RAXML_PREFIX = 'RAxML_result.' + +def unescape(string): + mapped_chars = { + '>': '__gt__', + '<': '__lt__', + "'": '__sq__', + '"': '__dq__', + '[': '__ob__', + ']': '__cb__', + '{': '__oc__', + '}': '__cc__', + '@': '__at__', + '\n': '__cn__', + '\r': '__cr__', + '\t': '__tc__', + '#': '__pd__' + } + + for key, value in mapped_chars.iteritems(): + string = string.replace(value, key) + + return string + + +class Species: + def __init__(self, string): + lis = string.split('\t') + # print lis + self.species = lis[0] + self.gene = lis[1] + self.name = lis[2] + self.sequence = lis[3] + + def toString(self): + return self.species + '\t' + self.sequence + + +class Gene: + def __init__(self, name): + self.name = name + self.count = 0 + self.length = 0 + self.species = [] + + def output(self): + file_name = self.name + ".phy" + location = RESULTS_DIR + os.sep + file_name + with open(location, 'w') as f: + f.write(str(self.count) + '\t' + str(self.length) + '\n') + for s in self.species: + f.write(s.toString()) + return file_name + + def add(self, species): + if species.name == "": + return + self.species.append(species) + self.count += 1 + if self.length == 0: + self.length = len(species.sequence) - 1 + + +def output_species(species): + file_name = species.gene + ".phy" + location = RESULTS_DIR + os.sep + file_name + with open(location, 'a') as f: + f.write(species.toString()) + return file_name + + +def process_phytab(input): + files = set() + genes = dict() + with open(input) as f: + for line in f: + if len(line) < 4: + continue + species = Species(line) + if species.gene in genes: + genes[species.gene].add(species) + else: + gene = Gene(species.gene) + gene.add(species) + genes[gene.name] = gene + for k, gene in genes.iteritems(): + files.add(gene.output()) + return files + + +def runRaxml(list_of_files, evo, evoDict, NUMTHREADS): + count = 0 + list_of_files = sorted(list_of_files) + for gene_file in list_of_files: + count+=1 + if gene_file.split(".")[0] in evoDict: + newEvo = evoDict[gene_file.split(".")[0]] + else: + newEvo = evo + file_name = RESULTS_DIR + os.sep + gene_file +## RAxML notes: ## +# to run parsimony trees: +# popen = subprocess.Popen(['raxmlHPC-PTHREADS', '-T', cpu_count,'-f', 'd', '-s', file_name,'-y', '-m', newEvo, '-n', gene_file[:-4]+'.tre', '-p', '34']) +# to run likelihood trees: +# popen = subprocess.Popen(['raxmlHPC-PTHREADS', "-T", cpu_count, "-s", file_name, '-m', newEvo, '-n', gene_file[:-4], '-p', '34']) +# to run likelihood trees using starting tree: +# popen = subprocess.Popen(['raxmlHPC-PTHREADS', '-T', cpu_count, '-f', 'e','-s', file_name, '-m', newEvo, '-n', gene_file[:-4], '-t', ptre]) + +## run trees as simultaneously as possible--but wait until last tree completes to resume script + raxml_cmd = ['raxmlHPC-PTHREADS', '-T', NUMTHREADS, '-s', file_name, '-m', newEvo, '-n', gene_file[:-4], '-p', '34'] + if count == len(list_of_files): + run = subprocess.Popen(raxml_cmd) + run.wait() + else: + run = subprocess.Popen(raxml_cmd) + run.communicate()[0] + + +def readEfile(efile): + evoDict = {} + with open(efile, "r") as f: + for line in f: + pair = line.split("\t") + evoDict[pair[0].strip()] = pair[1].strip() + return evoDict + +def main(): + usage = """%prog [options] +options (listed below) default to 'None' if omitted + """ + parser = optparse.OptionParser(usage=usage) + + parser.add_option( + '-i', '--in', + dest='input', + action='store', + type='string', + metavar="FILE", + help='Name of input data.') + + parser.add_option( + '-e', '--evo', + dest='evo', + action='store', + type='string', + metavar="EVO", + help='Evolution model.') + + parser.add_option( + '-f', '--evo-file', + dest='efile', + action='store', + type='string', + metavar="EVO_FILE", + help='Evolution model file. Format is gene_name [tab] evolution_model.') + + parser.add_option( + '-T', '--numthreads', + dest='numthreads', + action='store', + type='int', + metavar="NUMT", + help='Specify number of threads.') + + options, args = parser.parse_args() + + os.mkdir(RESULTS_DIR) + + list_of_species_files = process_phytab(unescape(options.input)) + + try: + evoDict = readEfile(unescape(options.efile)) + except IOError: + print "No sequence model file provide...using", unescape(options.evo), "as the model" + evoDict = {} + + runRaxml(list_of_species_files, unescape(options.evo), evoDict, str(options.numthreads)) + + result = [file for file in os.listdir('./') if file.startswith(RAXML_PREFIX)] + result = sorted(result) + with open(RESULTS_DIR + os.sep + RESULTS_FILE, "w") as f: + for file in result: + with open(file, "r") as r: + f.write(file[len(RAXML_PREFIX):] + '\t' + r.read()) + +if __name__ == '__main__': + main() + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/phytab_raxml.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/phytab_raxml.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,52 @@ + + for phytab input + + raxml + + + phytab_raxml.py -i $sequence -e $evo -f $efile -T 4 > $stdout 2>&1 + + + + + + + + + + + + + + + + + +**What it does** +Accepts Phytab input file (one or multiple gene partitions). Runs RAxML using specified model or model file for each partition. + +For phytab format description, see: http://osiris-phylogenetics.blogspot.com/2012/09/introduction-to-phytab-format.html + +See RAxML for additional information: https://github.com/stamatak/standard-RAxML + +------- +**Performance** +The extent to which runs are parallelized may be altered through the Galaxy jobrunner (command-line). The number of threads is set in this tool's XML command section. + +------- +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/phytab_raxml_pars.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/phytab_raxml_pars.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,190 @@ +#!/usr/bin/env python + +## This tool runs RAxML's parsimony inference on a phytab input. +## If N = # of nodes requested in job runner, then N RAxML jobs will run simultaneously. Make sure that the +## number of processors ('ppn') in the job runner matches the 'numthreads' commandline argument -T. +## +## Usage: ./phytab_raxml_using_ptree.parallel.py -i -e -f -T 4 +## example: ./phytab_raxml_using_ptree.parallel.py -i myphytab.txt -e PROTGAMMAWAG -f None -T 4 +## or: ./phytab_raxml_using_ptree.parallel.py -i myphtab.txt -e None -f modelsforeachpartition.txt -T 4 +## +## outputs a tab-delimited file with gene-partition and newick parsimony tree on each line. + +import optparse +import os +import subprocess +import multiprocessing + +RESULTS_DIR = 'results' +RESULTS_FILE = 'parsimony_results.txt' +RAXML_PREFIX = 'RAxML_parsimonyTree.' + +def unescape(string): + mapped_chars = { + '>': '__gt__', + '<': '__lt__', + "'": '__sq__', + '"': '__dq__', + '[': '__ob__', + ']': '__cb__', + '{': '__oc__', + '}': '__cc__', + '@': '__at__', + '\n': '__cn__', + '\r': '__cr__', + '\t': '__tc__', + '#': '__pd__' + } + + for key, value in mapped_chars.iteritems(): + string = string.replace(value, key) + + return string + + +class Species: + def __init__(self, string): + lis = string.split('\t') + # print lis + self.species = lis[0] + self.gene = lis[1] + self.name = lis[2] + self.sequence = lis[3] + + def toString(self): + return self.species + '\t' + self.sequence + + +class Gene: + def __init__(self, name): + self.name = name + self.count = 0 + self.length = 0 + self.species = [] + + def output(self): + file_name = self.name + ".phy" + location = RESULTS_DIR + os.sep + file_name + with open(location, 'w') as f: + f.write(str(self.count) + '\t' + str(self.length) + '\n') + for s in self.species: + f.write(s.toString()) + return file_name + + def add(self, species): + if species.name == "": + return + self.species.append(species) + self.count += 1 + if self.length == 0: + self.length = len(species.sequence) - 1 + + +def output_species(species): + file_name = species.gene + ".phy" + location = RESULTS_DIR + os.sep + file_name + with open(location, 'a') as f: + f.write(species.toString()) + return file_name + + +def process_phytab(input): + files = set() + genes = dict() + with open(input) as f: + for line in f: + if len(line) < 4: + continue + species = Species(line) + if species.gene in genes: + genes[species.gene].add(species) + else: + gene = Gene(species.gene) + gene.add(species) + genes[gene.name] = gene + for k, gene in genes.iteritems(): + files.add(gene.output()) + return files + + +def runRaxml(list_of_files, evo, evoDict,NUMTHREADS): + for gene_file in list_of_files: + if gene_file.split(".")[0] in evoDict: + newEvo = evoDict[gene_file.split(".")[0]] + else: + newEvo = evo +# cpu_count = str(multiprocessing.cpu_count()) + file_name = RESULTS_DIR + os.sep + gene_file +# to run parsimony trees: + popen = subprocess.Popen(['raxmlHPC-PTHREADS', '-T', cpu_count,'-f', 'd', '-s', file_name,'-y', '-m', newEvo, '-n', gene_file[:-4]+'.tre', '-p', '34']) +# to run likelihood trees: +# popen = subprocess.Popen(['raxmlHPC-PTHREADS', "-T", NUMTHREADS, "-s", file_name, '-m', newEvo, '-n', gene_file[:-4], '-p', '34']) + popen.wait() + + +def toData(text, name): + text = name + "\t" + text.replace("\n", "\\n") + return text + +def readEfile(efile): + evoDict = {} + with open(efile, "r") as f: + for line in f: + pair = line.split("\t") + evoDict[pair[0].strip()] = pair[1].strip() + return evoDict + +def main(): + usage = """%prog [options] +options (listed below) default to 'None' if omitted + """ + parser = optparse.OptionParser(usage=usage) + + parser.add_option( + '-i', '--in', + dest='input', + action='store', + type='string', + metavar="FILE", + help='Name of input data.') + + parser.add_option( + '-e', '--evo', + dest='evo', + action='store', + type='string', + metavar="EVO", + help='Evolution model.') + + parser.add_option( + '-f', '--evo-file', + dest='efile', + action='store', + type='string', + metavar="EVO_FILE", + help='Evolution model file. Format is gene_name [tab] evolution_model.') + + parser.add_option('-T', '--numthread',dest='numthreads', action='store',type='int', metavar="NUMT", help='Provide number of threads for RAxML') + options, args = parser.parse_args() + + os.mkdir(RESULTS_DIR) + + list_of_species_files = process_phytab(unescape(options.input)) + + + try: + evoDict = readEfile(unescape(options.efile)) + except IOError: + print "Could not find evolution model file, using:", unescape(options.evo) + evoDict = {} + + runRaxml(list_of_species_files, unescape(options.evo), evoDict,str(options.numthreads)) + + result = [file for file in os.listdir('./') if file.startswith(RAXML_PREFIX)] + with open(RESULTS_DIR + os.sep + RESULTS_FILE, "w") as f: + for file in result: + with open(file, "r") as r: + f.write(file[len(RAXML_PREFIX):-4] + '\t' + r.read()) + +if __name__ == '__main__': + main() diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/phytab_raxml_pars.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/phytab_raxml_pars.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,49 @@ + + Phytab RAxML - Parsimony for phytab format + + raxml + + + phytab_raxml_pars.py -i $sequence -e $evo -f $efile -T 4> $stdout 2>&1 + + + + + + + + + + + + + + + + + + +**What it does** +This tool computes a randomized parsimony starting tree with RAxML (-y argument). Single or multi-gene phytab files accepted. + +Also, see RAxML site for additional information: https://github.com/stamatak/standard-RAxML + +**Output** +A tab-delimited file containing partition-name and newick parsimony tree on each line. This output is accepted by the tool 'RAxML using Starting Trees' to optimize branch lengths for each +partition. + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/phytab_raxml_small.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/phytab_raxml_small.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,29 @@ + + Phytab RAxML SMALL - RAxML for phytab format. Calls only 1 node for 3 hours + + raxml + + + phytab_raxml.py -i $sequence -e $evo -f $efile -T 4> $stdout 2>&1 + + + + + + + + + + + + + + + + + **Parallel RaXML** +See phytab description here: http://osiris-phylogenetics.blogspot.com/2012/09/introduction-to-phytab-format.html + + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/phytab_raxml_using_ptree.parallel.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/phytab_raxml_using_ptree.parallel.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,220 @@ +#!/usr/bin/env python + +## +## This tool runs RAxML to optimize branch lengths on a tree. (Multiple trees if multi-gene phytab provided). +## If N = # of nodes requested in job runner, then N RAxML jobs will run simultaneously. Make sure that the +## number of processors ('ppn') in the job runner matches the 'numthreads' commandline arguement. +## +## Usage: ./phytab_raxml_using_ptree.parallel.py -i -e -f -T 4 +## example: ./phytab_raxml_using_ptree.parallel.py -i myphytab.txt -e PROTGAMMAWAG -f None -T 4 +## or: ./phytab_raxml_using_ptree.parallel.py -i myphtab.txt -e None -f modelsforeachpartition.txt -T 4 +## +import optparse +import os +import subprocess +import multiprocessing + +RESULTS_DIR = 'results' +RESULTS_FILE = 'results.phy' +RAXML_PREFIX = 'RAxML_result.' +#NUMTHREADS = '4' + +def unescape(string): + mapped_chars = { + '>': '__gt__', + '<': '__lt__', + "'": '__sq__', + '"': '__dq__', + '[': '__ob__', + ']': '__cb__', + '{': '__oc__', + '}': '__cc__', + '@': '__at__', + '\n': '__cn__', + '\r': '__cr__', + '\t': '__tc__', + '#': '__pd__' + } + + for key, value in mapped_chars.iteritems(): + string = string.replace(value, key) + + return string + + +class Species: + def __init__(self, string): + lis = string.split('\t') + # print lis + self.species = lis[0] + self.gene = lis[1] + self.name = lis[2] + self.sequence = lis[3] + + def toString(self): + return self.species + '\t' + self.sequence + +class Gene: + def __init__(self, name): + self.name = name + self.count = 0 + self.length = 0 + self.species = [] + + def output(self): + file_name = self.name + ".phy" + location = RESULTS_DIR + os.sep + file_name + with open(location, 'w') as f: + f.write(str(self.count) + '\t' + str(self.length) + '\n') + for s in self.species: + f.write(s.toString()) + return file_name + + def add(self, species): + if species.name == "": + return + self.species.append(species) + self.count += 1 + if self.length == 0: + self.length = len(species.sequence) - 1 + + +def output_species(species): + file_name = species.gene + ".phy" + location = RESULTS_DIR + os.sep + file_name + with open(location, 'a') as f: + f.write(species.toString()) + return file_name + + +def process_phytab(input): + files = set() + genes = dict() + with open(input) as f: + for line in f: + if len(line) < 4: + continue + species = Species(line) + if species.gene in genes: + genes[species.gene].add(species) + else: + gene = Gene(species.gene) + gene.add(species) + genes[gene.name] = gene + for k, gene in genes.iteritems(): + files.add(gene.output()) + return files + + +def runRaxml(list_of_files, evo, evoDict, list_of_ptrees,NUMTHREADS): + list_of_ptrees = sorted(list_of_ptrees) + count = 0 + for ptre in list_of_ptrees: + count+=1 + matching_gene_file = [file for file in list_of_files if file.startswith(ptre[:-5])] + gene_file = ''.join(matching_gene_file) + + if gene_file.split(".")[0] in evoDict: + newEvo = evoDict[gene_file.split(".")[0]] + else: + newEvo = evo + #cpu_count = str(multiprocessing.cpu_count()) + file_name = RESULTS_DIR + os.sep + gene_file +# to run parsimony trees: +# popen = subprocess.Popen(['raxmlHPC-PTHREADS', '-T', cpu_count,'-f', 'd', '-s', file_name,'-y', '-m', newEvo, '-n', gene_file[:-4]+'.tre', '-p', '34']) +# to run likelihood trees: +# popen = subprocess.Popen(['raxmlHPC-PTHREADS', "-T", cpu_count, "-s", file_name, '-m', newEvo, '-n', gene_file[:-4], '-p', '34']) +# to run likelihood trees using starting tree: +# popen = subprocess.Popen(['raxmlHPC-PTHREADS', '-T', cpu_count, '-f', 'e','-s', file_name, '-m', newEvo, '-n', gene_file[:-4], '-t', ptre]) +# popen.wait() + raxml_cmd = ['raxmlHPC-PTHREADS', '-T', NUMTHREADS, '-f' 'e', '-s', file_name, '-m', newEvo, '-n', gene_file[:-4], '-t', ptre] + if count == len(list_of_ptrees): + run = subprocess.Popen(raxml_cmd) + run.wait() + else: + run = subprocess.Popen(raxml_cmd) + run.communicate()[0] + +def readEfile(efile): + evoDict = {} + with open(efile, "r") as f: + for line in f: + pair = line.split("\t") + evoDict[pair[0].strip()] = pair[1].strip() + return evoDict + +def main(): + usage = """%prog [options] +options (listed below) default to 'None' if omitted + """ + parser = optparse.OptionParser(usage=usage) + + parser.add_option( + '-i', '--in', + dest='input', + action='store', + type='string', + metavar="FILE", + help='Name of input data.') + + parser.add_option( + '-e', '--evo', + dest='evo', + action='store', + type='string', + metavar="EVO", + help='Evolution model.') + + parser.add_option( + '-f', '--evo-file', + dest='efile', + action='store', + type='string', + metavar="EVO_FILE", + help='Evolution model file. Format is gene_name [tab] evolution_model.') + + parser.add_option( + '-t', '--starting-tree', + dest='ptre', + action='store', + type='string', + metavar="PTRE", + help='File of starting trees.') + + parser.add_option('-T', '--numthread',dest='numthreads', action='store',type='int', metavar="NUMT", help='Provide number of threads for RAxML') + options, args = parser.parse_args() + + os.mkdir(RESULTS_DIR) + + list_of_species_files = process_phytab(unescape(options.input)) + + try: + evoDict = readEfile(unescape(options.efile)) + except IOError: + print "No sequence model file provide...using", unescape(options.evo), "as the model" + evoDict = {} + + #read in starting treelist + with open(options.ptre, 'r') as MPtrees: + lines = MPtrees.readlines() + for each in lines: + if len(each)> 1: + line = each.split('\t') + gene = line[0] + parsTree = line[1] + tmptreefile = gene+'.ptre' + with open(tmptreefile, 'wb') as tmp: + tmp.write(parsTree) + list_of_ptrees = [file for file in os.listdir('./') if file.endswith('.ptre')] + + runRaxml(list_of_species_files, unescape(options.evo), evoDict, list_of_ptrees, str(options.numthreads)) + + result = [file for file in os.listdir('./') if file.startswith(RAXML_PREFIX)] + result = sorted(result) + with open(RESULTS_DIR + os.sep + RESULTS_FILE, "w") as f: + for file in result: + with open(file, "r") as r: + f.write(file[len(RAXML_PREFIX):] + '\t' + r.read()) + +if __name__ == '__main__': + main() diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/phytab_raxml_using_ptree.parallel.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/phytab_raxml_using_ptree.parallel.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,65 @@ + + optimizes branch lengths on provided tree(s) + + raxml + + + phytab_raxml_using_ptree.parallel.py -i $phytabinput -e $evo -f $efile -t $startingPtreelist -T 4> $stdout 2>&1 + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool estimates likelihood trees and branch lengths for a PHYTAB file using starting trees (e.g., parsimony trees) to accelerate tree estimation. + +------- +**Input requirements** + +1. PHYTAB sequence file +2. Starting tree list: two tab-delimited columns with the partition(gene) name and its newick starting tree (generated by PHYTAB RAxML-Parsimony):: + + geneA (((Species_a,Species_b),Species_c),Species_d); + geneB ((Species_a,Species_b),(Species_c,Species_d)); + +See phytab description here: http://osiris-phylogenetics.blogspot.com/2012/09/introduction-to-phytab-format.html +------- + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/phytab_raxml_using_ptree.serial.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/phytab_raxml_using_ptree.serial.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,203 @@ +#!/usr/bin/env python + +import optparse +import os +import subprocess +import multiprocessing + +RESULTS_DIR = 'results' +RESULTS_FILE = 'results.phy' +RAXML_PREFIX = 'RAxML_result.' +NUMTHREADS = '4' + +def unescape(string): + mapped_chars = { + '>': '__gt__', + '<': '__lt__', + "'": '__sq__', + '"': '__dq__', + '[': '__ob__', + ']': '__cb__', + '{': '__oc__', + '}': '__cc__', + '@': '__at__', + '\n': '__cn__', + '\r': '__cr__', + '\t': '__tc__', + '#': '__pd__' + } + + for key, value in mapped_chars.iteritems(): + string = string.replace(value, key) + + return string + + +class Species: + def __init__(self, string): + lis = string.split('\t') + # print lis + self.species = lis[0] + self.gene = lis[1] + self.name = lis[2] + self.sequence = lis[3] + + def toString(self): + return self.species + '\t' + self.sequence + + +class Gene: + def __init__(self, name): + self.name = name + self.count = 0 + self.length = 0 + self.species = [] + + def output(self): + file_name = self.name + ".phy" + location = RESULTS_DIR + os.sep + file_name + with open(location, 'w') as f: + f.write(str(self.count) + '\t' + str(self.length) + '\n') + for s in self.species: + f.write(s.toString()) + return file_name + + def add(self, species): + if species.name == "": + return + self.species.append(species) + self.count += 1 + if self.length == 0: + self.length = len(species.sequence) - 1 + + +def output_species(species): + file_name = species.gene + ".phy" + location = RESULTS_DIR + os.sep + file_name + with open(location, 'a') as f: + f.write(species.toString()) + return file_name + + +def process_phytab(input): + files = set() + genes = dict() + with open(input) as f: + for line in f: + if len(line) < 4: + continue + species = Species(line) + if species.gene in genes: + genes[species.gene].add(species) + else: + gene = Gene(species.gene) + gene.add(species) + genes[gene.name] = gene + for k, gene in genes.iteritems(): + files.add(gene.output()) + return files + + +def runRaxml(list_of_files, evo, evoDict, list_of_ptrees): +# ptreelist = [file for file in list_of_ptrees if file.endswith('.ptre')] + for ptre in list_of_ptrees: + matching_gene_file = [file for file in list_of_files if file.startswith(ptre[:-5])] + gene_file = ''.join(matching_gene_file) + + if gene_file.split(".")[0] in evoDict: + newEvo = evoDict[gene_file.split(".")[0]] + else: + newEvo = evo +# cpu_count = str(multiprocessing.cpu_count()) + file_name = RESULTS_DIR + os.sep + gene_file +# to run parsimony trees: +# popen = subprocess.Popen(['raxmlHPC-PTHREADS', '-T', cpu_count,'-f', 'd', '-s', file_name,'-y', '-m', newEvo, '-n', gene_file[:-4]+'.tre', '-p', '34']) +# to run likelihood trees: +# popen = subprocess.Popen(['raxmlHPC-PTHREADS', "-T", cpu_count, "-s", file_name, '-m', newEvo, '-n', gene_file[:-4], '-p', '34']) +# to run likelihood trees using starting tree: + popen = subprocess.Popen(['raxmlHPC-PTHREADS', '-T', NUMTHREADS, '-f', 'e','-s', file_name, '-m', newEvo, '-n', gene_file[:-4], '-t', ptre]) + + popen.wait() + +def readEfile(efile): + evoDict = {} + with open(efile, "r") as f: + for line in f: + pair = line.split("\t") + evoDict[pair[0].strip()] = pair[1].strip() + return evoDict + +def main(): + usage = """%prog [options] +options (listed below) default to 'None' if omitted + """ + parser = optparse.OptionParser(usage=usage) + + parser.add_option( + '-i', '--in', + dest='input', + action='store', + type='string', + metavar="FILE", + help='Name of input data.') + + parser.add_option( + '-e', '--evo', + dest='evo', + action='store', + type='string', + metavar="EVO", + help='Evolution model.') + + parser.add_option( + '-f', '--evo-file', + dest='efile', + action='store', + type='string', + metavar="EVO_FILE", + help='Evolution model file. Format is gene_name [tab] evolution_model.') + + parser.add_option( + '-t', '--starting-tree', + dest='ptre', + action='store', + type='string', + metavar="PTRE", + help='File of starting trees.') + + options, args = parser.parse_args() + + os.mkdir(RESULTS_DIR) + + list_of_species_files = process_phytab(unescape(options.input)) + + try: + evoDict = readEfile(unescape(options.efile)) + except IOError: + print "Could not find evolution model file, using:", unescape(options.evo) + evoDict = {} + + #read in starting treelist + with open(options.ptre, 'r') as MPtrees: + lines = MPtrees.readlines() + for each in lines: + if len(each)> 1: + line = each.split('\t') + gene = line[0] + parsTree = line[1] + tmptreefile = gene+'.ptre' + with open(tmptreefile, 'wb') as tmp: + tmp.write(parsTree) + list_of_ptrees = [file for file in os.listdir('./') if file.endswith('.ptre')] + + runRaxml(list_of_species_files, unescape(options.evo), evoDict, list_of_ptrees) + + result = [file for file in os.listdir('./') if file.startswith(RAXML_PREFIX)] + with open(RESULTS_DIR + os.sep + RESULTS_FILE, "w") as f: + for file in result: + with open(file, "r") as r: + f.write(file[len(RAXML_PREFIX):] + '\t' + r.read()) + +if __name__ == '__main__': + main() + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/phytab_raxml_using_ptree.serial.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/phytab_raxml_using_ptree.serial.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,48 @@ + + Phytab RAxML - optimize branch lengths on trees + + raxml + + + phytab_raxml_using_ptree.serial.py -i $phytabinput -e $evo -f $efile -t $startingPtreelist> $stdout 2>&1 + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool estimates likelihood trees and branch lengths for a PHYTAB file using starting trees (e.g., parsimony trees) to accelerate tree estimation. + +**Input requirements** + +1. PHYTAB sequence file +2. Starting tree list: two tab-delimited columns with the partition(gene) name and its newick starting tree (generated by PHYTAB RAxML-Parsimony):: + + geneA (((Species_a,Species_b),Species_c),Species_d); + geneB ((Species_a,Species_b),(Species_c,Species_d)); + + + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/place_fossil_ml.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/place_fossil_ml.sh Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,9 @@ +#!/bin/bash +datafile=$1 #binary datafile +model=BINGAMMA +morphmodel=MK +tree=$2 +reps=100 + +raxmlHPC-PTHREADS-SSE3 -T 8 -f u -s $datafile -K $morphmodel -m $model -n galaxy -t $tree -N $reps +raxmlHPC-PTHREADS-SSE3 -T 8 -f v -s $datafile -K $morphmodel -m $model -a RAxML_weights.galaxy -n fossil_weights -t $tree -N $reps diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/place_fossil_ml.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/place_fossil_ml.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,67 @@ + + Maximum Likelihood Fossil Placement on Molecular Phylogeny + + raxml + + + place_fossil_ml.sh $data_file $tree > $log 2>&1 + + + + + + + + + + + + +**What it does** + +This tool implements a Maximum Likelihood (ML) fossil placement algorithm developed by Berger and Stamatakis 2009. + +------ + +**Inputs** + +1. Input file of morphological data in raxml (phylipE) format. In Osiris, this can be generated by exporting a MorphoBank (www.morphobank.org) dataset into TNT format, then using TNT2PHYTAB, followed by phylocatenator to change the data to raxml format. +2. A molecular phylogeny in nexus format, for example the result of a RAxML analysis. + +------ + +**Outputs** + +RAxML writes the resulting tree file in newick text format, which can be viewed in Osiris with TreeVector (of the mothur package). In addition, if bootstrapping was selected, the individual bootstrap trees and the ML tree with support are written as separate newick files. + +------- + +**Additional Information** + +This method assumes a molecular phylogeny for a set of extant taxa, and then generates weights for each morphological character based on congruence with the molecular phylogeny. Next, the method attaches the fossils to every possible branch of the molecular tree, and in each case calculates the likelihood of observing the weighted morphological data. The placement of each fossil in the molecular tree is the placement with the maximum likelihood estimate. + +raxml Home Page: +http://www.exelixis-lab.org/software.html + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Berger and Stamatakis (2010). Accuracy of morphology-based phylogenetic fossil placement under +maximum likelihood. http://sco.h-its.org/exelixis/pubs/Exelixis-RRDR-2009-1.pdf + +Stamatakis, A. (2006). RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands of taxa and mixed models. Bioinformatics. +http://bioinformatics.oxfordjournals.org/content/22/21/2688.short + +Todd H. Oakley, Joanna M. Wolfe, Annie R. Lindgren, and Alexander K. Zaharoff (2012). Phylotranscriptomics to Bring the Understudied into the Fold: Monophyletic Ostracoda, Fossil Placement, and Pancrustacean Phylogeny Mol Biol Evol doi:10.1093/molbev/mss216 + + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/prune_phytab_using_list.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/prune_phytab_using_list.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,38 @@ +#!/usr/bin/python -tt + +##usage: ./prune_phytab_using_list.py > outfile + +#import modules +import sys, os, numpy, re + +def read(filename): + f = open(filename) + lines = f.readlines() + # for case where list is an empty file (here, under 20 bytes) + if os.lstat(sys.argv[2]).st_size < 20: + for line in lines: + print line, + else: + bad = open(sys.argv[2]) + badlines = bad.readlines() + badstripped = [line[:-1] for line in badlines] + str1 = '|'.join(badstripped) + str2 = '('+str1[:-1]+')' + pattern = re.compile(str2) + count=0 + for line in lines: + match = pattern.findall(line) + if match and sys.argv[3] == 'keep': + print line, + if not match and sys.argv[3] == 'discard': + print line, + bad.close() + f.close() + +def main(): + read(sys.argv[1]) + +if __name__ == '__main__': + main() + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/prune_phytab_using_list.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/prune_phytab_using_list.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,67 @@ + + Filters PHYTAB dataset by comparison to a text list + + prune_phytab_using_list.py $input1 $matchlist $tokeep > $output + + + + + + + + + + + + + + + +**What it does** + +This tool filters a PHYTAB sequence file to remove undesired sequences based on a list provided. + +----- + +**Basic Example** + +The input data must be in phytab column format (fields are tab-delimited). Column 1 is species name, C2 is genefamily, C3 unique sequence identifier, C4 is sequence:: + + species1 gene1 uniquenameA acgttagcgcgctatagc + species2 gene1 uniquenameB acgttag--cgctataaa + species3 gene1 uniquenameC acgttagcgcgctatagc + species4 gene1 uniquenameD acgttagcgcgctatagc + species1 gene2 uniquenameE --gttagtttgcta + species3 gene2 uniquenameF gtgttagtttgcta + +Sequences from selected taxa, genes, or specific sequences provided on the List input will be excluded or retained (depending on the popup option selected) in the resulting PHYTAB output. +The format of the list may consist of + +taxa only:: + + species1 + species4 + +genes only:: + + geneA + geneB + +specific genes from select taxa (tab-delimited):: + + species1 geneA + species4 geneB + +(This last sort of list is produced by the tool 'Long Branch Finder'.) + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the osiris_phylogenetics site at bitbucket.org + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/raxml.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/raxml.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,98 @@ +#! /usr/bin/perl -w + +use strict; +use warnings; +#raxml.pl Galaxy wrapper calls raxml from raxml.xml +#xml file contains: +#raxml.pl [GTR|CAT] [PROT|DNA] [protmodel] [morphmodel] [phylip file] [constraint] [partition] [best_tree?] [invar?] [#bootreps] [outgroup] + +##For debugging command line pass, uncomment next +#for (my $i=0; $i < @ARGV; $i++){ +# print "Parameter #$i ".$ARGV[$i]."\n\n"; +#} +#exit; + +my $rate_het=shift(@ARGV); #0 rate heterogeneity? value will = GAMMA or CAT +my $datatype = shift(@ARGV); #1 datatype? True=Protein False=DNA +my $protmodel = shift(@ARGV); #2 which protein model +my $morphmodel = shift(@ARGV); #3 which morphology multistate model +my $data_file= shift(@ARGV); #4 input a phylip file +my $part_file = shift(@ARGV); #5 optional partition file +my $constraint_tree = shift(@ARGV); #6 optional constraint tree +my $find_best = shift(@ARGV); #7 if ML find ML tree as well as bootstrapping +my $invar = shift(@ARGV); #8 if INVAR include invariant site parameter in model +my $nboots = shift(@ARGV); #9 Number of bootstrap reps +my $seed = shift(@ARGV); #10 Number of bootstrap reps +my $long = shift(@ARGV); #11 decide whether to do a long or bootstrap call or not, with multiple threads +my $outgroup = shift(@ARGV); #12 Specify the outgroup +my $model; + + + +# From shell pipeline +# raxmlHPC-PTHREADS7.2.6 -T $processors -f a -s $data_name.data -q $data_name.part -m $model -n $data_name -N 100 -x 1234567890 -o Limulus_polyphemus +# cp RAxML_bestTree.$data_name $data_nameBootBest.tre +# cp RAxML_bipartitions.$data_name $data_nameBoot.tre + +#ADD OPTIONS TO BUILD FULL RAXML COMMANDLINE ARGUMENT + +my $build_command; +#First CALL RAXML THROUGH PATH with 8 threads +if($long eq 'Long'){ #Currently both raxml and raxml_long call with 'long' + $build_command = "raxmlHPC-PTHREADS-SSE3 -T 8"; +}else{ + $build_command = "mpirun -np 10 raxmlHPC-MPI-SSE3 "; +} +#Check if find best tree is desired + if($find_best eq "ML"){ + $build_command = $build_command." -f a "; + } +#Next add call to input phylip file + $build_command = $build_command." -s ".$data_file; +#Add call to partition file name + unless($part_file eq 'None'){ + $build_command = $build_command." -q ".$part_file; + } +#Build substitution model + if($datatype eq "PROT"){ + $model = "PROT"; + }elsif($datatype eq "DNA"){ + $model = "GTR"; + } + if($rate_het eq "GTR"){ + $model = $model."GAMMA"; + }elsif($rate_het eq "CAT"){ + $model = $model."CAT"; + } + if($invar eq "INVAR"){ + $model = $model."I"; + } + if($datatype eq "PROT"){ + $model = $model.$protmodel; + } + $build_command = $build_command." -m ".$model; +#Add multistate morphology model + $build_command = $build_command." -K ".$morphmodel; +#check constraint tree + unless($constraint_tree eq 'None'){ + $build_command = $build_command." -g ".$constraint_tree; + } +#N Bootstraps + $build_command = $build_command." -N ".$nboots; +#Bootstrap seed + $build_command = $build_command." -x ".$seed; +#Parsimony seed + $build_command = $build_command." -p "."1234567"; + + +#name output files galaxy + $build_command = $build_command." -n galaxy"; +#Outgroup + if(defined $outgroup){ + $build_command = $build_command." -o ".$outgroup; + } + +print "Galaxy COMMAND BUILD WAS: $build_command\n"; + +#Uncomment to actually call raxml +system $build_command; diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/raxml.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/raxml.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,100 @@ + + Maximum Likelihood Phylogenetic Analysis + + raxml + + + raxml.pl $GAMMA $PROT $protmodel $morphmodel $data_file $part_file $constraint + $ML $INVAR $Boot $seed Long $Out > $raxml_log + 2>&1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +RAxML estimates phylogenetic trees using Maximum Likelihood. + +------ + +**Inputs** + +Input file is a RAxML compatible text file. This can be generated by Osiris tools from an aligned fasta file with fasta2phylipE or from a PHYTAB format file with phylocatenator. + +------ + +**Outputs** + +RAxML writes the resulting tree file in newick text format, which can be viewed in Osiris with TreeVector (of the mothur package). In addition, if bootstrapping was selected, the individual bootstrap trees and the ML tree with support are written as separate newick files. + +------- + +**Additional Information** + +raxml Home Page. +http://www.exelixis-lab.org/software.html + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Stamatakis, A. (2006). RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands of taxa and mixed models. Bioinformatics. +http://bioinformatics.oxfordjournals.org/content/22/21/2688.short + + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/raxml_boot.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/raxml_boot.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,103 @@ + + Bootstrapped Maximum Likelihood Phylogenetic Analysis + + raxml + + + raxml.pl $GAMMA $PROT $protmodel $morphmodel $data_file $part_file $constraint + $ML $INVAR $Boot $seed Boot $Out > $raxml_log + 2>&1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +RAxML_boot is a copy of RAxML for longer bootstrap-only analysis. RAxML estimates phylogenetic trees using Maximum Likelihood. + +------ + +**Inputs** + +Input file is a RAxML compatible text file. This can be generated by Osiris tools from an aligned fasta file with fasta2phylipE or from a PHYTAB format file with phylocatenator. + +------ + +**Outputs** + +RAxML writes the resulting tree file in newick text format, which can be viewed in Osiris with TreeVector (of the mothur package). In addition, if bootstrapping was selected, the individual bootstrap trees and the ML tree with support are written as separate newick files. + +------- + +**Additional Information** + +The reason to have a copy is to allow Galaxy to request more resources from the cluster job runner. Both raxml and raxml_boot call raxml.pl, but raxml_boot calls the perl script with the 'Boot' flag, which then calls the MPI version of RAxML to spread bootstraps across many processors. The number of processors specified in the call in the raxml.pl file must match the resources requested in Galaxy's universe.ini file for raxml_boot. + +raxml Home Page. +http://www.exelixis-lab.org/software.html + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Stamatakis, A. (2006). RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands of taxa and mixed models. Bioinformatics. +http://bioinformatics.oxfordjournals.org/content/22/21/2688.short + + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/raxml_long.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/raxml_long.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,105 @@ + + Maximum Likelihood Phylogenetic Analyses + + raxml + + + raxml.pl $GAMMA $PROT $protmodel $morphmodel $data_file $part_file $constraint + $ML $INVAR $Boot $seed Long $Out > $raxml_log + 2>&1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +The RAxML_long tool is a copy of the RAxML tool for running longer jobs. +RAxML estimates phylogenetic trees using Maximum Likelihood. + +------ + +**Inputs** + +Input file is a RAxML compatible text file. This can be generated by Osiris tools from an aligned fasta file with fasta2phylipE or from a PHYTAB format file with phylocatenator. + +------ + +**Outputs** + +RAxML writes the resulting tree file in newick text format, which can be viewed in Osiris with TreeVector (of the mothur package). In addition, if bootstrapping was selected, the individual bootstrap trees and the ML tree with support are written as separate newick files. + +------- + +**Additional Information** + + +The reason to have a copy is to allow Galaxy to request more resources from the cluster job runner. Both raxml and raxml_long call raxml.pl, but raxml_long calls the perl script with the 'Long' flag, which then calls raxml with more resources. The call in the raxml.pl file must match the resources requested in Galaxy's universe.ini file. + +raxml Home Page. +http://www.exelixis-lab.org/software.html + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Stamatakis, A. (2006). RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands of taxa and mixed models. Bioinformatics. +http://bioinformatics.oxfordjournals.org/content/22/21/2688.short + + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/raxml_pars.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/raxml_pars.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,48 @@ +#! /usr/bin/perl -w + +use strict; +use warnings; +#raxml.pl Galaxy wrapper calls raxml from raxml.xml + +#For debugging command line pass, uncomment next 4 lines +#for (my $i=0; $i < @ARGV; $i++){ +# print "Parameter #$i ".$ARGV[$i]."\n\n"; +#} +#exit; + +my $datatype = shift(@ARGV); #0 datatype +my $data_file= shift(@ARGV); #1 input a phylip file +my $part_file = shift(@ARGV); #2 optional partition file +my $seed = shift(@ARGV); #3 Number of bootstrap reps +my $outgroup = shift(@ARGV); #4 Specify the outgroup +my $model; + +#ADD OPTIONS TO BUILD FULL RAXML COMMANDLINE ARGUMENT + +my $build_command; +#First CALL RAXML THROUGH PATH with 8 threads + $build_command = "raxmlHPC-PTHREADS-SSE3 "; +#Add Parsimony Option and Thread number for PThreads + $build_command = $build_command." -y -T 4"; +#Next add call to input phylip file + $build_command = $build_command." -s ".$data_file; +#Add call to partition file name + unless($part_file eq 'None'){ + $build_command = $build_command." -q ".$part_file; + } +#model is passed directly with xml + $model = $datatype; + $build_command = $build_command." -m ".$model; +#Parsimony seed + $build_command = $build_command." -p ".$seed; +#name output files galaxy + $build_command = $build_command." -n parsimony"; +#Outgroup + if(defined $outgroup){ + $build_command = $build_command." -o ".$outgroup; + } + +print "Galaxy COMMAND BUILD WAS: $build_command\n"; + +#Uncomment to actually call raxml +system $build_command; diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/raxml_pars.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/raxml_pars.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,73 @@ + + Use RAxML to calculate a phylogeny with Parsimony + + raxml + + + raxml_pars.pl $datatype $data_file $part_file + $seed $Out > $raxml_log + 2>&1 + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +RAxML_pars estimates phylogenetic trees, in this case calling a Parsimony algorithm. + +------ + +**Inputs** + +Input file is a RAxML compatible text file. This can be generated by Osiris tools from an aligned fasta file with fasta2phylipE or from a PHYTAB format file with phylocatenator. + +------ + +**Outputs** + +RAxML writes the resulting tree file in newick text format, which can be viewed in Osiris with TreeVector (of the mothur package). + +------- + +**Additional Information** + +raxml Home Page. +http://www.exelixis-lab.org/software.html + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Stamatakis, A. (2006). RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands of taxa and mixed models. Bioinformatics. +http://bioinformatics.oxfordjournals.org/content/22/21/2688.short + + + + + diff -r 000000000000 -r 5b9a38ec4a39 phylogenies/raxml_read_placement.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylogenies/raxml_read_placement.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,73 @@ + + RAxML read placement - Produces a labeled tree from RAxML. + + raxml + + + raxmlHPC-PTHREADS-SSE3 -f v -s $alignment -m PROTGAMMAWAG -t $tree -n EPA_TEST -T 8 > $stdout 2>&1 + + + + + + + + + + + + + + + + +**What it does** + +This tool creates a labeled tree from an alignment file (in phylipE format) and a tree in nexus format. Any sequences found in the aligment that are not on the tree will be placed on the tree using ML, according the the EPA algorithm of Berger et al. + +------ + +**Inputs** + +1. Input file is a RAxML compatible text file. This can be generated by Osiris tools from an aligned fasta file with fasta2phylipE or from a PHYTAB format file with phylocatenator. +2. Second input file is a phylogenetic tree file in nexus format. This can be generated by RAxML or other programs. + +------ + +**Outputs** + +RAxML writes the resulting tree file in newick text format, which can be viewed in Osiris with TreeVector (of the mothur package). In addition, if bootstrapping was selected, the individual bootstrap trees and the ML tree with support are written as separate newick files. + +------- + +**Additional Information** + + The command this tool runs is: + raxmlHPC-PTHREADS-SSE3 -f v -s $alignment -m PROTGAMMAWAG -t $tree -n EPA_TEST -T 8 + + Which specifies 8 concurrent threads with -T 8. Change the xml if you want to call different numbers of threads. If using pbs or other job runner, make sure universe.ini file is set to match the number of threads requested. + + raxml Home Page. + http://www.exelixis-lab.org/software.html + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +S.A. Berger, D. Krompass. Stamatakis: "Performance, Accuracy and Web-Server for Evolutionary Placement of Short Sequence Reads under maximum-likelihood". In Systematic Biology 60(3):291-302, 2011. + +Stamatakis, A. (2006). RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands of taxa and mixed models. Bioinformatics. +http://bioinformatics.oxfordjournals.org/content/22/21/2688.short + + + + + diff -r 000000000000 -r 5b9a38ec4a39 phylographics/TreeVector.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylographics/TreeVector.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,61 @@ +#!/usr/bin/env python + +""" +Runs TreeVector on a newick file; +TODO: more documentation +""" + +import optparse, os, shutil, subprocess, sys, tempfile, re, string + +def stop_err( msg ): + sys.stderr.write( '%s\n' % msg ) + sys.exit() + +def __main__(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '-i', '--input', dest='input', help='The sequence input file' ) + parser.add_option( '-s', '--shape', dest='shape', help='Branch shape' ) + parser.add_option( '-l', '--length', dest='length', help='Branch length' ) + parser.add_option( '-g', '--svg', dest='svg', help='Graph in SVG format' ) + parser.add_option( '-p', '--jarBin', dest='jarBin', default='', help='The path to where jars are stored' ) + parser.add_option( '-j', '--jarFile', dest='jarFile', help='The file name of the jar file to use') + parser.add_option( '-x', '--jvmArgs', dest='jvmArgs', help='Java JVM arguments, e.g -Xmx250m') + (options, args) = parser.parse_args() + if options.jarBin == None: + stop_err("Misssing option --jarBin") + elif options.jarFile == None: + stop_err("Misssing option --jarFile") + elif options.input == None: + stop_err("Misssing option --input") + params = [] + props = [] + if options.jvmArgs != None: + props.append(options.jvmArgs) + if options.shape != None and options.shape != 'None': + params.append('-%s' % options.shape) + if options.length != None and options.length != 'None': + params.append('-%s' % options.length) + if options.svg != None and options.svg != 'None': + params.append('-out %s' % options.svg) + # make temp directory + buffsize = 1048576 + tmp_dir = tempfile.mkdtemp() + # print("tmp_dir %s" % tmp_dir) + # generate commandline + cmd = 'java %s -jar %s %s %s' % (' '.join(props), os.path.join( options.jarBin, options.jarFile ), options.input, ' '.join(params)) + # print >> sys.stderr, cmd + # need to nest try-except in try-finally to handle 2.4 + try: + try: + proc = subprocess.Popen( args=cmd, shell=True, stderr=subprocess.PIPE ) + returncode = proc.wait() + stderr = proc.stderr.read() + if returncode != 0: + raise Exception, stderr + except Exception, e: + raise Exception, 'Error executing TeeVector. ' + str( e ) + except Exception, e: + stop_err( 'TreeVector failed.\n' + str( e ) ) + +if __name__=="__main__": __main__() diff -r 000000000000 -r 5b9a38ec4a39 phylographics/TreeVector.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylographics/TreeVector.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,63 @@ + + Convert Newick tree topologies to SVG trees + TreeVector.py + --jarBin='~/bin/' + --jarFile='TreeVector.jar' + --jvmArgs='-Xmx250m' + --input=$tree + --svg=$output + --shape=$shape + --length=$length + + + + + + + + + + + + + + + + + + + +**What it does** + +TreeVector (Pethica et al., 2010) converts Newick format tree topologies to SVG trees. + +------ + +**Inputs** + +Newick tree topology. + +------ + +**Outputs** + +Scalable Vector Graphic (SVG) tree. + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider +citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +TreeVector Citation + +Pethica, R., Barker, G., Kovacs, T. and Gough, J. 2010. TreeVector: Scalable, Interactive, +Phylogenetic Trees for the Web. PLoS ONE 5(1), e8934. doi: 10.1371/journal.pone.0008934 + + + diff -r 000000000000 -r 5b9a38ec4a39 phylographics/makeRtrees.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylographics/makeRtrees.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,121 @@ +#!/usr/bin/perl + +#This script generates an R script to print trees to a pdf file +#input is a table with treenamenewick tree +use strict; + +my $filename = $ARGV[0]; +my $outfile = $ARGV[1]; +open FILE, $filename or die $!; +my $treetype = $ARGV[2]; +my $extiplabels = $ARGV[3]; +my $options; +my $labeltaxfile = $ARGV[4]; +my $query = $ARGV[5]; +my $midroot = $ARGV[6]; +my %labelhash; +my $genecount=0; +my @genes; +my $markquery; +my $midpoint; + +if($query eq 'yes'){ + $markquery = 1; +}else{ + $markquery = 0; +} +unless($labeltaxfile eq 'None'){ + open LABELFILE, $labeltaxfile or die $!; + while () { + chomp; + #get a line from the data file + my $currentinput = "$_"; + if($currentinput =~ /\t/){ + my @splitline = split(/\t/); + my $speciesname= $splitline[0]; + $speciesname = "'".$speciesname."'"; + my $treename = $splitline[1]; + if(exists $labelhash{$treename}){ + push @{ $labelhash{$treename} }, $speciesname; + }else{ + push @{ $labelhash{$treename} }, $speciesname; + #$labelhash{$treename} = $speciesname; + $genecount ++; + push @genes, $treename; + } + } + } + +}#end unless + + +if($extiplabels eq 'yes'){ + $options = ", show.tip.label=FALSE"; +}else{ + $options = ", show.tip.label=TRUE"; +} + +print "require(ape);\n"; +print "require(phangorn);\n"; +print "pdf(file='$outfile');\n"; + +while () { + chomp; + #get a line from the data file + my $currentinput = "$_"; + my @splitline = split(/\t/); + my $treename= $splitline[0]; + my $tree = $splitline[1]; + my $labelsvector; + + #print the R commands to make tree graphics + print "raw_tree <- read.tree(text = '$tree');\n"; + print "raw_tree\$edge.length[ is.na(raw_tree\$edge.length) ] <- 0 \n"; + if($midroot eq 'yes'){ + print "raw_tree <- midpoint(raw_tree)\n"; + } + #Check if large tree, then make text size smaller + print "thetips <- raw_tree\$tip.label \n"; + print "numtips <- length(thetips) \n"; + +#Make text smaller for trees with many tips + print "if(numtips>250){plot(raw_tree, cex=0.15, edge.width = 0.1, type='$treetype' $options)}else if(numtips>75){plot(raw_tree, cex=0.3, type='$treetype' $options)}else{plot(raw_tree, cex=0.6, type='$treetype' $options)};\n"; + print "title('Tree File: $treename');\n"; + +#Add taxon labels, if optional file present and if labels exist for tree + if(exists $labelhash{$treename}){ + $labelsvector = join ",", @{ $labelhash{$treename} }; + $labelsvector = "tolabel <- c(".$labelsvector.")"; + print "thetips <- raw_tree\$tip.label \n"; + print $labelsvector."\n"; + print "labels <- match(tolabel,thetips) \n"; + print "tiplabels(tip=labels, pch=21, cex=1) \n"; + } + + +#Add taxon labels if gene name contains QUERY - for readplacement + if($markquery == 1){ + print "thetips <- raw_tree\$tip.label \n"; + print "qlabels <- grep(\'QUERY\',thetips) \n"; + print "tiplabels(tip=qlabels, pch=21, cex=1.1) \n"; + print "l1labels <- grep(\'LANDMARK1\',thetips) \n"; + print "tiplabels(tip=l1labels, pch=15, cex=.8, col='red') \n"; + } +} + +print "dev.off();\n"; +close FILE; + +#Testing hash arrays +#my %nums; +#my $test='odd'; +#for my $n (4,5,6,10) { +# if ($n % 2) { +# push @{ $nums{$test} }, $n; +# } else { +# push @{ $nums{even} }, $n; +# } +#} +# +#print join ', ', @{ $nums{even} }; +#print "\n\n"; diff -r 000000000000 -r 5b9a38ec4a39 phylographics/phylographics.toolconf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylographics/phylographics.toolconf Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,5 @@ +
+ + +
+ diff -r 000000000000 -r 5b9a38ec4a39 phylographics/tab2trees.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylographics/tab2trees.sh Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,17 @@ +#First call perl script which reads trees and writes +#makeRtrees.pl must be in path! +command -v /var/www/galaxy_dev/galaxy-dist/tools/osiris/phylographics/makeRtrees.pl >/dev/null 2>&1 || { echo > $6 "ERROR: makeRtrees.pl must be available in Galaxy's path. R must be installed with ape and phangorn modules installed."; exit 1; } + + +#$1 infile +#$2 outfile +#$3 tree type (ie phylogram) +#$4 yes|no exclude tips +#$5 yes|no label taxa +#$6 name of Rfile "Rfile" by default in xml +#$7 yes|no to label OTUs with QUERY in title +#$8 yes|no to conduct midpoint rooting + +/var/www/galaxy_dev/galaxy-dist/tools/osiris/phylographics/makeRtrees.pl $1 $2 $3 $4 $5 $7 $8 > $6 2>log.txt + +R --vanilla < $6 2>log.txt diff -r 000000000000 -r 5b9a38ec4a39 phylographics/tab2trees.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylographics/tab2trees.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,57 @@ + + Create PDF file of phylogeny graphics from table of tree names and Newick trees + tab2trees.sh $input $output $treetype $extips $labeltax $Rfile $query $midpoint + + + + + + + + + + + + + + + + + + + + +**What it does** + +Creates a "book" of multiple phylogenetic tree graphics, one per page, in pdf format. + +------ + +**Inputs** + +tab delimited file. Column 1 is tree name column 2 is newick tree string. + +------ + +**Outputs** + +PDF file of phylogeny graphics. + +------- + +**Additional Information** + +Please direct questions or comments to ucsb_phylogenetics@lifesci.ucsb.edu or, if you can, enter them on the +osiris_phylogenetics site at bitbucket.org + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + diff -r 000000000000 -r 5b9a38ec4a39 phylostatistics/PD.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylostatistics/PD.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,68 @@ +#!/usr/bin/perl -w + +use strict; + +#use FindBin; +#use lib "$FindBin::Bin/lib"; +use Bio::TreeIO; +use Bio::Tree::Tree; + +###this script will find the phylogenetic distance between two species +#input is a tree, output filename, and table with pairwise distances +#usage: +#PD.pl +# parse in newick/new hampshire format +my @species1; +my @species2; + + +my $half=$ARGV[3]; +my $divtimebool; +if($half eq 'yes'){ + $divtimebool=1; +}elsif($half eq 'no'){ + $divtimebool=0; +}else{ + die "Argument must contain yes or no for divergence times\n"; +} +my $outfile = $ARGV[2]; +open(OUT, ">$outfile") or die("Couldn't open output file $ARGV[2]\n"); + + +my $pairsfile = $ARGV[0]; +open(PAIRS, "$pairsfile") or die("Couldn't open input file $ARGV[0]\n"); +while () { + chomp; + my $sp1; + my $sp2; + ($sp1, $sp2) = split("\t"); + push(@species1, $sp1); + push(@species2, $sp2); +} + +my $treefile = $ARGV[1]; + +for(my $i=0; $i < @species1; $i++){ + print OUT $species1[$i]."\t".$species2[$i]; + open(TREE, "$treefile") or die("Couldn't open output file $ARGV[1]\n"); + + my $treeio = new Bio::TreeIO('-format' => 'newick', + '-file' => $treefile); + + while(my $tree = $treeio->next_tree){; + my $node1 = $tree->find_node(-id => $species1[$i]); + my $node2 = $tree->find_node(-id => $species2[$i]); + my $distances = $tree->distance(-nodes => [$node1,$node2]); + + #ADD OPTION FOR DIVIDING BY 2 FOR DIVERGENCE TIMES + if($divtimebool==1){ + $distances = $distances/2 ; + } + print OUT "\t".$distances; + } +print OUT "\n"; +close(TREE); +} + +close(PAIRS); +close(OUT); diff -r 000000000000 -r 5b9a38ec4a39 phylostatistics/PDpairs.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylostatistics/PDpairs.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,59 @@ + + Pairwise distance between taxa in a phylogenetic tree + + Bio + + + PD.pl $intable $intree $outtable $half + + + + + + + + + + +**What it does** + +PDpairs generates Phylogenetic Distances (PD) for pairs of species in a phylogenetic tree. + +------ + +**Input formats** + +Input a table of species pairs:: + + species1 species2 + species2 species4 + +And a Newick format phylogeny with branch lengths + +------ + +**Outputs** + +Output is a table of the species pairs followed by a column of pairwise distance for each tree in the file:: + + species1 species2 1.104 + species2 species4 2.119 + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Stajich, J., Block, D., Boulez, K., Brenner, S., Chervitz, S., Dagdigian, C., Fuellen, G., Gilbert, J., Korf, I., Lapp, H. et al. (2002). The Bioperl toolkit: perl modules for the life sciences. Genome Res. 12,1611 -1618. + +Faith DP (1992) Conservation evaluation and phylogenetic diversity. Biological Conservation 61:1-10. + + + diff -r 000000000000 -r 5b9a38ec4a39 phylostatistics/SHtest.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylostatistics/SHtest.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,46 @@ +#! /usr/bin/perl -w + +use strict; +use warnings; + +##For debugging command line pass, uncomment next +#for (my $i=0; $i < @ARGV; $i++){ +# print "Parameter #$i ".$ARGV[$i]."\n\n"; +#} +#exit; + +my $datatype = shift(@ARGV); #0 datatype +my $data_file= shift(@ARGV); #1 input a phylip file +my $part_file = shift(@ARGV); #2 optional partition file +my $best_tree = shift(@ARGV); #3 best tree for SH comparison +my $alt_trees = shift(@ARGV); #4 Alternative tree(s) for SH comparison +my $model; + +#ADD OPTIONS TO BUILD FULL RAXML COMMANDLINE ARGUMENT + +my $build_command; +#First CALL RAXML THROUGH PATH with 8 threads + $build_command = "raxmlHPC-PTHREADS-SSE3 "; +#Add SH Test Option and Thread number for PThreads + $build_command = $build_command."-f h -T 4"; +#Next add call to input phylip file + $build_command = $build_command." -s ".$data_file; +#model is passed directly with xml + $model = $datatype; + $build_command = $build_command." -m ".$model; +#Add call to partition file name + unless($part_file eq 'None'){ + $build_command = $build_command." -q ".$part_file; + } +#Next add call to input best tree file + $build_command = $build_command." -t ".$best_tree; +#Next add call to input best tree file + $build_command = $build_command." -z ".$alt_trees; +#name output files galaxy + $build_command = $build_command." -n SH"; + +print "Galaxy COMMAND BUILD WAS: $build_command\n"; + +#Uncomment to actually call raxml +system $build_command; + diff -r 000000000000 -r 5b9a38ec4a39 phylostatistics/SHtest.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylostatistics/SHtest.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,74 @@ + + Use RAxML to calculate SHtest to compare phylogenetic tree topologies + + + raxml + + + SHtest.pl $datatype $data_file $part_file $best_tree $alt_trees + > $raxml_log + 2>&1 + + + + + + + + + + + + + + + + + + +**What it does** + +SHtest performs the Shimodaira-Hasegawa statistical test to compare one tree topology to alternative tree(s). The test is implemented with RAxML. + +------ + +**Inputs** + +#. A file in RAxML (phylipE) format. This can be made with Osiris tools phylocatenator or fasta2phylipE. +#. Model of evolution assumed for likelihood calculations. +#. An optional RAxML partition file delineates data set partitions. This can be made with Osiris tool phylocatenator. +#. Target phylogeny in newick format. +#. Alternative hypothesis phylogenetic tree(s) in newick format. + +------ + +**Outputs** + +The output is text output from RAxML, which details the statistical value of the SH test(s). + +------- + +**Additional Information** + +raxml Home Page: +http://www.exelixis-lab.org/software.html + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Shimodaira H., M. Hasegawa, 1999 Multiple comparisons of log-likelihoods with applications to phylogenetic inference Mol. Biol. Evol 16:1114-1116 + +Stamatakis, A. (2006). RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands of taxa and mixed models. Bioinformatics. +http://bioinformatics.oxfordjournals.org/content/22/21/2688.short + + + diff -r 000000000000 -r 5b9a38ec4a39 phylostatistics/phylomatic.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylostatistics/phylomatic.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,8 @@ +#!/usr/bin/perl + +my $file1 = $ARGV[0]; +my $file2 = $ARGV[1]; + +my $run = qx/phylomatic -f $file1 -t $file2 > output.txt 2> errors.txt /; + +print $run; diff -r 000000000000 -r 5b9a38ec4a39 phylostatistics/phylomatic.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylostatistics/phylomatic.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,67 @@ + + Run Phylomatic + + Phylocom Phylomatic + + ./phylomatic.pl $input1 $input2 + + + + + + + + + + +**What it does** + +Phylomatic (Webb & Donoghue, 2005) is part of the Phylocom software package (Webb et al., 2008). +Phylomatic takes your list of taxa, and first tries to match them by genus name to the megatree. Failing that, they are attached by family name. +If all the genera appear in the megatree, then that family appears resolved. If even one genus is missing from the megatree, the returned phylogeny portrays a polytomy of genera. +Currently, species are not included in the megatree, and species within a genus are always returned as polytomies. + +------ + +**Inputs** + +Input 1: Phylogeny in Newick format. +Input 2: List of taxa in delimited text file. + +------ + +**Outputs** + +Phylomatic matches input taxa to the most resolved possible position in any of a set of master +trees in the database (the 'megatrees') and returns the phylogeny in one of a variety of formats: +graphical, Newick, NEXUS, or tabular. + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use +of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Phylomatic (Part of Phylocom software package) + +http://phylodiversity.net/phylomatic/html/pm2_form.html + +If you use results derived from Phylocom analyses in your publications, please cite: + +Webb, C. O., Ackerly, D. D. & Kembel, S. W. 2008. Phylocom: software for the analyses of phylogenetic community structure and trait evolution. Bioinformatics, 24: 2089-2100. +doi: 10.1093/bioinformatics/btn358 + +Original Phylomatic citation: + +Webb, C. O. & Donoghue, M. J. 2005. Phylomatic: tree assembly for applied phylogenetics. Molecular +Ecology Resources, 5: 181-183. doi: 10.1111/j.1471-8286.2004.00829.x + + + diff -r 000000000000 -r 5b9a38ec4a39 phylostatistics/phylostatistics.tool_conf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylostatistics/phylostatistics.tool_conf Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,14 @@ +
+
diff -r 000000000000 -r 5b9a38ec4a39 phylostatistics/phytab_LB_pruner.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylostatistics/phytab_LB_pruner.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,142 @@ +import os +import optparse +import subprocess +from multiprocessing import Pool + +directory = "" +results = "results.data" +extension = "" +aligned_extension = ".tab" +datatype = "" + +perlpath = "/home/galaxy-dist/tools/osiris/tree-manipulation/" + +def unescape(string): + mapped_chars = { + '>': '__gt__', + '<': '__lt__', + "'": '__sq__', + '"': '__dq__', + '[': '__ob__', + ']': '__cb__', + '{': '__oc__', + '}': '__cc__', + '@': '__at__', + '\n': '__cn__', + '\r': '__cr__', + '\t': '__tc__', + '#': '__pd__' + } + + for key, value in mapped_chars.iteritems(): + string = string.replace(value, key) + + return string + + +def isTabular(file): + with open(file) as f: + for line in f: + if line[0] == '>': + return False + return True + +#def toData(text, name): +# name = name.replace("fasta", "") #file name has fasta when fasta file called +# text = name.replace(".fs.tre", "") + "\t" + text.replace(" " , "") +# return text + + +def toData(text, name): + text = text.split('\n') + result = '' + for line in text: + if '\t' in line: + line = line.replace("./data/","") + "\n" + result += line + return result # Index past the first newline char + +def LB_pruner(input): + file_name = directory + os.sep + input + popen = subprocess.Popen(['perl', perlpath+'LB_prunerG.pl', file_name, indata, file_name + aligned_extension]) + popen.wait() + +class Sequence: + def __init__(self, string): + lis = string.split() + self.name = lis[0] + self.tree = lis[1] + self.string = string + + def printFASTA(self): + return self.tree + '\n' + +def saveMulti(tabFile): + with open(tabFile) as f: + for line in f: + seq = Sequence(line) + with open(directory + os.sep + seq.name + extension, "a") as p: + p.write(seq.printFASTA()) + +def saveSingle(fastaFile): + with open(fastaFile) as f: + for line in f: + with open(directory + os.sep + "fasta" + extension, "a") as p: + p.write(line) + +def main(): + usage = """%prog [options] +options (listed below) default to 'None' if omitted + """ + parser = optparse.OptionParser(usage=usage) + + parser.add_option( + '-d', '--directory', + metavar="PATH", + dest='path', + default='.', + help='Path to working directory.') + + parser.add_option( + '-i', '--in', + dest='input', + action='store', + type='string', + metavar="FILE", + help='Name of input data.') + + parser.add_option( + '-m', '--mult', + dest='datatype', + action='store', + type='string', + help='Multiplier') + + options, args = parser.parse_args() + + global directory + global indata + inputFile = unescape(options.input) + directory = unescape(options.path) + os.sep + "data" + indata = unescape(options.datatype) + + os.mkdir(directory) + + if isTabular(inputFile): + saveMulti(inputFile) + else: + saveSingle(inputFile) + + pool = Pool() + list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)] + pool.map(LB_pruner, list_of_files) + + result = [file for file in os.listdir(directory) if file.lower().endswith(aligned_extension)] + with open(directory + os.sep + results, "a") as f: + for file in result: + with open(directory + os.sep + file, "r") as r: + f.write(toData(r.read(),file)) + +if __name__ == '__main__': + main() + diff -r 000000000000 -r 5b9a38ec4a39 phylostatistics/phytab_LB_pruner.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylostatistics/phytab_LB_pruner.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,59 @@ + + LB_pruner: Identify genes on very long branches. + + LB_prunerG.pl + + + phytab_LB_pruner.py -i $data -m $multiplier + + + + + + + + + + + + +**What it does** + +LB pruner will find the average of all branch lengths in each Newick tree. Given the user-input multiplier value (M), if any branch is longer than Mx +the average branch length, that gene will be written to the tab-delimited output file. See outputs section for more information. + +------ + +**Inputs** + +Sequence data file in phytab or FASTA. + +Phytab description: http://osiris-phylogenetics.blogspot.com/2012/09/introduction-to-phytab-format.html + +Input a table as follows::: + name newick_tree; + name2 newick_tree; + name3 newick_tree; + +Enter a value for M, the multiplier. + +------ + +**Outputs** + +Tab delimited file listing any genes longer than Mx the average branch length for each Newick tree. If an internal branch is longer than M times the +average then all members of that clade are written to the out file, with 999 as the length of each branch. The third column is branch length, fourth +column is average branch length for each tree. + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + diff -r 000000000000 -r 5b9a38ec4a39 phylostatistics/ses-mpd-galaxy.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylostatistics/ses-mpd-galaxy.r Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,28 @@ +#!/sw/math/R-2.15.3-shlib/bin/Rscript + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# NOTE: since picante is licensed under the GPL, and this program relies on +# picante, the program is licensed under the GPL regardless +# +# See: http://cran.r-project.org/web/packages/picante/picante.pdf, +# http://www.gnu.org/licenses/old-licenses/gpl-2.0-faq.html#IfLibraryIsGPL +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +library('picante') + +args <- commandArgs(trailingOnly = TRUE) + +sample <- read.table(file = args[1]) +tree <- read.tree(file = args[2]) + +# get community data matrix of sample +comm <- sample2matrix(sample) +# get phylogenetic distance matrix of tree +phydist <- cophenetic(tree) + +# finally, run the processed info through ses.mpd to get the result we want +result <- ses.mpd(comm, phydist) + +# capture result and output to file +out <- capture.output(result) +cat(out, file = args[3], sep = "\n") diff -r 000000000000 -r 5b9a38ec4a39 phylostatistics/ses-mpd.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylostatistics/ses-mpd.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,17 @@ + + + Standardized effect size of mean pairwise distances in communities. + ses-mpd-galaxy.r $samples $tree $output >/dev/null 2>&1 + + + + + + + + +Inputs: Phylocom sample formatted data text file, phylogenetic tree file. + +Output: Output of ses.mpd in text file + + diff -r 000000000000 -r 5b9a38ec4a39 phylostatistics/tree_species.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylostatistics/tree_species.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,22 @@ +#!/usr/bin/perl +use strict; + +my $infile=$ARGV[0]; +my $treefile=$ARGV[1]; + +open IN, $infile or die "Cannot open $infile\n"; +open TREE, $treefile or die "Cannot open $treefile\n"; + +my $tree = ; +close(TREE); + +while(){ + my $curspecies = $_; + chomp($curspecies); + $curspecies =~ s/ /\_/g ; + if($tree =~ m/$curspecies/){ + #match + }else{ + print $curspecies."\n"; + } +} diff -r 000000000000 -r 5b9a38ec4a39 phylostatistics/tree_species.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylostatistics/tree_species.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,49 @@ + + Writes species not in a phylogeny to a text list. + + tree_species.pl $infile $treefile > $outfile + + + + + + + + + +**What it does** + +This tool can be used to determine which species from a list are not present in a tree. +Searches, one at a time, for species (OTUs) that are present in a tree file. If not found, the species is written to the output file. + +------ + +**Inputs** + +1. A text list of species to use to search. +2. A phylogeny file. + +------ + +**Outputs** + +A text list of species (OTU) names. + +------- + +**Additional Information** + +This is a very simple perl script that just searches a text file for the presence of each line in the other file. + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + + + diff -r 000000000000 -r 5b9a38ec4a39 phylostatistics/tree_support_phyutility.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylostatistics/tree_support_phyutility.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,59 @@ + + Calculates support for nodes of a single tree (bootstrap) using a file of multiple trees + + phyutility + + + java -jar ${GALAXY_DATA_INDEX_DIR}/shared/jars/phyutility.jar -ts -in $treesfile -tree $besttree -out $outtree + + + + + + + + + +**What it does** + +Tree support RAxML calculates support values for a given tree using a set of input trees. For example, it can calculate bootstrap support for the ML tree based on a set of trees from a bootstrap analysis. +This is valuable when separating bootstrap analysis from ML analysis, and when separating bootstrap replicates in different runs, which can be concatenated and used as input for this tool. + +------ + +**Inputs** + +1. Tree file with target tree. + +2. Set of phylogenies, branch lengths are ignored. + +------ + +**Outputs** + +The focal tree is output with support values in Newick tree format. + +------- + +**Additional Information** + +1. This tool Calls phyutility + +2. A similar tool is tree_support_RAxML, which works best with RAxML output + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Smith, S. A. and Dunn, C. W. (2008) Phyutility: a phyloinformatics tool for trees, alignments, and molecular data. Bioinformatics. 24: 715-716 + + + diff -r 000000000000 -r 5b9a38ec4a39 phylostatistics/tree_support_raxml.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylostatistics/tree_support_raxml.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,67 @@ + + Calculates support for nodes of a single tree (bootstrap) using a file of multiple trees + + raxml + + + raxmlHPC-PTHREADS-SSE3 -T 2 -f b -t $besttree -z $treesfile -m GTRCAT -n galaxy -o $Out + + + + + + + + + + + + +**What it does** + +Tree support RAxML calculates support values for a given tree using a set of input trees. For example, it can calculate bootstrap support for the ML tree based on a set of trees from a bootstrap analysis. +This is valuable when separating bootstrap analysis from ML analysis, and when separating bootstrap replicates in different runs, which can be concatenated and used as input for this tool. + +------ + +**Inputs** + +1. Newick format target tree, branch lengths are optional. + +2. Set of Newick format phylogenies, branch lengths are ignored. + +3. Optional outgroup specification. There can be no spaces in this entry. + +------ + +**Outputs** + +The focal tree is output with support values in Newick tree format. + +------- + +**Additional Information** + +1. This tool Calls RAxML with -f b option with -t and -z options + +2. A similar tool is tree_support_phyutility, which works best with nexus format trees. + +------- + +**Citations** + +This tool is part of the Osiris Phylogenetics Tool Package for Galaxy. If you make extensive use of this tool in a publication, please consider citing the following. + +Current Osiris Citation is here + +http://osiris-phylogenetics.blogspot.com/2012/10/citation.html + +Additional Citations for this tool + +Stamatakis, A. (2006). RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands of taxa and mixed models. Bioinformatics. +http://bioinformatics.oxfordjournals.org/content/22/21/2688.short + + + + + diff -r 000000000000 -r 5b9a38ec4a39 phylostatistics/treeannotator.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylostatistics/treeannotator.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,23 @@ +#!/usr/bin/perl + +my $treeannotator_path = '/home/galaxy/pkgs/BEAST172/bin/treeannotator'; + +my $input = $ARGV[0]; +my $burnin = $ARGV[1]; +my $Node_heights = $ARGV[2]; + +my $node_opt; + +if($Node_heights eq "0") { + $node_opt = "keep"; +} +elsif($Node_heights eq "1") { + $node_opt = "median"; +} +elsif($Node_heights eq "2") { + $node_opt = "mean"; +} + +my $run = qx/$treeannotator_path -heights $node_opt -burnin $burnin $input out.tre 2>log.txt/; + +print $run; diff -r 000000000000 -r 5b9a38ec4a39 phylostatistics/treeannotator.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylostatistics/treeannotator.xml Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,39 @@ + + Runs BEAST 1.7.2 Tree Annotator + treeannotator.pl $input $burnin $node_heights + + + + + + + + + + + + + + + TreeAnnotator is a part of BEAST 1.7.2. + + http://beast.bio.ed.ac.uk/Main_Page + + This program assists in summarizing the information from a sample of trees produced by BEAST onto a single target tree. + The summary information includes the posterior probabilities of the nodes in the target tree, the posterior estimates and HPD limits of the node heights and (in the case of a relaxed molecular clock model) the rates. + + Burnin: This option allows you to select the amount of burn-in, i.e., the number of samples that will be discarded at the start of the run, so that you are only analysing the part of the trace that is in equilibrium. + + Node heights: This option allows you select how the node heights are summarised on the target tree. You can choose to keep the heights that the target tree has, or rescale it to reflect the posterior mean/median node heights for the clades contained in the target tree. + + http://beast.bio.ed.ac.uk/TreeAnnotator + + Citations: + + http://mbe.oxfordjournals.org/content/early/2012/02/25/molbev.mss075.abstract + Drummond AJ, Suchard MA, Xie D and Rambaut A "Bayesian phylogenetics with BEAUti and the BEAST 1.7" "Molecular Biology And Evolution" "in press" + + + + +
mview HTML
' + f + '