Mercurial > repos > jjv_bioinformaticians > phylogeny_maximum_parsimony_informative_sites
changeset 3:1cd1d4171142 draft
Uploaded
author | jjv_bioinformaticians |
---|---|
date | Wed, 05 Apr 2017 14:32:18 -0400 (2017-04-05) |
parents | 6ab618072d5c |
children | 549887e3f045 |
files | phylo-protpars.pl |
diffstat | 1 files changed, 160 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phylo-protpars.pl Wed Apr 05 14:32:18 2017 -0400 @@ -0,0 +1,160 @@ +#!/usr/bin/perl -w +use Bio::AlignIO; +use Bio::Root::IO; +use Bio::Seq; +use Bio::SeqIO; +use Bio::SimpleAlign; +use Bio::TreeIO; +use strict; +use warnings; + +use Bio::Tools::Run::Phylo::Phylip::ProtPars; +use Bio::Tools::Run::Alignment::Clustalw; +use Bio::Tools::Run::Phylo::Phylip::DrawTree; + +## Links +# https://metacpan.org/pod/Bio::AlignIO +# https://metacpan.org/pod/Bio::SimpleAlign +# https://metacpan.org/pod/Bio::Seq +## + +$ENV{PHYLIPDIR} = '/usr/lib/phylip/bin/'; +$ENV{CLUSTALWDIR} = '/usr/local/bin/'; +my $inputfilename = ''; +my $outputfilename = 'sequences'; +my $outHandler; +my $phylogeny = 0; +my $nValues = 0; +my $nFrequency = 0; + + +sub preprocess { + my ($file, $nValues, $nFrequency) = @_; + my $in = Bio::AlignIO->new(-file => $file , -format => 'phylip'); + my $aln = $in->next_aln(); # Get the first alignment + # Getting header + my $numSeq = $aln->num_sequences; # 5 seqs + my $length = $aln->length(); # 376 bases aligning with gaps + # Header end + my @sequences; + while ( $aln->num_sequences ) { # Number of sequences in this aligment. + # Extract sequences and check values for the alignment column $pos + #my @secuencias=(); + foreach my $seq ($aln->each_seq) { + my $sequence = $seq->seq();# + my @arraySequence = $sequence =~ /./sg; # String to array + push(@sequences,\@arraySequence); # Matrix of sequences + } + + # Creates hash that contains the repetitions of each aa in seq + my @toDelete = (); + for (my $j=0; $j<$length;$j++) { + my %amino = (); # AA hash + for (my $i=0; $i<$numSeq;$i++) { + my $char = $sequences[$i][$j]; + if ($char ne "-") { + if (exists $amino{$sequences[$i][$j]}) { + $amino{$sequences[$i][$j]} += 1; + } else { + $amino{$sequences[$i][$j]} = 1; + } + } + + } # End for i ($numSeq) + my $count = 0; + foreach my $key (keys %amino) { # $key = AA name + my $aminoCount = $amino{$key}; # Number of repeats of current amino acid. + if ($aminoCount >= $nFrequency) { + $count++; + } + } + if (!($count >= $nValues)) { # Delete column + push(@toDelete, $j); + } + } # End for j ($length) + + my $new_aln = $aln; + # Remove the columns included in "toDelete" hash + for (my $i = ((scalar(@toDelete)-1)); $i >= 0 ; $i--) { + $new_aln = $new_aln->remove_columns([$toDelete[$i],$toDelete[$i]]); + } + + $outHandler->write_aln($new_aln); + $aln = $in->next_aln(); # Take the next aligment, if it was. + } # End while +} + +## +# MAIN +## + +## Check input parameters +my $num_args = $#ARGV + 1; +if (!($num_args == 1 || $num_args == 3)) { + print "\nUsage: phylo-protpars.pl file.fasta [Num. Values] [Num. Repeats]\n"; + exit; +} +# Read arg2 and arg3 +if ($num_args == 3) { + $outHandler = Bio::AlignIO->new(-file => ">$outputfilename.aln.phy", + -format => 'phylip'); + $phylogeny = 1; +} + +# Taking values of parameters from arguments +$inputfilename = $ARGV[0]; +if (length($inputfilename)>200){ + print "Error: File name length msut be <200 characters\n"; + exit; +} +if ($phylogeny) { # Parse values + $nValues=$ARGV[1]; + if ($nValues < 2) { + print "Error: Number of values less than 2\n"; + exit; + } + $nFrequency=$ARGV[2]; + if ($nFrequency < 2) { + print "Error: Number of repeats less than 2\n"; + exit; + } +} +## End checking + +# Start MSA +my @params = ('ktuple' => 2, 'matrix' => 'BLOSUM','output'=>'phylip', 'outfile'=>$outputfilename.".aln"); +my $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params); +my $aln = $factory->run($inputfilename); + + +# Need to parse the MSA output file? +if ($phylogeny) { + preprocess($outputfilename.".aln", $nValues, $nFrequency); +} + + +# Process phylogeny +my @params2 = (idlength=>30,threshold=>10,jumble=>"17,10",outgroup=>2); +my $tree_factory = Bio::Tools::Run::Phylo::Phylip::ProtPars-> new(@params2); +my $output = ""; +if ($phylogeny) { + $output = $outputfilename.".aln.phy"; +} else { + $output = $outputfilename.".aln"; +} + +# Create the tree +my $tree = $tree_factory->run($output); +my @params3=('fontfile'=>'fontfile', 'tempdir'=>'images/'); +my $treedraw = Bio::Tools::Run::Phylo::Phylip::DrawTree->new(@params3); + +#If file 'plotfile' exists, it has to be deleted. +if (-e 'images/plotfile'){ + unlink 'images/plotfile' or warn "Could not unlink images/plotfile: $!"; +} + +open FILE, ">treePars.txt" or die $!; +print FILE $tree->as_text('newick'); + +# Create draw with treePars +# my $treeimagefile = $treedraw->run("treePars.txt"); \ No newline at end of file