view phylo-protpars.pl @ 4:549887e3f045 draft default tip

Uploading readme.txt
author jjv_bioinformaticians
date Wed, 12 Apr 2017 11:39:43 -0400
parents 1cd1d4171142
children
line wrap: on
line source

#!/usr/bin/perl -w
use Bio::AlignIO;
use Bio::Root::IO;
use Bio::Seq;
use Bio::SeqIO;
use Bio::SimpleAlign;
use Bio::TreeIO;
use strict;
use warnings;

use Bio::Tools::Run::Phylo::Phylip::ProtPars;
use Bio::Tools::Run::Alignment::Clustalw;
use Bio::Tools::Run::Phylo::Phylip::DrawTree;

## Links
# https://metacpan.org/pod/Bio::AlignIO
# https://metacpan.org/pod/Bio::SimpleAlign
# https://metacpan.org/pod/Bio::Seq
##

$ENV{PHYLIPDIR} = '/usr/lib/phylip/bin/';
$ENV{CLUSTALWDIR} = '/usr/local/bin/';
my $inputfilename = '';
my $outputfilename = 'sequences';
my $outHandler;
my $phylogeny = 0;
my $nValues = 0;
my $nFrequency = 0;


sub preprocess {
    my ($file, $nValues, $nFrequency) = @_;
    my $in  = Bio::AlignIO->new(-file => $file , -format => 'phylip');
    my $aln = $in->next_aln(); # Get the first alignment
    # Getting header
    my $numSeq = $aln->num_sequences; # 5 seqs
    my $length = $aln->length(); # 376 bases aligning with gaps
    # Header end
    my @sequences;
    while ( $aln->num_sequences ) { # Number of sequences in this aligment.
      # Extract sequences and check values for the alignment column $pos
      #my @secuencias=();
      foreach my $seq ($aln->each_seq) {
          my $sequence = $seq->seq();#
          my @arraySequence = $sequence =~ /./sg; # String to array
          push(@sequences,\@arraySequence);  # Matrix of sequences
      }
    
    # Creates hash that contains the repetitions of each aa in seq
      my @toDelete = ();
      for (my $j=0; $j<$length;$j++) {
        my %amino = (); # AA hash
        for (my $i=0; $i<$numSeq;$i++) {
          my $char = $sequences[$i][$j];
          if ($char ne "-") {
            if (exists $amino{$sequences[$i][$j]}) {
              $amino{$sequences[$i][$j]} += 1;
            } else {
              $amino{$sequences[$i][$j]} = 1;
            }
          }

        } # End for i ($numSeq)
        my $count = 0;
        foreach my $key (keys %amino) { # $key = AA name
          my $aminoCount = $amino{$key}; # Number of repeats of current amino acid.
          if ($aminoCount >= $nFrequency) {
            $count++;
          }
        }
        if (!($count >= $nValues)) { # Delete column
          push(@toDelete, $j);
        }
      } # End for j ($length)
      
      my $new_aln = $aln;
      # Remove the columns included in "toDelete" hash
      for (my $i = ((scalar(@toDelete)-1));  $i >= 0 ; $i--) {
        $new_aln = $new_aln->remove_columns([$toDelete[$i],$toDelete[$i]]);
      }
      
      $outHandler->write_aln($new_aln);
      $aln = $in->next_aln(); # Take the next aligment, if it was.
  } # End while
}

##
#   MAIN
##

## Check input parameters
my $num_args = $#ARGV + 1;
if (!($num_args == 1 || $num_args == 3)) {
    print "\nUsage: phylo-protpars.pl file.fasta [Num. Values] [Num. Repeats]\n";
    exit;
}
# Read arg2 and arg3
if ($num_args == 3) {
    $outHandler = Bio::AlignIO->new(-file   => ">$outputfilename.aln.phy",
                         -format => 'phylip');
    $phylogeny = 1;
}

# Taking values of parameters from arguments
$inputfilename = $ARGV[0];
if (length($inputfilename)>200){
    print "Error: File name length msut be <200 characters\n";
    exit;
}
if ($phylogeny) { # Parse values
  $nValues=$ARGV[1];
  if ($nValues < 2) {
      print "Error: Number of values less than 2\n";
      exit;
  }
  $nFrequency=$ARGV[2];
  if ($nFrequency < 2) {
      print "Error: Number of repeats less than 2\n";
      exit;
  }
}
## End checking

# Start MSA
my  @params = ('ktuple' => 2, 'matrix' => 'BLOSUM','output'=>'phylip', 'outfile'=>$outputfilename.".aln");
my  $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params);
my  $aln = $factory->run($inputfilename);


# Need to parse the MSA output file?
if ($phylogeny) {
  preprocess($outputfilename.".aln", $nValues, $nFrequency);
}


# Process phylogeny
my @params2 = (idlength=>30,threshold=>10,jumble=>"17,10",outgroup=>2);
my $tree_factory = Bio::Tools::Run::Phylo::Phylip::ProtPars-> new(@params2);
my $output = "";
if ($phylogeny) {
    $output = $outputfilename.".aln.phy";
} else {
    $output = $outputfilename.".aln";
}

# Create the tree
my $tree = $tree_factory->run($output);
my @params3=('fontfile'=>'fontfile', 'tempdir'=>'images/');
my $treedraw = Bio::Tools::Run::Phylo::Phylip::DrawTree->new(@params3);

#If file 'plotfile' exists, it has to be deleted.
if (-e 'images/plotfile'){
    unlink 'images/plotfile' or warn "Could not unlink images/plotfile: $!";
}

open FILE, ">treePars.txt" or die $!;
print FILE $tree->as_text('newick');

# Create draw with treePars
# my $treeimagefile = $treedraw->run("treePars.txt");