changeset 3:1cd1d4171142 draft

Uploaded
author jjv_bioinformaticians
date Wed, 05 Apr 2017 14:32:18 -0400 (2017-04-05)
parents 6ab618072d5c
children 549887e3f045
files phylo-protpars.pl
diffstat 1 files changed, 160 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/phylo-protpars.pl	Wed Apr 05 14:32:18 2017 -0400
@@ -0,0 +1,160 @@
+#!/usr/bin/perl -w
+use Bio::AlignIO;
+use Bio::Root::IO;
+use Bio::Seq;
+use Bio::SeqIO;
+use Bio::SimpleAlign;
+use Bio::TreeIO;
+use strict;
+use warnings;
+
+use Bio::Tools::Run::Phylo::Phylip::ProtPars;
+use Bio::Tools::Run::Alignment::Clustalw;
+use Bio::Tools::Run::Phylo::Phylip::DrawTree;
+
+## Links
+# https://metacpan.org/pod/Bio::AlignIO
+# https://metacpan.org/pod/Bio::SimpleAlign
+# https://metacpan.org/pod/Bio::Seq
+##
+
+$ENV{PHYLIPDIR} = '/usr/lib/phylip/bin/';
+$ENV{CLUSTALWDIR} = '/usr/local/bin/';
+my $inputfilename = '';
+my $outputfilename = 'sequences';
+my $outHandler;
+my $phylogeny = 0;
+my $nValues = 0;
+my $nFrequency = 0;
+
+
+sub preprocess {
+    my ($file, $nValues, $nFrequency) = @_;
+    my $in  = Bio::AlignIO->new(-file => $file , -format => 'phylip');
+    my $aln = $in->next_aln(); # Get the first alignment
+    # Getting header
+    my $numSeq = $aln->num_sequences; # 5 seqs
+    my $length = $aln->length(); # 376 bases aligning with gaps
+    # Header end
+    my @sequences;
+    while ( $aln->num_sequences ) { # Number of sequences in this aligment.
+      # Extract sequences and check values for the alignment column $pos
+      #my @secuencias=();
+      foreach my $seq ($aln->each_seq) {
+          my $sequence = $seq->seq();#
+          my @arraySequence = $sequence =~ /./sg; # String to array
+          push(@sequences,\@arraySequence);  # Matrix of sequences
+      }
+    
+    # Creates hash that contains the repetitions of each aa in seq
+      my @toDelete = ();
+      for (my $j=0; $j<$length;$j++) {
+        my %amino = (); # AA hash
+        for (my $i=0; $i<$numSeq;$i++) {
+          my $char = $sequences[$i][$j];
+          if ($char ne "-") {
+            if (exists $amino{$sequences[$i][$j]}) {
+              $amino{$sequences[$i][$j]} += 1;
+            } else {
+              $amino{$sequences[$i][$j]} = 1;
+            }
+          }
+
+        } # End for i ($numSeq)
+        my $count = 0;
+        foreach my $key (keys %amino) { # $key = AA name
+          my $aminoCount = $amino{$key}; # Number of repeats of current amino acid.
+          if ($aminoCount >= $nFrequency) {
+            $count++;
+          }
+        }
+        if (!($count >= $nValues)) { # Delete column
+          push(@toDelete, $j);
+        }
+      } # End for j ($length)
+      
+      my $new_aln = $aln;
+      # Remove the columns included in "toDelete" hash
+      for (my $i = ((scalar(@toDelete)-1));  $i >= 0 ; $i--) {
+        $new_aln = $new_aln->remove_columns([$toDelete[$i],$toDelete[$i]]);
+      }
+      
+      $outHandler->write_aln($new_aln);
+      $aln = $in->next_aln(); # Take the next aligment, if it was.
+  } # End while
+}
+
+##
+#   MAIN
+##
+
+## Check input parameters
+my $num_args = $#ARGV + 1;
+if (!($num_args == 1 || $num_args == 3)) {
+    print "\nUsage: phylo-protpars.pl file.fasta [Num. Values] [Num. Repeats]\n";
+    exit;
+}
+# Read arg2 and arg3
+if ($num_args == 3) {
+    $outHandler = Bio::AlignIO->new(-file   => ">$outputfilename.aln.phy",
+                         -format => 'phylip');
+    $phylogeny = 1;
+}
+
+# Taking values of parameters from arguments
+$inputfilename = $ARGV[0];
+if (length($inputfilename)>200){
+    print "Error: File name length msut be <200 characters\n";
+    exit;
+}
+if ($phylogeny) { # Parse values
+  $nValues=$ARGV[1];
+  if ($nValues < 2) {
+      print "Error: Number of values less than 2\n";
+      exit;
+  }
+  $nFrequency=$ARGV[2];
+  if ($nFrequency < 2) {
+      print "Error: Number of repeats less than 2\n";
+      exit;
+  }
+}
+## End checking
+
+# Start MSA
+my  @params = ('ktuple' => 2, 'matrix' => 'BLOSUM','output'=>'phylip', 'outfile'=>$outputfilename.".aln");
+my  $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params);
+my  $aln = $factory->run($inputfilename);
+
+
+# Need to parse the MSA output file?
+if ($phylogeny) {
+  preprocess($outputfilename.".aln", $nValues, $nFrequency);
+}
+
+
+# Process phylogeny
+my @params2 = (idlength=>30,threshold=>10,jumble=>"17,10",outgroup=>2);
+my $tree_factory = Bio::Tools::Run::Phylo::Phylip::ProtPars-> new(@params2);
+my $output = "";
+if ($phylogeny) {
+    $output = $outputfilename.".aln.phy";
+} else {
+    $output = $outputfilename.".aln";
+}
+
+# Create the tree
+my $tree = $tree_factory->run($output);
+my @params3=('fontfile'=>'fontfile', 'tempdir'=>'images/');
+my $treedraw = Bio::Tools::Run::Phylo::Phylip::DrawTree->new(@params3);
+
+#If file 'plotfile' exists, it has to be deleted.
+if (-e 'images/plotfile'){
+    unlink 'images/plotfile' or warn "Could not unlink images/plotfile: $!";
+}
+
+open FILE, ">treePars.txt" or die $!;
+print FILE $tree->as_text('newick');
+
+# Create draw with treePars
+# my $treeimagefile = $treedraw->run("treePars.txt");
\ No newline at end of file