3
|
1 #!/usr/bin/perl -w
|
|
2 use Bio::AlignIO;
|
|
3 use Bio::Root::IO;
|
|
4 use Bio::Seq;
|
|
5 use Bio::SeqIO;
|
|
6 use Bio::SimpleAlign;
|
|
7 use Bio::TreeIO;
|
|
8 use strict;
|
|
9 use warnings;
|
|
10
|
|
11 use Bio::Tools::Run::Phylo::Phylip::ProtPars;
|
|
12 use Bio::Tools::Run::Alignment::Clustalw;
|
|
13 use Bio::Tools::Run::Phylo::Phylip::DrawTree;
|
|
14
|
|
15 ## Links
|
|
16 # https://metacpan.org/pod/Bio::AlignIO
|
|
17 # https://metacpan.org/pod/Bio::SimpleAlign
|
|
18 # https://metacpan.org/pod/Bio::Seq
|
|
19 ##
|
|
20
|
|
21 $ENV{PHYLIPDIR} = '/usr/lib/phylip/bin/';
|
|
22 $ENV{CLUSTALWDIR} = '/usr/local/bin/';
|
|
23 my $inputfilename = '';
|
|
24 my $outputfilename = 'sequences';
|
|
25 my $outHandler;
|
|
26 my $phylogeny = 0;
|
|
27 my $nValues = 0;
|
|
28 my $nFrequency = 0;
|
|
29
|
|
30
|
|
31 sub preprocess {
|
|
32 my ($file, $nValues, $nFrequency) = @_;
|
|
33 my $in = Bio::AlignIO->new(-file => $file , -format => 'phylip');
|
|
34 my $aln = $in->next_aln(); # Get the first alignment
|
|
35 # Getting header
|
|
36 my $numSeq = $aln->num_sequences; # 5 seqs
|
|
37 my $length = $aln->length(); # 376 bases aligning with gaps
|
|
38 # Header end
|
|
39 my @sequences;
|
|
40 while ( $aln->num_sequences ) { # Number of sequences in this aligment.
|
|
41 # Extract sequences and check values for the alignment column $pos
|
|
42 #my @secuencias=();
|
|
43 foreach my $seq ($aln->each_seq) {
|
|
44 my $sequence = $seq->seq();#
|
|
45 my @arraySequence = $sequence =~ /./sg; # String to array
|
|
46 push(@sequences,\@arraySequence); # Matrix of sequences
|
|
47 }
|
|
48
|
|
49 # Creates hash that contains the repetitions of each aa in seq
|
|
50 my @toDelete = ();
|
|
51 for (my $j=0; $j<$length;$j++) {
|
|
52 my %amino = (); # AA hash
|
|
53 for (my $i=0; $i<$numSeq;$i++) {
|
|
54 my $char = $sequences[$i][$j];
|
|
55 if ($char ne "-") {
|
|
56 if (exists $amino{$sequences[$i][$j]}) {
|
|
57 $amino{$sequences[$i][$j]} += 1;
|
|
58 } else {
|
|
59 $amino{$sequences[$i][$j]} = 1;
|
|
60 }
|
|
61 }
|
|
62
|
|
63 } # End for i ($numSeq)
|
|
64 my $count = 0;
|
|
65 foreach my $key (keys %amino) { # $key = AA name
|
|
66 my $aminoCount = $amino{$key}; # Number of repeats of current amino acid.
|
|
67 if ($aminoCount >= $nFrequency) {
|
|
68 $count++;
|
|
69 }
|
|
70 }
|
|
71 if (!($count >= $nValues)) { # Delete column
|
|
72 push(@toDelete, $j);
|
|
73 }
|
|
74 } # End for j ($length)
|
|
75
|
|
76 my $new_aln = $aln;
|
|
77 # Remove the columns included in "toDelete" hash
|
|
78 for (my $i = ((scalar(@toDelete)-1)); $i >= 0 ; $i--) {
|
|
79 $new_aln = $new_aln->remove_columns([$toDelete[$i],$toDelete[$i]]);
|
|
80 }
|
|
81
|
|
82 $outHandler->write_aln($new_aln);
|
|
83 $aln = $in->next_aln(); # Take the next aligment, if it was.
|
|
84 } # End while
|
|
85 }
|
|
86
|
|
87 ##
|
|
88 # MAIN
|
|
89 ##
|
|
90
|
|
91 ## Check input parameters
|
|
92 my $num_args = $#ARGV + 1;
|
|
93 if (!($num_args == 1 || $num_args == 3)) {
|
|
94 print "\nUsage: phylo-protpars.pl file.fasta [Num. Values] [Num. Repeats]\n";
|
|
95 exit;
|
|
96 }
|
|
97 # Read arg2 and arg3
|
|
98 if ($num_args == 3) {
|
|
99 $outHandler = Bio::AlignIO->new(-file => ">$outputfilename.aln.phy",
|
|
100 -format => 'phylip');
|
|
101 $phylogeny = 1;
|
|
102 }
|
|
103
|
|
104 # Taking values of parameters from arguments
|
|
105 $inputfilename = $ARGV[0];
|
|
106 if (length($inputfilename)>200){
|
|
107 print "Error: File name length msut be <200 characters\n";
|
|
108 exit;
|
|
109 }
|
|
110 if ($phylogeny) { # Parse values
|
|
111 $nValues=$ARGV[1];
|
|
112 if ($nValues < 2) {
|
|
113 print "Error: Number of values less than 2\n";
|
|
114 exit;
|
|
115 }
|
|
116 $nFrequency=$ARGV[2];
|
|
117 if ($nFrequency < 2) {
|
|
118 print "Error: Number of repeats less than 2\n";
|
|
119 exit;
|
|
120 }
|
|
121 }
|
|
122 ## End checking
|
|
123
|
|
124 # Start MSA
|
|
125 my @params = ('ktuple' => 2, 'matrix' => 'BLOSUM','output'=>'phylip', 'outfile'=>$outputfilename.".aln");
|
|
126 my $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params);
|
|
127 my $aln = $factory->run($inputfilename);
|
|
128
|
|
129
|
|
130 # Need to parse the MSA output file?
|
|
131 if ($phylogeny) {
|
|
132 preprocess($outputfilename.".aln", $nValues, $nFrequency);
|
|
133 }
|
|
134
|
|
135
|
|
136 # Process phylogeny
|
|
137 my @params2 = (idlength=>30,threshold=>10,jumble=>"17,10",outgroup=>2);
|
|
138 my $tree_factory = Bio::Tools::Run::Phylo::Phylip::ProtPars-> new(@params2);
|
|
139 my $output = "";
|
|
140 if ($phylogeny) {
|
|
141 $output = $outputfilename.".aln.phy";
|
|
142 } else {
|
|
143 $output = $outputfilename.".aln";
|
|
144 }
|
|
145
|
|
146 # Create the tree
|
|
147 my $tree = $tree_factory->run($output);
|
|
148 my @params3=('fontfile'=>'fontfile', 'tempdir'=>'images/');
|
|
149 my $treedraw = Bio::Tools::Run::Phylo::Phylip::DrawTree->new(@params3);
|
|
150
|
|
151 #If file 'plotfile' exists, it has to be deleted.
|
|
152 if (-e 'images/plotfile'){
|
|
153 unlink 'images/plotfile' or warn "Could not unlink images/plotfile: $!";
|
|
154 }
|
|
155
|
|
156 open FILE, ">treePars.txt" or die $!;
|
|
157 print FILE $tree->as_text('newick');
|
|
158
|
|
159 # Create draw with treePars
|
|
160 # my $treeimagefile = $treedraw->run("treePars.txt"); |