annotate phylo-protpars.pl @ 4:549887e3f045 draft default tip

Uploading readme.txt
author jjv_bioinformaticians
date Wed, 12 Apr 2017 11:39:43 -0400
parents 1cd1d4171142
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
1 #!/usr/bin/perl -w
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
2 use Bio::AlignIO;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
3 use Bio::Root::IO;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
4 use Bio::Seq;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
5 use Bio::SeqIO;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
6 use Bio::SimpleAlign;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
7 use Bio::TreeIO;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
8 use strict;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
9 use warnings;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
10
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
11 use Bio::Tools::Run::Phylo::Phylip::ProtPars;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
12 use Bio::Tools::Run::Alignment::Clustalw;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
13 use Bio::Tools::Run::Phylo::Phylip::DrawTree;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
14
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
15 ## Links
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
16 # https://metacpan.org/pod/Bio::AlignIO
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
17 # https://metacpan.org/pod/Bio::SimpleAlign
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
18 # https://metacpan.org/pod/Bio::Seq
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
19 ##
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
20
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
21 $ENV{PHYLIPDIR} = '/usr/lib/phylip/bin/';
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
22 $ENV{CLUSTALWDIR} = '/usr/local/bin/';
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
23 my $inputfilename = '';
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
24 my $outputfilename = 'sequences';
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
25 my $outHandler;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
26 my $phylogeny = 0;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
27 my $nValues = 0;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
28 my $nFrequency = 0;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
29
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
30
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
31 sub preprocess {
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
32 my ($file, $nValues, $nFrequency) = @_;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
33 my $in = Bio::AlignIO->new(-file => $file , -format => 'phylip');
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
34 my $aln = $in->next_aln(); # Get the first alignment
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
35 # Getting header
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
36 my $numSeq = $aln->num_sequences; # 5 seqs
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
37 my $length = $aln->length(); # 376 bases aligning with gaps
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
38 # Header end
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
39 my @sequences;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
40 while ( $aln->num_sequences ) { # Number of sequences in this aligment.
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
41 # Extract sequences and check values for the alignment column $pos
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
42 #my @secuencias=();
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
43 foreach my $seq ($aln->each_seq) {
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
44 my $sequence = $seq->seq();#
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
45 my @arraySequence = $sequence =~ /./sg; # String to array
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
46 push(@sequences,\@arraySequence); # Matrix of sequences
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
47 }
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
48
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
49 # Creates hash that contains the repetitions of each aa in seq
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
50 my @toDelete = ();
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
51 for (my $j=0; $j<$length;$j++) {
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
52 my %amino = (); # AA hash
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
53 for (my $i=0; $i<$numSeq;$i++) {
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
54 my $char = $sequences[$i][$j];
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
55 if ($char ne "-") {
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
56 if (exists $amino{$sequences[$i][$j]}) {
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
57 $amino{$sequences[$i][$j]} += 1;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
58 } else {
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
59 $amino{$sequences[$i][$j]} = 1;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
60 }
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
61 }
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
62
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
63 } # End for i ($numSeq)
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
64 my $count = 0;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
65 foreach my $key (keys %amino) { # $key = AA name
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
66 my $aminoCount = $amino{$key}; # Number of repeats of current amino acid.
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
67 if ($aminoCount >= $nFrequency) {
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
68 $count++;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
69 }
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
70 }
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
71 if (!($count >= $nValues)) { # Delete column
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
72 push(@toDelete, $j);
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
73 }
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
74 } # End for j ($length)
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
75
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
76 my $new_aln = $aln;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
77 # Remove the columns included in "toDelete" hash
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
78 for (my $i = ((scalar(@toDelete)-1)); $i >= 0 ; $i--) {
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
79 $new_aln = $new_aln->remove_columns([$toDelete[$i],$toDelete[$i]]);
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
80 }
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
81
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
82 $outHandler->write_aln($new_aln);
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
83 $aln = $in->next_aln(); # Take the next aligment, if it was.
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
84 } # End while
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
85 }
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
86
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
87 ##
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
88 # MAIN
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
89 ##
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
90
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
91 ## Check input parameters
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
92 my $num_args = $#ARGV + 1;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
93 if (!($num_args == 1 || $num_args == 3)) {
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
94 print "\nUsage: phylo-protpars.pl file.fasta [Num. Values] [Num. Repeats]\n";
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
95 exit;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
96 }
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
97 # Read arg2 and arg3
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
98 if ($num_args == 3) {
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
99 $outHandler = Bio::AlignIO->new(-file => ">$outputfilename.aln.phy",
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
100 -format => 'phylip');
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
101 $phylogeny = 1;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
102 }
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
103
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
104 # Taking values of parameters from arguments
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
105 $inputfilename = $ARGV[0];
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
106 if (length($inputfilename)>200){
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
107 print "Error: File name length msut be <200 characters\n";
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
108 exit;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
109 }
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
110 if ($phylogeny) { # Parse values
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
111 $nValues=$ARGV[1];
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
112 if ($nValues < 2) {
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
113 print "Error: Number of values less than 2\n";
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
114 exit;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
115 }
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
116 $nFrequency=$ARGV[2];
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
117 if ($nFrequency < 2) {
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
118 print "Error: Number of repeats less than 2\n";
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
119 exit;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
120 }
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
121 }
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
122 ## End checking
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
123
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
124 # Start MSA
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
125 my @params = ('ktuple' => 2, 'matrix' => 'BLOSUM','output'=>'phylip', 'outfile'=>$outputfilename.".aln");
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
126 my $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params);
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
127 my $aln = $factory->run($inputfilename);
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
128
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
129
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
130 # Need to parse the MSA output file?
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
131 if ($phylogeny) {
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
132 preprocess($outputfilename.".aln", $nValues, $nFrequency);
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
133 }
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
134
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
135
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
136 # Process phylogeny
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
137 my @params2 = (idlength=>30,threshold=>10,jumble=>"17,10",outgroup=>2);
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
138 my $tree_factory = Bio::Tools::Run::Phylo::Phylip::ProtPars-> new(@params2);
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
139 my $output = "";
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
140 if ($phylogeny) {
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
141 $output = $outputfilename.".aln.phy";
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
142 } else {
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
143 $output = $outputfilename.".aln";
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
144 }
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
145
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
146 # Create the tree
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
147 my $tree = $tree_factory->run($output);
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
148 my @params3=('fontfile'=>'fontfile', 'tempdir'=>'images/');
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
149 my $treedraw = Bio::Tools::Run::Phylo::Phylip::DrawTree->new(@params3);
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
150
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
151 #If file 'plotfile' exists, it has to be deleted.
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
152 if (-e 'images/plotfile'){
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
153 unlink 'images/plotfile' or warn "Could not unlink images/plotfile: $!";
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
154 }
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
155
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
156 open FILE, ">treePars.txt" or die $!;
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
157 print FILE $tree->as_text('newick');
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
158
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
159 # Create draw with treePars
1cd1d4171142 Uploaded
jjv_bioinformaticians
parents:
diff changeset
160 # my $treeimagefile = $treedraw->run("treePars.txt");