Mercurial > repos > jjv_bioinformaticians > phylogeny_maximum_parsimony_informative_sites
comparison phylo-protpars.pl @ 3:1cd1d4171142 draft
Uploaded
author | jjv_bioinformaticians |
---|---|
date | Wed, 05 Apr 2017 14:32:18 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:6ab618072d5c | 3:1cd1d4171142 |
---|---|
1 #!/usr/bin/perl -w | |
2 use Bio::AlignIO; | |
3 use Bio::Root::IO; | |
4 use Bio::Seq; | |
5 use Bio::SeqIO; | |
6 use Bio::SimpleAlign; | |
7 use Bio::TreeIO; | |
8 use strict; | |
9 use warnings; | |
10 | |
11 use Bio::Tools::Run::Phylo::Phylip::ProtPars; | |
12 use Bio::Tools::Run::Alignment::Clustalw; | |
13 use Bio::Tools::Run::Phylo::Phylip::DrawTree; | |
14 | |
15 ## Links | |
16 # https://metacpan.org/pod/Bio::AlignIO | |
17 # https://metacpan.org/pod/Bio::SimpleAlign | |
18 # https://metacpan.org/pod/Bio::Seq | |
19 ## | |
20 | |
21 $ENV{PHYLIPDIR} = '/usr/lib/phylip/bin/'; | |
22 $ENV{CLUSTALWDIR} = '/usr/local/bin/'; | |
23 my $inputfilename = ''; | |
24 my $outputfilename = 'sequences'; | |
25 my $outHandler; | |
26 my $phylogeny = 0; | |
27 my $nValues = 0; | |
28 my $nFrequency = 0; | |
29 | |
30 | |
31 sub preprocess { | |
32 my ($file, $nValues, $nFrequency) = @_; | |
33 my $in = Bio::AlignIO->new(-file => $file , -format => 'phylip'); | |
34 my $aln = $in->next_aln(); # Get the first alignment | |
35 # Getting header | |
36 my $numSeq = $aln->num_sequences; # 5 seqs | |
37 my $length = $aln->length(); # 376 bases aligning with gaps | |
38 # Header end | |
39 my @sequences; | |
40 while ( $aln->num_sequences ) { # Number of sequences in this aligment. | |
41 # Extract sequences and check values for the alignment column $pos | |
42 #my @secuencias=(); | |
43 foreach my $seq ($aln->each_seq) { | |
44 my $sequence = $seq->seq();# | |
45 my @arraySequence = $sequence =~ /./sg; # String to array | |
46 push(@sequences,\@arraySequence); # Matrix of sequences | |
47 } | |
48 | |
49 # Creates hash that contains the repetitions of each aa in seq | |
50 my @toDelete = (); | |
51 for (my $j=0; $j<$length;$j++) { | |
52 my %amino = (); # AA hash | |
53 for (my $i=0; $i<$numSeq;$i++) { | |
54 my $char = $sequences[$i][$j]; | |
55 if ($char ne "-") { | |
56 if (exists $amino{$sequences[$i][$j]}) { | |
57 $amino{$sequences[$i][$j]} += 1; | |
58 } else { | |
59 $amino{$sequences[$i][$j]} = 1; | |
60 } | |
61 } | |
62 | |
63 } # End for i ($numSeq) | |
64 my $count = 0; | |
65 foreach my $key (keys %amino) { # $key = AA name | |
66 my $aminoCount = $amino{$key}; # Number of repeats of current amino acid. | |
67 if ($aminoCount >= $nFrequency) { | |
68 $count++; | |
69 } | |
70 } | |
71 if (!($count >= $nValues)) { # Delete column | |
72 push(@toDelete, $j); | |
73 } | |
74 } # End for j ($length) | |
75 | |
76 my $new_aln = $aln; | |
77 # Remove the columns included in "toDelete" hash | |
78 for (my $i = ((scalar(@toDelete)-1)); $i >= 0 ; $i--) { | |
79 $new_aln = $new_aln->remove_columns([$toDelete[$i],$toDelete[$i]]); | |
80 } | |
81 | |
82 $outHandler->write_aln($new_aln); | |
83 $aln = $in->next_aln(); # Take the next aligment, if it was. | |
84 } # End while | |
85 } | |
86 | |
87 ## | |
88 # MAIN | |
89 ## | |
90 | |
91 ## Check input parameters | |
92 my $num_args = $#ARGV + 1; | |
93 if (!($num_args == 1 || $num_args == 3)) { | |
94 print "\nUsage: phylo-protpars.pl file.fasta [Num. Values] [Num. Repeats]\n"; | |
95 exit; | |
96 } | |
97 # Read arg2 and arg3 | |
98 if ($num_args == 3) { | |
99 $outHandler = Bio::AlignIO->new(-file => ">$outputfilename.aln.phy", | |
100 -format => 'phylip'); | |
101 $phylogeny = 1; | |
102 } | |
103 | |
104 # Taking values of parameters from arguments | |
105 $inputfilename = $ARGV[0]; | |
106 if (length($inputfilename)>200){ | |
107 print "Error: File name length msut be <200 characters\n"; | |
108 exit; | |
109 } | |
110 if ($phylogeny) { # Parse values | |
111 $nValues=$ARGV[1]; | |
112 if ($nValues < 2) { | |
113 print "Error: Number of values less than 2\n"; | |
114 exit; | |
115 } | |
116 $nFrequency=$ARGV[2]; | |
117 if ($nFrequency < 2) { | |
118 print "Error: Number of repeats less than 2\n"; | |
119 exit; | |
120 } | |
121 } | |
122 ## End checking | |
123 | |
124 # Start MSA | |
125 my @params = ('ktuple' => 2, 'matrix' => 'BLOSUM','output'=>'phylip', 'outfile'=>$outputfilename.".aln"); | |
126 my $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params); | |
127 my $aln = $factory->run($inputfilename); | |
128 | |
129 | |
130 # Need to parse the MSA output file? | |
131 if ($phylogeny) { | |
132 preprocess($outputfilename.".aln", $nValues, $nFrequency); | |
133 } | |
134 | |
135 | |
136 # Process phylogeny | |
137 my @params2 = (idlength=>30,threshold=>10,jumble=>"17,10",outgroup=>2); | |
138 my $tree_factory = Bio::Tools::Run::Phylo::Phylip::ProtPars-> new(@params2); | |
139 my $output = ""; | |
140 if ($phylogeny) { | |
141 $output = $outputfilename.".aln.phy"; | |
142 } else { | |
143 $output = $outputfilename.".aln"; | |
144 } | |
145 | |
146 # Create the tree | |
147 my $tree = $tree_factory->run($output); | |
148 my @params3=('fontfile'=>'fontfile', 'tempdir'=>'images/'); | |
149 my $treedraw = Bio::Tools::Run::Phylo::Phylip::DrawTree->new(@params3); | |
150 | |
151 #If file 'plotfile' exists, it has to be deleted. | |
152 if (-e 'images/plotfile'){ | |
153 unlink 'images/plotfile' or warn "Could not unlink images/plotfile: $!"; | |
154 } | |
155 | |
156 open FILE, ">treePars.txt" or die $!; | |
157 print FILE $tree->as_text('newick'); | |
158 | |
159 # Create draw with treePars | |
160 # my $treeimagefile = $treedraw->run("treePars.txt"); |