comparison alignment/phylocatenator.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5b9a38ec4a39
1 #! /usr/bin/perl -w
2
3 use strict;
4 use warnings;
5 #phylocatenator.pl
6 #Written by Todd H. Oakley UCSB
7 #Version 1.0 May, 2012
8 #
9 #phyloconcatenator.pl [infile] [also requires arguments 1-8 detailed below]
10 #
11 #Version 1.0.1 Sept 2012 -- fixed a bug that led sometimes to species with no data being retained.
12 #This bug would not affect the data that was retained, so if raxml ran all was fine. However,
13 #raxml will not run with species that have completely missing data. Added extra section to remove
14 #those species.
15
16 #For debugging command line pass, uncomment next
17 #for(my $i=0; $i < @ARGV; $i++){
18 # print "Arg $i: $ARGV[$i] \n\n";
19 #}
20 #exit;
21
22 die "Check arguments" unless @ARGV == 9;
23
24 #Obtain Arguments
25 my $infile1=shift(@ARGV); #0 input file
26 my $mingf_persp=shift(@ARGV); #1 minimum number of genefamiles to keep species
27 my $minsp_pergf=shift(@ARGV); #2 minimum number of species to keep genefamily
28 my $min_gf_len =shift(@ARGV); #3 minimum gene family length
29 my $speciesfile=shift(@ARGV); #4 optional species file 'None' if false
30 my $modelsfile=shift(@ARGV); #5 models file
31 my $outfile=shift(@ARGV); #6 data outfile
32 my $partfile=shift(@ARGV); #7 partition file
33 my $htmlfile=shift(@ARGV); #8 html file
34
35 #Open 2 output files
36 open (OUT, ">$outfile") or die "Cannot create $outfile \n";
37 open (PART, ">$partfile") or die "Cannot create $partfile\n";
38 open (TABLE, ">$htmlfile") or die "Cannot create $htmlfile\n";
39
40 my %HoAdatatable;
41 my %HoGF;
42 my %lengthof;
43 my %ModelFor;
44 my $line = "";
45
46 my @specieslist;
47 my @genelist;
48 my @nsp;
49 my $nsl=1;
50 my @uniquemodels;
51 my @nullmodels;
52 my @retainedspecies;
53
54 # read file with species
55 unless($speciesfile eq 'None'){
56 open SPFILE, $speciesfile or die "ERROR: Cannot open file $speciesfile\n\n";
57 $nsl=0;
58 while (<SPFILE>) {
59 chomp;
60 my $currentinput = "$_";
61 if($currentinput =~m /\w/){ #must have a word otherwise empty
62 push(@nsp, $currentinput);
63 }else{
64 die "ERROR: Fewer than 4 species meet your specified criteria.\n";
65 }
66 # if(@nsp < 4){
67 # die "ERROR: Species file must have more than 4 species.\n";
68 # }
69 }
70 #print "\n\n";
71 } #end of unless
72
73 #Determine models used for each partition, make hash
74 unless($modelsfile eq 'None'){
75 open MODFILE, $modelsfile or die "ERROR: Cannot open file $modelsfile\n\n";
76 while (<MODFILE>) {
77 chomp;
78 my $currentinput = "$_";
79 if($currentinput =~m /\t/){ #must have a tab otherwise wrong file format
80 if($currentinput =~m /\t\t/){
81 print OUT "ERROR: file contains 2 tabs in a row. Check phytab format.\n";
82 die;
83 }else{
84 my @genemodel = split(/\t/, $currentinput);
85 my $genefamily=$genemodel[0];
86 my $curmodel = $genemodel[1];
87 if (exists $ModelFor{$genefamily}) {
88 print OUT "ERROR: Model specification for $genefamily is duplicated\n";
89 die;
90 }else{
91 $ModelFor{$genefamily}=$curmodel;
92 push(@uniquemodels,$curmodel);
93 }
94 }
95 }else{
96 die "ERROR: Model LUT must be genefamily\tmodel and contain no blank lines\n";
97 }
98 }
99 }
100 @uniquemodels = uniq(@uniquemodels); #remove redundant models - uniq is subroutine at end of script
101
102 #Now check that models are valid raxml models NOT DONE YET
103 checkraxmlmodel();
104
105 #INPUT ALL DATA
106 open(INFILE1, $infile1);
107 foreach $line(<INFILE1>) {
108 chop($line);
109 my $getline = $line;
110 my @column = split(/\t/, $getline);
111 my $species = $column[0];
112 my $genefamily = $column[1];
113 my $genename = $column[2];
114 my $sequence = $column[3];
115
116 if (exists $HoAdatatable{$species}{$genefamily}) {
117 print OUT "ERROR: $species $genefamily is duplicated\n";
118 die;
119 }else{
120 $HoAdatatable{$species}{$genefamily} = $sequence;
121 $HoGF{$genefamily}{$species}=$sequence;
122 }
123 }
124
125 #If no models file selected, set every gene family to GTR model
126 if($modelsfile eq 'None'){
127 foreach my $gfkey (sort keys %HoGF){
128 $ModelFor{$gfkey} = 'GTR';
129 }
130 push(@uniquemodels, 'GTR');
131 }
132
133 #First, keep all species with enough total partitions present
134 foreach my $specieskey (sort keys %HoAdatatable)
135 {
136 #Count species with minimum gfs
137 my $ngf_persp=0;
138 foreach my $genefamilykey (sort keys %{$HoAdatatable{$specieskey}})
139 {
140 $ngf_persp++;
141 }
142 unless($ngf_persp < $mingf_persp){ #too few genes for this species, delete the sp
143 if($nsl){ #No species list supplied, push all species into list
144 push(@specieslist,$specieskey);
145 }else{
146 #See if current specieskey is in inputted species list
147 my $nsp;
148 foreach $nsp(@nsp){
149 if (index($nsp,$specieskey) ge 0){
150 push(@specieslist,$specieskey);
151 }
152 }
153 }
154 }
155 }
156 print OUT "\n";
157 unless (@specieslist){
158 print OUT "ERROR: No species with more than $mingf_persp genes\n";
159 die;
160 }
161 if(@specieslist < 4){
162 print OUT "ERROR: Less than 4 species with more than $mingf_persp genes\n";
163 die;
164 }
165
166
167 my $oldgenelen = 0;
168 my $currentseqlen = 0;
169 foreach my $gfkey (sort keys %HoGF)
170 {
171 $oldgenelen=0;
172 #Count gfs with minimum species
173 my $nsp_pergf=0;
174 for(my $j=0; $j<@specieslist; $j++){
175 if(exists $HoGF{$gfkey}{$specieslist[$j]}){
176 $nsp_pergf++ ; ###if exists $HoGF{$gfkey}{$specieslist[$j]};
177
178
179 ##get length of gene and check it is consistent
180 if(exists $lengthof{$gfkey}){
181 $currentseqlen = length($HoGF{$gfkey}{$specieslist[$j]});
182 # $lengthof{$gfkey} = $currentseqlen;
183 if($currentseqlen == $oldgenelen){
184 #$oldgenelen = $lengthof{$gfkey};
185 #okay
186 }else{
187 print OUT "ERROR: $specieslist[$j] $gfkey sequences ".
188 "different lengths than previous. Sequences must be aligned. If ".
189 "sequences are aligned, check that the line ".
190 "does not have an extra data column before the ".
191 "sequence.\n\n";
192 die "ERROR: $specieslist[$j] $gfkey sequences ".
193 "different lengths than previous. Sequences must be aligned. If ".
194 "sequences are aligned, check that the line ".
195 "does not have an extra data column before the ".
196 "sequence.\n\n";
197 }
198 }else{
199 $currentseqlen = length($HoGF{$gfkey}{$specieslist[$j]});
200 if($currentseqlen == 0){
201 die "ERROR: Zero length sequence in file.\n"
202 }
203 $lengthof{$gfkey} = $currentseqlen;
204 $oldgenelen = $lengthof{$gfkey};
205 }
206 }
207 }
208 if($nsp_pergf < $minsp_pergf){
209 #too few species for this gene family
210 }else{
211 if(exists $lengthof{$gfkey}){
212 if($lengthof{$gfkey}>$min_gf_len){
213 push(@genelist,$gfkey);
214 }
215 }
216 }
217 }
218 if (@genelist==0){
219 print OUT "ERROR: No gene families/partitions meet the specified criteria\n";
220 die "ERROR: No gene families/partitions meet the specified criteria\n";
221 }
222
223 #Now must delete species that lack any *retained* partitions, ie some species may be all missing
224 foreach(@specieslist)
225 {
226 my $curspecies=$_;
227 #Count species with minimum gfs
228 my $ngf_persp=0;
229 my $nonmissing=0;
230 foreach (@genelist)
231 {
232 if (exists $HoAdatatable{$curspecies}{$_}){
233 my $cursequence = $HoAdatatable{$curspecies}{$_};
234 $cursequence =~ s/\?//g;
235 $cursequence =~ s/\-//g;
236 #Will not remove N's assumes those are not missing data. Revisit?
237 my $curlen = length($cursequence);
238 $nonmissing = $nonmissing + $curlen;
239 }
240 }
241 if($nonmissing > 0) #must remove species as it contains none of the genes retained
242 {
243 print "$curspecies has $nonmissing Non-Missing characters\n";
244 push(@retainedspecies, $curspecies);
245 }
246 }
247 @specieslist=@retainedspecies;
248
249 #Remove blankspecies from
250 #print phylip file
251 #calculate n characters
252 my $nchar=0; #total characters
253 my $ncharPs =0; #start count for characters in current partition
254 my $ncharPe =0; #end count for characters in current partition
255
256 #First count total characters for first line
257 for(my $k=0; $k<@genelist; $k++){
258 if (exists $lengthof{$genelist[$k]}){ #check that length was calculated
259 $nchar = $nchar + $lengthof{$genelist[$k]};
260 }else{
261 die "ERROR: $genelist[$k] LENGTH MISSING\n";
262 }
263 }
264 print OUT @specieslist." ".$nchar."\n";
265 htmlheader();
266
267 #Need to determine gene list order, which will change due to partitioning
268 #then write header line of gene names, hopefully in correct order
269 #print TABLE "<td bgcolor=white></td>"; #Blank line in species column
270 print TABLE "<td style'width:2pc'><font size='-3'>Partition:</td>";
271 for(my $part=0; $part < @uniquemodels; $part++){
272 for(my $k=0; $k<@genelist; $k++){
273 #First check if current gf matches current partition
274 if(exists $ModelFor{$genelist[$k]}){
275 if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){
276 if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){
277 print TABLE "<td style'width:2pc'><font size='-3'>$genelist[$k]</td>";
278 }
279 }
280 }
281 }
282 }
283 print TABLE "</tr>";
284
285 #print TABLE "<td bgcolor=white></td>"; #Blank line in species column
286 print TABLE "<td style'width:2pc'><font size='-3'>Model:</td>";
287 for(my $part=0; $part < @uniquemodels; $part++){
288 for(my $k=0; $k<@genelist; $k++){
289 #First check if current gf matches current partition
290 if(exists $ModelFor{$genelist[$k]}){
291 if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){
292 if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){
293 print TABLE "<td style'width:2pc'><font size='-3'>$uniquemodels[$part]</td>";
294 }
295 }
296 }
297 }
298 }
299 #End of htmlheader printing
300
301 for(my $j=0; $j<@specieslist; $j++){
302 print OUT "$specieslist[$j]\t";
303 print TABLE "
304 <tr>
305 <td>$specieslist[$j]</td>";
306 for(my $part=0; $part < @uniquemodels; $part++){
307 for(my $k=0; $k<@genelist; $k++){
308 #First check if current gf matches current partition
309 if(exists $ModelFor{$genelist[$k]}){
310 if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){
311 if (exists $HoAdatatable{$specieslist[$j]}{$genelist[$k]}){
312 print OUT $HoAdatatable{$specieslist[$j]}{$genelist[$k]};
313 print TABLE "<td bgcolor=black></td>";
314 $ncharPe = $ncharPe + $lengthof{$genelist[$k]};
315 }else{
316 if (exists $lengthof{$genelist[$k]}){
317 for(my $gap=0; $gap<$lengthof{$genelist[$k]}; $gap++){
318 print OUT "?";
319 }
320 $ncharPe = $ncharPe + $lengthof{$genelist[$k]};
321 print TABLE "<td bgcolor=white></td>";
322 #print TABLE "<td style'width:2pc'><font size='-2'>$genelist[$k]</td>";
323 }else{
324 die "ERROR: BUG!! $genelist[$k] LENGTH MISSING\n";
325 }
326 }
327 }
328 }else{
329 die "ERROR: $genelist[$k] is not assigned a model. Check model LUT input.\n";
330 }
331
332 }
333 if($j==0){ #print partitions first time through gene lists
334 if(($ncharPe==0) || ($ncharPs > $ncharPe)){
335 push(@nullmodels, $uniquemodels[$part]);
336 #print "NOTE: no partitions under the model $uniquemodels[$part] made final dataset\n";
337 }else{
338 print PART "$uniquemodels[$part], $uniquemodels[$part] = $ncharPs - $ncharPe \n";
339 }
340 $ncharPs=$ncharPe + 1;
341 }
342 }
343 print TABLE "</tr>";
344 print OUT "\n";
345 }
346 if($ncharPe/@genelist != $nchar){
347 # Have to account for multiple times through gene lists -- ncharPe is summed multiple times but printed once
348 #die "ERROR: BUG!! Last partition number doesn't match total n characters\n";
349 }
350
351 #print Statistics to screen can be redirected for Log File
352 print "\nSPECIES:\n";
353 unless($speciesfile eq 'None'){
354 print "Used species file to select species. \n";
355 }
356 print "Number of species with $mingf_persp or more genefamilies/partitions: ".@specieslist."\n";
357 print "Species list: @specieslist\n\n\n";
358 print "\nPARTITIONS/GENE FAMILIES:\n";
359 print "Number genefamilies/partitions longer than $min_gf_len characters and present in at least $minsp_pergf genefamilies/partitions: ".@genelist."\n";
360 for(my $i=0; $i < @genelist; $i++){
361 print "$genelist[$i] Length:$lengthof{$genelist[$i]}\n";
362 }
363
364 #Printing Model stats
365 print "\nMODELS:\n";
366 if($modelsfile eq 'None'){
367 print "All partitions set to GTR (no model LUT supplied)\n\n";
368 }else{
369 print "A LUT of models was supplied.\n";
370 print "The following models were present in the model LUT file:\n";
371 print join(" ", @uniquemodels), "\n";
372 print "\n";
373 for(my $i=0; $i < @nullmodels; $i++){
374 print "NOTE: no partitions under the model $nullmodels[$i] made final dataset\n";
375 }
376 }
377 close PART;
378 close OUT;
379 close SPFILE;
380 close TABLE;
381
382 sub uniq {
383 return keys %{{ map { $_ => 1 } @_ }};
384 }
385
386 sub checkraxmlmodel {
387 my @raxmlmodels = ("DNA","BIN","MULTI","DAYHOFF", "DCMUT", "JTT", "MTREV", "WAG", "RTREV", "CPREV", "VT", "BLOSUM62", "MTMAM", "LG", "GTR", "MTART", "MTZOA", "FLU","PMB", "HIVB","HIVW","JTTDCMUT");
388 return(1); #Not yet implemented
389 }
390 sub htmlheader {
391 print TABLE '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
392 "http://www.w3.org/TR/html4/loose.dtd">
393 <html>
394 <head>
395 <title>Dataset Presences and Absences</title>
396 <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
397 </head>
398 <hr><table border="1">
399 <tr>
400 <th align="center">Species<br>';
401 for(my $i=1; $i < @genelist+1; $i++){
402 #Genes are printing in wrong order
403 # print TABLE "<td style'width:2pc'><font size='-2'>$genelist[$i]</td>";
404 print TABLE "<td style'width:2pc'><font size='-2'>$i</td>";
405 }
406 print TABLE "</th>
407 </tr>";
408 }
409 sub htmltail {
410 print TABLE '
411 </body>
412 </html>';
413 }