diff alignment/phylocatenator.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alignment/phylocatenator.pl	Tue Mar 11 12:19:13 2014 -0700
@@ -0,0 +1,413 @@
+#! /usr/bin/perl -w
+
+use strict;
+use warnings;
+#phylocatenator.pl
+#Written by Todd H. Oakley UCSB
+#Version 1.0 May, 2012
+#
+#phyloconcatenator.pl [infile] [also requires arguments 1-8 detailed below]
+#
+#Version 1.0.1 Sept 2012 -- fixed a bug that led sometimes to species with no data being retained.
+#This bug would not affect the data that was retained, so if raxml ran all was fine. However, 
+#raxml will not run with species that have completely missing data. Added extra section to remove 
+#those species.
+
+#For debugging command line pass, uncomment next
+#for(my $i=0; $i < @ARGV; $i++){
+#	print "Arg $i:  $ARGV[$i] \n\n";
+#}
+#exit;
+
+die "Check arguments" unless @ARGV == 9;
+
+#Obtain Arguments
+my $infile1=shift(@ARGV);		#0 input file
+my $mingf_persp=shift(@ARGV);		#1 minimum number of genefamiles to keep species
+my $minsp_pergf=shift(@ARGV);		#2 minimum number of species to keep genefamily
+my $min_gf_len =shift(@ARGV);		#3 minimum gene family length
+my $speciesfile=shift(@ARGV);		#4 optional species file 'None' if false
+my $modelsfile=shift(@ARGV);		#5 models file
+my $outfile=shift(@ARGV);		#6 data outfile
+my $partfile=shift(@ARGV);		#7 partition file
+my $htmlfile=shift(@ARGV);		#8 html file
+
+#Open 2 output files
+open (OUT, ">$outfile") or die "Cannot create $outfile \n";
+open (PART, ">$partfile") or die "Cannot create $partfile\n";
+open (TABLE, ">$htmlfile") or die "Cannot create $htmlfile\n";
+
+my %HoAdatatable;
+my %HoGF;
+my %lengthof;
+my %ModelFor;
+my $line = "";
+
+my @specieslist;
+my @genelist;
+my @nsp;
+my $nsl=1;
+my @uniquemodels;
+my @nullmodels;
+my @retainedspecies;
+
+# read file with species
+unless($speciesfile eq 'None'){
+	open SPFILE, $speciesfile or die "ERROR: Cannot open file $speciesfile\n\n";
+	$nsl=0;
+while (<SPFILE>) {
+        chomp;
+	my $currentinput = "$_";
+        if($currentinput =~m /\w/){     #must have a word otherwise empty
+                push(@nsp, $currentinput);
+        }else{
+                die "ERROR: Fewer than 4 species meet your specified criteria.\n";
+        }
+#	if(@nsp < 4){
+#		die "ERROR: Species file must have more than 4 species.\n";
+#	}
+}
+#print "\n\n";
+} #end of unless
+
+#Determine models used for each partition, make hash
+unless($modelsfile eq 'None'){
+	open MODFILE, $modelsfile or die "ERROR: Cannot open file $modelsfile\n\n";
+	while (<MODFILE>) {
+	        chomp;
+		my $currentinput = "$_";
+	        if($currentinput =~m /\t/){     #must have a tab otherwise wrong file format
+			if($currentinput =~m /\t\t/){
+				print OUT "ERROR: file contains 2 tabs in a row.  Check phytab format.\n";
+				die;
+			}else{
+				my @genemodel = split(/\t/, $currentinput);
+				my $genefamily=$genemodel[0];
+				my $curmodel = $genemodel[1];
+				if (exists $ModelFor{$genefamily}) {
+					print OUT "ERROR: Model specification for $genefamily is duplicated\n";
+					die;
+				}else{
+					$ModelFor{$genefamily}=$curmodel;
+					push(@uniquemodels,$curmodel);
+				}
+			}
+		}else{
+			die "ERROR: Model LUT must be genefamily\tmodel and contain no blank lines\n";
+		}
+	}
+}
+@uniquemodels = uniq(@uniquemodels); 	#remove redundant models - uniq is subroutine at end of script
+
+#Now check that models are valid raxml models NOT DONE YET
+checkraxmlmodel();
+
+#INPUT ALL DATA
+open(INFILE1, $infile1);
+foreach $line(<INFILE1>) {
+	chop($line);
+	my $getline = $line;
+	my @column = split(/\t/, $getline);
+	my $species = $column[0];
+	my $genefamily = $column[1];
+	my $genename = $column[2];
+	my $sequence = $column[3];
+
+	if (exists $HoAdatatable{$species}{$genefamily}) {
+		print OUT "ERROR: $species $genefamily is duplicated\n";
+		die;
+	}else{
+		$HoAdatatable{$species}{$genefamily} = $sequence;
+		$HoGF{$genefamily}{$species}=$sequence;
+	}
+}
+
+#If no models file selected, set every gene family to GTR model
+if($modelsfile eq 'None'){
+	foreach my $gfkey (sort keys %HoGF){
+		$ModelFor{$gfkey} = 'GTR';
+	}
+	push(@uniquemodels, 'GTR');
+}
+
+#First, keep all species with enough total partitions present
+foreach my $specieskey (sort keys %HoAdatatable)
+{
+	#Count species with minimum gfs
+	my $ngf_persp=0;
+	foreach my $genefamilykey (sort keys %{$HoAdatatable{$specieskey}})
+	{
+		$ngf_persp++;
+	}
+	unless($ngf_persp < $mingf_persp){	#too few genes for this species, delete the sp
+		if($nsl){	#No species list supplied, push all species into list
+			push(@specieslist,$specieskey);
+		}else{
+			#See if current specieskey is in inputted species list
+			my $nsp;
+			foreach $nsp(@nsp){
+        			if (index($nsp,$specieskey) ge 0){
+					push(@specieslist,$specieskey);
+				}
+        		}
+		}
+	} 
+}
+print OUT "\n";
+unless (@specieslist){
+	print OUT "ERROR: No species with more than $mingf_persp genes\n";
+	die;
+}
+if(@specieslist < 4){
+	print OUT "ERROR: Less than 4 species with more than $mingf_persp genes\n";
+	die;
+}
+
+
+my $oldgenelen = 0;
+my $currentseqlen = 0;
+foreach my $gfkey (sort keys %HoGF)
+{
+	$oldgenelen=0;
+	#Count gfs  with minimum species
+	my $nsp_pergf=0;
+	for(my $j=0; $j<@specieslist; $j++){
+		if(exists $HoGF{$gfkey}{$specieslist[$j]}){
+			$nsp_pergf++ ; ###if exists $HoGF{$gfkey}{$specieslist[$j]};
+
+
+	##get length of gene and check it is consistent
+			if(exists $lengthof{$gfkey}){
+				$currentseqlen = length($HoGF{$gfkey}{$specieslist[$j]});
+			#	$lengthof{$gfkey} = $currentseqlen;
+				if($currentseqlen == $oldgenelen){
+					#$oldgenelen = $lengthof{$gfkey};
+					#okay
+				}else{
+					print OUT "ERROR: $specieslist[$j] $gfkey sequences ". 
+						"different lengths than previous. Sequences must be aligned. If ".
+						"sequences are aligned, check that the line ". 
+						"does not have an extra data column before the ". 
+						"sequence.\n\n";
+					die "ERROR: $specieslist[$j] $gfkey sequences ".
+						"different lengths than previous. Sequences must be aligned. If ".
+						"sequences are aligned, check that the line ". 
+						"does not have an extra data column before the ". 
+						"sequence.\n\n";
+				}
+			}else{
+				$currentseqlen = length($HoGF{$gfkey}{$specieslist[$j]});
+				if($currentseqlen == 0){
+					die "ERROR: Zero length sequence in file.\n"
+				}
+				$lengthof{$gfkey} = $currentseqlen;
+				$oldgenelen = $lengthof{$gfkey};
+			}
+		}
+	}
+	if($nsp_pergf < $minsp_pergf){
+		#too few species for this gene family
+	}else{
+		if(exists $lengthof{$gfkey}){
+			if($lengthof{$gfkey}>$min_gf_len){
+				push(@genelist,$gfkey);
+			}
+		}
+	}
+}
+if (@genelist==0){
+	print OUT "ERROR: No gene families/partitions meet the specified criteria\n";
+	die "ERROR: No gene families/partitions meet the specified criteria\n";
+}
+
+#Now must delete species that lack any *retained* partitions, ie some species may be all missing
+foreach(@specieslist)
+{
+	my $curspecies=$_;
+	#Count species with minimum gfs
+	my $ngf_persp=0;
+	my $nonmissing=0;
+	foreach (@genelist)
+	{
+		if (exists $HoAdatatable{$curspecies}{$_}){
+			my $cursequence = $HoAdatatable{$curspecies}{$_};
+			$cursequence =~ s/\?//g;
+			$cursequence =~ s/\-//g;
+			#Will not remove N's assumes those are not missing data. Revisit?
+			my $curlen = length($cursequence);
+			$nonmissing = $nonmissing + $curlen;
+		}
+	}
+	if($nonmissing > 0)	#must remove species as it contains none of the genes retained
+	{
+		print "$curspecies has $nonmissing Non-Missing characters\n";
+		push(@retainedspecies, $curspecies);
+	}
+}
+@specieslist=@retainedspecies;
+
+#Remove blankspecies from
+#print phylip file
+#calculate n characters
+my $nchar=0;	#total characters
+my $ncharPs =0;	#start count for characters in current partition
+my $ncharPe =0;	#end count for characters in current partition
+
+#First count total characters for first line
+for(my $k=0; $k<@genelist; $k++){
+	if (exists $lengthof{$genelist[$k]}){			#check that length was calculated
+		$nchar = $nchar + $lengthof{$genelist[$k]};
+	}else{
+		die "ERROR: $genelist[$k] LENGTH MISSING\n";
+	}
+}
+print OUT @specieslist." ".$nchar."\n";
+htmlheader();
+
+#Need to determine gene list order, which will change due to partitioning
+#then write header line of gene names, hopefully in correct order
+#print TABLE "<td bgcolor=white></td>"; #Blank line in species column
+print TABLE "<td style'width:2pc'><font size='-3'>Partition:</td>";
+for(my $part=0; $part < @uniquemodels; $part++){
+	for(my $k=0; $k<@genelist; $k++){
+		#First check if current gf matches current partition
+		if(exists $ModelFor{$genelist[$k]}){
+			if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){
+				if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){
+					print TABLE "<td style'width:2pc'><font size='-3'>$genelist[$k]</td>";
+				}
+			}
+		}
+	}
+}
+print TABLE "</tr>";
+
+#print TABLE "<td bgcolor=white></td>"; #Blank line in species column
+print TABLE "<td style'width:2pc'><font size='-3'>Model:</td>";
+for(my $part=0; $part < @uniquemodels; $part++){
+	for(my $k=0; $k<@genelist; $k++){
+		#First check if current gf matches current partition
+		if(exists $ModelFor{$genelist[$k]}){
+			if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){
+				if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){
+					print TABLE "<td style'width:2pc'><font size='-3'>$uniquemodels[$part]</td>";
+				}
+			}
+		}
+	}
+}
+#End of htmlheader printing
+
+for(my $j=0; $j<@specieslist; $j++){
+	print OUT "$specieslist[$j]\t";
+	print TABLE "
+		<tr>
+		<td>$specieslist[$j]</td>";
+	for(my $part=0; $part < @uniquemodels; $part++){
+		for(my $k=0; $k<@genelist; $k++){
+			#First check if current gf matches current partition
+			if(exists $ModelFor{$genelist[$k]}){
+				if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){
+					if (exists $HoAdatatable{$specieslist[$j]}{$genelist[$k]}){
+						print OUT $HoAdatatable{$specieslist[$j]}{$genelist[$k]};
+						print TABLE "<td bgcolor=black></td>";
+						$ncharPe = $ncharPe + $lengthof{$genelist[$k]};
+					}else{
+						if (exists $lengthof{$genelist[$k]}){
+							for(my $gap=0; $gap<$lengthof{$genelist[$k]}; $gap++){
+								print OUT "?";
+							}
+							$ncharPe = $ncharPe + $lengthof{$genelist[$k]};
+							print TABLE "<td bgcolor=white></td>";
+							#print TABLE "<td style'width:2pc'><font size='-2'>$genelist[$k]</td>";
+						}else{
+							die "ERROR: BUG!! $genelist[$k] LENGTH MISSING\n";
+						}
+					}
+				}
+			}else{
+				die "ERROR: $genelist[$k] is not assigned a model.  Check model LUT input.\n";
+			}
+
+		}
+		if($j==0){		#print partitions first time through gene lists
+			if(($ncharPe==0) || ($ncharPs > $ncharPe)){
+				push(@nullmodels, $uniquemodels[$part]);
+				#print "NOTE: no partitions under the model $uniquemodels[$part] made final dataset\n";
+			}else{
+				print PART "$uniquemodels[$part], $uniquemodels[$part] = $ncharPs - $ncharPe \n";
+			}
+			$ncharPs=$ncharPe + 1;
+		}
+	}
+	print TABLE "</tr>";
+	print OUT "\n";
+}
+if($ncharPe/@genelist != $nchar){
+	# Have to account for multiple times through gene lists -- ncharPe is summed multiple times but printed once
+	#die "ERROR:  BUG!! Last partition number doesn't match total n characters\n";
+}
+
+#print Statistics to screen can be redirected for Log File
+print "\nSPECIES:\n";
+unless($speciesfile eq 'None'){
+	print "Used species file to select species. \n";
+}
+print "Number of species with $mingf_persp or more genefamilies/partitions: ".@specieslist."\n";
+print "Species list: @specieslist\n\n\n";
+print "\nPARTITIONS/GENE FAMILIES:\n";
+print "Number genefamilies/partitions longer than $min_gf_len characters and present in at least $minsp_pergf genefamilies/partitions: ".@genelist."\n";
+for(my $i=0; $i < @genelist; $i++){
+	print "$genelist[$i] Length:$lengthof{$genelist[$i]}\n";
+}
+
+	#Printing Model stats
+print "\nMODELS:\n";
+if($modelsfile eq 'None'){
+	print "All partitions set to GTR (no model LUT supplied)\n\n";
+}else{
+	print "A LUT of models was supplied.\n";
+	print "The following models were present in the model LUT file:\n";
+	print join(" ", @uniquemodels), "\n";
+	print "\n";
+	for(my $i=0; $i < @nullmodels; $i++){
+		print "NOTE: no partitions under the model $nullmodels[$i] made final dataset\n";
+	}
+}
+close PART;
+close OUT;
+close SPFILE;
+close TABLE;
+
+sub uniq {
+    return keys %{{ map { $_ => 1 } @_ }};
+}
+
+sub checkraxmlmodel {
+	my @raxmlmodels = ("DNA","BIN","MULTI","DAYHOFF", "DCMUT", "JTT", "MTREV", "WAG", "RTREV", "CPREV", "VT", "BLOSUM62", "MTMAM", "LG", "GTR", "MTART", "MTZOA", "FLU","PMB", "HIVB","HIVW","JTTDCMUT");
+	return(1);	#Not yet implemented
+}
+sub htmlheader {
+	print TABLE '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 
+		"http://www.w3.org/TR/html4/loose.dtd">
+		<html>
+		<head>
+		<title>Dataset Presences and Absences</title>
+		<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+		</head>
+		<hr><table border="1">
+		<tr>
+			<th align="center">Species<br>';
+	for(my $i=1; $i < @genelist+1; $i++){
+#Genes are printing in wrong order
+#		print TABLE "<td style'width:2pc'><font size='-2'>$genelist[$i]</td>";
+		print TABLE "<td style'width:2pc'><font size='-2'>$i</td>";
+	}
+	print TABLE "</th>
+		     </tr>";
+}
+sub htmltail {
+print TABLE '
+</body>
+</html>';
+}