Mercurial > repos > dereeper > pgap
changeset 0:83e62a1aeeeb draft
Uploaded
| author | dereeper | 
|---|---|
| date | Thu, 24 Jun 2021 13:51:52 +0000 | 
| parents | |
| children | 6d3580552457 | 
| files | PGAP-1.2.1/BLOSUM45 PGAP-1.2.1/Blast_Filter.pl PGAP-1.2.1/Converter_NCBINewFormatData.pl PGAP-1.2.1/Converter_draft.pl PGAP-1.2.1/Converter_finished.pl PGAP-1.2.1/PGAP.pl PGAP-1.2.1/README PGAP-1.2.1/Statistics/Distributions.pm PGAP-1.2.1/Statistics/LineFit.pm PGAP-1.2.1/blast_parser.pl PGAP-1.2.1/inparanoid.pl PGAP-1.2.1/manual.pdf PGAP-1.2.1/multiparanoid.pl PGAP-1.2.1/readme.txt PGAP-1.2.1/seqstat.jar PGAP.xml PGAP_wrapper2.pl | 
| diffstat | 17 files changed, 8176 insertions(+), 0 deletions(-) [+] | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PGAP-1.2.1/BLOSUM45 Thu Jun 24 13:51:52 2021 +0000 @@ -0,0 +1,31 @@ +# Matrix made by matblas from blosum45.iij +# * column uses minimum score +# BLOSUM Clustered Scoring Matrix in 1/3 Bit Units +# Blocks Database = /data/blocks_5.0/blocks.dat +# Cluster Percentage: >= 45 +# Entropy = 0.3795, Expected = -0.2789 + A R N D C Q E G H I L K M F P S T W Y V B Z X * +A 5 -2 -1 -2 -1 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -2 -2 0 -1 -1 0 -5 +R -2 7 0 -1 -3 1 0 -2 0 -3 -2 3 -1 -2 -2 -1 -1 -2 -1 -2 -1 0 -1 -5 +N -1 0 6 2 -2 0 0 0 1 -2 -3 0 -2 -2 -2 1 0 -4 -2 -3 4 0 -1 -5 +D -2 -1 2 7 -3 0 2 -1 0 -4 -3 0 -3 -4 -1 0 -1 -4 -2 -3 5 1 -1 -5 +C -1 -3 -2 -3 12 -3 -3 -3 -3 -3 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 -2 -3 -2 -5 +Q -1 1 0 0 -3 6 2 -2 1 -2 -2 1 0 -4 -1 0 -1 -2 -1 -3 0 4 -1 -5 +E -1 0 0 2 -3 2 6 -2 0 -3 -2 1 -2 -3 0 0 -1 -3 -2 -3 1 4 -1 -5 +G 0 -2 0 -1 -3 -2 -2 7 -2 -4 -3 -2 -2 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -5 +H -2 0 1 0 -3 1 0 -2 10 -3 -2 -1 0 -2 -2 -1 -2 -3 2 -3 0 0 -1 -5 +I -1 -3 -2 -4 -3 -2 -3 -4 -3 5 2 -3 2 0 -2 -2 -1 -2 0 3 -3 -3 -1 -5 +L -1 -2 -3 -3 -2 -2 -2 -3 -2 2 5 -3 2 1 -3 -3 -1 -2 0 1 -3 -2 -1 -5 +K -1 3 0 0 -3 1 1 -2 -1 -3 -3 5 -1 -3 -1 -1 -1 -2 -1 -2 0 1 -1 -5 +M -1 -1 -2 -3 -2 0 -2 -2 0 2 2 -1 6 0 -2 -2 -1 -2 0 1 -2 -1 -1 -5 +F -2 -2 -2 -4 -2 -4 -3 -3 -2 0 1 -3 0 8 -3 -2 -1 1 3 0 -3 -3 -1 -5 +P -1 -2 -2 -1 -4 -1 0 -2 -2 -2 -3 -1 -2 -3 9 -1 -1 -3 -3 -3 -2 -1 -1 -5 +S 1 -1 1 0 -1 0 0 0 -1 -2 -3 -1 -2 -2 -1 4 2 -4 -2 -1 0 0 0 -5 +T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -1 -1 2 5 -3 -1 0 0 -1 0 -5 +W -2 -2 -4 -4 -5 -2 -3 -2 -3 -2 -2 -2 -2 1 -3 -4 -3 15 3 -3 -4 -2 -2 -5 +Y -2 -1 -2 -2 -3 -1 -2 -3 2 0 0 -1 0 3 -3 -2 -1 3 8 -1 -2 -2 -1 -5 +V 0 -2 -3 -3 -1 -3 -3 -3 -3 3 1 -2 1 0 -3 -1 0 -3 -1 5 -3 -3 -1 -5 +B -1 -1 4 5 -2 0 1 -1 0 -3 -3 0 -2 -3 -2 0 0 -4 -2 -3 4 2 -1 -5 +Z -1 0 0 1 -3 4 4 -2 0 -3 -2 1 -1 -3 -1 0 -1 -2 -2 -3 2 4 -1 -5 +X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 0 -2 -1 -1 -1 -1 -1 -5 +* -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PGAP-1.2.1/Blast_Filter.pl Thu Jun 24 13:51:52 2021 +0000 @@ -0,0 +1,51 @@ +#!/usr/bin/perl +use strict; +use warnings; + + +my $BLASTResult=$ARGV[0]; +my $SEQFile=$ARGV[1]; +my $coverage=$ARGV[2]; +my $identity=$ARGV[3]*100;#percents +my $score=$ARGV[4]; +my %hash; +my @row; +my $line; + +#my $coverage=0.5; +#my $identity=50;#percents +#my $score=50; + +&ReadSeqLength("$SEQFile",\%hash); + +open(F,$BLASTResult); +while ($line=<F>) + { + if ($line!~/\#/) + { + @row=split(/\t/,$line); + if ($row[2]>=$identity and $row[11]>=$score) + { + if ((($row[7]-$row[6]+1)/$hash{$row[0]})>=$coverage) + { + if ((($row[9]-$row[8]+1)/$hash{$row[1]})>=$coverage) + { + print "$row[0]\t$row[1]\t$row[11]"; + } + } + } + } + } +close(F); + +sub ReadSeqLength() + { + use Bio::SeqIO; + (my $file,my $hash)=@_; + my $seq; + my $in=Bio::SeqIO->new(-file=>"$file",-format=>"fasta"); + while ($seq=$in->next_seq()) + { + $$hash{$seq->id}=length($seq->seq()); + } + } \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PGAP-1.2.1/Converter_NCBINewFormatData.pl Thu Jun 24 13:51:52 2021 +0000 @@ -0,0 +1,300 @@ +#!/usr/bin/perl +use strict; +use warnings; +use Getopt::Long; + + + +my %optionHash=qw(); +my $inprefix=""; +my $outprefix=""; +my $indir=""; +my $outdir=""; +my @inList; +my @outList; + +GetOptions(\%optionHash,"inprefix:s","outprefix:s","indir:s","outdir:s","help|h!"); + + +if(scalar(keys %optionHash)==0){ + &print_usage(""); +} + +if(exists($optionHash{"h"}) or exists($optionHash{"help"}) ){ + &print_usage(""); +} + +if( exists($optionHash{"inprefix"} ) ){ + $inprefix = $optionHash{"inprefix"}; + @inList=split(/\+/,$inprefix); +}else{ + &print_usage("--inprefix should be provided!"); +} + +if( exists($optionHash{"outprefix"} ) ){ + $outprefix = $optionHash{"outprefix"}; + @outList=split(/\+/,$outprefix); +} + +if( exists($optionHash{"indir"} ) ){ + $indir = $optionHash{"indir"}; +}else{ + &print_usage("--indir should be provided!"); +} + +if( exists($optionHash{"outdir"} ) ){ + $outdir = $optionHash{"outdir"}; +}else{ + &print_usage("--outdir should be provided!"); +} + + +if( $inprefix eq "" or $indir eq "" or $outdir eq "" ){ + &print_usage(""); +} + +if( scalar(@outList) >0 and scalar(@outList) != scalar(@inList) ){ + &print_usage("If outprefix was provided, the name number should be identical with inprefix"); +} + + +system("mkdir -p $outdir"); + +foreach my $idx (0 .. (scalar(@inList) -1) ){ + my $inname = $inList[$idx]; + my $feature_table = $indir."/".$inname."_feature_table.txt"; + my $faa = $indir."/".$inname."_protein.faa"; + my $fna = $indir."/".$inname."_genomic.fna"; + + my %genePositionHash; + my %genomeSequenceHash; + my %geneAnnotationHash; + my %gene2genomeHash; + my %genefaaHash; + my $count=0; + + my $outname; + + if($outprefix eq ""){ + my @tmp=split(/\./,$inname); + $outname = $tmp[0]; + }else{ + $outname = $outList[$idx]; + } + + # check file + if( ! -e $feature_table){ + print "$feature_table was not found!\n"; + print "$inname was skipped!\n"; + next; + } + + if( ! -e $faa){ + print "$faa was not found!\n"; + print "$inname was skipped!\n"; + next; + } + + if( ! -e $fna){ + print "$fna was not found!\n"; + print "$inname was skipped!\n"; + next; + } + + # read feature + &read_feature($feature_table,\%geneAnnotationHash,\%genePositionHash,\%gene2genomeHash); + + # get genome sequence + &read_genomeSequence($fna,\%genomeSequenceHash); + + # get faa + &read_faa($faa,\%genefaaHash); + + # extract nuc and output + + open(PEP,">$outdir/$outname.pep"); + open(NUC,">$outdir/$outname.nuc"); + open(FUN,">$outdir/$outname.function"); + + foreach my $mygene (keys %genePositionHash){ + if( (!exists( $geneAnnotationHash{$mygene})) or (!exists($gene2genomeHash{$mygene})) or (!exists($genefaaHash{$mygene})) or (!exists($genomeSequenceHash{$gene2genomeHash{$mygene}})) ){ + print $inname . "\t" . $mygene . "\t" . "skipped for insufficient data!\n"; + next; + } + + my $nuc = &getSeq($genePositionHash{$mygene}, $genomeSequenceHash{$gene2genomeHash{$mygene}} ); + + if( $nuc eq ""){ + print $inname . "\t" . $mygene . "\t" . "skipped for insufficient data!\n"; + next; + } + + print PEP ">$mygene\n$genefaaHash{$mygene}\n"; + print NUC ">$mygene\n$nuc\n"; + print FUN "$mygene\t-\t$geneAnnotationHash{$mygene}\n"; + $count++; + } + close(PEP); + close(NUC); + close(FUN); + + print $inname . " -> $outname: $count genes were extracted in total.\n"; +} + + +sub getSeq(){ + (my $pos,my $genomeSeq)=@_; + my @tmp=split(/\|/,$pos); + if( $tmp[1]> length($genomeSeq) ){ + return ""; + } + my $seq = substr($genomeSeq,$tmp[0]-1,$tmp[1]-$tmp[0]+1); + if($tmp[2] eq "-"){ + $seq = &rcseq($seq); + } + return $seq; +} + + + +sub rcseq(){ + (my $seq)=@_; + $seq=uc($seq); + + $_=$seq; + tr/ACGT/TGCA/; + $seq = $_; + return reverse($_); +} + + + +sub read_faa(){ + (my $infile, my $seqHash)=@_; + my $seq=""; + my $name=""; + my @content; + open(F,$infile); + @content=<F>; + close(F); + chomp(@content); + + for (my $line = 0; $line < @content; $line++) { + if($content[$line] =~/^>/ ){ + if($name ne ""){ + $$seqHash{$name}=$seq; + } + my @tmp=split(/\s+/,$content[$line]); + @tmp=split(/\./,$tmp[0]); + + $name=substr($tmp[0], 1, length($tmp[0])-1); + $seq=""; + }else{ + $seq = $seq . $content[$line]; + } + } + $$seqHash{$name}=$seq; +} + + + + + + +sub read_genomeSequence(){ + (my $infile,my $seqHash)=@_; + my $seq=""; + my $name=""; + my @content; + open(F,$infile); + @content=<F>; + close(F); + chomp(@content); + + for (my $line = 0; $line < @content; $line++) { + if($content[$line] =~/^>/ ){ + if($name ne ""){ + $$seqHash{$name}=$seq; + } + my @tmp=split(/\s+/,$content[$line]); + $name=substr($tmp[0], 1, length($tmp[0])-1); + $seq=""; + }else{ + $seq = $seq . $content[$line]; + } + } + $$seqHash{$name}=$seq; +} + + + +sub read_feature(){ + (my $infile, my $annHash,my $posHash,my $gene2genomeHash)=@_; + my @content; + open(F,$infile); + @content=<F>; + close(F); + chomp(@content); + + for (my $line = 1; $line < @content - 1; $line=$line+1) { + my @row=split(/\t/,$content[$line]); + my @nextrow=split(/\t/,$content[$line+1]); + + if($row[1] ne "protein_coding"){ + next; + } + + # in case + if( $row[7]."|".$row[8]."|".$row[9] ne $nextrow[7]."|".$nextrow[8]."|".$nextrow[9]){ + next; + } + + # print $nextrow[10]."\n"; + # print $row[7]."|".$row[8]."|".$row[9]."\n"; + + my @tmp=split(/\./,$nextrow[10]); + my $geneName = $tmp[0]; + + $$annHash{$geneName}=$nextrow[13]; + $$posHash{$geneName}=$row[7]."|".$row[8]."|".$row[9]; + $$gene2genomeHash{$geneName}=$row[6]; + } +} + + +sub print_usage(){ + my $message=shift; +my @usage=qq( +Version: 2016042201 +Usage: perl Converter_NCBINewFormatData.pl [options] + + +Options: + + --inprefix String The prefix of the input files, such as GCF_000007545.1_ASM754v1 + If two or more strains were provided, please join their prefixs with "+" + Such as GCF_000007545.1_ASM754v1+GCF_000008105.1_ASM810v1+GCF_000711315.1_ASM71131v1 + + --indir String The directory of those input files + + --outprefix String The prefix for the output files. + If a value "ty2" was provided, the output files would be: ty2.nuc, ty2.pep, and ty2.function + If two or more strains were provided, please join their prefixs with "+" + If the prefix was not provided, the assembly value would be used as the prefix, such as GCF_000007545 + + --outdir String The directory for the output files + +Note: + 1. Before converting data with this script, please prepare *feature_table.txt, *genomic.fna and *protein.faa files for each strain. + + 2. This script was designed for NCBI new format data only. If part of your data is in the old format, please use the Converter_finished.pl or Converter_draft.pl script to convert the data. + + +); + + print join("\n",@usage)."\n"; + print $message."\n"; + exit; +} + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PGAP-1.2.1/Converter_draft.pl Thu Jun 24 13:51:52 2021 +0000 @@ -0,0 +1,245 @@ +#!/usr/bin/perl +use strict; +use warnings; +use Getopt::Std; + +my %opt; +getopts('N:I:O:',\%opt); + +my @usage=qq( +Version: 2016042201 +Usage: perl Converter_draft.pl [options] + +Options: + + -N String Input the strain nickname + -I String Input file directory + -O String Output file directory +); + +if (!scalar(keys %opt)) + { + print join("\n",@usage)."\n"; + exit; + } + +my $prefix; +if (exists($opt{"N"})) + { + $prefix=$opt{"N"} + }else + { + print "-N could not be empty!"; + print join("\n",@usage)."\n"; + exit; + } + +my $output; +if (exists($opt{"O"})) + { + $output=$opt{"O"}; + }else + { + print "-O could not be empty!"; + print join("\n",@usage)."\n"; + exit; + } + +my $input; +if (exists($opt{"I"})) + { + $input=$opt{"I"}; + }else + { + print "-I could not be empty!"; + print join("\n",@usage)."\n"; + exit; + } + +my $sp; +my $line; +my @row; +my @tmp; +my %hash; +my $flag; +my $file; +my $list; +my @list; +my $pttlost; +my $gi; + +if ((-e $output) and ((-d $output))) + { + }else + { + mkdir($output); + } + +if ($input!~/\/$/) + { + $input=$input."/"; + } + +if ($output!~/\/$/) + { + $output=$output."/"; + } + +opendir(DIR,"$input") || die "The input directory ( $input ) is not exists!\n"; +@list=grep(/faa$/,readdir(DIR)); +closedir(DIR); +$_=join("\t",@list); +s/.faa//g; +@list=split(/\t/,$_); + +open(PEP,">$output$prefix.pep"); +open(NUC,">$output$prefix.nuc"); +open(FUN,">$output$prefix.function"); + +foreach $list (@list) + { + %hash=(); + if (!(-e $input.$list.".faa")) + { + print $input.$list.".faa is not exists!\n$list.faa, $list.ffn and $list.ptt are skipped!\n"; + next; + } + + if (!(-e $input.$list.".ffn")) + { + print $input.$list.".ffn is not exists!\n$list.faa, $list.ffn and $list.ptt are skipped!\n"; + next; + } + + if (!(-e $input.$list.".ptt")) + { + open(F,$input.$list.".faa"); + @_=<F>; + close(F); + @_=grep(/^>/,@_); + if (scalar(@_)>1) + { + print $input.$list.".ptt is not exists!\n"; + print "There are more than 1 sequence in $list.faa and $list.ffn, So $list.faa, $list.ffn and $list.ptt are skipped!\n"; + next; + } + $pttlost=1; + }else + { + $pttlost=0; + } + + $file=$input.$list.".faa"; + open(F,$file) or die "could not open $file"; + while ($line=<F>) + { + if ($line=~/^>/) + { + @row=split(/\|/,$line); + print PEP ">$row[1]\n"; + if ($pttlost ==1) + { + $gi=$row[1]; + } + }else + { + print PEP $line; + } + } + close(F); + + if ($pttlost ==1) + { + print FUN "$gi\t-\thypothetical protein\n"; + }else + { + $file=$input.$list.".ptt"; + open(F,"$file") or die "could not open $file"; + $_=<F>; + $_=<F>; + $_=<F>; + while ($line=<F>) + { + chomp($line); + @row=split(/\t/,$line); + print FUN $row[3]."\t".$row[7]."\t".$row[8]."\n"; + @tmp=split(/\.\./,$row[0]); + if ($row[1] eq "+") + { + $hash{$tmp[0]."-".$tmp[@tmp-1]}=$row[3]; + }else + { + $hash{"c".$tmp[@tmp-1]."-".$tmp[0]}=$row[3]; + } + } + close(F); + } + + + + $file=$input.$list.".ffn"; + open(F,"$file") or die "could not open $file";; + while ($line=<F>) + { + if ($line=~/^>/) + { + if ($pttlost==1) + { + print NUC ">$gi\n"; + $flag=1; + }else + { + my $key=&getKey($line); + if (exists($hash{$key})) + { + $flag=1; + print NUC ">$hash{$key}\n"; + }else + { + $flag=0; + } + } + }else + { + if ($flag) + { + print NUC $line; + } + } + } + close(F); + } + +close(PEP); +close(NUC); +close(FUN); + +sub getKey() +{ + (my $line)=@_; + my @tmp; + my $strand; + chomp($line); + @tmp=split(/ /,$line); + @tmp=split(/\:/,$tmp[0]); + + if($tmp[@tmp-1]=~/c/) + { + $strand="-"; + }else + { + $strand="+"; + } + $_=$tmp[@tmp-1]; + s/c//g; + s/ //g; + @tmp=split(/\,|-/,$_); + @tmp=sort{$a<=>$b} @tmp; + if($strand eq "-") + { + return "c".$tmp[@tmp-1]."-".$tmp[0]; + }else + { + return $tmp[0]."-".$tmp[@tmp-1]; + } +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PGAP-1.2.1/Converter_finished.pl Thu Jun 24 13:51:52 2021 +0000 @@ -0,0 +1,188 @@ +#!/usr/bin/perl +use strict; +use warnings; +use Getopt::Std; + +my %opt; +getopts('S:I:O:',\%opt); + +my @usage=qq( +Version: 2016042201 +Usage: perl Converter_finished.pl [options] + +Options: + + -S String Input the strains nickname, + If 2 or more, join them with '+', + For example: CT18+NC_011834+SPA + -I String Input file directory + -O String Output file directory +); + +if (!scalar(keys %opt)) + { + print join("\n",@usage)."\n"; + exit; + } + +my @sp; +if (exists($opt{"S"})) + { + @sp=split(/\+/,$opt{"S"}); + }else + { + print "-S could not be empty!"; + print join("\n",@usage)."\n"; + exit; + } + +my $output; +if (exists($opt{"O"})) + { + $output=$opt{"O"}; + }else + { + print "-O could not be empty!"; + print join("\n",@usage)."\n"; + exit; + } + +my $input; +if (exists($opt{"I"})) + { + $input=$opt{"I"}; + }else + { + print "-I could not be empty!"; + print join("\n",@usage)."\n"; + exit; + } + +my $sp; +my $line; +my @row; +my @tmp; +my %hash; +my $flag; +my $file; + + +if ((-e $output) and ((-d $output))) + { + }else + { + mkdir($output); + } + +if ($input!~/\/$/) + { + $input=$input."/"; + } + + +if ($output!~/\/$/) + { + $output=$output."/"; + } + + +foreach $sp (@sp) + { + %hash=(); + $file=$input.$sp.".faa"; + open(F,$file) or die "could not open $file"; + open(R,">$output$sp.pep"); + while ($line=<F>) + { + if ($line=~/^>/) + { + @row=split(/\|/,$line); + print R ">$row[1]\n"; + }else + { + print R $line; + } + } + close(F); + close(R); + + $file=$input.$sp.".ptt"; + open(F,"$file") or die "could not open $file"; + open(R,">$output$sp.function"); + $_=<F>; + $_=<F>; + $_=<F>; + while ($line=<F>) + { + chomp($line); + @row=split(/\t/,$line); + print R $row[3]."\t".$row[7]."\t".$row[8]."\n"; + @tmp=split(/\.\./,$row[0]); + if ($row[1] eq "+") + { + $hash{$tmp[0]."-".$tmp[@tmp-1]}=$row[3]; + }else + { + $hash{"c".$tmp[@tmp-1]."-".$tmp[0]}=$row[3]; + } + } + close(R); + close(F); + + $file=$input.$sp.".ffn"; + open(F,"$file") or die "could not open $file";; + open(R,">$output/$sp.nuc"); + while ($line=<F>) + { + if ($line=~/^>/) + { + my $key=&getKey($line); + if (exists($hash{$key})) + { + $flag=1; + print R ">$hash{$key}\n"; + }else + { + $flag=0; + } + }else + { + if ($flag) + { + print R $line; + } + } + } + close(R); + close(F); + } + +sub getKey() +{ + (my $line)=@_; + my @tmp; + my $strand; + chomp($line); + @tmp=split(/ /,$line); + @tmp=split(/\:/,$tmp[0]); + + if($tmp[@tmp-1]=~/c/) + { + $strand="-"; + }else + { + $strand="+"; + } + $_=$tmp[@tmp-1]; + s/c//g; + s/ //g; + @tmp=split(/\,|-/,$_); + @tmp=sort{$a<=>$b} @tmp; + if($strand eq "-") + { + return "c".$tmp[@tmp-1]."-".$tmp[0]; + }else + { + return $tmp[0]."-".$tmp[@tmp-1]; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PGAP-1.2.1/PGAP.pl Thu Jun 24 13:51:52 2021 +0000 @@ -0,0 +1,2825 @@ +#!/usr/bin/perl +use strict; +use warnings; +use Getopt::Long; + +### programs from BLAST +my $formatdb="formatdb"; +my $blastall="blastall"; + +### programs from mcl +my $mcl="mcl"; + +### programs from mafft +my $mafft="mafft"; + +### programs from PHYLIP +my $seqboot="seqboot"; +my $neighbor="neighbor"; +my $consense="consense"; +my $dnaml="dnaml"; +my $dnadist="dnadist"; +my $dnapars="dnapars"; + +my $count_tree=0; + +my $sampleSize=8000; # when calculate the pan-genome size, we will sample $sampleSize combinations + # if the total combination number is larger than $sampleSize for specific genomes + # Surely, the number of $sampleSize is, the larger, the better. + # However, the larger the $sampleSize is, the more time would be consumed. + # we suggest the range: 5000 ~ 20,000 + +##################################################################### +# DOn't modify the following code, unless you know their functions +##################################################################### + +my %opt=qw(); +GetOptions(\%opt,"strains:s","input:s","output:s","cluster!","pangenome!","variation!","evolution!","function!","method:s","thread:i","score:f","evalue:f","coverage:f","local:f","global:f","identity:f","bootstrap:i","help|h!"); + +my @usage=qq( +====== Pan-Genome Analysis Pipeline (PGAP) ====== + Version 1.2.1 + +Usage: perl PGAP.pl [Options] + +Options: + --strains String Input strains nicknames, and join them with '+', for example: A+B+C + --input String Input data directory + --output String Result output directory + + --cluster Run homologous gene clustering + --pangenome Run pan-genome analysis + --variation Run homologous clusters variation analysis + --evolution Run evolution analysis + --function Run Function analysis + + --method String GF for GeneFamily method, and MP for MultiParanoid method + for GF: fast, but not very accurate + evalue, score, indentity, coverage are employed + for MP: slow, but more accurate + score, coverage, local, global are employed + --thread Int Number of processors to use in blastall. [default:1] + --score Int Minimum score in blastall. [default:40] + --evalue Decimal Maximal E-value in blastall. [default:1e-10] + --coverage Decimal Minimum alignment coverage for two homologous proteins. [default:0.5] + --local Decimal Minimum local alignment overlap in MP method. [default:0.25] + --global Decimal Minimum global alignment overlap in MP method. [default:0.5] + --identity Decimal Minimum alignment indentity for two homologous proteins. [default:0.5] + --bootstrap Int Bootstrap times for phylogenetics tree. [default:1] + + --h or help Display this message +); + + +############# specified variable ############# +my $inputDIR; +my $outputDIR; +my $run_cluster; +my $run_pangenome; +my $run_variation; +my $run_evolution; +my $run_function; +my $method=""; +my $thread; +my $score; +my $identity; +my $evalue; +my $coverage; +my $global; +my $local; +my $bootstrap; + + +my %pep; +my %nuc; +my $spnum; +my @clusters; +my $Cluster; +my @SpecieCombination; +my @spID; +my %genenum; +my %aaAln; +my %ntAln; +my %cog; +my %description; +#my %aa4tree; ### AA sequence for Phylogenetic Tree +my %nt4tree; ### nucleotide sequence for Phylogenetic Tree +my @SNPPosition; ### SNP position +my $dieMessage="You did not run PGAP.pl in the program directory\n"; +my $section; + +######### common temporary variable ############# +my $i; +my $j; +my $line; +my %tmpHash; +my @tmp; +my $tmp; +my $key; +my @row; +my $inparacount; +my $ClusterID; +my $orth; +my @content; +my $clusterName; +my @xdata; +my @ydata; +my @fit; + +my $fit_A; +my $fit_A_interval; +my $fit_B; +my $fit_C; +my $fit_C_interval; +my $fit_Rsquare; + + + +#### check option + +my $opt_error=0; + +if ((scalar(keys %opt) ==0) or (exists($opt{"help"}))) + { + print join("\n",@usage)."\n"; + exit; + } + + +###################### public info +### strains name +my @species; +if (exists($opt{"strains"})) + { + @species=split(/\+/,$opt{"strains"}); + $spnum=scalar(@species); + }else + { + print "Please assign strains nick name!\n"; + exit; + } + +### input data directory + +if (exists($opt{"input"})) + { + $inputDIR=$opt{"input"}; + if ($inputDIR!~/\/$/) + { + $inputDIR=$inputDIR."/"; + } + }else + { + print "Please assign input data directory!\n\n"; + exit; + } +### output data directory + +if (exists($opt{"output"})) + { + $outputDIR=$opt{"output"}; + if ($outputDIR!~/\/$/) + { + $outputDIR=$outputDIR."/"; + } + }else + { + print "Please assign result output directory!\n\n"; + exit; + } + +###################### section info + +if (exists($opt{"cluster"})) + { + $run_cluster=1; + }else + { + $run_cluster=0; + } + +if (exists($opt{"pangenome"})) + { + $run_pangenome=1; + }else + { + $run_pangenome=0; + } + +if (exists($opt{"variation"})) + { + $run_variation=1; + }else + { + $run_variation=0; + } + +if (exists($opt{"evolution"})) + { + $run_evolution=1; + }else + { + $run_evolution=0; + } + +if (exists($opt{"function"})) + { + $run_function=1; + }else + { + $run_function=0; + } + +if ($run_cluster) + { + ### method + if (exists($opt{"method"})) + { + $method=uc($opt{"method"}); + if ($method!~/^GF$/ and $method!~/^MP$/) + { + print "Unknown method: ".$opt{"method"}."\n"; + exit; + } + }else + { + print "Please assign the cluster method!\n\n"; + exit; + } + + ##thread + if (exists($opt{"thread"})) + { + $thread=$opt{"thread"}; + if ($thread==0) + { + print "please assign an applicable thread value.\n"; + exit; + } + }else + { + $thread=1; + } + + ##score + if (exists($opt{"score"})) + { + $score=$opt{"score"}; + if ($score<=0) + { + print "please assign an applicable score value.\n"; + exit; + } + }else + { + $score=40; + } + + if ($method eq "GF") + { + ###identity + if (exists($opt{"identity"})) + { + $identity=$opt{"identity"}; + if ($identity>1 or $identity<=0) + { + print "identity should be 0 ~ 1 \n"; + exit; + } + }else + { + $identity=0.5; + } + + ###evalue + if (exists($opt{"evalue"})) + { + $evalue=$opt{"evalue"}; + }else + { + $evalue=1e-10; + } + + ###coverage + if (exists($opt{"coverage"})) + { + $coverage=$opt{"coverage"}; + if ($coverage>1 or $coverage<=0) + { + print "coverage should be 0 ~ 1 \n"; + exit; + } + }else + { + $coverage=0.5; + } + } + + + if ($method eq "MP") + { + ###global + if (exists($opt{"global"})) + { + $global=$opt{"global"}; + if ($global>1 or $global<=0) + { + print "global coverage should be 0 ~ 1 \n"; + exit; + } + }else + { + $global=0.5; + } + ###local + if (exists($opt{"local"})) + { + $local=$opt{"local"}; + if ($local<=0) + { + print "local coverage should be 0 ~ [global coverage value] \n"; + exit; + } + if ($local>$global) + { + print "local coverage should be less than global coverage!\n"; + exit; + } + }else + { + $local=0.25; + } + } + } + +if ($run_evolution) + { + if (exists($opt{"bootstrap"})) + { + $bootstrap=$opt{"bootstrap"}; + if ($bootstrap<=0) + { + print "please assign an applicable bootstrap value.\n"; + } + }else + { + $bootstrap=1; + } + } + +print "Program begin at ".localtime()."\n"; +print "The following are the parameters for current process:\n"; + print "Strains: ".join(",",@species)."\n"; + print "Input directory: $inputDIR\n"; + print "Output directory: $outputDIR\n"; +if ($run_cluster) + { + print "Cluster analysis: yes\n"; + print " Method: $method\n"; + print " Thread: $thread\n"; + if ($method eq "GF") + { + print " E-value: $evalue\n"; + print " Identity: $identity\n"; + print " Coverage: $coverage\n"; + print " Score: $score\n"; + } + if ($method eq "MP") + { + print " Local: $local\n"; + print " Global: $global\n"; + print " Score: $score\n"; + } + }else + { + print "Cluster analysis: no\n"; + } +if ($run_pangenome) + { + print "Pan-genome analysis: yes\n"; + }else + { + print "Pan-genome analysis: no\n"; + } +if ($run_variation) + { + print "Variation analysis: yes\n"; + }else + { + print "Variation analysis: no\n"; + } +if ($run_evolution) + { + print "Evolution analysis: yes\n"; + print " Bootstrap: $bootstrap\n"; + }else + { + print "Evolution analysis: no\n"; + } +if ($run_function) + { + print "Function analysis: yes\n"; + }else + { + print "Function analysis: no\n"; + } + +$section=$run_cluster.$run_pangenome.$run_variation.$run_evolution.$run_function; + +############################################### +# section 0) check input file and program +############################################### +if (!(-e $outputDIR)) + { + system("mkdir $outputDIR"); + } +system("chmod +rw $outputDIR"); + +if (!(-w $outputDIR)) + { + print "There is no WRITE permission in $outputDIR\n"; + exit; + } +@tmp=qw(); +&CheckInputFile(\@species,$inputDIR,$section,$method,\@tmp); +&CheckExtraProgram($section,$method,\@tmp); + +if (scalar(@tmp)>0) + { + open(R,">".$outputDIR."0.error.message"); + print R join("",@tmp)."\n"; + close(R); + print "error!\nlog are saved in ${outputDIR}0.error.message\n"; + exit; + } + + +############################################ +# section 1) cluster analysis +############################################ + +if ($run_cluster) + { + print "\n\n############################################\n"; + print "# section 1) cluster analysis\n"; + print "############################################\n\n\n"; + +#### cluster gene and return result to the array @clusters + + if ($method eq "MP") + { + print "Begin cluster gene with MP method ...\n"; + &MP(); + }else + { + print "Begin cluster gene with GF method ...\n"; + &GF(); + } + +#### output normal cluster format + + &FormatClusterOutPut(\@species,"${outputDIR}1.Orthologs_Cluster.txt",\@clusters); + +#### Retrieve cluster + + &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters); + +##### gene distribution in each strains + %tmpHash=(); + &GeneDistribution(\@clusters,\%tmpHash); + + open(R,">${outputDIR}1.Gene_Distribution_By_Conservation.txt"); + print R "SharedBy_Strains\t".join("\t",@species)."\n"; + + for ($i=$spnum;$i>0;$i--) + { + print R $i; + for ($j=0;$j<$spnum;$j++) + { + if (exists($tmpHash{$i."|".$j})) + { + print R "\t".$tmpHash{$i."|".$j}; + }else + { + print R "\t0"; + } + } + print R "\n"; + } + + close(R); + + + }else + { + print "Homologous gene clustering is skipped!\n"; + } + + +if ($run_pangenome) + { + print "\n\n############################################\n"; + print "# section 2) Pan-genome analysis\n"; + print "############################################\n\n\n"; + +#### Retrieve cluster + &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters); + chomp(@clusters); + +#### convert file into 0-1 matrix + for ($line=0;$line<@clusters;$line++) + { + @row=split(/\t/,$clusters[$line]); + splice(@row,0,1); + for ($i=0;$i<@row;$i++) + { + if ($row[$i] eq "-") + { + $row[$i]=0; + }else + { + $row[$i]=1; + } + } + $clusters[$line]=join("\t",@row); + } + +#### fetch gene number of each strains +for ($i=0;$i<$spnum;$i++) + { + open(F,"$inputDIR$species[$i].pep"); + @tmp=<F>; + close(F); + @tmp=grep(/^>/,@tmp); + $genenum{$species[$i]}=scalar(@tmp); + } + +#### pan genome size and core genome size +print "Deducing pan genome size and core genome size for each composition...\n\n"; + +open(PAN,">${outputDIR}2.PanGenome.Data.txt"); +print PAN "ClusterConservation\tTotalGeneNumber\tPanGenome\tCoreGenome\n"; + +for ($i=1;$i<=scalar(@species);$i++) + { + #@SpecieCombination=&Combination(\@species,$i); + #@SpecieCombination=&Combination($spnum,$i); + if (&ChkCombinationValue($spnum,$i) !=0) ### transfer the array reference to the subroutine + { + &Combination($spnum,$i,\@SpecieCombination); ## if the combination number is less than sampleSize, then fecth all, else sample + }else + { + &SampleCombination($spnum,$i,\@SpecieCombination); + } + + foreach $key (@SpecieCombination) + { + ##### count total gene number in current combination + $tmp=0; + @spID=split(/\t/,$key); #### speices id in current combination + foreach (@spID) + { + $tmp=$tmp+$genenum{$species[$_]}; + } + ##### scan pangenome and coregenome + @tmp=split(/\t/,&PanGenomeNumber(\@spID)); + print PAN "$i\t$tmp\t".join("\t",@tmp)."\n"; + + } + } + +close(PAN); + +#### data fit + +#### for model A + +if ($spnum<3) + { + print "There are $spnum strains. For pan-genome function fitting, at least 3 strains data are required.\n"; + }else + { + open(R,">${outputDIR}2.PanGenome.Profile.txt"); + ##### genome number & pan-genome size + @xdata=qw(); + @ydata=qw(); + &ReadData2Array("${outputDIR}2.PanGenome.Data.txt",\@xdata,0,\@ydata,2); + &SumData(\@xdata,\@ydata,"mean"); + ($fit_Rsquare, $fit_A, $fit_A_interval, $fit_B, $fit_C, $fit_C_interval)=&fit_model_A(\@xdata,\@ydata); + print R "The relation bewteen genome number and pan-genome size\n\n"; + print R "Function model: y=A*x**B +C \n"; + print R "\ty denotes pan-genome size, x denotes genome number, and A, B, C are fitting parameters.\n\n"; + print R "Fitting result:\n"; + print R "\ty = $fit_A *x**$fit_B + $fit_C\n"; + print R "\tR-square = $fit_Rsquare\n"; + print R "\tA 95% confidence interval: ($fit_A - $fit_A_interval , $fit_A + $fit_A_interval)\n"; + print R "\tC 95% confidence interval: ($fit_C - $fit_C_interval , $fit_C + $fit_C_interval)\n\n\n\n\n"; + + ##### total gene number & pan-genome size + #@xdata=qw(); + #@ydata=qw(); + #&ReadData2Array("${outputDIR}2.PanGenome.Data.txt",\@xdata,1,\@ydata,2); + #&SumDataByMedian(\@xdata,\@ydata); + #($fit_Rsquare, $fit_A, $fit_B, $fit_C)=&fit_model_A(\@xdata,\@ydata); + #print R "The relation bewteen total gene number and pan-genome size\n\n"; + #print R "$fit_Rsquare, $fit_A, $fit_B, $fit_C\n"; + #print R "\ty = $fit_A *x**$fit_B + $fit_C R-square = $fit_Rsquare\n"; + #print R "\tx: total gene number\n"; + #print R "\ty: pan-genome size\n\n\n\n\n"; + + ##### genome number & core genome + @xdata=qw(); + @ydata=qw(); + &ReadData2Array("${outputDIR}2.PanGenome.Data.txt",\@xdata,0,\@ydata,3); + &SumData(\@xdata,\@ydata,"mean"); + ($fit_Rsquare, $fit_A, $fit_A_interval, $fit_B, $fit_C, $fit_C_interval)=&fit_model_B(\@xdata,\@ydata); + print R "The relation bewteen genome number and core genome size\n\n"; + print R "Function model: y=A*exp(B*x) +C \n"; + print R "\ty denotes pan-genome size, x denotes genome number, and A, B, C are fitting parameters.\n\n"; + print R "Fitting result:\n"; + print R "\ty = $fit_A *exp($fit_B * x) + $fit_C R-square = $fit_Rsquare\n"; + print R "\tR-square = $fit_Rsquare\n"; + print R "\tA 95% confidence interval: ($fit_A - $fit_A_interval , $fit_A + $fit_A_interval)\n"; + print R "\tC 95% confidence interval: ($fit_C - $fit_C_interval , $fit_C + $fit_C_interval)\n\n\n\n\n"; + close(R); + } + + } + + + ############################################ + # section 3) CDS variation analysis + ############################################ + +if ($run_variation) + { + print "\n\n############################################\n"; + print "# section 3) CDS variation analysis\n"; + print "############################################\n\n\n"; + + #### Retrieve cluster + &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters); + chomp(@clusters); + + ## protein + system("rm -rf *.pep"); + &PrepareFasta(\@species,$inputDIR,".pep"); ###prepare pep file + system("cat *.pep > All.faa && rm -rf *.pep && mv All.faa All.pep"); + &ReadSequenceInToHash("All.pep",\%pep); + ## nucleic + system("rm -rf *.nuc"); + &PrepareFasta(\@species,$inputDIR,".nuc"); ###prepare nuc file + system("cat *.nuc > All.ffn && rm -rf *.nuc && mv All.ffn All.nuc"); + &ReadSequenceInToHash("All.nuc",\%nuc); + + ## scanning SNP + %nt4tree=(); + for ($i=0;$i<$spnum;$i++) + { + $nt4tree{"S".$i}=""; + } + + open(VAR,">${outputDIR}3.CDS.variation.txt"); + print VAR "ClusterID\tStrains_Number\tGene_Number\tPosition\taaType\tntType\tntProfile\tVariation type\n"; + + open(VA,">${outputDIR}3.CDS.variation.analysis.txt"); + print VA "ClusterID\tInDel Base\tNonsynonymous mutation\tSynonymous mutation\n"; + + for ($line=0;$line<@clusters;$line++) + { + @row=split(/\t|\,/,$clusters[$line]); + $ClusterID=$row[0]; + splice(@row,0,1); + @row=grep(/^S/,@row); + if (scalar(@row) >=2) + { + open(PEP,">$ClusterID.pep"); + open(NUC,">$ClusterID.nuc"); + foreach $key (@row) + { + print PEP ">$key\n$pep{$key}\n"; + print NUC ">$key\n$nuc{$key}\n"; + } + close(PEP); + close(NUC); + system("$mafft --quiet $ClusterID.pep > $ClusterID.pal"); + #system("perl ./pal2nal.pl $ClusterID.pal $ClusterID.nuc -output fasta > $ClusterID.nal"); + $tmp=&pal2nal("$ClusterID.pal","$ClusterID.nuc","$ClusterID.nal"); + if ($tmp == 0) + { + system("rm -rf $ClusterID.*"); + next; + } + + @tmp=&DetectSNP(); + if (scalar(@tmp)>0) + { + print VA $ClusterID."\t".&VarAnalysis(\@tmp)."\n"; + print VAR join("",@tmp); + ### core orthologs + @row=split(/\t/,$clusters[$line]); + splice(@row,0,1); + if ((&CountGeneInCluster(join("\t",@row)) ==$spnum) and (&CountSpeicesInCluster(join("\t",@row)) == $spnum) ) + { + $count_tree++; + %tmpHash=(); + foreach (@row) + { + $tmpHash{$_}=""; + } + &RemoveHeadGap("$ClusterID.nal",\%tmpHash); + &ExtractSNP4tree(\%tmpHash,\%nt4tree); + } + } + system("rm -rf $ClusterID.*"); + } + } + close(VAR); + close(VA); + + open(R,">${outputDIR}3.CDS.variation.for.evolution.txt"); + foreach $key (keys %nt4tree) + { + $_=$key; + s/s//gi; + print R ">$species[$_]\n$nt4tree{$key}\n"; + } + close(R); + print $count_tree."\n\n"; + +### + system("rm All.nuc All.pep"); + }else + { + print "CDS variation is skipped.\n"; + } + + ############################################ + # section 4) CDS variation analysis + ############################################ + +if ($run_evolution) + { +#### Retrieve cluster + &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters); + chomp(@clusters); + + + +################## +## +## Distance based +## +################## + +### caculate the distance between each two strains by cluster + &ClusterProfil4Specie(\%tmpHash); # caculate Clusters profile for each specie + &DistanceMatrix($spnum,\%tmpHash); # caculate distance matrix acoording to Clusters profile + ###output distance + open(DIST,">${outputDIR}4.Species_Distance_Clusters_Based.txt"); + ###header + printf DIST "%5d", $spnum; + print DIST "\n"; + foreach $i (0..($spnum-1)) + { + $key="sp".$i."sp"; + printf DIST "%-10s",$key; + foreach $j (0..($spnum-1)) + { + printf DIST " %8f",$tmpHash{$i."-".$j}; + } + print DIST "\n"; + } + close(DIST); + + %tmpHash=(); + + ### based on pan genome (distance) + print "\nDraw pangenome based phylogenetic tree ...\n\n"; + + &PanBasedTree("${outputDIR}4.Species_Distance_Clusters_Based.txt","${outputDIR}4.PanBased"); + +################## +## +## SNP based +## +################## + %tmpHash=(); + if (!(-e "${outputDIR}3.CDS.variation.for.evolution.txt")) + { + print "Variation in core orthologs cluster is not found from ${outputDIR}3.CDS.variation.for.evolution.txt.\n"; + print "Maybe you have skipped CDS variation analysis.\n"; + }else + { + &ReadSequenceInToHash("${outputDIR}3.CDS.variation.for.evolution.txt",\%tmpHash); + open(R,">mlst.aln"); + for ($i=0;$i<@species;$i++) + { + print R ">sp${i}sp\n".$tmpHash{$species[$i]}."\n"; + } + close(R); + &fasta2phylip("mlst.aln","mlst.phylip"); + system("rm -rf mlst.aln"); + + print "\nDraw SNP based phylogenetic tree ...\n\n"; + &SNPBasedTree("mlst.phylip","${outputDIR}4.SNPBased"); + system("rm -rf mlst.phylip") + } + +######## +# replace speices name +######## + + opendir(DIR,"${outputDIR}"); + @tmp=readdir(DIR); + closedir(DIR); + @tmp=grep(/^4/,@tmp); + foreach $tmp (@tmp) + { + &ReplaceName(\@species,"${outputDIR}$tmp"); + } + }else + { + print "Evolution analysis is skipped.\n"; + } + + + ############################################ + # section 4) Function analysis + ############################################ + +if ($run_function) + { +#### Retrieve cluster + &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters); + chomp(@clusters); + +#### prepare annotation file + &PrepareTable(\@species,$inputDIR,".function"); ###prepare location file + &ReadAnnotation(\@species,\%cog,\%description); + + +#### assign function + open(R,">${outputDIR}5.Orthologs_Cluster_Function.txt"); + print R "ClusterID\tConservation_Level\tCOG\tDescription\n"; + for ($i=0;$i<@clusters;$i++) + { + @row=split(/\t/,$clusters[$i]); + $ClusterID=$row[0]; + splice(@row,0,1); + print R $ClusterID."\t".&CountSpeicesInCluster(join("\t",@row))."\t".&getCOG(\@row,\%cog)."\t".&getDescription(\@row,\%description)."\n"; + } + close(R); + +#### COG distribution + +###Whole Clusters COG Distribution +&outputCOGStatistic("${outputDIR}5.Orthologs_Whole_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",$spnum,1)); + +###Core Clusters COG Distribution +&outputCOGStatistic("${outputDIR}5.Orthologs_Core_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",$spnum,$spnum)); + +###Dispensable Clusters COG Distribution +&outputCOGStatistic("${outputDIR}5.Orthologs_Dispensable_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",($spnum-1),2)); + +###strains specifc Clusters COG Distribution +&outputCOGStatistic("${outputDIR}5.Orthologs_specifc_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",1,1)); + +system("rm -rf *.function"); + + }else + { + print "Function analysis is skipped.\n"; + } + +sub outputCOGStatistic() + { + (my $file,my $subcogcount)=@_; + my @cogcat=("J A K L B","D Y V T M N Z W U O","C G E F H I P Q","R S -"); + my @cogdesc=("INFORMATION STORAGE AND PROCESSING","CELLULAR PROCESSES AND SIGNALING","METABOLISM","POORLY CHARACTERIZED"); + my @subcogcat=qw(J A K L B D Y V T M N Z W U O C G E F H I P Q R S -); + my @subcogdesc=("[J] Translation, ribosomal structure and biogenesis","[A] RNA processing and modification","[K] Transcription","[L] Replication, recombination and repair","[B] Chromatin structure and dynamics","[D] Cell cycle control, cell division, chromosome partitioning","[Y] Nuclear structure","[V] Defense mechanisms","[T] Signal transduction mechanisms","[M] Cell wall/membrane/envelope biogenesis","[N] Cell motility","[Z] Cytoskeleton","[W] Extracellular structures","[U] Intracellular trafficking, secretion, and vesicular transport","[O] Posttranslational modification, protein turnover, chaperones","[C] Energy production and conversion","[G] Carbohydrate transport and metabolism","[E] Amino acid transport and metabolism","[F] Nucleotide transport and metabolism","[H] Coenzyme transport and metabolism","[I] Lipid transport and metabolism","[P] Inorganic ion transport and metabolism","[Q] Secondary metabolites biosynthesis, transport and catabolism","[R] General function prediction only","[S] Function unknown","[-] Unclassified"); + my %subcogdesc; + my $key; + my @cog; + my $i; + my $cognum; + + for ($i=0;$i<@subcogcat;$i++) + { + $subcogdesc{$subcogcat[$i]}=$subcogdesc[$i]; + } + + open(R,">$file"); + for ($i=0;$i<@cogcat;$i++) + { + $cognum=0; + foreach $key (split(" ",$cogcat[$i])) + { + $cognum=$cognum+$$subcogcount{$key}; + } + print R $cogdesc[$i]." ( ".$cognum." )\n"; + foreach $key (split(" ",$cogcat[$i])) + { + printf R "%-6d %s\n",$$subcogcount{$key},$subcogdesc{$key}; + } + print R "\n"; + + } + close(R); + + } + +sub scanCOG() + { + (my $file,my $max_orth,my $min_orth)=@_; + my @row; + my @subcogcat=qw(J A K L B D Y V T M N Z W U O C G E F H I P Q R S -); + my %subcogcount; + my $cog; + my $key; + + foreach $key (@subcogcat) + { + $subcogcount{$key}=0; + } + + @subcogcat=qw(J A K L B D Y V T M N Z W U O C G E F H I P Q R S); + + open(F,"$file"); + $_=<F>; + while (<F>) + { + @row=split(/\t/,$_); + if ($row[1]>=$min_orth and $row[1]<=$max_orth) + { + if ($row[2] eq "-") + { + $subcogcount{"-"}++; + }else + { + $_=uc($row[2]); + s/COG//gi; + $cog=$_; + foreach $key (@subcogcat) + { + if ($cog=~/$key/) + { + $subcogcount{$key}++; + } + } + } + } + } + close(F); + + return \%subcogcount; + } + + + + +sub getCOG() + { + (my $data,my $coghash)=@_; + my $cog=""; + my @cog; + my $key; + my %hash; + my @gene=split(/\t|\,/,join("\t",@$data)); + @gene=grep(/^S/,@gene); + + foreach $key (@gene) + { + if (($$coghash{$key} ne "-") and ($$coghash{$key} ne "")) + { + $cog=$cog.",".$$coghash{$key}; + } + } + @cog=split(/,/,$cog); + foreach $cog (@cog) + { + if ($cog ne "") + { + $hash{$cog}=1; + } + } + + $cog=join(",",(keys %hash)); + if ($cog eq "") + { + $cog="-"; + } + return $cog; + } + +sub getDescription() + { + (my $data,my $deschash)=@_; + my $desc=""; + my $key; + my @gene=split(/\t|\,/,join("\t",@$data)); + @gene=grep(/^S/,@gene); + + foreach $key (@gene) + { + if ( ($$deschash{$key} ne "") and ($$deschash{$key} ne "-") and ($$deschash{$key}!~/hypothetical/)) + { + $desc=$$deschash{$key}; + } + } + + if ($desc eq "") + { + $desc="hypothetical protein"; + } + + return $desc; + } + + + +sub ReadAnnotation() + { + (my $species,my $cog,my $description)=@_; + my $i; + my @row; + + for ($i=0;$i<@$species;$i++) + { + open(F,"$$species[$i].function"); + while (<F>) + { + chomp($_); + @row=split(/\t/,$_); + if (scalar(@row)>=2) + { + $$cog{$row[0]}=$row[1]; + }else + { + $$cog{$row[0]}="-"; + } + + if (scalar(@row)>=3) + { + $$description{$row[0]}=$row[2]; + }else + { + $$description{$row[0]}="hypothetical protein"; + } + } + close(F); + } + } + + +sub SNPBasedTree() + { + (my $infile,my $outfileprefix)=@_; + my $tmpin=$infile; + my $tmpout; + + +#### boootstrap +print "\n#### seqboot ...\n\n"; + open(R,">seqboot.cmd"); + #print R "$tmpin\n"; + print R "R\n"; + print R "$bootstrap\n"; + print R "Y\n"; + print R "1\n"; + close(R); + + system("cp $tmpin infile"); + system("$seqboot < seqboot.cmd"); + system("mv outfile 100dnaseq"); + system("rm -rf infile"); + system("rm seqboot.cmd"); # 100dnasseq + +#### dnaml +print "\n#### dnaml ...\n\n"; + open(R,">dnaml.cmd"); + #print R "100dnaseq\n"; + print R "T\n"; + print R "25\n"; + if ($bootstrap>1) + { + print R "M\n"; + print R "D\n"; + #print R "100\n"; + print R "$bootstrap\n"; + print R "1\n"; # Random number seed (must be odd)? + print R "5\n"; # Number of times to jumble? + } + print R "Y\n"; + close(R); + + system("cp 100dnaseq infile"); + system("$dnaml < dnaml.cmd"); + system("rm -rf outfile"); + system("rm -rf infile"); + system("mv outtree 100dnaseqtree"); # 100dnaseq, 100dnaseqtree + +#### consense +print "\n#### dnaml consense ...\n\n"; + + open(R,">consense.cmd"); + #print R "100dnaseqtree\n"; + print R "Y\n"; + close(R); + + system("cp 100dnaseqtree intree"); + system("$consense < consense.cmd"); + system("mv outfile ${outfileprefix}.ML.outfile"); + system("mv outtree ${outfileprefix}.ML.tree"); + system("rm -rf infile"); + system("rm -rf 100dnaseqtree"); # 100dnaseq + +#### dnadist +print "\n#### dnadist ...\n\n"; + open(R,">dnadist.cmd"); + #print R "100dnaseq\n"; + print R "T\n"; + print R "25\n"; + if ($bootstrap>1) + { + print R "M\n"; + print R "D\n"; + #print R "100\n"; + print R "$bootstrap\n"; + } + print R "Y\n"; + close(R); + + system("cp 100dnaseq infile"); + system("$dnadist < dnadist.cmd"); + system("rm -rf 100dnaseq"); + system("rm -rf infile"); + system("mv outfile 100dnadist"); # 100dnadist + +#### Neighbor-joining tree +print "\n#### Neighbor-joining ...\n\n"; + open(R,">NJ.cmd"); + if ($bootstrap>1) + { + #print R "100dnadist\n"; + print R "M\n"; + #print R "100\n"; + print R "$bootstrap\n"; + print R "1\n"; + } + print R "Y\n"; + close(R); + + system("cp 100dnadist infile"); + system("$neighbor < NJ.cmd"); + system("mv outtree 100dnadistNJtree"); + system("rm outfile"); + system("rm -rf infile"); + system("rm -rf NJ.cmd"); # 100dnadist,100dnadistNJtree + +#### NJ-consense +print "\n#### NJ-consense ...\n\n"; + open(R,">NJ-consense.cmd"); + #print R "100dnadistNJtree\n"; + print R "Y\n"; + close(R); + + system("cp 100dnadistNJtree intree"); + system("$consense < NJ-consense.cmd"); + system("mv outfile ${outfileprefix}.Neighbor-joining.outfile"); + system("mv outtree ${outfileprefix}.Neighbor-joining.tree"); + system("rm -rf NJ-consense.cmd"); + system("rm -rf intree"); + system("rm -rf 100dnadistNJtree"); + + +#### UPGMA tree +print "\n#### UPGMA ...\n\n"; + open(R,">UPGMA.cmd"); + #print R "100dnadist\n"; + print R "N\n"; + if ($bootstrap>1) + { + print R "M\n"; + #print R "100\n"; + print R "$bootstrap\n"; + print R "1\n"; + } + print R "Y\n"; + close(R); + + system("cp 100dnadist infile"); + system("$neighbor < UPGMA.cmd"); + system("mv outtree 100dnadistUPGMAtree"); + system("rm -rf outfile"); + system("rm -rf infile"); + system("rm -rf UPGMA.cmd"); + +#### UPGMA-consense +print "\n#### UPGMA-consense ...\n\n"; + open(R,">UPGMA-consense.cmd"); + #print R "100dnadistUPGMAtree\n"; + print R "Y\n"; + close(R); + + system("cp 100dnadistUPGMAtree intree"); + system("$consense < UPGMA-consense.cmd"); + system("mv outfile ${outfileprefix}.UPGMA.outfile"); + system("mv outtree ${outfileprefix}.UPGMA.tree"); + system("rm -rf UPGMA-consense.cmd"); + system("rm -rf 100dnadistUPGMAtree"); + system("rm -rf intree"); + +###CLEAN TMP FILE + + system("rm -rf *.cmd"); + system("rm -rf 100dnadist"); + } + +sub PanBasedTree() + { + (my $infile,my $outfileprefix)=@_; + my $tmpin; + my $tmpout; + + $tmpin=$infile; +#### Neighbor-joining tree + + open(R,">NJ.cmd"); + #print R "$tmpin\n"; + print R "Y\n"; + close(R); + + system("cp $tmpin infile"); + system("$neighbor < NJ.cmd"); + system("mv outfile ${outfileprefix}.Neighbor-joining.outfile"); + system("mv outtree ${outfileprefix}.Neighbor-joining.tree"); + system("rm -rf NJ.cmd"); + system("rm -rf infile"); + +#### UPGMA tree + + open(R,">UPGMA.cmd"); + #print R "$tmpin\n"; + print R "N\n"; + print R "Y\n"; + close(R); + + system("cp $tmpin infile"); + system("$neighbor < UPGMA.cmd"); + system("mv outfile ${outfileprefix}.UPGMA.outfile"); + system("mv outtree ${outfileprefix}.UPGMA.tree"); + system("rm -rf UPGMA.cmd"); + system("rm -rf infile"); + + +###CLEAN TMP FILE + system("rm -rf *.cmd"); + } + +sub DistanceMatrix() + { + (my $spnum,my $hash)=@_; + my $i; + my $j; + my $k; + my $dist; + my $ref; + my $query; + foreach $i (0..($spnum-1)) + { + foreach $j ($i..($spnum-1)) + { + $ref=$$hash{$i}; + $query=$$hash{$j}; + $dist=0; + for ($k=0;$k<length($ref);$k++) + { + if (substr($ref,$k,1) ne substr($query,$k,1)) + { + $dist++; + } + } + $$hash{$i."-".$j}=$dist; + $$hash{$j."-".$i}=$dist; + } + } + } + + +sub ClusterProfil4Specie + { + (my $hash)=@_; + my @row; + my $i; + + foreach (0..($spnum-1)) #initialization Hash + { + $$hash{$_}=""; + } + + foreach (@clusters) + { + @row=split(/\t/,$_); + splice(@row,0,1); + if (&CountSpeicesInCluster(join("\t",@row))>1) + { + for ($i=0;$i<@row;$i++) + { + if ($row[$i] eq "-") + { + $$hash{$i}=$$hash{$i}."0"; + }else + { + $$hash{$i}=$$hash{$i}."1"; + } + } + } + } + } + +# &ExtractSNP4tree(\%tmpHash,\%nt4tree); + +sub ExtractSNP4tree() + { + (my $hash,my $nt4treeRef)=@_; + my $key; + my @row; + my $i; + my $len; + my @tribases; + foreach $key (keys %$hash) + { + $$hash{substr($key,0,index($key,"G"))}=$$hash{$key}; + delete($$hash{$key}); + } + + for ($i=0;$i<$spnum;$i++) + { + $nt4tree{"S".$i}=$nt4tree{"S".$i}.$$hash{"S".$i}; + } + } + + +=pod +sub ExtractSNP4tree() + { + (my $hash,my $nt4treeRef)=@_; + my $key; + my @row; + my $i; + my $len; + my @tribases; + foreach $key (keys %$hash) + { + $$hash{substr($key,0,index($key,"G"))}=$$hash{$key}; + delete($$hash{$key}); + } + @_=(keys %$hash); + $len=length($_[0]); + for ($j=0;3*$j<$len;$j++) + { +##### scanning each codon + for ($i=0;$i<$spnum;$i++) + { + $tribases[$i]=substr($$hash{"S".$i},3*$j,3); + } +##### checking each codon + if (&IsTheSame(@tribases) ==0) + { + for ($i=0;$i<@tribases;$i++) + { + $nt4tree{"S".$i}=$nt4tree{"S".$i}.$tribases[$i]; + } + } + } + } +=cut + + +sub pal2nal() + { + (my $pal,my $nuc, my $nal)=@_; + my %aaAln=(); + my %ffn=(); + my %ntAln=(); + my %nt; + my $dna; + my $nt; + my $key; + my $flag=1; + my $i=0; + my $j; + +### read protein aligment result + &ReadAlignmentToHash("$pal",\%aaAln); +### read nt sequences + &ReadSequenceInToHash("$nuc",\%ffn); + foreach $key (keys %ffn) + { + $dna=$ffn{$key}; + #if (int(length($nt{$key})/3)*3 ne length($nt{$key})) + if (int(length($dna)/3)*3 ne length($dna)) + { + $flag=0; + print "The length of nucleotide sequence is not 3 integer times.\n"; + last; + }else + { + for ($i=0;$i<(length($dna)/3);$i++) + { + $nt{$key."|".$i}=substr($dna,$i*3,3); + } + } + } + + if ($flag==0) + { + return 0; + }else + { + foreach $key (keys %aaAln) ### replace aa with corresponding nt + { + $nt=""; + $i=0; + for ($j=0;$j<length($aaAln{$key});$j++) + { + if (substr($aaAln{$key},$j,1) eq "-") + { + $nt=$nt."---"; + }else + { + $nt=$nt.$nt{$key."|".$i}; + $i++; + } + } + $ntAln{$key}=$nt; + } + + ### output + open(R,">$nal"); + foreach (keys %ntAln) + { + print R ">$_\n".$ntAln{$_}."\n"; + } + close(R); + + return 1; + } + } + + + + +sub DetectSNP() + { + my %faa; + my %ffn; + my @row; + my $count_gene; + my $count_sp; + my @genelist; + my $i; + my $j; + my $pepalnlen; + my @cdsvar=qw(); + my $cdi=0; + my @tribases; + my @bases; + my @aa; + + +### fetch gene list + open(F,"$ClusterID.pep"); + @genelist=<F>; + close(F); + @genelist=grep(/^>/,@genelist); + chomp(@genelist); + $_=join("\t",@genelist); + s/>//g; + @genelist=split(/\t/,$_); + +### count gene number and species number + @row=split(/\t/,$clusters[$ClusterID-1]); + splice(@row,0,1); + $count_sp=&CountSpeicesInCluster(join("\t",@row)); + $count_gene=&CountGeneInCluster(join("\t",@row)); + +### read alignment sequences + &ReadAlignmentToHash("$ClusterID.pal",\%faa); + &ReadAlignmentToHash("$ClusterID.nal",\%ffn); + +@_=(keys %faa); +$pepalnlen=length($faa{$_[0]}); +### scan SNP + for ($i=1;$i<=$pepalnlen;$i++) + { + @tmp=qw(); + @tribases=qw(); + for ($j=0;$j<@genelist;$j++) ### fetch triplet codon + { + $tribases[$j]=substr($ffn{$genelist[$j]},3*($i-1),3); + } + if (&IsTheSame(@tribases) ==0) ### if triplet codon is not consistent + { + @aa=qw(); + for ($j=0;$j<@genelist;$j++) + { + $aa[$j]=substr($faa{$genelist[$j]},($i-1),1); + } + if (&IsTheSame(@aa) ==0) ### aa is not consistent + { + if (join("",@aa) =~/-/) + { + $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".$i."\t".&CharType(\@aa)."\t-\t-\tInDel\n"; + }else + { + #### base 1 + for ($j=0;$j<@genelist;$j++) + { + $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1),1); + } + if (&IsTheSame(@bases) ==0) + { + $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.1)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tNonsynonymous mutation\n"; + } + #### base 2 + for ($j=0;$j<@genelist;$j++) + { + $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+1,1); + } + if (&IsTheSame(@bases) ==0) + { + $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.2)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tNonsynonymous mutation\n"; + } + #### base 3 + for ($j=0;$j<@genelist;$j++) + { + $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+2,1); + } + if (&IsTheSame(@bases) ==0) + { + $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.3)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tNonsynonymous mutation\n"; + } + } + }else + { + #### base 1 + for ($j=0;$j<@genelist;$j++) + { + $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1),1); + } + if (&IsTheSame(@bases) ==0) + { + $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.1)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tSynonymous mutation\n"; + } + #### base 2 + for ($j=0;$j<@genelist;$j++) + { + $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+1,1); + } + if (&IsTheSame(@bases) ==0) + { + $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.2)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tSynonymous mutation\n"; + } + #### base 3 + for ($j=0;$j<@genelist;$j++) + { + $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+2,1); + } + if (&IsTheSame(@bases) ==0) + { + $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.3)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tSynonymous mutation\n"; + } + } + } + } + return @cdsvar; + } + + +sub VarAnalysis() + { + (my $data)=@_; + my @data=@$data; + my $indel=0; + my $syn=0; + my $nonsyn=0; + my @tmp; + $indel=scalar(grep(/InDel$/,@data)); + $nonsyn=scalar(grep(/Nonsynonymous mutation$/,@data));; + $syn=scalar(grep(/Synonymous mutation$/,@data)); + return "$indel\t$nonsyn\t$syn"; + } + + + +sub CharType() + { + (my $str)=@_; + my %hash; + my @data=@$str; + foreach (@data) + { + $hash{$_}=1; + } + return join(",",(keys %hash)); + } + +sub IsTheSame() + { + (my @data)=@_; + my %hash; + foreach (@data) + { + $hash{$_}=1; + } + if (scalar(keys %hash) ==1) + { + return 1; + }else + { + return 0; + } + } + + + +sub FormatClusterOutPut() + { + (my $speices,my $file,my $cluster)=@_; + my @row; + my $gid=1; + my $key; + my %hash; + my $gene; + my @tmp; + my $i; + my $j; + open(R,">$file"); + print R "ClutserID\t".join("\t",@$speices)."\n"; + foreach $key (@$cluster) + { + @row=split(/\t/,$key); + for ($i=0;$i<@row;$i++) + { + if ($row[$i] ne "-") + { + @tmp=split(/,/,$row[$i]); + for ($j=0;$j<@tmp;$j++) + { + $_=$tmp[$j]; + s/^S[0-9]+G//; + $tmp[$j]=$_; + } + $row[$i]=join(",",@tmp); + } + } + print R $gid."\t".join("\t",@row)."\n"; + $gid++; + } + close(R); + } +sub RetrieveClusterFromFile() + { + (my $file,my $clusters)=@_; + my @content; + my @row; + my $spid; + my $line=0; + my $i=0; + my $j; + my @tmp; + open(F,$file) or die "Could open $file\n"; + @content=<F>; + close(F); + splice(@content,0,1); + chomp(@content); + foreach (@content) + { + @row=split(/\t/,$_); + $$clusters[$line]=$row[0]; + splice(@row,0,1); + for ($i=0;$i<@row;$i++) + { + if ($row[$i] ne "-") + { + @tmp=split(/,/,$row[$i]); + for ($j=0;$j<@tmp;$j++) + { + $tmp[$j]="S${i}G".$tmp[$j]; + } + $row[$i]=join(",",@tmp); + } + } + $$clusters[$line]=$$clusters[$line]."\t".join("\t",@row)."\n"; + $line++; + } + } + + + + +sub GeneDistribution() + { + (my $clusters,my $hash)=@_; + my @row; + my $spid; + my $orth; + my $key; + foreach (@$clusters) + { + @row=split(/\t/,$_); + splice(@row,0,1); + $orth=&CountSpeicesInCluster(join("\t",@row)); + @row=split(/\t|\,/,join("\t",@row)); + foreach $key (@row) + { + if ($key ne "-") + { + $spid=substr($key,1,(index($key,'G')-1)); ###extract strains id + if (exists($$hash{$orth."|".$spid})) + { + $$hash{$orth."|".$spid}++; + }else + { + $$hash{$orth."|".$spid}=1; + } + } + } + } + } + +sub CountSpeicesInCluster() + { + (my $str)=@_; + chomp($str); + my @list=split(/\t/,$str); + my $key; + my $count=0; + + foreach $key (@list) + { + if ($key ne "-") + { + $count++; + } + } + return $count; + } + +sub CountGeneInCluster() + { + (my $str)=@_; + chomp(); + my @list=split(/\t|\,/,$str); + my $key; + my $count=0; + foreach $key (@list) + { + if ($key ne "-") + { + $count++; + } + } + return $count; + } + + + + +sub GF() + { + &PrepareFasta(\@species,$inputDIR,".pep"); ###prepare pep file + system("cat ".join(".pep ",@species).".pep > All.pep"); + system("grep '>' All.pep > genelist"); + system("$formatdb -p T -i All.pep"); + system("$blastall -p blastp -i All.pep -d All.pep -M BLOSUM45 -m9 -e $evalue -o All.blastp -a $thread"); + system("perl ./Blast_Filter.pl All.blastp All.pep $coverage $identity $score | $mcl - --abc -I 2.0 -o All.cluster"); + &FormatCluster("All.cluster","genelist",$spnum,\@clusters); + #system("rm -rf *.pep* All.blastp All.cluster genelist"); + } + +sub MP() + { +# (my $species,my $inputDIR,my $thread,my $evalue,my $score,my $coverage,my $identity)=@_; + my $i; + my $j; + &PrepareFasta(\@species,$inputDIR,".pep"); ###prepare pep file + system("cat ".join(".pep ",@species).".pep > All.pep"); + system("grep '>' All.pep > genelist"); + system("rm -rf All.pep"); + for ($i=0;$i<$spnum;$i++) + { + for ($j=$i+1;$j<$spnum;$j++) + { + system("perl ./inparanoid.pl $blastall $thread $formatdb $score $global $local $species[$i].pep $species[$j].pep"); + } + } + system("perl ./multiparanoid.pl -species ".join(".pep+",@species).".pep -unique 1"); +###convert the MP result to table list based on gene + &MP_Result_to_Table("MP.Cluster","All.cluster"); + &FormatCluster("All.cluster","genelist",$spnum,\@clusters); + system("rm -rf sqltable.* *.pep* MP.Cluster genelist"); + } + +sub fasta2phylip() + { + (my $input,my $output)=@_; + use Bio::AlignIO; + my $inputfilename = "10.aln"; + my $in= Bio::AlignIO->new(-file => $input , + -format => 'fasta'); + my $out = Bio::AlignIO->new(-file => ">$output" , + -format => 'phylip'); + while ( my $aln = $in->next_aln() ) + { + $out->write_aln($aln); + } + } + + +sub RemoveHeadGap() + { + (my $nal,my $hash)=@_; + my %aln; + my $key; + my $gaplength=0; + my $len1; + my $len2; + &ReadSequenceInToHash("$nal",\%aln); + foreach $key (keys %aln) + { + $len1=length($aln{$key}); + $_=$aln{$key}; + s/^-+//; + $len2=length($_); + if (($len1-$len2)>$gaplength) + { + $gaplength=$len1-$len2; + } + } + foreach $key (keys %aln) + { + $$hash{$key}=$$hash{$key}.substr($aln{$key},$gaplength,(length($aln{$key})-$gaplength)); + } + } + +sub PrepareFasta() + { + (my $species,my $inputDIR,my $extention)=@_; + my $sp; + my $file; + my $i; + my %hash; + my $key; + for ($i=0;$i<@$species;$i++) + { + $file=$inputDIR.$$species[$i].$extention; + %hash=(); + &ReadSequenceInToHash($file,\%hash); + open(R,">$$species[$i]${extention}") or die "Could write into $file\n"; + foreach $key (keys %hash) + { + print R ">S${i}G$key\n"; + print R $hash{$key}."\n"; + } + close(R); + } + } + +sub PrepareTable() + { + (my $species,my $inputDIR,my $extention)=@_; + my @content; + my $i; + my @row; + my $file; + for ($i=0;$i<@$species;$i++) + { + $file=$inputDIR.$$species[$i].$extention; + open(F,$file) or die "Could open $file\n"; + @content=<F>; + close(F); + chomp(@content); + open(R,">$$species[$i]${extention}") or die "Could write into $file\n"; + foreach (@content) + { + @row=split(/\t/,$_); + $row[0]="S${i}G$row[0]"; + if ($extention eq ".location") + { + $row[0]=$row[0]."\t".$row[0]; + } + print R join("\t",@row)."\n"; + } + close(R); + } + } + +sub CheckExtraProgram + { + #(my $section, my $method, my $tmparray)=@_; + my @error; + my $ei=0; + +#####cluster gene + if (substr($section,0,1) eq "1") + { +###MP: blastall formatdb +###GF: blastall formatdb mcl + + if (!(-e $formatdb)) + { + $error[$ei++]="formatdb is not found at $formatdb\n"; + } + + if (!(-X $formatdb)) + { + $error[$ei++]="there is not premission to execute $formatdb\n"; + } + + if (!(-e $blastall)) + { + $error[$ei++]="blastall is not found at $blastall\n"; + } + + if (!(-X $blastall)) + { + $error[$ei++]="there is not premission to execute $blastall\n"; + } + + if ($method eq "GF") + { + if (!(-e $mcl)) + { + $error[$ei++]="mcl is not found at $mcl\n"; + } + if (!(-X $mcl)) + { + $error[$ei++]="there is not premission to execute $mcl\n"; + } + } + } + +#####CDS variation + if (substr($section,2,1) eq "1") + { + if (!(-e $mafft)) + { + $error[$ei++]="mafft is not found at $mafft\n"; + } + if (!(-X $mafft)) + { + $error[$ei++]="there is not premission to execute $mafft\n"; + } + } + +#####CDS variation + if (substr($section,3,1) eq "1") + { + if (!(-e $mafft)) + { + $error[$ei++]="mafft is not found at $mafft\n"; + } + if (!(-X $mafft)) + { + $error[$ei++]="there is not premission to execute $mafft\n"; + } + } +#####Evolution analysis + if (substr($section,3,1) eq "1") + { + if (-e $seqboot) + { + $error[$ei++]="there is not premission to execute $seqboot\n" if(!(-X $seqboot)); + }else + { + $error[$ei++]="seqboot is not found at $seqboot\n"; + } + if (-e $dnaml) + { + $error[$ei++]="there is not premission to execute $dnaml\n" if(!(-X $dnaml)); + }else + { + $error[$ei++]="dnaml is not found at $dnaml\n"; + } + if (-e $dnadist) + { + $error[$ei++]="there is not premission to execute $dnadist\n" if(!(-X $dnadist)); + }else + { + $error[$ei++]="dnadist is not found at $dnadist\n"; + } + if (-e $neighbor) + { + $error[$ei++]="there is not premission to execute $neighbor\n" if(!(-X $neighbor)); + }else + { + $error[$ei++]="neighbor is not found at $neighbor\n"; + } + if (-e $consense) + { + $error[$ei++]="there is not premission to execute $consense\n" if(!(-X $consense)); + }else + { + $error[$ei++]="consense is not found at $consense\n"; + } + if (-e $dnapars) + { + $error[$ei++]="there is not premission to execute $dnapars\n" if(!(-X $dnapars)); + }else + { + $error[$ei++]="dnapars is not found at $dnapars\n"; + } + } + #@$tmparray=(@$tmparray,@error); + @tmp=(@tmp,@error); + } + + +sub CheckInputFile() + { + (my $species,my $inputDIR,my $section,my $method,my $tmparray)=@_; +####cluster + if (substr($section,0,1) eq "1") + { + if ($method eq "MM") + { + @$tmparray=(@$tmparray,&chk2SEQ($species,$inputDIR)); ### check pep and nuc + @$tmparray=(@$tmparray,&chktab($species,$inputDIR,".location"));### chk pep nuc location + }else + { + @$tmparray=(@$tmparray,&chk1SEQ($species,$inputDIR)); + } + } +###CDS variation + if (substr($section,2,1) eq "1") + { + @$tmparray=(@$tmparray,&chk2SEQ($species,$inputDIR)); + } +###function analysis + if (substr($section,4,1) eq "1") + { + @$tmparray=(@$tmparray,&chktab($species,$inputDIR,".function")); + } + } + + +sub chk1SEQ() + { + (my $species,my $inputDIR)=@_; + my @error; + my $ei=0; + my $sp; + my $pepfile; + my %pep; + foreach $sp (@$species) + { + %pep=(); + $pepfile=$inputDIR.$sp.".pep"; + &ReadSequenceInToHash($pepfile,\%pep); + if (scalar(keys %pep)<2) + { + $error[$ei++]="format error in $pepfile\n"; + } + } + return @error; + } + +sub chk2SEQ() + { + (my $species,my $inputDIR)=@_; + my $sp; + my %pep; + my %nuc; + my $pepfile; + my $nucfile; + my $key; + my @error; + my $ei=0; + foreach $sp (@$species) + { + $pepfile=$inputDIR.$sp.".pep"; + $nucfile=$inputDIR.$sp.".nuc"; + %pep=(); + %nuc=(); + &ReadSequenceInToHash("$pepfile",\%pep); + &ReadSequenceInToHash("$nucfile",\%nuc); + if (scalar(keys %pep) ne scalar(keys %nuc)) + { + $error[$ei++]="Sequences number is not consistent in the following two file:\n\t$pepfile\n\t$nucfile\n"; + }else + { + foreach $key (keys %pep) + { + if (exists($nuc{$key})) + { + if (length($nuc{$key}) ne ((length($pep{$key})+1)*3)) + { + $error[$ei++]="the length of $key in $nucfile is not consistent with its corresponding protein length\n"; + } + }else + { + $error[$ei++]="$key lost in $nucfile\n"; + } + } + + foreach $key (keys %nuc) + { + if (!exists($pep{$key})) + { + $error[$ei++]="1048 $key lost in $pepfile\n"; + } + } + } + } + return @error; + } + + +sub chktab() + { + (my $species,my $inputDIR,my $extention)=@_; + my %pep; + my @row; + my $key; + my %tab; + my @error; + my $ei=0; + my $sp; + my $tabfile; + my $pepfile; + foreach $sp (@$species) + { + %tab=(); + %pep=(); + $tabfile=$inputDIR.$sp.$extention; + open(F,"$tabfile"); + while (<F>) + { + chomp(); + @row=split(/\t/,$_); + if (scalar(@row)<3) + { + $error[$ei++]="format error in $tabfile\n"; + }else + { + $tab{$row[0]}=$row[1]; + } + } + close(F); + $pepfile=$inputDIR.$sp.".pep"; + &ReadSequenceInToHash($pepfile,\%pep); + foreach $key (keys %pep) + { + if (!exists($tab{$key})) + { + $error[$ei++]="sequence $key lost infomation in $tabfile\n"; + } + } + } + return @error; + } + + + +sub ReadSequenceInToHash() + { + use Bio::SeqIO; + (my $file,my $hash)=@_; + my $seq; + my $in=Bio::SeqIO->new(-file=>"$file",-format=>"fasta"); + while ($seq=$in->next_seq()) + { + #$$hash{$id."|".$seq->id}=$seq->seq(); + $$hash{$seq->id}=$seq->seq(); + } + } + +sub ReadAlignmentToHash() + { + (my $file,my $hash)=@_; + my $name=""; + my $seq=""; + my @content; + my $line; + open(F,"$file"); + @content=<F>; + close(F); + chomp(@content); + for ($line=0;$line<@content;$line++) + { + if ($content[$line]=~/^>/) + { + if ($line>0) + { + $$hash{$name}=$seq; + $name=""; + } + + $_=$content[$line]; + s/^>//; + $name=$_; + $seq=""; + }else + { + if ($name ne "") + { + $seq=$seq.$content[$line]; + } + } + } + $$hash{$name}=$seq; + } + + + +sub Combination() + { + (my $m,my $n,my $comRef)=@_; + my $str=""; + my %hash; + my $fpos; + my $num0; + my $rest; + my $tmp; + my $i; + my $j; + my $key; + #my $m=scalar(@$array); + my @combination; + + for ($i=1;$i<=$n;$i++) + { + $str="1".$str; + } + + for ($i=1;$i<=($m-$n);$i++) + { + $str=$str."0"; + } + + $hash{$str}=1; + while ($str=~/10/) + { + $fpos=index($str,"10"); + $_=$str; + s/10/01/; + $str=$_; + $tmp=substr($str,0,$fpos); + $_=$tmp; + s/0//g; + $rest=$_; + $num0=$fpos-length($_); + for ($i=1;$i<=$num0;$i++) + { + $rest="$rest"."0"; + } + $str="$rest".substr($str,$fpos,$m-$fpos); + $hash{$str}=1; + } + $j=0; + foreach $key (keys %hash) + { + $combination[$j]=""; + for ($i=0;$i<$m;$i++) + { + if (substr($key,$i,1) eq "1") + { + if ($combination[$j] ne "") + { + #$combination[$j]=$combination[$j]."\t".$$array[$i]; + $combination[$j]=$combination[$j]."\t".$i; + }else + { + #$combination[$j]=$$array[$i]; ### For return species ID + $combination[$j]=$i; + } + } + } + $j++; + } + @$comRef=@combination; ### update the data through the physic address + } + +sub ChkCombinationValue() +{ + (my $m,my $n)=@_; + my %hash; + my %vhash; + my $value=0; + my $key; + my @row; + my @sdA; + my @sdB; + + ### initialization + $hash{$m."-".$n}=1; + + ### split combination + while (scalar(keys %hash)>0 and $value<=$sampleSize) + { + foreach $key (keys %hash) + { + if ($value > $sampleSize) ### threshold + { + last; + } + if (!exists($hash{$key})) + { + next; + } + @row=split(/-/,$key); + #print $row[0]."|".$row[1]."\n"; + if ($row[0] eq $row[1]) + { + $value=$value+$hash{$key}; + }else + { + ##split + $sdA[0]=$row[0]-1; + $sdA[1]=$row[1]; + $sdB[0]=$row[0]-1; + $sdB[1]=$row[1]-1; + ##storing A + if (($sdA[0] eq $sdA[1]) or $sdA[1] ==0) + { + $value=$value+$hash{$key}; + }else + { + if (exists($hash{$sdA[0]."-".$sdA[1]})) + { + $hash{$sdA[0]."-".$sdA[1]}=$hash{$sdA[0]."-".$sdA[1]}+$hash{$key}; + }else + { + $hash{$sdA[0]."-".$sdA[1]}=$hash{$key}; + } + } + + ##storing B + if (($sdB[0] eq $sdB[1]) or $sdB[1]==0) + { + $value=$value+$hash{$key}; + }else + { + if (exists($hash{$sdB[0]."-".$sdB[1]})) + { + $hash{$sdB[0]."-".$sdB[1]}=$hash{$sdB[0]."-".$sdB[1]}+$hash{$key}; + }else + { + $hash{$sdB[0]."-".$sdB[1]}=$hash{$key}; + } + } + } + #delete original combination + delete($hash{$key}); + } + } + + if ($value>$sampleSize) + { + return 0; + }else + { + return $value; + } +} + + +sub SampleCombination() +{ + (my $m,my $n,my $comRef)=@_; + my %hash; + my $sampleTimes=0; + my @randNum; + my @sortID; + my $i; + my $j; + my $tmp; + while ( scalar(keys %hash)<$sampleSize and $sampleTimes<($sampleSize*2)) + { + for ($i=0;$i<$m;$i++) # generate random data + { + $randNum[$i]=int(100000 * rand(100)); + $sortID[$i]=$i; + } + + for ($i=0;$i<$m;$i++) # sorting random data + { + for ($j=0;$j<$m;$j++) + { + if ($randNum[$sortID[$i]]<$randNum[$sortID[$j]]) + { + $tmp=$sortID[$i]; + $sortID[$i]=$sortID[$j]; + $sortID[$j]=$tmp; + } + } + } + + #storing data + $tmp=join("\t",sort {$a<=>$b} (splice(@sortID,0,$n))); + $hash{$tmp}=1; + $sampleTimes++; + } + @$comRef=keys %hash; +} + + +sub PanGenomeNumber() + { + (my $spID)=@_; + my $pan=0; + my $core=0; + my $count; #### counter; + my @row; + + foreach (@clusters) + { + $count=0; + @row=split(/\t/,$_); + + foreach (@$spID) + { + $count=$count+$row[$_]; + } + + if ($count>0) + { + $pan++; + if ($count == scalar(@$spID)) + { + $core++; + } + } + } + return $pan."\t".$core; + } + +sub fit_model_A() + { +### model y = A * x**B + C + (my $xdata,my $ydata)=@_; + my $i; + my $b; + my $max_B=0; + my $max_R=0; + my $max_A=0; + my $max_A_interval; + my $max_C=0; + my $max_C_interval; + my $R=1e-100; + my $start; + my $end; + my $step; + my @xValues; + my @yValues; + + $start=1; + $step=0.001; + $b=$start; + $max_R=0; + $R=1e-100; + + use Statistics::LineFit; + use Statistics::Distributions; + + while ($max_R<=$R) + { + if (($b < 0.02) and ($b >-0.02)) + { + $b=-0.02; + } + + for ($i=0;$i<@$xdata;$i++) + { + $xValues[$i]=$$xdata[$i]**$b; + } + @yValues=@$ydata; + my $lineFit = Statistics::LineFit->new(); + $lineFit->setData (\@xValues, \@yValues) or die "Invalid data"; + (my $intercept, my $slope) = $lineFit->coefficients(); + my $rSquared = $lineFit->rSquared(); + my $meanSquaredError = $lineFit->meanSqError(); + my $durbinWatson = $lineFit->durbinWatson(); + my $sigma = $lineFit->sigma(); + (my $tStatIntercept, my $tStatSlope) = $lineFit->tStatistics(); + (my $varianceIntercept,my $varianceSlope) = $lineFit->varianceOfEstimates(); + + $max_R=$R; + $R=$rSquared; + if ($max_R<=$R) + { + $max_R=$R; + ($max_C,$max_A)=$lineFit->coefficients(); + $max_A_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceSlope); + $max_C_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceIntercept); + } + $b=$b-$step; + } + $max_B=$b; + return ($max_R,$max_A,$max_A_interval,$max_B,$max_C,$max_C_interval); + + } + +sub fit_model_B() + { +### model y = A * exp(x*B) + C + (my $xdata,my $ydata)=@_; + my $i; + my $b; + my $max_B=0; + my $max_R=0; + my $max_A=0; + my $max_A_interval; + my $max_C=0; + my $max_C_interval; + my $R=1e-100; + my $start; + my $end; + my $step; + my @xValues; + my @yValues; + + $start=0; + $step=0.001; + $b=$start; + $max_R=0; + $R=1e-100; + + use Statistics::LineFit; + use Statistics::Distributions; + + while ($max_R<=$R) + { + if (($b < 0.02) and ($b >-0.02)) + { + $b=-0.02; + } + + for ($i=0;$i<@$xdata;$i++) + { + $xValues[$i]=exp($$xdata[$i]*$b); + } + @yValues=@$ydata; + my $lineFit = Statistics::LineFit->new(); + $lineFit->setData (\@xValues, \@yValues) or die "Invalid data"; + (my $intercept, my $slope) = $lineFit->coefficients(); + my $rSquared = $lineFit->rSquared(); + my $meanSquaredError = $lineFit->meanSqError(); + my $durbinWatson = $lineFit->durbinWatson(); + my $sigma = $lineFit->sigma(); + (my $tStatIntercept, my $tStatSlope) = $lineFit->tStatistics(); + (my $varianceIntercept,my $varianceSlope) = $lineFit->varianceOfEstimates(); + + $max_R=$R; + $R=$rSquared; + if ($max_R<=$R) + { + $max_R=$R; + ($max_C,$max_A)=$lineFit->coefficients(); + $max_A_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceSlope); + $max_C_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceIntercept); + } + $b=$b-$step; + } + $max_B=$b; + return ($max_R,$max_A,$max_A_interval,$max_B,$max_C,$max_C_interval); + } + + +sub ReadData2Array() + { + (my $file, my $array1,my $col1,my $array2,my $col2)=@_; + my $i=0; + open(F,$file); + $_=<F>; + while (<F>) + { + chomp(); + @_=split(/\t/,$_); + $$array1[$i]=$_[$col1]; + $$array2[$i]=$_[$col2]; + $i++; + } + close(F); + } + +sub SumData() + { + (my $xdata,my $ydata,my $SumMethod)=@_; + my %hash; + my $i; + my $key; + my $max=0; + for ($i=0;$i<@$xdata;$i++) + { + if (exists($hash{$$xdata[$i]})) + { + $hash{$$xdata[$i]}=$hash{$$xdata[$i]}." ".$$ydata[$i]; + }else + { + $hash{$$xdata[$i]}=$$ydata[$i]; + if ($$xdata[$i]>$max) + { + $max=$$xdata[$i]; + } + } + } + @$xdata=qw(); + @$ydata=qw(); + $i=0; + foreach $i (1..$max) + { + $$xdata[$i-1]=$i; + if ($SumMethod eq "median") + { + $$ydata[$i-1]=&median($hash{$i}); + }else + { + $$ydata[$i-1]=&mean($hash{$i}); + } + } + #print join(",",@$xdata)."\n"; + #print join(",",@$ydata)."\n"; + } + +sub median() + { + (my $data)=@_; + my @data=split(/ /,$data); + my $arraylen=scalar(@data); + @data=sort{$a<=>$b} @data; + if (int($arraylen/2)*2 == $arraylen) + { + return ($data[$arraylen/2]+$data[$arraylen/2-1])/2; + }else + { + return $data[int($arraylen/2)]; + } + } + +sub mean() + { + (my $data)=@_; + my @data=split(/ /,$data); + my $sum=0; + foreach (@data) + { + $sum=$sum+$_; + } + return int(($sum/scalar(@data))*1000)/1000; + } + +sub ReplaceName() + { + (my $sp,my $file)=@_; + my @content; + my $line; + my $i; + my $target; + open(F,$file); + @content=<F>; + close(F); + for ($line=0;$line<@content;$line++) + { + for ($i=0;$i<@$sp;$i++) + { + $_=$content[$line]; + $target="sp".$i."sp"; + s/$target/$$sp[$i]/; + $content[$line]=$_; + } + } + open(R,">$file"); + print R @content; + close(R); + } + +sub MP_Result_to_Table() + { + (my $MPresult, my $outputfile)=@_; + my %hash; + my $maxid=0; + my $i; + my @row; + + open(F,"$MPresult"); + $_=<F>; + while (<F>) + { + @row=split(/\t/,$_); + if (exists($hash{$row[0]})) + { + $hash{$row[0]}=$hash{$row[0]}."\t".$row[2]; + }else + { + $hash{$row[0]}=$row[2]; + if ($row[0]>$maxid) + { + $maxid=$row[0]; + } + } + } + close(F); + + open(R,">$outputfile"); + foreach $i (1..$maxid) + { + print R $hash{$i}."\n"; + } + close(R); + } + + + +sub FormatCluster() + { + (my $infile,my $genelist,my $spnum,my $cluster)=@_; + my %hash; + my %gene; + my $key; + my @row; + my $sp; + my $line; + my $i=0; + my $j=0; + my @content; + +### record gene in clusters + open(F,"$infile"); + @content=<F>; + close(F); + chomp(@content); + for ($line=0;$line<@content;$line++) + { + @row=split(/\t/,$content[$line]); + foreach $key (@row) + { + $gene{$key}=1; + } + } +###retrieves gene which is not in clutsers + + open(F,"$genelist"); + while ($key=<F>) + { + if ($key=~/^>/) + { + chomp($key); + $_=$key; + s/^>//; + $key=$_; + if (!exists($gene{$key})) + { + $content[$line]=$key; + $line++; + } + } + } + close(F); + +#### initialization @cluster + @$cluster=qw(); + $j=0; + + foreach $line (@content) + { + if ($line ne "") + { + %hash=(); + @row=split(/\t/,$line); + foreach $key (@row) + { + $sp=substr($key,0,index($key,"G")); + $gene{$key}=1; + if (exists($hash{$sp})) + { + $hash{$sp}=$hash{$sp}.",".$key; + }else + { + $hash{$sp}=$key; + } + } + + $i=0; + @row=qw(); + + foreach $i (0..($spnum-1)) + { + if (exists($hash{"S$i"})) + { + $row[$i]=$hash{"S$i"}; + }else + { + $row[$i]="-"; + } + } + $$cluster[$j++]=join("\t",@row); + } + } + } + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PGAP-1.2.1/README Thu Jun 24 13:51:52 2021 +0000 @@ -0,0 +1,24 @@ +-=- PGAP 1.2.1 README -=- +PGAP is a Pan-Genomes Analysis Pipeline. + +For the installation and usage of PGAP, please refer to manual.pdf. + +There is also a brief tutorial in manual.pdf. + +If there are any problems or suggestions in using PGAP, please mail to PGAP@big.ac.cn + +-=- Change Log 1.2.1 -=- +1.Fix the bug in Converter_NCBINewFormatData.pl. + +-=- Change Log 1.2.0 -=- +1.A new script (Converter_NCBINewFormatData.pl) was developed for converting new format bacterial genomic data from NCBI at ftp://ftp.ncbi.nlm.nih.gov/genomes/all + +-=- Change Log 1.12 -=- +1.Fix the bug in the Perl converter Converter_draft.pl. + +-=- Change Log 1.11 -=- +1.Fix the bug in the two Perl converters (Converter_finished.pl and Converter_draft.pl). + +-=- Change Log 1.1 -=- +1.Add sampling strategy into the pan-genome analysis module. This strategy will efficiently reduce time cost and memory cost. +2.Revise the input data for SNP-based phylogenetic analysis
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PGAP-1.2.1/Statistics/Distributions.pm Thu Jun 24 13:51:52 2021 +0000 @@ -0,0 +1,529 @@ +package Statistics::Distributions; + +use strict; +use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); +use constant PI => 3.1415926536; +use constant SIGNIFICANT => 5; # number of significant digits to be returned + +require Exporter; + +@ISA = qw(Exporter AutoLoader); +# Items to export into callers namespace by default. Note: do not export +# names by default without a very good reason. Use EXPORT_OK instead. +# Do not simply export all your public functions/methods/constants. +@EXPORT_OK = qw(chisqrdistr tdistr fdistr udistr uprob chisqrprob tprob fprob); +$VERSION = '1.02'; + +# Preloaded methods go here. + +sub chisqrdistr { # Percentage points X^2(x^2,n) + my ($n, $p) = @_; + if ($n <= 0 || abs($n) - abs(int($n)) != 0) { + die "Invalid n: $n\n"; # degree of freedom + } + if ($p <= 0 || $p > 1) { + die "Invalid p: $p\n"; + } + return precision_string(_subchisqr($n, $p)); +} + +sub udistr { # Percentage points N(0,1^2) + my ($p) = (@_); + if ($p > 1 || $p <= 0) { + die "Invalid p: $p\n"; + } + return precision_string(_subu($p)); +} + +sub tdistr { # Percentage points t(x,n) + my ($n, $p) = @_; + if ($n <= 0 || abs($n) - abs(int($n)) != 0) { + die "Invalid n: $n\n"; + } + if ($p <= 0 || $p >= 1) { + die "Invalid p: $p\n"; + } + return precision_string(_subt($n, $p)); +} + +sub fdistr { # Percentage points F(x,n1,n2) + my ($n, $m, $p) = @_; + if (($n<=0) || ((abs($n)-(abs(int($n))))!=0)) { + die "Invalid n: $n\n"; # first degree of freedom + } + if (($m<=0) || ((abs($m)-(abs(int($m))))!=0)) { + die "Invalid m: $m\n"; # second degree of freedom + } + if (($p<=0) || ($p>1)) { + die "Invalid p: $p\n"; + } + return precision_string(_subf($n, $m, $p)); +} + +sub uprob { # Upper probability N(0,1^2) + my ($x) = @_; + return precision_string(_subuprob($x)); +} + +sub chisqrprob { # Upper probability X^2(x^2,n) + my ($n,$x) = @_; + if (($n <= 0) || ((abs($n) - (abs(int($n)))) != 0)) { + die "Invalid n: $n\n"; # degree of freedom + } + return precision_string(_subchisqrprob($n, $x)); +} + +sub tprob { # Upper probability t(x,n) + my ($n, $x) = @_; + if (($n <= 0) || ((abs($n) - abs(int($n))) !=0)) { + die "Invalid n: $n\n"; # degree of freedom + } + return precision_string(_subtprob($n, $x)); +} + +sub fprob { # Upper probability F(x,n1,n2) + my ($n, $m, $x) = @_; + if (($n<=0) || ((abs($n)-(abs(int($n))))!=0)) { + die "Invalid n: $n\n"; # first degree of freedom + } + if (($m<=0) || ((abs($m)-(abs(int($m))))!=0)) { + die "Invalid m: $m\n"; # second degree of freedom + } + return precision_string(_subfprob($n, $m, $x)); +} + + +sub _subfprob { + my ($n, $m, $x) = @_; + my $p; + + if ($x<=0) { + $p=1; + } elsif ($m % 2 == 0) { + my $z = $m / ($m + $n * $x); + my $a = 1; + for (my $i = $m - 2; $i >= 2; $i -= 2) { + $a = 1 + ($n + $i - 2) / $i * $z * $a; + } + $p = 1 - ((1 - $z) ** ($n / 2) * $a); + } elsif ($n % 2 == 0) { + my $z = $n * $x / ($m + $n * $x); + my $a = 1; + for (my $i = $n - 2; $i >= 2; $i -= 2) { + $a = 1 + ($m + $i - 2) / $i * $z * $a; + } + $p = (1 - $z) ** ($m / 2) * $a; + } else { + my $y = atan2(sqrt($n * $x / $m), 1); + my $z = sin($y) ** 2; + my $a = ($n == 1) ? 0 : 1; + for (my $i = $n - 2; $i >= 3; $i -= 2) { + $a = 1 + ($m + $i - 2) / $i * $z * $a; + } + my $b = PI; + for (my $i = 2; $i <= $m - 1; $i += 2) { + $b *= ($i - 1) / $i; + } + my $p1 = 2 / $b * sin($y) * cos($y) ** $m * $a; + + $z = cos($y) ** 2; + $a = ($m == 1) ? 0 : 1; + for (my $i = $m-2; $i >= 3; $i -= 2) { + $a = 1 + ($i - 1) / $i * $z * $a; + } + $p = max(0, $p1 + 1 - 2 * $y / PI + - 2 / PI * sin($y) * cos($y) * $a); + } + return $p; +} + + +sub _subchisqrprob { + my ($n,$x) = @_; + my $p; + + if ($x <= 0) { + $p = 1; + } elsif ($n > 100) { + $p = _subuprob((($x / $n) ** (1/3) + - (1 - 2/9/$n)) / sqrt(2/9/$n)); + } elsif ($x > 400) { + $p = 0; + } else { + my ($a, $i, $i1); + if (($n % 2) != 0) { + $p = 2 * _subuprob(sqrt($x)); + $a = sqrt(2/PI) * exp(-$x/2) / sqrt($x); + $i1 = 1; + } else { + $p = $a = exp(-$x/2); + $i1 = 2; + } + + for ($i = $i1; $i <= ($n-2); $i += 2) { + $a *= $x / $i; + $p += $a; + } + } + return $p; +} + +sub _subu { + my ($p) = @_; + my $y = -log(4 * $p * (1 - $p)); + my $x = sqrt( + $y * (1.570796288 + + $y * (.03706987906 + + $y * (-.8364353589E-3 + + $y *(-.2250947176E-3 + + $y * (.6841218299E-5 + + $y * (0.5824238515E-5 + + $y * (-.104527497E-5 + + $y * (.8360937017E-7 + + $y * (-.3231081277E-8 + + $y * (.3657763036E-10 + + $y *.6936233982E-12))))))))))); + $x = -$x if ($p>.5); + return $x; +} + +sub _subuprob { + my ($x) = @_; + my $p = 0; # if ($absx > 100) + my $absx = abs($x); + + if ($absx < 1.9) { + $p = (1 + + $absx * (.049867347 + + $absx * (.0211410061 + + $absx * (.0032776263 + + $absx * (.0000380036 + + $absx * (.0000488906 + + $absx * .000005383)))))) ** -16/2; + } elsif ($absx <= 100) { + for (my $i = 18; $i >= 1; $i--) { + $p = $i / ($absx + $p); + } + $p = exp(-.5 * $absx * $absx) + / sqrt(2 * PI) / ($absx + $p); + } + + $p = 1 - $p if ($x<0); + return $p; +} + + +sub _subt { + my ($n, $p) = @_; + + if ($p >= 1 || $p <= 0) { + die "Invalid p: $p\n"; + } + + if ($p == 0.5) { + return 0; + } elsif ($p < 0.5) { + return - _subt($n, 1 - $p); + } + + my $u = _subu($p); + my $u2 = $u ** 2; + + my $a = ($u2 + 1) / 4; + my $b = ((5 * $u2 + 16) * $u2 + 3) / 96; + my $c = (((3 * $u2 + 19) * $u2 + 17) * $u2 - 15) / 384; + my $d = ((((79 * $u2 + 776) * $u2 + 1482) * $u2 - 1920) * $u2 - 945) + / 92160; + my $e = (((((27 * $u2 + 339) * $u2 + 930) * $u2 - 1782) * $u2 - 765) * $u2 + + 17955) / 368640; + + my $x = $u * (1 + ($a + ($b + ($c + ($d + $e / $n) / $n) / $n) / $n) / $n); + + if ($n <= log10($p) ** 2 + 3) { + my $round; + do { + my $p1 = _subtprob($n, $x); + my $n1 = $n + 1; + my $delta = ($p1 - $p) + / exp(($n1 * log($n1 / ($n + $x * $x)) + + log($n/$n1/2/PI) - 1 + + (1/$n1 - 1/$n) / 6) / 2); + $x += $delta; + $round = sprintf("%.".abs(int(log10(abs $x)-4))."f",$delta); + } while (($x) && ($round != 0)); + } + return $x; +} + +sub _subtprob { + my ($n, $x) = @_; + + my ($a,$b); + my $w = atan2($x / sqrt($n), 1); + my $z = cos($w) ** 2; + my $y = 1; + + for (my $i = $n-2; $i >= 2; $i -= 2) { + $y = 1 + ($i-1) / $i * $z * $y; + } + + if ($n % 2 == 0) { + $a = sin($w)/2; + $b = .5; + } else { + $a = ($n == 1) ? 0 : sin($w)*cos($w)/PI; + $b= .5 + $w/PI; + } + return max(0, 1 - $b - $a * $y); +} + +sub _subf { + my ($n, $m, $p) = @_; + my $x; + + if ($p >= 1 || $p <= 0) { + die "Invalid p: $p\n"; + } + + if ($p == 1) { + $x = 0; + } elsif ($m == 1) { + $x = 1 / (_subt($n, 0.5 - $p / 2) ** 2); + } elsif ($n == 1) { + $x = _subt($m, $p/2) ** 2; + } elsif ($m == 2) { + my $u = _subchisqr($m, 1 - $p); + my $a = $m - 2; + $x = 1 / ($u / $m * (1 + + (($u - $a) / 2 + + (((4 * $u - 11 * $a) * $u + $a * (7 * $m - 10)) / 24 + + (((2 * $u - 10 * $a) * $u + $a * (17 * $m - 26)) * $u + - $a * $a * (9 * $m - 6) + )/48/$n + )/$n + )/$n)); + } elsif ($n > $m) { + $x = 1 / _subf2($m, $n, 1 - $p) + } else { + $x = _subf2($n, $m, $p) + } + return $x; +} + +sub _subf2 { + my ($n, $m, $p) = @_; + my $u = _subchisqr($n, $p); + my $n2 = $n - 2; + my $x = $u / $n * + (1 + + (($u - $n2) / 2 + + (((4 * $u - 11 * $n2) * $u + $n2 * (7 * $n - 10)) / 24 + + (((2 * $u - 10 * $n2) * $u + $n2 * (17 * $n - 26)) * $u + - $n2 * $n2 * (9 * $n - 6)) / 48 / $m) / $m) / $m); + my $delta; + do { + my $z = exp( + (($n+$m) * log(($n+$m) / ($n * $x + $m)) + + ($n - 2) * log($x) + + log($n * $m / ($n+$m)) + - log(4 * PI) + - (1/$n + 1/$m - 1/($n+$m))/6 + )/2); + $delta = (_subfprob($n, $m, $x) - $p) / $z; + $x += $delta; + } while (abs($delta)>3e-4); + return $x; +} + +sub _subchisqr { + my ($n, $p) = @_; + my $x; + + if (($p > 1) || ($p <= 0)) { + die "Invalid p: $p\n"; + } elsif ($p == 1){ + $x = 0; + } elsif ($n == 1) { + $x = _subu($p / 2) ** 2; + } elsif ($n == 2) { + $x = -2 * log($p); + } else { + my $u = _subu($p); + my $u2 = $u * $u; + + $x = max(0, $n + sqrt(2 * $n) * $u + + 2/3 * ($u2 - 1) + + $u * ($u2 - 7) / 9 / sqrt(2 * $n) + - 2/405 / $n * ($u2 * (3 *$u2 + 7) - 16)); + + if ($n <= 100) { + my ($x0, $p1, $z); + do { + $x0 = $x; + if ($x < 0) { + $p1 = 1; + } elsif ($n>100) { + $p1 = _subuprob((($x / $n)**(1/3) - (1 - 2/9/$n)) + / sqrt(2/9/$n)); + } elsif ($x>400) { + $p1 = 0; + } else { + my ($i0, $a); + if (($n % 2) != 0) { + $p1 = 2 * _subuprob(sqrt($x)); + $a = sqrt(2/PI) * exp(-$x/2) / sqrt($x); + $i0 = 1; + } else { + $p1 = $a = exp(-$x/2); + $i0 = 2; + } + + for (my $i = $i0; $i <= $n-2; $i += 2) { + $a *= $x / $i; + $p1 += $a; + } + } + $z = exp((($n-1) * log($x/$n) - log(4*PI*$x) + + $n - $x - 1/$n/6) / 2); + $x += ($p1 - $p) / $z; + $x = sprintf("%.5f", $x); + } while (($n < 31) && (abs($x0 - $x) > 1e-4)); + } + } + return $x; +} + +sub log10 { + my $n = shift; + return log($n) / log(10); +} + +sub max { + my $max = shift; + my $next; + while (@_) { + $next = shift; + $max = $next if ($next > $max); + } + return $max; +} + +sub min { + my $min = shift; + my $next; + while (@_) { + $next = shift; + $min = $next if ($next < $min); + } + return $min; +} + +sub precision { + my ($x) = @_; + return abs int(log10(abs $x) - SIGNIFICANT); +} + +sub precision_string { + my ($x) = @_; + if ($x) { + return sprintf "%." . precision($x) . "f", $x; + } else { + return "0"; + } +} + + +# Autoload methods go after =cut, and are processed by the autosplit program. + +1; +__END__ +# Below is the stub of documentation for your module. You better edit it! + +=head1 NAME + +Statistics::Distributions - Perl module for calculating critical values and upper probabilities of common statistical distributions + +=head1 SYNOPSIS + + use Statistics::Distributions; + + $chis=Statistics::Distributions::chisqrdistr (2,.05); + print "Chi-squared-crit (2 degrees of freedom, 95th percentile " + ."= 0.05 level) = $chis\n"; + + $u=Statistics::Distributions::udistr (.05); + print "u-crit (95th percentile = 0.05 level) = $u\n"; + + $t=Statistics::Distributions::tdistr (1,.005); + print "t-crit (1 degree of freedom, 99.5th percentile = 0.005 level) " + ."= $t\n"; + + $f=Statistics::Distributions::fdistr (1,3,.01); + print "F-crit (1 degree of freedom in numerator, 3 degrees of freedom " + ."in denominator, 99th percentile = 0.01 level) = $f\n"; + + $uprob=Statistics::Distributions::uprob (-0.85); + print "upper probability of the u distribution (u = -0.85): Q(u) " + ."= 1-G(u) = $uprob\n"; + + $chisprob=Statistics::Distributions::chisqrprob (3,6.25); + print "upper probability of the chi-square distribution (3 degrees " + ."of freedom, chi-squared = 6.25): Q = 1-G = $chisprob\n"; + + $tprob=Statistics::Distributions::tprob (3,6.251); + print "upper probability of the t distribution (3 degrees of " + ."freedom, t = 6.251): Q = 1-G = $tprob\n"; + + $fprob=Statistics::Distributions::fprob (3,5,.625); + print "upper probability of the F distribution (3 degrees of freedom " + ."in numerator, 5 degrees of freedom in denominator, F = 6.25): " + ."Q = 1-G = $fprob\n"; + +=head1 DESCRIPTION + +This Perl module calculates percentage points (5 significant digits) of the u (standard normal) distribution, the student's t distribution, the chi-square distribution and the F distribution. It can also calculate the upper probability (5 significant digits) of the u (standard normal), the chi-square, the t and the F distribution. +These critical values are needed to perform statistical tests, like the u test, the t test, the F test and the chi-squared test, and to calculate confidence intervals. + +If you are interested in more precise algorithms you could look at: + StatLib: http://lib.stat.cmu.edu/apstat/ ; + Applied Statistics Algorithms by Griffiths, P. and Hill, I.D., Ellis Horwood: Chichester (1985) + +=head1 BUGS + +This final version 1.02 has been released after more than one year without a bug report on the previous version 0.07. +Nevertheless, if you find any bugs or oddities, please do inform the author. + +=head1 INSTALLATION + +See perlmodinstall for information and options on installing Perl modules. + +=head1 AVAILABILITY + +The latest version of this module is available from the Distribution Perl Archive Network (CPAN). Please visit http://www.cpan.org/ to find a CPAN site near you or see http://www.cpan.org/authors/id/M/MI/MIKEK/ . + +=head1 AUTHOR + +Michael Kospach <mike.perl@gmx.at> + +Nice formating, simplification and bug repair by Matthias Trautner Kromann <mtk@id.cbs.dk> + +=head1 COPYRIGHT + +Copyright 2003 Michael Kospach. All rights reserved. + +This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. + +=head1 SEE ALSO + +Statistics::ChiSquare, Statistics::Table::t, Statistics::Table::F, perl(1). + +=cut + + + + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PGAP-1.2.1/Statistics/LineFit.pm Thu Jun 24 13:51:52 2021 +0000 @@ -0,0 +1,745 @@ +package Statistics::LineFit; +use strict; +use Carp qw(carp); +BEGIN { + use Exporter (); + use vars qw ($AUTHOR $VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); + $AUTHOR = 'Richard Anderson <cpan(AT)richardanderson(DOT)org>'; + @EXPORT = @EXPORT_OK = qw(); + %EXPORT_TAGS = (); + @ISA = qw(Exporter); + $VERSION = 0.06; +} + +sub new { +# +# Purpose: Create a new Statistics::LineFit object +# + my ($caller, $validate, $hush) = @_; + my $self = { doneRegress => 0, + gotData => 0, + hush => defined $hush ? $hush : 0, + validate => defined $validate ? $validate : 0, + }; + bless $self, ref($caller) || $caller; + return $self; +} + +sub coefficients { +# +# Purpose: Return the slope and intercept from least squares line fit +# + my $self = shift; + unless (defined $self->{intercept} and defined $self->{slope}) { + $self->regress() or return; + } + return ($self->{intercept}, $self->{slope}); +} + +sub computeSums { +# +# Purpose: Compute sum of x, y, x**2, y**2 and x*y (private method) +# + my $self = shift; + my ($sumX, $sumY, $sumXX, $sumYY, $sumXY) = (0, 0, 0, 0, 0); + if (defined $self->{weight}) { + for (my $i = 0; $i < $self->{numXY}; ++$i) { + $sumX += $self->{weight}[$i] * $self->{x}[$i]; + $sumY += $self->{weight}[$i] * $self->{y}[$i]; + $sumXX += $self->{weight}[$i] * $self->{x}[$i] ** 2; + $sumYY += $self->{weight}[$i] * $self->{y}[$i] ** 2; + $sumXY += $self->{weight}[$i] * $self->{x}[$i] + * $self->{y}[$i]; + } + } else { + for (my $i = 0; $i < $self->{numXY}; ++$i) { + $sumX += $self->{x}[$i]; + $sumY += $self->{y}[$i]; + $sumXX += $self->{x}[$i] ** 2; + $sumYY += $self->{y}[$i] ** 2; + $sumXY += $self->{x}[$i] * $self->{y}[$i]; + } + } + return ($sumX, $sumY, $sumXX, $sumYY, $sumXY); +} + +sub durbinWatson { +# +# Purpose: Return the Durbin-Watson statistic +# + my $self = shift; + unless (defined $self->{durbinWatson}) { + $self->regress() or return; + my $sumErrDiff = 0; + my $errorTMinus1 = $self->{y}[0] - ($self->{intercept} + $self->{slope} + * $self->{x}[0]); + for (my $i = 1; $i < $self->{numXY}; ++$i) { + my $error = $self->{y}[$i] - ($self->{intercept} + $self->{slope} + * $self->{x}[$i]); + $sumErrDiff += ($error - $errorTMinus1) ** 2; + $errorTMinus1 = $error; + } + $self->{durbinWatson} = $self->sumSqErrors() > 0 ? + $sumErrDiff / $self->sumSqErrors() : 0; + } + return $self->{durbinWatson}; +} + +sub meanSqError { +# +# Purpose: Return the mean squared error +# + my $self = shift; + unless (defined $self->{meanSqError}) { + $self->regress() or return; + $self->{meanSqError} = $self->sumSqErrors() / $self->{numXY}; + } + return $self->{meanSqError}; +} + +sub predictedYs { +# +# Purpose: Return the predicted y values +# + my $self = shift; + unless (defined $self->{predictedYs}) { + $self->regress() or return; + $self->{predictedYs} = []; + for (my $i = 0; $i < $self->{numXY}; ++$i) { + $self->{predictedYs}[$i] = $self->{intercept} + + $self->{slope} * $self->{x}[$i]; + } + } + return @{$self->{predictedYs}}; +} + +sub regress { +# +# Purpose: Do weighted or unweighted least squares 2-D line fit (if needed) +# +# Description: +# The equations below apply to both the weighted and unweighted fit: the +# weights are normalized in setWeights(), so the sum of the weights is +# equal to numXY. +# + my $self = shift; + return $self->{regressOK} if $self->{doneRegress}; + unless ($self->{gotData}) { + carp "No valid data input - can't do regression" unless $self->{hush}; + return 0; + } + my ($sumX, $sumY, $sumYY, $sumXY); + ($sumX, $sumY, $self->{sumXX}, $sumYY, $sumXY) = $self->computeSums(); + $self->{sumSqDevX} = $self->{sumXX} - $sumX ** 2 / $self->{numXY}; + if ($self->{sumSqDevX} != 0) { + $self->{sumSqDevY} = $sumYY - $sumY ** 2 / $self->{numXY}; + $self->{sumSqDevXY} = $sumXY - $sumX * $sumY / $self->{numXY}; + $self->{slope} = $self->{sumSqDevXY} / $self->{sumSqDevX}; + $self->{intercept} = ($sumY - $self->{slope} * $sumX) / $self->{numXY}; + $self->{regressOK} = 1; + } else { + carp "Can't fit line when x values are all equal" unless $self->{hush}; + $self->{sumXX} = $self->{sumSqDevX} = undef; + $self->{regressOK} = 0; + } + $self->{doneRegress} = 1; + return $self->{regressOK}; +} + +sub residuals { +# +# Purpose: Return the predicted Y values minus the observed Y values +# + my $self = shift; + unless (defined $self->{residuals}) { + $self->regress() or return; + $self->{residuals} = []; + for (my $i = 0; $i < $self->{numXY}; ++$i) { + $self->{residuals}[$i] = $self->{y}[$i] - ($self->{intercept} + + $self->{slope} * $self->{x}[$i]); + } + } + return @{$self->{residuals}}; +} + +sub rSquared { +# +# Purpose: Return the correlation coefficient +# + my $self = shift; + unless (defined $self->{rSquared}) { + $self->regress() or return; + my $denom = $self->{sumSqDevX} * $self->{sumSqDevY}; + $self->{rSquared} = $denom != 0 ? $self->{sumSqDevXY} ** 2 / $denom : 1; + } + return $self->{rSquared}; +} + +sub setData { +# +# Purpose: Initialize (x,y) values and optional weights +# + my ($self, $x, $y, $weights) = @_; + $self->{doneRegress} = 0; + $self->{x} = $self->{y} = $self->{numXY} = $self->{weight} + = $self->{intercept} = $self->{slope} = $self->{rSquared} + = $self->{sigma} = $self->{durbinWatson} = $self->{meanSqError} + = $self->{sumSqErrors} = $self->{tStatInt} = $self->{tStatSlope} + = $self->{predictedYs} = $self->{residuals} = $self->{sumXX} + = $self->{sumSqDevX} = $self->{sumSqDevY} = $self->{sumSqDevXY} + = undef; + if (@$x < 2) { + carp "Must input more than one data point!" unless $self->{hush}; + return 0; + } + $self->{numXY} = @$x; + if (ref $x->[0]) { + $self->setWeights($y) or return 0; + $self->{x} = [ ]; + $self->{y} = [ ]; + foreach my $xy (@$x) { + push @{$self->{x}}, $xy->[0]; + push @{$self->{y}}, $xy->[1]; + } + } else { + if (@$x != @$y) { + carp "Length of x and y arrays must be equal!" unless $self->{hush}; + return 0; + } + $self->setWeights($weights) or return 0; + $self->{x} = [ @$x ]; + $self->{y} = [ @$y ]; + } + if ($self->{validate}) { + unless ($self->validData()) { + $self->{x} = $self->{y} = $self->{weights} = $self->{numXY} = undef; + return 0; + } + } + $self->{gotData} = 1; + return 1; +} + +sub setWeights { +# +# Purpose: Normalize and initialize line fit weighting factors (private method) +# + my ($self, $weights) = @_; + return 1 unless defined $weights; + if (@$weights != $self->{numXY}) { + carp "Length of weight array must equal length of data array!" + unless $self->{hush}; + return 0; + } + if ($self->{validate}) { $self->validWeights($weights) or return 0 } + my $sumW = my $numNonZero = 0; + foreach my $weight (@$weights) { + if ($weight < 0) { + carp "Weights must be non-negative numbers!" unless $self->{hush}; + return 0; + } + $sumW += $weight; + if ($weight != 0) { ++$numNonZero } + } + if ($numNonZero < 2) { + carp "At least two weights must be nonzero!" unless $self->{hush}; + return 0; + } + my $factor = @$weights / $sumW; + foreach my $weight (@$weights) { $weight *= $factor } + $self->{weight} = [ @$weights ]; + return 1; +} + +sub sigma { +# +# Purpose: Return the estimated homoscedastic standard deviation of the +# error term +# + my $self = shift; + unless (defined $self->{sigma}) { + $self->regress() or return; + $self->{sigma} = $self->{numXY} > 2 ? + sqrt($self->sumSqErrors() / ($self->{numXY} - 2)) : 0; + } + return $self->{sigma}; +} + +sub sumSqErrors { +# +# Purpose: Return the sum of the squared errors (private method) +# + my $self = shift; + unless (defined $self->{sumSqErrors}) { + $self->regress() or return; + $self->{sumSqErrors} = $self->{sumSqDevY} - $self->{sumSqDevX} + * $self->{slope} ** 2; + if ($self->{sumSqErrors} < 0) { $self->{sumSqErrors} = 0 } + } + return $self->{sumSqErrors}; +} + +sub tStatistics { +# +# Purpose: Return the T statistics +# + my $self = shift; + unless (defined $self->{tStatInt} and defined $self->{tStatSlope}) { + $self->regress() or return; + my $biasEstimateInt = $self->sigma() * sqrt($self->{sumXX} + / ($self->{sumSqDevX} * $self->{numXY})); + $self->{tStatInt} = $biasEstimateInt != 0 ? + $self->{intercept} / $biasEstimateInt : 0; + my $biasEstimateSlope = $self->sigma() / sqrt($self->{sumSqDevX}); + $self->{tStatSlope} = $biasEstimateSlope != 0 ? + $self->{slope} / $biasEstimateSlope : 0; + } + return ($self->{tStatInt}, $self->{tStatSlope}); +} + +sub validData { +# +# Purpose: Verify that the input x-y data are numeric (private method) +# + my $self = shift; + for (my $i = 0; $i < $self->{numXY}; ++$i) { + if (not defined $self->{x}[$i]) { + carp "Input x[$i] is not defined" unless $self->{hush}; + return 0; + } + if ($self->{x}[$i] !~ + /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) + { + carp "Input x[$i] is not a number: $self->{x}[$i]" + unless $self->{hush}; + return 0; + } + if (not defined $self->{y}[$i]) { + carp "Input y[$i] is not defined" unless $self->{hush}; + return 0; + } + if ($self->{y}[$i] !~ + /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) + { + carp "Input y[$i] is not a number: $self->{y}[$i]" + unless $self->{hush}; + return 0; + } + } + return 1; +} + +sub validWeights { +# +# Purpose: Verify that the input weights are numeric (private method) +# + my ($self, $weights) = @_; + for (my $i = 0; $i < @$weights; ++$i) { + if (not defined $weights->[$i]) { + carp "Input weights[$i] is not defined" unless $self->{hush}; + return 0; + } + if ($weights->[$i] + !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) + { + carp "Input weights[$i] is not a number: $weights->[$i]" + unless $self->{hush}; + return 0; + } + } + return 1; +} + +sub varianceOfEstimates { +# +# Purpose: Return the variances in the estimates of the intercept and slope +# + my $self = shift; + unless (defined $self->{intercept} and defined $self->{slope}) { + $self->regress() or return; + } + my @predictedYs = $self->predictedYs(); + my ($s, $sx, $sxx) = (0, 0, 0); + if (defined $self->{weight}) { + for (my $i = 0; $i < $self->{numXY}; ++$i) { + my $variance = ($predictedYs[$i] - $self->{y}[$i]) ** 2; + next if 0 == $variance; + $s += 1.0 / $variance; + $sx += $self->{weight}[$i] * $self->{x}[$i] / $variance; + $sxx += $self->{weight}[$i] * $self->{x}[$i] ** 2 / $variance; + } + } else { + for (my $i = 0; $i < $self->{numXY}; ++$i) { + my $variance = ($predictedYs[$i] - $self->{y}[$i]) ** 2; + next if 0 == $variance; + $s += 1.0 / $variance; + $sx += $self->{x}[$i] / $variance; + $sxx += $self->{x}[$i] ** 2 / $variance; + } + } + my $denominator = ($s * $sxx - $sx ** 2); + if (0 == $denominator) { + return; + } else { + return ($sxx / $denominator, $s / $denominator); + } +} + +1; + +__END__ + +=head1 NAME + +Statistics::LineFit - Least squares line fit, weighted or unweighted + +=head1 SYNOPSIS + + use Statistics::LineFit; + $lineFit = Statistics::LineFit->new(); + $lineFit->setData (\@xValues, \@yValues) or die "Invalid data"; + ($intercept, $slope) = $lineFit->coefficients(); + defined $intercept or die "Can't fit line if x values are all equal"; + $rSquared = $lineFit->rSquared(); + $meanSquaredError = $lineFit->meanSqError(); + $durbinWatson = $lineFit->durbinWatson(); + $sigma = $lineFit->sigma(); + ($tStatIntercept, $tStatSlope) = $lineFit->tStatistics(); + @predictedYs = $lineFit->predictedYs(); + @residuals = $lineFit->residuals(); + (varianceIntercept, $varianceSlope) = $lineFit->varianceOfEstimates(); + +=head1 DESCRIPTION + +The Statistics::LineFit module does weighted or unweighted least-squares +line fitting to two-dimensional data (y = a + b * x). (This is also called +linear regression.) In addition to the slope and y-intercept, the module +can return the square of the correlation coefficient (R squared), the +Durbin-Watson statistic, the mean squared error, sigma, the t statistics, +the variance of the estimates of the slope and y-intercept, +the predicted y values and the residuals of the y values. (See the METHODS +section for a description of these statistics.) + +The module accepts input data in separate x and y arrays or a single +2-D array (an array of arrayrefs). The optional weights are input in a +separate array. The module can optionally verify that the input data and +weights are valid numbers. If weights are input, the line fit minimizes +the weighted sum of the squared errors and the following statistics are +weighted: the correlation coefficient, the Durbin-Watson statistic, the +mean squared error, sigma and the t statistics. + +The module is state-oriented and caches its results. Once you call the +setData() method, you can call the other methods in any order or call a +method several times without invoking redundant calculations. After calling +setData(), you can modify the input data or weights without affecting the +module's results. + +The decision to use or not use weighting could be made using your a +priori knowledge of the data or using supplemental data. If the data is +sparse or contains non-random noise, weighting can degrade the solution. +Weighting is a good option if some points are suspect or less relevant (e.g., +older terms in a time series, points that are known to have more noise). + +=head1 ALGORITHM + +The least-square line is the line that minimizes the sum of the squares +of the y residuals: + + Minimize SUM((y[i] - (a + b * x[i])) ** 2) + +Setting the parial derivatives of a and b to zero yields a solution that +can be expressed in terms of the means, variances and covariances of x and y: + + b = SUM((x[i] - meanX) * (y[i] - meanY)) / SUM((x[i] - meanX) ** 2) + + a = meanY - b * meanX + +Note that a and b are undefined if all the x values are the same. + +If you use weights, each term in the above sums is multiplied by the +value of the weight for that index. The program normalizes the weights +(after copying the input values) so that the sum of the weights equals +the number of points. This minimizes the differences between the weighted +and unweighted equations. + +Statistics::LineFit uses equations that are mathematically equivalent to +the above equations and computationally more efficient. The module runs +in O(N) (linear time). + +=head1 LIMITATIONS + +The regression fails if the input x values are all equal or the only unequal +x values have zero weights. This is an inherent limit to fitting a line of +the form y = a + b * x. In this case, the module issues an error message +and methods that return statistical values will return undefined values. +You can also use the return value of the regress() method to check the +status of the regression. + +As the sum of the squared deviations of the x values approaches zero, +the module's results becomes sensitive to the precision of floating point +operations on the host system. + +If the x values are not all the same and the apparent "best fit" line is +vertical, the module will fit a horizontal line. For example, an input of +(1, 1), (1, 7), (2, 3), (2, 5) returns a slope of zero, an intercept of 4 +and an R squared of zero. This is correct behavior because this line is the +best least-squares fit to the data for the given parameterization +(y = a + b * x). + +On a 32-bit system the results are accurate to about 11 significant digits, +depending on the input data. Many of the installation tests will fail +on a system with word lengths of 16 bits or fewer. (You might want to +upgrade your old 80286 IBM PC.) + +=head1 EXAMPLES + +=head2 Alternate calling sequence: + + use Statistics::LineFit; + $lineFit = Statistics::LineFit->new(); + $lineFit->setData(\@x, \@y) or die "Invalid regression data\n"; + if (defined $lineFit->rSquared() + and $lineFit->rSquared() > $threshold) + { + ($intercept, $slope) = $lineFit->coefficients(); + print "Slope: $slope Y-intercept: $intercept\n"; + } + +=head2 Multiple calls with same object, validate input, suppress error messages: + + use Statistics::LineFit; + $lineFit = Statistics::LineFit->new(1, 1); + while (1) { + @xy = read2Dxy(); # User-supplied subroutine + $lineFit->setData(\@xy); + ($intercept, $slope) = $lineFit->coefficients(); + if (defined $intercept) { + print "Slope: $slope Y-intercept: $intercept\n"; + } + } + +=head1 METHODS + +The module is state-oriented and caches its results. Once you call the +setData() method, you can call the other methods in any order or call +a method several times without invoking redundant calculations. + +The regression fails if the x values are all the same. In this case, +the module issues an error message and methods that return statistical +values will return undefined values. You can also use the return value +of the regress() method to check the status of the regression. + +=head2 new() - create a new Statistics::LineFit object + + $lineFit = Statistics::LineFit->new(); + $lineFit = Statistics::LineFit->new($validate); + $lineFit = Statistics::LineFit->new($validate, $hush); + + $validate = 1 -> Verify input data is numeric (slower execution) + 0 -> Don't verify input data (default, faster execution) + $hush = 1 -> Suppress error messages + = 0 -> Enable error messages (default) + +=head2 coefficients() - Return the slope and y intercept + + ($intercept, $slope) = $lineFit->coefficients(); + +The returned list is undefined if the regression fails. + +=head2 durbinWatson() - Return the Durbin-Watson statistic + + $durbinWatson = $lineFit->durbinWatson(); + +The Durbin-Watson test is a test for first-order autocorrelation in +the residuals of a time series regression. The Durbin-Watson statistic +has a range of 0 to 4; a value of 2 indicates there is no +autocorrelation. + +The return value is undefined if the regression fails. If weights are +input, the return value is the weighted Durbin-Watson statistic. + +=head2 meanSqError() - Return the mean squared error + + $meanSquaredError = $lineFit->meanSqError(); + +The return value is undefined if the regression fails. If weights are +input, the return value is the weighted mean squared error. + +=head2 predictedYs() - Return the predicted y values + + @predictedYs = $lineFit->predictedYs(); + +The returned list is undefined if the regression fails. + +=head2 regress() - Do the least squares line fit (if not already done) + + $lineFit->regress() or die "Regression failed" + +You don't need to call this method because it is invoked by the other +methods as needed. After you call setData(), you can call regress() +at any time to get the status of the regression for the current data. + +=head2 residuals() - Return predicted y values minus input y values + + @residuals = $lineFit->residuals(); + +The returned list is undefined if the regression fails. + +=head2 rSquared() - Return the square of the correlation coefficient + + $rSquared = $lineFit->rSquared(); + +R squared, also called the square of the Pearson product-moment correlation +coefficient, is a measure of goodness-of-fit. It is the fraction of the +variation in Y that can be attributed to the variation in X. A perfect fit +will have an R squared of 1; fitting a line to the vertices of a +regular polygon will yield an R squared of zero. Graphical displays of data +with an R squared of less than about 0.1 do not show a visible linear trend. + +The return value is undefined if the regression fails. If weights are +input, the return value is the weighted correlation coefficient. + +=head2 setData() - Initialize (x,y) values and optional weights + + $lineFit->setData(\@x, \@y) or die "Invalid regression data"; + $lineFit->setData(\@x, \@y, \@weights) or die "Invalid regression data"; + $lineFit->setData(\@xy) or die "Invalid regression data"; + $lineFit->setData(\@xy, \@weights) or die "Invalid regression data"; + +@xy is an array of arrayrefs; x values are $xy[$i][0], y values are +$xy[$i][1]. (The module does not access any indices greater than $xy[$i][1], +so the arrayrefs can point to arrays that are longer than two elements.) +The method identifies the difference between the first and fourth calling +signatures by examining the first argument. + +The optional weights array must be the same length as the data array(s). +The weights must be non-negative numbers; at least two of the weights +must be nonzero. Only the relative size of the weights is significant: +the program normalizes the weights (after copying the input values) so +that the sum of the weights equals the number of points. If you want to +do multiple line fits using the same weights, the weights must be passed +to each call to setData(). + +The method will return zero if the array lengths don't match, there are +less than two data points, any weights are negative or less than two of +the weights are nonzero. If the new() method was called with validate = 1, +the method will also verify that the data and weights are valid numbers. +Once you successfully call setData(), the next call to any method other than +new() or setData() invokes the regression. You can modify the input data +or weights after calling setData() without affecting the module's results. + +=head2 sigma() - Return the standard error of the estimate + +$sigma = $lineFit->sigma(); + +Sigma is an estimate of the homoscedastic standard deviation of the +error. Sigma is also known as the standard error of the estimate. + +The return value is undefined if the regression fails. If weights are +input, the return value is the weighted standard error. + +=head2 tStatistics() - Return the t statistics + + (tStatIntercept, $tStatSlope) = $lineFit->tStatistics(); + +The t statistic, also called the t ratio or Wald statistic, is used to +accept or reject a hypothesis using a table of cutoff values computed from +the t distribution. The t-statistic suggests that the estimated value is +(reasonable, too small, too large) when the t-statistic is (close to zero, +large and positive, large and negative). + +The returned list is undefined if the regression fails. If weights +are input, the returned values are the weighted t statistics. + +=head2 varianceOfEstimates() - Return variances of estimates of intercept, slope + + (varianceIntercept, $varianceSlope) = $lineFit->varianceOfEstimates(); + +Assuming the data are noisy or inaccurate, the intercept and slope returned +by the coefficients() method are only estimates of the true intercept and +slope. The varianceofEstimate() method returns the variances of the +estimates of the intercept and slope, respectively. See Numerical Recipes +in C, section 15.2 (Fitting Data to a Straight Line), equation 15.2.9. + +The returned list is undefined if the regression fails. If weights +are input, the returned values are the weighted variances. + +=head1 SEE ALSO + + Mendenhall, W., and Sincich, T.L., 2003, A Second Course in Statistics: + Regression Analysis, 6th ed., Prentice Hall. + Press, W. H., Flannery, B. P., Teukolsky, S. A., Vetterling, W. T., 1992, + Numerical Recipes in C : The Art of Scientific Computing, 2nd ed., + Cambridge University Press. + The man page for perl(1). + The CPAN modules Statistics::OLS, Statistics::GaussHelmert and + Statistics::Regression. + +Statistics::LineFit is simpler to use than Statistics::GaussHelmert or +Statistics::Regression. Statistics::LineFit was inspired by and borrows some +ideas from the venerable Statistics::OLS module. + +The significant differences +between Statistics::LineFit and Statistics::OLS (version 0.07) are: + +=over 4 + +=item B<Statistics::LineFit is more robust.> + +Statistics::OLS returns incorrect results for certain input datasets. +Statistics::OLS does not deep copy its input arrays, which can lead +to subtle bugs. The Statistics::OLS installation test has only one +test and does not verify that the regression returns correct results. +In contrast, Statistics::LineFit has over 200 installation tests that use +various datasets/calling sequences to verify the accuracy of the +regression to within 1.0e-10. + +=item B<Statistics::LineFit is faster.> + +For a sequence of calls to new(), setData(\@x, \@y) and regress(), +Statistics::LineFit is faster than Statistics::OLS by factors of 2.0, 1.6 +and 2.4 for array lengths of 5, 100 and 10000, respectively. + +=item B<Statistics::LineFit can do weighted or unweighted regression.> + +Statistics::OLS lacks this option. + +=item B<Statistics::LineFit has a better interface.> + +Once you call the Statistics::LineFit::setData() method, you can call the +other methods in any order and call methods multiple times without invoking +redundant calculations. Statistics::LineFit lets you enable or disable +data verification or error messages. + +=item B<Statistics::LineFit has better code and documentation.> + +The code in Statistics::LineFit is more readable, more object oriented and +more compliant with Perl coding standards than the code in Statistics::OLS. +The documentation for Statistics::LineFit is more detailed and complete. + +=back + +=head1 AUTHOR + +Richard Anderson, cpan(AT)richardanderson(DOT)org, +http://www.richardanderson.org + +=head1 LICENSE + +This program is free software; you can redistribute it and/or modify it under +the same terms as Perl itself. + +The full text of the license can be found in the LICENSE file included in +the distribution and available in the CPAN listing for Statistics::LineFit +(see www.cpan.org or search.cpan.org). + +=head1 DISCLAIMER + +To the maximum extent permitted by applicable law, the author of this +module disclaims all warranties, either express or implied, including +but not limited to implied warranties of merchantability and fitness for +a particular purpose, with regard to the software and the accompanying +documentation. + +=cut +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PGAP-1.2.1/blast_parser.pl Thu Jun 24 13:51:52 2021 +0000 @@ -0,0 +1,517 @@ +#!/usr/bin/perl +use strict; +use warnings; +#use lib '/afs/pdc.kth.se/home/k/krifo/vol03/domainAligner/Inparanoid_new/lib64'; +#use lib '/afs/pdc.kth.se/home/k/krifo/vol03/domainAligner/Inparanoid_new/lib64/lib64/perl5/site_perl/5.8.8/x86_64-linux-thread-multi'; +use XML::Parser; + +############################################################################################## +# +# This parser parses output files from blastall (blastp) and outputs one-line descriptions +# for each hit with a score above or equal to a cut-off score that is specified as a parameter +# to the program. The one-line description contains the following ifomation separated by tab +# * Query id +# * Hit id +# * Bit score +# * Query length +# * Hit length +# * Length of longest segment on query. This is defined as the length of the segment from the +# first position od the first hsp to the last position of the last hsp. I.e. if the +# hsps are 1-10 and 90-100, the length of the longest segment will be 100. +# * Length of longest segment on hit, see explanation above. +# * Total match length on query. This is defined as the total length of all hsps. If the hsps +# are 1-10 and 90-100, the length will be 20. +# * Total match length on hit. see explanation above +# * Positions for all segments on the query and on the hit. Positions on the query is written +# as p:1-100 and positions on the hit is writteh as h:1-100. Positions for all hsps are +# specified separated nt tab. Positions for query and hut for a segment is separated by +# one whitespace. +# + +# MODIFICATION: also add %identity as next output field +# HOWEVER note this is specifically protein sequence identity of the aligned regions, not all of the proteins + +# If alignment mode is used, the parser prints a > before all data on the one-line description. +# The next two lines will contain the aligned sequences, including gaps. Alignment mode is off +# by default but can be used if the second parameter sent to the parser is -a. If alignment +# mode is not used only the score cutoff and the Blast XML file should be sent to the parser. +# +# NB: This parser was created for blast 2.2.16. If you run earlier versions of blast or +# blast with -VT to emulate the old behaviour the code in this script must be changed a bit. +# The neccesary changes are few and the first change is to un-comment all lines +# marked by ##. Lines marked with ## occurs here at the beginning of the script and on a couple +# of places in the My::TagHandler package (In the Init subroutine). The other change neccessary +# is to comment/remove all lines marked with #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST. +# +# Written by Isabella Pekkari 2007-11-19 +# Modified by Kristoffer Forslund 2008-05-07 to add biased composition handling +# +############################################################################################## + +# First argument is score cutt-of in bits +# Second argument is beta threshold for composition filtering. Set to 0.0 to disable. +# Third argument is blast result file (.xml, i.i run blast with option -m7) that shall be parsed +my $score_cutoff = shift; + + +# If alignment mode shall be used, the flag -a is specified as the first parameter +# Alignment mode is off by defalut so the flag -a must be specified if this mode shall be used. +my $alignment_mode = 0; +if (defined $ARGV[0] && $ARGV[0] eq '-a') { + + $alignment_mode = 1; + shift; +} + +# If segments must be ordered in a linear fashion on both sequences. +# If 1, the hsp order must be the same on the query and the hit. +# For example, if the n-terminal of the hit matches the c-terminal of +# the query another segment where the c-terminal of the hit matches +# the n-terminal of the hit si not allowed. +# If this parameter is set to 0, hsps can be ordered differently on the +# query and the hit. In both cases, an overlap of maximum 5% of the shortest segment +# is allowed. +my $linear_segment_mode = 1; + +# Only for older versions of blast +##my $done = 0; + + + +# Create an xml parser +my $p = XML::Parser->new( Style => 'My::TagHandler', ); + +# Create an Expat::NB parser +my $nb_p = $p->parse_start(); + +# Parse the xml file +parse_document(); + +sub parse_document { + + while(my $l = <>){ + ##if ($done) { + + ##$nb_p->parse_done; + ##$nb_p = $p->parse_start(); + ##$done = 0; + + ##} else { + # Remove whitespace at the end of the line + chomp($l); +##/DEBUG +# print STDERR "###" .$l . "###\n"; + # Parse the line + $nb_p->parse_more($l); + ##} + } + + # Shut the parser down + $nb_p->parse_done; + +} + +# Only used for older versions of blast +##sub next_doc { + +## $done = 1; + +##} + + + +############################################ +# Package for parsing xml files obtained +# from blast. +############################################ +package My::TagHandler; + +use strict; +use warnings; + +#Subroutines +my $create_query; +my $store_query_length; +my $new_hit; +my $save_hit_length; +my $new_hsp; +my $save_hsp_start_query; +my $save_hsp_end_query; +my $save_hsp_start_hit; +my $save_hsp_end_hit; +my $save_hsp_qseq; +my $save_hsp_hseq; +my $end_hit; +my $print_query; +my $check_if_overlap_non_linear_allowed; +my $check_if_overlap_linear; + +my @current_query; # The current query +my $current_hit; # The current hit +my $current_hsp; # The current hsp + +my $currentText = ''; # Text from the current tag + +# Names of tags that shall be parsed as key and references to +# the anonymous subroutine that shall be run when parsing the tag as value. +my %tags_to_parse; + +# Reference to the anonymous subroutine that shall be run +# to check whether there is an overlap between two hsps. +my $overlap_sub; + +sub Init { + + my($expat) = @_; + + #Subroutine for creating new query + $create_query = sub { + + # The the query id. Additional information for the query is ignored. + my @tmp = split(/\s+/, $currentText); + @current_query = ($tmp[0], 0, []) + }; + + #Subroutine for storing query length + $store_query_length = sub {$current_query[1] = $currentText}; + + #Subroutine for creating new hit + $new_hit = sub { + + # The the hit id. Additional information for the hit is ignored. + my @tmp = split(/\s+/, $currentText); + $current_hit = [$tmp[0], 0, 0, 0, 0, [], 0]; + }; + + #Subroutine for saving hit length + $save_hit_length = sub {$current_hit->[1] = $currentText}; + + #Subroutine for creating new hsp + $new_hsp = sub {$current_hsp = [$currentText]}; + + # Subroutine for saving hsp start on query + $save_hsp_start_query = sub {$current_hsp->[1] = $currentText}; + + # Subroutine for saving hsp end on query + $save_hsp_end_query = sub {$current_hsp->[2] = $currentText}; + + # Subroutine for saving hsp start on hit + $save_hsp_start_hit = sub {$current_hsp->[3] = $currentText}; + + # Subroutine for saving hsp end on hit + $save_hsp_end_hit = sub {$current_hsp->[4] = $currentText;}; + + # Subroutine for saving hsp query sequence (as in alignment) + $save_hsp_qseq = sub {$current_hsp->[5] = $currentText;}; + + # Subroutine for saving hsp hit sequence (as in alignment) + $save_hsp_hseq = sub {$current_hsp->[6] = $currentText; + + # Check if this hsp overlaps with any of the + # existing hsps + my $overlap_found = 0; + foreach my $existing_hsp (@{$current_hit->[5]}) { + + if ($overlap_sub->($current_hsp, $existing_hsp)) { + $overlap_found = 1; + last; + } + + } + + # If this hsp does not overlap with any hsp it is added + + + unless ($overlap_found) { + + # Add the hsp to the hit + push( @{$current_hit->[5]}, $current_hsp); + + # Increase number of hsps for the hit with one + $current_hit->[6]++; + + # Add the hsp score to the total score + $current_hit->[2] += $current_hsp->[0]; + + # Add the hsp length on the query to the total hsp length on the query + $current_hit->[3] += ($current_hsp->[2] - $current_hsp->[1] + 1); + + # Add the hsp length on the hit to the total hsp length on the hit + $current_hit->[4] += ($current_hsp->[4] - $current_hsp->[3] + 1); + } + + }; + + # Subroutine for saving hit + $end_hit = sub { + + #Add the hit to the qurrent query + unless ($current_hit->[2] < $score_cutoff ) { + push( @{$current_query[2]}, $current_hit ); + } + }; + + # Subroutine for printing all hits for a query + $print_query = sub { + + # Sort the hits after score with hit with highest score first + my @sorted_hits = sort {$b->[2] <=> $a->[2]} @{$current_query[2]}; + + # For all hits + foreach my $hit (@sorted_hits) { + + if ($alignment_mode) { + + # When alignment mode is used, self hits are not printed + # Therefore, the hit is ignored if it has the same id as the query + next if ($current_query[0] eq $hit->[0]); + + # When alignment mode is used, a > is printed at the start of + # lines containing data, i.e. lines that do not contain sequences + print ">"; + } + + print "$current_query[0]\t"; # Print query id + print "$hit->[0]\t"; # Print hit id + printf ("%.1f\t", $hit->[2]); # Print bit score + print "$current_query[1]\t"; # Print query length + print "$hit->[1]\t"; # Print hit length + + if ($hit->[6] > 1) { # if more than one segment + + # Sort hsps on query + my @hsps_sorted_on_query = sort {$a->[1] <=> $b->[1]} @{$hit->[5]}; + + # Sort hsps on hit + my @hsps_sorted_on_hit = sort {$a->[3] <=> $b->[3]} @{$hit->[5]}; + + # Get total segment length on query + my $segm_length_query = $hsps_sorted_on_query[@hsps_sorted_on_query - 1]->[2] - $hsps_sorted_on_query[0]->[1] + 1; + + # Get total segment length on hit + my $segm_length_hit = $hsps_sorted_on_hit[@hsps_sorted_on_hit - 1]->[4] - $hsps_sorted_on_hit[0]->[3] + 1; + + print "$segm_length_query\t"; # Print segment length on query (i.e lengt of the segment started by the first hsp and ended by the last hsp) + print "$segm_length_hit\t"; # Print segment length on query + print "$hit->[3]\t$hit->[4]\t"; # Print total length of all hsps on the query and on the match, i.e length of actually matching region + + # In alignment mode, the aligned sequences shall be printed on the lines following the data line + my $hsp_qseq = ''; + my $hsp_hseq = ''; + + # Print query and hit segment positions + foreach my $segment (@hsps_sorted_on_query) { + + print "q:$segment->[1]-$segment->[2] "; + print "h:$segment->[3]-$segment->[4]\t"; + + if ($alignment_mode) { + + # Add the hsp sequences to the total sequence + $hsp_qseq .= $segment->[5]; + $hsp_hseq .= $segment->[6]; + } + + } + + print "\n"; + + if ($alignment_mode) { + + # Print sequences + print "$hsp_qseq\n"; + print "$hsp_hseq\n"; + } + + } else { + # Get the only hsp that was found for this hit + my $segment = $hit->[5]->[0]; + + # Get total segment length on query + my $segm_length_query = $segment->[2] - $segment->[1] + 1; + + # Get total segment length on hit + my $segm_length_hit = $segment->[4] - $segment->[3] + 1; + + print "$segm_length_query\t"; # Print segment length on query, i.e. length of this hsp on query + print "$segm_length_hit\t"; # Print segment length on hit, i.e. length of this hsp on hit + + # Print total length of all hsps on the query and on the match, i.e length of actually matching region + # Sice there is only one segment, these lengths will be the same as the segment lengths printed above. + print "$hit->[3]\t$hit->[4]\t"; + + # Print segment positions + + print "q:$segment->[1]-$segment->[2] "; + print "h:$segment->[3]-$segment->[4]\t"; + print "\n"; + + if ($alignment_mode) { + + # Print sequences + print "$segment->[5]\n"; + print "$segment->[6]\n"; + } + + } + } + ##main::next_doc(); #NB! Un-comment for older blast versions + + }; + + # Subroutine for checking if two hsps overlap. + # When this subroutine is used, non-linear arrangements of the hsps are allowed. + $check_if_overlap_non_linear_allowed = sub { + + my $hsp1 = shift; # One hsp + my $hsp2 = shift; # Another hsp + + # Check if there is an overlap. + return (_check_overlap_non_linear_allowed($hsp1->[1], $hsp1->[2], $hsp2->[1], $hsp2->[2]) + || _check_overlap_non_linear_allowed($hsp1->[3], $hsp1->[4], $hsp2->[3], $hsp2->[4])); + + }; + + # Subroutine for checking if two hsps overlap. + # When this subroutine is used, non-linear arrangements of the hsps are not allowed. + $check_if_overlap_linear = sub { + + my $hsp1 = shift; # One hsp + my $hsp2 = shift; # Another hsp + + # Get start point for hsp1 on query + my $start1_hsp1 = $hsp1->[1]; + + # Get start point for hsp2 on query + my $start1_hsp2 = $hsp2->[1]; + + # The hsp that comes first oon the query (N-terminal hsp) + my $first_hsp; + + # The hsp that comes last on the query + my $last_hsp; + + # Check which hsp is N-teminal. + if ($start1_hsp1 eq $start1_hsp2) { # If the fragments start at the same position, there is an overlap (100% of shorter sequence) + return 1; + } elsif ($start1_hsp1 < $start1_hsp2) { + + $first_hsp = $hsp1; + $last_hsp = $hsp2; + + } else { + + $first_hsp = $hsp2; + $last_hsp = $hsp1; + + } + + # Return whether there is an overlap or not. + return (_check_overlap_linear($first_hsp->[1], $first_hsp->[2], $last_hsp->[1], $last_hsp->[2]) + || _check_overlap_linear($first_hsp->[3], $first_hsp->[4], $last_hsp->[3], $last_hsp->[4])); + }; + + %tags_to_parse = ( + + ##'BlastOutput_query-def' => $create_query, + ##'BlastOutput_query-def' => $create_query, +## 'BlastOutput_query-def' => $create_query, +## 'BlastOutput_query-len' => $store_query_length, + 'Iteration_query-def' => $create_query, # Query id #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST + 'Iteration_query-len' => $store_query_length, # Query length #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST + 'Hit_def' => $new_hit, # Hit id + 'Hit_len' => $save_hit_length, # Hit length + 'Hsp_bit-score' => $new_hsp, # Hsp bit score + 'Hsp_query-from' => $save_hsp_start_query, # Start point for hsp on query + 'Hsp_query-to' => $save_hsp_end_query, # End point for hsp on query + 'Hsp_hit-from' => $save_hsp_start_hit, # Start position for hsp on hit + 'Hsp_hit-to' => $save_hsp_end_hit, # End position for hsp on hit + 'Hsp_qseq' => $save_hsp_qseq, # Hsp query sequence (gapped as in the alignment) + 'Hsp_hseq' => $save_hsp_hseq, # Hsp hit sequence (gapped as in the alignment) + 'Hit_hsps' => $end_hit, # End of hit + ##'BlastOutput' => $print_query + 'Iteration' => $print_query # End of query #REMOVE THIS LINE FOR OLDER VERSIONS OF BLAST + ); + + # Set overlap subroutine to use + if ($linear_segment_mode eq 1) { + $overlap_sub = $check_if_overlap_linear; + } else { + $overlap_sub = $check_if_overlap_non_linear_allowed; + } +} + +# Nothing is done when encountering a start tag +#sub Start +#{ + +#} + +sub End { + + my($expat, $tag) = @_; + + # If the name of the tag is in the table with names of tags to parse, + # run the corresponding subroutine + $tags_to_parse{$tag} && do { $tags_to_parse{$tag}->()}; + +} + +sub Char +{ + my($expat, $text) = @_; + # Save the tag text + $currentText = $text; +} + +sub _check_overlap_linear { + + my ($start1, $end1, $start2, $end2) = @_; + + # Length of segment 1 + my $length1 = $end1 - $start1 + 1; + + # Length of segment 2 + my $length2 = $end2 - $start2 + 1; + + # Get the length of the sortest of these segments + my $shortest_length = ($length1 < $length2)?$length1:$length2; + + # Maxumin of 5% overlap (witg regard to the shorter segment) is allowed + return (($start2 - $end1 - 1) / $shortest_length < - 0.05); +} + + + +sub _check_overlap_non_linear_allowed { + + my ($start1, $end1, $start2, $end2) = @_; + + # Length of segment 1 + my $length1 = $end1 - $start1 + 1; + + # Length of segment 2 + my $length2 = $end2 - $start2 + 1; + + # Get the length of the sortest of these segments + my $shortest_length = ($length1 < $length2)?$length1:$length2; + + if ($start1 eq $start2) { # If the fragment start at the same position, there is an overlap (100% of shorter sequence) + return 1; + } elsif ($start1 < $start2) { + + if (($start2 - $end1 + 1) / $shortest_length < - 0.05) { + return 1; + } + + } else { + + if (($start1 - $end2 + 1) / $shortest_length < - 0.05) { + return 1; + } + + } + + # If the method has not returned yet, there is no overlap + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PGAP-1.2.1/inparanoid.pl Thu Jun 24 13:51:52 2021 +0000 @@ -0,0 +1,1868 @@ +#! /usr/bin/perl +############################################################################### +# InParanoid version 4.1 +# Copyright (C) Erik Sonnhammer, Kristoffer Forslund, Isabella Pekkari, +# Ann-Charlotte Berglund, Maido Remm, 2007 +# +# This program is provided under the terms of a personal license to the recipient and may only +# be used for the recipient's own research at an academic insititution. +# +# Distribution of the results of this program must be discussed with the authors. +# For using this program in a company or for commercial purposes, a commercial license is required. +# Contact Erik.Sonnhammer@sbc.su.se in both cases +# +# Make sure that Perl XML libraries are installed! +# +# NOTE: This script requires blastall (NCBI BLAST) version 2.2.16 or higher, that supports +# compositional score matrix adjustment (-C2 flag). + +my $usage =" Usage: inparanoid.pl <FASTAFILE with sequences of species A> <FASTAFILE with sequences of species B> [FASTAFILE with sequences of species C] + +"; + +############################################################################### +# The program calculates orthologs between 2 datasets of proteins +# called A and B. Both datasets should be in multi-fasta file +# - Additionally, it tries to assign paralogous sequences (in-paralogs) to each +# thus forming paralogous clusters. +# - Confidence of in-paralogs is calculated in relative scale between 0-100%. +# This confidence value is dependent on how far is given sequence from the +# seed ortholog of given group +# - Confidence of groups can be calculated with bootstrapping. This is related +# to score difference between best hit and second best hit. +# - Optionally it can use a species C as outgroup. + +############################################################################### +# You may need to run the following command manually to increase your +# default datasize limit: 'limit datasize 500000 kb' + +############################################################################### +# Set following variables: # +############################################################################### + +# What do you want the program to do? # +$run_blast = 1; # Set to 1 if you don't have the 4 BLAST output files # + # Requires 'blastall', 'formatdb' (NCBI BLAST2) # + # and parser 'blast_parser.pl' # +$blast_two_passes = 1; # Set to 1 to run 2-pass strategy # + # (strongly recommended, but slower) # +$run_inparanoid = 1; +$use_bootstrap = 1;# Use bootstrapping to estimate the confidence of orthologs# + # Needs additional programs 'seqstat.jar' and 'blast2faa.pl' +$use_outgroup = 0; # Use proteins from the third genome as an outgroup # + # Reject best-best hit if outgroup sequence is MORE # + # similar to one of the sequences # + # (by more than $outgroup_cutoff bits) # + +# Define location of files and programs: +#$blastall = "blastall -VT"; #Remove -VT for blast version 2.2.12 or earlier +$blastall = "$ARGV[0] -a $ARGV[1]"; #Add -aN to use N processors +$formatdb = "$ARGV[2]"; +$seqstat = "seqstat.jar"; +$blastParser = "blast_parser.pl"; + +#$matrix = "BLOSUM62"; # Reasonable default for comparison of eukaryotes. +$matrix = "BLOSUM45"; #(for prokaryotes), +#$matrix = "BLOSUM80"; #(orthologs within metazoa), +#$matrix = "PAM70"; +#$matrix = "PAM30"; + +# Output options: # +$output = 0; # table_stats-format output # +$table = 0; # Print tab-delimited table of orthologs to file "table.txt" # + # Each orthologous group with all inparalogs is on one line # +$mysql_table = 1; # Print out sql tables for the web server # + # Each inparalog is on separate line # +$html = 0; # HTML-format output # + +# Algorithm parameters: +# Default values should work without problems. +# MAKE SURE, however, that the score cutoff here matches what you used for BLAST! +$score_cutoff = $ARGV[3]; # In bits. Any match below this is ignored # +$outgroup_cutoff = 50; # In bits. Outgroup sequence hit must be this many bits# + # stronger to reject best-best hit between A and B # +$conf_cutoff = 0.05; # Include in-paralogs with this confidence or better # +$group_overlap_cutoff = 0.5; # Merge groups if ortholog in one group has more # + # than this confidence in other group # +$grey_zone = 0; # This many bits signifies the difference between 2 scores # +$show_times = 0; # Show times spent for execution of each part of the program # + # (This does not work properly) # +$debug = 0; # Print debugging messages or not. Levels 0,1,2 and 4 exist # + +my $seq_overlap_cutoff = $ARGV[4]; # Match area should cover at least this much of longer sequence. Match area is defined as area from start of + # first segment to end of last segment, i.e segments 1-10 and 90-100 gives a match length of 100. +my $segment_coverage_cutoff = $ARGV[5]; # Actually matching segments must cover this much of longer sequence. + # For example, segments 1-10 and 90-100 gives a total length of 20. + +splice(@ARGV,0,6); + +############################################################################### +# No changes should be required below this line # +############################################################################### +$ENV{CLASSPATH} = "./$seqstat" if ($use_bootstrap); + +if (!@ARGV){ + print STDERR $usage; + exit 1; +} + +if ((@ARGV < 2) and ($run_inparanoid)){ + print STDERR "\n When \$run_inparanoid=1, at least two distinct FASTA files have to be specified.\n"; + + print STDERR $usage; + exit 1; +} + +if ((!$run_blast) and (!$run_inparanoid)){ + print STDERR "run_blast or run_inparanoid has to be set!\n"; + exit 1; +} + +# Input files: +$fasta_seq_fileA = "$ARGV[0]"; +$fasta_seq_fileB = "$ARGV[1]"; +$fasta_seq_fileC = "$ARGV[2]" if ($use_outgroup); # This is outgroup file + +my $blast_outputAB = $fasta_seq_fileA . "-" . $fasta_seq_fileB; +my $blast_outputBA = $fasta_seq_fileB . "-" . $fasta_seq_fileA; +my $blast_outputAA = $fasta_seq_fileA . "-" . $fasta_seq_fileA; +my $blast_outputBB = $fasta_seq_fileB . "-" . $fasta_seq_fileB; + +if ($use_outgroup){ + $blast_outputAC = $fasta_seq_fileA . "-" . $fasta_seq_fileC; + $blast_outputBC = $fasta_seq_fileB . "-" . $fasta_seq_fileC; +} +my %idA; # Name -> ID combinations for species 1 +my %idB; # Name -> ID combinations for species 2 +my @nameA; # ID -> Name combinations for species 1 +my @nameB; # ID -> Name combinations for species 2 +my @nameC; +my %scoreAB; # Hashes with pairwise BLAST scores (in bits) +my %scoreBA; +my %scoreAA; +my %scoreBB; +my @hitnAB; # 1-D arrays that keep the number of pairwise hits +my @hitnBA; +my @hitnAA; +my @hitnBB; +my @hitAB; # 2-D arrays that keep the actual matching IDs +my @hitBA; +my @hitAA; +my @hitBB; +my @besthitAB; # IDs of best hits in other species (may contain more than one ID) +my @besthitBA; # IDs of best hits in other species (may contain more than one ID) +my @bestscoreAB; # best match A -> B +my @bestscoreBA; # best match B -> A +my @ortoA; # IDs of ortholog candidates from species A +my @ortoB; # IDs of ortholog candidates from species B +my @ortoS; # Scores between ortoA and ortoB pairs +my @paralogsA; # List of paralog IDs in given cluster +my @paralogsB; # List of paralog IDs in given cluster +my @confPA; # Confidence values for A paralogs +my @confPB; # Confidence values for B paralogs +my @confA; # Confidence values for orthologous groups +my @confB; # Confidence values for orthologous groups +my $prev_time = 0; + +$outputfile = "Output." . $ARGV[0] . "-" . $ARGV[1]; +if ($output){ + open OUTPUT, ">$outputfile" or warn "Could not write to OUTPUT file $filename\n"; +} + +################################################# +# Assign ID numbers for species A +################################################# +open A, "$fasta_seq_fileA" or die "File A with sequences in FASTA format is missing +Usage $0 <FASTAFILE with sequences of species A> <FASTAFILE with sequences of species B> <FASTAFILE with sequences of species C>\n"; +$id = 0; +while (<A>){ + if(/^\>/){ + ++$id; + chomp; + s/\>//; + @tmp = split(/\s+/); + #$name = substr($tmp[0],0,25); + $name = $tmp[0]; + $idA{$name} = int($id); + $nameA[$id] = $name; + } +} +close A; +$A = $id; +print "$A sequences in file $fasta_seq_fileA\n"; + +if ($output){ + print OUTPUT "$A sequences in file $fasta_seq_fileA\n"; +} + +if (@ARGV >= 2) { +################################################# +# Assign ID numbers for species B +################################################# + open B, "$fasta_seq_fileB" or die "File B with sequences in FASTA format is missing +Usage $0 <FASTAFILE with sequences of species A> <FASTAFILE with sequences of species B> <FASTAFILE with sequences of species C>\n"; + $id = 0; + while (<B>){ + if(/^\>/){ + ++$id; + chomp; + s/\>//; + @tmp = split(/\s+/); + #$name = substr($tmp[0],0,25); + $name = $tmp[0]; + $idB{$name} = int($id); + $nameB[$id] = $name; + } + } + $B = $id; + print "$B sequences in file $fasta_seq_fileB\n"; + close B; + + if ($output){ + print OUTPUT "$B sequences in file $fasta_seq_fileB\n"; + } +} +################################################# +# Assign ID numbers for species C (outgroup) +################################################# +if ($use_outgroup){ + open C, "$fasta_seq_fileC" or die "File C with sequences in FASTA format is missing + Usage $0 <FASTAFILE with sequences of species A> <FASTAFILE with sequences of species B> <FASTAFILE with sequences of species C>\n"; + $id = 0; + while (<C>){ + if(/^\>/){ + ++$id; + chomp; + s/\>//; + @tmp = split(/\s+/); + #$name = substr($tmp[0],0,25); + $name = $tmp[0]; + $idC{$name} = int($id); + $nameC[$id] = $name; + } + } + $C = $id; + print "$C sequences in file $fasta_seq_fileC\n"; + close C; + if ($output){ + print OUTPUT "$C sequences in file $fasta_seq_fileC\n"; + } +} +if ($show_times){ + ($user_time,,,) = times; + printf ("Indexing sequences took %.2f seconds\n", ($user_time - $prev_time)); + $prev_time = $user_time; +} + +################################################# +# Run BLAST if not done already +################################################# +if ($run_blast){ + print "Trying to run BLAST now - this may take several hours ... or days in worst case!\n"; + print STDERR "Formatting BLAST databases\n"; + system ("$formatdb -i $fasta_seq_fileA"); + system ("$formatdb -i $fasta_seq_fileB") if (@ARGV >= 2); + system ("$formatdb -i $fasta_seq_fileC") if ($use_outgroup); + print STDERR "Done formatting\nStarting BLAST searches...\n"; + +# Run blast only if the files do not already exist is not default. +# NOTE: you should have done this beforehand, because you probably +# want two-pass blasting anyway which is not implemented here +# this is also not adapted to use specific compositional adjustment settings +# and might not use the proper blast parser... + + do_blast ($fasta_seq_fileA, $fasta_seq_fileA, $A, $A, $blast_outputAA); + + if (@ARGV >= 2) { + do_blast ($fasta_seq_fileA, $fasta_seq_fileB, $B, $B, $blast_outputAB); + do_blast ($fasta_seq_fileB, $fasta_seq_fileA, $A, $A, $blast_outputBA); + do_blast ($fasta_seq_fileB, $fasta_seq_fileB, $B, $B, $blast_outputBB); + } + + if ($use_outgroup){ + + do_blast ($fasta_seq_fileA, $fasta_seq_fileC, $A, $C, $blast_outputAC); + do_blast ($fasta_seq_fileB, $fasta_seq_fileC, $B, $C, $blast_outputBC); + } + + if ($show_times){ + ($user_time,,,) = times; + printf ("BLAST searches took %.2f seconds\n", ($user_time - $prev_time)); + $prev_time = $user_time; + } + print STDERR "Done BLAST searches. "; + +} else { + print STDERR "Skipping blast! \n"; +} + +if ($run_inparanoid){ + print STDERR "Starting ortholog detection...\n"; +################################################# +# Read in best hits from blast output file AB +################################################# + $count = 0; + open AB, "$blast_outputAB" or die "Blast output file A->B is missing\n"; + $old_idQ = 0; + while (<AB>){ + chomp; + @Fld = split(/\s+/); # Get query, match and score + + if( scalar @Fld < 9 ){ + if($Fld[0]=~/done/){ + print STDERR "AB ok\n"; + } + next; + } + + $q = $Fld[0]; + $m = $Fld[1]; + $idQ = $idA{$q}; # ID of query sequence + $idM = $idB{$m}; # ID of match sequence + $score = $Fld[2]; + + next if (!overlap_test(@Fld)); + + # Score must be equal to or above cut-off + next if ($score < $score_cutoff); + + if(!$count || $q ne $oldq){ + print "Match $m, score $score, ID for $q is missing\n" if ($debug == 2 and !(exists($idA{$q}))); + $hitnAB[$idA{$oldq}] = $hit if($count); # Record number of hits for previous query + $hit = 0; + ++$count; + $oldq = $q; + } + ++$hit; + $hitAB[$idQ][$hit] = int($idM); +# printf ("hitAB[%d][%d] = %d\n",$idQ,$hit,$idM); + $scoreAB{"$idQ:$idM"} = $score; + $scoreBA{"$idM:$idQ"} = $score_cutoff; # Initialize mutual hit score - sometimes this is below score_cutoff + $old_idQ = $idQ; +# } + } + $hitnAB[$idQ] = $hit; # For the last query +#printf ("hitnAB[1] = %d\n",$hitnAB[1]); +#printf ("hitnAB[%d] = %d\n",$idQ,$hit); + close AB; + if ($output){ + print OUTPUT "$count sequences $fasta_seq_fileA have homologs in dataset $fasta_seq_fileB\n"; + } +################################################# +# Read in best hits from blast output file BA +################################################# + $count = 0; + open BA, "$blast_outputBA" or die "Blast output file B->A is missing\n"; + $old_idQ = 0; + while (<BA>){ + chomp; + @Fld = split(/\s+/); # Get query, match and score + + if( scalar @Fld < 9 ){ + if($Fld[0]=~/done/){ + print STDERR "BA ok\n"; + } + next; + } + + $q = $Fld[0]; + $m = $Fld[1]; + $idQ = $idB{$q}; + $idM = $idA{$m}; + $score = $Fld[2]; + + next if (!overlap_test(@Fld)); + + next if ($score < $score_cutoff); + + if(!$count || $q ne $oldq){ + print "ID for $q is missing\n" if ($debug == 2 and (!exists($idB{$q}))); + $hitnBA[$idB{$oldq}] = $hit if($count); # Record number of hits for previous query + $hit = 0; + ++$count; + $oldq = $q; + } + ++$hit; + $hitBA[$idQ][$hit] = int($idM); +# printf ("hitBA[%d][%d] = %d\n",$idQ,$hit,$idM); + $scoreBA{"$idQ:$idM"} = $score; + $scoreAB{"$idM:$idQ"} = $score_cutoff if ($scoreAB{"$idM:$idQ"} < $score_cutoff); # Initialize missing scores + $old_idQ = $idQ; +# } + } + $hitnBA[$idQ] = $hit; # For the last query +#printf ("hitnBA[%d] = %d\n",$idQ,$hit); + close BA; + if ($output){ + print OUTPUT "$count sequences $fasta_seq_fileB have homologs in dataset $fasta_seq_fileA\n"; + } +##################### Equalize AB scores and BA scores ########################## + +###################################################################################################################################### Modification by Isabella 1 + + # I removed the time consuming all vs all search and equalize scores for all pairs where there was a hit + + foreach my $key (keys %scoreAB) { + + my ($a, $b) = split(':', $key); + my $key2 = $b . ':' . $a; + + # If debugg mod is 5 and the scores A-B and B-A are unequal + # the names of the two sequences and their scores are printed + if ($scoreAB{$key} != $scoreBA{$key2}){ + printf ("%-20s\t%-20s\t%d\t%d\n",$nameA[$a], $nameB[$b], $scoreAB{$key}, $scoreBA{$key2}) if ($debug == 5); + } + + # Set score AB and score BA to the mean of scores AB and BA. + # The final score is saved as an integer so .5 needs to be added to avoid rounding errors + $scoreAB{$key} = $scoreBA{$key2} = int(($scoreAB{$key} + $scoreBA{$key2})/2.0 +.5); + } + + # For all ids for sequences from organism A + #for $a(1..$A){ + #For all ids for sequences from organism B + #for $b(1..$B){ + + # No need to equalize score if there was no match between sequence with id $a from species A + # and sequence with id $b from species B + # next if (!$scoreAB{"$a:$b"}); + + # If debugg mod is 5 and the scores A-B and B-A are unequal + # the names of the two sequences and their scores are printed + # if ($scoreAB{"$a:$b"} != $scoreBA{"$b:$a"}){ + # printf ("%-20s\t%-20s\t%d\t%d\n",$nameA[$a], $nameB[$b], $scoreAB{"$a:$b"}, $scoreBA{"$b:$a"}) if ($debug == 5); + # } + + # Set score AB and score BA to the mean of scores AB and BA. + # The final score is saved as an integer so .5 needs to be added to avoid rounding errors + # $scoreAB{"$a:$b"} = $scoreBA{"$b:$a"} = int(($scoreAB{"$a:$b"} + $scoreBA{"$b:$a"})/2.0 +.5); + +# printf ("scoreAB{%d: %d} = %d\n", $a, $b, $scoreAB{"$a:$b"}); +# printf ("scoreBA{%d: %d} = %d\n", $b, $a, $scoreBA{"$a:$b"}); + #} +# } + +####################################################################################################################################### End modification by Isabella 1 + +##################### Re-sort hits, besthits and bestscore ####################### + for $idA(1..$A){ +# print "Loop index = $idA\n"; +# printf ("hitnAB[%d] = %d\n",$idA, $hitnAB[$idA]); + next if (!($hitnAB[$idA])); + for $hit (1..($hitnAB[$idA]-1)){ # Sort hits by score + while($scoreAB{"$idA:$hitAB[$idA][$hit]"} < $scoreAB{"$idA:$hitAB[$idA][$hit+1]"}){ + $tmp = $hitAB[$idA][$hit]; + $hitAB[$idA][$hit] = $hitAB[$idA][$hit+1]; + $hitAB[$idA][$hit+1] = $tmp; + --$hit if ($hit > 1); + } + } + $bestscore = $bestscoreAB[$idA] = $scoreAB{"$idA:$hitAB[$idA][1]"}; + $besthitAB[$idA] = $hitAB[$idA][1]; + for $hit (2..$hitnAB[$idA]){ + if ($bestscore - $scoreAB{"$idA:$hitAB[$idA][$hit]"} <= $grey_zone){ + $besthitAB[$idA] .= " $hitAB[$idA][$hit]"; + } + else { + last; + } + } + undef $is_besthitAB[$idA]; # Create index that we can check later + grep (vec($is_besthitAB[$idA],$_,1) = 1, split(/ /,$besthitAB[$idA])); +# printf ("besthitAB[%d] = hitAB[%d][%d] = %d\n",$idA,$idA,$hit,$besthitAB[$idA]); + + } + + for $idB(1..$B){ +# print "Loop index = $idB\n"; + next if (!($hitnBA[$idB])); + for $hit (1..($hitnBA[$idB]-1)){ +# Sort hits by score + while($scoreBA{"$idB:$hitBA[$idB][$hit]"} < $scoreBA{"$idB:$hitBA[$idB][$hit+1]"}){ + $tmp = $hitBA[$idB][$hit]; + $hitBA[$idB][$hit] = $hitBA[$idB][$hit+1]; + $hitBA[$idB][$hit+1] = $tmp; + --$hit if ($hit > 1); + } + } + $bestscore = $bestscoreBA[$idB] = $scoreBA{"$idB:$hitBA[$idB][1]"}; + $besthitBA[$idB] = $hitBA[$idB][1]; + for $hit (2..$hitnBA[$idB]){ + if ($bestscore - $scoreBA{"$idB:$hitBA[$idB][$hit]"} <= $grey_zone){ + $besthitBA[$idB] .= " $hitBA[$idB][$hit]"; + } + else {last;} + } + undef $is_besthitBA[$idB]; # Create index that we can check later + grep (vec($is_besthitBA[$idB],$_,1) = 1, split(/ /,$besthitBA[$idB])); +# printf ("besthitBA[%d] = %d\n",$idA,$besthitAB[$idA]); + } + + if ($show_times){ + ($user_time,,,) = times; + printf ("Reading and sorting homologs took %.2f seconds\n", ($user_time - $prev_time)); + $prev_time = $user_time; + } + +###################################################### +# Now find orthologs: +###################################################### + $o = 0; + + for $i(1..$A){ # For each ID in file A + if (defined $besthitAB[$i]){ + @besthits = split(/ /,$besthitAB[$i]); + for $hit (@besthits){ + if (vec($is_besthitBA[$hit],$i,1)){ + ++$o; + $ortoA[$o] = $i; + $ortoB[$o] = $hit; + $ortoS[$o] = $scoreAB{"$i:$hit"}; # Should be equal both ways +# --$o if ($ortoS[$o] == $score_cutoff); # Ignore orthologs that are exactly at score_cutoff + print "Accept! " if ($debug == 2); + } + else {print " " if ($debug == 2);} + printf ("%-20s\t%d\t%-20s\t", $nameA[$i], $bestscoreAB[$i], $nameB[$hit]) if ($debug == 2); + print "$bestscoreBA[$hit]\t$besthitBA[$hit]\n" if ($debug == 2); + } + } + } + print "$o ortholog candidates detected\n" if ($debug); +##################################################### +# Sort orthologs by ID and then by score: +##################################################### + +####################################################################################################### Modification by Isabella 2 + + # Removed time consuiming bubble sort. Created an index array and sort that according to id and score. + # The put all clusters on the right place. + + # Create an array used to store the position each element shall have in the final array + # The elements are initialized with the position numbers + my @position_index_array = (1..$o); + + # Sort the position list according to id + my @id_sorted_position_list = sort { ($ortoA[$a]+$ortoB[$a]) <=> ($ortoA[$b] + $ortoB[$b]) } @position_index_array; + + # Sort the list according to score + my @score_id_sorted_position_list = sort { $ortoS[$b] <=> $ortoS[$a] } @id_sorted_position_list; + + # Create new arrays for the sorted information + my @new_ortoA; + my @new_ortoB; + my @new_orthoS; + + # Add the information to the new arrays in the orer specifeid by the index array + for (my $index_in_list = 0; $index_in_list < scalar @score_id_sorted_position_list; $index_in_list++) { + + + my $old_index = $score_id_sorted_position_list[$index_in_list]; + $new_ortoA[$index_in_list + 1] = $ortoA[$old_index]; + $new_ortoB[$index_in_list + 1] = $ortoB[$old_index]; + $new_ortoS[$index_in_list + 1] = $ortoS[$old_index]; + } + + @ortoA = @new_ortoA; + @ortoB = @new_ortoB; + @ortoS = @new_ortoS; + + # Use bubblesort to sort ortholog pairs by id +# for $i(1..($o-1)){ +# while(($ortoA[$i]+$ortoB[$i]) > ($ortoA[$i+1] + $ortoB[$i+1])){ +# $tempA = $ortoA[$i]; +# $tempB = $ortoB[$i]; +# $tempS = $ortoS[$i]; +# +# $ortoA[$i] = $ortoA[$i+1]; +# $ortoB[$i] = $ortoB[$i+1]; +# $ortoS[$i] = $ortoS[$i+1]; +# +# $ortoA[$i+1] = $tempA; +# $ortoB[$i+1] = $tempB; +# $ortoS[$i+1] = $tempS; +# +# --$i if ($i > 1); +# } +# } +# +# # Use bubblesort to sort ortholog pairs by score +# for $i(1..($o-1)){ +# while($ortoS[$i] < $ortoS[$i+1]){ +# # Swap places: +# $tempA = $ortoA[$i]; +# $tempB = $ortoB[$i]; +# $tempS = $ortoS[$i]; +# +# $ortoA[$i] = $ortoA[$i+1]; +# $ortoB[$i] = $ortoB[$i+1]; +# $ortoS[$i] = $ortoS[$i+1]; +# +# $ortoA[$i+1] = $tempA; +# $ortoB[$i+1] = $tempB; +# $ortoS[$i+1] = $tempS; +# +# --$i if ($i > 1); +# } +# } + +###################################################################################################### End modification bt Isabella 2 + + @all_ortologsA = (); + @all_ortologsB = (); + for $i(1..$o){ + push(@all_ortologsA,$ortoA[$i]); # List of all orthologs + push(@all_ortologsB,$ortoB[$i]); + } + undef $is_ortologA; # Create index that we can check later + undef $is_ortologB; + grep (vec($is_ortologA,$_,1) = 1, @all_ortologsA); + grep (vec($is_ortologB,$_,1) = 1, @all_ortologsB); + + if ($show_times){ + ($user_time,,,) = times; + printf ("Finding and sorting orthologs took %.2f seconds\n", ($user_time - $prev_time)); + $prev_time = $user_time; + } +################################################# +# Read in best hits from blast output file AC +################################################# + if ($use_outgroup){ + $count = 0; + open AC, "$blast_outputAC" or die "Blast output file A->C is missing\n"; + while (<AC>){ + chomp; + @Fld = split(/\s+/); # Get query, match and score + + if( scalar @Fld < 9 ){ + if($Fld[0]=~/done/){ + print STDERR "AC ok\n"; + } + next; + } + + $q = $Fld[0]; + $m = $Fld[1]; + $idQ = $idA{$q}; + $idM = $idC{$m}; + $score = $Fld[2]; + next unless (vec($is_ortologA,$idQ,1)); + + next if (!overlap_test(@Fld)); + + next if ($score < $score_cutoff); + + next if ($count and ($q eq $oldq)); + # Only comes here if this is the best hit: + $besthitAC[$idQ] = $idM; + $bestscoreAC[$idQ] = $score; + $oldq = $q; + ++$count; + } + close AC; +################################################# +# Read in best hits from blast output file BC +################################################# + $count = 0; + open BC, "$blast_outputBC" or die "Blast output file B->C is missing\n"; + while (<BC>){ + chomp; + @Fld = split(/\s+/); # Get query, match and score + + if( scalar @Fld < 9 ){ + if($Fld[0]=~/done/){ + print STDERR "BC ok\n"; + } + next; + } + + $q = $Fld[0]; + $m = $Fld[1]; + $idQ = $idB{$q}; + $idM = $idC{$m}; + $score = $Fld[2]; + next unless (vec($is_ortologB,$idQ,1)); + + next if (!overlap_test(@Fld)); + + next if ($score < $score_cutoff); + + next if ($count and ($q eq $oldq)); + # Only comes here if this is the best hit: + $besthitBC[$idQ] = $idM; + $bestscoreBC[$idQ] = $score; + $oldq = $q; + ++$count; + } + close BC; +################################ +# Detect rooting problems +################################ + $rejected = 0; + @del = (); + $file = "rejected_sequences." . $fasta_seq_fileC; + open OUTGR, ">$file"; + for $i (1..$o){ + $diff1 = $diff2 = 0; + $idA = $ortoA[$i]; + $idB = $ortoB[$i]; + $diff1 = $bestscoreAC[$idA] - $ortoS[$i]; + $diff2 = $bestscoreBC[$idB] - $ortoS[$i]; + if ($diff1 > $outgroup_cutoff){ + print OUTGR "Ortholog pair $i ($nameA[$idA]-$nameB[$idB]). + $nameA[$idA] from $fasta_seq_fileA is closer to $nameC[$besthitAC[$idA]] than to $nameB[$idB]\n"; + print OUTGR " $ortoS[$i] < $bestscoreAC[$idA] by $diff1\n"; + } + if ($diff2 > $outgroup_cutoff){ + print OUTGR "Ortholog pair $i ($nameA[$idA]-$nameB[$idB]). + $nameB[$idB] from $fasta_seq_fileB is closer to $nameC[$besthitBC[$idB]] than to $nameA[$idA]\n"; + print OUTGR " $ortoS[$i] < $bestscoreBC[$idB] by $diff2\n"; + } + if (($diff1 > $outgroup_cutoff) or ($diff2 > $outgroup_cutoff)){ + ++$rejected; + $del[$i] = 1; + } + } + print "Number of rejected groups: $rejected (outgroup sequence was closer by more than $outgroup_cutoff bits)\n"; + close OUTGR; + } # End of $use_outgroup +################################ +# Read inside scores from AA +################################ + $count = 0; + $max_hit = 0; + open AA, "$blast_outputAA" or die "Blast output file A->A is missing\n"; + while (<AA>) { + chomp; # strip newline + + @Fld = split(/\s+/); # Get query and match names + + if( scalar @Fld < 9 ){ + if($Fld[0]=~/done/){ + print STDERR "AA ok\n"; + } + next; + } + + $q = $Fld[0]; + $m = $Fld[1]; + $score = $Fld[2]; + next unless (vec($is_ortologA,$idA{$q},1)); + + next if (!overlap_test(@Fld)); + + next if ($score < $score_cutoff); + + if(!$count || $q ne $oldq){ # New query + $max_hit = $hit if ($hit > $max_hit); + $hit = 0; + $oldq = $q; + } + ++$hit; + ++$count; + $scoreAA{"$idA{$q}:$idA{$m}"} = int($score + 0.5); + $hitAA[$idA{$q}][$hit] = int($idA{$m}); + $hitnAA[$idA{$q}] = $hit; + } + close AA; + if ($output){ + print OUTPUT "$count $fasta_seq_fileA-$fasta_seq_fileA matches\n"; + } +################################ +# Read inside scores from BB +################################ + $count = 0; + open BB, "$blast_outputBB" or die "Blast output file B->B is missing\n"; + while (<BB>) { + chomp; # strip newline + + @Fld = split(/\s+/); # Get query and match names + + if( scalar @Fld < 9 ){ + if($Fld[0]=~/done/){ + print STDERR "BB ok\n"; + } + next; + } + + $q = $Fld[0]; + $m = $Fld[1]; + $score = $Fld[2]; + next unless (vec($is_ortologB,$idB{$q},1)); + + next if (!overlap_test(@Fld)); + + next if ($score < $score_cutoff); + + if(!$count || $q ne $oldq){ # New query + $max_hit = $hit if ($hit > $max_hit); + $oldq = $q; + $hit = 0; + } + ++$count; + ++$hit; + $scoreBB{"$idB{$q}:$idB{$m}"} = int($score + 0.5); + $hitBB[$idB{$q}][$hit] = int($idB{$m}); + $hitnBB[$idB{$q}] = $hit; + } + close BB; + if ($output){ + print OUTPUT "$count $fasta_seq_fileB-$fasta_seq_fileB matches\n"; + } + if ($show_times){ + ($user_time,,,) = times; + printf ("Reading paralogous hits took %.2f seconds\n", ($user_time - $prev_time)); + $prev_time = $user_time; + } + print "Maximum number of hits per sequence was $max_hit\n" if ($debug); +##################################################### +# Find paralogs: +##################################################### + for $i(1..$o){ + $merge[$i] = 0; + next if($del[$i]); # If outgroup species was closer to one of the seed orthologs + $idA = $ortoA[$i]; + $idB = $ortoB[$i]; + local @membersA = (); + local @membersB = (); + + undef $is_paralogA[$i]; + undef $is_paralogB[$i]; + + print "$i: Ortholog pair $nameA[$idA] and $nameB[$idB]. $hitnAA[$idA] hits for A and $hitnBB[$idB] hits for B\n" if ($debug); + # Check if current ortholog is already clustered: + for $j(1..($i-1)){ + # Overlap type 1: Both orthologs already clustered here -> merge + if ((vec($is_paralogA[$j],$idA,1)) and (vec($is_paralogB[$j],$idB,1))){ + $merge[$i] = $j; + print "Merge CASE 1: group $i ($nameB[$idB]-$nameA[$idA]) and $j ($nameB[$ortoB[$j]]-$nameA[$ortoA[$j]])\n" if ($debug); + last; + } + # Overlap type 2: 2 competing ortholog pairs -> merge + elsif (($ortoS[$j] - $ortoS[$i] <= $grey_zone) + and (($ortoA[$j] == $ortoA[$i]) or ($ortoB[$j] == $ortoB[$i])) +# and ($paralogsA[$j]) + ){ # The last condition is false if the previous cluster has been already deleted + $merge[$i] = $j; + print "Merge CASE 2: group $i ($nameA[$ortoA[$i]]-$nameB[$ortoB[$i]]) and $j ($nameA[$ortoA[$j]]-$nameB[$ortoB[$j]])\n" if ($debug); + last; + } + # Overlap type 3: DELETE One of the orthologs belongs to some much stronger cluster -> delete + elsif (((vec($is_paralogA[$j],$idA,1)) or (vec($is_paralogB[$j],$idB,1))) and + ($ortoS[$j] - $ortoS[$i] > $score_cutoff)){ + print "Delete CASE 3: Cluster $i -> $j, score $ortoS[$i] -> $ortoS[$j], ($nameA[$ortoA[$j]]-$nameB[$ortoB[$j]])\n" if ($debug); + $merge[$i]= -1; # Means - do not add sequences to this cluster + $paralogsA[$i] = ""; + $paralogsB[$i] = ""; + last; + } + # Overlap type 4: One of the orthologs is close to the center of other cluster + elsif (((vec($is_paralogA[$j],$idA,1)) and ($confPA[$idA] > $group_overlap_cutoff)) or + ((vec($is_paralogB[$j],$idB,1)) and ($confPB[$idB] > $group_overlap_cutoff))){ + print "Merge CASE 4: Cluster $i -> $j, score $ortoS[$i] -> $ortoS[$j], ($nameA[$ortoA[$j]]-$nameB[$ortoB[$j]])\n" if ($debug); + $merge[$i] = $j; + last; + } + # Overlap type 5: + # All clusters that were overlapping, but not catched by previous "if" statements will be DIVIDED! + } + next if ($merge[$i] < 0); # This cluster should be deleted +##### Check for paralogs in A + $N = $hitnAA[$idA]; + for $j(1..$N){ + $hitID = $hitAA[$idA][$j]; # hit of idA +# print "Working with $nameA[$hitID]\n" if ($debug == 2); + # Decide whether this hit is inside the paralog circle: + if ( ($idA == $hitID) or ($scoreAA{"$idA:$hitID"} >= $bestscoreAB[$idA]) and + ($scoreAA{"$idA:$hitID"} >= $bestscoreAB[$hitID])){ + if ($debug == 2){ + print " Paralog candidates: "; + printf ("%-20s: %-20s", $nameA[$idA], $nameA[$hitID]); + print "\t$scoreAA{\"$idA:$hitID\"} : $bestscoreAB[$idA] : $bestscoreAB[$hitID]\n"; + } + $paralogs = 1; + if ($scoreAA{"$idA:$idA"} == $ortoS[$i]){ + if ($scoreAA{"$idA:$hitID"} == $scoreAA{"$idA:$idA"}){ + $conf_here = 1.0; # In the center + } + else{ + $conf_here = 0.0; # On the border + } + } + else { + $conf_here = ($scoreAA{"$idA:$hitID"} - $ortoS[$i]) / + ($scoreAA{"$idA:$idA"} - $ortoS[$i]); + } + # Check if this paralog candidate is already clustered in other clusters + for $k(1..($i-1)){ + if (vec($is_paralogA[$k],$hitID,1)){ # Yes, found in cluster $k + if($debug == 2){ + print " $nameA[$hitID] is already in cluster $k, together with:"; + print " $nameA[$ortoA[$k]] and $nameB[$ortoB[$k]] "; + print "($scoreAA{\"$ortoA[$k]:$hitID\"})"; + } + if (($confPA[$hitID] >= $conf_here) and + ($j != 1)){ # The seed ortholog CAN NOT remain there + print " and remains there.\n" if ($debug == 2); + $paralogs = 0; # No action + } + else { # Ortholog of THIS cluster is closer than ortholog of competing cluster $k + print " and should be moved here!\n" if ($debug == 2); # Remove from other cluster, add to this cluster + @membersAK = split(/ /, $paralogsA[$k]); # This array contains IDs + $paralogsA[$k] = "";# Remove all paralogs from cluster $k + @tmp = (); + for $m(@membersAK){ + push(@tmp,$m) if ($m != $hitID); # Put other members back + } + $paralogsA[$k] = join(' ',@tmp); + undef $is_paralogA[$k]; # Create index that we can check later + grep (vec($is_paralogA[$k],$_,1) = 1, @tmp); + } + last; + } + } + next if (! $paralogs); # Skip that paralog - it is already in cluster $k + push (@membersA,$hitID); # Add this hit to paralogs of A + } + } + # Calculate confidence values now: + @tmp = (); + for $idP (@membersA){ # For each paralog calculate conf value + if($scoreAA{"$idA:$idA"} == $ortoS[$i]){ + if ($scoreAA{"$idA:$idP"} == $scoreAA{"$idA:$idA"}){ + $confPA[$idP] = 1.00; + } + else{ + $confPA[$idP] = 0.00; + } + } + else{ + $confPA[$idP] = ($scoreAA{"$idA:$idP"} - $ortoS[$i]) / + ($scoreAA{"$idA:$idA"} - $ortoS[$i]); + } + push (@tmp, $idP) if ($confPA[$idP] >= $conf_cutoff); # If one wishes to use only significant paralogs + } + @membersA = @tmp; + ########### Merge if necessary: + if ($merge[$i] > 0){ # Merge existing cluster with overlapping cluster + @tmp = split(/ /,$paralogsA[$merge[$i]]); + for $m (@membersA){ + push (@tmp, $m) unless (vec($is_paralogA[$merge[$i]],$m,1)); + } + $paralogsA[$merge[$i]] = join(' ',@tmp); + undef $is_paralogA[$merge[$i]]; + grep (vec($is_paralogA[$merge[$i]],$_,1) = 1, @tmp); # Refresh index of paralog array + } + ######### Typical new cluster: + else{ # Create a new cluster + $paralogsA[$i] = join(' ',@membersA); + undef $is_paralogA; # Create index that we can check later + grep (vec($is_paralogA[$i],$_,1) = 1, @membersA); + } +##### The same procedure for species B: + $N = $hitnBB[$idB]; + for $j(1..$N){ + $hitID = $hitBB[$idB][$j]; +# print "Working with $nameB[$hitID]\n" if ($debug == 2); + if ( ($idB == $hitID) or ($scoreBB{"$idB:$hitID"} >= $bestscoreBA[$idB]) and + ($scoreBB{"$idB:$hitID"} >= $bestscoreBA[$hitID])){ + if ($debug == 2){ + print " Paralog candidates: "; + printf ("%-20s: %-20s", $nameB[$idB], $nameB[$hitID]); + print "\t$scoreBB{\"$idB:$hitID\"} : "; + print "$bestscoreBA[$idB] : $bestscoreBA[$hitID]\n"; + } + $paralogs = 1; + if ($scoreBB{"$idB:$idB"} == $ortoS[$i]){ + if ($scoreBB{"$idB:$hitID"} == $scoreBB{"$idB:$idB"}){ + $conf_here = 1.0; + } + else{ + $conf_here = 0.0; + } + } + else{ + $conf_here = ($scoreBB{"$idB:$hitID"} - $ortoS[$i]) / + ($scoreBB{"$idB:$idB"} - $ortoS[$i]); + } + + # Check if this paralog candidate is already clustered in other clusters + for $k(1..($i-1)){ + if (vec($is_paralogB[$k],$hitID,1)){ # Yes, found in cluster $k + if($debug == 2){ + print " $nameB[$hitID] is already in cluster $k, together with:"; + print " $nameB[$ortoB[$k]] and $nameA[$ortoA[$k]] "; + print "($scoreBB{\"$ortoB[$k]:$hitID\"})"; + } + if (($confPB[$hitID] >= $conf_here) and + ($j != 1)){ # The seed ortholog CAN NOT remain there + print " and remains there.\n" if ($debug == 2); + $paralogs = 0; # No action + } + else { # Ortholog of THIS cluster is closer than ortholog of competing cluster $k + print " and should be moved here!\n" if ($debug == 2); # Remove from other cluster, add to this cluster + @membersBK = split(/ /, $paralogsB[$k]); # This array contains names, not IDs + $paralogsB[$k] = ""; + @tmp = (); + for $m(@membersBK){ + push(@tmp,$m) if ($m != $hitID); # Put other members back + } + $paralogsB[$k] = join(' ',@tmp); + undef $is_paralogB[$k]; # Create index that we can check later + grep (vec($is_paralogB[$k],$_,1) = 1, @tmp); + } + last; # Don't search in other clusters + } + } + next if (! $paralogs); # Skip that paralog - it is already in cluster $k + push (@membersB,$hitID); + } + } + # Calculate confidence values now: + @tmp = (); + for $idP (@membersB){ # For each paralog calculate conf value + if($scoreBB{"$idB:$idB"} == $ortoS[$i]){ + if ($scoreBB{"$idB:$idP"} == $scoreBB{"$idB:$idB"}){ + $confPB[$idP] = 1.0; + } + else{ + $confPB[$idP] = 0.0; + } + } + else{ + $confPB[$idP] = ($scoreBB{"$idB:$idP"} - $ortoS[$i]) / + ($scoreBB{"$idB:$idB"} - $ortoS[$i]); + } + push (@tmp, $idP) if ($confPB[$idP] >= $conf_cutoff); # If one wishes to use only significant paralogs + } + @membersB = @tmp; + ########### Merge if necessary: + if ($merge[$i] > 0){ # Merge existing cluster with overlapping cluster + @tmp = split(/ /,$paralogsB[$merge[$i]]); + for $m (@membersB){ + push (@tmp, $m) unless (vec($is_paralogB[$merge[$i]],$m,1)); + } + $paralogsB[$merge[$i]] = join(' ',@tmp); + undef $is_paralogB[$merge[$i]]; + grep (vec($is_paralogB[$merge[$i]],$_,1) = 1, @tmp); # Refresh index of paralog array + } + ######### Typical new cluster: + else{ # Create a new cluster + $paralogsB[$i] = join(' ',@membersB); + undef $is_paralogB; # Create index that we can check later + grep (vec($is_paralogB[$i],$_,1) = 1, @membersB); + } + } + if ($show_times){ + ($user_time,,,) = times; + printf ("Finding in-paralogs took %.2f seconds\n", ($user_time - $prev_time)); + $prev_time = $user_time; + } +##################################################### + &clean_up(1); +#################################################### +# Find group for orphans. If cluster contains only one member, find where it should go: + for $i (1..$o){ + @membersA = split(/ /, $paralogsA[$i]); + @membersB = split(/ /, $paralogsB[$i]); + $na = @membersA; + $nb = @membersB; + if (($na == 0) and $nb){ + print "Warning: empty A cluster $i\n"; + for $m (@membersB){ + $bestscore = 0; + $bestgroup = 0; + $bestmatch = 0; + for $j (1..$o) { + next if ($i == $j); # Really need to check against all 100% members of the group. + @membersBJ = split(/ /, $paralogsB[$j]); + for $k (@membersBJ){ + next if ($confPB[$k] != 1); # For all 100% in-paralogs + $score = $scoreBB{"$m:$k"}; + if ($score > $bestscore){ + $bestscore = $score; + $bestgroup = $j; + $bestmatch = $k; + } + } + } + print "Orphan $nameB[$m] goes to group $bestgroup with $nameB[$bestmatch]\n" ; + @members = split(/ /, $paralogsB[$bestgroup]); + push (@members, $m); + $paralogsB[$bestgroup] = join(' ',@members); + $paralogsB[$i] = ""; + undef $is_paralogB[$bestgroup]; + undef $is_paralogB[$i]; + grep (vec($is_paralogB[$bestgroup],$_,1) = 1, @members); # Refresh index of paralog array +# grep (vec($is_paralogB[$i],$_,1) = 1, ()); + } + } + if ($na and ($nb == 0)){ + print "Warning: empty B cluster $i\n"; + for $m (@membersA){ + $bestscore = 0; + $bestgroup = 0; + $bestmatch = 0; + for $j (1..$o) { + next if ($i == $j); + @membersAJ = split(/ /, $paralogsA[$j]); + for $k (@membersAJ){ + next if ($confPA[$k] != 1); # For all 100% in-paralogs + $score = $scoreAA{"$m:$k"}; + if ($score > $bestscore){ + $bestscore = $score; + $bestgroup = $j; + $bestmatch = $k; + } + } + } + print "Orphan $nameA[$m] goes to group $bestgroup with $nameA[$bestmatch]\n"; + @members = split(/ /, $paralogsA[$bestgroup]); + push (@members, $m); + $paralogsA[$bestgroup] = join(' ',@members); + $paralogsA[$i] = ""; + undef $is_paralogA[$bestgroup]; + undef $is_paralogA[$i]; + grep (vec($is_paralogA[$bestgroup],$_,1) = 1, @members); # Refresh index of paralog array +# grep (vec($is_paralogA[$i],$_,1) = 1, ()); + } + } + } + + &clean_up(1); +################### + $htmlfile = "orthologs." . $ARGV[0] . "-" . $ARGV[1] . ".html"; + if ($html){ + open HTML, ">$htmlfile" or warn "Could not write to HTML file $filename\n"; + } + + + if ($output){ + print OUTPUT "\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n"; + print OUTPUT "$o groups of orthologs\n"; + print OUTPUT "$totalA in-paralogs from $fasta_seq_fileA\n"; + print OUTPUT "$totalB in-paralogs from $fasta_seq_fileB\n"; + print OUTPUT "Grey zone $grey_zone bits\n"; + print OUTPUT "Score cutoff $score_cutoff bits\n"; + print OUTPUT "In-paralogs with confidence less than $conf_cutoff not shown\n"; + print OUTPUT "Sequence overlap cutoff $seq_overlap_cutoff\n"; + print OUTPUT "Group merging cutoff $group_overlap_cutoff\n"; + print OUTPUT "Scoring matrix $matrix\n"; + print OUTPUT "\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n"; + } + if ($html){ + print HTML "<pre>\n"; + print HTML "\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n"; + print HTML "$o groups of orthologs\n"; + print HTML "$totalA in-paralogs from $fasta_seq_fileA\n"; + print HTML "$totalB in-paralogs from $fasta_seq_fileB\n"; + print HTML "Grey zone $grey_zone bits\n"; + print HTML "Score cutoff $score_cutoff bits\n"; + print HTML "In-paralogs with confidence less than $conf_cutoff not shown\n"; + print HTML "Sequence overlap cutoff $seq_overlap_cutoff\n"; + print HTML "Group merging cutoff $group_overlap_cutoff\n"; + print HTML "Scoring matrix $matrix\n"; + print HTML "\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n"; + } +# ############################################################################## +# Check for alternative orthologs, sort paralogs by confidence and print results +# ############################################################################## + if ($use_bootstrap and $debug){ + open FF, ">BS_vs_bits" or warn "Could not write to file BS_vs_bits\n"; + } + for $i(1..$o){ + @membersA = split(/ /, $paralogsA[$i]); + @membersB = split(/ /, $paralogsB[$i]); + $message = ""; + $htmlmessage = ""; + + $idB = $ortoB[$i]; + $nB = $hitnBA[$idB]; + for $idA(@membersA){ + next if ($confPA[$idA] != 1.0); + $nA = $hitnAB[$idA]; + $confA[$i] = $ortoS[$i]; # default + $bsA[$idA] = 1.0; + ############## + for $j(1..$nB){ + $idH = $hitBA[$idB][$j]; + ################ Some checks for alternative orthologs: + # 1. Don't consider sequences that are already in this cluster + next if (vec($is_paralogA[$i],$idH,1)); + next if ($confPA[$idH] > 0); # If $conf_cutoff > 0 idH might be incide circle, but not paralog + + # 2. Check if candidate for alternative ortholog is already clustered in stronger clusters + $in_other_cluster = 0; + for $k(1..($i-1)){ # Check if current ortholog is already clustered + if (vec($is_paralogA[$k],$idH,1)){ + $in_other_cluster = $k; + last; + } + } +# next if ($in_other_cluster); # This hit is clustered in cluster $k. It cannot be alternative ortholog + + # 3. The best hit of candidate ortholog should be ortoA or at least to belong into this cluster + @besthits = split (/ /,$besthitAB[$idH]); + $this_family = 0; + for $bh (@besthits){ + $this_family = 1 if ($idB == $bh); + } +# next unless ($this_family); # There was an alternative BA match but it's best match did not belong here + ################# Done with checks - if sequence passed, then it could be an alternative ortholog + $confA[$i] = $ortoS[$i] - $scoreBA{"$idB:$idH"}; + if ($use_bootstrap){ + if ($confA[$i] < $ortoS[$i]){ # Danger zone - check for bootstrap + $bsA[$idA] = &bootstrap($fasta_seq_fileB,$idB,$idA,$idH); + } + else { + $bsA[$idA] = 1.0; + } + } + last; + } + $message .= sprintf("Bootstrap support for %s as seed ortholog is %d%%.", $nameA[$idA], 100*$bsA[$idA]); + $message .= sprintf(" Alternative seed ortholog is %s (%d bits away from this cluster)", $nameA[$idH], $confA[$i]) if ($bsA[$idA] < 0.75); + $message .= sprintf("\n"); + if ($html){ + if ($bsA[$idA] < 0.75){ + $htmlmessage .= sprintf("<font color=\"red\">"); + } + elsif ($bsA[$idA] < 0.95){ + $htmlmessage .= sprintf("<font color=\"\#FFCC00\">"); + } + else { + $htmlmessage .= sprintf("<font color=\"green\">"); + } + $htmlmessage .= sprintf("Bootstrap support for %s as seed ortholog is %d%%.\n", $nameA[$idA], 100*$bsA[$idA]); + $htmlmessage .= sprintf("Alternative seed ortholog is %s (%d bits away from this cluster)\n", $nameA[$idH], $confA[$i]) if ($bsA[$idA] < 0.75); + $htmlmessage .= sprintf("</font>"); + } + printf (FF "%s\t%d\t%d\n", $nameA[$idA], $confA[$i], 100*$bsA[$idA]) if ($use_bootstrap and $debug); + } + ######## + $idA = $ortoA[$i]; + $nA = $hitnAB[$idA]; + for $idB(@membersB){ + next if ($confPB[$idB] != 1.0); + $nB = $hitnBA[$idB]; + $confB[$i] = $ortoS[$i]; # default + $bsB[$idB] = 1.0; + + for $j(1..$nA){ # For all AB hits of given ortholog + $idH = $hitAB[$idA][$j]; + # ############### Some checks for alternative orthologs: + # 1. Don't consider sequences that are already in this cluster + next if (vec($is_paralogB[$i],$idH,1)); + next if ($confPB[$idH] > 0); # If $conf_cutoff > 0 idH might be incide circle, but not paralog + + # 2. Check if candidate for alternative ortholog is already clustered in stronger clusters + $in_other_cluster = 0; + for $k(1..($i-1)){ + if (vec($is_paralogB[$k],$idH,1)){ + $in_other_cluster = $k; + last; # out from this cycle + } + } +# next if ($in_other_cluster); # This hit is clustered in cluster $k. It cannot be alternative ortholog + + # 3. The best hit of candidate ortholog should be ortoA + @besthits = split (/ /,$besthitBA[$idH]); + $this_family = 0; + for $bh (@besthits){ + $this_family = 1 if ($idA == $bh); + } +# next unless ($this_family); # There was an alternative BA match but it's best match did not belong here + # ################ Done with checks - if sequence passed, then it could be an alternative ortholog + $confB[$i] = $ortoS[$i] - $scoreAB{"$idA:$idH"}; + if ($use_bootstrap){ + if ($confB[$i] < $ortoS[$i]){ + $bsB[$idB] = &bootstrap($fasta_seq_fileA,$idA,$idB,$idH); + } + else { + $bsB[$idB] = 1.0; + } + } + last; + } + $message .= sprintf("Bootstrap support for %s as seed ortholog is %d%%.", $nameB[$idB], 100*$bsB[$idB]); + $message .= sprintf(" Alternative seed ortholog is %s (%d bits away from this cluster)", $nameB[$idH],$confB[$i]) if ($bsB[$idB] < 0.75); + $message .= sprintf("\n"); + if ($html){ + if ($bsB[$idB] < 0.75){ + $htmlmessage .= sprintf("<font color=\"red\">"); + } + elsif ($bsB[$idB] < 0.95){ + $htmlmessage .= sprintf("<font color=\"\#FFCC00\">"); + } + else { + $htmlmessage .= sprintf("<font color=\"green\">"); + } + $htmlmessage .= sprintf("Bootstrap support for %s as seed ortholog is %d%%.\n", $nameB[$idB], 100*$bsB[$idB]); + $htmlmessage .= sprintf("Alternative seed ortholog is %s (%d bits away from this cluster)\n", $nameB[$idH],$confB[$i]) if ($bsB[$idB] < 0.75); + $htmlmessage .= sprintf("</font>"); + } + printf (FF "%s\t%d\t%d\n", $nameB[$idB], $confB[$i], 100*$bsB[$idB]) if ($use_bootstrap and $debug); + } + close FF; + ########### Print header ############### + if ($output){ + print OUTPUT "___________________________________________________________________________________\n"; + print OUTPUT "Group of orthologs #" . $i .". Best score $ortoS[$i] bits\n"; + print OUTPUT "Score difference with first non-orthologous sequence - "; + printf (OUTPUT "%s:%d %s:%d\n", $fasta_seq_fileA,$confA[$i],$fasta_seq_fileB,$confB[$i]); + } + + if ($html){ + print HTML "</pre>\n"; + print HTML "<hr WIDTH=\"100%\">"; + print HTML "<h3>"; + print HTML "Group of orthologs #" . $i .". Best score $ortoS[$i] bits<br>\n"; + print HTML "Score difference with first non-orthologous sequence - "; + printf (HTML "%s:%d %s:%d</h3><pre>\n", $fasta_seq_fileA,$confA[$i],$fasta_seq_fileB,$confB[$i]); + } + ########### Sort and print members of A ############ + $nA = @membersA; + $nB = @membersB; + $nMAX = ($nA > $nB) ? $nA : $nB; + # Sort membersA inside the cluster by confidence: + for $m (0..($nA-1)){ + while($confPA[$membersA[$m]] < $confPA[$membersA[$m+1]]){ + $temp = $membersA[$m]; + $membersA[$m] = $membersA[$m+1]; + $membersA[$m+1] = $temp; + --$m if ($m > 1); + } + } + $paralogsA[$i] = join(' ',@membersA); # Put them back together + # Sort membersB inside the cluster by confidence: + for $m (0..($nB-1)){ + while($confPB[$membersB[$m]] < $confPB[$membersB[$m+1]]){ + $temp = $membersB[$m]; + $membersB[$m] = $membersB[$m+1]; + $membersB[$m+1] = $temp; + --$m if ($m > 1); + } + } + $paralogsB[$i] = join(' ',@membersB); # Put them back together + # Print to text file and to HTML file + for $m (0..($nMAX-1)){ + if ($m < $nA){ + if ($output){ + printf (OUTPUT "%-20s\t%.2f%%\t\t", $nameA[$membersA[$m]], (100*$confPA[$membersA[$m]])); + } + if ($html){ + print HTML "<B>" if ($confPA[$membersA[$m]] == 1); + printf (HTML "%-20s\t%.2f%%\t\t", $nameA[$membersA[$m]], (100*$confPA[$membersA[$m]])); + print HTML "</B>" if ($confPA[$membersA[$m]] == 1); + } + } + else { + printf (OUTPUT "%-20s\t%-7s\t\t", " ", " "); + printf (HTML "%-20s\t%-7s\t\t", " ", " ") if ($html); + } + if ($m < $nB){ + if ($output){ + printf (OUTPUT "%-20s\t%.2f%%\n", $nameB[$membersB[$m]], (100*$confPB[$membersB[$m]])); + } + if ($html){ + print HTML "<B>" if ($confPB[$membersB[$m]] == 1); + printf (HTML "%-20s\t%.2f%%", $nameB[$membersB[$m]], (100*$confPB[$membersB[$m]])); + print HTML "</B>" if ($confPB[$membersB[$m]] == 1); + print HTML "\n"; + } + } + else { + printf (OUTPUT "%-20s\t%-7s\n", " ", " ") if($output); + print HTML "\n" if ($html); + } + } + print OUTPUT $message if ($use_bootstrap and $output); + print HTML "$htmlmessage" if ($use_bootstrap and $html); + } + if ($output) { + close OUTPUT; + print "Output saved to file $outputfile\n"; + } + if ($html){ + close HTML; + print "HTML output saved to $htmlfile\n"; + } + + if ($table){ + $filename = "table." . $ARGV[0] . "-" . $ARGV[1]; + open F, ">$filename" or die; + print F "OrtoID\tScore\tOrtoA\tOrtoB\n"; + for $i(1..$o){ + print F "$i\t$ortoS[$i]\t"; + @members = split(/ /, $paralogsA[$i]); + for $m (@members){ + $m =~ s/://g; + printf (F "%s %.3f ", $nameA[$m], $confPA[$m]); + } + print F "\t"; + @members = split(/ /, $paralogsB[$i]); + for $m (@members){ + $m =~ s/://g; + printf (F "%s %.3f ", $nameB[$m], $confPB[$m]); + } + print F "\n"; + } + close F; + print "Table output saved to $filename\n"; + } + if ($mysql_table){ + $filename2 = "sqltable." . $ARGV[0] . "-" . $ARGV[1]; + open F2, ">$filename2" or die; + for $i(1..$o){ + @membersA = split(/ /, $paralogsA[$i]); + for $m (@membersA){ + # $m =~ s/://g; + if ($use_bootstrap && $bsA[$m]) { + printf (F2 "%d\t%d\t%s\t%.3f\t%s\t%d%\n", $i, $ortoS[$i], $ARGV[0], $confPA[$m], $nameA[$m], 100*$bsA[$m]); + } else { + printf (F2 "%d\t%d\t%s\t%.3f\t%s\n", $i, $ortoS[$i], $ARGV[0], $confPA[$m], $nameA[$m]); + } + } + @membersB = split(/ /, $paralogsB[$i]); + for $m (@membersB){ + # $m =~ s/://g; + if ($use_bootstrap && $bsB[$m]) { + printf (F2 "%d\t%d\t%s\t%.3f\t%s\t%d%\n", $i, $ortoS[$i], $ARGV[1], $confPB[$m], $nameB[$m], 100*$bsB[$m]); + }else { + printf (F2 "%d\t%d\t%s\t%.3f\t%s\n", $i, $ortoS[$i], $ARGV[1], $confPB[$m], $nameB[$m]); + } + } + } + close F2; + print "mysql output saved to $filename2\n"; + } + if ($show_times){ + ($user_time,,,) = times; + printf ("Finding bootstrap values and printing took %.2f seconds\n", ($user_time - $prev_time)); + printf ("The overall execution time: %.2f seconds\n", $user_time); + } + if ($run_blast) { + unlink "formatdb.log"; + unlink "$fasta_seq_fileA.phr", "$fasta_seq_fileA.pin", "$fasta_seq_fileA.psq"; + unlink "$fasta_seq_fileB.phr", "$fasta_seq_fileB.pin", "$fasta_seq_fileB.psq" if (@ARGV >= 2); + unlink "$fasta_seq_fileC.phr", "$fasta_seq_fileC.pin", "$fasta_seq_fileC.psq" if ($use_outgroup); + } + } + +############################################################## +# Functions: +############################################################## +sub clean_up { # Sort members within cluster and clusters by size +############################################################################################### Modification by Isabella 3 + + # Sort on index arrays with perl's built in sort instead of using bubble sort. + + $var = shift; + $totalA = $totalB = 0; + # First pass: count members within each cluster + foreach $i (1..$o) { + @membersA = split(/ /, $paralogsA[$i]); + $clusnA[$i] = @membersA; # Number of members in this cluster + $totalA += $clusnA[$i]; + $paralogsA[$i] = join(' ',@membersA); + + @membersB = split(/ /, $paralogsB[$i]); + $clusnB[$i] = @membersB; # Number of members in this cluster + $totalB += $clusnB[$i]; + $paralogsB[$i] = join(' ',@membersB); + + $clusn[$i] = $clusnB[$i] + $clusnA[$i]; # Number of members in given group + } + + # Create an array used to store the position each element shall have in the final array + # The elements are initialized with the position numbers + my @position_index_array = (1..$o); + + # Sort the position list according to cluster size + my @cluster_sorted_position_list = sort { $clusn[$b] <=> $clusn[$a]} @position_index_array; + + # Create new arrays for the sorted information + my @new_paralogsA; + my @new_paralogsB; + my @new_is_paralogA; + my @new_is_paralogB; + my @new_clusn; + my @new_ortoS; + my @new_ortoA; + my @new_ortoB; + + + # Add the information to the new arrays in the orer specifeid by the index array + for (my $index_in_list = 0; $index_in_list < scalar @cluster_sorted_position_list; $index_in_list++) { + + my $old_index = $cluster_sorted_position_list[$index_in_list]; + + if (!$clusn[$old_index]) { + $o = (scalar @new_ortoS) - 1; + last; + } + + $new_paralogsA[$index_in_list + 1] = $paralogsA[$old_index]; + $new_paralogsB[$index_in_list + 1] = $paralogsB[$old_index]; + $new_is_paralogA[$index_in_list + 1] = $is_paralogA[$old_index]; + $new_is_paralogB[$index_in_list + 1] = $is_paralogB[$old_index]; + $new_clusn[$index_in_list + 1] = $clusn[$old_index]; + $new_ortoA[$index_in_list + 1] = $ortoA[$old_index]; + $new_ortoB[$index_in_list + 1] = $ortoB[$old_index]; + $new_ortoS[$index_in_list + 1] = $ortoS[$old_index]; + } + + @paralogsA = @new_paralogsA; + @paralogsB = @new_paralogsB; + @is_paralogA = @new_is_paralogA; + @is_paralogB = @new_is_paralogB; + @clusn = @new_clusn; + @ortoS = @new_ortoS; + @ortoA = @new_ortoA; + @ortoB = @new_ortoB; + + # Create an array used to store the position each element shall have in the final array + # The elements are initialized with the position numbers + @position_index_array = (1..$o); + + # Sort the position list according to score + @score_sorted_position_list = sort { $ortoS[$b] <=> $ortoS[$a] } @position_index_array; + + # Create new arrays for the sorted information + my @new_paralogsA2 = (); + my @new_paralogsB2 = (); + my @new_is_paralogA2 = (); + my @new_is_paralogB2 = (); + my @new_clusn2 = (); + my @new_ortoS2 = (); + my @new_ortoA2 = (); + my @new_ortoB2 = (); + + # Add the information to the new arrays in the orer specifeid by the index array + for (my $index_in_list = 0; $index_in_list < scalar @score_sorted_position_list; $index_in_list++) { + + my $old_index = $score_sorted_position_list[$index_in_list]; + $new_paralogsA2[$index_in_list + 1] = $paralogsA[$old_index]; + $new_paralogsB2[$index_in_list + 1] = $paralogsB[$old_index]; + $new_is_paralogA2[$index_in_list + 1] = $is_paralogA[$old_index]; + $new_is_paralogB2[$index_in_list + 1] = $is_paralogB[$old_index]; + $new_clusn2[$index_in_list + 1] = $clusn[$old_index]; + $new_ortoA2[$index_in_list + 1] = $ortoA[$old_index]; + $new_ortoB2[$index_in_list + 1] = $ortoB[$old_index]; + $new_ortoS2[$index_in_list + 1] = $ortoS[$old_index]; + } + + @paralogsA = @new_paralogsA2; + @paralogsB = @new_paralogsB2; + @is_paralogA = @new_is_paralogA2; + @is_paralogB = @new_is_paralogB2; + @clusn = @new_clusn2; + @ortoS = @new_ortoS2; + @ortoA = @new_ortoA2; + @ortoB = @new_ortoB2; + +#################################################################################### End modification by Isabella 3 + +} +sub bootstrap{ + my $species = shift; + my $seq_id1 = shift; # Query ID from $species + my $seq_id2 = shift; # Best hit ID from other species + my $seq_id3 = shift; # Second best hit + # Retrieve sequence 1 from $species and sequence 2 from opposite species + my $significance = 0.0; + + if ($species eq $fasta_seq_fileA){ + $file1 = $fasta_seq_fileA; + $file2 = $fasta_seq_fileB; + } + elsif ($species eq $fasta_seq_fileB){ + $file1 = $fasta_seq_fileB; + $file2 = $fasta_seq_fileA; + } + else { + print "Bootstrap values for ortholog groups are not calculated\n"; + return 0; + } + + open A, $file1 or die; + $id = 0; + $print_this_seq = 0; + $seq1 = ""; + $seq2 = ""; + + $query_file = $seq_id1 . ".faq"; + open Q, ">$query_file" or die; + + while (<A>){ + if(/^\>/){ + ++$id; + $print_this_seq = ($id == $seq_id1)?1:0; + } + print Q if ($print_this_seq); + } + close A; + close Q; + ### + open B, $file2 or die; + $db_file = $seq_id2 . ".fas"; + open DB, ">$db_file" or die; + $id = 0; + $print_this_seq = 0; + + while (<B>){ + if(/^\>/){ + ++$id; + $print_this_seq = (($id == $seq_id2) or ($id == $seq_id3))?1:0; + } + print DB if ($print_this_seq); + } + close B; + close DB; + + system "$formatdb -i $db_file"; + # Use soft masking in 1-pass mode for simplicity. + system "$blastall -F\"m S\" -i $query_file -z 5000000 -d $db_file -p blastp -M $matrix -m7 | ./$blastParser 0 -a > $seq_id2.faa"; + + # Note: Changed score cutoff 50 to 0 for blast2faa.pl (060402). + # Reason: after a cluster merger a score can be less than the cutoff (50) + # which will remove the sequence in blast2faa.pl. The bootstrapping will + # then fail. + # AGAIN, updaye + + if (-s("$seq_id2.faa")){ + + system("java -jar $seqstat -m $matrix -n 1000 -i $seq_id2.faa > $seq_id2.bs"); # Can handle U, u + + if (-s("$seq_id2.bs")){ + open BS, "$seq_id2.bs" or die "pac failed\n"; + $_ = <BS>; + ($dummy1,$dummy2,$dummy3,$dummy4,$significance) = split(/\s+/); + close BS; + } + else{ + print STDERR "pac failed\n"; # if ($debug); + $significance = -0.01; + } + } + else{ + print STDERR "blast2faa for $query_file / $db_file failed\n"; # if ($debug); + $significance = 0.0; + } + + unlink "$seq_id2.fas", "$seq_id2.faa", "$seq_id2.bs", "$seq_id1.faq"; + unlink "formatdb.log", "$seq_id2.fas.psq", "$seq_id2.fas.pin", "$seq_id2.fas.phr"; + + return $significance; +} + +sub overlap_test{ + my @Fld = @_; + + # Filter out fragmentary hits by: + # Ignore hit if aggregate matching area covers less than $seq_overlap_cutoff of sequence. + # Ignore hit if local matching segments cover less than $segment_coverage_cutoff of sequence. + # + # $Fld[3] and $Fld[4] are query and subject lengths. + # $Fld[5] and $Fld[6] are lengths of the aggregate matching region on query and subject. (From start of first matching segment to end of last matching segment). + # $Fld[7] and $Fld[8] are local matching length on query and subject (Sum of all segments length's on query). + + $retval = 1; +# if ($Fld[3] >= $Fld[4]) { + if ($Fld[5] < ($seq_overlap_cutoff * $Fld[3])) {$retval = 0}; + if ($Fld[7] < ($segment_coverage_cutoff * $Fld[3])) {$retval = 0}; +# } + +# if ($Fld[4] >= $Fld[3]) { + if ($Fld[6] < ($seq_overlap_cutoff * $Fld[4])) {$retval = 0}; + if ($Fld[8] < ($segment_coverage_cutoff * $Fld[4])) {$retval = 0}; +# } + + # print "$Fld[3] $Fld[5] $Fld[7]; $Fld[4] $Fld[6] $Fld[8]; retval=$retval\n"; + + return $retval; +} + +sub do_blast + { + my @parameter=@_; + my $resultfile=@parameter[@parameter-1]; + my $go_to_blast=1; + my $resultfilesize; + if (-e $resultfile) + { + $resultfilesize= -s "$resultfile"; + if ($resultfilesize >10240) + { + $go_to_blast=0; + } + } + if ($go_to_blast) + { + if ($blast_two_passes) + { + do_blast_2pass(@parameter); + } else + { + do_blast_1pass(@parameter); + } + } + } + +sub do_blast_1pass { + my @Fld = @_; + + # $Fld [0] is query + # $Fld [1] is database + # $Fld [2] is query size + # $Fld [3] is database size + # $Fld [4] is output name + + # Use soft masking (low complexity masking by SEG in search phase, not in alignment phase). + system ("$blastall -F\"m S\" -i $Fld[0] -d $Fld[1] -p blastp -v $Fld[3] -b $Fld[3] -M $matrix -z 5000000 -m7 | ./$blastParser $score_cutoff > $Fld[4]"); +} + +sub do_blast_2pass { + + my @Fld = @_; + + # $Fld [0] is query + # $Fld [1] is database + # $Fld [2] is query size + # $Fld [3] is database size + # $Fld [4] is output name + + # assume the script has already formatted the database + # we will now do 2-pass approach + + # load sequences + + %sequencesA = (); + %sequencesB = (); + + open (FHA, $Fld [0]); + while (<FHA>) { + + $aLine = $_; + chomp ($aLine); + + $seq = ""; + + if ($aLine =~ />/) { + @words = split (/\s/, $aLine); + $seqID = $words[0]; + $sequencesA {$seqID} = ""; + } + else { + $sequencesA {$seqID} = $sequencesA {$seqID}.$aLine; + } + } + close (FHA); + + open (FHB, $Fld [1]); + while (<FHB>) { + $aLine = $_; + chomp ($aLine); + + $seq = ""; + + if ($aLine =~ />/) { + @words = split (/\s/, $aLine); + $seqID = $words[0]; + $sequencesB {$seqID} = ""; + } + else { + $sequencesB {$seqID} = $sequencesB {$seqID}.$aLine; + } + } + close (FHB); + + # Do first pass with compositional adjustment on and soft masking. + # This efficiently removes low complexity matches but truncates alignments, + # making a second pass necessary. + print STDERR "\nStarting first BLAST pass for $Fld[0] - $Fld[1] on "; + system("date"); + open FHR, "$blastall -C3 -F\"m S\" -i $Fld[0] -d $Fld[1] -p blastp -v $Fld[3] -b $Fld[3] -M $matrix -z 5000000 -m7 | ./$blastParser $score_cutoff|"; + + %theHits = (); + while (<FHR>) { + $aLine = $_; + chomp ($aLine); + @words = split (/\s+/, $aLine); + + if (exists ($theHits {$words [0]})) { + $theHits {$words [0]} = $theHits {$words [0]}." ".$words [1]; + } + else { + $theHits {$words [0]} = $words [1]; + } + + } + close (FHR); + + $tmpdir = "."; # May be slightly (5%) faster using the RAM disk "/dev/shm". + $tmpi = "$tmpdir/tmpi"; + $tmpd = "$tmpdir/tmpd"; + + # Do second pass with compositional adjustment off to get full-length alignments. + print STDERR "\nStarting second BLAST pass for $Fld[0] - $Fld[1] on "; + system("date"); + unlink "$Fld[4]"; + foreach $aQuery (keys % theHits) { + + # Create single-query file + open (FHT, ">$tmpi"); + print FHT ">$aQuery\n".$sequencesA {">$aQuery"}."\n"; + close (FHT); + + # Create mini-database of hit sequences + open (FHT, ">$tmpd"); + foreach $aHit (split (/\s/, $theHits {$aQuery})) { + print FHT ">$aHit\n".$sequencesB {">$aHit"}."\n"; + } + close (FHT); + + # Run Blast and add to output + system ("$formatdb -i $tmpd"); + system ("$blastall -C0 -FF -i $tmpi -d $tmpd -p blastp -v $Fld[3] -b $Fld[3] -M $matrix -z 5000000 -m7 | ./$blastParser $score_cutoff >> $Fld[4]"); + } + + unlink "$tmpi", "$tmpd", "formatdb.log", "$tmpd.phr", "$tmpd.pin", "$tmpd.psq"; +} + + +# Date Modification +# -------- --------------------------------------------------- +# +# 2006-04-02 [1.36] - Changed score cutoff 50 to 0 for blast2faa.pl. +# Reason: after a cluster merger a score can be less than the cutoff (50) +# which will remove the sequence in blast2faa.pl. The bootstrapping will +# then fail. +# - Fixed bug with index variable in bootstrap routine. +# +# 2006-06-01 [2.0] - Fixed bug in blast_parser.pl: fields 7 and 8 were swapped, +# it was supposed to print match_area before HSP_length. +# - Fixed bug in blastall call: -v param was wrong for the A-B +# and B-A comparisons. +# - +# - Changed "cluster" to "group" consistently in output. +# - Changed "main ortholog" to "seed ortholog" in output. +# - Replace U -> X before running seqstat.jar, otherwise it crashes. +# 2006-08-04 [2.0] - In bootstrap subroutine, replace U with X, otherwise seqstat +# will crash as this is not in the matrix (should fix this in seqstat) +# 2006-08-04 [2.1] - Changed to writing default output to file. +# - Added options to run blast only. +# - Fixed some file closing bugs. +# 2007-12-14 [3.0] - Sped up sorting routines (by Isabella). +# - New XML-based blast_parser. +# - New seqstat.jar to handle u and U. +# - Modified overlap criterion for rejecting matches. Now it agrees with the paper. +# 2009-04-01 [4.0] - Further modification of overlap criteria (require that they are met for both query and subject). +# - Changed bit score cutoff to 40, which is suitable for compositionally adjusted BLAST. +# - Added in 2-pass algorithm. +# 2009-06-11 [4.0] - Moved blasting out to subroutine. +# - Changed blasting in bootstrap subroutine to use unconditional score matrix adjustment and SEG hard masking, +# to be the same as first step of 2-pass blast. +# 2009-06-17 [4.0] - Compensated a Blast "bug" that sometimes gives a self-match lower score than a non-identical match. +# This can happen with score matrix adjustment and can lead to missed orthologs. +# 2009-08-18 [4.0] - Consolidated Blast filtering parameters for 2-pass (-C3 -F\"m S\"; -C0 -FF) +# 2009-10-09 [4.1] - Fixed bug that caused failure if Fasta header lines had more than one word.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PGAP-1.2.1/multiparanoid.pl Thu Jun 24 13:51:52 2021 +0000 @@ -0,0 +1,728 @@ +#!/usr/bin/perl +# +# Copyright (C) Andrey Alexeyenko and Erik Sonnhammer, 2006 +# For a license to use this program, contact Erik.Sonnhammer@cbc.su.se +############################################################################### +# Andrey Alexeyenko, Stockholm Bioinformatics Center, Albanova, Stockholm 2006 +# mailto:avalex99@mail.ru +############################################################################### +# The program is a postprocessor of ortholog tables produced by InParanoid. It combines pairwise InParanoid outputs (say, A-B, B-C, A-C) and makes multi-species clusters of orthologs ABC. + +use strict vars; +use DBI; +use Cwd; +require 5.002; + +our($inputdir, $makeunique, $indirect, $Pair, +@fields, @cutoffs, $output, @fields_out, $out, $algo, $debug, $SQLprefix, +%place, $nplaces, $source_by_cluster, $source_by_gene, $source_by_taken, $seeds, $use_bootstrap, $maxINPscore, $toMainOrthologOfSelf); +my($npairs, $i, $j, $ii, $ee, $gg, $nn, $oo, $s2, $s1, $nspec, $as, $newmain, @distances, @species, $sp1, $sp2, $clusno, $specletters, $a_pair, $a_cluster, $pms, $input, $output, $inputfile, $new, $thepair); +############################################################################### + +#FIRST OF ALL, set the variables $inputdir and $output below. MultiParanoid may not work without it!!! + +#Parameters to submit (may be abbreviated to the first letter): +# OBLIGATORY: +# -species: a list of several species connected with "+"; e.g. mouse+cat+dog; names of the genomes exactly how they are called both in file names and inside of files. NOTE: due to phylogenetic tree conflicts, order of the genomes slightly changes the result. The input files (the InParanoid output) should thus be called: +# sqltable.dog-cat, sqltable.dog-mouse etc. +#To use other file names, one should change the variable $SQLprefix below. + +# OPTIONAL: +# -cutoff: deprecated. But one can use it to tighten the clusters (to filter out weaker in paralogs by restricting with the distance to the seed orthologs); Example: " -cutoff 0.5 " or, for doing 3 different cluster versions at the same time: 0+0.2+0.5. Default: 0 (no cutoff). +# -algo: how to define between-paralogs distances while clustering. +# Options: onlymains (default and only reasonable) +# toallsorted +# -debug: 0 or 1; default: 0 +# -input: deprecated, now defined by -species parameter, and the input files shall have conventional names +# -output: deprecated, special message will tell the name of the output file +# -unique: 0 or 1; set to 1, if the duplicates (genes clustered twice) should be removed; default: 0 + +#The MultiParanoid output header: +### clusterID species gene is_seed_ortholog confidence_score species_in_cluster tree_conflict### + +#is (hopefully) self-explanatory. Though: + +# "is_seed_ortholog" means the protein was a seed ortholog in at least one InParanoid cluster. +# "confidence_score" is an average InParanoid score across the input clusters +# "tree_conflict" indicates that, from the point of view of different species, the number of inparalogs varied in at least one other species ("diff.numbers") or the numbers were the same, but the IDs differed ("diff.names"). + +#NOTE: s.c. 'distances' between cluster members are, in fact, similarities. Variables are called 'distances' for historical reasons. +# 'main orthologs' (defined by InParanoid) are better to be called 'seed orthologs'. + +# Settings: + +$algo = 'onlymains'; +$debug = 0; +$use_bootstrap = 0; +$maxINPscore = 1; +$toMainOrthologOfSelf = 0; +$indirect = 1; #this option allows using the currently default InParanoid output, were seed orthologs are recognized by the feature "IPscore = 1.00". Otherwise ($indirect = 0) a special format should be used (needs a change in the InParanoid script and is now deprecated). + +if ($indirect) { +$SQLprefix = 'sqltable.'; +$inputdir = './'; #change it respectively +$place{'clusterID'} = 0; #these should correspond to the current InParanoid output format +$place{'BLASTscore'} = 1; +$place{'species'} = 2; +$place{'IPscore'} = 3; +$place{'gene'} = 4; +$place{'main'} = 5; +$place{'pair'} = 6; +} +else { +$SQLprefix = 'ava_sql'; +$inputdir = 'disk/dir/multiparanoid/'; #change it respectively +###The older, ava_sql, version: +$place{'clusterID'} = 0; +$place{'BLASTscore'} = 1; +$place{'pair'} = 2; +$place{'species'} = 3; +$place{'gene'} = 4; +$place{'IPscore'} = 5; +$place{'main'} = 6; +#$indirect = 1 if !defined($place{'main'}); +} + +$pms = parseParameters(join(' ', @ARGV)); +if (!$pms->{'s'}) {die "No '+'-delimited input species list (pointing to INPARANOID output) specified. Check parameter '-species'... ";} +@species = split(/\+/, $pms->{'s'}); #list of species (genomes) as they appear in the file names. Default: no default +print "Species list: $pms->{'s'}\n is treated as species ".join(', ', @species)."\n"; +#print "Genomes @species are being processed...\n"; +$makeunique = 0;#also, mind the variables $inputdir and $output below +$makeunique = $pms->{'u'}; + +$specletters = fstLetters(@species); +# $output = 'diskZ/MultiparanoidOuput/'.$specletters.'.MP.sql' if !$output; #change it respectively + +$output = './MP.Cluster' ; #change it respectively + + +@cutoffs = split(/\+/, $pms->{'c'}); #Can restrict included in-paralogs by their distance to the main ones (see example below). Default: 0 (no cutooff) +#@cutoffs = (0, 0.25, 0.5, 0.75, 1); +@cutoffs = (0) if !exists($cutoffs[0]); + +$algo = $pms->{'a'} if !$algo; #which algorithm to use. Default: onlymains +$debug = $pms->{'d'}; #degree of debug messages' output. Default: no +$output = $pms->{'o'} if !$output; +$makeunique = $pms->{'u'} if !$makeunique; + +$nplaces = scalar(keys(%place)); +@fields_out = (clusterID, species, gene, is_seed_ortholog, confidence_score, species_in_cluster, tree_conflict); +push @fields_out, 'cutoff' if scalar(@cutoffs) > 1; +$nspec = scalar(@species); +open OUT, ">$output"; + +print "Genome pairs found and used: \n" if $debug > 0; +for ($s1 = 0; $s1 < $nspec; $s1++) { +for ($s2 = 0; $s2 < $nspec; $s2++) { + next if ($species[$s1] eq $species[$s2]); +$Pair = $species[$s1].'-'.$species[$s2]; +$inputfile = $inputdir.$SQLprefix.$Pair; +if (!system("ls $inputfile")) { +loadData($inputfile); +$npairs++; +}}} +die "Not enough (or too many) species pair files for ".join(", ", @species)."\n" if $npairs != ($nspec * ($nspec - 1)) / 2; +$clusno = 1; + +for $sp1(@species) { +for $sp2(@species) { +$thepair = "$sp1-$sp2"; + +next if (!$source_by_cluster->{$thepair} or ($sp1 eq $sp2)); +print "Analyzing $sp1 vs. $sp2: " if $debug; +print scalar(keys(%{$source_by_cluster->{$thepair}}))." clusters for this pair...\n" if $debug; +for $a_cluster(keys(%{$source_by_cluster->{$thepair}})) { +undef $new; undef $seeds; +$seeds = findOrthos($a_cluster, $thepair); + +#find recordset of orthologs of the both species (for cluster a) from the pairwise subset i-j +next if !defined($seeds->[0]); +LINE1:for $gg(@{$seeds}) { +next if (!$gg->[$place{'main'}] or $gg->[$nplaces]); +$gg->[$nplaces] = 1; +for $a_pair(keys(%{$source_by_cluster})) { +if (($a_pair =~ $gg->[$place{'species'}]) and ($gg->[$place{'pair'}] ne $a_pair)) { +$new = findNewOrtho($gg->[$place{'gene'}], $a_pair); +undef $newmain; +if (defined($new)) { +for $nn(@{$new}) { +push(@{$seeds}, $nn) if isNew($seeds, $nn); +$newmain = 1 if $nn->[$place{'main'}]; +} +redo LINE1 if $newmain; +}}}} + +makeClusters($seeds, \@cutoffs, \@species, $clusno++, treeConflict($seeds), $specletters); +}}} +if ($clusno > 20) { +print OUT join("\t", @fields_out)."\n"; +for $oo(@{$out->{'byRecord'}}) { +print OUT join("\t", @{$oo})."\n" if defined($oo); +} +print "Making ortholog clusters of ".join(', ', @species)." done. The result is in file $output\n"; +} +else { +die "No output has been produced..." +} +########################################################################### + +sub parseParameters { +my($parameters) = @_; +my($key, $value, $pars, %pars); + +#print "$parameters\n"; +$_ = $parameters; + while (m/\-(\w+)\s+([a-zA-Z0-9._=+]+)/g) { +next if ((lc($2) eq 'f') or (lc($2) eq 'false') or (lc($2) eq 'no') or (lc($2) eq '0')); +$pars{substr($1, 0, 1)} = $2; +} +if (scalar(keys(%pars)) < 1 or !$pars{'s'} or m/\-h/) { +print "\nNOT ENOUGH PARAMETERS!\nOBLIGATORY: +-species: a list of several species connected with '+'; e.g. mouse+cat+dog; names of the genomes exactly how they are called both in file names and inside of files. NOTE: due to phylogenetic tree conflicts, order of the genomes slightly changes the result. The input files (the InParanoid output) should thus be called: +sqltable.dog-cat, sqltable.dog-mouse etc. +To use other file names, one should change the variable \$SQLprefix inside the scrtipt. +\nOPTIONAL: +\n-cutoff: deprecated. But one can use it to tighten the clusters (to filter out weaker in paralogs by restricting with the distance to the seed orthologs); Example: ' -cutoff 0.5 ' or, for doing 3 different cluster versions at the same time: 0+0.2+0.5. Default: 0 (no cutoff). +\n-algo: how to define between-paralogs distances while clustering. + Options: onlymains (default and only reasonable) + toallsorted +\n-debug: 0 or 1; default: 0 +\n-input: deprecated, now defined by -species parameter and the variables \$inputdir and \$SQLprefix, and the input files shall have conventional names +\n-output: deprecated, special message will tell the name of the output file +\n-unique: 0 or 1; set to '0' if the duplicates (genes clustered twice) should BE REMOVED, to '1' otherwise; default: 0\n\n"; exit; +} +while (($key, $value) = each(%pars)) { +print "Parameter $key = $value;\n"; +} + +return(\%pars); +} + +sub treeConflict { #tells if there is a discrepancy, for species B, between 2-species trees A-B and B-C +my($seeds) = @_; +my($i, $ii, $jj, $kk, $sets); + +for $ii(@{$seeds}) { +push @{$sets->{$ii->[$place{'species'}]}->{$ii->[$place{'pair'}]}}, $ii->[$place{'gene'}]; +} + +for $ii(keys(%{$sets})) { +for $jj(keys(%{$sets->{$ii}})) { +for $kk(keys(%{$sets->{$ii}})) { +next if $jj eq $kk; +if (scalar(@{$sets->{$ii}->{$jj}}) != scalar(@{$sets->{$ii}->{$kk}})) {return('diff. numbers');} +sort {$a <=> $b} @{$jj}; +sort {$a <=> $b} @{$kk}; +for ($i = 0; $i < scalar(@{$sets->{$ii}->{$jj}}); $i++) { +if ($sets->{$ii}->{$jj}->[$i] ne $sets->{$ii}->{$kk}->[$i]) { +return('diff. names'); +}}}}} +return(undef); +} + +sub makeClusters { +my($seeds, $cutoffs, $species, $clusno, $conflict, $param) = @_; +my($clo, $le, $gg, @included, $distances, $specset, $cco); + +array2StdOut($seeds, "Primers") if $debug; +if ($algo eq 'onlymains') { +$distances = pairWiseDist2Main($seeds); +} +elsif ($algo eq 'toallsorted') { +$distances = pairWiseDist2All($seeds); +} +else {die "Algorithm is not set...";} +array2StdOut(setUnique($seeds), "Unique") if $debug; +for $cco(@{$cutoffs}) { +undef @included; + +@included = firstMains($seeds); +@included = @{setUnique(\@included)} if defined(@included); +array2StdOut(\@included, 'Mains') if $debug; +goto LINE3 if $cco == 1; + +if ($algo eq 'onlymains') { +push @included, @{getOthers( + setUnique($seeds), + $distances, + \@included, + $cco)}; +} +elsif ($algo eq 'toallsorted') { +@included = @{getClosests( + setUnique($seeds), + $distances, + \@included, + $cco)};} +LINE3:array2StdOut(\@included, "Cluster with cut-off $cco") if $debug; +LINE4: $specset = join('', fstLetters(sort(listSpecies(\@included, $species)))); +$conflict = (($conflict) ? $conflict : 'No'); +for $gg(@included) { +next if $makeunique and alreadyClustered($gg, $cco); +$ii = scalar(@{$out->{'byRecord'}}); +$out->{'byName'}->{$gg->[$place{'gene'}]}->{$cco} = $ii; + +for $ee($clusno, + $gg->[$place{'species'}], + $gg->[$place{'gene'}], + $gg->[$place{'main'}], + $gg->[$place{'IPscore'}], + $specset, + $conflict) {push @{$out->{'byRecord'}->[$ii]}, $ee;} +push @{$out->{'byRecord'}->[$ii]}, $cco if scalar(@cutoffs) > 1; + +} +#$out .= join("\t", (100 * $cco, $clusno, $gg->[$place{'species'}], $gg->[$place{'gene'}], $gg->[$place{'main'}], $gg->[$place{'IPscore'}], $specset, $conflict, $param, $algo))."\n" if !$makeunique or !alreadyClustered($gg); +} +return; +} + + +sub fstLetters { +my(@list) = @_; +my($letters, $ii, $firstLetter, $lastLetter); + +return(join('-', sort {$a cmp $b} @list)); +$firstLetter = 0; +for $ii(@list) { +$lastLetter = length($ii) - $firstLetter; +$letters .= "_".substr($ii, $firstLetter, $lastLetter); +} + +return $letters; +} + +sub listSpecies { +my($ar, $species) = @_; +my($already, %already, $ii, @flags); +# checks for species present in the current cluster + +for $ii(@{$ar}) { +push(@flags, $ii->[$place{'species'}]) if !$already{$ii->[$place{'species'}]}; +$already{$ii->[$place{'species'}]} = 1; +} +return(@flags); +} + +sub loadData { +my($inputfile) = @_; +my($line, @line, $i); + +open(IN, $inputfile); +print "Loading $inputfile...\n"; +while (<IN>) { +@line = split /\s+/; +#$place{'BLASTscore'}, $place{'species'}, $place{'IPscore'}, $place{'main'} +for $i(0..scalar(keys(%place))) { +if ($indirect) { +if ($i == $place{'main'}) { +$source_by_cluster->{$indirect ? $Pair : $line[$place{'pair'}]}->{$line[$place{'clusterID'}]}->{$line[$place{'gene'}]}->[$i] = ($line[$place{'IPscore'}] == $maxINPscore) ? 1 : 0; +$source_by_gene->{$indirect ? $Pair : $line[$place{'pair'}]}->{$line[$place{'gene'}]}->{$line[$place{'clusterID'}]}->[$i] = ($line[$place{'IPscore'}] == $maxINPscore) ? 1 : 0; +next; +} +if ($i == $place{'pair'}) { +$source_by_cluster->{$Pair}->{$line[$place{'clusterID'}]}->{$line[$place{'gene'}]}->[$i] = $Pair; +$source_by_gene->{$Pair}->{$line[$place{'gene'}]}->{$line[$place{'clusterID'}]}->[$i] = $Pair; +next; +}} +$source_by_cluster->{$indirect ? $Pair : $line[$place{'pair'}]}->{$line[$place{'clusterID'}]}->{$line[$place{'gene'}]}->[$i] = $line[$i]; +$source_by_gene->{$indirect ? $Pair :$line[$place{'pair'}]}->{$line[$place{'gene'}]}->{$line[$place{'clusterID'}]}->[$i] = $line[$i]; +}} +return; +} + +sub findOrthos { #initially picks up orthologs in the first pair of genomes +my($a, $thepair) = @_; +my($set, $sm, $i, $newel); + +return(undef) if $source_by_taken->{$thepair}->{$a}; +for (keys(%{$source_by_cluster->{$thepair}->{$a}})) { +$newel = $#{$seeds} + 1; +for $i(0..($nplaces - 1)) { +$seeds->[$newel]->[$i] = $source_by_cluster->{$thepair}->{$a}->{$_}->[$i]; +} +$source_by_taken->{$thepair}->{$a} = 1; +} +return $seeds; +} + +sub findNewOrtho { #adds orthologs from the remaining genome pairs +my($g, $thepair) = @_; +my($new, $ii, $j, $jj, $sm, $newel); + +for (keys(%{$source_by_gene->{$thepair}->{$g}})) { +next if $source_by_taken->{$thepair}->{$_}; +$newel = $#{$new} + 1; +for $jj(@{$source_by_gene->{$thepair}->{$g}->{$_}}) { +$new->[$newel]->[$j] = $jj; +$j++; +} +} +if (scalar(@{$new}) == 1) { +$new = findOrthos($new->[0]->[$place{'clusterID'}], $new->[0]->[$place{'pair'}]); + if (scalar(@{$new}) > 1) { +return($new);} + else {print "Unpaired seed ortholog...$new->[0]->[$place{'gene'}]\n";} +} +elsif (scalar(@{$new}) > 1) { + print "Multiple seed orthologs...\nPair '$thepair'\tGene '$g'"; +} +return(undef); +} + +sub isNew { +my($seeds, $new) = @_; +my($ii); + +for $ii(@{$seeds}) { +if (($ii->[$place{'gene'}] eq $new->[$place{'gene'}]) and ($ii->[$place{'pair'}] eq $new->[$place{'pair'}])) { +return(undef); +} +} +return(1); +} + +sub pairWiseDist2All { #between-gene distances for algortihm 'ToAllSorted'; deprecated +my($seeds) = @_; +my($i, $j, $ii, $jj, $new, $old, $same, $le, $distances, $ke, $va); + +$le = scalar(@{$seeds}); + +for ($i = 0; $i < $le; $i++) { +$ii = $seeds->[$i]->[$place{'gene'}]; +#next if $distances->{$ii}->{$jj}; +for ($j = 0; $j < $i; $j++) { +$jj = $seeds->[$j]->[$place{'gene'}]; +undef $same; undef $old; undef $new; +$same = 1 if ($seeds->[$i]->[$place{'species'}] eq $seeds->[$j]->[$place{'species'}]); +if ($same) { +$old = $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]};} +else {$old = $distances->{$ii}->{$jj};} + +next if ($seeds->[$i]->[$place{'pair'}] ne $seeds->[$j]->[$place{'pair'}]); +#THEY DESCRIBE THE SAME PAIR OF GENOMES +if ($same) { +#THEY ARE FROM THE SAME SPECIES +#next if $distances-> {$ii}->{$jj}->{$seeds->[$i]->[$place{'species'}]}; + if (!$seeds->[$i]->[$place{'main'}] and !$seeds->[$j]->[$place{'main'}]) { #BOTH ARE IN-PARALOGS + $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]} = $distances->{$jj}->{$ii}->{$seeds->[$i]->[$place{'pair'}]} = $seeds->[$i]->[$place{'IPscore'}] * $seeds->[$j]->[$place{'IPscore'}]; + } + elsif ($seeds->[$i]->[$place{'main'}]) { #THIS ONE IS IN-PARALOG + $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]} = $distances->{$jj}->{$ii}->{$seeds->[$i]->[$place{'pair'}]} = $seeds->[$j]->[$place{'IPscore'}]; + } + elsif ($seeds->[$j]->[$place{'main'}]) { #THIS ONE IS IN-PARALOG + $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]} = $distances->{$jj}->{$ii}->{$seeds->[$i]->[$place{'pair'}]} = $seeds->[$i]->[$place{'IPscore'}]; + } + else { #BOTH ARE SEED ORTHOLOGS + $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]} = $distances->{$jj}->{$ii}->{$seeds->[$i]->[$place{'pair'}]} = $maxINPscore; + } +$new = $distances->{$ii}->{$jj}->{$seeds->[$i]->[$place{'pair'}]}; +} +else { +#THEY ARE FROM THE DIFFERENT SPECIES +if ($seeds->[$i]->[$place{'main'}] and $seeds->[$j]->[$place{'main'}]) #BOTH ARE SEED ORTHOLOGS +{ + $distances->{$ii}->{$jj} = inpDist($seeds, $i, $j); + $distances->{$jj}->{$ii} = inpDist($seeds, $j, $i); +} +elsif ($seeds->[$i]->[$place{'main'}] or $seeds->[$j]->[$place{'main'}]) #STILL, ONE OF THEM IS IN-PARALOG (NO MATTER WHICH) +{ + $distances->{$ii}->{$jj} = + $distances->{$jj}->{$ii} = +inpDist ($seeds, $i, $j) * inpDist ($seeds, $j, $i) ; +} +else +#BOTH ARE IN-PARALOGS +{ + $distances->{$ii}->{$jj} = +mainDist($seeds, $i, $j) * +inpDist ($seeds, $i, $j) * +inpDist ($seeds, $j, $i) ; + $distances->{$jj}->{$ii} = +mainDist($seeds, $j, $i) * +inpDist ($seeds, $i, $j) * +inpDist ($seeds, $j, $i) ; +} +} +$new = $distances->{$ii}->{$jj}; +if (!$same) {print "Distance $ii-$jj is $new\n" if $debug; print "Distance $jj-$ii is $new\n" if $debug;} +else { +if ($debug) { +print "Distance $ii-$jj is\n"; +while (($ke, $va) = each(%{$distances->{$ii}->{$jj}})) { +print "\tfor $ke: $va\n"; +} +} +} +print "Distance $ii-$jj is not equal to old one $old\n" if ($old and ($old != $new)); +} +} +#for ($k = 0; $k < $le; $k++) {$distances->[$k]->[$k] = 1;} +return($distances); +} + +sub pairWiseDist2Main ($) {#between-gene distances for algortihm 'OnlyMains' +my($seeds) = @_; +my($le, $i, $j, $ii, $jj, $new, $old, $distances); + +$le = scalar(@{$seeds}); +#$le2 = scalar(@{$species}); +for ($i = 0; $i < $le; $i++) { +$ii = $seeds->[$i]->[$place{'gene'}]; +for ($j = 0; $j < $i; $j++) { +$jj = $seeds->[$j]->[$place{'gene'}]; +undef $old; undef $new; +$old = $distances->{$ii}->{$jj}; +next if ($seeds->[$i]->[$place{'pair'}] ne $seeds->[$j]->[$place{'pair'}]); +next if ($seeds->[$i]->[$place{'main'}] and $seeds->[$j]->[$place{'main'}]); #SEED ORTHOLOGS WILL BE CLUSTERED ANYWAY +if ($seeds->[$i]->[$place{'species'}] eq $seeds->[$j]->[$place{'species'}]) { +#if (!$use_bootstrap) {$distances->{$ii}->{$jj} = $distances->{$jj}->{$ii} = $maxINPscore ;next;} +next if !$toMainOrthologOfSelf; +$distances->{$ii}->{$jj} = $distances->{$jj}->{$ii} = +$seeds->[$j]->[$place{'IPscore'}] if $seeds->[$i]->[$place{'main'}]; +$distances->{$ii}->{$jj} = $distances->{$jj}->{$ii} = +$seeds->[$i]->[$place{'IPscore'}] if $seeds->[$j]->[$place{'main'}]; +} #THEY ARE FROM THE SAME SPECIES AND THE DISTANCE IS NOT USED +else { #THEY ARE FROM DIFFERENT SPECIES +if ($seeds->[$i]->[$place{'main'}] or $seeds->[$j]->[$place{'main'}]) { +#ONE OF THEM IS SEED ORTHOLOG (NO MATTER WHICH) + $distances->{$ii}->{$jj} = + $distances->{$jj}->{$ii} = +inpDist ($seeds, $i, $j) * +inpDist ($seeds, $j, $i) ; +} +else { +#BOTH ARE IN-PARALOGS + $distances->{$ii}->{$jj} = +mainDist($seeds, $i, $j) * +inpDist ($seeds, $i, $j) * +inpDist ($seeds, $j, $i) ; + $distances->{$jj}->{$ii} = +mainDist($seeds, $j, $i) * +inpDist ($seeds, $i, $j) * +inpDist ($seeds, $j, $i) ; +} +} +$new = $distances->{$ii}->{$jj}; +print "Distance $ii-$jj is $new\n" if $debug; +print "Distance $jj-$ii is $new\n" if $debug; +print "Distance $ii-$jj is not equal to old one $old\n" if ($old and ($old != $new)); +} +} +return($distances); +} + +sub mainDist ($$$) { #finds distances between two main (seed) orthologs +my($seeds, $from, $to) = @_; +my($ii, $pair1, $pair2, $sum, $nu, $nc); + +return($maxINPscore) if !$use_bootstrap; #if we think bootstrap values are not relevant in calculating the distance, then every Main-Main distance should bemaxINPscore = 1 (and this is the default) +$pair1 = $seeds->[$from]->[$place{'species'}].'-'.$seeds->[$to]->[$place{'species'}]; +$pair2 = $seeds->[$to]->[$place{'species'}].'-'.$seeds->[$from]->[$place{'species'}]; +$sum = $nc = $nu = 0; + +for $ii(@{$seeds}) { + if ( + ($ii->[$place{'main'}]) and + (($ii->[$place{'pair'}] eq $pair1) or ($ii->[$place{'pair'}] eq $pair2)) and + ($ii->[$place{'species'}]) eq $seeds->[$from]->[$place{'species'}]) { + $nc++; + $sum += $ii->[$place{'IPscore'}]; + } +} + +if (!$nc) { +print "Distance $seeds->[$from]->[$place{'gene'}]-$seeds->[$to]->[$place{'gene'}] (pair $pair1 or $pair2) not found\n"; +return(undef); +} +else +{ +return($sum / $nc); +} +} + +sub inpDist ($$$) { #finds a distance from main (seed) ortholog to in-paralog +my($seeds, $from, $to) = @_; +my($le, $ii, $pair1, $pair2, $sum, $nu, $nc); + +return($maxINPscore) if ($seeds->[$from]->[$place{'main'}] and !$use_bootstrap); +$le = scalar(@{$seeds}); +$pair1 = $seeds->[$from]->[$place{'species'}].'-'.$seeds->[$to]->[$place{'species'}]; +$pair2 = $seeds->[$to]->[$place{'species'}].'-'.$seeds->[$from]->[$place{'species'}]; +$sum = $nc = $nu = 0; +for $ii(@{$seeds}) { + if ( + ($ii->[$place{'gene'}] eq $seeds->[$from]->[$place{'gene'}]) and + (($ii->[$place{'pair'}] eq $pair1) or + ($ii->[$place{'pair'}] eq $pair2))) { +return($ii->[$place{'IPscore'}]); + } +} +print "Distance $seeds->[$from]->[$place{'gene'}]-$seeds->[$to]->[$place{'gene'}] not found\n"; +return undef; +} + +sub sortBy ($$) { +my($num, $ar) = @_; +my($i, $max, $maxnum, @sorted); + +#array2StdOut($ar, 'Before'); + +LINE001: while (scalar(@{$ar}) > 0) { +if (scalar(@{$ar}) == 1 and !$ar->[$i]->[$num]) { +last; +} +$max = 0; +undef $maxnum; +for ($i = 0; $i < scalar(@{$ar}); $i++) { +last if !$ar->[$i]->[$num]; +if ($ar->[$i]->[$num] > $max) { +$max = $ar->[$i]->[$num]; +$maxnum = $i; +} +} +if (defined($maxnum)) { +push @sorted, splice(@{$ar}, $maxnum, 1); +next LINE001; +} +} +return(\@sorted); +} + +sub setUnique ($) { #removes duplicated entries from the (pre-)cluster +my($seeds) = @_; +my($i, $j, $unique, $sds); + +$sds->[$place{'clusterID'}] = $seeds->[$place{'clusterID'}]; +for ($i = 1; $i < scalar(@{$seeds}); $i++) { +$unique = 1; +for ($j = 0; $j < scalar(@{$sds}); $j++) { +if (($seeds->[$i]->[$place{'species'}] eq $sds->[$j]->[$place{'species'}]) and ($seeds->[$i]->[$place{'gene'}] eq $sds->[$j]->[$place{'gene'}])) +{$unique = 0; +last;} +} +if ($unique) {push(@{$sds}, $seeds->[$i]);} +} +return($sds); +} + +sub isAlready ($$) { +my($as, $included) = @_; +my($ai); + +for $ai(@{$included}) { +return(1) if ($ai->[4] eq $as); +} +return(undef); +} + +sub alreadyClustered { #helps to avoid clustering same gene twice. Of two alternatives, the one is chosen where the gene is a main (seed) ortholog or was included first. +my($g, $cco) = @_; +my($ai); + +return(undef) if !defined($out->{'byName'}->{$g->[$place{'gene'}]}->{$cco}); + +if ($out->{'byRecord'}->[$out->{'byName'}->{$g->[$place{'gene'}]}->{$cco}]->[3]) { +return 1; +} +else { +undef $out->{'byRecord'}->[$out->{'byName'}->{$g->[$place{'gene'}]}->{$cco}]; +return(undef); +}} + +sub firstMains { #seed orthologs are included unconditionally +my($seeds) = @_; +my($i, $le, @included); + +$le = scalar(@{$seeds}); +for ($i = 0; $i < $le; $i++) { +# and ($seeds->[$i]->[$place{'IPscore'}] == 1) +if ($seeds->[$i]->[$place{'main'}]) { +push @included, $seeds->[$i]; +} +} +return(@included); +} + +sub getOthers ($$$$) { #retains only genes with distance (well, it is in fact a SIMILARITY) above the cut-off $cco. If it is not set, or equals to 0, the procedure just returns IPscore as a reference - to be saved in the output file. +my($seeds, $distances, $included, $cco) = @_; +my($i, $le, $les2, $nc, $ii, $jj, $dist, $ad, $nai, @others); + +$le = scalar(@{$seeds}); +for $ii(@{$seeds}) { +next if isAlready($ii->[$place{'gene'}], $included); +undef $dist; undef $nc; +for $jj(@{$included}) { +next if (($jj->[$place{'species'}] eq $ii->[$place{'species'}]) and !$toMainOrthologOfSelf); +if (!$distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]}) { +print "The distance between $ii->[$place{'gene'}] and $jj->[$place{'gene'}] does not exist...\n" if $debug; +next; +} +if ($jj->[$place{'main'}]) { +print "( $ii->[$place{'gene'}] - $jj->[$place{'gene'}] ) + " if $debug; +$dist += $distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]}; +$nc++; +} +} +print " / $nc = ".$dist/$nc."\n" if $debug; +return(undef) if !$nc; +if ($dist / $nc >= $cco) { +push @others, $ii; +$others[$#others]->[$place{'IPscore'}] = $dist / $nc; +} +} +return(\@others); +} + +sub getClosests ($$$$) { #used only in the deprecated 'ToAllSorted' algorithm +my($seeds, $distances, $included, $cco) = @_; +my($ni, $ii, $jj, $kk, $dist, $gotnew, $toadd, $nt, $max); + +do { +undef $gotnew; +for $ii(@{$seeds}) { +next if isAlready($ii->[$place{'gene'}], $included); +undef $ni; undef $dist; undef $max; +for $jj(@{$included}) { +undef $nt; undef $toadd; +print "$ii->[$place{'gene'}] vs $jj->[$place{'gene'}] \n" if $debug; +if ($ii->[$place{'species'}] ne $jj->[$place{'species'}]) { +$toadd = $distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]}; +} +else { +for $kk(keys(%{$distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]}})) { +$toadd += $distances->{$ii->[$place{'gene'}]}->{$jj->[$place{'gene'}]}->{$kk}; $nt++; +} +$toadd = $toadd / $nt if $nt; +} +$dist += $toadd; $ni++ if $toadd; +print "A distance for $ii->[$place{'gene'}] and $jj->[$place{'gene'}] does not exist...\n" if (!$toadd and $debug); +} +if ($ni and ($dist / $ni > $max->['IPscore'])) { +$max = $ii; $max->['IPscore'] = $dist / $ni; +} +if ($max->['IPscore'] >= $cco) { +print "Included $max->[4] as $dist / $ni > $cco\n" if $debug; +push(@{$included}, $max); +$gotnew = 1; +} +} +} while ($gotnew); +return($included); +} + +sub array2StdOut { #for debug purpose only +my($ar, $name) = @_; +my($ii, $jj); + +print "$name: \n"; +return(0) if !scalar(@{$ar}); +for $ii(@{$ar}) { +for $jj(@{$ii}) { +print "$jj\t" if defined($jj); +} +print "\n"; +} +return undef; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PGAP-1.2.1/readme.txt Thu Jun 24 13:51:52 2021 +0000 @@ -0,0 +1,22 @@ +-=- PGAP 1.2.1 README -=- +PGAP is a Pan-Genomes Analysis Pipeline. + +For the installation and usage of PGAP, please refer to manual.pdf. + +There is also a brief tutorial in manual.pdf. + +-=- Change Log 1.2.1 -=- +1.Fix the bug in Converter_NCBINewFormatData.pl. + +-=- Change Log 1.2.0 -=- +1.A new script (Converter_NCBINewFormatData.pl) was developed for converting new format bacterial genomic data from NCBI at ftp://ftp.ncbi.nlm.nih.gov/genomes/all + +-=- Change Log 1.12 -=- +1.Fix the bug in the Perl converter Converter_draft.pl. + +-=- Change Log 1.11 -=- +1.Fix the bug in the two Perl converters (Converter_finished.pl and Converter_draft.pl). + +-=- Change Log 1.1 -=- +1.Add sampling strategy into the pan-genome analysis module. This strategy will efficiently reduce time cost and memory cost. +2.Revise the input data for SNP-based phylogenetic analysis
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PGAP.xml Thu Jun 24 13:51:52 2021 +0000 @@ -0,0 +1,38 @@ +<tool id="PGAP" name="PGAP" version="1.0"> + <description>Pan-Genome Analysis Pipeline </description> + <requirements> + <requirement type="package" version="2.11.0">blast</requirement> + <requirement type="package" version="7.480">mafft</requirement> + <requirement type="package" version="14.137">mcl</requirement> + <requirement type="package" version="3.697">phylip</requirement> + <requirement type="package" version="1.7.2">perl-bioperl</requirement> +</requirements> --> + <command detect_errors="aggressive"><![CDATA[ + #import re + + ## Creates symlinks for each input file based on the Galaxy 'element_identifier' + ## Used so that a human-readable name appears in the output table (instead of 'dataset_xyz.dat') + #set $named_input_files = '' + #for $input_file in $input_files + ## Add single quotes around each input file identifier + #set $_input_file = "'{}'".format($input_file.element_identifier) + ln -s '${input_file}' ${_input_file} && + #set $named_input_files = $named_input_files + ',' + $_input_file + #end for + + perl ${__tool_directory__}/PGAP_wrapper2.pl -g $input_files -p $proteins -o $named_input_files -o $output + ]]></command> + + + + <inputs> + <param format="fasta" name="input_files" type="data" multiple="true" label="Coding genes FASTA files"/> + <param format="fasta" name="proteins" type="data" multiple="true" label="Protein FASTA files"/> + </inputs> + + <outputs> + <data format="txt" name="output" label="Pangenome presence absence matrix"/> + +</outputs> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PGAP_wrapper2.pl Thu Jun 24 13:51:52 2021 +0000 @@ -0,0 +1,65 @@ +#!/usr/bin/perl + +use strict; +use Getopt::Long; +use Bio::SeqIO; +use File::Basename; +my $dirname = dirname(__FILE__); +use Cwd qw(cwd); +my $dir = cwd; + +my $usage = qq~Usage:$0 <args> [<opts>] + +where <args> are: + + -g, --genes <list of gene fasta files. Comma separated list> + -p, --proteins <list of protein fasta files. Comma separated list> + -o, --order <list of samples in the same order. Comma separated list> + -o, --out <output name> +~; +$usage .= "\n"; + +my ($genes,$proteins,$out,$order); + +GetOptions( +"genes=s" => \$genes, +"proteins=s" => \$proteins, +"out=s" => \$out, +"order=s" => \$order +); + + +die $usage + if ( !$proteins || !$genes || !$out || !$order); + +my @names = split(",",$order); +my @list; +mkdir("tmpdir$$"); +my @gene_files = split(/,/,$genes); +my $n = 0; +foreach my $gene_file(@gene_files){ + my $particule = $names[$i]; + system("cp $gene_file tmpdir$$/$particule.nuc"); + $n++; +} +$n = 0; +my @protein_files = split(/,/,$proteins); +foreach my $protein_file(@protein_files){ + my $particule = $names[0]; + system("cp $protein_file tmpdir$$/$particule.pep"); + open(F,"$protein_file"); + open(F2,">tmpdir$$/$particule.function"); + while(<F>){ + if (/>(.*)/){ + print F2 "$1 - unknown\n"; + } + } + close(F); + close(F2); + $n++; +} + +#chdir("$dirname/PGAP-1.2.1"); +my $cmd = "perl $dirname/PGAP-1.2.1/PGAP.pl --input tmpdir$$ --output outdir --cluster --pangenome --variation --evolution --function --strains ".join("+",@list)." --method GF"; +system($cmd); +system("cp -rf outdir/1.Orthologs_Cluster.txt $out");
