Mercurial > repos > dereeper > pgap
diff PGAP-1.2.1/PGAP.pl @ 0:83e62a1aeeeb draft
Uploaded
author | dereeper |
---|---|
date | Thu, 24 Jun 2021 13:51:52 +0000 |
parents | |
children | 70b7a5270968 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PGAP-1.2.1/PGAP.pl Thu Jun 24 13:51:52 2021 +0000 @@ -0,0 +1,2825 @@ +#!/usr/bin/perl +use strict; +use warnings; +use Getopt::Long; + +### programs from BLAST +my $formatdb="formatdb"; +my $blastall="blastall"; + +### programs from mcl +my $mcl="mcl"; + +### programs from mafft +my $mafft="mafft"; + +### programs from PHYLIP +my $seqboot="seqboot"; +my $neighbor="neighbor"; +my $consense="consense"; +my $dnaml="dnaml"; +my $dnadist="dnadist"; +my $dnapars="dnapars"; + +my $count_tree=0; + +my $sampleSize=8000; # when calculate the pan-genome size, we will sample $sampleSize combinations + # if the total combination number is larger than $sampleSize for specific genomes + # Surely, the number of $sampleSize is, the larger, the better. + # However, the larger the $sampleSize is, the more time would be consumed. + # we suggest the range: 5000 ~ 20,000 + +##################################################################### +# DOn't modify the following code, unless you know their functions +##################################################################### + +my %opt=qw(); +GetOptions(\%opt,"strains:s","input:s","output:s","cluster!","pangenome!","variation!","evolution!","function!","method:s","thread:i","score:f","evalue:f","coverage:f","local:f","global:f","identity:f","bootstrap:i","help|h!"); + +my @usage=qq( +====== Pan-Genome Analysis Pipeline (PGAP) ====== + Version 1.2.1 + +Usage: perl PGAP.pl [Options] + +Options: + --strains String Input strains nicknames, and join them with '+', for example: A+B+C + --input String Input data directory + --output String Result output directory + + --cluster Run homologous gene clustering + --pangenome Run pan-genome analysis + --variation Run homologous clusters variation analysis + --evolution Run evolution analysis + --function Run Function analysis + + --method String GF for GeneFamily method, and MP for MultiParanoid method + for GF: fast, but not very accurate + evalue, score, indentity, coverage are employed + for MP: slow, but more accurate + score, coverage, local, global are employed + --thread Int Number of processors to use in blastall. [default:1] + --score Int Minimum score in blastall. [default:40] + --evalue Decimal Maximal E-value in blastall. [default:1e-10] + --coverage Decimal Minimum alignment coverage for two homologous proteins. [default:0.5] + --local Decimal Minimum local alignment overlap in MP method. [default:0.25] + --global Decimal Minimum global alignment overlap in MP method. [default:0.5] + --identity Decimal Minimum alignment indentity for two homologous proteins. [default:0.5] + --bootstrap Int Bootstrap times for phylogenetics tree. [default:1] + + --h or help Display this message +); + + +############# specified variable ############# +my $inputDIR; +my $outputDIR; +my $run_cluster; +my $run_pangenome; +my $run_variation; +my $run_evolution; +my $run_function; +my $method=""; +my $thread; +my $score; +my $identity; +my $evalue; +my $coverage; +my $global; +my $local; +my $bootstrap; + + +my %pep; +my %nuc; +my $spnum; +my @clusters; +my $Cluster; +my @SpecieCombination; +my @spID; +my %genenum; +my %aaAln; +my %ntAln; +my %cog; +my %description; +#my %aa4tree; ### AA sequence for Phylogenetic Tree +my %nt4tree; ### nucleotide sequence for Phylogenetic Tree +my @SNPPosition; ### SNP position +my $dieMessage="You did not run PGAP.pl in the program directory\n"; +my $section; + +######### common temporary variable ############# +my $i; +my $j; +my $line; +my %tmpHash; +my @tmp; +my $tmp; +my $key; +my @row; +my $inparacount; +my $ClusterID; +my $orth; +my @content; +my $clusterName; +my @xdata; +my @ydata; +my @fit; + +my $fit_A; +my $fit_A_interval; +my $fit_B; +my $fit_C; +my $fit_C_interval; +my $fit_Rsquare; + + + +#### check option + +my $opt_error=0; + +if ((scalar(keys %opt) ==0) or (exists($opt{"help"}))) + { + print join("\n",@usage)."\n"; + exit; + } + + +###################### public info +### strains name +my @species; +if (exists($opt{"strains"})) + { + @species=split(/\+/,$opt{"strains"}); + $spnum=scalar(@species); + }else + { + print "Please assign strains nick name!\n"; + exit; + } + +### input data directory + +if (exists($opt{"input"})) + { + $inputDIR=$opt{"input"}; + if ($inputDIR!~/\/$/) + { + $inputDIR=$inputDIR."/"; + } + }else + { + print "Please assign input data directory!\n\n"; + exit; + } +### output data directory + +if (exists($opt{"output"})) + { + $outputDIR=$opt{"output"}; + if ($outputDIR!~/\/$/) + { + $outputDIR=$outputDIR."/"; + } + }else + { + print "Please assign result output directory!\n\n"; + exit; + } + +###################### section info + +if (exists($opt{"cluster"})) + { + $run_cluster=1; + }else + { + $run_cluster=0; + } + +if (exists($opt{"pangenome"})) + { + $run_pangenome=1; + }else + { + $run_pangenome=0; + } + +if (exists($opt{"variation"})) + { + $run_variation=1; + }else + { + $run_variation=0; + } + +if (exists($opt{"evolution"})) + { + $run_evolution=1; + }else + { + $run_evolution=0; + } + +if (exists($opt{"function"})) + { + $run_function=1; + }else + { + $run_function=0; + } + +if ($run_cluster) + { + ### method + if (exists($opt{"method"})) + { + $method=uc($opt{"method"}); + if ($method!~/^GF$/ and $method!~/^MP$/) + { + print "Unknown method: ".$opt{"method"}."\n"; + exit; + } + }else + { + print "Please assign the cluster method!\n\n"; + exit; + } + + ##thread + if (exists($opt{"thread"})) + { + $thread=$opt{"thread"}; + if ($thread==0) + { + print "please assign an applicable thread value.\n"; + exit; + } + }else + { + $thread=1; + } + + ##score + if (exists($opt{"score"})) + { + $score=$opt{"score"}; + if ($score<=0) + { + print "please assign an applicable score value.\n"; + exit; + } + }else + { + $score=40; + } + + if ($method eq "GF") + { + ###identity + if (exists($opt{"identity"})) + { + $identity=$opt{"identity"}; + if ($identity>1 or $identity<=0) + { + print "identity should be 0 ~ 1 \n"; + exit; + } + }else + { + $identity=0.5; + } + + ###evalue + if (exists($opt{"evalue"})) + { + $evalue=$opt{"evalue"}; + }else + { + $evalue=1e-10; + } + + ###coverage + if (exists($opt{"coverage"})) + { + $coverage=$opt{"coverage"}; + if ($coverage>1 or $coverage<=0) + { + print "coverage should be 0 ~ 1 \n"; + exit; + } + }else + { + $coverage=0.5; + } + } + + + if ($method eq "MP") + { + ###global + if (exists($opt{"global"})) + { + $global=$opt{"global"}; + if ($global>1 or $global<=0) + { + print "global coverage should be 0 ~ 1 \n"; + exit; + } + }else + { + $global=0.5; + } + ###local + if (exists($opt{"local"})) + { + $local=$opt{"local"}; + if ($local<=0) + { + print "local coverage should be 0 ~ [global coverage value] \n"; + exit; + } + if ($local>$global) + { + print "local coverage should be less than global coverage!\n"; + exit; + } + }else + { + $local=0.25; + } + } + } + +if ($run_evolution) + { + if (exists($opt{"bootstrap"})) + { + $bootstrap=$opt{"bootstrap"}; + if ($bootstrap<=0) + { + print "please assign an applicable bootstrap value.\n"; + } + }else + { + $bootstrap=1; + } + } + +print "Program begin at ".localtime()."\n"; +print "The following are the parameters for current process:\n"; + print "Strains: ".join(",",@species)."\n"; + print "Input directory: $inputDIR\n"; + print "Output directory: $outputDIR\n"; +if ($run_cluster) + { + print "Cluster analysis: yes\n"; + print " Method: $method\n"; + print " Thread: $thread\n"; + if ($method eq "GF") + { + print " E-value: $evalue\n"; + print " Identity: $identity\n"; + print " Coverage: $coverage\n"; + print " Score: $score\n"; + } + if ($method eq "MP") + { + print " Local: $local\n"; + print " Global: $global\n"; + print " Score: $score\n"; + } + }else + { + print "Cluster analysis: no\n"; + } +if ($run_pangenome) + { + print "Pan-genome analysis: yes\n"; + }else + { + print "Pan-genome analysis: no\n"; + } +if ($run_variation) + { + print "Variation analysis: yes\n"; + }else + { + print "Variation analysis: no\n"; + } +if ($run_evolution) + { + print "Evolution analysis: yes\n"; + print " Bootstrap: $bootstrap\n"; + }else + { + print "Evolution analysis: no\n"; + } +if ($run_function) + { + print "Function analysis: yes\n"; + }else + { + print "Function analysis: no\n"; + } + +$section=$run_cluster.$run_pangenome.$run_variation.$run_evolution.$run_function; + +############################################### +# section 0) check input file and program +############################################### +if (!(-e $outputDIR)) + { + system("mkdir $outputDIR"); + } +system("chmod +rw $outputDIR"); + +if (!(-w $outputDIR)) + { + print "There is no WRITE permission in $outputDIR\n"; + exit; + } +@tmp=qw(); +&CheckInputFile(\@species,$inputDIR,$section,$method,\@tmp); +&CheckExtraProgram($section,$method,\@tmp); + +if (scalar(@tmp)>0) + { + open(R,">".$outputDIR."0.error.message"); + print R join("",@tmp)."\n"; + close(R); + print "error!\nlog are saved in ${outputDIR}0.error.message\n"; + exit; + } + + +############################################ +# section 1) cluster analysis +############################################ + +if ($run_cluster) + { + print "\n\n############################################\n"; + print "# section 1) cluster analysis\n"; + print "############################################\n\n\n"; + +#### cluster gene and return result to the array @clusters + + if ($method eq "MP") + { + print "Begin cluster gene with MP method ...\n"; + &MP(); + }else + { + print "Begin cluster gene with GF method ...\n"; + &GF(); + } + +#### output normal cluster format + + &FormatClusterOutPut(\@species,"${outputDIR}1.Orthologs_Cluster.txt",\@clusters); + +#### Retrieve cluster + + &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters); + +##### gene distribution in each strains + %tmpHash=(); + &GeneDistribution(\@clusters,\%tmpHash); + + open(R,">${outputDIR}1.Gene_Distribution_By_Conservation.txt"); + print R "SharedBy_Strains\t".join("\t",@species)."\n"; + + for ($i=$spnum;$i>0;$i--) + { + print R $i; + for ($j=0;$j<$spnum;$j++) + { + if (exists($tmpHash{$i."|".$j})) + { + print R "\t".$tmpHash{$i."|".$j}; + }else + { + print R "\t0"; + } + } + print R "\n"; + } + + close(R); + + + }else + { + print "Homologous gene clustering is skipped!\n"; + } + + +if ($run_pangenome) + { + print "\n\n############################################\n"; + print "# section 2) Pan-genome analysis\n"; + print "############################################\n\n\n"; + +#### Retrieve cluster + &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters); + chomp(@clusters); + +#### convert file into 0-1 matrix + for ($line=0;$line<@clusters;$line++) + { + @row=split(/\t/,$clusters[$line]); + splice(@row,0,1); + for ($i=0;$i<@row;$i++) + { + if ($row[$i] eq "-") + { + $row[$i]=0; + }else + { + $row[$i]=1; + } + } + $clusters[$line]=join("\t",@row); + } + +#### fetch gene number of each strains +for ($i=0;$i<$spnum;$i++) + { + open(F,"$inputDIR$species[$i].pep"); + @tmp=<F>; + close(F); + @tmp=grep(/^>/,@tmp); + $genenum{$species[$i]}=scalar(@tmp); + } + +#### pan genome size and core genome size +print "Deducing pan genome size and core genome size for each composition...\n\n"; + +open(PAN,">${outputDIR}2.PanGenome.Data.txt"); +print PAN "ClusterConservation\tTotalGeneNumber\tPanGenome\tCoreGenome\n"; + +for ($i=1;$i<=scalar(@species);$i++) + { + #@SpecieCombination=&Combination(\@species,$i); + #@SpecieCombination=&Combination($spnum,$i); + if (&ChkCombinationValue($spnum,$i) !=0) ### transfer the array reference to the subroutine + { + &Combination($spnum,$i,\@SpecieCombination); ## if the combination number is less than sampleSize, then fecth all, else sample + }else + { + &SampleCombination($spnum,$i,\@SpecieCombination); + } + + foreach $key (@SpecieCombination) + { + ##### count total gene number in current combination + $tmp=0; + @spID=split(/\t/,$key); #### speices id in current combination + foreach (@spID) + { + $tmp=$tmp+$genenum{$species[$_]}; + } + ##### scan pangenome and coregenome + @tmp=split(/\t/,&PanGenomeNumber(\@spID)); + print PAN "$i\t$tmp\t".join("\t",@tmp)."\n"; + + } + } + +close(PAN); + +#### data fit + +#### for model A + +if ($spnum<3) + { + print "There are $spnum strains. For pan-genome function fitting, at least 3 strains data are required.\n"; + }else + { + open(R,">${outputDIR}2.PanGenome.Profile.txt"); + ##### genome number & pan-genome size + @xdata=qw(); + @ydata=qw(); + &ReadData2Array("${outputDIR}2.PanGenome.Data.txt",\@xdata,0,\@ydata,2); + &SumData(\@xdata,\@ydata,"mean"); + ($fit_Rsquare, $fit_A, $fit_A_interval, $fit_B, $fit_C, $fit_C_interval)=&fit_model_A(\@xdata,\@ydata); + print R "The relation bewteen genome number and pan-genome size\n\n"; + print R "Function model: y=A*x**B +C \n"; + print R "\ty denotes pan-genome size, x denotes genome number, and A, B, C are fitting parameters.\n\n"; + print R "Fitting result:\n"; + print R "\ty = $fit_A *x**$fit_B + $fit_C\n"; + print R "\tR-square = $fit_Rsquare\n"; + print R "\tA 95% confidence interval: ($fit_A - $fit_A_interval , $fit_A + $fit_A_interval)\n"; + print R "\tC 95% confidence interval: ($fit_C - $fit_C_interval , $fit_C + $fit_C_interval)\n\n\n\n\n"; + + ##### total gene number & pan-genome size + #@xdata=qw(); + #@ydata=qw(); + #&ReadData2Array("${outputDIR}2.PanGenome.Data.txt",\@xdata,1,\@ydata,2); + #&SumDataByMedian(\@xdata,\@ydata); + #($fit_Rsquare, $fit_A, $fit_B, $fit_C)=&fit_model_A(\@xdata,\@ydata); + #print R "The relation bewteen total gene number and pan-genome size\n\n"; + #print R "$fit_Rsquare, $fit_A, $fit_B, $fit_C\n"; + #print R "\ty = $fit_A *x**$fit_B + $fit_C R-square = $fit_Rsquare\n"; + #print R "\tx: total gene number\n"; + #print R "\ty: pan-genome size\n\n\n\n\n"; + + ##### genome number & core genome + @xdata=qw(); + @ydata=qw(); + &ReadData2Array("${outputDIR}2.PanGenome.Data.txt",\@xdata,0,\@ydata,3); + &SumData(\@xdata,\@ydata,"mean"); + ($fit_Rsquare, $fit_A, $fit_A_interval, $fit_B, $fit_C, $fit_C_interval)=&fit_model_B(\@xdata,\@ydata); + print R "The relation bewteen genome number and core genome size\n\n"; + print R "Function model: y=A*exp(B*x) +C \n"; + print R "\ty denotes pan-genome size, x denotes genome number, and A, B, C are fitting parameters.\n\n"; + print R "Fitting result:\n"; + print R "\ty = $fit_A *exp($fit_B * x) + $fit_C R-square = $fit_Rsquare\n"; + print R "\tR-square = $fit_Rsquare\n"; + print R "\tA 95% confidence interval: ($fit_A - $fit_A_interval , $fit_A + $fit_A_interval)\n"; + print R "\tC 95% confidence interval: ($fit_C - $fit_C_interval , $fit_C + $fit_C_interval)\n\n\n\n\n"; + close(R); + } + + } + + + ############################################ + # section 3) CDS variation analysis + ############################################ + +if ($run_variation) + { + print "\n\n############################################\n"; + print "# section 3) CDS variation analysis\n"; + print "############################################\n\n\n"; + + #### Retrieve cluster + &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters); + chomp(@clusters); + + ## protein + system("rm -rf *.pep"); + &PrepareFasta(\@species,$inputDIR,".pep"); ###prepare pep file + system("cat *.pep > All.faa && rm -rf *.pep && mv All.faa All.pep"); + &ReadSequenceInToHash("All.pep",\%pep); + ## nucleic + system("rm -rf *.nuc"); + &PrepareFasta(\@species,$inputDIR,".nuc"); ###prepare nuc file + system("cat *.nuc > All.ffn && rm -rf *.nuc && mv All.ffn All.nuc"); + &ReadSequenceInToHash("All.nuc",\%nuc); + + ## scanning SNP + %nt4tree=(); + for ($i=0;$i<$spnum;$i++) + { + $nt4tree{"S".$i}=""; + } + + open(VAR,">${outputDIR}3.CDS.variation.txt"); + print VAR "ClusterID\tStrains_Number\tGene_Number\tPosition\taaType\tntType\tntProfile\tVariation type\n"; + + open(VA,">${outputDIR}3.CDS.variation.analysis.txt"); + print VA "ClusterID\tInDel Base\tNonsynonymous mutation\tSynonymous mutation\n"; + + for ($line=0;$line<@clusters;$line++) + { + @row=split(/\t|\,/,$clusters[$line]); + $ClusterID=$row[0]; + splice(@row,0,1); + @row=grep(/^S/,@row); + if (scalar(@row) >=2) + { + open(PEP,">$ClusterID.pep"); + open(NUC,">$ClusterID.nuc"); + foreach $key (@row) + { + print PEP ">$key\n$pep{$key}\n"; + print NUC ">$key\n$nuc{$key}\n"; + } + close(PEP); + close(NUC); + system("$mafft --quiet $ClusterID.pep > $ClusterID.pal"); + #system("perl ./pal2nal.pl $ClusterID.pal $ClusterID.nuc -output fasta > $ClusterID.nal"); + $tmp=&pal2nal("$ClusterID.pal","$ClusterID.nuc","$ClusterID.nal"); + if ($tmp == 0) + { + system("rm -rf $ClusterID.*"); + next; + } + + @tmp=&DetectSNP(); + if (scalar(@tmp)>0) + { + print VA $ClusterID."\t".&VarAnalysis(\@tmp)."\n"; + print VAR join("",@tmp); + ### core orthologs + @row=split(/\t/,$clusters[$line]); + splice(@row,0,1); + if ((&CountGeneInCluster(join("\t",@row)) ==$spnum) and (&CountSpeicesInCluster(join("\t",@row)) == $spnum) ) + { + $count_tree++; + %tmpHash=(); + foreach (@row) + { + $tmpHash{$_}=""; + } + &RemoveHeadGap("$ClusterID.nal",\%tmpHash); + &ExtractSNP4tree(\%tmpHash,\%nt4tree); + } + } + system("rm -rf $ClusterID.*"); + } + } + close(VAR); + close(VA); + + open(R,">${outputDIR}3.CDS.variation.for.evolution.txt"); + foreach $key (keys %nt4tree) + { + $_=$key; + s/s//gi; + print R ">$species[$_]\n$nt4tree{$key}\n"; + } + close(R); + print $count_tree."\n\n"; + +### + system("rm All.nuc All.pep"); + }else + { + print "CDS variation is skipped.\n"; + } + + ############################################ + # section 4) CDS variation analysis + ############################################ + +if ($run_evolution) + { +#### Retrieve cluster + &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters); + chomp(@clusters); + + + +################## +## +## Distance based +## +################## + +### caculate the distance between each two strains by cluster + &ClusterProfil4Specie(\%tmpHash); # caculate Clusters profile for each specie + &DistanceMatrix($spnum,\%tmpHash); # caculate distance matrix acoording to Clusters profile + ###output distance + open(DIST,">${outputDIR}4.Species_Distance_Clusters_Based.txt"); + ###header + printf DIST "%5d", $spnum; + print DIST "\n"; + foreach $i (0..($spnum-1)) + { + $key="sp".$i."sp"; + printf DIST "%-10s",$key; + foreach $j (0..($spnum-1)) + { + printf DIST " %8f",$tmpHash{$i."-".$j}; + } + print DIST "\n"; + } + close(DIST); + + %tmpHash=(); + + ### based on pan genome (distance) + print "\nDraw pangenome based phylogenetic tree ...\n\n"; + + &PanBasedTree("${outputDIR}4.Species_Distance_Clusters_Based.txt","${outputDIR}4.PanBased"); + +################## +## +## SNP based +## +################## + %tmpHash=(); + if (!(-e "${outputDIR}3.CDS.variation.for.evolution.txt")) + { + print "Variation in core orthologs cluster is not found from ${outputDIR}3.CDS.variation.for.evolution.txt.\n"; + print "Maybe you have skipped CDS variation analysis.\n"; + }else + { + &ReadSequenceInToHash("${outputDIR}3.CDS.variation.for.evolution.txt",\%tmpHash); + open(R,">mlst.aln"); + for ($i=0;$i<@species;$i++) + { + print R ">sp${i}sp\n".$tmpHash{$species[$i]}."\n"; + } + close(R); + &fasta2phylip("mlst.aln","mlst.phylip"); + system("rm -rf mlst.aln"); + + print "\nDraw SNP based phylogenetic tree ...\n\n"; + &SNPBasedTree("mlst.phylip","${outputDIR}4.SNPBased"); + system("rm -rf mlst.phylip") + } + +######## +# replace speices name +######## + + opendir(DIR,"${outputDIR}"); + @tmp=readdir(DIR); + closedir(DIR); + @tmp=grep(/^4/,@tmp); + foreach $tmp (@tmp) + { + &ReplaceName(\@species,"${outputDIR}$tmp"); + } + }else + { + print "Evolution analysis is skipped.\n"; + } + + + ############################################ + # section 4) Function analysis + ############################################ + +if ($run_function) + { +#### Retrieve cluster + &RetrieveClusterFromFile("${outputDIR}1.Orthologs_Cluster.txt",\@clusters); + chomp(@clusters); + +#### prepare annotation file + &PrepareTable(\@species,$inputDIR,".function"); ###prepare location file + &ReadAnnotation(\@species,\%cog,\%description); + + +#### assign function + open(R,">${outputDIR}5.Orthologs_Cluster_Function.txt"); + print R "ClusterID\tConservation_Level\tCOG\tDescription\n"; + for ($i=0;$i<@clusters;$i++) + { + @row=split(/\t/,$clusters[$i]); + $ClusterID=$row[0]; + splice(@row,0,1); + print R $ClusterID."\t".&CountSpeicesInCluster(join("\t",@row))."\t".&getCOG(\@row,\%cog)."\t".&getDescription(\@row,\%description)."\n"; + } + close(R); + +#### COG distribution + +###Whole Clusters COG Distribution +&outputCOGStatistic("${outputDIR}5.Orthologs_Whole_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",$spnum,1)); + +###Core Clusters COG Distribution +&outputCOGStatistic("${outputDIR}5.Orthologs_Core_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",$spnum,$spnum)); + +###Dispensable Clusters COG Distribution +&outputCOGStatistic("${outputDIR}5.Orthologs_Dispensable_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",($spnum-1),2)); + +###strains specifc Clusters COG Distribution +&outputCOGStatistic("${outputDIR}5.Orthologs_specifc_Cluster_COG_Distribution.txt",&scanCOG("${outputDIR}5.Orthologs_Cluster_Function.txt",1,1)); + +system("rm -rf *.function"); + + }else + { + print "Function analysis is skipped.\n"; + } + +sub outputCOGStatistic() + { + (my $file,my $subcogcount)=@_; + my @cogcat=("J A K L B","D Y V T M N Z W U O","C G E F H I P Q","R S -"); + my @cogdesc=("INFORMATION STORAGE AND PROCESSING","CELLULAR PROCESSES AND SIGNALING","METABOLISM","POORLY CHARACTERIZED"); + my @subcogcat=qw(J A K L B D Y V T M N Z W U O C G E F H I P Q R S -); + my @subcogdesc=("[J] Translation, ribosomal structure and biogenesis","[A] RNA processing and modification","[K] Transcription","[L] Replication, recombination and repair","[B] Chromatin structure and dynamics","[D] Cell cycle control, cell division, chromosome partitioning","[Y] Nuclear structure","[V] Defense mechanisms","[T] Signal transduction mechanisms","[M] Cell wall/membrane/envelope biogenesis","[N] Cell motility","[Z] Cytoskeleton","[W] Extracellular structures","[U] Intracellular trafficking, secretion, and vesicular transport","[O] Posttranslational modification, protein turnover, chaperones","[C] Energy production and conversion","[G] Carbohydrate transport and metabolism","[E] Amino acid transport and metabolism","[F] Nucleotide transport and metabolism","[H] Coenzyme transport and metabolism","[I] Lipid transport and metabolism","[P] Inorganic ion transport and metabolism","[Q] Secondary metabolites biosynthesis, transport and catabolism","[R] General function prediction only","[S] Function unknown","[-] Unclassified"); + my %subcogdesc; + my $key; + my @cog; + my $i; + my $cognum; + + for ($i=0;$i<@subcogcat;$i++) + { + $subcogdesc{$subcogcat[$i]}=$subcogdesc[$i]; + } + + open(R,">$file"); + for ($i=0;$i<@cogcat;$i++) + { + $cognum=0; + foreach $key (split(" ",$cogcat[$i])) + { + $cognum=$cognum+$$subcogcount{$key}; + } + print R $cogdesc[$i]." ( ".$cognum." )\n"; + foreach $key (split(" ",$cogcat[$i])) + { + printf R "%-6d %s\n",$$subcogcount{$key},$subcogdesc{$key}; + } + print R "\n"; + + } + close(R); + + } + +sub scanCOG() + { + (my $file,my $max_orth,my $min_orth)=@_; + my @row; + my @subcogcat=qw(J A K L B D Y V T M N Z W U O C G E F H I P Q R S -); + my %subcogcount; + my $cog; + my $key; + + foreach $key (@subcogcat) + { + $subcogcount{$key}=0; + } + + @subcogcat=qw(J A K L B D Y V T M N Z W U O C G E F H I P Q R S); + + open(F,"$file"); + $_=<F>; + while (<F>) + { + @row=split(/\t/,$_); + if ($row[1]>=$min_orth and $row[1]<=$max_orth) + { + if ($row[2] eq "-") + { + $subcogcount{"-"}++; + }else + { + $_=uc($row[2]); + s/COG//gi; + $cog=$_; + foreach $key (@subcogcat) + { + if ($cog=~/$key/) + { + $subcogcount{$key}++; + } + } + } + } + } + close(F); + + return \%subcogcount; + } + + + + +sub getCOG() + { + (my $data,my $coghash)=@_; + my $cog=""; + my @cog; + my $key; + my %hash; + my @gene=split(/\t|\,/,join("\t",@$data)); + @gene=grep(/^S/,@gene); + + foreach $key (@gene) + { + if (($$coghash{$key} ne "-") and ($$coghash{$key} ne "")) + { + $cog=$cog.",".$$coghash{$key}; + } + } + @cog=split(/,/,$cog); + foreach $cog (@cog) + { + if ($cog ne "") + { + $hash{$cog}=1; + } + } + + $cog=join(",",(keys %hash)); + if ($cog eq "") + { + $cog="-"; + } + return $cog; + } + +sub getDescription() + { + (my $data,my $deschash)=@_; + my $desc=""; + my $key; + my @gene=split(/\t|\,/,join("\t",@$data)); + @gene=grep(/^S/,@gene); + + foreach $key (@gene) + { + if ( ($$deschash{$key} ne "") and ($$deschash{$key} ne "-") and ($$deschash{$key}!~/hypothetical/)) + { + $desc=$$deschash{$key}; + } + } + + if ($desc eq "") + { + $desc="hypothetical protein"; + } + + return $desc; + } + + + +sub ReadAnnotation() + { + (my $species,my $cog,my $description)=@_; + my $i; + my @row; + + for ($i=0;$i<@$species;$i++) + { + open(F,"$$species[$i].function"); + while (<F>) + { + chomp($_); + @row=split(/\t/,$_); + if (scalar(@row)>=2) + { + $$cog{$row[0]}=$row[1]; + }else + { + $$cog{$row[0]}="-"; + } + + if (scalar(@row)>=3) + { + $$description{$row[0]}=$row[2]; + }else + { + $$description{$row[0]}="hypothetical protein"; + } + } + close(F); + } + } + + +sub SNPBasedTree() + { + (my $infile,my $outfileprefix)=@_; + my $tmpin=$infile; + my $tmpout; + + +#### boootstrap +print "\n#### seqboot ...\n\n"; + open(R,">seqboot.cmd"); + #print R "$tmpin\n"; + print R "R\n"; + print R "$bootstrap\n"; + print R "Y\n"; + print R "1\n"; + close(R); + + system("cp $tmpin infile"); + system("$seqboot < seqboot.cmd"); + system("mv outfile 100dnaseq"); + system("rm -rf infile"); + system("rm seqboot.cmd"); # 100dnasseq + +#### dnaml +print "\n#### dnaml ...\n\n"; + open(R,">dnaml.cmd"); + #print R "100dnaseq\n"; + print R "T\n"; + print R "25\n"; + if ($bootstrap>1) + { + print R "M\n"; + print R "D\n"; + #print R "100\n"; + print R "$bootstrap\n"; + print R "1\n"; # Random number seed (must be odd)? + print R "5\n"; # Number of times to jumble? + } + print R "Y\n"; + close(R); + + system("cp 100dnaseq infile"); + system("$dnaml < dnaml.cmd"); + system("rm -rf outfile"); + system("rm -rf infile"); + system("mv outtree 100dnaseqtree"); # 100dnaseq, 100dnaseqtree + +#### consense +print "\n#### dnaml consense ...\n\n"; + + open(R,">consense.cmd"); + #print R "100dnaseqtree\n"; + print R "Y\n"; + close(R); + + system("cp 100dnaseqtree intree"); + system("$consense < consense.cmd"); + system("mv outfile ${outfileprefix}.ML.outfile"); + system("mv outtree ${outfileprefix}.ML.tree"); + system("rm -rf infile"); + system("rm -rf 100dnaseqtree"); # 100dnaseq + +#### dnadist +print "\n#### dnadist ...\n\n"; + open(R,">dnadist.cmd"); + #print R "100dnaseq\n"; + print R "T\n"; + print R "25\n"; + if ($bootstrap>1) + { + print R "M\n"; + print R "D\n"; + #print R "100\n"; + print R "$bootstrap\n"; + } + print R "Y\n"; + close(R); + + system("cp 100dnaseq infile"); + system("$dnadist < dnadist.cmd"); + system("rm -rf 100dnaseq"); + system("rm -rf infile"); + system("mv outfile 100dnadist"); # 100dnadist + +#### Neighbor-joining tree +print "\n#### Neighbor-joining ...\n\n"; + open(R,">NJ.cmd"); + if ($bootstrap>1) + { + #print R "100dnadist\n"; + print R "M\n"; + #print R "100\n"; + print R "$bootstrap\n"; + print R "1\n"; + } + print R "Y\n"; + close(R); + + system("cp 100dnadist infile"); + system("$neighbor < NJ.cmd"); + system("mv outtree 100dnadistNJtree"); + system("rm outfile"); + system("rm -rf infile"); + system("rm -rf NJ.cmd"); # 100dnadist,100dnadistNJtree + +#### NJ-consense +print "\n#### NJ-consense ...\n\n"; + open(R,">NJ-consense.cmd"); + #print R "100dnadistNJtree\n"; + print R "Y\n"; + close(R); + + system("cp 100dnadistNJtree intree"); + system("$consense < NJ-consense.cmd"); + system("mv outfile ${outfileprefix}.Neighbor-joining.outfile"); + system("mv outtree ${outfileprefix}.Neighbor-joining.tree"); + system("rm -rf NJ-consense.cmd"); + system("rm -rf intree"); + system("rm -rf 100dnadistNJtree"); + + +#### UPGMA tree +print "\n#### UPGMA ...\n\n"; + open(R,">UPGMA.cmd"); + #print R "100dnadist\n"; + print R "N\n"; + if ($bootstrap>1) + { + print R "M\n"; + #print R "100\n"; + print R "$bootstrap\n"; + print R "1\n"; + } + print R "Y\n"; + close(R); + + system("cp 100dnadist infile"); + system("$neighbor < UPGMA.cmd"); + system("mv outtree 100dnadistUPGMAtree"); + system("rm -rf outfile"); + system("rm -rf infile"); + system("rm -rf UPGMA.cmd"); + +#### UPGMA-consense +print "\n#### UPGMA-consense ...\n\n"; + open(R,">UPGMA-consense.cmd"); + #print R "100dnadistUPGMAtree\n"; + print R "Y\n"; + close(R); + + system("cp 100dnadistUPGMAtree intree"); + system("$consense < UPGMA-consense.cmd"); + system("mv outfile ${outfileprefix}.UPGMA.outfile"); + system("mv outtree ${outfileprefix}.UPGMA.tree"); + system("rm -rf UPGMA-consense.cmd"); + system("rm -rf 100dnadistUPGMAtree"); + system("rm -rf intree"); + +###CLEAN TMP FILE + + system("rm -rf *.cmd"); + system("rm -rf 100dnadist"); + } + +sub PanBasedTree() + { + (my $infile,my $outfileprefix)=@_; + my $tmpin; + my $tmpout; + + $tmpin=$infile; +#### Neighbor-joining tree + + open(R,">NJ.cmd"); + #print R "$tmpin\n"; + print R "Y\n"; + close(R); + + system("cp $tmpin infile"); + system("$neighbor < NJ.cmd"); + system("mv outfile ${outfileprefix}.Neighbor-joining.outfile"); + system("mv outtree ${outfileprefix}.Neighbor-joining.tree"); + system("rm -rf NJ.cmd"); + system("rm -rf infile"); + +#### UPGMA tree + + open(R,">UPGMA.cmd"); + #print R "$tmpin\n"; + print R "N\n"; + print R "Y\n"; + close(R); + + system("cp $tmpin infile"); + system("$neighbor < UPGMA.cmd"); + system("mv outfile ${outfileprefix}.UPGMA.outfile"); + system("mv outtree ${outfileprefix}.UPGMA.tree"); + system("rm -rf UPGMA.cmd"); + system("rm -rf infile"); + + +###CLEAN TMP FILE + system("rm -rf *.cmd"); + } + +sub DistanceMatrix() + { + (my $spnum,my $hash)=@_; + my $i; + my $j; + my $k; + my $dist; + my $ref; + my $query; + foreach $i (0..($spnum-1)) + { + foreach $j ($i..($spnum-1)) + { + $ref=$$hash{$i}; + $query=$$hash{$j}; + $dist=0; + for ($k=0;$k<length($ref);$k++) + { + if (substr($ref,$k,1) ne substr($query,$k,1)) + { + $dist++; + } + } + $$hash{$i."-".$j}=$dist; + $$hash{$j."-".$i}=$dist; + } + } + } + + +sub ClusterProfil4Specie + { + (my $hash)=@_; + my @row; + my $i; + + foreach (0..($spnum-1)) #initialization Hash + { + $$hash{$_}=""; + } + + foreach (@clusters) + { + @row=split(/\t/,$_); + splice(@row,0,1); + if (&CountSpeicesInCluster(join("\t",@row))>1) + { + for ($i=0;$i<@row;$i++) + { + if ($row[$i] eq "-") + { + $$hash{$i}=$$hash{$i}."0"; + }else + { + $$hash{$i}=$$hash{$i}."1"; + } + } + } + } + } + +# &ExtractSNP4tree(\%tmpHash,\%nt4tree); + +sub ExtractSNP4tree() + { + (my $hash,my $nt4treeRef)=@_; + my $key; + my @row; + my $i; + my $len; + my @tribases; + foreach $key (keys %$hash) + { + $$hash{substr($key,0,index($key,"G"))}=$$hash{$key}; + delete($$hash{$key}); + } + + for ($i=0;$i<$spnum;$i++) + { + $nt4tree{"S".$i}=$nt4tree{"S".$i}.$$hash{"S".$i}; + } + } + + +=pod +sub ExtractSNP4tree() + { + (my $hash,my $nt4treeRef)=@_; + my $key; + my @row; + my $i; + my $len; + my @tribases; + foreach $key (keys %$hash) + { + $$hash{substr($key,0,index($key,"G"))}=$$hash{$key}; + delete($$hash{$key}); + } + @_=(keys %$hash); + $len=length($_[0]); + for ($j=0;3*$j<$len;$j++) + { +##### scanning each codon + for ($i=0;$i<$spnum;$i++) + { + $tribases[$i]=substr($$hash{"S".$i},3*$j,3); + } +##### checking each codon + if (&IsTheSame(@tribases) ==0) + { + for ($i=0;$i<@tribases;$i++) + { + $nt4tree{"S".$i}=$nt4tree{"S".$i}.$tribases[$i]; + } + } + } + } +=cut + + +sub pal2nal() + { + (my $pal,my $nuc, my $nal)=@_; + my %aaAln=(); + my %ffn=(); + my %ntAln=(); + my %nt; + my $dna; + my $nt; + my $key; + my $flag=1; + my $i=0; + my $j; + +### read protein aligment result + &ReadAlignmentToHash("$pal",\%aaAln); +### read nt sequences + &ReadSequenceInToHash("$nuc",\%ffn); + foreach $key (keys %ffn) + { + $dna=$ffn{$key}; + #if (int(length($nt{$key})/3)*3 ne length($nt{$key})) + if (int(length($dna)/3)*3 ne length($dna)) + { + $flag=0; + print "The length of nucleotide sequence is not 3 integer times.\n"; + last; + }else + { + for ($i=0;$i<(length($dna)/3);$i++) + { + $nt{$key."|".$i}=substr($dna,$i*3,3); + } + } + } + + if ($flag==0) + { + return 0; + }else + { + foreach $key (keys %aaAln) ### replace aa with corresponding nt + { + $nt=""; + $i=0; + for ($j=0;$j<length($aaAln{$key});$j++) + { + if (substr($aaAln{$key},$j,1) eq "-") + { + $nt=$nt."---"; + }else + { + $nt=$nt.$nt{$key."|".$i}; + $i++; + } + } + $ntAln{$key}=$nt; + } + + ### output + open(R,">$nal"); + foreach (keys %ntAln) + { + print R ">$_\n".$ntAln{$_}."\n"; + } + close(R); + + return 1; + } + } + + + + +sub DetectSNP() + { + my %faa; + my %ffn; + my @row; + my $count_gene; + my $count_sp; + my @genelist; + my $i; + my $j; + my $pepalnlen; + my @cdsvar=qw(); + my $cdi=0; + my @tribases; + my @bases; + my @aa; + + +### fetch gene list + open(F,"$ClusterID.pep"); + @genelist=<F>; + close(F); + @genelist=grep(/^>/,@genelist); + chomp(@genelist); + $_=join("\t",@genelist); + s/>//g; + @genelist=split(/\t/,$_); + +### count gene number and species number + @row=split(/\t/,$clusters[$ClusterID-1]); + splice(@row,0,1); + $count_sp=&CountSpeicesInCluster(join("\t",@row)); + $count_gene=&CountGeneInCluster(join("\t",@row)); + +### read alignment sequences + &ReadAlignmentToHash("$ClusterID.pal",\%faa); + &ReadAlignmentToHash("$ClusterID.nal",\%ffn); + +@_=(keys %faa); +$pepalnlen=length($faa{$_[0]}); +### scan SNP + for ($i=1;$i<=$pepalnlen;$i++) + { + @tmp=qw(); + @tribases=qw(); + for ($j=0;$j<@genelist;$j++) ### fetch triplet codon + { + $tribases[$j]=substr($ffn{$genelist[$j]},3*($i-1),3); + } + if (&IsTheSame(@tribases) ==0) ### if triplet codon is not consistent + { + @aa=qw(); + for ($j=0;$j<@genelist;$j++) + { + $aa[$j]=substr($faa{$genelist[$j]},($i-1),1); + } + if (&IsTheSame(@aa) ==0) ### aa is not consistent + { + if (join("",@aa) =~/-/) + { + $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".$i."\t".&CharType(\@aa)."\t-\t-\tInDel\n"; + }else + { + #### base 1 + for ($j=0;$j<@genelist;$j++) + { + $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1),1); + } + if (&IsTheSame(@bases) ==0) + { + $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.1)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tNonsynonymous mutation\n"; + } + #### base 2 + for ($j=0;$j<@genelist;$j++) + { + $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+1,1); + } + if (&IsTheSame(@bases) ==0) + { + $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.2)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tNonsynonymous mutation\n"; + } + #### base 3 + for ($j=0;$j<@genelist;$j++) + { + $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+2,1); + } + if (&IsTheSame(@bases) ==0) + { + $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.3)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tNonsynonymous mutation\n"; + } + } + }else + { + #### base 1 + for ($j=0;$j<@genelist;$j++) + { + $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1),1); + } + if (&IsTheSame(@bases) ==0) + { + $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.1)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tSynonymous mutation\n"; + } + #### base 2 + for ($j=0;$j<@genelist;$j++) + { + $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+1,1); + } + if (&IsTheSame(@bases) ==0) + { + $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.2)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tSynonymous mutation\n"; + } + #### base 3 + for ($j=0;$j<@genelist;$j++) + { + $bases[$j]=substr($ffn{$genelist[$j]},3*($i-1)+2,1); + } + if (&IsTheSame(@bases) ==0) + { + $cdsvar[$cdi++]=$ClusterID."\t".$count_sp."\t".$count_gene."\t".($i+0.3)."\t".&CharType(\@aa)."\t".&CharType(\@bases)."\t".join("",@bases)."\tSynonymous mutation\n"; + } + } + } + } + return @cdsvar; + } + + +sub VarAnalysis() + { + (my $data)=@_; + my @data=@$data; + my $indel=0; + my $syn=0; + my $nonsyn=0; + my @tmp; + $indel=scalar(grep(/InDel$/,@data)); + $nonsyn=scalar(grep(/Nonsynonymous mutation$/,@data));; + $syn=scalar(grep(/Synonymous mutation$/,@data)); + return "$indel\t$nonsyn\t$syn"; + } + + + +sub CharType() + { + (my $str)=@_; + my %hash; + my @data=@$str; + foreach (@data) + { + $hash{$_}=1; + } + return join(",",(keys %hash)); + } + +sub IsTheSame() + { + (my @data)=@_; + my %hash; + foreach (@data) + { + $hash{$_}=1; + } + if (scalar(keys %hash) ==1) + { + return 1; + }else + { + return 0; + } + } + + + +sub FormatClusterOutPut() + { + (my $speices,my $file,my $cluster)=@_; + my @row; + my $gid=1; + my $key; + my %hash; + my $gene; + my @tmp; + my $i; + my $j; + open(R,">$file"); + print R "ClutserID\t".join("\t",@$speices)."\n"; + foreach $key (@$cluster) + { + @row=split(/\t/,$key); + for ($i=0;$i<@row;$i++) + { + if ($row[$i] ne "-") + { + @tmp=split(/,/,$row[$i]); + for ($j=0;$j<@tmp;$j++) + { + $_=$tmp[$j]; + s/^S[0-9]+G//; + $tmp[$j]=$_; + } + $row[$i]=join(",",@tmp); + } + } + print R $gid."\t".join("\t",@row)."\n"; + $gid++; + } + close(R); + } +sub RetrieveClusterFromFile() + { + (my $file,my $clusters)=@_; + my @content; + my @row; + my $spid; + my $line=0; + my $i=0; + my $j; + my @tmp; + open(F,$file) or die "Could open $file\n"; + @content=<F>; + close(F); + splice(@content,0,1); + chomp(@content); + foreach (@content) + { + @row=split(/\t/,$_); + $$clusters[$line]=$row[0]; + splice(@row,0,1); + for ($i=0;$i<@row;$i++) + { + if ($row[$i] ne "-") + { + @tmp=split(/,/,$row[$i]); + for ($j=0;$j<@tmp;$j++) + { + $tmp[$j]="S${i}G".$tmp[$j]; + } + $row[$i]=join(",",@tmp); + } + } + $$clusters[$line]=$$clusters[$line]."\t".join("\t",@row)."\n"; + $line++; + } + } + + + + +sub GeneDistribution() + { + (my $clusters,my $hash)=@_; + my @row; + my $spid; + my $orth; + my $key; + foreach (@$clusters) + { + @row=split(/\t/,$_); + splice(@row,0,1); + $orth=&CountSpeicesInCluster(join("\t",@row)); + @row=split(/\t|\,/,join("\t",@row)); + foreach $key (@row) + { + if ($key ne "-") + { + $spid=substr($key,1,(index($key,'G')-1)); ###extract strains id + if (exists($$hash{$orth."|".$spid})) + { + $$hash{$orth."|".$spid}++; + }else + { + $$hash{$orth."|".$spid}=1; + } + } + } + } + } + +sub CountSpeicesInCluster() + { + (my $str)=@_; + chomp($str); + my @list=split(/\t/,$str); + my $key; + my $count=0; + + foreach $key (@list) + { + if ($key ne "-") + { + $count++; + } + } + return $count; + } + +sub CountGeneInCluster() + { + (my $str)=@_; + chomp(); + my @list=split(/\t|\,/,$str); + my $key; + my $count=0; + foreach $key (@list) + { + if ($key ne "-") + { + $count++; + } + } + return $count; + } + + + + +sub GF() + { + &PrepareFasta(\@species,$inputDIR,".pep"); ###prepare pep file + system("cat ".join(".pep ",@species).".pep > All.pep"); + system("grep '>' All.pep > genelist"); + system("$formatdb -p T -i All.pep"); + system("$blastall -p blastp -i All.pep -d All.pep -M BLOSUM45 -m9 -e $evalue -o All.blastp -a $thread"); + system("perl ./Blast_Filter.pl All.blastp All.pep $coverage $identity $score | $mcl - --abc -I 2.0 -o All.cluster"); + &FormatCluster("All.cluster","genelist",$spnum,\@clusters); + #system("rm -rf *.pep* All.blastp All.cluster genelist"); + } + +sub MP() + { +# (my $species,my $inputDIR,my $thread,my $evalue,my $score,my $coverage,my $identity)=@_; + my $i; + my $j; + &PrepareFasta(\@species,$inputDIR,".pep"); ###prepare pep file + system("cat ".join(".pep ",@species).".pep > All.pep"); + system("grep '>' All.pep > genelist"); + system("rm -rf All.pep"); + for ($i=0;$i<$spnum;$i++) + { + for ($j=$i+1;$j<$spnum;$j++) + { + system("perl ./inparanoid.pl $blastall $thread $formatdb $score $global $local $species[$i].pep $species[$j].pep"); + } + } + system("perl ./multiparanoid.pl -species ".join(".pep+",@species).".pep -unique 1"); +###convert the MP result to table list based on gene + &MP_Result_to_Table("MP.Cluster","All.cluster"); + &FormatCluster("All.cluster","genelist",$spnum,\@clusters); + system("rm -rf sqltable.* *.pep* MP.Cluster genelist"); + } + +sub fasta2phylip() + { + (my $input,my $output)=@_; + use Bio::AlignIO; + my $inputfilename = "10.aln"; + my $in= Bio::AlignIO->new(-file => $input , + -format => 'fasta'); + my $out = Bio::AlignIO->new(-file => ">$output" , + -format => 'phylip'); + while ( my $aln = $in->next_aln() ) + { + $out->write_aln($aln); + } + } + + +sub RemoveHeadGap() + { + (my $nal,my $hash)=@_; + my %aln; + my $key; + my $gaplength=0; + my $len1; + my $len2; + &ReadSequenceInToHash("$nal",\%aln); + foreach $key (keys %aln) + { + $len1=length($aln{$key}); + $_=$aln{$key}; + s/^-+//; + $len2=length($_); + if (($len1-$len2)>$gaplength) + { + $gaplength=$len1-$len2; + } + } + foreach $key (keys %aln) + { + $$hash{$key}=$$hash{$key}.substr($aln{$key},$gaplength,(length($aln{$key})-$gaplength)); + } + } + +sub PrepareFasta() + { + (my $species,my $inputDIR,my $extention)=@_; + my $sp; + my $file; + my $i; + my %hash; + my $key; + for ($i=0;$i<@$species;$i++) + { + $file=$inputDIR.$$species[$i].$extention; + %hash=(); + &ReadSequenceInToHash($file,\%hash); + open(R,">$$species[$i]${extention}") or die "Could write into $file\n"; + foreach $key (keys %hash) + { + print R ">S${i}G$key\n"; + print R $hash{$key}."\n"; + } + close(R); + } + } + +sub PrepareTable() + { + (my $species,my $inputDIR,my $extention)=@_; + my @content; + my $i; + my @row; + my $file; + for ($i=0;$i<@$species;$i++) + { + $file=$inputDIR.$$species[$i].$extention; + open(F,$file) or die "Could open $file\n"; + @content=<F>; + close(F); + chomp(@content); + open(R,">$$species[$i]${extention}") or die "Could write into $file\n"; + foreach (@content) + { + @row=split(/\t/,$_); + $row[0]="S${i}G$row[0]"; + if ($extention eq ".location") + { + $row[0]=$row[0]."\t".$row[0]; + } + print R join("\t",@row)."\n"; + } + close(R); + } + } + +sub CheckExtraProgram + { + #(my $section, my $method, my $tmparray)=@_; + my @error; + my $ei=0; + +#####cluster gene + if (substr($section,0,1) eq "1") + { +###MP: blastall formatdb +###GF: blastall formatdb mcl + + if (!(-e $formatdb)) + { + $error[$ei++]="formatdb is not found at $formatdb\n"; + } + + if (!(-X $formatdb)) + { + $error[$ei++]="there is not premission to execute $formatdb\n"; + } + + if (!(-e $blastall)) + { + $error[$ei++]="blastall is not found at $blastall\n"; + } + + if (!(-X $blastall)) + { + $error[$ei++]="there is not premission to execute $blastall\n"; + } + + if ($method eq "GF") + { + if (!(-e $mcl)) + { + $error[$ei++]="mcl is not found at $mcl\n"; + } + if (!(-X $mcl)) + { + $error[$ei++]="there is not premission to execute $mcl\n"; + } + } + } + +#####CDS variation + if (substr($section,2,1) eq "1") + { + if (!(-e $mafft)) + { + $error[$ei++]="mafft is not found at $mafft\n"; + } + if (!(-X $mafft)) + { + $error[$ei++]="there is not premission to execute $mafft\n"; + } + } + +#####CDS variation + if (substr($section,3,1) eq "1") + { + if (!(-e $mafft)) + { + $error[$ei++]="mafft is not found at $mafft\n"; + } + if (!(-X $mafft)) + { + $error[$ei++]="there is not premission to execute $mafft\n"; + } + } +#####Evolution analysis + if (substr($section,3,1) eq "1") + { + if (-e $seqboot) + { + $error[$ei++]="there is not premission to execute $seqboot\n" if(!(-X $seqboot)); + }else + { + $error[$ei++]="seqboot is not found at $seqboot\n"; + } + if (-e $dnaml) + { + $error[$ei++]="there is not premission to execute $dnaml\n" if(!(-X $dnaml)); + }else + { + $error[$ei++]="dnaml is not found at $dnaml\n"; + } + if (-e $dnadist) + { + $error[$ei++]="there is not premission to execute $dnadist\n" if(!(-X $dnadist)); + }else + { + $error[$ei++]="dnadist is not found at $dnadist\n"; + } + if (-e $neighbor) + { + $error[$ei++]="there is not premission to execute $neighbor\n" if(!(-X $neighbor)); + }else + { + $error[$ei++]="neighbor is not found at $neighbor\n"; + } + if (-e $consense) + { + $error[$ei++]="there is not premission to execute $consense\n" if(!(-X $consense)); + }else + { + $error[$ei++]="consense is not found at $consense\n"; + } + if (-e $dnapars) + { + $error[$ei++]="there is not premission to execute $dnapars\n" if(!(-X $dnapars)); + }else + { + $error[$ei++]="dnapars is not found at $dnapars\n"; + } + } + #@$tmparray=(@$tmparray,@error); + @tmp=(@tmp,@error); + } + + +sub CheckInputFile() + { + (my $species,my $inputDIR,my $section,my $method,my $tmparray)=@_; +####cluster + if (substr($section,0,1) eq "1") + { + if ($method eq "MM") + { + @$tmparray=(@$tmparray,&chk2SEQ($species,$inputDIR)); ### check pep and nuc + @$tmparray=(@$tmparray,&chktab($species,$inputDIR,".location"));### chk pep nuc location + }else + { + @$tmparray=(@$tmparray,&chk1SEQ($species,$inputDIR)); + } + } +###CDS variation + if (substr($section,2,1) eq "1") + { + @$tmparray=(@$tmparray,&chk2SEQ($species,$inputDIR)); + } +###function analysis + if (substr($section,4,1) eq "1") + { + @$tmparray=(@$tmparray,&chktab($species,$inputDIR,".function")); + } + } + + +sub chk1SEQ() + { + (my $species,my $inputDIR)=@_; + my @error; + my $ei=0; + my $sp; + my $pepfile; + my %pep; + foreach $sp (@$species) + { + %pep=(); + $pepfile=$inputDIR.$sp.".pep"; + &ReadSequenceInToHash($pepfile,\%pep); + if (scalar(keys %pep)<2) + { + $error[$ei++]="format error in $pepfile\n"; + } + } + return @error; + } + +sub chk2SEQ() + { + (my $species,my $inputDIR)=@_; + my $sp; + my %pep; + my %nuc; + my $pepfile; + my $nucfile; + my $key; + my @error; + my $ei=0; + foreach $sp (@$species) + { + $pepfile=$inputDIR.$sp.".pep"; + $nucfile=$inputDIR.$sp.".nuc"; + %pep=(); + %nuc=(); + &ReadSequenceInToHash("$pepfile",\%pep); + &ReadSequenceInToHash("$nucfile",\%nuc); + if (scalar(keys %pep) ne scalar(keys %nuc)) + { + $error[$ei++]="Sequences number is not consistent in the following two file:\n\t$pepfile\n\t$nucfile\n"; + }else + { + foreach $key (keys %pep) + { + if (exists($nuc{$key})) + { + if (length($nuc{$key}) ne ((length($pep{$key})+1)*3)) + { + $error[$ei++]="the length of $key in $nucfile is not consistent with its corresponding protein length\n"; + } + }else + { + $error[$ei++]="$key lost in $nucfile\n"; + } + } + + foreach $key (keys %nuc) + { + if (!exists($pep{$key})) + { + $error[$ei++]="1048 $key lost in $pepfile\n"; + } + } + } + } + return @error; + } + + +sub chktab() + { + (my $species,my $inputDIR,my $extention)=@_; + my %pep; + my @row; + my $key; + my %tab; + my @error; + my $ei=0; + my $sp; + my $tabfile; + my $pepfile; + foreach $sp (@$species) + { + %tab=(); + %pep=(); + $tabfile=$inputDIR.$sp.$extention; + open(F,"$tabfile"); + while (<F>) + { + chomp(); + @row=split(/\t/,$_); + if (scalar(@row)<3) + { + $error[$ei++]="format error in $tabfile\n"; + }else + { + $tab{$row[0]}=$row[1]; + } + } + close(F); + $pepfile=$inputDIR.$sp.".pep"; + &ReadSequenceInToHash($pepfile,\%pep); + foreach $key (keys %pep) + { + if (!exists($tab{$key})) + { + $error[$ei++]="sequence $key lost infomation in $tabfile\n"; + } + } + } + return @error; + } + + + +sub ReadSequenceInToHash() + { + use Bio::SeqIO; + (my $file,my $hash)=@_; + my $seq; + my $in=Bio::SeqIO->new(-file=>"$file",-format=>"fasta"); + while ($seq=$in->next_seq()) + { + #$$hash{$id."|".$seq->id}=$seq->seq(); + $$hash{$seq->id}=$seq->seq(); + } + } + +sub ReadAlignmentToHash() + { + (my $file,my $hash)=@_; + my $name=""; + my $seq=""; + my @content; + my $line; + open(F,"$file"); + @content=<F>; + close(F); + chomp(@content); + for ($line=0;$line<@content;$line++) + { + if ($content[$line]=~/^>/) + { + if ($line>0) + { + $$hash{$name}=$seq; + $name=""; + } + + $_=$content[$line]; + s/^>//; + $name=$_; + $seq=""; + }else + { + if ($name ne "") + { + $seq=$seq.$content[$line]; + } + } + } + $$hash{$name}=$seq; + } + + + +sub Combination() + { + (my $m,my $n,my $comRef)=@_; + my $str=""; + my %hash; + my $fpos; + my $num0; + my $rest; + my $tmp; + my $i; + my $j; + my $key; + #my $m=scalar(@$array); + my @combination; + + for ($i=1;$i<=$n;$i++) + { + $str="1".$str; + } + + for ($i=1;$i<=($m-$n);$i++) + { + $str=$str."0"; + } + + $hash{$str}=1; + while ($str=~/10/) + { + $fpos=index($str,"10"); + $_=$str; + s/10/01/; + $str=$_; + $tmp=substr($str,0,$fpos); + $_=$tmp; + s/0//g; + $rest=$_; + $num0=$fpos-length($_); + for ($i=1;$i<=$num0;$i++) + { + $rest="$rest"."0"; + } + $str="$rest".substr($str,$fpos,$m-$fpos); + $hash{$str}=1; + } + $j=0; + foreach $key (keys %hash) + { + $combination[$j]=""; + for ($i=0;$i<$m;$i++) + { + if (substr($key,$i,1) eq "1") + { + if ($combination[$j] ne "") + { + #$combination[$j]=$combination[$j]."\t".$$array[$i]; + $combination[$j]=$combination[$j]."\t".$i; + }else + { + #$combination[$j]=$$array[$i]; ### For return species ID + $combination[$j]=$i; + } + } + } + $j++; + } + @$comRef=@combination; ### update the data through the physic address + } + +sub ChkCombinationValue() +{ + (my $m,my $n)=@_; + my %hash; + my %vhash; + my $value=0; + my $key; + my @row; + my @sdA; + my @sdB; + + ### initialization + $hash{$m."-".$n}=1; + + ### split combination + while (scalar(keys %hash)>0 and $value<=$sampleSize) + { + foreach $key (keys %hash) + { + if ($value > $sampleSize) ### threshold + { + last; + } + if (!exists($hash{$key})) + { + next; + } + @row=split(/-/,$key); + #print $row[0]."|".$row[1]."\n"; + if ($row[0] eq $row[1]) + { + $value=$value+$hash{$key}; + }else + { + ##split + $sdA[0]=$row[0]-1; + $sdA[1]=$row[1]; + $sdB[0]=$row[0]-1; + $sdB[1]=$row[1]-1; + ##storing A + if (($sdA[0] eq $sdA[1]) or $sdA[1] ==0) + { + $value=$value+$hash{$key}; + }else + { + if (exists($hash{$sdA[0]."-".$sdA[1]})) + { + $hash{$sdA[0]."-".$sdA[1]}=$hash{$sdA[0]."-".$sdA[1]}+$hash{$key}; + }else + { + $hash{$sdA[0]."-".$sdA[1]}=$hash{$key}; + } + } + + ##storing B + if (($sdB[0] eq $sdB[1]) or $sdB[1]==0) + { + $value=$value+$hash{$key}; + }else + { + if (exists($hash{$sdB[0]."-".$sdB[1]})) + { + $hash{$sdB[0]."-".$sdB[1]}=$hash{$sdB[0]."-".$sdB[1]}+$hash{$key}; + }else + { + $hash{$sdB[0]."-".$sdB[1]}=$hash{$key}; + } + } + } + #delete original combination + delete($hash{$key}); + } + } + + if ($value>$sampleSize) + { + return 0; + }else + { + return $value; + } +} + + +sub SampleCombination() +{ + (my $m,my $n,my $comRef)=@_; + my %hash; + my $sampleTimes=0; + my @randNum; + my @sortID; + my $i; + my $j; + my $tmp; + while ( scalar(keys %hash)<$sampleSize and $sampleTimes<($sampleSize*2)) + { + for ($i=0;$i<$m;$i++) # generate random data + { + $randNum[$i]=int(100000 * rand(100)); + $sortID[$i]=$i; + } + + for ($i=0;$i<$m;$i++) # sorting random data + { + for ($j=0;$j<$m;$j++) + { + if ($randNum[$sortID[$i]]<$randNum[$sortID[$j]]) + { + $tmp=$sortID[$i]; + $sortID[$i]=$sortID[$j]; + $sortID[$j]=$tmp; + } + } + } + + #storing data + $tmp=join("\t",sort {$a<=>$b} (splice(@sortID,0,$n))); + $hash{$tmp}=1; + $sampleTimes++; + } + @$comRef=keys %hash; +} + + +sub PanGenomeNumber() + { + (my $spID)=@_; + my $pan=0; + my $core=0; + my $count; #### counter; + my @row; + + foreach (@clusters) + { + $count=0; + @row=split(/\t/,$_); + + foreach (@$spID) + { + $count=$count+$row[$_]; + } + + if ($count>0) + { + $pan++; + if ($count == scalar(@$spID)) + { + $core++; + } + } + } + return $pan."\t".$core; + } + +sub fit_model_A() + { +### model y = A * x**B + C + (my $xdata,my $ydata)=@_; + my $i; + my $b; + my $max_B=0; + my $max_R=0; + my $max_A=0; + my $max_A_interval; + my $max_C=0; + my $max_C_interval; + my $R=1e-100; + my $start; + my $end; + my $step; + my @xValues; + my @yValues; + + $start=1; + $step=0.001; + $b=$start; + $max_R=0; + $R=1e-100; + + use Statistics::LineFit; + use Statistics::Distributions; + + while ($max_R<=$R) + { + if (($b < 0.02) and ($b >-0.02)) + { + $b=-0.02; + } + + for ($i=0;$i<@$xdata;$i++) + { + $xValues[$i]=$$xdata[$i]**$b; + } + @yValues=@$ydata; + my $lineFit = Statistics::LineFit->new(); + $lineFit->setData (\@xValues, \@yValues) or die "Invalid data"; + (my $intercept, my $slope) = $lineFit->coefficients(); + my $rSquared = $lineFit->rSquared(); + my $meanSquaredError = $lineFit->meanSqError(); + my $durbinWatson = $lineFit->durbinWatson(); + my $sigma = $lineFit->sigma(); + (my $tStatIntercept, my $tStatSlope) = $lineFit->tStatistics(); + (my $varianceIntercept,my $varianceSlope) = $lineFit->varianceOfEstimates(); + + $max_R=$R; + $R=$rSquared; + if ($max_R<=$R) + { + $max_R=$R; + ($max_C,$max_A)=$lineFit->coefficients(); + $max_A_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceSlope); + $max_C_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceIntercept); + } + $b=$b-$step; + } + $max_B=$b; + return ($max_R,$max_A,$max_A_interval,$max_B,$max_C,$max_C_interval); + + } + +sub fit_model_B() + { +### model y = A * exp(x*B) + C + (my $xdata,my $ydata)=@_; + my $i; + my $b; + my $max_B=0; + my $max_R=0; + my $max_A=0; + my $max_A_interval; + my $max_C=0; + my $max_C_interval; + my $R=1e-100; + my $start; + my $end; + my $step; + my @xValues; + my @yValues; + + $start=0; + $step=0.001; + $b=$start; + $max_R=0; + $R=1e-100; + + use Statistics::LineFit; + use Statistics::Distributions; + + while ($max_R<=$R) + { + if (($b < 0.02) and ($b >-0.02)) + { + $b=-0.02; + } + + for ($i=0;$i<@$xdata;$i++) + { + $xValues[$i]=exp($$xdata[$i]*$b); + } + @yValues=@$ydata; + my $lineFit = Statistics::LineFit->new(); + $lineFit->setData (\@xValues, \@yValues) or die "Invalid data"; + (my $intercept, my $slope) = $lineFit->coefficients(); + my $rSquared = $lineFit->rSquared(); + my $meanSquaredError = $lineFit->meanSqError(); + my $durbinWatson = $lineFit->durbinWatson(); + my $sigma = $lineFit->sigma(); + (my $tStatIntercept, my $tStatSlope) = $lineFit->tStatistics(); + (my $varianceIntercept,my $varianceSlope) = $lineFit->varianceOfEstimates(); + + $max_R=$R; + $R=$rSquared; + if ($max_R<=$R) + { + $max_R=$R; + ($max_C,$max_A)=$lineFit->coefficients(); + $max_A_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceSlope); + $max_C_interval=Statistics::Distributions::tdistr (($spnum-2),.025)*sqrt($varianceIntercept); + } + $b=$b-$step; + } + $max_B=$b; + return ($max_R,$max_A,$max_A_interval,$max_B,$max_C,$max_C_interval); + } + + +sub ReadData2Array() + { + (my $file, my $array1,my $col1,my $array2,my $col2)=@_; + my $i=0; + open(F,$file); + $_=<F>; + while (<F>) + { + chomp(); + @_=split(/\t/,$_); + $$array1[$i]=$_[$col1]; + $$array2[$i]=$_[$col2]; + $i++; + } + close(F); + } + +sub SumData() + { + (my $xdata,my $ydata,my $SumMethod)=@_; + my %hash; + my $i; + my $key; + my $max=0; + for ($i=0;$i<@$xdata;$i++) + { + if (exists($hash{$$xdata[$i]})) + { + $hash{$$xdata[$i]}=$hash{$$xdata[$i]}." ".$$ydata[$i]; + }else + { + $hash{$$xdata[$i]}=$$ydata[$i]; + if ($$xdata[$i]>$max) + { + $max=$$xdata[$i]; + } + } + } + @$xdata=qw(); + @$ydata=qw(); + $i=0; + foreach $i (1..$max) + { + $$xdata[$i-1]=$i; + if ($SumMethod eq "median") + { + $$ydata[$i-1]=&median($hash{$i}); + }else + { + $$ydata[$i-1]=&mean($hash{$i}); + } + } + #print join(",",@$xdata)."\n"; + #print join(",",@$ydata)."\n"; + } + +sub median() + { + (my $data)=@_; + my @data=split(/ /,$data); + my $arraylen=scalar(@data); + @data=sort{$a<=>$b} @data; + if (int($arraylen/2)*2 == $arraylen) + { + return ($data[$arraylen/2]+$data[$arraylen/2-1])/2; + }else + { + return $data[int($arraylen/2)]; + } + } + +sub mean() + { + (my $data)=@_; + my @data=split(/ /,$data); + my $sum=0; + foreach (@data) + { + $sum=$sum+$_; + } + return int(($sum/scalar(@data))*1000)/1000; + } + +sub ReplaceName() + { + (my $sp,my $file)=@_; + my @content; + my $line; + my $i; + my $target; + open(F,$file); + @content=<F>; + close(F); + for ($line=0;$line<@content;$line++) + { + for ($i=0;$i<@$sp;$i++) + { + $_=$content[$line]; + $target="sp".$i."sp"; + s/$target/$$sp[$i]/; + $content[$line]=$_; + } + } + open(R,">$file"); + print R @content; + close(R); + } + +sub MP_Result_to_Table() + { + (my $MPresult, my $outputfile)=@_; + my %hash; + my $maxid=0; + my $i; + my @row; + + open(F,"$MPresult"); + $_=<F>; + while (<F>) + { + @row=split(/\t/,$_); + if (exists($hash{$row[0]})) + { + $hash{$row[0]}=$hash{$row[0]}."\t".$row[2]; + }else + { + $hash{$row[0]}=$row[2]; + if ($row[0]>$maxid) + { + $maxid=$row[0]; + } + } + } + close(F); + + open(R,">$outputfile"); + foreach $i (1..$maxid) + { + print R $hash{$i}."\n"; + } + close(R); + } + + + +sub FormatCluster() + { + (my $infile,my $genelist,my $spnum,my $cluster)=@_; + my %hash; + my %gene; + my $key; + my @row; + my $sp; + my $line; + my $i=0; + my $j=0; + my @content; + +### record gene in clusters + open(F,"$infile"); + @content=<F>; + close(F); + chomp(@content); + for ($line=0;$line<@content;$line++) + { + @row=split(/\t/,$content[$line]); + foreach $key (@row) + { + $gene{$key}=1; + } + } +###retrieves gene which is not in clutsers + + open(F,"$genelist"); + while ($key=<F>) + { + if ($key=~/^>/) + { + chomp($key); + $_=$key; + s/^>//; + $key=$_; + if (!exists($gene{$key})) + { + $content[$line]=$key; + $line++; + } + } + } + close(F); + +#### initialization @cluster + @$cluster=qw(); + $j=0; + + foreach $line (@content) + { + if ($line ne "") + { + %hash=(); + @row=split(/\t/,$line); + foreach $key (@row) + { + $sp=substr($key,0,index($key,"G")); + $gene{$key}=1; + if (exists($hash{$sp})) + { + $hash{$sp}=$hash{$sp}.",".$key; + }else + { + $hash{$sp}=$key; + } + } + + $i=0; + @row=qw(); + + foreach $i (0..($spnum-1)) + { + if (exists($hash{"S$i"})) + { + $row[$i]=$hash{"S$i"}; + }else + { + $row[$i]="-"; + } + } + $$cluster[$j++]=join("\t",@row); + } + } + } + +