Mercurial > repos > dcouvin > getsequenceinfo
view getSequenceInfo.pl @ 0:19ae17458c14 draft default tip
Uploaded
author | dcouvin |
---|---|
date | Wed, 15 Sep 2021 21:35:09 +0000 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl ################################################################################ ## "Copyright 2019 Vincent Moco and David Couvin" ## licence GPL-3.0-or-later ## This program is free software: you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation, either version 3 of the License, or ## (at your option) any later version. ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## You should have received a copy of the GNU General Public License ## along with this program. If not, see <https://www.gnu.org/licenses/>. ################################################################################ use strict; use warnings; my $version = "1.0.1"; # version #my $number = 50; # Date and time of the current day (Beginning) #my ($start_year,$start_month,$start_day, $start_hour,$start_min,$start_sec) = Today_and_Now(); my $start = time(); print "##################################################################\n"; print "## ---> Welcome to $0 (version $version)!\n"; #print "## Start Date (yyyy-mm-dd, hh:min:sec): $start_year-$start_month-$start_day, $start_hour:$start_min:$start_sec\n"; print "##################################################################\n\n"; #BioPerl, Date::Calc, File::Log, have been removed from the @modules my @modules = qw( Archive::Tar Bio::SeqIO Bio::Species File::Copy File::Path Net::FTP IO::Uncompress::Gunzip LWP::Simple POSIX ); foreach my $module (@modules) { if (isModuleInstalled($module)) { print "$module is.................installed!\n"; } else { print "$module is not installed. Please install it and try again.\n"; print "You can reinstall the $0 as shown on the README page or use the following command to install the module:\n"; print "cpan -i -f $module\n"; #system("cpan -i -f $module"); exit 1; } } print "\n"; use Archive::Tar; use Bio::SeqIO; use Bio::Species; #use Date::Calc qw(:all); use File::Copy qw(cp move); use File::Path qw(rmtree); use Net::FTP; use IO::Uncompress::Gunzip qw(gunzip $GunzipError); use LWP::Simple qw(get); use POSIX qw(floor); #use File::Log; #################################################################################################### ## A Perl script allowing to get sequence information from GenBank, RefSeq or ENA repositories. ## #################################################################################################### ### main program ### parameters my $directory = "genbank"; my $kingdom = ""; # kingdom of organism my $releaseDate = "0000-00-00"; # sequences are downloaded from this release date my $components; # components of the assembly my $species = ""; # species name my $getSummaries; # indicates if a new assembly summary is required my $assemblyLevel = "Complete Genome,Chromosome,Scaffold,Contig"; # assembly level of the genome my $quantity; # limit number of assemblies to download my $sequenceID; my $ftpServor = "ftp.ncbi.nlm.nih.gov"; my $enaFtpServor = "ftp.sra.ebi.ac.uk"; my $fldSep = "/"; # folder seperation change by OS my @availableKingdoms = ( "archaea", "bacteria", "fungi", "invertebrate", "plant", "protozoa", "vertebrate_mammalian", "vertebrate_other", "viral" ); # list of available kingdoms my $actualOS = "Unix"; # OS of the computer my $mainFolder; # folder in which the assemblies results are stored my $assemblyTaxid = ""; # taxid for assembly my $sraID; # SRA sequence ID my $assemblyPrjID; # assembly or prj ID my $log; # log my $path = ""; if (@ARGV<1) { help_user_simple($0); exit 0; } if ($ARGV[0] eq "-help" || $ARGV[0] eq "-h") { help_user_advance($0); exit 0; } if ($ARGV[0] eq "-version" || $ARGV[0] eq "-v") { program_version($0); exit 0; } ##requirements for (my $i = 0; $i <= $#ARGV; $i++) { if ($ARGV[$i]=~/-kingdom/i or $ARGV[$i]=~/-k/i) { $kingdom = $ARGV[$i+1]; } elsif ($ARGV[$i]=~/-path/i or $ARGV[$i]=~/-pathSummary/i) { $path = $ARGV[$i+1]; } elsif ($ARGV[$i]=~/-directory/i or $ARGV[$i]=~/-dir/i) { $directory = $ARGV[$i+1]; } elsif ($ARGV[$i]=~/-date/i) { $releaseDate = $ARGV[$i+1]; } elsif ($ARGV[$i]=~/-getSummaries/i or $ARGV[$i]=~/-get/i) { $getSummaries = $ARGV[$i+1]; } elsif ($ARGV[$i]=~/-species/i or $ARGV[$i]=~/-s/i) { $species = $ARGV[$i+1]; } elsif ($ARGV[$i]=~/-level/i or $ARGV[$i]=~/-le/i) { $assemblyLevel = $ARGV[$i+1]; } elsif ($ARGV[$i]=~/-components/i or $ARGV[$i]=~/-c/i) { $components = $ARGV[$i+1]; } elsif ($ARGV[$i]=~/-quantity/i or $ARGV[$i]=~/-q/i or $ARGV[$i]=~/-number/i or $ARGV[$i]=~/-n/i) { $quantity = int($ARGV[$i+1]); } elsif ($ARGV[$i]=~/-ena/i) { $sequenceID = $ARGV[$i+1]; } elsif ($ARGV[$i]=~/-output/i or $ARGV[$i]=~/-o/i) { $mainFolder = $ARGV[$i+1]; } elsif ($ARGV[$i]=~/-taxid/i) { $assemblyTaxid = $ARGV[$i+1]; } elsif ($ARGV[$i]=~/-fastq/i) { $sraID = $ARGV[$i+1]; } elsif ($ARGV[$i]=~/-assembly_or_project/i) { $assemblyPrjID = $ARGV[$i+1]; } elsif ($ARGV[$i]=~/-log/i) { $log = 1; # $log = File::Log->new({ # debug => 4, # logFileName => 'myLogFile.log', # logFileMode => '>', # dateTimeStamp => 1, # stderrRedirect => 1, # defaultFile => 0, # logFileDateTime => 1, # appName => 'getSequenceInfo', # PIDstamp => 0, # storeExpText => 1, # msgprepend => '', # say => 1, # }); } } #define folder separator and OS if ($^O =~ /MSWin32/) { $fldSep = "\\"; $actualOS = "MSWin32"; } #LOG file if ($log) { open (LOG, "log.txt") or die " error open log.txt $!:"; } print "Working ...\n"; if ($kingdom eq "viruses") { $kingdom = "viral"; } if (grep(/^$kingdom$/i, @availableKingdoms)) { my @patternParametersList; my @levelList = split /,/, $assemblyLevel; if ($species ne "") { my @speciesList = split(/,/, $species); foreach my $actualSpecies (@speciesList) { get_assembly_summary_species( $quantity, $releaseDate, $directory,$kingdom, $actualSpecies, \@levelList, $fldSep, $actualOS, $mainFolder, $assemblyTaxid, $log, $getSummaries, $components, $ftpServor); } } elsif ($assemblyTaxid ne "") { my @taxidList = split(/,/, $assemblyTaxid); foreach my $actualID (@taxidList) { get_assembly_summary_species($quantity, $releaseDate, $directory,$kingdom,$species, \@levelList, $fldSep, $actualOS, $mainFolder, $actualID, $log, $getSummaries, $components, $ftpServor); } } else { get_assembly_summary_species($quantity, $releaseDate, $directory,$kingdom,$species, \@levelList, $fldSep, $actualOS, $mainFolder, $assemblyTaxid, $log, $getSummaries, $components, $ftpServor); } } if ($sequenceID) { my @sequenceIDList = split /,/, $sequenceID; foreach my $enaID (@sequenceIDList) { sequence_ena($enaID, $log); } } if ($sraID) { my @sraIDList = split /,/, $sraID; foreach my $sra (@sraIDList) { download_ena_fastq($enaFtpServor, $sra, $log); } } if ($assemblyPrjID) { download_assembly_or_project($assemblyPrjID, $ftpServor, $fldSep, $directory, $log); } #my ($end_year,$end_month,$end_day, $end_hour,$end_min,$end_sec) = Today_and_Now(); #my ($D_y,$D_m,$D_d, $Dh,$Dm,$Ds) = # Delta_YMDHMS($start_year,$start_month,$start_day, $start_hour, $start_min, $start_sec, # $end_year, $end_month, $end_day, $end_hour,$end_min,$end_sec); #print "End Date: $end_year-$end_month-$end_day, $end_hour:$end_min:$end_sec\n"; print "Thank you for using $0!\n"; #print "Execution time: $D_y years, $D_m months, $D_d days, $Dh:$Dm:$Ds (hours:minutes:seconds)\n"; my $end = time(); my $total = $end - $start; my $min = $total / 60; my $hrs = $min / 60; print "Total time: $total seconds OR $min minutes OR $hrs hours ! \n"; ### subroutine # display global help document sub help_user_simple { my $programme = shift @_; print STDERR "Usage : perl $programme [options] \n"; print "Type perl $programme -version or perl $programme -v to get the current version\n"; print "Type perl $programme -help or perl $programme -h to get full help\n"; } #------------------------------------------------------------------------------ # display full help document sub help_user_advance { print <<HEREDOC; Name: $0 Synopsis: A Perl script allowing to get sequence information from GenBank RefSeq or ENA repositories. Usage: perl $0 [options] examples: perl $0 -k bacteria -s "Helicobacter pylori" -l "Complete Genome" -date 2019-06-01 perl $0 -k viruses -n 5 -date 2019-06-01 perl $0 -k "bacteria" -taxid 9,24 -n 10 -c plasmid -dir genbank -o Results perl $0 -ena BN000065 perl $0 -fastq ERR818002 perl $0 -fastq ERR818002,ERR818004 Kingdoms: archaea bacteria fungi invertebrate plant protozoa vertebrate_mammalian vertebrate_other viral Assembly levels: "Complete Genome" Chromosome Scaffold Contig General: -help or -h displays this help -version or -v displays the current version of the program Options ([XXX] represents the expected value): -directory or -dir [XXX] allows to indicate the NCBI's nucleotide sequences repository (default: $directory) -get or -getSummaries [XXX] allows to obtain a new assembly summary file in function of given kingdoms (bacteria,fungi,protozoa...) -k or -kingdom [XXX] allows to indicate kingdom of the organism (see the examples above) -s or -species [XXX] allows to indicate the species (must be combined with -k option) -taxid [XXX] allows to indicate a specific taxid (must be combined with -k option) -assembly_or_project [XXX] allows to indicate a specific assembly accession or bioproject (must be combined with -k option) -date [XXX] indicates the release date (with format yyyy-mm-dd) from which sequence information are available -l or -level [XXX] allows to select a specific assembly level (e.g. "Complete Genome") -o or -output [XXX] allows users to name the output result folder -n or -number [XXX] allows to limit the total number of assemblies to be downloaded -c or -components [XXX] allows to select specific components of the assembly (e.g. plasmid, chromosome, ...) -ena [XXX] allows to download report and fasta file given a ENA sequence ID -fastq [XXX] allows to download FASTQ sequences from ENA given a run accession (https://ena-docs.readthedocs.io/en/latest/faq/archive-generated-files.html) -log allows to create a log file HEREDOC } #------------------------------------------------------------------------------ # display program version sub program_version { my $programme = shift @_; print "\n $programme, version : $version\n"; print "\n A perl script to get sequences informations\n"; } #------------------------------------------------------------------------------ sub get_assembly_summary_species { my ($quantity, $releaseDate, $directory, $kingdom, $species, $levelList, $fldSep, $actualOS, $mainFolder, $assemblyTaxid, $log, $getSummaries, $components, $ftpServor) = @_; # assembly_summary.txt file from NCBI FTP site #my $assemblySummary = get_summaries($ftpServor, $kingdom, $getSummaries, $directory); my $assemblySummary = ""; if($path){ $assemblySummary = $path; } else{ $assemblySummary = download_summaries($directory, $kingdom, $ftpServor, $fldSep, $getSummaries); } #lineage folder # my $lineage_file = "/pub/taxonomy/new_taxdump/new_taxdump.tar.gz"; # allow to check old summary download my $oldKingdom = ""; # start of output file if ($log) { print LOG "...Downloading assembly_summary.txt...\n"; } # if ($actualOS =~ /Unix/i) { #initialiaze tar manipulation # my $tar = Archive::Tar->new; #download taxdump folder # download_file($ftpServor, $lineage_file); # $tar->read("new_taxdump.tar.gz"); # $tar->extract_file("rankedlineage.dmp"); # } #my $kingdomRep = $kingdom."_".$start_year."_".$start_month."_".$start_day; #my $kingdomRep = $kingdom."_folder"; my $kingdomRep = "folder"; if ($mainFolder) { $kingdomRep = $mainFolder; } mkdir $kingdomRep unless -d $kingdomRep; # Repository for request #my $repositoryAssembly = "assembly_repository_".$assemblyTaxid; my $repositoryAssembly = "result"; mkdir $repositoryAssembly unless -d $repositoryAssembly; my $oldAssemblyRep = "." . $fldSep . $kingdomRep . $fldSep . $repositoryAssembly; if (-d $oldAssemblyRep) { rmtree($oldAssemblyRep) } # Repository for fna file my $repositoryFNA = "Assembly"; mkdir $repositoryFNA unless -e $repositoryFNA; # Repository for genbank file my $repositoryGenbank = "GenBank"; mkdir $repositoryGenbank unless -e $repositoryGenbank; # Reposotiry for report file my $repositoryReport = "Report"; mkdir $repositoryReport unless -e $repositoryReport ; # Repositories for required components my %componentsRepHash; if ($components) { for my $component (split /,/, $components) { my $specificRep = $component."_folder"; #my $specificRep = $component."_".$species."_".$kingdom."_".$start_year."_".$start_month."_".$start_day; mkdir $specificRep unless -d $specificRep; $componentsRepHash{$component} = $specificRep; } } if ($log) { print LOG "...Create kingdom and components repository...\n"; } my %assemblyReportList; my @assemblyRepresentationList = qw/Full Partial/; my @levelList = @{$levelList}; my $checkCompleteGenome = grep(/complete genome/i, @levelList); if ($checkCompleteGenome > 0) {@assemblyRepresentationList = qw/Full/;} if (-e $assemblySummary) { # Read file open (SUM, $assemblySummary) or die " error open assembly_summary.txt $!:"; while(<SUM>) { chomp; if ($_ !~ m/^#/) { my @infoList = split /\t/, $_; my $foundAssemRep = grep (/$infoList[13]/i, @assemblyRepresentationList); my $checkLevel = grep(/$infoList[11]/i, @levelList); #replace 11 by 10 if ($foundAssemRep > 0 && $checkLevel > 0) { my $indexInfo = 0; my $searchPattern = ""; my $regex = ""; if ($species !~ /^$/) { $indexInfo = 7; $searchPattern = $species; $regex = qr/$searchPattern/i; } elsif ($assemblyTaxid !~ /^$/) { $indexInfo = 5; $searchPattern = $assemblyTaxid; $regex = qr/^$searchPattern$/i; } if (($infoList[$indexInfo] =~ $regex) or ($kingdom !~ /^$/ && $searchPattern =~ /^$/)) { my @gcfInfo = split(/\//, $infoList[19]); my $gcfName = pop(@gcfInfo); my $realDate = $infoList[14]; $realDate =~ s/\//-/g; my $genbankFile = $infoList[19] . "/" . $gcfName . "_genomic.gbff.gz"; my $dnaFile = $infoList[19] . "/" . $gcfName . "_genomic.fna.gz"; my $assemblyReport = $infoList[19] . "/" . $gcfName . "_assembly_report.txt"; if ($realDate gt $releaseDate) { $dnaFile = obtain_file($ftpServor, $dnaFile); $genbankFile = obtain_file($ftpServor, $genbankFile); $assemblyReport = obtain_file($ftpServor, $assemblyReport); download_file($ftpServor, $dnaFile); download_file($ftpServor, $genbankFile); download_file($ftpServor, $assemblyReport); if ($log) { print LOG "...download FASTA and GenBank report files...\n"; } # download sequences and check number of "N" characters my $fileFasta = $gcfName."_genomic.fna.gz"; my $ucpFasta = $gcfName."_genomic.fna"; if (-e $fileFasta) { gunzip $fileFasta => $ucpFasta or die "gunzip failed: $GunzipError\n"; move($ucpFasta, $repositoryFNA) or die "move failed: $!"; } if ($log) { print LOG "...Unzip FASTA file named $fileFasta ...\n"; } # download genome report my $fileReport = $gcfName."_assembly_report.txt"; if (-e $fileReport) { my $fileInformations = $gcfName."_informations.xls"; move($fileReport, $repositoryReport) or die "move failed: $!"; } # download genbank files my $fileGenbank = $gcfName."_genomic.gbff.gz"; my $ucpGenbank = $gcfName."_genomic.gbff"; if (-e $fileGenbank) { gunzip $fileGenbank => $ucpGenbank or die "gunzip failed: $GunzipError\n"; move($ucpGenbank, $repositoryGenbank) or die "move failed: $!"; } if ($log) { print LOG "...Unzip GenBank file $fileGenbank ...\n"; } # association report and fasta my $fileFastaGenbank = $ucpFasta . "," . $ucpGenbank; $assemblyReportList{$fileReport} = $fileFastaGenbank; if ($quantity) { $quantity -= 1; } } } } } if (defined $quantity && $quantity == 0) { $quantity = undef; last; } } close(SUM) or die "close file error : $!"; if (!keys %assemblyReportList) { print "##################################################################\n"; print "No results were found for the following query:\n"; print "perl $0 @ARGV\n"; print "##################################################################\n\n"; if ($actualOS =~ /unix/i) { unlink glob "*.dmp *.gz" or die "for file *.dmp *.gz $!:"; } if (empty_folder($kingdomRep)) { rmdir $kingdomRep or die "fail remove directory $!:"; } rmdir $repositoryAssembly or die "failed to remove directory $!:"; rmdir $repositoryFNA or die "failed to remove directory $!:"; rmdir $repositoryGenbank or die "failed to remove directory $!:"; rmdir $repositoryReport or die "failed to remove directory $!:"; if ($components) { for my $componentRep (values %componentsRepHash) { rmdir $componentRep or die "failed to remove directory $!:"; } } if ($log) { print LOG "...No results were found for the following query: "; print LOG "perl $0 @ARGV \n"; } } else { if ($log) { print LOG "...Results were found for the following query: "; print LOG "perl $0 @ARGV \n"; } # write summary files my %componentsSumHash; my @keysList = keys %assemblyReportList; my $summary = "summary.xls"; my $htmlSummary = "summary.html"; if ($components) { for my $component (split /,/, $components) { my $specificSummary = $component. "_summary.xls"; $componentsSumHash{$component} = $specificSummary; } } my $fileReport = "." . $fldSep. $repositoryReport . $fldSep . $keysList[0]; my @header = (); open(FILE, $fileReport) or die "error open file : $!"; while(<FILE>) { chomp; if($_ =~ /:/){ $_ =~ s/^#*//; my @ligne = split /:/, $_; push(@header, $ligne[0]); } } close(FILE) or die "error close file : $!"; open(HEAD, ">", $summary) or die " error open file : $!"; foreach(@header) { print HEAD $_ . "\t"; } print HEAD "Pubmed\tNucleScore\tClassification\t"; print HEAD "Country\tHost\tIsolation source\tA percent\t"; print HEAD "T percent\tG percent\tC percent\tN percent\tGC percent\t"; print HEAD "ATGC ratio\tLength\tShape\n"; close(HEAD) or die "error close file : $!"; if ($components) { foreach my $componentSummary (values %componentsSumHash) { open(SUM, ">>", $componentSummary) or die "error open file : $!"; print SUM "Id\tAssembly\tDescription\tLength\tStatus\tLevel\t"; print SUM "GC percent\tA percent\tT percent\tG percent\tC percent\n"; close(SUM) or die "error close file : $!"; } } for my $file (@keysList) { my $reportFile = $repositoryReport . $fldSep . $file; my @fastaGenbank = split /,/, $assemblyReportList{$file}; my $extFasta = $fastaGenbank[0]; my $extGenbank = $fastaGenbank[1]; my $fnaFile = $repositoryFNA . $fldSep . $extFasta; my $genbankFile = $repositoryGenbank . $fldSep . $extGenbank; write_assembly($reportFile, $fnaFile, $genbankFile, $summary, $repositoryAssembly, \%componentsSumHash, $kingdom, $actualOS, \@header, $log); if ($log) { print LOG "...Call write_assembly subroutine...\n"; } } write_html_summary($summary); if ($log) { print LOG "...Call write_html_summary subroutine...\n"; } if ($components) { my @componentList = keys %componentsSumHash; my %componentFastaHash = create_component_sequence_file($fldSep, $repositoryFNA, \@componentList); if (keys %componentFastaHash && $log) { $log->msg(1,"call create_component_sequence_file subroutine");} foreach my $component (keys %componentFastaHash) { move($componentFastaHash{$component}, $componentsRepHash{$component}) or die "move failed: $!"; } } move($summary, $repositoryAssembly) or die "move failed: $!"; move($htmlSummary, $repositoryAssembly) or die "move failed: $!"; move($repositoryFNA, $repositoryAssembly . $fldSep . $repositoryFNA) or die "move failed: $!"; move($repositoryGenbank, $repositoryAssembly . $fldSep . $repositoryGenbank) or die "move failed: $!"; move($repositoryReport , $repositoryAssembly . $fldSep . $repositoryReport) or die "move failed: $!"; if ($log) { print LOG "...move fasta, genbank and report to query folder \n"; } if ($components) { for my $component (keys %componentsSumHash) { move($componentsSumHash{$component}, $componentsRepHash{$component}) or die "move failed: $!"; move($componentsRepHash{$component}, $repositoryAssembly . $fldSep . $componentsRepHash{$component}) or die "move failed: $!" } if ($log) { print LOG "...move component files to folders\n"; } } move($repositoryAssembly, $kingdomRep . $fldSep . $repositoryAssembly) or die "move failed: $!"; if ($log) { print LOG "...move query folder to main folder\n"; } # if ($actualOS =~ /unix/i) { unlink glob "*.dmp" or die "for file *.dmp $!:"; } unlink glob "*.gz sequence.txt" or die "$!: for file *.gz sequence.txt"; } } } #write general assembly file sub write_assembly { my ($reportFile, $fnaFile, $genbankFile, $summary, $repositoryAssembly, $componentsSumHashRef, $kingdom, $actualOS, $headerRef, $log) = @_; my %componentsSumHash = %{$componentsSumHashRef}; my @header = @{$headerRef}; my %hashInformations = (); my $seq = ""; my $genomeName = ""; my $country = "na"; my $GCpercent = -1; my $taxId = "na"; my $assemblyLine; my $pubmedId = "na"; my $host = "na"; my $isoSource = "na"; # my $species = "na"; # my $genus = "na"; # my $family = "na"; # my $order = "na"; # my $class = "na"; # my $phylum = "na"; my $shape = "na"; my $geneNumber = "na"; open(REP, "<", $reportFile) or die "error open file $!:"; while (<REP>) { chomp; $_ =~ s/^#*//; if ($_ =~ /assembled-molecule/i) { $assemblyLine = $_; } if ($_ =~ /:/) { my @line = split /:/, $_; if ($line[1]) { $hashInformations{$line[0]} = trim($line[1]); } } } close(REP) or die "error close file $!:"; open(SUM, ">>", $summary) or die "error open file $!:"; foreach my $k(@header) { if (grep $_ eq $k, keys %hashInformations) { my $information = $hashInformations{$k}; if ($k =~ /Assembly name/i) { $genomeName = $information; } if ($information =~ /^\s*$/) { print SUM "na\t"; } else { print SUM $information . "\t"; } } else { print SUM "na\t"; } } close(SUM) or die "error close file $!:"; open(FNA, "<", $fnaFile) or die "Could not open $!:"; while (<FNA>) { chomp; if ($_ !~ /^>/) { $seq .= $_; } } close(FNA) or die "error close file :$!"; foreach my $summaryKey (keys %hashInformations) { if ($summaryKey =~ /taxid/i) { $taxId = $hashInformations{$summaryKey}; } } my $classification = get_taxonomic_rank_genbank($genbankFile); if ($log) { print LOG "...get taxonomic rank from genbank file\n"; } $GCpercent = gc_percent($seq); my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($fnaFile); my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length); my $atgcRatio = atgc_ratio($ade, $thy, $gua, $cyt); my @percentList = ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent); my $variance = shift_data_variance(@percentList); my $nucleScore = nucle_score($variance, $GCpercent, $atgcRatio, $length); if ($log) { print LOG "compute GC percent nucleotid percent ATGC ratio NucleScore\n"; } open(GBFF, "<", $genbankFile) or die "Error open file $!:"; while(<GBFF>) { chomp; if ($_ =~ /\/country="(.*)"/) { $country = trim($1); } if ($_ =~ /PUBMED(.*)/) { $pubmedId = trim($1); } if ($_ =~ /\/host="(.*)"/) { $host = trim($1); } if ($_ =~ /\/isolation_source="(.*)"/) { $isoSource = trim($1); } if ($_ =~ /\(Genes \(total\)\s+::(.*)/) { $geneNumber = trim($1); } if ($_ =~ /LOCUS.*\s+([a-z]{1,})\s+[a-z]{1,}\s+[0-9]{2,}-[a-z]{1,}-[0-9]{4,}$/i) { $shape = trim($1); } } close(GBFF) or die "error close file $!:"; open(SUM, ">>", $summary) or die "error open file $!:"; print SUM $pubmedId . "\t" . $nucleScore . "\t" . $classification ."\t" ; print SUM $country . "\t" . $host . "\t" . $isoSource . "\t" . $aPercent . "\t" ; print SUM $tPercent . "\t" . $gPercent . "\t" . $cPercent ."\t" . $nPercent . "\t"; print SUM $GCpercent ."\t". $atgcRatio ."\t". $length . "\t". $shape."\n"; close(SUM) or die "error close file: $!"; if (%componentsSumHash) { write_assembly_component($fnaFile, $genomeName, \%componentsSumHash, $log); } } #------------------------------------------------------------------------------ # get assembly component sub write_assembly_component { my($fnaFile, $genomeName, $componentsSumHashRef, $log) = @_; my %componentsSumHash = %{$componentsSumHashRef}; my $status = "na"; my $level = "na"; my $gcpercent; my $tmp_fasta_file = "sequence.txt"; my @desc = (); # check each sequence from (multi-)fasta file my ($seq, $inputfile); if ($log) { print LOG "...search specific components\n"; } # extract sequence details my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$fnaFile); while ($seq = $seqIO->next_seq()) { my $seqID = $seq->id; # ID of sequence my $seqDesc = $seq->desc; # Description of sequence my $globalSeq = $seq->seq; my $seqLength = $seq->length(); open(TSEQ, ">", $tmp_fasta_file) or die "Error open file: $!"; print TSEQ $globalSeq; close(TSEQ); my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($tmp_fasta_file); my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length); $gcpercent = gc_percent($globalSeq); @desc = split /,/, $seqDesc; if ($desc[1]) { $level = $desc[1]; } foreach my $component (keys %componentsSumHash) { if ($desc[0] =~ /$component/) { $status = $component; my $info = $seqID . "\t" . $genomeName ."\t" . $seqDesc . "\t" . $seqLength . "\t" . $status . "\t" . $level ."\t"; $info.= $gcpercent."\t". $aPercent ."\t". $tPercent ."\t". $gPercent ."\t". $cPercent . "\n"; add_to_file($componentsSumHash{$component}, $info); if ($log) { print LOG "...found component $component \n"; } } } } } #------------------------------------------------------------------------------ # download fasta sequence and report on ENA with assembly ID sub get_fasta_and_report_sequence_ena_assembly { my($sequenceID) = @_; my $tmp_file = "fichier.txt"; my @id_list = (); my $id_chain = ""; my $fasta_file = ""; my $report_file = ""; my $url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=xml"; my $output = get($url); open(TMP, ">", $tmp_file) or die("could not open $!"); print TMP $output; close(TMP) or die("could not close $!"); open(TMP, "<", $tmp_file) or die("could not open $!"); while(<TMP>){ if($_ =~ /<CHROMOSOME accession="(.*)">/){ push(@id_list, $1) } } close(TMP) or die("could not close $!"); $id_chain = join(",", @id_list); $url = "https://www.ebi.ac.uk/ena/data/view/$id_chain&display=fasta"; $output = get($url); $fasta_file = $sequenceID . ".fasta"; open(FILE, ">", $fasta_file) or die("could not open $!"); print FILE $output; close(FILE) or die("could not close $!"); $report_file = $sequenceID . "_report.txt"; for my $id (@id_list) { $url = "https://www.ebi.ac.uk/ena/data/view/$id&display=text&header=true"; $output = get($url); open(FILE, ">>", $report_file) or die("could not open $!"); print FILE $output; print FILE "##########################################################################\n\n"; close(FILE) or die("could not close $!"); } unlink "fichier.txt" or die "error delete file :$!"; return ($fasta_file, $report_file); } #------------------------------------------------------------------------------ # download ENA sub sequence_ena { my($sequenceID, $log) = @_; #my $assemblyRep = $sequenceID . "_folder"; my $fastaFile; my $reportFile; #if(-d $assemblyRep) { rmtree($assemblyRep); } #mkdir $assemblyRep; if ($log) { print LOG "...ENA sequence downloading ...\n"; } #if ($log) {$log->msg(1, "Creation of repository: $assemblyRep\n");} if($sequenceID =~ /^GCA_.*/){ ($fastaFile, $reportFile) = get_fasta_and_report_sequence_ena_assembly($sequenceID); } else { ($fastaFile, $reportFile) = get_fasta_and_report_sequence_ena_other($sequenceID); } if ($log) { print LOG "...Downloading of ENA FASTA and report files for sequence: $sequenceID\n"; } #move($fastaFile, $assemblyRep) or die "move failed: $!"; #move($reportFile, $assemblyRep) or die "move failed: $!"; if ($log) { print LOG "...Move fasta and report files to folder\n"; } } #------------------------------------------------------------------------------ # download fasta sequence and report on ENA with ENA ID sub get_fasta_and_report_sequence_ena_other { my($sequenceID) = @_; my $fasta_file = ""; my $report_file = ""; my $url; my $output; $url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=fasta"; #if($actualOS eq "MSWin32"){ $output = get($url); $fasta_file = $sequenceID . ".fasta"; open(FILE, ">", $fasta_file) or die("could not open $!"); print FILE $output; close(FILE) or die "could not close $!"; #} #else{ # system("wget $url"); #} $output = ""; $url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=text&header=true"; #if($actualOS eq "MSWin32"){ $output = get($url); # replace by wget???? $report_file = $sequenceID . "_report.txt"; open(FILE, ">>", $report_file) or die "could not open: $!"; print FILE $output; close(FILE) or die "could not close $!"; #} #else{ # system("wget $url"); #} return ($fasta_file, $report_file); } #------------------------------------------------------------------------------ # add information to file sub add_to_file { my ($file, $info) = @_; open(FILE, ">>", $file) or die ("Could not open $!"); print FILE $info; close(FILE); } #------------------------------------------------------------------------------ # return taxonomic rank of species by tax id sub get_taxonomic_rank { my($tax_id, $taxonomic_file) = @_; my $species = "na"; my $genus = "na"; my $family = "na"; my $order = "na"; my $class = "na"; my $phylum = "na"; # my ($species,$genus,$family,$order,$class,$phylum); my @tmp_array = ($species, $genus, $family, $order, $class, $phylum); open(TFILE, "<", $taxonomic_file) or die("Could not open $taxonomic_file: $!"); while(<TFILE>) { chomp; my @tax_info = split(/\|/, $_); if ($tax_info[0] == $tax_id) { @tax_info = trim_array(@tax_info); $tmp_array[0] = $tax_info[1]; splice(@tax_info, 0, 3); for(my $i = 1; $i < $#tmp_array + 1; $i++) { if (length($tax_info[$i-1]) > 0) { $tmp_array[$i] = $tax_info[$i-1]; } } close(TFILE) or die "error close $taxonomic_file $!:"; return @tmp_array; } } close(TFILE) or die "error close $taxonomic_file $!:"; } #------------------------------------------------------------------------------ # write html summary file sub write_html_summary { my($summary) = @_; my $htmlFile = "summary.html"; my $header = ""; my @fileToRead = (); open(HTML, ">", $htmlFile) or die "error open HTML summary $!"; print HTML "<!DOCTYPE html>\n"; print HTML "<html>\n"; print HTML " <head>\n"; print HTML " <title>Assembly summary</title>\n"; print HTML " </head>\n"; print HTML " <body>\n"; print HTML " <h2>Assembly Summary</h2>\n"; close(HTML) or die "error close HTML summary $!"; open(SUM, "<", $summary) or die "error open tsv summary $!"; @fileToRead = <SUM>; close(SUM) or die "error close tsv summary $!"; $header = splice(@fileToRead, 0, 1); for my $line (@fileToRead) { write_html_table($line, $htmlFile, $header); } open(HTML, ">>", $htmlFile) or die "error open HTML summary $!"; print HTML " </body>\n"; print HTML "</html>\n"; close(HTML) or die "error close HTML summary $!"; } #------------------------------------------------------------------------------ # write html table for summary sub write_html_table { my ($line, $htmlFile, $header) = @_; open(HTML, ">>", $htmlFile) or die "error open HTML summary $!"; print HTML " <table border=\"1\" style=\"margin-bottom: 20px;\">\n"; close(HTML) or die "error close HTML summary $!"; add_table_content($line, $htmlFile, $header); } #------------------------------------------------------------------------------ # add information to table sub add_table_content { my ($line, $htmlFile, $headers) = @_; my @assemblyHeader = split(/\t/, $headers); my @assemblyInfo = split(/\t/, $line); my %hashHeaderInfo; my $nbOfCell = 7; my $fullLine = floor(($#assemblyHeader + 1) / $nbOfCell); my $restCell = $#assemblyHeader + 1 - $fullLine * $nbOfCell; for (my $i = 0; $i < $#assemblyHeader + 1; $i++) { $hashHeaderInfo{trim($assemblyHeader[$i])} = $assemblyInfo[$i]; } my @keysHeaderInfo = keys %hashHeaderInfo; my $cellIndex = 0; open(HTML, ">>", $htmlFile) or die "error open HTML summary $!"; for (my $turn = 0; $turn < $fullLine; $turn++) { print HTML " <tr>\n"; for my $header (@assemblyHeader[$cellIndex..$cellIndex + $nbOfCell - 1]) { print HTML " <th>$header</th>\n"; } print HTML " </tr>\n"; print HTML " <tr>\n"; for my $header (@assemblyHeader[$cellIndex..$cellIndex + $nbOfCell - 1]) { if ($header =~ /PUBMED/i && $hashHeaderInfo{$header} ne "na") { print HTML " <td><a href=https://www.ncbi.nlm.nih.gov/pubmed/?term=". "$hashHeaderInfo{$header} target=\"_blank\">$hashHeaderInfo{trim($header)}</a></td>"; } else { print HTML " <td>$hashHeaderInfo{trim($header)}</td>\n"; } } print HTML " </tr>\n"; $cellIndex += $nbOfCell; } print HTML " <tr>\n"; for my $header(@assemblyHeader[$cellIndex..$#keysHeaderInfo]) { print HTML " <th>$header</th>\n"; } print HTML " <tr>\n"; print HTML " <tr>\n"; for my $header(@assemblyHeader[$cellIndex..$#keysHeaderInfo]) { print HTML " <td>$hashHeaderInfo{trim($header)}</td>\n"; } print HTML " <tr>\n"; print HTML " </table>\n"; close(HTML) or die "error close HTML summary $!"; } #------------------------------------------------------------------------------ #getTaxonomicRanks (function allowing to get taxonomic ranks from Genbank file) sub get_taxonomic_rank_genbank { my ($genbank) = @_; my $seqio_object = Bio::SeqIO->new(-file => $genbank); my $seq_object = $seqio_object->next_seq; # legible and long my $species_object = $seq_object->species; my $species_string = $species_object->node_name; # get all taxa from the ORGANISM section in an array my @classification = $seq_object->species->classification; # my $arraySize = @classification; # print "@classification\n"; # if($arraySize == 7){ # ($species,$genus,$family,$order,$class,$phylum,$kingdomGB) = @classification; # } # elsif($arraySize == 4){ # ($species,$class,$phylum,$kingdomGB) = @classification; # } my $classification = join(",", @classification); return ($classification); } #------------------------------------------------------------------------------ #add all sequences components to file sub create_component_sequence_file { my ($fldSep, $repository, $listComponentRef) = @_; my @listFnaFile; my @listComponent = @{$listComponentRef}; opendir(my $dh, $repository) || die "Can't opendir $repository: $!"; @listFnaFile = grep{/fna$/} readdir($dh); closedir $dh; my %componentFastaHash; foreach my $component (@listComponent) { my $componentFasta = $component.".fasta"; foreach my $fnaFile (@listFnaFile) { # my $actualFile = $repository . $fldSep . $fnaFile; my $seq; my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$repository . $fldSep . $fnaFile); while ($seq = $seqIO->next_seq()) { my $seqDesc = $seq->desc; if ($seqDesc =~ /$component/) { my $seqID = $seq->id; my $seqNuc = $seq->seq; my $shift = 60; my @seqArray = split //, $seqNuc; my $newSeqNuc = ""; if (length $seqNuc <= $shift) { $newSeqNuc = $seqNuc; } else { for(my $i = 0; $i < $#seqArray + 1; $i ++) { $newSeqNuc .= $seqArray[$i]; if (($i + 1) % $shift == 0) { $newSeqNuc .= ","; } } } open(FASTA, ">>", $componentFasta) or die "error open file $!:"; print FASTA ">$seqID $seqDesc\n"; foreach my $subSeqNuc (split /,/, $newSeqNuc) { print FASTA "$subSeqNuc\n"; } close(FASTA) or die "error close file $!:"; } } } if (-e $componentFasta) { $componentFastaHash{$component} = $componentFasta; } } return %componentFastaHash; } # remove back and front spaces sub trim { my ($string) = @_; $string =~ s/^\s+//; $string =~ s/\s+$//; return $string; } #------------------------------------------------------------------------------ # use trim in array sub trim_array { my (@array) = @_; foreach my $value (@array) { $value = trim($value); } return @array; } #------------------------------------------------------------------------------ # check if folder is empty sub empty_folder { my $dirname = shift; opendir(my $dholder, $dirname) or die "error not a directory"; my $isEmpty = scalar(grep { $_ ne "." && $_ ne ".." } readdir($dholder)); if ($isEmpty == 0) { return $isEmpty; } } #------------------------------------------------------------------------------ # number nucleotid and length sub number_nuc_length_seq { my ($fastaFile) = @_; my $ade = 0; my $thy = 0; my $gua = 0; my $cyt = 0; my $n = 0; my $length = 0; open (FASTA, "<", $fastaFile) or die "Could not open $!"; while (<FASTA>) { chomp; if ($_ !~ />/) { my @seq = split //, $_; for my $nuc (@seq) { $length +=1 ; if ($nuc =~ /a/i) {$ade+=1;} elsif ($nuc =~ /t/i) {$thy+=1;} elsif ($nuc =~ /g/i) {$gua+=1;} elsif ($nuc =~ /c/i) {$cyt+=1;} elsif ($nuc =~ /n/i) {$n+=1;} } } } close(FASTA) or die "Error close file :$!"; return ($ade, $thy, $gua, $cyt, $n, $length); } #------------------------------------------------------------------------------ # compute percentage of nucleotid sub nucleotid_percent { my($ade, $thy, $gua, $cyt, $n, $length) = @_; my $adePercent = $ade / $length * 100; my $thyPercent = $thy / $length * 100; my $guaPercent = $gua / $length * 100; my $cytPercent = $cyt / $length * 100; my $nPercent = $n / $length * 100; return ($adePercent, $thyPercent, $guaPercent, $cytPercent, $nPercent); } #------------------------------------------------------------------------------ # compute ATGC ratio sub atgc_ratio { my ($ade, $thy, $gua, $cyt) = @_; return (($ade + $thy) / ($gua + $cyt)); } #------------------------------------------------------------------------------ # variance sub shift_data_variance { my (@data) = @_; if ($#data + 1 < 2) { return 0.0; } my $K = $data[0]; my ($n, $Ex, $Ex2) = 0.0; for my $x (@data) { $n = $n + 1; $Ex += $x - $K; $Ex2 += ($x - $K) * ($x - $K); } my $variance = ($Ex2 - ($Ex * $Ex) / $n) / ($n); ## ($n - 1) return $variance; } #------------------------------------------------------------------------------ # nucle score sub nucle_score { my ($variance, $gcPercent, $atgcRatio, $length) = @_; #return log2(($variance * $gcPercent * $atgcRatio) / sqrt($length)); return log2(($variance * $gcPercent * $atgcRatio ** (3)) / sqrt($length)); } #------------------------------------------------------------------------------ sub log2 { my $n = shift; return (log($n) / log(2)); } #------------------------------------------------------------------------------ # compute GC pourcent sub gc_percent { my ($seq) = @_; my @charSeq = split(//, uc($seq)); my %hashFlank = (); foreach my $v (@charSeq) { $hashFlank{$v} += 1; } if (! $hashFlank{'G'}) { $hashFlank{'G'} = 0;} if (! $hashFlank{'C'}) { $hashFlank{'C'} = 0;} if(length($seq) == 0) { return 0; } else { return (($hashFlank{'G'} + $hashFlank{'C'}) / (length($seq))) * 100; } } #------------------------------------------------------------------------------ # download file from ftp protocol sub download_file { my($servor, $file) = @_; #if($actualOS eq "MSWin32"){ my $ftp = Net::FTP->new($servor, Debug => 0) or die "Cannot connect to $servor"; $ftp->login("anonymous", "-anonymous@") or die "Cannot login ", $ftp->message; $ftp->binary; $ftp->get($file) or die "get failed ", $ftp->message; $ftp->quit; #} #else{ # system("wget $file"); #} } #------------------------------------------------------------------------------ # obtain file directory sub obtain_file { my($servor, $link) = @_; if ($link =~ /$servor(.*)/) { return ($1); } } #------------------------------------------------------------------------------ # download fastq file from ENA sub download_ena_fastq { my ($enaFtpServor, $sraId, $log) = @_; my $fastqDir = "/vol1/fastq/"; my $dir1 = substr $sraId, 0, 6; my $dir2 = "000"; my $digits = substr $sraId, 3; my $fastqRep = $sraId . "_folder"; if ($log) { print LOG "...Downloading fastq file from ENA\n"; } if (length $digits == 6) { $dir2 = $sraId; $fastqDir .= $dir1 . "/" . $dir2 . "/"; } elsif (length $digits > 6) { my $digitsNumber = 0; my @digitsList = split //, (substr $digits, 6); foreach my $char (@digitsList) { if (length $dir2 >= 1) { $dir2 = substr $dir2, 0, (length $dir2) - 1; $digitsNumber += 1; } } $dir2 .= substr $digits, -$digitsNumber; $fastqDir .= $dir1 . "/" . $dir2 . "/" . $sraId . "/"; } if ($log) { print LOG "...recreate database folder path for FASTQ downloading\n"; } my $ftp = Net::FTP->new($enaFtpServor, Debug => 0) or die "Cannot connect to $enaFtpServor"; $ftp->login("anonymous", "-anonymous@") or die "Cannot login ", $ftp->message; $ftp->binary; $ftp->cwd($fastqDir) or die "maybe undefined sequence id, can't go to $fastqDir: ", $ftp->message; my @fastqFiles = $ftp->ls("$sraId*"); if ($log) { print LOG "...Searching fastq files in path\n"; } if (!grep(/^$/, @fastqFiles)) { if (-d $fastqRep) { rmtree($fastqRep) } mkdir $fastqRep; foreach my $fastqFile (@fastqFiles) { #if($actualOS eq "MSWin32"){ $ftp->get($fastqFile) or die "get failed ", $ftp->message; #} #else{ # system("wget $fastqFile"); #} #my @baseAndExt = split /\./, $fastqFile; #my $unzipFastq = $baseAndExt[0] . ".fastq"; #gunzip $fastqFile => $unzipFastq or die "gunzip failed: $GunzipError\n"; move($fastqFile, $fastqRep) or die "move failed: $!"; # DC replaced $unzipFastq by $fastqFile } #unlink glob "*fastq.gz" or die "$!: for file *fastq.gz"; if ($log) { print LOG "...Finalizing download of FASTQ file\n"; } } if ($log) { print LOG "End of download\n"; } $ftp->quit; } #------------------------------------------------------------------------------ # download fastq file from ENA sub get_assembly_or_project { my ($file, $sequence, $ftpServor, $fldSep, $log) = @_; my $pattern; my $indexInfo; my %folderHash; # Repository for fna file my $repositoryFNA = "Assembly"; # Repository for genbank file my $repositoryGenbank = "GenBank"; # Reposotiry for report file my $repositoryReport = "Report"; # global repository my $repositorySequence = $sequence; if ($sequence =~ /^GC[AF]_(.*)/) { $indexInfo = 0; $pattern = $1; } elsif ($sequence =~ /^PRJ/) { $indexInfo = 1; $pattern = $sequence; } open(SUM, $file) or die "error open file $!:"; while(<SUM>) { chomp; if ($_ !~ /^#/) { my @infoList = split /\t/, $_; if ($infoList[$indexInfo] =~ /$pattern/) { my @gcfInfo = split(/\//, $infoList[19]); my $gcfName = pop(@gcfInfo); my $genbankFile = $infoList[19] . "/" . $gcfName . "_genomic.gbff.gz"; my $dnaFile = $infoList[19] . "/" . $gcfName . "_genomic.fna.gz"; my $assemblyReport = $infoList[19] . "/" . $gcfName . "_assembly_report.txt"; $dnaFile = obtain_file($ftpServor, $dnaFile); $genbankFile = obtain_file($ftpServor, $genbankFile); $assemblyReport = obtain_file($ftpServor, $assemblyReport); download_file($ftpServor, $dnaFile); download_file($ftpServor, $genbankFile); download_file($ftpServor, $assemblyReport); # download sequences and check number of "N" characters my $fileFasta = $gcfName."_genomic.fna.gz"; my $ucpFasta = $gcfName."_genomic.fna"; if (-e $fileFasta) { gunzip $fileFasta => $ucpFasta or die "gunzip failed: $GunzipError\n"; $folderHash{$ucpFasta} = $repositoryFNA; } # download genome report my $fileReport = $gcfName."_assembly_report.txt"; if (-e $fileReport) { $folderHash{$fileReport} = $repositoryReport; } # download genbank files my $fileGenbank = $gcfName."_genomic.gbff.gz"; my $ucpGenbank = $gcfName."_genomic.gbff"; if (-e $fileGenbank) { gunzip $fileGenbank => $ucpGenbank or die "gunzip failed: $GunzipError\n"; $folderHash{$ucpGenbank} = $repositoryGenbank; } } } } close(SUM) or die "error close file $!"; if (keys %folderHash) { if (-e $repositorySequence) {rmtree($repositorySequence);} if ($log) { print LOG "...Download files from GenBank or RefSeq \n"; } mkdir $repositorySequence; mkdir $repositoryFNA; mkdir $repositoryGenbank; mkdir $repositoryReport; for my $ucpFile (keys %folderHash) { move($ucpFile, $folderHash{$ucpFile}) or die "error move file $!:"; } move($repositoryFNA, $repositorySequence . $fldSep. $repositoryFNA) or die "error move file $!:"; move($repositoryGenbank, $repositorySequence . $fldSep. $repositoryGenbank) or die "error move file $!:"; move($repositoryReport, $repositorySequence . $fldSep. $repositoryReport) or die "error move file $!:"; unlink glob "*.gz" or die "for file *.gz $!:"; if ($log) { print LOG "...move GenBank/RefSeq sequence files to dedicated folders\n"; } } } #------------------------------------------------------------------------------ sub download_assembly_or_project { my ($sequenceId, $ftpServor, $fldSep, $directory, $log) = @_; my $assemblySummary; my @sequenceIdList = split /,/, $sequenceId; if ($directory =~ /refseq/) { $assemblySummary = "assembly_summary_refseq.txt"; } elsif ($directory =~ /genbank/) { $assemblySummary = "assembly_summary_genbank.txt"; } my $assemblySummaryPath = "/genomes/ASSEMBLY_REPORTS/" . $assemblySummary; download_file($ftpServor, $assemblySummaryPath); foreach my $sequence (@sequenceIdList) { get_assembly_or_project($assemblySummary, $sequence, $ftpServor, $fldSep, $log); } unlink $assemblySummary or die "error remove file $!:"; } #------------------------------------------------------------------------------ # check if all required module are install sub isModuleInstalled { my $mod = shift; #eval("use $mod"); my $commandModule = `perldoc -l $mod`; if ($commandModule) { return(1); } else { return(0); } } #------------------------------------------------------------------------------ # download assembly summary sub download_summaries { my ($database, $kingdom, $ftpServor, $fldSep, $getSummaries) = @_; my $assemblySummary = "assembly_summary.txt"; my $assemblySummaryLink; my $fileName; opendir my $workingDirectory, "." . $fldSep or die "error open dir $!:"; my @filesList = readdir $workingDirectory; closedir $workingDirectory; if ($getSummaries) { foreach my $summaryKingdom (split /,/, $getSummaries) { foreach my $file (@filesList) { if ($file =~ /assembly_summary.txt/i && $file =~ /$summaryKingdom/i && $file =~ /$database/) { unlink $file or die "error remove file $!:"; $assemblySummaryLink = "/genomes/$database/$summaryKingdom/assembly_summary.txt"; download_file($ftpServor, $assemblySummaryLink); $fileName = $database . "_" . $summaryKingdom . "_" . "assembly_summary.txt"; rename $assemblySummary, $fileName; print "replace assembly_summary file\n"; } } } } foreach my $file (@filesList) { if ($file =~ /assembly_summary.txt/i && $file =~ /$kingdom/i && $file =~ /$database/) { return $file; } } $assemblySummaryLink = "/genomes/$database/$kingdom/assembly_summary.txt"; $fileName = $database . "_" . $kingdom . "_" . "assembly_summary.txt"; download_file($ftpServor, $assemblySummaryLink); rename $assemblySummary, $fileName; return $fileName; } #--------------------- sub bioseqio { my ($keyword, $file) = @_; local $/ = "\n>"; # read by FASTA record my $count = 0; open FASTA, $file; while (<FASTA>) { chomp; my $seq = $_; #my ($id) = $seq =~ /^>*(\S+)/; # parse ID as first word in FASTA header if ($seq =~ /^>*.*$keyword/) { #$seq =~ s/^>*.+\n//; # remove FASTA header #$seq =~ s/\n//g; # remove endlines $count++; print "\nThe sequence number is: $count \n"; print ">$seq"; } } close FASTA; }