0
|
1 #!/usr/bin/perl
|
|
2
|
|
3 ################################################################################
|
|
4 ## "Copyright 2019 Vincent Moco and David Couvin"
|
|
5 ## licence GPL-3.0-or-later
|
|
6 ## This program is free software: you can redistribute it and/or modify
|
|
7 ## it under the terms of the GNU General Public License as published by
|
|
8 ## the Free Software Foundation, either version 3 of the License, or
|
|
9 ## (at your option) any later version.
|
|
10 ## This program is distributed in the hope that it will be useful,
|
|
11 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13 ## GNU General Public License for more details.
|
|
14 ## You should have received a copy of the GNU General Public License
|
|
15 ## along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
16 ################################################################################
|
|
17
|
|
18 use strict;
|
|
19 use warnings;
|
|
20
|
|
21 my $version = "1.0.1"; # version
|
|
22
|
|
23 #my $number = 50;
|
|
24
|
|
25 # Date and time of the current day (Beginning)
|
|
26 #my ($start_year,$start_month,$start_day, $start_hour,$start_min,$start_sec) = Today_and_Now();
|
|
27 my $start = time();
|
|
28
|
|
29 print "##################################################################\n";
|
|
30 print "## ---> Welcome to $0 (version $version)!\n";
|
|
31 #print "## Start Date (yyyy-mm-dd, hh:min:sec): $start_year-$start_month-$start_day, $start_hour:$start_min:$start_sec\n";
|
|
32 print "##################################################################\n\n";
|
|
33
|
|
34 #BioPerl, Date::Calc, File::Log, have been removed from the @modules
|
|
35 my @modules = qw(
|
|
36 Archive::Tar
|
|
37 Bio::SeqIO
|
|
38 Bio::Species
|
|
39 File::Copy
|
|
40 File::Path
|
|
41 Net::FTP
|
|
42 IO::Uncompress::Gunzip
|
|
43 LWP::Simple
|
|
44 POSIX
|
|
45
|
|
46 );
|
|
47
|
|
48 foreach my $module (@modules) {
|
|
49 if (isModuleInstalled($module)) {
|
|
50 print "$module is.................installed!\n";
|
|
51 }
|
|
52 else {
|
|
53 print "$module is not installed. Please install it and try again.\n";
|
|
54 print "You can reinstall the $0 as shown on the README page or use the following command to install the module:\n";
|
|
55 print "cpan -i -f $module\n";
|
|
56 #system("cpan -i -f $module");
|
|
57 exit 1;
|
|
58 }
|
|
59 }
|
|
60 print "\n";
|
|
61
|
|
62 use Archive::Tar;
|
|
63 use Bio::SeqIO;
|
|
64 use Bio::Species;
|
|
65 #use Date::Calc qw(:all);
|
|
66 use File::Copy qw(cp move);
|
|
67 use File::Path qw(rmtree);
|
|
68 use Net::FTP;
|
|
69 use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
|
|
70 use LWP::Simple qw(get);
|
|
71 use POSIX qw(floor);
|
|
72 #use File::Log;
|
|
73
|
|
74 ####################################################################################################
|
|
75 ## A Perl script allowing to get sequence information from GenBank, RefSeq or ENA repositories.
|
|
76 ##
|
|
77 ####################################################################################################
|
|
78
|
|
79 ### main program
|
|
80 ### parameters
|
|
81 my $directory = "genbank";
|
|
82
|
|
83 my $kingdom = ""; # kingdom of organism
|
|
84
|
|
85 my $releaseDate = "0000-00-00"; # sequences are downloaded from this release date
|
|
86
|
|
87 my $components; # components of the assembly
|
|
88
|
|
89 my $species = ""; # species name
|
|
90
|
|
91 my $getSummaries; # indicates if a new assembly summary is required
|
|
92
|
|
93 my $assemblyLevel = "Complete Genome,Chromosome,Scaffold,Contig"; # assembly level of the genome
|
|
94
|
|
95 my $quantity; # limit number of assemblies to download
|
|
96
|
|
97 my $sequenceID;
|
|
98
|
|
99 my $ftpServor = "ftp.ncbi.nlm.nih.gov";
|
|
100
|
|
101 my $enaFtpServor = "ftp.sra.ebi.ac.uk";
|
|
102
|
|
103 my $fldSep = "/"; # folder seperation change by OS
|
|
104
|
|
105 my @availableKingdoms = (
|
|
106 "archaea",
|
|
107 "bacteria",
|
|
108 "fungi",
|
|
109 "invertebrate",
|
|
110 "plant",
|
|
111 "protozoa",
|
|
112 "vertebrate_mammalian",
|
|
113 "vertebrate_other",
|
|
114 "viral"
|
|
115 ); # list of available kingdoms
|
|
116
|
|
117 my $actualOS = "Unix"; # OS of the computer
|
|
118
|
|
119 my $mainFolder; # folder in which the assemblies results are stored
|
|
120
|
|
121 my $assemblyTaxid = ""; # taxid for assembly
|
|
122
|
|
123 my $sraID; # SRA sequence ID
|
|
124
|
|
125 my $assemblyPrjID; # assembly or prj ID
|
|
126
|
|
127 my $log; # log
|
|
128
|
|
129 my $path = "";
|
|
130
|
|
131
|
|
132 if (@ARGV<1) {
|
|
133 help_user_simple($0);
|
|
134 exit 0;
|
|
135 }
|
|
136
|
|
137 if ($ARGV[0] eq "-help" || $ARGV[0] eq "-h") {
|
|
138 help_user_advance($0);
|
|
139 exit 0;
|
|
140 }
|
|
141
|
|
142 if ($ARGV[0] eq "-version" || $ARGV[0] eq "-v") {
|
|
143 program_version($0);
|
|
144 exit 0;
|
|
145 }
|
|
146
|
|
147 ##requirements
|
|
148 for (my $i = 0; $i <= $#ARGV; $i++) {
|
|
149 if ($ARGV[$i]=~/-kingdom/i or $ARGV[$i]=~/-k/i) {
|
|
150 $kingdom = $ARGV[$i+1];
|
|
151 }
|
|
152 elsif ($ARGV[$i]=~/-path/i or $ARGV[$i]=~/-pathSummary/i) {
|
|
153 $path = $ARGV[$i+1];
|
|
154 }
|
|
155 elsif ($ARGV[$i]=~/-directory/i or $ARGV[$i]=~/-dir/i) {
|
|
156 $directory = $ARGV[$i+1];
|
|
157 }
|
|
158 elsif ($ARGV[$i]=~/-date/i) {
|
|
159 $releaseDate = $ARGV[$i+1];
|
|
160 }
|
|
161 elsif ($ARGV[$i]=~/-getSummaries/i or $ARGV[$i]=~/-get/i) {
|
|
162 $getSummaries = $ARGV[$i+1];
|
|
163 }
|
|
164 elsif ($ARGV[$i]=~/-species/i or $ARGV[$i]=~/-s/i) {
|
|
165 $species = $ARGV[$i+1];
|
|
166 }
|
|
167 elsif ($ARGV[$i]=~/-level/i or $ARGV[$i]=~/-le/i) {
|
|
168 $assemblyLevel = $ARGV[$i+1];
|
|
169 }
|
|
170 elsif ($ARGV[$i]=~/-components/i or $ARGV[$i]=~/-c/i) {
|
|
171 $components = $ARGV[$i+1];
|
|
172 }
|
|
173 elsif ($ARGV[$i]=~/-quantity/i or $ARGV[$i]=~/-q/i or $ARGV[$i]=~/-number/i or $ARGV[$i]=~/-n/i) {
|
|
174 $quantity = int($ARGV[$i+1]);
|
|
175 }
|
|
176 elsif ($ARGV[$i]=~/-ena/i) {
|
|
177 $sequenceID = $ARGV[$i+1];
|
|
178 }
|
|
179 elsif ($ARGV[$i]=~/-output/i or $ARGV[$i]=~/-o/i) {
|
|
180 $mainFolder = $ARGV[$i+1];
|
|
181 }
|
|
182 elsif ($ARGV[$i]=~/-taxid/i) {
|
|
183 $assemblyTaxid = $ARGV[$i+1];
|
|
184 }
|
|
185 elsif ($ARGV[$i]=~/-fastq/i) {
|
|
186 $sraID = $ARGV[$i+1];
|
|
187 }
|
|
188 elsif ($ARGV[$i]=~/-assembly_or_project/i) {
|
|
189 $assemblyPrjID = $ARGV[$i+1];
|
|
190 }
|
|
191 elsif ($ARGV[$i]=~/-log/i) {
|
|
192 $log = 1;
|
|
193 # $log = File::Log->new({
|
|
194 # debug => 4,
|
|
195 # logFileName => 'myLogFile.log',
|
|
196 # logFileMode => '>',
|
|
197 # dateTimeStamp => 1,
|
|
198 # stderrRedirect => 1,
|
|
199 # defaultFile => 0,
|
|
200 # logFileDateTime => 1,
|
|
201 # appName => 'getSequenceInfo',
|
|
202 # PIDstamp => 0,
|
|
203 # storeExpText => 1,
|
|
204 # msgprepend => '',
|
|
205 # say => 1,
|
|
206 # });
|
|
207 }
|
|
208 }
|
|
209
|
|
210 #define folder separator and OS
|
|
211 if ($^O =~ /MSWin32/) {
|
|
212 $fldSep = "\\";
|
|
213 $actualOS = "MSWin32";
|
|
214 }
|
|
215
|
|
216 #LOG file
|
|
217 if ($log) { open (LOG, "log.txt") or die " error open log.txt $!:"; }
|
|
218
|
|
219
|
|
220 print "Working ...\n";
|
|
221
|
|
222 if ($kingdom eq "viruses") { $kingdom = "viral"; }
|
|
223
|
|
224 if (grep(/^$kingdom$/i, @availableKingdoms)) {
|
|
225
|
|
226 my @patternParametersList;
|
|
227
|
|
228 my @levelList = split /,/, $assemblyLevel;
|
|
229
|
|
230 if ($species ne "") {
|
|
231 my @speciesList = split(/,/, $species);
|
|
232
|
|
233 foreach my $actualSpecies (@speciesList) {
|
|
234 get_assembly_summary_species( $quantity, $releaseDate, $directory,$kingdom, $actualSpecies,
|
|
235 \@levelList, $fldSep, $actualOS, $mainFolder, $assemblyTaxid, $log,
|
|
236 $getSummaries, $components, $ftpServor);
|
|
237 }
|
|
238 }
|
|
239 elsif ($assemblyTaxid ne "") {
|
|
240 my @taxidList = split(/,/, $assemblyTaxid);
|
|
241
|
|
242 foreach my $actualID (@taxidList) {
|
|
243 get_assembly_summary_species($quantity, $releaseDate, $directory,$kingdom,$species,
|
|
244 \@levelList, $fldSep, $actualOS, $mainFolder, $actualID, $log,
|
|
245 $getSummaries, $components, $ftpServor);
|
|
246 }
|
|
247 }
|
|
248 else {
|
|
249 get_assembly_summary_species($quantity, $releaseDate, $directory,$kingdom,$species,
|
|
250 \@levelList, $fldSep, $actualOS, $mainFolder, $assemblyTaxid, $log,
|
|
251 $getSummaries, $components, $ftpServor);
|
|
252 }
|
|
253 }
|
|
254
|
|
255
|
|
256 if ($sequenceID) {
|
|
257 my @sequenceIDList = split /,/, $sequenceID;
|
|
258
|
|
259 foreach my $enaID (@sequenceIDList) {
|
|
260 sequence_ena($enaID, $log);
|
|
261 }
|
|
262 }
|
|
263
|
|
264 if ($sraID) {
|
|
265 my @sraIDList = split /,/, $sraID;
|
|
266
|
|
267 foreach my $sra (@sraIDList) {
|
|
268 download_ena_fastq($enaFtpServor, $sra, $log);
|
|
269 }
|
|
270 }
|
|
271
|
|
272 if ($assemblyPrjID) {
|
|
273 download_assembly_or_project($assemblyPrjID, $ftpServor, $fldSep, $directory, $log);
|
|
274 }
|
|
275
|
|
276 #my ($end_year,$end_month,$end_day, $end_hour,$end_min,$end_sec) = Today_and_Now();
|
|
277
|
|
278 #my ($D_y,$D_m,$D_d, $Dh,$Dm,$Ds) =
|
|
279 # Delta_YMDHMS($start_year,$start_month,$start_day, $start_hour, $start_min, $start_sec,
|
|
280 # $end_year, $end_month, $end_day, $end_hour,$end_min,$end_sec);
|
|
281
|
|
282 #print "End Date: $end_year-$end_month-$end_day, $end_hour:$end_min:$end_sec\n";
|
|
283 print "Thank you for using $0!\n";
|
|
284 #print "Execution time: $D_y years, $D_m months, $D_d days, $Dh:$Dm:$Ds (hours:minutes:seconds)\n";
|
|
285 my $end = time();
|
|
286
|
|
287 my $total = $end - $start;
|
|
288 my $min = $total / 60;
|
|
289 my $hrs = $min / 60;
|
|
290
|
|
291 print "Total time: $total seconds OR $min minutes OR $hrs hours ! \n";
|
|
292
|
|
293 ### subroutine
|
|
294 # display global help document
|
|
295 sub help_user_simple {
|
|
296 my $programme = shift @_;
|
|
297 print STDERR "Usage : perl $programme [options] \n";
|
|
298 print "Type perl $programme -version or perl $programme -v to get the current version\n";
|
|
299 print "Type perl $programme -help or perl $programme -h to get full help\n";
|
|
300 }
|
|
301 #------------------------------------------------------------------------------
|
|
302 # display full help document
|
|
303 sub help_user_advance {
|
|
304 print <<HEREDOC;
|
|
305
|
|
306 Name:
|
|
307 $0
|
|
308
|
|
309 Synopsis:
|
|
310 A Perl script allowing to get sequence information from GenBank RefSeq or ENA repositories.
|
|
311
|
|
312 Usage:
|
|
313 perl $0 [options]
|
|
314 examples:
|
|
315 perl $0 -k bacteria -s "Helicobacter pylori" -l "Complete Genome" -date 2019-06-01
|
|
316 perl $0 -k viruses -n 5 -date 2019-06-01
|
|
317 perl $0 -k "bacteria" -taxid 9,24 -n 10 -c plasmid -dir genbank -o Results
|
|
318 perl $0 -ena BN000065
|
|
319 perl $0 -fastq ERR818002
|
|
320 perl $0 -fastq ERR818002,ERR818004
|
|
321
|
|
322 Kingdoms:
|
|
323 archaea
|
|
324 bacteria
|
|
325 fungi
|
|
326 invertebrate
|
|
327 plant
|
|
328 protozoa
|
|
329 vertebrate_mammalian
|
|
330 vertebrate_other
|
|
331 viral
|
|
332
|
|
333 Assembly levels:
|
|
334 "Complete Genome"
|
|
335 Chromosome
|
|
336 Scaffold
|
|
337 Contig
|
|
338
|
|
339 General:
|
|
340 -help or -h displays this help
|
|
341 -version or -v displays the current version of the program
|
|
342
|
|
343 Options ([XXX] represents the expected value):
|
|
344 -directory or -dir [XXX] allows to indicate the NCBI's nucleotide sequences repository (default: $directory)
|
|
345 -get or -getSummaries [XXX] allows to obtain a new assembly summary file in function of given kingdoms (bacteria,fungi,protozoa...)
|
|
346 -k or -kingdom [XXX] allows to indicate kingdom of the organism (see the examples above)
|
|
347 -s or -species [XXX] allows to indicate the species (must be combined with -k option)
|
|
348 -taxid [XXX] allows to indicate a specific taxid (must be combined with -k option)
|
|
349 -assembly_or_project [XXX] allows to indicate a specific assembly accession or bioproject (must be combined with -k option)
|
|
350 -date [XXX] indicates the release date (with format yyyy-mm-dd) from which sequence information are available
|
|
351 -l or -level [XXX] allows to select a specific assembly level (e.g. "Complete Genome")
|
|
352 -o or -output [XXX] allows users to name the output result folder
|
|
353 -n or -number [XXX] allows to limit the total number of assemblies to be downloaded
|
|
354 -c or -components [XXX] allows to select specific components of the assembly (e.g. plasmid, chromosome, ...)
|
|
355 -ena [XXX] allows to download report and fasta file given a ENA sequence ID
|
|
356 -fastq [XXX] allows to download FASTQ sequences from ENA given a run accession (https://ena-docs.readthedocs.io/en/latest/faq/archive-generated-files.html)
|
|
357 -log allows to create a log file
|
|
358 HEREDOC
|
|
359 }
|
|
360 #------------------------------------------------------------------------------
|
|
361 # display program version
|
|
362 sub program_version {
|
|
363 my $programme = shift @_;
|
|
364 print "\n $programme, version : $version\n";
|
|
365 print "\n A perl script to get sequences informations\n";
|
|
366 }
|
|
367 #------------------------------------------------------------------------------
|
|
368 sub get_assembly_summary_species {
|
|
369 my ($quantity, $releaseDate, $directory, $kingdom, $species, $levelList, $fldSep,
|
|
370 $actualOS, $mainFolder, $assemblyTaxid, $log, $getSummaries, $components, $ftpServor) = @_;
|
|
371
|
|
372 # assembly_summary.txt file from NCBI FTP site
|
|
373 #my $assemblySummary = get_summaries($ftpServor, $kingdom, $getSummaries, $directory);
|
|
374
|
|
375 my $assemblySummary = "";
|
|
376 if($path){
|
|
377 $assemblySummary = $path;
|
|
378 }
|
|
379 else{
|
|
380 $assemblySummary = download_summaries($directory, $kingdom, $ftpServor, $fldSep, $getSummaries);
|
|
381 }
|
|
382
|
|
383 #lineage folder
|
|
384 # my $lineage_file = "/pub/taxonomy/new_taxdump/new_taxdump.tar.gz";
|
|
385
|
|
386 # allow to check old summary download
|
|
387 my $oldKingdom = "";
|
|
388
|
|
389 # start of output file
|
|
390 if ($log) {
|
|
391 print LOG "...Downloading assembly_summary.txt...\n";
|
|
392 }
|
|
393
|
|
394 # if ($actualOS =~ /Unix/i) {
|
|
395 #initialiaze tar manipulation
|
|
396 # my $tar = Archive::Tar->new;
|
|
397
|
|
398 #download taxdump folder
|
|
399 # download_file($ftpServor, $lineage_file);
|
|
400 # $tar->read("new_taxdump.tar.gz");
|
|
401 # $tar->extract_file("rankedlineage.dmp");
|
|
402 # }
|
|
403
|
|
404
|
|
405 #my $kingdomRep = $kingdom."_".$start_year."_".$start_month."_".$start_day;
|
|
406 #my $kingdomRep = $kingdom."_folder";
|
|
407 my $kingdomRep = "folder";
|
|
408
|
|
409 if ($mainFolder) { $kingdomRep = $mainFolder; }
|
|
410 mkdir $kingdomRep unless -d $kingdomRep;
|
|
411
|
|
412 # Repository for request
|
|
413
|
|
414 #my $repositoryAssembly = "assembly_repository_".$assemblyTaxid;
|
|
415 my $repositoryAssembly = "result";
|
|
416 mkdir $repositoryAssembly unless -d $repositoryAssembly;
|
|
417
|
|
418 my $oldAssemblyRep = "." . $fldSep . $kingdomRep . $fldSep . $repositoryAssembly;
|
|
419 if (-d $oldAssemblyRep) { rmtree($oldAssemblyRep) }
|
|
420
|
|
421 # Repository for fna file
|
|
422 my $repositoryFNA = "Assembly";
|
|
423 mkdir $repositoryFNA unless -e $repositoryFNA;
|
|
424
|
|
425 # Repository for genbank file
|
|
426 my $repositoryGenbank = "GenBank";
|
|
427 mkdir $repositoryGenbank unless -e $repositoryGenbank;
|
|
428
|
|
429 # Reposotiry for report file
|
|
430 my $repositoryReport = "Report";
|
|
431 mkdir $repositoryReport unless -e $repositoryReport ;
|
|
432
|
|
433 # Repositories for required components
|
|
434 my %componentsRepHash;
|
|
435
|
|
436 if ($components) {
|
|
437 for my $component (split /,/, $components) {
|
|
438 my $specificRep = $component."_folder";
|
|
439 #my $specificRep = $component."_".$species."_".$kingdom."_".$start_year."_".$start_month."_".$start_day;
|
|
440 mkdir $specificRep unless -d $specificRep;
|
|
441 $componentsRepHash{$component} = $specificRep;
|
|
442 }
|
|
443 }
|
|
444
|
|
445 if ($log) {
|
|
446 print LOG "...Create kingdom and components repository...\n";
|
|
447 }
|
|
448
|
|
449 my %assemblyReportList;
|
|
450 my @assemblyRepresentationList = qw/Full Partial/;
|
|
451 my @levelList = @{$levelList};
|
|
452 my $checkCompleteGenome = grep(/complete genome/i, @levelList);
|
|
453
|
|
454 if ($checkCompleteGenome > 0) {@assemblyRepresentationList = qw/Full/;}
|
|
455
|
|
456 if (-e $assemblySummary) {
|
|
457 # Read file
|
|
458 open (SUM, $assemblySummary) or die " error open assembly_summary.txt $!:";
|
|
459 while(<SUM>) {
|
|
460 chomp;
|
|
461 if ($_ !~ m/^#/) {
|
|
462 my @infoList = split /\t/, $_;
|
|
463 my $foundAssemRep = grep (/$infoList[13]/i, @assemblyRepresentationList);
|
|
464 my $checkLevel = grep(/$infoList[11]/i, @levelList); #replace 11 by 10
|
|
465
|
|
466 if ($foundAssemRep > 0 && $checkLevel > 0) {
|
|
467 my $indexInfo = 0;
|
|
468 my $searchPattern = "";
|
|
469 my $regex = "";
|
|
470
|
|
471 if ($species !~ /^$/) {
|
|
472 $indexInfo = 7;
|
|
473 $searchPattern = $species;
|
|
474 $regex = qr/$searchPattern/i;
|
|
475 }
|
|
476 elsif ($assemblyTaxid !~ /^$/) {
|
|
477 $indexInfo = 5;
|
|
478 $searchPattern = $assemblyTaxid;
|
|
479 $regex = qr/^$searchPattern$/i;
|
|
480 }
|
|
481
|
|
482 if (($infoList[$indexInfo] =~ $regex) or ($kingdom !~ /^$/ && $searchPattern =~ /^$/)) {
|
|
483 my @gcfInfo = split(/\//, $infoList[19]);
|
|
484 my $gcfName = pop(@gcfInfo);
|
|
485 my $realDate = $infoList[14];
|
|
486 $realDate =~ s/\//-/g;
|
|
487
|
|
488 my $genbankFile = $infoList[19] . "/" . $gcfName . "_genomic.gbff.gz";
|
|
489 my $dnaFile = $infoList[19] . "/" . $gcfName . "_genomic.fna.gz";
|
|
490 my $assemblyReport = $infoList[19] . "/" . $gcfName . "_assembly_report.txt";
|
|
491
|
|
492 if ($realDate gt $releaseDate) {
|
|
493
|
|
494 $dnaFile = obtain_file($ftpServor, $dnaFile);
|
|
495 $genbankFile = obtain_file($ftpServor, $genbankFile);
|
|
496 $assemblyReport = obtain_file($ftpServor, $assemblyReport);
|
|
497
|
|
498 download_file($ftpServor, $dnaFile);
|
|
499 download_file($ftpServor, $genbankFile);
|
|
500 download_file($ftpServor, $assemblyReport);
|
|
501
|
|
502 if ($log) {
|
|
503 print LOG "...download FASTA and GenBank report files...\n";
|
|
504 }
|
|
505
|
|
506 # download sequences and check number of "N" characters
|
|
507 my $fileFasta = $gcfName."_genomic.fna.gz";
|
|
508 my $ucpFasta = $gcfName."_genomic.fna";
|
|
509 if (-e $fileFasta) {
|
|
510 gunzip $fileFasta => $ucpFasta or die "gunzip failed: $GunzipError\n";
|
|
511 move($ucpFasta, $repositoryFNA) or die "move failed: $!";
|
|
512 }
|
|
513
|
|
514 if ($log) {
|
|
515 print LOG "...Unzip FASTA file named $fileFasta ...\n";
|
|
516 }
|
|
517
|
|
518 # download genome report
|
|
519 my $fileReport = $gcfName."_assembly_report.txt";
|
|
520 if (-e $fileReport) {
|
|
521 my $fileInformations = $gcfName."_informations.xls";
|
|
522 move($fileReport, $repositoryReport) or die "move failed: $!";
|
|
523 }
|
|
524
|
|
525 # download genbank files
|
|
526 my $fileGenbank = $gcfName."_genomic.gbff.gz";
|
|
527 my $ucpGenbank = $gcfName."_genomic.gbff";
|
|
528 if (-e $fileGenbank) {
|
|
529 gunzip $fileGenbank => $ucpGenbank or die "gunzip failed: $GunzipError\n";
|
|
530 move($ucpGenbank, $repositoryGenbank) or die "move failed: $!";
|
|
531 }
|
|
532
|
|
533 if ($log) {
|
|
534 print LOG "...Unzip GenBank file $fileGenbank ...\n";
|
|
535 }
|
|
536
|
|
537 # association report and fasta
|
|
538 my $fileFastaGenbank = $ucpFasta . "," . $ucpGenbank;
|
|
539 $assemblyReportList{$fileReport} = $fileFastaGenbank;
|
|
540
|
|
541 if ($quantity) { $quantity -= 1; }
|
|
542
|
|
543 }
|
|
544 }
|
|
545 }
|
|
546 }
|
|
547 if (defined $quantity && $quantity == 0) {
|
|
548 $quantity = undef;
|
|
549 last;
|
|
550 }
|
|
551 }
|
|
552 close(SUM) or die "close file error : $!";
|
|
553
|
|
554 if (!keys %assemblyReportList) {
|
|
555 print "##################################################################\n";
|
|
556 print "No results were found for the following query:\n";
|
|
557 print "perl $0 @ARGV\n";
|
|
558 print "##################################################################\n\n";
|
|
559
|
|
560 if ($actualOS =~ /unix/i) { unlink glob "*.dmp *.gz" or die "for file *.dmp *.gz $!:"; }
|
|
561
|
|
562 if (empty_folder($kingdomRep)) { rmdir $kingdomRep or die "fail remove directory $!:"; }
|
|
563 rmdir $repositoryAssembly or die "failed to remove directory $!:";
|
|
564 rmdir $repositoryFNA or die "failed to remove directory $!:";
|
|
565 rmdir $repositoryGenbank or die "failed to remove directory $!:";
|
|
566 rmdir $repositoryReport or die "failed to remove directory $!:";
|
|
567
|
|
568 if ($components) {
|
|
569 for my $componentRep (values %componentsRepHash) {
|
|
570 rmdir $componentRep or die "failed to remove directory $!:";
|
|
571 }
|
|
572 }
|
|
573 if ($log) {
|
|
574 print LOG "...No results were found for the following query: ";
|
|
575 print LOG "perl $0 @ARGV \n";
|
|
576 }
|
|
577
|
|
578 }
|
|
579 else {
|
|
580
|
|
581 if ($log) {
|
|
582 print LOG "...Results were found for the following query: ";
|
|
583 print LOG "perl $0 @ARGV \n";
|
|
584 }
|
|
585 # write summary files
|
|
586 my %componentsSumHash;
|
|
587 my @keysList = keys %assemblyReportList;
|
|
588 my $summary = "summary.xls";
|
|
589 my $htmlSummary = "summary.html";
|
|
590
|
|
591 if ($components) {
|
|
592 for my $component (split /,/, $components) {
|
|
593 my $specificSummary = $component. "_summary.xls";
|
|
594 $componentsSumHash{$component} = $specificSummary;
|
|
595 }
|
|
596 }
|
|
597
|
|
598 my $fileReport = "." . $fldSep. $repositoryReport . $fldSep . $keysList[0];
|
|
599 my @header = ();
|
|
600
|
|
601 open(FILE, $fileReport) or die "error open file : $!";
|
|
602 while(<FILE>) {
|
|
603 chomp;
|
|
604 if($_ =~ /:/){
|
|
605 $_ =~ s/^#*//;
|
|
606 my @ligne = split /:/, $_;
|
|
607 push(@header, $ligne[0]);
|
|
608 }
|
|
609 }
|
|
610 close(FILE) or die "error close file : $!";
|
|
611
|
|
612 open(HEAD, ">", $summary) or die " error open file : $!";
|
|
613 foreach(@header) {
|
|
614 print HEAD $_ . "\t";
|
|
615 }
|
|
616
|
|
617 print HEAD "Pubmed\tNucleScore\tClassification\t";
|
|
618 print HEAD "Country\tHost\tIsolation source\tA percent\t";
|
|
619 print HEAD "T percent\tG percent\tC percent\tN percent\tGC percent\t";
|
|
620 print HEAD "ATGC ratio\tLength\tShape\n";
|
|
621 close(HEAD) or die "error close file : $!";
|
|
622
|
|
623 if ($components) {
|
|
624 foreach my $componentSummary (values %componentsSumHash) {
|
|
625 open(SUM, ">>", $componentSummary) or die "error open file : $!";
|
|
626 print SUM "Id\tAssembly\tDescription\tLength\tStatus\tLevel\t";
|
|
627 print SUM "GC percent\tA percent\tT percent\tG percent\tC percent\n";
|
|
628 close(SUM) or die "error close file : $!";
|
|
629 }
|
|
630 }
|
|
631
|
|
632 for my $file (@keysList) {
|
|
633 my $reportFile = $repositoryReport . $fldSep . $file;
|
|
634 my @fastaGenbank = split /,/, $assemblyReportList{$file};
|
|
635 my $extFasta = $fastaGenbank[0];
|
|
636 my $extGenbank = $fastaGenbank[1];
|
|
637 my $fnaFile = $repositoryFNA . $fldSep . $extFasta;
|
|
638 my $genbankFile = $repositoryGenbank . $fldSep . $extGenbank;
|
|
639
|
|
640 write_assembly($reportFile, $fnaFile, $genbankFile, $summary, $repositoryAssembly,
|
|
641 \%componentsSumHash, $kingdom, $actualOS, \@header, $log);
|
|
642
|
|
643 if ($log) {
|
|
644 print LOG "...Call write_assembly subroutine...\n";
|
|
645 }
|
|
646 }
|
|
647
|
|
648 write_html_summary($summary);
|
|
649
|
|
650 if ($log) {
|
|
651 print LOG "...Call write_html_summary subroutine...\n";
|
|
652 }
|
|
653
|
|
654 if ($components) {
|
|
655 my @componentList = keys %componentsSumHash;
|
|
656 my %componentFastaHash = create_component_sequence_file($fldSep, $repositoryFNA, \@componentList);
|
|
657
|
|
658 if (keys %componentFastaHash && $log) { $log->msg(1,"call create_component_sequence_file subroutine");}
|
|
659
|
|
660 foreach my $component (keys %componentFastaHash) {
|
|
661 move($componentFastaHash{$component}, $componentsRepHash{$component}) or die "move failed: $!";
|
|
662 }
|
|
663 }
|
|
664
|
|
665 move($summary, $repositoryAssembly) or die "move failed: $!";
|
|
666 move($htmlSummary, $repositoryAssembly) or die "move failed: $!";
|
|
667 move($repositoryFNA, $repositoryAssembly . $fldSep . $repositoryFNA) or die "move failed: $!";
|
|
668 move($repositoryGenbank, $repositoryAssembly . $fldSep . $repositoryGenbank) or die "move failed: $!";
|
|
669 move($repositoryReport , $repositoryAssembly . $fldSep . $repositoryReport) or die "move failed: $!";
|
|
670 if ($log) {
|
|
671 print LOG "...move fasta, genbank and report to query folder \n";
|
|
672 }
|
|
673
|
|
674 if ($components) {
|
|
675 for my $component (keys %componentsSumHash) {
|
|
676 move($componentsSumHash{$component}, $componentsRepHash{$component}) or die "move failed: $!";
|
|
677 move($componentsRepHash{$component}, $repositoryAssembly . $fldSep . $componentsRepHash{$component}) or die "move failed: $!"
|
|
678 }
|
|
679 if ($log) {
|
|
680 print LOG "...move component files to folders\n";
|
|
681 }
|
|
682 }
|
|
683
|
|
684 move($repositoryAssembly, $kingdomRep . $fldSep . $repositoryAssembly) or die "move failed: $!";
|
|
685
|
|
686 if ($log) {
|
|
687 print LOG "...move query folder to main folder\n";
|
|
688 }
|
|
689
|
|
690 # if ($actualOS =~ /unix/i) { unlink glob "*.dmp" or die "for file *.dmp $!:"; }
|
|
691 unlink glob "*.gz sequence.txt" or die "$!: for file *.gz sequence.txt";
|
|
692 }
|
|
693 }
|
|
694 }
|
|
695 #write general assembly file
|
|
696 sub write_assembly {
|
|
697 my ($reportFile, $fnaFile, $genbankFile, $summary, $repositoryAssembly,
|
|
698 $componentsSumHashRef, $kingdom, $actualOS, $headerRef, $log) = @_;
|
|
699
|
|
700 my %componentsSumHash = %{$componentsSumHashRef};
|
|
701 my @header = @{$headerRef};
|
|
702 my %hashInformations = ();
|
|
703 my $seq = "";
|
|
704 my $genomeName = "";
|
|
705 my $country = "na";
|
|
706 my $GCpercent = -1;
|
|
707 my $taxId = "na";
|
|
708 my $assemblyLine;
|
|
709 my $pubmedId = "na";
|
|
710 my $host = "na";
|
|
711 my $isoSource = "na";
|
|
712 # my $species = "na";
|
|
713 # my $genus = "na";
|
|
714 # my $family = "na";
|
|
715 # my $order = "na";
|
|
716 # my $class = "na";
|
|
717 # my $phylum = "na";
|
|
718 my $shape = "na";
|
|
719 my $geneNumber = "na";
|
|
720
|
|
721 open(REP, "<", $reportFile) or die "error open file $!:";
|
|
722 while (<REP>) {
|
|
723 chomp;
|
|
724 $_ =~ s/^#*//;
|
|
725 if ($_ =~ /assembled-molecule/i) { $assemblyLine = $_; }
|
|
726 if ($_ =~ /:/) {
|
|
727 my @line = split /:/, $_;
|
|
728 if ($line[1]) { $hashInformations{$line[0]} = trim($line[1]); }
|
|
729 }
|
|
730 }
|
|
731 close(REP) or die "error close file $!:";
|
|
732
|
|
733
|
|
734 open(SUM, ">>", $summary) or die "error open file $!:";
|
|
735 foreach my $k(@header) {
|
|
736 if (grep $_ eq $k, keys %hashInformations) {
|
|
737
|
|
738 my $information = $hashInformations{$k};
|
|
739
|
|
740 if ($k =~ /Assembly name/i) { $genomeName = $information; }
|
|
741
|
|
742 if ($information =~ /^\s*$/) {
|
|
743 print SUM "na\t";
|
|
744 } else {
|
|
745 print SUM $information . "\t";
|
|
746 }
|
|
747 } else {
|
|
748 print SUM "na\t";
|
|
749 }
|
|
750 }
|
|
751 close(SUM) or die "error close file $!:";
|
|
752
|
|
753 open(FNA, "<", $fnaFile) or die "Could not open $!:";
|
|
754 while (<FNA>) {
|
|
755 chomp;
|
|
756 if ($_ !~ /^>/) { $seq .= $_; }
|
|
757 }
|
|
758 close(FNA) or die "error close file :$!";
|
|
759
|
|
760 foreach my $summaryKey (keys %hashInformations) {
|
|
761 if ($summaryKey =~ /taxid/i) {
|
|
762 $taxId = $hashInformations{$summaryKey};
|
|
763 }
|
|
764 }
|
|
765
|
|
766 my $classification = get_taxonomic_rank_genbank($genbankFile);
|
|
767
|
|
768 if ($log) {
|
|
769 print LOG "...get taxonomic rank from genbank file\n";
|
|
770 }
|
|
771
|
|
772 $GCpercent = gc_percent($seq);
|
|
773 my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($fnaFile);
|
|
774 my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length);
|
|
775 my $atgcRatio = atgc_ratio($ade, $thy, $gua, $cyt);
|
|
776
|
|
777 my @percentList = ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent);
|
|
778
|
|
779 my $variance = shift_data_variance(@percentList);
|
|
780 my $nucleScore = nucle_score($variance, $GCpercent, $atgcRatio, $length);
|
|
781
|
|
782 if ($log) {
|
|
783 print LOG "compute GC percent nucleotid percent ATGC ratio NucleScore\n";
|
|
784 }
|
|
785
|
|
786 open(GBFF, "<", $genbankFile) or die "Error open file $!:";
|
|
787 while(<GBFF>) {
|
|
788 chomp;
|
|
789 if ($_ =~ /\/country="(.*)"/) { $country = trim($1); }
|
|
790 if ($_ =~ /PUBMED(.*)/) { $pubmedId = trim($1); }
|
|
791 if ($_ =~ /\/host="(.*)"/) { $host = trim($1); }
|
|
792 if ($_ =~ /\/isolation_source="(.*)"/) { $isoSource = trim($1); }
|
|
793 if ($_ =~ /\(Genes \(total\)\s+::(.*)/) { $geneNumber = trim($1); }
|
|
794 if ($_ =~ /LOCUS.*\s+([a-z]{1,})\s+[a-z]{1,}\s+[0-9]{2,}-[a-z]{1,}-[0-9]{4,}$/i) { $shape = trim($1); }
|
|
795 }
|
|
796 close(GBFF) or die "error close file $!:";
|
|
797
|
|
798
|
|
799 open(SUM, ">>", $summary) or die "error open file $!:";
|
|
800 print SUM $pubmedId . "\t" . $nucleScore . "\t" . $classification ."\t" ;
|
|
801 print SUM $country . "\t" . $host . "\t" . $isoSource . "\t" . $aPercent . "\t" ;
|
|
802 print SUM $tPercent . "\t" . $gPercent . "\t" . $cPercent ."\t" . $nPercent . "\t";
|
|
803 print SUM $GCpercent ."\t". $atgcRatio ."\t". $length . "\t". $shape."\n";
|
|
804 close(SUM) or die "error close file: $!";
|
|
805
|
|
806 if (%componentsSumHash) {
|
|
807 write_assembly_component($fnaFile, $genomeName, \%componentsSumHash, $log);
|
|
808 }
|
|
809 }
|
|
810 #------------------------------------------------------------------------------
|
|
811 # get assembly component
|
|
812 sub write_assembly_component {
|
|
813 my($fnaFile, $genomeName, $componentsSumHashRef, $log) = @_;
|
|
814
|
|
815 my %componentsSumHash = %{$componentsSumHashRef};
|
|
816 my $status = "na";
|
|
817 my $level = "na";
|
|
818 my $gcpercent;
|
|
819 my $tmp_fasta_file = "sequence.txt";
|
|
820 my @desc = ();
|
|
821
|
|
822 # check each sequence from (multi-)fasta file
|
|
823 my ($seq, $inputfile);
|
|
824
|
|
825 if ($log) {
|
|
826 print LOG "...search specific components\n";
|
|
827 }
|
|
828
|
|
829 # extract sequence details
|
|
830 my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$fnaFile);
|
|
831
|
|
832 while ($seq = $seqIO->next_seq()) {
|
|
833 my $seqID = $seq->id; # ID of sequence
|
|
834 my $seqDesc = $seq->desc; # Description of sequence
|
|
835 my $globalSeq = $seq->seq;
|
|
836 my $seqLength = $seq->length();
|
|
837
|
|
838 open(TSEQ, ">", $tmp_fasta_file) or die "Error open file: $!";
|
|
839 print TSEQ $globalSeq;
|
|
840 close(TSEQ);
|
|
841
|
|
842 my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($tmp_fasta_file);
|
|
843
|
|
844 my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length);
|
|
845
|
|
846 $gcpercent = gc_percent($globalSeq);
|
|
847
|
|
848 @desc = split /,/, $seqDesc;
|
|
849
|
|
850 if ($desc[1]) { $level = $desc[1]; }
|
|
851
|
|
852 foreach my $component (keys %componentsSumHash) {
|
|
853 if ($desc[0] =~ /$component/) {
|
|
854 $status = $component;
|
|
855 my $info = $seqID . "\t" . $genomeName ."\t" . $seqDesc . "\t" . $seqLength . "\t" . $status . "\t" . $level ."\t";
|
|
856 $info.= $gcpercent."\t". $aPercent ."\t". $tPercent ."\t". $gPercent ."\t". $cPercent . "\n";
|
|
857 add_to_file($componentsSumHash{$component}, $info);
|
|
858 if ($log) {
|
|
859 print LOG "...found component $component \n";
|
|
860 }
|
|
861 }
|
|
862 }
|
|
863 }
|
|
864 }
|
|
865 #------------------------------------------------------------------------------
|
|
866 # download fasta sequence and report on ENA with assembly ID
|
|
867 sub get_fasta_and_report_sequence_ena_assembly {
|
|
868 my($sequenceID) = @_;
|
|
869 my $tmp_file = "fichier.txt";
|
|
870 my @id_list = ();
|
|
871 my $id_chain = "";
|
|
872 my $fasta_file = "";
|
|
873 my $report_file = "";
|
|
874 my $url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=xml";
|
|
875 my $output = get($url);
|
|
876
|
|
877 open(TMP, ">", $tmp_file) or die("could not open $!");
|
|
878 print TMP $output;
|
|
879 close(TMP) or die("could not close $!");
|
|
880
|
|
881 open(TMP, "<", $tmp_file) or die("could not open $!");
|
|
882 while(<TMP>){
|
|
883 if($_ =~ /<CHROMOSOME accession="(.*)">/){
|
|
884 push(@id_list, $1)
|
|
885 }
|
|
886 }
|
|
887 close(TMP) or die("could not close $!");
|
|
888
|
|
889 $id_chain = join(",", @id_list);
|
|
890 $url = "https://www.ebi.ac.uk/ena/data/view/$id_chain&display=fasta";
|
|
891 $output = get($url);
|
|
892 $fasta_file = $sequenceID . ".fasta";
|
|
893 open(FILE, ">", $fasta_file) or die("could not open $!");
|
|
894 print FILE $output;
|
|
895 close(FILE) or die("could not close $!");
|
|
896
|
|
897
|
|
898 $report_file = $sequenceID . "_report.txt";
|
|
899 for my $id (@id_list) {
|
|
900 $url = "https://www.ebi.ac.uk/ena/data/view/$id&display=text&header=true";
|
|
901 $output = get($url);
|
|
902 open(FILE, ">>", $report_file) or die("could not open $!");
|
|
903 print FILE $output;
|
|
904 print FILE "##########################################################################\n\n";
|
|
905 close(FILE) or die("could not close $!");
|
|
906 }
|
|
907
|
|
908 unlink "fichier.txt" or die "error delete file :$!";
|
|
909
|
|
910 return ($fasta_file, $report_file);
|
|
911 }
|
|
912 #------------------------------------------------------------------------------
|
|
913 # download ENA
|
|
914 sub sequence_ena {
|
|
915 my($sequenceID, $log) = @_;
|
|
916 #my $assemblyRep = $sequenceID . "_folder";
|
|
917 my $fastaFile;
|
|
918 my $reportFile;
|
|
919
|
|
920 #if(-d $assemblyRep) { rmtree($assemblyRep); }
|
|
921 #mkdir $assemblyRep;
|
|
922
|
|
923 if ($log) {
|
|
924 print LOG "...ENA sequence downloading ...\n";
|
|
925 }
|
|
926 #if ($log) {$log->msg(1, "Creation of repository: $assemblyRep\n");}
|
|
927
|
|
928 if($sequenceID =~ /^GCA_.*/){
|
|
929 ($fastaFile, $reportFile) = get_fasta_and_report_sequence_ena_assembly($sequenceID);
|
|
930 }
|
|
931 else {
|
|
932 ($fastaFile, $reportFile) = get_fasta_and_report_sequence_ena_other($sequenceID);
|
|
933 }
|
|
934
|
|
935 if ($log) {
|
|
936 print LOG "...Downloading of ENA FASTA and report files for sequence: $sequenceID\n";
|
|
937 }
|
|
938
|
|
939 #move($fastaFile, $assemblyRep) or die "move failed: $!";
|
|
940 #move($reportFile, $assemblyRep) or die "move failed: $!";
|
|
941
|
|
942 if ($log) {
|
|
943 print LOG "...Move fasta and report files to folder\n";
|
|
944 }
|
|
945 }
|
|
946 #------------------------------------------------------------------------------
|
|
947 # download fasta sequence and report on ENA with ENA ID
|
|
948 sub get_fasta_and_report_sequence_ena_other {
|
|
949 my($sequenceID) = @_;
|
|
950 my $fasta_file = "";
|
|
951 my $report_file = "";
|
|
952 my $url;
|
|
953 my $output;
|
|
954
|
|
955 $url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=fasta";
|
|
956 #if($actualOS eq "MSWin32"){
|
|
957 $output = get($url);
|
|
958 $fasta_file = $sequenceID . ".fasta";
|
|
959 open(FILE, ">", $fasta_file) or die("could not open $!");
|
|
960 print FILE $output;
|
|
961 close(FILE) or die "could not close $!";
|
|
962 #}
|
|
963 #else{
|
|
964 # system("wget $url");
|
|
965 #}
|
|
966
|
|
967 $output = "";
|
|
968
|
|
969 $url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=text&header=true";
|
|
970 #if($actualOS eq "MSWin32"){
|
|
971 $output = get($url); # replace by wget????
|
|
972 $report_file = $sequenceID . "_report.txt";
|
|
973 open(FILE, ">>", $report_file) or die "could not open: $!";
|
|
974 print FILE $output;
|
|
975 close(FILE) or die "could not close $!";
|
|
976 #}
|
|
977 #else{
|
|
978 # system("wget $url");
|
|
979 #}
|
|
980
|
|
981 return ($fasta_file, $report_file);
|
|
982 }
|
|
983 #------------------------------------------------------------------------------
|
|
984 # add information to file
|
|
985 sub add_to_file {
|
|
986 my ($file, $info) = @_;
|
|
987 open(FILE, ">>", $file) or die ("Could not open $!");
|
|
988 print FILE $info;
|
|
989 close(FILE);
|
|
990 }
|
|
991 #------------------------------------------------------------------------------
|
|
992 # return taxonomic rank of species by tax id
|
|
993 sub get_taxonomic_rank {
|
|
994 my($tax_id, $taxonomic_file) = @_;
|
|
995 my $species = "na";
|
|
996 my $genus = "na";
|
|
997 my $family = "na";
|
|
998 my $order = "na";
|
|
999 my $class = "na";
|
|
1000 my $phylum = "na";
|
|
1001
|
|
1002 # my ($species,$genus,$family,$order,$class,$phylum);
|
|
1003 my @tmp_array = ($species, $genus, $family, $order, $class, $phylum);
|
|
1004
|
|
1005 open(TFILE, "<", $taxonomic_file) or
|
|
1006 die("Could not open $taxonomic_file: $!");
|
|
1007
|
|
1008 while(<TFILE>) {
|
|
1009 chomp;
|
|
1010 my @tax_info = split(/\|/, $_);
|
|
1011
|
|
1012 if ($tax_info[0] == $tax_id) {
|
|
1013 @tax_info = trim_array(@tax_info);
|
|
1014
|
|
1015 $tmp_array[0] = $tax_info[1];
|
|
1016 splice(@tax_info, 0, 3);
|
|
1017
|
|
1018 for(my $i = 1; $i < $#tmp_array + 1; $i++) {
|
|
1019 if (length($tax_info[$i-1]) > 0) { $tmp_array[$i] = $tax_info[$i-1]; }
|
|
1020 }
|
|
1021 close(TFILE) or die "error close $taxonomic_file $!:";
|
|
1022 return @tmp_array;
|
|
1023 }
|
|
1024 }
|
|
1025 close(TFILE) or die "error close $taxonomic_file $!:";
|
|
1026 }
|
|
1027 #------------------------------------------------------------------------------
|
|
1028 # write html summary file
|
|
1029 sub write_html_summary {
|
|
1030 my($summary) = @_;
|
|
1031 my $htmlFile = "summary.html";
|
|
1032 my $header = "";
|
|
1033 my @fileToRead = ();
|
|
1034
|
|
1035 open(HTML, ">", $htmlFile) or die "error open HTML summary $!";
|
|
1036 print HTML "<!DOCTYPE html>\n";
|
|
1037 print HTML "<html>\n";
|
|
1038 print HTML " <head>\n";
|
|
1039 print HTML " <title>Assembly summary</title>\n";
|
|
1040 print HTML " </head>\n";
|
|
1041 print HTML " <body>\n";
|
|
1042 print HTML " <h2>Assembly Summary</h2>\n";
|
|
1043 close(HTML) or die "error close HTML summary $!";
|
|
1044
|
|
1045 open(SUM, "<", $summary) or die "error open tsv summary $!";
|
|
1046 @fileToRead = <SUM>;
|
|
1047 close(SUM) or die "error close tsv summary $!";
|
|
1048
|
|
1049 $header = splice(@fileToRead, 0, 1);
|
|
1050
|
|
1051 for my $line (@fileToRead) {
|
|
1052 write_html_table($line, $htmlFile, $header);
|
|
1053 }
|
|
1054
|
|
1055 open(HTML, ">>", $htmlFile) or die "error open HTML summary $!";
|
|
1056 print HTML " </body>\n";
|
|
1057 print HTML "</html>\n";
|
|
1058 close(HTML) or die "error close HTML summary $!";
|
|
1059 }
|
|
1060 #------------------------------------------------------------------------------
|
|
1061 # write html table for summary
|
|
1062 sub write_html_table {
|
|
1063 my ($line, $htmlFile, $header) = @_;
|
|
1064
|
|
1065 open(HTML, ">>", $htmlFile) or die "error open HTML summary $!";
|
|
1066 print HTML " <table border=\"1\" style=\"margin-bottom: 20px;\">\n";
|
|
1067 close(HTML) or die "error close HTML summary $!";
|
|
1068 add_table_content($line, $htmlFile, $header);
|
|
1069 }
|
|
1070 #------------------------------------------------------------------------------
|
|
1071 # add information to table
|
|
1072 sub add_table_content {
|
|
1073 my ($line, $htmlFile, $headers) = @_;
|
|
1074
|
|
1075 my @assemblyHeader = split(/\t/, $headers);
|
|
1076 my @assemblyInfo = split(/\t/, $line);
|
|
1077 my %hashHeaderInfo;
|
|
1078 my $nbOfCell = 7;
|
|
1079 my $fullLine = floor(($#assemblyHeader + 1) / $nbOfCell);
|
|
1080 my $restCell = $#assemblyHeader + 1 - $fullLine * $nbOfCell;
|
|
1081
|
|
1082
|
|
1083 for (my $i = 0; $i < $#assemblyHeader + 1; $i++) {
|
|
1084 $hashHeaderInfo{trim($assemblyHeader[$i])} = $assemblyInfo[$i];
|
|
1085 }
|
|
1086
|
|
1087 my @keysHeaderInfo = keys %hashHeaderInfo;
|
|
1088 my $cellIndex = 0;
|
|
1089
|
|
1090 open(HTML, ">>", $htmlFile) or die "error open HTML summary $!";
|
|
1091 for (my $turn = 0; $turn < $fullLine; $turn++) {
|
|
1092
|
|
1093 print HTML " <tr>\n";
|
|
1094 for my $header (@assemblyHeader[$cellIndex..$cellIndex + $nbOfCell - 1]) {
|
|
1095 print HTML " <th>$header</th>\n";
|
|
1096 }
|
|
1097 print HTML " </tr>\n";
|
|
1098
|
|
1099 print HTML " <tr>\n";
|
|
1100 for my $header (@assemblyHeader[$cellIndex..$cellIndex + $nbOfCell - 1]) {
|
|
1101 if ($header =~ /PUBMED/i && $hashHeaderInfo{$header} ne "na") {
|
|
1102 print HTML " <td><a href=https://www.ncbi.nlm.nih.gov/pubmed/?term=".
|
|
1103 "$hashHeaderInfo{$header} target=\"_blank\">$hashHeaderInfo{trim($header)}</a></td>";
|
|
1104 }
|
|
1105 else {
|
|
1106 print HTML " <td>$hashHeaderInfo{trim($header)}</td>\n";
|
|
1107 }
|
|
1108 }
|
|
1109 print HTML " </tr>\n";
|
|
1110
|
|
1111 $cellIndex += $nbOfCell;
|
|
1112 }
|
|
1113
|
|
1114 print HTML " <tr>\n";
|
|
1115 for my $header(@assemblyHeader[$cellIndex..$#keysHeaderInfo]) {
|
|
1116 print HTML " <th>$header</th>\n";
|
|
1117 }
|
|
1118 print HTML " <tr>\n";
|
|
1119
|
|
1120 print HTML " <tr>\n";
|
|
1121 for my $header(@assemblyHeader[$cellIndex..$#keysHeaderInfo]) {
|
|
1122 print HTML " <td>$hashHeaderInfo{trim($header)}</td>\n";
|
|
1123 }
|
|
1124 print HTML " <tr>\n";
|
|
1125
|
|
1126 print HTML " </table>\n";
|
|
1127 close(HTML) or die "error close HTML summary $!";
|
|
1128 }
|
|
1129 #------------------------------------------------------------------------------
|
|
1130 #getTaxonomicRanks (function allowing to get taxonomic ranks from Genbank file)
|
|
1131 sub get_taxonomic_rank_genbank {
|
|
1132 my ($genbank) = @_;
|
|
1133
|
|
1134 my $seqio_object = Bio::SeqIO->new(-file => $genbank);
|
|
1135 my $seq_object = $seqio_object->next_seq;
|
|
1136
|
|
1137 # legible and long
|
|
1138 my $species_object = $seq_object->species;
|
|
1139 my $species_string = $species_object->node_name;
|
|
1140
|
|
1141 # get all taxa from the ORGANISM section in an array
|
|
1142 my @classification = $seq_object->species->classification;
|
|
1143 # my $arraySize = @classification;
|
|
1144
|
|
1145 # print "@classification\n";
|
|
1146
|
|
1147 # if($arraySize == 7){
|
|
1148 # ($species,$genus,$family,$order,$class,$phylum,$kingdomGB) = @classification;
|
|
1149 # }
|
|
1150 # elsif($arraySize == 4){
|
|
1151 # ($species,$class,$phylum,$kingdomGB) = @classification;
|
|
1152 # }
|
|
1153
|
|
1154 my $classification = join(",", @classification);
|
|
1155
|
|
1156 return ($classification);
|
|
1157 }
|
|
1158 #------------------------------------------------------------------------------
|
|
1159 #add all sequences components to file
|
|
1160 sub create_component_sequence_file {
|
|
1161 my ($fldSep, $repository, $listComponentRef) = @_;
|
|
1162
|
|
1163 my @listFnaFile;
|
|
1164 my @listComponent = @{$listComponentRef};
|
|
1165
|
|
1166 opendir(my $dh, $repository) || die "Can't opendir $repository: $!";
|
|
1167 @listFnaFile = grep{/fna$/} readdir($dh);
|
|
1168 closedir $dh;
|
|
1169
|
|
1170 my %componentFastaHash;
|
|
1171
|
|
1172 foreach my $component (@listComponent) {
|
|
1173
|
|
1174 my $componentFasta = $component.".fasta";
|
|
1175
|
|
1176 foreach my $fnaFile (@listFnaFile) {
|
|
1177
|
|
1178 # my $actualFile = $repository . $fldSep . $fnaFile;
|
|
1179
|
|
1180 my $seq;
|
|
1181 my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$repository . $fldSep . $fnaFile);
|
|
1182
|
|
1183 while ($seq = $seqIO->next_seq()) {
|
|
1184
|
|
1185 my $seqDesc = $seq->desc;
|
|
1186
|
|
1187 if ($seqDesc =~ /$component/) {
|
|
1188 my $seqID = $seq->id;
|
|
1189 my $seqNuc = $seq->seq;
|
|
1190 my $shift = 60;
|
|
1191 my @seqArray = split //, $seqNuc;
|
|
1192 my $newSeqNuc = "";
|
|
1193
|
|
1194 if (length $seqNuc <= $shift) {
|
|
1195 $newSeqNuc = $seqNuc;
|
|
1196 }
|
|
1197 else {
|
|
1198 for(my $i = 0; $i < $#seqArray + 1; $i ++) {
|
|
1199 $newSeqNuc .= $seqArray[$i];
|
|
1200 if (($i + 1) % $shift == 0) { $newSeqNuc .= ","; }
|
|
1201 }
|
|
1202 }
|
|
1203
|
|
1204 open(FASTA, ">>", $componentFasta) or die "error open file $!:";
|
|
1205 print FASTA ">$seqID $seqDesc\n";
|
|
1206 foreach my $subSeqNuc (split /,/, $newSeqNuc) {
|
|
1207 print FASTA "$subSeqNuc\n";
|
|
1208 }
|
|
1209 close(FASTA) or die "error close file $!:";
|
|
1210 }
|
|
1211 }
|
|
1212 }
|
|
1213 if (-e $componentFasta) { $componentFastaHash{$component} = $componentFasta; }
|
|
1214 }
|
|
1215 return %componentFastaHash;
|
|
1216 }
|
|
1217 # remove back and front spaces
|
|
1218 sub trim {
|
|
1219 my ($string) = @_;
|
|
1220 $string =~ s/^\s+//;
|
|
1221 $string =~ s/\s+$//;
|
|
1222 return $string;
|
|
1223 }
|
|
1224 #------------------------------------------------------------------------------
|
|
1225 # use trim in array
|
|
1226 sub trim_array {
|
|
1227 my (@array) = @_;
|
|
1228 foreach my $value (@array) {
|
|
1229 $value = trim($value);
|
|
1230 }
|
|
1231 return @array;
|
|
1232 }
|
|
1233 #------------------------------------------------------------------------------
|
|
1234 # check if folder is empty
|
|
1235 sub empty_folder {
|
|
1236 my $dirname = shift;
|
|
1237 opendir(my $dholder, $dirname) or die "error not a directory";
|
|
1238 my $isEmpty = scalar(grep { $_ ne "." && $_ ne ".." } readdir($dholder));
|
|
1239 if ($isEmpty == 0) { return $isEmpty; }
|
|
1240 }
|
|
1241 #------------------------------------------------------------------------------
|
|
1242 # number nucleotid and length
|
|
1243 sub number_nuc_length_seq {
|
|
1244 my ($fastaFile) = @_;
|
|
1245 my $ade = 0;
|
|
1246 my $thy = 0;
|
|
1247 my $gua = 0;
|
|
1248 my $cyt = 0;
|
|
1249 my $n = 0;
|
|
1250 my $length = 0;
|
|
1251
|
|
1252 open (FASTA, "<", $fastaFile) or die "Could not open $!";
|
|
1253 while (<FASTA>) {
|
|
1254 chomp;
|
|
1255 if ($_ !~ />/) {
|
|
1256 my @seq = split //, $_;
|
|
1257
|
|
1258 for my $nuc (@seq) {
|
|
1259 $length +=1 ;
|
|
1260 if ($nuc =~ /a/i) {$ade+=1;}
|
|
1261 elsif ($nuc =~ /t/i) {$thy+=1;}
|
|
1262 elsif ($nuc =~ /g/i) {$gua+=1;}
|
|
1263 elsif ($nuc =~ /c/i) {$cyt+=1;}
|
|
1264 elsif ($nuc =~ /n/i) {$n+=1;}
|
|
1265 }
|
|
1266 }
|
|
1267 }
|
|
1268 close(FASTA) or die "Error close file :$!";
|
|
1269 return ($ade, $thy, $gua, $cyt, $n, $length);
|
|
1270
|
|
1271 }
|
|
1272 #------------------------------------------------------------------------------
|
|
1273 # compute percentage of nucleotid
|
|
1274 sub nucleotid_percent {
|
|
1275 my($ade, $thy, $gua, $cyt, $n, $length) = @_;
|
|
1276
|
|
1277 my $adePercent = $ade / $length * 100;
|
|
1278 my $thyPercent = $thy / $length * 100;
|
|
1279 my $guaPercent = $gua / $length * 100;
|
|
1280 my $cytPercent = $cyt / $length * 100;
|
|
1281 my $nPercent = $n / $length * 100;
|
|
1282
|
|
1283 return ($adePercent, $thyPercent, $guaPercent, $cytPercent, $nPercent);
|
|
1284
|
|
1285 }
|
|
1286 #------------------------------------------------------------------------------
|
|
1287 # compute ATGC ratio
|
|
1288 sub atgc_ratio {
|
|
1289 my ($ade, $thy, $gua, $cyt) = @_;
|
|
1290 return (($ade + $thy) / ($gua + $cyt));
|
|
1291 }
|
|
1292 #------------------------------------------------------------------------------
|
|
1293 # variance
|
|
1294 sub shift_data_variance {
|
|
1295 my (@data) = @_;
|
|
1296
|
|
1297 if ($#data + 1 < 2) { return 0.0; }
|
|
1298
|
|
1299 my $K = $data[0];
|
|
1300 my ($n, $Ex, $Ex2) = 0.0;
|
|
1301
|
|
1302 for my $x (@data) {
|
|
1303 $n = $n + 1;
|
|
1304 $Ex += $x - $K;
|
|
1305 $Ex2 += ($x - $K) * ($x - $K);
|
|
1306 }
|
|
1307
|
|
1308 my $variance = ($Ex2 - ($Ex * $Ex) / $n) / ($n); ## ($n - 1)
|
|
1309
|
|
1310 return $variance;
|
|
1311
|
|
1312 }
|
|
1313 #------------------------------------------------------------------------------
|
|
1314 # nucle score
|
|
1315 sub nucle_score {
|
|
1316 my ($variance, $gcPercent, $atgcRatio, $length) = @_;
|
|
1317 #return log2(($variance * $gcPercent * $atgcRatio) / sqrt($length));
|
|
1318 return log2(($variance * $gcPercent * $atgcRatio ** (3)) / sqrt($length));
|
|
1319 }
|
|
1320 #------------------------------------------------------------------------------
|
|
1321 sub log2 {
|
|
1322 my $n = shift;
|
|
1323 return (log($n) / log(2));
|
|
1324 }
|
|
1325 #------------------------------------------------------------------------------
|
|
1326 # compute GC pourcent
|
|
1327 sub gc_percent {
|
|
1328 my ($seq) = @_;
|
|
1329
|
|
1330 my @charSeq = split(//, uc($seq));
|
|
1331 my %hashFlank = ();
|
|
1332
|
|
1333 foreach my $v (@charSeq) {
|
|
1334 $hashFlank{$v} += 1;
|
|
1335 }
|
|
1336
|
|
1337 if (! $hashFlank{'G'}) { $hashFlank{'G'} = 0;}
|
|
1338 if (! $hashFlank{'C'}) { $hashFlank{'C'} = 0;}
|
|
1339
|
|
1340 if(length($seq) == 0) {
|
|
1341 return 0;
|
|
1342 }
|
|
1343 else {
|
|
1344 return (($hashFlank{'G'} + $hashFlank{'C'}) / (length($seq))) * 100;
|
|
1345 }
|
|
1346
|
|
1347 }
|
|
1348 #------------------------------------------------------------------------------
|
|
1349 # download file from ftp protocol
|
|
1350 sub download_file {
|
|
1351 my($servor, $file) = @_;
|
|
1352
|
|
1353 #if($actualOS eq "MSWin32"){
|
|
1354 my $ftp = Net::FTP->new($servor, Debug => 0)
|
|
1355 or die "Cannot connect to $servor";
|
|
1356
|
|
1357 $ftp->login("anonymous", "-anonymous@")
|
|
1358 or die "Cannot login ", $ftp->message;
|
|
1359 $ftp->binary;
|
|
1360 $ftp->get($file) or die "get failed ", $ftp->message;
|
|
1361
|
|
1362 $ftp->quit;
|
|
1363 #}
|
|
1364 #else{
|
|
1365 # system("wget $file");
|
|
1366 #}
|
|
1367 }
|
|
1368 #------------------------------------------------------------------------------
|
|
1369 # obtain file directory
|
|
1370 sub obtain_file {
|
|
1371 my($servor, $link) = @_;
|
|
1372 if ($link =~ /$servor(.*)/) { return ($1); }
|
|
1373 }
|
|
1374 #------------------------------------------------------------------------------
|
|
1375 # download fastq file from ENA
|
|
1376 sub download_ena_fastq {
|
|
1377 my ($enaFtpServor, $sraId, $log) = @_;
|
|
1378
|
|
1379 my $fastqDir = "/vol1/fastq/";
|
|
1380 my $dir1 = substr $sraId, 0, 6;
|
|
1381 my $dir2 = "000";
|
|
1382 my $digits = substr $sraId, 3;
|
|
1383 my $fastqRep = $sraId . "_folder";
|
|
1384
|
|
1385 if ($log) {
|
|
1386 print LOG "...Downloading fastq file from ENA\n";
|
|
1387 }
|
|
1388
|
|
1389 if (length $digits == 6) {
|
|
1390 $dir2 = $sraId;
|
|
1391 $fastqDir .= $dir1 . "/" . $dir2 . "/";
|
|
1392 }
|
|
1393 elsif (length $digits > 6) {
|
|
1394 my $digitsNumber = 0;
|
|
1395 my @digitsList = split //, (substr $digits, 6);
|
|
1396
|
|
1397 foreach my $char (@digitsList) {
|
|
1398 if (length $dir2 >= 1) {
|
|
1399 $dir2 = substr $dir2, 0, (length $dir2) - 1;
|
|
1400 $digitsNumber += 1;
|
|
1401 }
|
|
1402 }
|
|
1403 $dir2 .= substr $digits, -$digitsNumber;
|
|
1404 $fastqDir .= $dir1 . "/" . $dir2 . "/" . $sraId . "/";
|
|
1405 }
|
|
1406
|
|
1407 if ($log) {
|
|
1408 print LOG "...recreate database folder path for FASTQ downloading\n";
|
|
1409 }
|
|
1410
|
|
1411 my $ftp = Net::FTP->new($enaFtpServor, Debug => 0)
|
|
1412 or die "Cannot connect to $enaFtpServor";
|
|
1413
|
|
1414 $ftp->login("anonymous", "-anonymous@")
|
|
1415 or die "Cannot login ", $ftp->message;
|
|
1416 $ftp->binary;
|
|
1417
|
|
1418 $ftp->cwd($fastqDir)
|
|
1419 or die "maybe undefined sequence id, can't go to $fastqDir: ", $ftp->message;
|
|
1420
|
|
1421 my @fastqFiles = $ftp->ls("$sraId*");
|
|
1422
|
|
1423 if ($log) {
|
|
1424 print LOG "...Searching fastq files in path\n";
|
|
1425 }
|
|
1426
|
|
1427 if (!grep(/^$/, @fastqFiles)) {
|
|
1428
|
|
1429 if (-d $fastqRep) { rmtree($fastqRep) }
|
|
1430 mkdir $fastqRep;
|
|
1431
|
|
1432 foreach my $fastqFile (@fastqFiles) {
|
|
1433 #if($actualOS eq "MSWin32"){
|
|
1434 $ftp->get($fastqFile) or die "get failed ", $ftp->message;
|
|
1435 #}
|
|
1436 #else{
|
|
1437 # system("wget $fastqFile");
|
|
1438 #}
|
|
1439
|
|
1440 #my @baseAndExt = split /\./, $fastqFile;
|
|
1441 #my $unzipFastq = $baseAndExt[0] . ".fastq";
|
|
1442
|
|
1443 #gunzip $fastqFile => $unzipFastq or die "gunzip failed: $GunzipError\n";
|
|
1444 move($fastqFile, $fastqRep) or die "move failed: $!"; # DC replaced $unzipFastq by $fastqFile
|
|
1445 }
|
|
1446 #unlink glob "*fastq.gz" or die "$!: for file *fastq.gz";
|
|
1447
|
|
1448 if ($log) {
|
|
1449 print LOG "...Finalizing download of FASTQ file\n";
|
|
1450 }
|
|
1451 }
|
|
1452
|
|
1453 if ($log) {
|
|
1454 print LOG "End of download\n";
|
|
1455 }
|
|
1456
|
|
1457 $ftp->quit;
|
|
1458 }
|
|
1459 #------------------------------------------------------------------------------
|
|
1460 # download fastq file from ENA
|
|
1461 sub get_assembly_or_project {
|
|
1462 my ($file, $sequence, $ftpServor, $fldSep, $log) = @_;
|
|
1463
|
|
1464 my $pattern;
|
|
1465 my $indexInfo;
|
|
1466 my %folderHash;
|
|
1467
|
|
1468 # Repository for fna file
|
|
1469 my $repositoryFNA = "Assembly";
|
|
1470
|
|
1471 # Repository for genbank file
|
|
1472 my $repositoryGenbank = "GenBank";
|
|
1473
|
|
1474 # Reposotiry for report file
|
|
1475 my $repositoryReport = "Report";
|
|
1476
|
|
1477 # global repository
|
|
1478 my $repositorySequence = $sequence;
|
|
1479
|
|
1480
|
|
1481 if ($sequence =~ /^GC[AF]_(.*)/) {
|
|
1482 $indexInfo = 0;
|
|
1483 $pattern = $1;
|
|
1484 }
|
|
1485 elsif ($sequence =~ /^PRJ/) {
|
|
1486 $indexInfo = 1;
|
|
1487 $pattern = $sequence;
|
|
1488 }
|
|
1489
|
|
1490 open(SUM, $file) or die "error open file $!:";
|
|
1491 while(<SUM>) {
|
|
1492 chomp;
|
|
1493 if ($_ !~ /^#/) {
|
|
1494 my @infoList = split /\t/, $_;
|
|
1495 if ($infoList[$indexInfo] =~ /$pattern/) {
|
|
1496 my @gcfInfo = split(/\//, $infoList[19]);
|
|
1497 my $gcfName = pop(@gcfInfo);
|
|
1498
|
|
1499
|
|
1500 my $genbankFile = $infoList[19] . "/" . $gcfName . "_genomic.gbff.gz";
|
|
1501 my $dnaFile = $infoList[19] . "/" . $gcfName . "_genomic.fna.gz";
|
|
1502 my $assemblyReport = $infoList[19] . "/" . $gcfName . "_assembly_report.txt";
|
|
1503
|
|
1504 $dnaFile = obtain_file($ftpServor, $dnaFile);
|
|
1505 $genbankFile = obtain_file($ftpServor, $genbankFile);
|
|
1506 $assemblyReport = obtain_file($ftpServor, $assemblyReport);
|
|
1507
|
|
1508 download_file($ftpServor, $dnaFile);
|
|
1509 download_file($ftpServor, $genbankFile);
|
|
1510 download_file($ftpServor, $assemblyReport);
|
|
1511
|
|
1512 # download sequences and check number of "N" characters
|
|
1513 my $fileFasta = $gcfName."_genomic.fna.gz";
|
|
1514 my $ucpFasta = $gcfName."_genomic.fna";
|
|
1515 if (-e $fileFasta) {
|
|
1516 gunzip $fileFasta => $ucpFasta or die "gunzip failed: $GunzipError\n";
|
|
1517 $folderHash{$ucpFasta} = $repositoryFNA;
|
|
1518 }
|
|
1519
|
|
1520 # download genome report
|
|
1521 my $fileReport = $gcfName."_assembly_report.txt";
|
|
1522 if (-e $fileReport) {
|
|
1523 $folderHash{$fileReport} = $repositoryReport;
|
|
1524 }
|
|
1525
|
|
1526 # download genbank files
|
|
1527 my $fileGenbank = $gcfName."_genomic.gbff.gz";
|
|
1528 my $ucpGenbank = $gcfName."_genomic.gbff";
|
|
1529 if (-e $fileGenbank) {
|
|
1530 gunzip $fileGenbank => $ucpGenbank or die "gunzip failed: $GunzipError\n";
|
|
1531 $folderHash{$ucpGenbank} = $repositoryGenbank;
|
|
1532 }
|
|
1533
|
|
1534 }
|
|
1535 }
|
|
1536 }
|
|
1537 close(SUM) or die "error close file $!";
|
|
1538
|
|
1539 if (keys %folderHash) {
|
|
1540 if (-e $repositorySequence) {rmtree($repositorySequence);}
|
|
1541
|
|
1542 if ($log) {
|
|
1543 print LOG "...Download files from GenBank or RefSeq \n";
|
|
1544 }
|
|
1545
|
|
1546 mkdir $repositorySequence;
|
|
1547 mkdir $repositoryFNA;
|
|
1548 mkdir $repositoryGenbank;
|
|
1549 mkdir $repositoryReport;
|
|
1550
|
|
1551 for my $ucpFile (keys %folderHash) {
|
|
1552 move($ucpFile, $folderHash{$ucpFile}) or die "error move file $!:";
|
|
1553 }
|
|
1554 move($repositoryFNA, $repositorySequence . $fldSep. $repositoryFNA) or die "error move file $!:";
|
|
1555 move($repositoryGenbank, $repositorySequence . $fldSep. $repositoryGenbank) or die "error move file $!:";
|
|
1556 move($repositoryReport, $repositorySequence . $fldSep. $repositoryReport) or die "error move file $!:";
|
|
1557 unlink glob "*.gz" or die "for file *.gz $!:";
|
|
1558
|
|
1559 if ($log) {
|
|
1560 print LOG "...move GenBank/RefSeq sequence files to dedicated folders\n";
|
|
1561 }
|
|
1562 }
|
|
1563
|
|
1564 }
|
|
1565 #------------------------------------------------------------------------------
|
|
1566 sub download_assembly_or_project {
|
|
1567 my ($sequenceId, $ftpServor, $fldSep, $directory, $log) = @_;
|
|
1568
|
|
1569 my $assemblySummary;
|
|
1570 my @sequenceIdList = split /,/, $sequenceId;
|
|
1571
|
|
1572 if ($directory =~ /refseq/) {
|
|
1573 $assemblySummary = "assembly_summary_refseq.txt";
|
|
1574 } elsif ($directory =~ /genbank/) {
|
|
1575 $assemblySummary = "assembly_summary_genbank.txt";
|
|
1576 }
|
|
1577
|
|
1578 my $assemblySummaryPath = "/genomes/ASSEMBLY_REPORTS/" . $assemblySummary;
|
|
1579 download_file($ftpServor, $assemblySummaryPath);
|
|
1580
|
|
1581 foreach my $sequence (@sequenceIdList) {
|
|
1582 get_assembly_or_project($assemblySummary, $sequence, $ftpServor, $fldSep, $log);
|
|
1583 }
|
|
1584
|
|
1585 unlink $assemblySummary or die "error remove file $!:";
|
|
1586 }
|
|
1587 #------------------------------------------------------------------------------
|
|
1588 # check if all required module are install
|
|
1589 sub isModuleInstalled {
|
|
1590 my $mod = shift;
|
|
1591
|
|
1592 #eval("use $mod");
|
|
1593 my $commandModule = `perldoc -l $mod`;
|
|
1594
|
|
1595 if ($commandModule) {
|
|
1596 return(1);
|
|
1597 } else {
|
|
1598 return(0);
|
|
1599 }
|
|
1600 }
|
|
1601 #------------------------------------------------------------------------------
|
|
1602 # download assembly summary
|
|
1603 sub download_summaries {
|
|
1604 my ($database, $kingdom, $ftpServor, $fldSep, $getSummaries) = @_;
|
|
1605
|
|
1606 my $assemblySummary = "assembly_summary.txt";
|
|
1607 my $assemblySummaryLink;
|
|
1608 my $fileName;
|
|
1609
|
|
1610 opendir my $workingDirectory, "." . $fldSep or die "error open dir $!:";
|
|
1611 my @filesList = readdir $workingDirectory;
|
|
1612 closedir $workingDirectory;
|
|
1613
|
|
1614 if ($getSummaries) {
|
|
1615 foreach my $summaryKingdom (split /,/, $getSummaries) {
|
|
1616 foreach my $file (@filesList) {
|
|
1617 if ($file =~ /assembly_summary.txt/i && $file =~ /$summaryKingdom/i && $file =~ /$database/) {
|
|
1618 unlink $file or die "error remove file $!:";
|
|
1619 $assemblySummaryLink = "/genomes/$database/$summaryKingdom/assembly_summary.txt";
|
|
1620 download_file($ftpServor, $assemblySummaryLink);
|
|
1621 $fileName = $database . "_" . $summaryKingdom . "_" . "assembly_summary.txt";
|
|
1622 rename $assemblySummary, $fileName;
|
|
1623 print "replace assembly_summary file\n";
|
|
1624 }
|
|
1625 }
|
|
1626 }
|
|
1627 }
|
|
1628
|
|
1629 foreach my $file (@filesList) {
|
|
1630 if ($file =~ /assembly_summary.txt/i && $file =~ /$kingdom/i && $file =~ /$database/) {
|
|
1631 return $file;
|
|
1632 }
|
|
1633 }
|
|
1634
|
|
1635 $assemblySummaryLink = "/genomes/$database/$kingdom/assembly_summary.txt";
|
|
1636 $fileName = $database . "_" . $kingdom . "_" . "assembly_summary.txt";
|
|
1637 download_file($ftpServor, $assemblySummaryLink);
|
|
1638 rename $assemblySummary, $fileName;
|
|
1639
|
|
1640 return $fileName;
|
|
1641 }
|
|
1642 #---------------------
|
|
1643 sub bioseqio {
|
|
1644
|
|
1645 my ($keyword, $file) = @_;
|
|
1646 local $/ = "\n>"; # read by FASTA record
|
|
1647
|
|
1648 my $count = 0;
|
|
1649
|
|
1650 open FASTA, $file;
|
|
1651 while (<FASTA>) {
|
|
1652 chomp;
|
|
1653 my $seq = $_;
|
|
1654 #my ($id) = $seq =~ /^>*(\S+)/; # parse ID as first word in FASTA header
|
|
1655 if ($seq =~ /^>*.*$keyword/) {
|
|
1656 #$seq =~ s/^>*.+\n//; # remove FASTA header
|
|
1657 #$seq =~ s/\n//g; # remove endlines
|
|
1658 $count++;
|
|
1659 print "\nThe sequence number is: $count \n";
|
|
1660 print ">$seq";
|
|
1661 }
|
|
1662 }
|
|
1663 close FASTA;
|
|
1664 }
|