comparison COG/bac-genomics-scripts/ncbi_ftp_download/ncbi_ftp_concat_unpack.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
comparison
equal deleted inserted replaced
2:97e4e3e818b6 3:e42d30da7a74
1 #!/usr/bin/perl
2 use warnings;
3 use strict;
4 use File::Find; # module to traverse directory trees
5
6 my $usage = << "USAGE";
7
8 #######################################################################
9 # $0 genbank|refseq [y] #
10 # #
11 # Unpacks and concatenates all draft and complete genomes (in genbank #
12 # and fasta format) downloaded from NCBI's FTP server #
13 # (ftp://ftp.ncbi.nlm.nih.gov/). The script traverses the downloaded #
14 # NCBI FTP folder structure and thus has to be called from the top #
15 # level (containing the folder './ftp.ncbi.nlm.nih.gov'). #
16 # Therefore, use the bash-shell wrapper script 'ncbi_ftp_download.sh',#
17 # which employs 'wget' to download the genomes and mirrors the NCBI #
18 # FTP server folder structure locally (with 'ftp.ncbi.nlm.nih.gov' #
19 # being the top folder). Afterwards, 'ncbi_ftp_download.sh' runs #
20 # 'ncbi_ftp_concat_unpack.pl' with both 'genbank' and 'refseq' #
21 # options, as well as option 'y' to overwrite the old result folders #
22 # (see below). Both scripts have to be in the same directory (or in #
23 # the path) to run 'ncbi_ftp_download.sh'. #
24 # For COMPLETE genomes PLASMIDS are concatenated to the CHROMOSOMES to#
25 # create multi-genbank/-fasta files (script 'split_multi-seq_file.pl' #
26 # can be used to split the multi-seq file to single-seq files). #
27 # In DRAFT genomes, SCAFFOLD and/or CONTIG files, designated by #
28 # 'draft_scaf' or 'draft_con', are controlled for annotation (i.e. if #
29 # gene primary feature tags exist). Usually only one file contains #
30 # annotations. The one with annotation is then used to create multi- #
31 # genbank files. Multi-fasta files are created for the corresponding #
32 # genbank-file or, if no annotation exists, for the file which #
33 # contains more sequence information (either contigs or scaffolds), #
34 # and if equal, scaffolds are preferred. #
35 # #
36 # Use option 'genbank' to extract/copy GenBank genomes and 'refseq' #
37 # for RefSeq genomes. Concatenated files are stored in the result #
38 # folders './genbank' and './refseq', respectively. Set option 'y' in #
39 # the program call to delete previous result folders and create new #
40 # ones (otherwise, the script will ask user if to proceed). #
41 # If sequence size discrepancies between a genbank and its #
42 # corresponding fasta file are found, error file 'seq_errors.txt' will#
43 # contain warnings. #
44 # #
45 # version 0.2.1, update: 13.07.2015 Andreas Leimbach #
46 # 15.09.2012 aleimba[at]gmx[dot]de #
47 #######################################################################
48
49 USAGE
50 ;
51
52
53 ### Print usage if -h|--h|--help is given as argument or options are not given
54 my ($db, $ask) = @ARGV; # if $ask not given, run subroutine 'ask_exit' (below) to get input from STDIN
55 if (!defined $db) {
56 die $usage;
57 } elsif ($db =~ m/-h/) {
58 die $usage;
59 }
60 my $err = 'seq_errors.txt'; # Error file will be created if sequence discrepancies between genbank and corresponding fasta files will be found (see sub 'warn_seq')
61 if (-e $err) { # remove error file from previous run, as it will be opened/created in append mode
62 unlink $err or die "Can't remove previous error file \'$err\' for new script run: $!\n";
63 }
64
65
66 ### Set the correct directories to start traversion of the folder structure
67 my $dir_complete; # for complete genomes
68 my $dir_draft; # for draft genomes
69 if ($db =~ /genbank/i) {
70 $db = 'genbank';
71 $dir_complete = './ftp.ncbi.nlm.nih.gov/genbank/genomes/Bacteria';
72 $dir_draft = './ftp.ncbi.nlm.nih.gov/genbank/genomes/Bacteria_DRAFT';
73 ask_exit($db, $ask); # subroutine to ask if the current result folder should be deleted and created new, or exit the script
74 rm_dir($db); # subroutine to remove the result directory and all its contents prior filling it with new files (in case files have changed)
75 } elsif ($db =~ /refseq/i) {
76 $db = 'refseq';
77 $dir_complete = './ftp.ncbi.nlm.nih.gov/genomes/Bacteria';
78 $dir_draft = './ftp.ncbi.nlm.nih.gov/genomes/Bacteria_DRAFT';
79 ask_exit($db, $ask);
80 rm_dir($db);
81 } else {
82 die "\nFatal error: wrong database name, give either 'genbank' or 'refseq' as first argument!\n\n";
83 }
84
85
86 ### Concatenate all genbank and fasta files for COMPLETE genomes in subdirectories and write to the genbank/refseq folders
87 find ({wanted => \&completes, no_chdir => 1}, $dir_complete); # the function 'find' from File::Find takes a reference to a subroutine and a starting directory to walk through recursively (e.g. 'find (\$concat, $dirname)')
88 # For each file/directory/link in the tree (also in the current directory) it calls the referenced function and changes to that directory with 'chdir()' --> this can be stopped with 'no_chdir => 1', parameters to find are passed as a hash reference
89
90
91 ### Unzip and unpack all DRAFT files with *.tgz in subdirectories and concat in result folders
92 my %draft_files; # store the filenames of the concatenated files in here (in subroutine 'drafts')
93 find ({wanted => \&drafts, no_chdir => 1}, $dir_draft);
94
95
96 ### For DRAFTS go through all the files in the result folder and keep only gbk files with annotation (either only in contig or scaffold) and the corresponding fastas for each strain; if no annotation exists keep the fasta file with the most sequence information (contig or scaffold); delete the other files (gbk and corresponding fasta) in the result folder
97 my ($A, $C, $G, $T); # to calculate the total base count (subs 'seq_size' and 'base_count')
98 my $skip = ''; # not possible to delete elements from a hash while iterating through it, thus need a variable to skip files
99 foreach my $file (sort{$b cmp $a} keys %draft_files) { # reverse sort to get scaf files first if existent
100 $skip =~ s/(\S+)\_draft\_(scaf|con)\.\w*$/$1/; # get rid of file-extension to skip for scaffold genomes corresponding contig gbk
101 if ($file =~ /$skip/) { # $skip stores $file from previous foreach
102 next;
103 }
104 if ($file =~ /\_draft\_scaf\.gbk$/) { # if scaffold file exists
105 print "\nProcessing DRAFT genome: $file\n"; # status message
106 my $scaf_gbkfile = my $scaf_fasfile = $file;
107 $scaf_fasfile =~ s/gbk$/fasta/;
108 my $con_gbkfile = $scaf_gbkfile;
109 $con_gbkfile =~ s/\_scaf\./\_con\./;
110 my $con_fasfile = $con_gbkfile;
111 $con_fasfile =~ s/gbk$/fasta/;
112 my $scaf_size = -s "./$db/$scaf_gbkfile"; # file size of the scaffold gbk file
113 my $con_size;
114 if (-e "./$db/$con_gbkfile") { # only a scaffold file might exist and not a contig file
115 $con_size = -s "./$db/$con_gbkfile";
116 } else {
117 $con_size = 0;
118 }
119 my $scaf_gene = count_gene($scaf_gbkfile); # subroutine to count genes in genbank files
120 # print "scaf file size: $scaf_size\tscaf_gene: $scaf_gene\n"; # print statement to test output
121 my $con_gene = count_gene($con_gbkfile); # 0 if '$con_gbkfile' doesn't exist
122 # print "con file size: $con_size\tcon_gene: $con_gene\n"; # print statement to test output
123 if ($scaf_gene > $con_gene && $scaf_size > $con_size && $con_gene == 0) { # so many conditions needed? Do they always work?
124 print "Annotation in scaffolds: Keeping \'$scaf_gbkfile\' and \'$scaf_fasfile\' and removing contig files (if they exist)!\n"; # status message
125 rm_exist_file($db, $con_gbkfile); # remove the contig-gbk without annotation; subroutine, if only a scaffold file exists and not a contig file
126 $skip = $file; # scaffold file with annotation found, skip a corresponding contig gbk in %draft_files (if existent)
127 my $scaf_gbkcount = seq_size($scaf_gbkfile); # subroutine to calculate total bases (A, C, G, T) of multi-genbank or -fasta files
128 rm_exist_file($db, $con_fasfile); # also delete the corresponding fasta
129 my $scaf_fascount = seq_size($scaf_fasfile); # count total bases of fasta
130 # print "scaf_gbkcount: $scaf_gbkcount\tscaf_fascount: $scaf_fascount\n"; # print statement to test output
131 warn_seq($scaf_fasfile, $scaf_gbkcount, $scaf_fascount); # subroutine to warn if the genbank and the fasta file have differing sequence sizes
132 next;
133 } elsif ($con_gene > $scaf_gene && $con_size > $scaf_size && $scaf_gene == 0) {
134 print "Annotation in contigs: Keeping \'$con_gbkfile\' and \'$con_fasfile\' and removing scaffold files!\n";
135 unlink "./$db/$scaf_gbkfile" or die "Can't remove file \'$scaf_gbkfile\': $!\n";
136 $skip = $file;
137 my $con_gbkcount = seq_size($con_gbkfile);
138 unlink "./$db/$scaf_fasfile" or die "Can't remove file \'$scaf_fasfile\': $!\n";
139 my $con_fascount = seq_size($con_fasfile);
140 # print "con_gbkcount: $con_gbkcount\tcon_fascount: $con_fascount\n"; # print statement to test output
141 warn_seq($con_fasfile, $con_gbkcount, $con_fascount);
142 next;
143 } elsif ($con_gene == 0 && $scaf_gene == 0) { # no annotation in both genbanks, just keep one fasta
144 print "No annotation in either scaffold or contig file, but ";
145 unlink "./$db/$scaf_gbkfile" or die "Can't remove file \'$scaf_gbkfile\': $!\n";
146 rm_exist_file($db, $con_gbkfile);
147 $skip = $file;
148 my $scaf_fascount = seq_size($scaf_fasfile);
149 my $con_fascount = seq_size($con_fasfile); # 0 if '$con_gbkfile' doesn't exist
150 # print "\nscaf_fascount: $scaf_fascount\tcon_fascount: $con_fascount\n"; # print statement to test output
151 if ($scaf_fascount > $con_fascount || $scaf_fascount == $con_fascount) { # keep scaffold file if both same base count
152 print "more/equal sequence information in scaffolds (or contigs don't exist). Thus keeping \'$scaf_fasfile\' and removing contig fasta file!\n";
153 rm_exist_file($db, $con_fasfile);
154 } elsif ($con_fascount > $scaf_fascount) {
155 print "more sequence information in contigs. Thus keeping \'$con_fasfile\' and removing scaffold fasta file!\n";
156 unlink "./$db/$scaf_fasfile" or die "Can't remove file \'$scaf_fasfile\': $!\n";
157 }
158 next;
159 }
160 } elsif ($file =~ /\_draft\_con\.gbk$/) { # if no scaffold file exists, contig file exists
161 print "\nProcessing DRAFT genome: $file\n"; # status message
162 print "Only contigs exist ";
163 $skip = $file;
164 my $con_gene = count_gene($file);
165 my $con_fasfile = $file;
166 $con_fasfile =~ s/gbk$/fasta/;
167 if ($con_gene == 0) { # no annotation exists, only keep the fasta
168 unlink "./$db/$file" or die "Can't remove file \'$file\': $!\n";
169 print "but not annotated, thus keeping only fasta file \'$con_fasfile\'!\n";
170 } else {
171 print "and annotated, thus keeping contig genbank, \'$file\', and fasta files, \'$con_fasfile\'!\n";
172 my $con_gbkcount = seq_size($file);
173 my $con_fascount = seq_size($con_fasfile);
174 # print "con_gbkcount: $con_gbkcount\tcon_fascount: $con_fascount\n"; # print statement to test output
175 warn_seq($con_fasfile, $con_gbkcount, $con_fascount);
176 }
177 }
178 }
179
180
181 ### Print result folder
182 print "\n#### All genome files have been written to the folder \'./$db\'!\n";
183 if (-e $err) {
184 print "#### Sequence discrepancies between genbank and corresponding fasta files have been found, see file \'$err\'!\n";
185 }
186
187 exit;
188
189
190 ###############
191 # Subroutines #
192 ###############
193
194 ### Subroutine to ask if the script should proceed with deleting the previous result folder or exit, if $ask not given as ARGV
195 sub ask_exit {
196 my ($db, $ask) = @_;
197 if (!defined($ask) && -e $db) {
198 print "The script will delete an already existing result folder \'./$db\', proceed [y|n]? ";
199 $ask = <STDIN>;
200 } elsif (!defined($ask) && !-e $db) {
201 $ask = 'y';
202 }
203 if ($ask !~ /y/i) {
204 die "Script aborted!\n";
205 }
206 return 1;
207 }
208
209
210 ### Subroutine that counts all bases
211 sub base_count {
212 my $seq = shift;
213 $A = ($seq =~ tr/[aA]//) + $A; # pattern match modifier like 'i' don't work with transliterations
214 $C = ($seq =~ tr/[cC]//) + $C;
215 $G = ($seq =~ tr/[gG]//) + $G;
216 $T = ($seq =~ tr/[tT]//) + $T;
217 return 1;
218 }
219
220
221 ### Subroutine to concatenate all genbank and fasta files for COMPLETE genomes and store in the result folder
222 sub completes { # subroutine gets one argument $_, the file/directory seen by 'find'
223 if (/^\.+$/ || /.*\/Bacteria$/) { # skip system folders (. and ..) and 'Bacteria' folder
224 return 1;
225 } elsif (-d) { # '-d' = only directories
226 my $file = $_; # because of 'no_chdir => 1', $_ contains the full path to the file/dir
227 $file =~ s/.*\/(\S*)$/$1/; # get the folder name to create the filename
228 print "\nProcessing COMPLETE genome: $file\n"; # status message
229 $file =~ s/Escherichia_coli_/Ecoli_/; # shorten final E. coli file names
230 # $file =~ s/Shigella_/S/; # 'Shigella_D9' ends up as 'SD9'
231 my $gbk_file = "$file.gbk";
232 my $fas_file = "$file.fasta";
233 dir_check($db); # subroutine to test if result directory exists, if not create; ALWAYS for first file since above 'rm_dir'
234 system("cat $_/*.gbk > ./$db/$gbk_file"); # unix system call for concatenation
235 system("cat $_/*.fna > ./$db/$fas_file");
236 my $gbk_count = seq_size($gbk_file); # subroutine to calculate total bases (A, C, G, and T) of genbank or fasta files
237 my $fas_count = seq_size($fas_file);
238 # print "gbk_count: $gbk_count\tfas_count: $fas_count\n"; # print statement to test output
239 warn_seq($fas_file, $gbk_count, $fas_count); # subroutine to warn if the genbank and the fasta file have differing sequence sizes
240 }
241 return 1;
242 }
243
244
245 ### Count the genes in genbank files
246 sub count_gene {
247 my $file = shift;
248 my $gene_count = 0;
249 if (-e "./$db/$file") { # if the file doesn't exist (e.g. only scaffold file and not contig file) $gene_count will remain 0
250 open(IN, "<./$db/$file") or die "Can't open file \'$file\': $!\n";
251 while (<IN>) {
252 $gene_count++ if /\s{3,}gene\s{3,}/; # Some Genbank files only annotated with 'gene' tags (instead of 'gene' and 'CDS'); '\s{3,}' at least three whitespaces
253 }
254 close IN;
255 }
256 return $gene_count;
257 }
258
259
260 ### Subroutine to check for result directories (./genbank and ./refseq), if they don't exist create; ALWAYS as the prior directory with its contents is deleted at the start of the script with 'rm_dir'
261 sub dir_check {
262 my $dir = shift;
263 if (!-e $db) { # mkdir if not existent
264 mkdir $db or die "Couldn't create result directory \'$db\': $!\n";
265 }
266 return 1;
267 }
268
269
270 ### Subroutine to work through all possible DRAFT files and call next subroutine 'unpack_concat'
271 sub drafts {
272 my @result; # stores results of sub 'unpack_concat'
273 push(@result, unpack_concat('contig.gbk')); # subroutine to unpack and concat respective files; returns nothing if argument (here 'contig.gbk') doesn't fit to filename and $result[x] will be undefined
274 push(@result, unpack_concat('scaffold.gbk'));
275 push(@result, unpack_concat('contig.fna'));
276 push(@result, unpack_concat('scaffold.fna'));
277 foreach (@result) { # for each succesful subroutine run @result contains one element, i.e. the concatenated filename
278 $draft_files{$_} = 1; # save filename in hash '%draft_files' if array element '$result[x]' defined
279 }
280 return 1;
281 }
282
283
284 ### Subroutine to remove the prior result directories (./genbank and ./refseq) and all their contents, before new files are written in
285 sub rm_dir {
286 my $directory = shift;
287 if (-e $directory) {
288 opendir (DIR, $directory) or die "Can't opendir $directory: $!"; # system unix call 'rm -rf' is too dangerous
289 while (defined(my $file = readdir(DIR))) {
290 if ($file =~ /^\.+$/) { # skip system folders (. and ..)
291 next;
292 }
293 unlink "./$directory/$file" or die "Can't remove file \'$file\': $!\n";
294 }
295 close DIR;
296 rmdir $directory or die "Can't remove directory \'$directory\': $!\n"; # delete empty directory
297 return 1;
298 }
299 return 0;
300 }
301
302
303 ### Test for file existence and remove file
304 sub rm_exist_file {
305 my ($db, $file) = @_;
306 if (-e "./$db/$file") {
307 unlink "./$db/$file" or die "Can't remove file \'$file\': $!\n";
308 }
309 return 1;
310 }
311
312
313 ### Subroutine to remove the unpacked files and clean up
314 sub rm_files {
315 my ($directory, $assembly) = @_;
316 opendir (DIR, $directory) or die "Can't opendir \'$directory\' to remove the unpacked files: $!";
317 $assembly =~ s/.*(\..*)$/$1/; # get the file-extension to remove files
318 while (defined(my $file = readdir(DIR))) {
319 if ($file =~ /.*$assembly$/) { # Why does it not work to include: '-f $file &&'?
320 unlink "$directory/$file";
321 }
322 }
323 close DIR;
324 return 1;
325 }
326
327
328 ### Calculate the total sequence in multi-genbanks and -fastas
329 sub seq_size {
330 my $file = shift;
331 my $seq_count = 0;
332 if (-e "./$db/$file") { # if the file doesn't exist (e.g. only scaffold file and not contig file) $seq_count will remain 0
333 $A = $C = $G = $T = 0;
334 open(IN, "<./$db/$file") or die "Can't open file \'$file\': $!\n";
335 if ($file =~ /.*\.gbk/) {
336 while (my $seq = <IN>) {
337 if ($seq =~ /^ORIGIN\s*$/) {
338 $seq = <IN>; # don't want the ORIGIN line, but the next one onwards until '//'
339 while ($seq !~ /\/\//) {
340 base_count($seq);
341 $seq = <IN>;
342 }
343 }
344 }
345 } elsif ($file =~ /.*\.fasta$/) {
346 while (my $seq = <IN>) {
347 if ($seq =~ /^>/) {
348 next;
349 }
350 base_count($seq);
351 }
352 }
353 close IN;
354 $seq_count = $A + $C + $G + $T;
355 }
356 return $seq_count;
357 }
358
359
360 ### Subroutine to unzip, unpack all DRAFT files with *.tgz, and concatenate the contigs/scaffolds in the result folder
361 sub unpack_concat {
362 my $assembly = shift;
363 if (-f && /.*\.$assembly\.tgz/) { # only if $_ is a file (-f) and fits to $assembly.tgz; because of 'no_chdir => 1' $_ contains the full path to the file (with filename)
364 my $file = $_;
365 my $directory = $File::Find::dir; # internal variable for File::Find, contains the directory path to the file
366 system("tar xfz $file --directory=$directory"); # system call to unpack the $file.tgz
367 $file = $File::Find::dir; # replace path+file to just directory path
368 $file =~ s/.*\/(\S*)$/$1/; # get the folder name to create the filename
369 $file =~ s/Escherichia_coli_/Ecoli_/; # shorten final E. coli file names
370 # $file =~ s/Shigella_/S/; # 'Shigella_D9' ends up as 'SD9'
371 dir_check($db); # subroutine to test if result directory exists, if not create; ALWAYS for first file since above 'rm_dir'
372 if ($assembly =~ /contig.gbk/) {
373 $file = $file . '_draft_con.gbk';
374 system("cat $directory/*.gbk > ./$db/$file");
375 rm_files($directory, $assembly); # subroutine to remove unpacked files
376 return $file; # @results in sub 'drafts' defined
377 } elsif ($assembly =~ /scaffold.gbk/) {
378 $file = $file . '_draft_scaf.gbk';
379 system("cat $directory/*.gbk > ./$db/$file");
380 rm_files($directory, $assembly);
381 return $file;
382 } elsif ($assembly =~ /contig.fna/) {
383 $file = $file . '_draft_con.fasta';
384 system("cat $directory/*.fna > ./$db/$file");
385 rm_files($directory, $assembly);
386 return $file;
387 } elsif ($assembly =~ /scaffold.fna/) {
388 $file = $file . '_draft_scaf.fasta';
389 system("cat $directory/*.fna > ./$db/$file");
390 rm_files($directory, $assembly);
391 return $file;
392 }
393 }
394 return; # return undefined to scan @results for the concatenated file in subroutine 'drafts'
395 }
396
397
398 ### Subroutine to warn if the genbank and corresponding fasta file have differing sequence
399 sub warn_seq {
400 my ($file, $gbk_size, $fasta_size) = @_;
401 if ($gbk_size != $fasta_size) {
402 $file =~ s/\.fasta//; # get rid of file-extension
403 open (ERR, ">>$err");
404 print ERR "There is a difference in the sequence length of corresponding files \'$file\.gbk: $gbk_size\' and \'$file\.fasta: $fasta_size\'!\n\n";
405 close ERR;
406 }
407 return 1;
408 }