annotate COG/bac-genomics-scripts/ncbi_ftp_download/ncbi_ftp_concat_unpack.pl @ 13:152d7c43478b draft default tip

Uploaded
author dereeper
date Thu, 30 May 2024 20:07:55 +0000
parents e42d30da7a74
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2 use warnings;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
3 use strict;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
4 use File::Find; # module to traverse directory trees
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
5
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
6 my $usage = << "USAGE";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
7
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
8 #######################################################################
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
9 # $0 genbank|refseq [y] #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
10 # #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
11 # Unpacks and concatenates all draft and complete genomes (in genbank #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
12 # and fasta format) downloaded from NCBI's FTP server #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
13 # (ftp://ftp.ncbi.nlm.nih.gov/). The script traverses the downloaded #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
14 # NCBI FTP folder structure and thus has to be called from the top #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
15 # level (containing the folder './ftp.ncbi.nlm.nih.gov'). #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
16 # Therefore, use the bash-shell wrapper script 'ncbi_ftp_download.sh',#
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
17 # which employs 'wget' to download the genomes and mirrors the NCBI #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
18 # FTP server folder structure locally (with 'ftp.ncbi.nlm.nih.gov' #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
19 # being the top folder). Afterwards, 'ncbi_ftp_download.sh' runs #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
20 # 'ncbi_ftp_concat_unpack.pl' with both 'genbank' and 'refseq' #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
21 # options, as well as option 'y' to overwrite the old result folders #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
22 # (see below). Both scripts have to be in the same directory (or in #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
23 # the path) to run 'ncbi_ftp_download.sh'. #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
24 # For COMPLETE genomes PLASMIDS are concatenated to the CHROMOSOMES to#
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
25 # create multi-genbank/-fasta files (script 'split_multi-seq_file.pl' #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
26 # can be used to split the multi-seq file to single-seq files). #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
27 # In DRAFT genomes, SCAFFOLD and/or CONTIG files, designated by #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
28 # 'draft_scaf' or 'draft_con', are controlled for annotation (i.e. if #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
29 # gene primary feature tags exist). Usually only one file contains #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
30 # annotations. The one with annotation is then used to create multi- #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
31 # genbank files. Multi-fasta files are created for the corresponding #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
32 # genbank-file or, if no annotation exists, for the file which #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
33 # contains more sequence information (either contigs or scaffolds), #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
34 # and if equal, scaffolds are preferred. #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
35 # #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
36 # Use option 'genbank' to extract/copy GenBank genomes and 'refseq' #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
37 # for RefSeq genomes. Concatenated files are stored in the result #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
38 # folders './genbank' and './refseq', respectively. Set option 'y' in #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
39 # the program call to delete previous result folders and create new #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
40 # ones (otherwise, the script will ask user if to proceed). #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
41 # If sequence size discrepancies between a genbank and its #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
42 # corresponding fasta file are found, error file 'seq_errors.txt' will#
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
43 # contain warnings. #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
44 # #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
45 # version 0.2.1, update: 13.07.2015 Andreas Leimbach #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
46 # 15.09.2012 aleimba[at]gmx[dot]de #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
47 #######################################################################
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
48
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
49 USAGE
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
50 ;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
51
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
52
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
53 ### Print usage if -h|--h|--help is given as argument or options are not given
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
54 my ($db, $ask) = @ARGV; # if $ask not given, run subroutine 'ask_exit' (below) to get input from STDIN
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
55 if (!defined $db) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
56 die $usage;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
57 } elsif ($db =~ m/-h/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
58 die $usage;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
59 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
60 my $err = 'seq_errors.txt'; # Error file will be created if sequence discrepancies between genbank and corresponding fasta files will be found (see sub 'warn_seq')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
61 if (-e $err) { # remove error file from previous run, as it will be opened/created in append mode
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
62 unlink $err or die "Can't remove previous error file \'$err\' for new script run: $!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
63 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
64
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
65
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
66 ### Set the correct directories to start traversion of the folder structure
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
67 my $dir_complete; # for complete genomes
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
68 my $dir_draft; # for draft genomes
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
69 if ($db =~ /genbank/i) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
70 $db = 'genbank';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
71 $dir_complete = './ftp.ncbi.nlm.nih.gov/genbank/genomes/Bacteria';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
72 $dir_draft = './ftp.ncbi.nlm.nih.gov/genbank/genomes/Bacteria_DRAFT';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
73 ask_exit($db, $ask); # subroutine to ask if the current result folder should be deleted and created new, or exit the script
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
74 rm_dir($db); # subroutine to remove the result directory and all its contents prior filling it with new files (in case files have changed)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
75 } elsif ($db =~ /refseq/i) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
76 $db = 'refseq';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
77 $dir_complete = './ftp.ncbi.nlm.nih.gov/genomes/Bacteria';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
78 $dir_draft = './ftp.ncbi.nlm.nih.gov/genomes/Bacteria_DRAFT';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
79 ask_exit($db, $ask);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
80 rm_dir($db);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
81 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
82 die "\nFatal error: wrong database name, give either 'genbank' or 'refseq' as first argument!\n\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
83 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
84
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
85
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
86 ### Concatenate all genbank and fasta files for COMPLETE genomes in subdirectories and write to the genbank/refseq folders
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
87 find ({wanted => \&completes, no_chdir => 1}, $dir_complete); # the function 'find' from File::Find takes a reference to a subroutine and a starting directory to walk through recursively (e.g. 'find (\$concat, $dirname)')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
88 # For each file/directory/link in the tree (also in the current directory) it calls the referenced function and changes to that directory with 'chdir()' --> this can be stopped with 'no_chdir => 1', parameters to find are passed as a hash reference
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
89
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
90
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
91 ### Unzip and unpack all DRAFT files with *.tgz in subdirectories and concat in result folders
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
92 my %draft_files; # store the filenames of the concatenated files in here (in subroutine 'drafts')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
93 find ({wanted => \&drafts, no_chdir => 1}, $dir_draft);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
94
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
95
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
96 ### For DRAFTS go through all the files in the result folder and keep only gbk files with annotation (either only in contig or scaffold) and the corresponding fastas for each strain; if no annotation exists keep the fasta file with the most sequence information (contig or scaffold); delete the other files (gbk and corresponding fasta) in the result folder
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
97 my ($A, $C, $G, $T); # to calculate the total base count (subs 'seq_size' and 'base_count')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
98 my $skip = ''; # not possible to delete elements from a hash while iterating through it, thus need a variable to skip files
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
99 foreach my $file (sort{$b cmp $a} keys %draft_files) { # reverse sort to get scaf files first if existent
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
100 $skip =~ s/(\S+)\_draft\_(scaf|con)\.\w*$/$1/; # get rid of file-extension to skip for scaffold genomes corresponding contig gbk
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
101 if ($file =~ /$skip/) { # $skip stores $file from previous foreach
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
102 next;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
103 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
104 if ($file =~ /\_draft\_scaf\.gbk$/) { # if scaffold file exists
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
105 print "\nProcessing DRAFT genome: $file\n"; # status message
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
106 my $scaf_gbkfile = my $scaf_fasfile = $file;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
107 $scaf_fasfile =~ s/gbk$/fasta/;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
108 my $con_gbkfile = $scaf_gbkfile;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
109 $con_gbkfile =~ s/\_scaf\./\_con\./;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
110 my $con_fasfile = $con_gbkfile;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
111 $con_fasfile =~ s/gbk$/fasta/;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
112 my $scaf_size = -s "./$db/$scaf_gbkfile"; # file size of the scaffold gbk file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
113 my $con_size;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
114 if (-e "./$db/$con_gbkfile") { # only a scaffold file might exist and not a contig file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
115 $con_size = -s "./$db/$con_gbkfile";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
116 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
117 $con_size = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
118 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
119 my $scaf_gene = count_gene($scaf_gbkfile); # subroutine to count genes in genbank files
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
120 # print "scaf file size: $scaf_size\tscaf_gene: $scaf_gene\n"; # print statement to test output
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
121 my $con_gene = count_gene($con_gbkfile); # 0 if '$con_gbkfile' doesn't exist
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
122 # print "con file size: $con_size\tcon_gene: $con_gene\n"; # print statement to test output
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
123 if ($scaf_gene > $con_gene && $scaf_size > $con_size && $con_gene == 0) { # so many conditions needed? Do they always work?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
124 print "Annotation in scaffolds: Keeping \'$scaf_gbkfile\' and \'$scaf_fasfile\' and removing contig files (if they exist)!\n"; # status message
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
125 rm_exist_file($db, $con_gbkfile); # remove the contig-gbk without annotation; subroutine, if only a scaffold file exists and not a contig file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
126 $skip = $file; # scaffold file with annotation found, skip a corresponding contig gbk in %draft_files (if existent)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
127 my $scaf_gbkcount = seq_size($scaf_gbkfile); # subroutine to calculate total bases (A, C, G, T) of multi-genbank or -fasta files
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
128 rm_exist_file($db, $con_fasfile); # also delete the corresponding fasta
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
129 my $scaf_fascount = seq_size($scaf_fasfile); # count total bases of fasta
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
130 # print "scaf_gbkcount: $scaf_gbkcount\tscaf_fascount: $scaf_fascount\n"; # print statement to test output
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
131 warn_seq($scaf_fasfile, $scaf_gbkcount, $scaf_fascount); # subroutine to warn if the genbank and the fasta file have differing sequence sizes
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
132 next;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
133 } elsif ($con_gene > $scaf_gene && $con_size > $scaf_size && $scaf_gene == 0) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
134 print "Annotation in contigs: Keeping \'$con_gbkfile\' and \'$con_fasfile\' and removing scaffold files!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
135 unlink "./$db/$scaf_gbkfile" or die "Can't remove file \'$scaf_gbkfile\': $!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
136 $skip = $file;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
137 my $con_gbkcount = seq_size($con_gbkfile);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
138 unlink "./$db/$scaf_fasfile" or die "Can't remove file \'$scaf_fasfile\': $!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
139 my $con_fascount = seq_size($con_fasfile);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
140 # print "con_gbkcount: $con_gbkcount\tcon_fascount: $con_fascount\n"; # print statement to test output
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
141 warn_seq($con_fasfile, $con_gbkcount, $con_fascount);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
142 next;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
143 } elsif ($con_gene == 0 && $scaf_gene == 0) { # no annotation in both genbanks, just keep one fasta
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
144 print "No annotation in either scaffold or contig file, but ";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
145 unlink "./$db/$scaf_gbkfile" or die "Can't remove file \'$scaf_gbkfile\': $!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
146 rm_exist_file($db, $con_gbkfile);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
147 $skip = $file;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
148 my $scaf_fascount = seq_size($scaf_fasfile);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
149 my $con_fascount = seq_size($con_fasfile); # 0 if '$con_gbkfile' doesn't exist
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
150 # print "\nscaf_fascount: $scaf_fascount\tcon_fascount: $con_fascount\n"; # print statement to test output
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
151 if ($scaf_fascount > $con_fascount || $scaf_fascount == $con_fascount) { # keep scaffold file if both same base count
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
152 print "more/equal sequence information in scaffolds (or contigs don't exist). Thus keeping \'$scaf_fasfile\' and removing contig fasta file!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
153 rm_exist_file($db, $con_fasfile);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
154 } elsif ($con_fascount > $scaf_fascount) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
155 print "more sequence information in contigs. Thus keeping \'$con_fasfile\' and removing scaffold fasta file!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
156 unlink "./$db/$scaf_fasfile" or die "Can't remove file \'$scaf_fasfile\': $!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
157 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
158 next;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
159 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
160 } elsif ($file =~ /\_draft\_con\.gbk$/) { # if no scaffold file exists, contig file exists
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
161 print "\nProcessing DRAFT genome: $file\n"; # status message
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
162 print "Only contigs exist ";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
163 $skip = $file;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
164 my $con_gene = count_gene($file);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
165 my $con_fasfile = $file;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
166 $con_fasfile =~ s/gbk$/fasta/;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
167 if ($con_gene == 0) { # no annotation exists, only keep the fasta
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
168 unlink "./$db/$file" or die "Can't remove file \'$file\': $!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
169 print "but not annotated, thus keeping only fasta file \'$con_fasfile\'!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
170 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
171 print "and annotated, thus keeping contig genbank, \'$file\', and fasta files, \'$con_fasfile\'!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
172 my $con_gbkcount = seq_size($file);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
173 my $con_fascount = seq_size($con_fasfile);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
174 # print "con_gbkcount: $con_gbkcount\tcon_fascount: $con_fascount\n"; # print statement to test output
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
175 warn_seq($con_fasfile, $con_gbkcount, $con_fascount);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
176 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
177 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
178 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
179
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
180
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
181 ### Print result folder
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
182 print "\n#### All genome files have been written to the folder \'./$db\'!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
183 if (-e $err) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
184 print "#### Sequence discrepancies between genbank and corresponding fasta files have been found, see file \'$err\'!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
185 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
186
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
187 exit;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
188
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
189
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
190 ###############
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
191 # Subroutines #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
192 ###############
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
193
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
194 ### Subroutine to ask if the script should proceed with deleting the previous result folder or exit, if $ask not given as ARGV
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
195 sub ask_exit {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
196 my ($db, $ask) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
197 if (!defined($ask) && -e $db) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
198 print "The script will delete an already existing result folder \'./$db\', proceed [y|n]? ";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
199 $ask = <STDIN>;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
200 } elsif (!defined($ask) && !-e $db) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
201 $ask = 'y';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
202 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
203 if ($ask !~ /y/i) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
204 die "Script aborted!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
205 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
206 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
207 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
208
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
209
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
210 ### Subroutine that counts all bases
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
211 sub base_count {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
212 my $seq = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
213 $A = ($seq =~ tr/[aA]//) + $A; # pattern match modifier like 'i' don't work with transliterations
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
214 $C = ($seq =~ tr/[cC]//) + $C;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
215 $G = ($seq =~ tr/[gG]//) + $G;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
216 $T = ($seq =~ tr/[tT]//) + $T;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
217 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
218 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
219
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
220
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
221 ### Subroutine to concatenate all genbank and fasta files for COMPLETE genomes and store in the result folder
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
222 sub completes { # subroutine gets one argument $_, the file/directory seen by 'find'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
223 if (/^\.+$/ || /.*\/Bacteria$/) { # skip system folders (. and ..) and 'Bacteria' folder
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
224 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
225 } elsif (-d) { # '-d' = only directories
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
226 my $file = $_; # because of 'no_chdir => 1', $_ contains the full path to the file/dir
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
227 $file =~ s/.*\/(\S*)$/$1/; # get the folder name to create the filename
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
228 print "\nProcessing COMPLETE genome: $file\n"; # status message
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
229 $file =~ s/Escherichia_coli_/Ecoli_/; # shorten final E. coli file names
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
230 # $file =~ s/Shigella_/S/; # 'Shigella_D9' ends up as 'SD9'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
231 my $gbk_file = "$file.gbk";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
232 my $fas_file = "$file.fasta";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
233 dir_check($db); # subroutine to test if result directory exists, if not create; ALWAYS for first file since above 'rm_dir'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
234 system("cat $_/*.gbk > ./$db/$gbk_file"); # unix system call for concatenation
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
235 system("cat $_/*.fna > ./$db/$fas_file");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
236 my $gbk_count = seq_size($gbk_file); # subroutine to calculate total bases (A, C, G, and T) of genbank or fasta files
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
237 my $fas_count = seq_size($fas_file);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
238 # print "gbk_count: $gbk_count\tfas_count: $fas_count\n"; # print statement to test output
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
239 warn_seq($fas_file, $gbk_count, $fas_count); # subroutine to warn if the genbank and the fasta file have differing sequence sizes
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
240 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
241 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
242 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
243
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
244
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
245 ### Count the genes in genbank files
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
246 sub count_gene {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
247 my $file = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
248 my $gene_count = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
249 if (-e "./$db/$file") { # if the file doesn't exist (e.g. only scaffold file and not contig file) $gene_count will remain 0
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
250 open(IN, "<./$db/$file") or die "Can't open file \'$file\': $!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
251 while (<IN>) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
252 $gene_count++ if /\s{3,}gene\s{3,}/; # Some Genbank files only annotated with 'gene' tags (instead of 'gene' and 'CDS'); '\s{3,}' at least three whitespaces
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
253 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
254 close IN;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
255 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
256 return $gene_count;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
257 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
258
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
259
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
260 ### Subroutine to check for result directories (./genbank and ./refseq), if they don't exist create; ALWAYS as the prior directory with its contents is deleted at the start of the script with 'rm_dir'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
261 sub dir_check {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
262 my $dir = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
263 if (!-e $db) { # mkdir if not existent
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
264 mkdir $db or die "Couldn't create result directory \'$db\': $!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
265 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
266 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
267 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
268
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
269
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
270 ### Subroutine to work through all possible DRAFT files and call next subroutine 'unpack_concat'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
271 sub drafts {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
272 my @result; # stores results of sub 'unpack_concat'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
273 push(@result, unpack_concat('contig.gbk')); # subroutine to unpack and concat respective files; returns nothing if argument (here 'contig.gbk') doesn't fit to filename and $result[x] will be undefined
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
274 push(@result, unpack_concat('scaffold.gbk'));
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
275 push(@result, unpack_concat('contig.fna'));
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
276 push(@result, unpack_concat('scaffold.fna'));
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
277 foreach (@result) { # for each succesful subroutine run @result contains one element, i.e. the concatenated filename
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
278 $draft_files{$_} = 1; # save filename in hash '%draft_files' if array element '$result[x]' defined
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
279 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
280 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
281 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
282
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
283
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
284 ### Subroutine to remove the prior result directories (./genbank and ./refseq) and all their contents, before new files are written in
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
285 sub rm_dir {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
286 my $directory = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
287 if (-e $directory) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
288 opendir (DIR, $directory) or die "Can't opendir $directory: $!"; # system unix call 'rm -rf' is too dangerous
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
289 while (defined(my $file = readdir(DIR))) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
290 if ($file =~ /^\.+$/) { # skip system folders (. and ..)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
291 next;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
292 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
293 unlink "./$directory/$file" or die "Can't remove file \'$file\': $!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
294 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
295 close DIR;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
296 rmdir $directory or die "Can't remove directory \'$directory\': $!\n"; # delete empty directory
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
297 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
298 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
299 return 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
300 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
301
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
302
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
303 ### Test for file existence and remove file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
304 sub rm_exist_file {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
305 my ($db, $file) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
306 if (-e "./$db/$file") {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
307 unlink "./$db/$file" or die "Can't remove file \'$file\': $!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
308 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
309 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
310 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
311
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
312
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
313 ### Subroutine to remove the unpacked files and clean up
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
314 sub rm_files {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
315 my ($directory, $assembly) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
316 opendir (DIR, $directory) or die "Can't opendir \'$directory\' to remove the unpacked files: $!";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
317 $assembly =~ s/.*(\..*)$/$1/; # get the file-extension to remove files
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
318 while (defined(my $file = readdir(DIR))) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
319 if ($file =~ /.*$assembly$/) { # Why does it not work to include: '-f $file &&'?
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
320 unlink "$directory/$file";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
321 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
322 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
323 close DIR;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
324 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
325 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
326
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
327
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
328 ### Calculate the total sequence in multi-genbanks and -fastas
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
329 sub seq_size {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
330 my $file = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
331 my $seq_count = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
332 if (-e "./$db/$file") { # if the file doesn't exist (e.g. only scaffold file and not contig file) $seq_count will remain 0
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
333 $A = $C = $G = $T = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
334 open(IN, "<./$db/$file") or die "Can't open file \'$file\': $!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
335 if ($file =~ /.*\.gbk/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
336 while (my $seq = <IN>) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
337 if ($seq =~ /^ORIGIN\s*$/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
338 $seq = <IN>; # don't want the ORIGIN line, but the next one onwards until '//'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
339 while ($seq !~ /\/\//) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
340 base_count($seq);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
341 $seq = <IN>;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
342 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
343 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
344 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
345 } elsif ($file =~ /.*\.fasta$/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
346 while (my $seq = <IN>) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
347 if ($seq =~ /^>/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
348 next;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
349 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
350 base_count($seq);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
351 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
352 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
353 close IN;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
354 $seq_count = $A + $C + $G + $T;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
355 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
356 return $seq_count;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
357 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
358
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
359
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
360 ### Subroutine to unzip, unpack all DRAFT files with *.tgz, and concatenate the contigs/scaffolds in the result folder
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
361 sub unpack_concat {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
362 my $assembly = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
363 if (-f && /.*\.$assembly\.tgz/) { # only if $_ is a file (-f) and fits to $assembly.tgz; because of 'no_chdir => 1' $_ contains the full path to the file (with filename)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
364 my $file = $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
365 my $directory = $File::Find::dir; # internal variable for File::Find, contains the directory path to the file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
366 system("tar xfz $file --directory=$directory"); # system call to unpack the $file.tgz
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
367 $file = $File::Find::dir; # replace path+file to just directory path
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
368 $file =~ s/.*\/(\S*)$/$1/; # get the folder name to create the filename
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
369 $file =~ s/Escherichia_coli_/Ecoli_/; # shorten final E. coli file names
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
370 # $file =~ s/Shigella_/S/; # 'Shigella_D9' ends up as 'SD9'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
371 dir_check($db); # subroutine to test if result directory exists, if not create; ALWAYS for first file since above 'rm_dir'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
372 if ($assembly =~ /contig.gbk/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
373 $file = $file . '_draft_con.gbk';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
374 system("cat $directory/*.gbk > ./$db/$file");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
375 rm_files($directory, $assembly); # subroutine to remove unpacked files
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
376 return $file; # @results in sub 'drafts' defined
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
377 } elsif ($assembly =~ /scaffold.gbk/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
378 $file = $file . '_draft_scaf.gbk';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
379 system("cat $directory/*.gbk > ./$db/$file");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
380 rm_files($directory, $assembly);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
381 return $file;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
382 } elsif ($assembly =~ /contig.fna/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
383 $file = $file . '_draft_con.fasta';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
384 system("cat $directory/*.fna > ./$db/$file");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
385 rm_files($directory, $assembly);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
386 return $file;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
387 } elsif ($assembly =~ /scaffold.fna/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
388 $file = $file . '_draft_scaf.fasta';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
389 system("cat $directory/*.fna > ./$db/$file");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
390 rm_files($directory, $assembly);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
391 return $file;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
392 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
393 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
394 return; # return undefined to scan @results for the concatenated file in subroutine 'drafts'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
395 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
396
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
397
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
398 ### Subroutine to warn if the genbank and corresponding fasta file have differing sequence
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
399 sub warn_seq {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
400 my ($file, $gbk_size, $fasta_size) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
401 if ($gbk_size != $fasta_size) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
402 $file =~ s/\.fasta//; # get rid of file-extension
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
403 open (ERR, ">>$err");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
404 print ERR "There is a difference in the sequence length of corresponding files \'$file\.gbk: $gbk_size\' and \'$file\.fasta: $fasta_size\'!\n\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
405 close ERR;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
406 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
407 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
408 }