1
|
1 #!/usr/bin/perl
|
|
2
|
|
3 use strict;
|
|
4
|
|
5 use File::Basename;
|
|
6 my $dirname = dirname(__FILE__);
|
|
7
|
|
8 system("wget https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt");
|
|
9 system("wget https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt");
|
|
10
|
|
11 my %continents;
|
|
12 open(F,"countries.txt");
|
|
13 <F>;
|
|
14 while(my $line =<F>){
|
|
15 chomp($line);
|
|
16 my ($continent,$country) = split(/,/,$line);
|
|
17 $continents{$country} = $continent;
|
|
18 }
|
|
19 close(F);
|
|
20
|
|
21 my $input = $ARGV[0];
|
|
22 my $outdir = $ARGV[1];
|
|
23 my $private_genomes = $ARGV[2];
|
|
24
|
|
25 system("cat $input $private_genomes >$outdir/list.txt");
|
|
26
|
|
27 my $concat = "";
|
|
28 open(O2,">$outdir/genbanks.txt");
|
|
29 open(O,">$outdir/strains.txt");
|
|
30 open(GENES,">$outdir/genes.txt");
|
|
31 open(L,">$outdir/list_genomes.txt");
|
|
32 open(L2,">$outdir/list_genomes2.txt");
|
|
33 open(L3,">$outdir/genomes.txt");
|
|
34 open(L4,">$outdir/genomes2.txt");
|
|
35 open(SEQFILE,">$outdir/seqfile");
|
|
36 open(PanSN,">$outdir/all_genomes.fa");
|
|
37 open(METADATA,">$outdir/metadata_strains.txt");
|
|
38
|
|
39 open(F,"$outdir/list.txt");
|
|
40 #open(TEST,">$outdir/test");
|
|
41 while(my $line =<F>){
|
|
42 chomp($line);
|
|
43 my $genbank = $line;
|
|
44 if (!-e "$genbank"){
|
|
45 my $grep = `grep '$line' prokaryotes.txt`;
|
|
46 #print "$genbank $line aaa $grep\n";exit;
|
|
47 my @infos = split(/\t/,$grep);
|
|
48 my $status = $infos[15];
|
|
49 if ($status !~/Complete Genome/ && $status !~/Chromosome/){
|
|
50 #next;
|
|
51 }
|
|
52 my $ftp_path = $infos[$#infos -2];
|
|
53
|
|
54 $ftp_path =~s/ftp:/http:/g;
|
|
55 my @table = split(/\//,$ftp_path);
|
|
56 my $name = $table[$#table];
|
|
57 my $prot_file = "$ftp_path/$name"."_protein.faa.gz";
|
|
58 my $gbff = "$ftp_path/$name"."_genomic.gbff.gz";
|
|
59 my $gff = "$ftp_path/$name"."_genomic.gff.gz";
|
|
60 my $genome_fasta = "$ftp_path/$name"."_genomic.fna.gz";
|
|
61 my @particules = split(/_/,$name);
|
|
62
|
|
63 `wget -O $outdir/$genbank.fasta.gz $genome_fasta`;
|
|
64 `gunzip $outdir/$genbank.fasta.gz`;
|
|
65 `wget -O $outdir/$genbank.gb.gz $gbff`;
|
|
66 system("gunzip $outdir/$genbank.gb.gz");
|
|
67
|
|
68
|
|
69 ################################################################
|
|
70 # for eukaryotes
|
|
71 ################################################################
|
|
72 if (!$grep){
|
|
73 $grep = `grep '$line' eukaryotes.txt`;
|
|
74 my @infos = split(/\t/,$grep);
|
|
75 my $gca = $infos[8];
|
|
76 if ($gca =~/GCA_(\d\d\d)(\d\d\d)(\d\d\d)/){
|
|
77 my $part1 = $1;
|
|
78 $ftp_path = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/$1/$2/$3";
|
|
79 `wget -O $outdir/$gca.index.html $ftp_path`;
|
|
80 my $grep_name = `grep '$gca' $outdir/$gca.index.html`;
|
|
81 unlink("$outdir/$gca.index.html");
|
|
82 my $name;
|
|
83 if ($grep_name =~/href=\"(.*)\"/){
|
|
84 $name = $1;
|
|
85 $name=~s/\///g;
|
|
86 }
|
|
87 $ftp_path = $ftp_path."/$name";
|
|
88 my $prot_file = "$ftp_path/$name"."_protein.faa.gz";
|
|
89 my $gbff = "$ftp_path/$name"."_genomic.gbff.gz";
|
|
90 my $gff = "$ftp_path/$name"."_genomic.gff.gz";
|
|
91 my $genome_fasta = "$ftp_path/$name"."_genomic.fna.gz";
|
|
92 my @particules = split(/_/,$name);
|
|
93
|
|
94 `wget -O $outdir/$genbank.fasta.gz $genome_fasta`;
|
|
95 `gunzip $outdir/$genbank.fasta.gz`;
|
|
96 `wget -O $outdir/$genbank.gb.gz $gbff`;
|
|
97 `wget -O $outdir/$genbank.faa.gz $prot_file`;
|
|
98 system("gunzip $outdir/$genbank.gb.gz");
|
|
99 system("gunzip $outdir/$genbank.faa.gz");
|
|
100
|
|
101 }
|
|
102 }
|
|
103 }
|
|
104 else{
|
|
105 my $genbank_file = $genbank;
|
|
106 my $grep = `grep 'LOCUS' $genbank_file`;
|
|
107 $genbank = "unknown";
|
|
108 if ($grep =~/LOCUS\s+([\-\:\w]+)/){$genbank = $1;}
|
|
109
|
|
110 #$genbank =~s/\:/_/g;
|
|
111
|
|
112 my $cmd = "cp -rf $genbank_file $outdir/$genbank.gb";
|
|
113 system($cmd);
|
|
114
|
|
115 my %genome_seqs;
|
|
116 my $current_chr;
|
|
117 my $go = 0;
|
|
118 open(G,"$outdir/$genbank.gb");
|
|
119 while(<G>){
|
|
120 if ($go == 1 && /(\d+) (.*)$/){
|
|
121 my $line = $2;
|
|
122 $line =~s/ //g;
|
|
123 $genome_seqs{$current_chr}.=$line;
|
|
124 }
|
|
125 if (/LOCUS ([^\s]+)/){
|
|
126 $current_chr = $1;
|
|
127 }
|
|
128 if (/ORIGIN/){$go = 1;}
|
|
129 if (/^\/\//){$go = 0;}
|
|
130 }
|
|
131 close(G);
|
|
132
|
|
133 open(FASTA,">$outdir/$genbank.fasta");
|
|
134 foreach my $ch(keys(%genome_seqs)){
|
|
135 print FASTA ">$ch\n";
|
|
136 my $seq = $genome_seqs{$ch};
|
|
137 print FASTA "$seq\n";
|
|
138 }
|
|
139 close(FASTA);
|
|
140 }
|
|
141 #my $get_organism_line = `head -10 $outdir/$genbank.gb | grep DEFINITION `;
|
|
142 my $get_organism_line = `head -10 $outdir/$genbank.gb | grep -A 1 DEFINITION `;
|
|
143
|
|
144 # if several lines for DEFINITION, concatenate the lines
|
|
145 my @lines_organism = split(/\n/,$get_organism_line);
|
|
146 my $first_line = $lines_organism[0];
|
|
147 my $second_line = $lines_organism[1];
|
|
148 if ($second_line =~/^ (.*)/){
|
|
149 $get_organism_line = $first_line. " ".$1;
|
|
150 }
|
|
151 else{
|
|
152 $get_organism_line = $first_line;
|
|
153 }
|
|
154
|
|
155 my $strain;
|
|
156 my $genus;
|
|
157 if ($get_organism_line =~/DEFINITION (.*)$/){
|
|
158 $strain = $1;
|
|
159 ($genus) = split(/\s/,$strain);
|
|
160 }
|
|
161 my $country = `head -100 $outdir/$genbank.gb | grep country `;
|
|
162 $country =~s/^\s+//g;
|
|
163 $country =~s/\/country=//g;
|
|
164 $country =~s/\"//g;
|
|
165 $country =~s/\n//g;$country =~s/\r//g;
|
|
166 if ($country =~/:/){
|
|
167 my $city;
|
|
168 ($country,$city) = split(/:/,$country);
|
|
169 }
|
|
170 if ($country eq ""){$country = "unresolved";}
|
|
171 my $continent = "unresolved";
|
|
172 if ($continents{$country}){
|
|
173 $continent = $continents{$country};
|
|
174 }
|
|
175 $strain =~s/\.//g;
|
|
176 my ($info1,$info2 ) = split(",",$strain);
|
|
177 $strain = $info1;
|
|
178 $strain =~s/ /_/g;
|
|
179 $strain =~s/strain_//g;
|
|
180 $strain =~s/_chromosome//g;
|
|
181 $strain =~s/_genome//g;
|
|
182 $strain =~s/_str_/_/g;
|
|
183 $strain =~s/[^\w\-\_]//g;
|
|
184 $strain =~s/\-/_/g;
|
|
185
|
|
186 print O "$genbank $strain\n";
|
|
187 $concat .= "$genbank,";
|
|
188 print L "$genbank $outdir/$genbank.gb\n";
|
|
189 print L2 "$genbank\n";
|
|
190 print L3 "$outdir/$genbank.fasta\n";
|
|
191 print L4 "$outdir/$genbank.fasta $strain\n";
|
|
192 print SEQFILE "$genbank $outdir/$genbank.fasta\n";
|
|
193 print METADATA "$strain\t$genus\t$country\t$continent\n";
|
|
194
|
|
195
|
|
196 my %genome_sequences;
|
|
197 my $genome = "";
|
|
198 my $seqid = "";
|
|
199 open(GENOME,"$outdir/$genbank.fasta");
|
|
200 while(<GENOME>){
|
|
201 if (!/^>/){
|
|
202 my $line = $_;
|
|
203 $line =~s/\n//g;$line =~s/\r//g;
|
|
204 $genome_sequences{$seqid} .= $line;
|
|
205 $genome .= $line;
|
|
206 print PanSN $_;
|
|
207 }
|
|
208 elsif (/>([^\s]+)/){
|
|
209 $seqid = $1;
|
|
210 $seqid =~s/\.\d+$//g;
|
|
211 print PanSN ">$strain#$seqid\n";
|
|
212 }
|
|
213 }
|
|
214 close(GENOME);
|
|
215
|
|
216
|
|
217 open(N,">$outdir/$genbank.nuc");
|
|
218 open(P,">$outdir/$genbank.pep");
|
|
219 open(FUNC,">$outdir/$genbank.func");
|
|
220 my $go = 0;
|
|
221 my $start;
|
|
222 my $end;
|
|
223 my $product;
|
|
224 my $complement = 0;
|
|
225 my $end_gene = "no";
|
|
226 my $protein = "";
|
|
227
|
|
228 #`sed -i "s/'//g" $outdir/$genbank.gb`;
|
|
229
|
|
230 my $has_translation = `grep -c 'translation=' $outdir/$genbank.gb`;
|
|
231 $has_translation =~s/\n//g;$has_translation =~s/\r//g;
|
|
232 open(G,"$outdir/$genbank.gb");
|
|
233 my $current_gene;
|
|
234 my $current_chrom;
|
|
235 while(<G>){
|
|
236 if (/^\s+ORGANISM\s+(.*)$/){
|
|
237 }
|
|
238 if (/protein_id=\"(.*)\"/){
|
|
239 $current_gene = $1;
|
|
240 }
|
|
241 if (/LOCUS ([^\s]+)/){
|
|
242 $current_chrom = $1;
|
|
243 }
|
|
244 if (/locus_tag=\"(.*)\"/){
|
|
245 $current_gene = $1;
|
|
246 }
|
|
247 if ($go == 1){
|
|
248 my $line = $_;
|
|
249 $line =~s/ //g;
|
|
250 $line =~s/\n//g;$line =~s/\r//g;
|
|
251 $protein .= $line;
|
|
252 if (/\"$/){
|
|
253 $protein =~s/\"//g;
|
|
254 $end_gene = "yes";
|
|
255
|
|
256 }
|
|
257 }
|
|
258 if (/\/translation=\"(.*)/ or ($has_translation == 0 && /protein_id=/)){
|
|
259 $go = 1;
|
|
260 $protein .= $1;
|
|
261 print P ">$current_gene\n";
|
|
262 print N ">$current_gene\n";
|
|
263 print GENES "$current_gene $product [$strain]\n";
|
|
264
|
|
265 if ($protein =~/\"$/){
|
|
266 $end_gene = "yes";
|
|
267 }
|
|
268 if ($has_translation == 0){$end_gene = "yes";}
|
|
269 }
|
|
270 if ($end_gene eq "yes"){
|
|
271 $protein =~s/\"//g;
|
|
272 print P "$protein\n";
|
|
273 $protein = "";
|
|
274 my $length = $end - $start + 1;
|
|
275
|
|
276 #print "okkkk $current_chrom\n";
|
|
277 #my $geneseq = substr($genome,$start-1,$length);
|
|
278 my $geneseq = substr($genome_sequences{$current_chrom},$start-1,$length);
|
|
279
|
|
280
|
|
281 if ($complement){
|
|
282 my $revcomp = reverse $geneseq;
|
|
283 $revcomp =~ tr/ATGCatgc/TACGtacg/;
|
|
284 $geneseq = $revcomp;
|
|
285 }
|
|
286 print N "$geneseq\n";
|
|
287 print FUNC "$current_gene - $product\n";
|
|
288 $go = 0;
|
|
289 $end_gene = "no";
|
|
290 }
|
|
291 if (/\/product=\"(.*)\"/){
|
|
292 $product = $1;
|
|
293 }
|
|
294 if (/^\s+CDS\s+(\d+)\.\.(\d+)\s*$/){
|
|
295 $start = $1;
|
|
296 $end = $2;
|
|
297 $complement = 0;
|
|
298 }
|
|
299 if (/^\s+CDS\s+complement\((\d+)\.\.(\d+)\)\s*$/){
|
|
300 $start = $1;
|
|
301 $end = $2;
|
|
302 $complement = 1;
|
|
303 }
|
|
304 }
|
|
305 close(G);
|
|
306 close(P);
|
|
307 close(N);
|
|
308 close(FUNC);
|
|
309
|
|
310 if ($has_translation == 0){
|
|
311 system("perl $dirname/translate.pl $outdir/$genbank.nuc $outdir/$genbank.pep");
|
|
312 }
|
|
313
|
|
314 my $prot_num = 0;
|
|
315 open(PRT,">$outdir/$genbank.prt");
|
|
316 open(P,"$outdir/$genbank.pep");
|
|
317 while(<P>){
|
|
318 if (/>(.*)/){
|
|
319 my $prot_id = $1;
|
|
320 $prot_num++;
|
|
321 my $new_id = "$strain"."_".$prot_num;
|
|
322 print PRT ">$new_id\n";
|
|
323 }
|
|
324 else{
|
|
325 print PRT $_;
|
|
326 }
|
|
327 }
|
|
328 close(P);
|
|
329 close(PRT);
|
|
330 }
|
|
331 close(F);
|
|
332 close(O);
|
|
333 close(METADATA);
|
|
334 chop($concat);
|
|
335 print O2 $concat;
|
|
336 close(O2);
|
|
337 close(L);
|
|
338 close(L2);
|
|
339 close(L3);
|
|
340 close(L4);
|
|
341 close(GENES);
|
|
342 close(SEQFILE);
|
|
343 close(PanSN);
|
|
344 #close(TEST);
|
|
345
|
|
346 unlink("prokaryotes.txt");
|
|
347 unlink("eukaryotes.txt");
|