Mercurial > repos > dereeper > pangenome_explorer
comparison PanExplorer_workflow/Perl/wget.pl @ 1:032f6b3806a3 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:16:08 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:3cbb01081cde | 1:032f6b3806a3 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 use strict; | |
4 | |
5 use File::Basename; | |
6 my $dirname = dirname(__FILE__); | |
7 | |
8 system("wget https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt"); | |
9 system("wget https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt"); | |
10 | |
11 my %continents; | |
12 open(F,"countries.txt"); | |
13 <F>; | |
14 while(my $line =<F>){ | |
15 chomp($line); | |
16 my ($continent,$country) = split(/,/,$line); | |
17 $continents{$country} = $continent; | |
18 } | |
19 close(F); | |
20 | |
21 my $input = $ARGV[0]; | |
22 my $outdir = $ARGV[1]; | |
23 my $private_genomes = $ARGV[2]; | |
24 | |
25 system("cat $input $private_genomes >$outdir/list.txt"); | |
26 | |
27 my $concat = ""; | |
28 open(O2,">$outdir/genbanks.txt"); | |
29 open(O,">$outdir/strains.txt"); | |
30 open(GENES,">$outdir/genes.txt"); | |
31 open(L,">$outdir/list_genomes.txt"); | |
32 open(L2,">$outdir/list_genomes2.txt"); | |
33 open(L3,">$outdir/genomes.txt"); | |
34 open(L4,">$outdir/genomes2.txt"); | |
35 open(SEQFILE,">$outdir/seqfile"); | |
36 open(PanSN,">$outdir/all_genomes.fa"); | |
37 open(METADATA,">$outdir/metadata_strains.txt"); | |
38 | |
39 open(F,"$outdir/list.txt"); | |
40 #open(TEST,">$outdir/test"); | |
41 while(my $line =<F>){ | |
42 chomp($line); | |
43 my $genbank = $line; | |
44 if (!-e "$genbank"){ | |
45 my $grep = `grep '$line' prokaryotes.txt`; | |
46 #print "$genbank $line aaa $grep\n";exit; | |
47 my @infos = split(/\t/,$grep); | |
48 my $status = $infos[15]; | |
49 if ($status !~/Complete Genome/ && $status !~/Chromosome/){ | |
50 #next; | |
51 } | |
52 my $ftp_path = $infos[$#infos -2]; | |
53 | |
54 $ftp_path =~s/ftp:/http:/g; | |
55 my @table = split(/\//,$ftp_path); | |
56 my $name = $table[$#table]; | |
57 my $prot_file = "$ftp_path/$name"."_protein.faa.gz"; | |
58 my $gbff = "$ftp_path/$name"."_genomic.gbff.gz"; | |
59 my $gff = "$ftp_path/$name"."_genomic.gff.gz"; | |
60 my $genome_fasta = "$ftp_path/$name"."_genomic.fna.gz"; | |
61 my @particules = split(/_/,$name); | |
62 | |
63 `wget -O $outdir/$genbank.fasta.gz $genome_fasta`; | |
64 `gunzip $outdir/$genbank.fasta.gz`; | |
65 `wget -O $outdir/$genbank.gb.gz $gbff`; | |
66 system("gunzip $outdir/$genbank.gb.gz"); | |
67 | |
68 | |
69 ################################################################ | |
70 # for eukaryotes | |
71 ################################################################ | |
72 if (!$grep){ | |
73 $grep = `grep '$line' eukaryotes.txt`; | |
74 my @infos = split(/\t/,$grep); | |
75 my $gca = $infos[8]; | |
76 if ($gca =~/GCA_(\d\d\d)(\d\d\d)(\d\d\d)/){ | |
77 my $part1 = $1; | |
78 $ftp_path = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/$1/$2/$3"; | |
79 `wget -O $outdir/$gca.index.html $ftp_path`; | |
80 my $grep_name = `grep '$gca' $outdir/$gca.index.html`; | |
81 unlink("$outdir/$gca.index.html"); | |
82 my $name; | |
83 if ($grep_name =~/href=\"(.*)\"/){ | |
84 $name = $1; | |
85 $name=~s/\///g; | |
86 } | |
87 $ftp_path = $ftp_path."/$name"; | |
88 my $prot_file = "$ftp_path/$name"."_protein.faa.gz"; | |
89 my $gbff = "$ftp_path/$name"."_genomic.gbff.gz"; | |
90 my $gff = "$ftp_path/$name"."_genomic.gff.gz"; | |
91 my $genome_fasta = "$ftp_path/$name"."_genomic.fna.gz"; | |
92 my @particules = split(/_/,$name); | |
93 | |
94 `wget -O $outdir/$genbank.fasta.gz $genome_fasta`; | |
95 `gunzip $outdir/$genbank.fasta.gz`; | |
96 `wget -O $outdir/$genbank.gb.gz $gbff`; | |
97 `wget -O $outdir/$genbank.faa.gz $prot_file`; | |
98 system("gunzip $outdir/$genbank.gb.gz"); | |
99 system("gunzip $outdir/$genbank.faa.gz"); | |
100 | |
101 } | |
102 } | |
103 } | |
104 else{ | |
105 my $genbank_file = $genbank; | |
106 my $grep = `grep 'LOCUS' $genbank_file`; | |
107 $genbank = "unknown"; | |
108 if ($grep =~/LOCUS\s+([\-\:\w]+)/){$genbank = $1;} | |
109 | |
110 #$genbank =~s/\:/_/g; | |
111 | |
112 my $cmd = "cp -rf $genbank_file $outdir/$genbank.gb"; | |
113 system($cmd); | |
114 | |
115 my %genome_seqs; | |
116 my $current_chr; | |
117 my $go = 0; | |
118 open(G,"$outdir/$genbank.gb"); | |
119 while(<G>){ | |
120 if ($go == 1 && /(\d+) (.*)$/){ | |
121 my $line = $2; | |
122 $line =~s/ //g; | |
123 $genome_seqs{$current_chr}.=$line; | |
124 } | |
125 if (/LOCUS ([^\s]+)/){ | |
126 $current_chr = $1; | |
127 } | |
128 if (/ORIGIN/){$go = 1;} | |
129 if (/^\/\//){$go = 0;} | |
130 } | |
131 close(G); | |
132 | |
133 open(FASTA,">$outdir/$genbank.fasta"); | |
134 foreach my $ch(keys(%genome_seqs)){ | |
135 print FASTA ">$ch\n"; | |
136 my $seq = $genome_seqs{$ch}; | |
137 print FASTA "$seq\n"; | |
138 } | |
139 close(FASTA); | |
140 } | |
141 #my $get_organism_line = `head -10 $outdir/$genbank.gb | grep DEFINITION `; | |
142 my $get_organism_line = `head -10 $outdir/$genbank.gb | grep -A 1 DEFINITION `; | |
143 | |
144 # if several lines for DEFINITION, concatenate the lines | |
145 my @lines_organism = split(/\n/,$get_organism_line); | |
146 my $first_line = $lines_organism[0]; | |
147 my $second_line = $lines_organism[1]; | |
148 if ($second_line =~/^ (.*)/){ | |
149 $get_organism_line = $first_line. " ".$1; | |
150 } | |
151 else{ | |
152 $get_organism_line = $first_line; | |
153 } | |
154 | |
155 my $strain; | |
156 my $genus; | |
157 if ($get_organism_line =~/DEFINITION (.*)$/){ | |
158 $strain = $1; | |
159 ($genus) = split(/\s/,$strain); | |
160 } | |
161 my $country = `head -100 $outdir/$genbank.gb | grep country `; | |
162 $country =~s/^\s+//g; | |
163 $country =~s/\/country=//g; | |
164 $country =~s/\"//g; | |
165 $country =~s/\n//g;$country =~s/\r//g; | |
166 if ($country =~/:/){ | |
167 my $city; | |
168 ($country,$city) = split(/:/,$country); | |
169 } | |
170 if ($country eq ""){$country = "unresolved";} | |
171 my $continent = "unresolved"; | |
172 if ($continents{$country}){ | |
173 $continent = $continents{$country}; | |
174 } | |
175 $strain =~s/\.//g; | |
176 my ($info1,$info2 ) = split(",",$strain); | |
177 $strain = $info1; | |
178 $strain =~s/ /_/g; | |
179 $strain =~s/strain_//g; | |
180 $strain =~s/_chromosome//g; | |
181 $strain =~s/_genome//g; | |
182 $strain =~s/_str_/_/g; | |
183 $strain =~s/[^\w\-\_]//g; | |
184 $strain =~s/\-/_/g; | |
185 | |
186 print O "$genbank $strain\n"; | |
187 $concat .= "$genbank,"; | |
188 print L "$genbank $outdir/$genbank.gb\n"; | |
189 print L2 "$genbank\n"; | |
190 print L3 "$outdir/$genbank.fasta\n"; | |
191 print L4 "$outdir/$genbank.fasta $strain\n"; | |
192 print SEQFILE "$genbank $outdir/$genbank.fasta\n"; | |
193 print METADATA "$strain\t$genus\t$country\t$continent\n"; | |
194 | |
195 | |
196 my %genome_sequences; | |
197 my $genome = ""; | |
198 my $seqid = ""; | |
199 open(GENOME,"$outdir/$genbank.fasta"); | |
200 while(<GENOME>){ | |
201 if (!/^>/){ | |
202 my $line = $_; | |
203 $line =~s/\n//g;$line =~s/\r//g; | |
204 $genome_sequences{$seqid} .= $line; | |
205 $genome .= $line; | |
206 print PanSN $_; | |
207 } | |
208 elsif (/>([^\s]+)/){ | |
209 $seqid = $1; | |
210 $seqid =~s/\.\d+$//g; | |
211 print PanSN ">$strain#$seqid\n"; | |
212 } | |
213 } | |
214 close(GENOME); | |
215 | |
216 | |
217 open(N,">$outdir/$genbank.nuc"); | |
218 open(P,">$outdir/$genbank.pep"); | |
219 open(FUNC,">$outdir/$genbank.func"); | |
220 my $go = 0; | |
221 my $start; | |
222 my $end; | |
223 my $product; | |
224 my $complement = 0; | |
225 my $end_gene = "no"; | |
226 my $protein = ""; | |
227 | |
228 #`sed -i "s/'//g" $outdir/$genbank.gb`; | |
229 | |
230 my $has_translation = `grep -c 'translation=' $outdir/$genbank.gb`; | |
231 $has_translation =~s/\n//g;$has_translation =~s/\r//g; | |
232 open(G,"$outdir/$genbank.gb"); | |
233 my $current_gene; | |
234 my $current_chrom; | |
235 while(<G>){ | |
236 if (/^\s+ORGANISM\s+(.*)$/){ | |
237 } | |
238 if (/protein_id=\"(.*)\"/){ | |
239 $current_gene = $1; | |
240 } | |
241 if (/LOCUS ([^\s]+)/){ | |
242 $current_chrom = $1; | |
243 } | |
244 if (/locus_tag=\"(.*)\"/){ | |
245 $current_gene = $1; | |
246 } | |
247 if ($go == 1){ | |
248 my $line = $_; | |
249 $line =~s/ //g; | |
250 $line =~s/\n//g;$line =~s/\r//g; | |
251 $protein .= $line; | |
252 if (/\"$/){ | |
253 $protein =~s/\"//g; | |
254 $end_gene = "yes"; | |
255 | |
256 } | |
257 } | |
258 if (/\/translation=\"(.*)/ or ($has_translation == 0 && /protein_id=/)){ | |
259 $go = 1; | |
260 $protein .= $1; | |
261 print P ">$current_gene\n"; | |
262 print N ">$current_gene\n"; | |
263 print GENES "$current_gene $product [$strain]\n"; | |
264 | |
265 if ($protein =~/\"$/){ | |
266 $end_gene = "yes"; | |
267 } | |
268 if ($has_translation == 0){$end_gene = "yes";} | |
269 } | |
270 if ($end_gene eq "yes"){ | |
271 $protein =~s/\"//g; | |
272 print P "$protein\n"; | |
273 $protein = ""; | |
274 my $length = $end - $start + 1; | |
275 | |
276 #print "okkkk $current_chrom\n"; | |
277 #my $geneseq = substr($genome,$start-1,$length); | |
278 my $geneseq = substr($genome_sequences{$current_chrom},$start-1,$length); | |
279 | |
280 | |
281 if ($complement){ | |
282 my $revcomp = reverse $geneseq; | |
283 $revcomp =~ tr/ATGCatgc/TACGtacg/; | |
284 $geneseq = $revcomp; | |
285 } | |
286 print N "$geneseq\n"; | |
287 print FUNC "$current_gene - $product\n"; | |
288 $go = 0; | |
289 $end_gene = "no"; | |
290 } | |
291 if (/\/product=\"(.*)\"/){ | |
292 $product = $1; | |
293 } | |
294 if (/^\s+CDS\s+(\d+)\.\.(\d+)\s*$/){ | |
295 $start = $1; | |
296 $end = $2; | |
297 $complement = 0; | |
298 } | |
299 if (/^\s+CDS\s+complement\((\d+)\.\.(\d+)\)\s*$/){ | |
300 $start = $1; | |
301 $end = $2; | |
302 $complement = 1; | |
303 } | |
304 } | |
305 close(G); | |
306 close(P); | |
307 close(N); | |
308 close(FUNC); | |
309 | |
310 if ($has_translation == 0){ | |
311 system("perl $dirname/translate.pl $outdir/$genbank.nuc $outdir/$genbank.pep"); | |
312 } | |
313 | |
314 my $prot_num = 0; | |
315 open(PRT,">$outdir/$genbank.prt"); | |
316 open(P,"$outdir/$genbank.pep"); | |
317 while(<P>){ | |
318 if (/>(.*)/){ | |
319 my $prot_id = $1; | |
320 $prot_num++; | |
321 my $new_id = "$strain"."_".$prot_num; | |
322 print PRT ">$new_id\n"; | |
323 } | |
324 else{ | |
325 print PRT $_; | |
326 } | |
327 } | |
328 close(P); | |
329 close(PRT); | |
330 } | |
331 close(F); | |
332 close(O); | |
333 close(METADATA); | |
334 chop($concat); | |
335 print O2 $concat; | |
336 close(O2); | |
337 close(L); | |
338 close(L2); | |
339 close(L3); | |
340 close(L4); | |
341 close(GENES); | |
342 close(SEQFILE); | |
343 close(PanSN); | |
344 #close(TEST); | |
345 | |
346 unlink("prokaryotes.txt"); | |
347 unlink("eukaryotes.txt"); |