comparison PanExplorer_workflow/Perl/get_data.pl @ 1:032f6b3806a3 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:16:08 +0000
parents
children
comparison
equal deleted inserted replaced
0:3cbb01081cde 1:032f6b3806a3
1 #!/usr/bin/perl
2
3 use strict;
4
5 use YAML qw(LoadFile);
6 use Data::Dumper qw(Dumper);
7 use File::Basename;
8 my $dirname = dirname(__FILE__);
9
10
11 my %continents;
12 open(F,"countries.txt");
13 <F>;
14 while(my $line =<F>){
15 chomp($line);
16 my ($continent,$country) = split(/,/,$line);
17 $continents{$country} = $continent;
18 }
19 close(F);
20
21 my $input_yml = $ARGV[0];
22 my $outdir = $ARGV[1];
23
24 open(LIST,">$outdir/list.txt");
25 my $data = LoadFile($input_yml);
26 my %hashtable = %$data;
27
28 if ($hashtable{"ids"}){
29 my $ref_ids = $hashtable{"ids"};
30 my @ids = @$ref_ids;
31 foreach my $id(@ids){
32 print LIST "$id\n";
33 }
34 }
35 if ($hashtable{"input_genbanks"}){
36 my $ref_gbs = $hashtable{"input_genbanks"};
37 my @gbs = @$ref_gbs;
38 foreach my $gb(@gbs){
39 print LIST "$gb\n";
40 }
41 }
42 close(LIST);
43
44
45
46 my $concat = "";
47 open(O2,">$outdir/genbanks.txt");
48 open(O,">$outdir/strains.txt");
49 open(GENES,">$outdir/genes.txt");
50 open(L,">$outdir/list_genomes.txt");
51 open(L2,">$outdir/list_genomes2.txt");
52 open(L3,">$outdir/genomes.txt");
53 open(L4,">$outdir/genomes2.txt");
54 open(SEQFILE,">$outdir/seqfile");
55 open(PanSN,">$outdir/all_genomes.fa");
56 open(METADATA,">$outdir/metadata_strains.txt");
57
58 open(F,"$outdir/list.txt");
59 #open(TEST,">$outdir/test");
60 while(my $line =<F>){
61 chomp($line);
62 my $genbank = $line;
63 if (!-e "$genbank"){
64
65 my $assembly_accession = $genbank;
66 system("datasets download genome accession --include-gbff --filename $outdir/$assembly_accession.zip $assembly_accession");
67 system("unzip -o $outdir/$assembly_accession.zip");
68 system("cp -rf ncbi_dataset/data/$assembly_accession/$assembly_accession*genomic.fna $outdir/$genbank.fasta");
69 system("cp -rf ncbi_dataset/data/$assembly_accession/genomic.gbff $outdir/$genbank.gb");
70
71
72 }
73 else{
74 my $genbank_file = $genbank;
75 my $grep = `grep 'LOCUS' $genbank_file`;
76 $genbank = "unknown";
77 if ($grep =~/LOCUS\s+([\-\:\w]+)/){$genbank = $1;}
78
79 #$genbank =~s/\:/_/g;
80
81 my $cmd = "cp -rf $genbank_file $outdir/$genbank.gb";
82 system($cmd);
83
84 my %genome_seqs;
85 my $current_chr;
86 my $go = 0;
87 open(G,"$outdir/$genbank.gb");
88 while(<G>){
89 if ($go == 1 && /(\d+) (.*)$/){
90 my $line = $2;
91 $line =~s/ //g;
92 $genome_seqs{$current_chr}.=$line;
93 }
94 if (/LOCUS ([^\s]+)/){
95 $current_chr = $1;
96 }
97 if (/ORIGIN/){$go = 1;}
98 if (/^\/\//){$go = 0;}
99 }
100 close(G);
101
102 open(FASTA,">$outdir/$genbank.fasta");
103 foreach my $ch(keys(%genome_seqs)){
104 print FASTA ">$ch\n";
105 my $seq = $genome_seqs{$ch};
106 print FASTA "$seq\n";
107 }
108 close(FASTA);
109 }
110 #my $get_organism_line = `head -10 $outdir/$genbank.gb | grep DEFINITION `;
111
112 # remove single quote in genbank file
113 `sed -i "s/'//g" $outdir/$genbank.gb`;
114 my $get_organism_line = `head -10 $outdir/$genbank.gb | grep -A 1 DEFINITION `;
115
116 # if several lines for DEFINITION, concatenate the lines
117 my @lines_organism = split(/\n/,$get_organism_line);
118 my $first_line = $lines_organism[0];
119 my $second_line = $lines_organism[1];
120 if ($second_line =~/^ (.*)/){
121 $get_organism_line = $first_line. " ".$1;
122 }
123 else{
124 $get_organism_line = $first_line;
125 }
126
127 my $strain;
128 my $genus;
129 if ($get_organism_line =~/DEFINITION (.*)$/){
130 $strain = $1;
131 ($genus) = split(/\s/,$strain);
132 }
133 my $country = `head -100 $outdir/$genbank.gb | grep country `;
134 $country =~s/^\s+//g;
135 $country =~s/\/country=//g;
136 $country =~s/\"//g;
137 $country =~s/\n//g;$country =~s/\r//g;
138 if ($country =~/:/){
139 my $city;
140 ($country,$city) = split(/:/,$country);
141 }
142 if ($country eq ""){$country = "unresolved";}
143 my $continent = "unresolved";
144 if ($continents{$country}){
145 $continent = $continents{$country};
146 }
147 $strain =~s/\.//g;
148 my ($info1,$info2 ) = split(",",$strain);
149 $strain = $info1;
150 $strain =~s/ /_/g;
151 $strain =~s/strain_//g;
152 $strain =~s/_chromosome//g;
153 $strain =~s/_genome//g;
154 $strain =~s/_str_/_/g;
155 $strain =~s/[^\w\-\_]//g;
156 $strain =~s/\-/_/g;
157
158 print O "$genbank $strain\n";
159 $concat .= "$genbank,";
160 print L "$genbank $outdir/$genbank.gb\n";
161 print L2 "$genbank\n";
162 print L3 "$outdir/$genbank.fasta\n";
163 print L4 "$outdir/$genbank.fasta $strain\n";
164 print SEQFILE "$genbank $outdir/$genbank.fasta\n";
165 print METADATA "$strain\t$genus\t$country\t$continent\n";
166
167
168 my %genome_sequences;
169 my $genome = "";
170 my $seqid = "";
171 open(GENOME,"$outdir/$genbank.fasta");
172 while(<GENOME>){
173 if (!/^>/){
174 my $line = $_;
175 $line =~s/\n//g;$line =~s/\r//g;
176 $genome_sequences{$seqid} .= $line;
177 $genome .= $line;
178 print PanSN $_;
179 }
180 elsif (/>([^\s]+)/){
181 $seqid = $1;
182 $seqid =~s/\.\d+$//g;
183 print PanSN ">$strain#$seqid\n";
184 }
185 }
186 close(GENOME);
187
188
189 open(N,">$outdir/$genbank.nuc");
190 open(P,">$outdir/$genbank.pep");
191 open(FUNC,">$outdir/$genbank.func");
192 my $go = 0;
193 my $start;
194 my $end;
195 my $product;
196 my $complement = 0;
197 my $end_gene = "no";
198 my $protein = "";
199 my $has_translation = `grep -c 'translation=' $outdir/$genbank.gb`;
200 $has_translation =~s/\n//g;$has_translation =~s/\r//g;
201 open(G,"$outdir/$genbank.gb");
202 my $current_gene;
203 my $current_chrom;
204 while(<G>){
205 if (/^\s+ORGANISM\s+(.*)$/){
206 }
207 if (/protein_id=\"(.*)\"/){
208 $current_gene = $1;
209 }
210 if (/LOCUS ([^\s]+)/){
211 $current_chrom = $1;
212 }
213 if (/locus_tag=\"(.*)\"/){
214 $current_gene = $1;
215 }
216 if ($go == 1){
217 my $line = $_;
218 $line =~s/ //g;
219 $line =~s/\n//g;$line =~s/\r//g;
220 $protein .= $line;
221 if (/\"$/){
222 $protein =~s/\"//g;
223 $end_gene = "yes";
224
225 }
226 }
227 if (/\/translation=\"(.*)/ or ($has_translation == 0 && /protein_id=/)){
228 $go = 1;
229 $protein .= $1;
230 print P ">$current_gene\n";
231 print N ">$current_gene\n";
232 print GENES "$current_gene $product [$strain]\n";
233
234 if ($protein =~/\"$/){
235 $end_gene = "yes";
236 }
237 if ($has_translation == 0){$end_gene = "yes";}
238 }
239 if ($end_gene eq "yes"){
240 $protein =~s/\"//g;
241 print P "$protein\n";
242 $protein = "";
243 my $length = $end - $start + 1;
244
245 #print "okkkk $current_chrom\n";
246 #my $geneseq = substr($genome,$start-1,$length);
247 my $geneseq = substr($genome_sequences{$current_chrom},$start-1,$length);
248
249
250 if ($complement){
251 my $revcomp = reverse $geneseq;
252 $revcomp =~ tr/ATGCatgc/TACGtacg/;
253 $geneseq = $revcomp;
254 }
255 print N "$geneseq\n";
256 print FUNC "$current_gene - $product\n";
257 $go = 0;
258 $end_gene = "no";
259 }
260 if (/\/product=\"(.*)\"/){
261 $product = $1;
262 }
263 if (/^\s+CDS\s+(\d+)\.\.(\d+)\s*$/){
264 $start = $1;
265 $end = $2;
266 $complement = 0;
267 }
268 if (/^\s+CDS\s+complement\((\d+)\.\.(\d+)\)\s*$/){
269 $start = $1;
270 $end = $2;
271 $complement = 1;
272 }
273 }
274 close(G);
275 close(P);
276 close(N);
277 close(FUNC);
278
279 if ($has_translation == 0){
280 system("perl $dirname/translate.pl $outdir/$genbank.nuc $outdir/$genbank.pep");
281 }
282
283 my $prot_num = 0;
284 open(PRT,">$outdir/$genbank.prt");
285 open(P,"$outdir/$genbank.pep");
286 while(<P>){
287 if (/>(.*)/){
288 my $prot_id = $1;
289 $prot_num++;
290 my $new_id = "$strain"."_".$prot_num;
291 print PRT ">$new_id\n";
292 }
293 else{
294 print PRT $_;
295 }
296 }
297 close(P);
298 close(PRT);
299 }
300 close(F);
301 close(O);
302 close(METADATA);
303 chop($concat);
304 print O2 $concat;
305 close(O2);
306 close(L);
307 close(L2);
308 close(L3);
309 close(L4);
310 close(GENES);
311 close(SEQFILE);
312 close(PanSN);
313 #close(TEST);
314
315 unlink("prokaryotes.txt");
316 unlink("eukaryotes.txt");