Mercurial > repos > dereeper > pangenome_explorer
comparison PanExplorer_workflow/Perl/get_data.pl @ 1:032f6b3806a3 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:16:08 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:3cbb01081cde | 1:032f6b3806a3 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 use strict; | |
4 | |
5 use YAML qw(LoadFile); | |
6 use Data::Dumper qw(Dumper); | |
7 use File::Basename; | |
8 my $dirname = dirname(__FILE__); | |
9 | |
10 | |
11 my %continents; | |
12 open(F,"countries.txt"); | |
13 <F>; | |
14 while(my $line =<F>){ | |
15 chomp($line); | |
16 my ($continent,$country) = split(/,/,$line); | |
17 $continents{$country} = $continent; | |
18 } | |
19 close(F); | |
20 | |
21 my $input_yml = $ARGV[0]; | |
22 my $outdir = $ARGV[1]; | |
23 | |
24 open(LIST,">$outdir/list.txt"); | |
25 my $data = LoadFile($input_yml); | |
26 my %hashtable = %$data; | |
27 | |
28 if ($hashtable{"ids"}){ | |
29 my $ref_ids = $hashtable{"ids"}; | |
30 my @ids = @$ref_ids; | |
31 foreach my $id(@ids){ | |
32 print LIST "$id\n"; | |
33 } | |
34 } | |
35 if ($hashtable{"input_genbanks"}){ | |
36 my $ref_gbs = $hashtable{"input_genbanks"}; | |
37 my @gbs = @$ref_gbs; | |
38 foreach my $gb(@gbs){ | |
39 print LIST "$gb\n"; | |
40 } | |
41 } | |
42 close(LIST); | |
43 | |
44 | |
45 | |
46 my $concat = ""; | |
47 open(O2,">$outdir/genbanks.txt"); | |
48 open(O,">$outdir/strains.txt"); | |
49 open(GENES,">$outdir/genes.txt"); | |
50 open(L,">$outdir/list_genomes.txt"); | |
51 open(L2,">$outdir/list_genomes2.txt"); | |
52 open(L3,">$outdir/genomes.txt"); | |
53 open(L4,">$outdir/genomes2.txt"); | |
54 open(SEQFILE,">$outdir/seqfile"); | |
55 open(PanSN,">$outdir/all_genomes.fa"); | |
56 open(METADATA,">$outdir/metadata_strains.txt"); | |
57 | |
58 open(F,"$outdir/list.txt"); | |
59 #open(TEST,">$outdir/test"); | |
60 while(my $line =<F>){ | |
61 chomp($line); | |
62 my $genbank = $line; | |
63 if (!-e "$genbank"){ | |
64 | |
65 my $assembly_accession = $genbank; | |
66 system("datasets download genome accession --include-gbff --filename $outdir/$assembly_accession.zip $assembly_accession"); | |
67 system("unzip -o $outdir/$assembly_accession.zip"); | |
68 system("cp -rf ncbi_dataset/data/$assembly_accession/$assembly_accession*genomic.fna $outdir/$genbank.fasta"); | |
69 system("cp -rf ncbi_dataset/data/$assembly_accession/genomic.gbff $outdir/$genbank.gb"); | |
70 | |
71 | |
72 } | |
73 else{ | |
74 my $genbank_file = $genbank; | |
75 my $grep = `grep 'LOCUS' $genbank_file`; | |
76 $genbank = "unknown"; | |
77 if ($grep =~/LOCUS\s+([\-\:\w]+)/){$genbank = $1;} | |
78 | |
79 #$genbank =~s/\:/_/g; | |
80 | |
81 my $cmd = "cp -rf $genbank_file $outdir/$genbank.gb"; | |
82 system($cmd); | |
83 | |
84 my %genome_seqs; | |
85 my $current_chr; | |
86 my $go = 0; | |
87 open(G,"$outdir/$genbank.gb"); | |
88 while(<G>){ | |
89 if ($go == 1 && /(\d+) (.*)$/){ | |
90 my $line = $2; | |
91 $line =~s/ //g; | |
92 $genome_seqs{$current_chr}.=$line; | |
93 } | |
94 if (/LOCUS ([^\s]+)/){ | |
95 $current_chr = $1; | |
96 } | |
97 if (/ORIGIN/){$go = 1;} | |
98 if (/^\/\//){$go = 0;} | |
99 } | |
100 close(G); | |
101 | |
102 open(FASTA,">$outdir/$genbank.fasta"); | |
103 foreach my $ch(keys(%genome_seqs)){ | |
104 print FASTA ">$ch\n"; | |
105 my $seq = $genome_seqs{$ch}; | |
106 print FASTA "$seq\n"; | |
107 } | |
108 close(FASTA); | |
109 } | |
110 #my $get_organism_line = `head -10 $outdir/$genbank.gb | grep DEFINITION `; | |
111 | |
112 # remove single quote in genbank file | |
113 `sed -i "s/'//g" $outdir/$genbank.gb`; | |
114 my $get_organism_line = `head -10 $outdir/$genbank.gb | grep -A 1 DEFINITION `; | |
115 | |
116 # if several lines for DEFINITION, concatenate the lines | |
117 my @lines_organism = split(/\n/,$get_organism_line); | |
118 my $first_line = $lines_organism[0]; | |
119 my $second_line = $lines_organism[1]; | |
120 if ($second_line =~/^ (.*)/){ | |
121 $get_organism_line = $first_line. " ".$1; | |
122 } | |
123 else{ | |
124 $get_organism_line = $first_line; | |
125 } | |
126 | |
127 my $strain; | |
128 my $genus; | |
129 if ($get_organism_line =~/DEFINITION (.*)$/){ | |
130 $strain = $1; | |
131 ($genus) = split(/\s/,$strain); | |
132 } | |
133 my $country = `head -100 $outdir/$genbank.gb | grep country `; | |
134 $country =~s/^\s+//g; | |
135 $country =~s/\/country=//g; | |
136 $country =~s/\"//g; | |
137 $country =~s/\n//g;$country =~s/\r//g; | |
138 if ($country =~/:/){ | |
139 my $city; | |
140 ($country,$city) = split(/:/,$country); | |
141 } | |
142 if ($country eq ""){$country = "unresolved";} | |
143 my $continent = "unresolved"; | |
144 if ($continents{$country}){ | |
145 $continent = $continents{$country}; | |
146 } | |
147 $strain =~s/\.//g; | |
148 my ($info1,$info2 ) = split(",",$strain); | |
149 $strain = $info1; | |
150 $strain =~s/ /_/g; | |
151 $strain =~s/strain_//g; | |
152 $strain =~s/_chromosome//g; | |
153 $strain =~s/_genome//g; | |
154 $strain =~s/_str_/_/g; | |
155 $strain =~s/[^\w\-\_]//g; | |
156 $strain =~s/\-/_/g; | |
157 | |
158 print O "$genbank $strain\n"; | |
159 $concat .= "$genbank,"; | |
160 print L "$genbank $outdir/$genbank.gb\n"; | |
161 print L2 "$genbank\n"; | |
162 print L3 "$outdir/$genbank.fasta\n"; | |
163 print L4 "$outdir/$genbank.fasta $strain\n"; | |
164 print SEQFILE "$genbank $outdir/$genbank.fasta\n"; | |
165 print METADATA "$strain\t$genus\t$country\t$continent\n"; | |
166 | |
167 | |
168 my %genome_sequences; | |
169 my $genome = ""; | |
170 my $seqid = ""; | |
171 open(GENOME,"$outdir/$genbank.fasta"); | |
172 while(<GENOME>){ | |
173 if (!/^>/){ | |
174 my $line = $_; | |
175 $line =~s/\n//g;$line =~s/\r//g; | |
176 $genome_sequences{$seqid} .= $line; | |
177 $genome .= $line; | |
178 print PanSN $_; | |
179 } | |
180 elsif (/>([^\s]+)/){ | |
181 $seqid = $1; | |
182 $seqid =~s/\.\d+$//g; | |
183 print PanSN ">$strain#$seqid\n"; | |
184 } | |
185 } | |
186 close(GENOME); | |
187 | |
188 | |
189 open(N,">$outdir/$genbank.nuc"); | |
190 open(P,">$outdir/$genbank.pep"); | |
191 open(FUNC,">$outdir/$genbank.func"); | |
192 my $go = 0; | |
193 my $start; | |
194 my $end; | |
195 my $product; | |
196 my $complement = 0; | |
197 my $end_gene = "no"; | |
198 my $protein = ""; | |
199 my $has_translation = `grep -c 'translation=' $outdir/$genbank.gb`; | |
200 $has_translation =~s/\n//g;$has_translation =~s/\r//g; | |
201 open(G,"$outdir/$genbank.gb"); | |
202 my $current_gene; | |
203 my $current_chrom; | |
204 while(<G>){ | |
205 if (/^\s+ORGANISM\s+(.*)$/){ | |
206 } | |
207 if (/protein_id=\"(.*)\"/){ | |
208 $current_gene = $1; | |
209 } | |
210 if (/LOCUS ([^\s]+)/){ | |
211 $current_chrom = $1; | |
212 } | |
213 if (/locus_tag=\"(.*)\"/){ | |
214 $current_gene = $1; | |
215 } | |
216 if ($go == 1){ | |
217 my $line = $_; | |
218 $line =~s/ //g; | |
219 $line =~s/\n//g;$line =~s/\r//g; | |
220 $protein .= $line; | |
221 if (/\"$/){ | |
222 $protein =~s/\"//g; | |
223 $end_gene = "yes"; | |
224 | |
225 } | |
226 } | |
227 if (/\/translation=\"(.*)/ or ($has_translation == 0 && /protein_id=/)){ | |
228 $go = 1; | |
229 $protein .= $1; | |
230 print P ">$current_gene\n"; | |
231 print N ">$current_gene\n"; | |
232 print GENES "$current_gene $product [$strain]\n"; | |
233 | |
234 if ($protein =~/\"$/){ | |
235 $end_gene = "yes"; | |
236 } | |
237 if ($has_translation == 0){$end_gene = "yes";} | |
238 } | |
239 if ($end_gene eq "yes"){ | |
240 $protein =~s/\"//g; | |
241 print P "$protein\n"; | |
242 $protein = ""; | |
243 my $length = $end - $start + 1; | |
244 | |
245 #print "okkkk $current_chrom\n"; | |
246 #my $geneseq = substr($genome,$start-1,$length); | |
247 my $geneseq = substr($genome_sequences{$current_chrom},$start-1,$length); | |
248 | |
249 | |
250 if ($complement){ | |
251 my $revcomp = reverse $geneseq; | |
252 $revcomp =~ tr/ATGCatgc/TACGtacg/; | |
253 $geneseq = $revcomp; | |
254 } | |
255 print N "$geneseq\n"; | |
256 print FUNC "$current_gene - $product\n"; | |
257 $go = 0; | |
258 $end_gene = "no"; | |
259 } | |
260 if (/\/product=\"(.*)\"/){ | |
261 $product = $1; | |
262 } | |
263 if (/^\s+CDS\s+(\d+)\.\.(\d+)\s*$/){ | |
264 $start = $1; | |
265 $end = $2; | |
266 $complement = 0; | |
267 } | |
268 if (/^\s+CDS\s+complement\((\d+)\.\.(\d+)\)\s*$/){ | |
269 $start = $1; | |
270 $end = $2; | |
271 $complement = 1; | |
272 } | |
273 } | |
274 close(G); | |
275 close(P); | |
276 close(N); | |
277 close(FUNC); | |
278 | |
279 if ($has_translation == 0){ | |
280 system("perl $dirname/translate.pl $outdir/$genbank.nuc $outdir/$genbank.pep"); | |
281 } | |
282 | |
283 my $prot_num = 0; | |
284 open(PRT,">$outdir/$genbank.prt"); | |
285 open(P,"$outdir/$genbank.pep"); | |
286 while(<P>){ | |
287 if (/>(.*)/){ | |
288 my $prot_id = $1; | |
289 $prot_num++; | |
290 my $new_id = "$strain"."_".$prot_num; | |
291 print PRT ">$new_id\n"; | |
292 } | |
293 else{ | |
294 print PRT $_; | |
295 } | |
296 } | |
297 close(P); | |
298 close(PRT); | |
299 } | |
300 close(F); | |
301 close(O); | |
302 close(METADATA); | |
303 chop($concat); | |
304 print O2 $concat; | |
305 close(O2); | |
306 close(L); | |
307 close(L2); | |
308 close(L3); | |
309 close(L4); | |
310 close(GENES); | |
311 close(SEQFILE); | |
312 close(PanSN); | |
313 #close(TEST); | |
314 | |
315 unlink("prokaryotes.txt"); | |
316 unlink("eukaryotes.txt"); |