annotate Perl/wget.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
3 use strict;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
4
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
5 use File::Basename;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
6 my $dirname = dirname(__FILE__);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
7
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
8 system("wget https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
9 system("wget https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
10
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
11 my %continents;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
12 open(F,"countries.txt");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
13 <F>;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
14 while(my $line =<F>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
15 chomp($line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
16 my ($continent,$country) = split(/,/,$line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
17 $continents{$country} = $continent;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
18 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
19 close(F);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
20
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
21 my $input = $ARGV[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
22 my $outdir = $ARGV[1];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
23 my $private_genomes = $ARGV[2];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
24
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
25 system("cat $input $private_genomes >$outdir/list.txt");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
26
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
27 my $concat = "";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
28 open(O2,">$outdir/genbanks.txt");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
29 open(O,">$outdir/strains.txt");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
30 open(GENES,">$outdir/genes.txt");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
31 open(L,">$outdir/list_genomes.txt");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
32 open(L2,">$outdir/list_genomes2.txt");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
33 open(L3,">$outdir/genomes.txt");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
34 open(L4,">$outdir/genomes2.txt");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
35 open(SEQFILE,">$outdir/seqfile");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
36 open(PanSN,">$outdir/all_genomes.fa");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
37 open(METADATA,">$outdir/metadata_strains.txt");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
38
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
39 open(F,"$outdir/list.txt");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
40 #open(TEST,">$outdir/test");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
41 while(my $line =<F>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
42 chomp($line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
43 my $genbank = $line;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
44 if (!-e "$genbank"){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
45 my $grep = `grep '$line' prokaryotes.txt`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
46 #print "$genbank $line aaa $grep\n";exit;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
47 my @infos = split(/\t/,$grep);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
48 my $status = $infos[15];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
49 if ($status !~/Complete Genome/ && $status !~/Chromosome/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
50 #next;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
51 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
52 my $ftp_path = $infos[$#infos -2];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
53
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
54 $ftp_path =~s/ftp:/http:/g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
55 my @table = split(/\//,$ftp_path);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
56 my $name = $table[$#table];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
57 my $prot_file = "$ftp_path/$name"."_protein.faa.gz";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
58 my $gbff = "$ftp_path/$name"."_genomic.gbff.gz";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
59 my $gff = "$ftp_path/$name"."_genomic.gff.gz";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
60 my $genome_fasta = "$ftp_path/$name"."_genomic.fna.gz";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
61 my @particules = split(/_/,$name);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
62
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
63 `wget -O $outdir/$genbank.fasta.gz $genome_fasta`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
64 `gunzip $outdir/$genbank.fasta.gz`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
65 `wget -O $outdir/$genbank.gb.gz $gbff`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
66 system("gunzip $outdir/$genbank.gb.gz");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
67
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
68
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
69 ################################################################
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
70 # for eukaryotes
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
71 ################################################################
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
72 if (!$grep){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
73 $grep = `grep '$line' eukaryotes.txt`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
74 my @infos = split(/\t/,$grep);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
75 my $gca = $infos[8];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
76 if ($gca =~/GCA_(\d\d\d)(\d\d\d)(\d\d\d)/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
77 my $part1 = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
78 $ftp_path = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/$1/$2/$3";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
79 `wget -O $outdir/$gca.index.html $ftp_path`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
80 my $grep_name = `grep '$gca' $outdir/$gca.index.html`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
81 unlink("$outdir/$gca.index.html");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
82 my $name;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
83 if ($grep_name =~/href=\"(.*)\"/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
84 $name = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
85 $name=~s/\///g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
86 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
87 $ftp_path = $ftp_path."/$name";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
88 my $prot_file = "$ftp_path/$name"."_protein.faa.gz";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
89 my $gbff = "$ftp_path/$name"."_genomic.gbff.gz";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
90 my $gff = "$ftp_path/$name"."_genomic.gff.gz";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
91 my $genome_fasta = "$ftp_path/$name"."_genomic.fna.gz";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
92 my @particules = split(/_/,$name);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
93
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
94 `wget -O $outdir/$genbank.fasta.gz $genome_fasta`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
95 `gunzip $outdir/$genbank.fasta.gz`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
96 `wget -O $outdir/$genbank.gb.gz $gbff`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
97 `wget -O $outdir/$genbank.faa.gz $prot_file`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
98 system("gunzip $outdir/$genbank.gb.gz");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
99 system("gunzip $outdir/$genbank.faa.gz");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
100
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
101 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
102 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
103 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
104 else{
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
105 my $genbank_file = $genbank;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
106 my $grep = `grep 'LOCUS' $genbank_file`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
107 $genbank = "unknown";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
108 if ($grep =~/LOCUS\s+([\-\:\w]+)/){$genbank = $1;}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
109
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
110 #$genbank =~s/\:/_/g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
111
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
112 my $cmd = "cp -rf $genbank_file $outdir/$genbank.gb";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
113 system($cmd);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
114
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
115 my %genome_seqs;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
116 my $current_chr;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
117 my $go = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
118 open(G,"$outdir/$genbank.gb");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
119 while(<G>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
120 if ($go == 1 && /(\d+) (.*)$/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
121 my $line = $2;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
122 $line =~s/ //g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
123 $genome_seqs{$current_chr}.=$line;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
124 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
125 if (/LOCUS ([^\s]+)/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
126 $current_chr = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
127 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
128 if (/ORIGIN/){$go = 1;}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
129 if (/^\/\//){$go = 0;}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
130 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
131 close(G);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
132
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
133 open(FASTA,">$outdir/$genbank.fasta");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
134 foreach my $ch(keys(%genome_seqs)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
135 print FASTA ">$ch\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
136 my $seq = $genome_seqs{$ch};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
137 print FASTA "$seq\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
138 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
139 close(FASTA);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
140 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
141 #my $get_organism_line = `head -10 $outdir/$genbank.gb | grep DEFINITION `;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
142 my $get_organism_line = `head -10 $outdir/$genbank.gb | grep -A 1 DEFINITION `;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
143
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
144 # if several lines for DEFINITION, concatenate the lines
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
145 my @lines_organism = split(/\n/,$get_organism_line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
146 my $first_line = $lines_organism[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
147 my $second_line = $lines_organism[1];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
148 if ($second_line =~/^ (.*)/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
149 $get_organism_line = $first_line. " ".$1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
150 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
151 else{
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
152 $get_organism_line = $first_line;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
153 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
154
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
155 my $strain;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
156 my $genus;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
157 if ($get_organism_line =~/DEFINITION (.*)$/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
158 $strain = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
159 ($genus) = split(/\s/,$strain);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
160 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
161 my $country = `head -100 $outdir/$genbank.gb | grep country `;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
162 $country =~s/^\s+//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
163 $country =~s/\/country=//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
164 $country =~s/\"//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
165 $country =~s/\n//g;$country =~s/\r//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
166 if ($country =~/:/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
167 my $city;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
168 ($country,$city) = split(/:/,$country);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
169 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
170 if ($country eq ""){$country = "unresolved";}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
171 my $continent = "unresolved";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
172 if ($continents{$country}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
173 $continent = $continents{$country};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
174 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
175 $strain =~s/\.//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
176 my ($info1,$info2 ) = split(",",$strain);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
177 $strain = $info1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
178 $strain =~s/ /_/g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
179 $strain =~s/strain_//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
180 $strain =~s/_chromosome//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
181 $strain =~s/_genome//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
182 $strain =~s/_str_/_/g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
183 $strain =~s/[^\w\-\_]//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
184 $strain =~s/\-/_/g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
185
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
186 print O "$genbank $strain\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
187 $concat .= "$genbank,";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
188 print L "$genbank $outdir/$genbank.gb\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
189 print L2 "$genbank\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
190 print L3 "$outdir/$genbank.fasta\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
191 print L4 "$outdir/$genbank.fasta $strain\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
192 print SEQFILE "$genbank $outdir/$genbank.fasta\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
193 print METADATA "$strain\t$genus\t$country\t$continent\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
194
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
195
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
196 my %genome_sequences;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
197 my $genome = "";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
198 my $seqid = "";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
199 open(GENOME,"$outdir/$genbank.fasta");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
200 while(<GENOME>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
201 if (!/^>/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
202 my $line = $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
203 $line =~s/\n//g;$line =~s/\r//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
204 $genome_sequences{$seqid} .= $line;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
205 $genome .= $line;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
206 print PanSN $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
207 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
208 elsif (/>([^\s]+)/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
209 $seqid = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
210 $seqid =~s/\.\d+$//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
211 print PanSN ">$strain#$seqid\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
212 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
213 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
214 close(GENOME);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
215
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
216
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
217 open(N,">$outdir/$genbank.nuc");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
218 open(P,">$outdir/$genbank.pep");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
219 open(FUNC,">$outdir/$genbank.func");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
220 my $go = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
221 my $start;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
222 my $end;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
223 my $product;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
224 my $complement = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
225 my $end_gene = "no";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
226 my $protein = "";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
227
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
228 #`sed -i "s/'//g" $outdir/$genbank.gb`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
229
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
230 my $has_translation = `grep -c 'translation=' $outdir/$genbank.gb`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
231 $has_translation =~s/\n//g;$has_translation =~s/\r//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
232 open(G,"$outdir/$genbank.gb");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
233 my $current_gene;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
234 my $current_chrom;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
235 while(<G>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
236 if (/^\s+ORGANISM\s+(.*)$/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
237 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
238 if (/protein_id=\"(.*)\"/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
239 $current_gene = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
240 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
241 if (/LOCUS ([^\s]+)/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
242 $current_chrom = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
243 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
244 if (/locus_tag=\"(.*)\"/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
245 $current_gene = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
246 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
247 if ($go == 1){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
248 my $line = $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
249 $line =~s/ //g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
250 $line =~s/\n//g;$line =~s/\r//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
251 $protein .= $line;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
252 if (/\"$/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
253 $protein =~s/\"//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
254 $end_gene = "yes";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
255
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
256 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
257 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
258 if (/\/translation=\"(.*)/ or ($has_translation == 0 && /protein_id=/)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
259 $go = 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
260 $protein .= $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
261 print P ">$current_gene\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
262 print N ">$current_gene\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
263 print GENES "$current_gene $product [$strain]\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
264
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
265 if ($protein =~/\"$/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
266 $end_gene = "yes";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
267 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
268 if ($has_translation == 0){$end_gene = "yes";}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
269 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
270 if ($end_gene eq "yes"){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
271 $protein =~s/\"//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
272 print P "$protein\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
273 $protein = "";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
274 my $length = $end - $start + 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
275
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
276 #print "okkkk $current_chrom\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
277 #my $geneseq = substr($genome,$start-1,$length);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
278 my $geneseq = substr($genome_sequences{$current_chrom},$start-1,$length);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
279
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
280
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
281 if ($complement){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
282 my $revcomp = reverse $geneseq;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
283 $revcomp =~ tr/ATGCatgc/TACGtacg/;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
284 $geneseq = $revcomp;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
285 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
286 print N "$geneseq\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
287 print FUNC "$current_gene - $product\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
288 $go = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
289 $end_gene = "no";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
290 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
291 if (/\/product=\"(.*)\"/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
292 $product = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
293 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
294 if (/^\s+CDS\s+(\d+)\.\.(\d+)\s*$/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
295 $start = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
296 $end = $2;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
297 $complement = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
298 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
299 if (/^\s+CDS\s+complement\((\d+)\.\.(\d+)\)\s*$/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
300 $start = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
301 $end = $2;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
302 $complement = 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
303 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
304 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
305 close(G);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
306 close(P);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
307 close(N);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
308 close(FUNC);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
309
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
310 if ($has_translation == 0){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
311 system("perl $dirname/translate.pl $outdir/$genbank.nuc $outdir/$genbank.pep");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
312 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
313
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
314 my $prot_num = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
315 open(PRT,">$outdir/$genbank.prt");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
316 open(P,"$outdir/$genbank.pep");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
317 while(<P>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
318 if (/>(.*)/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
319 my $prot_id = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
320 $prot_num++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
321 my $new_id = "$strain"."_".$prot_num;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
322 print PRT ">$new_id\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
323 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
324 else{
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
325 print PRT $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
326 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
327 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
328 close(P);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
329 close(PRT);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
330 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
331 close(F);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
332 close(O);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
333 close(METADATA);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
334 chop($concat);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
335 print O2 $concat;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
336 close(O2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
337 close(L);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
338 close(L2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
339 close(L3);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
340 close(L4);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
341 close(GENES);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
342 close(SEQFILE);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
343 close(PanSN);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
344 #close(TEST);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
345
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
346 unlink("prokaryotes.txt");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
347 unlink("eukaryotes.txt");