annotate getSequenceInfo.pl @ 0:19ae17458c14 draft default tip

Uploaded
author dcouvin
date Wed, 15 Sep 2021 21:35:09 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1 #!/usr/bin/perl
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
2
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
3 ################################################################################
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
4 ## "Copyright 2019 Vincent Moco and David Couvin"
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
5 ## licence GPL-3.0-or-later
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
6 ## This program is free software: you can redistribute it and/or modify
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
7 ## it under the terms of the GNU General Public License as published by
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
8 ## the Free Software Foundation, either version 3 of the License, or
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
9 ## (at your option) any later version.
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
10 ## This program is distributed in the hope that it will be useful,
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
11 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
12 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
13 ## GNU General Public License for more details.
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
14 ## You should have received a copy of the GNU General Public License
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
15 ## along with this program. If not, see <https://www.gnu.org/licenses/>.
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
16 ################################################################################
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
17
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
18 use strict;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
19 use warnings;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
20
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
21 my $version = "1.0.1"; # version
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
22
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
23 #my $number = 50;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
24
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
25 # Date and time of the current day (Beginning)
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
26 #my ($start_year,$start_month,$start_day, $start_hour,$start_min,$start_sec) = Today_and_Now();
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
27 my $start = time();
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
28
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
29 print "##################################################################\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
30 print "## ---> Welcome to $0 (version $version)!\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
31 #print "## Start Date (yyyy-mm-dd, hh:min:sec): $start_year-$start_month-$start_day, $start_hour:$start_min:$start_sec\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
32 print "##################################################################\n\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
33
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
34 #BioPerl, Date::Calc, File::Log, have been removed from the @modules
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
35 my @modules = qw(
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
36 Archive::Tar
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
37 Bio::SeqIO
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
38 Bio::Species
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
39 File::Copy
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
40 File::Path
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
41 Net::FTP
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
42 IO::Uncompress::Gunzip
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
43 LWP::Simple
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
44 POSIX
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
45
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
46 );
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
47
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
48 foreach my $module (@modules) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
49 if (isModuleInstalled($module)) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
50 print "$module is.................installed!\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
51 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
52 else {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
53 print "$module is not installed. Please install it and try again.\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
54 print "You can reinstall the $0 as shown on the README page or use the following command to install the module:\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
55 print "cpan -i -f $module\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
56 #system("cpan -i -f $module");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
57 exit 1;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
58 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
59 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
60 print "\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
61
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
62 use Archive::Tar;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
63 use Bio::SeqIO;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
64 use Bio::Species;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
65 #use Date::Calc qw(:all);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
66 use File::Copy qw(cp move);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
67 use File::Path qw(rmtree);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
68 use Net::FTP;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
69 use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
70 use LWP::Simple qw(get);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
71 use POSIX qw(floor);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
72 #use File::Log;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
73
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
74 ####################################################################################################
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
75 ## A Perl script allowing to get sequence information from GenBank, RefSeq or ENA repositories.
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
76 ##
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
77 ####################################################################################################
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
78
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
79 ### main program
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
80 ### parameters
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
81 my $directory = "genbank";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
82
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
83 my $kingdom = ""; # kingdom of organism
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
84
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
85 my $releaseDate = "0000-00-00"; # sequences are downloaded from this release date
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
86
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
87 my $components; # components of the assembly
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
88
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
89 my $species = ""; # species name
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
90
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
91 my $getSummaries; # indicates if a new assembly summary is required
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
92
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
93 my $assemblyLevel = "Complete Genome,Chromosome,Scaffold,Contig"; # assembly level of the genome
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
94
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
95 my $quantity; # limit number of assemblies to download
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
96
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
97 my $sequenceID;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
98
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
99 my $ftpServor = "ftp.ncbi.nlm.nih.gov";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
100
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
101 my $enaFtpServor = "ftp.sra.ebi.ac.uk";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
102
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
103 my $fldSep = "/"; # folder seperation change by OS
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
104
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
105 my @availableKingdoms = (
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
106 "archaea",
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
107 "bacteria",
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
108 "fungi",
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
109 "invertebrate",
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
110 "plant",
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
111 "protozoa",
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
112 "vertebrate_mammalian",
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
113 "vertebrate_other",
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
114 "viral"
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
115 ); # list of available kingdoms
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
116
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
117 my $actualOS = "Unix"; # OS of the computer
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
118
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
119 my $mainFolder; # folder in which the assemblies results are stored
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
120
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
121 my $assemblyTaxid = ""; # taxid for assembly
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
122
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
123 my $sraID; # SRA sequence ID
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
124
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
125 my $assemblyPrjID; # assembly or prj ID
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
126
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
127 my $log; # log
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
128
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
129 my $path = "";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
130
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
131
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
132 if (@ARGV<1) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
133 help_user_simple($0);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
134 exit 0;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
135 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
136
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
137 if ($ARGV[0] eq "-help" || $ARGV[0] eq "-h") {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
138 help_user_advance($0);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
139 exit 0;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
140 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
141
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
142 if ($ARGV[0] eq "-version" || $ARGV[0] eq "-v") {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
143 program_version($0);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
144 exit 0;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
145 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
146
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
147 ##requirements
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
148 for (my $i = 0; $i <= $#ARGV; $i++) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
149 if ($ARGV[$i]=~/-kingdom/i or $ARGV[$i]=~/-k/i) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
150 $kingdom = $ARGV[$i+1];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
151 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
152 elsif ($ARGV[$i]=~/-path/i or $ARGV[$i]=~/-pathSummary/i) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
153 $path = $ARGV[$i+1];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
154 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
155 elsif ($ARGV[$i]=~/-directory/i or $ARGV[$i]=~/-dir/i) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
156 $directory = $ARGV[$i+1];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
157 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
158 elsif ($ARGV[$i]=~/-date/i) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
159 $releaseDate = $ARGV[$i+1];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
160 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
161 elsif ($ARGV[$i]=~/-getSummaries/i or $ARGV[$i]=~/-get/i) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
162 $getSummaries = $ARGV[$i+1];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
163 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
164 elsif ($ARGV[$i]=~/-species/i or $ARGV[$i]=~/-s/i) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
165 $species = $ARGV[$i+1];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
166 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
167 elsif ($ARGV[$i]=~/-level/i or $ARGV[$i]=~/-le/i) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
168 $assemblyLevel = $ARGV[$i+1];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
169 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
170 elsif ($ARGV[$i]=~/-components/i or $ARGV[$i]=~/-c/i) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
171 $components = $ARGV[$i+1];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
172 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
173 elsif ($ARGV[$i]=~/-quantity/i or $ARGV[$i]=~/-q/i or $ARGV[$i]=~/-number/i or $ARGV[$i]=~/-n/i) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
174 $quantity = int($ARGV[$i+1]);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
175 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
176 elsif ($ARGV[$i]=~/-ena/i) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
177 $sequenceID = $ARGV[$i+1];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
178 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
179 elsif ($ARGV[$i]=~/-output/i or $ARGV[$i]=~/-o/i) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
180 $mainFolder = $ARGV[$i+1];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
181 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
182 elsif ($ARGV[$i]=~/-taxid/i) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
183 $assemblyTaxid = $ARGV[$i+1];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
184 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
185 elsif ($ARGV[$i]=~/-fastq/i) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
186 $sraID = $ARGV[$i+1];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
187 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
188 elsif ($ARGV[$i]=~/-assembly_or_project/i) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
189 $assemblyPrjID = $ARGV[$i+1];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
190 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
191 elsif ($ARGV[$i]=~/-log/i) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
192 $log = 1;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
193 # $log = File::Log->new({
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
194 # debug => 4,
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
195 # logFileName => 'myLogFile.log',
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
196 # logFileMode => '>',
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
197 # dateTimeStamp => 1,
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
198 # stderrRedirect => 1,
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
199 # defaultFile => 0,
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
200 # logFileDateTime => 1,
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
201 # appName => 'getSequenceInfo',
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
202 # PIDstamp => 0,
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
203 # storeExpText => 1,
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
204 # msgprepend => '',
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
205 # say => 1,
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
206 # });
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
207 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
208 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
209
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
210 #define folder separator and OS
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
211 if ($^O =~ /MSWin32/) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
212 $fldSep = "\\";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
213 $actualOS = "MSWin32";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
214 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
215
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
216 #LOG file
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
217 if ($log) { open (LOG, "log.txt") or die " error open log.txt $!:"; }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
218
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
219
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
220 print "Working ...\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
221
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
222 if ($kingdom eq "viruses") { $kingdom = "viral"; }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
223
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
224 if (grep(/^$kingdom$/i, @availableKingdoms)) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
225
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
226 my @patternParametersList;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
227
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
228 my @levelList = split /,/, $assemblyLevel;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
229
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
230 if ($species ne "") {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
231 my @speciesList = split(/,/, $species);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
232
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
233 foreach my $actualSpecies (@speciesList) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
234 get_assembly_summary_species( $quantity, $releaseDate, $directory,$kingdom, $actualSpecies,
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
235 \@levelList, $fldSep, $actualOS, $mainFolder, $assemblyTaxid, $log,
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
236 $getSummaries, $components, $ftpServor);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
237 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
238 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
239 elsif ($assemblyTaxid ne "") {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
240 my @taxidList = split(/,/, $assemblyTaxid);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
241
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
242 foreach my $actualID (@taxidList) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
243 get_assembly_summary_species($quantity, $releaseDate, $directory,$kingdom,$species,
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
244 \@levelList, $fldSep, $actualOS, $mainFolder, $actualID, $log,
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
245 $getSummaries, $components, $ftpServor);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
246 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
247 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
248 else {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
249 get_assembly_summary_species($quantity, $releaseDate, $directory,$kingdom,$species,
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
250 \@levelList, $fldSep, $actualOS, $mainFolder, $assemblyTaxid, $log,
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
251 $getSummaries, $components, $ftpServor);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
252 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
253 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
254
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
255
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
256 if ($sequenceID) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
257 my @sequenceIDList = split /,/, $sequenceID;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
258
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
259 foreach my $enaID (@sequenceIDList) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
260 sequence_ena($enaID, $log);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
261 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
262 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
263
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
264 if ($sraID) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
265 my @sraIDList = split /,/, $sraID;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
266
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
267 foreach my $sra (@sraIDList) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
268 download_ena_fastq($enaFtpServor, $sra, $log);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
269 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
270 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
271
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
272 if ($assemblyPrjID) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
273 download_assembly_or_project($assemblyPrjID, $ftpServor, $fldSep, $directory, $log);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
274 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
275
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
276 #my ($end_year,$end_month,$end_day, $end_hour,$end_min,$end_sec) = Today_and_Now();
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
277
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
278 #my ($D_y,$D_m,$D_d, $Dh,$Dm,$Ds) =
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
279 # Delta_YMDHMS($start_year,$start_month,$start_day, $start_hour, $start_min, $start_sec,
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
280 # $end_year, $end_month, $end_day, $end_hour,$end_min,$end_sec);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
281
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
282 #print "End Date: $end_year-$end_month-$end_day, $end_hour:$end_min:$end_sec\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
283 print "Thank you for using $0!\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
284 #print "Execution time: $D_y years, $D_m months, $D_d days, $Dh:$Dm:$Ds (hours:minutes:seconds)\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
285 my $end = time();
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
286
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
287 my $total = $end - $start;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
288 my $min = $total / 60;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
289 my $hrs = $min / 60;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
290
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
291 print "Total time: $total seconds OR $min minutes OR $hrs hours ! \n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
292
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
293 ### subroutine
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
294 # display global help document
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
295 sub help_user_simple {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
296 my $programme = shift @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
297 print STDERR "Usage : perl $programme [options] \n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
298 print "Type perl $programme -version or perl $programme -v to get the current version\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
299 print "Type perl $programme -help or perl $programme -h to get full help\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
300 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
301 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
302 # display full help document
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
303 sub help_user_advance {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
304 print <<HEREDOC;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
305
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
306 Name:
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
307 $0
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
308
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
309 Synopsis:
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
310 A Perl script allowing to get sequence information from GenBank RefSeq or ENA repositories.
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
311
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
312 Usage:
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
313 perl $0 [options]
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
314 examples:
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
315 perl $0 -k bacteria -s "Helicobacter pylori" -l "Complete Genome" -date 2019-06-01
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
316 perl $0 -k viruses -n 5 -date 2019-06-01
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
317 perl $0 -k "bacteria" -taxid 9,24 -n 10 -c plasmid -dir genbank -o Results
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
318 perl $0 -ena BN000065
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
319 perl $0 -fastq ERR818002
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
320 perl $0 -fastq ERR818002,ERR818004
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
321
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
322 Kingdoms:
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
323 archaea
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
324 bacteria
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
325 fungi
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
326 invertebrate
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
327 plant
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
328 protozoa
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
329 vertebrate_mammalian
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
330 vertebrate_other
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
331 viral
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
332
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
333 Assembly levels:
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
334 "Complete Genome"
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
335 Chromosome
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
336 Scaffold
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
337 Contig
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
338
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
339 General:
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
340 -help or -h displays this help
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
341 -version or -v displays the current version of the program
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
342
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
343 Options ([XXX] represents the expected value):
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
344 -directory or -dir [XXX] allows to indicate the NCBI's nucleotide sequences repository (default: $directory)
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
345 -get or -getSummaries [XXX] allows to obtain a new assembly summary file in function of given kingdoms (bacteria,fungi,protozoa...)
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
346 -k or -kingdom [XXX] allows to indicate kingdom of the organism (see the examples above)
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
347 -s or -species [XXX] allows to indicate the species (must be combined with -k option)
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
348 -taxid [XXX] allows to indicate a specific taxid (must be combined with -k option)
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
349 -assembly_or_project [XXX] allows to indicate a specific assembly accession or bioproject (must be combined with -k option)
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
350 -date [XXX] indicates the release date (with format yyyy-mm-dd) from which sequence information are available
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
351 -l or -level [XXX] allows to select a specific assembly level (e.g. "Complete Genome")
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
352 -o or -output [XXX] allows users to name the output result folder
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
353 -n or -number [XXX] allows to limit the total number of assemblies to be downloaded
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
354 -c or -components [XXX] allows to select specific components of the assembly (e.g. plasmid, chromosome, ...)
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
355 -ena [XXX] allows to download report and fasta file given a ENA sequence ID
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
356 -fastq [XXX] allows to download FASTQ sequences from ENA given a run accession (https://ena-docs.readthedocs.io/en/latest/faq/archive-generated-files.html)
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
357 -log allows to create a log file
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
358 HEREDOC
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
359 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
360 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
361 # display program version
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
362 sub program_version {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
363 my $programme = shift @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
364 print "\n $programme, version : $version\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
365 print "\n A perl script to get sequences informations\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
366 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
367 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
368 sub get_assembly_summary_species {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
369 my ($quantity, $releaseDate, $directory, $kingdom, $species, $levelList, $fldSep,
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
370 $actualOS, $mainFolder, $assemblyTaxid, $log, $getSummaries, $components, $ftpServor) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
371
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
372 # assembly_summary.txt file from NCBI FTP site
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
373 #my $assemblySummary = get_summaries($ftpServor, $kingdom, $getSummaries, $directory);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
374
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
375 my $assemblySummary = "";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
376 if($path){
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
377 $assemblySummary = $path;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
378 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
379 else{
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
380 $assemblySummary = download_summaries($directory, $kingdom, $ftpServor, $fldSep, $getSummaries);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
381 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
382
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
383 #lineage folder
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
384 # my $lineage_file = "/pub/taxonomy/new_taxdump/new_taxdump.tar.gz";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
385
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
386 # allow to check old summary download
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
387 my $oldKingdom = "";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
388
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
389 # start of output file
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
390 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
391 print LOG "...Downloading assembly_summary.txt...\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
392 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
393
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
394 # if ($actualOS =~ /Unix/i) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
395 #initialiaze tar manipulation
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
396 # my $tar = Archive::Tar->new;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
397
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
398 #download taxdump folder
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
399 # download_file($ftpServor, $lineage_file);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
400 # $tar->read("new_taxdump.tar.gz");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
401 # $tar->extract_file("rankedlineage.dmp");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
402 # }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
403
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
404
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
405 #my $kingdomRep = $kingdom."_".$start_year."_".$start_month."_".$start_day;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
406 #my $kingdomRep = $kingdom."_folder";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
407 my $kingdomRep = "folder";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
408
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
409 if ($mainFolder) { $kingdomRep = $mainFolder; }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
410 mkdir $kingdomRep unless -d $kingdomRep;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
411
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
412 # Repository for request
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
413
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
414 #my $repositoryAssembly = "assembly_repository_".$assemblyTaxid;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
415 my $repositoryAssembly = "result";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
416 mkdir $repositoryAssembly unless -d $repositoryAssembly;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
417
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
418 my $oldAssemblyRep = "." . $fldSep . $kingdomRep . $fldSep . $repositoryAssembly;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
419 if (-d $oldAssemblyRep) { rmtree($oldAssemblyRep) }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
420
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
421 # Repository for fna file
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
422 my $repositoryFNA = "Assembly";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
423 mkdir $repositoryFNA unless -e $repositoryFNA;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
424
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
425 # Repository for genbank file
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
426 my $repositoryGenbank = "GenBank";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
427 mkdir $repositoryGenbank unless -e $repositoryGenbank;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
428
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
429 # Reposotiry for report file
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
430 my $repositoryReport = "Report";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
431 mkdir $repositoryReport unless -e $repositoryReport ;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
432
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
433 # Repositories for required components
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
434 my %componentsRepHash;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
435
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
436 if ($components) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
437 for my $component (split /,/, $components) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
438 my $specificRep = $component."_folder";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
439 #my $specificRep = $component."_".$species."_".$kingdom."_".$start_year."_".$start_month."_".$start_day;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
440 mkdir $specificRep unless -d $specificRep;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
441 $componentsRepHash{$component} = $specificRep;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
442 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
443 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
444
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
445 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
446 print LOG "...Create kingdom and components repository...\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
447 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
448
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
449 my %assemblyReportList;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
450 my @assemblyRepresentationList = qw/Full Partial/;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
451 my @levelList = @{$levelList};
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
452 my $checkCompleteGenome = grep(/complete genome/i, @levelList);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
453
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
454 if ($checkCompleteGenome > 0) {@assemblyRepresentationList = qw/Full/;}
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
455
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
456 if (-e $assemblySummary) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
457 # Read file
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
458 open (SUM, $assemblySummary) or die " error open assembly_summary.txt $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
459 while(<SUM>) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
460 chomp;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
461 if ($_ !~ m/^#/) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
462 my @infoList = split /\t/, $_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
463 my $foundAssemRep = grep (/$infoList[13]/i, @assemblyRepresentationList);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
464 my $checkLevel = grep(/$infoList[11]/i, @levelList); #replace 11 by 10
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
465
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
466 if ($foundAssemRep > 0 && $checkLevel > 0) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
467 my $indexInfo = 0;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
468 my $searchPattern = "";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
469 my $regex = "";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
470
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
471 if ($species !~ /^$/) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
472 $indexInfo = 7;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
473 $searchPattern = $species;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
474 $regex = qr/$searchPattern/i;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
475 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
476 elsif ($assemblyTaxid !~ /^$/) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
477 $indexInfo = 5;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
478 $searchPattern = $assemblyTaxid;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
479 $regex = qr/^$searchPattern$/i;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
480 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
481
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
482 if (($infoList[$indexInfo] =~ $regex) or ($kingdom !~ /^$/ && $searchPattern =~ /^$/)) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
483 my @gcfInfo = split(/\//, $infoList[19]);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
484 my $gcfName = pop(@gcfInfo);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
485 my $realDate = $infoList[14];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
486 $realDate =~ s/\//-/g;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
487
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
488 my $genbankFile = $infoList[19] . "/" . $gcfName . "_genomic.gbff.gz";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
489 my $dnaFile = $infoList[19] . "/" . $gcfName . "_genomic.fna.gz";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
490 my $assemblyReport = $infoList[19] . "/" . $gcfName . "_assembly_report.txt";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
491
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
492 if ($realDate gt $releaseDate) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
493
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
494 $dnaFile = obtain_file($ftpServor, $dnaFile);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
495 $genbankFile = obtain_file($ftpServor, $genbankFile);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
496 $assemblyReport = obtain_file($ftpServor, $assemblyReport);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
497
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
498 download_file($ftpServor, $dnaFile);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
499 download_file($ftpServor, $genbankFile);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
500 download_file($ftpServor, $assemblyReport);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
501
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
502 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
503 print LOG "...download FASTA and GenBank report files...\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
504 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
505
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
506 # download sequences and check number of "N" characters
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
507 my $fileFasta = $gcfName."_genomic.fna.gz";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
508 my $ucpFasta = $gcfName."_genomic.fna";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
509 if (-e $fileFasta) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
510 gunzip $fileFasta => $ucpFasta or die "gunzip failed: $GunzipError\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
511 move($ucpFasta, $repositoryFNA) or die "move failed: $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
512 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
513
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
514 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
515 print LOG "...Unzip FASTA file named $fileFasta ...\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
516 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
517
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
518 # download genome report
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
519 my $fileReport = $gcfName."_assembly_report.txt";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
520 if (-e $fileReport) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
521 my $fileInformations = $gcfName."_informations.xls";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
522 move($fileReport, $repositoryReport) or die "move failed: $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
523 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
524
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
525 # download genbank files
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
526 my $fileGenbank = $gcfName."_genomic.gbff.gz";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
527 my $ucpGenbank = $gcfName."_genomic.gbff";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
528 if (-e $fileGenbank) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
529 gunzip $fileGenbank => $ucpGenbank or die "gunzip failed: $GunzipError\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
530 move($ucpGenbank, $repositoryGenbank) or die "move failed: $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
531 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
532
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
533 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
534 print LOG "...Unzip GenBank file $fileGenbank ...\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
535 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
536
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
537 # association report and fasta
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
538 my $fileFastaGenbank = $ucpFasta . "," . $ucpGenbank;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
539 $assemblyReportList{$fileReport} = $fileFastaGenbank;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
540
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
541 if ($quantity) { $quantity -= 1; }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
542
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
543 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
544 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
545 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
546 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
547 if (defined $quantity && $quantity == 0) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
548 $quantity = undef;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
549 last;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
550 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
551 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
552 close(SUM) or die "close file error : $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
553
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
554 if (!keys %assemblyReportList) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
555 print "##################################################################\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
556 print "No results were found for the following query:\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
557 print "perl $0 @ARGV\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
558 print "##################################################################\n\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
559
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
560 if ($actualOS =~ /unix/i) { unlink glob "*.dmp *.gz" or die "for file *.dmp *.gz $!:"; }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
561
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
562 if (empty_folder($kingdomRep)) { rmdir $kingdomRep or die "fail remove directory $!:"; }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
563 rmdir $repositoryAssembly or die "failed to remove directory $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
564 rmdir $repositoryFNA or die "failed to remove directory $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
565 rmdir $repositoryGenbank or die "failed to remove directory $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
566 rmdir $repositoryReport or die "failed to remove directory $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
567
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
568 if ($components) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
569 for my $componentRep (values %componentsRepHash) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
570 rmdir $componentRep or die "failed to remove directory $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
571 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
572 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
573 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
574 print LOG "...No results were found for the following query: ";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
575 print LOG "perl $0 @ARGV \n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
576 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
577
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
578 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
579 else {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
580
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
581 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
582 print LOG "...Results were found for the following query: ";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
583 print LOG "perl $0 @ARGV \n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
584 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
585 # write summary files
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
586 my %componentsSumHash;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
587 my @keysList = keys %assemblyReportList;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
588 my $summary = "summary.xls";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
589 my $htmlSummary = "summary.html";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
590
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
591 if ($components) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
592 for my $component (split /,/, $components) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
593 my $specificSummary = $component. "_summary.xls";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
594 $componentsSumHash{$component} = $specificSummary;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
595 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
596 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
597
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
598 my $fileReport = "." . $fldSep. $repositoryReport . $fldSep . $keysList[0];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
599 my @header = ();
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
600
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
601 open(FILE, $fileReport) or die "error open file : $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
602 while(<FILE>) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
603 chomp;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
604 if($_ =~ /:/){
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
605 $_ =~ s/^#*//;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
606 my @ligne = split /:/, $_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
607 push(@header, $ligne[0]);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
608 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
609 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
610 close(FILE) or die "error close file : $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
611
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
612 open(HEAD, ">", $summary) or die " error open file : $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
613 foreach(@header) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
614 print HEAD $_ . "\t";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
615 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
616
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
617 print HEAD "Pubmed\tNucleScore\tClassification\t";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
618 print HEAD "Country\tHost\tIsolation source\tA percent\t";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
619 print HEAD "T percent\tG percent\tC percent\tN percent\tGC percent\t";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
620 print HEAD "ATGC ratio\tLength\tShape\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
621 close(HEAD) or die "error close file : $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
622
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
623 if ($components) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
624 foreach my $componentSummary (values %componentsSumHash) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
625 open(SUM, ">>", $componentSummary) or die "error open file : $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
626 print SUM "Id\tAssembly\tDescription\tLength\tStatus\tLevel\t";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
627 print SUM "GC percent\tA percent\tT percent\tG percent\tC percent\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
628 close(SUM) or die "error close file : $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
629 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
630 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
631
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
632 for my $file (@keysList) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
633 my $reportFile = $repositoryReport . $fldSep . $file;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
634 my @fastaGenbank = split /,/, $assemblyReportList{$file};
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
635 my $extFasta = $fastaGenbank[0];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
636 my $extGenbank = $fastaGenbank[1];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
637 my $fnaFile = $repositoryFNA . $fldSep . $extFasta;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
638 my $genbankFile = $repositoryGenbank . $fldSep . $extGenbank;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
639
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
640 write_assembly($reportFile, $fnaFile, $genbankFile, $summary, $repositoryAssembly,
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
641 \%componentsSumHash, $kingdom, $actualOS, \@header, $log);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
642
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
643 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
644 print LOG "...Call write_assembly subroutine...\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
645 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
646 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
647
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
648 write_html_summary($summary);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
649
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
650 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
651 print LOG "...Call write_html_summary subroutine...\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
652 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
653
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
654 if ($components) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
655 my @componentList = keys %componentsSumHash;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
656 my %componentFastaHash = create_component_sequence_file($fldSep, $repositoryFNA, \@componentList);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
657
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
658 if (keys %componentFastaHash && $log) { $log->msg(1,"call create_component_sequence_file subroutine");}
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
659
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
660 foreach my $component (keys %componentFastaHash) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
661 move($componentFastaHash{$component}, $componentsRepHash{$component}) or die "move failed: $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
662 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
663 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
664
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
665 move($summary, $repositoryAssembly) or die "move failed: $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
666 move($htmlSummary, $repositoryAssembly) or die "move failed: $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
667 move($repositoryFNA, $repositoryAssembly . $fldSep . $repositoryFNA) or die "move failed: $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
668 move($repositoryGenbank, $repositoryAssembly . $fldSep . $repositoryGenbank) or die "move failed: $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
669 move($repositoryReport , $repositoryAssembly . $fldSep . $repositoryReport) or die "move failed: $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
670 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
671 print LOG "...move fasta, genbank and report to query folder \n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
672 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
673
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
674 if ($components) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
675 for my $component (keys %componentsSumHash) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
676 move($componentsSumHash{$component}, $componentsRepHash{$component}) or die "move failed: $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
677 move($componentsRepHash{$component}, $repositoryAssembly . $fldSep . $componentsRepHash{$component}) or die "move failed: $!"
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
678 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
679 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
680 print LOG "...move component files to folders\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
681 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
682 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
683
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
684 move($repositoryAssembly, $kingdomRep . $fldSep . $repositoryAssembly) or die "move failed: $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
685
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
686 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
687 print LOG "...move query folder to main folder\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
688 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
689
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
690 # if ($actualOS =~ /unix/i) { unlink glob "*.dmp" or die "for file *.dmp $!:"; }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
691 unlink glob "*.gz sequence.txt" or die "$!: for file *.gz sequence.txt";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
692 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
693 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
694 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
695 #write general assembly file
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
696 sub write_assembly {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
697 my ($reportFile, $fnaFile, $genbankFile, $summary, $repositoryAssembly,
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
698 $componentsSumHashRef, $kingdom, $actualOS, $headerRef, $log) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
699
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
700 my %componentsSumHash = %{$componentsSumHashRef};
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
701 my @header = @{$headerRef};
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
702 my %hashInformations = ();
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
703 my $seq = "";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
704 my $genomeName = "";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
705 my $country = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
706 my $GCpercent = -1;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
707 my $taxId = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
708 my $assemblyLine;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
709 my $pubmedId = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
710 my $host = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
711 my $isoSource = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
712 # my $species = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
713 # my $genus = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
714 # my $family = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
715 # my $order = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
716 # my $class = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
717 # my $phylum = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
718 my $shape = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
719 my $geneNumber = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
720
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
721 open(REP, "<", $reportFile) or die "error open file $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
722 while (<REP>) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
723 chomp;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
724 $_ =~ s/^#*//;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
725 if ($_ =~ /assembled-molecule/i) { $assemblyLine = $_; }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
726 if ($_ =~ /:/) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
727 my @line = split /:/, $_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
728 if ($line[1]) { $hashInformations{$line[0]} = trim($line[1]); }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
729 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
730 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
731 close(REP) or die "error close file $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
732
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
733
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
734 open(SUM, ">>", $summary) or die "error open file $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
735 foreach my $k(@header) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
736 if (grep $_ eq $k, keys %hashInformations) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
737
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
738 my $information = $hashInformations{$k};
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
739
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
740 if ($k =~ /Assembly name/i) { $genomeName = $information; }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
741
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
742 if ($information =~ /^\s*$/) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
743 print SUM "na\t";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
744 } else {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
745 print SUM $information . "\t";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
746 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
747 } else {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
748 print SUM "na\t";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
749 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
750 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
751 close(SUM) or die "error close file $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
752
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
753 open(FNA, "<", $fnaFile) or die "Could not open $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
754 while (<FNA>) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
755 chomp;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
756 if ($_ !~ /^>/) { $seq .= $_; }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
757 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
758 close(FNA) or die "error close file :$!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
759
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
760 foreach my $summaryKey (keys %hashInformations) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
761 if ($summaryKey =~ /taxid/i) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
762 $taxId = $hashInformations{$summaryKey};
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
763 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
764 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
765
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
766 my $classification = get_taxonomic_rank_genbank($genbankFile);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
767
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
768 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
769 print LOG "...get taxonomic rank from genbank file\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
770 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
771
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
772 $GCpercent = gc_percent($seq);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
773 my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($fnaFile);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
774 my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
775 my $atgcRatio = atgc_ratio($ade, $thy, $gua, $cyt);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
776
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
777 my @percentList = ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
778
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
779 my $variance = shift_data_variance(@percentList);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
780 my $nucleScore = nucle_score($variance, $GCpercent, $atgcRatio, $length);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
781
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
782 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
783 print LOG "compute GC percent nucleotid percent ATGC ratio NucleScore\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
784 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
785
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
786 open(GBFF, "<", $genbankFile) or die "Error open file $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
787 while(<GBFF>) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
788 chomp;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
789 if ($_ =~ /\/country="(.*)"/) { $country = trim($1); }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
790 if ($_ =~ /PUBMED(.*)/) { $pubmedId = trim($1); }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
791 if ($_ =~ /\/host="(.*)"/) { $host = trim($1); }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
792 if ($_ =~ /\/isolation_source="(.*)"/) { $isoSource = trim($1); }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
793 if ($_ =~ /\(Genes \(total\)\s+::(.*)/) { $geneNumber = trim($1); }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
794 if ($_ =~ /LOCUS.*\s+([a-z]{1,})\s+[a-z]{1,}\s+[0-9]{2,}-[a-z]{1,}-[0-9]{4,}$/i) { $shape = trim($1); }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
795 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
796 close(GBFF) or die "error close file $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
797
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
798
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
799 open(SUM, ">>", $summary) or die "error open file $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
800 print SUM $pubmedId . "\t" . $nucleScore . "\t" . $classification ."\t" ;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
801 print SUM $country . "\t" . $host . "\t" . $isoSource . "\t" . $aPercent . "\t" ;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
802 print SUM $tPercent . "\t" . $gPercent . "\t" . $cPercent ."\t" . $nPercent . "\t";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
803 print SUM $GCpercent ."\t". $atgcRatio ."\t". $length . "\t". $shape."\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
804 close(SUM) or die "error close file: $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
805
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
806 if (%componentsSumHash) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
807 write_assembly_component($fnaFile, $genomeName, \%componentsSumHash, $log);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
808 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
809 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
810 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
811 # get assembly component
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
812 sub write_assembly_component {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
813 my($fnaFile, $genomeName, $componentsSumHashRef, $log) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
814
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
815 my %componentsSumHash = %{$componentsSumHashRef};
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
816 my $status = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
817 my $level = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
818 my $gcpercent;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
819 my $tmp_fasta_file = "sequence.txt";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
820 my @desc = ();
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
821
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
822 # check each sequence from (multi-)fasta file
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
823 my ($seq, $inputfile);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
824
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
825 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
826 print LOG "...search specific components\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
827 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
828
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
829 # extract sequence details
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
830 my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$fnaFile);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
831
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
832 while ($seq = $seqIO->next_seq()) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
833 my $seqID = $seq->id; # ID of sequence
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
834 my $seqDesc = $seq->desc; # Description of sequence
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
835 my $globalSeq = $seq->seq;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
836 my $seqLength = $seq->length();
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
837
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
838 open(TSEQ, ">", $tmp_fasta_file) or die "Error open file: $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
839 print TSEQ $globalSeq;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
840 close(TSEQ);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
841
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
842 my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($tmp_fasta_file);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
843
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
844 my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
845
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
846 $gcpercent = gc_percent($globalSeq);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
847
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
848 @desc = split /,/, $seqDesc;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
849
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
850 if ($desc[1]) { $level = $desc[1]; }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
851
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
852 foreach my $component (keys %componentsSumHash) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
853 if ($desc[0] =~ /$component/) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
854 $status = $component;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
855 my $info = $seqID . "\t" . $genomeName ."\t" . $seqDesc . "\t" . $seqLength . "\t" . $status . "\t" . $level ."\t";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
856 $info.= $gcpercent."\t". $aPercent ."\t". $tPercent ."\t". $gPercent ."\t". $cPercent . "\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
857 add_to_file($componentsSumHash{$component}, $info);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
858 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
859 print LOG "...found component $component \n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
860 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
861 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
862 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
863 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
864 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
865 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
866 # download fasta sequence and report on ENA with assembly ID
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
867 sub get_fasta_and_report_sequence_ena_assembly {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
868 my($sequenceID) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
869 my $tmp_file = "fichier.txt";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
870 my @id_list = ();
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
871 my $id_chain = "";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
872 my $fasta_file = "";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
873 my $report_file = "";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
874 my $url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=xml";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
875 my $output = get($url);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
876
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
877 open(TMP, ">", $tmp_file) or die("could not open $!");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
878 print TMP $output;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
879 close(TMP) or die("could not close $!");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
880
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
881 open(TMP, "<", $tmp_file) or die("could not open $!");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
882 while(<TMP>){
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
883 if($_ =~ /<CHROMOSOME accession="(.*)">/){
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
884 push(@id_list, $1)
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
885 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
886 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
887 close(TMP) or die("could not close $!");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
888
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
889 $id_chain = join(",", @id_list);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
890 $url = "https://www.ebi.ac.uk/ena/data/view/$id_chain&display=fasta";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
891 $output = get($url);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
892 $fasta_file = $sequenceID . ".fasta";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
893 open(FILE, ">", $fasta_file) or die("could not open $!");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
894 print FILE $output;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
895 close(FILE) or die("could not close $!");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
896
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
897
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
898 $report_file = $sequenceID . "_report.txt";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
899 for my $id (@id_list) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
900 $url = "https://www.ebi.ac.uk/ena/data/view/$id&display=text&header=true";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
901 $output = get($url);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
902 open(FILE, ">>", $report_file) or die("could not open $!");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
903 print FILE $output;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
904 print FILE "##########################################################################\n\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
905 close(FILE) or die("could not close $!");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
906 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
907
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
908 unlink "fichier.txt" or die "error delete file :$!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
909
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
910 return ($fasta_file, $report_file);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
911 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
912 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
913 # download ENA
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
914 sub sequence_ena {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
915 my($sequenceID, $log) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
916 #my $assemblyRep = $sequenceID . "_folder";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
917 my $fastaFile;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
918 my $reportFile;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
919
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
920 #if(-d $assemblyRep) { rmtree($assemblyRep); }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
921 #mkdir $assemblyRep;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
922
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
923 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
924 print LOG "...ENA sequence downloading ...\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
925 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
926 #if ($log) {$log->msg(1, "Creation of repository: $assemblyRep\n");}
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
927
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
928 if($sequenceID =~ /^GCA_.*/){
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
929 ($fastaFile, $reportFile) = get_fasta_and_report_sequence_ena_assembly($sequenceID);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
930 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
931 else {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
932 ($fastaFile, $reportFile) = get_fasta_and_report_sequence_ena_other($sequenceID);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
933 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
934
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
935 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
936 print LOG "...Downloading of ENA FASTA and report files for sequence: $sequenceID\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
937 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
938
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
939 #move($fastaFile, $assemblyRep) or die "move failed: $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
940 #move($reportFile, $assemblyRep) or die "move failed: $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
941
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
942 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
943 print LOG "...Move fasta and report files to folder\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
944 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
945 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
946 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
947 # download fasta sequence and report on ENA with ENA ID
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
948 sub get_fasta_and_report_sequence_ena_other {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
949 my($sequenceID) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
950 my $fasta_file = "";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
951 my $report_file = "";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
952 my $url;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
953 my $output;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
954
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
955 $url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=fasta";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
956 #if($actualOS eq "MSWin32"){
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
957 $output = get($url);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
958 $fasta_file = $sequenceID . ".fasta";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
959 open(FILE, ">", $fasta_file) or die("could not open $!");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
960 print FILE $output;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
961 close(FILE) or die "could not close $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
962 #}
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
963 #else{
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
964 # system("wget $url");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
965 #}
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
966
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
967 $output = "";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
968
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
969 $url = "https://www.ebi.ac.uk/ena/data/view/$sequenceID&display=text&header=true";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
970 #if($actualOS eq "MSWin32"){
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
971 $output = get($url); # replace by wget????
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
972 $report_file = $sequenceID . "_report.txt";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
973 open(FILE, ">>", $report_file) or die "could not open: $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
974 print FILE $output;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
975 close(FILE) or die "could not close $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
976 #}
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
977 #else{
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
978 # system("wget $url");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
979 #}
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
980
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
981 return ($fasta_file, $report_file);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
982 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
983 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
984 # add information to file
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
985 sub add_to_file {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
986 my ($file, $info) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
987 open(FILE, ">>", $file) or die ("Could not open $!");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
988 print FILE $info;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
989 close(FILE);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
990 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
991 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
992 # return taxonomic rank of species by tax id
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
993 sub get_taxonomic_rank {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
994 my($tax_id, $taxonomic_file) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
995 my $species = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
996 my $genus = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
997 my $family = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
998 my $order = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
999 my $class = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1000 my $phylum = "na";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1001
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1002 # my ($species,$genus,$family,$order,$class,$phylum);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1003 my @tmp_array = ($species, $genus, $family, $order, $class, $phylum);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1004
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1005 open(TFILE, "<", $taxonomic_file) or
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1006 die("Could not open $taxonomic_file: $!");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1007
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1008 while(<TFILE>) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1009 chomp;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1010 my @tax_info = split(/\|/, $_);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1011
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1012 if ($tax_info[0] == $tax_id) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1013 @tax_info = trim_array(@tax_info);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1014
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1015 $tmp_array[0] = $tax_info[1];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1016 splice(@tax_info, 0, 3);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1017
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1018 for(my $i = 1; $i < $#tmp_array + 1; $i++) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1019 if (length($tax_info[$i-1]) > 0) { $tmp_array[$i] = $tax_info[$i-1]; }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1020 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1021 close(TFILE) or die "error close $taxonomic_file $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1022 return @tmp_array;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1023 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1024 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1025 close(TFILE) or die "error close $taxonomic_file $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1026 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1027 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1028 # write html summary file
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1029 sub write_html_summary {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1030 my($summary) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1031 my $htmlFile = "summary.html";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1032 my $header = "";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1033 my @fileToRead = ();
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1034
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1035 open(HTML, ">", $htmlFile) or die "error open HTML summary $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1036 print HTML "<!DOCTYPE html>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1037 print HTML "<html>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1038 print HTML " <head>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1039 print HTML " <title>Assembly summary</title>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1040 print HTML " </head>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1041 print HTML " <body>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1042 print HTML " <h2>Assembly Summary</h2>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1043 close(HTML) or die "error close HTML summary $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1044
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1045 open(SUM, "<", $summary) or die "error open tsv summary $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1046 @fileToRead = <SUM>;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1047 close(SUM) or die "error close tsv summary $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1048
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1049 $header = splice(@fileToRead, 0, 1);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1050
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1051 for my $line (@fileToRead) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1052 write_html_table($line, $htmlFile, $header);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1053 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1054
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1055 open(HTML, ">>", $htmlFile) or die "error open HTML summary $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1056 print HTML " </body>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1057 print HTML "</html>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1058 close(HTML) or die "error close HTML summary $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1059 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1060 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1061 # write html table for summary
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1062 sub write_html_table {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1063 my ($line, $htmlFile, $header) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1064
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1065 open(HTML, ">>", $htmlFile) or die "error open HTML summary $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1066 print HTML " <table border=\"1\" style=\"margin-bottom: 20px;\">\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1067 close(HTML) or die "error close HTML summary $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1068 add_table_content($line, $htmlFile, $header);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1069 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1070 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1071 # add information to table
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1072 sub add_table_content {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1073 my ($line, $htmlFile, $headers) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1074
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1075 my @assemblyHeader = split(/\t/, $headers);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1076 my @assemblyInfo = split(/\t/, $line);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1077 my %hashHeaderInfo;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1078 my $nbOfCell = 7;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1079 my $fullLine = floor(($#assemblyHeader + 1) / $nbOfCell);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1080 my $restCell = $#assemblyHeader + 1 - $fullLine * $nbOfCell;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1081
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1082
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1083 for (my $i = 0; $i < $#assemblyHeader + 1; $i++) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1084 $hashHeaderInfo{trim($assemblyHeader[$i])} = $assemblyInfo[$i];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1085 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1086
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1087 my @keysHeaderInfo = keys %hashHeaderInfo;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1088 my $cellIndex = 0;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1089
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1090 open(HTML, ">>", $htmlFile) or die "error open HTML summary $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1091 for (my $turn = 0; $turn < $fullLine; $turn++) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1092
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1093 print HTML " <tr>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1094 for my $header (@assemblyHeader[$cellIndex..$cellIndex + $nbOfCell - 1]) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1095 print HTML " <th>$header</th>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1096 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1097 print HTML " </tr>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1098
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1099 print HTML " <tr>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1100 for my $header (@assemblyHeader[$cellIndex..$cellIndex + $nbOfCell - 1]) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1101 if ($header =~ /PUBMED/i && $hashHeaderInfo{$header} ne "na") {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1102 print HTML " <td><a href=https://www.ncbi.nlm.nih.gov/pubmed/?term=".
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1103 "$hashHeaderInfo{$header} target=\"_blank\">$hashHeaderInfo{trim($header)}</a></td>";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1104 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1105 else {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1106 print HTML " <td>$hashHeaderInfo{trim($header)}</td>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1107 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1108 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1109 print HTML " </tr>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1110
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1111 $cellIndex += $nbOfCell;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1112 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1113
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1114 print HTML " <tr>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1115 for my $header(@assemblyHeader[$cellIndex..$#keysHeaderInfo]) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1116 print HTML " <th>$header</th>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1117 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1118 print HTML " <tr>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1119
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1120 print HTML " <tr>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1121 for my $header(@assemblyHeader[$cellIndex..$#keysHeaderInfo]) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1122 print HTML " <td>$hashHeaderInfo{trim($header)}</td>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1123 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1124 print HTML " <tr>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1125
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1126 print HTML " </table>\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1127 close(HTML) or die "error close HTML summary $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1128 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1129 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1130 #getTaxonomicRanks (function allowing to get taxonomic ranks from Genbank file)
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1131 sub get_taxonomic_rank_genbank {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1132 my ($genbank) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1133
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1134 my $seqio_object = Bio::SeqIO->new(-file => $genbank);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1135 my $seq_object = $seqio_object->next_seq;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1136
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1137 # legible and long
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1138 my $species_object = $seq_object->species;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1139 my $species_string = $species_object->node_name;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1140
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1141 # get all taxa from the ORGANISM section in an array
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1142 my @classification = $seq_object->species->classification;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1143 # my $arraySize = @classification;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1144
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1145 # print "@classification\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1146
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1147 # if($arraySize == 7){
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1148 # ($species,$genus,$family,$order,$class,$phylum,$kingdomGB) = @classification;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1149 # }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1150 # elsif($arraySize == 4){
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1151 # ($species,$class,$phylum,$kingdomGB) = @classification;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1152 # }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1153
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1154 my $classification = join(",", @classification);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1155
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1156 return ($classification);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1157 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1158 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1159 #add all sequences components to file
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1160 sub create_component_sequence_file {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1161 my ($fldSep, $repository, $listComponentRef) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1162
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1163 my @listFnaFile;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1164 my @listComponent = @{$listComponentRef};
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1165
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1166 opendir(my $dh, $repository) || die "Can't opendir $repository: $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1167 @listFnaFile = grep{/fna$/} readdir($dh);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1168 closedir $dh;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1169
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1170 my %componentFastaHash;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1171
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1172 foreach my $component (@listComponent) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1173
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1174 my $componentFasta = $component.".fasta";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1175
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1176 foreach my $fnaFile (@listFnaFile) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1177
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1178 # my $actualFile = $repository . $fldSep . $fnaFile;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1179
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1180 my $seq;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1181 my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$repository . $fldSep . $fnaFile);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1182
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1183 while ($seq = $seqIO->next_seq()) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1184
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1185 my $seqDesc = $seq->desc;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1186
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1187 if ($seqDesc =~ /$component/) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1188 my $seqID = $seq->id;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1189 my $seqNuc = $seq->seq;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1190 my $shift = 60;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1191 my @seqArray = split //, $seqNuc;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1192 my $newSeqNuc = "";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1193
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1194 if (length $seqNuc <= $shift) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1195 $newSeqNuc = $seqNuc;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1196 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1197 else {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1198 for(my $i = 0; $i < $#seqArray + 1; $i ++) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1199 $newSeqNuc .= $seqArray[$i];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1200 if (($i + 1) % $shift == 0) { $newSeqNuc .= ","; }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1201 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1202 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1203
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1204 open(FASTA, ">>", $componentFasta) or die "error open file $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1205 print FASTA ">$seqID $seqDesc\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1206 foreach my $subSeqNuc (split /,/, $newSeqNuc) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1207 print FASTA "$subSeqNuc\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1208 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1209 close(FASTA) or die "error close file $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1210 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1211 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1212 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1213 if (-e $componentFasta) { $componentFastaHash{$component} = $componentFasta; }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1214 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1215 return %componentFastaHash;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1216 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1217 # remove back and front spaces
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1218 sub trim {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1219 my ($string) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1220 $string =~ s/^\s+//;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1221 $string =~ s/\s+$//;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1222 return $string;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1223 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1224 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1225 # use trim in array
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1226 sub trim_array {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1227 my (@array) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1228 foreach my $value (@array) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1229 $value = trim($value);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1230 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1231 return @array;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1232 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1233 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1234 # check if folder is empty
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1235 sub empty_folder {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1236 my $dirname = shift;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1237 opendir(my $dholder, $dirname) or die "error not a directory";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1238 my $isEmpty = scalar(grep { $_ ne "." && $_ ne ".." } readdir($dholder));
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1239 if ($isEmpty == 0) { return $isEmpty; }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1240 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1241 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1242 # number nucleotid and length
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1243 sub number_nuc_length_seq {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1244 my ($fastaFile) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1245 my $ade = 0;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1246 my $thy = 0;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1247 my $gua = 0;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1248 my $cyt = 0;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1249 my $n = 0;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1250 my $length = 0;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1251
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1252 open (FASTA, "<", $fastaFile) or die "Could not open $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1253 while (<FASTA>) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1254 chomp;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1255 if ($_ !~ />/) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1256 my @seq = split //, $_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1257
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1258 for my $nuc (@seq) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1259 $length +=1 ;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1260 if ($nuc =~ /a/i) {$ade+=1;}
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1261 elsif ($nuc =~ /t/i) {$thy+=1;}
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1262 elsif ($nuc =~ /g/i) {$gua+=1;}
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1263 elsif ($nuc =~ /c/i) {$cyt+=1;}
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1264 elsif ($nuc =~ /n/i) {$n+=1;}
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1265 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1266 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1267 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1268 close(FASTA) or die "Error close file :$!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1269 return ($ade, $thy, $gua, $cyt, $n, $length);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1270
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1271 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1272 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1273 # compute percentage of nucleotid
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1274 sub nucleotid_percent {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1275 my($ade, $thy, $gua, $cyt, $n, $length) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1276
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1277 my $adePercent = $ade / $length * 100;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1278 my $thyPercent = $thy / $length * 100;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1279 my $guaPercent = $gua / $length * 100;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1280 my $cytPercent = $cyt / $length * 100;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1281 my $nPercent = $n / $length * 100;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1282
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1283 return ($adePercent, $thyPercent, $guaPercent, $cytPercent, $nPercent);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1284
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1285 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1286 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1287 # compute ATGC ratio
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1288 sub atgc_ratio {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1289 my ($ade, $thy, $gua, $cyt) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1290 return (($ade + $thy) / ($gua + $cyt));
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1291 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1292 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1293 # variance
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1294 sub shift_data_variance {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1295 my (@data) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1296
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1297 if ($#data + 1 < 2) { return 0.0; }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1298
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1299 my $K = $data[0];
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1300 my ($n, $Ex, $Ex2) = 0.0;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1301
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1302 for my $x (@data) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1303 $n = $n + 1;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1304 $Ex += $x - $K;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1305 $Ex2 += ($x - $K) * ($x - $K);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1306 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1307
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1308 my $variance = ($Ex2 - ($Ex * $Ex) / $n) / ($n); ## ($n - 1)
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1309
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1310 return $variance;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1311
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1312 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1313 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1314 # nucle score
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1315 sub nucle_score {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1316 my ($variance, $gcPercent, $atgcRatio, $length) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1317 #return log2(($variance * $gcPercent * $atgcRatio) / sqrt($length));
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1318 return log2(($variance * $gcPercent * $atgcRatio ** (3)) / sqrt($length));
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1319 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1320 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1321 sub log2 {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1322 my $n = shift;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1323 return (log($n) / log(2));
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1324 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1325 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1326 # compute GC pourcent
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1327 sub gc_percent {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1328 my ($seq) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1329
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1330 my @charSeq = split(//, uc($seq));
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1331 my %hashFlank = ();
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1332
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1333 foreach my $v (@charSeq) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1334 $hashFlank{$v} += 1;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1335 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1336
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1337 if (! $hashFlank{'G'}) { $hashFlank{'G'} = 0;}
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1338 if (! $hashFlank{'C'}) { $hashFlank{'C'} = 0;}
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1339
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1340 if(length($seq) == 0) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1341 return 0;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1342 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1343 else {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1344 return (($hashFlank{'G'} + $hashFlank{'C'}) / (length($seq))) * 100;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1345 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1346
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1347 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1348 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1349 # download file from ftp protocol
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1350 sub download_file {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1351 my($servor, $file) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1352
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1353 #if($actualOS eq "MSWin32"){
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1354 my $ftp = Net::FTP->new($servor, Debug => 0)
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1355 or die "Cannot connect to $servor";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1356
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1357 $ftp->login("anonymous", "-anonymous@")
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1358 or die "Cannot login ", $ftp->message;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1359 $ftp->binary;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1360 $ftp->get($file) or die "get failed ", $ftp->message;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1361
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1362 $ftp->quit;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1363 #}
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1364 #else{
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1365 # system("wget $file");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1366 #}
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1367 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1368 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1369 # obtain file directory
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1370 sub obtain_file {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1371 my($servor, $link) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1372 if ($link =~ /$servor(.*)/) { return ($1); }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1373 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1374 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1375 # download fastq file from ENA
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1376 sub download_ena_fastq {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1377 my ($enaFtpServor, $sraId, $log) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1378
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1379 my $fastqDir = "/vol1/fastq/";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1380 my $dir1 = substr $sraId, 0, 6;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1381 my $dir2 = "000";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1382 my $digits = substr $sraId, 3;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1383 my $fastqRep = $sraId . "_folder";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1384
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1385 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1386 print LOG "...Downloading fastq file from ENA\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1387 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1388
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1389 if (length $digits == 6) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1390 $dir2 = $sraId;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1391 $fastqDir .= $dir1 . "/" . $dir2 . "/";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1392 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1393 elsif (length $digits > 6) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1394 my $digitsNumber = 0;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1395 my @digitsList = split //, (substr $digits, 6);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1396
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1397 foreach my $char (@digitsList) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1398 if (length $dir2 >= 1) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1399 $dir2 = substr $dir2, 0, (length $dir2) - 1;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1400 $digitsNumber += 1;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1401 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1402 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1403 $dir2 .= substr $digits, -$digitsNumber;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1404 $fastqDir .= $dir1 . "/" . $dir2 . "/" . $sraId . "/";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1405 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1406
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1407 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1408 print LOG "...recreate database folder path for FASTQ downloading\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1409 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1410
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1411 my $ftp = Net::FTP->new($enaFtpServor, Debug => 0)
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1412 or die "Cannot connect to $enaFtpServor";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1413
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1414 $ftp->login("anonymous", "-anonymous@")
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1415 or die "Cannot login ", $ftp->message;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1416 $ftp->binary;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1417
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1418 $ftp->cwd($fastqDir)
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1419 or die "maybe undefined sequence id, can't go to $fastqDir: ", $ftp->message;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1420
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1421 my @fastqFiles = $ftp->ls("$sraId*");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1422
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1423 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1424 print LOG "...Searching fastq files in path\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1425 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1426
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1427 if (!grep(/^$/, @fastqFiles)) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1428
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1429 if (-d $fastqRep) { rmtree($fastqRep) }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1430 mkdir $fastqRep;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1431
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1432 foreach my $fastqFile (@fastqFiles) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1433 #if($actualOS eq "MSWin32"){
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1434 $ftp->get($fastqFile) or die "get failed ", $ftp->message;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1435 #}
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1436 #else{
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1437 # system("wget $fastqFile");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1438 #}
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1439
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1440 #my @baseAndExt = split /\./, $fastqFile;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1441 #my $unzipFastq = $baseAndExt[0] . ".fastq";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1442
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1443 #gunzip $fastqFile => $unzipFastq or die "gunzip failed: $GunzipError\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1444 move($fastqFile, $fastqRep) or die "move failed: $!"; # DC replaced $unzipFastq by $fastqFile
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1445 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1446 #unlink glob "*fastq.gz" or die "$!: for file *fastq.gz";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1447
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1448 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1449 print LOG "...Finalizing download of FASTQ file\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1450 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1451 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1452
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1453 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1454 print LOG "End of download\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1455 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1456
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1457 $ftp->quit;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1458 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1459 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1460 # download fastq file from ENA
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1461 sub get_assembly_or_project {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1462 my ($file, $sequence, $ftpServor, $fldSep, $log) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1463
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1464 my $pattern;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1465 my $indexInfo;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1466 my %folderHash;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1467
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1468 # Repository for fna file
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1469 my $repositoryFNA = "Assembly";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1470
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1471 # Repository for genbank file
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1472 my $repositoryGenbank = "GenBank";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1473
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1474 # Reposotiry for report file
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1475 my $repositoryReport = "Report";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1476
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1477 # global repository
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1478 my $repositorySequence = $sequence;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1479
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1480
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1481 if ($sequence =~ /^GC[AF]_(.*)/) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1482 $indexInfo = 0;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1483 $pattern = $1;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1484 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1485 elsif ($sequence =~ /^PRJ/) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1486 $indexInfo = 1;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1487 $pattern = $sequence;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1488 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1489
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1490 open(SUM, $file) or die "error open file $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1491 while(<SUM>) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1492 chomp;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1493 if ($_ !~ /^#/) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1494 my @infoList = split /\t/, $_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1495 if ($infoList[$indexInfo] =~ /$pattern/) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1496 my @gcfInfo = split(/\//, $infoList[19]);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1497 my $gcfName = pop(@gcfInfo);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1498
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1499
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1500 my $genbankFile = $infoList[19] . "/" . $gcfName . "_genomic.gbff.gz";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1501 my $dnaFile = $infoList[19] . "/" . $gcfName . "_genomic.fna.gz";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1502 my $assemblyReport = $infoList[19] . "/" . $gcfName . "_assembly_report.txt";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1503
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1504 $dnaFile = obtain_file($ftpServor, $dnaFile);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1505 $genbankFile = obtain_file($ftpServor, $genbankFile);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1506 $assemblyReport = obtain_file($ftpServor, $assemblyReport);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1507
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1508 download_file($ftpServor, $dnaFile);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1509 download_file($ftpServor, $genbankFile);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1510 download_file($ftpServor, $assemblyReport);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1511
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1512 # download sequences and check number of "N" characters
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1513 my $fileFasta = $gcfName."_genomic.fna.gz";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1514 my $ucpFasta = $gcfName."_genomic.fna";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1515 if (-e $fileFasta) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1516 gunzip $fileFasta => $ucpFasta or die "gunzip failed: $GunzipError\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1517 $folderHash{$ucpFasta} = $repositoryFNA;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1518 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1519
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1520 # download genome report
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1521 my $fileReport = $gcfName."_assembly_report.txt";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1522 if (-e $fileReport) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1523 $folderHash{$fileReport} = $repositoryReport;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1524 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1525
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1526 # download genbank files
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1527 my $fileGenbank = $gcfName."_genomic.gbff.gz";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1528 my $ucpGenbank = $gcfName."_genomic.gbff";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1529 if (-e $fileGenbank) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1530 gunzip $fileGenbank => $ucpGenbank or die "gunzip failed: $GunzipError\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1531 $folderHash{$ucpGenbank} = $repositoryGenbank;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1532 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1533
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1534 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1535 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1536 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1537 close(SUM) or die "error close file $!";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1538
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1539 if (keys %folderHash) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1540 if (-e $repositorySequence) {rmtree($repositorySequence);}
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1541
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1542 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1543 print LOG "...Download files from GenBank or RefSeq \n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1544 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1545
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1546 mkdir $repositorySequence;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1547 mkdir $repositoryFNA;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1548 mkdir $repositoryGenbank;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1549 mkdir $repositoryReport;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1550
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1551 for my $ucpFile (keys %folderHash) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1552 move($ucpFile, $folderHash{$ucpFile}) or die "error move file $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1553 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1554 move($repositoryFNA, $repositorySequence . $fldSep. $repositoryFNA) or die "error move file $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1555 move($repositoryGenbank, $repositorySequence . $fldSep. $repositoryGenbank) or die "error move file $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1556 move($repositoryReport, $repositorySequence . $fldSep. $repositoryReport) or die "error move file $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1557 unlink glob "*.gz" or die "for file *.gz $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1558
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1559 if ($log) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1560 print LOG "...move GenBank/RefSeq sequence files to dedicated folders\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1561 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1562 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1563
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1564 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1565 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1566 sub download_assembly_or_project {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1567 my ($sequenceId, $ftpServor, $fldSep, $directory, $log) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1568
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1569 my $assemblySummary;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1570 my @sequenceIdList = split /,/, $sequenceId;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1571
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1572 if ($directory =~ /refseq/) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1573 $assemblySummary = "assembly_summary_refseq.txt";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1574 } elsif ($directory =~ /genbank/) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1575 $assemblySummary = "assembly_summary_genbank.txt";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1576 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1577
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1578 my $assemblySummaryPath = "/genomes/ASSEMBLY_REPORTS/" . $assemblySummary;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1579 download_file($ftpServor, $assemblySummaryPath);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1580
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1581 foreach my $sequence (@sequenceIdList) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1582 get_assembly_or_project($assemblySummary, $sequence, $ftpServor, $fldSep, $log);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1583 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1584
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1585 unlink $assemblySummary or die "error remove file $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1586 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1587 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1588 # check if all required module are install
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1589 sub isModuleInstalled {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1590 my $mod = shift;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1591
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1592 #eval("use $mod");
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1593 my $commandModule = `perldoc -l $mod`;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1594
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1595 if ($commandModule) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1596 return(1);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1597 } else {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1598 return(0);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1599 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1600 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1601 #------------------------------------------------------------------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1602 # download assembly summary
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1603 sub download_summaries {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1604 my ($database, $kingdom, $ftpServor, $fldSep, $getSummaries) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1605
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1606 my $assemblySummary = "assembly_summary.txt";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1607 my $assemblySummaryLink;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1608 my $fileName;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1609
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1610 opendir my $workingDirectory, "." . $fldSep or die "error open dir $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1611 my @filesList = readdir $workingDirectory;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1612 closedir $workingDirectory;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1613
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1614 if ($getSummaries) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1615 foreach my $summaryKingdom (split /,/, $getSummaries) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1616 foreach my $file (@filesList) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1617 if ($file =~ /assembly_summary.txt/i && $file =~ /$summaryKingdom/i && $file =~ /$database/) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1618 unlink $file or die "error remove file $!:";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1619 $assemblySummaryLink = "/genomes/$database/$summaryKingdom/assembly_summary.txt";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1620 download_file($ftpServor, $assemblySummaryLink);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1621 $fileName = $database . "_" . $summaryKingdom . "_" . "assembly_summary.txt";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1622 rename $assemblySummary, $fileName;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1623 print "replace assembly_summary file\n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1624 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1625 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1626 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1627 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1628
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1629 foreach my $file (@filesList) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1630 if ($file =~ /assembly_summary.txt/i && $file =~ /$kingdom/i && $file =~ /$database/) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1631 return $file;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1632 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1633 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1634
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1635 $assemblySummaryLink = "/genomes/$database/$kingdom/assembly_summary.txt";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1636 $fileName = $database . "_" . $kingdom . "_" . "assembly_summary.txt";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1637 download_file($ftpServor, $assemblySummaryLink);
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1638 rename $assemblySummary, $fileName;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1639
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1640 return $fileName;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1641 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1642 #---------------------
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1643 sub bioseqio {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1644
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1645 my ($keyword, $file) = @_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1646 local $/ = "\n>"; # read by FASTA record
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1647
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1648 my $count = 0;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1649
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1650 open FASTA, $file;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1651 while (<FASTA>) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1652 chomp;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1653 my $seq = $_;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1654 #my ($id) = $seq =~ /^>*(\S+)/; # parse ID as first word in FASTA header
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1655 if ($seq =~ /^>*.*$keyword/) {
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1656 #$seq =~ s/^>*.+\n//; # remove FASTA header
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1657 #$seq =~ s/\n//g; # remove endlines
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1658 $count++;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1659 print "\nThe sequence number is: $count \n";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1660 print ">$seq";
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1661 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1662 }
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1663 close FASTA;
19ae17458c14 Uploaded
dcouvin
parents:
diff changeset
1664 }