Mercurial > repos > fgiacomoni > massbank_ws_searchspectrum
comparison massbank_ws_searchspectrum.pl @ 0:023c380900ef draft default tip
Init repository with last massbank_ws_searchspectrum master version
author | fgiacomoni |
---|---|
date | Wed, 19 Apr 2017 11:31:58 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:023c380900ef |
---|---|
1 #!perl | |
2 | |
3 ## script : XXX.pl | |
4 | |
5 ## Notes : | |
6 # -> manage score sorting : Cleaned_pcGroups done but not in outputs ! | |
7 | |
8 #============================================================================= | |
9 # Included modules and versions | |
10 #============================================================================= | |
11 ## Perl modules | |
12 use strict ; | |
13 use warnings ; | |
14 use Carp qw (cluck croak carp) ; | |
15 | |
16 use threads; | |
17 use threads::shared; | |
18 use Thread::Queue; | |
19 | |
20 use Data::Dumper ; | |
21 use Getopt::Long ; | |
22 use POSIX ; | |
23 use FindBin ; ## Allows you to locate the directory of original perl script | |
24 | |
25 ## Specific Perl Modules (PFEM) | |
26 use lib $FindBin::Bin ; | |
27 my $binPath = $FindBin::Bin ; | |
28 use lib::csv qw( :ALL ) ; | |
29 use lib::conf qw( :ALL ) ; | |
30 | |
31 ## Dedicate Perl Modules (Home made...) | |
32 use lib::massbank_api qw( :ALL ) ; | |
33 use lib::threader qw(:ALL) ; | |
34 use lib::mapper qw(:ALL) ; | |
35 use lib::writter qw(:ALL) ; | |
36 use lib::massbank_parser qw(:ALL) ; | |
37 | |
38 | |
39 | |
40 ## Initialized values | |
41 my ($help, $mzs_file, $col_mz, $col_int, $col_pcgroup, $line_header ) = ( undef, undef, undef, undef, undef,undef, undef ) ; | |
42 my ($server, $ion_mode, $score_threshold, $instruments, $max, $unit, $tol, $cutoff) = ( undef, undef, undef, undef, undef, undef, undef ) ; | |
43 my ($output_json, $output_tabular, $output_xlsx, $output_html ) = ( undef, undef, undef, undef ) ; | |
44 | |
45 ## Local values ONLY FOR TEST : | |
46 #my $server = 'JP' ; | |
47 #my $threading_threshold = 6 ; | |
48 | |
49 #============================================================================= | |
50 # Manage EXCEPTIONS | |
51 #============================================================================= | |
52 &GetOptions ( "help|h" => \$help, # HELP | |
53 "masses:s" => \$mzs_file, | |
54 "col_mz:i" => \$col_mz, | |
55 "col_int:i" => \$col_int, ## optionnal | |
56 "col_pcgroup:i" => \$col_pcgroup, | |
57 "lineheader:i" => \$line_header, | |
58 "mode:s" => \$ion_mode, | |
59 "score_threshold:f" => \$score_threshold, | |
60 "instruments:s" => \$instruments, # advanced -> to transform into string with comma => done ! | |
61 "max:i" => \$max, # advanced | |
62 "unit:s" => \$unit, # advanced | |
63 "tolerance:f" => \$tol, | |
64 "cutoff:i" => \$cutoff, # advanced : intensity cutoff | |
65 "server:s" => \$server, ## by default JP and # advanced | |
66 "output_json:s" => \$output_json, | |
67 "output_xlsx:s" => \$output_xlsx, | |
68 "output_tabular:s" => \$output_tabular, | |
69 "output_html:s" => \$output_html, | |
70 ) ; | |
71 | |
72 ## if you put the option -help or -h function help is started | |
73 if ( defined($help) ){ &help ; } | |
74 | |
75 #============================================================================= | |
76 # MAIN SCRIPT | |
77 #============================================================================= | |
78 | |
79 ## -------------- Conf file ------------------------ : | |
80 my ( $CONF ) = ( undef ) ; | |
81 foreach my $conf ( <$binPath/*.cfg> ) { | |
82 my $oConf = lib::conf::new() ; | |
83 $CONF = $oConf->as_conf($conf) ; | |
84 } | |
85 | |
86 ## -------------- HTML template file ------------------------ : | |
87 foreach my $html_template ( <$binPath/*.tmpl> ) { $CONF->{'HTML_TEMPLATE'} = $html_template ; } | |
88 | |
89 ## Main variables : | |
90 my ($pcs, $mzs, $into, $complete_rows, $pcgroups) = (undef, undef, undef, undef, undef) ; | |
91 | |
92 ## manage csv file containing list of masses (every thing is manage in jar) | |
93 if ( ( defined $mzs_file ) and ( $mzs_file ne "" ) and ( -e $mzs_file ) ) { | |
94 | |
95 ## parse csv ids and masses | |
96 my $is_header = undef ; | |
97 my $ocsv = lib::csv->new() ; | |
98 my $csv = $ocsv->get_csv_object( "\t" ) ; | |
99 if ( ( defined $line_header ) and ( $line_header > 0 ) ) { $is_header = 'yes' ; } | |
100 $pcs = $ocsv->get_value_from_csv_multi_header( $csv, $mzs_file, $col_pcgroup, $is_header, $line_header ) ; ## retrieve pc values on csv | |
101 $mzs = $ocsv->get_value_from_csv_multi_header( $csv, $mzs_file, $col_mz, $is_header, $line_header ) ; ## retrieve mz values on csv | |
102 $into = $ocsv->get_value_from_csv_multi_header( $csv, $mzs_file, $col_int, $is_header, $line_header ) if ( defined $col_int ); ## retrieve into values on csv // optionnal in input files | |
103 $complete_rows = $ocsv->parse_csv_object($csv, \$mzs_file) ; ## parse all csv for output csv build | |
104 | |
105 ## manage input file with no into colunm / init into with a default value of 10 | |
106 if ( !defined $col_int ) { | |
107 my $nb_mzs = scalar(@{$mzs}) ; | |
108 my @intos = map {10} (0..$nb_mzs-1) ; | |
109 my $nb_intos = scalar(@intos) ; | |
110 if ($nb_intos == $nb_mzs) { $into = \@intos ; } | |
111 else { carp "A difference exists between intensity and mz values\n" } | |
112 } | |
113 | |
114 ## manage instruments string to array_ref | |
115 if (defined $instruments ) { | |
116 if ($instruments eq '') { ## in xml : can select nothing... | |
117 $instruments = ['all'] ; | |
118 } | |
119 else { | |
120 my @instruments = split(/,/, $instruments) ; | |
121 $instruments = \@instruments ; | |
122 } | |
123 } | |
124 | |
125 | |
126 ## Build pcgroups with their features : | |
127 my $omap = lib::mapper->new() ; | |
128 $pcgroups = $omap->get_pcgroups($pcs, $mzs, $into ) ; | |
129 my $pcgroup_list = $omap->get_pcgroup_list($pcs ) ; | |
130 | |
131 # print Dumper $pcgroups ; | |
132 | |
133 my $pc_num = 0 ; | |
134 $pc_num = scalar(@{$pcgroup_list}) ; | |
135 | |
136 ## manage a list of query pc_group dependant: | |
137 if ($pcgroups) { | |
138 ## - - - - - - - - - - - - - - - - - - - - - Multithreadind mode if pcgroups > 6 - - - - - - - - - - - - - - - - | |
139 if ($pc_num > $CONF->{'THREADING_THRESHOLD'}) { | |
140 print $server."\n" ; | |
141 print "\n------ ** ** ** Using multithreading mode ** ** ** --------\n\n" ; | |
142 my $time_start = time ; | |
143 | |
144 our $NBTHREADS = $CONF->{'THREADING_THRESHOLD'} ; | |
145 | |
146 # use constant THREADS => 6 ; | |
147 my $Qworks = Thread::Queue->new(); | |
148 my @threads = () ; | |
149 my @queries = () ; | |
150 my @Qresults = () ; | |
151 | |
152 foreach my $pc_group_id (keys %{$pcgroups}) { | |
153 push (@queries, $pcgroups->{$pc_group_id}) if $pcgroups->{$pc_group_id} ; | |
154 } | |
155 | |
156 for (1..$NBTHREADS) { | |
157 my $oworker = lib::threader->new ; | |
158 push @threads, threads->create(sub { $oworker->searchSpectrumWorker($Qworks, $server, $ion_mode, $instruments, $max, $unit, $tol, $cutoff) ; } ) ; | |
159 } | |
160 | |
161 $Qworks->enqueue(@queries); | |
162 $Qworks->enqueue(undef) for 1..$NBTHREADS; | |
163 push @Qresults, $_->join foreach @threads; | |
164 | |
165 | |
166 my $time_end = time ; | |
167 my $seconds = $time_end-$time_start ; | |
168 print "\n------ Time used in multithreading mode : $seconds seconds --------\n\n" ; | |
169 | |
170 # print Dumper @Qresults ; | |
171 | |
172 ## controle number of returned queries : | |
173 my $massbank_results_num = 0 ; | |
174 $massbank_results_num = scalar @Qresults ; | |
175 | |
176 if ( $massbank_results_num == $pc_num ) { | |
177 ## Map @Qresults with annotation hash : pcgroup_id in @Qresults (pcgroup2) // id in $pcgroups (pcgroup2) | |
178 foreach my $result (@Qresults) { | |
179 ## manage annotation part | |
180 if ($result->{'pcgroup_id'}) { | |
181 if ($pcgroups->{$result->{'pcgroup_id'}}) { | |
182 $pcgroups->{$result->{'pcgroup_id'}}{'annotation'} = $result ; | |
183 } | |
184 else { carp "Carefull : no mapping possible between massbank results and initial pcgroups data\n";} | |
185 } | |
186 else { carp "Carefull : no pcgroup id defined in massbank results\n"; } | |
187 | |
188 ## manage massbank_ids part | |
189 if ($result->{'res'}) { | |
190 my @tmp_res = map {$_->{'id'}} @{$result->{'res'}} ; | |
191 $pcgroups->{$result->{'pcgroup_id'}}{'massbank_ids'} = \@tmp_res ; | |
192 } | |
193 } | |
194 } | |
195 else { | |
196 croak "[ERROR] : problem between massbank results number and pcgroups number\n"; | |
197 } | |
198 } | |
199 ## - - - - - - - - - - - - - - - - - - - - - mono thread mode if pcgroups <= 6 - - - - - - - - - - - - - - - - | |
200 else { | |
201 ## connexion | |
202 # print $server."\n" ; | |
203 my $omassbank = lib::massbank_api->new() ; | |
204 my $soap = $omassbank->selectMassBank($server) ; | |
205 print "\n------ ** ** ** Using batch mode ** ** ** --------\n\n" ; | |
206 my $time_start = time ; | |
207 foreach my $pcgroup (keys %{$pcgroups}) { | |
208 ## searchSpectrum via SOAP | |
209 print "Annot pcgroup n-$pcgroup\n" ; | |
210 my $oquery = lib::massbank_api->new() ; | |
211 my ($results, $num) = $oquery->searchSpectrum($soap, $pcgroups->{$pcgroup}{'id'}, $pcgroups->{$pcgroup}{'mzmed'}, $pcgroups->{$pcgroup}{'into'}, $ion_mode, $instruments, $max, $unit, $tol, $cutoff) ; | |
212 $pcgroups->{$pcgroup}{'annotation'} = $results ; | |
213 # print Dumper $results ; | |
214 } | |
215 my $time_end = time ; | |
216 my $seconds = $time_end-$time_start ; | |
217 print "\n------ Time used in foreach mode: $seconds seconds --------\n\n" ; | |
218 } | |
219 } | |
220 else { | |
221 croak "The pcgroup object is not defined\n" ; | |
222 } | |
223 # print "Init pcGroups results are\n" ; | |
224 # print Dumper $pcgroups ; | |
225 | |
226 } ## End of elsif "defined $mzs_file" | |
227 else { | |
228 warn "[WARN] Can't use Massbank WS service without an existing input tabular file\n" ; | |
229 &help ; | |
230 } | |
231 | |
232 ## Clean zone - use threshold on massbank entry returned score | |
233 my $omap = lib::mapper->new() ; | |
234 my $cleaned_pcgroups = $omap->filter_pcgroup_res($pcgroups, $score_threshold) ; | |
235 | |
236 #print "Cleaned_pcGroups are\n" ; | |
237 #print Dumper $cleaned_pcgroups ; | |
238 | |
239 ## add min/max value of each mzmed in the pc_group | |
240 my $pcgroups_with_intervales = $omap->add_min_max_for_pcgroup_res($cleaned_pcgroups, $tol ) ; | |
241 | |
242 #print "pcGroups_with_intervales are\n" ; | |
243 #print Dumper $pcgroups_with_intervales ; | |
244 | |
245 | |
246 ## search in the local indexed db - - - TODO - - - | |
247 | |
248 ## OR search new ones | |
249 | |
250 ## get all unique Massbank Ids found | |
251 my $oids = lib::mapper->new() ; | |
252 my $all_massbank_ids = $omap->compute_ids_from_pcgroups_res($cleaned_pcgroups) ; | |
253 | |
254 ## get entries on the MassBank server by ID by pieces of 10 | |
255 my $omapper = lib::mapper->new() ; | |
256 my $recordList = $omapper->get_massbank_records_by_chunk ($server, $all_massbank_ids, 10) ; | |
257 #print "\n\nRecords are\n" ; | |
258 #print Dumper $recordList ; | |
259 #print Dumper $all_massbank_ids ; | |
260 | |
261 ## foreach record - get id and peaks - create a object | |
262 my %records = (); | |
263 foreach (@$recordList) { | |
264 ## parse record handles | |
265 my $parser = lib::massbank_parser->new() ; | |
266 my $id = $parser->getIdFromString($_) ; | |
267 $records{$id}{'peaks'} = $parser->getPeaksFromString($_) ; | |
268 $records{$id}{'names'} = $parser->getChemNamesFromString($_) ; | |
269 $records{$id}{'instrument_type'} = $parser->getInstrumentTypeFromString($_) ; | |
270 $records{$id}{'precursor_type'} = $parser->getPrecursorTypeFromString($_) ; | |
271 $records{$id}{'ms_type'} = $parser->getMsTypeFromString($_) ; | |
272 $records{$id}{'formula'} = $parser->getFormulaFromString($_) ; | |
273 $records{$id}{'exact_mz'} = $parser->getExactMzFromString($_) ; | |
274 $records{$id}{'inchi'} = $parser->getInchiFromString($_) ; | |
275 } | |
276 #print Dumper %records ; | |
277 | |
278 ## Map pc_groups and records | |
279 my $well_annoted_pcGroups = $omapper->mapGroupsWithRecords($pcgroups_with_intervales, \%records) ; | |
280 | |
281 #print Dumper $well_annoted_pcGroups ; | |
282 | |
283 ## Output writting : | |
284 my ( $massbank_matrix ) = ( undef ) ; | |
285 | |
286 ## XLS OUTPUT -- new format | |
287 if ( (defined $output_xlsx) and (defined $well_annoted_pcGroups) and (defined $mzs) and (defined $pcs) ) { | |
288 my $owritter = lib::writter->new() ; | |
289 $owritter->write_xls_skel(\$output_xlsx, $mzs, $pcs, $well_annoted_pcGroups, \%records) ; | |
290 } | |
291 | |
292 ## CSV OUTPUT | |
293 if ( (defined $output_tabular) and (defined $well_annoted_pcGroups) and (defined $pcs) and (defined $mzs) ) { | |
294 my $omapper = lib::mapper->new() ; | |
295 if ( ( defined $line_header ) and ( $line_header == 1 ) ) { $massbank_matrix = $omapper->set_massbank_matrix_object('massbank', $pcs, $mzs, $well_annoted_pcGroups, \%records ) ; } | |
296 elsif ( ( defined $line_header ) and ( $line_header == 0 ) ) { $massbank_matrix = $omapper->set_massbank_matrix_object(undef, $pcs, $mzs, $well_annoted_pcGroups, \%records ) ; } | |
297 | |
298 $massbank_matrix = $omapper->add_massbank_matrix_to_input_matrix($complete_rows, $massbank_matrix) ; | |
299 my $owritter = lib::writter->new() ; | |
300 $owritter->write_csv_skel(\$output_tabular, $massbank_matrix) ; | |
301 } | |
302 | |
303 my $json_scalar = undef ; | |
304 ## JSON OUTPUT | |
305 if ( (defined $output_json) and (defined $well_annoted_pcGroups) and (defined $mzs) and (defined $pcs) ) { | |
306 my $omapper = lib::mapper->new() ; | |
307 $json_scalar = $omapper->map_pc_to_generic_json($pcs, $well_annoted_pcGroups, \%records) ; | |
308 my $owritter = lib::writter->new() ; | |
309 $owritter->write_json_skel(\$output_json, $json_scalar) ; | |
310 } | |
311 | |
312 ## HTML OUTPUT -- TODO | |
313 if ( (defined $output_html) and (defined $json_scalar) ) { | |
314 | |
315 # print Dumper $json_scalar ; | |
316 | |
317 ## Uses N mz and theirs entries per page (see config file). | |
318 # how many pages you need with your input mz list? | |
319 my $nb_pages_for_html_out = ceil( scalar(@{$mzs} ) / $CONF->{HTML_ENTRIES_PER_PAGE} ) ; | |
320 | |
321 ## Search condition: | |
322 my $search_condition = "Search params : Molecular specie = $ion_mode / delta ($unit) = $tol / Score threshold = $score_threshold and max hit = $max per pcgroup" ; | |
323 | |
324 my $oHtml = lib::mapper->new() ; | |
325 my ($tbody_object) = $oHtml->set_html_tbody_object( $nb_pages_for_html_out, $CONF->{HTML_ENTRIES_PER_PAGE} ) ; | |
326 ($tbody_object) = $oHtml->add_mz_to_tbody_object($tbody_object, $CONF->{HTML_ENTRIES_PER_PAGE}, $mzs, $json_scalar) ; | |
327 ($tbody_object) = $oHtml->add_entries_to_tbody_object($tbody_object, $CONF->{HTML_ENTRIES_PER_PAGE}, $mzs, $json_scalar) ; | |
328 | |
329 my $oWritter = lib::writter->new() ; | |
330 $oWritter->write_html_skel(\$output_html, $tbody_object, $nb_pages_for_html_out, $search_condition, $CONF->{'HTML_TEMPLATE'}, $CONF->{'JS_GALAXY_PATH'}, $CONF->{'CSS_GALAXY_PATH'}) ; | |
331 } | |
332 else { | |
333 warn "[WARN] The html output file or the json iss not defined\n" ; | |
334 } | |
335 | |
336 | |
337 | |
338 | |
339 | |
340 | |
341 | |
342 #==================================================================================== | |
343 # Help subroutine called with -h option | |
344 # number of arguments : 0 | |
345 # Argument(s) : | |
346 # Return : 1 | |
347 #==================================================================================== | |
348 sub help { | |
349 print STDERR " | |
350 massbank_ws_searchspectrum.pl | |
351 | |
352 # massbank_ws_searchspectrum.pl is a script to use SOAP massbank webservice and send specific queries about spectra searches. | |
353 # Input : a list of mzs, intensities, pcgroup. | |
354 # Author : Franck Giacomoni | |
355 # Email : franck.giacomoni\@clermont.inra.fr | |
356 # Version : 1.0 | |
357 # Created : 20/01/2017 | |
358 USAGE : | |
359 massbank_ws_searchspectrum.pl -help OR | |
360 | |
361 massbank_ws_searchspectrum.pl | |
362 -masses [name of input file] -col_id -col_mz -col_int -col_pcgroup -lineheader | |
363 -mode [ion mode : Positive, Negative or Both ] | |
364 -score_threshold [Ignore Massbank results with a score lower than the defined threshold] | |
365 -instruments [array of string: all or values obtained by getInstrumentTypes method] | |
366 -max [0 is all results or int] | |
367 -unit [unit or ppm] | |
368 -tolerance [Tolerance of values of m/z of peaks: 0.3 unit or 50 ppm] | |
369 -cutoff [Ignore peaks whose intensity is not larger than the value of cutoff. Default: 50)] | |
370 -server [name of the massbank server : EU or JP only] | |
371 -output_json [ouput file for JSON] | |
372 -output_xls [ouput file for XLS] | |
373 -output_tabular [ouput file for TABULAR] | |
374 | |
375 "; | |
376 exit(1); | |
377 } | |
378 | |
379 ## END of script - F Giacomoni | |
380 | |
381 __END__ | |
382 | |
383 =head1 NAME | |
384 | |
385 XXX.pl -- script for | |
386 | |
387 =head1 USAGE | |
388 | |
389 XXX.pl -precursors -arg1 [-arg2] | |
390 or XXX.pl -help | |
391 | |
392 =head1 SYNOPSIS | |
393 | |
394 This script manage ... | |
395 | |
396 =head1 DESCRIPTION | |
397 | |
398 This main program is a ... | |
399 | |
400 =over 4 | |
401 | |
402 =item B<function01> | |
403 | |
404 =item B<function02> | |
405 | |
406 =back | |
407 | |
408 =head1 AUTHOR | |
409 | |
410 Franck Giacomoni E<lt>franck.giacomoni@clermont.inra.frE<gt> | |
411 Yann Guitton | |
412 | |
413 =head1 LICENSE | |
414 | |
415 This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. | |
416 | |
417 =head1 VERSION | |
418 | |
419 version 1 : 05 / 01 / 2016 | |
420 | |
421 version 2 : ?? | |
422 | |
423 =cut |