comparison massbank_ws_searchspectrum.pl @ 0:023c380900ef draft default tip

Init repository with last massbank_ws_searchspectrum master version
author fgiacomoni
date Wed, 19 Apr 2017 11:31:58 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:023c380900ef
1 #!perl
2
3 ## script : XXX.pl
4
5 ## Notes :
6 # -> manage score sorting : Cleaned_pcGroups done but not in outputs !
7
8 #=============================================================================
9 # Included modules and versions
10 #=============================================================================
11 ## Perl modules
12 use strict ;
13 use warnings ;
14 use Carp qw (cluck croak carp) ;
15
16 use threads;
17 use threads::shared;
18 use Thread::Queue;
19
20 use Data::Dumper ;
21 use Getopt::Long ;
22 use POSIX ;
23 use FindBin ; ## Allows you to locate the directory of original perl script
24
25 ## Specific Perl Modules (PFEM)
26 use lib $FindBin::Bin ;
27 my $binPath = $FindBin::Bin ;
28 use lib::csv qw( :ALL ) ;
29 use lib::conf qw( :ALL ) ;
30
31 ## Dedicate Perl Modules (Home made...)
32 use lib::massbank_api qw( :ALL ) ;
33 use lib::threader qw(:ALL) ;
34 use lib::mapper qw(:ALL) ;
35 use lib::writter qw(:ALL) ;
36 use lib::massbank_parser qw(:ALL) ;
37
38
39
40 ## Initialized values
41 my ($help, $mzs_file, $col_mz, $col_int, $col_pcgroup, $line_header ) = ( undef, undef, undef, undef, undef,undef, undef ) ;
42 my ($server, $ion_mode, $score_threshold, $instruments, $max, $unit, $tol, $cutoff) = ( undef, undef, undef, undef, undef, undef, undef ) ;
43 my ($output_json, $output_tabular, $output_xlsx, $output_html ) = ( undef, undef, undef, undef ) ;
44
45 ## Local values ONLY FOR TEST :
46 #my $server = 'JP' ;
47 #my $threading_threshold = 6 ;
48
49 #=============================================================================
50 # Manage EXCEPTIONS
51 #=============================================================================
52 &GetOptions ( "help|h" => \$help, # HELP
53 "masses:s" => \$mzs_file,
54 "col_mz:i" => \$col_mz,
55 "col_int:i" => \$col_int, ## optionnal
56 "col_pcgroup:i" => \$col_pcgroup,
57 "lineheader:i" => \$line_header,
58 "mode:s" => \$ion_mode,
59 "score_threshold:f" => \$score_threshold,
60 "instruments:s" => \$instruments, # advanced -> to transform into string with comma => done !
61 "max:i" => \$max, # advanced
62 "unit:s" => \$unit, # advanced
63 "tolerance:f" => \$tol,
64 "cutoff:i" => \$cutoff, # advanced : intensity cutoff
65 "server:s" => \$server, ## by default JP and # advanced
66 "output_json:s" => \$output_json,
67 "output_xlsx:s" => \$output_xlsx,
68 "output_tabular:s" => \$output_tabular,
69 "output_html:s" => \$output_html,
70 ) ;
71
72 ## if you put the option -help or -h function help is started
73 if ( defined($help) ){ &help ; }
74
75 #=============================================================================
76 # MAIN SCRIPT
77 #=============================================================================
78
79 ## -------------- Conf file ------------------------ :
80 my ( $CONF ) = ( undef ) ;
81 foreach my $conf ( <$binPath/*.cfg> ) {
82 my $oConf = lib::conf::new() ;
83 $CONF = $oConf->as_conf($conf) ;
84 }
85
86 ## -------------- HTML template file ------------------------ :
87 foreach my $html_template ( <$binPath/*.tmpl> ) { $CONF->{'HTML_TEMPLATE'} = $html_template ; }
88
89 ## Main variables :
90 my ($pcs, $mzs, $into, $complete_rows, $pcgroups) = (undef, undef, undef, undef, undef) ;
91
92 ## manage csv file containing list of masses (every thing is manage in jar)
93 if ( ( defined $mzs_file ) and ( $mzs_file ne "" ) and ( -e $mzs_file ) ) {
94
95 ## parse csv ids and masses
96 my $is_header = undef ;
97 my $ocsv = lib::csv->new() ;
98 my $csv = $ocsv->get_csv_object( "\t" ) ;
99 if ( ( defined $line_header ) and ( $line_header > 0 ) ) { $is_header = 'yes' ; }
100 $pcs = $ocsv->get_value_from_csv_multi_header( $csv, $mzs_file, $col_pcgroup, $is_header, $line_header ) ; ## retrieve pc values on csv
101 $mzs = $ocsv->get_value_from_csv_multi_header( $csv, $mzs_file, $col_mz, $is_header, $line_header ) ; ## retrieve mz values on csv
102 $into = $ocsv->get_value_from_csv_multi_header( $csv, $mzs_file, $col_int, $is_header, $line_header ) if ( defined $col_int ); ## retrieve into values on csv // optionnal in input files
103 $complete_rows = $ocsv->parse_csv_object($csv, \$mzs_file) ; ## parse all csv for output csv build
104
105 ## manage input file with no into colunm / init into with a default value of 10
106 if ( !defined $col_int ) {
107 my $nb_mzs = scalar(@{$mzs}) ;
108 my @intos = map {10} (0..$nb_mzs-1) ;
109 my $nb_intos = scalar(@intos) ;
110 if ($nb_intos == $nb_mzs) { $into = \@intos ; }
111 else { carp "A difference exists between intensity and mz values\n" }
112 }
113
114 ## manage instruments string to array_ref
115 if (defined $instruments ) {
116 if ($instruments eq '') { ## in xml : can select nothing...
117 $instruments = ['all'] ;
118 }
119 else {
120 my @instruments = split(/,/, $instruments) ;
121 $instruments = \@instruments ;
122 }
123 }
124
125
126 ## Build pcgroups with their features :
127 my $omap = lib::mapper->new() ;
128 $pcgroups = $omap->get_pcgroups($pcs, $mzs, $into ) ;
129 my $pcgroup_list = $omap->get_pcgroup_list($pcs ) ;
130
131 # print Dumper $pcgroups ;
132
133 my $pc_num = 0 ;
134 $pc_num = scalar(@{$pcgroup_list}) ;
135
136 ## manage a list of query pc_group dependant:
137 if ($pcgroups) {
138 ## - - - - - - - - - - - - - - - - - - - - - Multithreadind mode if pcgroups > 6 - - - - - - - - - - - - - - - -
139 if ($pc_num > $CONF->{'THREADING_THRESHOLD'}) {
140 print $server."\n" ;
141 print "\n------ ** ** ** Using multithreading mode ** ** ** --------\n\n" ;
142 my $time_start = time ;
143
144 our $NBTHREADS = $CONF->{'THREADING_THRESHOLD'} ;
145
146 # use constant THREADS => 6 ;
147 my $Qworks = Thread::Queue->new();
148 my @threads = () ;
149 my @queries = () ;
150 my @Qresults = () ;
151
152 foreach my $pc_group_id (keys %{$pcgroups}) {
153 push (@queries, $pcgroups->{$pc_group_id}) if $pcgroups->{$pc_group_id} ;
154 }
155
156 for (1..$NBTHREADS) {
157 my $oworker = lib::threader->new ;
158 push @threads, threads->create(sub { $oworker->searchSpectrumWorker($Qworks, $server, $ion_mode, $instruments, $max, $unit, $tol, $cutoff) ; } ) ;
159 }
160
161 $Qworks->enqueue(@queries);
162 $Qworks->enqueue(undef) for 1..$NBTHREADS;
163 push @Qresults, $_->join foreach @threads;
164
165
166 my $time_end = time ;
167 my $seconds = $time_end-$time_start ;
168 print "\n------ Time used in multithreading mode : $seconds seconds --------\n\n" ;
169
170 # print Dumper @Qresults ;
171
172 ## controle number of returned queries :
173 my $massbank_results_num = 0 ;
174 $massbank_results_num = scalar @Qresults ;
175
176 if ( $massbank_results_num == $pc_num ) {
177 ## Map @Qresults with annotation hash : pcgroup_id in @Qresults (pcgroup2) // id in $pcgroups (pcgroup2)
178 foreach my $result (@Qresults) {
179 ## manage annotation part
180 if ($result->{'pcgroup_id'}) {
181 if ($pcgroups->{$result->{'pcgroup_id'}}) {
182 $pcgroups->{$result->{'pcgroup_id'}}{'annotation'} = $result ;
183 }
184 else { carp "Carefull : no mapping possible between massbank results and initial pcgroups data\n";}
185 }
186 else { carp "Carefull : no pcgroup id defined in massbank results\n"; }
187
188 ## manage massbank_ids part
189 if ($result->{'res'}) {
190 my @tmp_res = map {$_->{'id'}} @{$result->{'res'}} ;
191 $pcgroups->{$result->{'pcgroup_id'}}{'massbank_ids'} = \@tmp_res ;
192 }
193 }
194 }
195 else {
196 croak "[ERROR] : problem between massbank results number and pcgroups number\n";
197 }
198 }
199 ## - - - - - - - - - - - - - - - - - - - - - mono thread mode if pcgroups <= 6 - - - - - - - - - - - - - - - -
200 else {
201 ## connexion
202 # print $server."\n" ;
203 my $omassbank = lib::massbank_api->new() ;
204 my $soap = $omassbank->selectMassBank($server) ;
205 print "\n------ ** ** ** Using batch mode ** ** ** --------\n\n" ;
206 my $time_start = time ;
207 foreach my $pcgroup (keys %{$pcgroups}) {
208 ## searchSpectrum via SOAP
209 print "Annot pcgroup n-$pcgroup\n" ;
210 my $oquery = lib::massbank_api->new() ;
211 my ($results, $num) = $oquery->searchSpectrum($soap, $pcgroups->{$pcgroup}{'id'}, $pcgroups->{$pcgroup}{'mzmed'}, $pcgroups->{$pcgroup}{'into'}, $ion_mode, $instruments, $max, $unit, $tol, $cutoff) ;
212 $pcgroups->{$pcgroup}{'annotation'} = $results ;
213 # print Dumper $results ;
214 }
215 my $time_end = time ;
216 my $seconds = $time_end-$time_start ;
217 print "\n------ Time used in foreach mode: $seconds seconds --------\n\n" ;
218 }
219 }
220 else {
221 croak "The pcgroup object is not defined\n" ;
222 }
223 # print "Init pcGroups results are\n" ;
224 # print Dumper $pcgroups ;
225
226 } ## End of elsif "defined $mzs_file"
227 else {
228 warn "[WARN] Can't use Massbank WS service without an existing input tabular file\n" ;
229 &help ;
230 }
231
232 ## Clean zone - use threshold on massbank entry returned score
233 my $omap = lib::mapper->new() ;
234 my $cleaned_pcgroups = $omap->filter_pcgroup_res($pcgroups, $score_threshold) ;
235
236 #print "Cleaned_pcGroups are\n" ;
237 #print Dumper $cleaned_pcgroups ;
238
239 ## add min/max value of each mzmed in the pc_group
240 my $pcgroups_with_intervales = $omap->add_min_max_for_pcgroup_res($cleaned_pcgroups, $tol ) ;
241
242 #print "pcGroups_with_intervales are\n" ;
243 #print Dumper $pcgroups_with_intervales ;
244
245
246 ## search in the local indexed db - - - TODO - - -
247
248 ## OR search new ones
249
250 ## get all unique Massbank Ids found
251 my $oids = lib::mapper->new() ;
252 my $all_massbank_ids = $omap->compute_ids_from_pcgroups_res($cleaned_pcgroups) ;
253
254 ## get entries on the MassBank server by ID by pieces of 10
255 my $omapper = lib::mapper->new() ;
256 my $recordList = $omapper->get_massbank_records_by_chunk ($server, $all_massbank_ids, 10) ;
257 #print "\n\nRecords are\n" ;
258 #print Dumper $recordList ;
259 #print Dumper $all_massbank_ids ;
260
261 ## foreach record - get id and peaks - create a object
262 my %records = ();
263 foreach (@$recordList) {
264 ## parse record handles
265 my $parser = lib::massbank_parser->new() ;
266 my $id = $parser->getIdFromString($_) ;
267 $records{$id}{'peaks'} = $parser->getPeaksFromString($_) ;
268 $records{$id}{'names'} = $parser->getChemNamesFromString($_) ;
269 $records{$id}{'instrument_type'} = $parser->getInstrumentTypeFromString($_) ;
270 $records{$id}{'precursor_type'} = $parser->getPrecursorTypeFromString($_) ;
271 $records{$id}{'ms_type'} = $parser->getMsTypeFromString($_) ;
272 $records{$id}{'formula'} = $parser->getFormulaFromString($_) ;
273 $records{$id}{'exact_mz'} = $parser->getExactMzFromString($_) ;
274 $records{$id}{'inchi'} = $parser->getInchiFromString($_) ;
275 }
276 #print Dumper %records ;
277
278 ## Map pc_groups and records
279 my $well_annoted_pcGroups = $omapper->mapGroupsWithRecords($pcgroups_with_intervales, \%records) ;
280
281 #print Dumper $well_annoted_pcGroups ;
282
283 ## Output writting :
284 my ( $massbank_matrix ) = ( undef ) ;
285
286 ## XLS OUTPUT -- new format
287 if ( (defined $output_xlsx) and (defined $well_annoted_pcGroups) and (defined $mzs) and (defined $pcs) ) {
288 my $owritter = lib::writter->new() ;
289 $owritter->write_xls_skel(\$output_xlsx, $mzs, $pcs, $well_annoted_pcGroups, \%records) ;
290 }
291
292 ## CSV OUTPUT
293 if ( (defined $output_tabular) and (defined $well_annoted_pcGroups) and (defined $pcs) and (defined $mzs) ) {
294 my $omapper = lib::mapper->new() ;
295 if ( ( defined $line_header ) and ( $line_header == 1 ) ) { $massbank_matrix = $omapper->set_massbank_matrix_object('massbank', $pcs, $mzs, $well_annoted_pcGroups, \%records ) ; }
296 elsif ( ( defined $line_header ) and ( $line_header == 0 ) ) { $massbank_matrix = $omapper->set_massbank_matrix_object(undef, $pcs, $mzs, $well_annoted_pcGroups, \%records ) ; }
297
298 $massbank_matrix = $omapper->add_massbank_matrix_to_input_matrix($complete_rows, $massbank_matrix) ;
299 my $owritter = lib::writter->new() ;
300 $owritter->write_csv_skel(\$output_tabular, $massbank_matrix) ;
301 }
302
303 my $json_scalar = undef ;
304 ## JSON OUTPUT
305 if ( (defined $output_json) and (defined $well_annoted_pcGroups) and (defined $mzs) and (defined $pcs) ) {
306 my $omapper = lib::mapper->new() ;
307 $json_scalar = $omapper->map_pc_to_generic_json($pcs, $well_annoted_pcGroups, \%records) ;
308 my $owritter = lib::writter->new() ;
309 $owritter->write_json_skel(\$output_json, $json_scalar) ;
310 }
311
312 ## HTML OUTPUT -- TODO
313 if ( (defined $output_html) and (defined $json_scalar) ) {
314
315 # print Dumper $json_scalar ;
316
317 ## Uses N mz and theirs entries per page (see config file).
318 # how many pages you need with your input mz list?
319 my $nb_pages_for_html_out = ceil( scalar(@{$mzs} ) / $CONF->{HTML_ENTRIES_PER_PAGE} ) ;
320
321 ## Search condition:
322 my $search_condition = "Search params : Molecular specie = $ion_mode / delta ($unit) = $tol / Score threshold = $score_threshold and max hit = $max per pcgroup" ;
323
324 my $oHtml = lib::mapper->new() ;
325 my ($tbody_object) = $oHtml->set_html_tbody_object( $nb_pages_for_html_out, $CONF->{HTML_ENTRIES_PER_PAGE} ) ;
326 ($tbody_object) = $oHtml->add_mz_to_tbody_object($tbody_object, $CONF->{HTML_ENTRIES_PER_PAGE}, $mzs, $json_scalar) ;
327 ($tbody_object) = $oHtml->add_entries_to_tbody_object($tbody_object, $CONF->{HTML_ENTRIES_PER_PAGE}, $mzs, $json_scalar) ;
328
329 my $oWritter = lib::writter->new() ;
330 $oWritter->write_html_skel(\$output_html, $tbody_object, $nb_pages_for_html_out, $search_condition, $CONF->{'HTML_TEMPLATE'}, $CONF->{'JS_GALAXY_PATH'}, $CONF->{'CSS_GALAXY_PATH'}) ;
331 }
332 else {
333 warn "[WARN] The html output file or the json iss not defined\n" ;
334 }
335
336
337
338
339
340
341
342 #====================================================================================
343 # Help subroutine called with -h option
344 # number of arguments : 0
345 # Argument(s) :
346 # Return : 1
347 #====================================================================================
348 sub help {
349 print STDERR "
350 massbank_ws_searchspectrum.pl
351
352 # massbank_ws_searchspectrum.pl is a script to use SOAP massbank webservice and send specific queries about spectra searches.
353 # Input : a list of mzs, intensities, pcgroup.
354 # Author : Franck Giacomoni
355 # Email : franck.giacomoni\@clermont.inra.fr
356 # Version : 1.0
357 # Created : 20/01/2017
358 USAGE :
359 massbank_ws_searchspectrum.pl -help OR
360
361 massbank_ws_searchspectrum.pl
362 -masses [name of input file] -col_id -col_mz -col_int -col_pcgroup -lineheader
363 -mode [ion mode : Positive, Negative or Both ]
364 -score_threshold [Ignore Massbank results with a score lower than the defined threshold]
365 -instruments [array of string: all or values obtained by getInstrumentTypes method]
366 -max [0 is all results or int]
367 -unit [unit or ppm]
368 -tolerance [Tolerance of values of m/z of peaks: 0.3 unit or 50 ppm]
369 -cutoff [Ignore peaks whose intensity is not larger than the value of cutoff. Default: 50)]
370 -server [name of the massbank server : EU or JP only]
371 -output_json [ouput file for JSON]
372 -output_xls [ouput file for XLS]
373 -output_tabular [ouput file for TABULAR]
374
375 ";
376 exit(1);
377 }
378
379 ## END of script - F Giacomoni
380
381 __END__
382
383 =head1 NAME
384
385 XXX.pl -- script for
386
387 =head1 USAGE
388
389 XXX.pl -precursors -arg1 [-arg2]
390 or XXX.pl -help
391
392 =head1 SYNOPSIS
393
394 This script manage ...
395
396 =head1 DESCRIPTION
397
398 This main program is a ...
399
400 =over 4
401
402 =item B<function01>
403
404 =item B<function02>
405
406 =back
407
408 =head1 AUTHOR
409
410 Franck Giacomoni E<lt>franck.giacomoni@clermont.inra.frE<gt>
411 Yann Guitton
412
413 =head1 LICENSE
414
415 This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
416
417 =head1 VERSION
418
419 version 1 : 05 / 01 / 2016
420
421 version 2 : ??
422
423 =cut