Mercurial > repos > fgiacomoni > massbank_ws_searchspectrum
diff massbank_ws_searchspectrum.pl @ 0:023c380900ef draft default tip
Init repository with last massbank_ws_searchspectrum master version
author | fgiacomoni |
---|---|
date | Wed, 19 Apr 2017 11:31:58 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/massbank_ws_searchspectrum.pl Wed Apr 19 11:31:58 2017 -0400 @@ -0,0 +1,423 @@ +#!perl + +## script : XXX.pl + +## Notes : +# -> manage score sorting : Cleaned_pcGroups done but not in outputs ! + +#============================================================================= +# Included modules and versions +#============================================================================= +## Perl modules +use strict ; +use warnings ; +use Carp qw (cluck croak carp) ; + +use threads; +use threads::shared; +use Thread::Queue; + +use Data::Dumper ; +use Getopt::Long ; +use POSIX ; +use FindBin ; ## Allows you to locate the directory of original perl script + +## Specific Perl Modules (PFEM) +use lib $FindBin::Bin ; +my $binPath = $FindBin::Bin ; +use lib::csv qw( :ALL ) ; +use lib::conf qw( :ALL ) ; + +## Dedicate Perl Modules (Home made...) +use lib::massbank_api qw( :ALL ) ; +use lib::threader qw(:ALL) ; +use lib::mapper qw(:ALL) ; +use lib::writter qw(:ALL) ; +use lib::massbank_parser qw(:ALL) ; + + + +## Initialized values +my ($help, $mzs_file, $col_mz, $col_int, $col_pcgroup, $line_header ) = ( undef, undef, undef, undef, undef,undef, undef ) ; +my ($server, $ion_mode, $score_threshold, $instruments, $max, $unit, $tol, $cutoff) = ( undef, undef, undef, undef, undef, undef, undef ) ; +my ($output_json, $output_tabular, $output_xlsx, $output_html ) = ( undef, undef, undef, undef ) ; + +## Local values ONLY FOR TEST : +#my $server = 'JP' ; +#my $threading_threshold = 6 ; + +#============================================================================= +# Manage EXCEPTIONS +#============================================================================= +&GetOptions ( "help|h" => \$help, # HELP + "masses:s" => \$mzs_file, + "col_mz:i" => \$col_mz, + "col_int:i" => \$col_int, ## optionnal + "col_pcgroup:i" => \$col_pcgroup, + "lineheader:i" => \$line_header, + "mode:s" => \$ion_mode, + "score_threshold:f" => \$score_threshold, + "instruments:s" => \$instruments, # advanced -> to transform into string with comma => done ! + "max:i" => \$max, # advanced + "unit:s" => \$unit, # advanced + "tolerance:f" => \$tol, + "cutoff:i" => \$cutoff, # advanced : intensity cutoff + "server:s" => \$server, ## by default JP and # advanced + "output_json:s" => \$output_json, + "output_xlsx:s" => \$output_xlsx, + "output_tabular:s" => \$output_tabular, + "output_html:s" => \$output_html, + ) ; + +## if you put the option -help or -h function help is started +if ( defined($help) ){ &help ; } + +#============================================================================= +# MAIN SCRIPT +#============================================================================= + +## -------------- Conf file ------------------------ : +my ( $CONF ) = ( undef ) ; +foreach my $conf ( <$binPath/*.cfg> ) { + my $oConf = lib::conf::new() ; + $CONF = $oConf->as_conf($conf) ; +} + +## -------------- HTML template file ------------------------ : +foreach my $html_template ( <$binPath/*.tmpl> ) { $CONF->{'HTML_TEMPLATE'} = $html_template ; } + +## Main variables : +my ($pcs, $mzs, $into, $complete_rows, $pcgroups) = (undef, undef, undef, undef, undef) ; + +## manage csv file containing list of masses (every thing is manage in jar) +if ( ( defined $mzs_file ) and ( $mzs_file ne "" ) and ( -e $mzs_file ) ) { + + ## parse csv ids and masses + my $is_header = undef ; + my $ocsv = lib::csv->new() ; + my $csv = $ocsv->get_csv_object( "\t" ) ; + if ( ( defined $line_header ) and ( $line_header > 0 ) ) { $is_header = 'yes' ; } + $pcs = $ocsv->get_value_from_csv_multi_header( $csv, $mzs_file, $col_pcgroup, $is_header, $line_header ) ; ## retrieve pc values on csv + $mzs = $ocsv->get_value_from_csv_multi_header( $csv, $mzs_file, $col_mz, $is_header, $line_header ) ; ## retrieve mz values on csv + $into = $ocsv->get_value_from_csv_multi_header( $csv, $mzs_file, $col_int, $is_header, $line_header ) if ( defined $col_int ); ## retrieve into values on csv // optionnal in input files + $complete_rows = $ocsv->parse_csv_object($csv, \$mzs_file) ; ## parse all csv for output csv build + + ## manage input file with no into colunm / init into with a default value of 10 + if ( !defined $col_int ) { + my $nb_mzs = scalar(@{$mzs}) ; + my @intos = map {10} (0..$nb_mzs-1) ; + my $nb_intos = scalar(@intos) ; + if ($nb_intos == $nb_mzs) { $into = \@intos ; } + else { carp "A difference exists between intensity and mz values\n" } + } + + ## manage instruments string to array_ref + if (defined $instruments ) { + if ($instruments eq '') { ## in xml : can select nothing... + $instruments = ['all'] ; + } + else { + my @instruments = split(/,/, $instruments) ; + $instruments = \@instruments ; + } + } + + + ## Build pcgroups with their features : + my $omap = lib::mapper->new() ; + $pcgroups = $omap->get_pcgroups($pcs, $mzs, $into ) ; + my $pcgroup_list = $omap->get_pcgroup_list($pcs ) ; + +# print Dumper $pcgroups ; + + my $pc_num = 0 ; + $pc_num = scalar(@{$pcgroup_list}) ; + + ## manage a list of query pc_group dependant: + if ($pcgroups) { + ## - - - - - - - - - - - - - - - - - - - - - Multithreadind mode if pcgroups > 6 - - - - - - - - - - - - - - - - + if ($pc_num > $CONF->{'THREADING_THRESHOLD'}) { + print $server."\n" ; + print "\n------ ** ** ** Using multithreading mode ** ** ** --------\n\n" ; + my $time_start = time ; + + our $NBTHREADS = $CONF->{'THREADING_THRESHOLD'} ; + +# use constant THREADS => 6 ; + my $Qworks = Thread::Queue->new(); + my @threads = () ; + my @queries = () ; + my @Qresults = () ; + + foreach my $pc_group_id (keys %{$pcgroups}) { + push (@queries, $pcgroups->{$pc_group_id}) if $pcgroups->{$pc_group_id} ; + } + + for (1..$NBTHREADS) { + my $oworker = lib::threader->new ; + push @threads, threads->create(sub { $oworker->searchSpectrumWorker($Qworks, $server, $ion_mode, $instruments, $max, $unit, $tol, $cutoff) ; } ) ; + } + + $Qworks->enqueue(@queries); + $Qworks->enqueue(undef) for 1..$NBTHREADS; + push @Qresults, $_->join foreach @threads; + + + my $time_end = time ; + my $seconds = $time_end-$time_start ; + print "\n------ Time used in multithreading mode : $seconds seconds --------\n\n" ; + +# print Dumper @Qresults ; + + ## controle number of returned queries : + my $massbank_results_num = 0 ; + $massbank_results_num = scalar @Qresults ; + + if ( $massbank_results_num == $pc_num ) { + ## Map @Qresults with annotation hash : pcgroup_id in @Qresults (pcgroup2) // id in $pcgroups (pcgroup2) + foreach my $result (@Qresults) { + ## manage annotation part + if ($result->{'pcgroup_id'}) { + if ($pcgroups->{$result->{'pcgroup_id'}}) { + $pcgroups->{$result->{'pcgroup_id'}}{'annotation'} = $result ; + } + else { carp "Carefull : no mapping possible between massbank results and initial pcgroups data\n";} + } + else { carp "Carefull : no pcgroup id defined in massbank results\n"; } + + ## manage massbank_ids part + if ($result->{'res'}) { + my @tmp_res = map {$_->{'id'}} @{$result->{'res'}} ; + $pcgroups->{$result->{'pcgroup_id'}}{'massbank_ids'} = \@tmp_res ; + } + } + } + else { + croak "[ERROR] : problem between massbank results number and pcgroups number\n"; + } + } + ## - - - - - - - - - - - - - - - - - - - - - mono thread mode if pcgroups <= 6 - - - - - - - - - - - - - - - - + else { + ## connexion +# print $server."\n" ; + my $omassbank = lib::massbank_api->new() ; + my $soap = $omassbank->selectMassBank($server) ; + print "\n------ ** ** ** Using batch mode ** ** ** --------\n\n" ; + my $time_start = time ; + foreach my $pcgroup (keys %{$pcgroups}) { + ## searchSpectrum via SOAP + print "Annot pcgroup n-$pcgroup\n" ; + my $oquery = lib::massbank_api->new() ; + my ($results, $num) = $oquery->searchSpectrum($soap, $pcgroups->{$pcgroup}{'id'}, $pcgroups->{$pcgroup}{'mzmed'}, $pcgroups->{$pcgroup}{'into'}, $ion_mode, $instruments, $max, $unit, $tol, $cutoff) ; + $pcgroups->{$pcgroup}{'annotation'} = $results ; + # print Dumper $results ; + } + my $time_end = time ; + my $seconds = $time_end-$time_start ; + print "\n------ Time used in foreach mode: $seconds seconds --------\n\n" ; + } + } + else { + croak "The pcgroup object is not defined\n" ; + } +# print "Init pcGroups results are\n" ; +# print Dumper $pcgroups ; + +} ## End of elsif "defined $mzs_file" +else { + warn "[WARN] Can't use Massbank WS service without an existing input tabular file\n" ; + &help ; +} + +## Clean zone - use threshold on massbank entry returned score +my $omap = lib::mapper->new() ; +my $cleaned_pcgroups = $omap->filter_pcgroup_res($pcgroups, $score_threshold) ; + +#print "Cleaned_pcGroups are\n" ; +#print Dumper $cleaned_pcgroups ; + +## add min/max value of each mzmed in the pc_group +my $pcgroups_with_intervales = $omap->add_min_max_for_pcgroup_res($cleaned_pcgroups, $tol ) ; + +#print "pcGroups_with_intervales are\n" ; +#print Dumper $pcgroups_with_intervales ; + + +## search in the local indexed db - - - TODO - - - + +## OR search new ones + +## get all unique Massbank Ids found +my $oids = lib::mapper->new() ; +my $all_massbank_ids = $omap->compute_ids_from_pcgroups_res($cleaned_pcgroups) ; + +## get entries on the MassBank server by ID by pieces of 10 +my $omapper = lib::mapper->new() ; +my $recordList = $omapper->get_massbank_records_by_chunk ($server, $all_massbank_ids, 10) ; +#print "\n\nRecords are\n" ; +#print Dumper $recordList ; +#print Dumper $all_massbank_ids ; + +## foreach record - get id and peaks - create a object +my %records = (); +foreach (@$recordList) { + ## parse record handles + my $parser = lib::massbank_parser->new() ; + my $id = $parser->getIdFromString($_) ; + $records{$id}{'peaks'} = $parser->getPeaksFromString($_) ; + $records{$id}{'names'} = $parser->getChemNamesFromString($_) ; + $records{$id}{'instrument_type'} = $parser->getInstrumentTypeFromString($_) ; + $records{$id}{'precursor_type'} = $parser->getPrecursorTypeFromString($_) ; + $records{$id}{'ms_type'} = $parser->getMsTypeFromString($_) ; + $records{$id}{'formula'} = $parser->getFormulaFromString($_) ; + $records{$id}{'exact_mz'} = $parser->getExactMzFromString($_) ; + $records{$id}{'inchi'} = $parser->getInchiFromString($_) ; +} +#print Dumper %records ; + +## Map pc_groups and records +my $well_annoted_pcGroups = $omapper->mapGroupsWithRecords($pcgroups_with_intervales, \%records) ; + +#print Dumper $well_annoted_pcGroups ; + +## Output writting : +my ( $massbank_matrix ) = ( undef ) ; + +## XLS OUTPUT -- new format +if ( (defined $output_xlsx) and (defined $well_annoted_pcGroups) and (defined $mzs) and (defined $pcs) ) { + my $owritter = lib::writter->new() ; + $owritter->write_xls_skel(\$output_xlsx, $mzs, $pcs, $well_annoted_pcGroups, \%records) ; +} + +## CSV OUTPUT +if ( (defined $output_tabular) and (defined $well_annoted_pcGroups) and (defined $pcs) and (defined $mzs) ) { + my $omapper = lib::mapper->new() ; + if ( ( defined $line_header ) and ( $line_header == 1 ) ) { $massbank_matrix = $omapper->set_massbank_matrix_object('massbank', $pcs, $mzs, $well_annoted_pcGroups, \%records ) ; } + elsif ( ( defined $line_header ) and ( $line_header == 0 ) ) { $massbank_matrix = $omapper->set_massbank_matrix_object(undef, $pcs, $mzs, $well_annoted_pcGroups, \%records ) ; } + + $massbank_matrix = $omapper->add_massbank_matrix_to_input_matrix($complete_rows, $massbank_matrix) ; + my $owritter = lib::writter->new() ; + $owritter->write_csv_skel(\$output_tabular, $massbank_matrix) ; +} + +my $json_scalar = undef ; +## JSON OUTPUT +if ( (defined $output_json) and (defined $well_annoted_pcGroups) and (defined $mzs) and (defined $pcs) ) { + my $omapper = lib::mapper->new() ; + $json_scalar = $omapper->map_pc_to_generic_json($pcs, $well_annoted_pcGroups, \%records) ; + my $owritter = lib::writter->new() ; + $owritter->write_json_skel(\$output_json, $json_scalar) ; +} + +## HTML OUTPUT -- TODO +if ( (defined $output_html) and (defined $json_scalar) ) { + +# print Dumper $json_scalar ; + + ## Uses N mz and theirs entries per page (see config file). + # how many pages you need with your input mz list? + my $nb_pages_for_html_out = ceil( scalar(@{$mzs} ) / $CONF->{HTML_ENTRIES_PER_PAGE} ) ; + + ## Search condition: + my $search_condition = "Search params : Molecular specie = $ion_mode / delta ($unit) = $tol / Score threshold = $score_threshold and max hit = $max per pcgroup" ; + + my $oHtml = lib::mapper->new() ; + my ($tbody_object) = $oHtml->set_html_tbody_object( $nb_pages_for_html_out, $CONF->{HTML_ENTRIES_PER_PAGE} ) ; + ($tbody_object) = $oHtml->add_mz_to_tbody_object($tbody_object, $CONF->{HTML_ENTRIES_PER_PAGE}, $mzs, $json_scalar) ; + ($tbody_object) = $oHtml->add_entries_to_tbody_object($tbody_object, $CONF->{HTML_ENTRIES_PER_PAGE}, $mzs, $json_scalar) ; + + my $oWritter = lib::writter->new() ; + $oWritter->write_html_skel(\$output_html, $tbody_object, $nb_pages_for_html_out, $search_condition, $CONF->{'HTML_TEMPLATE'}, $CONF->{'JS_GALAXY_PATH'}, $CONF->{'CSS_GALAXY_PATH'}) ; +} +else { + warn "[WARN] The html output file or the json iss not defined\n" ; +} + + + + + + + +#==================================================================================== +# Help subroutine called with -h option +# number of arguments : 0 +# Argument(s) : +# Return : 1 +#==================================================================================== +sub help { + print STDERR " +massbank_ws_searchspectrum.pl + +# massbank_ws_searchspectrum.pl is a script to use SOAP massbank webservice and send specific queries about spectra searches. +# Input : a list of mzs, intensities, pcgroup. +# Author : Franck Giacomoni +# Email : franck.giacomoni\@clermont.inra.fr +# Version : 1.0 +# Created : 20/01/2017 +USAGE : + massbank_ws_searchspectrum.pl -help OR + + massbank_ws_searchspectrum.pl + -masses [name of input file] -col_id -col_mz -col_int -col_pcgroup -lineheader + -mode [ion mode : Positive, Negative or Both ] + -score_threshold [Ignore Massbank results with a score lower than the defined threshold] + -instruments [array of string: all or values obtained by getInstrumentTypes method] + -max [0 is all results or int] + -unit [unit or ppm] + -tolerance [Tolerance of values of m/z of peaks: 0.3 unit or 50 ppm] + -cutoff [Ignore peaks whose intensity is not larger than the value of cutoff. Default: 50)] + -server [name of the massbank server : EU or JP only] + -output_json [ouput file for JSON] + -output_xls [ouput file for XLS] + -output_tabular [ouput file for TABULAR] + + "; + exit(1); +} + +## END of script - F Giacomoni + +__END__ + +=head1 NAME + + XXX.pl -- script for + +=head1 USAGE + + XXX.pl -precursors -arg1 [-arg2] + or XXX.pl -help + +=head1 SYNOPSIS + +This script manage ... + +=head1 DESCRIPTION + +This main program is a ... + +=over 4 + +=item B<function01> + +=item B<function02> + +=back + +=head1 AUTHOR + +Franck Giacomoni E<lt>franck.giacomoni@clermont.inra.frE<gt> +Yann Guitton + +=head1 LICENSE + +This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. + +=head1 VERSION + +version 1 : 05 / 01 / 2016 + +version 2 : ?? + +=cut \ No newline at end of file