Mercurial > repos > fgiacomoni > hmdb_ms_search
view wsdl_hmdb.pl @ 28:6d796927335d draft
master branch Updating with tag :CI_COMMIT_TAG - - Fxx
author | fgiacomoni |
---|---|
date | Tue, 04 Jun 2024 08:54:22 +0000 |
parents | 8f7546d0b925 |
children |
line wrap: on
line source
#!perl ## script : wsdl_hmdb.pl #============================================================================= # Included modules and versions #============================================================================= ## Perl modules use strict ; use warnings ; use Carp qw (cluck croak carp) ; use Data::Dumper ; use Getopt::Long ; use POSIX ; use FindBin ; ## Permet de localisez le repertoire du script perl d'origine ## Specific Modules (Home made...) use lib $FindBin::Bin ; my $binPath = $FindBin::Bin ; use lib::hmdb qw( :ALL ) ; ## PFEM Perl Modules use lib::conf qw( :ALL ) ; use lib::csv qw( :ALL ) ; ## Initialized values my ( $help ) = undef ; my ( $mass ) = undef ; my ( $masses_file, $col_id, $col_mass, $header_choice, $nbline_header ) = ( undef, undef, undef, undef, undef ) ; my $max_query = undef ; my ( $adductType, $delta, $molecular_species, $out_tab, $out_html, $out_xls ) = ( undef, undef, undef, undef, undef, undef ) ; my $advancedFeatures = 0 ; my $VERBOSE = ( 3 ) ; #============================================================================= # Manage EXCEPTIONS #============================================================================= &GetOptions ( "h" => \$help, # HELP "mass:s" => \$mass, ## option : one masse "masses:s" => \$masses_file, ## option : path to the input "header_choice:s" => \$header_choice, ## Presence or not of header in tabular file "nblineheader:i" => \$nbline_header, ## numbre of header line present in file "colfactor:i" => \$col_mass, ## Column id for retrieve formula list in tabular file "adduct_type:s" => \$adductType, ## A list of selected adducts "delta:f" => \$delta, "mode:s" => \$molecular_species, ## Molecular species (positive/negative/neutral) "maxquery:i" => \$max_query, ## Maximum query return (default is 20 entries by query // min 1 & max 50 ) "output_tabular:s" => \$out_tab, ## option : path to the ouput (tabular : input+results ) "output_html|v:s" => \$out_html, ## option : path to the results view (output2) "output_xlsx:s" => \$out_xls, ## option : path to the xls-like format output "advancedFeatures:i"=> \$advancedFeatures, ## option : set to 1 to get advanced options or 0 to get first level only. "verbose:i" => \$VERBOSE, ## VERBOSE Of the tool ) ; #============================================================================= # EXCEPTIONS #============================================================================= $help and &help ; #============================================================================= # MAIN SCRIPT #============================================================================= print "* * * The hmdb client program is launched: * * *\n" if ($VERBOSE>1) ; ## -------------- Conf file ------------------------ : my ( $CONF ) = ( undef ) ; foreach my $conf ( <$binPath/*.cfg> ) { my $oConf = lib::conf::new() ; $CONF = $oConf->as_conf($conf) ; } ## -------------- HTML template file ------------------------ : foreach my $html_template ( <$binPath/*.tmpl> ) { $CONF->{'HTML_TEMPLATE'} = $html_template ; } if (!defined $max_query) { $max_query = $CONF->{'HMDB_MAX_QUERY'} ; } ## --------------- Global parameters ---------------- : my ( $ids, $masses, $results ) = ( undef, undef, undef ) ; my ( $complete_rows, $nb_pages_for_html_out ) = ( undef, 1 ) ; my $metabocard_features = undef ; my $search_condition = "Search params : Molecular specie = $molecular_species / delta (mass-to-charge ratio) = $delta" ; print "\twith $search_condition\n" if ($VERBOSE>1) ; ## $adductType if (defined $adductType ) { print "\twith Adducts: $adductType\n" if ($VERBOSE>1) ; } ## --------------- retrieve input data -------------- : ## manage only one mass if ( ( defined $mass ) and ( $mass ne '' ) ) { my @masses = split(" ", $mass); $masses = \@masses ; for (my $i=1 ; $i<=$#masses+1 ; $i++){ push (@$ids,"mz_0".sprintf("%04s", $i ) ); } } ## END IF ## manage csv file containing list of masses elsif ( ( defined $masses_file ) and ( $masses_file ne "" ) and ( -e $masses_file ) ) { ## parse all csv for later : output csv build my $ocsv_input = lib::csv->new() ; my $complete_csv = $ocsv_input->get_csv_object( "\t" ) ; $complete_rows = $ocsv_input->parse_csv_object($complete_csv, \$masses_file) ; ## parse masses and set ids my $ocsv = lib::csv->new() ; my $csv = $ocsv->get_csv_object( "\t" ) ; if ( ( !defined $nbline_header ) or ( $nbline_header < 0 ) ) { $nbline_header = 0 ; } $masses = $ocsv->get_value_from_csv_multi_header( $csv, $masses_file, $col_mass, $header_choice, $nbline_header ) ; ## retrieve mz values on csv my $nbmz = @$masses ; for (my $i=1 ; $i<=$nbmz+1 ; $i++){ push (@$ids,"mz_0".sprintf("%04s", $i ) ); } } else { warn "[warning] Input data are missing : none mass or file of masses\n" ; &help ; } #print Dumper $masses ; ## ---------------- launch queries -------------------- : if ( ( defined $delta ) and ( $delta > 0 ) and ( defined $molecular_species ) and ( $molecular_species ne '' ) ) { ## prepare masses list and execute query my $oHmdb = lib::hmdb::new() ; my $hmdb_pages = undef ; my $status = undef ; my ($hmdb_ids, $idsNumber) = (undef, 0) ; $results = [] ; # prepare arrays ref my $submasses = $oHmdb->extract_sub_mz_lists($masses, $CONF->{HMDB_LIMITS} ) ; print "\tand ".scalar(@$masses)." masses are submitted as ".scalar(@$submasses)." queries to HMDB \n\n" if ($VERBOSE>1) ; ## get the hmdb server status by a test query - continuous queries or kill script. $status = $oHmdb->testMatchesFromHmdb5WithUA() ; $oHmdb->check_state_from_hmdb_ua($status) ; ## can kill the script execution my $cluster = 1 ; foreach my $mzs ( @{$submasses} ) { # print Dumper $mzs ; my $result = undef ; my $cleanedResult = undef ; my ( $hmdb_masses, $nb_masses_to_submit ) = $oHmdb->prepare_multi_masses_query($mzs) ; my ($hmdb_adducts, $nb_selected_adducts) = $oHmdb->prepareAdductListFormat($adductType) ; print "\n\tSubmission of m/z cluster ".sprintf '%04s',$cluster."" if ($VERBOSE>1) ; ($hmdb_pages, $status) = $oHmdb->getMatchesFromHmdb5WithUA($hmdb_masses, $delta, $molecular_species, $hmdb_adducts) ; print "...HMDB reply results with status: $status\n" if ($VERBOSE>1) ; # print Dumper $hmdb_pages ; sleep(1) ; ## hard modification with $max_query fixed at 1000 !!! Need to be refactoring ## Cutof will be done in next method after URI check ($result) = $oHmdb->parseHmdb5CSVResults($hmdb_pages, $mzs, 1000) ; ## hash format result # print Dumper $result ; ## This previous step return results with cutoff on the number of entries returned ! ## clean/max result by testing each HMDB_ID page in HMDB ($cleanedResult) = $oHmdb->checkHmdbUrlEntries($CONF->{'HMDB_METABOCARD_URL'}, $result, $max_query ) ; $results = [ @$results, @$cleanedResult ] ; # sleep(1) ; $cluster ++ ; } ## Add more information of each found metabolite (1 for extra or 0 by default) if ($advancedFeatures > 0) { ## foreach metabolite get its own metabocard ($hmdb_ids, $idsNumber) = $oHmdb->get_unik_ids_from_results($results) ; print "\tComplementary annotation: asking for $idsNumber metabocards\n" if ($VERBOSE>1) ; # $hmdb_ids->{'HMDB03125'} = 1 , $metabocard_features = $oHmdb->get_hmdb_metabocard_from_id($hmdb_ids, $CONF->{'HMDB_METABOCARD_URL'}) ; ## Try to multithread the querying # print Dumper $results ; # print Dumper $hmdb_ids ; # print Dumper $metabocard_features ; ## Map metabocards with results (add supplementary data) if ( ( defined $results ) and ( defined $metabocard_features ) ) { $results = $oHmdb->map_suppl_data_on_hmdb_results($results, $metabocard_features) ; } } else { print "\tNo complementary annotation asked\n" if ($VERBOSE>1) ; ## Fill with msg not asked advanced annotation $results = $oHmdb->map_suppl_data_on_hmdb_results($results, undef) ; } ## Uses N mz and theirs entries per page (see config file). # how many pages you need with your input mz list? $nb_pages_for_html_out = ceil( scalar(@{$masses} ) / $CONF->{HTML_ENTRIES_PER_PAGE} ) ; # print Dumper $results ; } else { croak "Can't work with HMDB : missing paramaters (list of masses, delta or molecular species)\n" ; } ## end ELSE ## -------------- Produce HTML/CSV output ------------------ : print "\n\tProducing html and tabular outputs\n" if ($VERBOSE>1) ; if ( ( defined $out_html ) and ( defined $results ) ) { my $oHtml = lib::hmdb::new() ; my ($tbody_object) = $oHtml->set_html_tbody_object( $nb_pages_for_html_out, $CONF->{HTML_ENTRIES_PER_PAGE} ) ; ($tbody_object) = $oHtml->add_mz_to_tbody_object($tbody_object, $CONF->{HTML_ENTRIES_PER_PAGE}, $masses, $ids) ; ($tbody_object) = $oHtml->add_entries_to_tbody_object($tbody_object, $CONF->{HTML_ENTRIES_PER_PAGE}, $masses, $results) ; # print Dumper $tbody_object ; my $output_html = $oHtml->write_html_skel(\$out_html, $tbody_object, $nb_pages_for_html_out, $search_condition, $CONF->{'HTML_TEMPLATE'}, $CONF->{'JS_GALAXY_PATH'}, $CONF->{'CSS_GALAXY_PATH'}) ; } ## END IF else { warn "Can't create a HTML output for HMDB : no result found or your output file is not defined\n" ; } if ( ( defined $out_tab ) and ( defined $results ) ) { # produce a csv based on METLIN format my $ocsv = lib::hmdb::new() ; if (defined $masses_file) { my $lm_matrix = undef ; if ( ( defined $nbline_header ) and ( $header_choice eq 'yes' ) ) { # $lm_matrix = $ocsv->set_lm_matrix_object('hmdb', $masses, $results ) ; $lm_matrix = $ocsv->set_hmdb_matrix_object_with_ids('hmdb', $masses, $results ) ; $lm_matrix = $ocsv->add_lm_matrix_to_input_matrix($complete_rows, $lm_matrix, $nbline_header-1) ; } elsif ( ( $header_choice eq 'no' ) ) { # $lm_matrix = $ocsv->set_lm_matrix_object(undef, $masses, $results ) ; $lm_matrix = $ocsv->set_hmdb_matrix_object_with_ids(undef, $masses, $results ) ; $lm_matrix = $ocsv->add_lm_matrix_to_input_matrix($complete_rows, $lm_matrix, 0) ; } $ocsv->write_csv_skel(\$out_tab, $lm_matrix) ; } elsif (defined $mass) { $ocsv->write_csv_one_mass($masses, $ids, $results, $out_tab) ; } } ## END IF else { warn "Can't create a tabular output for HMDB : no result found or your output file is not defined\n" ; } ## Write XLS like format if ( ( defined $out_xls ) and ( defined $results ) ) { my $ocsv = lib::hmdb::new() ; $ocsv->write_csv_one_mass($masses, $ids, $results, $out_xls) ; } print "\n* * * The hmdb client program ended * * *\n" if ($VERBOSE>1) ; #==================================================================================== # Help subroutine called with -h option # number of arguments : 0 # Argument(s) : # Return : 1 #==================================================================================== sub help { print STDERR " help of wsdl_hmdb # wsdl_hmdb is a script to query HMDB website using mz and return a list of candidates sent by HMDB based on the ms search tool. # Input : accurate mz or list of accurate masses # Author : Franck Giacomoni and Marion Landi # Email : fgiacomoni\@inra.fr # Version : 1.6 # Created : 08/07/2012 # Updated : 22/06/2022 - 1.7.2 USAGE : wsdl_hmdb.pl -mass [one mass or a string list of exact masses] -delta [mz delta] -mode [molecular species: positive|negative|neutral] -output [output tabular file] -view [output html file] or wsdl_hmdb.pl -masses [an input file of mzs] -colfactor [col of mz] -header_choice [yes|no] -nblineheader [nb of lines containing file header : 0-n] -delta [mz delta] -mode [molecular species: positive|negative|neutral] -output [output tabular file] -view [output html file] -advancedFeatures [Default 0, set to 1 for advanced features as logp, inchi, ...] or wsdl_hmdb.pl -h for help "; exit(1); } ## END of script - F Giacomoni __END__ =head1 NAME wsdl_hmdb.pl -- script to query HMDB website using mz and return a list of candidates sent by HMDB based on the ms search tool. =head1 USAGE wsdl_hmdb.pl -mass [one mass or a string list of exact masses] -delta [mz delta] -mode [molecular species: positive|negative|neutral] -output [output tabular file] -view [output html file] or wsdl_hmdb.pl -masses [an input file of mzs] -colfactor [col of mz] -header_choice [yes|no] -nblineheader [nb of lines containing file header : 0-n] -delta [mz delta] -mode [molecular species: positive|negative|neutral] -output [output tabular file] -view [output html file] =head1 SYNOPSIS This script manages batch queries on HMDB server (v5.0). =head1 DESCRIPTION This main program is a script to query HMDB website using mz and return a list of candidates sent by HMDB based on the ms search tool. =over 4 =item B<function01> =item B<function02> =back =head1 AUTHOR Franck Giacomoni E<lt>franck.giacomoni@inra.frE<gt> =head1 LICENSE This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 VERSION version 1.0 : 06 / 06 / 2013 version 1.2 : 27 / 01 / 2014 version 1.3 : 19 / 11 / 2014 version 1.4 : 21 / 01 / 2016 - a clean version for community version 1.5 : 19 / 01 / 2018 - modify parser to match HMDB V4.0 formats version 1.6 : 23 / 01 / 2019 - Manage UA http error and advanced feature from hmdb. version 1.6.1 : 30 / 01 / 2019 - Adding adducts and fixxing minors bugs and requirements version 1.7.0 : 19/ 05 / 2022 - Update HMDB API client - compliant with HMDB 5.0 web portal version 1.7.2 : 22/ 06 / 2022 - Fix issue with neg result parsing =cut