view massbank_ws_searchspectrum.pl @ 0:023c380900ef draft default tip

Init repository with last massbank_ws_searchspectrum master version
author fgiacomoni
date Wed, 19 Apr 2017 11:31:58 -0400
parents
children
line wrap: on
line source

#!perl

## script  : XXX.pl

## Notes :
#	-> manage score sorting : Cleaned_pcGroups done but not in outputs !

#=============================================================================
#                              Included modules and versions
#=============================================================================
## Perl modules
use strict ;
use warnings ;
use Carp qw (cluck croak carp) ;

use threads;
use threads::shared;
use Thread::Queue;

use Data::Dumper ;
use Getopt::Long ;
use POSIX ;
use FindBin ; ## Allows you to locate the directory of original perl script

## Specific Perl Modules (PFEM)
use lib $FindBin::Bin ;
my $binPath = $FindBin::Bin ;
use lib::csv  qw( :ALL ) ;
use lib::conf  qw( :ALL ) ;

## Dedicate Perl Modules (Home made...)
use lib::massbank_api qw( :ALL ) ;
use lib::threader qw(:ALL) ;
use lib::mapper qw(:ALL) ;
use lib::writter qw(:ALL) ;
use lib::massbank_parser qw(:ALL) ;



## Initialized values
my ($help, $mzs_file, $col_mz, $col_int, $col_pcgroup, $line_header ) = ( undef, undef, undef, undef, undef,undef, undef ) ;
my ($server, $ion_mode, $score_threshold, $instruments, $max, $unit, $tol, $cutoff) = ( undef, undef, undef, undef, undef, undef, undef ) ;
my ($output_json, $output_tabular, $output_xlsx, $output_html ) = ( undef, undef, undef, undef ) ;

## Local values ONLY FOR TEST :
#my $server = 'JP' ;
#my $threading_threshold = 6 ;

#=============================================================================
#                                Manage EXCEPTIONS
#=============================================================================
&GetOptions ( 	"help|h"     		=> \$help,       # HELP
				"masses:s"			=> \$mzs_file,
				"col_mz:i"			=> \$col_mz,
				"col_int:i"			=> \$col_int, ## optionnal
				"col_pcgroup:i"		=> \$col_pcgroup,
				"lineheader:i"		=> \$line_header,
				"mode:s"			=> \$ion_mode, 
				"score_threshold:f"	=> \$score_threshold, 
				"instruments:s"		=> \$instruments, # advanced -> to transform into string with comma => done !
				"max:i"				=> \$max, # advanced
				"unit:s"			=> \$unit, # advanced
				"tolerance:f"		=> \$tol, 
				"cutoff:i"			=> \$cutoff, # advanced : intensity cutoff
				"server:s"			=> \$server, ## by default JP and # advanced
				"output_json:s"		=> \$output_json,
				"output_xlsx:s"		=> \$output_xlsx,
				"output_tabular:s"	=> \$output_tabular,
				"output_html:s"		=> \$output_html,
            ) ;
         
## if you put the option -help or -h function help is started
if ( defined($help) ){ &help ; }

#=============================================================================
#                                MAIN SCRIPT
#=============================================================================

## -------------- Conf file ------------------------ :
my ( $CONF ) = ( undef ) ;
foreach my $conf ( <$binPath/*.cfg> ) {
	my $oConf = lib::conf::new() ;
	$CONF = $oConf->as_conf($conf) ;
}

## -------------- HTML template file ------------------------ :
foreach my $html_template ( <$binPath/*.tmpl> ) { $CONF->{'HTML_TEMPLATE'} = $html_template ; }

## Main variables :
my ($pcs, $mzs, $into, $complete_rows, $pcgroups) = (undef, undef, undef, undef, undef) ;

## manage csv file containing list of masses (every thing is manage in jar)
if ( ( defined $mzs_file ) and ( $mzs_file ne "" ) and ( -e $mzs_file ) ) {
	
	## parse csv ids and masses
	my $is_header = undef ;
	my $ocsv = lib::csv->new() ;
	my $csv = $ocsv->get_csv_object( "\t" ) ;
	if ( ( defined $line_header ) and ( $line_header > 0 ) ) { $is_header = 'yes' ;    }
	$pcs = $ocsv->get_value_from_csv_multi_header( $csv, $mzs_file, $col_pcgroup, $is_header, $line_header ) ; ## retrieve pc values on csv
	$mzs = $ocsv->get_value_from_csv_multi_header( $csv, $mzs_file, $col_mz, $is_header, $line_header ) ; ## retrieve mz values on csv
	$into = $ocsv->get_value_from_csv_multi_header( $csv, $mzs_file, $col_int, $is_header, $line_header ) if ( defined $col_int ); ## retrieve into values on csv // optionnal in input files
	$complete_rows = $ocsv->parse_csv_object($csv, \$mzs_file) ; ## parse all csv for output csv build

	## manage input file with no into colunm / init into with a default value of 10
	if ( !defined $col_int ) {
		my $nb_mzs = scalar(@{$mzs}) ;
		my @intos = map {10} (0..$nb_mzs-1) ;
		my $nb_intos = scalar(@intos) ;
		if ($nb_intos == $nb_mzs) { $into = \@intos ;	}
		else { carp "A difference exists between intensity and mz values\n" }
	}
	
	## manage instruments string to array_ref
	if (defined $instruments ) {
		if ($instruments eq '') { ## in xml : can select nothing...
			$instruments = ['all'] ;
		}
		else {
			my @instruments = split(/,/, $instruments) ;
			$instruments = \@instruments ;
		}
	}
	
	
	## Build pcgroups with their features :
	my $omap = lib::mapper->new() ;
	$pcgroups = $omap->get_pcgroups($pcs, $mzs, $into ) ;
	my $pcgroup_list = $omap->get_pcgroup_list($pcs ) ;
	
#	print Dumper $pcgroups ;
	
	my $pc_num = 0 ;
	$pc_num = scalar(@{$pcgroup_list}) ;
	
	## manage a list of query pc_group dependant:
	if ($pcgroups) {
		## - - - - - - -  - - - - -  - - - -  - - - - - Multithreadind mode if pcgroups > 6 - - - - - - - - - - - - - - - - 
		if ($pc_num > $CONF->{'THREADING_THRESHOLD'}) {
			print $server."\n" ;
			print "\n------  ** ** ** Using multithreading mode ** ** ** --------\n\n" ;
			my $time_start = time ;
			
			our $NBTHREADS = $CONF->{'THREADING_THRESHOLD'} ;

#			use constant THREADS => 6 ;
			my $Qworks = Thread::Queue->new();
			my @threads = () ;
			my @queries = () ;
			my @Qresults = () ;
			
			foreach my $pc_group_id (keys %{$pcgroups}) {
				push (@queries, $pcgroups->{$pc_group_id}) if $pcgroups->{$pc_group_id} ;
			}
			
			for (1..$NBTHREADS) {
				my $oworker = lib::threader->new ;
			    push @threads, threads->create(sub { $oworker->searchSpectrumWorker($Qworks, $server, $ion_mode, $instruments, $max, $unit, $tol, $cutoff) ; } ) ;
			}
			
			$Qworks->enqueue(@queries);
			$Qworks->enqueue(undef) for 1..$NBTHREADS;
			push @Qresults, $_->join foreach @threads;

			
			my $time_end = time ;
			my $seconds = $time_end-$time_start ;
			print "\n------  Time used in multithreading mode : $seconds seconds --------\n\n" ;
			
#			print Dumper @Qresults ;
			
			## controle number of returned queries :
			my $massbank_results_num = 0 ;
			$massbank_results_num = scalar @Qresults ;
			
			if ( $massbank_results_num == $pc_num ) {
				## Map @Qresults with annotation hash : pcgroup_id in @Qresults (pcgroup2) // id in $pcgroups (pcgroup2)
				foreach my $result (@Qresults) {
					## manage annotation part
					if ($result->{'pcgroup_id'}) {
						if ($pcgroups->{$result->{'pcgroup_id'}}) {
							$pcgroups->{$result->{'pcgroup_id'}}{'annotation'} = $result ;
						}
						else { carp "Carefull : no mapping possible between massbank results and initial pcgroups data\n";}
					}
					else { carp "Carefull : no pcgroup id defined in massbank results\n"; }
					
					## manage massbank_ids part
					if ($result->{'res'}) {
						my @tmp_res = map {$_->{'id'}} @{$result->{'res'}} ;
						$pcgroups->{$result->{'pcgroup_id'}}{'massbank_ids'} = \@tmp_res ;					
					}
				}
			}
			else {
				croak "[ERROR] : problem between massbank results number and pcgroups number\n";
			}
		}
		## - - - - - - -  - - - - -  - - - -  - - - - - mono thread mode if pcgroups <= 6 - - - - - - - - - - - - - - - - 
		else {
			## connexion
#			print $server."\n" ;
			my $omassbank = lib::massbank_api->new() ;
			my $soap = $omassbank->selectMassBank($server) ;
			print "\n------  ** ** ** Using batch mode ** ** ** --------\n\n" ;
			my $time_start = time ;
			foreach my $pcgroup (keys %{$pcgroups}) {
				## searchSpectrum via SOAP
				print "Annot pcgroup n-$pcgroup\n" ;
				my $oquery = lib::massbank_api->new() ;
				my ($results, $num) = $oquery->searchSpectrum($soap, $pcgroups->{$pcgroup}{'id'}, $pcgroups->{$pcgroup}{'mzmed'}, $pcgroups->{$pcgroup}{'into'}, $ion_mode, $instruments, $max, $unit, $tol, $cutoff) ;
				$pcgroups->{$pcgroup}{'annotation'} = $results ;
	#			print Dumper $results ;
			}
			my $time_end = time ;
			my $seconds = $time_end-$time_start ;
			print "\n------  Time used in foreach mode: $seconds seconds --------\n\n" ;
		}
	}
	else {
		croak "The pcgroup object is not defined\n" ;
	}
#	print "Init pcGroups results are\n"  ;
#	print Dumper $pcgroups ;
	
} ## End of elsif "defined $mzs_file"
else {
	warn "[WARN] Can't use Massbank WS service without an existing input tabular file\n" ;
	&help ;
}

## Clean zone - use threshold on massbank entry returned score
my $omap = lib::mapper->new() ;
my $cleaned_pcgroups = $omap->filter_pcgroup_res($pcgroups, $score_threshold) ;

#print "Cleaned_pcGroups are\n" ;
#print Dumper $cleaned_pcgroups ;

## add min/max value of each mzmed in the pc_group
my $pcgroups_with_intervales = $omap->add_min_max_for_pcgroup_res($cleaned_pcgroups, $tol ) ;

#print "pcGroups_with_intervales are\n" ;
#print Dumper $pcgroups_with_intervales ;


## search in the local indexed db - - - TODO - - - 

## OR search new ones 

## get all unique Massbank Ids found
my $oids = lib::mapper->new() ;
my $all_massbank_ids = $omap->compute_ids_from_pcgroups_res($cleaned_pcgroups) ;

## get entries on the MassBank server by ID by pieces of 10
my $omapper = lib::mapper->new() ;
my $recordList = $omapper->get_massbank_records_by_chunk ($server, $all_massbank_ids, 10) ;
#print "\n\nRecords are\n" ;
#print Dumper $recordList ;
#print Dumper $all_massbank_ids ;

## foreach record - get id and peaks - create a object
my %records = ();
foreach (@$recordList) {
	## parse record handles
	my $parser = lib::massbank_parser->new() ;
	my $id = $parser->getIdFromString($_) ;
	$records{$id}{'peaks'} = $parser->getPeaksFromString($_) ;
	$records{$id}{'names'} = $parser->getChemNamesFromString($_) ;
	$records{$id}{'instrument_type'} = $parser->getInstrumentTypeFromString($_) ;
	$records{$id}{'precursor_type'} = $parser->getPrecursorTypeFromString($_) ;
	$records{$id}{'ms_type'} = $parser->getMsTypeFromString($_) ;
	$records{$id}{'formula'} = $parser->getFormulaFromString($_) ;
	$records{$id}{'exact_mz'} = $parser->getExactMzFromString($_) ;
	$records{$id}{'inchi'} = $parser->getInchiFromString($_) ;
}
#print Dumper %records ;

## Map pc_groups and records
my $well_annoted_pcGroups = $omapper->mapGroupsWithRecords($pcgroups_with_intervales, \%records) ;

#print Dumper $well_annoted_pcGroups ;

## Output writting :
my ( $massbank_matrix ) = ( undef ) ;

## XLS OUTPUT -- new format
if (  (defined $output_xlsx) and  (defined $well_annoted_pcGroups) and  (defined $mzs) and  (defined $pcs)  ) {
	my $owritter = lib::writter->new() ;
	$owritter->write_xls_skel(\$output_xlsx, $mzs, $pcs, $well_annoted_pcGroups, \%records) ;
}

## CSV OUTPUT
if (  (defined $output_tabular) and  (defined $well_annoted_pcGroups) and  (defined $pcs) and  (defined $mzs) ) {
	my $omapper = lib::mapper->new() ;
	if ( ( defined $line_header ) and ( $line_header == 1 ) ) { $massbank_matrix = $omapper->set_massbank_matrix_object('massbank', $pcs, $mzs, $well_annoted_pcGroups, \%records ) ; }
	elsif ( ( defined $line_header ) and ( $line_header == 0 ) ) { $massbank_matrix = $omapper->set_massbank_matrix_object(undef, $pcs, $mzs, $well_annoted_pcGroups, \%records ) ; }

	$massbank_matrix = $omapper->add_massbank_matrix_to_input_matrix($complete_rows, $massbank_matrix) ;
	my $owritter = lib::writter->new() ;
	$owritter->write_csv_skel(\$output_tabular, $massbank_matrix) ;
}

my $json_scalar = undef ;
## JSON OUTPUT
if (  (defined $output_json) and  (defined $well_annoted_pcGroups) and  (defined $mzs) and  (defined $pcs)  ) {
	my $omapper = lib::mapper->new() ;
	$json_scalar = $omapper->map_pc_to_generic_json($pcs, $well_annoted_pcGroups, \%records) ;
	my $owritter = lib::writter->new() ;
	$owritter->write_json_skel(\$output_json, $json_scalar) ;
}

## HTML OUTPUT -- TODO
if (  (defined $output_html) and  (defined $json_scalar)  ) {
	
#	print Dumper $json_scalar ;
	
	## Uses N mz and theirs entries per page (see config file).
	# how many pages you need with your input mz list?
	my $nb_pages_for_html_out = ceil( scalar(@{$mzs} ) / $CONF->{HTML_ENTRIES_PER_PAGE} )  ;
	
	## Search condition:
	my $search_condition = "Search params : Molecular specie = $ion_mode / delta ($unit) = $tol / Score threshold = $score_threshold and max hit = $max per pcgroup" ;
	
	my $oHtml = lib::mapper->new() ;
	my ($tbody_object) = $oHtml->set_html_tbody_object( $nb_pages_for_html_out, $CONF->{HTML_ENTRIES_PER_PAGE} ) ;
	($tbody_object) = $oHtml->add_mz_to_tbody_object($tbody_object, $CONF->{HTML_ENTRIES_PER_PAGE}, $mzs, $json_scalar) ;
	($tbody_object) = $oHtml->add_entries_to_tbody_object($tbody_object, $CONF->{HTML_ENTRIES_PER_PAGE}, $mzs, $json_scalar) ;
	
	my $oWritter = lib::writter->new() ;
	$oWritter->write_html_skel(\$output_html, $tbody_object, $nb_pages_for_html_out, $search_condition, $CONF->{'HTML_TEMPLATE'}, $CONF->{'JS_GALAXY_PATH'}, $CONF->{'CSS_GALAXY_PATH'}) ;
}
else {
	warn "[WARN] The html output file or the json iss not defined\n" ;
}







#====================================================================================
# Help subroutine called with -h option
# number of arguments : 0
# Argument(s)        :
# Return           : 1
#====================================================================================
sub help {
	print STDERR "
massbank_ws_searchspectrum.pl

# massbank_ws_searchspectrum.pl is a script to use SOAP massbank webservice and send specific queries about spectra searches. 
# Input : a list of mzs, intensities, pcgroup.
# Author : Franck Giacomoni
# Email : franck.giacomoni\@clermont.inra.fr
# Version : 1.0
# Created : 20/01/2017
USAGE :		 
		massbank_ws_searchspectrum.pl -help OR
		
		massbank_ws_searchspectrum.pl 
			-masses [name of input file] -col_id -col_mz -col_int -col_pcgroup -lineheader
			-mode [ion mode : Positive, Negative or Both ]
			-score_threshold [Ignore Massbank results with a score lower than the defined threshold]
			-instruments [array of string: all or values obtained by getInstrumentTypes method]
			-max [0 is all results or int]
			-unit [unit or ppm]
			-tolerance [Tolerance of values of m/z of peaks: 0.3 unit or 50 ppm]
			-cutoff [Ignore peaks whose intensity is not larger than the value of cutoff. Default: 50)]
			-server [name of the massbank server : EU or JP only]
			-output_json [ouput file for JSON]
			-output_xls [ouput file for XLS]
			-output_tabular [ouput file for TABULAR]
		
		";
	exit(1);
}

## END of script - F Giacomoni 

__END__

=head1 NAME

 XXX.pl -- script for

=head1 USAGE

 XXX.pl -precursors -arg1 [-arg2] 
 or XXX.pl -help

=head1 SYNOPSIS

This script manage ... 

=head1 DESCRIPTION

This main program is a ...

=over 4

=item B<function01>

=item B<function02>

=back

=head1 AUTHOR

Franck Giacomoni E<lt>franck.giacomoni@clermont.inra.frE<gt>
Yann Guitton 

=head1 LICENSE

This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.

=head1 VERSION

version 1 : 05 / 01 / 2016

version 2 : ??

=cut