diff lib/hmdb_api.pm @ 0:7c9269bded0e draft

Init repository for [downloader_bank_hmdb]
author fgiacomoni
date Tue, 14 Jan 2020 05:21:23 -0500
parents
children be504ccbc41c
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/hmdb_api.pm	Tue Jan 14 05:21:23 2020 -0500
@@ -0,0 +1,342 @@
+package hmdb_api ;
+
+use strict;
+use warnings ;
+use Exporter ;
+use Carp ;
+
+use Data::Dumper ;
+use XML::Twig ;
+
+use csv ;
+
+use vars qw($VERSION @ISA @EXPORT %EXPORT_TAGS);
+
+our $VERSION = "1.0";
+our @ISA = qw(Exporter);
+our @EXPORT = qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz);
+our %EXPORT_TAGS = ( ALL => [qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz)] );
+
+=head1 NAME
+
+My::Module - An example module
+
+=head1 SYNOPSIS
+
+    use My::Module;
+    my $object = My::Module->new();
+    print $object->as_string;
+
+=head1 DESCRIPTION
+
+This module does not really exist, it
+was made for the sole purpose of
+demonstrating how POD works.
+
+=head1 METHODS
+
+Methods are :
+
+=head2 METHOD new
+
+	## Description : new
+	## Input : $self
+	## Ouput : bless $self ;
+	## Usage : new() ;
+
+=cut
+
+sub new {
+    ## Variables
+    my $self={};
+    bless($self) ;
+    return $self ;
+}
+### END of SUB
+
+
+=head2 METHOD cowmetdb_handle
+
+	## Description : open a flat file and push the contains in memory - compute entries number.
+	## Input : $flat
+	## Output : $handler, $entries
+	## Usage : my ( $handler ) = cowmetdb_handle( $flat ) ;
+	
+=cut
+## START of SUB
+sub cowmetdb_handle {
+	## Retrieve Values
+    my $self = shift ;
+    my ( $flat ) = @_ ;
+    
+    my @handle = () ;
+    my $entries = 0 ;
+    my ( $begin, $end ) = ( 0, 0 ) ;
+    
+    if ( -e $flat ) {
+    	open(FILE, "<$flat") or die "Cant' read the file $flat\n" ;
+    	while (my $line = <FILE>){
+    		chomp $line ;
+    		push(@handle, $line) ;
+    		if ( $line =~ /^#BEGIN_METABOCARD/ ) { $begin = 1 ; }
+    		elsif ( ( $line =~ /^#END_METABOCARD/ ) and ( $begin == 1 ) ){ $end = 1 ; }
+    		## count entries
+    		if ( ( $end == 1 ) and ( $begin == 1 ) ){ $entries++ ; ( $begin, $end ) = ( 0, 0 ) ; }
+    	}
+    	close(FILE) ;
+    }
+    else {
+    	croak "Can't find the source file $flat\n" ;
+    }
+    
+    return(\@handle, \$entries) ;
+}
+## END of SUB
+
+=head2 METHOD cowmetdb_hash
+
+	## Description : work on a hmdb flat text handler and field data (selected fields), build a hash for each found entry
+	## Input : $handler
+	## Output : $entries
+	## Usage : my ( $entries ) = hmdb_hash( $handler ) ;
+	
+=cut
+## START of SUB
+sub cowmetdb_hash {
+	## Retrieve Values
+    my $self = shift ;
+    my ( $handle ) = @_ ;
+    
+    my @entries = () ;
+    my %entry = () ;
+    my $pos = 0 ;
+    
+    if ( ( defined $handle ) ) {
+    	foreach my $data ( @$handle ) {
+    		
+    		if( $data =~ /^#BEGIN_METABOCARD/ ) { %entry = () ; }
+    		elsif( $data =~ /^#END_METABOCARD/ ) { my %temp = %entry ; push (@entries, \%temp) ; }
+    	    elsif( $data =~ /^# name:/ ) { 						$entry{'COMMON_NAME'} = $handle->[$pos+1] ; }
+			elsif( $data =~ /^# iupac:/ ) { 					$entry{'IUPAC'} = $handle->[$pos+1] ; }
+			elsif( $data =~ /^# kegg_compound_id:/ ) { 			$entry{'KEGG_ID'} = $handle->[$pos+1] ; }
+    	    elsif( $data =~ /^# chemical_formula:/ ) {			$entry{'FORMULA'} = $handle->[$pos+1] ; }
+    	    elsif( $data =~ /^# taxonomy_super_class:/ ) {		$entry{'TAXONOMY'} = $handle->[$pos+1] ; }
+    	    elsif( $data =~ /^# cas_number:/ ) {				$entry{'CAS'} = $handle->[$pos+1] ; }
+    	    elsif( $data =~ /^# biofluid_location:/ ) {			$entry{'LOCATION'} = $handle->[$pos+1] ; }
+    	    elsif( $data =~ /^# inchi_identifier:/ ) {			$entry{'INCHI'} = $handle->[$pos+1] ; }
+    	    elsif( $data =~ /^# weight_average:/ ) {			$entry{'MZ_AVERAGE'} = $handle->[$pos+1] ; }
+    	    elsif( $data =~ /^# weight_mono:/ ) {				$entry{'MZ_MONO'} = $handle->[$pos+1] ; }
+    	    elsif( $data =~ /^# biocyc_id:/ ) {					$entry{'BIOCYC_ID'} = $handle->[$pos+1] ; }
+    	    elsif( $data =~ /^# hmdb_id:/ ) {					$entry{'HMDB_ID'} = $handle->[$pos+1] ; }
+    	    
+    	    $pos++ ;
+    	}
+    }
+    else {
+    	croak "Handle is not defined : parsing step impossible\n" ;
+    }
+    
+    return(\@entries) ;
+}
+## END of SUB
+
+
+
+=head2 METHOD getMetaboliteFeatures
+
+	## Description : get metabolites features from a xml file
+	## Input : $xmlFile, 
+	## Output : $metabolites
+	## Usage : $metabolites = getMetaboliteFeatures($xmlFile) ;
+	
+=cut
+sub getMetaboliteFeatures {
+	## Retrieve Values
+    my $self = shift ;
+    my ( $xmlFile ) = @_ ;
+    
+    my %metabolites = () ;
+    my $twig = undef ;
+    my $id = undef ;
+    
+    if (-e $xmlFile) {
+    	
+    	$twig = XML::Twig->nparse_ppe(
+					
+			twig_handlers => {
+					'metabolite/accession' => sub {$id = $_ -> text_only ; $metabolites{$id} = undef ; } ,
+					# metabolite name
+					'metabolite/name' => sub { $metabolites{$id}{'metabolite_name'} = $_ -> text_only ; } ,
+					# metabolite chemical_formula
+					'metabolite/chemical_formula' => sub { $metabolites{$id}{'chemical_formula'} = $_ -> text_only ; } ,
+					# metabolite monisotopic_molecular_weight
+					'metabolite/monisotopic_molecular_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ## general case
+					'metabolite/monisotopic_moleculate_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ## 
+					# metabolite inchikey
+					'metabolite/inchikey' => sub { $metabolites{$id}{'inchikey'} = $_ -> text_only ; } ,
+			}, 
+			pretty_print => 'indented', 
+			error_context => 1, $xmlFile
+		);
+						
+#		$twig->print;
+		$twig->purge ;
+    }
+    
+    ## get number of entries:
+    my $X = keys %metabolites ;
+    
+    return (\%metabolites, $X) ;
+    
+	
+}  
+### END of SUB
+
+=head2 METHOD setMetaboliteAcurrateMzToModesMz
+
+	## Description : set M+H and M-H masses from a metabolite (M) accurate mass
+	## Input : $metabolites, $proton_mass, $electron_mass
+	## Output : $mzsMetabolites
+	## Usage : my ( $mzsMetabolites ) = setMetaboliteAcurrateMzToModesMz ( $metabolites, $proton_mass, $electron_mass ) ;
+	
+=cut
+## START of SUB
+sub setMetaboliteAcurrateMzToModesMz {
+    ## Retrieve Values
+    my $self = shift ;
+    my ( $format, $metabolites, $proton_mass, $electron_mass, $charge ) = @_;
+    
+    if ($format eq 'XML') {
+		foreach my $id (sort keys %{$metabolites}) {
+			if ( $metabolites->{$id}{'monisotopic_molecular_weight'} ) {
+				my $tmp_mass = $metabolites->{$id}{'monisotopic_molecular_weight'} ;
+				$metabolites->{$id}{'[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ;
+				$metabolites->{$id}{'[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ;
+			}
+			else {
+				warn "No monisotopic_molecular_weight field exists with metabolite $id\n " ;
+			}
+	    }	
+	}
+	elsif ( ($format eq 'CARD') ) {
+		foreach my $entry (@$metabolites) {
+				if ( $entry->{'MZ_MONO'} ) {
+					my $tmp_mass = $entry->{'MZ_MONO'} ;
+					$entry->{'MZ_[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ;
+					$entry->{'MZ_[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ;
+				}
+				else {
+					warn "No MZ_MONO field exists with metabolite $entry->{'HMDB_ID'}\n " ;
+				}
+		}
+	}
+    
+    
+    return ($metabolites) ;
+}
+### END of SUB
+
+=head2 METHOD buildMetabolitesArray
+
+	## Description : build a metabolite list from xml extraction
+	## Input : $metabolites, $headers
+	## Output : $metabolitesSorted
+	## Usage : my ( $metabolitesSorted ) = buildMetabolitesArray ( $metabolites, $headers ) ;
+	
+=cut
+## START of SUB
+sub buildMetabolitesArray {
+    ## Retrieve Values
+    my $self = shift ;
+    my ( $metabolites, $headers ) = @_;
+    my ( @metabolitesSorted ) = ( () ) ;
+    
+    ## header format is ['HMDB_ID','MzBank', 'MetName', 'ChemFormula', 'INChIkey']
+    if (defined $headers) {
+    	push ( @metabolitesSorted, $headers ) ;
+    }
+    else {
+    	push ( @metabolitesSorted, ['HMDB_ID','MzBank', '[M+H]+', '[M-H]-', 'MetName', 'ChemFormula', 'INChIkey'] ) ;
+    }
+    
+    foreach my $id (sort keys %{$metabolites}) {
+    	my @tmp = () ;
+    	push (@tmp, $id) ;
+    	push (@tmp, $metabolites->{$id}{'monisotopic_molecular_weight'}) ;
+    	push (@tmp, $metabolites->{$id}{'[M+H]+'}) ;
+    	push (@tmp, $metabolites->{$id}{'[M-H]-'}) ;
+    	push (@tmp, $metabolites->{$id}{'metabolite_name'}) ;
+    	push (@tmp, $metabolites->{$id}{'chemical_formula'}) ;
+    	push (@tmp, $metabolites->{$id}{'inchikey'}) ;
+    	
+    	# merge
+    	push (@metabolitesSorted, \@tmp) ;
+    }
+
+    return (\@metabolitesSorted) ;
+}
+### END of SUB
+
+=head2 METHOD cowmetdb_hash_to_inhouse_format
+
+	## Description : adaptator from hash cowmetdb entry to inhouse format 
+	## Input : $entries
+	## Output : $tsv_handler
+	## Usage : my ( $tsv_handler ) = cowmetdb_hash_to_inhouse_format( $entries ) ;
+	
+=cut
+## START of SUB
+sub cowmetdb_hash_to_inhouse_format {
+	## Retrieve Values
+    my $self = shift ;
+    my ( $entries ) = @_ ;
+    
+    my @fields_name = ('HMDB_ID', 'COMMON_NAME', 'CAS', 'FORMULA', 'MZ_MONO', 'MZ_AVERAGE', 'MZ_[M+H]+', 'MZ_[M-H]-', 'KEGG_ID', 'BIOCYC_ID', 'INCHI', 'LOCATION', 'TAXONOMY', 'IUPAC') ;
+    my @tsv_handler = () ;
+    push (@tsv_handler, \@fields_name) ; ## first line
+    
+    foreach my $entry (@$entries) {
+    	my @tmp = ( $entry->{'HMDB_ID'}, $entry->{'COMMON_NAME'}, $entry->{'CAS'}, $entry->{'FORMULA'}, $entry->{'MZ_MONO'}, $entry->{'MZ_AVERAGE'}, $entry->{'MZ_[M+H]+'}, $entry->{'MZ_[M-H]-'}, $entry->{'KEGG_ID'}, $entry->{'BIOCYC_ID'}, 
+    	$entry->{'INCHI'}, $entry->{'LOCATION'}, $entry->{'TAXONOMY'}, $entry->{'IUPAC'} ) ;
+    	push (@tsv_handler, \@tmp) ; ## one entry by one line
+    }
+    
+    return(\@tsv_handler) ;
+}
+## END of SUB
+
+
+1 ;
+
+
+__END__
+
+=head1 SUPPORT
+
+You can find documentation for this module with the perldoc command.
+
+ perldoc XXX.pm
+
+=head1 Exports
+
+=over 4
+
+=item :ALL is ...
+
+=back
+
+=head1 AUTHOR
+
+Franck Giacomoni E<lt>franck.giacomoni@clermont.inra.frE<gt>
+
+=head1 LICENSE
+
+This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
+
+=head1 VERSION
+
+version 1 : xx / xx / 201x
+
+version 2 : ??
+
+=cut
\ No newline at end of file