view query_mass_repos.py @ 23:85fd05d0d16c

New tool to Query multiple public repositories for elemental compositions from accurate mass values detected by high-resolution mass spectrometers
author pieter.lukasse@wur.nl
date Thu, 03 Apr 2014 16:44:11 +0200
parents
children 60b53f2aa48a
line wrap: on
line source

#!/usr/bin/env python
# encoding: utf-8
'''
Module to query a set of accurate mass values detected by high-resolution mass spectrometers
against various repositories/services such as METabolomics EXPlorer database or the 
MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/).

It will take the input file and for each record it will query the 
molecular mass in the selected repository/service. If one or more compounds are found 
then extra information regarding these compounds is added to the output file.

The output file is thus the input file enriched with information about 
related items found in the selected repository/service.   

The service should implement the following interface: 

http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth   (txth means there is guaranteed to be a header line before the data)

The output should be tab separated and should contain the following columns (in this order)
db-name    molecular-formula    dbe    formula-weight    id    description


'''
import csv
import sys
import fileinput
import urllib2
import time
from collections import OrderedDict

__author__ = "Pieter Lukasse"
__contact__ = "pieter.lukasse@wur.nl"
__copyright__ = "Copyright, 2014, Plant Research International, WUR"
__license__ = "Apache v2"

def _process_file(in_xsv, delim='\t'):
    '''
    Generic method to parse a tab-separated file returning a dictionary with named columns
    @param in_csv: input filename to be parsed
    '''
    data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
    return _process_data(data)
    
def _process_data(data):
    
    header = data.pop(0)
    # Create dictionary with column name as key
    output = OrderedDict()
    for index in xrange(len(header)):
        output[header[index]] = [row[index] for row in data]
    return output


def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit):
    
    '''
    This method will iterate over the record in the input_data and
    will enrich them with the related information found (if any) in the 
    chosen repository/service
    
    # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
    '''
    merged = []
    
    for i in xrange(len(input_data[input_data.keys()[0]])):
        # Get the record in same dictionary format as input_data, but containing
        # a value at each column instead of a list of all values of all records:
        input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
        
        # read the molecular mass :
        molecular_mass = input_data_record[molecular_mass_col]
        
        
        # search for related records in repository/service:
        data_found = None
        if molecular_mass != "": 
            molecular_mass = float(molecular_mass)
            
            # 1- search for data around this MM:
            query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth"
            
            data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
            data_type_found = "MM"
        
                
        if data_found == None:
            # If still nothing found, just add empty columns
            extra_cols = ['', '','','','','']
        else:
            # Add info found:
            extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
        
        # Take all data and merge it into a "flat"/simple array of values:
        field_values_list = _merge_data(input_data_record, extra_cols)
    
        merged.append(field_values_list)

    # return the merged/enriched records:
    return merged


def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
    '''
    This method will go over the data found and will return a 
    list with the following items:
    - details of hits found :
         db-name    molecular-formula    dbe    formula-weight    id    description
    - Link that executes same query
        
    '''
    
    # set() makes a unique list:
    db_name_set = []
    molecular_formula_set = []
    id_set = []
    description_set = []
    
    
    if 'db-name' in data_found:
        db_name_set = set(data_found['db-name'])
    elif '# db-name' in data_found:
        db_name_set = set(data_found['# db-name'])    
    if 'molecular-formula' in data_found:
        molecular_formula_set = set(data_found['molecular-formula'])
    if 'id' in data_found:
        id_set = set(data_found['id'])
    if 'description' in data_found:
        description_set = set(data_found['description'])
    
    result = [data_type_found,
              _to_xsv(db_name_set),
              _to_xsv(molecular_formula_set),
              _to_xsv(id_set),
              _to_xsv(description_set),
              #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): 
              "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
    return result


def _to_xsv(data_set):
    result = ""
    for item in data_set:
        result = result + str(item) + "|"    
    return result


def _fire_query_and_return_dict(url):
    '''
    This method will fire the query as a web-service call and 
    return the results as a list of dictionary objects
    '''
    
    try:
        data = urllib2.urlopen(url).read()
        
        # transform to dictionary:
        result = []
        data_rows = data.split("\n")
        
        # remove comment lines if any (only leave the one that has "molecular-formula" word in it...compatible with kazusa service):
        data_rows_to_remove = []
        for data_row in data_rows:
            if data_row == "" or (data_row[0] == '#' and "molecular-formula" not in data_row):
                data_rows_to_remove.append(data_row)
                
        for data_row in data_rows_to_remove:
            data_rows.remove(data_row)
        
        # check if there is any data in the response:
        if len(data_rows) <= 1 or data_rows[1].strip() == '': 
            # means there is only the header row...so no hits:
            return None
        
        for data_row in data_rows:
            if not data_row.strip() == '':
                row_as_list = _str_to_list(data_row, delimiter='\t')
                result.append(row_as_list)
        
        # return result processed into a dict:
        return _process_data(result)
        
    except urllib2.HTTPError, e:
        raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
    except urllib2.URLError, e:
        raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ")

def _str_to_list(data_row, delimiter='\t'):    
    result = []
    for column in data_row.split(delimiter):
        result.append(column)
    return result
    
    
# alternative: ?    
#     s = requests.Session()
#     s.verify = False
#     #s.auth = (token01, token02)
#     resp = s.get(url, params={'name': 'anonymous'}, stream=True)
#     content = resp.content
#     # transform to dictionary:
    
    
    
    
def _merge_data(input_data_record, extra_cols):
    '''
    Adds the extra information to the existing data record and returns
    the combined new record.
    '''
    record = []
    for column in input_data_record:
        record.append(input_data_record[column])
    
    
    # add extra columns
    for column in extra_cols:
        record.append(column)    
    
    return record  
    

def _save_data(data_rows, headers, out_csv):
    '''
    Writes tab-separated data to file
    @param data_rows: dictionary containing merged/enriched dataset
    @param out_csv: output csv file
    '''

    # Open output file for writing
    outfile_single_handle = open(out_csv, 'wb')
    output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")

    # Write headers
    output_single_handle.writerow(headers)

    # Write one line for each row
    for data_row in data_rows:
        output_single_handle.writerow(data_row)

def _get_repository_URL(repository_file):
    '''
    Read out and return the URL stored in the given file.
    '''
    file_input = fileinput.input(repository_file)
    try:
        for line in file_input:
            if line[0] != '#':
                # just return the first line that is not a comment line:
                return line
    finally:
        file_input.close()
    

def main():
    '''
    Query main function
    
    The input file can be any tabular file, as long as it contains a column for the molecular mass.
    This column is then used to query against the chosen repository/service Database.   
    '''
    seconds_start = int(round(time.time()))
    
    input_file = sys.argv[1]
    molecular_mass_col = sys.argv[2]
    repository_file = sys.argv[3]
    error_margin = float(sys.argv[4])
    margin_unit = sys.argv[5]
    output_result = sys.argv[6]

    # Parse repository_file to find the URL to the service:
    repository_dblink = _get_repository_URL(repository_file)
    
    # Parse tabular input file into dictionary/array:
    input_data = _process_file(input_file)
    
    # Query data against repository :
    enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit)
    headers = input_data.keys() + ['SEARCH hits for ','SEARCH hits: db-names', 'SEARCH hits: molecular-formulas ',
                                   'SEARCH hits: ids','SEARCH hits: descriptions', 'Link to SEARCH hits']

    _save_data(enriched_data, headers, output_result)
    
    seconds_end = int(round(time.time()))
    print "Took " + str(seconds_end - seconds_start) + " seconds"
                      
                      

if __name__ == '__main__':
    main()