Mercurial > repos > pieterlukasse > prims_metabolomics
view query_mass_repos.py @ 39:455d650e3dbc
small fixes
author | pieter.lukasse@wur.nl |
---|---|
date | Fri, 19 Sep 2014 12:56:00 +0200 |
parents | 60b53f2aa48a |
children |
line wrap: on
line source
#!/usr/bin/env python # encoding: utf-8 ''' Module to query a set of accurate mass values detected by high-resolution mass spectrometers against various repositories/services such as METabolomics EXPlorer database or the MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/). It will take the input file and for each record it will query the molecular mass in the selected repository/service. If one or more compounds are found then extra information regarding these compounds is added to the output file. The output file is thus the input file enriched with information about related items found in the selected repository/service. The service should implement the following interface: http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth (txth means there is guaranteed to be a header line before the data) The output should be tab separated and should contain the following columns (in this order) db-name molecular-formula dbe formula-weight id description ''' import csv import sys import fileinput import urllib2 import time from collections import OrderedDict __author__ = "Pieter Lukasse" __contact__ = "pieter.lukasse@wur.nl" __copyright__ = "Copyright, 2014, Plant Research International, WUR" __license__ = "Apache v2" def _process_file(in_xsv, delim='\t'): ''' Generic method to parse a tab-separated file returning a dictionary with named columns @param in_csv: input filename to be parsed ''' data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim)) return _process_data(data) def _process_data(data): header = data.pop(0) # Create dictionary with column name as key output = OrderedDict() for index in xrange(len(header)): output[header[index]] = [row[index] for row in data] return output def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit): ''' This method will iterate over the record in the input_data and will enrich them with the related information found (if any) in the chosen repository/service # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies ''' merged = [] for i in xrange(len(input_data[input_data.keys()[0]])): # Get the record in same dictionary format as input_data, but containing # a value at each column instead of a list of all values of all records: input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()])) # read the molecular mass : molecular_mass = input_data_record[molecular_mass_col] # search for related records in repository/service: data_found = None if molecular_mass != "": molecular_mass = float(molecular_mass) # 1- search for data around this MM: query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth" data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") data_type_found = "MM" if data_found == None: # If still nothing found, just add empty columns extra_cols = ['', '','','','',''] else: # Add info found: extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link) # Take all data and merge it into a "flat"/simple array of values: field_values_list = _merge_data(input_data_record, extra_cols) merged.append(field_values_list) # return the merged/enriched records: return merged def _get_extra_info_and_link_cols(data_found, data_type_found, query_link): ''' This method will go over the data found and will return a list with the following items: - details of hits found : db-name molecular-formula dbe formula-weight id description - Link that executes same query ''' # set() makes a unique list: db_name_set = [] molecular_formula_set = [] id_set = [] description_set = [] if 'db-name' in data_found: db_name_set = set(data_found['db-name']) elif '# db-name' in data_found: db_name_set = set(data_found['# db-name']) if 'molecular-formula' in data_found: molecular_formula_set = set(data_found['molecular-formula']) if 'id' in data_found: id_set = set(data_found['id']) if 'description' in data_found: description_set = set(data_found['description']) result = [data_type_found, _to_xsv(db_name_set), _to_xsv(molecular_formula_set), _to_xsv(id_set), _to_xsv(description_set), #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"] return result def _to_xsv(data_set): result = "" for item in data_set: result = result + str(item) + "|" return result def _fire_query_and_return_dict(url): ''' This method will fire the query as a web-service call and return the results as a list of dictionary objects ''' try: data = urllib2.urlopen(url).read() # transform to dictionary: result = [] data_rows = data.split("\n") # remove comment lines if any (only leave the one that has "molecular-formula" word in it...compatible with kazusa service): data_rows_to_remove = [] for data_row in data_rows: if data_row == "" or (data_row[0] == '#' and "molecular-formula" not in data_row): data_rows_to_remove.append(data_row) for data_row in data_rows_to_remove: data_rows.remove(data_row) # check if there is any data in the response: if len(data_rows) <= 1 or data_rows[1].strip() == '': # means there is only the header row...so no hits: return None for data_row in data_rows: if not data_row.strip() == '': row_as_list = _str_to_list(data_row, delimiter='\t') result.append(row_as_list) # return result processed into a dict: return _process_data(result) except urllib2.HTTPError, e: raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason) except urllib2.URLError, e: raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ") def _str_to_list(data_row, delimiter='\t'): result = [] for column in data_row.split(delimiter): result.append(column) return result # alternative: ? # s = requests.Session() # s.verify = False # #s.auth = (token01, token02) # resp = s.get(url, params={'name': 'anonymous'}, stream=True) # content = resp.content # # transform to dictionary: def _merge_data(input_data_record, extra_cols): ''' Adds the extra information to the existing data record and returns the combined new record. ''' record = [] for column in input_data_record: record.append(input_data_record[column]) # add extra columns for column in extra_cols: record.append(column) return record def _save_data(data_rows, headers, out_csv): ''' Writes tab-separated data to file @param data_rows: dictionary containing merged/enriched dataset @param out_csv: output csv file ''' # Open output file for writing outfile_single_handle = open(out_csv, 'wb') output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") # Write headers output_single_handle.writerow(headers) # Write one line for each row for data_row in data_rows: output_single_handle.writerow(data_row) def _get_repository_URL(repository_file): ''' Read out and return the URL stored in the given file. ''' file_input = fileinput.input(repository_file) try: for line in file_input: if line[0] != '#': # just return the first line that is not a comment line: return line finally: file_input.close() def main(): ''' Query main function The input file can be any tabular file, as long as it contains a column for the molecular mass. This column is then used to query against the chosen repository/service Database. ''' seconds_start = int(round(time.time())) input_file = sys.argv[1] molecular_mass_col = sys.argv[2] repository_file = sys.argv[3] error_margin = float(sys.argv[4]) margin_unit = sys.argv[5] output_result = sys.argv[6] # Parse repository_file to find the URL to the service: repository_dblink = _get_repository_URL(repository_file) # Parse tabular input file into dictionary/array: input_data = _process_file(input_file) # Query data against repository : enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit) headers = input_data.keys() + ['SEARCH hits for ','SEARCH hits: db-names', 'SEARCH hits: molecular-formulas ', 'SEARCH hits: ids','SEARCH hits: descriptions', 'Link to SEARCH hits'] #TODO - add min and max formula weigth columns _save_data(enriched_data, headers, output_result) seconds_end = int(round(time.time())) print "Took " + str(seconds_end - seconds_start) + " seconds" if __name__ == '__main__': main()