diff utils.py @ 0:cce6989ed423

new NIST wrapper demo tools
author pieter.lukasse@wur.nl
date Thu, 22 Jan 2015 16:14:57 +0100
parents
children 8c20185752da
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils.py	Thu Jan 22 16:14:57 2015 +0100
@@ -0,0 +1,163 @@
+'''
+Created on 31 dec. 2014
+
+@author: lukas007
+'''
+import shutil
+import subprocess
+import csv
+from collections import OrderedDict
+    
+def copy_dir(src, dst):
+    shutil.copytree(src, dst)
+
+    
+def copy_file(src, dst):  
+    shutil.copy(src, dst)
+
+def get_process_list():
+    p = subprocess.Popen(['ps', '-A'], stdout=subprocess.PIPE)
+    out, err = p.communicate()
+    return out.splitlines()
+
+def get_process_pid(process_name):
+    pid = -1
+    for line in get_process_list():
+        if process_name in line:
+            pid = int(line.split(None, 1)[0])
+    return pid
+
+
+def get_as_dict(in_tsv):
+    '''
+    Generic method to parse a tab-separated file returning a dictionary with named columns
+    @param in_tsv: input filename to be parsed
+    '''
+    data = list(csv.reader(open(in_tsv, 'rU'), delimiter='\t'))
+    header = data.pop(0)
+    # Create dictionary with column name as key
+    output = {}
+    for index in xrange(len(header)):
+        output[header[index]] = [row[index] for row in data]
+    return output
+
+def save_dict_as_tsv(dict, out_tsv):
+    '''
+    Writes tab-separated data to file
+    @param data: dictionary containing merged dataset
+    @param out_tsv: output tsv file
+    '''
+
+    # Open output file for writing
+    out_file = open(out_tsv, 'wb')
+    output_writer = csv.writer(out_file, delimiter="\t")
+
+    # Write headers
+    output_writer.writerow(list(dict.keys()))
+
+    # Write 
+    for record_index in xrange(len(dict[dict.keys()[0]])):
+        row = [dict[k][record_index] for k in dict]
+        output_writer.writerow(row)
+            
+            
+            
+
+def get_nist_out_as_dict(nist_result_file):
+    '''
+    Method to parse NIST specific output into a dictionary.
+    @param nist_result_file: result file as produced by NIST nistms$.exe 
+    '''
+    # Create dictionary with column name as key
+    output = OrderedDict()
+    output['id'] = []
+    output['compound_name'] = []
+    output['formula'] = []
+    output['lib_name'] = []
+    output['id_in_lib'] = []
+    output['mf'] = []
+    output['rmf'] = []
+    output['prob'] = []
+    output['cas'] = []
+    output['mw'] = []
+    
+    
+    for line in open(nist_result_file):
+        row = line.split('<<')
+        if row[0].startswith('Unknown'):
+            title_row = row[0]
+            continue
+        elif row[0].startswith('Hit'):
+            hit = row
+
+            output['id'].append(title_row.split(': ')[1].split('  ')[0])
+            output['compound_name'].append((hit[1].split('>>')[0]).decode('utf-8', errors='replace')) # see http://blog.webforefront.com/archives/2011/02/python_ascii_co.html
+            output['formula'].append(hit[2].split('>>')[0])
+            output['lib_name'].append(hit[3].split('>>')[0])
+            
+            other_fields_list = (hit[2].split('>>')[1] + hit[3].split('>>')[1]).split(';')
+            count = 0
+            for field in other_fields_list:
+                if field.startswith(' MF: '):
+                    count += 1
+                    output['mf'].append(field.split('MF: ')[1])
+                elif field.startswith(' RMF: '):
+                    count += 1
+                    output['rmf'].append(field.split('RMF: ')[1])
+                elif field.startswith(' Prob: '):
+                    count += 1
+                    output['prob'].append(field.split('Prob: ')[1])
+                elif field.startswith(' CAS:'):
+                    count += 1
+                    output['cas'].append(field.split('CAS:')[1])
+                elif field.startswith(' Mw: '):
+                    count += 1
+                    output['mw'].append(field.split('Mw: ')[1])
+                elif field.startswith(' Id: '):
+                    count += 1
+                    output['id_in_lib'].append(field.split('Id: ')[1][0:-2])  # the [0:-2] is to avoid the last 2 characters, namely a '.' and a \n
+                elif field != '' and field != ' Lib: ':
+                    raise Exception('Error: unexpected field in NIST output: ' + field)                    
+            
+            if count != 6:
+                raise Exception('Error: did not find all expected fields in NIST output')  
+
+    return output
+
+def get_spectra_file_as_dict(spectrum_file):
+    '''
+    Method to parse spectra file in NIST MSP input format into a dictionary.
+    The idea is to parse the following :
+        
+        Name: spectrum1
+        DB#: 1
+        Num Peaks: 87
+        14 8; 15 15; 27 18; 28 15; 29 15; 
+        30 11; 32 19; 39 32; 40 12; 41 68;
+    
+    into: 
+    
+        dict['spectrum1'] = "14 8; 15 15; 27 18; 28 15; 29 15; 30 11; 32 19; 39 32; 40 12; 41 68;"
+    
+    @param spectrum_file: spectra file in MSP format (e.g. also the format returned by MsClust)
+    '''
+    
+    output = OrderedDict()
+    name = '' 
+    spectrum = ''
+    for line in open(spectrum_file):
+        if line.startswith('Name: '):
+            if name != '':
+                # store spectrum:
+                output[name] = spectrum 
+            name = line.split('Name: ')[1].replace('\n','')
+            spectrum = ''
+        elif line[0].isdigit():
+            # parse spectra:
+            spectrum += line.replace('\n','')
+    
+    # store also last spectrum:
+    output[name] = spectrum
+    
+    return output
+    
\ No newline at end of file