comparison utils.py @ 0:cce6989ed423

new NIST wrapper demo tools
author pieter.lukasse@wur.nl
date Thu, 22 Jan 2015 16:14:57 +0100
parents
children 8c20185752da
comparison
equal deleted inserted replaced
-1:000000000000 0:cce6989ed423
1 '''
2 Created on 31 dec. 2014
3
4 @author: lukas007
5 '''
6 import shutil
7 import subprocess
8 import csv
9 from collections import OrderedDict
10
11 def copy_dir(src, dst):
12 shutil.copytree(src, dst)
13
14
15 def copy_file(src, dst):
16 shutil.copy(src, dst)
17
18 def get_process_list():
19 p = subprocess.Popen(['ps', '-A'], stdout=subprocess.PIPE)
20 out, err = p.communicate()
21 return out.splitlines()
22
23 def get_process_pid(process_name):
24 pid = -1
25 for line in get_process_list():
26 if process_name in line:
27 pid = int(line.split(None, 1)[0])
28 return pid
29
30
31 def get_as_dict(in_tsv):
32 '''
33 Generic method to parse a tab-separated file returning a dictionary with named columns
34 @param in_tsv: input filename to be parsed
35 '''
36 data = list(csv.reader(open(in_tsv, 'rU'), delimiter='\t'))
37 header = data.pop(0)
38 # Create dictionary with column name as key
39 output = {}
40 for index in xrange(len(header)):
41 output[header[index]] = [row[index] for row in data]
42 return output
43
44 def save_dict_as_tsv(dict, out_tsv):
45 '''
46 Writes tab-separated data to file
47 @param data: dictionary containing merged dataset
48 @param out_tsv: output tsv file
49 '''
50
51 # Open output file for writing
52 out_file = open(out_tsv, 'wb')
53 output_writer = csv.writer(out_file, delimiter="\t")
54
55 # Write headers
56 output_writer.writerow(list(dict.keys()))
57
58 # Write
59 for record_index in xrange(len(dict[dict.keys()[0]])):
60 row = [dict[k][record_index] for k in dict]
61 output_writer.writerow(row)
62
63
64
65
66 def get_nist_out_as_dict(nist_result_file):
67 '''
68 Method to parse NIST specific output into a dictionary.
69 @param nist_result_file: result file as produced by NIST nistms$.exe
70 '''
71 # Create dictionary with column name as key
72 output = OrderedDict()
73 output['id'] = []
74 output['compound_name'] = []
75 output['formula'] = []
76 output['lib_name'] = []
77 output['id_in_lib'] = []
78 output['mf'] = []
79 output['rmf'] = []
80 output['prob'] = []
81 output['cas'] = []
82 output['mw'] = []
83
84
85 for line in open(nist_result_file):
86 row = line.split('<<')
87 if row[0].startswith('Unknown'):
88 title_row = row[0]
89 continue
90 elif row[0].startswith('Hit'):
91 hit = row
92
93 output['id'].append(title_row.split(': ')[1].split(' ')[0])
94 output['compound_name'].append((hit[1].split('>>')[0]).decode('utf-8', errors='replace')) # see http://blog.webforefront.com/archives/2011/02/python_ascii_co.html
95 output['formula'].append(hit[2].split('>>')[0])
96 output['lib_name'].append(hit[3].split('>>')[0])
97
98 other_fields_list = (hit[2].split('>>')[1] + hit[3].split('>>')[1]).split(';')
99 count = 0
100 for field in other_fields_list:
101 if field.startswith(' MF: '):
102 count += 1
103 output['mf'].append(field.split('MF: ')[1])
104 elif field.startswith(' RMF: '):
105 count += 1
106 output['rmf'].append(field.split('RMF: ')[1])
107 elif field.startswith(' Prob: '):
108 count += 1
109 output['prob'].append(field.split('Prob: ')[1])
110 elif field.startswith(' CAS:'):
111 count += 1
112 output['cas'].append(field.split('CAS:')[1])
113 elif field.startswith(' Mw: '):
114 count += 1
115 output['mw'].append(field.split('Mw: ')[1])
116 elif field.startswith(' Id: '):
117 count += 1
118 output['id_in_lib'].append(field.split('Id: ')[1][0:-2]) # the [0:-2] is to avoid the last 2 characters, namely a '.' and a \n
119 elif field != '' and field != ' Lib: ':
120 raise Exception('Error: unexpected field in NIST output: ' + field)
121
122 if count != 6:
123 raise Exception('Error: did not find all expected fields in NIST output')
124
125 return output
126
127 def get_spectra_file_as_dict(spectrum_file):
128 '''
129 Method to parse spectra file in NIST MSP input format into a dictionary.
130 The idea is to parse the following :
131
132 Name: spectrum1
133 DB#: 1
134 Num Peaks: 87
135 14 8; 15 15; 27 18; 28 15; 29 15;
136 30 11; 32 19; 39 32; 40 12; 41 68;
137
138 into:
139
140 dict['spectrum1'] = "14 8; 15 15; 27 18; 28 15; 29 15; 30 11; 32 19; 39 32; 40 12; 41 68;"
141
142 @param spectrum_file: spectra file in MSP format (e.g. also the format returned by MsClust)
143 '''
144
145 output = OrderedDict()
146 name = ''
147 spectrum = ''
148 for line in open(spectrum_file):
149 if line.startswith('Name: '):
150 if name != '':
151 # store spectrum:
152 output[name] = spectrum
153 name = line.split('Name: ')[1].replace('\n','')
154 spectrum = ''
155 elif line[0].isdigit():
156 # parse spectra:
157 spectrum += line.replace('\n','')
158
159 # store also last spectrum:
160 output[name] = spectrum
161
162 return output
163