0
|
1 """
|
|
2 Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University
|
|
3
|
|
4 Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
5 of this software and associated documentation files (the "Software"), to deal
|
|
6 in the Software without restriction, including without limitation the rights
|
|
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
8 copies of the Software, and to permit persons to whom the Software is
|
|
9 furnished to do so, subject to the following conditions:
|
|
10
|
|
11 The above copyright notice and this permission notice shall be included in
|
|
12 all copies or substantial portions of the Software.
|
|
13
|
|
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
20 THE SOFTWARE.
|
|
21 """
|
|
22
|
|
23 import sys
|
|
24 import csv
|
|
25
|
|
26 def getPDF(filename, print_progress):
|
|
27 '''
|
|
28 Parses NIST PDF file
|
|
29 @param filename: PDF file to parse
|
|
30 '''
|
|
31 NistInput = {}
|
|
32 NistInput_missed = {}
|
|
33 nist_input = open(filename, 'r').read()
|
|
34
|
|
35 hitid = []
|
|
36 rt = []
|
|
37 name = []
|
|
38 forward = []
|
|
39 cas = []
|
|
40 reverse = []
|
|
41 prob = []
|
|
42 lib_id = []
|
|
43 nist_id = []
|
|
44 missed_compounds = []
|
|
45 rt_missed_compounds = []
|
|
46 formula = []
|
|
47
|
|
48 hit_list = nist_input.split('** Search Report Page 1 of 1 **')
|
|
49 hit_list.pop(0)
|
|
50 #number_hits = range(10)
|
|
51 line_id = 0
|
|
52 for line in hit_list:
|
|
53 line = line.strip().translate(None, '\r')
|
|
54 if line != '':
|
|
55 hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit')
|
|
56
|
|
57 spec_id = hits.pop(0).split(' ')[1]
|
|
58 j = 0
|
|
59 for hh in hits:
|
|
60 cell = hh.split(';')
|
|
61 if print_progress == True:
|
|
62 print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell
|
|
63 line_id += 1
|
|
64 if len(cell) == 7: # the compound has CAS number
|
|
65 if len(cell[1].split(':')) == 2:
|
|
66 forward.append(cell[1].split(':')[1])
|
|
67 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
|
|
68 if len(cell[0].split(':')) > 2:
|
|
69 name_tmp = ':'.join(cell[0].split(':')[1:])
|
|
70 else:
|
|
71 name_tmp = cell[0].split(':')[1]
|
|
72 name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
|
|
73 name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
|
|
74 name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace(" ", " "))
|
|
75 if name_tmp:
|
|
76 if name_tmp.split(' ')[-1][0] == 'C' or name_tmp.split(' ')[-1][0] == 'F' or name_tmp.split(' ')[-1][0] == 'H':
|
|
77 formule = (name_tmp.split(' ')[-1])
|
|
78 else:
|
|
79 formule = ('not_def')
|
|
80 else:
|
|
81 formule = ('not_def')
|
|
82 formula.append(formule.replace(" ", " "))
|
|
83 reverse.append(cell[2].split(':')[1])
|
|
84 prob.append(cell[3].split(' ')[2].replace('%', ''))
|
|
85 cas.append(cell[4].split(':')[1])
|
|
86 lib_id.append(cell[5].split(':')[1])
|
|
87 nist_id.append(cell[6].split(':')[1].replace('.', '').strip())
|
|
88 j = j + 1
|
|
89 else:
|
|
90 missed_compounds.append(hh)
|
|
91 rt_missed_compounds.append(spec_id)
|
|
92
|
|
93 elif len(cell) >= 6: # the compound has no CAS number
|
|
94 if len(cell[1].split(':')) == 2:
|
|
95
|
|
96 forward.append(cell[1].split(':')[1])
|
|
97 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
|
|
98 if len(cell[0].split(':')) > 2:
|
|
99 name_tmp = ':'.join(cell[0].split(':')[1:])
|
|
100 else:
|
|
101 name_tmp = cell[0].split(':')[1]
|
|
102 name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
|
|
103 name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
|
|
104 name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace(" ", " ")) # " ", " "
|
|
105 name_tmp = name_tmp.strip().split(' ')
|
|
106 if name_tmp:
|
|
107 if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
|
|
108 formule = (name_tmp[-1])
|
|
109 else:
|
|
110 formule = ('not_def')
|
|
111 else:
|
|
112 formule = ('not_def')
|
|
113 formula.append(formule.replace(" ", " "))
|
|
114 reverse.append(cell[2].split(':')[1])
|
|
115 prob.append(cell[3].split(' ')[2].replace('%', ''))
|
|
116 cas.append('undef')
|
|
117 lib_id.append(cell[4].split(':')[1])
|
|
118 nist_id.append(cell[5].split(':')[1].replace('.', '').strip())
|
|
119 j = j + 1
|
|
120
|
|
121 else:
|
|
122 missed_compounds.append(hh)
|
|
123 rt_missed_compounds.append(spec_id)
|
|
124
|
|
125 else: # Missing columns, report and quit
|
|
126
|
|
127 return
|
|
128
|
|
129 for _ in range(j):
|
|
130 hitid.append(str(spec_id.replace(" ", " ")))
|
|
131 rt.append(str(float(spec_id.split('-')[3]) / 1e+06))
|
|
132
|
|
133 NistInput['ID'] = hitid
|
|
134 NistInput['R.T.'] = rt
|
|
135 NistInput['Name'] = name
|
|
136 NistInput['CAS'] = cas
|
|
137 NistInput['Formula'] = formula
|
|
138 NistInput['Forward'] = forward
|
|
139 NistInput['Reverse'] = reverse
|
|
140 NistInput['Probability'] = prob
|
|
141 NistInput['Library'] = lib_id
|
|
142 NistInput['Library ID'] = nist_id
|
|
143 NistInput_missed['Missed Compounds'] = missed_compounds
|
|
144 NistInput_missed['RT missed Compounds'] = rt_missed_compounds
|
|
145
|
|
146 return NistInput, NistInput_missed
|
|
147
|
|
148
|
|
149 def convert_pdftotext2tabular(filename, output_file, error_file, print_progress):
|
|
150 '''
|
|
151 Converts NIST PDF file to tabular format
|
|
152 @param filename: PDF file to parse
|
|
153 @param output_file: output file for the hits
|
|
154 @param error_file: output file for failed hits
|
|
155 '''
|
|
156 [HitList, HitList_missed] = getPDF(filename, print_progress)
|
|
157 # save Hitlist as tab seperate file
|
|
158 Hitlist_as_text = "\t".join(HitList.keys()) + "\n"
|
|
159 Hitlist_array_of_array = ([HitList[row] for row in HitList.keys()])
|
|
160 Hitlist_as_text += str("\n".join(["\t".join(e) for e in zip(*Hitlist_array_of_array)]))
|
|
161 output_fh = open(output_file, 'wb')
|
|
162 output_fh.write(Hitlist_as_text)
|
|
163 output_fh.close()
|
|
164
|
|
165 out_missed_pdf = open(error_file, 'wb')
|
|
166 for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']):
|
|
167 out_missed_pdf.write('%s\n' % '\t'.join([y, x]))
|
|
168 out_missed_pdf.close()
|
|
169
|
|
170
|
|
171 def read_tabular(in_csv):
|
|
172 '''
|
|
173 Parses a tab-separated file returning a dictionary with named columns
|
|
174 @param in_csv: input filename to be parsed
|
|
175 '''
|
|
176 data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t'))
|
|
177 header = data.pop(0)
|
|
178 # Create dictionary with column name as key
|
|
179 output = {}
|
|
180 for index in xrange(len(header)):
|
|
181 output[header[index]] = [row[index] for row in data]
|
|
182 return output
|
|
183
|
|
184
|
|
185 def read_tabular_old(filename):
|
|
186 '''
|
|
187 Function to read tabular format (created by convert_pdftotext2tabular)
|
|
188 and output a dict with header of columns as key and value is columns of tabular as list
|
|
189 @param filename: tabular file to read
|
|
190 '''
|
|
191 input_fh = None
|
|
192 try:
|
|
193 input_fh = open(filename, 'r')
|
|
194 except IOError, error:
|
|
195 raise error
|
|
196 colnames = input_fh.readline().strip().split('\t')
|
|
197 cells = []
|
|
198 for line in input_fh.readlines():
|
|
199 cells.append(line.strip().split('\t'))
|
|
200 #transform from row oriented structure to column oriented structure
|
|
201 cells = zip(*cells)
|
|
202 #store the list of list in form of final output
|
|
203 RankFilterGC_format = {}
|
|
204 for colnumber in range(len(colnames)):
|
|
205 RankFilterGC_format[colnames[colnumber]] = cells[colnumber]
|
|
206 return RankFilterGC_format
|
|
207
|
|
208
|
|
209 if __name__ == '__main__':
|
|
210 convert_pdftotext2tabular(sys.argv[1], sys.argv[2], sys.argv[3], True)
|