comparison rankfilter_GCMS/pdfread.py @ 0:9d5f4f5f764b

Initial commit to toolshed
author pieter.lukasse@wur.nl
date Thu, 16 Jan 2014 13:10:00 +0100
parents
children 637830ac8bcd
comparison
equal deleted inserted replaced
-1:000000000000 0:9d5f4f5f764b
1 """
2 Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University
3
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 """
22
23 import sys
24 import csv
25
26 def getPDF(filename, print_progress):
27 '''
28 Parses NIST PDF file
29 @param filename: PDF file to parse
30 '''
31 NistInput = {}
32 NistInput_missed = {}
33 nist_input = open(filename, 'r').read()
34
35 hitid = []
36 rt = []
37 name = []
38 forward = []
39 cas = []
40 reverse = []
41 prob = []
42 lib_id = []
43 nist_id = []
44 missed_compounds = []
45 rt_missed_compounds = []
46 formula = []
47
48 hit_list = nist_input.split('** Search Report Page 1 of 1 **')
49 hit_list.pop(0)
50 #number_hits = range(10)
51 line_id = 0
52 for line in hit_list:
53 line = line.strip().translate(None, '\r')
54 if line != '':
55 hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit')
56
57 spec_id = hits.pop(0).split(' ')[1]
58 j = 0
59 for hh in hits:
60 cell = hh.split(';')
61 if print_progress == True:
62 print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell
63 line_id += 1
64 if len(cell) == 7: # the compound has CAS number
65 if len(cell[1].split(':')) == 2:
66 forward.append(cell[1].split(':')[1])
67 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
68 if len(cell[0].split(':')) > 2:
69 name_tmp = ':'.join(cell[0].split(':')[1:])
70 else:
71 name_tmp = cell[0].split(':')[1]
72 name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
73 name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
74 name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace(" ", " "))
75 if name_tmp:
76 if name_tmp.split(' ')[-1][0] == 'C' or name_tmp.split(' ')[-1][0] == 'F' or name_tmp.split(' ')[-1][0] == 'H':
77 formule = (name_tmp.split(' ')[-1])
78 else:
79 formule = ('not_def')
80 else:
81 formule = ('not_def')
82 formula.append(formule.replace(" ", " "))
83 reverse.append(cell[2].split(':')[1])
84 prob.append(cell[3].split(' ')[2].replace('%', ''))
85 cas.append(cell[4].split(':')[1])
86 lib_id.append(cell[5].split(':')[1])
87 nist_id.append(cell[6].split(':')[1].replace('.', '').strip())
88 j = j + 1
89 else:
90 missed_compounds.append(hh)
91 rt_missed_compounds.append(spec_id)
92
93 elif len(cell) >= 6: # the compound has no CAS number
94 if len(cell[1].split(':')) == 2:
95
96 forward.append(cell[1].split(':')[1])
97 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
98 if len(cell[0].split(':')) > 2:
99 name_tmp = ':'.join(cell[0].split(':')[1:])
100 else:
101 name_tmp = cell[0].split(':')[1]
102 name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
103 name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
104 name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace(" ", " ")) # " ", " "
105 name_tmp = name_tmp.strip().split(' ')
106 if name_tmp:
107 if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
108 formule = (name_tmp[-1])
109 else:
110 formule = ('not_def')
111 else:
112 formule = ('not_def')
113 formula.append(formule.replace(" ", " "))
114 reverse.append(cell[2].split(':')[1])
115 prob.append(cell[3].split(' ')[2].replace('%', ''))
116 cas.append('undef')
117 lib_id.append(cell[4].split(':')[1])
118 nist_id.append(cell[5].split(':')[1].replace('.', '').strip())
119 j = j + 1
120
121 else:
122 missed_compounds.append(hh)
123 rt_missed_compounds.append(spec_id)
124
125 else: # Missing columns, report and quit
126
127 return
128
129 for _ in range(j):
130 hitid.append(str(spec_id.replace(" ", " ")))
131 rt.append(str(float(spec_id.split('-')[3]) / 1e+06))
132
133 NistInput['ID'] = hitid
134 NistInput['R.T.'] = rt
135 NistInput['Name'] = name
136 NistInput['CAS'] = cas
137 NistInput['Formula'] = formula
138 NistInput['Forward'] = forward
139 NistInput['Reverse'] = reverse
140 NistInput['Probability'] = prob
141 NistInput['Library'] = lib_id
142 NistInput['Library ID'] = nist_id
143 NistInput_missed['Missed Compounds'] = missed_compounds
144 NistInput_missed['RT missed Compounds'] = rt_missed_compounds
145
146 return NistInput, NistInput_missed
147
148
149 def convert_pdftotext2tabular(filename, output_file, error_file, print_progress):
150 '''
151 Converts NIST PDF file to tabular format
152 @param filename: PDF file to parse
153 @param output_file: output file for the hits
154 @param error_file: output file for failed hits
155 '''
156 [HitList, HitList_missed] = getPDF(filename, print_progress)
157 # save Hitlist as tab seperate file
158 Hitlist_as_text = "\t".join(HitList.keys()) + "\n"
159 Hitlist_array_of_array = ([HitList[row] for row in HitList.keys()])
160 Hitlist_as_text += str("\n".join(["\t".join(e) for e in zip(*Hitlist_array_of_array)]))
161 output_fh = open(output_file, 'wb')
162 output_fh.write(Hitlist_as_text)
163 output_fh.close()
164
165 out_missed_pdf = open(error_file, 'wb')
166 for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']):
167 out_missed_pdf.write('%s\n' % '\t'.join([y, x]))
168 out_missed_pdf.close()
169
170
171 def read_tabular(in_csv):
172 '''
173 Parses a tab-separated file returning a dictionary with named columns
174 @param in_csv: input filename to be parsed
175 '''
176 data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t'))
177 header = data.pop(0)
178 # Create dictionary with column name as key
179 output = {}
180 for index in xrange(len(header)):
181 output[header[index]] = [row[index] for row in data]
182 return output
183
184
185 def read_tabular_old(filename):
186 '''
187 Function to read tabular format (created by convert_pdftotext2tabular)
188 and output a dict with header of columns as key and value is columns of tabular as list
189 @param filename: tabular file to read
190 '''
191 input_fh = None
192 try:
193 input_fh = open(filename, 'r')
194 except IOError, error:
195 raise error
196 colnames = input_fh.readline().strip().split('\t')
197 cells = []
198 for line in input_fh.readlines():
199 cells.append(line.strip().split('\t'))
200 #transform from row oriented structure to column oriented structure
201 cells = zip(*cells)
202 #store the list of list in form of final output
203 RankFilterGC_format = {}
204 for colnumber in range(len(colnames)):
205 RankFilterGC_format[colnames[colnumber]] = cells[colnumber]
206 return RankFilterGC_format
207
208
209 if __name__ == '__main__':
210 convert_pdftotext2tabular(sys.argv[1], sys.argv[2], sys.argv[3], True)