Mercurial > repos > pieterlukasse > prims_metabolomics
comparison rankfilter_GCMS/pdfread.py @ 0:9d5f4f5f764b
Initial commit to toolshed
author | pieter.lukasse@wur.nl |
---|---|
date | Thu, 16 Jan 2014 13:10:00 +0100 |
parents | |
children | 637830ac8bcd |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9d5f4f5f764b |
---|---|
1 """ | |
2 Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University | |
3 | |
4 Permission is hereby granted, free of charge, to any person obtaining a copy | |
5 of this software and associated documentation files (the "Software"), to deal | |
6 in the Software without restriction, including without limitation the rights | |
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
8 copies of the Software, and to permit persons to whom the Software is | |
9 furnished to do so, subject to the following conditions: | |
10 | |
11 The above copyright notice and this permission notice shall be included in | |
12 all copies or substantial portions of the Software. | |
13 | |
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
20 THE SOFTWARE. | |
21 """ | |
22 | |
23 import sys | |
24 import csv | |
25 | |
26 def getPDF(filename, print_progress): | |
27 ''' | |
28 Parses NIST PDF file | |
29 @param filename: PDF file to parse | |
30 ''' | |
31 NistInput = {} | |
32 NistInput_missed = {} | |
33 nist_input = open(filename, 'r').read() | |
34 | |
35 hitid = [] | |
36 rt = [] | |
37 name = [] | |
38 forward = [] | |
39 cas = [] | |
40 reverse = [] | |
41 prob = [] | |
42 lib_id = [] | |
43 nist_id = [] | |
44 missed_compounds = [] | |
45 rt_missed_compounds = [] | |
46 formula = [] | |
47 | |
48 hit_list = nist_input.split('** Search Report Page 1 of 1 **') | |
49 hit_list.pop(0) | |
50 #number_hits = range(10) | |
51 line_id = 0 | |
52 for line in hit_list: | |
53 line = line.strip().translate(None, '\r') | |
54 if line != '': | |
55 hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit') | |
56 | |
57 spec_id = hits.pop(0).split(' ')[1] | |
58 j = 0 | |
59 for hh in hits: | |
60 cell = hh.split(';') | |
61 if print_progress == True: | |
62 print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell | |
63 line_id += 1 | |
64 if len(cell) == 7: # the compound has CAS number | |
65 if len(cell[1].split(':')) == 2: | |
66 forward.append(cell[1].split(':')[1]) | |
67 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end | |
68 if len(cell[0].split(':')) > 2: | |
69 name_tmp = ':'.join(cell[0].split(':')[1:]) | |
70 else: | |
71 name_tmp = cell[0].split(':')[1] | |
72 name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C') | |
73 name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C') | |
74 name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace(" ", " ")) | |
75 if name_tmp: | |
76 if name_tmp.split(' ')[-1][0] == 'C' or name_tmp.split(' ')[-1][0] == 'F' or name_tmp.split(' ')[-1][0] == 'H': | |
77 formule = (name_tmp.split(' ')[-1]) | |
78 else: | |
79 formule = ('not_def') | |
80 else: | |
81 formule = ('not_def') | |
82 formula.append(formule.replace(" ", " ")) | |
83 reverse.append(cell[2].split(':')[1]) | |
84 prob.append(cell[3].split(' ')[2].replace('%', '')) | |
85 cas.append(cell[4].split(':')[1]) | |
86 lib_id.append(cell[5].split(':')[1]) | |
87 nist_id.append(cell[6].split(':')[1].replace('.', '').strip()) | |
88 j = j + 1 | |
89 else: | |
90 missed_compounds.append(hh) | |
91 rt_missed_compounds.append(spec_id) | |
92 | |
93 elif len(cell) >= 6: # the compound has no CAS number | |
94 if len(cell[1].split(':')) == 2: | |
95 | |
96 forward.append(cell[1].split(':')[1]) | |
97 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end | |
98 if len(cell[0].split(':')) > 2: | |
99 name_tmp = ':'.join(cell[0].split(':')[1:]) | |
100 else: | |
101 name_tmp = cell[0].split(':')[1] | |
102 name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C') | |
103 name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C') | |
104 name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace(" ", " ")) # " ", " " | |
105 name_tmp = name_tmp.strip().split(' ') | |
106 if name_tmp: | |
107 if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H': | |
108 formule = (name_tmp[-1]) | |
109 else: | |
110 formule = ('not_def') | |
111 else: | |
112 formule = ('not_def') | |
113 formula.append(formule.replace(" ", " ")) | |
114 reverse.append(cell[2].split(':')[1]) | |
115 prob.append(cell[3].split(' ')[2].replace('%', '')) | |
116 cas.append('undef') | |
117 lib_id.append(cell[4].split(':')[1]) | |
118 nist_id.append(cell[5].split(':')[1].replace('.', '').strip()) | |
119 j = j + 1 | |
120 | |
121 else: | |
122 missed_compounds.append(hh) | |
123 rt_missed_compounds.append(spec_id) | |
124 | |
125 else: # Missing columns, report and quit | |
126 | |
127 return | |
128 | |
129 for _ in range(j): | |
130 hitid.append(str(spec_id.replace(" ", " "))) | |
131 rt.append(str(float(spec_id.split('-')[3]) / 1e+06)) | |
132 | |
133 NistInput['ID'] = hitid | |
134 NistInput['R.T.'] = rt | |
135 NistInput['Name'] = name | |
136 NistInput['CAS'] = cas | |
137 NistInput['Formula'] = formula | |
138 NistInput['Forward'] = forward | |
139 NistInput['Reverse'] = reverse | |
140 NistInput['Probability'] = prob | |
141 NistInput['Library'] = lib_id | |
142 NistInput['Library ID'] = nist_id | |
143 NistInput_missed['Missed Compounds'] = missed_compounds | |
144 NistInput_missed['RT missed Compounds'] = rt_missed_compounds | |
145 | |
146 return NistInput, NistInput_missed | |
147 | |
148 | |
149 def convert_pdftotext2tabular(filename, output_file, error_file, print_progress): | |
150 ''' | |
151 Converts NIST PDF file to tabular format | |
152 @param filename: PDF file to parse | |
153 @param output_file: output file for the hits | |
154 @param error_file: output file for failed hits | |
155 ''' | |
156 [HitList, HitList_missed] = getPDF(filename, print_progress) | |
157 # save Hitlist as tab seperate file | |
158 Hitlist_as_text = "\t".join(HitList.keys()) + "\n" | |
159 Hitlist_array_of_array = ([HitList[row] for row in HitList.keys()]) | |
160 Hitlist_as_text += str("\n".join(["\t".join(e) for e in zip(*Hitlist_array_of_array)])) | |
161 output_fh = open(output_file, 'wb') | |
162 output_fh.write(Hitlist_as_text) | |
163 output_fh.close() | |
164 | |
165 out_missed_pdf = open(error_file, 'wb') | |
166 for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']): | |
167 out_missed_pdf.write('%s\n' % '\t'.join([y, x])) | |
168 out_missed_pdf.close() | |
169 | |
170 | |
171 def read_tabular(in_csv): | |
172 ''' | |
173 Parses a tab-separated file returning a dictionary with named columns | |
174 @param in_csv: input filename to be parsed | |
175 ''' | |
176 data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t')) | |
177 header = data.pop(0) | |
178 # Create dictionary with column name as key | |
179 output = {} | |
180 for index in xrange(len(header)): | |
181 output[header[index]] = [row[index] for row in data] | |
182 return output | |
183 | |
184 | |
185 def read_tabular_old(filename): | |
186 ''' | |
187 Function to read tabular format (created by convert_pdftotext2tabular) | |
188 and output a dict with header of columns as key and value is columns of tabular as list | |
189 @param filename: tabular file to read | |
190 ''' | |
191 input_fh = None | |
192 try: | |
193 input_fh = open(filename, 'r') | |
194 except IOError, error: | |
195 raise error | |
196 colnames = input_fh.readline().strip().split('\t') | |
197 cells = [] | |
198 for line in input_fh.readlines(): | |
199 cells.append(line.strip().split('\t')) | |
200 #transform from row oriented structure to column oriented structure | |
201 cells = zip(*cells) | |
202 #store the list of list in form of final output | |
203 RankFilterGC_format = {} | |
204 for colnumber in range(len(colnames)): | |
205 RankFilterGC_format[colnames[colnumber]] = cells[colnumber] | |
206 return RankFilterGC_format | |
207 | |
208 | |
209 if __name__ == '__main__': | |
210 convert_pdftotext2tabular(sys.argv[1], sys.argv[2], sys.argv[3], True) |