Mercurial > repos > pieterlukasse > prims_metabolomics
annotate rankfilter_GCMS/pdfread.py @ 62:9bd2597c8851 default tip
r
author | pieter.lukasse@wur.nl |
---|---|
date | Fri, 06 Feb 2015 15:49:26 +0100 |
parents | d685210eef3e |
children |
rev | line source |
---|---|
0 | 1 """ |
2 Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University | |
3 | |
4 Permission is hereby granted, free of charge, to any person obtaining a copy | |
5 of this software and associated documentation files (the "Software"), to deal | |
6 in the Software without restriction, including without limitation the rights | |
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
8 copies of the Software, and to permit persons to whom the Software is | |
9 furnished to do so, subject to the following conditions: | |
10 | |
11 The above copyright notice and this permission notice shall be included in | |
12 all copies or substantial portions of the Software. | |
13 | |
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
20 THE SOFTWARE. | |
21 """ | |
22 | |
23 import sys | |
24 import csv | |
25 | |
26 def getPDF(filename, print_progress): | |
27 ''' | |
28 Parses NIST PDF file | |
29 @param filename: PDF file to parse | |
30 ''' | |
31 NistInput = {} | |
32 NistInput_missed = {} | |
33 nist_input = open(filename, 'r').read() | |
34 | |
35 hitid = [] | |
36 rt = [] | |
37 name = [] | |
38 forward = [] | |
39 cas = [] | |
40 reverse = [] | |
41 prob = [] | |
42 lib_id = [] | |
43 nist_id = [] | |
44 missed_compounds = [] | |
45 rt_missed_compounds = [] | |
46 formula = [] | |
47 | |
48 hit_list = nist_input.split('** Search Report Page 1 of 1 **') | |
49 hit_list.pop(0) | |
50 #number_hits = range(10) | |
51 line_id = 0 | |
52 for line in hit_list: | |
53 line = line.strip().translate(None, '\r') | |
54 if line != '': | |
26
637830ac8bcd
added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
55 hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit') #solution? : if we wouldn't replace the \n by ' ' but by some special sign, then reading formula would be simpler! |
637830ac8bcd
added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
56 #strange....code seems fine actually...debug! See test/data/download.pdf |
637830ac8bcd
added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
57 # strange thing is that it looks like the new line does not end up in the text file, eventhough it looks like there is a new line in the pdf...perhaps a bug in the pdf2text command in linux? |
0 | 58 spec_id = hits.pop(0).split(' ')[1] |
59 j = 0 | |
60 for hh in hits: | |
61 cell = hh.split(';') | |
62 if print_progress == True: | |
63 print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell | |
64 line_id += 1 | |
65 if len(cell) == 7: # the compound has CAS number | |
66 if len(cell[1].split(':')) == 2: | |
61 | 67 forward.append((cell[1].split(':')[1]).strip()) |
0 | 68 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end |
69 if len(cell[0].split(':')) > 2: | |
70 name_tmp = ':'.join(cell[0].split(':')[1:]) | |
71 else: | |
72 name_tmp = cell[0].split(':')[1] | |
26
637830ac8bcd
added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
73 |
61 | 74 name.append(name_tmp.replace(" ", " ").strip()) |
0 | 75 name_tmp = name_tmp.strip().split(' ') |
76 if name_tmp: | |
61 | 77 # if the name ends with a word that starts with C, F or H, then assume this last word is a formula: |
0 | 78 if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H': |
79 formule = (name_tmp[-1]) | |
80 else: | |
81 formule = ('not_def') | |
82 else: | |
83 formule = ('not_def') | |
84 formula.append(formule.replace(" ", " ")) | |
61 | 85 reverse.append((cell[2].split(':')[1]).strip()) |
86 prob.append(cell[3].split(' ')[2].replace('%', '')) | |
87 cas.append((cell[4].split(':')[1]).strip()) | |
88 lib_id.append((cell[5].split(':')[1]).strip()) | |
89 nist_id.append(cell[6].split(':')[1].replace('.', '').strip()) | |
90 j = j + 1 | |
91 else: | |
92 missed_compounds.append(hh) | |
93 rt_missed_compounds.append(spec_id) | |
94 | |
95 elif len(cell) == 6: # the compound has no CAS number | |
96 if len(cell[1].split(':')) == 2: | |
97 | |
98 forward.append((cell[1].split(':')[1]).strip()) | |
99 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end | |
100 if len(cell[0].split(':')) > 2: | |
101 name_tmp = ':'.join(cell[0].split(':')[1:]) | |
102 else: | |
103 name_tmp = cell[0].split(':')[1] | |
104 | |
105 name.append(name_tmp.replace(" ", " ").strip()) | |
106 name_tmp = name_tmp.strip().split(' ') | |
107 if name_tmp: | |
108 # if the name ends with a word that starts with C, F or H, then assume this last word is a formula: | |
109 if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H': | |
110 formule = (name_tmp[-1]) | |
111 else: | |
112 formule = ('not_def') | |
113 else: | |
114 formule = ('not_def') | |
115 formula.append(formule.replace(" ", " ")) | |
116 reverse.append((cell[2].split(':')[1]).strip()) | |
0 | 117 prob.append(cell[3].split(' ')[2].replace('%', '')) |
118 cas.append('undef') | |
61 | 119 lib_id.append((cell[4].split(':')[1]).strip()) |
0 | 120 nist_id.append(cell[5].split(':')[1].replace('.', '').strip()) |
121 j = j + 1 | |
122 | |
123 else: | |
124 missed_compounds.append(hh) | |
125 rt_missed_compounds.append(spec_id) | |
126 | |
127 else: # Missing columns, report and quit | |
61 | 128 missed_compounds.append(hh) |
129 rt_missed_compounds.append(spec_id) | |
0 | 130 |
131 for _ in range(j): | |
132 hitid.append(str(spec_id.replace(" ", " "))) | |
133 rt.append(str(float(spec_id.split('-')[3]) / 1e+06)) | |
134 | |
135 NistInput['ID'] = hitid | |
136 NistInput['R.T.'] = rt | |
137 NistInput['Name'] = name | |
138 NistInput['CAS'] = cas | |
139 NistInput['Formula'] = formula | |
140 NistInput['Forward'] = forward | |
141 NistInput['Reverse'] = reverse | |
142 NistInput['Probability'] = prob | |
143 NistInput['Library'] = lib_id | |
144 NistInput['Library ID'] = nist_id | |
145 NistInput_missed['Missed Compounds'] = missed_compounds | |
146 NistInput_missed['RT missed Compounds'] = rt_missed_compounds | |
147 | |
148 return NistInput, NistInput_missed | |
149 | |
150 | |
151 def convert_pdftotext2tabular(filename, output_file, error_file, print_progress): | |
152 ''' | |
153 Converts NIST PDF file to tabular format | |
154 @param filename: PDF file to parse | |
155 @param output_file: output file for the hits | |
156 @param error_file: output file for failed hits | |
157 ''' | |
158 [HitList, HitList_missed] = getPDF(filename, print_progress) | |
159 # save Hitlist as tab seperate file | |
160 Hitlist_as_text = "\t".join(HitList.keys()) + "\n" | |
161 Hitlist_array_of_array = ([HitList[row] for row in HitList.keys()]) | |
162 Hitlist_as_text += str("\n".join(["\t".join(e) for e in zip(*Hitlist_array_of_array)])) | |
163 output_fh = open(output_file, 'wb') | |
164 output_fh.write(Hitlist_as_text) | |
165 output_fh.close() | |
166 | |
167 out_missed_pdf = open(error_file, 'wb') | |
168 for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']): | |
61 | 169 out_missed_pdf.write("Line with incorrect format or unexpected number of fields:\n") |
0 | 170 out_missed_pdf.write('%s\n' % '\t'.join([y, x])) |
171 out_missed_pdf.close() | |
172 | |
173 | |
174 def read_tabular(in_csv): | |
175 ''' | |
176 Parses a tab-separated file returning a dictionary with named columns | |
177 @param in_csv: input filename to be parsed | |
178 ''' | |
179 data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t')) | |
180 header = data.pop(0) | |
181 # Create dictionary with column name as key | |
182 output = {} | |
183 for index in xrange(len(header)): | |
184 output[header[index]] = [row[index] for row in data] | |
185 return output | |
186 | |
187 | |
188 def read_tabular_old(filename): | |
189 ''' | |
190 Function to read tabular format (created by convert_pdftotext2tabular) | |
191 and output a dict with header of columns as key and value is columns of tabular as list | |
192 @param filename: tabular file to read | |
193 ''' | |
194 input_fh = None | |
195 try: | |
196 input_fh = open(filename, 'r') | |
197 except IOError, error: | |
198 raise error | |
199 colnames = input_fh.readline().strip().split('\t') | |
200 cells = [] | |
201 for line in input_fh.readlines(): | |
202 cells.append(line.strip().split('\t')) | |
203 #transform from row oriented structure to column oriented structure | |
204 cells = zip(*cells) | |
205 #store the list of list in form of final output | |
206 RankFilterGC_format = {} | |
207 for colnumber in range(len(colnames)): | |
208 RankFilterGC_format[colnames[colnumber]] = cells[colnumber] | |
209 return RankFilterGC_format | |
210 | |
211 | |
212 if __name__ == '__main__': | |
213 convert_pdftotext2tabular(sys.argv[1], sys.argv[2], sys.argv[3], True) |