Mercurial > repos > pieterlukasse > prims_metabolomics
annotate rankfilter_GCMS/pdfread.py @ 50:93102202ab79
Added more options and better documentation.
Added MsClust support for parsing XCMS alignment results.
Improved output reports for XCMS wrappers.
New tools.
author | pieter.lukasse@wur.nl |
---|---|
date | Wed, 10 Dec 2014 22:04:33 +0100 |
parents | 637830ac8bcd |
children | 35f506f30ae4 |
rev | line source |
---|---|
0 | 1 """ |
2 Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University | |
3 | |
4 Permission is hereby granted, free of charge, to any person obtaining a copy | |
5 of this software and associated documentation files (the "Software"), to deal | |
6 in the Software without restriction, including without limitation the rights | |
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
8 copies of the Software, and to permit persons to whom the Software is | |
9 furnished to do so, subject to the following conditions: | |
10 | |
11 The above copyright notice and this permission notice shall be included in | |
12 all copies or substantial portions of the Software. | |
13 | |
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
20 THE SOFTWARE. | |
21 """ | |
22 | |
23 import sys | |
24 import csv | |
25 | |
26 def getPDF(filename, print_progress): | |
27 ''' | |
28 Parses NIST PDF file | |
29 @param filename: PDF file to parse | |
30 ''' | |
31 NistInput = {} | |
32 NistInput_missed = {} | |
33 nist_input = open(filename, 'r').read() | |
34 | |
35 hitid = [] | |
36 rt = [] | |
37 name = [] | |
38 forward = [] | |
39 cas = [] | |
40 reverse = [] | |
41 prob = [] | |
42 lib_id = [] | |
43 nist_id = [] | |
44 missed_compounds = [] | |
45 rt_missed_compounds = [] | |
46 formula = [] | |
47 | |
48 hit_list = nist_input.split('** Search Report Page 1 of 1 **') | |
49 hit_list.pop(0) | |
50 #number_hits = range(10) | |
51 line_id = 0 | |
52 for line in hit_list: | |
53 line = line.strip().translate(None, '\r') | |
54 if line != '': | |
26
637830ac8bcd
added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
55 hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit') #solution? : if we wouldn't replace the \n by ' ' but by some special sign, then reading formula would be simpler! |
637830ac8bcd
added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
56 #strange....code seems fine actually...debug! See test/data/download.pdf |
637830ac8bcd
added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
57 # strange thing is that it looks like the new line does not end up in the text file, eventhough it looks like there is a new line in the pdf...perhaps a bug in the pdf2text command in linux? |
0 | 58 spec_id = hits.pop(0).split(' ')[1] |
59 j = 0 | |
60 for hh in hits: | |
61 cell = hh.split(';') | |
62 if print_progress == True: | |
63 print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell | |
64 line_id += 1 | |
65 if len(cell) == 7: # the compound has CAS number | |
66 if len(cell[1].split(':')) == 2: | |
67 forward.append(cell[1].split(':')[1]) | |
68 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end | |
69 if len(cell[0].split(':')) > 2: | |
70 name_tmp = ':'.join(cell[0].split(':')[1:]) | |
71 else: | |
72 name_tmp = cell[0].split(':')[1] | |
26
637830ac8bcd
added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
73 |
637830ac8bcd
added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
74 # uggly workaround for the cases where there ends up to be no space between the name and the formula: exaustive |
637830ac8bcd
added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
75 # replaces of known cases by the same with a white space: |
0 | 76 name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C') |
77 name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C') | |
26
637830ac8bcd
added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
78 name_tmp = name_tmp.replace('-, LC', '-, L C').replace('-, DC', '-, D C') |
637830ac8bcd
added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents:
0
diff
changeset
|
79 |
0 | 80 name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace(" ", " ")) |
81 if name_tmp: | |
82 if name_tmp.split(' ')[-1][0] == 'C' or name_tmp.split(' ')[-1][0] == 'F' or name_tmp.split(' ')[-1][0] == 'H': | |
83 formule = (name_tmp.split(' ')[-1]) | |
84 else: | |
85 formule = ('not_def') | |
86 else: | |
87 formule = ('not_def') | |
88 formula.append(formule.replace(" ", " ")) | |
89 reverse.append(cell[2].split(':')[1]) | |
90 prob.append(cell[3].split(' ')[2].replace('%', '')) | |
91 cas.append(cell[4].split(':')[1]) | |
92 lib_id.append(cell[5].split(':')[1]) | |
93 nist_id.append(cell[6].split(':')[1].replace('.', '').strip()) | |
94 j = j + 1 | |
95 else: | |
96 missed_compounds.append(hh) | |
97 rt_missed_compounds.append(spec_id) | |
98 | |
99 elif len(cell) >= 6: # the compound has no CAS number | |
100 if len(cell[1].split(':')) == 2: | |
101 | |
102 forward.append(cell[1].split(':')[1]) | |
103 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end | |
104 if len(cell[0].split(':')) > 2: | |
105 name_tmp = ':'.join(cell[0].split(':')[1:]) | |
106 else: | |
107 name_tmp = cell[0].split(':')[1] | |
108 name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C') | |
109 name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C') | |
110 name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace(" ", " ")) # " ", " " | |
111 name_tmp = name_tmp.strip().split(' ') | |
112 if name_tmp: | |
113 if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H': | |
114 formule = (name_tmp[-1]) | |
115 else: | |
116 formule = ('not_def') | |
117 else: | |
118 formule = ('not_def') | |
119 formula.append(formule.replace(" ", " ")) | |
120 reverse.append(cell[2].split(':')[1]) | |
121 prob.append(cell[3].split(' ')[2].replace('%', '')) | |
122 cas.append('undef') | |
123 lib_id.append(cell[4].split(':')[1]) | |
124 nist_id.append(cell[5].split(':')[1].replace('.', '').strip()) | |
125 j = j + 1 | |
126 | |
127 else: | |
128 missed_compounds.append(hh) | |
129 rt_missed_compounds.append(spec_id) | |
130 | |
131 else: # Missing columns, report and quit | |
132 | |
133 return | |
134 | |
135 for _ in range(j): | |
136 hitid.append(str(spec_id.replace(" ", " "))) | |
137 rt.append(str(float(spec_id.split('-')[3]) / 1e+06)) | |
138 | |
139 NistInput['ID'] = hitid | |
140 NistInput['R.T.'] = rt | |
141 NistInput['Name'] = name | |
142 NistInput['CAS'] = cas | |
143 NistInput['Formula'] = formula | |
144 NistInput['Forward'] = forward | |
145 NistInput['Reverse'] = reverse | |
146 NistInput['Probability'] = prob | |
147 NistInput['Library'] = lib_id | |
148 NistInput['Library ID'] = nist_id | |
149 NistInput_missed['Missed Compounds'] = missed_compounds | |
150 NistInput_missed['RT missed Compounds'] = rt_missed_compounds | |
151 | |
152 return NistInput, NistInput_missed | |
153 | |
154 | |
155 def convert_pdftotext2tabular(filename, output_file, error_file, print_progress): | |
156 ''' | |
157 Converts NIST PDF file to tabular format | |
158 @param filename: PDF file to parse | |
159 @param output_file: output file for the hits | |
160 @param error_file: output file for failed hits | |
161 ''' | |
162 [HitList, HitList_missed] = getPDF(filename, print_progress) | |
163 # save Hitlist as tab seperate file | |
164 Hitlist_as_text = "\t".join(HitList.keys()) + "\n" | |
165 Hitlist_array_of_array = ([HitList[row] for row in HitList.keys()]) | |
166 Hitlist_as_text += str("\n".join(["\t".join(e) for e in zip(*Hitlist_array_of_array)])) | |
167 output_fh = open(output_file, 'wb') | |
168 output_fh.write(Hitlist_as_text) | |
169 output_fh.close() | |
170 | |
171 out_missed_pdf = open(error_file, 'wb') | |
172 for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']): | |
173 out_missed_pdf.write('%s\n' % '\t'.join([y, x])) | |
174 out_missed_pdf.close() | |
175 | |
176 | |
177 def read_tabular(in_csv): | |
178 ''' | |
179 Parses a tab-separated file returning a dictionary with named columns | |
180 @param in_csv: input filename to be parsed | |
181 ''' | |
182 data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t')) | |
183 header = data.pop(0) | |
184 # Create dictionary with column name as key | |
185 output = {} | |
186 for index in xrange(len(header)): | |
187 output[header[index]] = [row[index] for row in data] | |
188 return output | |
189 | |
190 | |
191 def read_tabular_old(filename): | |
192 ''' | |
193 Function to read tabular format (created by convert_pdftotext2tabular) | |
194 and output a dict with header of columns as key and value is columns of tabular as list | |
195 @param filename: tabular file to read | |
196 ''' | |
197 input_fh = None | |
198 try: | |
199 input_fh = open(filename, 'r') | |
200 except IOError, error: | |
201 raise error | |
202 colnames = input_fh.readline().strip().split('\t') | |
203 cells = [] | |
204 for line in input_fh.readlines(): | |
205 cells.append(line.strip().split('\t')) | |
206 #transform from row oriented structure to column oriented structure | |
207 cells = zip(*cells) | |
208 #store the list of list in form of final output | |
209 RankFilterGC_format = {} | |
210 for colnumber in range(len(colnames)): | |
211 RankFilterGC_format[colnames[colnumber]] = cells[colnumber] | |
212 return RankFilterGC_format | |
213 | |
214 | |
215 if __name__ == '__main__': | |
216 convert_pdftotext2tabular(sys.argv[1], sys.argv[2], sys.argv[3], True) |