annotate rankfilter_GCMS/pdfread.py @ 31:31e6e2242d33

small fix in doc
author pieter.lukasse@wur.nl
date Sat, 30 Aug 2014 16:21:32 +0200
parents 637830ac8bcd
children 35f506f30ae4
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
1 """
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
2 Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
3
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
4 Permission is hereby granted, free of charge, to any person obtaining a copy
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
5 of this software and associated documentation files (the "Software"), to deal
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
6 in the Software without restriction, including without limitation the rights
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
8 copies of the Software, and to permit persons to whom the Software is
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
9 furnished to do so, subject to the following conditions:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
10
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
11 The above copyright notice and this permission notice shall be included in
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
12 all copies or substantial portions of the Software.
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
13
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
20 THE SOFTWARE.
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
21 """
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
22
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
23 import sys
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
24 import csv
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
25
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
26 def getPDF(filename, print_progress):
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
27 '''
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
28 Parses NIST PDF file
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
29 @param filename: PDF file to parse
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
30 '''
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
31 NistInput = {}
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
32 NistInput_missed = {}
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
33 nist_input = open(filename, 'r').read()
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
34
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
35 hitid = []
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
36 rt = []
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
37 name = []
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
38 forward = []
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
39 cas = []
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
40 reverse = []
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
41 prob = []
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
42 lib_id = []
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
43 nist_id = []
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
44 missed_compounds = []
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
45 rt_missed_compounds = []
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
46 formula = []
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
47
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
48 hit_list = nist_input.split('** Search Report Page 1 of 1 **')
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
49 hit_list.pop(0)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
50 #number_hits = range(10)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
51 line_id = 0
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
52 for line in hit_list:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
53 line = line.strip().translate(None, '\r')
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
54 if line != '':
26
637830ac8bcd added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents: 0
diff changeset
55 hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit') #solution? : if we wouldn't replace the \n by ' ' but by some special sign, then reading formula would be simpler!
637830ac8bcd added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents: 0
diff changeset
56 #strange....code seems fine actually...debug! See test/data/download.pdf
637830ac8bcd added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents: 0
diff changeset
57 # strange thing is that it looks like the new line does not end up in the text file, eventhough it looks like there is a new line in the pdf...perhaps a bug in the pdf2text command in linux?
0
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
58 spec_id = hits.pop(0).split(' ')[1]
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
59 j = 0
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
60 for hh in hits:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
61 cell = hh.split(';')
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
62 if print_progress == True:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
63 print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
64 line_id += 1
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
65 if len(cell) == 7: # the compound has CAS number
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
66 if len(cell[1].split(':')) == 2:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
67 forward.append(cell[1].split(':')[1])
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
68 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
69 if len(cell[0].split(':')) > 2:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
70 name_tmp = ':'.join(cell[0].split(':')[1:])
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
71 else:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
72 name_tmp = cell[0].split(':')[1]
26
637830ac8bcd added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents: 0
diff changeset
73
637830ac8bcd added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents: 0
diff changeset
74 # uggly workaround for the cases where there ends up to be no space between the name and the formula: exaustive
637830ac8bcd added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents: 0
diff changeset
75 # replaces of known cases by the same with a white space:
0
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
76 name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
77 name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
26
637830ac8bcd added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents: 0
diff changeset
78 name_tmp = name_tmp.replace('-, LC', '-, L C').replace('-, DC', '-, D C')
637830ac8bcd added validation in metexp to tabular tool; added workaround/fix for L and D compound types
pieter.lukasse@wur.nl
parents: 0
diff changeset
79
0
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
80 name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace(" ", " "))
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
81 if name_tmp:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
82 if name_tmp.split(' ')[-1][0] == 'C' or name_tmp.split(' ')[-1][0] == 'F' or name_tmp.split(' ')[-1][0] == 'H':
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
83 formule = (name_tmp.split(' ')[-1])
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
84 else:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
85 formule = ('not_def')
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
86 else:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
87 formule = ('not_def')
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
88 formula.append(formule.replace(" ", " "))
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
89 reverse.append(cell[2].split(':')[1])
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
90 prob.append(cell[3].split(' ')[2].replace('%', ''))
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
91 cas.append(cell[4].split(':')[1])
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
92 lib_id.append(cell[5].split(':')[1])
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
93 nist_id.append(cell[6].split(':')[1].replace('.', '').strip())
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
94 j = j + 1
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
95 else:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
96 missed_compounds.append(hh)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
97 rt_missed_compounds.append(spec_id)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
98
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
99 elif len(cell) >= 6: # the compound has no CAS number
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
100 if len(cell[1].split(':')) == 2:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
101
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
102 forward.append(cell[1].split(':')[1])
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
103 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
104 if len(cell[0].split(':')) > 2:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
105 name_tmp = ':'.join(cell[0].split(':')[1:])
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
106 else:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
107 name_tmp = cell[0].split(':')[1]
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
108 name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
109 name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
110 name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace(" ", " ")) # " ", " "
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
111 name_tmp = name_tmp.strip().split(' ')
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
112 if name_tmp:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
113 if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
114 formule = (name_tmp[-1])
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
115 else:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
116 formule = ('not_def')
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
117 else:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
118 formule = ('not_def')
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
119 formula.append(formule.replace(" ", " "))
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
120 reverse.append(cell[2].split(':')[1])
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
121 prob.append(cell[3].split(' ')[2].replace('%', ''))
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
122 cas.append('undef')
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
123 lib_id.append(cell[4].split(':')[1])
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
124 nist_id.append(cell[5].split(':')[1].replace('.', '').strip())
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
125 j = j + 1
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
126
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
127 else:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
128 missed_compounds.append(hh)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
129 rt_missed_compounds.append(spec_id)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
130
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
131 else: # Missing columns, report and quit
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
132
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
133 return
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
134
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
135 for _ in range(j):
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
136 hitid.append(str(spec_id.replace(" ", " ")))
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
137 rt.append(str(float(spec_id.split('-')[3]) / 1e+06))
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
138
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
139 NistInput['ID'] = hitid
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
140 NistInput['R.T.'] = rt
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
141 NistInput['Name'] = name
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
142 NistInput['CAS'] = cas
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
143 NistInput['Formula'] = formula
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
144 NistInput['Forward'] = forward
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
145 NistInput['Reverse'] = reverse
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
146 NistInput['Probability'] = prob
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
147 NistInput['Library'] = lib_id
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
148 NistInput['Library ID'] = nist_id
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
149 NistInput_missed['Missed Compounds'] = missed_compounds
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
150 NistInput_missed['RT missed Compounds'] = rt_missed_compounds
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
151
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
152 return NistInput, NistInput_missed
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
153
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
154
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
155 def convert_pdftotext2tabular(filename, output_file, error_file, print_progress):
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
156 '''
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
157 Converts NIST PDF file to tabular format
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
158 @param filename: PDF file to parse
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
159 @param output_file: output file for the hits
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
160 @param error_file: output file for failed hits
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
161 '''
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
162 [HitList, HitList_missed] = getPDF(filename, print_progress)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
163 # save Hitlist as tab seperate file
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
164 Hitlist_as_text = "\t".join(HitList.keys()) + "\n"
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
165 Hitlist_array_of_array = ([HitList[row] for row in HitList.keys()])
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
166 Hitlist_as_text += str("\n".join(["\t".join(e) for e in zip(*Hitlist_array_of_array)]))
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
167 output_fh = open(output_file, 'wb')
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
168 output_fh.write(Hitlist_as_text)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
169 output_fh.close()
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
170
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
171 out_missed_pdf = open(error_file, 'wb')
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
172 for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']):
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
173 out_missed_pdf.write('%s\n' % '\t'.join([y, x]))
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
174 out_missed_pdf.close()
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
175
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
176
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
177 def read_tabular(in_csv):
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
178 '''
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
179 Parses a tab-separated file returning a dictionary with named columns
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
180 @param in_csv: input filename to be parsed
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
181 '''
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
182 data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t'))
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
183 header = data.pop(0)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
184 # Create dictionary with column name as key
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
185 output = {}
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
186 for index in xrange(len(header)):
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
187 output[header[index]] = [row[index] for row in data]
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
188 return output
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
189
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
190
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
191 def read_tabular_old(filename):
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
192 '''
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
193 Function to read tabular format (created by convert_pdftotext2tabular)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
194 and output a dict with header of columns as key and value is columns of tabular as list
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
195 @param filename: tabular file to read
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
196 '''
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
197 input_fh = None
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
198 try:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
199 input_fh = open(filename, 'r')
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
200 except IOError, error:
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
201 raise error
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
202 colnames = input_fh.readline().strip().split('\t')
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
203 cells = []
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
204 for line in input_fh.readlines():
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
205 cells.append(line.strip().split('\t'))
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
206 #transform from row oriented structure to column oriented structure
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
207 cells = zip(*cells)
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
208 #store the list of list in form of final output
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
209 RankFilterGC_format = {}
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
210 for colnumber in range(len(colnames)):
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
211 RankFilterGC_format[colnames[colnumber]] = cells[colnumber]
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
212 return RankFilterGC_format
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
213
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
214
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
215 if __name__ == '__main__':
9d5f4f5f764b Initial commit to toolshed
pieter.lukasse@wur.nl
parents:
diff changeset
216 convert_pdftotext2tabular(sys.argv[1], sys.argv[2], sys.argv[3], True)