comparison rankfilter_GCMS/pdfread.py @ 61:d685210eef3e

fix in pdftotabular tool
author pieter.lukasse@wur.nl
date Fri, 19 Dec 2014 15:30:13 +0100
parents 35f506f30ae4
children
comparison
equal deleted inserted replaced
60:35f506f30ae4 61:d685210eef3e
62 if print_progress == True: 62 if print_progress == True:
63 print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell 63 print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell
64 line_id += 1 64 line_id += 1
65 if len(cell) == 7: # the compound has CAS number 65 if len(cell) == 7: # the compound has CAS number
66 if len(cell[1].split(':')) == 2: 66 if len(cell[1].split(':')) == 2:
67 forward.append(cell[1].split(':')[1]) 67 forward.append((cell[1].split(':')[1]).strip())
68 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end 68 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
69 if len(cell[0].split(':')) > 2: 69 if len(cell[0].split(':')) > 2:
70 name_tmp = ':'.join(cell[0].split(':')[1:]) 70 name_tmp = ':'.join(cell[0].split(':')[1:])
71 else: 71 else:
72 name_tmp = cell[0].split(':')[1] 72 name_tmp = cell[0].split(':')[1]
73 73
74 # uggly workaround for the cases where there ends up to be no space between the name and the formula: exaustive 74 name.append(name_tmp.replace(" ", " ").strip())
75 # replaces of known cases by the same with a white space: 75 name_tmp = name_tmp.strip().split(' ')
76 name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
77 name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
78 name_tmp = name_tmp.replace(', LC', ', L C').replace(', DC', ', D C')
79
80 name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace(" ", " "))
81 if name_tmp: 76 if name_tmp:
82 if name_tmp.split(' ')[-1][0] == 'C' or name_tmp.split(' ')[-1][0] == 'F' or name_tmp.split(' ')[-1][0] == 'H': 77 # if the name ends with a word that starts with C, F or H, then assume this last word is a formula:
83 formule = (name_tmp.split(' ')[-1]) 78 if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
79 formule = (name_tmp[-1])
84 else: 80 else:
85 formule = ('not_def') 81 formule = ('not_def')
86 else: 82 else:
87 formule = ('not_def') 83 formule = ('not_def')
88 formula.append(formule.replace(" ", " ")) 84 formula.append(formule.replace(" ", " "))
89 reverse.append(cell[2].split(':')[1]) 85 reverse.append((cell[2].split(':')[1]).strip())
90 prob.append(cell[3].split(' ')[2].replace('%', '')) 86 prob.append(cell[3].split(' ')[2].replace('%', ''))
91 cas.append(cell[4].split(':')[1]) 87 cas.append((cell[4].split(':')[1]).strip())
92 lib_id.append(cell[5].split(':')[1]) 88 lib_id.append((cell[5].split(':')[1]).strip())
93 nist_id.append(cell[6].split(':')[1].replace('.', '').strip()) 89 nist_id.append(cell[6].split(':')[1].replace('.', '').strip())
94 j = j + 1 90 j = j + 1
95 else: 91 else:
96 missed_compounds.append(hh) 92 missed_compounds.append(hh)
97 rt_missed_compounds.append(spec_id) 93 rt_missed_compounds.append(spec_id)
98 94
99 elif len(cell) >= 6: # the compound has no CAS number 95 elif len(cell) == 6: # the compound has no CAS number
100 if len(cell[1].split(':')) == 2: 96 if len(cell[1].split(':')) == 2:
101 97
102 forward.append(cell[1].split(':')[1]) 98 forward.append((cell[1].split(':')[1]).strip())
103 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end 99 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
104 if len(cell[0].split(':')) > 2: 100 if len(cell[0].split(':')) > 2:
105 name_tmp = ':'.join(cell[0].split(':')[1:]) 101 name_tmp = ':'.join(cell[0].split(':')[1:])
106 else: 102 else:
107 name_tmp = cell[0].split(':')[1] 103 name_tmp = cell[0].split(':')[1]
108 name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C') 104
109 name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C') 105 name.append(name_tmp.replace(" ", " ").strip())
110 name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace(" ", " ")) # " ", " "
111 name_tmp = name_tmp.strip().split(' ') 106 name_tmp = name_tmp.strip().split(' ')
112 if name_tmp: 107 if name_tmp:
108 # if the name ends with a word that starts with C, F or H, then assume this last word is a formula:
113 if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H': 109 if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
114 formule = (name_tmp[-1]) 110 formule = (name_tmp[-1])
115 else: 111 else:
116 formule = ('not_def') 112 formule = ('not_def')
117 else: 113 else:
118 formule = ('not_def') 114 formule = ('not_def')
119 formula.append(formule.replace(" ", " ")) 115 formula.append(formule.replace(" ", " "))
120 reverse.append(cell[2].split(':')[1]) 116 reverse.append((cell[2].split(':')[1]).strip())
121 prob.append(cell[3].split(' ')[2].replace('%', '')) 117 prob.append(cell[3].split(' ')[2].replace('%', ''))
122 cas.append('undef') 118 cas.append('undef')
123 lib_id.append(cell[4].split(':')[1]) 119 lib_id.append((cell[4].split(':')[1]).strip())
124 nist_id.append(cell[5].split(':')[1].replace('.', '').strip()) 120 nist_id.append(cell[5].split(':')[1].replace('.', '').strip())
125 j = j + 1 121 j = j + 1
126 122
127 else: 123 else:
128 missed_compounds.append(hh) 124 missed_compounds.append(hh)
129 rt_missed_compounds.append(spec_id) 125 rt_missed_compounds.append(spec_id)
130 126
131 else: # Missing columns, report and quit 127 else: # Missing columns, report and quit
132 128 missed_compounds.append(hh)
133 return 129 rt_missed_compounds.append(spec_id)
134 130
135 for _ in range(j): 131 for _ in range(j):
136 hitid.append(str(spec_id.replace(" ", " "))) 132 hitid.append(str(spec_id.replace(" ", " ")))
137 rt.append(str(float(spec_id.split('-')[3]) / 1e+06)) 133 rt.append(str(float(spec_id.split('-')[3]) / 1e+06))
138 134
168 output_fh.write(Hitlist_as_text) 164 output_fh.write(Hitlist_as_text)
169 output_fh.close() 165 output_fh.close()
170 166
171 out_missed_pdf = open(error_file, 'wb') 167 out_missed_pdf = open(error_file, 'wb')
172 for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']): 168 for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']):
169 out_missed_pdf.write("Line with incorrect format or unexpected number of fields:\n")
173 out_missed_pdf.write('%s\n' % '\t'.join([y, x])) 170 out_missed_pdf.write('%s\n' % '\t'.join([y, x]))
174 out_missed_pdf.close() 171 out_missed_pdf.close()
175 172
176 173
177 def read_tabular(in_csv): 174 def read_tabular(in_csv):