Mercurial > repos > pieterlukasse > prims_metabolomics
comparison rankfilter_GCMS/pdfread.py @ 61:d685210eef3e
fix in pdftotabular tool
author | pieter.lukasse@wur.nl |
---|---|
date | Fri, 19 Dec 2014 15:30:13 +0100 |
parents | 35f506f30ae4 |
children |
comparison
equal
deleted
inserted
replaced
60:35f506f30ae4 | 61:d685210eef3e |
---|---|
62 if print_progress == True: | 62 if print_progress == True: |
63 print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell | 63 print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell |
64 line_id += 1 | 64 line_id += 1 |
65 if len(cell) == 7: # the compound has CAS number | 65 if len(cell) == 7: # the compound has CAS number |
66 if len(cell[1].split(':')) == 2: | 66 if len(cell[1].split(':')) == 2: |
67 forward.append(cell[1].split(':')[1]) | 67 forward.append((cell[1].split(':')[1]).strip()) |
68 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end | 68 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end |
69 if len(cell[0].split(':')) > 2: | 69 if len(cell[0].split(':')) > 2: |
70 name_tmp = ':'.join(cell[0].split(':')[1:]) | 70 name_tmp = ':'.join(cell[0].split(':')[1:]) |
71 else: | 71 else: |
72 name_tmp = cell[0].split(':')[1] | 72 name_tmp = cell[0].split(':')[1] |
73 | 73 |
74 # uggly workaround for the cases where there ends up to be no space between the name and the formula: exaustive | 74 name.append(name_tmp.replace(" ", " ").strip()) |
75 # replaces of known cases by the same with a white space: | 75 name_tmp = name_tmp.strip().split(' ') |
76 name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C') | |
77 name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C') | |
78 name_tmp = name_tmp.replace(', LC', ', L C').replace(', DC', ', D C') | |
79 | |
80 name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace(" ", " ")) | |
81 if name_tmp: | 76 if name_tmp: |
82 if name_tmp.split(' ')[-1][0] == 'C' or name_tmp.split(' ')[-1][0] == 'F' or name_tmp.split(' ')[-1][0] == 'H': | 77 # if the name ends with a word that starts with C, F or H, then assume this last word is a formula: |
83 formule = (name_tmp.split(' ')[-1]) | 78 if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H': |
79 formule = (name_tmp[-1]) | |
84 else: | 80 else: |
85 formule = ('not_def') | 81 formule = ('not_def') |
86 else: | 82 else: |
87 formule = ('not_def') | 83 formule = ('not_def') |
88 formula.append(formule.replace(" ", " ")) | 84 formula.append(formule.replace(" ", " ")) |
89 reverse.append(cell[2].split(':')[1]) | 85 reverse.append((cell[2].split(':')[1]).strip()) |
90 prob.append(cell[3].split(' ')[2].replace('%', '')) | 86 prob.append(cell[3].split(' ')[2].replace('%', '')) |
91 cas.append(cell[4].split(':')[1]) | 87 cas.append((cell[4].split(':')[1]).strip()) |
92 lib_id.append(cell[5].split(':')[1]) | 88 lib_id.append((cell[5].split(':')[1]).strip()) |
93 nist_id.append(cell[6].split(':')[1].replace('.', '').strip()) | 89 nist_id.append(cell[6].split(':')[1].replace('.', '').strip()) |
94 j = j + 1 | 90 j = j + 1 |
95 else: | 91 else: |
96 missed_compounds.append(hh) | 92 missed_compounds.append(hh) |
97 rt_missed_compounds.append(spec_id) | 93 rt_missed_compounds.append(spec_id) |
98 | 94 |
99 elif len(cell) >= 6: # the compound has no CAS number | 95 elif len(cell) == 6: # the compound has no CAS number |
100 if len(cell[1].split(':')) == 2: | 96 if len(cell[1].split(':')) == 2: |
101 | 97 |
102 forward.append(cell[1].split(':')[1]) | 98 forward.append((cell[1].split(':')[1]).strip()) |
103 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end | 99 # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end |
104 if len(cell[0].split(':')) > 2: | 100 if len(cell[0].split(':')) > 2: |
105 name_tmp = ':'.join(cell[0].split(':')[1:]) | 101 name_tmp = ':'.join(cell[0].split(':')[1:]) |
106 else: | 102 else: |
107 name_tmp = cell[0].split(':')[1] | 103 name_tmp = cell[0].split(':')[1] |
108 name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C') | 104 |
109 name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C') | 105 name.append(name_tmp.replace(" ", " ").strip()) |
110 name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace(" ", " ")) # " ", " " | |
111 name_tmp = name_tmp.strip().split(' ') | 106 name_tmp = name_tmp.strip().split(' ') |
112 if name_tmp: | 107 if name_tmp: |
108 # if the name ends with a word that starts with C, F or H, then assume this last word is a formula: | |
113 if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H': | 109 if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H': |
114 formule = (name_tmp[-1]) | 110 formule = (name_tmp[-1]) |
115 else: | 111 else: |
116 formule = ('not_def') | 112 formule = ('not_def') |
117 else: | 113 else: |
118 formule = ('not_def') | 114 formule = ('not_def') |
119 formula.append(formule.replace(" ", " ")) | 115 formula.append(formule.replace(" ", " ")) |
120 reverse.append(cell[2].split(':')[1]) | 116 reverse.append((cell[2].split(':')[1]).strip()) |
121 prob.append(cell[3].split(' ')[2].replace('%', '')) | 117 prob.append(cell[3].split(' ')[2].replace('%', '')) |
122 cas.append('undef') | 118 cas.append('undef') |
123 lib_id.append(cell[4].split(':')[1]) | 119 lib_id.append((cell[4].split(':')[1]).strip()) |
124 nist_id.append(cell[5].split(':')[1].replace('.', '').strip()) | 120 nist_id.append(cell[5].split(':')[1].replace('.', '').strip()) |
125 j = j + 1 | 121 j = j + 1 |
126 | 122 |
127 else: | 123 else: |
128 missed_compounds.append(hh) | 124 missed_compounds.append(hh) |
129 rt_missed_compounds.append(spec_id) | 125 rt_missed_compounds.append(spec_id) |
130 | 126 |
131 else: # Missing columns, report and quit | 127 else: # Missing columns, report and quit |
132 | 128 missed_compounds.append(hh) |
133 return | 129 rt_missed_compounds.append(spec_id) |
134 | 130 |
135 for _ in range(j): | 131 for _ in range(j): |
136 hitid.append(str(spec_id.replace(" ", " "))) | 132 hitid.append(str(spec_id.replace(" ", " "))) |
137 rt.append(str(float(spec_id.split('-')[3]) / 1e+06)) | 133 rt.append(str(float(spec_id.split('-')[3]) / 1e+06)) |
138 | 134 |
168 output_fh.write(Hitlist_as_text) | 164 output_fh.write(Hitlist_as_text) |
169 output_fh.close() | 165 output_fh.close() |
170 | 166 |
171 out_missed_pdf = open(error_file, 'wb') | 167 out_missed_pdf = open(error_file, 'wb') |
172 for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']): | 168 for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']): |
169 out_missed_pdf.write("Line with incorrect format or unexpected number of fields:\n") | |
173 out_missed_pdf.write('%s\n' % '\t'.join([y, x])) | 170 out_missed_pdf.write('%s\n' % '\t'.join([y, x])) |
174 out_missed_pdf.close() | 171 out_missed_pdf.close() |
175 | 172 |
176 | 173 |
177 def read_tabular(in_csv): | 174 def read_tabular(in_csv): |