Mercurial > repos > pieterlukasse > prims_metabolomics
changeset 61:d685210eef3e
fix in pdftotabular tool
author | pieter.lukasse@wur.nl |
---|---|
date | Fri, 19 Dec 2014 15:30:13 +0100 |
parents | 35f506f30ae4 |
children | 9bd2597c8851 |
files | rankfilter_GCMS/pdfread.py rankfilter_GCMS/pdftotabular.py rankfilter_GCMS/test/test_pdfread.py static_resources/elements_and_masses.tab |
diffstat | 4 files changed, 75 insertions(+), 49 deletions(-) [+] |
line wrap: on
line diff
--- a/rankfilter_GCMS/pdfread.py Fri Dec 19 11:30:22 2014 +0100 +++ b/rankfilter_GCMS/pdfread.py Fri Dec 19 15:30:13 2014 +0100 @@ -64,52 +64,17 @@ line_id += 1 if len(cell) == 7: # the compound has CAS number if len(cell[1].split(':')) == 2: - forward.append(cell[1].split(':')[1]) + forward.append((cell[1].split(':')[1]).strip()) # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end if len(cell[0].split(':')) > 2: name_tmp = ':'.join(cell[0].split(':')[1:]) else: name_tmp = cell[0].split(':')[1] - # uggly workaround for the cases where there ends up to be no space between the name and the formula: exaustive - # replaces of known cases by the same with a white space: - name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C') - name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C') - name_tmp = name_tmp.replace(', LC', ', L C').replace(', DC', ', D C') - - name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace(" ", " ")) - if name_tmp: - if name_tmp.split(' ')[-1][0] == 'C' or name_tmp.split(' ')[-1][0] == 'F' or name_tmp.split(' ')[-1][0] == 'H': - formule = (name_tmp.split(' ')[-1]) - else: - formule = ('not_def') - else: - formule = ('not_def') - formula.append(formule.replace(" ", " ")) - reverse.append(cell[2].split(':')[1]) - prob.append(cell[3].split(' ')[2].replace('%', '')) - cas.append(cell[4].split(':')[1]) - lib_id.append(cell[5].split(':')[1]) - nist_id.append(cell[6].split(':')[1].replace('.', '').strip()) - j = j + 1 - else: - missed_compounds.append(hh) - rt_missed_compounds.append(spec_id) - - elif len(cell) >= 6: # the compound has no CAS number - if len(cell[1].split(':')) == 2: - - forward.append(cell[1].split(':')[1]) - # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end - if len(cell[0].split(':')) > 2: - name_tmp = ':'.join(cell[0].split(':')[1:]) - else: - name_tmp = cell[0].split(':')[1] - name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C') - name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C') - name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace(" ", " ")) # " ", " " + name.append(name_tmp.replace(" ", " ").strip()) name_tmp = name_tmp.strip().split(' ') if name_tmp: + # if the name ends with a word that starts with C, F or H, then assume this last word is a formula: if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H': formule = (name_tmp[-1]) else: @@ -117,10 +82,41 @@ else: formule = ('not_def') formula.append(formule.replace(" ", " ")) - reverse.append(cell[2].split(':')[1]) + reverse.append((cell[2].split(':')[1]).strip()) + prob.append(cell[3].split(' ')[2].replace('%', '')) + cas.append((cell[4].split(':')[1]).strip()) + lib_id.append((cell[5].split(':')[1]).strip()) + nist_id.append(cell[6].split(':')[1].replace('.', '').strip()) + j = j + 1 + else: + missed_compounds.append(hh) + rt_missed_compounds.append(spec_id) + + elif len(cell) == 6: # the compound has no CAS number + if len(cell[1].split(':')) == 2: + + forward.append((cell[1].split(':')[1]).strip()) + # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end + if len(cell[0].split(':')) > 2: + name_tmp = ':'.join(cell[0].split(':')[1:]) + else: + name_tmp = cell[0].split(':')[1] + + name.append(name_tmp.replace(" ", " ").strip()) + name_tmp = name_tmp.strip().split(' ') + if name_tmp: + # if the name ends with a word that starts with C, F or H, then assume this last word is a formula: + if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H': + formule = (name_tmp[-1]) + else: + formule = ('not_def') + else: + formule = ('not_def') + formula.append(formule.replace(" ", " ")) + reverse.append((cell[2].split(':')[1]).strip()) prob.append(cell[3].split(' ')[2].replace('%', '')) cas.append('undef') - lib_id.append(cell[4].split(':')[1]) + lib_id.append((cell[4].split(':')[1]).strip()) nist_id.append(cell[5].split(':')[1].replace('.', '').strip()) j = j + 1 @@ -129,8 +125,8 @@ rt_missed_compounds.append(spec_id) else: # Missing columns, report and quit - - return + missed_compounds.append(hh) + rt_missed_compounds.append(spec_id) for _ in range(j): hitid.append(str(spec_id.replace(" ", " "))) @@ -170,6 +166,7 @@ out_missed_pdf = open(error_file, 'wb') for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']): + out_missed_pdf.write("Line with incorrect format or unexpected number of fields:\n") out_missed_pdf.write('%s\n' % '\t'.join([y, x])) out_missed_pdf.close()
--- a/rankfilter_GCMS/pdftotabular.py Fri Dec 19 11:30:22 2014 +0100 +++ b/rankfilter_GCMS/pdftotabular.py Fri Dec 19 15:30:13 2014 +0100 @@ -27,8 +27,11 @@ @param output_file: output text file for the hits ''' + # "-layout" option in pdftotext call below: Maintain (as best as possible) the original physical layout of the text. The + # default is to 'undo' physical layout (columns, hyphenation, etc.) and output + # the text in reading order. try: - call(["pdftotext", filename, output_file]) + call(["pdftotext", "-layout", filename, output_file]) except: raise Exception("Error while trying to convert PDF to text")
--- a/rankfilter_GCMS/test/test_pdfread.py Fri Dec 19 11:30:22 2014 +0100 +++ b/rankfilter_GCMS/test/test_pdfread.py Fri Dec 19 15:30:13 2014 +0100 @@ -3,13 +3,14 @@ @author: marcelk ''' -from GCMS.rankfilter_GCMS import pdfread # @UnresolvedImport +from GCMS.rankfilter_GCMS import pdfread, pdftotabular # @UnresolvedImport from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 import unittest class Test(unittest.TestCase): + def setUp(self): self.nist_pdf = resource_filename(__name__, "data/NIST_test_PDF.txt") @@ -20,18 +21,44 @@ [hitlist, hitlist_missed] = pdfread.getPDF(self.nist_pdf, True) rows = [hitlist[row] for row in hitlist.keys()] data = [set(row) for row in zip(*rows)] - expected_element = set(('12.3', ' Sucrose ', '14', 'undef', ' standards 2009', ' 660', 'not_def', - '18495-0.142537-21284-2.26544e+07-135', '22.6544', ' 714')) + expected_element = set(('12.3', 'Sucrose', '14', 'undef', 'standards 2009', '660', 'not_def', + '18495-0.142537-21284-2.26544e+07-135', '22.6544', '714')) self.failUnless(expected_element in data) self.failUnless(len(hitlist_missed) != 0) ''' Check for last (dummy) hit: - Hit 6 : (dummy hit)Sorbopyranose, 1,2,3,4,5-pentakis-O-(trimethylsilyl)-, LC21H52O6Si5;MF: 658; RMF: 658; Prob 15.6%; CAS: 30645-02-4; Lib: mainlib; ID: 37062. + Hit 6 : (dummy hit)Sorbopyranose, 1,2,3,4,5-pentakis-O-(trimethylsilyl)-, L C21H52O6Si5;MF: 658; RMF: 658; Prob 15.6%; CAS: 30645-02-4; Lib: mainlib; ID: 37062. ''' - expected_element = set(['C21H52O6Si5', ' 30645-02-4', ' mainlib', '15.6', ' (dummy hit)Sorbopyranose, 1,2,3,4,5-pentakis-O-(trimethylsilyl)-, L C21H52O6Si5', '7298-1-9580-1.29014e+07-9', ' 658', '12.9014', '37062']) + expected_element = set(['C21H52O6Si5', '30645-02-4', 'mainlib', '15.6', '(dummy hit)Sorbopyranose, 1,2,3,4,5-pentakis-O-(trimethylsilyl)-, L C21H52O6Si5', '7298-1-9580-1.29014e+07-9', '658', '12.9014', '37062']) self.failUnless(expected_element in data) + + def test_pdftotabular(self): + #pdftotabular.convert_pdftotext(resource_filename(__name__, "data/Coffee_suntory_without spectra.pdf"), "Coffee_suntory_without spectra2.txt") + pdfread.convert_pdftotext2tabular(resource_filename(__name__, "data/testfile_2.txt"), + resource_filename(__name__, "data/testfile_2.tab"), + resource_filename(__name__, "data/testfile_2.log"), False) + + #read both the reference file and actual output files + expected = _read_file(resource_filename(__name__, "data/testfile_2.tab_ref")) + actual = _read_file(resource_filename(__name__, "data/testfile_2.tab")) + + #convert the read in files to lists we can compare + expected = expected.split() + actual = actual.split() + + for exp, act in zip(expected, actual): + # compare values + self.failUnlessEqual(exp, act) if __name__ == "__main__": #import sys;sys.argv = ['', 'Test.test_getPDF'] unittest.main() + +def _read_file(filename): + ''' + Helper method to quickly read a file + @param filename: + ''' + with open(filename) as handle: + return handle.read()
--- a/static_resources/elements_and_masses.tab Fri Dec 19 11:30:22 2014 +0100 +++ b/static_resources/elements_and_masses.tab Fri Dec 19 15:30:13 2014 +0100 @@ -1,6 +1,5 @@ Name Atomic number Chemical symbol Relative atomic mass Hydrogen 1 H 1.01 -Deutrium ? D 2.014 Helium 2 He 4 Lithium 3 Li 6.94 Beryllium 4 Be 9.01