changeset 61:d685210eef3e

fix in pdftotabular tool
author pieter.lukasse@wur.nl
date Fri, 19 Dec 2014 15:30:13 +0100
parents 35f506f30ae4
children 9bd2597c8851
files rankfilter_GCMS/pdfread.py rankfilter_GCMS/pdftotabular.py rankfilter_GCMS/test/test_pdfread.py static_resources/elements_and_masses.tab
diffstat 4 files changed, 75 insertions(+), 49 deletions(-) [+]
line wrap: on
line diff
--- a/rankfilter_GCMS/pdfread.py	Fri Dec 19 11:30:22 2014 +0100
+++ b/rankfilter_GCMS/pdfread.py	Fri Dec 19 15:30:13 2014 +0100
@@ -64,52 +64,17 @@
                 line_id += 1
                 if len(cell) == 7:  # the compound has CAS number
                     if len(cell[1].split(':')) == 2:
-                        forward.append(cell[1].split(':')[1])
+                        forward.append((cell[1].split(':')[1]).strip())
                         # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
                         if len(cell[0].split(':')) > 2:
                             name_tmp = ':'.join(cell[0].split(':')[1:])
                         else:
                             name_tmp = cell[0].split(':')[1]
                             
-                        # uggly workaround for the cases where there ends up to be no space between the name and the formula: exaustive
-                        # replaces of known cases by the same with a white space:    
-                        name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
-                        name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
-                        name_tmp = name_tmp.replace(', LC', ', L C').replace(', DC', ', D C')
-                        
-                        name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace("  ", " "))
-                        if name_tmp:
-                            if name_tmp.split(' ')[-1][0] == 'C' or name_tmp.split(' ')[-1][0] == 'F' or name_tmp.split(' ')[-1][0] == 'H':
-                                formule = (name_tmp.split(' ')[-1])
-                            else:
-                                formule = ('not_def')
-                        else:
-                            formule = ('not_def')
-                        formula.append(formule.replace("  ", " "))
-                        reverse.append(cell[2].split(':')[1])
-                        prob.append(cell[3].split(' ')[2].replace('%', ''))
-                        cas.append(cell[4].split(':')[1])
-                        lib_id.append(cell[5].split(':')[1])
-                        nist_id.append(cell[6].split(':')[1].replace('.', '').strip())
-                        j = j + 1
-                    else:
-                        missed_compounds.append(hh)
-                        rt_missed_compounds.append(spec_id)
-
-                elif len(cell) >= 6:  # the compound has no CAS number
-                    if len(cell[1].split(':')) == 2:
-
-                        forward.append(cell[1].split(':')[1])
-                        # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
-                        if len(cell[0].split(':')) > 2:
-                            name_tmp = ':'.join(cell[0].split(':')[1:])
-                        else:
-                            name_tmp = cell[0].split(':')[1]
-                        name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
-                        name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
-                        name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace("  ", " "))  # "  ", " "
+                        name.append(name_tmp.replace("  ", " ").strip())
                         name_tmp = name_tmp.strip().split(' ')
                         if name_tmp:
+                            # if the name ends with a word that starts with C, F or H, then assume this last word is a formula:
                             if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
                                 formule = (name_tmp[-1])
                             else:
@@ -117,10 +82,41 @@
                         else:
                             formule = ('not_def')
                         formula.append(formule.replace("  ", " "))
-                        reverse.append(cell[2].split(':')[1])
+                        reverse.append((cell[2].split(':')[1]).strip())
+                        prob.append(cell[3].split(' ')[2].replace('%', ''))
+                        cas.append((cell[4].split(':')[1]).strip())
+                        lib_id.append((cell[5].split(':')[1]).strip())
+                        nist_id.append(cell[6].split(':')[1].replace('.', '').strip())
+                        j = j + 1
+                    else:
+                        missed_compounds.append(hh)
+                        rt_missed_compounds.append(spec_id)
+
+                elif len(cell) == 6:  # the compound has no CAS number
+                    if len(cell[1].split(':')) == 2:
+
+                        forward.append((cell[1].split(':')[1]).strip())
+                        # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
+                        if len(cell[0].split(':')) > 2:
+                            name_tmp = ':'.join(cell[0].split(':')[1:])
+                        else:
+                            name_tmp = cell[0].split(':')[1]
+                        
+                        name.append(name_tmp.replace("  ", " ").strip())
+                        name_tmp = name_tmp.strip().split(' ')
+                        if name_tmp:
+                            # if the name ends with a word that starts with C, F or H, then assume this last word is a formula:
+                            if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
+                                formule = (name_tmp[-1])
+                            else:
+                                formule = ('not_def')
+                        else:
+                            formule = ('not_def')
+                        formula.append(formule.replace("  ", " "))
+                        reverse.append((cell[2].split(':')[1]).strip())
                         prob.append(cell[3].split(' ')[2].replace('%', ''))
                         cas.append('undef')
-                        lib_id.append(cell[4].split(':')[1])
+                        lib_id.append((cell[4].split(':')[1]).strip())
                         nist_id.append(cell[5].split(':')[1].replace('.', '').strip())
                         j = j + 1
 
@@ -129,8 +125,8 @@
                         rt_missed_compounds.append(spec_id)
 
                 else: # Missing columns, report and quit
-                    
-                    return
+                    missed_compounds.append(hh)
+                    rt_missed_compounds.append(spec_id)
 
             for _ in range(j):
                 hitid.append(str(spec_id.replace("  ", " ")))
@@ -170,6 +166,7 @@
 
     out_missed_pdf = open(error_file, 'wb')
     for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']):
+        out_missed_pdf.write("Line with incorrect format or unexpected number of fields:\n")
         out_missed_pdf.write('%s\n' % '\t'.join([y, x]))
     out_missed_pdf.close()
 
--- a/rankfilter_GCMS/pdftotabular.py	Fri Dec 19 11:30:22 2014 +0100
+++ b/rankfilter_GCMS/pdftotabular.py	Fri Dec 19 15:30:13 2014 +0100
@@ -27,8 +27,11 @@
     @param output_file: output text file for the hits    
     '''
     
+    # "-layout" option in pdftotext call below: Maintain (as best as possible) the original physical layout of the text. The 
+    #                                           default is to 'undo' physical layout (columns, hyphenation, etc.) and output 
+    #                                           the text in reading order.
     try:
-        call(["pdftotext", filename, output_file])
+        call(["pdftotext", "-layout", filename, output_file])
     except:
         raise Exception("Error while trying to convert PDF to text")
    
--- a/rankfilter_GCMS/test/test_pdfread.py	Fri Dec 19 11:30:22 2014 +0100
+++ b/rankfilter_GCMS/test/test_pdfread.py	Fri Dec 19 15:30:13 2014 +0100
@@ -3,13 +3,14 @@
 
 @author: marcelk
 '''
-from GCMS.rankfilter_GCMS import pdfread  # @UnresolvedImport
+from GCMS.rankfilter_GCMS import pdfread, pdftotabular  # @UnresolvedImport
 from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
 import unittest
 
 
 class Test(unittest.TestCase):
 
+
     def setUp(self):
         self.nist_pdf = resource_filename(__name__, "data/NIST_test_PDF.txt")
 
@@ -20,18 +21,44 @@
         [hitlist, hitlist_missed] = pdfread.getPDF(self.nist_pdf, True)
         rows = [hitlist[row] for row in hitlist.keys()]
         data = [set(row) for row in zip(*rows)]
-        expected_element = set(('12.3', ' Sucrose ', '14', 'undef', ' standards 2009', ' 660', 'not_def',
-        '18495-0.142537-21284-2.26544e+07-135', '22.6544', ' 714'))
+        expected_element = set(('12.3', 'Sucrose', '14', 'undef', 'standards 2009', '660', 'not_def',
+        '18495-0.142537-21284-2.26544e+07-135', '22.6544', '714'))
         self.failUnless(expected_element in data)
         self.failUnless(len(hitlist_missed) != 0)
         '''
         Check for last (dummy) hit:  
-        Hit 6 : (dummy hit)Sorbopyranose, 1,2,3,4,5-pentakis-O-(trimethylsilyl)-, LC21H52O6Si5;MF: 658; RMF: 658; Prob 15.6%; CAS: 30645-02-4; Lib: mainlib; ID: 37062.
+        Hit 6 : (dummy hit)Sorbopyranose, 1,2,3,4,5-pentakis-O-(trimethylsilyl)-, L C21H52O6Si5;MF: 658; RMF: 658; Prob 15.6%; CAS: 30645-02-4; Lib: mainlib; ID: 37062.
         '''
-        expected_element = set(['C21H52O6Si5', ' 30645-02-4', ' mainlib', '15.6', ' (dummy hit)Sorbopyranose, 1,2,3,4,5-pentakis-O-(trimethylsilyl)-, L C21H52O6Si5', '7298-1-9580-1.29014e+07-9', ' 658', '12.9014', '37062'])
+        expected_element = set(['C21H52O6Si5', '30645-02-4', 'mainlib', '15.6', '(dummy hit)Sorbopyranose, 1,2,3,4,5-pentakis-O-(trimethylsilyl)-, L C21H52O6Si5', '7298-1-9580-1.29014e+07-9', '658', '12.9014', '37062'])
         self.failUnless(expected_element in data)
         
+        
+    def test_pdftotabular(self):
+        #pdftotabular.convert_pdftotext(resource_filename(__name__, "data/Coffee_suntory_without spectra.pdf"), "Coffee_suntory_without spectra2.txt")
+        pdfread.convert_pdftotext2tabular(resource_filename(__name__, "data/testfile_2.txt"),
+                                          resource_filename(__name__, "data/testfile_2.tab"),
+                                          resource_filename(__name__, "data/testfile_2.log"), False)
+        
+        #read both the reference file  and actual output files
+        expected = _read_file(resource_filename(__name__, "data/testfile_2.tab_ref"))
+        actual = _read_file(resource_filename(__name__, "data/testfile_2.tab"))
+        
+        #convert the read in files to lists we can compare
+        expected = expected.split()
+        actual = actual.split()
+
+        for exp, act in zip(expected, actual):
+            # compare values
+            self.failUnlessEqual(exp, act)
 
 if __name__ == "__main__":
     #import sys;sys.argv = ['', 'Test.test_getPDF']
     unittest.main()
+    
+def _read_file(filename):
+    '''
+    Helper method to quickly read a file
+    @param filename:
+    '''
+    with open(filename) as handle:
+        return handle.read()    
--- a/static_resources/elements_and_masses.tab	Fri Dec 19 11:30:22 2014 +0100
+++ b/static_resources/elements_and_masses.tab	Fri Dec 19 15:30:13 2014 +0100
@@ -1,6 +1,5 @@
 Name	Atomic number	Chemical symbol	Relative atomic mass
 Hydrogen	1	H	1.01
-Deutrium	?	D	2.014
 Helium	2	He	4
 Lithium	3	Li	6.94
 Beryllium	4	Be	9.01