Mercurial > repos > pieterlukasse > prims_metabolomics

diff rankfilter_GCMS/pdfread.py @ 61:d685210eef3e
fix in pdftotabular tool
author: pieter.lukasse@wur.nl
date: Fri, 19 Dec 2014 15:30:13 +0100
parents: 35f506f30ae4
--- a/rankfilter_GCMS/pdfread.py	Fri Dec 19 11:30:22 2014 +0100
+++ b/rankfilter_GCMS/pdfread.py	Fri Dec 19 15:30:13 2014 +0100
@@ -64,52 +64,17 @@
                 line_id += 1
                 if len(cell) == 7:  # the compound has CAS number
                     if len(cell[1].split(':')) == 2:
-                        forward.append(cell[1].split(':')[1])
+                        forward.append((cell[1].split(':')[1]).strip())
                         # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
                         if len(cell[0].split(':')) > 2:
                             name_tmp = ':'.join(cell[0].split(':')[1:])
                         else:
                             name_tmp = cell[0].split(':')[1]
                             
-                        # uggly workaround for the cases where there ends up to be no space between the name and the formula: exaustive
-                        # replaces of known cases by the same with a white space:    
-                        name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
-                        name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
-                        name_tmp = name_tmp.replace(', LC', ', L C').replace(', DC', ', D C')
-                        
-                        name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace("  ", " "))
-                        if name_tmp:
-                            if name_tmp.split(' ')[-1][0] == 'C' or name_tmp.split(' ')[-1][0] == 'F' or name_tmp.split(' ')[-1][0] == 'H':
-                                formule = (name_tmp.split(' ')[-1])
-                            else:
-                                formule = ('not_def')
-                        else:
-                            formule = ('not_def')
-                        formula.append(formule.replace("  ", " "))
-                        reverse.append(cell[2].split(':')[1])
-                        prob.append(cell[3].split(' ')[2].replace('%', ''))
-                        cas.append(cell[4].split(':')[1])
-                        lib_id.append(cell[5].split(':')[1])
-                        nist_id.append(cell[6].split(':')[1].replace('.', '').strip())
-                        j = j + 1
-                    else:
-                        missed_compounds.append(hh)
-                        rt_missed_compounds.append(spec_id)
-
-                elif len(cell) >= 6:  # the compound has no CAS number
-                    if len(cell[1].split(':')) == 2:
-
-                        forward.append(cell[1].split(':')[1])
-                        # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
-                        if len(cell[0].split(':')) > 2:
-                            name_tmp = ':'.join(cell[0].split(':')[1:])
-                        else:
-                            name_tmp = cell[0].split(':')[1]
-                        name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
-                        name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
-                        name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace("  ", " "))  # "  ", " "
+                        name.append(name_tmp.replace("  ", " ").strip())
                         name_tmp = name_tmp.strip().split(' ')
                         if name_tmp:
+                            # if the name ends with a word that starts with C, F or H, then assume this last word is a formula:
                             if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
                                 formule = (name_tmp[-1])
                             else:
@@ -117,10 +82,41 @@
                         else:
                             formule = ('not_def')
                         formula.append(formule.replace("  ", " "))
-                        reverse.append(cell[2].split(':')[1])
+                        reverse.append((cell[2].split(':')[1]).strip())
+                        prob.append(cell[3].split(' ')[2].replace('%', ''))
+                        cas.append((cell[4].split(':')[1]).strip())
+                        lib_id.append((cell[5].split(':')[1]).strip())
+                        nist_id.append(cell[6].split(':')[1].replace('.', '').strip())
+                        j = j + 1
+                    else:
+                        missed_compounds.append(hh)
+                        rt_missed_compounds.append(spec_id)
+
+                elif len(cell) == 6:  # the compound has no CAS number
+                    if len(cell[1].split(':')) == 2:
+
+                        forward.append((cell[1].split(':')[1]).strip())
+                        # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
+                        if len(cell[0].split(':')) > 2:
+                            name_tmp = ':'.join(cell[0].split(':')[1:])
+                        else:
+                            name_tmp = cell[0].split(':')[1]
+                        
+                        name.append(name_tmp.replace("  ", " ").strip())
+                        name_tmp = name_tmp.strip().split(' ')
+                        if name_tmp:
+                            # if the name ends with a word that starts with C, F or H, then assume this last word is a formula:
+                            if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
+                                formule = (name_tmp[-1])
+                            else:
+                                formule = ('not_def')
+                        else:
+                            formule = ('not_def')
+                        formula.append(formule.replace("  ", " "))
+                        reverse.append((cell[2].split(':')[1]).strip())
                         prob.append(cell[3].split(' ')[2].replace('%', ''))
                         cas.append('undef')
-                        lib_id.append(cell[4].split(':')[1])
+                        lib_id.append((cell[4].split(':')[1]).strip())
                         nist_id.append(cell[5].split(':')[1].replace('.', '').strip())
                         j = j + 1
 
@@ -129,8 +125,8 @@
                         rt_missed_compounds.append(spec_id)
 
                 else: # Missing columns, report and quit
-                    
-                    return
+                    missed_compounds.append(hh)
+                    rt_missed_compounds.append(spec_id)
 
             for _ in range(j):
                 hitid.append(str(spec_id.replace("  ", " ")))
@@ -170,6 +166,7 @@
 
     out_missed_pdf = open(error_file, 'wb')
     for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']):
+        out_missed_pdf.write("Line with incorrect format or unexpected number of fields:\n")
         out_missed_pdf.write('%s\n' % '\t'.join([y, x]))
     out_missed_pdf.close()
author	pieter.lukasse@wur.nl
date	Fri, 19 Dec 2014 15:30:13 +0100
parents	35f506f30ae4
children