changeset 26:637830ac8bcd

added validation in metexp to tabular tool; added workaround/fix for L and D compound types
author pieter.lukasse@wur.nl
date Thu, 24 Apr 2014 11:28:38 +0200
parents ab7f9ec70ffc
children ecd3f6c9e606
files MsClust.jar export_to_metexp_tabular.xml rankfilter_GCMS/pdfread.py rankfilter_GCMS/test/test_pdfread.py test/test_query_mass_repos.py
diffstat 5 files changed, 33 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
Binary file MsClust.jar has changed
--- a/export_to_metexp_tabular.xml	Fri Apr 04 10:25:19 2014 +0200
+++ b/export_to_metexp_tabular.xml	Thu Apr 24 11:28:38 2014 +0200
@@ -1,6 +1,6 @@
 <tool id="export_to_metexp_tabular" 
     name="METEXP - Tabular file" 
-    version="0.1.0">
+    version="0.2.0">
   <description>Create tabular file for loading into METabolomics EXPlorer database</description>
   <command interpreter="python">
     export_to_metexp_tabular.py $rankfilter_and_caslookup_combi $msclust_quant_file $output_result 
@@ -15,22 +15,33 @@
     	
    <param name="organism" type="text" size="80"
            label="Organism(s) info"
-           help="Metadata information to accompany the results when stored in MetExp DB." /> 	
+           help="Metadata information to accompany the results when stored in MetExp DB." >
+           <validator type="empty_field" message="A value is required."></validator><!-- attribute optional="False" does not seem to work for params so validator is added -->
+    </param>
+            	
    <param name="tissue" type="text" size="80"
            label="Tissue(s) info"
-           help="Metadata information to accompany the results when stored in MetExp DB." />
+           help="Metadata information to accompany the results when stored in MetExp DB."  >
+           <validator type="empty_field" message="A value is required."></validator>
+    </param>
            
    <param name="experiment_name" type="text" size="80"
            label="Experiment name/code"
-           help="Name or code to store the results under. This can help you find the results back in MetExpDB." />  
+           help="Name or code to store the results under. This can help you find the results back in MetExpDB."  >
+           <validator type="empty_field" message="A value is required."></validator>
+    </param>
            
    <param name="user_name" type="text" size="80"
            label="User name"
-           help="User name or code to store the results under. This can help you find the results back in MetExpDB." />  
+           help="User name or code to store the results under. This can help you find the results back in MetExpDB."  >
+           <validator type="empty_field" message="A value is required."></validator>
+    </param>
                    
     <param name="column_type" type="text" size="80"
            label="Column type"
-           help="Column type to report with the results. This can help you find the results back in MetExpDB." />    
+           help="Column type to report with the results. This can help you find the results back in MetExpDB."  >
+           <validator type="empty_field" message="A value is required."></validator>
+    </param>
     
   </inputs>
   <outputs>
--- a/rankfilter_GCMS/pdfread.py	Fri Apr 04 10:25:19 2014 +0200
+++ b/rankfilter_GCMS/pdfread.py	Thu Apr 24 11:28:38 2014 +0200
@@ -52,8 +52,9 @@
     for line in hit_list:
         line = line.strip().translate(None, '\r')
         if line != '':
-            hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit')
-
+            hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit')  #solution? : if we wouldn't replace the \n by ' ' but by some special sign, then reading formula would be simpler! 
+                                                                                                #strange....code seems fine actually...debug! See test/data/download.pdf 
+                                                                                                # strange thing is that it looks like the new line does not end up in the text file, eventhough it looks like there is a new line in the pdf...perhaps a bug in the pdf2text command in linux?
             spec_id = hits.pop(0).split(' ')[1]
             j = 0
             for hh in hits:
@@ -69,8 +70,13 @@
                             name_tmp = ':'.join(cell[0].split(':')[1:])
                         else:
                             name_tmp = cell[0].split(':')[1]
+                            
+                        # uggly workaround for the cases where there ends up to be no space between the name and the formula: exaustive
+                        # replaces of known cases by the same with a white space:    
                         name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
                         name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
+                        name_tmp = name_tmp.replace('-, LC', '-, L C').replace('-, DC', '-, D C')
+                        
                         name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace("  ", " "))
                         if name_tmp:
                             if name_tmp.split(' ')[-1][0] == 'C' or name_tmp.split(' ')[-1][0] == 'F' or name_tmp.split(' ')[-1][0] == 'H':
--- a/rankfilter_GCMS/test/test_pdfread.py	Fri Apr 04 10:25:19 2014 +0200
+++ b/rankfilter_GCMS/test/test_pdfread.py	Thu Apr 24 11:28:38 2014 +0200
@@ -24,6 +24,13 @@
         '18495-0.142537-21284-2.26544e+07-135', '22.6544', ' 714'))
         self.failUnless(expected_element in data)
         self.failUnless(len(hitlist_missed) != 0)
+        '''
+        Check for last (dummy) hit:  
+        Hit 6 : (dummy hit)Sorbopyranose, 1,2,3,4,5-pentakis-O-(trimethylsilyl)-, LC21H52O6Si5;MF: 658; RMF: 658; Prob 15.6%; CAS: 30645-02-4; Lib: mainlib; ID: 37062.
+        '''
+        expected_element = set(['C21H52O6Si5', ' 30645-02-4', ' mainlib', '15.6', ' (dummy hit)Sorbopyranose, 1,2,3,4,5-pentakis-O-(trimethylsilyl)-, L C21H52O6Si5', '7298-1-9580-1.29014e+07-9', ' 658', '12.9014', '37062'])
+        self.failUnless(expected_element in data)
+        
 
 if __name__ == "__main__":
     #import sys;sys.argv = ['', 'Test.test_getPDF']
--- a/test/test_query_mass_repos.py	Fri Apr 04 10:25:19 2014 +0200
+++ b/test/test_query_mass_repos.py	Thu Apr 24 11:28:38 2014 +0200
@@ -31,7 +31,7 @@
         
         input_file = resource_filename(__name__, "data/service_query_tabular.txt")
 
-        molecular_mass_col = "MM"
+        molecular_mass_col = "mass (Da)"
         dblink_file = resource_filename(__name__, "data/MFSearcher ExactMassDB service.txt")
         output_result = resource_filename(__name__, outdir + "metexp_query_results_added.txt")