changeset 14:346ff9ad8c7a

fix for rankfilter, removed pfd read functional
author linda.bakker@wur.nl <linda.bakker@wur.nl>
date Fri, 20 Mar 2015 17:10:04 +0100
parents 5a753524e525
children 05ff1c55db84
files GCMS/combine_output.py GCMS/combine_output.xml metaMS/README.txt rankfilter_GCMS/rankfilter.py
diffstat 4 files changed, 54 insertions(+), 54 deletions(-) [+]
line wrap: on
line diff
--- a/GCMS/combine_output.py	Thu Mar 19 15:04:56 2015 +0100
+++ b/GCMS/combine_output.py	Fri Mar 20 17:10:04 2015 +0100
@@ -81,7 +81,6 @@
     # The ID in the RankFilter output contains the following 5 fields:
     rf_id = rankfilter['ID'].split('-')
     try:
-        name, formula = _remove_formula(rankfilter['Name'])
         hit = [rf_id[0], # Centrotype
                rf_id[1], # cent.Factor
                rf_id[2], # scan nr
@@ -89,8 +88,8 @@
                rf_id[4], # nr. Peaks
                # Appending other fields
                rankfilter['R.T.'],
-               name,
-               caslookup['FORMULA'] if not formula else formula,
+               rankfilter['Name'],
+               rankfilter['Formula'],
                rankfilter['Library'].strip(),
                rankfilter['CAS'].strip(),
                rankfilter['Forward'],
@@ -120,18 +119,7 @@
     return hit
 
 
-def _remove_formula(name):
-    '''
-    The RankFilter Name field often contains the Formula as well, this function removes it from the Name
-    @param name: complete name of the compound from the RankFilter output
-    '''
-    name = name.split()
-    poss_formula = name[-1]
-    match = re.match("^(([A-Z][a-z]{0,2})(\d*))+$", poss_formula)
-    if match:
-        return ' '.join(name[:-1]), poss_formula
-    else:
-        return ' '.join(name), False
+
 
 
 def _get_default_caslookup():
--- a/GCMS/combine_output.xml	Thu Mar 19 15:04:56 2015 +0100
+++ b/GCMS/combine_output.xml	Fri Mar 20 17:10:04 2015 +0100
@@ -17,6 +17,10 @@
   <help>
 Performs a combination of output files from the 'RankFilter' and 'Lookup RI for CAS' tools into two tab-separated files.
 
+Merges data from both input dictionaries based on the Centrotype field.
+In the 'RIQC-RankFilter output' the centrotype is found in the 'ID' field (first part before the "-"). In the 'RIQC-Lookup RI for CAS output'
+the centrotype is found in the 'Centrotype' field. 
+
 The files produced are contain either all hits for a compound on a single line (Single) or on separate lines 
 (Multi). 
 
--- a/metaMS/README.txt	Thu Mar 19 15:04:56 2015 +0100
+++ b/metaMS/README.txt	Fri Mar 20 17:10:04 2015 +0100
@@ -1,70 +1,74 @@
 Wrappers for:
-- the metaMS R package by Ron Wehrens.
-- the xcms package. 
+- the metaMS R package by Ron Wehrens (https://github.com/rwehrens/metaMS.git)
+- the xcms package (https://xcmsonline.scripps.edu/, http://www.bioconductor.org/packages/release/bioc/html/xcms.html)
+- the CAMERA tool ( http://www.bioconductor.org/packages/release/bioc/html/CAMERA.html)
 
 Wrappers written by Pieter Lukasse. 
 
 
-Installation  (when updating: close all vignettes [i.e. pdfs/manuals] !)
+=======Installation  (when updating: close all vignettes [i.e. pdfs/manuals] !)============
 
-In R:
+In R execute:
 source("http://bioconductor.org/biocLite.R")
 biocLite("metaMS")
 biocLite("multtest")
-#biocLite("R2HTML")
+
 # for "multi-threading"  (actually starts multiple R processes for parallel processing):
 install.packages("snow")
 install.packages("Cairo")
 
+>> Running the wrappers: go to <this> directory and execute: 
+
 ======Run metaMS_cmd_pick_and_group.r with:=================
 
-E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript metaMS_cmd_pick_and_group.r test/extdata.zip test/example_settings.txt test/out/peakTable.txt test/out/xsAnnotatePrep.rdata positive test/out/html_peaks.html test/out 
+Rscript metaMS_cmd_pick_and_group.r test/extdata.zip test/example_settings.txt test/out/peakTable.txt test/out/xsAnnotatePrep.rdata positive test/out/html_peaks.html test/out 
 
 
 ======Run metaMS_cmd_annotate.r with:=================
 
-E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript metaMS_cmd_annotate.r test/LCDBtest.RData test/out/xsAnnotatePrep.rdata test/example_settings.txt test/out/annotationTable.txt "0" test/out/html_annot.html test/out
+Rscript metaMS_cmd_annotate.r test/LCDBtest.RData test/out/xsAnnotatePrep.rdata test/example_settings.txt test/out/annotationTable.txt "0" test/out/html_annot.html test/out
 
 
 ======Run xcms_differential_analysis.r with:=================
 
-E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript xcms_differential_analysis.r test/out/xsAnnotatePrep.rdata "CLASS1" "CLASS2" 10 test/out2/outtable.tsv test/out2/html/html.html test/out2/html
+Rscript xcms_differential_analysis.r test/out/xsAnnotatePrep.rdata "CLASS1" "CLASS2" 10 test/out2/outtable.tsv test/out2/html/html.html test/out2/html
 
 
 ======Run xcms_get_eic.r with:=================
 
 
-E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript xcms_get_alignment_eic.r test/out/xsAnnotatePrep.rdata 10 300 3 STDmix_GC_01,STDmix_GC_02 test/out3/html/html.html test/out3/html
+Rscript xcms_get_alignment_eic.r test/out/xsAnnotatePrep.rdata 10 300 3 STDmix_GC_01,STDmix_GC_02 test/out3/html/html.html test/out3/html
 
 OR
 
-E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript xcms_get_mass_eic.r test/out/xsAnnotatePrep.rdata 10 3000 -1 -1 "77.98,231.96" 5 STDmix_GC_01,STDmix_GC_02 Yes raw test/out4/html/html.html test/out4/html
+Rscript xcms_get_mass_eic.r test/out/xsAnnotatePrep.rdata 10 3000 -1 -1 "77.98,231.96" 5 STDmix_GC_01,STDmix_GC_02 Yes raw test/out4/html/html.html test/out4/html
 
 
 
 !!!!!!!!!!!!!!!!Troubleshooting:!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
 
-NetCDF is required. If the following is found in the installation.log 
+(1) NetCDF is required. If the following is found in the installation.log : 
 
-In file included from rnetCDF.c:2:0:
-rnetCDF.h:1:20: fatal error: netcdf.h: No such file or directory
-compilation terminated.
+	In file included from rnetCDF.c:2:0:
+	rnetCDF.h:1:20: fatal error: netcdf.h: No such file or directory
+	compilation terminated.
 
 
 then metaMS will not have been installed and running the tool will result in error: 
-<simpleError in library(metaMS): there is no package called 'metaMS'>
+	
+	<simpleError in library(metaMS): there is no package called 'metaMS'>
 
 Possible solution:
-> Install the -dev of those packages to get the headers that are 
-  required to compile the package. In this case, you need libnetcdf-dev, udunits-bin and libudunits2-dev
-  (from http://stackoverflow.com/questions/11319698/how-to-install-r-packages-rnetcdf-and-ncdf-on-ubuntu)
+	> Install the -dev of those packages to get the headers that are 
+	  required to compile the package. In this case, you need libnetcdf-dev, udunits-bin and libudunits2-dev
+	  (from http://stackoverflow.com/questions/11319698/how-to-install-r-packages-rnetcdf-and-ncdf-on-ubuntu)
 So 
->>sudo apt-get install libnetcdf-dev, udunits-bin and libudunits2-dev
+	>>sudo apt-get install libnetcdf-dev, udunits-bin and libudunits2-dev
   
-Cairo / no X11 mode is required. 
+(2) Cairo / "no X11" (headless) mode is required. 
 
 Possible solution:
-> install of cairo (http://www.cairographics.org/) and/or set CAIRO_CFLAGS/LIBS correspondingly.  
+	> install of cairo (http://www.cairographics.org/) and/or set CAIRO_CFLAGS/LIBS correspondingly.  
 So 
->>sudo apt-get install libcairo2-dev
+	>>sudo apt-get install libcairo2-dev
--- a/rankfilter_GCMS/rankfilter.py	Thu Mar 19 15:04:56 2015 +0100
+++ b/rankfilter_GCMS/rankfilter.py	Fri Mar 20 17:10:04 2015 +0100
@@ -25,9 +25,8 @@
 from numpy import array, linalg, ones
 from numpy.polynomial import polynomial
 import math
-import pdfread
 import sys
-
+import csv
 
 def calibrate(standards):
     '''
@@ -269,6 +268,19 @@
 
         i = i + 1
 
+def read_tabular(in_csv):
+    '''
+    Parses a tab-separated file returning a dictionary with named columns
+    @param in_csv: input filename to be parsed
+    '''
+    data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t'))
+    header = data.pop(0)
+    # Create dictionary with column name as key
+    output = {}
+    for index in xrange(len(header)):
+        output[header[index]] = [row[index] for row in data]
+    return output
+
 #---------End--------------
 def main():
     #Ranking and filtering procedure
@@ -381,18 +393,15 @@
     if InputData['analysis_type'] == 'NIST':
         #HitList_missed - a variable of type dictionary containing the hits with the symbol ";"
         #in the name
-        if not NDIS_is_tabular:
-            print "Warning; NDIS is not tabular format, reading PDF..\n"
-            [HitList, HitList_missed] = pdfread.getPDF(InputData['sample'])
-        else:
-            HitList = pdfread.read_tabular(InputData['sample'])
+        # HITLIST = the NIST results file given here as input:
+        HitList = read_tabular(InputData['sample'])
 
     #Convert RT to RI
     if InputData['model'] == 'linear':
             HitList = convert_rt(HitList, coeff)
     if InputData['model'] == 'poly':
-            print "Executing convert_rt_poly().."
-            HitList = convert_rt_poly(HitList, poly_cal)
+        print "Executing convert_rt_poly().."
+        HitList = convert_rt_poly(HitList, poly_cal)
 
     #------Read the library data with the predicted RI------
     try:
@@ -415,16 +424,11 @@
     #------Print the ranked and filtered hits------
     #Specify which data to be printed
     if InputData['analysis_type'] == 'AMDIS':
-        keys_to_print = ['R.T.', 'CAS', 'Name', 'Rank', 'RIexp', 'RIsvr', '%rel.err', 'Weighted', 'Reverse', 'Synonyms']
+        keys_to_print = ['R.T.', 'CAS', 'Name', 'Formula','Rank', 'RIexp', 'RIsvr', '%rel.err', 'Weighted', 'Reverse', 'Synonyms']
     else:
-        keys_to_print = ['ID', 'R.T.', 'Name', 'CAS', 'Rank', 'RIexp', 'RIsvr', '%rel.err', 'Forward', 'Reverse', 'Synonyms', 'Library']
+        keys_to_print = ['ID', 'R.T.', 'Name', 'Formula', 'CAS', 'Rank', 'RIexp', 'RIsvr', '%rel.err', 'Forward', 'Reverse', 'Synonyms', 'Library']
 
-    #skip this error output from reading a pdftotext file when file is tabular     
-    if InputData['analysis_type'] == 'NIST' and not NDIS_is_tabular:
-        out_missed_pdf = open(output_files['missed_parse_pdf'], 'w')
-        for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']):
-            out_missed_pdf.write('%s\n' % '\t'.join([y, x]))
-        out_missed_pdf.close()
+
 
     print_to_file(HitList, output_files, keys_to_print, print_subsets)