changeset 0:9d5f4f5f764b

Initial commit to toolshed
author pieter.lukasse@wur.nl
date Thu, 16 Jan 2014 13:10:00 +0100
parents
children e467a6c83d67
files LICENSE MsClust.jar NOTICE README.rst Rscripts/filter-RIDB.R Rscripts/ridb-regression.R __init__.py combine_output.py combine_output.xml create_model.xml datatypes_conf.xml export_to_metexp_tabular.py library_lookup.py library_lookup.xml match_library.py msclust2.0.1.xml rankfilterGCMS_tabular.xml rankfilter_GCMS/__init__.py rankfilter_GCMS/pdfread.py rankfilter_GCMS/pdftotabular.py rankfilter_GCMS/rankfilter.py rankfilter_GCMS/test/__init__.py rankfilter_GCMS/test/test_pdfread.py rankfilter_GCMS/test/test_rankfilter.py rankfilter_text2tabular.xml select_on_rank.py select_on_rank.xml static/images/confidence_and_slope_params_explain.png static/images/msclust_summary.png static/images/sample_SIM.png static/images/sample_sel_and_peak_height_correction.png test/__init__.py test/integration_tests.py test/test_combine_output.py test/test_export_to_metexp_tabular.py test/test_library_lookup.py test/test_match_library.py
diffstat 36 files changed, 3481 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/LICENSE	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
Binary file MsClust.jar has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/NOTICE	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,14 @@
+PRIMS-metabolomics toolset & Galaxy wrappers
+============================================
+ 
+Metabolomics module of Plant Research International's Mass Spectrometry (PRIMS) toolsuite. 
+This toolset consists of custom tools to enable metabolite identifications and 
+Retention Index (RI) based Quality Control (RIQC) for Mass Spectrometry metabolomics data. 
+
+Copyright:
+* 2012: NIST_UTIL and RIQC tools: Copyright (c) 2012 Maarten Kooyman and Marcel Kempenaar, NBIC BRS
+* 2013: all tools: Copyright (c) 2013 by Pieter Lukasse, Plant Research International (PRI), 
+  Wageningen, The Netherlands. All rights reserved. See the license text below.
+
+Galaxy wrappers and installation are available from the Galaxy Tool Shed at:
+http://toolshed.g2.bx.psu.edu/view/pieterlukasse/prims_metabolomics
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.rst	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,71 @@
+PRIMS-metabolomics toolset & Galaxy wrappers
+============================================
+ 
+Metabolomics module of Plant Research International's Mass Spectrometry (PRIMS) toolsuite. 
+This toolset consists of custom tools to enable metabolite identifications and 
+Retention Index (RI) based Quality Control (RIQC) for Mass Spectrometry metabolomics data. 
+
+Copyright:
+* 2012: NIST_UTIL and RIQC tools: Copyright (c) 2012 Maarten Kooyman and Marcel Kempenaar, NBIC BRS
+* 2013: all tools: Copyright (c) 2013 by Pieter Lukasse, Plant Research International (PRI), 
+  Wageningen, The Netherlands. All rights reserved. See the license text below.
+
+Galaxy wrappers and installation are available from the Galaxy Tool Shed at:
+http://toolshed.g2.bx.psu.edu/view/pieterlukasse/prims_metabolomics
+
+History
+=======
+
+============== ======================================================================
+Date            Changes
+-------------- ----------------------------------------------------------------------
+January 2014   * first release via Tool Shed, combining the RIQC and MsClust in a 
+                 single package (this package)
+               * integration with METEXP software (data store for metabolomics 
+                 experiments with respective metadata and identifications)  
+2013           * hand-over of the NIST_UTIL and RIQC tools from the NBIC team to  
+                 Plant Research International  
+2012           * development of MsClust 2.0, making it also suitable for Galaxy 
+<2011          * development and publication of MsClust 1.0
+============== ======================================================================
+
+Tool Versioning
+===============
+
+PRIMS tools will have versions of the form X.Y.Z. Versions
+differing only after the second decimal should be completely
+compatible with each other. Breaking changes should result in an
+increment of the number before and/or after the first decimal. All
+tools of version less than 1.0.0 should be considered beta.
+
+
+Bug Reports & other questions
+=============================
+
+For the time being issues can be reported via the contact form at:
+http://www.wageningenur.nl/en/Persons/PNJ-Pieter-Lukasse.htm
+
+Developers, Contributions & Collaborations 
+==========================================
+
+If you wish to join forces and collaborate on some of the 
+tools do not hesitate to contact Pieter Lukasse via the contact form above. 
+
+
+License (Apache, Version 2.0)
+=============================
+
+Copyright 2013 Pieter Lukasse, Plant Research International (PRI). 
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this software except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+     
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Rscripts/filter-RIDB.R	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,56 @@
+##
+#
+# Removes duplicates from a RI-database
+#
+# Usage:
+#       Rscript filter-RIDB.R /path/to/retention_db.txt output_RIDB_file.txt
+#
+##
+
+# Commandline arguments
+args  <- commandArgs(TRUE)
+ridb <- args[1]
+out_file <- args[2]
+
+# Function to check duplicates
+duplicates <- function(dat) { 
+     s <- do.call("order", as.data.frame(dat)) 
+     non.dup <- !duplicated(dat[s, ]) 
+     orig.ind <- s[non.dup] 
+     first.occ <- orig.ind[cumsum(non.dup)] 
+     first.occ[non.dup] <- NA 
+     first.occ[order(s)]
+}
+
+# Load CSV file
+ridb <- read.csv(ridb,header=TRUE, sep="\t")
+## Filters on: CAS FORMULA Column type Column phase type Column name
+filter_cols <- c(1, 3, 5, 6, 7)
+cat("RIDB dimensions: ")
+print(dim(ridb))
+deleted <- NULL
+cat("Checking for duplicates...")
+dups <- duplicates(ridb[,filter_cols])
+cat("\t[DONE]\nRemoving duplicates...")
+newridb <- ridb
+newridb["min"] <- NA
+newridb["max"] <- NA
+newridb["orig.columns"] <- NA
+for (i in unique(dups)) {
+    if (!is.na(i)) {
+        rows <- which(dups == i)
+        duprows <- ridb[c(i, rows),]
+        # Replace duplicate rows with one row containing the median value
+        new_RI <- median(duprows$RI)
+        newridb$RI[i] <- median(duprows$RI)
+        newridb$min[i] <- min(duprows$RI)
+        newridb$max[i] <- max(duprows$RI)
+        newridb$orig.columns[i] <- paste(rows, collapse=",")
+        deleted <- c(deleted, rows)
+    }
+}
+cat("\t\t[DONE]\nCreating new dataset...")
+out_ridb <- newridb[-deleted,]
+cat("\t\t[DONE]\nWriting new dataset...")
+write.table(out_ridb, na='', file=out_file, quote=T, sep="\t", row.names=F)
+cat("\t\t[DONE]\n")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Rscripts/ridb-regression.R	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,208 @@
+##
+#
+# Performs regression analysis using either 3rd degree polynomial- or linear-method
+#
+##
+
+# Commandline arguments
+args  <- commandArgs(TRUE)
+if (length(args) < 7)
+	stop(cat("Missing arguments, usage:\n\tRscript ridb-regression.R RI-database ",
+             "ouput_file logfile min_residuals range_mod pvalue rsquared method ",
+			 "plot(yes/no) plot_archive"))
+
+ridb <- args[1]
+out_file <- args[2]
+logfile <- args[3]
+min_residuals <- as.integer(args[4])
+range_mod <- as.integer(args[5])
+pvalue <- as.double(args[6])
+rsquared <- as.double(args[7])
+method <- args[8]
+plot <- tolower(args[9])
+if (plot == 'true')
+	plot_archive = args[10]
+
+# Do not show warnings etc.
+sink(file='/dev/null')
+
+progress <- c()
+logger <- function(logdata) {
+	## Logs progress, adds a timestamp for each event
+	#cat(paste(Sys.time(), "\t", logdata, "\n", sep="")) ## DEBUG
+	progress <<- c(progress, paste(Sys.time(), "\t", logdata, sep=""))
+}
+
+logger("Reading Retention Index Database..")
+
+# Read Retention Index Database
+ridb <- read.csv(ridb, header=TRUE, sep="\t")
+logger(paste("\t", nrow(ridb), "records read.."))
+# Get a unique list 
+gc_columns <- unique(as.vector(as.matrix(ridb['Column.name'])[,1]))
+cas_numbers <- unique(as.vector(as.matrix(ridb['CAS'])[,1]))
+
+add_poly_fit <- function(fit, gc1_index, gc2_index, range) {
+	pval = anova.lm(fit)$Pr
+	r.squared = summary(fit)$r.squared
+
+	data = rep(NA, 11)
+	# Append results to matrix
+	data[1] = gc_columns[gc1_index] # Column 1
+	data[2] = gc_columns[gc2_index] # Column 2
+	data[3] = coefficients(fit)[1]  # The 4 coefficients
+	data[4] = coefficients(fit)[2]
+	data[5] = coefficients(fit)[3]
+	data[6] = coefficients(fit)[4]
+	data[7] = range[1]              # Left limit
+	data[8] = range[2]              # Right limit
+	data[9] = length(fit$residuals) # Number of datapoints analysed
+	data[10] = pval[1]              # p-value for resulting fitting
+	data[11] = r.squared            # R-squared
+	return(data)
+}
+
+
+add_linear_fit <- function(fit, gc1_index, gc2_index, range) {
+	pval = anova.lm(fit)$Pr
+	r.squared = summary(fit)$r.squared
+	
+	data = rep(NA, 7)
+	# Append results to matrix
+	data[1] = gc_columns[gc1_index] # Column 1
+	data[2] = gc_columns[gc2_index] # Column 2
+	data[3] = coefficients(fit)[1]  # The 4 coefficients
+	data[4] = coefficients(fit)[2]
+	data[7] = length(fit$residuals) # Number of datapoints analysed
+	data[8] = pval[1]               # p-value for resulting fitting
+	data[9] = r.squared             # R-squared
+	return(data)
+}
+
+
+add_fit <- function(fit, gc1_index, gc2_index, range, method) {
+	if (method == 'poly')
+		return(add_poly_fit(fit, gc1_index, gc2_index, range))
+	else
+		return(add_linear_fit(fit, gc1_index, gc2_index, range))
+}
+
+
+plot_fit <- function(ri1, ri2, gc1_index, gc2_index, coeff, range, method) {
+    if (method == 'poly')
+        pol <- function(x) coeff[4]*x^3 + coeff[3]*x^2 + coeff[2]*x + coeff[1]
+    else
+        pol <- function(x) coeff[2]*x + coeff[1]
+    pdf(paste('regression_model_',
+              make.names(gc_columns[gc1_index]), '_vs_', 
+              make.names(gc_columns[gc2_index]), '.pdf', sep=''))
+    curve(pol, 250:3750, col="red", lwd=2.5, main='Regression Model', xlab=gc_columns[gc1_index],
+          ylab=gc_columns[gc2_index], xlim=c(250, 3750), ylim=c(250, 3750))
+    points(ri1, ri2, lwd=0.4)
+    # Add vertical lines showing left- and right limits when using poly method
+    if (method == 'poly')
+        abline(v=range, col="grey", lwd=1.5)
+    dev.off()
+}
+
+# Initialize output dataframe
+if (method == 'poly') {
+    m <- data.frame(matrix(ncol = 11, nrow = 10))
+} else {
+    m <- data.frame(matrix(ncol = 9, nrow = 10))
+}
+
+
+get_fit <- function(gc1, gc2, method) {
+	if (method == 'poly')
+		return(lm(gc1 ~ poly(gc2, 3, raw=TRUE)))
+	else
+		return(lm(gc1 ~ gc2))
+}
+
+# Permutate
+k <- 1
+logger(paste("Permutating (with ", length(gc_columns), " GC-columns)..", sep=""))
+
+for (i in 1:(length(gc_columns)-1)) {
+	logger(paste("\tCalculating model for ", gc_columns[i], "..", sep=""))
+	breaks <- 0
+	for (j in (i+1):length(gc_columns)) {
+		col1 = ridb[which(ridb['Column.name'][,1] == gc_columns[i]),]
+		col2 = ridb[which(ridb['Column.name'][,1] == gc_columns[j]),]
+		
+		# Find CAS numbers for which both columns have data (intersect)
+		cas_intersect = intersect(col1[['CAS']], col2[['CAS']])
+		
+		# Skip if number of shared CAS entries is < cutoff
+		if (length(cas_intersect) < min_residuals) {
+			breaks = breaks + 1
+			next
+		}
+		# Gather Retention Indices
+		col1_data = col1[['RI']][match(cas_intersect, col1[['CAS']])]
+		col2_data = col2[['RI']][match(cas_intersect, col2[['CAS']])]
+		
+		# Calculate the range within which regression is possible (and move if 'range_mod' != 0)
+		range = c(min(c(min(col1_data), min(col2_data))), max(c(max(col1_data), max(col2_data))))
+		if (range_mod != 0) {
+			# Calculate percentage and add/subtract from range
+			perc = diff(range) / 100
+			perc_cutoff = range_mod * perc
+			range = as.integer(range + c(perc_cutoff, -perc_cutoff))
+		}
+		
+		# Calculate model for column1 vs column2 and plot if requested
+		fit = get_fit(col1_data, col2_data, method)
+		m[k,] = add_fit(fit, i, j, range, method)
+		
+		if (plot == 'true')
+			plot_fit(col1_data, col2_data, i, j, coefficients(fit), range, method)
+		
+		# Calculate model for column2 vs column1 and plot if requested
+		fit = get_fit(col2_data, col1_data, method)
+		m[k + 1,] = add_fit(fit, j, i, range, method)
+		
+		if (plot == 'true')
+			plot_fit(col2_data, col1_data, j, i, coefficients(fit), range, method)
+		
+		k = k + 2
+	}
+	logger(paste("\t\t", breaks, " comparisons have been skipped due to nr. of datapoints < cutoff", sep=""))
+}
+
+# Filter on pvalue and R-squared
+logger("Filtering on pvalue and R-squared..")
+if (method == 'poly') {
+    pval_index <- which(m[,10] < pvalue)
+    rsquared_index <- which(m[,11] > rsquared)
+} else {
+    pval_index <- which(m[,8] < pvalue)
+    rsquared_index <- which(m[,9] > rsquared)
+}
+logger(paste(nrow(m) - length(pval_index), " models discarded due to pvalue > ", pvalue, sep=""))
+
+logger(paste(nrow(m) - length(rsquared_index), " models discarded due to R-squared < ", rsquared, sep=""))
+
+# Remaining rows
+index = unique(c(pval_index, rsquared_index))
+
+# Reduce dataset
+m = m[index,]
+sink()
+
+# Place plots in the history as a ZIP file
+if (plot == 'true') {
+    logger("Creating archive with model graphics..")
+    system(paste("zip -9 -r models.zip *.pdf > /dev/null", sep=""))
+    system(paste("cp models.zip ", plot_archive, sep=""))
+}
+
+# Save dataframe as tab separated file
+logger("All done, saving data..")
+header = c("Column1", "Column2", "Coefficient1", "Coefficient2", "Coefficient3", "Coefficient4", 
+           "LeftLimit", "RightLimit", "Residuals", "pvalue", "Rsquared")
+if (method != 'poly')
+	header = header[c(1:4, 7:11)]
+write(progress, logfile)
+write.table(m, file=out_file, sep="\t", quote=FALSE, col.names=header, row.names=FALSE)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/__init__.py	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,6 @@
+'''
+Module containing Galaxy tools for the GC/MS pipeline
+Created on Mar 6, 2012
+
+@author: marcelk
+'''
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/combine_output.py	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,230 @@
+#!/usr/bin/env python
+# encoding: utf-8
+'''
+Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup)
+'''
+
+import csv
+import re
+import sys
+import math
+import pprint
+
+__author__ = "Marcel Kempenaar"
+__contact__ = "brs@nbic.nl"
+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
+__license__ = "MIT"
+
+def _process_data(in_csv):
+    '''
+    Generic method to parse a tab-separated file returning a dictionary with named columns
+    @param in_csv: input filename to be parsed
+    '''
+    data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t'))
+    header = data.pop(0)
+    # Create dictionary with column name as key
+    output = {}
+    for index in xrange(len(header)):
+        output[header[index]] = [row[index] for row in data]
+    return output
+
+
+def _merge_data(rankfilter, caslookup):
+    '''
+    Merges data from both input dictionaries based on the Centrotype field. This method will
+    build up a new list containing the merged hits as the items. 
+    @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name)
+    @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name)
+    '''
+    # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts:
+    if (len(rankfilter['ID']) != len(caslookup['Centrotype'])):
+        raise Exception('rankfilter and caslookup files should have the same nr of rows/records ')
+    
+    merged = []
+    processed = {}
+    for compound_id_idx in xrange(len(rankfilter['ID'])):
+        compound_id = rankfilter['ID'][compound_id_idx]
+        if not compound_id in processed :
+            # keep track of processed items to not repeat them
+            processed[compound_id] = compound_id
+            # get centrotype nr
+            centrotype = compound_id.split('-')[0]
+            # Get the indices for current compound ID in both data-structures for proper matching
+            rindex = [index for index, value in enumerate(rankfilter['ID']) if value == compound_id]
+            cindex = [index for index, value in enumerate(caslookup['Centrotype']) if value == centrotype]
+            
+            merged_hits = []
+            # Combine hits
+            for hit in xrange(len(rindex)):
+                # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do 
+                # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
+                # corresponding value in the rankfilter or caslookup tables; i.e. 
+                # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute
+                #                    represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)
+                rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()]))
+                cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()]))
+                
+                merged_hit = _add_hit(rf_record, cl_record)
+                merged_hits.append(merged_hit)
+                
+            merged.append(merged_hits)
+
+    return merged, len(rindex)
+
+
+def _add_hit(rankfilter, caslookup):
+    '''
+    Combines single records from both the RankFilter- and CasLookup-tools
+    @param rankfilter: record (dictionary) of one compound in the RankFilter output
+    @param caslookup: matching record (dictionary) of one compound in the CasLookup output
+    '''
+    # The ID in the RankFilter output contains the following 5 fields:
+    rf_id = rankfilter['ID'].split('-')
+    try:
+        name, formula = _remove_formula(rankfilter['Name'])
+        hit = [rf_id[0], # Centrotype
+               rf_id[1], # cent.Factor
+               rf_id[2], # scan nr
+               rf_id[3], # R.T. (umin)
+               rf_id[4], # nr. Peaks
+               # Appending other fields
+               rankfilter['R.T.'],
+               name,
+               caslookup['FORMULA'] if not formula else formula,
+               rankfilter['Library'].strip(),
+               rankfilter['CAS'].strip(),
+               rankfilter['Forward'],
+               rankfilter['Reverse'],
+               ((float(rankfilter['Forward']) + float(rankfilter['Reverse'])) / 2),
+               rankfilter['RIexp'],
+               caslookup['RI'],
+               rankfilter['RIsvr'],
+               # Calculate absolute differences
+               math.fabs(float(rankfilter['RIexp']) - float(rankfilter['RIsvr'])),
+               math.fabs(float(caslookup['RI']) - float(rankfilter['RIexp'])),
+               caslookup['Regression.Column.Name'],
+               caslookup['min'],
+               caslookup['max'],
+               caslookup['nr.duplicates'],
+               caslookup['Column.phase.type'],
+               caslookup['Column.name'],
+               rankfilter['Rank'],
+               rankfilter['%rel.err'],
+               rankfilter['Synonyms']]
+    except KeyError as error:
+        print "Problem reading in data from input file(s):\n",
+        print "Respective CasLookup entry: \n", pprint.pprint(caslookup), "\n"
+        print "Respective RankFilter entry: \n", pprint.pprint(rankfilter), "\n"
+        raise error
+
+    return hit
+
+
+def _remove_formula(name):
+    '''
+    The RankFilter Name field often contains the Formula as well, this function removes it from the Name
+    @param name: complete name of the compound from the RankFilter output
+    '''
+    name = name.split()
+    poss_formula = name[-1]
+    match = re.match("^(([A-Z][a-z]{0,2})(\d*))+$", poss_formula)
+    if match:
+        return ' '.join(name[:-1]), poss_formula
+    else:
+        return ' '.join(name), False
+
+
+def _get_default_caslookup():
+    '''
+    The Cas Lookup tool might not have found all compounds in the library searched,
+    this default dict will be used to combine with the Rank Filter output
+    '''
+    return {'FORMULA': 'N/A',
+            'RI': '0.0',
+            'Regression.Column.Name': 'None',
+            'min': '0.0',
+            'max': '0.0',
+            'nr.duplicates': '0',
+            'Column.phase.type': 'N/A',
+            'Column.name': 'N/A'}
+
+
+def _save_data(data, nhits, out_csv_single, out_csv_multi):
+    '''
+    Writes tab-separated data to file
+    @param data: dictionary containing merged dataset
+    @param out_csv: output csv file
+    '''
+    header = ['Centrotype',
+              'cent.Factor',
+              'scan nr.',
+              'R.T. (umin)',
+              'nr. Peaks',
+              'R.T.',
+              'Name',
+              'FORMULA',
+              'Library',
+              'CAS',
+              'Forward',
+              'Reverse',
+              'Avg. (Forward, Reverse)',
+              'RIexp',
+              'RI',
+              'RIsvr',
+              'RIexp - RIsvr',
+              'RI - RIexp',
+              'Regression.Column.Name',
+              'min',
+              'max',
+              'nr.duplicates',
+              'Column.phase.type',
+              'Column.name',
+              'Rank',
+              '%rel.err',
+              'Synonyms']
+
+    # Open output file for writing
+    outfile_single_handle = open(out_csv_single, 'wb')
+    outfile_multi_handle = open(out_csv_multi, 'wb')
+    output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+    output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t")
+
+    # Write headers
+    output_single_handle.writerow(header)
+    output_multi_handle.writerow(header * nhits)
+    # Combine all hits for each centrotype into one line
+    line = []
+    for centrotype_idx in xrange(len(data)):
+        for hit in data[centrotype_idx]:
+            line.extend(hit)
+        output_multi_handle.writerow(line)
+        line = []
+
+    # Write one line for each centrotype
+    for centrotype_idx in xrange(len(data)):
+        for hit in data[centrotype_idx]:
+            output_single_handle.writerow(hit)
+
+
+def main():
+    '''
+    Combine Output main function
+    It will merge the result files from "RankFilter"  and "Lookup RI for CAS numbers" 
+    NB: the caslookup_result_file will typically have fewer lines than
+    rankfilter_result_file, so the merge has to consider this as well. The final file
+    should have the same nr of lines as rankfilter_result_file.
+    '''
+    rankfilter_result_file = sys.argv[1]
+    caslookup_result_file = sys.argv[2]
+    output_single_csv = sys.argv[3]
+    output_multi_csv = sys.argv[4]
+
+    # Read RankFilter and CasLookup output files
+    rankfilter = _process_data(rankfilter_result_file)
+    caslookup = _process_data(caslookup_result_file)
+    merged, nhits = _merge_data(rankfilter, caslookup)
+    _save_data(merged, nhits, output_single_csv, output_multi_csv)
+
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/combine_output.xml	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,35 @@
+<tool id="combine_output" name="RIQC-Combine RankFilter and CasLookup output" version="1.0.2">
+  <description>Perform a combination of output data from the RankFilter and CasLookup tools</description>
+  <command interpreter="python">
+    combine_output.py $rankfilter_in $caslookup_in $out_single $out_multi
+  </command>
+  <inputs>
+    <param format="tabular" name="caslookup_in" type="data" label="RIQC-Lookup RI for CAS output"
+    	help="Select the output file from the CasLookup tool"/>
+    <param format="tabular" name="rankfilter_in" type="data" label="RIQC-RankFilter output" 
+    	help="Select the output file from the RankFilter tool"/>
+  </inputs>
+  <outputs>
+    <data format="tabular" label="${tool.name} (Single) on ${on_string}" name="out_single" />
+    <data format="tabular" label="${tool.name} (Multi) on ${on_string}" name="out_multi" />
+  </outputs>
+  <help>
+Performs a combination of output files from the 'RankFilter' and 'Lookup RI for CAS' tools into two tab-separated files.
+
+The files produced are contain either all hits for a compound on a single line (Single) or on separate lines 
+(Multi). 
+
+.. class:: infomark
+
+**Notes**
+   
+The input data should be produced by the RankFilter and 'Lookup RI for CAS' tools provided on this Galaxy server with the 
+original headers kept intact. Processing steps include:
+   
+   - Added columns showing the average Forward/Reverse values, RIexp - RIsvr and RI - RIexp values
+   - The ID column of the RankFilter tool output is split into 'Centrotype', 'cent.Factor', 'scan nr.', 'R.T. (umin)'
+     and 'nr. Peaks' fields.
+   - The formula is split off the 'Name' field in the RankFilter output    
+    
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/create_model.xml	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,78 @@
+<tool id="create_poly_model" name="RIQC-Create Regression Model" version="1.0.2">
+  <description>Generate coefficients to enable the regression from one GC-column
+		  		         to another GC-column</description>
+  <command interpreter="Rscript">Rscripts/ridb-regression.R 
+               $ridb
+               $out_model
+               $out_log
+               $min_residuals
+               $range_mod
+               $pvalue
+               $rsquared
+               $method
+               $plot
+               #if $plot
+                   $model_graphics
+               #end if
+  </command>
+  <inputs>
+    <param format="tabular" name="ridb" type="select" label="Retention Index (RI) and GC columns Library file"
+           help="Select the RI library file of which all GC columns and their RI values
+                 will be used to create a model" 
+      		 dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RI_DB_libraries")'/>                 
+                 
+    <param name="method" type="select" label="Select regression method"
+           help="Method to use for calculating the model" >
+           <option value="poly" selected="True">Polynomial (3rd degree)</option>
+           <option value="linear">Linear</option>
+    </param>
+    <param name="min_residuals" type="integer" value="10" optional="False"
+           label="Minimum number of residuals" help="The minimum number of residuals
+                 (datapoints) that both columns should have in common when calculating
+                 the model" />
+    <param name="range_mod" type="integer" value="0" optional="False"
+           label="Range modifier" help="Moves the range of the usable RI space by the
+                  given percentage. Set to 0 to use the full range of available data." />
+    <param name="pvalue" type="float" value="0.05" optional="False" min="0" max="1"
+           label="Pvalue to filter on" help="Set the upper limit for the pvalue (calculated)
+                  by performing an ANOVA analysis on the created model). All models with higher
+                  pvalues are discarded." />
+    <param name="rsquared" type="float" value="0.95" optional="False" min="0" max="1"
+           label="R-squared to filter on" help="Set the lower limit for the R-squared,
+                  all models with lower values are discarded." />
+    <param name="plot" type="boolean" label="Create a separate plot for each model"
+           help="This will create a ZIP file in the history containing PDF plots" />
+  </inputs>
+  <code file="match_library.py" />
+  <outputs>
+  	<data format="zip" label="Model Graphics of ${on_string}" name="model_graphics" >
+  	    <filter>(plot)</filter>
+  	</data>
+    <data format="tabular" label="Regression logfile of ${on_string}"  name="out_log" />
+    <data format="tabular" label="Regression model of ${on_string}"  name="out_model" />
+  </outputs> 
+  <help>
+Calculates regression models for a permutation of all GC columns contained in the selected
+RI database file. The method used for creating the model is either based on a 3rd degree 
+polynomial or a standard linear model.
+
+The *Minimum number of residuals* option will only allow regression if the columns it is based
+on has at least that number of datapoints on the same compound. 
+
+Filtering is possible by setting an upper limit for the *p-value* and / or a lower limit for
+the *R squared* value. The produced logfile will state how many models have been discarded due
+to this filtering. The output model file also includes the p-value and R squared value for
+each created model.
+
+Graphical output of the models is available by selecting the plot option which shows the
+data points used for the model as well as the fit itself and the range of data that will
+be usable. 
+
+.. class:: infomark
+
+**Notes**
+
+The output file produced by this tool is required as input for the CasLookup tool when
+selecting to apply regression when finding hits in the RIDB.
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,12 @@
+<?xml version="1.0"?>
+<datatypes>
+  <datatype_files>
+  </datatype_files>
+  <registration display_path="display_applications">
+        <!-- type for the pdf -->
+        <datatype extension="pdf"  type="galaxy.datatypes.data:Data" mimetype="application/octet-stream" 
+        display_in_upload="true" subclass="true"/>
+        <datatype extension="msclust.csv" type="galaxy.datatypes.tabular:Tabular" mimetype="text/csv" display_in_upload="true" subclass="true">
+        </datatype>   
+  </registration>
+</datatypes>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/export_to_metexp_tabular.py	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,171 @@
+#!/usr/bin/env python
+# encoding: utf-8
+'''
+Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust
+into a tabular file that can be uploaded to the MetExp database.
+
+RankFilter, CasLookup are already combined by combine_output.py so here we will use
+this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust
+quantification files are to be combined with combine_output.py result as well. 
+
+Extra calculations performed:
+- The column MW is also added here and is derived from the column FORMULA found 
+  in combine_output.py result. 
+  
+So in total here we merge 3 files and calculate one new column. 
+'''
+
+import csv
+import sys
+from collections import OrderedDict
+
+__author__ = "Pieter Lukasse"
+__contact__ = "pieter.lukasse@wur.nl"
+__copyright__ = "Copyright, 2013, Plant Research International, WUR"
+__license__ = "Apache v2"
+
+def _process_data(in_csv, delim='\t'):
+    '''
+    Generic method to parse a tab-separated file returning a dictionary with named columns
+    @param in_csv: input filename to be parsed
+    '''
+    data = list(csv.reader(open(in_csv, 'rU'), delimiter=delim))
+    header = data.pop(0)
+    # Create dictionary with column name as key
+    output = OrderedDict()
+    for index in xrange(len(header)):
+        output[header[index]] = [row[index] for row in data]
+    return output
+
+ONE_TO_ONE = 'one_to_one'
+N_TO_ONE = 'n_to_one'
+
+def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, relation_type=ONE_TO_ONE):
+    '''
+    Merges data from both input dictionaries based on the link fields. This method will
+    build up a new list containing the merged hits as the items. 
+    @param set1: dictionary holding set1 in the form of N lists (one list per attribute name)
+    @param set2: dictionary holding set2 in the form of N lists (one list per attribute name)
+    '''
+    # TODO test for correct input files -> same link_field values should be there (test at least number of unique link_field values):
+    #
+    # if (len(set1[link_field_set1]) != len(set2[link_field_set2])):
+    #    raise Exception('input files should have the same nr of key values  ')
+    
+    
+    merged = []
+    processed = {}
+    for link_field_set1_idx in xrange(len(set1[link_field_set1])):
+        link_field_set1_value = set1[link_field_set1][link_field_set1_idx]
+        if not link_field_set1_value in processed :
+            # keep track of processed items to not repeat them
+            processed[link_field_set1_value] = link_field_set1_value
+
+            # Get the indices for current link_field_set1_value in both data-structures for proper matching
+            set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value]
+            set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ]
+            
+            
+            
+            merged_hits = []
+            # Combine hits
+            for hit in xrange(len(set1index)):
+                # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do 
+                # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
+                # corresponding value in the rankfilter or caslookup tables; i.e. 
+                # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute
+                #                    represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)
+                # It just ensures the entry is made available as a plain named array for easy access.
+                rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()]))
+                if relation_type == ONE_TO_ONE :
+                    cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[hit]] for key in set2.keys()]))
+                else:
+                    # is N to 1:
+                    cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()]))
+                
+                merged_hit = merge_function(rf_record, cl_record)
+                merged_hits.append(merged_hit)
+                
+            merged.append(merged_hits)
+
+    return merged, len(set1index)
+
+
+def _compare_records(key1, key2):
+    '''
+    in this case the compare method is really simple as both keys are expected to contain 
+    same value when records are the same
+    '''
+    if key1 == key2:
+        return True
+    else:
+        return False
+    
+    
+    
+def _merge_records(rank_caslookup_combi, msclust_quant_record):
+    '''
+    Combines single records from both the RankFilter+CasLookup combi file and from MsClust file
+    
+    @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py)
+    @param msclust_quant_record: msclust quantification + spectrum record
+    '''
+    i = 0
+    record = []
+    for column in rank_caslookup_combi:
+        record.append(rank_caslookup_combi[column])
+        i += 1
+    
+    for column in msclust_quant_record:
+        record.append(msclust_quant_record[column])
+        i += 1
+        
+    return record
+
+
+
+
+def _save_data(data, headers, nhits, out_csv):
+    '''
+    Writes tab-separated data to file
+    @param data: dictionary containing merged dataset
+    @param out_csv: output csv file
+    '''
+
+    # Open output file for writing
+    outfile_single_handle = open(out_csv, 'wb')
+    output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+
+    # Write headers
+    output_single_handle.writerow(headers)
+
+    # Write one line for each centrotype
+    for centrotype_idx in xrange(len(data)):
+        for hit in data[centrotype_idx]:
+            output_single_handle.writerow(hit)
+
+
+def main():
+    '''
+    Combine Output main function
+    
+    RankFilter, CasLookup are already combined by combine_output.py so here we will use
+    this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust
+    quantification files are to be combined with combine_output.py result as well. 
+    '''
+    rankfilter_and_caslookup_combined_file = sys.argv[1]
+    msclust_quantification_and_spectra_file = sys.argv[2]
+    output_csv = sys.argv[3]
+
+    # Read RankFilter and CasLookup output files
+    rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file)
+    msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',')
+    
+    merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype', 
+                                msclust_quantification_and_spectra, 'centrotype', _compare_records, _merge_records, N_TO_ONE)
+    headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys()
+    _save_data(merged, headers, nhits, output_csv)
+
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/library_lookup.py	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,327 @@
+'''
+Logic for searching a Retention Index database file given output from NIST
+'''
+import match_library
+import re
+import sys
+import csv
+
+__author__ = "Marcel Kempenaar"
+__contact__ = "brs@nbic.nl"
+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
+__license__ = "MIT"
+
+def create_lookup_table(library_file, column_type_name, statphase):
+    '''
+    Creates a dictionary holding the contents of the library to be searched
+    @param library_file: library to read
+    @param column_type_name: the columns type name
+    @param statphase: the columns stationary phase
+    '''
+    (data, header) = match_library.read_library(library_file)
+    # Test for presence of required columns
+    if ('columntype' not in header or
+        'columnphasetype' not in header or
+        'cas' not in header):
+        raise IOError('Missing columns in ', library_file)
+
+    column_type_column = header.index("columntype")
+    statphase_column = header.index("columnphasetype")
+    cas_column = header.index("cas")
+
+    filtered_library = [line for line in data if line[column_type_column] == column_type_name
+                        and line[statphase_column] == statphase]
+    lookup_dict = {}
+    for element in filtered_library:
+        # Here the cas_number is set to the numeric part of the cas_column value, so if the 
+        # cas_column value is 'C1433' then cas_number will be '1433'
+        cas_number = str(re.findall(r'\d+', (element[cas_column]).strip())[0])
+        try:
+            lookup_dict[cas_number].append(element)
+        except KeyError:
+            lookup_dict[cas_number] = [element]
+    return lookup_dict
+
+
+def _preferred(hits, pref, ctype, polar, model, method):
+    '''
+    Returns all entries in the lookup_dict that have the same column name, type and polarity
+    as given by the user, uses regression if selected given the model and method to use. The
+    regression is applied on the column with the best R-squared value in the model
+    @param hits: all entries in the lookup_dict for the given CAS number
+    @param pref: preferred GC-column, can be one or more names
+    @param ctype: column type (capillary etc.)
+    @param polar: polarity (polar / non-polar etc.)
+    @param model: data loaded from file containing regression models
+    @param method: supported regression method (i.e. poly(nomial) or linear)
+    '''
+    match = []
+    for column in pref:
+        for hit in hits:
+            if hit[4] == ctype and hit[5] == polar and hit[6] == column:
+                # Create copy of found hit since it will be altered downstream
+                match.extend(hit)
+                return match, False
+
+    # No hit found for current CAS number, return if not performing regression
+    if not model:
+        return False, False
+
+    # Perform regression
+    for column in pref:
+        if column not in model:
+            break
+        # Order regression candidates by R-squared value (last element)
+        order = sorted(model[column].items(), key=lambda col: col[1][-1])
+        # Create list of regression candidate column names
+        regress_columns = list(reversed([column for (column, _) in order]))
+        # Names of available columns
+        available = [hit[6] for hit in hits]
+        
+        # TODO: combine Rsquared and number of datapoints to get the best regression match
+        '''
+        # Iterate regression columns (in order) and retrieve their models
+        models = {}
+        for col in regress_columns:
+            if col in available:
+                hit = list(hits[available.index(col)])
+                if hit[4] == ctype:
+                    # models contains all model data including residuals [-2] and rsquared [-1]
+                    models[pref[0]] = model[pref[0]][hit[6]] 
+        # Get the combined maximum for residuals and rsquared
+        best_match = models[]
+        # Apply regression
+        if method == 'poly':
+            regressed = _apply_poly_regression(best_match, hit[6], float(hit[3]), model)
+            if regressed:
+                hit[3] = regressed
+            else:
+                return False, False
+            else:
+                hit[3] = _apply_linear_regression(best_match, hit[6], float(hit[3]), model)
+                match.extend(hit)
+            return match, hit[6]
+        '''
+        
+        for col in regress_columns:
+            if col in available:
+                hit = list(hits[available.index(col)])
+                if hit[4] == ctype:
+                    # Perform regression using a column for which regression is possible
+                    if method == 'poly':
+                        # Polynomial is only possible within a set border, if the RI falls outside
+                        # of this border, skip this lookup
+                        regressed = _apply_poly_regression(pref[0], hit[6], float(hit[3]), model)
+                        if regressed:
+                            hit[3] = regressed
+                        else:
+                            return False, False
+                    else:
+                        hit[3] = _apply_linear_regression(pref[0], hit[6], float(hit[3]), model)
+                    match.extend(hit)
+                    return match, hit[6]
+
+    return False, False
+
+
+
+def default_hit(row, cas_nr, compound_id):
+    '''
+    This method will return a "default"/empty hit for cases where the
+    method _preferred() returns False (i.e. a RI could not be found 
+    for the given cas nr, also not via regression.
+    '''
+    return [
+            #'CAS', 
+            'C' + cas_nr,
+            #'NAME', 
+            '',
+            #'FORMULA', 
+            '',
+            #'RI', 
+            '0.0',
+            #'Column.type', 
+            '',
+            #'Column.phase.type', 
+            '',
+            #'Column.name', 
+            '',
+            #'phase.coding', 
+            ' ',
+            #'CAS_column.Name', 
+            '',
+            #'Centrotype', -> NOTE THAT compound_id is not ALWAYS centrotype...depends on MsClust algorithm used...for now only one MsClust algorithm is used so it is not an issue, but this should be updated/corrected once that changes
+            compound_id,
+            #'Regression.Column.Name', 
+            '',
+            #'min', 
+            '',
+            #'max', 
+            '',
+            #'nr.duplicates', 
+            '']
+    
+
+def format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method):
+    '''
+    Looks up the compounds in the library lookup table and formats the results
+    @param lookup_dict: dictionary containing the library to be searched
+    @param nist_tabular_filename: NIST output file to be matched
+    @param pref: (list of) column-name(s) to look for
+    @param ctype: column type of interest
+    @param polar: polarity of the used column
+    @param model: data loaded from file containing regression models
+    @param method: supported regression method (i.e. poly(nomial) or linear)
+    '''
+    (nist_tabular_list, header_clean) = match_library.read_library(nist_tabular_filename)
+    # Retrieve indices of the CAS and compound_id columns (exit if not present)
+    try:
+        casi = header_clean.index("cas")
+        idi = header_clean.index("id")
+    except:
+        raise IOError("'CAS' or 'compound_id' not found in header of library file")
+
+    data = []
+    for row in nist_tabular_list:
+        casf = str(row[casi].replace('-', '').strip())
+        compound_id = str(row[idi].split('-')[0])
+        if casf in lookup_dict:
+            found_hit, regress = _preferred(lookup_dict[casf], pref, ctype, polar, model, method)
+            if found_hit:
+                # Keep cas nr as 'C'+ numeric part:
+                found_hit[0] = 'C' + casf
+                # Add compound id
+                found_hit.insert(9, compound_id)
+                # Add information on regression process
+                found_hit.insert(10, regress if regress else 'None')
+                # Replace column index references with actual number of duplicates
+                dups = len(found_hit[-1].split(','))
+                if dups > 1:
+                    found_hit[-1] = str(dups + 1)
+                else:
+                    found_hit[-1] = '0'
+                data.append(found_hit)
+                found_hit = ''
+            else:
+                data.append(default_hit(row, casf, compound_id))
+        else:
+            data.append(default_hit(row, casf, compound_id))
+            
+        casf = ''
+        compound_id = ''
+        found_hit = []
+        dups = []
+    return data
+
+
+def _save_data(content, outfile):
+    '''
+    Write to output file
+    @param content: content to write
+    @param outfile: file to write to
+    '''
+    # header
+    header = ['CAS',
+              'NAME',
+              'FORMULA',
+              'RI',
+              'Column.type',
+              'Column.phase.type',
+              'Column.name',
+              'phase.coding',
+              'CAS_column.Name',
+              'Centrotype',
+              'Regression.Column.Name',
+              'min',
+              'max',
+              'nr.duplicates']
+    output_handle = csv.writer(open(outfile, 'wb'), delimiter="\t")
+    output_handle.writerow(header)
+    for entry in content:
+        output_handle.writerow(entry)
+
+
+def _read_model(model_file):
+    '''
+    Creates an easy to search dictionary for getting the regression parameters
+    for each valid combination of GC-columns
+    @param model_file: filename containing the regression models
+    '''
+    regress = list(csv.reader(open(model_file, 'rU'), delimiter='\t'))
+    if len(regress.pop(0)) > 9:
+        method = 'poly'
+    else:
+        method = 'linear'
+
+    model = {}
+    # Create new dictionary for each GC-column
+    for line in regress:
+        model[line[0]] = {}
+
+    # Add data
+    for line in regress:
+        if method == 'poly':
+            model[line[0]][line[1]] = [float(col) for col in line[2:11]]
+        else:  # linear
+            model[line[0]][line[1]] = [float(col) for col in line[2:9]]
+
+    return model, method
+
+
+def _apply_poly_regression(column1, column2, retention_index, model):
+    '''
+    Calculates a new retention index (RI) value using a given 3rd-degree polynomial
+    model based on data from GC columns 1 and 2
+    @param column1: name of the selected GC-column
+    @param column2: name of the GC-column to use for regression
+    @param retention_index: RI to convert
+    @param model: dictionary containing model information for all GC-columns
+    '''
+    coeff = model[column1][column2]
+    # If the retention index to convert is within range of the data the model is based on, perform regression
+    if coeff[4] < retention_index < coeff[5]:
+        return (coeff[3] * (retention_index ** 3) + coeff[2] * (retention_index ** 2) + 
+                (retention_index * coeff[1]) + coeff[0])
+    else:
+        return False
+
+
+def _apply_linear_regression(column1, column2, retention_index, model):
+    '''
+    Calculates a new retention index (RI) value using a given linear model based on data
+    from GC columns 1 and 2
+    @param column1: name of the selected GC-column
+    @param column2: name of the GC-column to use for regression
+    @param retention_index: RI to convert
+    @param model: dictionary containing model information for all GC-columns
+    '''
+    # TODO: No use of limits
+    coeff = model[column1][column2]
+    return coeff[1] * retention_index + coeff[0]
+
+
+def main():
+    '''
+    Library Lookup main function
+    '''
+    library_file = sys.argv[1]
+    nist_tabular_filename = sys.argv[2]
+    ctype = sys.argv[3]
+    polar = sys.argv[4]
+    outfile = sys.argv[5]
+    pref = sys.argv[6:-1]
+    regress = sys.argv[-1]
+
+    if regress != 'False':
+        model, method = _read_model(regress)
+    else:
+        model, method = False, None
+
+    lookup_dict = create_lookup_table(library_file, ctype, polar)
+    data = format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method)
+
+    _save_data(data, outfile)
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/library_lookup.xml	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,68 @@
+<tool id="lookup_library" name="RIQC-Lookup RI for CAS numbers in library" version="1.0.2">
+  <description>Lookup or estimate the RI using a "known RI values" CAS numbers library</description>
+  <command interpreter="python">
+    library_lookup.py 
+    $library_file
+    $input 
+    "$col_type" 
+    "$polarity" 
+    $output
+    #for $ctype in $pref
+      ${ctype.columntype}
+    #end for
+    $regression.model	
+  </command>
+  <inputs>
+    <page>
+      <param format="tabular" name="input" type="data" label="NIST identifications as tabular file" 
+      		 help="Select a tab delimited NIST metabolite identifications file (converted from PDF)" />
+      <param name="library_file" type="select" label="CAS x RI Library file" 
+      		 help="Select a library/lookup file containing RI values for CAS numbers on various chromatography columns " 
+      		 dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RI_DB_libraries")'/>
+      <param name="col_type" type="select" label="Select column type" refresh_on_change="true"
+	         display="radio" dynamic_options='get_column_type(library_file)'
+	         help="" />
+    </page>
+    <page>
+      <param name="polarity" type="select" label="Select polarity" refresh_on_change="true"
+             display="radio" dynamic_options='filter_column(library_file,col_type)'
+	         help="" />
+    </page>
+    <page>
+	  <conditional name="regression">
+		  <param name="regression_select" type="boolean" checked="false" label="Apply regression method" 
+		  		 help="If no data for the selected column is present in the database, selecting this option will try 
+		  		 	   to convert Retention Indices using data from other GC-columns with a regression method. Please
+		  		 	   note that only the first given GC-column above will be used for this, any alternatives will be
+		  		 	   ignored" />
+		  <when value="true">
+		  	<param name="model" format="tabular" type="data" label="Tabular file containing regression model" 
+		  		   help="This file contains the coefficients used to perform the regression from one GC-column
+		  		         to another GC-column"/>     
+		  </when>
+          <when value="false">
+            <param name="model" type="hidden" value="False" />
+          </when>
+	  </conditional>
+      <repeat name="pref" title="Select column name preference">
+		<param name="columntype" type="select" label="Column name" refresh_on_change="true"
+               dynamic_options='filter_column2(library_file, col_type, polarity)'
+	       	   help="Select one or more column names for filtering. The order defines the priority." />
+      </repeat>
+    </page>
+  </inputs>
+  <outputs>
+    <data format="tabular" label="${tool.name} on" name="output" />
+</outputs>
+<code file="match_library.py" />
+  <help>
+Performs a lookup of the RI values by matching CAS numbers from the given NIST identifications file to a library.
+If a direct match is NOT found for the preferred column name, a regression can be done to find
+the theoretical RI value based on known RI values for the CAS number on other column types (see step 4).
+If there is no match for the CAS number on any column type, then the record is not given a RI. 
+
+
+
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/match_library.py	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,120 @@
+'''
+Containing functions are called from Galaxy to populate lists/checkboxes with selectable items
+'''
+import csv
+import glob
+import os
+
+
+__author__ = "Marcel Kempenaar"
+__contact__ = "brs@nbic.nl"
+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
+__license__ = "MIT"
+
+def get_column_type(library_file):
+    '''
+    Returns a Galaxy formatted list of tuples containing all possibilities for the
+    GC-column types. Used by the library_lookup.xml tool
+    @param library_file: given library file from which the list of GC-column types is extracted
+    '''
+    (data, header) = read_library(library_file)
+
+    if 'columntype' not in header:
+        raise IOError('Missing columns in ', library_file)
+
+    # Filter data on column type
+    column_type = header.index("columntype")
+    amounts_in_list_dict = count_occurrence([row[column_type] for row in data])
+    galaxy_output = [(str(a) + "(" + str(b) + ")", a, False) for a, b in amounts_in_list_dict.items()]
+    return(galaxy_output)
+
+
+def filter_column(library_file, column_type_name):
+    '''
+    Filters the Retention Index database on column type
+    @param library_file: file containing the database
+    @param column_type_name: column type to filter on
+    '''
+    (data, header) = read_library(library_file)
+
+    if ('columntype' not in header or
+        'columnphasetype' not in header):
+        raise IOError('Missing columns in ', library_file)
+
+    column_type = header.index("columntype")
+    statphase = header.index("columnphasetype")
+
+    # Filter data on colunn type name
+    statphase_list = [line[statphase] for line in data if line[column_type] == column_type_name]
+    amounts_in_list_dict = count_occurrence(statphase_list)
+    galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()]
+    return(sorted(galaxy_output))
+
+
+def filter_column2(library_file, column_type_name, statphase):
+    '''
+    Filters the Retention Index database on column type
+    @param library_file: file containing the database
+    @param column_type_name: column type to filter on
+    @param statphase: stationary phase of the column to filter on
+    '''
+    (data, header) = read_library(library_file)
+
+    if ('columntype' not in header or
+        'columnphasetype' not in header or
+        'columnname' not in header):
+        raise IOError('Missing columns in ', library_file)
+
+    column_type_column = header.index("columntype")
+    statphase_column = header.index("columnphasetype")
+    column_name_column = header.index("columnname")
+
+    # Filter data on given column type name and stationary phase
+    statphase_list = [line[column_name_column] for line in data if line[column_type_column] == column_type_name and
+                      line[statphase_column] == statphase]
+    amounts_in_list_dict = count_occurrence(statphase_list)
+    galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()]
+    return(sorted(galaxy_output))
+
+
+def read_library(filename):
+    '''
+    Reads a CSV file and returns its contents and a normalized header
+    @param filename: file to read
+    '''
+    data = list(csv.reader(open(filename, 'rU'), delimiter='\t'))
+    header_clean = [i.lower().strip().replace(".", "").replace("%", "") for i in data.pop(0)]
+    return(data, header_clean)
+
+
+
+def get_directory_files(dir_name):
+    '''
+    Reads the directory and
+    returns the list of .txt files found as a dictionary
+    with file name and full path so that it can 
+    fill a Galaxy drop-down combo box.
+    
+    '''
+    files = glob.glob(dir_name + "/*.txt")
+    if len(files) == 0:
+        raise Exception("Configuration error: no library files found in <galaxy-home-dir>/" + dir_name)
+    else:
+        galaxy_output = [(str(get_file_name_no_ext(file_name)), str(os.path.abspath(file_name)), False) for file_name in files]
+    return(galaxy_output)
+    
+def get_file_name_no_ext(full_name):
+    '''
+    returns just the last part of the name
+    '''
+    simple_name = os.path.basename(full_name)
+    base, ext = os.path.splitext(simple_name)
+    return base
+    
+
+def count_occurrence(data_list):
+    '''
+    Counts occurrences in a list and returns a dict with item:occurrence
+    @param data_list: list to count items from
+    '''
+    return dict((key, data_list.count(key)) for key in set(data_list))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/msclust2.0.1.xml	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,289 @@
+<tool name="MsClust" id="msclust2" version="2.0.1">
+	<description>Extracts fragmentation spectra from aligned data</description>
+	<!-- 
+	   For remote debugging start you listener on port 8000 and use the following as command interpreter:
+	       java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000 
+	                    //////////////////////////
+	       
+	       TODO in command below: add conditionals according to options of using or NOT the tolerances/thresholds from previous steps 
+	    -->
+	<command interpreter="java -jar ">
+	    MsClust.jar 
+	   	-peaksFileName $inputPeaks 
+	   	-dataType $dataType
+        -imputationMethod $imputationMethod.type
+        #if $imputationMethod.type == "valueRange"
+        	-rangeUpperLimit $imputationMethod.rangeUpperLimit
+        #end if
+		-plInputFormat "metalign" 
+		-potDensFuncType $potDensFuncType.type 
+		-centerSelectionType $centerSelectionType.type 
+		-clusteringType $clusteringType.type 
+		-neighborhoodWindowSize $potDensFuncType.pdf_neighborhoodWindowSize 
+		-clusterSearchStopCriterium $centerSelectionType.cs_stop_criterion
+		-pearsonDistTreshold $potDensFuncType.pdf_pears_treshold
+		-pearsonTresholdConfidence $potDensFuncType.pdf_pears_conf
+		-pearsonPDReductionThreshold $centerSelectionType.cs_pears_pd_reductionTreshold
+		-pearsonPDReductionSlope $centerSelectionType.cs_pears_pd_reductionSlope
+		-scanDistTol $potDensFuncType.pdf_scan_toler 
+		-scanDistanceConfidence $potDensFuncType.pdf_scan_conf  
+		-centrotypesOut $centrotypesOut 
+		-simOut $simOut
+		-micOut $micOut
+		-mspOut $mspOut 
+		-classOut $classOut
+		-outReport $htmlReportFile
+	    -outReportPicturesPath $htmlReportFile.files_path
+        #if $clusteringType.type == "fuzzyCMeans"
+        	-fcmMembershipWeightingExponent $clusteringType.fcmMembershipWeightingExponent 
+			-fcmStopCriterion $clusteringType.fcmStopCriterion
+			-fcmCorrelationWeight $clusteringType.fcmCorrelationWeight
+			-fcmFinalAssemblyType $clusteringType.finalClusterAssembly.type
+			#if $clusteringType.finalClusterAssembly.type == "membershipBased"
+				-fcmMembershipCutoff $clusteringType.finalClusterAssembly.fcmMembershipCutoff
+			#end if
+        #end if
+		-verbose "false"
+	    #if $advancedSettings.settings == True
+	    	-advancedSettings YES
+	    	-saturationLimit $advancedSettings.saturationLimit
+	    	-sampleSelectionSortType $advancedSettings.sampleSelectionSortType
+	    	-simSelectionAlgorithm $advancedSettings.simSelectionAlgorithm
+	    	-simMassFilter "$advancedSettings.simMassFilter"
+	    	-simMembershipThreshold $advancedSettings.simMembershipThreshold
+	    	-simSaturationThreshold $advancedSettings.simSaturationThreshold
+	    	-simAbsenseThreshold $advancedSettings.simAbsenseThreshold
+	    	-micMembershipThreshold $advancedSettings.micMembershipThreshold
+	    	-peakIntensityCorrectionAlgorithm $advancedSettings.peakIntensityCorrectionAlgorithm
+        #else
+        	-advancedSettings YES
+        	-sampleSelectionSortType SIM_INTENSITY
+        	-peakIntensityCorrectionAlgorithm CORRELATION_BASED
+        #end if
+	    
+	</command>
+	<inputs>
+	<!--  <param name="rankingWeightConfig" type="text" area="true" size="11x70" label="NB - TEST VERSION" 
+value="VERSION BEING TESTED AT THIS MOMENT...NOT READY FOR USE..."/>
+	-->
+	 	<param name="inputPeaks" type="data" format="txt" label="Ion-wise aligned data (e.g. MetAlign output data)" />
+		<param name="dataType" type="select" size="30" label="Data type">
+				<option value="gcms"  selected="true">GC-MS</option>
+				<option value="lcms">LC-MS</option>
+			</param>
+	 	<conditional name="imputationMethod">
+			<param name="type" type="select" size="30" label="Select the approach used for imputing missing values (optional)" help="select how you generated the values to fill in the data gaps">
+				<option value="none" >none</option>
+				<option value="metot" selected="true">MeTot</option>
+				<option value="valueRange">Values range</option>
+			</param>
+			<when value="valueRange">
+				<param name="rangeUpperLimit" type="integer" size="10" value="0" label="Range upper limit" help="values up to this limit will be considered 'generated' values"  />
+			</when>
+		</conditional>		    
+	 	<conditional name="potDensFuncType">
+		    <param name="type" type="select" size="30" label="Select PD function type =====================================================">
+		      <option value="original" selected="true">Original</option>
+		    </param>
+		    <when value="original">
+		      <param name="pdf_neighborhoodWindowSize" type="integer" size="10" value="200" label="Effective Peaks"  />
+		      <param name="pdf_scan_toler" type="float" size="10" value="10" label="Peak Width, in scans"  />
+		      <param name="pdf_scan_conf" type="float" size="10" value="80" label="Peak Width confidence (0.0 to 99.99)" help="example: 0[no confidence]...50[good guess]...99.9[quite certain])" />
+		      <param name="pdf_pears_treshold" type="float" size="10" value="0.8" label="Correlation threshold (0.0 - 1.0)" />
+		      <param name="pdf_pears_conf" type="float" size="10" value="98.0" label="Correlation threshold confidence (0.0 to 99.99)" help="example: 0[no confidence]...50[good guess]...99.9[quite certain])" />
+		    </when>
+		</conditional>
+		<conditional name="centerSelectionType">
+		    <param name="type" type="select" label="Initial Centers selection type ==================================================" >
+		      <option value="original" selected="true">Original - Subtractive potential reductions with stop criterion and REUSE tolerances (from PD function)</option>
+		    </param>
+		    <when value="original">
+		      <param name="cs_pears_pd_reductionTreshold" type="float" size="10" value="0.8" label="Potential Density reduction (0.0 - 1.0)"  />
+		      <param name="cs_pears_pd_reductionSlope" type="float" size="10" value="0.01" label="Potential Density reduction softness "  />
+		      <param name="cs_stop_criterion" type="float" size="10" value="2" label="Stop Criterion "  />
+			</when>
+		</conditional>
+		<conditional name="clusteringType">
+		    <param name="type" type="select" label="Classify using ===========================================================">
+		      <option value="original" selected="true">Original - Fuzzy clustering, keep original centers and REUSE (scan distance) tolerances</option>
+		      <option value="fuzzyCMeans">(experimental) Fuzzy C-Means - Fuzzy clustering, optimize centers</option>
+		    </param>
+		    <when value="original">
+		    <!-- nothing -->
+			</when>
+		    <when value="originalNewTol">
+		      <param name="clust_scan_toler" type="float" size="10" value="10" label="Peak Width, in scans"  />
+		      <param name="clust_scan_slope" type="float" size="10" value="2" label="Peak Width margin softness"  />
+		    </when>
+		    <when value="fuzzyCMeans">
+				<param name="fcmMembershipWeightingExponent" type="float" size="10" value="2.0" label="Membership Weighting Exponent" help="Influences cluster center repositioning in the iterations 1.1 (exploratory) to around 3.0 (conservative)" />
+				<param name="fcmStopCriterion" type="float" size="10" value="0.05" label="Stop Criterion" help="When convergence is 'reached' (e.g. 0.05 means memberships only changed with 5% in last iteration)" />
+				<param name="fcmCorrelationWeight" type="float" size="10" value="2" label="Correlation weight factor" help="Increase this if you think the correlation is reliable (e.g. you have a high number of samples)" />
+				<conditional name="finalClusterAssembly">
+					<param name="type" type="select" label="Final cluster assembly" >
+				      <option value="original" selected="true">Original - distance based</option>
+				      <option value="membershipBased">Membership based</option>
+				    </param>
+					<when value="membershipBased">
+						<param name="fcmMembershipCutoff" type="select" label="Maximum allowed peak overlap" >
+							<option value="0.05" >~7 clusters</option>
+							<option value="0.10" >~5 clusters</option>
+							<option value="0.20" >~3 clusters</option>
+						</param>
+					</when>
+					<when value="original">
+					    <!-- nothing -->
+					</when>
+				</conditional>
+		    </when>
+		</conditional>
+		
+		<param name="summaryReport" type="boolean" checked="true" label="Generate summary report" help="NB: this will increase the processing time (in some cases up to a few extra minutes)"/>
+     	
+        <conditional name="advancedSettings">
+     		<param name="settings" type="boolean" truevalue="Yes" falsevalue="No" checked="false" label="Advanced settings ========================================================"/>
+     		<when value="Yes">
+     			<param name="saturationLimit" optional="true" type="integer" size="10" label="Saturation limit (optional)" help="fill in if you have saturation problems in your data"  />
+	 			<param name="sampleSelectionSortType"  type="select" label="Sample selection scheme for spectrum peak intensity correction algorithm (optional/experimental)" help="The intensity values to use to select the samples for each cluster/metabolite in which it is most intense/abundant. These samples are used in the peak intensity correction (see parameter below). Use this option to try to avoid samples that have insufficient signal or saturation."  >
+     				<option value="None">None</option>
+     				<!-- in order of best FORWARD scoring when tested on /test/data/report_test_sets/(P2) Relative peak heights in spectra/Input (Test set 1) -->
+     				<option value="SIM_INTENSITY" selected="true">SIM intensities</option>
+		    		<option value="MAX_INTENSITY">Maximum intensities</option>
+     				<option value="CENTROTYPE_INTENSITY">Centrotype peak intensities</option>
+		    		<option value="MIC_INTENSITY">MIC intensities</option>		    		
+     			</param>
+     			<param name="peakIntensityCorrectionAlgorithm"  type="select" label="Spectrum peak intensity correction algorithm (optional/experimental)" help="Whether spectrum peak heights should be adjusted according to their membership to the cluster or to their correlation to the cluster's centrotype ion"  >
+     				<option value="MEMBERSHIP_BASED">Membership based (msclust 1.0 mode)</option>
+		    		<option value="CORRELATION_BASED" selected="true">Correlation based</option>
+     			</param>     			
+     			<param name="simSelectionAlgorithm" type="select" label="SIM selection algorithm (experimental)" help="Set this if you want to deviate from the standard which is: allow shared SIM peaks for GC-MS data, and force unique SIM peaks for LC-MS data">
+     				<option value="" selected="true"></option>
+     				<option value="uniqueSIM">Unique SIM peak</option>
+		    		<option value="sharedSIM">Shared SIM peak</option>
+     			</param>
+     			<param name="simMassFilter" type="text" optional="true" size="30" label="SIM mass exclusion list" help="Comma-separated list of masses NOT to use as SIM peaks. E.g. '73,147,...' " />
+     			<param name="simMembershipThreshold" optional="true" type="float" size="10" label="SIM membership threshold" help="Minimum membership a peak should have to qualify as a SIM candidate. E.g. 0.8 " />
+     			<param name="simSaturationThreshold" optional="true" type="float" size="10" label="SIM saturation threshold (%)" help="Maximum % of samples in which a SIM candidate peak may be saturated. If the candidate peak exceeds this threshold, then another peak is chosen. If no peak can be found this criteria, mass 0 is reported" />
+     			<param name="simAbsenseThreshold" optional="true" type="float" size="10" label="SIM absence threshold (%)" help="Maximum % of samples in which a SIM candidate peak may be absent. If the candidate peak exceeds this threshold, then another peak is chosen. If no peak can be found meeting this criteria, mass 0 is reported" />
+     			
+     			<param name="micMembershipThreshold" optional="true" type="float" size="10" label="MIC membership threshold" help="Minimum membership a peak should have to be counted in the MIC sum. E.g. 0.8 " />
+     			
+     		</when>
+     	</conditional>	
+
+     	
+	</inputs>
+	<outputs>
+	  <data name="centrotypesOut" format="msclust.csv" label="${tool.name} on ${on_string} - centrotypes file"/>
+	  <data name="simOut" format="msclust.csv" label="${tool.name} on ${on_string} - SIM file"/>
+	  <data name="micOut" format="msclust.csv" label="${tool.name} on ${on_string} - MIC file"/>
+	   <data name="mspOut" format="msp" label="${tool.name} on ${on_string} - SPECTRA file"/>
+	  <data name="classOut" format="msclust.csv" label="${tool.name} on ${on_string} - Classification file"/>
+	  <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report">
+	 	<!-- If the expression is false, the file is not created -->
+	  	<filter>( summaryReport == True )</filter>
+	  </data>
+	</outputs>
+	<tests>
+	  <!--  find out how to use -->
+	</tests>
+  <help>
+
+<!-- see also http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#hyperlink-targets -->
+  
+.. class:: infomark
+  
+This tool extracts spectra from ion-wise aligned MS(/MS) results. It uses expression profiles and 
+retention times of the putative ions to cluster them. Each cluster is then used to generate 
+one spectrum containing the clustered ions (peaks). 
+
+.. image:: $PATH_TO_IMAGES/msclust_summary.png 
+
+
+-----
+
+**Output**
+
+This tools returns a number of ouptut files and a small report. 
+
+**Parameters index**
+
+
+*Select the approach used for imputing missing values:* only select this if you have used a specific method to 
+fill in the data gaps in the input file. One example is replacing zero values by some randomly generated low value.
+If MeTot is chosen, then a value is considered generated if: the value contains a dot '.' and some number 
+other than 0 (zero) after the dot. 
+
+*Effective Peaks:* Neighborhood window size to consider when calculating density. Smaller values increase 
+performance but are less reliable.
+
+*Peak Width, in scans:* Scan window width of scans to consider 'close'. One can see this as the 
+'tolerated variation in scans' for the apex positions of the fragment peaks composing a cluster. 
+Note: if MetAlign was used, this is the variation *after* pre-processing by MetAlign.   
+
+*Peak Width confidence:* The higher the confidence, the stricter the threshold.
+
+*Correlation threshold (0.0 - 1.0):* Tolerance center for pearson distance calculation. The higher this value, 
+the higher the correlation between 2 items has to be for them to be considered 'close'. 
+
+*Correlation threshold confidence:* The higher the confidence, the stricter the threshold. `More...`__
+
+*Potential Density reduction (0.0 - 1.0):* Reduction tolerance center for pearson distance calculation. 
+The higher this value, the less the low correlated items get reduced, getting a chance to form a cluster of their own. 
+
+*Potential Density reduction softness:* Reduction curve slope for pearson distance tolerance. Lower 
+values = stricter separation at the value determined in 'Potential Density reduction' above  
+(TODO review this comment). 
+
+*Stop Criterion:* When to stop reducing and looking for new clusters. Lower values = more iterations 
+
+.. __: javascript:window.open('$PATH_TO_IMAGES/confidence_and_slope_params_explain.png','popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes')
+
+
+-----
+
+**Output files described below**
+
+-----
+
+*SPECTRA:* this file can be submitted to NIST for identification of the spectra.
+
+`Click here for more details on the Sample selection and Spectrum peak intensity correction algorithm parameters related to SPECTRA generation`_  
+
+.. _Click here for more details on the Sample selection and Spectrum peak intensity correction algorithm parameters related to SPECTRA generation: javascript:window.open('$PATH_TO_IMAGES/sample_sel_and_peak_height_correction.png','popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes')
+
+-----
+
+*MIC:* stands for Measured Ions Count -> it contains, for each cluster, the sum of the ion count 
+values (corrected by their membership) for all MEASURED cluster ions in the given sample.
+
+The MIC for a **cluster i** in **sample s**, where **cluster i** has **n** members is thus: 
+
+sum ( [intensity of member n in **sample s**] x [membership value of member n in **cluster i** ] )
+
+-----
+
+*SIM:* stands for Selective Ion Mode ->  it contains, for each cluster, the intensity values of the 
+most representative member ion peak of this cluster. The most representative member peak is the one with the 
+highest membership*average_intensity. This definition leads to conflicts as a peak can have a 
+membership in two or more clusters. The assignment of a SIM peak to a cluster depends on 
+the configured data type (LC or GC-MS). NB: this can be overruled in the "advanced settings":
+
+(1) LC-MS SIM: select SIM peak only once and for the centrotype in which this specific mass has its 
+highest membership; for neighboring centrotypes use its "second best SIM", etcetera. In other words,
+if the SIM peak has been identified as the SIM in more than 1 cluster, assign as SIM to the cluster 
+with highest membership. Continue searching for other SIM peaks to assign to the other clusters until 
+all ambiguities are solved.
+
+(2) GC-MS SIM: the SIM peak can be "shared" by multiple clusters. However, the intensity values are corrected
+by the membership value of the peak in the cluster in case the SIM peak is "shared". If the SIM peak is not 
+"shared" then the "raw" intensity values of the SIM peak are recorded in the SIM file. 
+
+`Click here for more details on the SIM output file`_  
+
+.. _Click here for more details on the SIM output file: javascript:window.open('$PATH_TO_IMAGES/sample_SIM.png','popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes')
+
+
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilterGCMS_tabular.xml	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,77 @@
+<tool id="rankfilterGCMS_tabular" name="RIQC-RankFilter GC-MS from tabular file" version="1.0.2">
+  <description>Convert Retention Time to Retention Index</description>
+  <command interpreter="python">rankfilter_GCMS/rankfilter.py $input_file</command>
+  <inputs>
+    <param format="tabular" name="sample" type="data" label="Sample File" 
+	       help="Converted PDF file in tabular format" />
+	<!-- question: is this calibration file not column specific as it includes RT info?? -->
+    <param name="calibration"  type="select" label="Calibration File" 
+           help="Calibration file with reference masses (e.g. alkanes) with their RT and RI values"
+    		dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RankFilter_Calibration_Files")'/>
+
+    <param name="analysis_type" type="select" format="text" label="Analysis Type"
+    	   help="Select the type of analysis that has been used to generate the sample file">
+      <option value="NIST">NIST</option>
+      <option value="AMDIS">AMDIS</option>
+    </param>
+    <param name="model" type="select" format="text" label="Select a model to be used "
+    	   help="Both linear and (3rd degree) polynomial models are available ">
+      <option value="linear">Linear</option>
+      <option value="poly">Polynomial</option>
+    </param>
+    <param name="lib_data" type="select" label="Library" 
+	       help="Reference global lookup library file with CAS numbers and respective (previously calculated) RIsvr values" 
+           dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RankFilter_lookup_libraries")'/>	       
+	       
+    <param name="window" type="float" label="Window" value="10.56" />
+  </inputs>
+  <outputs>
+    <data format="tabular" label="${tool.name}" name="onefile" />
+  </outputs>
+  <!-- file with implementation of the function get_directory_files() used above  -->
+  <code file="match_library.py" />
+  <configfiles>
+    <configfile name="input_file">
+      sample = ${sample}
+      calibration = ${calibration}
+      lib_data = ${lib_data}
+      window = ${window}
+      analysis_type = ${analysis_type}
+      tabular = True
+      onefile = ${onefile}
+      model = ${model}
+    </configfile>
+  </configfiles>
+  <help>
+Basically estimates the experimental RI (RIexp) by building a RI(RT) function based on the
+given calibration file.   
+
+It also determines the estimated RI (RIsvr) by looking up for each entry of the given input file (Sample File), 
+based on its CAS number, its respective RIsvr value in the given global lookup library
+(this step is also called the "RankFilter analysis" -see reference below; Sample File may be either from NIST or AMDIS). 
+This generates an prediction of the RI for 
+a compound according to the "RankFilter procedure" (RIsvr). 
+
+Output is a tab separated file in which four columns are added:
+
+	- **Rank** Calculated rank
+	- **RIexp** Experimental Retention Index (RI)
+	- **RIsvr** Calculated RI based on support vector regression (SVR)
+	- **%rel.err** Relative RI error (%rel.error = 100 * (RISVR − RIexp) / RIexp)
+
+.. class:: infomark
+
+**Notes**
+
+	- The layout of the Calibration file should include the following columns: 'MW', 'R.T.' and 'RI'.
+	- Selecting 'Polynomial' in the model parameter will calculate a 3rd degree polynomial model that will
+	  be used to convert from XXXX to YYYY.
+
+-----
+
+**References**
+
+    - **RankFilter**: Mihaleva et. al. (2009) *Automated procedure for candidate compound selection in GC-MS 
+      metabolomics based on prediction of Kovats retention index*. Bioinformatics, 25 (2009), pp. 787–794
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_GCMS/__init__.py	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,5 @@
+'''
+Created on Mar 14, 2012
+
+@author: marcelk
+'''
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_GCMS/pdfread.py	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,210 @@
+"""
+Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University 
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import sys
+import csv
+
+def getPDF(filename, print_progress):
+    '''
+    Parses NIST PDF file
+    @param filename: PDF file to parse
+    '''
+    NistInput = {}
+    NistInput_missed = {}
+    nist_input = open(filename, 'r').read()
+
+    hitid = []
+    rt = []
+    name = []
+    forward = []
+    cas = []
+    reverse = []
+    prob = []
+    lib_id = []
+    nist_id = []
+    missed_compounds = []
+    rt_missed_compounds = []
+    formula = []
+
+    hit_list = nist_input.split('** Search Report Page 1 of 1 **')
+    hit_list.pop(0)
+    #number_hits = range(10)
+    line_id = 0
+    for line in hit_list:
+        line = line.strip().translate(None, '\r')
+        if line != '':
+            hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit')
+
+            spec_id = hits.pop(0).split(' ')[1]
+            j = 0
+            for hh in hits:
+                cell = hh.split(';')
+                if print_progress == True:
+                    print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell
+                line_id += 1
+                if len(cell) == 7:  # the compound has CAS number
+                    if len(cell[1].split(':')) == 2:
+                        forward.append(cell[1].split(':')[1])
+                        # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
+                        if len(cell[0].split(':')) > 2:
+                            name_tmp = ':'.join(cell[0].split(':')[1:])
+                        else:
+                            name_tmp = cell[0].split(':')[1]
+                        name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
+                        name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
+                        name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace("  ", " "))
+                        if name_tmp:
+                            if name_tmp.split(' ')[-1][0] == 'C' or name_tmp.split(' ')[-1][0] == 'F' or name_tmp.split(' ')[-1][0] == 'H':
+                                formule = (name_tmp.split(' ')[-1])
+                            else:
+                                formule = ('not_def')
+                        else:
+                            formule = ('not_def')
+                        formula.append(formule.replace("  ", " "))
+                        reverse.append(cell[2].split(':')[1])
+                        prob.append(cell[3].split(' ')[2].replace('%', ''))
+                        cas.append(cell[4].split(':')[1])
+                        lib_id.append(cell[5].split(':')[1])
+                        nist_id.append(cell[6].split(':')[1].replace('.', '').strip())
+                        j = j + 1
+                    else:
+                        missed_compounds.append(hh)
+                        rt_missed_compounds.append(spec_id)
+
+                elif len(cell) >= 6:  # the compound has no CAS number
+                    if len(cell[1].split(':')) == 2:
+
+                        forward.append(cell[1].split(':')[1])
+                        # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
+                        if len(cell[0].split(':')) > 2:
+                            name_tmp = ':'.join(cell[0].split(':')[1:])
+                        else:
+                            name_tmp = cell[0].split(':')[1]
+                        name_tmp = name_tmp.replace('lC', 'l C').replace(']C', '] C').replace('sC', 's C').replace('9C', '9 C').replace('.C', '. C')
+                        name_tmp = name_tmp.replace(')C', ') C').replace('eC', 'e C').replace('yC', 'y C').replace('oC', 'o C').replace('-C', '- C').replace('dC', 'd C').replace('rC', 'r C')
+                        name.append((' '.join(name_tmp.split(' ')[0:len(name_tmp) - 1])).replace("  ", " "))  # "  ", " "
+                        name_tmp = name_tmp.strip().split(' ')
+                        if name_tmp:
+                            if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
+                                formule = (name_tmp[-1])
+                            else:
+                                formule = ('not_def')
+                        else:
+                            formule = ('not_def')
+                        formula.append(formule.replace("  ", " "))
+                        reverse.append(cell[2].split(':')[1])
+                        prob.append(cell[3].split(' ')[2].replace('%', ''))
+                        cas.append('undef')
+                        lib_id.append(cell[4].split(':')[1])
+                        nist_id.append(cell[5].split(':')[1].replace('.', '').strip())
+                        j = j + 1
+
+                    else:
+                        missed_compounds.append(hh)
+                        rt_missed_compounds.append(spec_id)
+
+                else: # Missing columns, report and quit
+                    
+                    return
+
+            for _ in range(j):
+                hitid.append(str(spec_id.replace("  ", " ")))
+                rt.append(str(float(spec_id.split('-')[3]) / 1e+06))
+
+    NistInput['ID'] = hitid
+    NistInput['R.T.'] = rt
+    NistInput['Name'] = name
+    NistInput['CAS'] = cas
+    NistInput['Formula'] = formula
+    NistInput['Forward'] = forward
+    NistInput['Reverse'] = reverse
+    NistInput['Probability'] = prob
+    NistInput['Library'] = lib_id
+    NistInput['Library ID'] = nist_id
+    NistInput_missed['Missed Compounds'] = missed_compounds
+    NistInput_missed['RT missed Compounds'] = rt_missed_compounds
+
+    return NistInput, NistInput_missed
+
+
+def convert_pdftotext2tabular(filename, output_file, error_file, print_progress):
+    '''
+    Converts NIST PDF file to tabular format
+    @param filename: PDF file to parse
+    @param output_file: output file for the hits
+    @param error_file: output file for failed hits
+    '''
+    [HitList, HitList_missed] = getPDF(filename, print_progress)
+    # save Hitlist as tab seperate file
+    Hitlist_as_text = "\t".join(HitList.keys()) + "\n"
+    Hitlist_array_of_array = ([HitList[row] for row in HitList.keys()])
+    Hitlist_as_text += str("\n".join(["\t".join(e) for e in zip(*Hitlist_array_of_array)]))
+    output_fh = open(output_file, 'wb')
+    output_fh.write(Hitlist_as_text)
+    output_fh.close()
+
+    out_missed_pdf = open(error_file, 'wb')
+    for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']):
+        out_missed_pdf.write('%s\n' % '\t'.join([y, x]))
+    out_missed_pdf.close()
+
+
+def read_tabular(in_csv):
+    '''
+    Parses a tab-separated file returning a dictionary with named columns
+    @param in_csv: input filename to be parsed
+    '''
+    data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t'))
+    header = data.pop(0)
+    # Create dictionary with column name as key
+    output = {}
+    for index in xrange(len(header)):
+        output[header[index]] = [row[index] for row in data]
+    return output
+
+
+def read_tabular_old(filename):
+    '''
+    Function to read tabular format (created by convert_pdftotext2tabular)
+    and output a dict with header of columns as key and value is columns of tabular as list
+    @param filename: tabular file to read
+    '''
+    input_fh = None
+    try:
+        input_fh = open(filename, 'r')
+    except IOError, error:
+        raise error
+    colnames = input_fh.readline().strip().split('\t')
+    cells = []
+    for line in input_fh.readlines():
+        cells.append(line.strip().split('\t'))
+    #transform from row oriented structure to column oriented structure
+    cells = zip(*cells)
+    #store the list of list in form of final output
+    RankFilterGC_format = {}
+    for colnumber in range(len(colnames)):
+        RankFilterGC_format[colnames[colnumber]] = cells[colnumber]
+    return RankFilterGC_format
+
+
+if __name__ == '__main__':
+    convert_pdftotext2tabular(sys.argv[1], sys.argv[2], sys.argv[3], True)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_GCMS/pdftotabular.py	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,41 @@
+"""
+Copyright (C) 2013, Pieter Lukasse, Plant Research International, Wageningen
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this software except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+"""
+
+import sys
+import pdfread
+from subprocess import call
+
+
+def convert_pdftotext(filename, output_file):
+    '''
+    Converts PDF file to text
+    @param filename: PDF file to parse
+    @param output_file: output text file for the hits    
+    '''
+    
+    try:
+        call(["pdftotext", filename, output_file])
+    except:
+        raise Exception("Error while trying to convert PDF to text")
+   
+   
+
+
+if __name__ == '__main__':
+    pdf_as_text = sys.argv[1]+".txt"
+    convert_pdftotext(sys.argv[1], pdf_as_text)
+    pdfread.convert_pdftotext2tabular(pdf_as_text, sys.argv[2], sys.argv[3], False)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_GCMS/rankfilter.py	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,432 @@
+"""
+Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+#Library functions definition
+#----------Begin-------------
+from numpy import array, linalg, ones
+from numpy.polynomial import polynomial
+import math
+import pdfread
+import sys
+
+
+def calibrate(standards):
+    '''
+    Calculates the RT to RI conversion: RI = a + b*RT
+    @param standards: variable containing RI and RT data
+    '''
+    A = ones((len(standards['R.T.']), 2), dtype=float)
+    A[:, 0] = array(map(float, standards['R.T.']))
+    [coeff, res, r, s] = linalg.lstsq(A, array(map(float, standards['RI'])))
+
+    return coeff
+
+
+def calibrate_poly(standards):
+    '''
+    Calculates the RT to RI conversion using a polynomial model
+    @param standards: variable containing RI and RT data
+    '''
+    retention_time = array(map(float, standards['R.T.']))
+    retention_index = array(map(float, standards['RI']))
+
+    # Fit a 3rd degree polynomial
+    fit = polynomial.polyfit(retention_time, retention_index, 3)[::-1]
+    return [fit[0], fit[1], fit[2], fit[3]]
+
+
+def calculate_poly(retention_time, poly_cal):
+    '''
+    Converts a given retention time to retention index using the calculated polynomial model
+    @param retention_time: retention_time to convert to retention index
+    @param poly_cal: result from calculating regression
+    '''
+    # Calculates RI based on given retention_time using polynomial function
+    retention_time = array(map(float, retention_time))
+    if len(retention_time) > 1:
+        ri_exp = []
+        for i in retention_time:
+            ri_exp.append(poly_cal[0] * (i ** 3) + poly_cal[1] * (i ** 2) + (i * poly_cal[2]) + poly_cal[3])
+        return ri_exp
+    else:
+        return poly_cal[0] * (retention_time ** 3) + poly_cal[1] * (retention_time ** 2) + (retention_time * poly_cal[2]) + poly_cal[3]
+
+
+def convert_rt(hit_list, coeff):
+    '''
+    Converts a given retention time to retention index using the linear model
+    @param hit_list: list holding the retention time
+    @param coeff: calculated coefficient (slope and intercept) using the linear model
+    '''
+    #Convert RT to RI
+    hit_list['RIexp'] = array(map(float, hit_list['R.T.'])) * coeff[0] + coeff[1]
+    return hit_list
+
+
+def convert_rt_poly(hit_list, poly_cal):
+    '''
+    Calls the actual RT to RI converter and returns the updated list with added RIexp value
+    @param hit_list: result list containing the retention time
+    '''
+    hit_list['RIexp'] = array(map(float, calculate_poly(hit_list['R.T.'], poly_cal)))
+    return hit_list
+
+
+def get_data(libdata, LabelCol):
+    '''
+    Retrieves datacolumns indicated by LabelCol from libdata (generic function)
+    Returns a dict with the requested column names as keys
+    @param libdata: file from which data is loaded
+    @param LabelCol: columns to retrieve
+    '''
+    from numpy import take
+    LibData = open(libdata, 'r').read().split('\n')
+
+    #Get the labels of the columns in the file
+    FirstLine = LibData.pop(0).split('\t')
+
+    FirstLine[-1] = FirstLine[-1].replace('\r', '')
+
+    # Create a temporate variable containing the all data
+    tmp_data = []
+    for ll in LibData:
+        if ll != '':
+            tmp_data.append(ll.split('\t'))
+
+    # Find the indices of the desired data
+    ind = []
+    try:
+        for key in LabelCol:
+            ind.append(FirstLine.index(key))
+    except:
+        print str(key) + " not found in first line of library (" + str(libdata) + ")"
+        print"the folowing items are found in the first line of the library:\n" + str(FirstLine)
+        sys.exit(1)
+    # Extract the desired data
+    data = []
+    for x in tmp_data:
+        data.append(take(array(x), ind))
+
+    # library_data = dict(zip(LabelCol,transpose(data)))
+    library_data = dict(zip(LabelCol, map(lambda *x: list(x), *data)))
+    return library_data
+
+
+def rank_hit(hit_list, library_data, window):
+    '''
+    Computes the Rank and % relative error
+    @param hit_list: input data
+    @param library_data: library used for reading the RIsvr data 
+    @param window: error window
+    '''
+    hit_match_ripred = []
+    hit_match_syn = []
+    # Convert 'Name' data to list in order to be indexed
+    # library_data['Name']=list(library_data['Name'])
+
+    for hit_cas, hit_name in zip(hit_list['CAS'], hit_list['Name']):
+        index = 0
+        if hit_cas != 'undef':
+            try:
+                index = library_data['CAS'].index(hit_cas.replace(' ', '').replace('-', ''))
+            except:
+                try:
+                    index = library_data['Name'].index(hit_name.replace(' ', ''))
+                except:
+                    # If for any reason the hit is not present
+                    # in the mainlib library indicate this with -999 
+                    index = 0
+        else:
+            try:
+                index = library_data['Name'].index(hit_name.replace(' ', ''))
+            except:
+                # If for any reason the hit is not present
+                # in the mainlib library indicate this with -999 
+                index = 0
+        if index != 0:
+            hit_match_ripred.append(float(library_data['RIsvr'][index]))
+            hit_match_syn.append(library_data['Synonyms'][index])
+        else:
+            hit_match_ripred.append(-999)
+            hit_match_syn.append('None')
+    hit_list['RIsvr'] = hit_match_ripred
+    hit_list['Synonyms'] = hit_match_syn
+
+    # Determine the relative difference between the experimental
+    # and the predicted RI
+    ri_err = []
+
+    for ri_exp, ri_qsar in zip(hit_list['RIexp'], hit_list['RIsvr']):
+        if int(ri_qsar) != -999:
+            ri_err.append(float(int(float(ri_qsar)) - int(float(ri_exp))) / int(float(ri_exp)) * 100)
+        else:
+            ri_err.append(-999)
+
+    # Define the rank of the hits
+    hit_rank = []
+
+    for tt in ri_err:
+        if tt == -999:
+            # The name of the hit generated with AMDIS did not match a name
+            # in the mainlib library
+            hit_rank.append(4)
+        else:
+            # Rank 1 - ri_err is within +/- window/2
+            if abs(tt) <= float(window) / 2:
+                hit_rank.append(1)
+            # Rank 2 - window/2 < ri_err <= window 
+            if abs(tt) > float(window) / 2 and abs(tt) <= float(window):
+                hit_rank.append(2)
+            # Rank 3 - ri_err > window
+            if abs(tt) > float(window):
+                hit_rank.append(3)
+    hit_list['Rank'] = hit_rank
+    hit_list['%rel.err'] = ri_err
+    return hit_list
+
+def print_to_file(hit_list, filename, keys_to_print, print_subsets=True):
+    '''
+    Writes output data to files (four output files are generated):
+        filename_ranked - the hits are ranked
+        filename_filter_in - only hits with rank 1 and 2 are retained
+        filename_filter_out - hits with rank 3 are filtered out
+        filename_filter_missed - hits with rank 4 - there was no match with the
+                                 library data
+    @param hit_list: a dictionary with the ranked hits
+    @param filename: the core of the output file names
+    @param keys_to_print: determines the order in which the data are printed
+    @param print_subsets:
+    '''
+    from numpy import take
+
+    out_ranked = open(filename["ranked"], 'w')
+    if (print_subsets):
+        out_in = open(filename["filter_in"], 'w')
+        out_out = open(filename["filter_out"], 'w')
+        out_missed = open(filename["missed"], 'w')
+
+    #Convert RIexp and RIsvr into integer for printing
+    hit_list['RIexp'] = map(int, map(math.ceil, hit_list['RIexp']))
+    hit_list['RIsvr'] = map(int, map(math.ceil, hit_list['RIsvr']))
+
+    #Establish the right order of the data to be printed    
+    tmp_items = hit_list.items()
+    tmp_keys = hit_list.keys()
+    ind = []
+
+    for key in keys_to_print:
+        ind.append(tmp_keys.index(key))
+
+    #Print the labels of the columns
+    line = '\t'.join(take(array(tmp_keys), ind))
+    out_ranked.write('%s\n' % line)
+    if (print_subsets):
+        out_in.write('%s\n' % line)
+        out_out.write('%s\n' % line)
+        out_missed.write('%s\n' % line)
+
+    #Collect the data for each hit in the right order and print them
+    #in the output file
+    i = 0
+    for name in hit_list['Name']:
+        tt = []
+        for x in iter(tmp_items):
+            # trim the value if it is a string:
+            if isinstance(x[1][i], basestring):
+                tt.append(x[1][i].strip())
+            else:
+                tt.append(x[1][i])
+        tmp1 = take(array(tt), ind)
+        line = '\t'.join(tmp1.tolist())
+
+        out_ranked.write('%s\n' % line)
+        if(print_subsets):
+            if hit_list['Rank'][i] == 4:
+                out_missed.write('%s\n' % line)
+            if hit_list['Rank'][i] == 3:
+                out_out.write('%s\n' % line)
+            if hit_list['Rank'][i] == 1 or hit_list['Rank'][i] == 2:
+                out_in.write('%s\n' % line)
+
+        i = i + 1
+
+#---------End--------------
+def main():
+    #Ranking and filtering procedure
+    if len(sys.argv) < 2:
+        print "Usage:"
+        print "python RankFilter_GC-MS.py input \n"
+        print "input is a text file that specifies the names and the location"
+        print "of the files with the sample, compounds for calibration, library"
+        print "data, the core of the name ot the outputs, and the value of the"
+        print "window used for the filtering \n"
+
+        sys.exit(1)
+
+    #------Read the input file------
+    try:
+        ifile = open(sys.argv[1], 'r').read().split('\n')
+    except:
+        print sys.argv[1], " file can not be found"
+        sys.exit()
+
+    #Get the file names for the data
+    #labels - the type of input files
+    #filenames - the names of the input files
+    labels = []
+    filenames = []
+
+    for l in ifile:
+        l = l.strip()
+        if l != '':
+            labels.append(l.split('=')[0].replace(' ', '').replace('\r', ''))
+            filenames.append(l.split('=')[1].replace(' ', '').replace('\r', ''))
+
+    InputData = dict(zip(labels, filenames))
+
+    #this part checkes if the ouput option is set. The output files are saved as the output variable as prefix for the output files
+    #if the output is not found , each output file has to be selected by forehand. This comes in handy for pipeline tools as galaxy
+    print_subsets = True
+    NDIS_is_tabular = False
+
+    if 'output' in InputData:
+        output_files = {"ranked":InputData['output'] + "_ranked", \
+        "filter_in":InputData['output'] + "_filter_in", \
+        "filter_out":InputData['output'] + "filter_out", \
+        "missed":InputData['output'] + "_missed", \
+        "missed_parse_pdf":InputData['output'] + "_missed_parse_pdf"}
+    elif 'tabular' in InputData:
+        NDIS_is_tabular = True
+        if(not "onefile" in InputData):
+            output_files = {"ranked":InputData['ranked'], \
+            "filter_in":InputData['filter_in'], \
+            "filter_out":InputData['filter_out'], \
+            "missed":InputData['missed']}
+        else:
+            print_subsets = False
+            output_files = {"ranked":InputData['onefile']}
+    else:
+        output_files = {"ranked":InputData['ranked'], \
+        "filter_in":InputData['filter_in'], \
+        "filter_out":InputData['filter_out'], \
+        "missed":InputData['missed'], \
+        "missed_parse_pdf":InputData['missed_parse_pdf']}
+
+    #------Start with calibration------
+    #Check whether a file with data for the calibration is specified
+    #Specify which data to be read from the file with standard compounds
+    LabelColStand = ['Name', 'R.T.', 'RI']
+
+    if InputData['calibration'] != 'none':
+        #get the coeffiecients for the RT to RI convertion
+
+        try:
+            ifile = open(InputData['calibration'], 'r')
+        except:
+            print "file", InputData['calibration'], "can not be found"
+            sys.exit(1)
+
+        standards = get_data(InputData['calibration'], LabelColStand)
+        if InputData['model'] == 'linear':
+            coeff = calibrate(standards)
+        elif InputData['model'] == 'poly':
+            poly_cal = calibrate_poly(standards)
+        else:
+            print "error: model ", InputData['model'], " can not be found. Use 'linear' or 'poly' "
+            sys.exit(1)
+    else:
+        #No file has been specified for the calibration
+        #Use the default coefficients
+        print 'No file has been specified for the calibration'
+        print 'WARNING: the default coefficients will be used'
+        coeff = array([29.4327, 454.5260])
+
+    if InputData['analysis_type'] == 'AMDIS':
+        try:
+            AmdisOut = open(InputData['sample'], 'r')
+            print("open ok")
+            #Specify which data to be extracted from the AMDIS output table
+            #Weighted and Reverse are measure of matching between the experimental
+            #and the library spectra. The experimental spectrum is used as template
+            #for the calculation of Weighted, whereas for Reverse the template is the
+            #library spectrum
+            LabelCol = ['CAS', 'Name', 'R.T.', 'Weighted', 'Reverse', 'Purity']
+
+            #Get the data from the AMDIS output file
+            HitList = get_data(InputData['sample'], LabelCol)
+            #Remove '>' from the names
+            HitList['Name'] = [s.replace('>', '') for s in HitList['Name']]
+        except:
+            print "the file", InputData['sample'], "can not be found"
+            sys.exit(1)
+    if InputData['analysis_type'] == 'NIST':
+        #HitList_missed - a variable of type dictionary containing the hits with the symbol ";"
+        #in the name
+        if not NDIS_is_tabular:
+            print "Warning; NDIS is not tabular format, reading PDF..\n"
+            [HitList, HitList_missed] = pdfread.getPDF(InputData['sample'])
+        else:
+            HitList = pdfread.read_tabular(InputData['sample'])
+
+    #Convert RT to RI
+    if InputData['model'] == 'linear':
+            HitList = convert_rt(HitList, coeff)
+    if InputData['model'] == 'poly':
+            print "Executing convert_rt_poly().."
+            HitList = convert_rt_poly(HitList, poly_cal)
+
+    #------Read the library data with the predicted RI------
+    try:
+        LibData = open(InputData['lib_data'], 'r')
+    except:
+        print "the file", InputData['lib_data'], "can not be found"
+        sys.exit(1)
+
+    #Specify which data to be extracted from the library data file
+    LabelColLib = ['CAS', 'Name', 'RIsvr', 'Synonyms']
+    LibraryData = get_data(InputData['lib_data'], LabelColLib)
+
+    #------Match the hits with the library data and rank them------
+    if InputData['window'] != '':
+        HitList = rank_hit(HitList, LibraryData, InputData['window'])
+    else:
+        print "No value for the window used for the filtering is specified \n"
+        sys.exit(1)
+
+    #------Print the ranked and filtered hits------
+    #Specify which data to be printed
+    if InputData['analysis_type'] == 'AMDIS':
+        keys_to_print = ['R.T.', 'CAS', 'Name', 'Rank', 'RIexp', 'RIsvr', '%rel.err', 'Weighted', 'Reverse', 'Synonyms']
+    else:
+        keys_to_print = ['ID', 'R.T.', 'Name', 'CAS', 'Rank', 'RIexp', 'RIsvr', '%rel.err', 'Forward', 'Reverse', 'Synonyms', 'Library']
+
+    #skip this error output from reading a pdftotext file when file is tabular     
+    if InputData['analysis_type'] == 'NIST' and not NDIS_is_tabular:
+        out_missed_pdf = open(output_files['missed_parse_pdf'], 'w')
+        for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']):
+            out_missed_pdf.write('%s\n' % '\t'.join([y, x]))
+        out_missed_pdf.close()
+
+    print_to_file(HitList, output_files, keys_to_print, print_subsets)
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_GCMS/test/test_pdfread.py	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,30 @@
+'''
+Created on Mar 13, 2012
+
+@author: marcelk
+'''
+from GCMS.rankfilter_GCMS import pdfread  # @UnresolvedImport
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+import unittest
+
+
+class Test(unittest.TestCase):
+
+    def setUp(self):
+        self.nist_pdf = resource_filename(__name__, "data/NIST_test_PDF.txt")
+
+    def test_getPDF(self):
+        '''
+        Tests the reading and parsing of a NIST PDF file
+        '''
+        [hitlist, hitlist_missed] = pdfread.getPDF(self.nist_pdf)
+        rows = [hitlist[row] for row in hitlist.keys()]
+        data = [set(row) for row in zip(*rows)]
+        expected_element = set(('12.3', ' Sucrose ', '14', 'undef', ' standards 2009', ' 660', 'not_def',
+        '18495-0.142537-21284-2.26544e+07-135', '22.6544', ' 714'))
+        self.failUnless(expected_element in data)
+        self.failUnless(len(hitlist_missed) != 0)
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.test_getPDF']
+    unittest.main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_GCMS/test/test_rankfilter.py	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,58 @@
+'''
+Created on Mar 13, 2012
+
+@author: marcelk
+'''
+from GCMS.rankfilter_GCMS.rankfilter import get_data, calibrate, calibrate_poly, convert_rt, convert_rt_poly
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+import unittest
+
+
+class Test(unittest.TestCase):
+
+    def setUp(self):
+        self.calibration = resource_filename(__name__, "data/calibration.txt")
+        self.sample = resource_filename(__name__, "data/sample.txt")
+
+    def test_get_data(self):
+        samples = get_data(self.sample, ['CAS', 'Name', 'R.T.', 'Forward', 'Reverse', 'Formula'])
+        self.assertEqual(['C29H50O', 'C29H50O', 'C29H50O', 'C29H50O', 'C29H50O', 'C28H48O', 'C28H48O', 'C28H48O',
+                          'C28H48O', 'C27H44O2', 'C29H50O2', 'C29H50O2', 'C29H50O2', 'C29H50O2', 'C29H50O2',
+                          'C28H48O2', 'C28H48O2', 'C28H48O2', 'C28H48O2', 'C30H50O3', 'C29H50O', 'C29H50O',
+                          'C29H50O', 'C29H50O'], samples['Formula'])
+
+    def test_calibrate(self):
+        standards = get_data(self.calibration, ['Name', 'R.T.', 'RI'])
+        coeff = calibrate(standards)
+        self.assertAlmostEqual(103.19073523551653, coeff[0], 5)
+        self.assertAlmostEqual(277.14374835349395, coeff[1], 5)
+
+    def test_calibrate_poly(self):
+        standards = get_data(self.calibration, ['Name', 'R.T.', 'RI'])
+        poly_cal = calibrate_poly(standards)
+        self.assertAlmostEqual(0.028897105229818407, poly_cal[0], 5)
+        self.assertAlmostEqual(0.704572097468386, poly_cal[1], 5)
+        self.assertAlmostEqual(51.636852478526357, poly_cal[2], 5)
+        self.assertAlmostEqual(704.95499738104672, poly_cal[3], 5)
+
+    def test_convert_rt(self):
+        standards = get_data(self.calibration, ['Name', 'R.T.', 'RI'])
+        coeff = calibrate(standards)
+        convert = convert_rt({'R.T.': [5, 10, 15, 20]}, coeff)
+        self.assertAlmostEqual(793.09742453, convert['RIexp'][0], 5)
+        self.assertAlmostEqual(1309.05110071, convert['RIexp'][1], 5)
+        self.assertAlmostEqual(1825.00477689, convert['RIexp'][2], 5)
+        self.assertAlmostEqual(2340.95845306, convert['RIexp'][3], 5)
+
+    def test_convert_rt_poly(self):
+        standards = get_data(self.calibration, ['Name', 'R.T.', 'RI'])
+        poly_cal = calibrate_poly(standards)
+        convert = convert_rt_poly({'R.T.': [5, 10, 15, 20]}, poly_cal)
+        self.assertAlmostEqual(984.36570036, convert['RIexp'][0], 5)
+        self.assertAlmostEqual(1320.67783714, convert['RIexp'][1], 5)
+        self.assertAlmostEqual(1735.56423664, convert['RIexp'][2], 5)
+        self.assertAlmostEqual(2250.69772778, convert['RIexp'][3], 5)
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_text2tabular.xml	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,14 @@
+<tool id="NDIStext2tabular" name="NIST_UTIL- NIST text to tabular format" version="1.0.2">
+  <description>Convert NIST text to tabular format</description>
+  <command interpreter="python">rankfilter_GCMS/pdftotabular.py $input $output $output_err</command>
+  <inputs>
+    <param format="pdf" name="input" type="data" label="NIST identifications report (PDF)"/>
+  </inputs>
+  <outputs>
+    <data format="tabular" label="${tool.name} output on ${on_string}"  name="output" />
+    <data format="tabular" label="${tool.name} error log"  name="output_err" />
+  </outputs>
+  <help>
+    This tool converts NIST identification report output (PDF) to a tabular format needed for further use with the RIQC tools.
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/select_on_rank.py	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,21 @@
+import csv
+import sys
+
+__author__ = "Marcel Kempenaar"
+__contact__ = "brs@nbic.nl"
+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
+__license__ = "MIT"
+
+in_file = sys.argv[1]
+out_file = sys.argv[2]
+to_select_list = [str(select.strip()) for select in sys.argv[3].split(',') if (len(select) > 0)]
+
+data = list(csv.reader(open(in_file, 'rb'), delimiter='\t'))
+header = data.pop(0)
+header_clean = [i.lower().strip().replace(".", "").replace("%", "") for i in header]
+rank = header_clean.index("rank")
+
+writer = csv.writer(open(out_file, 'wb'), delimiter='\t')
+writer.writerow(header)
+for select in to_select_list:
+    writer.writerows([i for i in data if i[rank] == select])
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/select_on_rank.xml	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,15 @@
+<tool id="filter_on_rank" name="RIQC-Filter on rank" version="1.0.2">
+  <description>Filter on the Rank field in the RankFilter output file</description>
+  <command interpreter="python">select_on_rank.py $input $output "$rank"</command>
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Source file (RankFilter ouptut)"/>
+    <param format="tabular" help="Filter on (keep different values separate with a comma)" value ="1,2" 
+	   name="rank" type="text" label="Select Ranks to keep"/>
+  </inputs>
+  <outputs>
+    <data format="tabular" label="${tool.name} on ${on_string} selected ${rank}"  name="output" />
+  </outputs> 
+  <help>
+This tool removes all entries with non selected rank values from the input file (supported input file is a RankFilter output file).
+  </help>
+</tool>
Binary file static/images/confidence_and_slope_params_explain.png has changed
Binary file static/images/msclust_summary.png has changed
Binary file static/images/sample_SIM.png has changed
Binary file static/images/sample_sel_and_peak_height_correction.png has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/__init__.py	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,1 @@
+''' BRS GCMS Galaxy Tools Module '''
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/integration_tests.py	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,268 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+from GCMS import library_lookup, combine_output
+from GCMS.rankfilter_GCMS import rankfilter
+import os.path
+import sys
+import unittest
+import re
+
+
+class IntegrationTest(unittest.TestCase):
+    def test_library_lookup(self):
+        '''
+        Run main for data/NIST_tabular and compare produced files with references determined earlier.
+        '''
+        # Create out folder
+        outdir = "output/" #tempfile.mkdtemp(prefix='test_library_lookup')
+        if not os.path.exists(outdir):
+            os.makedirs(outdir)
+        outfile_base = os.path.join(outdir, 'produced_library_lookup')
+        outfile_txt = outfile_base + '.txt'
+
+        #Build up arguments and run
+        input_txt = resource_filename(__name__, "data/NIST_tabular.txt")
+        library = resource_filename(__name__, "data/RIDB_subset.txt")
+        regress_model = resource_filename(__name__, "data/ridb_poly_regression.txt")
+        sys.argv = ['test',
+                    library,
+                    input_txt,
+                    'Capillary',
+                    'Semi-standard non-polar',
+                    outfile_txt,
+                    'HP-5',
+                    regress_model]
+        # Execute main function with arguments provided through sys.argv
+        library_lookup.main()
+        #Compare with reference files
+        reference_txt = resource_filename(__name__, 'reference/produced_library_lookup.txt')
+        
+        #read both the reference file  and actual output files
+        expected = _read_file(reference_txt)
+        actual = _read_file(outfile_txt)
+        
+        #convert the read in files to lists we can compare
+        expected = expected.split()
+        actual = actual.split()
+
+        for exp, act in zip(expected, actual):
+            if re.match('\\d+\\.\\d+', exp):
+                exp = float(exp)
+                act = float(act)
+                self.assertAlmostEqual(exp, act, places=5)
+            else:
+                # compare values
+                self.failUnlessEqual(expected, actual)
+
+
+    def test_combine_output_simple(self):
+        '''
+        Run main for data/NIST_tabular and compare produced files with references determined earlier.
+        '''
+        # Create out folder
+        outdir = "output/" #tempfile.mkdtemp(prefix='test_library_lookup')
+        if not os.path.exists(outdir):
+            os.makedirs(outdir)
+        outfile_base = os.path.join(outdir, 'produced_combine_output')
+        outfile_single_txt = outfile_base + '_single.txt'
+        outfile_multi_txt = outfile_base + '_multi.txt'
+
+        #Build up arguments and run
+        input_rankfilter = resource_filename(__name__, "data/Rankfilter.txt")
+        input_caslookup = resource_filename(__name__, "data/Caslookup.txt")
+        sys.argv = ['test',
+                    input_rankfilter,
+                    input_caslookup,
+                    outfile_single_txt,
+                    outfile_multi_txt]
+        # Execute main function with arguments provided through sys.argv
+        combine_output.main()
+        #Compare with reference files
+        # reference_single_txt = resource_filename(__name__, 'reference/produced_combine_output_single.txt')
+        # reference_multi_txt = resource_filename(__name__, 'reference/produced_combine_output_multi.txt')
+        # self.failUnlessEqual(_read_file(reference_single_txt), _read_file(outfile_single_txt))
+        # self.failUnlessEqual(_read_file(reference_multi_txt), _read_file(outfile_multi_txt))
+
+        #Clean up
+        #shutil.rmtree(tempdir)
+
+
+        
+    def def_test_rank_filter_advanced(self):
+        '''
+        Run main of RankFilter
+        '''
+        # Create out folder
+        outdir = "output/integration/"
+        if not os.path.exists(outdir):
+            os.makedirs(outdir)
+
+        #Build up arguments and run
+        input_txt = resource_filename(__name__, "data/integration/RankFilterInput_conf.txt")
+        sys.argv = ['test', 
+                    input_txt]
+        # Execute main function with arguments provided through sys.argv
+        rankfilter.main()
+        #Compare with reference files
+               
+    def def_test_library_lookup_advanced(self):
+        '''
+        Run main for data/NIST_tabular and compare produced files with references determined earlier.
+        '''
+        # Create out folder
+        outdir = "output/integration/" 
+        if not os.path.exists(outdir):
+            os.makedirs(outdir)
+        outfile_base = os.path.join(outdir, 'produced_library_lookup_ADVANCED')
+        outfile_txt = outfile_base + '.txt'
+
+        #Build up arguments and run
+        input_txt = resource_filename(__name__, "data/integration/NIST_identification_results_tabular.txt")
+        library = resource_filename(__name__, "data/integration/Library_RI_DB_capillary_columns-noDuplicates.txt")
+        regress_model = resource_filename(__name__, "data/integration/regression_MODEL_for_columns.txt")
+        sys.argv = ['test',
+                    library,
+                    input_txt,
+                    'Capillary',
+                    'Semi-standard non-polar',
+                    outfile_txt,
+                    'DB-5',
+                    regress_model]
+        # Execute main function with arguments provided through sys.argv
+        library_lookup.main()
+
+
+        
+    def test_combine_output_advanced(self):
+        '''
+        Variant on test case above, but a bit more complex as some of the centrotypes have
+        different NIST hits which should give them different RI values. This test also
+        runs not only the combine output, but the other two preceding steps as well, 
+        so it ensures the integration also works on the current code of all three tools. 
+        '''
+            
+        # Run RankFilter 
+        self.def_test_rank_filter_advanced()
+        
+        # Run library CAS RI lookup
+        self.def_test_library_lookup_advanced()
+        
+        outdir = "output/integration/"    
+        outfile_base = os.path.join(outdir, 'produced_combine_output')
+        outfile_single_txt = outfile_base + '_single.txt'
+        outfile_multi_txt = outfile_base + '_multi.txt'
+
+        #Build up arguments and run
+        input_rankfilter = resource_filename(__name__, "output/integration/produced_rank_filter_out.txt")
+        input_caslookup = resource_filename(__name__, "output/integration/produced_library_lookup_ADVANCED.txt")
+        sys.argv = ['test',
+                    input_rankfilter,
+                    input_caslookup,
+                    outfile_single_txt,
+                    outfile_multi_txt]
+        # Execute main function with arguments provided through sys.argv
+        combine_output.main()
+        #Compare with reference files
+#        reference_single_txt = resource_filename(__name__, 'reference/produced_combine_output_single.txt')
+#        reference_multi_txt = resource_filename(__name__, 'reference/produced_combine_output_multi.txt')
+#        self.failUnlessEqual(_read_file(reference_single_txt), _read_file(outfile_single_txt))
+#        self.failUnlessEqual(_read_file(reference_multi_txt), _read_file(outfile_multi_txt))
+        
+        # Check 1: output single should have one record per centrotype:
+        
+        
+        # Check 2: output single has more records than output single:
+        combine_result_single_items =  combine_output._process_data(outfile_single_txt)
+        combine_result_multi_items =  combine_output._process_data(outfile_multi_txt)
+        self.assertGreater(len(combine_result_single_items['Centrotype']), 
+                           len(combine_result_multi_items['Centrotype']))
+        
+        
+        # Check 3: library_lookup RI column, centrotype column, ri_svr column are correct:
+        caslookup_items = combine_output._process_data(input_caslookup)
+        rankfilter_items = combine_output._process_data(input_rankfilter)
+        
+        # check that the caslookup RI column is correctly maintained in its original order in
+        # the combined file:
+        ri_caslookup = caslookup_items['RI']
+        ri_combine_single = combine_result_single_items['RI']
+        self.assertListEqual(ri_caslookup, ri_combine_single) 
+        
+        # check the centrotype column's integrity:
+        centrotype_caslookup = caslookup_items['Centrotype']
+        centrotype_combine_single = combine_result_single_items['Centrotype']
+        centrotype_rankfilter = _get_centrotype_rankfilter(rankfilter_items['ID'])
+        self.assertListEqual(centrotype_caslookup, centrotype_combine_single)
+        self.assertListEqual(centrotype_caslookup, centrotype_rankfilter)
+                
+        # integration and integrity checks:
+        file_NIST = resource_filename(__name__, "data/integration/NIST_identification_results_tabular.txt")
+        file_NIST_items = combine_output._process_data(file_NIST)
+        # check that rank filter output has exactly the same ID items as the original NIST input file:
+        self.assertListEqual(file_NIST_items['ID'], rankfilter_items['ID']) 
+        # check the same for the CAS column:
+        self.assertListEqual(_get_strippedcas(file_NIST_items['CAS']), rankfilter_items['CAS'])
+        # now check the NIST CAS column against the cas lookup results:  
+        cas_NIST = _get_processedcas(file_NIST_items['CAS'])
+        self.assertListEqual(cas_NIST, caslookup_items['CAS'])
+        # now check the CAS of the combined result. If all checks are OK, it means the CAS column's order
+        # and values remained stable throughout all steps: 
+        self.assertListEqual(rankfilter_items['CAS'], combine_result_single_items['CAS']) 
+        
+        # check that the rankfilter RIsvr column is correctly maintained in its original order in
+        # the combined file:
+        risvr_rankfilter = rankfilter_items['RIsvr']
+        risvr_combine_single = combine_result_single_items['RIsvr']
+        self.assertListEqual(risvr_rankfilter, risvr_combine_single) 
+
+        
+   
+
+def _get_centrotype_rankfilter(id_list):
+    '''
+    returns the list of centrotype ids given a list of ID in the
+    form e.g. 74-1.0-564-1905200-7, where the numbers before the 
+    first "-" are the centrotype id
+    '''
+    result = []
+    for compound_id_idx in xrange(len(id_list)):
+        compound_id = id_list[compound_id_idx]
+        centrotype = compound_id.split('-')[0]
+        result.append(centrotype) 
+
+    return result
+
+
+def _get_processedcas(cas_list):
+    '''
+    returns the list cas numbers in the form C64175 instead of 64-17-5
+    '''
+    result = []
+    for cas_id_idx in xrange(len(cas_list)):
+        cas = cas_list[cas_id_idx]
+        processed_cas = 'C' + str(cas.replace('-', '').strip())
+        result.append(processed_cas) 
+
+    return result
+
+def _get_strippedcas(cas_list):
+    '''
+    removes the leading white space from e.g. " 64-17-5"
+    '''
+    result = []
+    for cas_id_idx in xrange(len(cas_list)):
+        cas = cas_list[cas_id_idx]
+        processed_cas = cas.strip()
+        result.append(processed_cas) 
+
+    return result
+
+
+def _read_file(filename):
+    '''
+    Helper method to quickly read a file
+    @param filename:
+    '''
+    with open(filename) as handle:
+        return handle.read()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_combine_output.py	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,106 @@
+'''
+Created on Mar 27, 2012
+
+@author: marcelk
+'''
+from GCMS import combine_output
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+import os
+import shutil
+import tempfile
+import unittest
+
+
+class Test(unittest.TestCase):
+    '''
+    Tests for the 'combine_output' Galaxy tool
+    '''
+
+    def setUp(self):
+        self.rf_output = resource_filename(__name__, "data/RankFilter.txt")
+        self.cl_output = resource_filename(__name__, "data/CasLookup.txt")
+
+    def test_process_data(self):
+        '''
+        Tests the processing of the RankFilter and CasLookup files into dictionaries
+        '''
+        rfdata = combine_output._process_data(self.rf_output)
+        cldata = combine_output._process_data(self.cl_output)
+        self.assertEqual(set([' 18457-04-0', ' 55133-95-4', ' 58-08-2', ' 112-34-5']), set(rfdata['CAS']))
+        self.assertEqual(set(['C58082', 'C18457040', 'C55133954', 'C112345']), set(cldata['CAS']))
+
+    def test_add_hit(self):
+        '''
+        Tests the combination of two records from both the RankFilter- and CasLookup-tools
+        '''
+        rfdata = combine_output._process_data(self.rf_output)
+        cldata = combine_output._process_data(self.cl_output)
+        index = 0
+        rf_record = dict(zip(rfdata.keys(), [rfdata[key][index] for key in rfdata.keys()]))
+        cl_record = dict(zip(cldata.keys(), [cldata[key][index] for key in cldata.keys()]))
+
+        hit = combine_output._add_hit(rf_record, cl_record)
+        self.assertEqual(len(hit), 27)
+
+        # Pass empty record, should fail combination
+        self.assertRaises(KeyError, combine_output._add_hit, rf_record, {})
+
+    def test_merge_data(self):
+        '''
+        Tests the merging of the RankFilter and CasLookup data
+        '''
+        rfdata = combine_output._process_data(self.rf_output)
+        cldata = combine_output._process_data(self.cl_output)
+        merged, _ = combine_output._merge_data(rfdata, cldata)
+        centrotypes = _get_centrotypes(merged)
+        self.failUnless(all(centrotype in centrotypes for centrotype in ('2716','12723', '3403', '12710')))
+
+def _get_centrotypes(merged):
+    '''
+    returns centrotype codes found in merged set
+    '''
+    result = []
+    for item_idx in xrange(len(merged)):
+        item = merged[item_idx]
+        centrotype = item[0][0]
+        result.append(centrotype) 
+
+    return result 
+
+    def test_remove_formula(self):
+        '''
+        Tests the removal of the Formula from the 'Name' field (RankFilter output)
+        '''
+        name = "Caffeine C8H10N4O2"
+        compound_name, compound_formula = combine_output._remove_formula(name)
+        self.assertEqual(compound_name, 'Caffeine')
+        self.assertEqual(compound_formula, 'C8H10N4O2')
+        name = "Ethanol C2H6O"
+        compound_name, compound_formula = combine_output._remove_formula(name)
+        self.assertEqual(compound_name, 'Ethanol')
+        self.assertEqual(compound_formula, 'C2H6O')
+        # No formula to remove
+        name = "Butanoic acid, 4-[(trimethylsilyl)oxy]-, trimethylsilyl ester"
+        compound_name, compound_formula = combine_output._remove_formula(name)
+        self.assertEqual(compound_name, name)
+        self.assertEqual(compound_formula, False)
+
+    def test_save_data(self):
+        '''
+        Tests the creation of the output tabular files (no content testing)
+        '''
+        temp_folder = tempfile.mkdtemp(prefix='gcms_combine_output_')
+        saved_single_data = '{0}/{1}'.format(temp_folder, 'output_single.tsv')
+        saved_multi_data = '{0}/{1}'.format(temp_folder, 'output_multi.tsv')
+        rfdata = combine_output._process_data(self.rf_output)
+        cldata = combine_output._process_data(self.cl_output)
+        merged, nhits = combine_output._merge_data(rfdata, cldata)
+        combine_output._save_data(merged, nhits, saved_single_data, saved_multi_data)
+        self.failUnless(os.path.exists(saved_single_data))
+        self.failUnless(os.path.exists(saved_multi_data))
+        shutil.rmtree(temp_folder)
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_export_to_metexp_tabular.py	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,85 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+from GCMS import export_to_metexp_tabular
+import os.path
+import sys
+import unittest
+
+
+class IntegrationTest(unittest.TestCase):
+
+
+    def test_combine_output_simple(self):
+        '''
+        comment me
+        '''
+        # Create out folder
+        outdir = "output/metexp/"
+        if not os.path.exists(outdir):
+            os.makedirs(outdir)
+
+        #Build up arguments and run
+        
+        rankfilter_and_caslookup_combined_file = resource_filename(__name__, "data/dummy1_produced_combine_output_single.txt")
+        msclust_quantification_and_spectra_file = resource_filename(__name__, "data/dummy1_sim.txt")
+        output_csv = resource_filename(__name__, outdir + "metexp_tabular.txt")
+    
+        sys.argv = ['test',
+                    rankfilter_and_caslookup_combined_file,
+                    msclust_quantification_and_spectra_file,
+                    output_csv]
+        # Execute main function with arguments provided through sys.argv
+        export_to_metexp_tabular.main()
+
+        '''
+        # Asserts are based on reading in with process_data and comparing values of 
+        # certain columns
+        
+        # Check 3: library_lookup RI column, centrotype column, ri_svr column are correct:
+        caslookup_items = combine_output._process_data(input_caslookup)
+        rankfilter_items = combine_output._process_data(input_rankfilter)
+        
+        # check that the caslookup RI column is correctly maintained in its original order in
+        # the combined file:
+        ri_caslookup = caslookup_items['RI']
+        ri_combine_single = combine_result_single_items['RI']
+        self.assertListEqual(ri_caslookup, ri_combine_single) 
+        
+        # check the centrotype column's integrity:
+        centrotype_caslookup = caslookup_items['Centrotype']
+        centrotype_combine_single = combine_result_single_items['Centrotype']
+        centrotype_rankfilter = _get_centrotype_rankfilter(rankfilter_items['ID'])
+        self.assertListEqual(centrotype_caslookup, centrotype_combine_single)
+        self.assertListEqual(centrotype_caslookup, centrotype_rankfilter)
+                
+        # integration and integrity checks:
+        file_NIST = resource_filename(__name__, "data/integration/NIST_identification_results_tabular.txt")
+        file_NIST_items = combine_output._process_data(file_NIST)
+        # check that rank filter output has exactly the same ID items as the original NIST input file:
+        self.assertListEqual(file_NIST_items['ID'], rankfilter_items['ID']) 
+        # check the same for the CAS column:
+        self.assertListEqual(_get_strippedcas(file_NIST_items['CAS']), rankfilter_items['CAS'])
+        # now check the NIST CAS column against the cas lookup results:  
+        cas_NIST = _get_processedcas(file_NIST_items['CAS'])
+        self.assertListEqual(cas_NIST, caslookup_items['CAS'])
+        # now check the CAS of the combined result. If all checks are OK, it means the CAS column's order
+        # and values remained stable throughout all steps: 
+        self.assertListEqual(rankfilter_items['CAS'], combine_result_single_items['CAS']) 
+        
+        # check that the rankfilter RIsvr column is correctly maintained in its original order in
+        # the combined file:
+        risvr_rankfilter = rankfilter_items['RIsvr']
+        risvr_combine_single = combine_result_single_items['RIsvr']
+        self.assertListEqual(risvr_rankfilter, risvr_combine_single) 
+        '''
+        
+   
+
+def _read_file(filename):
+    '''
+    Helper method to quickly read a file
+    @param filename:
+    '''
+    with open(filename) as handle:
+        return handle.read()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_library_lookup.py	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,180 @@
+'''
+Created on Mar 6, 2012
+
+@author: marcelk
+'''
+from GCMS import library_lookup, match_library
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+import os
+import shutil
+import tempfile
+import unittest
+
+
+class Test(unittest.TestCase):
+    '''
+    Tests the 'library_lookup' Galaxy tool
+    '''
+
+    def setUp(self):
+        self.ri_database = resource_filename(__name__, "data/RIDB_subset.txt")
+        self.nist_output = resource_filename(__name__, "data/NIST_tabular.txt")
+        self.ridb_poly_regress = resource_filename(__name__, "data/ridb_poly_regression.txt")
+        self.ridb_linear_regress = resource_filename(__name__, "data/ridb_linear_regression.txt")
+
+    def test_create_lookup_table(self):
+        '''
+        Tests the 'create_lookup_table' function
+        '''
+        column_type = 'Capillary'
+        polarity = 'Semi-standard non-polar'
+        lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity)
+        self.assertFalse(False in [res[4] == 'Capillary' for res in lookup_dict['4177166']])
+        self.assertEqual(['C51276336', '2,6-Dimethyl-octa-1,7-dien-3,6-diol', 'C10H18O2',
+                          '1277', 'Capillary', 'Semi-standard non-polar', 'DB-5MS', '1',
+                          'C51276336_DB-5MS', '', '', ''], lookup_dict['51276336'][1])
+
+    def test_read_model(self):
+        '''
+        Tests reading the regression model data containing the parameters required for converting
+        retention indices between GC-columns
+        '''
+        model, _ = library_lookup._read_model(self.ridb_poly_regress)
+        # Order of values: coefficient 1 through 4, left limit, right limit
+        # Polynomial model
+        self.assertEqual([20.6155874639486, 0.945187096379008, 3.96480787567566e-05, -9.04377237159287e-09,
+                          628.0, 2944.0, 405.0, 0, 0.998685262365514], model['HP-5']['SE-54'])
+        self.assertEqual([-92.3963391356951, 1.26116176393346, -0.000191991657547972, 4.15387371263164e-08,
+                          494.0, 2198.0, 407.0, 0, 0.996665023122993], model['Apiezon L']['Squalane'])
+        # Linear model
+        model, _ = library_lookup._read_model(self.ridb_linear_regress)
+        self.assertEqual([2.81208738561543, 0.99482475526584, 628.0, 2944.0, 405.0, 0, 0.998643883946458],
+                         model['HP-5']['SE-54'])
+        self.assertEqual([19.979922768462, 0.993741869298272, 494.0, 2198.0, 407.0, 0, 0.99636062891041],
+                         model['Apiezon L']['Squalane'])
+
+    def test_apply_regression(self):
+        '''
+        Tests the regression model on some arbitrary retention indices
+        '''
+        poly_model, _ = library_lookup._read_model(self.ridb_poly_regress)
+        linear_model, _ = library_lookup._read_model(self.ridb_linear_regress)
+        retention_indices = [1000, 1010, 1020, 1030, 1040, 1050]
+        converted_poly = []
+        converted_linear = []
+        for ri in retention_indices:
+            converted_poly.append(library_lookup._apply_poly_regression('HP-5', 'DB-5', ri, poly_model))
+            converted_linear.append(library_lookup._apply_linear_regression('HP-5', 'DB-5', ri, linear_model))
+
+        self.assertEqual([1003.0566541860778, 1013.0979459524663, 1023.1358645806529, 1033.170466241159,
+                          1043.2018071045052, 1053.2299433412131], converted_poly)
+        self.assertEqual([1001.8127584915925, 1011.830140783027, 1021.8475230744615, 1031.864905365896,
+                          1041.8822876573306, 1051.899669948765], converted_linear)
+        
+        # Test polynomial limit detection, the following RI falls outside of the possible limits
+        ri = 3400
+        converted_poly = library_lookup._apply_poly_regression('HP-5', 'DB-5', ri, poly_model)
+        self.assertEqual(False, converted_poly)
+
+    def test_preferred_hit(self):
+        ''' Tests the matching of the hits with the preferred column, including regression '''
+        model, method = library_lookup._read_model(self.ridb_poly_regress)
+        column_type = 'Capillary'
+        polarity = 'Semi-standard non-polar'
+        lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity)
+        hits = lookup_dict['150867']
+        # No regression, should however consider order of given preference
+        match = library_lookup._preferred(hits, ['SE-52', 'DB-5', 'HP-5'], column_type, polarity, model, method)
+        expected = (['C150867', '(E)-phytol', 'C20H40O', '2110', 'Capillary',
+                    'Semi-standard non-polar', 'SE-52', '', 'C150867_SE-52', '', '', ''], False)
+        self.assertEqual(expected, match)
+
+        # Perform regression by looking for 'OV-101' which isn't there. 'SE-52' has the best regression model
+        # of the available columns
+        match = library_lookup._preferred(hits, ['OV-101'], column_type, polarity, model, method)
+        expected = (['C150867', '(E)-phytol', 'C20H40O', 2158.5769891569125, 'Capillary',
+                     'Semi-standard non-polar', 'SE-52', '', 'C150867_SE-52', '', '', ''], 'SE-52')
+        self.assertEqual(expected, match)
+
+    def test_format_result(self):
+        '''
+        Tests the 'format_result' function
+        '''
+        column_type = 'Capillary'
+        polarity = 'Semi-standard non-polar'
+
+        # Look for DB-5
+        pref_column = ['DB-5']
+        model, method = library_lookup._read_model(self.ridb_poly_regress)
+        lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity)
+        data = library_lookup.format_result(lookup_dict, self.nist_output, pref_column, column_type,
+                                            polarity, model, method)#False, None)
+
+        # remove non-hits from set:
+        data = _get_hits_only(data)
+        self.assertEqual(['C544354', 'Ethyl linoleate', 'C20H36O2', '2155', 'Capillary', 'Semi-standard non-polar',
+                           'DB-5', '1', 'C544354_DB-5', '1810', 'None', '', '', '0'], data[20])
+        self.assertEqual(111, len(data))
+
+        # Look for both DB-5 and HP-5
+        pref_column = ['DB-5', 'HP-5']
+        data = library_lookup.format_result(lookup_dict, self.nist_output, pref_column, column_type,
+                                            polarity, False, None)
+        # remove non-hits from set:
+        data = _get_hits_only(data)
+        self.assertEqual(['C502614', '.beta.-(E)-Farnesene', 'C15H24', '1508', 'Capillary', 'Semi-standard non-polar',
+                          'DB-5', '1', 'C502614_DB-5', '942', 'None', '1482', '1522', '22'], data[50])
+        self.assertEqual(106, len(data))
+
+
+    def test_save_data(self):
+        '''
+        Tests the creation of the output tabular file
+        '''
+        temp_folder = tempfile.mkdtemp(prefix='gcms_combine_output_')
+        saved_data = '{0}/{1}'.format(temp_folder, 'output.tsv')
+        column_type = 'Capillary'
+        polarity = 'Semi-standard non-polar'
+        pref_column = ['DB-5']
+        lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity)
+        data = library_lookup.format_result(lookup_dict, self.nist_output, pref_column, column_type, polarity, False, None)
+        library_lookup._save_data(data, saved_data)
+        self.failUnless(os.path.exists(saved_data))
+        shutil.rmtree(temp_folder)
+        
+        
+    def test_match_library_get_lib_files(self):
+        '''
+        Tests the match_library.py functionality
+        '''
+        riqc_libs_dir = resource_filename(__name__, "../repositories")
+        get_library_files_output = match_library.get_directory_files(riqc_libs_dir)
+        self.assertEqual(4, len(get_library_files_output))
+        self.assertEqual("Library_RI_DB_capillary_columns-noDuplicates", get_library_files_output[0][0])
+        #TODO change assert below to assert that the result is a file, so the test can run on other dirs as well:
+        #self.assertEqual("E:\\workspace\\PRIMS-metabolomics\\python-tools\\tools\\GCMS\\test\\data\\riqc_libs\\RI DB library (capillary columns) Dec.2012.txt", get_library_files_output[0][1])
+        #self.assertEqual("RI DB library (capillary columns) Jan.2013", get_library_files_output[1][0])  
+        try:
+            get_library_files_output = match_library.get_directory_files("/blah")
+            # should not come here
+            self.assertTrue(False)
+        except:
+            # should come here
+            self.assertTrue(True)
+
+def _get_hits_only(data):
+    '''
+    removes items that have RI == 0.0 and Name == '' (these are dummy lines just for the output
+    '''
+    result = []
+    for item_idx in xrange(len(data)):
+        item = data[item_idx]
+        if item[1] != '' and item[3] > 0.0 :
+            result.append(item) 
+
+    return result 
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_match_library.py	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,51 @@
+'''
+Created on Mar 6, 2012
+
+@author: marcelk
+'''
+from GCMS import match_library
+import unittest
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+
+
+class Test(unittest.TestCase):
+    '''
+    Tests the 'match_library' Galaxy tool
+    '''
+    nist_db = resource_filename(__name__, "data/RIDB_subset.txt")
+
+    def test_get_column_type(self):
+        '''
+        Tests the 'get_column_type' function that returns tuples of unique elements
+        for column types in the RI database
+        '''
+        galaxy_output = match_library.get_column_type(self.nist_db)
+        self.assertEqual([('Capillary(9999)', 'Capillary', False)], galaxy_output)
+
+    def test_filter_column(self):
+        '''
+        Tests the 'filter_column' function showing the column phase for all 'Capillary' columns
+        '''
+        galaxy_output = match_library.filter_column(self.nist_db, 'Capillary')
+        self.assertEqual([('Semi-standard non-polar(9999)', 'Semi-standard non-polar', False)], galaxy_output)
+
+    def test_filter_column2(self):
+        '''
+        Tests the 'filter_column' function showing all possibilities for columns having both the
+        'Capillary' and 'Semi-standard non-polar' properties
+        '''
+        galaxy_output = match_library.filter_column2(self.nist_db, 'Capillary', 'Semi-standard non-polar')
+        self.failUnless(('Apiezon M(6)', 'Apiezon M', False) in galaxy_output)
+
+    def test_count_occurrence(self):
+        '''
+        Tests the 'count_occurrence' function
+        '''
+        test_list = [2, 0, 0, 2, 1, 3, 4, 5, 3, 2, 3, 4, 5, 5, 4, 2, 5, 3, 4, 3, 5, 4, 2, 0, 4]
+        counts = match_library.count_occurrence(test_list)
+        self.assertEqual({0: 3, 1: 1, 2: 5, 3: 5, 4: 6, 5: 5}, counts)
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()