Galaxy |

Changeset 0:dffc38727496 (2015-02-07)

Next changeset 1:223d1167de58 (2015-03-19)

Commit message:
initial commit

added:
LICENSE
NOTICE
README.rst
Rscripts/filter-RIDB.R
Rscripts/ridb-regression.R
__init__.py
combine_output.py
combine_output.xml
create_model.xml
datatypes_conf.xml
export_to_metexp_tabular.py
export_to_metexp_tabular.xml
library_lookup.py
library_lookup.xml
match_library.py
metaMS_cmd_annotate.r
metaMS_cmd_pick_and_group.r
metams_lcms_annotate.xml
metams_lcms_pick_and_group.xml
primsfilters.py
query_mass_repos.py
query_mass_repos.xml
query_metexp.py
query_metexp.xml
rankfilterGCMS_tabular.xml
rankfilter_GCMS/__init__.py
rankfilter_GCMS/pdfread.py
rankfilter_GCMS/pdftotabular.py
rankfilter_GCMS/rankfilter.py
rankfilter_text2tabular.xml
select_on_rank.py
select_on_rank.xml
static/images/CAMERA_results.png
static/images/confidence_and_slope_params_explain.png
static/images/diffreport.png
static/images/massEIC.png
static/images/metaMS_annotate.png
static/images/metaMS_pick_align_camera.png
static/images/msclust_summary.png
static/images/sample_SIM.png
static/images/sample_sel_and_peak_height_correction.png
static_resources/elements_and_masses.tab
test/__init__.py
test/integration_tests.py
test/test_combine_output.py
test/test_export_to_metexp_tabular.py
test/test_library_lookup.py
test/test_match_library.py
test/test_query_mass_repos.py
test/test_query_metexp.py
test/test_query_metexp_LARGE.py
tool_dependencies.xml
xcms_differential_analysis.r
xcms_differential_analysis.xml
xcms_get_alignment_eic.r
xcms_get_alignment_eic.xml
xcms_get_mass_eic.r
xcms_get_mass_eic.xml

diff -r 000000000000 -r dffc38727496 LICENSE
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/LICENSE Sat Feb 07 22:02:00 2015 +0100

[

b'@@ -0,0 +1,202 @@\n+\n+ Apache License\n+ Version 2.0, January 2004\n+ http://www.apache.org/licenses/\n+\n+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n+\n+ 1. Definitions.\n+\n+ "License" shall mean the terms and conditions for use, reproduction,\n+ and distribution as defined by Sections 1 through 9 of this document.\n+\n+ "Licensor" shall mean the copyright owner or entity authorized by\n+ the copyright owner that is granting the License.\n+\n+ "Legal Entity" shall mean the union of the acting entity and all\n+ other entities that control, are controlled by, or are under common\n+ control with that entity. For the purposes of this definition,\n+ "control" means (i) the power, direct or indirect, to cause the\n+ direction or management of such entity, whether by contract or\n+ otherwise, or (ii) ownership of fifty percent (50%) or more of the\n+ outstanding shares, or (iii) beneficial ownership of such entity.\n+\n+ "You" (or "Your") shall mean an individual or Legal Entity\n+ exercising permissions granted by this License.\n+\n+ "Source" form shall mean the preferred form for making modifications,\n+ including but not limited to software source code, documentation\n+ source, and configuration files.\n+\n+ "Object" form shall mean any form resulting from mechanical\n+ transformation or translation of a Source form, including but\n+ not limited to compiled object code, generated documentation,\n+ and conversions to other media types.\n+\n+ "Work" shall mean the work of authorship, whether in Source or\n+ Object form, made available under the License, as indicated by a\n+ copyright notice that is included in or attached to the work\n+ (an example is provided in the Appendix below).\n+\n+ "Derivative Works" shall mean any work, whether in Source or Object\n+ form, that is based on (or derived from) the Work and for which the\n+ editorial revisions, annotations, elaborations, or other modifications\n+ represent, as a whole, an original work of authorship. For the purposes\n+ of this License, Derivative Works shall not include works that remain\n+ separable from, or merely link (or bind by name) to the interfaces of,\n+ the Work and Derivative Works thereof.\n+\n+ "Contribution" shall mean any work of authorship, including\n+ the original version of the Work and any modifications or additions\n+ to that Work or Derivative Works thereof, that is intentionally\n+ submitted to Licensor for inclusion in the Work by the copyright owner\n+ or by an individual or Legal Entity authorized to submit on behalf of\n+ the copyright owner. For the purposes of this definition, "submitted"\n+ means any form of electronic, verbal, or written communication sent\n+ to the Licensor or its representatives, including but not limited to\n+ communication on electronic mailing lists, source code control systems,\n+ and issue tracking systems that are managed by, or on behalf of, the\n+ Licensor for the purpose of discussing and improving the Work, but\n+ excluding communication that is conspicuously marked or otherwise\n+ designated in writing by the copyright owner as "Not a Contribution."\n+\n+ "Contributor" shall mean Licensor and any individual or Legal Entity\n+ on behalf of whom a Contribution has been received by Licensor and\n+ subsequently incorporated within the Work.\n+\n+ 2. Grant of Copyright License. Subject to the terms and conditions of\n+ this License, each Contributor hereby grants to You a perpetual,\n+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n+ copyright license to reproduce, prepare Derivative Works of,\n+ publicly display, publicly perform, sublicense, and distribute the\n+ Work and such Derivative Works in Source or Obj'..b'r shall be under the terms and conditions of\n+ this License, without any additional terms or conditions.\n+ Notwithstanding the above, nothing herein shall supersede or modify\n+ the terms of any separate license agreement you may have executed\n+ with Licensor regarding such Contributions.\n+\n+ 6. Trademarks. This License does not grant permission to use the trade\n+ names, trademarks, service marks, or product names of the Licensor,\n+ except as required for reasonable and customary use in describing the\n+ origin of the Work and reproducing the content of the NOTICE file.\n+\n+ 7. Disclaimer of Warranty. Unless required by applicable law or\n+ agreed to in writing, Licensor provides the Work (and each\n+ Contributor provides its Contributions) on an "AS IS" BASIS,\n+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n+ implied, including, without limitation, any warranties or conditions\n+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n+ PARTICULAR PURPOSE. You are solely responsible for determining the\n+ appropriateness of using or redistributing the Work and assume any\n+ risks associated with Your exercise of permissions under this License.\n+\n+ 8. Limitation of Liability. In no event and under no legal theory,\n+ whether in tort (including negligence), contract, or otherwise,\n+ unless required by applicable law (such as deliberate and grossly\n+ negligent acts) or agreed to in writing, shall any Contributor be\n+ liable to You for damages, including any direct, indirect, special,\n+ incidental, or consequential damages of any character arising as a\n+ result of this License or out of the use or inability to use the\n+ Work (including but not limited to damages for loss of goodwill,\n+ work stoppage, computer failure or malfunction, or any and all\n+ other commercial damages or losses), even if such Contributor\n+ has been advised of the possibility of such damages.\n+\n+ 9. Accepting Warranty or Additional Liability. While redistributing\n+ the Work or Derivative Works thereof, You may choose to offer,\n+ and charge a fee for, acceptance of support, warranty, indemnity,\n+ or other liability obligations and/or rights consistent with this\n+ License. However, in accepting such obligations, You may act only\n+ on Your own behalf and on Your sole responsibility, not on behalf\n+ of any other Contributor, and only if You agree to indemnify,\n+ defend, and hold each Contributor harmless for any liability\n+ incurred by, or claims asserted against, such Contributor by reason\n+ of your accepting any such warranty or additional liability.\n+\n+ END OF TERMS AND CONDITIONS\n+\n+ APPENDIX: How to apply the Apache License to your work.\n+\n+ To apply the Apache License to your work, attach the following\n+ boilerplate notice, with the fields enclosed by brackets "[]"\n+ replaced with your own identifying information. (Don\'t include\n+ the brackets!) The text should be enclosed in the appropriate\n+ comment syntax for the file format. We also recommend that a\n+ file or class name and description of purpose be included on the\n+ same "printed page" as the copyright notice for easier\n+ identification within third-party archives.\n+\n+ Copyright [yyyy] [name of copyright owner]\n+\n+ Licensed under the Apache License, Version 2.0 (the "License");\n+ you may not use this file except in compliance with the License.\n+ You may obtain a copy of the License at\n+\n+ http://www.apache.org/licenses/LICENSE-2.0\n+\n+ Unless required by applicable law or agreed to in writing, software\n+ distributed under the License is distributed on an "AS IS" BASIS,\n+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n+ See the License for the specific language governing permissions and\n+ limitations under the License.\n'

diff -r 000000000000 -r dffc38727496 NOTICE
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/NOTICE Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,14 @@
+PRIMS-metabolomics toolset & Galaxy wrappers
+============================================
+
+Metabolomics module of Plant Research International's Mass Spectrometry (PRIMS) toolsuite.
+This toolset consists of custom tools to enable metabolite identifications and
+Retention Index (RI) based Quality Control (RIQC) for Mass Spectrometry metabolomics data.
+
+Copyright:
+* 2012: NIST_UTIL and RIQC tools: Copyright (c) 2012 Maarten Kooyman and Marcel Kempenaar, NBIC BRS
+* 2013: all tools: Copyright (c) 2013 by Pieter Lukasse, Plant Research International (PRI),
+ Wageningen, The Netherlands. All rights reserved. See the license text below.
+
+Galaxy wrappers and installation are available from the Galaxy Tool Shed at:
+http://toolshed.g2.bx.psu.edu/view/pieterlukasse/prims_metabolomics
\ No newline at end of file

diff -r 000000000000 -r dffc38727496 README.rst
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/README.rst Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,103 @@
+PRIMS-metabolomics toolset & Galaxy wrappers
+============================================
+
+Metabolomics module of Plant Research International's Mass Spectrometry (PRIMS) toolsuite.
+This toolset consists of custom tools to enable metabolite identifications and
+Retention Index (RI) based Quality Control (RIQC) for Mass Spectrometry metabolomics data.
+
+Copyright:
+* 2012: NIST_UTIL and RIQC tools: Copyright (c) 2012 Maarten Kooyman and Marcel Kempenaar, NBIC BRS
+* 2013: all tools: Copyright (c) 2013 by Pieter Lukasse, Plant Research International (PRI),
+Wageningen, The Netherlands. All rights reserved. See the license text below.
+
+Galaxy wrappers and installation are available from the Galaxy Tool Shed at:
+http://toolshed.g2.bx.psu.edu/view/pieterlukasse/prims_metabolomics
+
+History
+=======
+
+============== ======================================================================
+Date            Changes
+-------------- ----------------------------------------------------------------------
+December 2014  * Added MsClust support for parsing XCMS alignment results.
+               * Improved output reports for XCMS wrappers.
+November 2014  * Added XCMS related tool wrappers (for metaMS and diffreport)
+September 2014 * Added new membership cutoff option for final clusters in MsClust
+               * Improved MsClust memory usage for large datasets
+               * Simplified MsClust HTML report
+               * Added option for microminutes based clustering instead of scannr
+                 based in MsClust
+April 2014     * Added interface to ExactMassDB, Pep1000, KEGG, KNApSAcK, Flavonoid
+                 Viewer, LipidMAPS, HMDB, PubChem, by using the service MFSearcher.
+                 This enables users to query multiple public repositories for
+                 elemental compositions from accurate mass values detected by
+                 high-resolution mass spectrometers. NB: see also added
+                 licensing info below.
+March 2014     * Added interface to METEXP data store, including tool to fire
+                 queries in batch mode
+               * Improved quantification output files of MsClust, a.o. sorting
+                 mass list based on intensity (last two columns of quantification
+                 files)
+January 2014   * first release via Tool Shed, combining the RIQC and MsClust in a
+                 single package (this package)
+               * integration with METEXP software (data store for metabolomics
+                 experiments with respective metadata and identifications)
+2013           * hand-over of the NIST_UTIL and RIQC tools from the NBIC team to
+                 Plant Research International
+2012           * development of MsClust 2.0, making it also suitable for Galaxy
+<2011          * development and publication of MsClust 1.0
+============== ======================================================================
+
+Tool Versioning
+===============
+
+PRIMS tools will have versions of the form X.Y.Z. Versions
+differing only after the second decimal should be completely
+compatible with each other. Breaking changes should result in an
+increment of the number before and/or after the first decimal. All
+tools of version less than 1.0.0 should be considered beta.
+
+
+Bug Reports & other questions
+=============================
+
+For the time being issues can be reported via the contact form at:
+http://www.wageningenur.nl/en/Persons/PNJ-Pieter-Lukasse.htm
+
+Developers, Contributions & Collaborations
+==========================================
+
+If you wish to join forces and collaborate on some of the
+tools do not hesitate to contact Pieter Lukasse via the contact form above.
+
+
+License (Apache, Version 2.0)
+=============================
+
+Copyright 2013 Pieter Lukasse, Plant Research International (PRI).
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this software except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+
+License for third party services
+================================
+MFSearcher service : http://webs2.kazusa.or.jp/mfsearcher/#090
+In the MFSearcher system, the compound data provided by KEGG, Flavonoid Viewer, LIPID MAPS, HMDB and PubChem
+were downloaded for academic purposes. The compound data of KNApSAcK is provided by Prof. Kanaya in
+Nara Institute of Science and Technology (NAIST). The part of these data are utilized to construct the
+specified databases for rapid mass searching in the MFSearcher system after re-calculating the molecular weights.
+Please preserve the contracts of each original databases when utilizing the search results against these
+databases by MFSearcher.
+
+The searching system of MFSearcher, the ExactMassDB database, and the Pep1000 database by Kazusa DNA
+Research Institute is licensed under a Creative Commons Attribution-NonCommercial 3.0 Unported License.
\ No newline at end of file

diff -r 000000000000 -r dffc38727496 Rscripts/filter-RIDB.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Rscripts/filter-RIDB.R Sat Feb 07 22:02:00 2015 +0100

[

@@ -0,0 +1,56 @@
+##
+#
+# Removes duplicates from a RI-database
+#
+# Usage:
+#       Rscript filter-RIDB.R /path/to/retention_db.txt output_RIDB_file.txt
+#
+##
+
+# Commandline arguments
+args  <- commandArgs(TRUE)
+ridb <- args[1]
+out_file <- args[2]
+
+# Function to check duplicates
+duplicates <- function(dat) {
+     s <- do.call("order", as.data.frame(dat))
+     non.dup <- !duplicated(dat[s, ])
+     orig.ind <- s[non.dup]
+     first.occ <- orig.ind[cumsum(non.dup)]
+     first.occ[non.dup] <- NA
+     first.occ[order(s)]
+}
+
+# Load CSV file
+ridb <- read.csv(ridb,header=TRUE, sep="\t")
+## Filters on: CAS FORMULA Column type Column phase type Column name
+filter_cols <- c(1, 3, 5, 6, 7)
+cat("RIDB dimensions: ")
+print(dim(ridb))
+deleted <- NULL
+cat("Checking for duplicates...")
+dups <- duplicates(ridb[,filter_cols])
+cat("\t[DONE]\nRemoving duplicates...")
+newridb <- ridb
+newridb["min"] <- NA
+newridb["max"] <- NA
+newridb["orig.columns"] <- NA
+for (i in unique(dups)) {
+    if (!is.na(i)) {
+        rows <- which(dups == i)
+        duprows <- ridb[c(i, rows),]
+        # Replace duplicate rows with one row containing the median value
+        new_RI <- median(duprows$RI)
+        newridb$RI[i] <- median(duprows$RI)
+        newridb$min[i] <- min(duprows$RI)
+        newridb$max[i] <- max(duprows$RI)
+        newridb$orig.columns[i] <- paste(rows, collapse=",")
+        deleted <- c(deleted, rows)
+    }
+}
+cat("\t\t[DONE]\nCreating new dataset...")
+out_ridb <- newridb[-deleted,]
+cat("\t\t[DONE]\nWriting new dataset...")
+write.table(out_ridb, na='', file=out_file, quote=T, sep="\t", row.names=F)
+cat("\t\t[DONE]\n")

diff -r 000000000000 -r dffc38727496 Rscripts/ridb-regression.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Rscripts/ridb-regression.R Sat Feb 07 22:02:00 2015 +0100

[

@@ -0,0 +1,208 @@
+##
+#
+# Performs regression analysis using either 3rd degree polynomial- or linear-method
+#
+##
+
+# Commandline arguments
+args  <- commandArgs(TRUE)
+if (length(args) < 7)
+ stop(cat("Missing arguments, usage:\n\tRscript ridb-regression.R RI-database ",
+             "ouput_file logfile min_residuals range_mod pvalue rsquared method ",
+ "plot(yes/no) plot_archive"))
+
+ridb <- args[1]
+out_file <- args[2]
+logfile <- args[3]
+min_residuals <- as.integer(args[4])
+range_mod <- as.integer(args[5])
+pvalue <- as.double(args[6])
+rsquared <- as.double(args[7])
+method <- args[8]
+plot <- tolower(args[9])
+if (plot == 'true')
+ plot_archive = args[10]
+
+# Do not show warnings etc.
+sink(file='/dev/null')
+
+progress <- c()
+logger <- function(logdata) {
+ ## Logs progress, adds a timestamp for each event
+ #cat(paste(Sys.time(), "\t", logdata, "\n", sep="")) ## DEBUG
+ progress <<- c(progress, paste(Sys.time(), "\t", logdata, sep=""))
+}
+
+logger("Reading Retention Index Database..")
+
+# Read Retention Index Database
+ridb <- read.csv(ridb, header=TRUE, sep="\t")
+logger(paste("\t", nrow(ridb), "records read.."))
+# Get a unique list
+gc_columns <- unique(as.vector(as.matrix(ridb['Column.name'])[,1]))
+cas_numbers <- unique(as.vector(as.matrix(ridb['CAS'])[,1]))
+
+add_poly_fit <- function(fit, gc1_index, gc2_index, range) {
+ pval = anova.lm(fit)$Pr
+ r.squared = summary(fit)$r.squared
+
+ data = rep(NA, 11)
+ # Append results to matrix
+ data[1] = gc_columns[gc1_index] # Column 1
+ data[2] = gc_columns[gc2_index] # Column 2
+ data[3] = coefficients(fit)[1]  # The 4 coefficients
+ data[4] = coefficients(fit)[2]
+ data[5] = coefficients(fit)[3]
+ data[6] = coefficients(fit)[4]
+ data[7] = range[1]              # Left limit
+ data[8] = range[2]              # Right limit
+ data[9] = length(fit$residuals) # Number of datapoints analysed
+ data[10] = pval[1]              # p-value for resulting fitting
+ data[11] = r.squared            # R-squared
+ return(data)
+}
+
+
+add_linear_fit <- function(fit, gc1_index, gc2_index, range) {
+ pval = anova.lm(fit)$Pr
+ r.squared = summary(fit)$r.squared
+
+ data = rep(NA, 7)
+ # Append results to matrix
+ data[1] = gc_columns[gc1_index] # Column 1
+ data[2] = gc_columns[gc2_index] # Column 2
+ data[3] = coefficients(fit)[1]  # The 4 coefficients
+ data[4] = coefficients(fit)[2]
+ data[7] = length(fit$residuals) # Number of datapoints analysed
+ data[8] = pval[1]               # p-value for resulting fitting
+ data[9] = r.squared             # R-squared
+ return(data)
+}
+
+
+add_fit <- function(fit, gc1_index, gc2_index, range, method) {
+ if (method == 'poly')
+ return(add_poly_fit(fit, gc1_index, gc2_index, range))
+ else
+ return(add_linear_fit(fit, gc1_index, gc2_index, range))
+}
+
+
+plot_fit <- function(ri1, ri2, gc1_index, gc2_index, coeff, range, method) {
+    if (method == 'poly')
+        pol <- function(x) coeff[4]*x^3 + coeff[3]*x^2 + coeff[2]*x + coeff[1]
+    else
+        pol <- function(x) coeff[2]*x + coeff[1]
+    pdf(paste('regression_model_',
+              make.names(gc_columns[gc1_index]), '_vs_',
+              make.names(gc_columns[gc2_index]), '.pdf', sep=''))
+    curve(pol, 250:3750, col="red", lwd=2.5, main='Regression Model', xlab=gc_columns[gc1_index],
+          ylab=gc_columns[gc2_index], xlim=c(250, 3750), ylim=c(250, 3750))
+    points(ri1, ri2, lwd=0.4)
+    # Add vertical lines showing left- and right limits when using poly method
+    if (method == 'poly')
+        abline(v=range, col="grey", lwd=1.5)
+    dev.off()
+}
+
+# Initialize output dataframe
+if (method == 'poly') {
+    m <- data.frame(matrix(ncol = 11, nrow = 10))
+} else {
+    m <- data.frame(matrix(ncol = 9, nrow = 10))
+}
+
+
+get_fit <- function(gc1, gc2, method) {
+ if (method == 'poly')
+ return(lm(gc1 ~ poly(gc2, 3, raw=TRUE)))
+ else
+ return(lm(gc1 ~ gc2))
+}
+
+# Permutate
+k <- 1
+logger(paste("Permutating (with ", length(gc_columns), " GC-columns)..", sep=""))
+
+for (i in 1:(length(gc_columns)-1)) {
+ logger(paste("\tCalculating model for ", gc_columns[i], "..", sep=""))
+ breaks <- 0
+ for (j in (i+1):length(gc_columns)) {
+ col1 = ridb[which(ridb['Column.name'][,1] == gc_columns[i]),]
+ col2 = ridb[which(ridb['Column.name'][,1] == gc_columns[j]),]
+
+ # Find CAS numbers for which both columns have data (intersect)
+ cas_intersect = intersect(col1[['CAS']], col2[['CAS']])
+
+ # Skip if number of shared CAS entries is < cutoff
+ if (length(cas_intersect) < min_residuals) {
+ breaks = breaks + 1
+ next
+ }
+ # Gather Retention Indices
+ col1_data = col1[['RI']][match(cas_intersect, col1[['CAS']])]
+ col2_data = col2[['RI']][match(cas_intersect, col2[['CAS']])]
+
+ # Calculate the range within which regression is possible (and move if 'range_mod' != 0)
+ range = c(min(c(min(col1_data), min(col2_data))), max(c(max(col1_data), max(col2_data))))
+ if (range_mod != 0) {
+ # Calculate percentage and add/subtract from range
+ perc = diff(range) / 100
+ perc_cutoff = range_mod * perc
+ range = as.integer(range + c(perc_cutoff, -perc_cutoff))
+ }
+
+ # Calculate model for column1 vs column2 and plot if requested
+ fit = get_fit(col1_data, col2_data, method)
+ m[k,] = add_fit(fit, i, j, range, method)
+
+ if (plot == 'true')
+ plot_fit(col1_data, col2_data, i, j, coefficients(fit), range, method)
+
+ # Calculate model for column2 vs column1 and plot if requested
+ fit = get_fit(col2_data, col1_data, method)
+ m[k + 1,] = add_fit(fit, j, i, range, method)
+
+ if (plot == 'true')
+ plot_fit(col2_data, col1_data, j, i, coefficients(fit), range, method)
+
+ k = k + 2
+ }
+ logger(paste("\t\t", breaks, " comparisons have been skipped due to nr. of datapoints < cutoff", sep=""))
+}
+
+# Filter on pvalue and R-squared
+logger("Filtering on pvalue and R-squared..")
+if (method == 'poly') {
+    pval_index <- which(m[,10] < pvalue)
+    rsquared_index <- which(m[,11] > rsquared)
+} else {
+    pval_index <- which(m[,8] < pvalue)
+    rsquared_index <- which(m[,9] > rsquared)
+}
+logger(paste(nrow(m) - length(pval_index), " models discarded due to pvalue > ", pvalue, sep=""))
+
+logger(paste(nrow(m) - length(rsquared_index), " models discarded due to R-squared < ", rsquared, sep=""))
+
+# Remaining rows
+index = unique(c(pval_index, rsquared_index))
+
+# Reduce dataset
+m = m[index,]
+sink()
+
+# Place plots in the history as a ZIP file
+if (plot == 'true') {
+    logger("Creating archive with model graphics..")
+    system(paste("zip -9 -r models.zip *.pdf > /dev/null", sep=""))
+    system(paste("cp models.zip ", plot_archive, sep=""))
+}
+
+# Save dataframe as tab separated file
+logger("All done, saving data..")
+header = c("Column1", "Column2", "Coefficient1", "Coefficient2", "Coefficient3", "Coefficient4",
+           "LeftLimit", "RightLimit", "Residuals", "pvalue", "Rsquared")
+if (method != 'poly')
+ header = header[c(1:4, 7:11)]
+write(progress, logfile)
+write.table(m, file=out_file, sep="\t", quote=FALSE, col.names=header, row.names=FALSE)

diff -r 000000000000 -r dffc38727496 __init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/__init__.py Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,6 @@
+'''
+Module containing Galaxy tools for the LC or GC/MS pipeline
+Created on Mar , 2014
+
+@author: pieter lukasse
+'''
\ No newline at end of file

diff -r 000000000000 -r dffc38727496 combine_output.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/combine_output.py Sat Feb 07 22:02:00 2015 +0100

[

b'@@ -0,0 +1,253 @@\n+#!/usr/bin/env python\n+# encoding: utf-8\n+\'\'\'\n+Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup)\n+\'\'\'\n+\n+import csv\n+import re\n+import sys\n+import math\n+import pprint\n+\n+__author__ = "Marcel Kempenaar"\n+__contact__ = "brs@nbic.nl"\n+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"\n+__license__ = "MIT"\n+\n+def _process_data(in_csv):\n+ \'\'\'\n+ Generic method to parse a tab-separated file returning a dictionary with named columns\n+ @param in_csv: input filename to be parsed\n+ \'\'\'\n+ data = list(csv.reader(open(in_csv, \'rU\'), delimiter=\'\\t\'))\n+ header = data.pop(0)\n+ # Create dictionary with column name as key\n+ output = {}\n+ for index in xrange(len(header)):\n+ output[header[index]] = [row[index] for row in data]\n+ return output\n+\n+\n+def _merge_data(rankfilter, caslookup):\n+ \'\'\'\n+ Merges data from both input dictionaries based on the Centrotype field. This method will\n+ build up a new list containing the merged hits as the items. \n+ @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name)\n+ @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name)\n+ \'\'\'\n+ # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts:\n+ if (len(rankfilter[\'ID\']) != len(caslookup[\'Centrotype\'])):\n+ raise Exception(\'rankfilter and caslookup files should have the same nr of rows/records \')\n+ \n+ merged = []\n+ processed = {}\n+ for compound_id_idx in xrange(len(rankfilter[\'ID\'])):\n+ compound_id = rankfilter[\'ID\'][compound_id_idx]\n+ if not compound_id in processed :\n+ # keep track of processed items to not repeat them\n+ processed[compound_id] = compound_id\n+ # get centrotype nr\n+ centrotype = compound_id.split(\'-\')[0]\n+ # Get the indices for current compound ID in both data-structures for proper matching\n+ rindex = [index for index, value in enumerate(rankfilter[\'ID\']) if value == compound_id]\n+ cindex = [index for index, value in enumerate(caslookup[\'Centrotype\']) if value == centrotype]\n+ \n+ merged_hits = []\n+ # Combine hits\n+ for hit in xrange(len(rindex)):\n+ # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do \n+ # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its\n+ # corresponding value in the rankfilter or caslookup tables; i.e. \n+ # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute\n+ # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)\n+ rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()]))\n+ cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()]))\n+ \n+ merged_hit = _add_hit(rf_record, cl_record)\n+ merged_hits.append(merged_hit)\n+ \n+ merged.append(merged_hits)\n+\n+ return merged, len(rindex)\n+\n+\n+def _add_hit(rankfilter, caslookup):\n+ \'\'\'\n+ Combines single records from both the RankFilter- and CasLookup-tools\n+ @param rankfilter: record (dictionary) of one compound in the RankFilter output\n+ @param caslookup: matching record (dictionary) of one compound in the CasLookup output\n+ \'\'\'\n+ # The ID in the RankFilter output contains the following 5 fields:\n+ rf_id = rankfilter[\'ID\'].split(\'-\')\n+ try:\n+ name, formula = _remove_formula(rankfilter[\'Name\'])\n+ hit = [rf_id[0], # Centrotype\n+ rf_id[1], # cent.Factor\n+ rf_id[2], # '..b'ULA\': \'N/A\',\n+ \'RI\': \'0.0\',\n+ \'Regression.Column.Name\': \'None\',\n+ \'min\': \'0.0\',\n+ \'max\': \'0.0\',\n+ \'nr.duplicates\': \'0\',\n+ \'Column.phase.type\': \'N/A\',\n+ \'Column.name\': \'N/A\'}\n+\n+\n+def _save_data(data, nhits, out_csv_single, out_csv_multi):\n+ \'\'\'\n+ Writes tab-separated data to file\n+ @param data: dictionary containing merged dataset\n+ @param out_csv: output csv file\n+ \'\'\'\n+ # Columns we don\'t repeat:\n+ header_part1 = [\'Centrotype\',\n+ \'cent.Factor\',\n+ \'scan nr.\',\n+ \'R.T. (umin)\',\n+ \'nr. Peaks\',\n+ \'R.T.\']\n+ # These are the headers/columns we repeat in case of \n+ # combining hits in one line (see alternative_headers method below):\n+ header_part2 = [\n+ \'Name\',\n+ \'FORMULA\',\n+ \'Library\',\n+ \'CAS\',\n+ \'Forward\',\n+ \'Reverse\',\n+ \'Avg. (Forward, Reverse)\',\n+ \'RIexp\',\n+ \'RI\',\n+ \'RIsvr\',\n+ \'RIexp - RIsvr\',\n+ \'RI - RIexp\',\n+ \'Regression.Column.Name\',\n+ \'min\',\n+ \'max\',\n+ \'nr.duplicates\',\n+ \'Column.phase.type\',\n+ \'Column.name\',\n+ \'Rank\',\n+ \'%rel.err\',\n+ \'Synonyms\']\n+\n+ # Open output file for writing\n+ outfile_single_handle = open(out_csv_single, \'wb\')\n+ outfile_multi_handle = open(out_csv_multi, \'wb\')\n+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\\t")\n+ output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\\t")\n+\n+ # Write headers\n+ output_single_handle.writerow(header_part1 + header_part2)\n+ output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1))\n+ # Combine all hits for each centrotype into one line\n+ line = []\n+ for centrotype_idx in xrange(len(data)):\n+ i = 0\n+ for hit in data[centrotype_idx]:\n+ if i==0:\n+ line.extend(hit)\n+ else:\n+ line.extend(hit[6:])\n+ i = i+1\n+ # small validation (if error, it is a programming error):\n+ if i > nhits:\n+ raise Exception(\'Error: more hits that expected for centrotype_idx \' + centrotype_idx)\n+ output_multi_handle.writerow(line)\n+ line = []\n+\n+ # Write one line for each centrotype\n+ for centrotype_idx in xrange(len(data)):\n+ for hit in data[centrotype_idx]:\n+ output_single_handle.writerow(hit)\n+\n+def alternative_headers(header_part2, nr_alternative_hits):\n+ \'\'\' \n+ This method will iterate over the header names and add the string \'ALT#_\' before each, \n+ where # is the number of the alternative, according to number of alternative hits we want to add\n+ to final csv/tsv\n+ \'\'\'\n+ result = []\n+ for i in xrange(nr_alternative_hits): \n+ for header_name in header_part2:\n+ result.append("ALT" + str(i+1) + "_" + header_name) \n+ return result\n+\n+def main():\n+ \'\'\'\n+ Combine Output main function\n+ It will merge the result files from "RankFilter" and "Lookup RI for CAS numbers" \n+ NB: the caslookup_result_file will typically have fewer lines than\n+ rankfilter_result_file, so the merge has to consider this as well. The final file\n+ should have the same nr of lines as rankfilter_result_file.\n+ \'\'\'\n+ rankfilter_result_file = sys.argv[1]\n+ caslookup_result_file = sys.argv[2]\n+ output_single_csv = sys.argv[3]\n+ output_multi_csv = sys.argv[4]\n+\n+ # Read RankFilter and CasLookup output files\n+ rankfilter = _process_data(rankfilter_result_file)\n+ caslookup = _process_data(caslookup_result_file)\n+ merged, nhits = _merge_data(rankfilter, caslookup)\n+ _save_data(merged, nhits, output_single_csv, output_multi_csv)\n+\n+\n+if __name__ == \'__main__\':\n+ main()\n'

diff -r 000000000000 -r dffc38727496 combine_output.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/combine_output.xml Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,36 @@
+<tool id="combine_output" name="RIQC-Combine RankFilter and CasLookup output" version="1.0.2">
+  <description>Perform a combination of output data from the RankFilter and CasLookup tools</description>
+  <command interpreter="python">
+    combine_output.py $rankfilter_in $caslookup_in $out_single $out_multi
+  </command>
+  <inputs>
+    <param format="tabular" name="rankfilter_in" type="data" label="RIQC-RankFilter output (Estimated RI)"
+     help="Select the output file from the RankFilter tool"/>
+    <param format="tabular" name="caslookup_in" type="data" label="RIQC-Lookup RI for CAS output ('Known' RI)"
+     help="Select the output file from the CasLookup tool"/>
+    
+  </inputs>
+  <outputs>
+    <data format="tabular" label="${tool.name} (Single) on ${on_string}" name="out_single" />
+    <data format="tabular" label="${tool.name} (Multi) on ${on_string}" name="out_multi" />
+  </outputs>
+  <help>
+Performs a combination of output files from the 'RankFilter' and 'Lookup RI for CAS' tools into two tab-separated files.
+
+The files produced are contain either all hits for a compound on a single line (Single) or on separate lines
+(Multi).
+
+.. class:: infomark
+
+**Notes**
+
+The input data should be produced by the RankFilter and 'Lookup RI for CAS' tools provided on this Galaxy server with the
+original headers kept intact. Processing steps include:
+
+   - Added columns showing the average Forward/Reverse values, RIexp - RIsvr and RI - RIexp values
+   - The ID column of the RankFilter tool output is split into 'Centrotype', 'cent.Factor', 'scan nr.', 'R.T. (umin)'
+     and 'nr. Peaks' fields.
+   - The formula is split off the 'Name' field in the RankFilter output
+
+  </help>
+</tool>

diff -r 000000000000 -r dffc38727496 create_model.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/create_model.xml Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,78 @@
+<tool id="create_poly_model" name="RIQC-Create Regression Model" version="1.0.2">
+  <description>Generate coefficients to enable the regression from one GC-column
+             to another GC-column</description>
+  <command interpreter="Rscript">Rscripts/ridb-regression.R
+               $ridb
+               $out_model
+               $out_log
+               $min_residuals
+               $range_mod
+               $pvalue
+               $rsquared
+               $method
+               $plot
+               #if $plot
+                   $model_graphics
+               #end if
+  </command>
+  <inputs>
+    <param format="tabular" name="ridb" type="select" label="Retention Index (RI) and GC columns Library file"
+           help="Select the RI library file of which all GC columns and their RI values
+                 will be used to create a model"
+       dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RI_DB_libraries")'/>
+
+    <param name="method" type="select" label="Select regression method"
+           help="Method to use for calculating the model" >
+           <option value="poly" selected="True">Polynomial (3rd degree)</option>
+           <option value="linear">Linear</option>
+    </param>
+    <param name="min_residuals" type="integer" value="10" optional="False"
+           label="Minimum number of residuals" help="The minimum number of residuals
+                 (datapoints) that both columns should have in common when calculating
+                 the model" />
+    <param name="range_mod" type="integer" value="0" optional="False"
+           label="Range modifier" help="Moves the range of the usable RI space by the
+                  given percentage. Set to 0 to use the full range of available data." />
+    <param name="pvalue" type="float" value="0.05" optional="False" min="0" max="1"
+           label="Pvalue to filter on" help="Set the upper limit for the pvalue (calculated)
+                  by performing an ANOVA analysis on the created model). All models with higher
+                  pvalues are discarded." />
+    <param name="rsquared" type="float" value="0.95" optional="False" min="0" max="1"
+           label="R-squared to filter on" help="Set the lower limit for the R-squared,
+                  all models with lower values are discarded." />
+    <param name="plot" type="boolean" label="Create a separate plot for each model"
+           help="This will create a ZIP file in the history containing PDF plots" />
+  </inputs>
+  <code file="match_library.py" />
+  <outputs>
+   <data format="zip" label="Model Graphics of ${on_string}" name="model_graphics" >
+       <filter>(plot)</filter>
+   </data>
+    <data format="tabular" label="Regression logfile of ${on_string}"  name="out_log" />
+    <data format="tabular" label="Regression model of ${on_string}"  name="out_model" />
+  </outputs>
+  <help>
+Calculates regression models for a permutation of all GC columns contained in the selected
+RI database file. The method used for creating the model is either based on a 3rd degree
+polynomial or a standard linear model.
+
+The *Minimum number of residuals* option will only allow regression if the columns it is based
+on has at least that number of datapoints on the same compound.
+
+Filtering is possible by setting an upper limit for the *p-value* and / or a lower limit for
+the *R squared* value. The produced logfile will state how many models have been discarded due
+to this filtering. The output model file also includes the p-value and R squared value for
+each created model.
+
+Graphical output of the models is available by selecting the plot option which shows the
+data points used for the model as well as the fit itself and the range of data that will
+be usable.
+
+.. class:: infomark
+
+**Notes**
+
+The output file produced by this tool is required as input for the CasLookup tool when
+selecting to apply regression when finding hits in the RIDB.
+  </help>
+</tool>

diff -r 000000000000 -r dffc38727496 datatypes_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+<datatypes>
+  <datatype_files>
+  </datatype_files>
+  <registration display_path="display_applications">
+        <datatype extension="msclust.csv" type="galaxy.datatypes.tabular:Tabular" mimetype="text/csv" display_in_upload="true" subclass="true">
+        </datatype>
+  </registration>
+  <registration display_path="display_applications">
+        <datatype extension="rdata" type="galaxy.datatypes.data:Data" mimetype="application/zip" >
+        </datatype>
+  </registration>
+</datatypes>
\ No newline at end of file

diff -r 000000000000 -r dffc38727496 export_to_metexp_tabular.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/export_to_metexp_tabular.py Sat Feb 07 22:02:00 2015 +0100

[

b'@@ -0,0 +1,247 @@\n+#!/usr/bin/env python\n+# encoding: utf-8\n+\'\'\'\n+Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust\n+into a tabular file that can be uploaded to the MetExp database.\n+\n+RankFilter, CasLookup are already combined by combine_output.py so here we will use\n+this result. Furthermore here one of the MsClust\n+quantification files containing the respective spectra details are to be combined as well. \n+\n+Extra calculations performed:\n+- The column MW is also added here and is derived from the column FORMULA found \n+ in RankFilter, CasLookup combined result. \n+ \n+So in total here we merge 2 files and calculate one new column. \n+\'\'\'\n+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611\n+import csv\n+import re\n+import sys\n+from collections import OrderedDict\n+\n+__author__ = "Pieter Lukasse"\n+__contact__ = "pieter.lukasse@wur.nl"\n+__copyright__ = "Copyright, 2013, Plant Research International, WUR"\n+__license__ = "Apache v2"\n+\n+def _process_data(in_csv, delim=\'\\t\'):\n+ \'\'\'\n+ Generic method to parse a tab-separated file returning a dictionary with named columns\n+ @param in_csv: input filename to be parsed\n+ \'\'\'\n+ data = list(csv.reader(open(in_csv, \'rU\'), delimiter=delim))\n+ header = data.pop(0)\n+ # Create dictionary with column name as key\n+ output = OrderedDict()\n+ for index in xrange(len(header)):\n+ output[header[index]] = [row[index] for row in data]\n+ return output\n+\n+ONE_TO_ONE = \'one_to_one\'\n+N_TO_ONE = \'n_to_one\'\n+\n+def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE):\n+ \'\'\'\n+ Merges data from both input dictionaries based on the link fields. This method will\n+ build up a new list containing the merged hits as the items. \n+ @param set1: dictionary holding set1 in the form of N lists (one list per attribute name)\n+ @param set2: dictionary holding set2 in the form of N lists (one list per attribute name)\n+ \'\'\'\n+ # TODO test for correct input files -> same link_field values should be there \n+ # (test at least number of unique link_field values):\n+ #\n+ # if (len(set1[link_field_set1]) != len(set2[link_field_set2])):\n+ # raise Exception(\'input files should have the same nr of key values \')\n+ \n+ \n+ merged = []\n+ processed = {}\n+ for link_field_set1_idx in xrange(len(set1[link_field_set1])):\n+ link_field_set1_value = set1[link_field_set1][link_field_set1_idx]\n+ if not link_field_set1_value in processed :\n+ # keep track of processed items to not repeat them\n+ processed[link_field_set1_value] = link_field_set1_value\n+\n+ # Get the indices for current link_field_set1_value in both data-structures for proper matching\n+ set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value]\n+ set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ]\n+ # Validation :\n+ if len(set2index) == 0:\n+ # means that corresponding data could not be found in set2, then throw error\n+ raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" + \n+ link_field_set1_value + " only found in first dataset. ")\n+ \n+ merged_hits = []\n+ # Combine hits\n+ for hit in xrange(len(set1index)):\n+ # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do \n+ # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its\n+ # corresponding value in the sets; i.e. \n+ # set1[key] => returns the list/array with size = nrrows, with the values for the attribute\n+ '..b'of reference standard\n+ record.append(\'0\')\n+ record.append(\'\') \n+ \n+ return record\n+\n+\n+def get_molecular_mass(formula):\n+ \'\'\'\n+ Calculates the molecular mass (MM). \n+ E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O\n+ \'\'\'\n+ \n+ # Each element is represented by a capital letter, followed optionally by \n+ # lower case, with one or more digits as for how many elements:\n+ element_pattern = re.compile("([A-Z][a-z]?)(\\d*)")\n+\n+ total_mass = 0\n+ for (element_name, count) in element_pattern.findall(formula):\n+ if count == "":\n+ count = 1\n+ else:\n+ count = int(count)\n+ element_mass = float(elements_and_masses_map[element_name]) # "found: Python\'s built-in float type has double precision " (? check if really correct ?)\n+ total_mass += element_mass * count\n+ \n+ return total_mass\n+ \n+ \n+\n+def _save_data(data, headers, out_csv):\n+ \'\'\'\n+ Writes tab-separated data to file\n+ @param data: dictionary containing merged dataset\n+ @param out_csv: output csv file\n+ \'\'\'\n+\n+ # Open output file for writing\n+ outfile_single_handle = open(out_csv, \'wb\')\n+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\\t")\n+\n+ # Write headers\n+ output_single_handle.writerow(headers)\n+\n+ # Write \n+ for item_idx in xrange(len(data)):\n+ for hit in data[item_idx]:\n+ output_single_handle.writerow(hit)\n+\n+\n+def _get_map_for_elements_and_masses(elements_and_masses):\n+ \'\'\'\n+ This method will read out the column \'Chemical symbol\' and make a map \n+ of this, storing the column \'Relative atomic mass\' as its value\n+ \'\'\'\n+ resultMap = {}\n+ index = 0\n+ for entry in elements_and_masses[\'Chemical symbol\']:\n+ resultMap[entry] = elements_and_masses[\'Relative atomic mass\'][index]\n+ index += 1\n+ \n+ return resultMap\n+\n+\n+def init_elements_and_masses_map():\n+ \'\'\'\n+ Initializes the lookup map containing the elements and their respective masses\n+ \'\'\'\n+ elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab"))\n+ global elements_and_masses_map\n+ elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses)\n+ \n+\n+def main():\n+ \'\'\'\n+ Combine Output main function\n+ \n+ RankFilter, CasLookup are already combined by combine_output.py so here we will use\n+ this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust\n+ quantification files are to be combined with combine_output.py result as well. \n+ \'\'\'\n+ rankfilter_and_caslookup_combined_file = sys.argv[1]\n+ msclust_quantification_and_spectra_file = sys.argv[2]\n+ output_csv = sys.argv[3]\n+ # metadata\n+ metadata = OrderedDict()\n+ metadata[\'organism\'] = sys.argv[4]\n+ metadata[\'tissue\'] = sys.argv[5]\n+ metadata[\'experiment_name\'] = sys.argv[6]\n+ metadata[\'user_name\'] = sys.argv[7]\n+ metadata[\'column_type\'] = sys.argv[8]\n+\n+ # Read RankFilter and CasLookup output files\n+ rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file)\n+ msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, \',\')\n+ \n+ # Read elements and masses to use for the MW/MM calculation :\n+ init_elements_and_masses_map()\n+ \n+ merged, nhits = _merge_data(rankfilter_and_caslookup_combined, \'Centrotype\', \n+ msclust_quantification_and_spectra, \'centrotype\', \n+ _compare_records, _merge_records, metadata,\n+ N_TO_ONE)\n+ headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + [\'MM\',\'MW\', \'Level of identification\', \'Location of reference standard\']\n+ _save_data(merged, headers, output_csv)\n+\n+\n+if __name__ == \'__main__\':\n+ main()\n'

diff -r 000000000000 -r dffc38727496 export_to_metexp_tabular.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/export_to_metexp_tabular.xml Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,75 @@
+<tool id="export_to_metexp_tabular"
+    name="METEXP - Tabular file"
+    version="0.2.0">
+  <description>Create tabular file for loading into METabolomics EXPlorer database</description>
+  <command interpreter="python">
+    export_to_metexp_tabular.py
+    $rankfilter_and_caslookup_combi
+    $msclust_quant_file
+    $output_result
+    "$organism"
+    "$tissue"
+    "$experiment_name"
+    "$user_name"
+    "$column_type"
+  </command>
+  <inputs>
+    <param format="tabular" name="rankfilter_and_caslookup_combi" type="data" label="RIQC-Combine RankFilter and CasLookup output"
+     help="Select the (multi) output file from the 'Combine RankFilter and CasLookup' tool"/>
+    <param format="tabular" name="msclust_quant_file" type="data" label="MusClust-quantification file output"
+     help="Select the output file from MsClust (centrotype, mic or sim) which also contain respective spectrum details"/>
+
+
+   <param name="organism" type="text" size="80"
+           label="Organism(s) info"
+           help="Metadata information to accompany the results when stored in MetExp DB." >
+           <validator type="empty_field" message="A value is required."></validator>
+    </param>
+
+   <param name="tissue" type="text" size="80"
+           label="Tissue(s) info"
+           help="Metadata information to accompany the results when stored in MetExp DB."  >
+           <validator type="empty_field" message="A value is required."></validator>
+    </param>
+
+   <param name="experiment_name" type="text" size="80"
+           label="Experiment name/code"
+           help="Name or code to store the results under. This can help you find the results back in MetExpDB."  >
+           <validator type="empty_field" message="A value is required."></validator>
+    </param>
+
+   <param name="user_name" type="text" size="80"
+           label="User name"
+           help="User name or code to store the results under. This can help you find the results back in MetExpDB."  >
+           <validator type="empty_field" message="A value is required."></validator>
+    </param>
+
+    <param name="column_type" type="text" size="80"
+           label="Column type"
+           help="Column type to report with the results. This can help you find the results back in MetExpDB."  >
+           <validator type="empty_field" message="A value is required."></validator>
+    </param>
+
+  </inputs>
+  <outputs>
+    <data format="tabular" label="${tool.name} on ${on_string}" name="output_result" />
+  </outputs>
+  <help>
+.. class:: infomark
+
+Tool to combine output from the tools RankFilter, CasLookup and MsClust
+into a tabular file that can be uploaded to the METabolomics EXPlorer (MetExp) database.
+
+RankFilter, CasLookup are already combined by 'RIQC-Combine RankFilter and CasLookup' tool so here we will use
+this result.
+
+**Notes**
+
+Extra calculations performed:
+- The columns MM and MW are also added here and are derived from the column FORMULA found in RankFilter, CasLookup combined result.
+
+So in total here we merge 2 files and calculate one new column.
+
+
+  </help>
+</tool>

diff -r 000000000000 -r dffc38727496 library_lookup.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/library_lookup.py Sat Feb 07 22:02:00 2015 +0100

[

b'@@ -0,0 +1,327 @@\n+\'\'\'\n+Logic for searching a Retention Index database file given output from NIST\n+\'\'\'\n+import match_library\n+import re\n+import sys\n+import csv\n+\n+__author__ = "Marcel Kempenaar"\n+__contact__ = "brs@nbic.nl"\n+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"\n+__license__ = "MIT"\n+\n+def create_lookup_table(library_file, column_type_name, statphase):\n+ \'\'\'\n+ Creates a dictionary holding the contents of the library to be searched\n+ @param library_file: library to read\n+ @param column_type_name: the columns type name\n+ @param statphase: the columns stationary phase\n+ \'\'\'\n+ (data, header) = match_library.read_library(library_file)\n+ # Test for presence of required columns\n+ if (\'columntype\' not in header or\n+ \'columnphasetype\' not in header or\n+ \'cas\' not in header):\n+ raise IOError(\'Missing columns in \', library_file)\n+\n+ column_type_column = header.index("columntype")\n+ statphase_column = header.index("columnphasetype")\n+ cas_column = header.index("cas")\n+\n+ filtered_library = [line for line in data if line[column_type_column] == column_type_name\n+ and line[statphase_column] == statphase]\n+ lookup_dict = {}\n+ for element in filtered_library:\n+ # Here the cas_number is set to the numeric part of the cas_column value, so if the \n+ # cas_column value is \'C1433\' then cas_number will be \'1433\'\n+ cas_number = str(re.findall(r\'\\d+\', (element[cas_column]).strip())[0])\n+ try:\n+ lookup_dict[cas_number].append(element)\n+ except KeyError:\n+ lookup_dict[cas_number] = [element]\n+ return lookup_dict\n+\n+\n+def _preferred(hits, pref, ctype, polar, model, method):\n+ \'\'\'\n+ Returns all entries in the lookup_dict that have the same column name, type and polarity\n+ as given by the user, uses regression if selected given the model and method to use. The\n+ regression is applied on the column with the best R-squared value in the model\n+ @param hits: all entries in the lookup_dict for the given CAS number\n+ @param pref: preferred GC-column, can be one or more names\n+ @param ctype: column type (capillary etc.)\n+ @param polar: polarity (polar / non-polar etc.)\n+ @param model: data loaded from file containing regression models\n+ @param method: supported regression method (i.e. poly(nomial) or linear)\n+ \'\'\'\n+ match = []\n+ for column in pref:\n+ for hit in hits:\n+ if hit[4] == ctype and hit[5] == polar and hit[6] == column:\n+ # Create copy of found hit since it will be altered downstream\n+ match.extend(hit)\n+ return match, False\n+\n+ # No hit found for current CAS number, return if not performing regression\n+ if not model:\n+ return False, False\n+\n+ # Perform regression\n+ for column in pref:\n+ if column not in model:\n+ break\n+ # Order regression candidates by R-squared value (last element)\n+ order = sorted(model[column].items(), key=lambda col: col[1][-1])\n+ # Create list of regression candidate column names\n+ regress_columns = list(reversed([column for (column, _) in order]))\n+ # Names of available columns\n+ available = [hit[6] for hit in hits]\n+ \n+ # TODO: combine Rsquared and number of datapoints to get the best regression match\n+ \'\'\'\n+ # Iterate regression columns (in order) and retrieve their models\n+ models = {}\n+ for col in regress_columns:\n+ if col in available:\n+ hit = list(hits[available.index(col)])\n+ if hit[4] == ctype:\n+ # models contains all model data including residuals [-2] and rsquared [-1]\n+ models[pref[0]] = model[pref[0]][hit[6]] \n+ # Get the combined maximum for residuals and rsquared\n+ best_match = models[]\n+ # Apply regression\n+ if me'..b'se:\n+ found_hit[-1] = \'0\'\n+ data.append(found_hit)\n+ found_hit = \'\'\n+ else:\n+ data.append(default_hit(row, casf, compound_id))\n+ else:\n+ data.append(default_hit(row, casf, compound_id))\n+ \n+ casf = \'\'\n+ compound_id = \'\'\n+ found_hit = []\n+ dups = []\n+ return data\n+\n+\n+def _save_data(content, outfile):\n+ \'\'\'\n+ Write to output file\n+ @param content: content to write\n+ @param outfile: file to write to\n+ \'\'\'\n+ # header\n+ header = [\'CAS\',\n+ \'NAME\',\n+ \'FORMULA\',\n+ \'RI\',\n+ \'Column.type\',\n+ \'Column.phase.type\',\n+ \'Column.name\',\n+ \'phase.coding\',\n+ \'CAS_column.Name\',\n+ \'Centrotype\',\n+ \'Regression.Column.Name\',\n+ \'min\',\n+ \'max\',\n+ \'nr.duplicates\']\n+ output_handle = csv.writer(open(outfile, \'wb\'), delimiter="\\t")\n+ output_handle.writerow(header)\n+ for entry in content:\n+ output_handle.writerow(entry)\n+\n+\n+def _read_model(model_file):\n+ \'\'\'\n+ Creates an easy to search dictionary for getting the regression parameters\n+ for each valid combination of GC-columns\n+ @param model_file: filename containing the regression models\n+ \'\'\'\n+ regress = list(csv.reader(open(model_file, \'rU\'), delimiter=\'\\t\'))\n+ if len(regress.pop(0)) > 9:\n+ method = \'poly\'\n+ else:\n+ method = \'linear\'\n+\n+ model = {}\n+ # Create new dictionary for each GC-column\n+ for line in regress:\n+ model[line[0]] = {}\n+\n+ # Add data\n+ for line in regress:\n+ if method == \'poly\':\n+ model[line[0]][line[1]] = [float(col) for col in line[2:11]]\n+ else: # linear\n+ model[line[0]][line[1]] = [float(col) for col in line[2:9]]\n+\n+ return model, method\n+\n+\n+def _apply_poly_regression(column1, column2, retention_index, model):\n+ \'\'\'\n+ Calculates a new retention index (RI) value using a given 3rd-degree polynomial\n+ model based on data from GC columns 1 and 2\n+ @param column1: name of the selected GC-column\n+ @param column2: name of the GC-column to use for regression\n+ @param retention_index: RI to convert\n+ @param model: dictionary containing model information for all GC-columns\n+ \'\'\'\n+ coeff = model[column1][column2]\n+ # If the retention index to convert is within range of the data the model is based on, perform regression\n+ if coeff[4] < retention_index < coeff[5]:\n+ return (coeff[3] * (retention_index ** 3) + coeff[2] * (retention_index ** 2) + \n+ (retention_index * coeff[1]) + coeff[0])\n+ else:\n+ return False\n+\n+\n+def _apply_linear_regression(column1, column2, retention_index, model):\n+ \'\'\'\n+ Calculates a new retention index (RI) value using a given linear model based on data\n+ from GC columns 1 and 2\n+ @param column1: name of the selected GC-column\n+ @param column2: name of the GC-column to use for regression\n+ @param retention_index: RI to convert\n+ @param model: dictionary containing model information for all GC-columns\n+ \'\'\'\n+ # TODO: No use of limits\n+ coeff = model[column1][column2]\n+ return coeff[1] * retention_index + coeff[0]\n+\n+\n+def main():\n+ \'\'\'\n+ Library Lookup main function\n+ \'\'\'\n+ library_file = sys.argv[1]\n+ nist_tabular_filename = sys.argv[2]\n+ ctype = sys.argv[3]\n+ polar = sys.argv[4]\n+ outfile = sys.argv[5]\n+ pref = sys.argv[6:-1]\n+ regress = sys.argv[-1]\n+\n+ if regress != \'False\':\n+ model, method = _read_model(regress)\n+ else:\n+ model, method = False, None\n+\n+ lookup_dict = create_lookup_table(library_file, ctype, polar)\n+ data = format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method)\n+\n+ _save_data(data, outfile)\n+\n+\n+if __name__ == "__main__":\n+ main()\n'

diff -r 000000000000 -r dffc38727496 library_lookup.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/library_lookup.xml Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,75 @@
+<tool id="lookup_library" name="RIQC-Lookup RI for CAS numbers in library" version="1.0.2">
+  <description>Lookup or estimate the RI using a "known RI values" CAS numbers library</description>
+  <command interpreter="python">
+    library_lookup.py
+    $library_file
+    $input
+    "$col_type"
+    "$polarity"
+    $output
+    #for $ctype in $pref
+      ${ctype.columntype}
+    #end for
+    $regression.model
+  </command>
+  <inputs>
+  
+    <page>
+      <param format="tabular" name="input" type="data" label="NIST identifications as tabular file"
+       help="Select a tab delimited NIST metabolite identifications file (converted from PDF)" />
+      <param name="library_file" type="select" label="CAS x RI Library file"
+       help="Select a library/lookup file containing RI values for CAS numbers on various chromatography columns "
+       dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RI_DB_libraries")'/>
+    </page>
+    <page>
+      <param name="col_type" type="select" label="Select column type" refresh_on_change="true"
+          display="radio" dynamic_options='get_column_type(library_file)'
+          help="" />
+ </page>
+    <page>
+      <param name="polarity" type="select" label="Select polarity" refresh_on_change="true"
+             display="radio" dynamic_options='filter_column(library_file,col_type)'
+          help="" />
+    </page>
+    <page>
+   <conditional name="regression">
+   <param name="regression_select" type="boolean" checked="false" label="Apply regression method"
+    help="If no data for the selected column is present in the database, selecting this option will try
+       to convert Retention Indices using data from other GC-columns with a regression method. Please
+       note that only the first given GC-column above will be used for this, any alternatives will be
+       ignored" />
+   <when value="true">
+    <param name="model" format="tabular" type="data" label="Tabular file containing regression model"
+       help="This file contains the coefficients used to perform the regression from one GC-column
+             to another GC-column"/>
+   </when>
+          <when value="false">
+            <param name="model" type="hidden" value="False" />
+          </when>
+   </conditional>
+      <repeat name="pref" title="Select column name preference">
+ <param name="columntype" type="select" label="Column name" refresh_on_change="true"
+               dynamic_options='filter_column2(library_file, col_type, polarity)'
+            help="Select one or more column names for filtering. The order defines the priority." />
+      </repeat>
+    </page>
+  </inputs>
+  <outputs>
+    <data format="tabular" label="${tool.name} on" name="output" />
+</outputs>
+<code file="match_library.py" />
+  <help>
+Performs a lookup of the RI values by matching CAS numbers from the given NIST identifications file to a library.
+If a direct match is NOT found for the preferred column name, a regression can be done to find
+the theoretical RI value based on known RI values for the CAS number on other column types (see step 4).
+If there is no match for the CAS number on any column type, then the record is not given a RI.
+
+
+
+  </help>
+
+</tool>

diff -r 000000000000 -r dffc38727496 match_library.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/match_library.py Sat Feb 07 22:02:00 2015 +0100

[

@@ -0,0 +1,133 @@
+'''
+Containing functions are called from Galaxy to populate lists/checkboxes with selectable items
+'''
+import csv
+import glob
+import os
+
+
+__author__ = "Marcel Kempenaar"
+__contact__ = "brs@nbic.nl"
+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
+__license__ = "MIT"
+
+def get_column_type(library_file):
+    '''
+    Returns a Galaxy formatted list of tuples containing all possibilities for the
+    GC-column types. Used by the library_lookup.xml tool
+    @param library_file: given library file from which the list of GC-column types is extracted
+    '''
+    if library_file == "":
+        galaxy_output = [("", "", False)]
+    else:
+        (data, header) = read_library(library_file)
+
+        if 'columntype' not in header:
+            raise IOError('Missing columns in ', library_file)
+
+        # Filter data on column type
+        column_type = header.index("columntype")
+        amounts_in_list_dict = count_occurrence([row[column_type] for row in data])
+        galaxy_output = [(str(a) + "(" + str(b) + ")", a, False) for a, b in amounts_in_list_dict.items()]
+
+    return(galaxy_output)
+
+
+def filter_column(library_file, column_type_name):
+    '''
+    Filters the Retention Index database on column type
+    @param library_file: file containing the database
+    @param column_type_name: column type to filter on
+    '''
+    if library_file == "":
+        galaxy_output = [("", "", False)]
+    else:
+        (data, header) = read_library(library_file)
+
+        if ('columntype' not in header or
+            'columnphasetype' not in header):
+            raise IOError('Missing columns in ', library_file)
+
+        column_type = header.index("columntype")
+        statphase = header.index("columnphasetype")
+
+        # Filter data on colunn type name
+        statphase_list = [line[statphase] for line in data if line[column_type] == column_type_name]
+        amounts_in_list_dict = count_occurrence(statphase_list)
+        galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()]
+
+    return(sorted(galaxy_output))
+
+
+def filter_column2(library_file, column_type_name, statphase):
+    '''
+    Filters the Retention Index database on column type
+    @param library_file: file containing the database
+    @param column_type_name: column type to filter on
+    @param statphase: stationary phase of the column to filter on
+    '''
+    if library_file == "":
+        galaxy_output = [("", "", False)]
+    else:
+        (data, header) = read_library(library_file)
+
+        if ('columntype' not in header or
+            'columnphasetype' not in header or
+            'columnname' not in header):
+            raise IOError('Missing columns in ', library_file)
+
+        column_type_column = header.index("columntype")
+        statphase_column = header.index("columnphasetype")
+        column_name_column = header.index("columnname")
+
+        # Filter data on given column type name and stationary phase
+        statphase_list = [line[column_name_column] for line in data if line[column_type_column] == column_type_name and
+                          line[statphase_column] == statphase]
+        amounts_in_list_dict = count_occurrence(statphase_list)
+        galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()]
+
+    return(sorted(galaxy_output))
+
+
+def read_library(filename):
+    '''
+    Reads a CSV file and returns its contents and a normalized header
+    @param filename: file to read
+    '''
+    data = list(csv.reader(open(filename, 'rU'), delimiter='\t'))
+    header_clean = [i.lower().strip().replace(".", "").replace("%", "") for i in data.pop(0)]
+    return(data, header_clean)
+
+
+
+def get_directory_files(dir_name):
+    '''
+    Reads the directory and
+    returns the list of .txt files found as a dictionary
+    with file name and full path so that it can
+    fill a Galaxy drop-down combo box.
+
+    '''
+    files = glob.glob(dir_name + "/*.*")
+    if len(files) == 0:
+        # Configuration error: no library files found in <galaxy-home-dir>/" + dir_name :
+        galaxy_output = [("Configuration error: expected file not found in <galaxy-home-dir>/" + dir_name, "", False)]
+    else:
+        galaxy_output = [(str(get_file_name_no_ext(file_name)), str(os.path.abspath(file_name)), False) for file_name in files]
+    return(galaxy_output)
+
+def get_file_name_no_ext(full_name):
+    '''
+    returns just the last part of the name
+    '''
+    simple_name = os.path.basename(full_name)
+    base, ext = os.path.splitext(simple_name)
+    return base
+
+
+def count_occurrence(data_list):
+    '''
+    Counts occurrences in a list and returns a dict with item:occurrence
+    @param data_list: list to count items from
+    '''
+    return dict((key, data_list.count(key)) for key in set(data_list))

diff -r 000000000000 -r dffc38727496 metaMS_cmd_annotate.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaMS_cmd_annotate.r Sat Feb 07 22:02:00 2015 +0100

[

@@ -0,0 +1,86 @@
+## read args:
+args <- commandArgs(TRUE)
+## the constructed DB, e.g. "E:/Rworkspace/metaMS/data/LCDBtest.RData"
+args.constructedDB <- args[1]
+## data file in xset format:
+args.xsetData <- args[2]
+## settings file, e.g. "E:/Rworkspace/metaMS/data/settings.r", should contain assignment to an object named "customMetaMSsettings"
+args.settings <- args[3]
+
+## output file names, e.g. "E:/Rworkspace/metaMS/data/out.txt"
+args.outAnnotationTable <- args[4]
+
+args.mass_error_function <- args[5]
+if (args.mass_error_function == "0")
+ args.mass_error_function <- NULL
+## report files
+args.htmlReportFile <- args[6]
+args.htmlReportFile.files_path <- args[7]
+
+if (length(args) == 8)
+{
+ args.outLogFile <- args[8]
+ # suppress messages:
+ # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888
+ msg <- file(args.outLogFile, open="wt")
+ sink(msg, type="message")
+ sink(msg, type="output")
+}
+
+cat("\nSettings used===============:\n")
+cat(readChar(args.settings, 1e5))
+
+
+tryCatch(
+        {
+         library(metaMS)
+
+ ## load the constructed DB :
+ tempEnv <- new.env()
+ testDB <- load(args.constructedDB, envir=tempEnv)
+ xsetData <- readRDS(args.xsetData)
+
+ ## load settings "script" into "customMetaMSsettings"
+ source(args.settings, local=tempEnv)
+ message(paste(" loaded : ", args.settings))
+
+ # Just to highlight: if you want to use more than one
+ # trigger runLC:
+ LC <- runLC(xset=xsetData, settings = tempEnv[["customMetaMSsettings"]], DB = tempEnv[[testDB[1]]]$DB, errf=args.mass_error_function, nSlaves=20, returnXset = TRUE)
+
+ # write out runLC annotation results:
+ write.table(LC$PeakTable, args.outAnnotationTable, sep="\t", row.names=FALSE)
+
+ # the used constructed DB (write to log):
+ cat("\nConstructed DB info===============:\n")
+ str(tempEnv[[testDB[1]]]$Info)
+ cat("\nConstructed DB table===============:\n")
+ if (length(args) == 8)
+ {
+ write.table(tempEnv[[testDB[1]]]$DB, args.outLogFile, append=TRUE, row.names=FALSE)
+ write.table(tempEnv[[testDB[1]]]$Reftable, args.outLogFile, sep="\t", append=TRUE, row.names=FALSE)
+ }
+
+ message("\nGenerating report.........")
+ # report
+ dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE)
+ html <- "<html><body><h1>Summary of annotation results:</h1>"
+ nrTotalFeatures <- nrow(LC$PeakTable)
+ nrAnnotatedFeatures <- nrow(LC$Annotation$annotation.table)
+ html <- paste(html,"<p>Total nr of features: ", nrTotalFeatures,"</p>", sep="")
+ html <- paste(html,"<p>Total nr of annotated features: ", nrAnnotatedFeatures,"</p>", sep="")
+
+ html <- paste(html,"</body><html>")
+ message("finished generating report")
+ write(html,file=args.htmlReportFile)
+ # unlink(args.htmlReportFile)
+ cat("\nWarnings================:\n")
+ str( warnings() )
+ },
+        error=function(cond) {
+            sink(NULL, type="message") # default setting
+ sink(stderr(), type="output")
+            message("\nERROR: ===========\n")
+            print(cond)
+        }
+    )

diff -r 000000000000 -r dffc38727496 metaMS_cmd_pick_and_group.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaMS_cmd_pick_and_group.r Sat Feb 07 22:02:00 2015 +0100

[

@@ -0,0 +1,92 @@
+## read args:
+args <- commandArgs(TRUE)
+## data files, e.g. "E:/Rworkspace/metaMS/data/data.zip" (with e.g. .CDF files) and unzip output dir, e.g. "E:/"
+args.dataZip <- args[1]
+args.zipExtrDir <- sub("\\.","_",paste(args[1],"dir", sep=""))
+dir.create(file.path(args.zipExtrDir), showWarnings = FALSE, recursive = TRUE)
+## settings file, e.g. "E:/Rworkspace/metaMS/data/settings.r", should contain assignment to an object named "customMetaMSsettings"
+args.settings <- args[2]
+
+## output file names, e.g. "E:/Rworkspace/metaMS/data/out.txt"
+args.outPeakTable <- args[3]
+args.xsetOut <- args[4]
+
+# polarity as explicit parameter:
+args.runLC_polarity <- args[5]
+
+## report files
+args.htmlReportFile <- args[6]
+args.htmlReportFile.files_path <- args[7]
+
+
+if (length(args) == 8)
+{
+ args.outLogFile <- args[8]
+ # suppress messages:
+ # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888
+ msg <- file(args.outLogFile, open="wt")
+ sink(msg, type="message")
+ sink(msg, type="output")
+}
+
+cat("\nSettings used===============:\n")
+cat(readChar(args.settings, 1e5))
+
+
+tryCatch(
+        {
+         library(metaMS)
+
+ ## load the data files from a zip file
+ files <- unzip(args.dataZip, exdir=args.zipExtrDir)
+
+ ## load settings "script" into "customMetaMSsettings"
+ tempEnv <- new.env()
+ source(args.settings, local=tempEnv)
+ message(paste(" loaded : ", args.settings))
+ allSettings <- tempEnv[["customMetaMSsettings"]]
+
+ # trigger runLC:
+ LC <- runLC(files, settings = allSettings, polarity=args.runLC_polarity, nSlaves=20, returnXset = TRUE)
+
+ # write out runLC annotation results:
+ write.table(LC$PeakTable, args.outPeakTable, sep="\t", row.names=FALSE)
+
+ # save xset as rdata:
+ xsAnnotatePreparedData <- LC$xset
+ saveRDS(xsAnnotatePreparedData, file=args.xsetOut)
+
+ message("\nGenerating report.........")
+ # report
+ dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE)
+ html <- "<html><body><h1>Info on alignment quality </h1>"
+ # TODO add (nr and mass error) and group size
+
+ message("\nPlotting figures... ")
+ figureName <- paste(args.htmlReportFile.files_path, "/figure_retcor.png", sep="")
+ html <- paste(html,"<img src='figure_retcor.png' /><br/>", sep="")
+ png( figureName, type="cairo", width=1100,height=600 )
+ retcor(LC$xset@xcmsSet, method="peakgroups", plottype = "mdevden")
+ html <- paste(html,"<a>*NB: retention time correction plot based on 'peakgroups' option with default settings. This is not the plot matching the exact settings used in the run,
+ but just intended to give a rough estimate of the retention time shifts present in the data. A more accurate plot will be available once
+                                    this option is added in metaMS API. </a><br/>", sep="")
+ devname = dev.off()
+
+
+ gt <- groups(LC$xset@xcmsSet)
+ groupidx1 <- which(gt[,"rtmed"] > 0 & gt[,"rtmed"] < 3000 & gt[,"npeaks"] > 3)
+
+ html <- paste(html,"</body><html>")
+ message("finished generating report")
+ write(html,file=args.htmlReportFile)
+ # unlink(args.htmlReportFile)
+ cat("\nWarnings================:\n")
+ str( warnings() )
+ },
+        error=function(cond) {
+            sink(NULL, type="message") # default setting
+ sink(stderr(), type="output")
+            message("\nERROR: ===========\n")
+            print(cond)
+        }
+    )

diff -r 000000000000 -r dffc38727496 metams_lcms_annotate.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metams_lcms_annotate.xml Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,121 @@
+<tool id="metams_lcms_annotate" name="METAMS-LC/MS Annotate"  version="0.0.4">
+ <description> Runs metaMS process for LC/MS feature annotation</description>
+ <requirements>
+ <requirement type="package" version="3.1.1">R_bioc_metams</requirement>
+ </requirements>
+ <command interpreter="Rscript">
+ metaMS_cmd_annotate.r
+     $constructed_db
+     $xsetData
+     $customMetaMSsettings
+     $outputFile
+     #if $mzTol.mzTolType == "fixed"
+ 0
+ #else
+            "$mzTol.mass_error_function"
+ #end if
+     $htmlReportFile
+     $htmlReportFile.files_path
+     $outputLog
+ </command>
+<inputs>
+ <param name="constructed_db" type="select" label="Constructed DB" help="Reference annotation database generated from matching measurements of a mixture of chemical standards
+ against a manually validated reference table which contains the key analytical information for each standard."
+       dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/metaMS")'/>
+
+ <param name="xsetData" type="data" format="rdata" label="xcmsSet data file (xset RDATA)" help="E.g. output data file resulting from METAMS 'feature picking, aligning and grouping' run"/>
+
+ <param name="protocolName" type="text" size="30" label="protocolName" value="e.g. Synapt.QTOF.RP"
+ help="Choose a name to give for the specific settings in the parameters below"/>
+
+ <param name="rtdiff" type="float" size="10" value="1.5" label="rtdiff" help="(Annotation) Allowed rt difference (in minutes)"/>
+
+ <conditional name="mzTol">
+ <param name="mzTolType" type="select" size="30" label="(Annotation) m/z tolerance type">
+ <option value="fixed" selected="true">Fixed tolerance</option>
+ <option value="adaptive" >Adaptive tolerance</option>
+ </param>
+ <when value="fixed">
+ <param name="mzdiff" type="float" size="10" value="0.005" label="mzdiff" help="(Annotation) Fixed mass tolerance" />
+ </when>
+ <when value="adaptive">
+ <param name="ppm" type="float" size="10" value="5.0" label="ppm" help="(Annotation) Tolerance in ppm" />
+ <param name="mass_error_function" type="text" area="true" size="3x70" label="(Annotation) Mass error function"/>
+ </when>
+ </conditional>
+
+ <param name="rtval" type="float" size="10" value="0.1" label="(max)rtval" help="(Validation) Group items are clustered once more with hierarchical clustering ('complete' method)
+           based on their rt distances. Here one can specify the rt threshold for removing the items that have too diverging rt (the ones with rt difference
+           larger than rtval). " />
+ <param name="minfeat" type="integer" size="10" value="2" label="minfeat"
+            help="(Validation) Threshold for the minimum number of features a
+            cluster/group should have (after rtval filtering above). Other clusters/groups are filtered out." />
+
+</inputs>
+<configfiles>
+
+<configfile name="customMetaMSsettings">## start comment
+ ## metaMS process settings
+ customMetaMSsettings <- metaMSsettings(protocolName = "${protocolName}",
+                            chrom = "LC")
+metaSetting(customMetaMSsettings, "match2DB") <- list(
+            rtdiff = ${rtdiff},
+            rtval = ${rtval},
+ #if $mzTol.mzTolType == "fixed"
+ mzdiff = ${mzTol.mzdiff},
+ #else
+            ppm = ${mzTol.ppm},
+ #end if
+            minfeat = ${minfeat})</configfile>
+
+</configfiles>
+
+<outputs>
+ <data name="outputFile" format="tabular" label="${tool.name} on ${on_string} - metaMS annotated file (TSV)"/>
+ <data name="outputLog" format="txt" label="${tool.name} on ${on_string} - metaMS LOG" hidden="True"/>
+ <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - metaMS report (HTML)"/>
+</outputs>
+<tests>
+ <test>
+ </test>
+</tests>
+<code file="match_library.py" /> 
+<help>
+
+.. class:: infomark
+
+Runs metaMS process for LC/MS feature annotation based on matching to an existing 'standards' DB.
+The figure below shows the main parts of this metaMS process.
+
+.. image:: $PATH_TO_IMAGES/metaMS_annotate.png
+
+
+.. class:: infomark
+
+The implemented annotation strategy can be broken down in the following steps:
+
+1. *Feature wise Annotation:* Each feature detected by runLC is matched against the database. If
+the mass error function is provided, the appropriate m/z tolerance is calculated, otherwise a fixed
+tolerance is used (mzdiff). The retention time tolerance is fixed and should be selected on the
+bases of the characteristics of each chromatographic method (rtdiff). Multiple annotations - i.e.
+features which are associated to more than one compound - are possible. This outcome does not
+indicate a problem per se, but is an inherent drawback of co-elution.
+
+2. *Annotation Validation:* The annotated features are organized in 'pseudospectra' collecting all
+the experimental features which are assigned to a specific compound. A specific annotation is
+confirmed only if more than minfeat features which differ in retention time less than rtval are
+present in a pseudospectrum. As a general rule rtval should be narrower than rtdiff. The
+latter, indeed, accounts for shifts in retention time between the injection of the standards and the
+metabolomics experiment under investigation. This time can be rather long, considering that the
+standards are not commonly re-analyzed each time. On the other hand, rtval represents the shift
+between the ions of the same compound within the same batch of injections and therefore it has
+only to account for the smaller shifts occurring during peak picking and alignment.
+
+
+  </help>
+  <citations>
+        <citation type="doi">10.1016/j.jchromb.2014.02.051</citation> 
+   </citations>
+</tool>
\ No newline at end of file

diff -r 000000000000 -r dffc38727496 metams_lcms_pick_and_group.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metams_lcms_pick_and_group.xml Sat Feb 07 22:02:00 2015 +0100

b'@@ -0,0 +1,301 @@\n+<tool id="metams_lcms_pick_and_group" name="METAMS-LC/MS Pick, Align and Group" version="0.0.4">\r\n+\t<description> Runs metaMS process for LC/MS feature picking, aligning and grouping</description>\r\n+\t<requirements>\r\n+\t\t<requirement type="package" version="3.1.1">R_bioc_metams</requirement>\r\n+\t</requirements>\t\r\n+\t<command interpreter="Rscript">\r\n+\t\tmetaMS_cmd_pick_and_group.r \r\n+\t $data_files\r\n+\t $customMetaMSsettings\r\n+\t $outputFile \r\n+\t $xsetOut\r\n+\t $polarity\r\n+\t $htmlReportFile\r\n+\t $htmlReportFile.files_path\r\n+\t $outputLog\r\n+\t</command>\r\n+<inputs>\r\n+\t<param name="data_files" type="data" format="prims.fileset.zip" label="Data files (.zip file with CDF, mzML or mzXML files)" help=".zip file containing the CDF, mzML or mzXML files of the new measurements"/>\r\n+\t\r\n+\t<param name="protocolName" type="text" size="30" label="protocolName" value="e.g. Synapt.QTOF.RP" \r\n+\t\thelp="Choose a name to give for the specific settings in the parameters below"/>\r\n+\t\r\n+\t<param name="polarity" type="select" size="30" label="polarity" \r\n+\t\thelp="Which polarity mode was used for measuring of the ms sample">\r\n+\t\t<option value="positive" selected="true">positive</option>\r\n+\t\t<option value="negative" >negative</option>\r\n+\t</param>\r\n+\t\r\n+\t\r\n+\t\r\n+\t<conditional name="peakPicking">\r\n+\t\t<param name="method" type="select" size="30" label="PEAK PICKING method ====================================================="\r\n+\t\thelp="matchedFilter=Feature detection in the chromatographic time domain ; centWave=Feature detection for high resolution LC/MS data">\r\n+\t\t\t<option value="matchedFilter" selected="true">matchedFilter</option>\r\n+\t\t\t<option value="centWave" >centWave</option>\r\n+\t\t</param>\r\n+\t\t<when value="matchedFilter">\t\r\n+\t\t\t<param name="fwhm" type="integer" size="10" value="20" label="fwhm" \r\n+\t\t\thelp="full width at half maximum of matched filtration gaussian model peak. Only used to calculate the actual sigma" />\r\n+\t\t\t<param name="sigma_denom" type="float" size="10" value="2.3548" label="sigma_denominator" \r\n+\t\t\thelp="denominator for standard deviation (width) of matched filtration model peak (e.g. sigma = fwhm/2.3548)" />\r\n+\t\t\t<param name="max" type="integer" size="10" value="50" label="max" \r\n+\t\t\thelp="maximum number of peaks per extracted ion chromatogram" />\r\n+\t\t\t<param name="snthresh" type="integer" size="10" value="4" label="snthresh" \r\n+\t\t\thelp="signal to noise ratio cutoff" />\r\n+\t\t\t<param name="step" type="float" size="10" value="0.05" label="step" \r\n+\t\t\thelp="step size to use for profile generation"/>\r\n+\t\t\t<param name="steps" type="integer" size="10" value="2" label="steps" \r\n+\t\t\thelp="number of steps to merge prior to filtration"/>\r\n+\t\t\t<param name="mzdiff" type="float" size="10" value="0.8" label="mzdiff" \r\n+\t\t\thelp="minimum difference in m/z for peaks with overlapping retention times"/>\r\n+\t\t</when>\r\n+\t\t<when value="centWave">\r\n+\t\t\t<param name="ppm" type="integer" size="10" value="25" label="ppm" \r\n+\t\t\thelp="maxmial tolerated m/z deviation in consecutive scans, in ppm" />\r\n+\t\t\t<param name="peakwidth" type="text" size="10" value="20,50" label="peakwidth" \r\n+\t\t\thelp="Chromatographic peak width, given as range (min,max) in seconds" />\r\n+\t\t\t<param name="snthresh" type="integer" size="10" value="10" label="snthresh" \r\n+\t\t\thelp="signal to noise ratio cutoff" />\t\t\t\r\n+\t\t\t<param name="prefilter" type="text" size="10" value="3,100" label="prefilter=c(k,I)" \r\n+\t\t\t\thelp="Prefilter step for the first phase. Mass traces are only retained if \r\n+\t\t\t\tthey contain at least k peaks with intensity > = I" />\t\t\t\r\n+\t\t\t<param name="mzCenterFun" type="select" size="30" label="mzCenterFun" \r\n+\t\t\t\thelp="Function to calculate t'..b'akPicking.peakwidth}),\r\n+\t\t\t\t\t\t\t\tsnthresh = ${peakPicking.snthresh},\r\n+\t\t\t\t\t\t\t\tprefilter = c(${peakPicking.prefilter}),\r\n+\t\t\t\t\t\t\t\tmzCenterFun = "${peakPicking.mzCenterFun}",\r\n+\t\t\t\t\t\t\t\tintegrate = ${peakPicking.integrate},\r\n+\t\t\t\t\t\t\t\tmzdiff = ${peakPicking.mzdiff},\r\n+\t\t\t\t\t\t\t\tfitgauss = ${peakPicking.fitgauss},\r\n+\t\t\t\t\t\t\t\tnoise = ${peakPicking.noise}),\r\n+\t\t\t\t\t\t\t#end if\r\n+ Alignment = list(\r\n+ min.class.fraction = ${min_class_fraction},\r\n+ min.class.size = ${min_class_size},\r\n+ mzwid = ${mzwid},\r\n+ bws = c(${bws}),\r\n+ retcormethod = "${retcor.retcormethod}",\r\n+\t\t\t\t\t\t\t#if $retcor.retcormethod == "peakgroups"\r\n+ \t\tsmooth = "${retcor.smooth}",\r\n+ \tmissingratio = ${retcor.missingratio},\r\n+ \textraratio = ${retcor.extraratio},\r\n+ \tretcorfamily = "${retcor.retcorfamily}", \r\n+\t\t\t\t\t\t\t#else\r\n+\t\t\t\t\t\t\t\t##repeating the method as workaround/ backwards compatibility (can remove this one after fix from metaMS):\r\n+\t\t\t\t\t\t\t\tmethod = "${retcor.retcormethod}", \r\n+\t\t\t\t\t\t\t\tprofStep = ${retcor.profStep},\r\n+\t\t\t\t\t\t\t#end if\t\t\t\t\t\t\t\t \r\n+ fillPeaks = ${fillPeaks}),\r\n+ CAMERA = list(\r\n+ perfwhm = ${perfwhm},\r\n+ cor_eic_th = ${cor_eic_th},\r\n+ ppm= ${ppm},\r\n+ graphMethod= "${groupCorr_graphMethod}",\r\n+ pval= ${groupCorr_pval},\r\n+ calcCiS= ${groupCorr_calcCiS},\r\n+ calcIso= ${groupCorr_calcIso},\r\n+ calcCaS= ${groupCorr_calcCaS},\r\n+ maxcharge= ${findIsotopes_maxcharge},\r\n+ maxiso= ${findIsotopes_maxiso},\r\n+ minfrac= ${findIsotopes_minfrac},\r\n+ multiplier= ${findAdducts_multiplier}\r\n+ ))</configfile>\r\n+\r\n+</configfiles>\r\n+\r\n+<outputs>\r\n+\t<data name="outputFile" format="tabular" label="${tool.name} on ${on_string} - peaks table (TSV)"/>\r\n+\t<data name="outputLog" format="txt" label="${tool.name} on ${on_string} - LOG" hidden="True"/>\r\n+\t<data name="xsetOut" format="rdata" label="${tool.name} on ${on_string} - xcmsSet (RDATA)"/>\r\n+\t<data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - report (HTML)"/>\r\n+</outputs>\r\n+<tests>\r\n+\t<test>\r\n+\t</test>\r\n+</tests>\r\n+<help>\r\n+\r\n+.. class:: infomark\r\n+ \r\n+Runs metaMS process for LC/MS feature feature picking, aligning and grouping. \r\n+This part of the metaMS process makes use of the XCMS and CAMERA tools and algorithms.\r\n+CAMERA is used for automatic deconvolution/annotation of LC/ESI-MS data.\r\n+The figure below shows the main parts of the metaMS process wrapped by this tool. \r\n+\r\n+.. image:: $PATH_TO_IMAGES/metaMS_pick_align_camera.png \r\n+\r\n+\r\n+From CAMERA documentation: \r\n+\r\n+.. image:: $PATH_TO_IMAGES/CAMERA_results.png \r\n+\r\n+**References**\r\n+\r\n+If you use this Galaxy tool in work leading to a scientific publication please\r\n+cite the following papers:\r\n+\r\n+Wehrens, R.; Weingart, G.; Mattivi, F. (2014). \r\n+metaMS: an open-source pipeline for GC-MS-based untargeted metabolomics. \r\n+Journal of chromatography B: biomedical sciences and applications, 996 (1): 109-116. \r\n+doi: 10.1016/j.jchromb.2014.02.051 \r\n+handle: http://hdl.handle.net/10449/24012\r\n+\r\n+Wrapper by Pieter Lukasse.\r\n+\r\n+\r\n+ </help>\r\n+ <citations>\r\n+ <citation type="doi">10.1016/j.jchromb.2014.02.051</citation> \r\n+ </citations>\r\n+</tool>\n\\ No newline at end of file\n'

diff -r 000000000000 -r dffc38727496 primsfilters.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/primsfilters.py Sat Feb 07 22:02:00 2015 +0100

[

@@ -0,0 +1,44 @@
+import logging
+log = logging.getLogger( __name__ )
+
+
+def restrict_prims_metabolomics( context, tool ):
+    """
+    This tool filter will hide prims_metabolomics tools for non-metabolomics users.
+    This can be enabled by adding the following to the
+     ``app:main`` section of ``universe_wsgi.ini``::
+
+        tool_filters = primsfilters:restrict_prims_metabolomics
+
+    and by adding this file to the folder:
+
+        <galaxy-dist>/lib/galaxy/tools/filters
+
+    This is optional and can be used in case some control is desired on whom
+    gets to see the prims_metabolomics tools. When not using this file and the
+    settings mentioned above, all prims_metabolomics tools will be visible to
+    all users.
+    """
+    # for debugging: import pydevd;pydevd.settrace("L0136815.wurnet.nl")
+    user = context.trans.user
+    metabolomics_tools = [ "msclust2", "combine_output", "create_poly_model", "lookup_library",
+                          "NDIStext2tabular", "rankfilterGCMS_tabular", "filter_on_rank",
+                          "export_to_metexp_tabular", "query_metexp" ]
+    found_match = False
+    # iterate over the tool (partial)ids and look for a match (this is compatible with tool shed given ids):
+    for partial_id in metabolomics_tools:
+        if tool.id.find("/"+ partial_id + "/") >= 0:
+            found_match = True
+            break
+    # the second part of this if is compatible with the ids when NOT using tool shed:
+    if found_match or tool.id in metabolomics_tools:
+        # logging.warn( 'FILTER MATCHED: %s' %(tool.name))
+
+        for user_role in user.roles:
+            if user_role.role.name == "PRIMS_METABOLOMICS":
+                return True
+        # not found to have the role, return false:
+        return False
+    else:
+        # return true for any other tool
+        return True
\ No newline at end of file

diff -r 000000000000 -r dffc38727496 query_mass_repos.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/query_mass_repos.py Sat Feb 07 22:02:00 2015 +0100

[

b'@@ -0,0 +1,289 @@\n+#!/usr/bin/env python\n+# encoding: utf-8\n+\'\'\'\n+Module to query a set of accurate mass values detected by high-resolution mass spectrometers\n+against various repositories/services such as METabolomics EXPlorer database or the \n+MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/).\n+\n+It will take the input file and for each record it will query the \n+molecular mass in the selected repository/service. If one or more compounds are found \n+then extra information regarding these compounds is added to the output file.\n+\n+The output file is thus the input file enriched with information about \n+related items found in the selected repository/service. \n+\n+The service should implement the following interface: \n+\n+http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth (txth means there is guaranteed to be a header line before the data)\n+\n+The output should be tab separated and should contain the following columns (in this order)\n+db-name molecular-formula dbe formula-weight id description\n+\n+\n+\'\'\'\n+import csv\n+import sys\n+import fileinput\n+import urllib2\n+import time\n+from collections import OrderedDict\n+\n+__author__ = "Pieter Lukasse"\n+__contact__ = "pieter.lukasse@wur.nl"\n+__copyright__ = "Copyright, 2014, Plant Research International, WUR"\n+__license__ = "Apache v2"\n+\n+def _process_file(in_xsv, delim=\'\\t\'):\n+ \'\'\'\n+ Generic method to parse a tab-separated file returning a dictionary with named columns\n+ @param in_csv: input filename to be parsed\n+ \'\'\'\n+ data = list(csv.reader(open(in_xsv, \'rU\'), delimiter=delim))\n+ return _process_data(data)\n+ \n+def _process_data(data):\n+ \n+ header = data.pop(0)\n+ # Create dictionary with column name as key\n+ output = OrderedDict()\n+ for index in xrange(len(header)):\n+ output[header[index]] = [row[index] for row in data]\n+ return output\n+\n+\n+def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit):\n+ \n+ \'\'\'\n+ This method will iterate over the record in the input_data and\n+ will enrich them with the related information found (if any) in the \n+ chosen repository/service\n+ \n+ # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies\n+ \'\'\'\n+ merged = []\n+ \n+ for i in xrange(len(input_data[input_data.keys()[0]])):\n+ # Get the record in same dictionary format as input_data, but containing\n+ # a value at each column instead of a list of all values of all records:\n+ input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))\n+ \n+ # read the molecular mass :\n+ molecular_mass = input_data_record[molecular_mass_col]\n+ \n+ \n+ # search for related records in repository/service:\n+ data_found = None\n+ if molecular_mass != "": \n+ molecular_mass = float(molecular_mass)\n+ \n+ # 1- search for data around this MM:\n+ query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth"\n+ \n+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")\n+ data_type_found = "MM"\n+ \n+ \n+ if data_found == None:\n+ # If still nothing found, just add empty columns\n+ extra_cols = [\'\', \'\',\'\',\'\',\'\',\'\']\n+ else:\n+ # Add info found:\n+ extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)\n+ \n+ # Take all data and merge it into a "flat"/simple array of values:\n+ field_values_list = _merge_data(input_data_record, extra_cols)\n+ \n+ merged.append(field_values_list)\n+\n+ # return the merged/enriched records:\n+ return merged\n+\n+\n+def'..b'_rows[1].strip() == \'\': \n+ # means there is only the header row...so no hits:\n+ return None\n+ \n+ for data_row in data_rows:\n+ if not data_row.strip() == \'\':\n+ row_as_list = _str_to_list(data_row, delimiter=\'\\t\')\n+ result.append(row_as_list)\n+ \n+ # return result processed into a dict:\n+ return _process_data(result)\n+ \n+ except urllib2.HTTPError, e:\n+ raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)\n+ except urllib2.URLError, e:\n+ raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ")\n+\n+def _str_to_list(data_row, delimiter=\'\\t\'): \n+ result = []\n+ for column in data_row.split(delimiter):\n+ result.append(column)\n+ return result\n+ \n+ \n+# alternative: ? \n+# s = requests.Session()\n+# s.verify = False\n+# #s.auth = (token01, token02)\n+# resp = s.get(url, params={\'name\': \'anonymous\'}, stream=True)\n+# content = resp.content\n+# # transform to dictionary:\n+ \n+ \n+ \n+ \n+def _merge_data(input_data_record, extra_cols):\n+ \'\'\'\n+ Adds the extra information to the existing data record and returns\n+ the combined new record.\n+ \'\'\'\n+ record = []\n+ for column in input_data_record:\n+ record.append(input_data_record[column])\n+ \n+ \n+ # add extra columns\n+ for column in extra_cols:\n+ record.append(column) \n+ \n+ return record \n+ \n+\n+def _save_data(data_rows, headers, out_csv):\n+ \'\'\'\n+ Writes tab-separated data to file\n+ @param data_rows: dictionary containing merged/enriched dataset\n+ @param out_csv: output csv file\n+ \'\'\'\n+\n+ # Open output file for writing\n+ outfile_single_handle = open(out_csv, \'wb\')\n+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\\t")\n+\n+ # Write headers\n+ output_single_handle.writerow(headers)\n+\n+ # Write one line for each row\n+ for data_row in data_rows:\n+ output_single_handle.writerow(data_row)\n+\n+def _get_repository_URL(repository_file):\n+ \'\'\'\n+ Read out and return the URL stored in the given file.\n+ \'\'\'\n+ file_input = fileinput.input(repository_file)\n+ try:\n+ for line in file_input:\n+ if line[0] != \'#\':\n+ # just return the first line that is not a comment line:\n+ return line\n+ finally:\n+ file_input.close()\n+ \n+\n+def main():\n+ \'\'\'\n+ Query main function\n+ \n+ The input file can be any tabular file, as long as it contains a column for the molecular mass.\n+ This column is then used to query against the chosen repository/service Database. \n+ \'\'\'\n+ seconds_start = int(round(time.time()))\n+ \n+ input_file = sys.argv[1]\n+ molecular_mass_col = sys.argv[2]\n+ repository_file = sys.argv[3]\n+ error_margin = float(sys.argv[4])\n+ margin_unit = sys.argv[5]\n+ output_result = sys.argv[6]\n+\n+ # Parse repository_file to find the URL to the service:\n+ repository_dblink = _get_repository_URL(repository_file)\n+ \n+ # Parse tabular input file into dictionary/array:\n+ input_data = _process_file(input_file)\n+ \n+ # Query data against repository :\n+ enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit)\n+ headers = input_data.keys() + [\'SEARCH hits for \',\'SEARCH hits: db-names\', \'SEARCH hits: molecular-formulas \',\n+ \'SEARCH hits: ids\',\'SEARCH hits: descriptions\', \'Link to SEARCH hits\'] #TODO - add min and max formula weigth columns\n+\n+ _save_data(enriched_data, headers, output_result)\n+ \n+ seconds_end = int(round(time.time()))\n+ print "Took " + str(seconds_end - seconds_start) + " seconds"\n+ \n+ \n+\n+if __name__ == \'__main__\':\n+ main()\n'

diff -r 000000000000 -r dffc38727496 query_mass_repos.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/query_mass_repos.xml Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,106 @@
+<tool id="query_mass_repos"
+    name="METEXP - Find elemental composition formulas based on mass values "
+    version="0.1.0">
+  <description>Query multiple public repositories for elemental compositions from accurate mass values detected by high-resolution mass spectrometers</description>
+  <command interpreter="python">
+    query_mass_repos.py
+    $input_file
+    "$molecular_mass_col"
+    "$repository_file"
+    $error_margin
+    $margin_unit
+    $output_result
+  </command>
+  <inputs>
+
+   <param name="input_file" format="tabular" type="data"
+        label="Input file"
+     help="Select a tabular file containing the entries to be queried/verified in the MetExp DB"/>
+
+   <param name="molecular_mass_col" type="text" size="50"
+           label="Molecular mass column name"
+           value="MM"
+           help="Name of the column containing the molecular mass information (in the given input file)" />
+
+   <param name="repository_file" type="select" label="Repository/service to query"
+       help="Select the repository/service which should be queried"
+       dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/MetExp_MassSearch_Services")'/>
+
+   <param name="error_margin" type="float" size="10"
+           label="Error marging"
+           value="0.01"
+           help="Mass difference allowed when searching in the repositories for a mass match." />
+
+   <param name="margin_unit" type="select" label="Margin unit">
+    <option value="ms" selected="True">ms</option>
+     <option value="ppm">ppm</option>
+   </param>
+   
+
+  </inputs>
+  <outputs>
+    <data name="output_result" format="tabular" label="${tool.name} on ${on_string}" />
+  </outputs>
+  <code file="match_library.py" /> 
+  <help>
+.. class:: infomark
+
+This tool will query multiple public repositories such as PRI-MetExp or http://webs2.kazusa.or.jp/mfsearcher
+for elemental compositions from accurate mass values detected by high-resolution mass spectrometers.
+
+It will take the input file and for each record it will query the
+molecular mass in the selected repository. If one or more compounds are found in the
+repository then extra information regarding (mass based)matching elemental composition formulas is added to the output file.
+
+The output file is thus the input file enriched with information about
+related items found in the selected repository.
+
+**Notes**
+
+The input file can be any tabular file, as long as it contains a column for the molecular mass.
+
+**Services that can be queried**
+
+================= =========================================================================
+Database          Description
+----------------- -------------------------------------------------------------------------
+PRI-MetExp        LC-MS and GC-MS data from experiments from the metabolomics group at
+                  Plant Research International. NB: restricted access to employees with
+                  access key.
+ExactMassDB       A database of possible elemental compositions consits of C: 100,
+                  H: 200, O: 50, N: 10, P: 10, and S: 10, that satisfy the Senior and
+                  the Lewis valence rules.
+                  (via /mfsearcher/exmassdb/)
+ExactMassDB-HR2   HR2, which is one of the fastest tools for calculation of elemental
+                  compositions, filters some elemental compositions according to
+                  the Seven Golden Rules (Kind and Fiehn, 2007). The ExactMassDB-HR2
+                  database returns the same result as does HR2 with the same atom kind
+                  and number condition as that used in construction of the ExactMassDB.
+                  (via /mfsearcher/exmassdb-hr2/)
+Pep1000           A database of possible linear polypeptides that are
+                  constructed with 20 kinds of amino acids and having molecular
+                  weights smaller than 1000.
+                  (via /mfsearcher/pep1000/)
+KEGG              Re-calculated compound data from KEGG. Weekly updated.
+                  (via /mfsearcher/kegg/)
+KNApSAcK          Re-calculated compound data from KNApSAcK.
+                  (via /mfsearcher/knapsack/)
+Flavonoid Viewer  Re-calculated compound data from Flavonoid Viewer .
+                  (via /mfsearcher/flavonoidviewer/
+LipidMAPS         Re-calculated compound data from LIPID MAPS.
+                  (via /mfsearcher/lipidmaps/)
+HMDB              Re-calculated compound data from Human Metabolome Database (HMDB)
+                  Version 3.5.
+                  (via /mfsearcher/hmdb/)
+PubChem           Re-calculated compound data from PubChem. Monthly updated.
+                  (via /mfsearcher/pubchem/)
+================= =========================================================================
+
+Sources for table above: PRI-MetExp and http://webs2.kazusa.or.jp/mfsearcher
+
+  </help>
+</tool>

diff -r 000000000000 -r dffc38727496 query_metexp.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/query_metexp.py Sat Feb 07 22:02:00 2015 +0100

[

b'@@ -0,0 +1,282 @@\n+#!/usr/bin/env python\n+# encoding: utf-8\n+\'\'\'\n+Module to query a set of identifications against the METabolomics EXPlorer database.\n+\n+It will take the input file and for each record it will query the \n+molecular mass in the selected MetExp DB. If one or more compounds are found in the\n+MetExp DB then extra information regarding these compounds is added to the output file.\n+\n+The output file is thus the input file enriched with information about \n+related items found in the selected MetExp DB. \n+\'\'\'\n+import csv\n+import sys\n+import fileinput\n+import urllib2\n+import time\n+from collections import OrderedDict\n+\n+__author__ = "Pieter Lukasse"\n+__contact__ = "pieter.lukasse@wur.nl"\n+__copyright__ = "Copyright, 2014, Plant Research International, WUR"\n+__license__ = "Apache v2"\n+\n+def _process_file(in_xsv, delim=\'\\t\'):\n+ \'\'\'\n+ Generic method to parse a tab-separated file returning a dictionary with named columns\n+ @param in_csv: input filename to be parsed\n+ \'\'\'\n+ data = list(csv.reader(open(in_xsv, \'rU\'), delimiter=delim))\n+ return _process_data(data)\n+ \n+def _process_data(data):\n+ \n+ header = data.pop(0)\n+ # Create dictionary with column name as key\n+ output = OrderedDict()\n+ for index in xrange(len(header)):\n+ output[header[index]] = [row[index] for row in data]\n+ return output\n+\n+\n+def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method):\n+ \'\'\'\n+ This method will iterate over the record in the input_data and\n+ will enrich them with the related information found (if any) in the \n+ MetExp Database.\n+ \n+ # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies\n+ \'\'\'\n+ merged = []\n+ \n+ for i in xrange(len(input_data[input_data.keys()[0]])):\n+ # Get the record in same dictionary format as input_data, but containing\n+ # a value at each column instead of a list of all values of all records:\n+ input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))\n+ \n+ # read the molecular mass and formula:\n+ cas_id = input_data_record[casid_col]\n+ formula = input_data_record[formula_col]\n+ molecular_mass = input_data_record[molecular_mass_col]\n+ \n+ # search for related records in MetExp:\n+ data_found = None\n+ if cas_id != "undef": \n+ # 1- search for other experiments where this CAS id has been found:\n+ query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method\n+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")\n+ data_type_found = "CAS"\n+ if data_found == None:\n+ # 2- search for other experiments where this FORMULA has been found:\n+ query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method\n+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")\n+ data_type_found = "FORMULA"\n+ if data_found == None:\n+ # 3- search for other experiments where this MM has been found:\n+ query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method \n+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")\n+ data_type_found = "MM"\n+ \n+ if data_found == None:\n+ # If still nothing found, just add empty columns\n+ extra_cols = [\'\', \'\',\'\',\'\',\'\',\'\',\'\',\'\']\n+ else:\n+ # Add info found:\n+ extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)\n+ \n+ # Take all data and merge it into a "flat"/simple array of values:\n+ field_values_'..b'ne\n+ \n+ for data_row in data_rows:\n+ if not data_row.strip() == \'\':\n+ row_as_list = _str_to_list(data_row, delimiter=\'\\t\')\n+ result.append(row_as_list)\n+ \n+ # return result processed into a dict:\n+ return _process_data(result)\n+ \n+ except urllib2.HTTPError, e:\n+ raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)\n+ except urllib2.URLError, e:\n+ raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ")\n+\n+def _str_to_list(data_row, delimiter=\'\\t\'): \n+ result = []\n+ for column in data_row.split(delimiter):\n+ result.append(column)\n+ return result\n+ \n+ \n+# alternative: ? \n+# s = requests.Session()\n+# s.verify = False\n+# #s.auth = (token01, token02)\n+# resp = s.get(url, params={\'name\': \'anonymous\'}, stream=True)\n+# content = resp.content\n+# # transform to dictionary:\n+ \n+ \n+ \n+ \n+def _merge_data(input_data_record, extra_cols):\n+ \'\'\'\n+ Adds the extra information to the existing data record and returns\n+ the combined new record.\n+ \'\'\'\n+ record = []\n+ for column in input_data_record:\n+ record.append(input_data_record[column])\n+ \n+ \n+ # add extra columns\n+ for column in extra_cols:\n+ record.append(column) \n+ \n+ return record \n+ \n+\n+def _save_data(data_rows, headers, out_csv):\n+ \'\'\'\n+ Writes tab-separated data to file\n+ @param data_rows: dictionary containing merged/enriched dataset\n+ @param out_csv: output csv file\n+ \'\'\'\n+\n+ # Open output file for writing\n+ outfile_single_handle = open(out_csv, \'wb\')\n+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\\t")\n+\n+ # Write headers\n+ output_single_handle.writerow(headers)\n+\n+ # Write one line for each row\n+ for data_row in data_rows:\n+ output_single_handle.writerow(data_row)\n+\n+def _get_metexp_URL(metexp_dblink_file):\n+ \'\'\'\n+ Read out and return the URL stored in the given file.\n+ \'\'\'\n+ file_input = fileinput.input(metexp_dblink_file)\n+ try:\n+ for line in file_input:\n+ if line[0] != \'#\':\n+ # just return the first line that is not a comment line:\n+ return line\n+ finally:\n+ file_input.close()\n+ \n+\n+def main():\n+ \'\'\'\n+ MetExp Query main function\n+ \n+ The input file can be any tabular file, as long as it contains a column for the molecular mass\n+ and one for the formula of the respective identification. These two columns are then\n+ used to query against MetExp Database. \n+ \'\'\'\n+ seconds_start = int(round(time.time()))\n+ \n+ input_file = sys.argv[1]\n+ casid_col = sys.argv[2]\n+ formula_col = sys.argv[3]\n+ molecular_mass_col = sys.argv[4]\n+ metexp_dblink_file = sys.argv[5]\n+ separation_method = sys.argv[6]\n+ output_result = sys.argv[7]\n+\n+ # Parse metexp_dblink_file to find the URL to the MetExp service:\n+ metexp_dblink = _get_metexp_URL(metexp_dblink_file)\n+ \n+ # Parse tabular input file into dictionary/array:\n+ input_data = _process_file(input_file)\n+ \n+ # Query data against MetExp DB :\n+ enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method)\n+ headers = input_data.keys() + [\'METEXP hits for \',\'METEXP hits: organisms\', \'METEXP hits: tissues\',\n+ \'METEXP hits: experiments\',\'METEXP hits: user names\',\'METEXP hits: column types\', \'METEXP hits: CAS nrs\', \'Link to METEXP hits\']\n+ \n+ _save_data(enriched_data, headers, output_result)\n+ \n+ seconds_end = int(round(time.time()))\n+ print "Took " + str(seconds_end - seconds_start) + " seconds"\n+ \n+ \n+\n+if __name__ == \'__main__\':\n+ main()\n'

diff -r 000000000000 -r dffc38727496 query_metexp.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/query_metexp.xml Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,69 @@
+<tool id="query_metexp"
+    name="METEXP - Query Database "
+    version="0.1.0">
+  <description>Query a set of identifications against the METabolomics EXPeriments database</description>
+  <command interpreter="python">
+    query_metexp.py
+    $input_file
+    "$casid_col"
+    "$formula_col"
+    "$molecular_mass_col"
+    "$metexp_dblink_file"
+    $separation_method
+    $output_result
+  </command>
+  <inputs>
+
+   <param name="input_file" format="tabular" type="data"
+        label="Input file"
+     help="Select a tabular file containing the entries to be queried/verified in the MetExp DB"/>
+
+  <param name="separation_method" type="select" label="Data type to query">
+   <option value="GC" selected="True">GC</option>
+    <option value="LC">LC</option>
+  </param>
+
+
+   <param name="casid_col" type="text" size="50"
+           label="CAS ID column name"
+           value="CAS"
+           help="Name of the column containing the CAS code information (in the given input file)" />
+   <param name="formula_col" type="text" size="50"
+           label="Formula ID column name"
+           value="FORMULA"
+           help="Name of the column containing the formula information (in the given input file)" />
+   <param name="molecular_mass_col" type="text" size="50"
+           label="Molecular mass column name"
+           value="MM"
+           help="Name of the column containing the molecular mass information (in the given input file)" />
+
+   <param name="metexp_dblink_file" type="select" label="MetExp DB to query"
+       help="Select the MetExp Database/backend which should be queried"
+       dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/MetExp_Databases")'/>
+
+
+  </inputs>
+  <outputs>
+    <data name="output_result" format="tabular" label="${tool.name} on ${on_string}" />
+  </outputs>
+  <code file="match_library.py" /> 
+  <help>
+.. class:: infomark
+
+This tool will Query a set of identifications against the METabolomics EXPlorer database.
+
+It will take the input file and for each record it will query the
+molecular mass in the selected MetExp DB. If one or more compounds are found in the
+MetExp DB then extra information regarding these compounds is added to the output file.
+
+The output file is thus the input file enriched with information about
+related items found in the selected MetExp DB.
+
+**Notes**
+
+The input file can be any tabular file, as long as it contains a column for the molecular mass
+and one for the formula of the respective identification.
+
+
+  </help>
+</tool>

diff -r 000000000000 -r dffc38727496 rankfilterGCMS_tabular.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilterGCMS_tabular.xml Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,80 @@
+<tool id="rankfilterGCMS_tabular" name="RIQC-RankFilter GC-MS from tabular file" version="1.0.2">
+  <description>Convert Retention Time to Retention Index</description>
+  <command interpreter="python">rankfilter_GCMS/rankfilter.py $input_file</command>
+  <inputs>
+    <param format="tabular" name="sample" type="data" label="Sample File"
+        help="Select a tab delimited NIST metabolite identifications file (converted from PDF)" />
+ 
+    
+    <param name="calibration" format="any" type="data" label="Calibration File"
+           help="Calibration file containing reference masses (e.g. alkanes) with their respective RT and RI values"/>
+
+    <param name="analysis_type" type="select" format="text" label="Analysis Type"
+        help="Select the type of analysis that has been used to generate the sample file">
+      <option value="NIST">NIST</option>
+      <option value="AMDIS">AMDIS</option>
+    </param>
+    <param name="model" type="select" format="text" label="Select a model to be used "
+        help="Both linear and (3rd degree) polynomial models are available ">
+      <option value="linear">Linear</option>
+      <option value="poly">Polynomial</option>
+    </param>
+    <param name="lib_data" type="select" label="Library"
+        help="Reference global lookup library file with CAS numbers and respective (previously calculated) RIsvr values"
+           dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RankFilter_lookup_libraries")'/>
+
+    <param name="window" type="float" label="Window" value="10.56" />
+  </inputs>
+  <outputs>
+    <data format="tabular" label="${tool.name}" name="onefile" />
+  </outputs>
+  
+  <code file="match_library.py" />
+  <configfiles>
+    <configfile name="input_file">
+      sample = ${sample}
+      calibration = ${calibration}
+      lib_data = ${lib_data}
+      window = ${window}
+      analysis_type = ${analysis_type}
+      tabular = True
+      onefile = ${onefile}
+      model = ${model}
+    </configfile>
+  </configfiles>
+  <help>
+Basically estimates the experimental RI (RIexp) by building a RI(RT) function based on the
+given calibration file.
+
+It also determines the estimated RI (RIsvr) by looking up for each entry of the given input file (Sample File),
+based on its CAS number, its respective RIsvr value in the given global lookup library
+(this step is also called the "RankFilter analysis" -see reference below; Sample File may be either from NIST or AMDIS).
+This generates an prediction of the RI for
+a compound according to the "RankFilter procedure" (RIsvr).
+
+Output is a tab separated file in which four columns are added:
+
+ - **Rank** Calculated rank
+ - **RIexp** Experimental Retention Index (RI)
+ - **RIsvr** Calculated RI based on support vector regression (SVR)
+ - **%rel.err** Relative RI error (%rel.error = 100 * (RISVR − RIexp) / RIexp)
+
+.. class:: infomark
+
+**Notes**
+
+ - The layout of the Calibration file should include the following columns: 'MW', 'R.T.' and 'RI'.
+ - Selecting 'Polynomial' in the model parameter will calculate a 3rd degree polynomial model that will
+   be used to convert from XXXX to YYYY.
+
+-----
+
+**References**
+
+    - **RankFilter**: Mihaleva et. al. (2009) *Automated procedure for candidate compound selection in GC-MS
+      metabolomics based on prediction of Kovats retention index*. Bioinformatics, 25 (2009), pp. 787–794
+  </help>
+</tool>

diff -r 000000000000 -r dffc38727496 rankfilter_GCMS/__init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_GCMS/__init__.py Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,5 @@
+'''
+Created on Mar 14, 2012
+
+@author: marcelk
+'''

diff -r 000000000000 -r dffc38727496 rankfilter_GCMS/pdfread.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_GCMS/pdfread.py Sat Feb 07 22:02:00 2015 +0100

[

b'@@ -0,0 +1,214 @@\n+"""\n+Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University \n+\n+Permission is hereby granted, free of charge, to any person obtaining a copy\n+of this software and associated documentation files (the "Software"), to deal\n+in the Software without restriction, including without limitation the rights\n+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n+copies of the Software, and to permit persons to whom the Software is\n+furnished to do so, subject to the following conditions:\n+\n+The above copyright notice and this permission notice shall be included in\n+all copies or substantial portions of the Software.\n+\n+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n+THE SOFTWARE.\n+"""\n+\n+import sys\n+import csv\n+\n+def getPDF(filename, print_progress):\n+ \'\'\'\n+ Parses NIST PDF file\n+ @param filename: PDF file to parse\n+ \'\'\'\n+ NistInput = {}\n+ NistInput_missed = {}\n+ nist_input = open(filename, \'r\').read()\n+\n+ hitid = []\n+ rt = []\n+ name = []\n+ forward = []\n+ cas = []\n+ reverse = []\n+ prob = []\n+ lib_id = []\n+ nist_id = []\n+ missed_compounds = []\n+ id_missed_compounds = []\n+ formula = []\n+\n+ hit_list = nist_input.split(\'** Search Report Page 1 of 1 **\')\n+ hit_list.pop(0)\n+ #number_hits = range(10)\n+ line_id = 0\n+ for line in hit_list:\n+ line = line.strip().translate(None, \'\\r\')\n+ if line != \'\':\n+ hits = line.replace(\'\\n\', \' \').replace(\'\\x0c\', \'\').replace(\'^L\', \'\').split(\'Hit\') #solution? : if we wouldn\'t replace the \\n by \' \' but by some special sign, then reading formula would be simpler! \n+ #strange....code seems fine actually...debug! See test/data/download.pdf \n+ # strange thing is that it looks like the new line does not end up in the text file, eventhough it looks like there is a new line in the pdf...perhaps a bug in the pdf2text command in linux?\n+ spec_id = hits.pop(0).split(\' \')[1]\n+ j = 0\n+ for hh in hits:\n+ cell = hh.split(\';\')\n+ if print_progress == True:\n+ print \'Processing line: \', line_id, \' with length: \', len(cell), \':\\n\\t\', cell\n+ line_id += 1\n+ if len(cell) == 7: # the compound has CAS number\n+ if len(cell[1].split(\':\')) == 2:\n+ forward.append((cell[1].split(\':\')[1]).strip())\n+ # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end\n+ if len(cell[0].split(\':\')) > 2:\n+ name_tmp = \':\'.join(cell[0].split(\':\')[1:])\n+ else:\n+ name_tmp = cell[0].split(\':\')[1]\n+ \n+ name.append(name_tmp.replace(" ", " ").strip())\n+ name_tmp = name_tmp.strip().split(\' \')\n+ if name_tmp:\n+ # if the name ends with a word that starts with C, F or H, then assume this last word is a formula:\n+ if name_tmp[-1][0] == \'C\' or name_tmp[-1][0] == \'F\' or name_tmp[-1][0] == \'H\':\n+ formule = (name_tmp[-1])\n+ else:\n+ formule = (\'not_def\')\n+ else:\n+ '..b'nd(formule.replace(" ", " "))\n+ reverse.append((cell[2].split(\':\')[1]).strip())\n+ prob.append(cell[3].split(\' \')[2].replace(\'%\', \'\'))\n+ cas.append(\'undef\')\n+ lib_id.append((cell[4].split(\':\')[1]).strip())\n+ nist_id.append(cell[5].split(\':\')[1].replace(\'.\', \'\').strip())\n+ j = j + 1\n+\n+ else:\n+ missed_compounds.append(hh)\n+ id_missed_compounds.append(spec_id)\n+\n+ else: # Missing columns, report and quit\n+ missed_compounds.append(hh)\n+ id_missed_compounds.append(spec_id)\n+\n+ for _ in range(j):\n+ hitid.append(str(spec_id.replace(" ", " ")))\n+ #NB: this is the RT as found in the "id" generated by e.g. msclust, so NOT the RT of the library hit:\n+ rt.append(str(float(spec_id.split(\'-\')[3]) / 1e+06))\n+\n+ NistInput[\'ID\'] = hitid\n+ NistInput[\'R.T.\'] = rt\n+ NistInput[\'Name\'] = name\n+ NistInput[\'CAS\'] = cas\n+ NistInput[\'Formula\'] = formula\n+ NistInput[\'Forward\'] = forward\n+ NistInput[\'Reverse\'] = reverse\n+ NistInput[\'Probability\'] = prob\n+ NistInput[\'Library\'] = lib_id\n+ NistInput[\'Library ID\'] = nist_id\n+ NistInput_missed[\'Missed Compounds\'] = missed_compounds\n+ NistInput_missed[\'ID missed Compounds\'] = id_missed_compounds\n+\n+ return NistInput, NistInput_missed\n+\n+\n+def convert_pdftotext2tabular(filename, output_file, error_file, print_progress):\n+ \'\'\'\n+ Converts NIST PDF file to tabular format\n+ @param filename: PDF file to parse\n+ @param output_file: output file for the hits\n+ @param error_file: output file for failed hits\n+ \'\'\'\n+ [HitList, HitList_missed] = getPDF(filename, print_progress)\n+ # save Hitlist as tab seperate file\n+ Hitlist_as_text = "\\t".join(HitList.keys()) + "\\n"\n+ Hitlist_array_of_array = ([HitList[row] for row in HitList.keys()])\n+ Hitlist_as_text += str("\\n".join(["\\t".join(e) for e in zip(*Hitlist_array_of_array)]))\n+ output_fh = open(output_file, \'wb\')\n+ output_fh.write(Hitlist_as_text)\n+ output_fh.close()\n+\n+ out_missed_pdf = open(error_file, \'wb\')\n+ for x, y in zip(HitList_missed[\'Missed Compounds\'], HitList_missed[\'ID missed Compounds\']):\n+ out_missed_pdf.write("Line with incorrect format or unexpected number of fields:\\n")\n+ out_missed_pdf.write(\'%s\\n\' % \'\\t\'.join([y, x]))\n+ out_missed_pdf.close()\n+\n+\n+def read_tabular(in_csv):\n+ \'\'\'\n+ Parses a tab-separated file returning a dictionary with named columns\n+ @param in_csv: input filename to be parsed\n+ \'\'\'\n+ data = list(csv.reader(open(in_csv, \'rU\'), delimiter=\'\\t\'))\n+ header = data.pop(0)\n+ # Create dictionary with column name as key\n+ output = {}\n+ for index in xrange(len(header)):\n+ output[header[index]] = [row[index] for row in data]\n+ return output\n+\n+\n+def read_tabular_old(filename):\n+ \'\'\'\n+ Function to read tabular format (created by convert_pdftotext2tabular)\n+ and output a dict with header of columns as key and value is columns of tabular as list\n+ @param filename: tabular file to read\n+ \'\'\'\n+ input_fh = None\n+ try:\n+ input_fh = open(filename, \'r\')\n+ except IOError, error:\n+ raise error\n+ colnames = input_fh.readline().strip().split(\'\\t\')\n+ cells = []\n+ for line in input_fh.readlines():\n+ cells.append(line.strip().split(\'\\t\'))\n+ #transform from row oriented structure to column oriented structure\n+ cells = zip(*cells)\n+ #store the list of list in form of final output\n+ RankFilterGC_format = {}\n+ for colnumber in range(len(colnames)):\n+ RankFilterGC_format[colnames[colnumber]] = cells[colnumber]\n+ return RankFilterGC_format\n+\n+\n+if __name__ == \'__main__\':\n+ convert_pdftotext2tabular(sys.argv[1], sys.argv[2], sys.argv[3], True)\n'

diff -r 000000000000 -r dffc38727496 rankfilter_GCMS/pdftotabular.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_GCMS/pdftotabular.py Sat Feb 07 22:02:00 2015 +0100

[

@@ -0,0 +1,44 @@
+"""
+Copyright (C) 2013, Pieter Lukasse, Plant Research International, Wageningen
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this software except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+"""
+
+import sys
+import pdfread
+from subprocess import call
+
+
+def convert_pdftotext(filename, output_file):
+    '''
+    Converts PDF file to text
+    @param filename: PDF file to parse
+    @param output_file: output text file for the hits
+    '''
+
+    # "-layout" option in pdftotext call below: Maintain (as best as possible) the original physical layout of the text. The
+    #                                           default is to 'undo' physical layout (columns, hyphenation, etc.) and output
+    #                                           the text in reading order.
+    try:
+        call(["pdftotext", "-layout", filename, output_file])
+    except:
+        raise Exception("Error while trying to convert PDF to text")
+
+
+
+
+if __name__ == '__main__':
+    pdf_as_text = sys.argv[1]+".txt"
+    convert_pdftotext(sys.argv[1], pdf_as_text)
+    pdfread.convert_pdftotext2tabular(pdf_as_text, sys.argv[2], sys.argv[3], False)

diff -r 000000000000 -r dffc38727496 rankfilter_GCMS/rankfilter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_GCMS/rankfilter.py Sat Feb 07 22:02:00 2015 +0100

[

b'@@ -0,0 +1,432 @@\n+"""\n+Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University\n+\n+Permission is hereby granted, free of charge, to any person obtaining a copy\n+of this software and associated documentation files (the "Software"), to deal\n+in the Software without restriction, including without limitation the rights\n+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n+copies of the Software, and to permit persons to whom the Software is\n+furnished to do so, subject to the following conditions:\n+\n+The above copyright notice and this permission notice shall be included in\n+all copies or substantial portions of the Software.\n+\n+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n+THE SOFTWARE.\n+"""\n+\n+#Library functions definition\n+#----------Begin-------------\n+from numpy import array, linalg, ones\n+from numpy.polynomial import polynomial\n+import math\n+import pdfread\n+import sys\n+\n+\n+def calibrate(standards):\n+ \'\'\'\n+ Calculates the RT to RI conversion: RI = a + b*RT\n+ @param standards: variable containing RI and RT data\n+ \'\'\'\n+ A = ones((len(standards[\'R.T.\']), 2), dtype=float)\n+ A[:, 0] = array(map(float, standards[\'R.T.\']))\n+ [coeff, res, r, s] = linalg.lstsq(A, array(map(float, standards[\'RI\'])))\n+\n+ return coeff\n+\n+\n+def calibrate_poly(standards):\n+ \'\'\'\n+ Calculates the RT to RI conversion using a polynomial model\n+ @param standards: variable containing RI and RT data\n+ \'\'\'\n+ retention_time = array(map(float, standards[\'R.T.\']))\n+ retention_index = array(map(float, standards[\'RI\']))\n+\n+ # Fit a 3rd degree polynomial\n+ fit = polynomial.polyfit(retention_time, retention_index, 3)[::-1]\n+ return [fit[0], fit[1], fit[2], fit[3]]\n+\n+\n+def calculate_poly(retention_time, poly_cal):\n+ \'\'\'\n+ Converts a given retention time to retention index using the calculated polynomial model\n+ @param retention_time: retention_time to convert to retention index\n+ @param poly_cal: result from calculating regression\n+ \'\'\'\n+ # Calculates RI based on given retention_time using polynomial function\n+ retention_time = array(map(float, retention_time))\n+ if len(retention_time) > 1:\n+ ri_exp = []\n+ for i in retention_time:\n+ ri_exp.append(poly_cal[0] * (i ** 3) + poly_cal[1] * (i ** 2) + (i * poly_cal[2]) + poly_cal[3])\n+ return ri_exp\n+ else:\n+ return poly_cal[0] * (retention_time ** 3) + poly_cal[1] * (retention_time ** 2) + (retention_time * poly_cal[2]) + poly_cal[3]\n+\n+\n+def convert_rt(hit_list, coeff):\n+ \'\'\'\n+ Converts a given retention time to retention index using the linear model\n+ @param hit_list: list holding the retention time\n+ @param coeff: calculated coefficient (slope and intercept) using the linear model\n+ \'\'\'\n+ #Convert RT to RI\n+ hit_list[\'RIexp\'] = array(map(float, hit_list[\'R.T.\'])) * coeff[0] + coeff[1]\n+ return hit_list\n+\n+\n+def convert_rt_poly(hit_list, poly_cal):\n+ \'\'\'\n+ Calls the actual RT to RI converter and returns the updated list with added RIexp value\n+ @param hit_list: result list containing the retention time\n+ \'\'\'\n+ hit_list[\'RIexp\'] = array(map(float, calculate_poly(hit_list[\'R.T.\'], poly_cal)))\n+ return hit_list\n+\n+\n+def get_data(libdata, LabelCol):\n+ \'\'\'\n+ Retrieves datacolumns indicated by LabelCol from libdata (generic function)\n+ Returns a dict with the requested column names as keys\n+ @param libdata: file from which data is loaded\n+ @param LabelCol: columns to retrieve\n+ \'\'\'\n+ from numpy import take\n+ LibData = op'..b'be found"\n+ sys.exit(1)\n+\n+ standards = get_data(InputData[\'calibration\'], LabelColStand)\n+ if InputData[\'model\'] == \'linear\':\n+ coeff = calibrate(standards)\n+ elif InputData[\'model\'] == \'poly\':\n+ poly_cal = calibrate_poly(standards)\n+ else:\n+ print "error: model ", InputData[\'model\'], " can not be found. Use \'linear\' or \'poly\' "\n+ sys.exit(1)\n+ else:\n+ #No file has been specified for the calibration\n+ #Use the default coefficients\n+ print \'No file has been specified for the calibration\'\n+ print \'WARNING: the default coefficients will be used\'\n+ coeff = array([29.4327, 454.5260])\n+\n+ if InputData[\'analysis_type\'] == \'AMDIS\':\n+ try:\n+ AmdisOut = open(InputData[\'sample\'], \'r\')\n+ print("open ok")\n+ #Specify which data to be extracted from the AMDIS output table\n+ #Weighted and Reverse are measure of matching between the experimental\n+ #and the library spectra. The experimental spectrum is used as template\n+ #for the calculation of Weighted, whereas for Reverse the template is the\n+ #library spectrum\n+ LabelCol = [\'CAS\', \'Name\', \'R.T.\', \'Weighted\', \'Reverse\', \'Purity\']\n+\n+ #Get the data from the AMDIS output file\n+ HitList = get_data(InputData[\'sample\'], LabelCol)\n+ #Remove \'>\' from the names\n+ HitList[\'Name\'] = [s.replace(\'>\', \'\') for s in HitList[\'Name\']]\n+ except:\n+ print "the file", InputData[\'sample\'], "can not be found"\n+ sys.exit(1)\n+ if InputData[\'analysis_type\'] == \'NIST\':\n+ #HitList_missed - a variable of type dictionary containing the hits with the symbol ";"\n+ #in the name\n+ if not NDIS_is_tabular:\n+ print "Warning; NDIS is not tabular format, reading PDF..\\n"\n+ [HitList, HitList_missed] = pdfread.getPDF(InputData[\'sample\'])\n+ else:\n+ HitList = pdfread.read_tabular(InputData[\'sample\'])\n+\n+ #Convert RT to RI\n+ if InputData[\'model\'] == \'linear\':\n+ HitList = convert_rt(HitList, coeff)\n+ if InputData[\'model\'] == \'poly\':\n+ print "Executing convert_rt_poly().."\n+ HitList = convert_rt_poly(HitList, poly_cal)\n+\n+ #------Read the library data with the predicted RI------\n+ try:\n+ LibData = open(InputData[\'lib_data\'], \'r\')\n+ except:\n+ print "the file", InputData[\'lib_data\'], "can not be found"\n+ sys.exit(1)\n+\n+ #Specify which data to be extracted from the library data file\n+ LabelColLib = [\'CAS\', \'Name\', \'RIsvr\', \'Synonyms\']\n+ LibraryData = get_data(InputData[\'lib_data\'], LabelColLib)\n+\n+ #------Match the hits with the library data and rank them------\n+ if InputData[\'window\'] != \'\':\n+ HitList = rank_hit(HitList, LibraryData, InputData[\'window\'])\n+ else:\n+ print "No value for the window used for the filtering is specified \\n"\n+ sys.exit(1)\n+\n+ #------Print the ranked and filtered hits------\n+ #Specify which data to be printed\n+ if InputData[\'analysis_type\'] == \'AMDIS\':\n+ keys_to_print = [\'R.T.\', \'CAS\', \'Name\', \'Rank\', \'RIexp\', \'RIsvr\', \'%rel.err\', \'Weighted\', \'Reverse\', \'Synonyms\']\n+ else:\n+ keys_to_print = [\'ID\', \'R.T.\', \'Name\', \'CAS\', \'Rank\', \'RIexp\', \'RIsvr\', \'%rel.err\', \'Forward\', \'Reverse\', \'Synonyms\', \'Library\']\n+\n+ #skip this error output from reading a pdftotext file when file is tabular \n+ if InputData[\'analysis_type\'] == \'NIST\' and not NDIS_is_tabular:\n+ out_missed_pdf = open(output_files[\'missed_parse_pdf\'], \'w\')\n+ for x, y in zip(HitList_missed[\'Missed Compounds\'], HitList_missed[\'RT missed Compounds\']):\n+ out_missed_pdf.write(\'%s\\n\' % \'\\t\'.join([y, x]))\n+ out_missed_pdf.close()\n+\n+ print_to_file(HitList, output_files, keys_to_print, print_subsets)\n+\n+if __name__ == \'__main__\':\n+ main()\n'

diff -r 000000000000 -r dffc38727496 rankfilter_text2tabular.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_text2tabular.xml Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,14 @@
+<tool id="NDIStext2tabular" name="NIST_UTIL- NIST text to tabular format" version="1.0.2">
+  <description>Convert NIST text to tabular format</description>
+  <command interpreter="python">rankfilter_GCMS/pdftotabular.py $input $output $output_err</command>
+  <inputs>
+    <param format="pdf" name="input" type="data" label="NIST identifications report (PDF)"/>
+  </inputs>
+  <outputs>
+    <data format="tabular" label="${tool.name} output on ${on_string}"  name="output" />
+    <data format="tabular" label="${tool.name} error log"  name="output_err" />
+  </outputs>
+  <help>
+    This tool converts NIST identification report output (PDF) to a tabular format needed for further use with the RIQC tools.
+  </help>
+</tool>

diff -r 000000000000 -r dffc38727496 select_on_rank.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/select_on_rank.py Sat Feb 07 22:02:00 2015 +0100

[

@@ -0,0 +1,21 @@
+import csv
+import sys
+
+__author__ = "Marcel Kempenaar"
+__contact__ = "brs@nbic.nl"
+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
+__license__ = "MIT"
+
+in_file = sys.argv[1]
+out_file = sys.argv[2]
+to_select_list = [str(select.strip()) for select in sys.argv[3].split(',') if (len(select) > 0)]
+
+data = list(csv.reader(open(in_file, 'rb'), delimiter='\t'))
+header = data.pop(0)
+header_clean = [i.lower().strip().replace(".", "").replace("%", "") for i in header]
+rank = header_clean.index("rank")
+
+writer = csv.writer(open(out_file, 'wb'), delimiter='\t')
+writer.writerow(header)
+for select in to_select_list:
+ writer.writerows([i for i in data if i[rank] == select])

diff -r 000000000000 -r dffc38727496 select_on_rank.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/select_on_rank.xml Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,15 @@
+<tool id="filter_on_rank" name="RIQC-Filter on rank" version="1.0.2">
+  <description>Filter on the Rank field in the RankFilter output file</description>
+  <command interpreter="python">select_on_rank.py $input $output "$rank"</command>
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Source file (RankFilter ouptut)"/>
+    <param format="tabular" help="Filter on (keep different values separate with a comma)" value ="1,2"
+    name="rank" type="text" label="Select Ranks to keep"/>
+  </inputs>
+  <outputs>
+    <data format="tabular" label="${tool.name} on ${on_string} selected ${rank}"  name="output" />
+  </outputs>
+  <help>
+This tool removes all entries with non selected rank values from the input file (supported input file is a RankFilter output file).
+  </help>
+</tool>

diff -r 000000000000 -r dffc38727496 static/images/CAMERA_results.png

Binary file static/images/CAMERA_results.png has changed

diff -r 000000000000 -r dffc38727496 static/images/confidence_and_slope_params_explain.png

Binary file static/images/confidence_and_slope_params_explain.png has changed

diff -r 000000000000 -r dffc38727496 static/images/diffreport.png

Binary file static/images/diffreport.png has changed

diff -r 000000000000 -r dffc38727496 static/images/massEIC.png

Binary file static/images/massEIC.png has changed

diff -r 000000000000 -r dffc38727496 static/images/metaMS_annotate.png

Binary file static/images/metaMS_annotate.png has changed

diff -r 000000000000 -r dffc38727496 static/images/metaMS_pick_align_camera.png

Binary file static/images/metaMS_pick_align_camera.png has changed

diff -r 000000000000 -r dffc38727496 static/images/msclust_summary.png

Binary file static/images/msclust_summary.png has changed

diff -r 000000000000 -r dffc38727496 static/images/sample_SIM.png

Binary file static/images/sample_SIM.png has changed

diff -r 000000000000 -r dffc38727496 static/images/sample_sel_and_peak_height_correction.png

Binary file static/images/sample_sel_and_peak_height_correction.png has changed

diff -r 000000000000 -r dffc38727496 static_resources/elements_and_masses.tab
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/static_resources/elements_and_masses.tab Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,104 @@
+Name Atomic number Chemical symbol Relative atomic mass
+Hydrogen 1 H 1.01
+Helium 2 He 4
+Lithium 3 Li 6.94
+Beryllium 4 Be 9.01
+Boron 5 B 10.81
+Carbon 6 C 12.01
+Nitrogen 7 N 14.01
+Oxygen 8 O 16
+Fluorine 9 F 19
+Neon 10 Ne 20.18
+Sodium 11 Na 22.99
+Magnesium 12 Mg 24.31
+Aluminum 13 Al 26.98
+Silicon 14 Si 28.09
+Phosphorus 15 P 30.98
+Sulfur 16 S 32.06
+Chlorine 17 Cl 35.45
+Argon 18 Ar 39.95
+Potassium 19 K 39.1
+Calcium 20 Ca 40.08
+Scandium 21 Sc 44.96
+Titanium 22 Ti 47.9
+Vanadium 23 V 50.94
+Chromium 24 Cr 52
+Manganese 25 Mn 54.94
+Iron 26 Fe 55.85
+Cobalt 27 Co 58.93
+Nickel 28 Ni 58.71
+Copper 29 Cu 63.54
+Zinc 30 Zn 65.37
+Gallium 31 Ga 69.72
+Germanium 32 Ge 72.59
+Arsenic 33 As 74.99
+Selenium 34 Se 78.96
+Bromine 35 Br 79.91
+Krypton 36 Kr 83.8
+Rubidium 37 Rb 85.47
+Strontium 38 Sr 87.62
+Yttrium 39 Y 88.91
+Zirconium 40 Zr 91.22
+Niobium 41 Nb 92.91
+Molybdenum 42 Mo 95.94
+Technetium 43 Tc 96.91
+Ruthenium 44 Ru 101.07
+Rhodium 45 Rh 102.9
+Palladium 46 Pd 106.4
+Silver 47 Ag 107.87
+Cadmium 48 Cd 112.4
+Indium 49 In 114.82
+Tin 50 Sn 118.69
+Antimony 51 Sb 121.75
+Tellurium 52 Te 127.6
+Iodine 53 I 126.9
+Xenon 54 Xe 131.3
+Cesium 55 Cs 132.9
+Barium 56 Ba 137.34
+Lanthanum 57 La 138.91
+Cerium 58 Ce 140.12
+Praseodymium 59 Pr 140.91
+Neodymium 60 Nd 144.24
+Promethium 61 Pm 144.91
+Samarium 62 Sm 150.35
+Europium 63 Eu 151.96
+Gadolinium 64 Gd 157.25
+Terbium 65 Tb 158.92
+Dysprosium 66 Dy 162.5
+Holmium 67 Ho 164.93
+Erbium 68 Er 167.26
+Thulium 69 Tm 168.93
+Ytterbium 70 Yb 173.04
+Lutetium 71 Lu 174.97
+Hafnium 72 Hf 178.49
+Tantalum 73 Ta 180.95
+Wolfram 74 W 183.85
+Rhenium 75 Re 186.2
+Osmium 76 Os 190.2
+Iridium 77 Ir 192.22
+Platinum 78 Pt 195.09
+Gold 79 Au 196.97
+Mercury 80 Hg 200.59
+Thallium 81 Tl 204.37
+Lead 82 Pb 207.19
+Bismuth 83 Bi 208.98
+Polonium 84 Po 208.98
+Astatine 85 At 209.99
+Radon 86 Rn 222.02
+Francium 87 Fr 223.02
+Radium 88 Ra 226
+Actinium 89 Ac 227.03
+Thorium 90 Th 232.04
+Protactinium 91 Pa 231.04
+Uranium 92 U 238.03
+Neptunium 93 Np 237
+Plutonium 94 Pu 242
+Americium 95 Am 243.06
+Curium 96 Cm 247.07
+Berkelium 97 Bk 247.07
+Californium 98 Cf 251.08
+Einsteinium 99 Es 254.09
+Fermium 100 Fm 257.1
+Mendelevium 101 Md 257.1
+Nobelium 102 No 255.09
+Lawrencium 103 Lr 256.1

diff -r 000000000000 -r dffc38727496 test/__init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/__init__.py Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,1 @@
+''' unit tests '''

diff -r 000000000000 -r dffc38727496 test/integration_tests.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/integration_tests.py Sat Feb 07 22:02:00 2015 +0100

[

b'@@ -0,0 +1,268 @@\n+\'\'\'Integration tests for the GCMS project\'\'\'\n+\n+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611\n+from GCMS import library_lookup, combine_output\n+from GCMS.rankfilter_GCMS import rankfilter\n+import os.path\n+import sys\n+import unittest\n+import re\n+\n+\n+class IntegrationTest(unittest.TestCase):\n+ def test_library_lookup(self):\n+ \'\'\'\n+ Run main for data/NIST_tabular and compare produced files with references determined earlier.\n+ \'\'\'\n+ # Create out folder\n+ outdir = "output/" #tempfile.mkdtemp(prefix=\'test_library_lookup\')\n+ if not os.path.exists(outdir):\n+ os.makedirs(outdir)\n+ outfile_base = os.path.join(outdir, \'produced_library_lookup\')\n+ outfile_txt = outfile_base + \'.txt\'\n+\n+ #Build up arguments and run\n+ input_txt = resource_filename(__name__, "data/NIST_tabular.txt")\n+ library = resource_filename(__name__, "data/RIDB_subset.txt")\n+ regress_model = resource_filename(__name__, "data/ridb_poly_regression.txt")\n+ sys.argv = [\'test\',\n+ library,\n+ input_txt,\n+ \'Capillary\',\n+ \'Semi-standard non-polar\',\n+ outfile_txt,\n+ \'HP-5\',\n+ regress_model]\n+ # Execute main function with arguments provided through sys.argv\n+ library_lookup.main()\n+ #Compare with reference files\n+ reference_txt = resource_filename(__name__, \'reference/produced_library_lookup.txt\')\n+ \n+ #read both the reference file and actual output files\n+ expected = _read_file(reference_txt)\n+ actual = _read_file(outfile_txt)\n+ \n+ #convert the read in files to lists we can compare\n+ expected = expected.split()\n+ actual = actual.split()\n+\n+ for exp, act in zip(expected, actual):\n+ if re.match(\'\\\\d+\\\\.\\\\d+\', exp):\n+ exp = float(exp)\n+ act = float(act)\n+ self.assertAlmostEqual(exp, act, places=5)\n+ else:\n+ # compare values\n+ self.failUnlessEqual(expected, actual)\n+\n+\n+ def test_combine_output_simple(self):\n+ \'\'\'\n+ Run main for data/NIST_tabular and compare produced files with references determined earlier.\n+ \'\'\'\n+ # Create out folder\n+ outdir = "output/" #tempfile.mkdtemp(prefix=\'test_library_lookup\')\n+ if not os.path.exists(outdir):\n+ os.makedirs(outdir)\n+ outfile_base = os.path.join(outdir, \'produced_combine_output\')\n+ outfile_single_txt = outfile_base + \'_single.txt\'\n+ outfile_multi_txt = outfile_base + \'_multi.txt\'\n+\n+ #Build up arguments and run\n+ input_rankfilter = resource_filename(__name__, "data/Rankfilter.txt")\n+ input_caslookup = resource_filename(__name__, "data/Caslookup.txt")\n+ sys.argv = [\'test\',\n+ input_rankfilter,\n+ input_caslookup,\n+ outfile_single_txt,\n+ outfile_multi_txt]\n+ # Execute main function with arguments provided through sys.argv\n+ combine_output.main()\n+ #Compare with reference files\n+ # reference_single_txt = resource_filename(__name__, \'reference/produced_combine_output_single.txt\')\n+ # reference_multi_txt = resource_filename(__name__, \'reference/produced_combine_output_multi.txt\')\n+ # self.failUnlessEqual(_read_file(reference_single_txt), _read_file(outfile_single_txt))\n+ # self.failUnlessEqual(_read_file(reference_multi_txt), _read_file(outfile_multi_txt))\n+\n+ #Clean up\n+ #shutil.rmtree(tempdir)\n+\n+\n+ \n+ def def_test_rank_filter_advanced(self):\n+ \'\'\'\n+ Run main of RankFilter\n+ \'\'\'\n+ # Create out folder\n+ outdir = "output/integration/"\n+ if not os.path.exists(outdir):\n+ '..b' combine_result_single_items = combine_output._process_data(outfile_single_txt)\n+ combine_result_multi_items = combine_output._process_data(outfile_multi_txt)\n+ self.assertGreater(len(combine_result_single_items[\'Centrotype\']), \n+ len(combine_result_multi_items[\'Centrotype\']))\n+ \n+ \n+ # Check 3: library_lookup RI column, centrotype column, ri_svr column are correct:\n+ caslookup_items = combine_output._process_data(input_caslookup)\n+ rankfilter_items = combine_output._process_data(input_rankfilter)\n+ \n+ # check that the caslookup RI column is correctly maintained in its original order in\n+ # the combined file:\n+ ri_caslookup = caslookup_items[\'RI\']\n+ ri_combine_single = combine_result_single_items[\'RI\']\n+ self.assertListEqual(ri_caslookup, ri_combine_single) \n+ \n+ # check the centrotype column\'s integrity:\n+ centrotype_caslookup = caslookup_items[\'Centrotype\']\n+ centrotype_combine_single = combine_result_single_items[\'Centrotype\']\n+ centrotype_rankfilter = _get_centrotype_rankfilter(rankfilter_items[\'ID\'])\n+ self.assertListEqual(centrotype_caslookup, centrotype_combine_single)\n+ self.assertListEqual(centrotype_caslookup, centrotype_rankfilter)\n+ \n+ # integration and integrity checks:\n+ file_NIST = resource_filename(__name__, "data/integration/NIST_identification_results_tabular.txt")\n+ file_NIST_items = combine_output._process_data(file_NIST)\n+ # check that rank filter output has exactly the same ID items as the original NIST input file:\n+ self.assertListEqual(file_NIST_items[\'ID\'], rankfilter_items[\'ID\']) \n+ # check the same for the CAS column:\n+ self.assertListEqual(_get_strippedcas(file_NIST_items[\'CAS\']), rankfilter_items[\'CAS\'])\n+ # now check the NIST CAS column against the cas lookup results: \n+ cas_NIST = _get_processedcas(file_NIST_items[\'CAS\'])\n+ self.assertListEqual(cas_NIST, caslookup_items[\'CAS\'])\n+ # now check the CAS of the combined result. If all checks are OK, it means the CAS column\'s order\n+ # and values remained stable throughout all steps: \n+ self.assertListEqual(rankfilter_items[\'CAS\'], combine_result_single_items[\'CAS\']) \n+ \n+ # check that the rankfilter RIsvr column is correctly maintained in its original order in\n+ # the combined file:\n+ risvr_rankfilter = rankfilter_items[\'RIsvr\']\n+ risvr_combine_single = combine_result_single_items[\'RIsvr\']\n+ self.assertListEqual(risvr_rankfilter, risvr_combine_single) \n+\n+ \n+ \n+\n+def _get_centrotype_rankfilter(id_list):\n+ \'\'\'\n+ returns the list of centrotype ids given a list of ID in the\n+ form e.g. 74-1.0-564-1905200-7, where the numbers before the \n+ first "-" are the centrotype id\n+ \'\'\'\n+ result = []\n+ for compound_id_idx in xrange(len(id_list)):\n+ compound_id = id_list[compound_id_idx]\n+ centrotype = compound_id.split(\'-\')[0]\n+ result.append(centrotype) \n+\n+ return result\n+\n+\n+def _get_processedcas(cas_list):\n+ \'\'\'\n+ returns the list cas numbers in the form C64175 instead of 64-17-5\n+ \'\'\'\n+ result = []\n+ for cas_id_idx in xrange(len(cas_list)):\n+ cas = cas_list[cas_id_idx]\n+ processed_cas = \'C\' + str(cas.replace(\'-\', \'\').strip())\n+ result.append(processed_cas) \n+\n+ return result\n+\n+def _get_strippedcas(cas_list):\n+ \'\'\'\n+ removes the leading white space from e.g. " 64-17-5"\n+ \'\'\'\n+ result = []\n+ for cas_id_idx in xrange(len(cas_list)):\n+ cas = cas_list[cas_id_idx]\n+ processed_cas = cas.strip()\n+ result.append(processed_cas) \n+\n+ return result\n+\n+\n+def _read_file(filename):\n+ \'\'\'\n+ Helper method to quickly read a file\n+ @param filename:\n+ \'\'\'\n+ with open(filename) as handle:\n+ return handle.read()\n'

diff -r 000000000000 -r dffc38727496 test/test_combine_output.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_combine_output.py Sat Feb 07 22:02:00 2015 +0100

[

@@ -0,0 +1,106 @@
+'''
+Created on Mar 27, 2012
+
+@author: marcelk
+'''
+from GCMS import combine_output
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+import os
+import shutil
+import tempfile
+import unittest
+
+
+class Test(unittest.TestCase):
+    '''
+    Tests for the 'combine_output' Galaxy tool
+    '''
+
+    def setUp(self):
+        self.rf_output = resource_filename(__name__, "data/RankFilter.txt")
+        self.cl_output = resource_filename(__name__, "data/CasLookup.txt")
+
+    def test_process_data(self):
+        '''
+        Tests the processing of the RankFilter and CasLookup files into dictionaries
+        '''
+        rfdata = combine_output._process_data(self.rf_output)
+        cldata = combine_output._process_data(self.cl_output)
+        self.assertEqual(set([' 18457-04-0', ' 55133-95-4', ' 58-08-2', ' 112-34-5']), set(rfdata['CAS']))
+        self.assertEqual(set(['C58082', 'C18457040', 'C55133954', 'C112345']), set(cldata['CAS']))
+
+    def test_add_hit(self):
+        '''
+        Tests the combination of two records from both the RankFilter- and CasLookup-tools
+        '''
+        rfdata = combine_output._process_data(self.rf_output)
+        cldata = combine_output._process_data(self.cl_output)
+        index = 0
+        rf_record = dict(zip(rfdata.keys(), [rfdata[key][index] for key in rfdata.keys()]))
+        cl_record = dict(zip(cldata.keys(), [cldata[key][index] for key in cldata.keys()]))
+
+        hit = combine_output._add_hit(rf_record, cl_record)
+        self.assertEqual(len(hit), 27)
+
+        # Pass empty record, should fail combination
+        self.assertRaises(KeyError, combine_output._add_hit, rf_record, {})
+
+    def test_merge_data(self):
+        '''
+        Tests the merging of the RankFilter and CasLookup data
+        '''
+        rfdata = combine_output._process_data(self.rf_output)
+        cldata = combine_output._process_data(self.cl_output)
+        merged, _ = combine_output._merge_data(rfdata, cldata)
+        centrotypes = _get_centrotypes(merged)
+        self.failUnless(all(centrotype in centrotypes for centrotype in ('2716','12723', '3403', '12710')))
+
+def _get_centrotypes(merged):
+    '''
+    returns centrotype codes found in merged set
+    '''
+    result = []
+    for item_idx in xrange(len(merged)):
+        item = merged[item_idx]
+        centrotype = item[0][0]
+        result.append(centrotype)
+
+    return result
+
+    def test_remove_formula(self):
+        '''
+        Tests the removal of the Formula from the 'Name' field (RankFilter output)
+        '''
+        name = "Caffeine C8H10N4O2"
+        compound_name, compound_formula = combine_output._remove_formula(name)
+        self.assertEqual(compound_name, 'Caffeine')
+        self.assertEqual(compound_formula, 'C8H10N4O2')
+        name = "Ethanol C2H6O"
+        compound_name, compound_formula = combine_output._remove_formula(name)
+        self.assertEqual(compound_name, 'Ethanol')
+        self.assertEqual(compound_formula, 'C2H6O')
+        # No formula to remove
+        name = "Butanoic acid, 4-[(trimethylsilyl)oxy]-, trimethylsilyl ester"
+        compound_name, compound_formula = combine_output._remove_formula(name)
+        self.assertEqual(compound_name, name)
+        self.assertEqual(compound_formula, False)
+
+    def test_save_data(self):
+        '''
+        Tests the creation of the output tabular files (no content testing)
+        '''
+        temp_folder = tempfile.mkdtemp(prefix='gcms_combine_output_')
+        saved_single_data = '{0}/{1}'.format(temp_folder, 'output_single.tsv')
+        saved_multi_data = '{0}/{1}'.format(temp_folder, 'output_multi.tsv')
+        rfdata = combine_output._process_data(self.rf_output)
+        cldata = combine_output._process_data(self.cl_output)
+        merged, nhits = combine_output._merge_data(rfdata, cldata)
+        combine_output._save_data(merged, nhits, saved_single_data, saved_multi_data)
+        self.failUnless(os.path.exists(saved_single_data))
+        self.failUnless(os.path.exists(saved_multi_data))
+        shutil.rmtree(temp_folder)
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()

diff -r 000000000000 -r dffc38727496 test/test_export_to_metexp_tabular.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_export_to_metexp_tabular.py Sat Feb 07 22:02:00 2015 +0100

[

@@ -0,0 +1,112 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+from GCMS import export_to_metexp_tabular
+import os.path
+import sys
+import unittest
+
+
+class IntegrationTest(unittest.TestCase):
+
+
+    def test_MM_calculations(self):
+        '''
+        test the implemented method for MM calculations for
+        given chemical formulas
+        '''
+        export_to_metexp_tabular.init_elements_and_masses_map()
+
+        formula = "C8H18O3"
+        # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26
+        result = export_to_metexp_tabular.get_molecular_mass(formula)
+        self.assertEqual(162.26, result)
+
+        formula = "CH2O3Fe2Ni"
+        # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44
+        result = export_to_metexp_tabular.get_molecular_mass(formula)
+        self.assertAlmostEqual(232.44, result, 2)
+
+
+
+
+
+    def test_combine_output_simple(self):
+        '''
+        comment me
+        '''
+        # Create out folder
+        outdir = "output/metexp/"
+        if not os.path.exists(outdir):
+            os.makedirs(outdir)
+
+        #Build up arguments and run
+
+        rankfilter_and_caslookup_combined_file = resource_filename(__name__, "data/dummy1_produced_combine_output_single.txt")
+        msclust_quantification_and_spectra_file = resource_filename(__name__, "data/dummy1_sim.txt")
+        output_csv = resource_filename(__name__, outdir + "metexp_tabular.txt")
+
+        sys.argv = ['test',
+                    rankfilter_and_caslookup_combined_file,
+                    msclust_quantification_and_spectra_file,
+                    output_csv,
+                    'tomato',
+                    'leafs',
+                    'test experiment',
+                    'pieter',
+                    'DB5 column']
+
+        # Execute main function with arguments provided through sys.argv
+        export_to_metexp_tabular.main()
+
+        '''
+        # Asserts are based on reading in with process_data and comparing values of
+        # certain columns
+
+        # Check 3: library_lookup RI column, centrotype column, ri_svr column are correct:
+        caslookup_items = combine_output._process_data(input_caslookup)
+        rankfilter_items = combine_output._process_data(input_rankfilter)
+
+        # check that the caslookup RI column is correctly maintained in its original order in
+        # the combined file:
+        ri_caslookup = caslookup_items['RI']
+        ri_combine_single = combine_result_single_items['RI']
+        self.assertListEqual(ri_caslookup, ri_combine_single)
+
+        # check the centrotype column's integrity:
+        centrotype_caslookup = caslookup_items['Centrotype']
+        centrotype_combine_single = combine_result_single_items['Centrotype']
+        centrotype_rankfilter = _get_centrotype_rankfilter(rankfilter_items['ID'])
+        self.assertListEqual(centrotype_caslookup, centrotype_combine_single)
+        self.assertListEqual(centrotype_caslookup, centrotype_rankfilter)
+
+        # integration and integrity checks:
+        file_NIST = resource_filename(__name__, "data/integration/NIST_identification_results_tabular.txt")
+        file_NIST_items = combine_output._process_data(file_NIST)
+        # check that rank filter output has exactly the same ID items as the original NIST input file:
+        self.assertListEqual(file_NIST_items['ID'], rankfilter_items['ID'])
+        # check the same for the CAS column:
+        self.assertListEqual(_get_strippedcas(file_NIST_items['CAS']), rankfilter_items['CAS'])
+        # now check the NIST CAS column against the cas lookup results:
+        cas_NIST = _get_processedcas(file_NIST_items['CAS'])
+        self.assertListEqual(cas_NIST, caslookup_items['CAS'])
+        # now check the CAS of the combined result. If all checks are OK, it means the CAS column's order
+        # and values remained stable throughout all steps:
+        self.assertListEqual(rankfilter_items['CAS'], combine_result_single_items['CAS'])
+
+        # check that the rankfilter RIsvr column is correctly maintained in its original order in
+        # the combined file:
+        risvr_rankfilter = rankfilter_items['RIsvr']
+        risvr_combine_single = combine_result_single_items['RIsvr']
+        self.assertListEqual(risvr_rankfilter, risvr_combine_single)
+        '''
+
+
+
+def _read_file(filename):
+    '''
+    Helper method to quickly read a file
+    @param filename:
+    '''
+    with open(filename) as handle:
+        return handle.read()

diff -r 000000000000 -r dffc38727496 test/test_library_lookup.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_library_lookup.py Sat Feb 07 22:02:00 2015 +0100

[

b'@@ -0,0 +1,180 @@\n+\'\'\'\n+Created on Mar 6, 2012\n+\n+@author: marcelk\n+\'\'\'\n+from GCMS import library_lookup, match_library\n+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611\n+import os\n+import shutil\n+import tempfile\n+import unittest\n+\n+\n+class Test(unittest.TestCase):\n+ \'\'\'\n+ Tests the \'library_lookup\' Galaxy tool\n+ \'\'\'\n+\n+ def setUp(self):\n+ self.ri_database = resource_filename(__name__, "data/RIDB_subset.txt")\n+ self.nist_output = resource_filename(__name__, "data/NIST_tabular.txt")\n+ self.ridb_poly_regress = resource_filename(__name__, "data/ridb_poly_regression.txt")\n+ self.ridb_linear_regress = resource_filename(__name__, "data/ridb_linear_regression.txt")\n+\n+ def test_create_lookup_table(self):\n+ \'\'\'\n+ Tests the \'create_lookup_table\' function\n+ \'\'\'\n+ column_type = \'Capillary\'\n+ polarity = \'Semi-standard non-polar\'\n+ lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity)\n+ self.assertFalse(False in [res[4] == \'Capillary\' for res in lookup_dict[\'4177166\']])\n+ self.assertEqual([\'C51276336\', \'2,6-Dimethyl-octa-1,7-dien-3,6-diol\', \'C10H18O2\',\n+ \'1277\', \'Capillary\', \'Semi-standard non-polar\', \'DB-5MS\', \'1\',\n+ \'C51276336_DB-5MS\', \'\', \'\', \'\'], lookup_dict[\'51276336\'][1])\n+\n+ def test_read_model(self):\n+ \'\'\'\n+ Tests reading the regression model data containing the parameters required for converting\n+ retention indices between GC-columns\n+ \'\'\'\n+ model, _ = library_lookup._read_model(self.ridb_poly_regress)\n+ # Order of values: coefficient 1 through 4, left limit, right limit\n+ # Polynomial model\n+ self.assertEqual([20.6155874639486, 0.945187096379008, 3.96480787567566e-05, -9.04377237159287e-09,\n+ 628.0, 2944.0, 405.0, 0, 0.998685262365514], model[\'HP-5\'][\'SE-54\'])\n+ self.assertEqual([-92.3963391356951, 1.26116176393346, -0.000191991657547972, 4.15387371263164e-08,\n+ 494.0, 2198.0, 407.0, 0, 0.996665023122993], model[\'Apiezon L\'][\'Squalane\'])\n+ # Linear model\n+ model, _ = library_lookup._read_model(self.ridb_linear_regress)\n+ self.assertEqual([2.81208738561543, 0.99482475526584, 628.0, 2944.0, 405.0, 0, 0.998643883946458],\n+ model[\'HP-5\'][\'SE-54\'])\n+ self.assertEqual([19.979922768462, 0.993741869298272, 494.0, 2198.0, 407.0, 0, 0.99636062891041],\n+ model[\'Apiezon L\'][\'Squalane\'])\n+\n+ def test_apply_regression(self):\n+ \'\'\'\n+ Tests the regression model on some arbitrary retention indices\n+ \'\'\'\n+ poly_model, _ = library_lookup._read_model(self.ridb_poly_regress)\n+ linear_model, _ = library_lookup._read_model(self.ridb_linear_regress)\n+ retention_indices = [1000, 1010, 1020, 1030, 1040, 1050]\n+ converted_poly = []\n+ converted_linear = []\n+ for ri in retention_indices:\n+ converted_poly.append(library_lookup._apply_poly_regression(\'HP-5\', \'DB-5\', ri, poly_model))\n+ converted_linear.append(library_lookup._apply_linear_regression(\'HP-5\', \'DB-5\', ri, linear_model))\n+\n+ self.assertEqual([1003.0566541860778, 1013.0979459524663, 1023.1358645806529, 1033.170466241159,\n+ 1043.2018071045052, 1053.2299433412131], converted_poly)\n+ self.assertEqual([1001.8127584915925, 1011.830140783027, 1021.8475230744615, 1031.864905365896,\n+ 1041.8822876573306, 1051.899669948765], converted_linear)\n+ \n+ # Test polynomial limit detection, the following RI falls outside of the possible limits\n+ ri = 3400\n+ converted_poly = library_lookup._apply_poly_regression(\'HP-5\', \'DB-5\', ri, poly_model)\n+ self.assertEqual(False, converted_poly)\n+\n+ def test_preferred_hit(self):\n+ '..b' \'Semi-standard non-polar\', \'SE-52\', \'\', \'C150867_SE-52\', \'\', \'\', \'\'], \'SE-52\')\n+ self.assertEqual(expected, match)\n+\n+ def test_format_result(self):\n+ \'\'\'\n+ Tests the \'format_result\' function\n+ \'\'\'\n+ column_type = \'Capillary\'\n+ polarity = \'Semi-standard non-polar\'\n+\n+ # Look for DB-5\n+ pref_column = [\'DB-5\']\n+ model, method = library_lookup._read_model(self.ridb_poly_regress)\n+ lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity)\n+ data = library_lookup.format_result(lookup_dict, self.nist_output, pref_column, column_type,\n+ polarity, model, method)#False, None)\n+\n+ # remove non-hits from set:\n+ data = _get_hits_only(data)\n+ self.assertEqual([\'C544354\', \'Ethyl linoleate\', \'C20H36O2\', \'2155\', \'Capillary\', \'Semi-standard non-polar\',\n+ \'DB-5\', \'1\', \'C544354_DB-5\', \'1810\', \'None\', \'\', \'\', \'0\'], data[20])\n+ self.assertEqual(111, len(data))\n+\n+ # Look for both DB-5 and HP-5\n+ pref_column = [\'DB-5\', \'HP-5\']\n+ data = library_lookup.format_result(lookup_dict, self.nist_output, pref_column, column_type,\n+ polarity, False, None)\n+ # remove non-hits from set:\n+ data = _get_hits_only(data)\n+ self.assertEqual([\'C502614\', \'.beta.-(E)-Farnesene\', \'C15H24\', \'1508\', \'Capillary\', \'Semi-standard non-polar\',\n+ \'DB-5\', \'1\', \'C502614_DB-5\', \'942\', \'None\', \'1482\', \'1522\', \'22\'], data[50])\n+ self.assertEqual(106, len(data))\n+\n+\n+ def test_save_data(self):\n+ \'\'\'\n+ Tests the creation of the output tabular file\n+ \'\'\'\n+ temp_folder = tempfile.mkdtemp(prefix=\'gcms_combine_output_\')\n+ saved_data = \'{0}/{1}\'.format(temp_folder, \'output.tsv\')\n+ column_type = \'Capillary\'\n+ polarity = \'Semi-standard non-polar\'\n+ pref_column = [\'DB-5\']\n+ lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity)\n+ data = library_lookup.format_result(lookup_dict, self.nist_output, pref_column, column_type, polarity, False, None)\n+ library_lookup._save_data(data, saved_data)\n+ self.failUnless(os.path.exists(saved_data))\n+ shutil.rmtree(temp_folder)\n+ \n+ \n+ def test_match_library_get_lib_files(self):\n+ \'\'\'\n+ Tests the match_library.py functionality\n+ \'\'\'\n+ riqc_libs_dir = resource_filename(__name__, "../repositories/PRIMS-metabolomics/RI_DB_libraries")\n+ get_library_files_output = match_library.get_directory_files(riqc_libs_dir)\n+ self.assertEqual(2, len(get_library_files_output))\n+ self.assertEqual("Library_RI_DB_capillary_columns-noDuplicates", get_library_files_output[0][0])\n+ #TODO change assert below to assert that the result is a file, so the test can run on other dirs as well:\n+ #self.assertEqual("E:\\\\workspace\\\\PRIMS-metabolomics\\\\python-tools\\\\tools\\\\GCMS\\\\test\\\\data\\\\riqc_libs\\\\RI DB library (capillary columns) Dec.2012.txt", get_library_files_output[0][1])\n+ #self.assertEqual("RI DB library (capillary columns) Jan.2013", get_library_files_output[1][0]) \n+ try:\n+ get_library_files_output = match_library.get_directory_files("/blah")\n+ # should not come here\n+ self.assertTrue(False)\n+ except:\n+ # should come here\n+ self.assertTrue(True)\n+\n+def _get_hits_only(data):\n+ \'\'\'\n+ removes items that have RI == 0.0 and Name == \'\' (these are dummy lines just for the output\n+ \'\'\'\n+ result = []\n+ for item_idx in xrange(len(data)):\n+ item = data[item_idx]\n+ if item[1] != \'\' and item[3] > 0.0 :\n+ result.append(item) \n+\n+ return result \n+\n+\n+if __name__ == "__main__":\n+ #import sys;sys.argv = [\'\', \'Test.testName\']\n+ unittest.main()\n'

diff -r 000000000000 -r dffc38727496 test/test_match_library.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_match_library.py Sat Feb 07 22:02:00 2015 +0100

[

@@ -0,0 +1,51 @@
+'''
+Created on Mar 6, 2012
+
+@author: marcelk
+'''
+from GCMS import match_library
+import unittest
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+
+
+class Test(unittest.TestCase):
+    '''
+    Tests the 'match_library' Galaxy tool
+    '''
+    nist_db = resource_filename(__name__, "data/RIDB_subset.txt")
+
+    def test_get_column_type(self):
+        '''
+        Tests the 'get_column_type' function that returns tuples of unique elements
+        for column types in the RI database
+        '''
+        galaxy_output = match_library.get_column_type(self.nist_db)
+        self.assertEqual([('Capillary(9999)', 'Capillary', False)], galaxy_output)
+
+    def test_filter_column(self):
+        '''
+        Tests the 'filter_column' function showing the column phase for all 'Capillary' columns
+        '''
+        galaxy_output = match_library.filter_column(self.nist_db, 'Capillary')
+        self.assertEqual([('Semi-standard non-polar(9999)', 'Semi-standard non-polar', False)], galaxy_output)
+
+    def test_filter_column2(self):
+        '''
+        Tests the 'filter_column' function showing all possibilities for columns having both the
+        'Capillary' and 'Semi-standard non-polar' properties
+        '''
+        galaxy_output = match_library.filter_column2(self.nist_db, 'Capillary', 'Semi-standard non-polar')
+        self.failUnless(('Apiezon M(6)', 'Apiezon M', False) in galaxy_output)
+
+    def test_count_occurrence(self):
+        '''
+        Tests the 'count_occurrence' function
+        '''
+        test_list = [2, 0, 0, 2, 1, 3, 4, 5, 3, 2, 3, 4, 5, 5, 4, 2, 5, 3, 4, 3, 5, 4, 2, 0, 4]
+        counts = match_library.count_occurrence(test_list)
+        self.assertEqual({0: 3, 1: 1, 2: 5, 3: 5, 4: 6, 5: 5}, counts)
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()

diff -r 000000000000 -r dffc38727496 test/test_query_mass_repos.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_query_mass_repos.py Sat Feb 07 22:02:00 2015 +0100

[

@@ -0,0 +1,62 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+from MS import query_mass_repos
+import os.path
+import sys
+import unittest
+
+
+class IntegrationTest(unittest.TestCase):
+
+
+
+
+    def test_simple(self):
+        '''
+        Simple initial test
+        '''
+        # Create out folder
+        outdir = "output/query_mass_repos/"
+        if not os.path.exists(outdir):
+            os.makedirs(outdir)
+
+        #Build up arguments and run
+
+        #     input_file = sys.argv[1]
+        #     molecular_mass_col = sys.argv[2]
+        #     repository_file = sys.argv[3]
+        #     mass_tolerance = float(sys.argv[4])
+        #     output_result = sys.argv[5]
+
+        input_file = resource_filename(__name__, "data/service_query_tabular.txt")
+
+        molecular_mass_col = "mass (Da)"
+        dblink_file = resource_filename(__name__, "data/MFSearcher ExactMassDB service.txt")
+        output_result = resource_filename(__name__, outdir + "metexp_query_results_added.txt")
+
+
+
+
+        sys.argv = ['test',
+                    input_file,
+                    molecular_mass_col,
+                    dblink_file,
+                    '0.001',
+                    'ms',
+                    output_result]
+
+        # Execute main function with arguments provided through sys.argv
+        query_mass_repos.main()
+
+
+
+
+
+def _read_file(filename):
+    '''
+    Helper method to quickly read a file
+    @param filename:
+    '''
+    with open(filename) as handle:
+        return handle.read()

diff -r 000000000000 -r dffc38727496 test/test_query_metexp.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_query_metexp.py Sat Feb 07 22:02:00 2015 +0100

[

@@ -0,0 +1,83 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+from GCMS import query_metexp
+import os.path
+import sys
+import unittest
+
+
+class IntegrationTest(unittest.TestCase):
+
+
+#    copied from test_export_to_metexp_tabular.py
+#    def test_MM_calculations(self):
+#         '''
+#         test the implemented method for MM calculations for
+#         given chemical formulas
+#         '''
+#         export_to_metexp_tabular.init_elements_and_masses_map()
+#
+#         formula = "C8H18O3"
+#         # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26
+#         result = export_to_metexp_tabular.get_molecular_mass(formula)
+#         self.assertEqual(162.26, result)
+#
+#         formula = "CH2O3Fe2Ni"
+#         # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44
+#         result = export_to_metexp_tabular.get_molecular_mass(formula)
+#         self.assertAlmostEqual(232.44, result, 2)
+#
+#
+#
+
+
+    def test_simple(self):
+        '''
+        Simple initial test
+        '''
+        # Create out folder
+        outdir = "output/metexp_query/"
+        if not os.path.exists(outdir):
+            os.makedirs(outdir)
+
+        #Build up arguments and run
+
+        #         input_file = sys.argv[1]
+        #         molecular_mass_col = sys.argv[2]
+        #         formula_col = sys.argv[3]
+        #         metexp_dblink_file = sys.argv[4]
+        #         output_result = sys.argv[5]
+
+        input_file = resource_filename(__name__, "data/metexp_query_tabular.txt")
+        casid_col = "CAS"
+        formula_col = "FORMULA"
+        molecular_mass_col = "MM"
+        metexp_dblink_file = resource_filename(__name__, "data/METEXP Test DB.txt")
+        output_result = resource_filename(__name__, outdir + "metexp_query_results_added.txt")
+
+        sys.argv = ['test',
+                    input_file,
+                    casid_col,
+                    formula_col,
+                    molecular_mass_col,
+                    metexp_dblink_file,
+                    'GC',
+                    output_result]
+
+        # Execute main function with arguments provided through sys.argv
+        query_metexp.main()
+
+        # TODO - asserts  (base them on DB being filled with test data form metexp unit test for upload method)
+        # PA
+
+
+
+
+def _read_file(filename):
+    '''
+    Helper method to quickly read a file
+    @param filename:
+    '''
+    with open(filename) as handle:
+        return handle.read()

diff -r 000000000000 -r dffc38727496 test/test_query_metexp_LARGE.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_query_metexp_LARGE.py Sat Feb 07 22:02:00 2015 +0100

[

@@ -0,0 +1,79 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
+from GCMS import query_metexp
+import os.path
+import sys
+import unittest
+
+
+class IntegrationTest(unittest.TestCase):
+
+
+#     def test_MM_calculations(self):
+#         '''
+#         test the implemented method for MM calculations for
+#         given chemical formulas
+#         '''
+#         export_to_metexp_tabular.init_elements_and_masses_map()
+#
+#         formula = "C8H18O3"
+#         # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26
+#         result = export_to_metexp_tabular.get_molecular_mass(formula)
+#         self.assertEqual(162.26, result)
+#
+#         formula = "CH2O3Fe2Ni"
+#         # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44
+#         result = export_to_metexp_tabular.get_molecular_mass(formula)
+#         self.assertAlmostEqual(232.44, result, 2)
+#
+#
+#
+
+
+    def test_large(self):
+        '''
+        Simple test, but on larger set, last test executed in 28s
+        '''
+        # Create out folder
+        outdir = "output/metexp_query/"
+        if not os.path.exists(outdir):
+            os.makedirs(outdir)
+
+        #Build up arguments and run
+
+        #         input_file = sys.argv[1]
+        #         molecular_mass_col = sys.argv[2]
+        #         formula_col = sys.argv[3]
+        #         metexp_dblink_file = sys.argv[4]
+        #         output_result = sys.argv[5]
+
+        input_file = resource_filename(__name__, "data/metexp_query_tabular_large.txt")
+        casid_col = "CAS"
+        formula_col = "FORMULA"
+        molecular_mass_col = "MM"
+        metexp_dblink_file = resource_filename(__name__, "data/METEXP Test DB.txt")
+        output_result = resource_filename(__name__, outdir + "metexp_query_results_added_LARGE.txt")
+
+        sys.argv = ['test',
+                    input_file,
+                    casid_col,
+                    formula_col,
+                    molecular_mass_col,
+                    metexp_dblink_file,
+                    'GC',
+                    output_result]
+
+        # Execute main function with arguments provided through sys.argv
+        query_metexp.main()
+
+
+
+
+def _read_file(filename):
+    '''
+    Helper method to quickly read a file
+    @param filename:
+    '''
+    with open(filename) as handle:
+        return handle.read()

diff -r 000000000000 -r dffc38727496 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+<tool_dependency>
+
+   <package name="R_bioc_metams" version="3.1.1">
+ <repository changeset_revision="4b30bdaf4dbd" name="prims_metabolomics_r_dependencies" owner="pieterlukasse" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu" />
+ </package>
+ <readme>
+ This dependency:
+ Ensures R 3.1.1 installation is triggered (via dependency).
+ Ensures Bioconductor 3.0 and package metaMS, multtest and snow are installed.
+      </readme>
+</tool_dependency>
\ No newline at end of file

diff -r 000000000000 -r dffc38727496 xcms_differential_analysis.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xcms_differential_analysis.r Sat Feb 07 22:02:00 2015 +0100

[

@@ -0,0 +1,85 @@
+## read args:
+args <- commandArgs(TRUE)
+#cat("args <- \"\"\n")
+## a xcms xset saved as .RData
+args.xsetData <- args[1]
+#cat(paste("args.xsetData <- \"", args[1], "\"\n", sep=""))
+
+args.class1 <- args[2]
+args.class2 <- args[3]
+#cat(paste("args.class1 <- \"", args[2], "\"\n", sep=""))
+#cat(paste("args.class2 <- \"", args[3], "\"\n", sep=""))
+
+args.topcount <- strtoi(args[4])
+#cat(paste("args.topcount <- ", args[4], "\n", sep=""))
+
+args.outTable <- args[5]
+
+## report files
+args.htmlReportFile <- args[6]
+args.htmlReportFile.files_path <- args[7]
+#cat(paste("args.htmlReportFile <- \"", args[6], "\"\n", sep=""))
+#cat(paste("args.htmlReportFile.files_path <- \"", args[7], "\"\n", sep=""))
+
+
+if (length(args) == 8)
+{
+ args.outLogFile <- args[8]
+ # suppress messages:
+ # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888
+ msg <- file(args.outLogFile, open="wt")
+ sink(msg, type="message")
+ sink(msg, type="output")
+}
+
+tryCatch(
+        {
+         library(metaMS)
+         library(xcms)
+         #library("R2HTML")
+
+ # load the xset data :
+ xsetData <- readRDS(args.xsetData)
+ # if here to support both scenarios:
+ if ("xcmsSet" %in% slotNames(xsetData) )
+ {
+ xsetData <- xsetData@xcmsSet
+ }
+
+
+ # info: levels(xcmsSet@phenoData$class) also gives access to the class names
+ dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE)
+ # set cairo as default for png plots:
+ png = function (...) grDevices::png(...,type='cairo')
+ # run diffreport
+ reporttab <- diffreport(xsetData, args.class1, args.class2, paste(args.htmlReportFile.files_path,"/fig", sep=""), args.topcount, metlin = 0.15, h=480, w=640)
+
+ # write out tsv table:
+ write.table(reporttab, args.outTable, sep="\t", row.names=FALSE)
+
+ message("\nGenerating report.........")
+
+ cat("<html><body><h1>Differential analysis report</h1>", file= args.htmlReportFile)
+ #HTML(reporttab[1:args.topcount,], file= args.htmlReportFile)
+ figuresPath <- paste(args.htmlReportFile.files_path, "/fig_eic", sep="")
+ message(figuresPath)
+ listOfFiles <- list.files(path = figuresPath)
+ for (i in 1:length(listOfFiles))
+ {
+ figureName <- listOfFiles[i]
+ # maybe we still need to copy the figures to the args.htmlReportFile.files_path
+ cat(paste("<img src='fig_eic/", figureName,"' />", sep=""), file= args.htmlReportFile, append=TRUE)
+ cat(paste("<img src='fig_box/", figureName,"' />", sep=""), file= args.htmlReportFile, append=TRUE)
+ }
+
+ message("finished generating report")
+ cat("\nWarnings================:\n")
+ str( warnings() )
+ },
+        error=function(cond) {
+            sink(NULL, type="message") # default setting
+ sink(stderr(), type="output")
+            message("\nERROR: ===========\n")
+            print(cond)
+        }
+    )
\ No newline at end of file

diff -r 000000000000 -r dffc38727496 xcms_differential_analysis.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xcms_differential_analysis.xml Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,55 @@
+<tool id="xcms_differential_analysis" name="XCMS Differential Analsysis"  version="0.0.4">
+ <description> Runs xcms diffreport function for differential Analsysis</description>
+ <requirements>
+ <requirement type="package" version="3.1.1">R_bioc_metams</requirement>
+ </requirements>
+ <command interpreter="Rscript">
+ xcms_differential_analysis.r
+     $xsetData
+ "$class1"
+ "$class2"
+ $topcount
+ $outTable
+ $htmlReportFile
+ $htmlReportFile.files_path
+ $outLogFile
+ </command>
+<inputs>
+
+ <param name="xsetData" type="data" format="rdata" label="xset xcms data file" help="E.g. output data file resulting from METAMS run"/>
+
+
+ <param name="class1" type="text" size="30" label="Class1 name" value="" help="Name of first class for the comparison"/>
+ <param name="class2" type="text" size="30" label="Class2 name" value="" help="Name of second class for the comparison"/>
+
+ <param name="topcount" type="integer" size="10" value="10" label="Number of items to return" help="Top X differential items. E.g. if 10, it will return top 10 differential items." />
+
+</inputs>
+<outputs>
+ <data name="outTable" format="tabular" label="${tool.name} on ${on_string} - Top differential items (TSV)"/>
+ <data name="outLogFile" format="txt" label="${tool.name} on ${on_string} - log (LOG)" hidden="True"/>
+ <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - differential report (HTML)"/>
+</outputs>
+<tests>
+ <test>
+ </test>
+</tests>
+<help>
+
+.. class:: infomark
+
+Runs xcms diffreport for showing the most significant differences between two sets/classes of samples. This tool also creates extracted ion chromatograms (EICs) for
+the most significant differences. The figure below shows an example of such an EIC.
+
+.. image:: $PATH_TO_IMAGES/diffreport.png
+
+
+
+
+  </help>
+  <citations>
+        <citation type="doi">10.1021/ac051437y</citation> 
+   </citations>
+</tool>
\ No newline at end of file

diff -r 000000000000 -r dffc38727496 xcms_get_alignment_eic.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xcms_get_alignment_eic.r Sat Feb 07 22:02:00 2015 +0100

[

@@ -0,0 +1,153 @@
+# shows all alignment results in a rt region
+## read args:
+args <- commandArgs(TRUE)
+# xset data:
+args.xsetData <- args[1]
+
+args.rtStart  <- strtoi(args[2])
+args.rtEnd <- strtoi(args[3])
+
+# limit max diff to 600 and minNrSamples to at least 2 :
+if (args.rtEnd - args.rtStart > 600)
+ stop("The RT window should be <= 600")
+
+args.minNrSamples <- strtoi(args[4]) #min nr samples
+if (args.minNrSamples < 2)
+ stop("Set 'Minimum number of samples' to 2 or higher")
+
+
+args.sampleNames <- strsplit(args[5], ",")[[1]]
+# trim leading and trailing spaces:
+args.sampleNames <- gsub("^\\s+|\\s+$", "", args.sampleNames)
+
+## report files
+args.htmlReportFile <- args[6]
+args.htmlReportFile.files_path <- args[7]
+
+
+if (length(args) == 8)
+{
+ args.outLogFile <- args[8]
+ # suppress messages:
+ # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888
+ msg <- file(args.outLogFile, open="wt")
+ sink(msg, type="message")
+ sink(msg, type="output")
+}
+
+
+
+tryCatch(
+        {
+         library(metaMS)
+
+ # load the xset data :
+ xsetData <- readRDS(args.xsetData)
+ # if here to support both scenarios:
+ if ("xcmsSet" %in% slotNames(xsetData) )
+ {
+ xsetData <- xsetData@xcmsSet
+ }
+
+ # report
+ dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE)
+ message(paste("\nGenerating report.........in ", args.htmlReportFile.files_path))
+
+ write(paste("<html><body><h1>Extracted Ion Chromatograms (EIC) of alignments with peaks in ",args.minNrSamples, " or more samples</h1>"),
+       file=args.htmlReportFile)
+
+ gt <- groups(xsetData)
+ message("\nParsed groups... ")
+ groupidx1 <- which(gt[,"rtmed"] > args.rtStart & gt[,"rtmed"] < args.rtEnd & gt[,"npeaks"] >= args.minNrSamples)  # this should be only on samples selected
+
+ if (length(groupidx1) > 0)
+ {
+ message("\nGetting EIC... ")
+ eiccor <- getEIC(xsetData, groupidx = c(groupidx1), sampleidx=args.sampleNames)
+ #eicraw <- getEIC(xsetData, groupidx = c(groupidx1), sampleidx=args.sampleNames, rt = "raw")
+
+ #sampleNamesIdx <- which(sampnames(LC$xset@xcmsSet) %in% args.sampleNames, arr.ind = TRUE)
+ #or (from bioconductor code for getEIC):  NB: this is assuming sample indexes used in data are ordered after the order of sample names!!
+ sampNames <- sampnames(xsetData)
+ sampleNamesIdx <- match( args.sampleNames, sampNames)
+ message(paste("Samples: ", sampleNamesIdx))
+
+ #TODO html <- paste(html, "<table><tbody>")
+ message(paste("\nPlotting figures... "))
+
+ #get the mz list (interestingly, this [,"mz"] is relatively slow):
+ peakMzList <- xsetData@peaks[,"mz"]
+ peakSampleList <- xsetData@peaks[,"sample"]
+ #signal to noise list:
+ peakSnList <- xsetData@peaks[,"sn"]
+
+ message(paste("Total nr of peaks: ", length(peakMzList)))
+
+ for (i in 1:length(groupidx1))
+ {
+ groupMembers <- xsetData@groupidx[[groupidx1[i]]]
+
+ groupMzList <- ""
+ groupSampleList <- ""
+ finalGroupSize <- 0
+
+ for (j in 1:length(groupMembers))
+ {
+ #get only the peaks from the selected samples:
+ memberSample <- peakSampleList[groupMembers[j]]
+ memberSn <- peakSnList[groupMembers[j]]
+ if (!is.na(memberSn) && memberSample %in% sampleNamesIdx)
+ {
+ message(paste("Group: ", groupidx1[i], " / Member sample: ", memberSample))
+ memberMz <- peakMzList[groupMembers[j]]
+
+
+ if (finalGroupSize == 0)
+ {
+ groupMzList <- memberMz
+ groupSampleList <- sampNames[memberSample]
+ } else {
+ groupMzList <- paste(groupMzList,",",memberMz, sep="")
+ groupSampleList <- paste(groupSampleList,",",sampNames[memberSample], sep="")
+ }
+
+ finalGroupSize <- finalGroupSize +1
+ }
+ }
+ # here we do the real check on group size and only the groups that have enough
+ # members with signal to noise > 0 will be plotted here:
+ if (finalGroupSize >= args.minNrSamples)
+ {
+ message(paste("Plotting figure ",i, " of max ", length(groupidx1)," figures... "))
+
+ figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="")
+ write(paste("<img src='", "figure", i,".png' />", sep="") ,
+        file=args.htmlReportFile, append=TRUE)
+
+ png( figureName, type="cairo", width=800 )
+ plot(eiccor, xsetData, groupidx = i)
+ devname = dev.off()
+
+ write(paste("<p>Alignment id: [", groupidx1[i], "]. m/z list of peaks in alignment with signal/noise <> NA:<br/>", groupMzList,"</p>", sep="") ,
+        file=args.htmlReportFile, append=TRUE)
+        write(paste("<p>m/z values found in the following samples respectively: <br/>", groupSampleList,"</p>", sep="") ,
+        file=args.htmlReportFile, append=TRUE)
+ }
+ }
+
+ }
+
+ write("</body><html>",
+        file=args.htmlReportFile, append=TRUE)
+ message("finished generating report")
+ # unlink(args.htmlReportFile)
+ cat("\nWarnings================:\n")
+ str( warnings() )
+ },
+        error=function(cond) {
+            sink(NULL, type="message") # default setting
+ sink(stderr(), type="output")
+            message("\nERROR: ===========\n")
+            print(cond)
+        }
+    )

diff -r 000000000000 -r dffc38727496 xcms_get_alignment_eic.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xcms_get_alignment_eic.xml Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,73 @@
+<tool id="xcms_get_alignment_eic" name="XCMS Get Alignment EICs"  version="0.0.4">
+ <description> Extracts alignment EICs from feature alignment data</description>
+ <requirements>
+ <requirement type="package" version="3.1.1">R_bioc_metams</requirement>
+ </requirements>
+ <command interpreter="Rscript">
+ xcms_get_alignment_eic.r
+     $xsetData
+ $rtStart
+ $rtEnd
+ $minNrSamples
+ "$sampleNames"
+ $htmlReportFile
+ $htmlReportFile.files_path
+ $outLogFile
+ </command>
+<inputs>
+
+ <param name="xsetData" type="data" format="rdata" label="xset xcms data file" help="E.g. output data file resulting from METAMS run"/>
+
+
+ <param name="rtStart" type="integer" value="" size="10" label="RT start" help="Start of Retention Time region to plot" />
+ <param name="rtEnd" type="integer" value="" size="10"  label="RT end" help="End of Retention Time region to plot" />
+
+ <param name="minNrSamples" type="integer" size="10" value="10" label="Minimum number of samples in which aligned feature should be found" help="Can also read this as: Minimum
+ number of features in alignment. E.g. if set to 10, it means the alignment should consist of at least 10 peaks from 10 different samples aligned together." />
+
+ <param name="sampleNames" type="text" area="true" size="10x70" label="List of sample names"
+ value="sampleName1,sampleName2,etc"
+ help="Comma or line-separated list of sample names. Optional field where you can specify the subset of samples
+ to use for the EIC plots. NB: if your dataset has many samples, specifying a subset here can significantly speed up the processing time." >
+ <sanitizer>
+ 
+ <valid/>
+ <mapping initial="none">
+      <add source="
" target=","/>
+      <add source="" target=""/>
+      <add source=""" target=""/>
+ </mapping>
+ </sanitizer>
+ </param>
+
+
+</inputs>
+<outputs>
+ <data name="outLogFile" format="txt" label="${tool.name} on ${on_string} - log (LOG)" hidden="True"/>
+ <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - EIC(s) report (HTML)"/>
+</outputs>
+<tests>
+ <test>
+ </test>
+</tests>
+<help>
+
+.. class:: infomark
+
+This tool finds the alignments in the specified RT window and extracts alignment EICs from feature alignment data using XCMS's getEIC method.
+It produces a HTML report showing the EIC plots and the mass list of each alignment. The figure below shows an example of such an EIC plot, showing also the difference between
+two classes, with extra alignment information beneath it.
+
+.. image:: $PATH_TO_IMAGES/diffreport.png
+
+Alignment id: 1709. m/z list of peaks in alignment:
+614.002922098482,613.998019830021,614.000382307519,613.998229980469,613.998229980469
+
+
+  </help>
+  <citations>
+        <citation type="doi">10.1021/ac051437y</citation> 
+   </citations>
+</tool>
\ No newline at end of file

diff -r 000000000000 -r dffc38727496 xcms_get_mass_eic.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xcms_get_mass_eic.r Sat Feb 07 22:02:00 2015 +0100

[

@@ -0,0 +1,162 @@
+## read args:
+args <- commandArgs(TRUE)
+# xset data:
+args.xsetData <- args[1]
+
+args.rtStart  <- strtoi(args[2])
+args.rtEnd <- strtoi(args[3])
+
+args.mzStart <- as.double(args[4])
+args.mzEnd <- as.double(args[5])
+# there are 2 options: specify a mz range or a mz list:
+if (args.mzStart < 0)
+{
+ args.mzList <- as.double(strsplit(args[6], ",")[[1]])
+ cat(typeof(as.double(strsplit(args[6], ",")[[1]])))
+ args.mzTolPpm <- as.double(args[7])
+ # calculate mzends based on ppm tol:
+ mzListEnd <- c()
+ mzListStart <- c()
+ for (i in 1:length(args.mzList))
+ {
+ mzEnd <- args.mzList[i] + args.mzList[i]*args.mzTolPpm/1000000.0
+ mzStart <- args.mzList[i] - args.mzList[i]*args.mzTolPpm/1000000.0
+ mzListEnd <- c(mzListEnd, mzEnd)
+ mzListStart <- c(mzListStart, mzStart)
+ }
+ str(mzListStart)
+ str(mzListEnd)
+} else {
+ mzListEnd <- c(args.mzEnd)
+ mzListStart <- c(args.mzStart)
+}
+
+args.sampleNames <- strsplit(args[8], ",")[[1]]
+# trim leading and trailing spaces:
+args.sampleNames <- gsub("^\\s+|\\s+$", "", args.sampleNames)
+
+args.combineSamples <- args[9]
+args.rtPlotMode <- args[10]
+
+## report files
+args.htmlReportFile <- args[11]
+args.htmlReportFile.files_path <- args[12]
+
+
+if (length(args) == 13)
+{
+ args.outLogFile <- args[13]
+ # suppress messages:
+ # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888
+ msg <- file(args.outLogFile, open="wt")
+ sink(msg, type="message")
+ sink(msg, type="output")
+}
+
+# TODO - add option to do masses in same plot (if given in same line oid) or in separate plots
+# TODO2 - let it run in parallel
+
+tryCatch(
+        {
+         library(metaMS)
+
+ # load the xset data :
+ xsetData <- readRDS(args.xsetData)
+ # if here to support both scenarios:
+ if ("xcmsSet" %in% slotNames(xsetData) )
+ {
+ xsetData <- xsetData@xcmsSet
+ }
+
+ # report
+ dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE)
+ message(paste("\nGenerating report.........in ", args.htmlReportFile.files_path))
+
+ html <- "<html><body><h1>Extracted Ion Chromatograms (EIC) matching criteria</h1>"
+
+ if (args.combineSamples == "No")
+ {
+ if (length(args.sampleNames) > 1 && length(mzListStart) > 1 && length(args.sampleNames) != length(mzListStart))
+ stop(paste("The number of sample names should match the number of m/z values in the list. Found ", length(mzListStart),
+           " masses while ",  length(args.sampleNames), " sample names were given."))
+
+    iterSize <- length(args.sampleNames)
+ # these can be set to 1 or 0 just as a trick to iterate OR not over the items. If the respective list is of length 1, only the first item should be used
+ fixSampleIdx <- 1
+ fixMzListIdx <- 1
+ if (length(args.sampleNames) == 1)
+ {
+ fixSampleIdx <- 0
+ iterSize <- length(mzListStart)
+ }
+ if (length(mzListStart) == 1)
+ {
+ fixMzListIdx <- 0
+ }
+ lineColors <- rainbow(iterSize)
+ for (i in 0:(iterSize-1))
+ {
+ message("\nGetting EIC... ")
+ eiccor <- getEIC(xsetData,
+ mzrange=matrix(c(mzListStart[i*fixMzListIdx+1],mzListEnd[i*fixMzListIdx+1]),nrow=1,ncol=2,byrow=TRUE),
+ rtrange=matrix(c(args.rtStart,args.rtEnd),nrow=1,ncol=2,byrow=TRUE),
+ sampleidx=c(args.sampleNames[i*fixSampleIdx+1]), rt=args.rtPlotMode)
+
+ message("\nPlotting figures... ")
+ figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="")
+ html <- paste(html,"<img src='", "figure", i,".png' /><br/>", sep="")
+ png( figureName, type="cairo", width=1100,height=250 )
+ #plot(eiccor, col=lineColors[i+1])
+ # black is better in this case:
+ plot(eiccor)
+ legend('topright', # places a legend at the appropriate place
+ legend=c(args.sampleNames[i*fixSampleIdx+1]), # puts text in the legend
+ lty=c(1,1), # gives the legend appropriate symbols (lines)
+ lwd=c(2.5,2.5))
+
+ devname = dev.off()
+ }
+
+ } else {
+ for (i in 1:length(mzListStart))
+ {
+ message("\nGetting EIC... ")
+ eiccor <- getEIC(xsetData,
+ mzrange=matrix(c(mzListStart[i],mzListEnd[i]),nrow=1,ncol=2,byrow=TRUE),
+ rtrange=matrix(c(args.rtStart,args.rtEnd),nrow=1,ncol=2,byrow=TRUE),
+ sampleidx=args.sampleNames, rt = args.rtPlotMode)
+
+ #set size, set option (plot per sample, plot per mass)
+
+ message("\nPlotting figures... ")
+ figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="")
+ html <- paste(html,"<img src='", "figure", i,".png' />", sep="")
+ png( figureName, type="cairo", width=1100,height=450 )
+ lineColors <- rainbow(length(args.sampleNames))
+ plot(eiccor, col=lineColors)
+ legend('topright', # places a legend at the appropriate place
+   legend=args.sampleNames, # puts text in the legend
+   lty=c(1,1), # gives the legend appropriate symbols (lines)
+   lwd=c(2.5,2.5),
+           col=lineColors)
+ devname = dev.off()
+ }
+ }
+ if (args.rtPlotMode == "corrected")
+ {
+ html <- paste(html,"<p>*rt values are corrected ones</p></body><html>")
+ }
+ html <- paste(html,"</body><html>")
+ message("finished generating report")
+ write(html,file=args.htmlReportFile)
+ # unlink(args.htmlReportFile)
+ cat("\nWarnings================:\n")
+ str( warnings() )
+ },
+        error=function(cond) {
+            sink(NULL, type="message") # default setting
+ sink(stderr(), type="output")
+            message("\nERROR: ===========\n")
+            print(cond)
+        }
+    )

diff -r 000000000000 -r dffc38727496 xcms_get_mass_eic.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xcms_get_mass_eic.xml Sat Feb 07 22:02:00 2015 +0100

@@ -0,0 +1,117 @@
+<tool id="xcms_get_mass_eic" name="XCMS Get EICs"  version="0.0.4">
+ <description> Extracts EICs for a given list of masses</description>
+ <requirements>
+ <requirement type="package" version="3.1.1">R_bioc_metams</requirement>
+ </requirements>
+ <command interpreter="Rscript">
+ xcms_get_mass_eic.r
+     $xsetData
+ $rtStart
+ $rtEnd
+ #if $massParameters.massParametersType == "window"
+ $massParameters.mzStart
+ $massParameters.mzEnd
+ -1
+ "."
+ #else
+ -1
+ -1
+ "$massParameters.mzList"
+ $massParameters.mzTolPpm
+ #end if
+ "$sampleNames"
+ $combineSamples
+ $rtPlotMode
+ $htmlReportFile
+ $htmlReportFile.files_path
+ $outLogFile
+ </command>
+<inputs>
+
+ <param name="xsetData" type="data" format="rdata" label="xset xcms data file" help="E.g. output data file resulting from METAMS run"/>
+
+
+ <param name="rtStart" type="integer" value="" size="10" label="RT start" help="Start of Retention Time region to plot" />
+ <param name="rtEnd" type="integer" value="" size="10"  label="RT end" help="End of Retention Time region to plot" />
+
+ <conditional name="massParameters">
+ <param name="massParametersType" type="select" size="50" label="Give masses as" >
+ <option value="list" selected="true">m/z list</option>
+ <option value="window" >m/z window</option>
+ </param>
+ <when value="list">
+ <param name="mzList" type="text" area="true" size="7x70" label="m/z list"
+ help="Comma or line-separated list of m/z values for which to plot an EIC. One EIC will be plotted for each m/z given here.">
+ <sanitizer>
+ 
+ <valid/>
+ <mapping initial="none">
+      <add source="
" target=","/>
+      <add source="" target=""/>
+      <add source=""" target=""/>
+ </mapping>
+ </sanitizer>
+ </param>
+ <param name="mzTolPpm" type="integer" size="10" value="5" label="m/z tolerance (ppm)"  />
+ </when>
+ <when value="window">
+ <param name="mzStart" type="float" value="" size="10" label="m/z start" help="Start of m/z window" />
+ <param name="mzEnd" type="float" value="" size="10"  label="m/z end" help="End of m/z window" />
+    </when>
+ </conditional>
+
+
+ <param name="sampleNames" type="text" area="true" size="10x70" label="List of sample names"
+ value="sampleName1,sampleName2,etc"
+ help="Comma or line-separated list of sample names. Here you can specify the subset of samples
+ to use for the EIC plots. NB: if your dataset has many samples, specifying a subset here can significantly speed up the processing time." >
+ <sanitizer>
+ 
+ <valid/>
+ <mapping initial="none">
+      <add source="
" target=","/>
+      <add source="" target=""/>
+      <add source=""" target=""/>
+ </mapping>
+ </sanitizer>
+ </param>
+
+ <param name="combineSamples" type="select" size="50" label="Combine samples in EIC"
+        help="Combining samples means plot contains the combined EIC of a m/z in the different samples. When NOT combining, each plot
+        only contains the EIC of the m/z in the respectively given sample (the sample name from the sample list in the same position as the
+        m/z in the m/z list.). Tip: use Yes for visualizing EIC of grouped masses (MsClust or CAMERA groups), use No for visualizing EICs of the same mass in
+        the different samples.">
+ <option value="No" selected="true">No</option>
+ <option value="Yes" >Yes</option>
+ </param>
+ <param name="rtPlotMode" type="select" size="50" label="RT plot mode"
+        help="Select whether EIC should be on original or raw Retention Time">
+ <option value="raw" selected="true">Raw</option>
+ <option value="corrected" >Corrected</option>
+ </param>
+</inputs>
+<outputs>
+ <data name="outLogFile" format="txt" label="${tool.name} on ${on_string} - log (LOG)" hidden="True"/>
+ <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - EIC(s) report (HTML)"/>
+</outputs>
+<tests>
+ <test>
+ </test>
+</tests>
+<help>
+
+.. class:: infomark
+
+This tool will plot EICs for a given list of masses (or a mass window) using XCMS's getEIC method.
+It produces a HTML report showing the EIC plots, one for each given mass. The figure below shows an example of such an EIC plot.
+
+.. image:: $PATH_TO_IMAGES/massEIC.png
+
+
+  </help>
+  <citations>
+        <citation type="doi">10.1021/ac051437y</citation> 
+   </citations>
+</tool>
\ No newline at end of file