| Next changeset 1:e467a6c83d67 (2014-01-16) |
|
Commit message:
Initial commit to toolshed |
|
added:
LICENSE MsClust.jar NOTICE README.rst Rscripts/filter-RIDB.R Rscripts/ridb-regression.R __init__.py combine_output.py combine_output.xml create_model.xml datatypes_conf.xml export_to_metexp_tabular.py library_lookup.py library_lookup.xml match_library.py msclust2.0.1.xml rankfilterGCMS_tabular.xml rankfilter_GCMS/__init__.py rankfilter_GCMS/pdfread.py rankfilter_GCMS/pdftotabular.py rankfilter_GCMS/rankfilter.py rankfilter_GCMS/test/__init__.py rankfilter_GCMS/test/test_pdfread.py rankfilter_GCMS/test/test_rankfilter.py rankfilter_text2tabular.xml select_on_rank.py select_on_rank.xml static/images/confidence_and_slope_params_explain.png static/images/msclust_summary.png static/images/sample_SIM.png static/images/sample_sel_and_peak_height_correction.png test/__init__.py test/integration_tests.py test/test_combine_output.py test/test_export_to_metexp_tabular.py test/test_library_lookup.py test/test_match_library.py |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b LICENSE --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/LICENSE Thu Jan 16 13:10:00 2014 +0100 |
| [ |
| b'@@ -0,0 +1,202 @@\n+\n+ Apache License\n+ Version 2.0, January 2004\n+ http://www.apache.org/licenses/\n+\n+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n+\n+ 1. Definitions.\n+\n+ "License" shall mean the terms and conditions for use, reproduction,\n+ and distribution as defined by Sections 1 through 9 of this document.\n+\n+ "Licensor" shall mean the copyright owner or entity authorized by\n+ the copyright owner that is granting the License.\n+\n+ "Legal Entity" shall mean the union of the acting entity and all\n+ other entities that control, are controlled by, or are under common\n+ control with that entity. For the purposes of this definition,\n+ "control" means (i) the power, direct or indirect, to cause the\n+ direction or management of such entity, whether by contract or\n+ otherwise, or (ii) ownership of fifty percent (50%) or more of the\n+ outstanding shares, or (iii) beneficial ownership of such entity.\n+\n+ "You" (or "Your") shall mean an individual or Legal Entity\n+ exercising permissions granted by this License.\n+\n+ "Source" form shall mean the preferred form for making modifications,\n+ including but not limited to software source code, documentation\n+ source, and configuration files.\n+\n+ "Object" form shall mean any form resulting from mechanical\n+ transformation or translation of a Source form, including but\n+ not limited to compiled object code, generated documentation,\n+ and conversions to other media types.\n+\n+ "Work" shall mean the work of authorship, whether in Source or\n+ Object form, made available under the License, as indicated by a\n+ copyright notice that is included in or attached to the work\n+ (an example is provided in the Appendix below).\n+\n+ "Derivative Works" shall mean any work, whether in Source or Object\n+ form, that is based on (or derived from) the Work and for which the\n+ editorial revisions, annotations, elaborations, or other modifications\n+ represent, as a whole, an original work of authorship. For the purposes\n+ of this License, Derivative Works shall not include works that remain\n+ separable from, or merely link (or bind by name) to the interfaces of,\n+ the Work and Derivative Works thereof.\n+\n+ "Contribution" shall mean any work of authorship, including\n+ the original version of the Work and any modifications or additions\n+ to that Work or Derivative Works thereof, that is intentionally\n+ submitted to Licensor for inclusion in the Work by the copyright owner\n+ or by an individual or Legal Entity authorized to submit on behalf of\n+ the copyright owner. For the purposes of this definition, "submitted"\n+ means any form of electronic, verbal, or written communication sent\n+ to the Licensor or its representatives, including but not limited to\n+ communication on electronic mailing lists, source code control systems,\n+ and issue tracking systems that are managed by, or on behalf of, the\n+ Licensor for the purpose of discussing and improving the Work, but\n+ excluding communication that is conspicuously marked or otherwise\n+ designated in writing by the copyright owner as "Not a Contribution."\n+\n+ "Contributor" shall mean Licensor and any individual or Legal Entity\n+ on behalf of whom a Contribution has been received by Licensor and\n+ subsequently incorporated within the Work.\n+\n+ 2. Grant of Copyright License. Subject to the terms and conditions of\n+ this License, each Contributor hereby grants to You a perpetual,\n+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n+ copyright license to reproduce, prepare Derivative Works of,\n+ publicly display, publicly perform, sublicense, and distribute the\n+ Work and such Derivative Works in Source or Obj'..b'r shall be under the terms and conditions of\n+ this License, without any additional terms or conditions.\n+ Notwithstanding the above, nothing herein shall supersede or modify\n+ the terms of any separate license agreement you may have executed\n+ with Licensor regarding such Contributions.\n+\n+ 6. Trademarks. This License does not grant permission to use the trade\n+ names, trademarks, service marks, or product names of the Licensor,\n+ except as required for reasonable and customary use in describing the\n+ origin of the Work and reproducing the content of the NOTICE file.\n+\n+ 7. Disclaimer of Warranty. Unless required by applicable law or\n+ agreed to in writing, Licensor provides the Work (and each\n+ Contributor provides its Contributions) on an "AS IS" BASIS,\n+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n+ implied, including, without limitation, any warranties or conditions\n+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n+ PARTICULAR PURPOSE. You are solely responsible for determining the\n+ appropriateness of using or redistributing the Work and assume any\n+ risks associated with Your exercise of permissions under this License.\n+\n+ 8. Limitation of Liability. In no event and under no legal theory,\n+ whether in tort (including negligence), contract, or otherwise,\n+ unless required by applicable law (such as deliberate and grossly\n+ negligent acts) or agreed to in writing, shall any Contributor be\n+ liable to You for damages, including any direct, indirect, special,\n+ incidental, or consequential damages of any character arising as a\n+ result of this License or out of the use or inability to use the\n+ Work (including but not limited to damages for loss of goodwill,\n+ work stoppage, computer failure or malfunction, or any and all\n+ other commercial damages or losses), even if such Contributor\n+ has been advised of the possibility of such damages.\n+\n+ 9. Accepting Warranty or Additional Liability. While redistributing\n+ the Work or Derivative Works thereof, You may choose to offer,\n+ and charge a fee for, acceptance of support, warranty, indemnity,\n+ or other liability obligations and/or rights consistent with this\n+ License. However, in accepting such obligations, You may act only\n+ on Your own behalf and on Your sole responsibility, not on behalf\n+ of any other Contributor, and only if You agree to indemnify,\n+ defend, and hold each Contributor harmless for any liability\n+ incurred by, or claims asserted against, such Contributor by reason\n+ of your accepting any such warranty or additional liability.\n+\n+ END OF TERMS AND CONDITIONS\n+\n+ APPENDIX: How to apply the Apache License to your work.\n+\n+ To apply the Apache License to your work, attach the following\n+ boilerplate notice, with the fields enclosed by brackets "[]"\n+ replaced with your own identifying information. (Don\'t include\n+ the brackets!) The text should be enclosed in the appropriate\n+ comment syntax for the file format. We also recommend that a\n+ file or class name and description of purpose be included on the\n+ same "printed page" as the copyright notice for easier\n+ identification within third-party archives.\n+\n+ Copyright [yyyy] [name of copyright owner]\n+\n+ Licensed under the Apache License, Version 2.0 (the "License");\n+ you may not use this file except in compliance with the License.\n+ You may obtain a copy of the License at\n+\n+ http://www.apache.org/licenses/LICENSE-2.0\n+\n+ Unless required by applicable law or agreed to in writing, software\n+ distributed under the License is distributed on an "AS IS" BASIS,\n+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n+ See the License for the specific language governing permissions and\n+ limitations under the License.\n' |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b MsClust.jar |
| b |
| Binary file MsClust.jar has changed |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b NOTICE --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/NOTICE Thu Jan 16 13:10:00 2014 +0100 |
| b |
| @@ -0,0 +1,14 @@ +PRIMS-metabolomics toolset & Galaxy wrappers +============================================ + +Metabolomics module of Plant Research International's Mass Spectrometry (PRIMS) toolsuite. +This toolset consists of custom tools to enable metabolite identifications and +Retention Index (RI) based Quality Control (RIQC) for Mass Spectrometry metabolomics data. + +Copyright: +* 2012: NIST_UTIL and RIQC tools: Copyright (c) 2012 Maarten Kooyman and Marcel Kempenaar, NBIC BRS +* 2013: all tools: Copyright (c) 2013 by Pieter Lukasse, Plant Research International (PRI), + Wageningen, The Netherlands. All rights reserved. See the license text below. + +Galaxy wrappers and installation are available from the Galaxy Tool Shed at: +http://toolshed.g2.bx.psu.edu/view/pieterlukasse/prims_metabolomics \ No newline at end of file |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b README.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.rst Thu Jan 16 13:10:00 2014 +0100 |
| b |
| @@ -0,0 +1,71 @@ +PRIMS-metabolomics toolset & Galaxy wrappers +============================================ + +Metabolomics module of Plant Research International's Mass Spectrometry (PRIMS) toolsuite. +This toolset consists of custom tools to enable metabolite identifications and +Retention Index (RI) based Quality Control (RIQC) for Mass Spectrometry metabolomics data. + +Copyright: +* 2012: NIST_UTIL and RIQC tools: Copyright (c) 2012 Maarten Kooyman and Marcel Kempenaar, NBIC BRS +* 2013: all tools: Copyright (c) 2013 by Pieter Lukasse, Plant Research International (PRI), + Wageningen, The Netherlands. All rights reserved. See the license text below. + +Galaxy wrappers and installation are available from the Galaxy Tool Shed at: +http://toolshed.g2.bx.psu.edu/view/pieterlukasse/prims_metabolomics + +History +======= + +============== ====================================================================== +Date Changes +-------------- ---------------------------------------------------------------------- +January 2014 * first release via Tool Shed, combining the RIQC and MsClust in a + single package (this package) + * integration with METEXP software (data store for metabolomics + experiments with respective metadata and identifications) +2013 * hand-over of the NIST_UTIL and RIQC tools from the NBIC team to + Plant Research International +2012 * development of MsClust 2.0, making it also suitable for Galaxy +<2011 * development and publication of MsClust 1.0 +============== ====================================================================== + +Tool Versioning +=============== + +PRIMS tools will have versions of the form X.Y.Z. Versions +differing only after the second decimal should be completely +compatible with each other. Breaking changes should result in an +increment of the number before and/or after the first decimal. All +tools of version less than 1.0.0 should be considered beta. + + +Bug Reports & other questions +============================= + +For the time being issues can be reported via the contact form at: +http://www.wageningenur.nl/en/Persons/PNJ-Pieter-Lukasse.htm + +Developers, Contributions & Collaborations +========================================== + +If you wish to join forces and collaborate on some of the +tools do not hesitate to contact Pieter Lukasse via the contact form above. + + +License (Apache, Version 2.0) +============================= + +Copyright 2013 Pieter Lukasse, Plant Research International (PRI). + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this software except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + \ No newline at end of file |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b Rscripts/filter-RIDB.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Rscripts/filter-RIDB.R Thu Jan 16 13:10:00 2014 +0100 |
| [ |
| @@ -0,0 +1,56 @@ +## +# +# Removes duplicates from a RI-database +# +# Usage: +# Rscript filter-RIDB.R /path/to/retention_db.txt output_RIDB_file.txt +# +## + +# Commandline arguments +args <- commandArgs(TRUE) +ridb <- args[1] +out_file <- args[2] + +# Function to check duplicates +duplicates <- function(dat) { + s <- do.call("order", as.data.frame(dat)) + non.dup <- !duplicated(dat[s, ]) + orig.ind <- s[non.dup] + first.occ <- orig.ind[cumsum(non.dup)] + first.occ[non.dup] <- NA + first.occ[order(s)] +} + +# Load CSV file +ridb <- read.csv(ridb,header=TRUE, sep="\t") +## Filters on: CAS FORMULA Column type Column phase type Column name +filter_cols <- c(1, 3, 5, 6, 7) +cat("RIDB dimensions: ") +print(dim(ridb)) +deleted <- NULL +cat("Checking for duplicates...") +dups <- duplicates(ridb[,filter_cols]) +cat("\t[DONE]\nRemoving duplicates...") +newridb <- ridb +newridb["min"] <- NA +newridb["max"] <- NA +newridb["orig.columns"] <- NA +for (i in unique(dups)) { + if (!is.na(i)) { + rows <- which(dups == i) + duprows <- ridb[c(i, rows),] + # Replace duplicate rows with one row containing the median value + new_RI <- median(duprows$RI) + newridb$RI[i] <- median(duprows$RI) + newridb$min[i] <- min(duprows$RI) + newridb$max[i] <- max(duprows$RI) + newridb$orig.columns[i] <- paste(rows, collapse=",") + deleted <- c(deleted, rows) + } +} +cat("\t\t[DONE]\nCreating new dataset...") +out_ridb <- newridb[-deleted,] +cat("\t\t[DONE]\nWriting new dataset...") +write.table(out_ridb, na='', file=out_file, quote=T, sep="\t", row.names=F) +cat("\t\t[DONE]\n") |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b Rscripts/ridb-regression.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Rscripts/ridb-regression.R Thu Jan 16 13:10:00 2014 +0100 |
| [ |
| @@ -0,0 +1,208 @@ +## +# +# Performs regression analysis using either 3rd degree polynomial- or linear-method +# +## + +# Commandline arguments +args <- commandArgs(TRUE) +if (length(args) < 7) + stop(cat("Missing arguments, usage:\n\tRscript ridb-regression.R RI-database ", + "ouput_file logfile min_residuals range_mod pvalue rsquared method ", + "plot(yes/no) plot_archive")) + +ridb <- args[1] +out_file <- args[2] +logfile <- args[3] +min_residuals <- as.integer(args[4]) +range_mod <- as.integer(args[5]) +pvalue <- as.double(args[6]) +rsquared <- as.double(args[7]) +method <- args[8] +plot <- tolower(args[9]) +if (plot == 'true') + plot_archive = args[10] + +# Do not show warnings etc. +sink(file='/dev/null') + +progress <- c() +logger <- function(logdata) { + ## Logs progress, adds a timestamp for each event + #cat(paste(Sys.time(), "\t", logdata, "\n", sep="")) ## DEBUG + progress <<- c(progress, paste(Sys.time(), "\t", logdata, sep="")) +} + +logger("Reading Retention Index Database..") + +# Read Retention Index Database +ridb <- read.csv(ridb, header=TRUE, sep="\t") +logger(paste("\t", nrow(ridb), "records read..")) +# Get a unique list +gc_columns <- unique(as.vector(as.matrix(ridb['Column.name'])[,1])) +cas_numbers <- unique(as.vector(as.matrix(ridb['CAS'])[,1])) + +add_poly_fit <- function(fit, gc1_index, gc2_index, range) { + pval = anova.lm(fit)$Pr + r.squared = summary(fit)$r.squared + + data = rep(NA, 11) + # Append results to matrix + data[1] = gc_columns[gc1_index] # Column 1 + data[2] = gc_columns[gc2_index] # Column 2 + data[3] = coefficients(fit)[1] # The 4 coefficients + data[4] = coefficients(fit)[2] + data[5] = coefficients(fit)[3] + data[6] = coefficients(fit)[4] + data[7] = range[1] # Left limit + data[8] = range[2] # Right limit + data[9] = length(fit$residuals) # Number of datapoints analysed + data[10] = pval[1] # p-value for resulting fitting + data[11] = r.squared # R-squared + return(data) +} + + +add_linear_fit <- function(fit, gc1_index, gc2_index, range) { + pval = anova.lm(fit)$Pr + r.squared = summary(fit)$r.squared + + data = rep(NA, 7) + # Append results to matrix + data[1] = gc_columns[gc1_index] # Column 1 + data[2] = gc_columns[gc2_index] # Column 2 + data[3] = coefficients(fit)[1] # The 4 coefficients + data[4] = coefficients(fit)[2] + data[7] = length(fit$residuals) # Number of datapoints analysed + data[8] = pval[1] # p-value for resulting fitting + data[9] = r.squared # R-squared + return(data) +} + + +add_fit <- function(fit, gc1_index, gc2_index, range, method) { + if (method == 'poly') + return(add_poly_fit(fit, gc1_index, gc2_index, range)) + else + return(add_linear_fit(fit, gc1_index, gc2_index, range)) +} + + +plot_fit <- function(ri1, ri2, gc1_index, gc2_index, coeff, range, method) { + if (method == 'poly') + pol <- function(x) coeff[4]*x^3 + coeff[3]*x^2 + coeff[2]*x + coeff[1] + else + pol <- function(x) coeff[2]*x + coeff[1] + pdf(paste('regression_model_', + make.names(gc_columns[gc1_index]), '_vs_', + make.names(gc_columns[gc2_index]), '.pdf', sep='')) + curve(pol, 250:3750, col="red", lwd=2.5, main='Regression Model', xlab=gc_columns[gc1_index], + ylab=gc_columns[gc2_index], xlim=c(250, 3750), ylim=c(250, 3750)) + points(ri1, ri2, lwd=0.4) + # Add vertical lines showing left- and right limits when using poly method + if (method == 'poly') + abline(v=range, col="grey", lwd=1.5) + dev.off() +} + +# Initialize output dataframe +if (method == 'poly') { + m <- data.frame(matrix(ncol = 11, nrow = 10)) +} else { + m <- data.frame(matrix(ncol = 9, nrow = 10)) +} + + +get_fit <- function(gc1, gc2, method) { + if (method == 'poly') + return(lm(gc1 ~ poly(gc2, 3, raw=TRUE))) + else + return(lm(gc1 ~ gc2)) +} + +# Permutate +k <- 1 +logger(paste("Permutating (with ", length(gc_columns), " GC-columns)..", sep="")) + +for (i in 1:(length(gc_columns)-1)) { + logger(paste("\tCalculating model for ", gc_columns[i], "..", sep="")) + breaks <- 0 + for (j in (i+1):length(gc_columns)) { + col1 = ridb[which(ridb['Column.name'][,1] == gc_columns[i]),] + col2 = ridb[which(ridb['Column.name'][,1] == gc_columns[j]),] + + # Find CAS numbers for which both columns have data (intersect) + cas_intersect = intersect(col1[['CAS']], col2[['CAS']]) + + # Skip if number of shared CAS entries is < cutoff + if (length(cas_intersect) < min_residuals) { + breaks = breaks + 1 + next + } + # Gather Retention Indices + col1_data = col1[['RI']][match(cas_intersect, col1[['CAS']])] + col2_data = col2[['RI']][match(cas_intersect, col2[['CAS']])] + + # Calculate the range within which regression is possible (and move if 'range_mod' != 0) + range = c(min(c(min(col1_data), min(col2_data))), max(c(max(col1_data), max(col2_data)))) + if (range_mod != 0) { + # Calculate percentage and add/subtract from range + perc = diff(range) / 100 + perc_cutoff = range_mod * perc + range = as.integer(range + c(perc_cutoff, -perc_cutoff)) + } + + # Calculate model for column1 vs column2 and plot if requested + fit = get_fit(col1_data, col2_data, method) + m[k,] = add_fit(fit, i, j, range, method) + + if (plot == 'true') + plot_fit(col1_data, col2_data, i, j, coefficients(fit), range, method) + + # Calculate model for column2 vs column1 and plot if requested + fit = get_fit(col2_data, col1_data, method) + m[k + 1,] = add_fit(fit, j, i, range, method) + + if (plot == 'true') + plot_fit(col2_data, col1_data, j, i, coefficients(fit), range, method) + + k = k + 2 + } + logger(paste("\t\t", breaks, " comparisons have been skipped due to nr. of datapoints < cutoff", sep="")) +} + +# Filter on pvalue and R-squared +logger("Filtering on pvalue and R-squared..") +if (method == 'poly') { + pval_index <- which(m[,10] < pvalue) + rsquared_index <- which(m[,11] > rsquared) +} else { + pval_index <- which(m[,8] < pvalue) + rsquared_index <- which(m[,9] > rsquared) +} +logger(paste(nrow(m) - length(pval_index), " models discarded due to pvalue > ", pvalue, sep="")) + +logger(paste(nrow(m) - length(rsquared_index), " models discarded due to R-squared < ", rsquared, sep="")) + +# Remaining rows +index = unique(c(pval_index, rsquared_index)) + +# Reduce dataset +m = m[index,] +sink() + +# Place plots in the history as a ZIP file +if (plot == 'true') { + logger("Creating archive with model graphics..") + system(paste("zip -9 -r models.zip *.pdf > /dev/null", sep="")) + system(paste("cp models.zip ", plot_archive, sep="")) +} + +# Save dataframe as tab separated file +logger("All done, saving data..") +header = c("Column1", "Column2", "Coefficient1", "Coefficient2", "Coefficient3", "Coefficient4", + "LeftLimit", "RightLimit", "Residuals", "pvalue", "Rsquared") +if (method != 'poly') + header = header[c(1:4, 7:11)] +write(progress, logfile) +write.table(m, file=out_file, sep="\t", quote=FALSE, col.names=header, row.names=FALSE) |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b __init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/__init__.py Thu Jan 16 13:10:00 2014 +0100 |
| b |
| @@ -0,0 +1,6 @@ +''' +Module containing Galaxy tools for the GC/MS pipeline +Created on Mar 6, 2012 + +@author: marcelk +''' |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b combine_output.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/combine_output.py Thu Jan 16 13:10:00 2014 +0100 |
| [ |
| b'@@ -0,0 +1,230 @@\n+#!/usr/bin/env python\n+# encoding: utf-8\n+\'\'\'\n+Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup)\n+\'\'\'\n+\n+import csv\n+import re\n+import sys\n+import math\n+import pprint\n+\n+__author__ = "Marcel Kempenaar"\n+__contact__ = "brs@nbic.nl"\n+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"\n+__license__ = "MIT"\n+\n+def _process_data(in_csv):\n+ \'\'\'\n+ Generic method to parse a tab-separated file returning a dictionary with named columns\n+ @param in_csv: input filename to be parsed\n+ \'\'\'\n+ data = list(csv.reader(open(in_csv, \'rU\'), delimiter=\'\\t\'))\n+ header = data.pop(0)\n+ # Create dictionary with column name as key\n+ output = {}\n+ for index in xrange(len(header)):\n+ output[header[index]] = [row[index] for row in data]\n+ return output\n+\n+\n+def _merge_data(rankfilter, caslookup):\n+ \'\'\'\n+ Merges data from both input dictionaries based on the Centrotype field. This method will\n+ build up a new list containing the merged hits as the items. \n+ @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name)\n+ @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name)\n+ \'\'\'\n+ # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts:\n+ if (len(rankfilter[\'ID\']) != len(caslookup[\'Centrotype\'])):\n+ raise Exception(\'rankfilter and caslookup files should have the same nr of rows/records \')\n+ \n+ merged = []\n+ processed = {}\n+ for compound_id_idx in xrange(len(rankfilter[\'ID\'])):\n+ compound_id = rankfilter[\'ID\'][compound_id_idx]\n+ if not compound_id in processed :\n+ # keep track of processed items to not repeat them\n+ processed[compound_id] = compound_id\n+ # get centrotype nr\n+ centrotype = compound_id.split(\'-\')[0]\n+ # Get the indices for current compound ID in both data-structures for proper matching\n+ rindex = [index for index, value in enumerate(rankfilter[\'ID\']) if value == compound_id]\n+ cindex = [index for index, value in enumerate(caslookup[\'Centrotype\']) if value == centrotype]\n+ \n+ merged_hits = []\n+ # Combine hits\n+ for hit in xrange(len(rindex)):\n+ # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do \n+ # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its\n+ # corresponding value in the rankfilter or caslookup tables; i.e. \n+ # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute\n+ # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)\n+ rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()]))\n+ cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()]))\n+ \n+ merged_hit = _add_hit(rf_record, cl_record)\n+ merged_hits.append(merged_hit)\n+ \n+ merged.append(merged_hits)\n+\n+ return merged, len(rindex)\n+\n+\n+def _add_hit(rankfilter, caslookup):\n+ \'\'\'\n+ Combines single records from both the RankFilter- and CasLookup-tools\n+ @param rankfilter: record (dictionary) of one compound in the RankFilter output\n+ @param caslookup: matching record (dictionary) of one compound in the CasLookup output\n+ \'\'\'\n+ # The ID in the RankFilter output contains the following 5 fields:\n+ rf_id = rankfilter[\'ID\'].split(\'-\')\n+ try:\n+ name, formula = _remove_formula(rankfilter[\'Name\'])\n+ hit = [rf_id[0], # Centrotype\n+ rf_id[1], # cent.Factor\n+ rf_id[2], # '..b' rankfilter[\'%rel.err\'],\n+ rankfilter[\'Synonyms\']]\n+ except KeyError as error:\n+ print "Problem reading in data from input file(s):\\n",\n+ print "Respective CasLookup entry: \\n", pprint.pprint(caslookup), "\\n"\n+ print "Respective RankFilter entry: \\n", pprint.pprint(rankfilter), "\\n"\n+ raise error\n+\n+ return hit\n+\n+\n+def _remove_formula(name):\n+ \'\'\'\n+ The RankFilter Name field often contains the Formula as well, this function removes it from the Name\n+ @param name: complete name of the compound from the RankFilter output\n+ \'\'\'\n+ name = name.split()\n+ poss_formula = name[-1]\n+ match = re.match("^(([A-Z][a-z]{0,2})(\\d*))+$", poss_formula)\n+ if match:\n+ return \' \'.join(name[:-1]), poss_formula\n+ else:\n+ return \' \'.join(name), False\n+\n+\n+def _get_default_caslookup():\n+ \'\'\'\n+ The Cas Lookup tool might not have found all compounds in the library searched,\n+ this default dict will be used to combine with the Rank Filter output\n+ \'\'\'\n+ return {\'FORMULA\': \'N/A\',\n+ \'RI\': \'0.0\',\n+ \'Regression.Column.Name\': \'None\',\n+ \'min\': \'0.0\',\n+ \'max\': \'0.0\',\n+ \'nr.duplicates\': \'0\',\n+ \'Column.phase.type\': \'N/A\',\n+ \'Column.name\': \'N/A\'}\n+\n+\n+def _save_data(data, nhits, out_csv_single, out_csv_multi):\n+ \'\'\'\n+ Writes tab-separated data to file\n+ @param data: dictionary containing merged dataset\n+ @param out_csv: output csv file\n+ \'\'\'\n+ header = [\'Centrotype\',\n+ \'cent.Factor\',\n+ \'scan nr.\',\n+ \'R.T. (umin)\',\n+ \'nr. Peaks\',\n+ \'R.T.\',\n+ \'Name\',\n+ \'FORMULA\',\n+ \'Library\',\n+ \'CAS\',\n+ \'Forward\',\n+ \'Reverse\',\n+ \'Avg. (Forward, Reverse)\',\n+ \'RIexp\',\n+ \'RI\',\n+ \'RIsvr\',\n+ \'RIexp - RIsvr\',\n+ \'RI - RIexp\',\n+ \'Regression.Column.Name\',\n+ \'min\',\n+ \'max\',\n+ \'nr.duplicates\',\n+ \'Column.phase.type\',\n+ \'Column.name\',\n+ \'Rank\',\n+ \'%rel.err\',\n+ \'Synonyms\']\n+\n+ # Open output file for writing\n+ outfile_single_handle = open(out_csv_single, \'wb\')\n+ outfile_multi_handle = open(out_csv_multi, \'wb\')\n+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\\t")\n+ output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\\t")\n+\n+ # Write headers\n+ output_single_handle.writerow(header)\n+ output_multi_handle.writerow(header * nhits)\n+ # Combine all hits for each centrotype into one line\n+ line = []\n+ for centrotype_idx in xrange(len(data)):\n+ for hit in data[centrotype_idx]:\n+ line.extend(hit)\n+ output_multi_handle.writerow(line)\n+ line = []\n+\n+ # Write one line for each centrotype\n+ for centrotype_idx in xrange(len(data)):\n+ for hit in data[centrotype_idx]:\n+ output_single_handle.writerow(hit)\n+\n+\n+def main():\n+ \'\'\'\n+ Combine Output main function\n+ It will merge the result files from "RankFilter" and "Lookup RI for CAS numbers" \n+ NB: the caslookup_result_file will typically have fewer lines than\n+ rankfilter_result_file, so the merge has to consider this as well. The final file\n+ should have the same nr of lines as rankfilter_result_file.\n+ \'\'\'\n+ rankfilter_result_file = sys.argv[1]\n+ caslookup_result_file = sys.argv[2]\n+ output_single_csv = sys.argv[3]\n+ output_multi_csv = sys.argv[4]\n+\n+ # Read RankFilter and CasLookup output files\n+ rankfilter = _process_data(rankfilter_result_file)\n+ caslookup = _process_data(caslookup_result_file)\n+ merged, nhits = _merge_data(rankfilter, caslookup)\n+ _save_data(merged, nhits, output_single_csv, output_multi_csv)\n+\n+\n+if __name__ == \'__main__\':\n+ main()\n' |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b combine_output.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/combine_output.xml Thu Jan 16 13:10:00 2014 +0100 |
| b |
| @@ -0,0 +1,35 @@ +<tool id="combine_output" name="RIQC-Combine RankFilter and CasLookup output" version="1.0.2"> + <description>Perform a combination of output data from the RankFilter and CasLookup tools</description> + <command interpreter="python"> + combine_output.py $rankfilter_in $caslookup_in $out_single $out_multi + </command> + <inputs> + <param format="tabular" name="caslookup_in" type="data" label="RIQC-Lookup RI for CAS output" + help="Select the output file from the CasLookup tool"/> + <param format="tabular" name="rankfilter_in" type="data" label="RIQC-RankFilter output" + help="Select the output file from the RankFilter tool"/> + </inputs> + <outputs> + <data format="tabular" label="${tool.name} (Single) on ${on_string}" name="out_single" /> + <data format="tabular" label="${tool.name} (Multi) on ${on_string}" name="out_multi" /> + </outputs> + <help> +Performs a combination of output files from the 'RankFilter' and 'Lookup RI for CAS' tools into two tab-separated files. + +The files produced are contain either all hits for a compound on a single line (Single) or on separate lines +(Multi). + +.. class:: infomark + +**Notes** + +The input data should be produced by the RankFilter and 'Lookup RI for CAS' tools provided on this Galaxy server with the +original headers kept intact. Processing steps include: + + - Added columns showing the average Forward/Reverse values, RIexp - RIsvr and RI - RIexp values + - The ID column of the RankFilter tool output is split into 'Centrotype', 'cent.Factor', 'scan nr.', 'R.T. (umin)' + and 'nr. Peaks' fields. + - The formula is split off the 'Name' field in the RankFilter output + + </help> +</tool> |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b create_model.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/create_model.xml Thu Jan 16 13:10:00 2014 +0100 |
| b |
| @@ -0,0 +1,78 @@ +<tool id="create_poly_model" name="RIQC-Create Regression Model" version="1.0.2"> + <description>Generate coefficients to enable the regression from one GC-column + to another GC-column</description> + <command interpreter="Rscript">Rscripts/ridb-regression.R + $ridb + $out_model + $out_log + $min_residuals + $range_mod + $pvalue + $rsquared + $method + $plot + #if $plot + $model_graphics + #end if + </command> + <inputs> + <param format="tabular" name="ridb" type="select" label="Retention Index (RI) and GC columns Library file" + help="Select the RI library file of which all GC columns and their RI values + will be used to create a model" + dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RI_DB_libraries")'/> + + <param name="method" type="select" label="Select regression method" + help="Method to use for calculating the model" > + <option value="poly" selected="True">Polynomial (3rd degree)</option> + <option value="linear">Linear</option> + </param> + <param name="min_residuals" type="integer" value="10" optional="False" + label="Minimum number of residuals" help="The minimum number of residuals + (datapoints) that both columns should have in common when calculating + the model" /> + <param name="range_mod" type="integer" value="0" optional="False" + label="Range modifier" help="Moves the range of the usable RI space by the + given percentage. Set to 0 to use the full range of available data." /> + <param name="pvalue" type="float" value="0.05" optional="False" min="0" max="1" + label="Pvalue to filter on" help="Set the upper limit for the pvalue (calculated) + by performing an ANOVA analysis on the created model). All models with higher + pvalues are discarded." /> + <param name="rsquared" type="float" value="0.95" optional="False" min="0" max="1" + label="R-squared to filter on" help="Set the lower limit for the R-squared, + all models with lower values are discarded." /> + <param name="plot" type="boolean" label="Create a separate plot for each model" + help="This will create a ZIP file in the history containing PDF plots" /> + </inputs> + <code file="match_library.py" /> + <outputs> + <data format="zip" label="Model Graphics of ${on_string}" name="model_graphics" > + <filter>(plot)</filter> + </data> + <data format="tabular" label="Regression logfile of ${on_string}" name="out_log" /> + <data format="tabular" label="Regression model of ${on_string}" name="out_model" /> + </outputs> + <help> +Calculates regression models for a permutation of all GC columns contained in the selected +RI database file. The method used for creating the model is either based on a 3rd degree +polynomial or a standard linear model. + +The *Minimum number of residuals* option will only allow regression if the columns it is based +on has at least that number of datapoints on the same compound. + +Filtering is possible by setting an upper limit for the *p-value* and / or a lower limit for +the *R squared* value. The produced logfile will state how many models have been discarded due +to this filtering. The output model file also includes the p-value and R squared value for +each created model. + +Graphical output of the models is available by selecting the plot option which shows the +data points used for the model as well as the fit itself and the range of data that will +be usable. + +.. class:: infomark + +**Notes** + +The output file produced by this tool is required as input for the CasLookup tool when +selecting to apply regression when finding hits in the RIDB. + </help> +</tool> |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b datatypes_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Thu Jan 16 13:10:00 2014 +0100 |
| b |
| @@ -0,0 +1,12 @@ +<?xml version="1.0"?> +<datatypes> + <datatype_files> + </datatype_files> + <registration display_path="display_applications"> + <!-- type for the pdf --> + <datatype extension="pdf" type="galaxy.datatypes.data:Data" mimetype="application/octet-stream" + display_in_upload="true" subclass="true"/> + <datatype extension="msclust.csv" type="galaxy.datatypes.tabular:Tabular" mimetype="text/csv" display_in_upload="true" subclass="true"> + </datatype> + </registration> +</datatypes> \ No newline at end of file |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b export_to_metexp_tabular.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/export_to_metexp_tabular.py Thu Jan 16 13:10:00 2014 +0100 |
| [ |
| @@ -0,0 +1,171 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust +into a tabular file that can be uploaded to the MetExp database. + +RankFilter, CasLookup are already combined by combine_output.py so here we will use +this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust +quantification files are to be combined with combine_output.py result as well. + +Extra calculations performed: +- The column MW is also added here and is derived from the column FORMULA found + in combine_output.py result. + +So in total here we merge 3 files and calculate one new column. +''' + +import csv +import sys +from collections import OrderedDict + +__author__ = "Pieter Lukasse" +__contact__ = "pieter.lukasse@wur.nl" +__copyright__ = "Copyright, 2013, Plant Research International, WUR" +__license__ = "Apache v2" + +def _process_data(in_csv, delim='\t'): + ''' + Generic method to parse a tab-separated file returning a dictionary with named columns + @param in_csv: input filename to be parsed + ''' + data = list(csv.reader(open(in_csv, 'rU'), delimiter=delim)) + header = data.pop(0) + # Create dictionary with column name as key + output = OrderedDict() + for index in xrange(len(header)): + output[header[index]] = [row[index] for row in data] + return output + +ONE_TO_ONE = 'one_to_one' +N_TO_ONE = 'n_to_one' + +def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, relation_type=ONE_TO_ONE): + ''' + Merges data from both input dictionaries based on the link fields. This method will + build up a new list containing the merged hits as the items. + @param set1: dictionary holding set1 in the form of N lists (one list per attribute name) + @param set2: dictionary holding set2 in the form of N lists (one list per attribute name) + ''' + # TODO test for correct input files -> same link_field values should be there (test at least number of unique link_field values): + # + # if (len(set1[link_field_set1]) != len(set2[link_field_set2])): + # raise Exception('input files should have the same nr of key values ') + + + merged = [] + processed = {} + for link_field_set1_idx in xrange(len(set1[link_field_set1])): + link_field_set1_value = set1[link_field_set1][link_field_set1_idx] + if not link_field_set1_value in processed : + # keep track of processed items to not repeat them + processed[link_field_set1_value] = link_field_set1_value + + # Get the indices for current link_field_set1_value in both data-structures for proper matching + set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value] + set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ] + + + + merged_hits = [] + # Combine hits + for hit in xrange(len(set1index)): + # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do + # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its + # corresponding value in the rankfilter or caslookup tables; i.e. + # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute + # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index) + # It just ensures the entry is made available as a plain named array for easy access. + rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()])) + if relation_type == ONE_TO_ONE : + cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[hit]] for key in set2.keys()])) + else: + # is N to 1: + cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()])) + + merged_hit = merge_function(rf_record, cl_record) + merged_hits.append(merged_hit) + + merged.append(merged_hits) + + return merged, len(set1index) + + +def _compare_records(key1, key2): + ''' + in this case the compare method is really simple as both keys are expected to contain + same value when records are the same + ''' + if key1 == key2: + return True + else: + return False + + + +def _merge_records(rank_caslookup_combi, msclust_quant_record): + ''' + Combines single records from both the RankFilter+CasLookup combi file and from MsClust file + + @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py) + @param msclust_quant_record: msclust quantification + spectrum record + ''' + i = 0 + record = [] + for column in rank_caslookup_combi: + record.append(rank_caslookup_combi[column]) + i += 1 + + for column in msclust_quant_record: + record.append(msclust_quant_record[column]) + i += 1 + + return record + + + + +def _save_data(data, headers, nhits, out_csv): + ''' + Writes tab-separated data to file + @param data: dictionary containing merged dataset + @param out_csv: output csv file + ''' + + # Open output file for writing + outfile_single_handle = open(out_csv, 'wb') + output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") + + # Write headers + output_single_handle.writerow(headers) + + # Write one line for each centrotype + for centrotype_idx in xrange(len(data)): + for hit in data[centrotype_idx]: + output_single_handle.writerow(hit) + + +def main(): + ''' + Combine Output main function + + RankFilter, CasLookup are already combined by combine_output.py so here we will use + this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust + quantification files are to be combined with combine_output.py result as well. + ''' + rankfilter_and_caslookup_combined_file = sys.argv[1] + msclust_quantification_and_spectra_file = sys.argv[2] + output_csv = sys.argv[3] + + # Read RankFilter and CasLookup output files + rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file) + msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',') + + merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype', + msclust_quantification_and_spectra, 'centrotype', _compare_records, _merge_records, N_TO_ONE) + headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + _save_data(merged, headers, nhits, output_csv) + + +if __name__ == '__main__': + main() |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b library_lookup.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/library_lookup.py Thu Jan 16 13:10:00 2014 +0100 |
| [ |
| b'@@ -0,0 +1,327 @@\n+\'\'\'\n+Logic for searching a Retention Index database file given output from NIST\n+\'\'\'\n+import match_library\n+import re\n+import sys\n+import csv\n+\n+__author__ = "Marcel Kempenaar"\n+__contact__ = "brs@nbic.nl"\n+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"\n+__license__ = "MIT"\n+\n+def create_lookup_table(library_file, column_type_name, statphase):\n+ \'\'\'\n+ Creates a dictionary holding the contents of the library to be searched\n+ @param library_file: library to read\n+ @param column_type_name: the columns type name\n+ @param statphase: the columns stationary phase\n+ \'\'\'\n+ (data, header) = match_library.read_library(library_file)\n+ # Test for presence of required columns\n+ if (\'columntype\' not in header or\n+ \'columnphasetype\' not in header or\n+ \'cas\' not in header):\n+ raise IOError(\'Missing columns in \', library_file)\n+\n+ column_type_column = header.index("columntype")\n+ statphase_column = header.index("columnphasetype")\n+ cas_column = header.index("cas")\n+\n+ filtered_library = [line for line in data if line[column_type_column] == column_type_name\n+ and line[statphase_column] == statphase]\n+ lookup_dict = {}\n+ for element in filtered_library:\n+ # Here the cas_number is set to the numeric part of the cas_column value, so if the \n+ # cas_column value is \'C1433\' then cas_number will be \'1433\'\n+ cas_number = str(re.findall(r\'\\d+\', (element[cas_column]).strip())[0])\n+ try:\n+ lookup_dict[cas_number].append(element)\n+ except KeyError:\n+ lookup_dict[cas_number] = [element]\n+ return lookup_dict\n+\n+\n+def _preferred(hits, pref, ctype, polar, model, method):\n+ \'\'\'\n+ Returns all entries in the lookup_dict that have the same column name, type and polarity\n+ as given by the user, uses regression if selected given the model and method to use. The\n+ regression is applied on the column with the best R-squared value in the model\n+ @param hits: all entries in the lookup_dict for the given CAS number\n+ @param pref: preferred GC-column, can be one or more names\n+ @param ctype: column type (capillary etc.)\n+ @param polar: polarity (polar / non-polar etc.)\n+ @param model: data loaded from file containing regression models\n+ @param method: supported regression method (i.e. poly(nomial) or linear)\n+ \'\'\'\n+ match = []\n+ for column in pref:\n+ for hit in hits:\n+ if hit[4] == ctype and hit[5] == polar and hit[6] == column:\n+ # Create copy of found hit since it will be altered downstream\n+ match.extend(hit)\n+ return match, False\n+\n+ # No hit found for current CAS number, return if not performing regression\n+ if not model:\n+ return False, False\n+\n+ # Perform regression\n+ for column in pref:\n+ if column not in model:\n+ break\n+ # Order regression candidates by R-squared value (last element)\n+ order = sorted(model[column].items(), key=lambda col: col[1][-1])\n+ # Create list of regression candidate column names\n+ regress_columns = list(reversed([column for (column, _) in order]))\n+ # Names of available columns\n+ available = [hit[6] for hit in hits]\n+ \n+ # TODO: combine Rsquared and number of datapoints to get the best regression match\n+ \'\'\'\n+ # Iterate regression columns (in order) and retrieve their models\n+ models = {}\n+ for col in regress_columns:\n+ if col in available:\n+ hit = list(hits[available.index(col)])\n+ if hit[4] == ctype:\n+ # models contains all model data including residuals [-2] and rsquared [-1]\n+ models[pref[0]] = model[pref[0]][hit[6]] \n+ # Get the combined maximum for residuals and rsquared\n+ best_match = models[]\n+ # Apply regression\n+ if me'..b'se:\n+ found_hit[-1] = \'0\'\n+ data.append(found_hit)\n+ found_hit = \'\'\n+ else:\n+ data.append(default_hit(row, casf, compound_id))\n+ else:\n+ data.append(default_hit(row, casf, compound_id))\n+ \n+ casf = \'\'\n+ compound_id = \'\'\n+ found_hit = []\n+ dups = []\n+ return data\n+\n+\n+def _save_data(content, outfile):\n+ \'\'\'\n+ Write to output file\n+ @param content: content to write\n+ @param outfile: file to write to\n+ \'\'\'\n+ # header\n+ header = [\'CAS\',\n+ \'NAME\',\n+ \'FORMULA\',\n+ \'RI\',\n+ \'Column.type\',\n+ \'Column.phase.type\',\n+ \'Column.name\',\n+ \'phase.coding\',\n+ \'CAS_column.Name\',\n+ \'Centrotype\',\n+ \'Regression.Column.Name\',\n+ \'min\',\n+ \'max\',\n+ \'nr.duplicates\']\n+ output_handle = csv.writer(open(outfile, \'wb\'), delimiter="\\t")\n+ output_handle.writerow(header)\n+ for entry in content:\n+ output_handle.writerow(entry)\n+\n+\n+def _read_model(model_file):\n+ \'\'\'\n+ Creates an easy to search dictionary for getting the regression parameters\n+ for each valid combination of GC-columns\n+ @param model_file: filename containing the regression models\n+ \'\'\'\n+ regress = list(csv.reader(open(model_file, \'rU\'), delimiter=\'\\t\'))\n+ if len(regress.pop(0)) > 9:\n+ method = \'poly\'\n+ else:\n+ method = \'linear\'\n+\n+ model = {}\n+ # Create new dictionary for each GC-column\n+ for line in regress:\n+ model[line[0]] = {}\n+\n+ # Add data\n+ for line in regress:\n+ if method == \'poly\':\n+ model[line[0]][line[1]] = [float(col) for col in line[2:11]]\n+ else: # linear\n+ model[line[0]][line[1]] = [float(col) for col in line[2:9]]\n+\n+ return model, method\n+\n+\n+def _apply_poly_regression(column1, column2, retention_index, model):\n+ \'\'\'\n+ Calculates a new retention index (RI) value using a given 3rd-degree polynomial\n+ model based on data from GC columns 1 and 2\n+ @param column1: name of the selected GC-column\n+ @param column2: name of the GC-column to use for regression\n+ @param retention_index: RI to convert\n+ @param model: dictionary containing model information for all GC-columns\n+ \'\'\'\n+ coeff = model[column1][column2]\n+ # If the retention index to convert is within range of the data the model is based on, perform regression\n+ if coeff[4] < retention_index < coeff[5]:\n+ return (coeff[3] * (retention_index ** 3) + coeff[2] * (retention_index ** 2) + \n+ (retention_index * coeff[1]) + coeff[0])\n+ else:\n+ return False\n+\n+\n+def _apply_linear_regression(column1, column2, retention_index, model):\n+ \'\'\'\n+ Calculates a new retention index (RI) value using a given linear model based on data\n+ from GC columns 1 and 2\n+ @param column1: name of the selected GC-column\n+ @param column2: name of the GC-column to use for regression\n+ @param retention_index: RI to convert\n+ @param model: dictionary containing model information for all GC-columns\n+ \'\'\'\n+ # TODO: No use of limits\n+ coeff = model[column1][column2]\n+ return coeff[1] * retention_index + coeff[0]\n+\n+\n+def main():\n+ \'\'\'\n+ Library Lookup main function\n+ \'\'\'\n+ library_file = sys.argv[1]\n+ nist_tabular_filename = sys.argv[2]\n+ ctype = sys.argv[3]\n+ polar = sys.argv[4]\n+ outfile = sys.argv[5]\n+ pref = sys.argv[6:-1]\n+ regress = sys.argv[-1]\n+\n+ if regress != \'False\':\n+ model, method = _read_model(regress)\n+ else:\n+ model, method = False, None\n+\n+ lookup_dict = create_lookup_table(library_file, ctype, polar)\n+ data = format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method)\n+\n+ _save_data(data, outfile)\n+\n+\n+if __name__ == "__main__":\n+ main()\n' |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b library_lookup.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/library_lookup.xml Thu Jan 16 13:10:00 2014 +0100 |
| b |
| @@ -0,0 +1,68 @@ +<tool id="lookup_library" name="RIQC-Lookup RI for CAS numbers in library" version="1.0.2"> + <description>Lookup or estimate the RI using a "known RI values" CAS numbers library</description> + <command interpreter="python"> + library_lookup.py + $library_file + $input + "$col_type" + "$polarity" + $output + #for $ctype in $pref + ${ctype.columntype} + #end for + $regression.model + </command> + <inputs> + <page> + <param format="tabular" name="input" type="data" label="NIST identifications as tabular file" + help="Select a tab delimited NIST metabolite identifications file (converted from PDF)" /> + <param name="library_file" type="select" label="CAS x RI Library file" + help="Select a library/lookup file containing RI values for CAS numbers on various chromatography columns " + dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RI_DB_libraries")'/> + <param name="col_type" type="select" label="Select column type" refresh_on_change="true" + display="radio" dynamic_options='get_column_type(library_file)' + help="" /> + </page> + <page> + <param name="polarity" type="select" label="Select polarity" refresh_on_change="true" + display="radio" dynamic_options='filter_column(library_file,col_type)' + help="" /> + </page> + <page> + <conditional name="regression"> + <param name="regression_select" type="boolean" checked="false" label="Apply regression method" + help="If no data for the selected column is present in the database, selecting this option will try + to convert Retention Indices using data from other GC-columns with a regression method. Please + note that only the first given GC-column above will be used for this, any alternatives will be + ignored" /> + <when value="true"> + <param name="model" format="tabular" type="data" label="Tabular file containing regression model" + help="This file contains the coefficients used to perform the regression from one GC-column + to another GC-column"/> + </when> + <when value="false"> + <param name="model" type="hidden" value="False" /> + </when> + </conditional> + <repeat name="pref" title="Select column name preference"> + <param name="columntype" type="select" label="Column name" refresh_on_change="true" + dynamic_options='filter_column2(library_file, col_type, polarity)' + help="Select one or more column names for filtering. The order defines the priority." /> + </repeat> + </page> + </inputs> + <outputs> + <data format="tabular" label="${tool.name} on" name="output" /> +</outputs> +<code file="match_library.py" /> + <help> +Performs a lookup of the RI values by matching CAS numbers from the given NIST identifications file to a library. +If a direct match is NOT found for the preferred column name, a regression can be done to find +the theoretical RI value based on known RI values for the CAS number on other column types (see step 4). +If there is no match for the CAS number on any column type, then the record is not given a RI. + + + + </help> + +</tool> |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b match_library.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/match_library.py Thu Jan 16 13:10:00 2014 +0100 |
| [ |
| @@ -0,0 +1,120 @@ +''' +Containing functions are called from Galaxy to populate lists/checkboxes with selectable items +''' +import csv +import glob +import os + + +__author__ = "Marcel Kempenaar" +__contact__ = "brs@nbic.nl" +__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" +__license__ = "MIT" + +def get_column_type(library_file): + ''' + Returns a Galaxy formatted list of tuples containing all possibilities for the + GC-column types. Used by the library_lookup.xml tool + @param library_file: given library file from which the list of GC-column types is extracted + ''' + (data, header) = read_library(library_file) + + if 'columntype' not in header: + raise IOError('Missing columns in ', library_file) + + # Filter data on column type + column_type = header.index("columntype") + amounts_in_list_dict = count_occurrence([row[column_type] for row in data]) + galaxy_output = [(str(a) + "(" + str(b) + ")", a, False) for a, b in amounts_in_list_dict.items()] + return(galaxy_output) + + +def filter_column(library_file, column_type_name): + ''' + Filters the Retention Index database on column type + @param library_file: file containing the database + @param column_type_name: column type to filter on + ''' + (data, header) = read_library(library_file) + + if ('columntype' not in header or + 'columnphasetype' not in header): + raise IOError('Missing columns in ', library_file) + + column_type = header.index("columntype") + statphase = header.index("columnphasetype") + + # Filter data on colunn type name + statphase_list = [line[statphase] for line in data if line[column_type] == column_type_name] + amounts_in_list_dict = count_occurrence(statphase_list) + galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()] + return(sorted(galaxy_output)) + + +def filter_column2(library_file, column_type_name, statphase): + ''' + Filters the Retention Index database on column type + @param library_file: file containing the database + @param column_type_name: column type to filter on + @param statphase: stationary phase of the column to filter on + ''' + (data, header) = read_library(library_file) + + if ('columntype' not in header or + 'columnphasetype' not in header or + 'columnname' not in header): + raise IOError('Missing columns in ', library_file) + + column_type_column = header.index("columntype") + statphase_column = header.index("columnphasetype") + column_name_column = header.index("columnname") + + # Filter data on given column type name and stationary phase + statphase_list = [line[column_name_column] for line in data if line[column_type_column] == column_type_name and + line[statphase_column] == statphase] + amounts_in_list_dict = count_occurrence(statphase_list) + galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()] + return(sorted(galaxy_output)) + + +def read_library(filename): + ''' + Reads a CSV file and returns its contents and a normalized header + @param filename: file to read + ''' + data = list(csv.reader(open(filename, 'rU'), delimiter='\t')) + header_clean = [i.lower().strip().replace(".", "").replace("%", "") for i in data.pop(0)] + return(data, header_clean) + + + +def get_directory_files(dir_name): + ''' + Reads the directory and + returns the list of .txt files found as a dictionary + with file name and full path so that it can + fill a Galaxy drop-down combo box. + + ''' + files = glob.glob(dir_name + "/*.txt") + if len(files) == 0: + raise Exception("Configuration error: no library files found in <galaxy-home-dir>/" + dir_name) + else: + galaxy_output = [(str(get_file_name_no_ext(file_name)), str(os.path.abspath(file_name)), False) for file_name in files] + return(galaxy_output) + +def get_file_name_no_ext(full_name): + ''' + returns just the last part of the name + ''' + simple_name = os.path.basename(full_name) + base, ext = os.path.splitext(simple_name) + return base + + +def count_occurrence(data_list): + ''' + Counts occurrences in a list and returns a dict with item:occurrence + @param data_list: list to count items from + ''' + return dict((key, data_list.count(key)) for key in set(data_list)) |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b msclust2.0.1.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/msclust2.0.1.xml Thu Jan 16 13:10:00 2014 +0100 |
| [ |
| b'@@ -0,0 +1,289 @@\n+<tool name="MsClust" id="msclust2" version="2.0.1">\r\n+\t<description>Extracts fragmentation spectra from aligned data</description>\r\n+\t<!-- \r\n+\t For remote debugging start you listener on port 8000 and use the following as command interpreter:\r\n+\t java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000 \r\n+\t //////////////////////////\r\n+\t \r\n+\t TODO in command below: add conditionals according to options of using or NOT the tolerances/thresholds from previous steps \r\n+\t -->\r\n+\t<command interpreter="java -jar ">\r\n+\t MsClust.jar \r\n+\t \t-peaksFileName $inputPeaks \r\n+\t \t-dataType $dataType\r\n+ -imputationMethod $imputationMethod.type\r\n+ #if $imputationMethod.type == "valueRange"\r\n+ \t-rangeUpperLimit $imputationMethod.rangeUpperLimit\r\n+ #end if\r\n+\t\t-plInputFormat "metalign" \r\n+\t\t-potDensFuncType $potDensFuncType.type \r\n+\t\t-centerSelectionType $centerSelectionType.type \r\n+\t\t-clusteringType $clusteringType.type \r\n+\t\t-neighborhoodWindowSize $potDensFuncType.pdf_neighborhoodWindowSize \r\n+\t\t-clusterSearchStopCriterium $centerSelectionType.cs_stop_criterion\r\n+\t\t-pearsonDistTreshold $potDensFuncType.pdf_pears_treshold\r\n+\t\t-pearsonTresholdConfidence $potDensFuncType.pdf_pears_conf\r\n+\t\t-pearsonPDReductionThreshold $centerSelectionType.cs_pears_pd_reductionTreshold\r\n+\t\t-pearsonPDReductionSlope $centerSelectionType.cs_pears_pd_reductionSlope\r\n+\t\t-scanDistTol $potDensFuncType.pdf_scan_toler \r\n+\t\t-scanDistanceConfidence $potDensFuncType.pdf_scan_conf \r\n+\t\t-centrotypesOut $centrotypesOut \r\n+\t\t-simOut $simOut\r\n+\t\t-micOut $micOut\r\n+\t\t-mspOut $mspOut \r\n+\t\t-classOut $classOut\r\n+\t\t-outReport $htmlReportFile\r\n+\t -outReportPicturesPath $htmlReportFile.files_path\r\n+ #if $clusteringType.type == "fuzzyCMeans"\r\n+ \t-fcmMembershipWeightingExponent $clusteringType.fcmMembershipWeightingExponent \r\n+\t\t\t-fcmStopCriterion $clusteringType.fcmStopCriterion\r\n+\t\t\t-fcmCorrelationWeight $clusteringType.fcmCorrelationWeight\r\n+\t\t\t-fcmFinalAssemblyType $clusteringType.finalClusterAssembly.type\r\n+\t\t\t#if $clusteringType.finalClusterAssembly.type == "membershipBased"\r\n+\t\t\t\t-fcmMembershipCutoff $clusteringType.finalClusterAssembly.fcmMembershipCutoff\r\n+\t\t\t#end if\r\n+ #end if\r\n+\t\t-verbose "false"\r\n+\t #if $advancedSettings.settings == True\r\n+\t \t-advancedSettings YES\r\n+\t \t-saturationLimit $advancedSettings.saturationLimit\r\n+\t \t-sampleSelectionSortType $advancedSettings.sampleSelectionSortType\r\n+\t \t-simSelectionAlgorithm $advancedSettings.simSelectionAlgorithm\r\n+\t \t-simMassFilter "$advancedSettings.simMassFilter"\r\n+\t \t-simMembershipThreshold $advancedSettings.simMembershipThreshold\r\n+\t \t-simSaturationThreshold $advancedSettings.simSaturationThreshold\r\n+\t \t-simAbsenseThreshold $advancedSettings.simAbsenseThreshold\r\n+\t \t-micMembershipThreshold $advancedSettings.micMembershipThreshold\r\n+\t \t-peakIntensityCorrectionAlgorithm $advancedSettings.peakIntensityCorrectionAlgorithm\r\n+ #else\r\n+ \t-advancedSettings YES\r\n+ \t-sampleSelectionSortType SIM_INTENSITY\r\n+ \t-peakIntensityCorrectionAlgorithm CORRELATION_BASED\r\n+ #end if\r\n+\t \r\n+\t</command>\r\n+\t<inputs>\r\n+\t<!-- <param name="rankingWeightConfig" type="text" area="true" size="11x70" label="NB - TEST VERSION" \r\n+value="VERSION BEING TESTED AT THIS MOMENT...NOT READY FOR USE..."/>\r\n+\t-->\r\n+\t \t<param name="inputPeaks" type="data" format="txt" label="Ion-wise aligned data (e.g. MetAlign output data)" />\r\n+\t\t<param name="dataType" type="select" size="30" label="Data type">\r\n+\t\t\t\t<option value="gcms" selected="true">GC-MS</option>\r\n+\t\t\t\t<option value="lcms">LC-MS</option>\r\n+\t\t\t</param>\r\n+\t \t<conditional name="imputationMethod">\r\n+\t\t\t<param name="type" type="select" size="30" label="Select the approach used for imputing missing values (optional)" help="select how you generated the values to fill in the data gaps">\r\n+\t\t\t\t<option value="none"'..b'n was used, this is the variation *after* pre-processing by MetAlign. \r\n+\r\n+*Peak Width confidence:* The higher the confidence, the stricter the threshold.\r\n+\r\n+*Correlation threshold (0.0 - 1.0):* Tolerance center for pearson distance calculation. The higher this value, \r\n+the higher the correlation between 2 items has to be for them to be considered \'close\'. \r\n+\r\n+*Correlation threshold confidence:* The higher the confidence, the stricter the threshold. `More...`__\r\n+\r\n+*Potential Density reduction (0.0 - 1.0):* Reduction tolerance center for pearson distance calculation. \r\n+The higher this value, the less the low correlated items get reduced, getting a chance to form a cluster of their own. \r\n+\r\n+*Potential Density reduction softness:* Reduction curve slope for pearson distance tolerance. Lower \r\n+values = stricter separation at the value determined in \'Potential Density reduction\' above \r\n+(TODO review this comment). \r\n+\r\n+*Stop Criterion:* When to stop reducing and looking for new clusters. Lower values = more iterations \r\n+\r\n+.. __: javascript:window.open(\'$PATH_TO_IMAGES/confidence_and_slope_params_explain.png\',\'popUpWindow\',\'height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes\')\r\n+\r\n+\r\n+-----\r\n+\r\n+**Output files described below**\r\n+\r\n+-----\r\n+\r\n+*SPECTRA:* this file can be submitted to NIST for identification of the spectra.\r\n+\r\n+`Click here for more details on the Sample selection and Spectrum peak intensity correction algorithm parameters related to SPECTRA generation`_ \r\n+\r\n+.. _Click here for more details on the Sample selection and Spectrum peak intensity correction algorithm parameters related to SPECTRA generation: javascript:window.open(\'$PATH_TO_IMAGES/sample_sel_and_peak_height_correction.png\',\'popUpWindow\',\'height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes\')\r\n+\r\n+-----\r\n+\r\n+*MIC:* stands for Measured Ions Count -> it contains, for each cluster, the sum of the ion count \r\n+values (corrected by their membership) for all MEASURED cluster ions in the given sample.\r\n+\r\n+The MIC for a **cluster i** in **sample s**, where **cluster i** has **n** members is thus: \r\n+\r\n+sum ( [intensity of member n in **sample s**] x [membership value of member n in **cluster i** ] )\r\n+\r\n+-----\r\n+\r\n+*SIM:* stands for Selective Ion Mode -> it contains, for each cluster, the intensity values of the \r\n+most representative member ion peak of this cluster. The most representative member peak is the one with the \r\n+highest membership*average_intensity. This definition leads to conflicts as a peak can have a \r\n+membership in two or more clusters. The assignment of a SIM peak to a cluster depends on \r\n+the configured data type (LC or GC-MS). NB: this can be overruled in the "advanced settings":\r\n+\r\n+(1) LC-MS SIM: select SIM peak only once and for the centrotype in which this specific mass has its \r\n+highest membership; for neighboring centrotypes use its "second best SIM", etcetera. In other words,\r\n+if the SIM peak has been identified as the SIM in more than 1 cluster, assign as SIM to the cluster \r\n+with highest membership. Continue searching for other SIM peaks to assign to the other clusters until \r\n+all ambiguities are solved.\r\n+\r\n+(2) GC-MS SIM: the SIM peak can be "shared" by multiple clusters. However, the intensity values are corrected\r\n+by the membership value of the peak in the cluster in case the SIM peak is "shared". If the SIM peak is not \r\n+"shared" then the "raw" intensity values of the SIM peak are recorded in the SIM file. \r\n+\r\n+`Click here for more details on the SIM output file`_ \r\n+\r\n+.. _Click here for more details on the SIM output file: javascript:window.open(\'$PATH_TO_IMAGES/sample_SIM.png\',\'popUpWindow\',\'height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes\')\r\n+\r\n+\r\n+\r\n+ </help>\r\n+</tool>\r\n' |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b rankfilterGCMS_tabular.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilterGCMS_tabular.xml Thu Jan 16 13:10:00 2014 +0100 |
| b |
| @@ -0,0 +1,77 @@ +<tool id="rankfilterGCMS_tabular" name="RIQC-RankFilter GC-MS from tabular file" version="1.0.2"> + <description>Convert Retention Time to Retention Index</description> + <command interpreter="python">rankfilter_GCMS/rankfilter.py $input_file</command> + <inputs> + <param format="tabular" name="sample" type="data" label="Sample File" + help="Converted PDF file in tabular format" /> + <!-- question: is this calibration file not column specific as it includes RT info?? --> + <param name="calibration" type="select" label="Calibration File" + help="Calibration file with reference masses (e.g. alkanes) with their RT and RI values" + dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RankFilter_Calibration_Files")'/> + + <param name="analysis_type" type="select" format="text" label="Analysis Type" + help="Select the type of analysis that has been used to generate the sample file"> + <option value="NIST">NIST</option> + <option value="AMDIS">AMDIS</option> + </param> + <param name="model" type="select" format="text" label="Select a model to be used " + help="Both linear and (3rd degree) polynomial models are available "> + <option value="linear">Linear</option> + <option value="poly">Polynomial</option> + </param> + <param name="lib_data" type="select" label="Library" + help="Reference global lookup library file with CAS numbers and respective (previously calculated) RIsvr values" + dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/RankFilter_lookup_libraries")'/> + + <param name="window" type="float" label="Window" value="10.56" /> + </inputs> + <outputs> + <data format="tabular" label="${tool.name}" name="onefile" /> + </outputs> + <!-- file with implementation of the function get_directory_files() used above --> + <code file="match_library.py" /> + <configfiles> + <configfile name="input_file"> + sample = ${sample} + calibration = ${calibration} + lib_data = ${lib_data} + window = ${window} + analysis_type = ${analysis_type} + tabular = True + onefile = ${onefile} + model = ${model} + </configfile> + </configfiles> + <help> +Basically estimates the experimental RI (RIexp) by building a RI(RT) function based on the +given calibration file. + +It also determines the estimated RI (RIsvr) by looking up for each entry of the given input file (Sample File), +based on its CAS number, its respective RIsvr value in the given global lookup library +(this step is also called the "RankFilter analysis" -see reference below; Sample File may be either from NIST or AMDIS). +This generates an prediction of the RI for +a compound according to the "RankFilter procedure" (RIsvr). + +Output is a tab separated file in which four columns are added: + + - **Rank** Calculated rank + - **RIexp** Experimental Retention Index (RI) + - **RIsvr** Calculated RI based on support vector regression (SVR) + - **%rel.err** Relative RI error (%rel.error = 100 * (RISVR − RIexp) / RIexp) + +.. class:: infomark + +**Notes** + + - The layout of the Calibration file should include the following columns: 'MW', 'R.T.' and 'RI'. + - Selecting 'Polynomial' in the model parameter will calculate a 3rd degree polynomial model that will + be used to convert from XXXX to YYYY. + +----- + +**References** + + - **RankFilter**: Mihaleva et. al. (2009) *Automated procedure for candidate compound selection in GC-MS + metabolomics based on prediction of Kovats retention index*. Bioinformatics, 25 (2009), pp. 787–794 + </help> +</tool> |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b rankfilter_GCMS/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/__init__.py Thu Jan 16 13:10:00 2014 +0100 |
| b |
| @@ -0,0 +1,5 @@ +''' +Created on Mar 14, 2012 + +@author: marcelk +''' |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b rankfilter_GCMS/pdfread.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/pdfread.py Thu Jan 16 13:10:00 2014 +0100 |
| [ |
| b'@@ -0,0 +1,210 @@\n+"""\n+Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University \n+\n+Permission is hereby granted, free of charge, to any person obtaining a copy\n+of this software and associated documentation files (the "Software"), to deal\n+in the Software without restriction, including without limitation the rights\n+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n+copies of the Software, and to permit persons to whom the Software is\n+furnished to do so, subject to the following conditions:\n+\n+The above copyright notice and this permission notice shall be included in\n+all copies or substantial portions of the Software.\n+\n+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n+THE SOFTWARE.\n+"""\n+\n+import sys\n+import csv\n+\n+def getPDF(filename, print_progress):\n+ \'\'\'\n+ Parses NIST PDF file\n+ @param filename: PDF file to parse\n+ \'\'\'\n+ NistInput = {}\n+ NistInput_missed = {}\n+ nist_input = open(filename, \'r\').read()\n+\n+ hitid = []\n+ rt = []\n+ name = []\n+ forward = []\n+ cas = []\n+ reverse = []\n+ prob = []\n+ lib_id = []\n+ nist_id = []\n+ missed_compounds = []\n+ rt_missed_compounds = []\n+ formula = []\n+\n+ hit_list = nist_input.split(\'** Search Report Page 1 of 1 **\')\n+ hit_list.pop(0)\n+ #number_hits = range(10)\n+ line_id = 0\n+ for line in hit_list:\n+ line = line.strip().translate(None, \'\\r\')\n+ if line != \'\':\n+ hits = line.replace(\'\\n\', \' \').replace(\'\\x0c\', \'\').replace(\'^L\', \'\').split(\'Hit\')\n+\n+ spec_id = hits.pop(0).split(\' \')[1]\n+ j = 0\n+ for hh in hits:\n+ cell = hh.split(\';\')\n+ if print_progress == True:\n+ print \'Processing line: \', line_id, \' with length: \', len(cell), \':\\n\\t\', cell\n+ line_id += 1\n+ if len(cell) == 7: # the compound has CAS number\n+ if len(cell[1].split(\':\')) == 2:\n+ forward.append(cell[1].split(\':\')[1])\n+ # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end\n+ if len(cell[0].split(\':\')) > 2:\n+ name_tmp = \':\'.join(cell[0].split(\':\')[1:])\n+ else:\n+ name_tmp = cell[0].split(\':\')[1]\n+ name_tmp = name_tmp.replace(\'lC\', \'l C\').replace(\']C\', \'] C\').replace(\'sC\', \'s C\').replace(\'9C\', \'9 C\').replace(\'.C\', \'. C\')\n+ name_tmp = name_tmp.replace(\')C\', \') C\').replace(\'eC\', \'e C\').replace(\'yC\', \'y C\').replace(\'oC\', \'o C\').replace(\'-C\', \'- C\').replace(\'dC\', \'d C\').replace(\'rC\', \'r C\')\n+ name.append((\' \'.join(name_tmp.split(\' \')[0:len(name_tmp) - 1])).replace(" ", " "))\n+ if name_tmp:\n+ if name_tmp.split(\' \')[-1][0] == \'C\' or name_tmp.split(\' \')[-1][0] == \'F\' or name_tmp.split(\' \')[-1][0] == \'H\':\n+ formule = (name_tmp.split(\' \')[-1])\n+ else:\n+ formule = (\'not_def\')\n+ else:\n+ formule = (\'not_def\')\n+ formula.append(formule.replace(" ", " "))\n+ reverse.append(cell[2].split(\':\')[1])\n+ prob.append(cell[3].split(\' \')[2].replace(\'%\', \'\'))\n+ cas.append(cell[4].split(\':\')[1])\n+ lib_id.append(cell[5].split(\':\')[1])\n+ '..b'me_tmp[-1][0] == \'H\':\n+ formule = (name_tmp[-1])\n+ else:\n+ formule = (\'not_def\')\n+ else:\n+ formule = (\'not_def\')\n+ formula.append(formule.replace(" ", " "))\n+ reverse.append(cell[2].split(\':\')[1])\n+ prob.append(cell[3].split(\' \')[2].replace(\'%\', \'\'))\n+ cas.append(\'undef\')\n+ lib_id.append(cell[4].split(\':\')[1])\n+ nist_id.append(cell[5].split(\':\')[1].replace(\'.\', \'\').strip())\n+ j = j + 1\n+\n+ else:\n+ missed_compounds.append(hh)\n+ rt_missed_compounds.append(spec_id)\n+\n+ else: # Missing columns, report and quit\n+ \n+ return\n+\n+ for _ in range(j):\n+ hitid.append(str(spec_id.replace(" ", " ")))\n+ rt.append(str(float(spec_id.split(\'-\')[3]) / 1e+06))\n+\n+ NistInput[\'ID\'] = hitid\n+ NistInput[\'R.T.\'] = rt\n+ NistInput[\'Name\'] = name\n+ NistInput[\'CAS\'] = cas\n+ NistInput[\'Formula\'] = formula\n+ NistInput[\'Forward\'] = forward\n+ NistInput[\'Reverse\'] = reverse\n+ NistInput[\'Probability\'] = prob\n+ NistInput[\'Library\'] = lib_id\n+ NistInput[\'Library ID\'] = nist_id\n+ NistInput_missed[\'Missed Compounds\'] = missed_compounds\n+ NistInput_missed[\'RT missed Compounds\'] = rt_missed_compounds\n+\n+ return NistInput, NistInput_missed\n+\n+\n+def convert_pdftotext2tabular(filename, output_file, error_file, print_progress):\n+ \'\'\'\n+ Converts NIST PDF file to tabular format\n+ @param filename: PDF file to parse\n+ @param output_file: output file for the hits\n+ @param error_file: output file for failed hits\n+ \'\'\'\n+ [HitList, HitList_missed] = getPDF(filename, print_progress)\n+ # save Hitlist as tab seperate file\n+ Hitlist_as_text = "\\t".join(HitList.keys()) + "\\n"\n+ Hitlist_array_of_array = ([HitList[row] for row in HitList.keys()])\n+ Hitlist_as_text += str("\\n".join(["\\t".join(e) for e in zip(*Hitlist_array_of_array)]))\n+ output_fh = open(output_file, \'wb\')\n+ output_fh.write(Hitlist_as_text)\n+ output_fh.close()\n+\n+ out_missed_pdf = open(error_file, \'wb\')\n+ for x, y in zip(HitList_missed[\'Missed Compounds\'], HitList_missed[\'RT missed Compounds\']):\n+ out_missed_pdf.write(\'%s\\n\' % \'\\t\'.join([y, x]))\n+ out_missed_pdf.close()\n+\n+\n+def read_tabular(in_csv):\n+ \'\'\'\n+ Parses a tab-separated file returning a dictionary with named columns\n+ @param in_csv: input filename to be parsed\n+ \'\'\'\n+ data = list(csv.reader(open(in_csv, \'rU\'), delimiter=\'\\t\'))\n+ header = data.pop(0)\n+ # Create dictionary with column name as key\n+ output = {}\n+ for index in xrange(len(header)):\n+ output[header[index]] = [row[index] for row in data]\n+ return output\n+\n+\n+def read_tabular_old(filename):\n+ \'\'\'\n+ Function to read tabular format (created by convert_pdftotext2tabular)\n+ and output a dict with header of columns as key and value is columns of tabular as list\n+ @param filename: tabular file to read\n+ \'\'\'\n+ input_fh = None\n+ try:\n+ input_fh = open(filename, \'r\')\n+ except IOError, error:\n+ raise error\n+ colnames = input_fh.readline().strip().split(\'\\t\')\n+ cells = []\n+ for line in input_fh.readlines():\n+ cells.append(line.strip().split(\'\\t\'))\n+ #transform from row oriented structure to column oriented structure\n+ cells = zip(*cells)\n+ #store the list of list in form of final output\n+ RankFilterGC_format = {}\n+ for colnumber in range(len(colnames)):\n+ RankFilterGC_format[colnames[colnumber]] = cells[colnumber]\n+ return RankFilterGC_format\n+\n+\n+if __name__ == \'__main__\':\n+ convert_pdftotext2tabular(sys.argv[1], sys.argv[2], sys.argv[3], True)\n' |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b rankfilter_GCMS/pdftotabular.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/pdftotabular.py Thu Jan 16 13:10:00 2014 +0100 |
| [ |
| @@ -0,0 +1,41 @@ +""" +Copyright (C) 2013, Pieter Lukasse, Plant Research International, Wageningen + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this software except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +""" + +import sys +import pdfread +from subprocess import call + + +def convert_pdftotext(filename, output_file): + ''' + Converts PDF file to text + @param filename: PDF file to parse + @param output_file: output text file for the hits + ''' + + try: + call(["pdftotext", filename, output_file]) + except: + raise Exception("Error while trying to convert PDF to text") + + + + +if __name__ == '__main__': + pdf_as_text = sys.argv[1]+".txt" + convert_pdftotext(sys.argv[1], pdf_as_text) + pdfread.convert_pdftotext2tabular(pdf_as_text, sys.argv[2], sys.argv[3], False) |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b rankfilter_GCMS/rankfilter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/rankfilter.py Thu Jan 16 13:10:00 2014 +0100 |
| [ |
| b'@@ -0,0 +1,432 @@\n+"""\n+Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University\n+\n+Permission is hereby granted, free of charge, to any person obtaining a copy\n+of this software and associated documentation files (the "Software"), to deal\n+in the Software without restriction, including without limitation the rights\n+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n+copies of the Software, and to permit persons to whom the Software is\n+furnished to do so, subject to the following conditions:\n+\n+The above copyright notice and this permission notice shall be included in\n+all copies or substantial portions of the Software.\n+\n+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n+THE SOFTWARE.\n+"""\n+\n+#Library functions definition\n+#----------Begin-------------\n+from numpy import array, linalg, ones\n+from numpy.polynomial import polynomial\n+import math\n+import pdfread\n+import sys\n+\n+\n+def calibrate(standards):\n+ \'\'\'\n+ Calculates the RT to RI conversion: RI = a + b*RT\n+ @param standards: variable containing RI and RT data\n+ \'\'\'\n+ A = ones((len(standards[\'R.T.\']), 2), dtype=float)\n+ A[:, 0] = array(map(float, standards[\'R.T.\']))\n+ [coeff, res, r, s] = linalg.lstsq(A, array(map(float, standards[\'RI\'])))\n+\n+ return coeff\n+\n+\n+def calibrate_poly(standards):\n+ \'\'\'\n+ Calculates the RT to RI conversion using a polynomial model\n+ @param standards: variable containing RI and RT data\n+ \'\'\'\n+ retention_time = array(map(float, standards[\'R.T.\']))\n+ retention_index = array(map(float, standards[\'RI\']))\n+\n+ # Fit a 3rd degree polynomial\n+ fit = polynomial.polyfit(retention_time, retention_index, 3)[::-1]\n+ return [fit[0], fit[1], fit[2], fit[3]]\n+\n+\n+def calculate_poly(retention_time, poly_cal):\n+ \'\'\'\n+ Converts a given retention time to retention index using the calculated polynomial model\n+ @param retention_time: retention_time to convert to retention index\n+ @param poly_cal: result from calculating regression\n+ \'\'\'\n+ # Calculates RI based on given retention_time using polynomial function\n+ retention_time = array(map(float, retention_time))\n+ if len(retention_time) > 1:\n+ ri_exp = []\n+ for i in retention_time:\n+ ri_exp.append(poly_cal[0] * (i ** 3) + poly_cal[1] * (i ** 2) + (i * poly_cal[2]) + poly_cal[3])\n+ return ri_exp\n+ else:\n+ return poly_cal[0] * (retention_time ** 3) + poly_cal[1] * (retention_time ** 2) + (retention_time * poly_cal[2]) + poly_cal[3]\n+\n+\n+def convert_rt(hit_list, coeff):\n+ \'\'\'\n+ Converts a given retention time to retention index using the linear model\n+ @param hit_list: list holding the retention time\n+ @param coeff: calculated coefficient (slope and intercept) using the linear model\n+ \'\'\'\n+ #Convert RT to RI\n+ hit_list[\'RIexp\'] = array(map(float, hit_list[\'R.T.\'])) * coeff[0] + coeff[1]\n+ return hit_list\n+\n+\n+def convert_rt_poly(hit_list, poly_cal):\n+ \'\'\'\n+ Calls the actual RT to RI converter and returns the updated list with added RIexp value\n+ @param hit_list: result list containing the retention time\n+ \'\'\'\n+ hit_list[\'RIexp\'] = array(map(float, calculate_poly(hit_list[\'R.T.\'], poly_cal)))\n+ return hit_list\n+\n+\n+def get_data(libdata, LabelCol):\n+ \'\'\'\n+ Retrieves datacolumns indicated by LabelCol from libdata (generic function)\n+ Returns a dict with the requested column names as keys\n+ @param libdata: file from which data is loaded\n+ @param LabelCol: columns to retrieve\n+ \'\'\'\n+ from numpy import take\n+ LibData = op'..b'be found"\n+ sys.exit(1)\n+\n+ standards = get_data(InputData[\'calibration\'], LabelColStand)\n+ if InputData[\'model\'] == \'linear\':\n+ coeff = calibrate(standards)\n+ elif InputData[\'model\'] == \'poly\':\n+ poly_cal = calibrate_poly(standards)\n+ else:\n+ print "error: model ", InputData[\'model\'], " can not be found. Use \'linear\' or \'poly\' "\n+ sys.exit(1)\n+ else:\n+ #No file has been specified for the calibration\n+ #Use the default coefficients\n+ print \'No file has been specified for the calibration\'\n+ print \'WARNING: the default coefficients will be used\'\n+ coeff = array([29.4327, 454.5260])\n+\n+ if InputData[\'analysis_type\'] == \'AMDIS\':\n+ try:\n+ AmdisOut = open(InputData[\'sample\'], \'r\')\n+ print("open ok")\n+ #Specify which data to be extracted from the AMDIS output table\n+ #Weighted and Reverse are measure of matching between the experimental\n+ #and the library spectra. The experimental spectrum is used as template\n+ #for the calculation of Weighted, whereas for Reverse the template is the\n+ #library spectrum\n+ LabelCol = [\'CAS\', \'Name\', \'R.T.\', \'Weighted\', \'Reverse\', \'Purity\']\n+\n+ #Get the data from the AMDIS output file\n+ HitList = get_data(InputData[\'sample\'], LabelCol)\n+ #Remove \'>\' from the names\n+ HitList[\'Name\'] = [s.replace(\'>\', \'\') for s in HitList[\'Name\']]\n+ except:\n+ print "the file", InputData[\'sample\'], "can not be found"\n+ sys.exit(1)\n+ if InputData[\'analysis_type\'] == \'NIST\':\n+ #HitList_missed - a variable of type dictionary containing the hits with the symbol ";"\n+ #in the name\n+ if not NDIS_is_tabular:\n+ print "Warning; NDIS is not tabular format, reading PDF..\\n"\n+ [HitList, HitList_missed] = pdfread.getPDF(InputData[\'sample\'])\n+ else:\n+ HitList = pdfread.read_tabular(InputData[\'sample\'])\n+\n+ #Convert RT to RI\n+ if InputData[\'model\'] == \'linear\':\n+ HitList = convert_rt(HitList, coeff)\n+ if InputData[\'model\'] == \'poly\':\n+ print "Executing convert_rt_poly().."\n+ HitList = convert_rt_poly(HitList, poly_cal)\n+\n+ #------Read the library data with the predicted RI------\n+ try:\n+ LibData = open(InputData[\'lib_data\'], \'r\')\n+ except:\n+ print "the file", InputData[\'lib_data\'], "can not be found"\n+ sys.exit(1)\n+\n+ #Specify which data to be extracted from the library data file\n+ LabelColLib = [\'CAS\', \'Name\', \'RIsvr\', \'Synonyms\']\n+ LibraryData = get_data(InputData[\'lib_data\'], LabelColLib)\n+\n+ #------Match the hits with the library data and rank them------\n+ if InputData[\'window\'] != \'\':\n+ HitList = rank_hit(HitList, LibraryData, InputData[\'window\'])\n+ else:\n+ print "No value for the window used for the filtering is specified \\n"\n+ sys.exit(1)\n+\n+ #------Print the ranked and filtered hits------\n+ #Specify which data to be printed\n+ if InputData[\'analysis_type\'] == \'AMDIS\':\n+ keys_to_print = [\'R.T.\', \'CAS\', \'Name\', \'Rank\', \'RIexp\', \'RIsvr\', \'%rel.err\', \'Weighted\', \'Reverse\', \'Synonyms\']\n+ else:\n+ keys_to_print = [\'ID\', \'R.T.\', \'Name\', \'CAS\', \'Rank\', \'RIexp\', \'RIsvr\', \'%rel.err\', \'Forward\', \'Reverse\', \'Synonyms\', \'Library\']\n+\n+ #skip this error output from reading a pdftotext file when file is tabular \n+ if InputData[\'analysis_type\'] == \'NIST\' and not NDIS_is_tabular:\n+ out_missed_pdf = open(output_files[\'missed_parse_pdf\'], \'w\')\n+ for x, y in zip(HitList_missed[\'Missed Compounds\'], HitList_missed[\'RT missed Compounds\']):\n+ out_missed_pdf.write(\'%s\\n\' % \'\\t\'.join([y, x]))\n+ out_missed_pdf.close()\n+\n+ print_to_file(HitList, output_files, keys_to_print, print_subsets)\n+\n+if __name__ == \'__main__\':\n+ main()\n' |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b rankfilter_GCMS/test/test_pdfread.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/test/test_pdfread.py Thu Jan 16 13:10:00 2014 +0100 |
| [ |
| @@ -0,0 +1,30 @@ +''' +Created on Mar 13, 2012 + +@author: marcelk +''' +from GCMS.rankfilter_GCMS import pdfread # @UnresolvedImport +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +import unittest + + +class Test(unittest.TestCase): + + def setUp(self): + self.nist_pdf = resource_filename(__name__, "data/NIST_test_PDF.txt") + + def test_getPDF(self): + ''' + Tests the reading and parsing of a NIST PDF file + ''' + [hitlist, hitlist_missed] = pdfread.getPDF(self.nist_pdf) + rows = [hitlist[row] for row in hitlist.keys()] + data = [set(row) for row in zip(*rows)] + expected_element = set(('12.3', ' Sucrose ', '14', 'undef', ' standards 2009', ' 660', 'not_def', + '18495-0.142537-21284-2.26544e+07-135', '22.6544', ' 714')) + self.failUnless(expected_element in data) + self.failUnless(len(hitlist_missed) != 0) + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.test_getPDF'] + unittest.main() |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b rankfilter_GCMS/test/test_rankfilter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/test/test_rankfilter.py Thu Jan 16 13:10:00 2014 +0100 |
| [ |
| @@ -0,0 +1,58 @@ +''' +Created on Mar 13, 2012 + +@author: marcelk +''' +from GCMS.rankfilter_GCMS.rankfilter import get_data, calibrate, calibrate_poly, convert_rt, convert_rt_poly +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +import unittest + + +class Test(unittest.TestCase): + + def setUp(self): + self.calibration = resource_filename(__name__, "data/calibration.txt") + self.sample = resource_filename(__name__, "data/sample.txt") + + def test_get_data(self): + samples = get_data(self.sample, ['CAS', 'Name', 'R.T.', 'Forward', 'Reverse', 'Formula']) + self.assertEqual(['C29H50O', 'C29H50O', 'C29H50O', 'C29H50O', 'C29H50O', 'C28H48O', 'C28H48O', 'C28H48O', + 'C28H48O', 'C27H44O2', 'C29H50O2', 'C29H50O2', 'C29H50O2', 'C29H50O2', 'C29H50O2', + 'C28H48O2', 'C28H48O2', 'C28H48O2', 'C28H48O2', 'C30H50O3', 'C29H50O', 'C29H50O', + 'C29H50O', 'C29H50O'], samples['Formula']) + + def test_calibrate(self): + standards = get_data(self.calibration, ['Name', 'R.T.', 'RI']) + coeff = calibrate(standards) + self.assertAlmostEqual(103.19073523551653, coeff[0], 5) + self.assertAlmostEqual(277.14374835349395, coeff[1], 5) + + def test_calibrate_poly(self): + standards = get_data(self.calibration, ['Name', 'R.T.', 'RI']) + poly_cal = calibrate_poly(standards) + self.assertAlmostEqual(0.028897105229818407, poly_cal[0], 5) + self.assertAlmostEqual(0.704572097468386, poly_cal[1], 5) + self.assertAlmostEqual(51.636852478526357, poly_cal[2], 5) + self.assertAlmostEqual(704.95499738104672, poly_cal[3], 5) + + def test_convert_rt(self): + standards = get_data(self.calibration, ['Name', 'R.T.', 'RI']) + coeff = calibrate(standards) + convert = convert_rt({'R.T.': [5, 10, 15, 20]}, coeff) + self.assertAlmostEqual(793.09742453, convert['RIexp'][0], 5) + self.assertAlmostEqual(1309.05110071, convert['RIexp'][1], 5) + self.assertAlmostEqual(1825.00477689, convert['RIexp'][2], 5) + self.assertAlmostEqual(2340.95845306, convert['RIexp'][3], 5) + + def test_convert_rt_poly(self): + standards = get_data(self.calibration, ['Name', 'R.T.', 'RI']) + poly_cal = calibrate_poly(standards) + convert = convert_rt_poly({'R.T.': [5, 10, 15, 20]}, poly_cal) + self.assertAlmostEqual(984.36570036, convert['RIexp'][0], 5) + self.assertAlmostEqual(1320.67783714, convert['RIexp'][1], 5) + self.assertAlmostEqual(1735.56423664, convert['RIexp'][2], 5) + self.assertAlmostEqual(2250.69772778, convert['RIexp'][3], 5) + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b rankfilter_text2tabular.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_text2tabular.xml Thu Jan 16 13:10:00 2014 +0100 |
| b |
| @@ -0,0 +1,14 @@ +<tool id="NDIStext2tabular" name="NIST_UTIL- NIST text to tabular format" version="1.0.2"> + <description>Convert NIST text to tabular format</description> + <command interpreter="python">rankfilter_GCMS/pdftotabular.py $input $output $output_err</command> + <inputs> + <param format="pdf" name="input" type="data" label="NIST identifications report (PDF)"/> + </inputs> + <outputs> + <data format="tabular" label="${tool.name} output on ${on_string}" name="output" /> + <data format="tabular" label="${tool.name} error log" name="output_err" /> + </outputs> + <help> + This tool converts NIST identification report output (PDF) to a tabular format needed for further use with the RIQC tools. + </help> +</tool> |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b select_on_rank.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/select_on_rank.py Thu Jan 16 13:10:00 2014 +0100 |
| [ |
| @@ -0,0 +1,21 @@ +import csv +import sys + +__author__ = "Marcel Kempenaar" +__contact__ = "brs@nbic.nl" +__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" +__license__ = "MIT" + +in_file = sys.argv[1] +out_file = sys.argv[2] +to_select_list = [str(select.strip()) for select in sys.argv[3].split(',') if (len(select) > 0)] + +data = list(csv.reader(open(in_file, 'rb'), delimiter='\t')) +header = data.pop(0) +header_clean = [i.lower().strip().replace(".", "").replace("%", "") for i in header] +rank = header_clean.index("rank") + +writer = csv.writer(open(out_file, 'wb'), delimiter='\t') +writer.writerow(header) +for select in to_select_list: + writer.writerows([i for i in data if i[rank] == select]) |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b select_on_rank.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/select_on_rank.xml Thu Jan 16 13:10:00 2014 +0100 |
| b |
| @@ -0,0 +1,15 @@ +<tool id="filter_on_rank" name="RIQC-Filter on rank" version="1.0.2"> + <description>Filter on the Rank field in the RankFilter output file</description> + <command interpreter="python">select_on_rank.py $input $output "$rank"</command> + <inputs> + <param format="tabular" name="input" type="data" label="Source file (RankFilter ouptut)"/> + <param format="tabular" help="Filter on (keep different values separate with a comma)" value ="1,2" + name="rank" type="text" label="Select Ranks to keep"/> + </inputs> + <outputs> + <data format="tabular" label="${tool.name} on ${on_string} selected ${rank}" name="output" /> + </outputs> + <help> +This tool removes all entries with non selected rank values from the input file (supported input file is a RankFilter output file). + </help> +</tool> |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b static/images/confidence_and_slope_params_explain.png |
| b |
| Binary file static/images/confidence_and_slope_params_explain.png has changed |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b static/images/msclust_summary.png |
| b |
| Binary file static/images/msclust_summary.png has changed |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b static/images/sample_SIM.png |
| b |
| Binary file static/images/sample_SIM.png has changed |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b static/images/sample_sel_and_peak_height_correction.png |
| b |
| Binary file static/images/sample_sel_and_peak_height_correction.png has changed |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b test/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/__init__.py Thu Jan 16 13:10:00 2014 +0100 |
| b |
| @@ -0,0 +1,1 @@ +''' BRS GCMS Galaxy Tools Module ''' |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b test/integration_tests.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/integration_tests.py Thu Jan 16 13:10:00 2014 +0100 |
| [ |
| b'@@ -0,0 +1,268 @@\n+\'\'\'Integration tests for the GCMS project\'\'\'\n+\n+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611\n+from GCMS import library_lookup, combine_output\n+from GCMS.rankfilter_GCMS import rankfilter\n+import os.path\n+import sys\n+import unittest\n+import re\n+\n+\n+class IntegrationTest(unittest.TestCase):\n+ def test_library_lookup(self):\n+ \'\'\'\n+ Run main for data/NIST_tabular and compare produced files with references determined earlier.\n+ \'\'\'\n+ # Create out folder\n+ outdir = "output/" #tempfile.mkdtemp(prefix=\'test_library_lookup\')\n+ if not os.path.exists(outdir):\n+ os.makedirs(outdir)\n+ outfile_base = os.path.join(outdir, \'produced_library_lookup\')\n+ outfile_txt = outfile_base + \'.txt\'\n+\n+ #Build up arguments and run\n+ input_txt = resource_filename(__name__, "data/NIST_tabular.txt")\n+ library = resource_filename(__name__, "data/RIDB_subset.txt")\n+ regress_model = resource_filename(__name__, "data/ridb_poly_regression.txt")\n+ sys.argv = [\'test\',\n+ library,\n+ input_txt,\n+ \'Capillary\',\n+ \'Semi-standard non-polar\',\n+ outfile_txt,\n+ \'HP-5\',\n+ regress_model]\n+ # Execute main function with arguments provided through sys.argv\n+ library_lookup.main()\n+ #Compare with reference files\n+ reference_txt = resource_filename(__name__, \'reference/produced_library_lookup.txt\')\n+ \n+ #read both the reference file and actual output files\n+ expected = _read_file(reference_txt)\n+ actual = _read_file(outfile_txt)\n+ \n+ #convert the read in files to lists we can compare\n+ expected = expected.split()\n+ actual = actual.split()\n+\n+ for exp, act in zip(expected, actual):\n+ if re.match(\'\\\\d+\\\\.\\\\d+\', exp):\n+ exp = float(exp)\n+ act = float(act)\n+ self.assertAlmostEqual(exp, act, places=5)\n+ else:\n+ # compare values\n+ self.failUnlessEqual(expected, actual)\n+\n+\n+ def test_combine_output_simple(self):\n+ \'\'\'\n+ Run main for data/NIST_tabular and compare produced files with references determined earlier.\n+ \'\'\'\n+ # Create out folder\n+ outdir = "output/" #tempfile.mkdtemp(prefix=\'test_library_lookup\')\n+ if not os.path.exists(outdir):\n+ os.makedirs(outdir)\n+ outfile_base = os.path.join(outdir, \'produced_combine_output\')\n+ outfile_single_txt = outfile_base + \'_single.txt\'\n+ outfile_multi_txt = outfile_base + \'_multi.txt\'\n+\n+ #Build up arguments and run\n+ input_rankfilter = resource_filename(__name__, "data/Rankfilter.txt")\n+ input_caslookup = resource_filename(__name__, "data/Caslookup.txt")\n+ sys.argv = [\'test\',\n+ input_rankfilter,\n+ input_caslookup,\n+ outfile_single_txt,\n+ outfile_multi_txt]\n+ # Execute main function with arguments provided through sys.argv\n+ combine_output.main()\n+ #Compare with reference files\n+ # reference_single_txt = resource_filename(__name__, \'reference/produced_combine_output_single.txt\')\n+ # reference_multi_txt = resource_filename(__name__, \'reference/produced_combine_output_multi.txt\')\n+ # self.failUnlessEqual(_read_file(reference_single_txt), _read_file(outfile_single_txt))\n+ # self.failUnlessEqual(_read_file(reference_multi_txt), _read_file(outfile_multi_txt))\n+\n+ #Clean up\n+ #shutil.rmtree(tempdir)\n+\n+\n+ \n+ def def_test_rank_filter_advanced(self):\n+ \'\'\'\n+ Run main of RankFilter\n+ \'\'\'\n+ # Create out folder\n+ outdir = "output/integration/"\n+ if not os.path.exists(outdir):\n+ '..b' combine_result_single_items = combine_output._process_data(outfile_single_txt)\n+ combine_result_multi_items = combine_output._process_data(outfile_multi_txt)\n+ self.assertGreater(len(combine_result_single_items[\'Centrotype\']), \n+ len(combine_result_multi_items[\'Centrotype\']))\n+ \n+ \n+ # Check 3: library_lookup RI column, centrotype column, ri_svr column are correct:\n+ caslookup_items = combine_output._process_data(input_caslookup)\n+ rankfilter_items = combine_output._process_data(input_rankfilter)\n+ \n+ # check that the caslookup RI column is correctly maintained in its original order in\n+ # the combined file:\n+ ri_caslookup = caslookup_items[\'RI\']\n+ ri_combine_single = combine_result_single_items[\'RI\']\n+ self.assertListEqual(ri_caslookup, ri_combine_single) \n+ \n+ # check the centrotype column\'s integrity:\n+ centrotype_caslookup = caslookup_items[\'Centrotype\']\n+ centrotype_combine_single = combine_result_single_items[\'Centrotype\']\n+ centrotype_rankfilter = _get_centrotype_rankfilter(rankfilter_items[\'ID\'])\n+ self.assertListEqual(centrotype_caslookup, centrotype_combine_single)\n+ self.assertListEqual(centrotype_caslookup, centrotype_rankfilter)\n+ \n+ # integration and integrity checks:\n+ file_NIST = resource_filename(__name__, "data/integration/NIST_identification_results_tabular.txt")\n+ file_NIST_items = combine_output._process_data(file_NIST)\n+ # check that rank filter output has exactly the same ID items as the original NIST input file:\n+ self.assertListEqual(file_NIST_items[\'ID\'], rankfilter_items[\'ID\']) \n+ # check the same for the CAS column:\n+ self.assertListEqual(_get_strippedcas(file_NIST_items[\'CAS\']), rankfilter_items[\'CAS\'])\n+ # now check the NIST CAS column against the cas lookup results: \n+ cas_NIST = _get_processedcas(file_NIST_items[\'CAS\'])\n+ self.assertListEqual(cas_NIST, caslookup_items[\'CAS\'])\n+ # now check the CAS of the combined result. If all checks are OK, it means the CAS column\'s order\n+ # and values remained stable throughout all steps: \n+ self.assertListEqual(rankfilter_items[\'CAS\'], combine_result_single_items[\'CAS\']) \n+ \n+ # check that the rankfilter RIsvr column is correctly maintained in its original order in\n+ # the combined file:\n+ risvr_rankfilter = rankfilter_items[\'RIsvr\']\n+ risvr_combine_single = combine_result_single_items[\'RIsvr\']\n+ self.assertListEqual(risvr_rankfilter, risvr_combine_single) \n+\n+ \n+ \n+\n+def _get_centrotype_rankfilter(id_list):\n+ \'\'\'\n+ returns the list of centrotype ids given a list of ID in the\n+ form e.g. 74-1.0-564-1905200-7, where the numbers before the \n+ first "-" are the centrotype id\n+ \'\'\'\n+ result = []\n+ for compound_id_idx in xrange(len(id_list)):\n+ compound_id = id_list[compound_id_idx]\n+ centrotype = compound_id.split(\'-\')[0]\n+ result.append(centrotype) \n+\n+ return result\n+\n+\n+def _get_processedcas(cas_list):\n+ \'\'\'\n+ returns the list cas numbers in the form C64175 instead of 64-17-5\n+ \'\'\'\n+ result = []\n+ for cas_id_idx in xrange(len(cas_list)):\n+ cas = cas_list[cas_id_idx]\n+ processed_cas = \'C\' + str(cas.replace(\'-\', \'\').strip())\n+ result.append(processed_cas) \n+\n+ return result\n+\n+def _get_strippedcas(cas_list):\n+ \'\'\'\n+ removes the leading white space from e.g. " 64-17-5"\n+ \'\'\'\n+ result = []\n+ for cas_id_idx in xrange(len(cas_list)):\n+ cas = cas_list[cas_id_idx]\n+ processed_cas = cas.strip()\n+ result.append(processed_cas) \n+\n+ return result\n+\n+\n+def _read_file(filename):\n+ \'\'\'\n+ Helper method to quickly read a file\n+ @param filename:\n+ \'\'\'\n+ with open(filename) as handle:\n+ return handle.read()\n' |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b test/test_combine_output.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_combine_output.py Thu Jan 16 13:10:00 2014 +0100 |
| [ |
| @@ -0,0 +1,106 @@ +''' +Created on Mar 27, 2012 + +@author: marcelk +''' +from GCMS import combine_output +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +import os +import shutil +import tempfile +import unittest + + +class Test(unittest.TestCase): + ''' + Tests for the 'combine_output' Galaxy tool + ''' + + def setUp(self): + self.rf_output = resource_filename(__name__, "data/RankFilter.txt") + self.cl_output = resource_filename(__name__, "data/CasLookup.txt") + + def test_process_data(self): + ''' + Tests the processing of the RankFilter and CasLookup files into dictionaries + ''' + rfdata = combine_output._process_data(self.rf_output) + cldata = combine_output._process_data(self.cl_output) + self.assertEqual(set([' 18457-04-0', ' 55133-95-4', ' 58-08-2', ' 112-34-5']), set(rfdata['CAS'])) + self.assertEqual(set(['C58082', 'C18457040', 'C55133954', 'C112345']), set(cldata['CAS'])) + + def test_add_hit(self): + ''' + Tests the combination of two records from both the RankFilter- and CasLookup-tools + ''' + rfdata = combine_output._process_data(self.rf_output) + cldata = combine_output._process_data(self.cl_output) + index = 0 + rf_record = dict(zip(rfdata.keys(), [rfdata[key][index] for key in rfdata.keys()])) + cl_record = dict(zip(cldata.keys(), [cldata[key][index] for key in cldata.keys()])) + + hit = combine_output._add_hit(rf_record, cl_record) + self.assertEqual(len(hit), 27) + + # Pass empty record, should fail combination + self.assertRaises(KeyError, combine_output._add_hit, rf_record, {}) + + def test_merge_data(self): + ''' + Tests the merging of the RankFilter and CasLookup data + ''' + rfdata = combine_output._process_data(self.rf_output) + cldata = combine_output._process_data(self.cl_output) + merged, _ = combine_output._merge_data(rfdata, cldata) + centrotypes = _get_centrotypes(merged) + self.failUnless(all(centrotype in centrotypes for centrotype in ('2716','12723', '3403', '12710'))) + +def _get_centrotypes(merged): + ''' + returns centrotype codes found in merged set + ''' + result = [] + for item_idx in xrange(len(merged)): + item = merged[item_idx] + centrotype = item[0][0] + result.append(centrotype) + + return result + + def test_remove_formula(self): + ''' + Tests the removal of the Formula from the 'Name' field (RankFilter output) + ''' + name = "Caffeine C8H10N4O2" + compound_name, compound_formula = combine_output._remove_formula(name) + self.assertEqual(compound_name, 'Caffeine') + self.assertEqual(compound_formula, 'C8H10N4O2') + name = "Ethanol C2H6O" + compound_name, compound_formula = combine_output._remove_formula(name) + self.assertEqual(compound_name, 'Ethanol') + self.assertEqual(compound_formula, 'C2H6O') + # No formula to remove + name = "Butanoic acid, 4-[(trimethylsilyl)oxy]-, trimethylsilyl ester" + compound_name, compound_formula = combine_output._remove_formula(name) + self.assertEqual(compound_name, name) + self.assertEqual(compound_formula, False) + + def test_save_data(self): + ''' + Tests the creation of the output tabular files (no content testing) + ''' + temp_folder = tempfile.mkdtemp(prefix='gcms_combine_output_') + saved_single_data = '{0}/{1}'.format(temp_folder, 'output_single.tsv') + saved_multi_data = '{0}/{1}'.format(temp_folder, 'output_multi.tsv') + rfdata = combine_output._process_data(self.rf_output) + cldata = combine_output._process_data(self.cl_output) + merged, nhits = combine_output._merge_data(rfdata, cldata) + combine_output._save_data(merged, nhits, saved_single_data, saved_multi_data) + self.failUnless(os.path.exists(saved_single_data)) + self.failUnless(os.path.exists(saved_multi_data)) + shutil.rmtree(temp_folder) + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b test/test_export_to_metexp_tabular.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_export_to_metexp_tabular.py Thu Jan 16 13:10:00 2014 +0100 |
| [ |
| @@ -0,0 +1,85 @@ +'''Integration tests for the GCMS project''' + +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +from GCMS import export_to_metexp_tabular +import os.path +import sys +import unittest + + +class IntegrationTest(unittest.TestCase): + + + def test_combine_output_simple(self): + ''' + comment me + ''' + # Create out folder + outdir = "output/metexp/" + if not os.path.exists(outdir): + os.makedirs(outdir) + + #Build up arguments and run + + rankfilter_and_caslookup_combined_file = resource_filename(__name__, "data/dummy1_produced_combine_output_single.txt") + msclust_quantification_and_spectra_file = resource_filename(__name__, "data/dummy1_sim.txt") + output_csv = resource_filename(__name__, outdir + "metexp_tabular.txt") + + sys.argv = ['test', + rankfilter_and_caslookup_combined_file, + msclust_quantification_and_spectra_file, + output_csv] + # Execute main function with arguments provided through sys.argv + export_to_metexp_tabular.main() + + ''' + # Asserts are based on reading in with process_data and comparing values of + # certain columns + + # Check 3: library_lookup RI column, centrotype column, ri_svr column are correct: + caslookup_items = combine_output._process_data(input_caslookup) + rankfilter_items = combine_output._process_data(input_rankfilter) + + # check that the caslookup RI column is correctly maintained in its original order in + # the combined file: + ri_caslookup = caslookup_items['RI'] + ri_combine_single = combine_result_single_items['RI'] + self.assertListEqual(ri_caslookup, ri_combine_single) + + # check the centrotype column's integrity: + centrotype_caslookup = caslookup_items['Centrotype'] + centrotype_combine_single = combine_result_single_items['Centrotype'] + centrotype_rankfilter = _get_centrotype_rankfilter(rankfilter_items['ID']) + self.assertListEqual(centrotype_caslookup, centrotype_combine_single) + self.assertListEqual(centrotype_caslookup, centrotype_rankfilter) + + # integration and integrity checks: + file_NIST = resource_filename(__name__, "data/integration/NIST_identification_results_tabular.txt") + file_NIST_items = combine_output._process_data(file_NIST) + # check that rank filter output has exactly the same ID items as the original NIST input file: + self.assertListEqual(file_NIST_items['ID'], rankfilter_items['ID']) + # check the same for the CAS column: + self.assertListEqual(_get_strippedcas(file_NIST_items['CAS']), rankfilter_items['CAS']) + # now check the NIST CAS column against the cas lookup results: + cas_NIST = _get_processedcas(file_NIST_items['CAS']) + self.assertListEqual(cas_NIST, caslookup_items['CAS']) + # now check the CAS of the combined result. If all checks are OK, it means the CAS column's order + # and values remained stable throughout all steps: + self.assertListEqual(rankfilter_items['CAS'], combine_result_single_items['CAS']) + + # check that the rankfilter RIsvr column is correctly maintained in its original order in + # the combined file: + risvr_rankfilter = rankfilter_items['RIsvr'] + risvr_combine_single = combine_result_single_items['RIsvr'] + self.assertListEqual(risvr_rankfilter, risvr_combine_single) + ''' + + + +def _read_file(filename): + ''' + Helper method to quickly read a file + @param filename: + ''' + with open(filename) as handle: + return handle.read() |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b test/test_library_lookup.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_library_lookup.py Thu Jan 16 13:10:00 2014 +0100 |
| [ |
| b'@@ -0,0 +1,180 @@\n+\'\'\'\n+Created on Mar 6, 2012\n+\n+@author: marcelk\n+\'\'\'\n+from GCMS import library_lookup, match_library\n+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611\n+import os\n+import shutil\n+import tempfile\n+import unittest\n+\n+\n+class Test(unittest.TestCase):\n+ \'\'\'\n+ Tests the \'library_lookup\' Galaxy tool\n+ \'\'\'\n+\n+ def setUp(self):\n+ self.ri_database = resource_filename(__name__, "data/RIDB_subset.txt")\n+ self.nist_output = resource_filename(__name__, "data/NIST_tabular.txt")\n+ self.ridb_poly_regress = resource_filename(__name__, "data/ridb_poly_regression.txt")\n+ self.ridb_linear_regress = resource_filename(__name__, "data/ridb_linear_regression.txt")\n+\n+ def test_create_lookup_table(self):\n+ \'\'\'\n+ Tests the \'create_lookup_table\' function\n+ \'\'\'\n+ column_type = \'Capillary\'\n+ polarity = \'Semi-standard non-polar\'\n+ lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity)\n+ self.assertFalse(False in [res[4] == \'Capillary\' for res in lookup_dict[\'4177166\']])\n+ self.assertEqual([\'C51276336\', \'2,6-Dimethyl-octa-1,7-dien-3,6-diol\', \'C10H18O2\',\n+ \'1277\', \'Capillary\', \'Semi-standard non-polar\', \'DB-5MS\', \'1\',\n+ \'C51276336_DB-5MS\', \'\', \'\', \'\'], lookup_dict[\'51276336\'][1])\n+\n+ def test_read_model(self):\n+ \'\'\'\n+ Tests reading the regression model data containing the parameters required for converting\n+ retention indices between GC-columns\n+ \'\'\'\n+ model, _ = library_lookup._read_model(self.ridb_poly_regress)\n+ # Order of values: coefficient 1 through 4, left limit, right limit\n+ # Polynomial model\n+ self.assertEqual([20.6155874639486, 0.945187096379008, 3.96480787567566e-05, -9.04377237159287e-09,\n+ 628.0, 2944.0, 405.0, 0, 0.998685262365514], model[\'HP-5\'][\'SE-54\'])\n+ self.assertEqual([-92.3963391356951, 1.26116176393346, -0.000191991657547972, 4.15387371263164e-08,\n+ 494.0, 2198.0, 407.0, 0, 0.996665023122993], model[\'Apiezon L\'][\'Squalane\'])\n+ # Linear model\n+ model, _ = library_lookup._read_model(self.ridb_linear_regress)\n+ self.assertEqual([2.81208738561543, 0.99482475526584, 628.0, 2944.0, 405.0, 0, 0.998643883946458],\n+ model[\'HP-5\'][\'SE-54\'])\n+ self.assertEqual([19.979922768462, 0.993741869298272, 494.0, 2198.0, 407.0, 0, 0.99636062891041],\n+ model[\'Apiezon L\'][\'Squalane\'])\n+\n+ def test_apply_regression(self):\n+ \'\'\'\n+ Tests the regression model on some arbitrary retention indices\n+ \'\'\'\n+ poly_model, _ = library_lookup._read_model(self.ridb_poly_regress)\n+ linear_model, _ = library_lookup._read_model(self.ridb_linear_regress)\n+ retention_indices = [1000, 1010, 1020, 1030, 1040, 1050]\n+ converted_poly = []\n+ converted_linear = []\n+ for ri in retention_indices:\n+ converted_poly.append(library_lookup._apply_poly_regression(\'HP-5\', \'DB-5\', ri, poly_model))\n+ converted_linear.append(library_lookup._apply_linear_regression(\'HP-5\', \'DB-5\', ri, linear_model))\n+\n+ self.assertEqual([1003.0566541860778, 1013.0979459524663, 1023.1358645806529, 1033.170466241159,\n+ 1043.2018071045052, 1053.2299433412131], converted_poly)\n+ self.assertEqual([1001.8127584915925, 1011.830140783027, 1021.8475230744615, 1031.864905365896,\n+ 1041.8822876573306, 1051.899669948765], converted_linear)\n+ \n+ # Test polynomial limit detection, the following RI falls outside of the possible limits\n+ ri = 3400\n+ converted_poly = library_lookup._apply_poly_regression(\'HP-5\', \'DB-5\', ri, poly_model)\n+ self.assertEqual(False, converted_poly)\n+\n+ def test_preferred_hit(self):\n+ '..b'9125, \'Capillary\',\n+ \'Semi-standard non-polar\', \'SE-52\', \'\', \'C150867_SE-52\', \'\', \'\', \'\'], \'SE-52\')\n+ self.assertEqual(expected, match)\n+\n+ def test_format_result(self):\n+ \'\'\'\n+ Tests the \'format_result\' function\n+ \'\'\'\n+ column_type = \'Capillary\'\n+ polarity = \'Semi-standard non-polar\'\n+\n+ # Look for DB-5\n+ pref_column = [\'DB-5\']\n+ model, method = library_lookup._read_model(self.ridb_poly_regress)\n+ lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity)\n+ data = library_lookup.format_result(lookup_dict, self.nist_output, pref_column, column_type,\n+ polarity, model, method)#False, None)\n+\n+ # remove non-hits from set:\n+ data = _get_hits_only(data)\n+ self.assertEqual([\'C544354\', \'Ethyl linoleate\', \'C20H36O2\', \'2155\', \'Capillary\', \'Semi-standard non-polar\',\n+ \'DB-5\', \'1\', \'C544354_DB-5\', \'1810\', \'None\', \'\', \'\', \'0\'], data[20])\n+ self.assertEqual(111, len(data))\n+\n+ # Look for both DB-5 and HP-5\n+ pref_column = [\'DB-5\', \'HP-5\']\n+ data = library_lookup.format_result(lookup_dict, self.nist_output, pref_column, column_type,\n+ polarity, False, None)\n+ # remove non-hits from set:\n+ data = _get_hits_only(data)\n+ self.assertEqual([\'C502614\', \'.beta.-(E)-Farnesene\', \'C15H24\', \'1508\', \'Capillary\', \'Semi-standard non-polar\',\n+ \'DB-5\', \'1\', \'C502614_DB-5\', \'942\', \'None\', \'1482\', \'1522\', \'22\'], data[50])\n+ self.assertEqual(106, len(data))\n+\n+\n+ def test_save_data(self):\n+ \'\'\'\n+ Tests the creation of the output tabular file\n+ \'\'\'\n+ temp_folder = tempfile.mkdtemp(prefix=\'gcms_combine_output_\')\n+ saved_data = \'{0}/{1}\'.format(temp_folder, \'output.tsv\')\n+ column_type = \'Capillary\'\n+ polarity = \'Semi-standard non-polar\'\n+ pref_column = [\'DB-5\']\n+ lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity)\n+ data = library_lookup.format_result(lookup_dict, self.nist_output, pref_column, column_type, polarity, False, None)\n+ library_lookup._save_data(data, saved_data)\n+ self.failUnless(os.path.exists(saved_data))\n+ shutil.rmtree(temp_folder)\n+ \n+ \n+ def test_match_library_get_lib_files(self):\n+ \'\'\'\n+ Tests the match_library.py functionality\n+ \'\'\'\n+ riqc_libs_dir = resource_filename(__name__, "../repositories")\n+ get_library_files_output = match_library.get_directory_files(riqc_libs_dir)\n+ self.assertEqual(4, len(get_library_files_output))\n+ self.assertEqual("Library_RI_DB_capillary_columns-noDuplicates", get_library_files_output[0][0])\n+ #TODO change assert below to assert that the result is a file, so the test can run on other dirs as well:\n+ #self.assertEqual("E:\\\\workspace\\\\PRIMS-metabolomics\\\\python-tools\\\\tools\\\\GCMS\\\\test\\\\data\\\\riqc_libs\\\\RI DB library (capillary columns) Dec.2012.txt", get_library_files_output[0][1])\n+ #self.assertEqual("RI DB library (capillary columns) Jan.2013", get_library_files_output[1][0]) \n+ try:\n+ get_library_files_output = match_library.get_directory_files("/blah")\n+ # should not come here\n+ self.assertTrue(False)\n+ except:\n+ # should come here\n+ self.assertTrue(True)\n+\n+def _get_hits_only(data):\n+ \'\'\'\n+ removes items that have RI == 0.0 and Name == \'\' (these are dummy lines just for the output\n+ \'\'\'\n+ result = []\n+ for item_idx in xrange(len(data)):\n+ item = data[item_idx]\n+ if item[1] != \'\' and item[3] > 0.0 :\n+ result.append(item) \n+\n+ return result \n+\n+\n+if __name__ == "__main__":\n+ #import sys;sys.argv = [\'\', \'Test.testName\']\n+ unittest.main()\n' |
| b |
| diff -r 000000000000 -r 9d5f4f5f764b test/test_match_library.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_match_library.py Thu Jan 16 13:10:00 2014 +0100 |
| [ |
| @@ -0,0 +1,51 @@ +''' +Created on Mar 6, 2012 + +@author: marcelk +''' +from GCMS import match_library +import unittest +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 + + +class Test(unittest.TestCase): + ''' + Tests the 'match_library' Galaxy tool + ''' + nist_db = resource_filename(__name__, "data/RIDB_subset.txt") + + def test_get_column_type(self): + ''' + Tests the 'get_column_type' function that returns tuples of unique elements + for column types in the RI database + ''' + galaxy_output = match_library.get_column_type(self.nist_db) + self.assertEqual([('Capillary(9999)', 'Capillary', False)], galaxy_output) + + def test_filter_column(self): + ''' + Tests the 'filter_column' function showing the column phase for all 'Capillary' columns + ''' + galaxy_output = match_library.filter_column(self.nist_db, 'Capillary') + self.assertEqual([('Semi-standard non-polar(9999)', 'Semi-standard non-polar', False)], galaxy_output) + + def test_filter_column2(self): + ''' + Tests the 'filter_column' function showing all possibilities for columns having both the + 'Capillary' and 'Semi-standard non-polar' properties + ''' + galaxy_output = match_library.filter_column2(self.nist_db, 'Capillary', 'Semi-standard non-polar') + self.failUnless(('Apiezon M(6)', 'Apiezon M', False) in galaxy_output) + + def test_count_occurrence(self): + ''' + Tests the 'count_occurrence' function + ''' + test_list = [2, 0, 0, 2, 1, 3, 4, 5, 3, 2, 3, 4, 5, 5, 4, 2, 5, 3, 4, 3, 5, 4, 2, 0, 4] + counts = match_library.count_occurrence(test_list) + self.assertEqual({0: 3, 1: 1, 2: 5, 3: 5, 4: 6, 5: 5}, counts) + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() |