Mercurial > repos > galaxyp > mt2mq
diff MT2MQ.R @ 2:9c8e7137d331 draft
"planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
author | galaxyp |
---|---|
date | Wed, 12 Aug 2020 17:36:53 -0400 |
parents | e50ec3a9a3f9 |
children |
line wrap: on
line diff
--- a/MT2MQ.R Fri Jun 26 11:15:17 2020 -0400 +++ b/MT2MQ.R Wed Aug 12 17:36:53 2020 -0400 @@ -2,10 +2,10 @@ # Load libraries suppressPackageStartupMessages(library(tidyverse)) -#default_locale() +suppressPackageStartupMessages(library(taxize)) # Set parameters from arguments -args = commandArgs(trailingOnly = TRUE) +args <- commandArgs(trailingOnly = TRUE) data <- args[1] # data: full path to file or directory: # - if in functional or f-t mode, should be a tsv file of HUMAnN2 gene families, after regrouping and renaming to GO, joining samples, and renormalizing to CPM. @@ -18,49 +18,67 @@ ontology <- unlist(strsplit(args[3], split = ",")) # ontology: only for function or f-t mode. A string of the GO namespace(s) to include, separated by commas. # ex: to include all: "molecular_function,biological_process,cellular_component" -outfile <- args[4] - # outfile: full path with pathname and extension for output + +int_file <- args[4] + # int_file: full path and file name and extension to write intensity file + +func_file <- args[5] + # func_file: full path and file name and extension to write func file + +tax_file <- args[6] + # tax_file: full path and file name and extension to write tax file + # Functional mode -if (mode == "f"){ - out <- read.delim(file=data, header=TRUE, sep='\t') %>% - filter(!grepl(".+g__.+",X..Gene.Family)) %>% - separate(col=X..Gene.Family, into=c("id", "Extra"), sep=": ", fill="left") %>% - separate(col=Extra, into = c("namespace", "name"), sep = " ", fill="left", extra="merge") %>% - mutate(namespace = if_else(namespace == "[MF]", true = "molecular_function", false = if_else(namespace == "[BP]", true = "biological_process", false = "cellular_component"))) %>% - filter(namespace %in% ontology) %>% +if (mode == "f") { + int <- read.delim(file = data, header = TRUE, sep = "\t") %>% + filter(!grepl(".+g__.+", X..Gene.Family)) %>% + separate(col = X..Gene.Family, into = c("id", "Extra"), sep = ": ", fill = "left") %>% + separate(col = Extra, into = c("namespace", "name"), sep = " ", fill = "left", extra = "merge") %>% + mutate(namespace = if_else(namespace == "[MF]", true = "molecular_function", false = if_else(namespace == "[BP]", true = "biological_process", false = "cellular_component"))) %>% + filter(namespace %in% ontology) %>% select(id, name, namespace, 4:ncol(.)) + func <- int %>% + select(id) %>% + mutate(gos = id) + write.table(x = int, file = int_file, quote = FALSE, sep = "\t", row.names = FALSE) + write.table(x = func, file = func_file, quote = FALSE, sep = "\t", row.names = FALSE) } # Taxonomic mode -if (mode == "t"){ +if (mode == "t") { files <- dir(path = data) - out <- tibble(filename = files) %>% - mutate(file_contents= map(filename, ~read.delim(file=file.path(data, .), header=TRUE, sep = "\t"))) %>% - unnest(cols = c(file_contents)) %>% - rename(sample = filename) %>% - separate(col = sample, into = c("sample",NA), sep=".tsv") %>% - pivot_wider(names_from = sample, values_from = abundance) %>% - mutate(rank = "genus") %>% - rename(name = genus) %>% - mutate(id = row_number(name)) %>% # filler for taxon id but should eventually find a way to get id from ncbi database + int <- tibble(filename = files) %>% + mutate(file_contents = map(filename, ~read.delim(file = file.path(data, .), header = TRUE, sep = "\t"))) %>% + unnest(cols = c(file_contents)) %>% + rename(sample = filename) %>% + separate(col = sample, into = c("sample", NA), sep = ".tsv") %>% + pivot_wider(names_from = sample, values_from = abundance) %>% + mutate(rank = "genus") %>% + rename(name = genus) %>% + mutate(name = as.character(name)) %>% + mutate(id = get_uid(name, key = NULL, messages = FALSE)) %>% select(id, name, rank, 2:ncol(.)) + tax <- int %>% + select(id) %>% + mutate(tax = id) + write.table(x = int, file = int_file, quote = FALSE, sep = "\t", row.names = FALSE) + write.table(x = tax, file = tax_file, quote = FALSE, sep = "\t", row.names = FALSE) } # Function-taxonomy mode -if (mode == "ft"){ - out <- read.delim(file=data, header=TRUE, sep='\t') %>% - filter(grepl(".+g__.+",X..Gene.Family)) %>% - separate(col=X..Gene.Family, into=c("id", "Extra"), sep=": ", fill="left") %>% - separate(col=Extra, into = c("namespace", "name"), sep = " ", fill="left", extra="merge") %>% - separate(col = name, into = c("name", "taxa"), sep="\\|", extra = "merge") %>% - separate(col = taxa, into = c("Extra", "genus", "species"), sep = "__") %>% select(-"Extra") %>% - mutate_if(is.character, str_replace_all, pattern = "\\.s", replacement = "") %>% - mutate_at(c("species"), str_replace_all, pattern = "_", replacement = " ") %>% - mutate(namespace = if_else(namespace == "[MF]", true = "molecular_function", false = if_else(namespace == "[BP]", true = "biological_process", false = "cellular_component"))) %>% - filter(namespace %in% ontology) %>% +if (mode == "ft") { + ft <- read.delim(file = data, header = TRUE, sep = "\t") %>% + filter(grepl(".+g__.+", X..Gene.Family)) %>% + separate(col = X..Gene.Family, into = c("id", "Extra"), sep = ": ", fill = "left") %>% + separate(col = Extra, into = c("namespace", "name"), sep = " ", fill = "left", extra = "merge") %>% + separate(col = name, into = c("name", "taxa"), sep = "\\|", extra = "merge") %>% + separate(col = taxa, into = c("Extra", "genus", "species"), sep = "__") %>% + select(-"Extra") %>% + mutate_if(is.character, str_replace_all, pattern = "\\.s", replacement = "") %>% + mutate_at(c("species"), str_replace_all, pattern = "_", replacement = " ") %>% + mutate(namespace = if_else(namespace == "[MF]", true = "molecular_function", false = if_else(namespace == "[BP]", true = "biological_process", false = "cellular_component"))) %>% + filter(namespace %in% ontology) %>% select(id, name, namespace, 4:ncol(.)) + write.table(x = ft, file = int_file, quote = FALSE, sep = "\t", row.names = FALSE) } - -# Write file -write.table(x = out, file = outfile, quote = FALSE, sep = "\t", row.names = FALSE)