Mercurial > repos > lnguyen > proteore_goprofiles
changeset 0:6eeb2fb0c4bd draft default tip
planemo upload
author | lnguyen |
---|---|
date | Sat, 16 Sep 2017 09:17:07 -0400 |
parents | |
children | |
files | goprofiles.R goprofiles.xml test-data/UnipIDs.txt test-data/goprofile.BP.pdf test-data/goprofile.CC.pdf test-data/goprofile.MF.pdf |
diffstat | 6 files changed, 437 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/goprofiles.R Sat Sep 16 09:17:07 2017 -0400 @@ -0,0 +1,263 @@ +# Load necessary libraries +library("org.Hs.eg.db", quietly=TRUE) +library("goProfiles", quietly=TRUE) + +# Read file and return file content as data.frame? +readfile = function(filename, header) { + if (header == "true") { + # Read only the first two lines of the files as data (without headers): + headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE) + #print("header") + #print(headers) + # Create the headers names with the two (or more) first rows, sappy allows to make operations over the columns (in this case paste) - read more about sapply here : + #headers_names <- sapply(headers, paste, collapse = "_") + #print(headers_names) + #Read the data of the files (skipping the first 2 rows): + file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE) + #print(file[1,]) + #And assign the headers of step two to the data: + names(file) <- headers + } + else { + file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE) + } + return(file) +} + +#filename = "/Users/LinCun/Documents/ProteoRE/usecase1/Check/HPA.Selection.134.txt" +#test = readfile(filename) +#str(test) +#str(test$Gene.names) +getprofile = function(ids, id_type, level) { + + # Check if level is number + if (! as.numeric(level) %% 1 == 0) { + stop("Please enter an integer for level") + } + else { + level = as.numeric(level) + } + #genes = as.vector(file[,ncol]) + + # Extract Gene Entrez ID + if (id_type == "Entrez") { + id = select(org.Hs.eg.db, ids, "ENTREZID", multiVals = "first") + genes_ids = id$ENTREZID[which( ! is.na(id$ENTREZID))] + } + else { + genes_ids = c() + id = select(org.Hs.eg.db, ids, "ENTREZID", "UNIPROT", multiVals = "first") + #print(id[[1]][1]) + genes_ids = id$ENTREZID[which( ! is.na(id$ENTREZID))] + # IDs that have NA ENTREZID + NAs = id$UNIPROT[which(is.na(id$ENTREZID))] + print("IDs unable to convert to ENTREZID: ") + print(NAs) + } + #print(genes_ids) + # Convert Protein IDs into entrez ids + + # for (i in 1:length(id$UNIPROT)) { + # print(i) + # if (is.na(id[[2]][i])) { + # print(id[[2]][i]) + # } + # } + # a = id[which(id$ENTREZID == "NA"),] + # print(a) + # print(a$UNIPROT) + #print(id[[1]][which(is.na(id$ENTREZID))]) + #print(genes_ids) + # for (gene in genes) { + # #id = as.character(mget(gene, org.Hs.egALIAS2EG, ifnotfound = NA)) + # id = select(org.Hs.eg.db, genes, "ENTREZID", "UNIPROT") + # print(id) + # genes_ids = append(genes_ids, id$ENTREZID) + # } + #print(genes_ids) + + # Create basic profiles + profile.CC = basicProfile(genes_ids, onto='CC', level=level, orgPackage="org.Hs.eg.db", empty.cats=F, ord=T, na.rm=T) + profile.BP = basicProfile(genes_ids, onto='BP', level=level, orgPackage="org.Hs.eg.db", empty.cats=F, ord=T, na.rm=T) + profile.MF = basicProfile(genes_ids, onto='MF', level=level, orgPackage="org.Hs.eg.db", empty.cats=F, ord=T, na.rm=T) + profile.ALL = basicProfile(genes_ids, onto='ANY', level=level, orgPackage="org.Hs.eg.db", empty.cats=F, ord=T, na.rm=T) + + # Print profile + # printProfiles(profile) + + return(c(profile.CC, profile.MF, profile.BP, profile.ALL)) +} + +# Plot profiles to PNG +plotPNG = function(profile.CC = NULL, profile.BP = NULL, profile.MF = NULL, profile.ALL = NULL, per = TRUE, title = TRUE) { + if (!is.null(profile.CC)) { + png("profile.CC.png") + plotProfiles(profile.CC, percentage=per, multiplePlots=FALSE, aTitle=title) + dev.off() + } + if (!is.null(profile.BP)) { + png("profile.BP.png") + plotProfiles(profile.BP, percentage=per, multiplePlots=FALSE, aTitle=title) + dev.off() + } + if (!is.null(profile.MF)) { + png("profile.MF.png") + plotProfiles(profile.MF, percentage=per, multiplePlots=FALSE, aTitle=title) + dev.off() + } + if (!is.null(profile.ALL)) { + png("profile.ALL.png") + plotProfiles(profile.ALL, percentage=per, multiplePlots=T, aTitle=title) + dev.off() + } +} + +# Plot profiles to JPEG +plotJPEG = function(profile.CC = NULL, profile.BP = NULL, profile.MF = NULL, profile.ALL = NULL, per = TRUE, title = TRUE) { + if (!is.null(profile.CC)) { + jpeg("profile.CC.jpeg") + plotProfiles(profile.CC, percentage=per, multiplePlots=FALSE, aTitle=title) + dev.off() + } + if (!is.null(profile.BP)) { + jpeg("profile.BP.jpeg") + plotProfiles(profile.BP, percentage=per, multiplePlots=FALSE, aTitle=title) + dev.off() + } + if (!is.null(profile.MF)) { + jpeg("profile.MF.jpeg") + plotProfiles(profile.MF, percentage=per, multiplePlots=FALSE, aTitle=title) + dev.off() + } + if (!is.null(profile.ALL)) { + jpeg("profile.ALL.jpeg") + plotProfiles(profile.ALL, percentage=per, multiplePlots=FALSE, aTitle=title) + dev.off() + } +} + +# Plot profiles to PDF +plotPDF = function(profile.CC = NULL, profile.BP = NULL, profile.MF = NULL, profile.ALL = NULL, per = TRUE, title = TRUE) { + if (!is.null(profile.CC)) { + pdf("profile.CC.pdf") + plotProfiles(profile.CC, percentage=per, multiplePlots=FALSE, aTitle=title) + dev.off() + } + if (!is.null(profile.BP)) { + pdf("profile.BP.pdf") + plotProfiles(profile.BP, percentage=per, multiplePlots=FALSE, aTitle=title) + dev.off() + } + if (!is.null(profile.MF)) { + pdf("profile.MF.pdf") + plotProfiles(profile.MF, percentage=per, multiplePlots=FALSE, aTitle=title) + dev.off() + } + if (!is.null(profile.ALL)) { + #print("all") + pdf("profile.ALL.pdf") + plotProfiles(profile.ALL, percentage=per, multiplePlots=FALSE, aTitle=title) + dev.off() + } +} + +goprofiles = function() { + args = commandArgs(trailingOnly = TRUE) + #print(args) + # arguments: filename.R inputfile ncol "CC,MF,BP,ALL" "PNG,JPEG,PDF" level "TRUE"(percentage) "Title" + if (length(args) != 8) { + stop("Not enough/Too many arguments", call. = FALSE) + } + else { + input_type = args[2] + if (input_type == "text") { + input = strsplit(args[1], "\\s+")[[1]] + } + else if (input_type == "file") { + filename = strsplit(args[1], ",")[[1]][1] + ncol = strsplit(args[1], ",")[[1]][2] + # Check ncol + if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) { + stop("Please enter an integer for level") + } + else { + ncol = as.numeric(gsub("c", "", ncol)) + } + header = strsplit(args[1], ",")[[1]][3] + # Get file content + file = readfile(filename, header) + # Extract Protein IDs list + input = c() + for (row in as.character(file[,ncol])) { + input = c(input, strsplit(row, ";")[[1]][1]) + } + } + id_type = args[3] + ontoopt = strsplit(args[4], ",")[[1]] + #print(ontoopt) + #plotopt = strsplit(args[3], ",") + plotopt = args[5] + level = args[6] + per = as.logical(args[7]) + title = args[8] + + profiles = getprofile(input, id_type, level) + profile.CC = profiles[1] + #print(profile.CC) + profile.MF = profiles[2] + #print(profile.MF) + profile.BP = profiles[3] + #print(profile.BP) + profile.ALL = profiles[-3:-1] + #print(profile.ALL) + #c(profile.ALL, profile.CC, profile.MF, profile.BP) + if ("CC" %in% ontoopt) { + if (grepl("PNG", plotopt)) { + plotPNG(profile.CC=profile.CC, per=per, title=title) + } + if (grepl("JPEG", plotopt)) { + plotJPEG(profile.CC = profile.CC, per=per, title=title) + } + if (grepl("PDF", plotopt)) { + plotPDF(profile.CC = profile.CC, per=per, title=title) + } + } + if ("MF" %in% ontoopt) { + if (grepl("PNG", plotopt)) { + plotPNG(profile.MF = profile.MF, per=per, title=title) + } + if (grepl("JPEG", plotopt)) { + plotJPEG(profile.MF = profile.MF, per=per, title=title) + } + if (grepl("PDF", plotopt)) { + plotPDF(profile.MF = profile.MF, per=per, title=title) + } + } + if ("BP" %in% ontoopt) { + if (grepl("PNG", plotopt)) { + plotPNG(profile.BP = profile.BP, per=per, title=title) + } + if (grepl("JPEG", plotopt)) { + plotJPEG(profile.BP = profile.BP, per=per, title=title) + } + if (grepl("PDF", plotopt)) { + plotPDF(profile.BP = profile.BP, per=per, title=title) + } + } + + #if (grepl("PNG", plotopt)) { + # plotPNG(profile.ALL = profile.ALL, per=per, title=title) + #} + #if (grepl("JPEG", plotopt)) { + # plotJPEG(profile.ALL = profile.ALL, per=per, title=title) + #} + #if (grepl("PDF", plotopt)) { + # plotPDF(profile.ALL = profile.ALL, per=per, title=title) + #} + } + +} + +goprofiles() + +#Rscript go.R ../proteinGroups_Maud.txt "1" "CC" "PDF" 2 "TRUE" "Title"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/goprofiles.xml Sat Sep 16 09:17:07 2017 -0400 @@ -0,0 +1,149 @@ +<tool id="goProfiles" name="goProfilesP" version="0.1.0"> + <description> Identify enriched biological themes, GO terms from your protein list. + </description> + <requirements> + <!--requirement type="package" version="3.3.0">bioconductor-org.hs.eg.db</requirement> + <requirement type="package" version="1.38.0">goprofiles</requirement--> + </requirements> + <stdio> + <exit_code range="1:" /> + </stdio> + <command><![CDATA[ + Rscript $__tool_directory__/goprofiles.R + #if $input.ids == "text" + "$input.text" "text" + #else + "$input.file,$input.ncol,$input.header" "file" + #end if + + $input.id_type + + $onto_opt + + $opt.plot_opt + + $level + + $per + + "$title" + + ]]></command> + <inputs> + <conditional name="input" > + <param name="ids" type="select" label="Provide your Entrez Gene or UniProt identifiers" help="Copy/paste or ID list from a file (e.g. table)" > + <option value="text">Copy/paste your identifiers</option> + <option value="file">Input file containing your identifiers</option> + </param> + <when value="text" > + <param name="text" type="text" label="Copy/paste your identifiers" help='IDs must be separated by spaces into the form field, for example: P31946 P62258' > + <sanitizer> + <valid initial="string.printable"> + <remove value="'"/> + </valid> + <mapping initial="none"> + <add source="'" target="__sq__"/> + </mapping> + </sanitizer> + </param> + <param name="id_type" type="select" label="Please select the type of your IDs list" > + <option value="Entrez">Entrez Gene ID</option> + <option value="UniProt">UniProt protein ID</option> + </param> + </when> + <when value="file" > + <param name="file" type="data" format="txt,tabular" label="Choose a file that contains your list of IDs" help="" /> + <param name="ncol" type="text" label="The column number of IDs to use" help='For example, fill in "c1" if it is the first column, "c2" if it is the second column and so on' /> + <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does your input file contain header?" /> + <param name="id_type" type="select" label="Please select the type of your IDs list" > + <option value="Entrez">Entrez Gene ID</option> + <option value="UniProt">UniProt protein ID</option> + </param> + </when> + + </conditional> + <param type="select" name="onto_opt" label="Please select GO terms category" multiple="True" display="checkboxes" > + <option value="CC">Cellular Component (CC)</option> + <option value="MF">Molecular Function (MF)</option> + <option value="BP">Biological Process (BP)</option> + </param> + <param type="select" name="level" label="Level of the ontology at which the profile has to be built (the higher this number, the deeper the GO level)" > + <option value="1">1</option> + <option value="2" selected="True">2</option> + <option value="3">3</option> + </param> + <param type="boolean" name="per" label="Plot absolute or relative frequencies (not summing to 100)" truevalue="TRUE" falsevalue="FALSE" /> + <param type="text" name="title" label="Enter title of your figure" /> + <section name="opt" title="Choose graphical output (bar plots) format: png, jpeg, pdf" expanded="False" help="By default, PDF is chosen as output format"> + <param type="select" name="plot_opt" label="Choose plot output extension" multiple="True" display="checkboxes" > + <option value="PNG">PNG</option> + <option value="JPEG">JPEG</option> + <option value="PDF" selected="True">PDF</option> + </param> + </section> + </inputs> + <outputs> + <collection type="list" label="GO Profile diagram outputs" name="output" > + <discover_datasets pattern="(?P<designation>.+\.png)" ext="png" /> + <discover_datasets pattern="(?P<designation>.+\.jpeg)" ext="jpg" /> + <discover_datasets pattern="(?P<designation>.+\.pdf)" ext="pdf" /> + </collection> + </outputs> + <tests> + <test> + <conditional name="input"> + <param name="ids" value="file" /> + <param name="file" value="UnipIDs.txt" /> + <param name="ncol" value="c1" /> + <param name="header" value="false" /> + </conditional> + <param name="onto_opt" value="CC,MF,BP" /> + <param name="level" value="2" /> + <param name="per" value="true" /> + <param name="title" value="Test" /> + <section name="opt" > + <param name="plot_opt" value="PDF" /> + </section> + <output_collection name="output" type="list" > + <element name="goprofile.BP.pdf" file="goprofile.BP.pdf" ftype="pdf" /> + <element name="goprofile.MF.pdf" file="goprofile.MF.pdf" ftype="pdf" /> + <element name="goprofile.CC.pdf" file="goprofile.CC.pdf" ftype="pdf" /> + </output_collection> + </test> + </tests> + <help><![CDATA[ +This tool, based on the goProfiles R package, performs statistical analysis of functional profiles. It is based on GO ontology and considers either a gene set ('Entrez’ Identifiers) or a protein set (Uniprot ID) as input. + +You can choose one or more GO categories: + +* Biological Process (BP) +* Cellular Component (CC) +* Molecular Function (MF) + +Functional profile at a given GO level is obtained by counting the number of identifiers having a hit in each category of this level (2 by default). Results are displayed as bar plots (with absolute or relative frequencies) and can be exported in pdf, png and jpeg formats. + +For more details about GoProfiles, please read: Salicrú et al. Comparison of lists of genes based on functional profiles. BMC Bioinformatics. 2011;12:401.(https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-401) + +If your type of identifiers is not supported (i.e. different form Uniprot and Entrez), please use the **ID Converter** component in the ProteoRE section to convert your list of IDs first. + +----- + +.. class:: infomark + +**Authors** + +Sanchez A, Ocana J and Salicru M (2016). goProfiles: goProfiles: an R package for the statistical analysis of functional profiles. R package version 1.38.0. + +.. class:: infomark + +**Galaxy integration** + +T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR +Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit,Migale Bioinformatics platform, + +Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool. + + ]]></help> + <citations> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/UnipIDs.txt Sat Sep 16 09:17:07 2017 -0400 @@ -0,0 +1,25 @@ +P04637 +P08246 +P63244 +P10275 +P00533 +Q14524 +P05067 +P35555 +P35222 +O95273 +P00451 +P38398 +Q05086 +Q12802 +P68871 +P04585 +Q96EB6 +Q9NYL2 +P31749 +P01137 +Q5S007 +Q08379 +P02649 +P35498 +P12931