Repository 'h_clust'
hg clone

Changeset 0:dc678d2c1976 (2018-01-18)
Commit message:
planemo upload commit a2411926bebc2ca3bb31215899a9f18a67e59556
diff -r 000000000000 -r dc678d2c1976 LICENSE
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/LICENSE Thu Jan 18 07:56:33 2018 -0500
diff -r 000000000000 -r dc678d2c1976 h_clust.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/h_clust.R Thu Jan 18 07:56:33 2018 -0500
b'@@ -0,0 +1,391 @@\n+# R Script producing a hierarchical clustering\n+# Input : a file containing a table with numeric values\n+#\t  except for the first column containing sample names\n+#\t  and the first line containing variable names\n+#\t  separator expected is <TAB>\n+#\n+# Clustering method :\n+#\t  euclidean, correlation, ...\n+#\n+# Ouptut : a file containing the image of the clustering\n+#-----------------------------------------------------------------\n+# Authors : sophie.lamarre(at)\n+#\t    ignacio.gonzalez(at)\n+#\t    luc.jouneau(at)\n+#\t    valentin.marcon(at)\n+# Version : 0.9\n+# Date    : 13/3/2017\n+\n+# The function -------------------------------------\n+#---------------------------------------------------\n+h_clust <- function(input_file,\n+                      group_member_file = NULL,\n+                      output_file = "out/myplot",\n+                      log_file = "log/H_Clust.html",\n+                      format_image_out = "png",\n+                      distance_method = "euclidean",\n+                      agglomeration_method = "ward",\n+                      column_clustering = TRUE,\n+                      select = NULL,\n+                      plot_title = "",\n+                      xlab = "",\n+                      ylab = "Height",\n+                      width = 7,\n+                      height = 7,\n+                      ppi = 300,\n+\t\t      na_encoding="NA"\n+) {\n+\n+  # This function allows to generate hierarchical cluster analysis on a table  according to differents parameters.\n+  # It needs a dataset : the table of data and optionally a group_member data to set colored labels.\n+  # It generates a clustering tree graphic from hierarchical clustering.\n+  #\n+  # Parameters :\n+  # - input_file : input_file name\n+  # - group_member_file : input sample/tag group_member_file name\n+  # - output_file : output_file name\n+  # - log_file : log file name\n+  # - format_image_out : graphic format of the output_file. This must be one of "png", "jpeg", "tiff", "pdf"\n+  # - distance_method : the distance measure to be used. This must be one of "euclidean", "correlation", "maximum", "manhattan", "canberra", "binary" or "minkowski"\n+  # - agglomeration_method : the agglomeration_method to be used. This should be one of "ward", "single", "complete", "average", "mcquitty", "median" or "centroid"\n+  # - column_clustering : if TRUE clustering is performed on the columns\n+  # - select : number of top variables to use for clustering, selected by highest row variance. If NULL all the variables are selected\n+  # - plot_title : an overall title for the plot\n+  # - xlab : a title for the x axis\n+  # - ylab : a title for the y axis\n+  # - width : the width of the graphics region in inches\n+  # - height : the height of the graphics region in inches\n+  # - ppi : the nominal resolution in ppi\n+  # - na_encoding : label used to indicate missing values\n+  \n+  library(RColorBrewer)\n+  \n+  #---------------------------------------------------\n+  # Auxiliary function\n+  #---------------------------------------------------\n+  insert.blank = function(x) {paste(strsplit(x, "@$\xc2\xa7", fixed = TRUE)[[1]], collapse = " ")}\n+  \n+  #---------------------------------------------------\n+  # Titles \n+  #---------------------------------------------------\n+  plot_title = insert.blank(plot_title)\n+  xlab = insert.blank(xlab)\n+  ylab = insert.blank(ylab)\n+  \n+  #---------------------------------------------------\n+  # Read and verify data\n+  #---------------------------------------------------\n+  #1\xb0) Checks valid for all modules\n+  if (column_clustering) {\n+\tvariable_in_line=1\n+  \tcolumn_use="individual"\n+  \tline_use="variable"\n+  } else {\n+\tvariable_in_line=0\n+  \tline_use="individual"\n+  \tcolumn_use="variable"\n+  }\n+\t\n+  log_error=function(message="") {\n+  \t\tcat("<HTML><HEAD><TITLE>Hierarchical clustering report</TITLE></HEAD><BODY>\\n",file=log_file,append=F,sep="")\n+  \t\tcat("&#9888 An error occurred while trying to read your ta'..b'#########################################################\n+  # Treatment successfull\n+  ##########################################################\n+  cat("<HTML><HEAD><TITLE>Hierarchical clustering report</TITLE></HEAD><BODY>\\n",file=log_file,append=F,sep="")\n+  cat("&#10003; Your clustering process is successfull !<BR>",file=log_file,append=T,sep="")\n+  cat("</BODY></HTML>\\n",file=log_file,append=T,sep="")\n+   \n+  q(save="no",status=0)\n+\n+} # end of function\n+\n+#### Test clustering ####\n+#LJO : 13/3/2017\n+#setwd("H:/INRA/cati/groupe stats/Galaxy/hclust")\n+#h_clust(input_file="in/decathlon.txt",plot_title="declathlon",output_file="out/decathlon1")\n+#h_clust(input_file="in/decathlon.txt",plot_title="declathlon",output_file="out/decathlon2",column_clustering=FALSE,\n+#         xlab="Competitors",ylab="Distance")\n+#h_clust(input_file="in/decathlon.txt",plot_title="declathlon",output_file="out/decathlon3",column_clustering=FALSE,\n+#         select=5)\n+#h_clust(input_file="in/decathlon.txt",plot_title="declathlon",output_file="out/decathlon4",column_clustering=FALSE,\n+#         distance_method="correlation",agglomeration_method="average",\n+#         format_image_out="tiff",ppi=100,width=3,height=3\n+#)\n+##Group : competitors\n+#h_clust(input_file="in/decathlon.txt",plot_title="declathlon",output_file="out/decathlon5",column_clustering=FALSE,\n+#         xlab="Competitors",ylab="Distance",group_member_file="in/competitors_groups - 1 column.txt")\n+#h_clust(input_file="in/decathlon.txt",plot_title="declathlon",output_file="out/decathlon6",column_clustering=FALSE,\n+#         xlab="Competitors",ylab="Distance",group_member_file="in/competitors_groups - 2 columns.txt")\n+#h_clust(input_file="in/decathlon.txt",plot_title="declathlon",output_file="out/decathlon7",column_clustering=FALSE,\n+#         xlab="Competitors",ylab="Distance",group_member_file="in/competitors_groups - 1 column with header.txt")\n+#h_clust(input_file="in/decathlon.txt",plot_title="declathlon",output_file="out/decathlon8",column_clustering=FALSE,\n+#         xlab="Competitors",ylab="Distance",group_member_file="in/competitors_groups - 2 columns with header.txt")\n+#h_clust(input_file="in/decathlon.txt",plot_title="declathlon",output_file="out/decathlon8",column_clustering=FALSE,\n+#         xlab="Competitors",ylab="Distance",group_member_file="in/competitors_groups - 2 columns - with error.txt")\n+#\n+##Group : competitions\n+#h_clust(input_file="in/decathlon.txt",plot_title="declathlon",output_file="out/decathlon9",column_clustering=TRUE,\n+#         xlab="Competitions",ylab="Distance",group_member_file="in/competitions_groups - 1 column.txt")\n+#h_clust(input_file="in/decathlon.txt",plot_title="declathlon",output_file="out/decathlon10",column_clustering=TRUE,\n+#         xlab="Competitions",ylab="Distance",group_member_file="in/competitions_groups - 2 columns.txt")\n+#h_clust(input_file="in/decathlon.txt",plot_title="declathlon",output_file="out/decathlon11",column_clustering=TRUE,\n+#         xlab="Competitions",ylab="Distance",group_member_file="in/competitions_groups - 1 column with header.txt")\n+#h_clust(input_file="in/decathlon.txt",plot_title="declathlon",output_file="out/decathlon12",column_clustering=TRUE,\n+#         xlab="Competitions",ylab="Distance",group_member_file="in/competitions_groups - 2 columns with header.txt")\n+#h_clust(input_file="in/decathlon.txt",plot_title="declathlon",output_file="out/decathlon13",column_clustering=TRUE,\n+#         xlab="Competitions",ylab="Distance",group_member_file="in/competitions_groups - 2 columns - with error.txt")\n+#\n+##Missing values\n+#h_clust(input_file="in/decathlon - with NA.txt",plot_title="declathlon",output_file="out/decathlon14",column_clustering=TRUE,\n+#         xlab="Competitions",ylab="Distance",na_encoding="missing_value")\n+#\n+##Top 5 competitions\n+#h_clust(input_file="in/decathlon.txt",plot_title="declathlon",output_file="out/decathlon15",column_clustering=TRUE,\n+#         xlab="Competitions",ylab="Distance",select=5)\n'
diff -r 000000000000 -r dc678d2c1976 h_clust.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/h_clust.xml Thu Jan 18 07:56:33 2018 -0500
b'@@ -0,0 +1,280 @@\n+<!--# Copyright (C) 2017 INRA\n+# This program is free software: you can redistribute it and/or modify\n+# it under the terms of the GNU General Public License as published by\n+# the Free Software Foundation, either version 3 of the License, or\n+# (at your option) any later version.\n+#\n+# This program is distributed in the hope that it will be useful,\n+# but WITHOUT ANY WARRANTY; without even the implied warranty of\n+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n+# GNU General Public License for more details.\n+# \n+# You should have received a copy of the GNU General Public License\n+# along with this program.  If not, see\n+#-->\n+\n+<tool id="h_clust" name="Hierarchical clustering" version="1.0.0">\n+    <description>Generate hierarchical cluster analysis on a numeric data table</description>\n+    <requirements>\n+        <requirement type="package">R</requirement>\n+        <requirement type="package">r-rcolorbrewer</requirement>\n+        <requirement type="package">r-batch</requirement>\n+    </requirements>\n+    <stdio>\n+        <!-- Anything other than zero is an error -->\n+        <exit_code range="1:" level="fatal"/>\n+        <exit_code range=":-1" level="fatal"/>\n+    </stdio>\n+    <command interpreter="Rscript"><![CDATA[\n+        h_clust_galaxy.R\n+        input_file \'${input_file}\'\n+        #if $conditional.selector == "YES":\n+          group_member_file \'${conditional.group_member_file}\'\n+        #else\n+          group_member_file \'${conditional.selector}\'\n+        #end if\n+        output_file \'${output_file}\'\n+        log_file \'${log_file}\'\n+        distance_method \'${distance_method}\'\n+        agglomeration_method \'${agglomeration_method}\'\n+        column_clustering \'${sample_clustering}\'\n+        select \'${select}\'\n+        plot_title \'${plot_title}\'\n+        xlab \'${xlab}\'\n+        ylab \'${ylab}\'\n+        #if $moreoption.more == "YES":\n+          width \'${moreoption.width}\'\n+          height \'${moreoption.height}\'\n+          ppi \'${moreoption.ppi}\'\n+          format_image_out \'${moreoption.format_image_out}\'\n+        #end if\n+        #if $moreoption.more == "NO":\n+          width "7"\n+          height "7"\n+          ppi "300"\n+          format_image_out "png"\n+        #end if\n+        na_encoding \'${NA_code}\'\n+    ]]></command>\n+    <inputs>\n+        <param format="csv,tabular" name="input_file" type="data" label="Input file" help="Data file on which clustering will be performed" />\n+        <conditional name="conditional">\n+            <param name="selector" type="select" label="Do you have an input variable/individual group member file ?">\n+                <option value="NO">No</option>\n+                <option value="YES">Yes</option>\n+            </param>\n+            <when value="YES">\n+                <param format="csv, txt" name="group_member_file" type="data" label="Input variable/individual group member file" />\n+            </when>\n+            <when value="NO" />\n+        </conditional>\n+        <param name="distance_method" type="select" label="The distance measure to be used (one choice mandatory)">\n+            <option value="euclidean">euclidean</option>\n+            <option value="correlation">correlation</option>\n+            <option value="maximum">maximum</option>\n+            <validator type="empty_field" message="Please choose, at least, one distance measure to be used." />\n+        </param>\n+        <param name="agglomeration_method" type="select" label="The agglomeration method to be used (one choice mandatory)">\n+            <option value="ward">ward</option>            <option value="single">single</option><option value="complete">complete</option><option value="average">average</option><validator type="empty_field" message="Please choose, at least, one agglomeration method to be used." /></param>\n+        <param name="sample_clustering" type="select" label="Clustering is performed on:" help="(default columns)">\n+            <opti'..b'ice mandatory)\n+        | This must be one of "euclidean", "correlation" or "maximum". The distance measure mostly used is "euclidean" and "correlation". \n+        |\n+\n+The agglomeration method to be used (one choice mandatory)\n+        | This should be one of "ward", "single", "complete" or "average". The distance measure mostly used is "ward".\n+        |\n+\n+Clustering is performed on the columns\n+        | if YES clustering is performed on the columns. if NO clustering is performed on the lines.\n+        |\n+\n+Number of top elements to use for clustering\n+        | Number of columns (or lines) to take into account for the clustering. The top elements will be selected by variance (top 10 elements with the biggest variance). If NULL all the elements will be used to perform clustering. For NGS or microarray data, this allows to perform a clustering on the top n most variables genes.\n+        |\n+\n+Label used for Missing values\n+        | Missing value coding characters\n+        |\n+\n+An overall title for the plot\n+        | enter a title for the plot\n+        |\n+\n+A title for the x axis\n+        | enter a title for the x axis\n+        |\n+\n+A title for the y axis\n+        | enter a title for the y axis\n+        |\n+\n+The width of the graphics region in inches\n+        | enter a number\n+        |\n+\n+The height of the graphics region in inches\n+        | enter a number\n+        |\n+\n+The nominal resolution in ppi\n+        | enter a number (a higher number means a high resolution which can take times to open)\n+        |\n+\n+Graphic format of the output file\n+        | choose a format between "png", "jpeg", "tiff" and "pdf"\n+        |\n+\n+------------\n+Output files\n+------------\n+\n+Hierarchical_clustering_report\n+        |\n+\n+Hierarchical_clustering_log\n+        |\n+\n+------\n+\n+**Authors** Luc Jouneau (, Sarah Maman ( and Valentin Marcon (\n+\n+Contact : \n+\n+E-learning available : Not yet.\n+\n+.. class:: infomark\n+\n+-------------\n+Please cite :\n+-------------\n+\n+- (Depending on the help provided you can cite us in acknowledgements, references or both.)\n+    \n+Acknowledgements\n+        | We wish to thank SIGENAE group and the statistical CATI BIOS4Biol group : Ignacio Gonzalez, Sophie Lamarre, Sarah Maman, Luc Jouneau, Christophe Klopp\n+        | Re-packaging was provided by Valentin Marcon (INRA, Migale platform, as part of the IFB project \'Galaxy For Life Science\' (\n+        | \n+\n+References\n+        | SIGENAE []\n+        |\n+ \n+    ]]></help>\n+    <citations>\n+        <citation type="bibtex">\n+        @article {Love002832,\n+\tauthor = {Love, Michael I and Huber, Wolfgang and Anders, Simon},\n+\ttitle = {Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2},\n+\tyear = {2014},\n+\tdoi = {10.1101/002832},\n+\tpublisher = {Cold Spring Harbor Laboratory},\n+\tabstract = {In comparative high-throughput sequencing assays, a fundamental task is the analysis of count data, such as read counts per gene in RNA-seq data, for evidence of systematic changes across experimental conditions. Small replicate numbers, discreteness, large dynamic range and the presence of outliers require a suitable statistical approach. We present DESeq2, a method for differential analysis of count data. DESeq2 uses shrinkage estimation for dispersions and fold changes to improve stability and interpretability of the estimates. This enables a more quantitative analysis focused on the strength rather than the mere presence of differential expression and facilitates downstream tasks such as gene ranking and visualization. DESeq2 is available as an R/Bioconductor package.},\n+\tURL = {},\n+\teprint = {},\n+\tjournal = {bioRxiv}}\n+        </citation>\n+    </citations>\n+</tool>\n'
diff -r 000000000000 -r dc678d2c1976 h_clust_galaxy.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/h_clust_galaxy.R Thu Jan 18 07:56:33 2018 -0500
@@ -0,0 +1,79 @@
+#!/usr/local/bioinfo/bin/Rscript --vanilla --slave --no-site-file
+# R Script producing a hierarchical clustering
+# Input : a file containing a table with numeric values
+#         except for the first column containing sample names
+#         and the first line containing variable names
+#         separator expected is <TAB>
+# Clustering method :
+#         euclidean, correlation, ...
+# Ouptut : a file containing the image of the clustering
+# Authors : sophie.lamarre(at)
+#           ignacio.gonzalez(at)
+#           luc.jouneau(at)
+#           valentin.marcon(at)
+# Version : 0.9
+# Date    : 06/09/2017
+## Options
+strAsFacL <- options()$stringsAsFactors
+options(stringsAsFactors = FALSE)
+## Libraries laoding
+# For parseCommandArgs function
+# R script call
+source_local <- function(fname)
+ argv <- commandArgs(trailingOnly = FALSE)
+ base_dir <- dirname(substring(argv[grep("--file=", argv)], 8))
+ source(paste(base_dir, fname, sep="/"))
+#Import the different functions used for the hierarchical clustering
+## Lecture parametres
+argLs <- parseCommandArgs(evaluate=FALSE)
+if (group_member_file=="NO"){
+    group_member_file<-NULL
+if (select=="NULL"){
+    select<-NULL
+if (column_clustering=="TRUE"){
+    column_clustering<-TRUE
+} else {
+    column_clustering<-FALSE
+     group_member_file=group_member_file,
+     output_file=argLs[["output_file"]],
+     log_file=argLs[["log_file"]],
+     format_image_out=argLs[["format_image_out"]],
+     distance_method=argLs[["distance_method"]],
+     agglomeration_method=argLs[["agglomeration_method"]],
+     column_clustering=column_clustering,
+     select=select,
+     plot_title=argLs[["plot_title"]],
+     xlab=argLs[["xlab"]],
+     ylab=argLs[["ylab"]],
+     width=argLs[["width"]],
+     height=argLs[["height"]],
+     ppi=argLs[["ppi"]],
+     na_encoding=argLs[["NA_code"]])
diff -r 000000000000 -r dc678d2c1976 test-data/decathlon.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/decathlon.tsv Thu Jan 18 07:56:33 2018 -0500
@@ -0,0 +1,42 @@
+"name" "100m" "Long.jump" "Shot.put" "High.jump" "400m" "110m.hurdle" "Discus" "Pole.vault" "Javeline" "1500m"
+"SEBRLE" 11.04 7.58 14.83 2.07 49.81 14.69 43.75 5.02 63.19 291.7
+"CLAY" 10.76 7.4 14.26 1.86 49.37 14.05 50.72 4.92 60.15 301.5
+"KARPOV" 11.02 7.3 14.77 2.04 48.37 14.09 48.95 4.92 50.31 300.2
+"BERNARD" 11.02 7.23 14.25 1.92 48.93 14.99 40.87 5.32 62.77 280.1
+"YURKOV" 11.34 7.09 15.19 2.1 50.42 15.31 46.26 4.72 63.44 276.4
+"WARNERS" 11.11 7.6 14.31 1.98 48.68 14.23 41.1 4.92 51.77 278.1
+"ZSIVOCZKY" 11.13 7.3 13.48 2.01 48.62 14.17 45.67 4.42 55.37 268
+"McMULLEN" 10.83 7.31 13.76 2.13 49.91 14.38 44.41 4.42 56.37 285.1
+"MARTINEAU" 11.64 6.81 14.57 1.95 50.14 14.93 47.6 4.92 52.33 262.1
+"HERNU" 11.37 7.56 14.41 1.86 51.1 15.06 44.99 4.82 57.19 285.1
+"BARRAS" 11.33 6.97 14.09 1.95 49.48 14.48 42.1 4.72 55.4 282
+"NOOL" 11.33 7.27 12.68 1.98 49.2 15.29 37.92 4.62 57.44 266.6
+"BOURGUIGNON" 11.36 6.8 13.46 1.86 51.16 15.67 40.49 5.02 54.68 291.7
+"Sebrle" 10.85 7.84 16.36 2.12 48.36 14.05 48.72 5 70.52 280.01
+"Clay" 10.44 7.96 15.23 2.06 49.19 14.13 50.11 4.9 69.71 282
+"Karpov" 10.5 7.81 15.93 2.09 46.81 13.97 51.65 4.6 55.54 278.11
+"Macey" 10.89 7.47 15.73 2.15 48.97 14.56 48.34 4.4 58.46 265.42
+"Warners" 10.62 7.74 14.48 1.97 47.97 14.01 43.73 4.9 55.39 278.05
+"Zsivoczky" 10.91 7.14 15.31 2.12 49.4 14.95 45.62 4.7 63.45 269.54
+"Hernu" 10.97 7.19 14.65 2.03 48.73 14.25 44.72 4.8 57.76 264.35
+"Nool" 10.8 7.53 14.26 1.88 48.81 14.8 42.05 5.4 61.33 276.33
+"Bernard" 10.69 7.48 14.8 2.12 49.13 14.17 44.75 4.4 55.27 276.31
+"Schwarzl" 10.98 7.49 14.01 1.94 49.76 14.25 42.43 5.1 56.32 273.56
+"Pogorelov" 10.95 7.31 15.1 2.06 50.79 14.21 44.6 5 53.45 287.63
+"Schoenbeck" 10.9 7.3 14.77 1.88 50.3 14.34 44.41 5 60.89 278.82
+"Barras" 11.14 6.99 14.91 1.94 49.41 14.37 44.83 4.6 64.55 267.09
+"Smith" 10.85 6.81 15.24 1.91 49.27 14.01 49.02 4.2 61.52 272.74
+"Averyanov" 10.55 7.34 14.44 1.94 49.72 14.39 39.88 4.8 54.51 271.02
+"Ojaniemi" 10.68 7.5 14.97 1.94 49.12 15.01 40.35 4.6 59.26 275.71
+"Smirnov" 10.89 7.07 13.88 1.94 49.11 14.77 42.47 4.7 60.88 263.31
+"Qi" 11.06 7.34 13.55 1.97 49.65 14.78 45.13 4.5 60.79 272.63
+"Drews" 10.87 7.38 13.07 1.88 48.51 14.01 40.11 5 51.53 274.21
+"Parkhomenko" 11.14 6.61 15.69 2.03 51.04 14.88 41.9 4.8 65.82 277.94
+"Terek" 10.92 6.94 15.15 1.94 49.56 15.12 45.62 5.3 50.62 290.36
+"Gomez" 11.08 7.26 14.57 1.85 48.61 14.41 40.95 4.4 60.71 269.7
+"Turi" 11.08 6.91 13.62 2.03 51.67 14.26 39.83 4.8 59.34 290.01
+"Lorenzo" 11.1 7.03 13.22 1.85 49.34 15.38 40.22 4.5 58.36 263.08
+"Karlivans" 11.33 7.26 13.3 1.97 50.54 14.98 43.34 4.5 52.92 278.67
+"Korkizoglou" 10.86 7.07 14.81 1.94 51.16 14.96 46.07 4.7 53.05 317
+"Uldal" 11.23 6.99 13.53 1.85 50.95 15.09 43.01 4.5 60 281.7
+"Casarsa" 11.36 6.68 14.92 1.94 53.2 15.39 48.66 4.4 58.62 296.12
diff -r 000000000000 -r dc678d2c1976 test-data/log_file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/log_file Thu Jan 18 07:56:33 2018 -0500
@@ -0,0 +1,2 @@
+<HTML><HEAD><TITLE>Hierarchical clustering report</TITLE></HEAD><BODY>
+&#10003; Your clustering process is successfull !<BR></BODY></HTML>
diff -r 000000000000 -r dc678d2c1976 test-data/output_file
