Mercurial > repos > devteam > dwt_var_perclass
changeset 1:781e68074f84 draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/dwt_var_perclass commit f929353ffb0623f2218d7dec459c7da62f3b0d24"
author | devteam |
---|---|
date | Mon, 06 Jul 2020 20:34:10 -0400 |
parents | cb422b6f49d2 |
children | |
files | execute_dwt_var_perClass.R execute_dwt_var_perClass.pl execute_dwt_var_perClass.xml test-data/in.tsv test-data/out.pdf |
diffstat | 5 files changed, 258 insertions(+), 331 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/execute_dwt_var_perClass.R Mon Jul 06 20:34:10 2020 -0400 @@ -0,0 +1,212 @@ +###################################################################### +## plot power spectra, i.e. wavelet variance by class +## add code to create null bands by permuting the original data series +## get class of maximum significant variance per feature +## generate plots and table matrix of variance including p-values +###################################################################### +library("wavethresh"); +library("waveslim"); + +options(echo = FALSE) + +## normalize data +norm <- function(data) { + v <- (data - mean(data)) / sd(data); + if (sum(is.na(v)) >= 1) { + v <- data; + } + return(v); +} + +dwt_var_permut_get_max <- function(data, names, outfile, filter = 4, bc = "symmetric", method = "kendall", wf = "haar", boundary = "reflection") { + max_var <- NULL; + matrix <- NULL; + title <- NULL; + final_pvalue <- NULL; + short_levels <- NULL; + scale <- NULL; + + print(names); + + par(mfcol = c(length(names), length(names)), mar = c(0, 0, 0, 0), oma = c(4, 3, 3, 2), xaxt = "s", cex = 1, las = 1); + + short_levels <- wavethresh::wd(data[, 1], filter.number = filter, bc = bc)$nlevels; + + title <- c("motif"); + for (i in seq_len(short_levels)) { + title <- c(title, paste(i, "var", sep = "_"), paste(i, "pval", sep = "_"), paste(i, "test", sep = "_")); + } + print(title); + + ## normalize the raw data + data <- apply(data, 2, norm); + + for (i in seq_len(length(names))) { + for (j in seq_len(length(names))) { + temp <- NULL; + results <- NULL; + wave1_dwt <- NULL; + out <- NULL; + + out <- vector(length = length(title)); + temp <- vector(length = short_levels); + + if (i != j) { + plot(temp, type = "n", axes = FALSE, xlab = NA, ylab = NA); + box(col = "grey"); + grid(ny = 0, nx = NULL); + } else { + + wave1_dwt <- waveslim::dwt(data[, i], wf = wf, short_levels, boundary = boundary); + + temp_row <- (short_levels + 1) * -1; + temp_col <- 1; + temp <- waveslim::wave.variance(wave1_dwt)[temp_row, temp_col]; + + ##permutations code : + feature1 <- NULL; + null <- NULL; + var_25 <- NULL; + var_975 <- NULL; + med <- NULL; + + feature1 <- data[, i]; + for (k in seq_len(1000)) { + nk_1 <- NULL; + null_levels <- NULL; + var <- NULL; + null_wave1 <- NULL; + + nk_1 <- sample(feature1, length(feature1), replace = FALSE); + null_levels <- wavethresh::wd(nk_1, filter.number = filter, bc = bc)$nlevels; + var <- vector(length = length(null_levels)); + null_wave1 <- waveslim::dwt(nk_1, wf = wf, short_levels, boundary = boundary); + var <- waveslim::wave.variance(null_wave1)[-8, 1]; + null <- rbind(null, var); + } + null <- apply(null, 2, sort, na.last = TRUE); + var_25 <- null[25, ]; + var_975 <- null[975, ]; + med <- (apply(null, 2, median, na.rm = TRUE)); + + ## plot + results <- cbind(temp, var_25, var_975); + matplot(results, type = "b", pch = "*", lty = 1, col = c(1, 2, 2), axes = F); + + ## get pvalues by comparison to null distribution + out <- (names[i]); + for (m in seq_len(length(temp))) { + print(paste("scale", m, sep = " ")); + print(paste("var", temp[m], sep = " ")); + print(paste("med", med[m], sep = " ")); + pv <- NULL; + tail <- NULL; + out <- c(out, format(temp[m], digits = 3)); + if (temp[m] >= med[m]) { + ## R tail test + print("R"); + tail <- "R"; + pv <- (length(which(null[, m] >= temp[m]))) / (length(na.exclude(null[, m]))); + + } else { + ## L tail test + print("L"); + tail <- "L"; + pv <- (length(which(null[, m] <= temp[m]))) / (length(na.exclude(null[, m]))); + } + out <- c(out, pv); + print(pv); + out <- c(out, tail); + ## get variances outside null bands by comparing temp to null + ### temp stores variance for each scale, and null stores permuted variances for null bands + if (temp[m] <= var_975[m]) { + temp[m] <- NA; + } + } + final_pvalue <- rbind(final_pvalue, out); + matrix <- rbind(matrix, temp) + } + ## labels + if (i == 1) { + mtext(names[j], side = 2, line = 0.5, las = 3, cex = 0.25); + } + if (j == 1) { + mtext(names[i], side = 3, line = 0.5, cex = 0.25); + } + if (j == length(names)) { + axis(1, at = (1:short_levels), las = 3, cex.axis = 0.5); + } + } + } + colnames(final_pvalue) <- title; + + ## get maximum variance larger than expectation by comparison to null bands + varnames <- vector(); + for (i in seq_len(length(names))) { + name1 <- paste(names[i], "var", sep = "_") + varnames <- c(varnames, name1) + } + rownames(matrix) <- varnames; + colnames(matrix) <- (1:short_levels); + max_var <- names; + scale <- vector(length = length(names)); + for (x in seq_len(nrow(matrix))) { + if (length(which.max(matrix[x, ])) == 0) { + scale[x] <- NA; + } else{ + scale[x] <- colnames(matrix)[which.max(matrix[x, ])]; + } + } + max_var <- cbind(max_var, scale); + write.table(max_var, file = outfile, sep = "\t", quote = FALSE, row.names = FALSE, append = TRUE); + return(final_pvalue); +} + +## execute +## read in data +args <- commandArgs(trailingOnly = TRUE) + +data_test <- NULL; +data_test <- read.delim(args[1]); + +count <- ncol(data_test) +print(paste("The number of columns in the input file is: ", count)); + +# check if the number of motifs is not a multiple of 12, and round up is so +if (count %% 12 != 0) { + print("the number of motifs is not a multiple of 12") + count2 <- ceiling(count / 12); +}else{ + print("the number of motifs is a multiple of 12") + count2 <- count / 12 +} +print(paste("There will be", count2, "subfiles")) + +pdf(file = args[4], width = 11, height = 8); + +## loop to read and execute on all count2 subfiles +final <- NULL; +for (x in seq_len(count2)) { + sub <- NULL; + sub_names <- NULL; + a <- NULL; + b <- NULL; + + a <- ((x - 1) * 12 + 1); + b <- x * 12; + + if (x < count2) { + sub <- data_test[, +c(a:b)]; + sub_names <- colnames(data_test)[a:b]; + final <- rbind(final, dwt_var_permut_get_max(sub, sub_names, args[2])); + } + else{ + sub <- data_test[, +c(a:ncol(data_test))]; + sub_names <- colnames(data_test)[a:ncol(data_test)]; + final <- rbind(final, dwt_var_permut_get_max(sub, sub_names, args[2])); + } +} + +dev.off(); + +write.table(final, file = args[3], sep = "\t", quote = FALSE, row.names = FALSE);
--- a/execute_dwt_var_perClass.pl Mon Jan 27 09:26:11 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,320 +0,0 @@ -#!/usr/bin/perl -w - -use warnings; -use IO::Handle; -use POSIX qw(floor ceil); - -# example: perl execute_dwt_var_perClass.pl hg18_NCNR_10bp_3flanks_deletionHotspot_data_del.txt deletionHotspot 3flanks del - -$usage = "execute_dwt_var_perClass.pl [TABULAR.in] [TABULAR.out] [TABULAR.out] [PDF.out] \n"; -die $usage unless @ARGV == 4; - -#get the input arguments -my $inputFile = $ARGV[0]; -my $firstOutputFile = $ARGV[1]; -my $secondOutputFile = $ARGV[2]; -my $thirdOutputFile = $ARGV[3]; - -open (INPUT, "<", $inputFile) || die("Could not open file $inputFile \n"); -open (OUTPUT1, ">", $firstOutputFile) || die("Could not open file $firstOutputFile \n"); -open (OUTPUT2, ">", $secondOutputFile) || die("Could not open file $secondOutputFile \n"); -open (OUTPUT3, ">", $thirdOutputFile) || die("Could not open file $thirdOutputFile \n"); -open (ERROR, ">", "error.txt") or die ("Could not open file error.txt \n"); - -#save all error messages into the error file $errorFile using the error file handle ERROR -STDERR -> fdopen( \*ERROR, "w" ) or die ("Could not direct errors to the error file error.txt \n"); - -# choosing meaningful names for the output files -$max_dwt = $firstOutputFile; -$pvalue = $secondOutputFile; -$pdf = $thirdOutputFile; - -# count the number of columns in the input file -while($buffer = <INPUT>){ - #if ($buffer =~ m/interval/){ - chomp($buffer); - $buffer =~ s/^#\s*//; - @contrl = split(/\t/, $buffer); - last; - #} -} -print "The number of columns in the input file is: " . (@contrl) . "\n"; -print "\n"; - -# count the number of motifs in the input file -$count = 0; -for ($i = 0; $i < @contrl; $i++){ - $count++; - print "# $contrl[$i]\n"; -} -print "The number of motifs in the input file is: $count \n"; - -# check if the number of motifs is not a multiple of 12, and round up is so -$count2 = ($count/12); -if ($count2 =~ m/(\D)/){ - print "the number of motifs is not a multiple of 12 \n"; - $count2 = ceil($count2); -} -else { - print "the number of motifs is a multiple of 12 \n"; -} -print "There will be $count2 subfiles\n\n"; - -# split infile into subfiles only 12 motif per file for R plotting -for ($x = 1; $x <= $count2; $x++){ - $a = (($x - 1) * 12 + 1); - $b = $x * 12; - - if ($x < $count2){ - print "# data.short $x <- data_test[, +c($a:$b)]; \n"; - } - else{ - print "# data.short $x <- data_test[, +c($a:ncol(data_test)]; \n"; - } -} - -print "\n"; -print "There are 4 output files: \n"; -print "The first output file is a pdf file\n"; -print "The second output file is a max_dwt file\n"; -print "The third output file is a pvalues file\n"; -print "The fourth output file is a test_final_pvalues file\n"; - -# write R script -$r_script = "get_dwt_varPermut_getMax.r"; -print "The R file name is: $r_script \n"; - -open(Rcmd, ">", "$r_script") or die "Cannot open $r_script \n\n"; - -print Rcmd " - ###################################################################### - # plot power spectra, i.e. wavelet variance by class - # add code to create null bands by permuting the original data series - # get class of maximum significant variance per feature - # generate plots and table matrix of variance including p-values - ###################################################################### - library(\"Rwave\"); - library(\"wavethresh\"); - library(\"waveslim\"); - - options(echo = FALSE) - - # normalize data - norm <- function(data){ - v <- (data-mean(data))/sd(data); - if(sum(is.na(v)) >= 1){ - v<-data; - } - return(v); - } - - dwt_var_permut_getMax <- function(data, names, filter = 4, bc = \"symmetric\", method = \"kendall\", wf = \"haar\", boundary = \"reflection\") { - max_var = NULL; - matrix = NULL; - title = NULL; - final_pvalue = NULL; - short.levels = NULL; - scale = NULL; - - print(names); - - par(mfcol = c(length(names), length(names)), mar = c(0, 0, 0, 0), oma = c(4, 3, 3, 2), xaxt = \"s\", cex = 1, las = 1); - - short.levels <- wd(data[, 1], filter.number = filter, bc = bc)\$nlevels; - - title <- c(\"motif\"); - for (i in 1:short.levels){ - title <- c(title, paste(i, \"var\", sep = \"_\"), paste(i, \"pval\", sep = \"_\"), paste(i, \"test\", sep = \"_\")); - } - print(title); - - # normalize the raw data - data<-apply(data,2,norm); - - for(i in 1:length(names)){ - for(j in 1:length(names)){ - temp = NULL; - results = NULL; - wave1.dwt = NULL; - out = NULL; - - out <- vector(length = length(title)); - temp <- vector(length = short.levels); - - if(i < j) { - plot(temp, type = \"n\", axes = FALSE, xlab = NA, ylab = NA); - box(col = \"grey\"); - grid(ny = 0, nx = NULL); - } else { - if (i > j){ - plot(temp, type = \"n\", axes = FALSE, xlab = NA, ylab = NA); - box(col = \"grey\"); - grid(ny = 0, nx = NULL); - } else { - - wave1.dwt <- dwt(data[, i], wf = wf, short.levels, boundary = boundary); - - temp_row = (short.levels + 1 ) * -1; - temp_col = 1; - temp <- wave.variance(wave1.dwt)[temp_row, temp_col]; - - #permutations code : - feature1 = NULL; - null = NULL; - var_25 = NULL; - var_975 = NULL; - med = NULL; - - feature1 = data[, i]; - for (k in 1:1000) { - nk_1 = NULL; - null.levels = NULL; - var = NULL; - null_wave1 = NULL; - - nk_1 = sample(feature1, length(feature1), replace = FALSE); - null.levels <- wd(nk_1, filter.number = filter, bc = bc)\$nlevels; - var <- vector(length = length(null.levels)); - null_wave1 <- dwt(nk_1, wf = wf, short.levels, boundary = boundary); - var<- wave.variance(null_wave1)[-8, 1]; - null= rbind(null, var); - } - null <- apply(null, 2, sort, na.last = TRUE); - var_25 <- null[25, ]; - var_975 <- null[975, ]; - med <- (apply(null, 2, median, na.rm = TRUE)); - - # plot - results <- cbind(temp, var_25, var_975); - matplot(results, type = \"b\", pch = \"*\", lty = 1, col = c(1, 2, 2), axes = F); - - # get pvalues by comparison to null distribution - out <- (names[i]); - for (m in 1:length(temp)){ - print(paste(\"scale\", m, sep = \" \")); - print(paste(\"var\", temp[m], sep = \" \")); - print(paste(\"med\", med[m], sep = \" \")); - pv = tail = NULL; - out <- c(out, format(temp[m], digits = 3)); - if (temp[m] >= med[m]){ - # R tail test - print(\"R\"); - tail <- \"R\"; - pv <- (length(which(null[, m] >= temp[m])))/(length(na.exclude(null[, m]))); - - } else { - if (temp[m] < med[m]){ - # L tail test - print(\"L\"); - tail <- \"L\"; - pv <- (length(which(null[, m] <= temp[m])))/(length(na.exclude(null[, m]))); - } - } - out <- c(out, pv); - print(pv); - out <- c(out, tail); - } - final_pvalue <-rbind(final_pvalue, out); - - - # get variances outside null bands by comparing temp to null - ## temp stores variance for each scale, and null stores permuted variances for null bands - for (n in 1:length(temp)){ - if (temp[n] <= var_975[n]){ - temp[n] <- NA; - } else { - temp[n] <- temp[n]; - } - } - matrix <- rbind(matrix, temp) - } - } - # labels - if (i == 1){ - mtext(names[j], side = 2, line = 0.5, las = 3, cex = 0.25); - } - if (j == 1){ - mtext(names[i], side = 3, line = 0.5, cex = 0.25); - } - if (j == length(names)){ - axis(1, at = (1:short.levels), las = 3, cex.axis = 0.5); - } - } - } - colnames(final_pvalue) <- title; - #write.table(final_pvalue, file = \"test_final_pvalue.txt\", sep = \"\\t\", quote = FALSE, row.names = FALSE, append = TRUE); - - # get maximum variance larger than expectation by comparison to null bands - varnames <- vector(); - for(i in 1:length(names)){ - name1 = paste(names[i], \"var\", sep = \"_\") - varnames <- c(varnames, name1) - } - rownames(matrix) <- varnames; - colnames(matrix) <- (1:short.levels); - max_var <- names; - scale <- vector(length = length(names)); - for (x in 1:nrow(matrix)){ - if (length(which.max(matrix[x, ])) == 0){ - scale[x] <- NA; - } - else{ - scale[x] <- colnames(matrix)[which.max(matrix[x, ])]; - } - } - max_var <- cbind(max_var, scale); - write.table(max_var, file = \"$max_dwt\", sep = \"\\t\", quote = FALSE, row.names = FALSE, append = TRUE); - return(final_pvalue); - }\n"; - -print Rcmd " - # execute - # read in data - - data_test = NULL; - data_test <- read.delim(\"$inputFile\"); - - pdf(file = \"$pdf\", width = 11, height = 8); - - # loop to read and execute on all $count2 subfiles - final = NULL; - for (x in 1:$count2){ - sub = NULL; - sub_names = NULL; - a = NULL; - b = NULL; - - a = ((x - 1) * 12 + 1); - b = x * 12; - - if (x < $count2){ - sub <- data_test[, +c(a:b)]; - sub_names <- colnames(data_test)[a:b]; - final <- rbind(final, dwt_var_permut_getMax(sub, sub_names)); - } - else{ - sub <- data_test[, +c(a:ncol(data_test))]; - sub_names <- colnames(data_test)[a:ncol(data_test)]; - final <- rbind(final, dwt_var_permut_getMax(sub, sub_names)); - - } - } - - dev.off(); - - write.table(final, file = \"$pvalue\", sep = \"\\t\", quote = FALSE, row.names = FALSE); - - #eof\n"; - -close Rcmd; - -system("echo \"wavelet ANOVA started on \`hostname\` at \`date\`\"\n"); -system("R --no-restore --no-save --no-readline < $r_script > $r_script.out"); -system("echo \"wavelet ANOVA ended on \`hostname\` at \`date\`\"\n"); - -#close the input and output and error files -close(ERROR); -close(OUTPUT3); -close(OUTPUT2); -close(OUTPUT1); -close(INPUT); \ No newline at end of file
--- a/execute_dwt_var_perClass.xml Mon Jan 27 09:26:11 2014 -0500 +++ b/execute_dwt_var_perClass.xml Mon Jul 06 20:34:10 2020 -0400 @@ -1,20 +1,38 @@ -<tool id="compute_p-values_max_variances_feature_occurrences_in_one_dataset_using_discrete_wavelet_transfom" name="Compute P-values and Max Variances for Feature Occurrences" version="1.0.0"> +<tool id="compute_p-values_max_variances_feature_occurrences_in_one_dataset_using_discrete_wavelet_transfom" name="Compute P-values and Max Variances for Feature Occurrences" version="1.0.1"> <description>in one dataset using Discrete Wavelet Transfoms</description> - - <command interpreter="perl"> - execute_dwt_var_perClass.pl $inputFile $outputFile1 $outputFile2 $outputFile3 + <requirements> + <requirement type="package" version="1.7.5">r-waveslim</requirement> + <requirement type="package" version="4.6.8">r-wavethresh</requirement> + </requirements> + <command detect_errors="exit_code"> + Rscript --vanilla '$__tool_directory__/execute_dwt_var_perClass.R' + '$inputFile' + '$outputFile1' + '$outputFile2' + '$outputFile3' </command> - <inputs> - <param format="tabular" name="inputFile" type="data" label="Select the input file"/> + <param format="tabular" name="inputFile" type="data" label="Select the input file"/> </inputs> - <outputs> - <data format="tabular" name="outputFile1"/> - <data format="tabular" name="outputFile2"/> - <data format="pdf" name="outputFile3"/> + <data format="tabular" name="outputFile1" label="${tool.name} on ${on_string}: scales"/> + <data format="tabular" name="outputFile2" label="${tool.name} on ${on_string}: statistics"/> + <data format="pdf" name="outputFile3" label="${tool.name} on ${on_string}: pdf"/> </outputs> - + <tests> + <test> + <param ftype="tabular" name="inputFile" value="in.tsv"/> + <output name="outputFile1" ftype="tabular"> + <assert_contents><has_line_matching expression="^max_var\tscale.*"/></assert_contents> + <assert_contents><has_line_matching expression="^translinTarget.*" /></assert_contents> + </output> + <output name="outputFile2" ftype="tabular"> + <assert_contents><has_line_matching expression="^motif\t1_var.*"/></assert_contents> + <assert_contents><has_line_matching expression="^translinTarget.*" /></assert_contents> + </output> + <output name="outputFile3" ftype="pdf" file="out.pdf" compare="sim_size"/> + </test> + </tests> <help> .. class:: infomark
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/in.tsv Mon Jul 06 20:34:10 2020 -0400 @@ -0,0 +1,17 @@ +deletionHoptspot insertionHoptspot dnaPolPauseFrameshift indelHotspot topoisomeraseCleavageSite translinTarget vDjRecombinationSignal x-likeSite +226 403 416 221 1165 832 749 1056 +236 444 380 241 1223 746 782 1207 +242 496 391 195 1116 643 770 1219 +243 429 364 191 1118 694 783 1223 +244 410 371 236 1063 692 805 1233 +230 386 370 217 1087 657 787 1215 +275 404 402 214 1044 697 831 1188 +265 443 365 231 1086 694 782 1184 +255 390 354 246 1114 642 773 1176 +281 384 406 232 1102 719 787 1191 +263 459 369 251 1135 643 810 1215 +280 433 400 251 1159 701 777 1151 +278 385 382 231 1147 697 707 1161 +248 393 389 211 1162 723 759 1183 +251 403 385 246 1114 752 776 1153 +239 383 347 227 1172 759 789 1141