Mercurial > repos > devteam > dwt_cor_avb_all
changeset 0:0f2eda4ea8dc draft
Imported from capsule None
author | devteam |
---|---|
date | Mon, 27 Jan 2014 09:26:52 -0500 |
parents | |
children | 8564f6927b87 |
files | execute_dwt_cor_aVb_all.pl execute_dwt_cor_aVb_all.xml |
diffstat | 2 files changed, 346 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/execute_dwt_cor_aVb_all.pl Mon Jan 27 09:26:52 2014 -0500 @@ -0,0 +1,223 @@ +#!/usr/bin/perl -w + +use warnings; +use IO::Handle; + +$usage = "execute_dwt_cor_aVb_all.pl [TABULAR.in] [TABULAR.in] [TABULAR.out] [PDF.out] \n"; +die $usage unless @ARGV == 4; + +#get the input arguments +my $firstInputFile = $ARGV[0]; +my $secondInputFile = $ARGV[1]; +my $firstOutputFile = $ARGV[2]; +my $secondOutputFile = $ARGV[3]; + +open (INPUT1, "<", $firstInputFile) || die("Could not open file $firstInputFile \n"); +open (INPUT2, "<", $secondInputFile) || die("Could not open file $secondInputFile \n"); +open (OUTPUT1, ">", $firstOutputFile) || die("Could not open file $firstOutputFile \n"); +open (OUTPUT2, ">", $secondOutputFile) || die("Could not open file $secondOutputFile \n"); +open (ERROR, ">", "error.txt") or die ("Could not open file error.txt \n"); + +#save all error messages into the error file $errorFile using the error file handle ERROR +STDERR -> fdopen( \*ERROR, "w" ) or die ("Could not direct errors to the error file error.txt \n"); + +print "There are two input data files: \n"; +print "The input data file is: $firstInputFile \n"; +print "The control data file is: $secondInputFile \n"; + +# IvC test +$test = "cor_aVb_all"; + +# construct an R script to implement the IvC test +print "\n"; + +$r_script = "get_dwt_cor_aVa_test.r"; +print "$r_script \n"; + + +# R script +open(Rcmd, ">", "$r_script") or die "Cannot open $r_script \n\n"; +print Rcmd " + ################################################################################# + # code to do all correlation tests of form: motif(a) vs. motif(b) + # add code to create null bands by permuting the original data series + # generate plots and table matrix of correlation coefficients including p-values + ################################################################################# + library(\"Rwave\"); + library(\"wavethresh\"); + library(\"waveslim\"); + + options(echo = FALSE) + + # normalize data + norm <- function(data){ + v <- (data - mean(data))/sd(data); + if(sum(is.na(v)) >= 1){ + v <- data; + } + return(v); + } + + dwt_cor <- function(data.short, names.short, data.long, names.long, test, pdf, table, filter = 4, bc = \"symmetric\", method = \"kendall\", wf = \"haar\", boundary = \"reflection\") { + print(test); + print(pdf); + print(table); + + pdf(file = pdf); + final_pvalue = NULL; + title = NULL; + + short.levels <- wd(data.short[, 1], filter.number = filter, bc = bc)\$nlevels; + title <- c(\"motif1\", \"motif2\"); + for (i in 1:short.levels){ + title <- c(title, paste(i, \"cor\", sep = \"_\"), paste(i, \"pval\", sep = \"_\")); + } + print(title); + + # normalize the raw data + data.short <- apply(data.short, 2, norm); + data.long <- apply(data.long, 2, norm); + + # loop to compare a vs b + for(i in 1:length(names.short)){ + for(j in 1:length(names.long)){ + if(i >= j){ + next; + } + else { + # Kendall Tau + # DWT wavelet correlation function + # include significance to compare + wave1.dwt = wave2.dwt = NULL; + tau.dwt = NULL; + out = NULL; + + print(names.short[i]); + print(names.long[j]); + + # need exit if not comparing motif(a) vs motif(a) + if (names.short[i] == names.long[j]){ + stop(paste(\"motif\", names.short[i], \"is the same as\", names.long[j], sep = \" \")); + } + else { + wave1.dwt <- dwt(data.short[, i], wf = wf, short.levels, boundary = boundary); + wave2.dwt <- dwt(data.long[, j], wf = wf, short.levels, boundary = boundary); + tau.dwt <-vector(length = short.levels) + + # perform cor test on wavelet coefficients per scale + for(level in 1:short.levels){ + w1_level = w2_level = NULL; + w1_level <- (wave1.dwt[[level]]); + w2_level <- (wave2.dwt[[level]]); + tau.dwt[level] <- cor.test(w1_level, w2_level, method = method)\$estimate; + } + + # CI bands by permutation of time series + feature1 = feature2 = NULL; + feature1 = data.short[, i]; + feature2 = data.long[, j]; + null = results = med = NULL; + cor_25 = cor_975 = NULL; + + for (k in 1:1000) { + nk_1 = nk_2 = NULL; + null.levels = NULL; + cor = NULL; + null_wave1 = null_wave2 = NULL; + + nk_1 <- sample(feature1, length(feature1), replace = FALSE); + nk_2 <- sample(feature2, length(feature2), replace = FALSE); + null.levels <- wd(nk_1, filter.number = filter, bc = bc)\$nlevels; + cor <- vector(length = null.levels); + null_wave1 <- dwt(nk_1, wf = wf, short.levels, boundary = boundary); + null_wave2 <- dwt(nk_2, wf = wf, short.levels, boundary = boundary); + + for(level in 1:null.levels){ + null_level1 = null_level2 = NULL; + null_level1 <- (null_wave1[[level]]); + null_level2 <- (null_wave2[[level]]); + cor[level] <- cor.test(null_level1, null_level2, method = method)\$estimate; + } + null = rbind(null, cor); + } + + null <- apply(null, 2, sort, na.last = TRUE); + cor_25 <- null[25, ]; + cor_975 <- null[975, ]; + med <- (apply(null, 2, median, na.rm = TRUE)); + + # plot + results <- cbind(tau.dwt, cor_25, cor_975); + matplot(results, type = \"b\", pch = \"*\", lty = 1, col = c(1, 2, 2), ylim = c(-1, 1), xlab = \"Wavelet Scale\", ylab = \"Wavelet Correlation Kendall's Tau\", main = (paste(test, names.short[i], \"vs.\", names.long[j], sep = \" \")), cex.main = 0.75); + abline(h = 0); + + # get pvalues by comparison to null distribution + ### modify pval calculation for error type II of T test #### + out <- c(names.short[i],names.long[j]); + for (m in 1:length(tau.dwt)){ + print(m); + print(tau.dwt[m]); + out <- c(out, format(tau.dwt[m], digits = 3)); + pv = NULL; + if(is.na(tau.dwt[m])){ + pv <- \"NA\"; + } + else{ + if (tau.dwt[m] >= med[m]){ + # R tail test + pv <- (length(which(null[, m] >= tau.dwt[m])))/(length(na.exclude(null[, m]))); + } + else{ + if (tau.dwt[m] < med[m]){ + # L tail test + pv <- (length(which(null[, m] <= tau.dwt[m])))/(length(na.exclude(null[, m]))); + } + } + } + out <- c(out, pv); + print(pv); + } + final_pvalue <-rbind(final_pvalue, out); + print(out); + } + } + } + } + colnames(final_pvalue) <- title; + write.table(final_pvalue, file = table, sep = \"\\t\", quote = FALSE, row.names = FALSE) + dev.off(); + }\n"; + +print Rcmd " + # execute + # read in data + + inputData1 = inputData2 = NULL; + inputData.short1 = inputData.short2 = NULL; + inputDataNames.short1 = inputDataNames.short2 = NULL; + + inputData1 <- read.delim(\"$firstInputFile\"); + inputData.short1 <- inputData1[, +c(1:ncol(inputData1))]; + inputDataNames.short1 <- colnames(inputData.short1); + + inputData2 <- read.delim(\"$secondInputFile\"); + inputData.short2 <- inputData2[, +c(1:ncol(inputData2))]; + inputDataNames.short2 <- colnames(inputData.short2); + + # cor test for motif(a) in inputData1 vs motif(b) in inputData2 + dwt_cor(inputData.short1, inputDataNames.short1, inputData.short2, inputDataNames.short2, test = \"$test\", pdf = \"$secondOutputFile\", table = \"$firstOutputFile\"); + print (\"done with the correlation test\"); + + #eof\n"; +close Rcmd; + +system("echo \"wavelet IvC test started on \`hostname\` at \`date\`\"\n"); +system("R --no-restore --no-save --no-readline < $r_script > $r_script.out\n"); +system("echo \"wavelet IvC test ended on \`hostname\` at \`date\`\"\n"); + +#close the input and output and error files +close(ERROR); +close(OUTPUT2); +close(OUTPUT1); +close(INPUT2); +close(INPUT1);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/execute_dwt_cor_aVb_all.xml Mon Jan 27 09:26:52 2014 -0500 @@ -0,0 +1,123 @@ +<tool id="compute_p-values_correlation_coefficients_featureA_featureB_occurrences_between_two_datasets_using_discrete_wavelet_transfom" name="Compute P-values and Correlation Coefficients for Occurrences of Two Set of Features" version="1.0.0"> + <description>between two datasets using Discrete Wavelet Transfoms</description> + + <command interpreter="perl"> + execute_dwt_cor_aVb_all.pl $inputFile1 $inputFile2 $outputFile1 $outputFile2 + </command> + + <inputs> + <param format="tabular" name="inputFile1" type="data" label="Select the first input file"/> + <param format="tabular" name="inputFile2" type="data" label="Select the second input file"/> + </inputs> + + <outputs> + <data format="tabular" name="outputFile1"/> + <data format="pdf" name="outputFile2"/> + </outputs> + + <help> + +.. class:: infomark + +**What it does** + +This program generates plots and computes table matrix of coefficient correlations and p-values at multiple scales for the correlation between the occurrences of features in one dataset and their occurrences in another using multiscale wavelet analysis technique. + +The program assumes that the user has two sets of DNA sequences, S1 and S1, each of which consists of one or more sequences of equal length. Each sequence in each set is divided into the same number of multiple intervals n such that n = 2^k, where k is a positive integer and k >= 1. Thus, n could be any value of the set {2, 4, 8, 16, 32, 64, 128, ...}. k represents the number of scales. + +The program has two input files obtained as follows: + +For a given set of features, say motifs, the user counts the number of occurrences of each feature in each interval of each sequence in S1 and S1, and builds two tabular files representing the count results in each interval of S1 and S1. These are the input files of the program. + +The program gives two output files: + +- The first output file is a TABULAR format file representing the coefficient correlations and p-values for each feature at each scale. +- The second output file is a PDF file consisting of as many figures as the number of features, such that each figure represents the values of the coefficient correlations for that feature at every scale. + +----- + +.. class:: warningmark + +**Note** + +In order to obtain empirical p-values, a random perumtation test is implemented by the program, which results in the fact that the program gives slightly different results each time it is run on the same input file. + +----- + +**Example** + +Counting the occurrences of 5 features (motifs) in 16 intervals (one line per interval) of the DNA sequences in S1 gives the following tabular file:: + + deletionHoptspot insertionHoptspot dnaPolPauseFrameshift topoisomeraseCleavageSite translinTarget + 82 162 158 79 459 + 111 196 154 75 459 + 98 178 160 79 475 + 113 201 170 113 436 + 113 173 147 95 446 + 107 150 155 84 436 + 106 166 175 96 448 + 113 176 135 106 514 + 113 170 152 87 450 + 95 152 167 93 467 + 91 171 169 118 426 + 84 139 160 100 459 + 92 154 164 104 440 + 100 145 154 98 472 + 91 161 152 71 461 + 117 164 139 97 463 + +And counting the occurrences of 5 features (motifs) in 16 intervals (one line per interval) of the DNA sequences in S2 gives the following tabular file:: + + deletionHoptspot insertionHoptspot dnaPolPauseFrameshift topoisomeraseCleavageSite translinTarget + 269 366 330 238 1129 + 239 328 327 283 1188 + 254 351 358 297 1151 + 262 371 355 256 1107 + 254 361 352 234 1192 + 265 354 367 240 1182 + 255 359 333 235 1217 + 271 389 387 272 1241 + 240 305 341 249 1159 + 272 351 337 257 1169 + 275 351 337 233 1158 + 305 331 361 253 1172 + 277 341 343 253 1113 + 266 362 355 267 1162 + 235 326 329 241 1230 + 254 335 360 251 1172 + + +We notice that the number of scales here is 4 because 16 = 2^4. Running the program on the above input files gives the following output: + +The first output file:: + + motif1 motif2 1_cor 1_pval 2_cor 2_pval 3_cor 3_pval 4_cor 4_pval + + deletionHoptspot insertionHoptspot -0.1 0.346 -0.214 0.338 1 0.127 1 0.467 + deletionHoptspot dnaPolPauseFrameshift 0.167 0.267 -0.214 0.334 1 0.122 1 0.511 + deletionHoptspot topoisomeraseCleavageSite 0.167 0.277 0.143 0.412 -0.667 0.243 1 0.521 + deletionHoptspot translinTarget 0 0.505 0.0714 0.441 1 0.124 1 0.518 + insertionHoptspot dnaPolPauseFrameshift -0.202 0.238 0.143 0.379 -1 0.122 1 0.517 + insertionHoptspot topoisomeraseCleavageSite -0.0336 0.457 0.214 0.29 0.667 0.252 1 0.503 + insertionHoptspot translinTarget 0.0672 0.389 0.429 0.186 -1 0.119 1 0.506 + dnaPolPauseFrameshift topoisomeraseCleavageSite -0.353 0.101 0.357 0.228 0 0.612 -1 0.49 + dnaPolPauseFrameshift translinTarget -0.151 0.303 -0.571 0.09 -0.333 0.37 -1 1 + topoisomeraseCleavageSite translinTarget -0.37 0.077 -0.222 0.297 0.667 0.234 -1 0.471 + +The second output file: + +.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_1.png +.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_2.png +.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_3.png +.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_4.png +.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_5.png +.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_6.png +.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_7.png +.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_8.png +.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_9.png +.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_10.png + + + </help> + +</tool>