changeset 0:0f2eda4ea8dc draft

Imported from capsule None
author devteam
date Mon, 27 Jan 2014 09:26:52 -0500
parents
children 8564f6927b87
files execute_dwt_cor_aVb_all.pl execute_dwt_cor_aVb_all.xml
diffstat 2 files changed, 346 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/execute_dwt_cor_aVb_all.pl	Mon Jan 27 09:26:52 2014 -0500
@@ -0,0 +1,223 @@
+#!/usr/bin/perl -w
+
+use warnings;
+use IO::Handle;
+
+$usage = "execute_dwt_cor_aVb_all.pl [TABULAR.in] [TABULAR.in] [TABULAR.out] [PDF.out]  \n";
+die $usage unless @ARGV == 4;
+
+#get the input arguments
+my $firstInputFile = $ARGV[0];
+my $secondInputFile = $ARGV[1];
+my $firstOutputFile = $ARGV[2];
+my $secondOutputFile = $ARGV[3];
+
+open (INPUT1, "<", $firstInputFile) || die("Could not open file $firstInputFile \n");
+open (INPUT2, "<", $secondInputFile) || die("Could not open file $secondInputFile \n");
+open (OUTPUT1, ">", $firstOutputFile) || die("Could not open file $firstOutputFile \n");
+open (OUTPUT2, ">", $secondOutputFile) || die("Could not open file $secondOutputFile \n");
+open (ERROR,  ">", "error.txt")  or die ("Could not open file error.txt \n");
+
+#save all error messages into the error file $errorFile using the error file handle ERROR
+STDERR -> fdopen( \*ERROR,  "w" ) or die ("Could not direct errors to the error file error.txt \n");
+
+print "There are two input data files: \n";
+print "The input data file is: $firstInputFile \n";
+print "The control data file is: $secondInputFile \n";
+
+# IvC test
+$test = "cor_aVb_all";
+
+# construct an R script to implement the IvC test
+print "\n";
+
+$r_script = "get_dwt_cor_aVa_test.r"; 
+print "$r_script \n";
+
+
+# R script
+open(Rcmd, ">", "$r_script") or die "Cannot open $r_script \n\n";
+print Rcmd "
+	#################################################################################
+	# code to do all correlation tests of form: motif(a) vs. motif(b)
+	# add code to create null bands by permuting the original data series
+	# generate plots and table matrix of correlation coefficients including p-values
+	#################################################################################
+	library(\"Rwave\");
+	library(\"wavethresh\");
+	library(\"waveslim\");
+	
+	options(echo = FALSE)
+	
+	# normalize data
+	norm <- function(data){
+		v <- (data - mean(data))/sd(data);
+		if(sum(is.na(v)) >= 1){
+			v <- data;
+		}
+		return(v);
+	}
+	
+	dwt_cor <- function(data.short, names.short, data.long, names.long, test, pdf, table, filter = 4, bc = \"symmetric\", method = \"kendall\", wf = \"haar\", boundary = \"reflection\") {
+		print(test);
+		print(pdf);
+		print(table);
+		
+		pdf(file = pdf);
+		final_pvalue = NULL;
+		title = NULL;
+		
+		short.levels <- wd(data.short[, 1], filter.number = filter, bc = bc)\$nlevels;
+		title <- c(\"motif1\", \"motif2\");
+		for (i in 1:short.levels){
+			title <- c(title, paste(i, \"cor\", sep = \"_\"), paste(i, \"pval\", sep = \"_\"));
+		}
+		print(title);
+	
+		# normalize the raw data
+		data.short <- apply(data.short, 2, norm);
+		data.long <- apply(data.long, 2, norm);
+	
+		# loop to compare a vs b
+		for(i in 1:length(names.short)){
+			for(j in 1:length(names.long)){
+				if(i >= j){
+					next;
+				} 
+				else { 
+					# Kendall Tau
+					# DWT wavelet correlation function
+					# include significance to compare
+					wave1.dwt = wave2.dwt = NULL;
+					tau.dwt = NULL;
+					out = NULL;
+	
+					print(names.short[i]);
+					print(names.long[j]);
+					
+					# need exit if not comparing motif(a) vs motif(a)
+					if (names.short[i] == names.long[j]){
+						stop(paste(\"motif\", names.short[i], \"is the same as\", names.long[j], sep = \" \"));
+					}
+					else {
+						wave1.dwt <- dwt(data.short[, i], wf = wf, short.levels, boundary = boundary);
+						wave2.dwt <- dwt(data.long[, j], wf = wf, short.levels, boundary = boundary);
+						tau.dwt <-vector(length = short.levels)
+				   
+						# perform cor test on wavelet coefficients per scale 
+						for(level in 1:short.levels){
+							w1_level = w2_level = NULL;
+							w1_level <- (wave1.dwt[[level]]);
+							w2_level <- (wave2.dwt[[level]]);
+							tau.dwt[level] <- cor.test(w1_level, w2_level, method = method)\$estimate;
+						}
+						
+						# CI bands by permutation of time series
+						feature1 = feature2 = NULL;
+						feature1 = data.short[, i];
+						feature2 = data.long[, j];
+						null = results = med = NULL; 
+						cor_25 = cor_975 = NULL;
+						
+						for (k in 1:1000) {
+							nk_1 = nk_2 = NULL;
+							null.levels = NULL;
+							cor = NULL;
+							null_wave1 = null_wave2 = NULL;
+							
+							nk_1 <- sample(feature1, length(feature1), replace = FALSE);
+							nk_2 <- sample(feature2, length(feature2), replace = FALSE);
+							null.levels <- wd(nk_1, filter.number = filter, bc = bc)\$nlevels;
+							cor <- vector(length = null.levels);
+							null_wave1 <- dwt(nk_1, wf = wf, short.levels, boundary = boundary);
+							null_wave2 <- dwt(nk_2, wf = wf, short.levels, boundary = boundary);
+
+							for(level in 1:null.levels){
+								null_level1 = null_level2 = NULL;
+								null_level1 <- (null_wave1[[level]]);
+								null_level2 <- (null_wave2[[level]]);
+								cor[level] <- cor.test(null_level1, null_level2, method = method)\$estimate;
+							}
+							null = rbind(null, cor);
+						}
+							
+						null <- apply(null, 2, sort, na.last = TRUE);
+						cor_25 <- null[25, ];
+						cor_975 <- null[975, ];
+						med <- (apply(null, 2, median, na.rm = TRUE));
+
+						# plot
+						results <- cbind(tau.dwt, cor_25, cor_975);
+						matplot(results, type = \"b\", pch = \"*\", lty = 1, col = c(1, 2, 2), ylim = c(-1, 1), xlab = \"Wavelet Scale\", ylab = \"Wavelet Correlation Kendall's Tau\", main = (paste(test, names.short[i], \"vs.\", names.long[j], sep = \" \")), cex.main = 0.75);
+						abline(h = 0);
+	
+						# get pvalues by comparison to null distribution
+						### modify pval calculation for error type II of T test ####
+						out <- c(names.short[i],names.long[j]);
+						for (m in 1:length(tau.dwt)){
+							print(m);
+							print(tau.dwt[m]);
+							out <- c(out, format(tau.dwt[m], digits = 3));	
+							pv = NULL;
+							if(is.na(tau.dwt[m])){
+								pv <- \"NA\"; 
+							} 
+							else{
+								if (tau.dwt[m] >= med[m]){
+									# R tail test
+									pv <- (length(which(null[, m] >= tau.dwt[m])))/(length(na.exclude(null[, m])));
+								}
+								else{
+									if (tau.dwt[m] < med[m]){
+										# L tail test
+										pv <- (length(which(null[, m] <= tau.dwt[m])))/(length(na.exclude(null[, m])));
+									}
+								}
+							}
+							out <- c(out, pv);
+							print(pv);
+						}
+						final_pvalue <-rbind(final_pvalue, out);
+						print(out);
+					}
+				}
+			}
+		}
+		colnames(final_pvalue) <- title;
+		write.table(final_pvalue, file = table, sep = \"\\t\", quote = FALSE, row.names = FALSE)
+		dev.off();
+	}\n";
+	
+print Rcmd "
+	# execute
+	# read in data 
+
+	inputData1 = inputData2 = NULL;
+	inputData.short1 = inputData.short2 = NULL;
+	inputDataNames.short1 = inputDataNames.short2 = NULL;
+	
+	inputData1 <- read.delim(\"$firstInputFile\");
+	inputData.short1 <- inputData1[, +c(1:ncol(inputData1))];
+	inputDataNames.short1 <- colnames(inputData.short1);
+		
+	inputData2 <- read.delim(\"$secondInputFile\");
+	inputData.short2 <- inputData2[, +c(1:ncol(inputData2))];
+	inputDataNames.short2 <- colnames(inputData.short2);
+	
+	# cor test for motif(a) in inputData1 vs motif(b) in inputData2
+	dwt_cor(inputData.short1, inputDataNames.short1, inputData.short2, inputDataNames.short2, test = \"$test\", pdf = \"$secondOutputFile\", table = \"$firstOutputFile\");
+	print (\"done with the correlation test\");
+
+	#eof\n";
+close Rcmd;
+
+system("echo \"wavelet IvC test started on \`hostname\` at \`date\`\"\n");
+system("R --no-restore --no-save --no-readline < $r_script > $r_script.out\n");
+system("echo \"wavelet IvC test ended on \`hostname\` at \`date\`\"\n");
+
+#close the input and output and error files
+close(ERROR);
+close(OUTPUT2);
+close(OUTPUT1);
+close(INPUT2);
+close(INPUT1);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/execute_dwt_cor_aVb_all.xml	Mon Jan 27 09:26:52 2014 -0500
@@ -0,0 +1,123 @@
+<tool id="compute_p-values_correlation_coefficients_featureA_featureB_occurrences_between_two_datasets_using_discrete_wavelet_transfom" name="Compute P-values and Correlation Coefficients for Occurrences of Two Set of Features" version="1.0.0">
+  <description>between two datasets using Discrete Wavelet Transfoms</description>
+  
+  <command interpreter="perl">
+  	execute_dwt_cor_aVb_all.pl $inputFile1 $inputFile2 $outputFile1 $outputFile2
+  </command>
+
+  <inputs>
+  	<param format="tabular" name="inputFile1" type="data" label="Select the first input file"/>	
+  	<param format="tabular" name="inputFile2" type="data" label="Select the second input file"/>
+  </inputs>
+  
+  <outputs>
+    <data format="tabular" name="outputFile1"/> 
+    <data format="pdf" name="outputFile2"/>
+  </outputs>
+  	
+  <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This program generates plots and computes table matrix of coefficient correlations and p-values at multiple scales for the correlation between the occurrences of features in one dataset and their occurrences in another using multiscale wavelet analysis technique. 
+
+The program assumes that the user has two sets of DNA sequences, S1 and S1, each of which consists of one or more sequences of equal length. Each sequence in each set is divided into the same number of multiple intervals n such that n = 2^k, where k is a positive integer and  k >= 1. Thus, n could be any value of the set {2, 4, 8, 16, 32, 64, 128, ...}. k represents the number of scales.
+
+The program has two input files obtained as follows:
+
+For a given set of features, say motifs, the user counts the number of occurrences of each feature in each interval of each sequence in S1 and S1, and builds two tabular files representing the count results in each interval of S1 and S1. These are the input files of the program. 
+
+The program gives two output files:
+
+- The first output file is a TABULAR format file representing the coefficient correlations and p-values for each feature at each scale.
+- The second output file is a PDF file consisting of as many figures as the number of features, such that each figure represents the values of the coefficient correlations for that feature at every scale.
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+In order to obtain empirical p-values, a random perumtation test is implemented by the program, which results in the fact that the program gives slightly different results each time it is run on the same input file. 
+
+-----
+
+**Example**
+
+Counting the occurrences of 5 features (motifs) in 16 intervals (one line per interval) of the DNA sequences in S1 gives the following tabular file::
+
+	deletionHoptspot	insertionHoptspot	dnaPolPauseFrameshift	topoisomeraseCleavageSite	translinTarget	
+		82			162			158			79				459
+		111			196			154			75				459
+		98			178			160			79				475
+		113			201			170			113				436
+		113			173			147			95				446
+		107			150			155			84				436
+		106			166			175			96				448
+		113			176			135			106				514
+		113			170			152			87				450
+		95			152			167			93				467
+		91			171			169			118				426
+		84			139			160			100				459
+		92			154			164			104				440
+		100			145			154			98				472
+		91			161			152			71				461
+		117			164			139			97				463
+
+And counting the occurrences of 5 features (motifs) in 16 intervals (one line per interval) of the DNA sequences in S2 gives the following tabular file::
+
+	deletionHoptspot	insertionHoptspot	dnaPolPauseFrameshift	topoisomeraseCleavageSite	translinTarget
+		269			366			330			238				1129
+		239			328			327			283				1188
+		254			351			358			297				1151
+		262			371			355			256				1107
+		254			361			352			234				1192
+		265			354			367			240				1182
+		255			359			333			235				1217
+		271			389			387			272				1241
+		240			305			341			249				1159
+		272			351			337			257				1169
+		275			351			337			233				1158
+		305			331			361			253				1172
+		277			341			343			253				1113
+		266			362			355			267				1162
+		235			326			329			241				1230
+		254			335			360			251				1172
+
+  
+We notice that the number of scales here is 4 because 16 = 2^4. Running the program on the above input files gives the following output:
+
+The first output file::
+
+	motif1				motif2				1_cor		1_pval		2_cor		2_pval		3_cor		3_pval		4_cor		4_pval
+	
+	deletionHoptspot		insertionHoptspot		-0.1		0.346		-0.214		0.338		1		0.127		1		0.467
+	deletionHoptspot		dnaPolPauseFrameshift		0.167		0.267		-0.214		0.334		1		0.122		1		0.511
+	deletionHoptspot		topoisomeraseCleavageSite	0.167		0.277		0.143		0.412		-0.667		0.243		1		0.521
+	deletionHoptspot		translinTarget			0		0.505		0.0714		0.441		1		0.124		1		0.518
+	insertionHoptspot		dnaPolPauseFrameshift		-0.202		0.238		0.143		0.379		-1		0.122		1		0.517
+	insertionHoptspot		topoisomeraseCleavageSite	-0.0336		0.457		0.214		0.29		0.667		0.252		1		0.503
+	insertionHoptspot		translinTarget			0.0672		0.389		0.429		0.186		-1		0.119		1		0.506
+	dnaPolPauseFrameshift		topoisomeraseCleavageSite	-0.353		0.101		0.357		0.228		0		0.612		-1		0.49
+	dnaPolPauseFrameshift		translinTarget			-0.151		0.303		-0.571		0.09		-0.333		0.37		-1		1
+	topoisomeraseCleavageSite	translinTarget			-0.37		0.077		-0.222		0.297		0.667		0.234		-1		0.471
+
+The second output file:
+
+.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_1.png
+.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_2.png
+.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_3.png
+.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_4.png
+.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_5.png
+.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_6.png
+.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_7.png
+.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_8.png
+.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_9.png
+.. image:: ${static_path}/operation_icons/dwt_cor_aVb_all_10.png
+
+
+  </help>  
+  
+</tool>