Next changeset 1:cb8bac9d0d37 (2017-09-07) |
Commit message:
planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty |
added:
CpGoe.pl CpGoe.xml Functions/Kernel_function_form.R Functions/density_pm.R KDEanalysis.r KDEanalysis.xml LICENSE readme.rst test-data/10_std_unif.txt test-data/9_seqs.fa tool_dependencies.xml |
b |
diff -r 000000000000 -r 1535ffddeff4 CpGoe.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CpGoe.pl Thu Sep 07 08:51:57 2017 -0400 |
[ |
b'@@ -0,0 +1,368 @@\n+#! /usr/bin/perl\n+\n+####################################\n+#\n+# CpGoe.pl -f fasta-file -o output_file -m minlength\n+#\n+# reads concatanated fasta files, writes name, length, CpGs and GpCs, CpGo/e ratios and other quantities into TAB file for each sequence\n+#####################################\n+\n+use diagnostics;\n+use strict;\n+use Carp;\n+use FileHandle;\n+use File::Path;\n+use File::Basename;\n+use Data::Dumper;\n+use Getopt::Std;\n+\n+my $VERSION = "1.0";\n+$Getopt::Std::STANDARD_HELP_VERSION = 1;\n+\n+\n+# Called when --help set as flag\n+sub HELP_MESSAGE\n+{\n+ print "Description: CpGoe processes a FASTA file and outputs the CpGo/e ratios and - if specified - further quantities\\n\\n" .\n+ "Usage: CpGoe.pl [OPTION] -f FASTA_FILE \\n\\n" .\n+ "Options:\\n" .\n+ " -f FASTA_FILE Name of FASTA file containing the input sequences. REQUIRED.\\n" .\n+ " -o OUT_FILE name of output file containing the results\\n" .\n+ " -m MIN_LEN minimum length of sequences, shorter sequences are discarded\\n" .\n+\t\t" -c CONTEXT\t\t Context to calculate the ratio (CpA, CpC, CpG, CpT) default CpG.\\n" .\n+ " -a ALGORITHM Algorithm used to calculate CpGo/e ratio. Default: 1\\n" .\n+\t\t" 1 - (CpG / (C * G)) * (L^2 / L-1)\\n" .\n+\t\t" 2 - (CpG / (C * G)) * (L^2 / L)\\n" .\n+\t\t" 3 - (CpG / L) / (G + C)^2\\n" .\n+\t\t" 4 - CpG / ( (C + G)/2 )^2\\n" .\n+ " -d detailed output, providing other quantities additional to the CpGo/e ratios\\n" .\n+ " -h output header line\\n".\n+ " -v verbose messages\\n" ;\n+ exit 0;\n+}\n+\n+\n+\n+# Called when --version set as flag\n+sub VERSION_MESSAGE\n+{\n+ print "CpGoe $VERSION\\n";\n+}\n+\n+\n+\n+# Command line parsing\n+# ... read argument\n+my %opts;\n+getopts(\'f:o:m:c:a:dvh\', \\%opts);\n+#if ($#ARGV != 0) {\n+# print STDERR "Exactly one argument has to be provided, the name of the input FASTA file.\\n" .\n+# "Moreover, the options must be listed first, then the name of the input FASTA file.\\n";\n+# exit 1;\n+#}\n+my $fasta_fname;\n+if (exists($opts{\'f\'})) {\n+ $fasta_fname = $opts{\'f\'};\n+} else {\n+ HELP_MESSAGE\n+}\n+\n+# ... read options\n+my $out_fname;\n+my $has_file_output;\n+if (exists($opts{\'o\'})) {\n+ $out_fname = $opts{\'o\'};\n+ $has_file_output = 1;\n+}\n+else {\n+ $has_file_output = 0;\n+}\n+\n+my $min_len;\n+if (exists($opts{\'m\'})) {\n+ $min_len = $opts{\'m\'};\n+}\n+else {\n+ $min_len = 1;\n+}\n+\n+my $algo = 1;\n+if (exists($opts{\'a\'})) {\n+ $algo = $opts{\'a\'};\n+}\n+\n+my $context = \'CpG\';\n+if (exists($opts{\'c\'})) {\n+ $context = $opts{\'c\'};\n+}\n+\n+my $is_verbose = exists($opts{\'v\'});\n+my $has_header = exists($opts{\'h\'});\n+my $is_detailed = exists($opts{\'d\'});\n+\n+\n+\n+# read input file and split into fasta sequences on the fly\n+# ... check whether input FASTA file exists\n+my $FASTA;\n+if (-e $fasta_fname) {\n+ if (-f $fasta_fname) {\n+ my $res = open($FASTA, $fasta_fname);\n+ if (!$res) {\n+ print STDERR "could not open $fasta_fname\\n";\n+ exit 1;\n+ }\n+ }\n+ else {\n+ print STDERR "$fasta_fname is not a file\\n";\n+ exit 1;\n+ }\n+}\n+else {\n+ print STDERR "cannot open file $fasta_fname\\n";\n+ exit 1;\n+}\n+\n+\n+# ... determine which linebreak is used (linux / windows / mac)\n+my $found_n = 0;\n+my $found_r = 0;\n+my $found_rn = 0;\n+while ( defined( my $ch = getc($FASTA) ) ) {\n+ if ($ch eq "\\n") {\n+ if ($found_r) {\n+ $found_rn = 1;\n+ $found_r = 0;\n+ } else {\n+ $found_n = 1;\n+ }\n+ last;\n+ } elsif ($ch eq "\\r") {\n+ $found_r = 1;\n+ } else {\n+ if ($found_r) {\n+ last;\n+ }\n+ }\n+}\n+close($FASTA);\n+if ($found_r + $found_n + $found_rn != 1) {\n+ print STDERR "something went wrong determining the linebreaks used in $fasta_fname\\n";\n+}\n+\n+\n+# ... read in sequences\n+my $old_linebreak = $/;\n+if ($found_n) {\n+ $/ = "\\n";\n+} elsif ($found_r) {\n+ $/ = "\\r";\n+} else {\n+ $/ = "\\r\\n";\n+}\n+my $re'..b'$i] eq "") {\n+ if ($num < $MAX) {\n+ $str .= "Sequence " . $names[$i] . " in FASTA file $fasta_fname is empty\\n";\n+ }\n+ $err = 1;\n+ ++$num;\n+ }\n+}\n+if ($err) {\n+ print STDERR "$str";\n+ if ($num > $MAX) {\n+ print STDERR "$num empty sequences in total in FASTA file $fasta_fname \\n";\n+ }\n+ exit 1;\n+}\n+\n+\n+# ... ... check for illegal characters in sequences\n+for (my $i = 0; $i <= $#names; ++$i) {\n+ my $str = $seqs[$i];\n+ $str =~ s/[abcdghkmnrstuvwy]//gi;\n+ if (length($str) > 0) {\n+ $str = join \'\', sort { $a cmp $b } split(//, $str);\n+ print STDERR "Sequence " . $names[$i] . " of FASTA file " . $fasta_fname . " contains illegal characters: ";\n+ for (my $j = 0; $j <= length($str); ++$j) {\n+ my $out = ($j == 0);\n+ my $curr = substr($str, $j, 1);\n+ if (!$out) {\n+ my $prev = substr($str, $j - 1, 1) ;\n+ $out = ($curr ne $prev);\n+ }\n+ if ($out) {\n+ print STDERR $curr;\n+ }\n+ }\n+ print STDERR "\\n";\n+ exit 1;\n+ }\n+}\n+\n+\n+# ... output\n+if ($is_verbose) {\n+ print $#names + 1 . " sequence(s) read.\\n";\n+}\n+\n+\n+\n+# output quantities\n+# ... open output file\n+my $OUT;\n+if ($has_file_output) {\n+ if ((-e $out_fname) && !(-f $out_fname)) {\n+ print STDERR "$out_fname exists and is not a file\\n";\n+ exit 1;\n+ }\n+ if (!open($OUT, ">$out_fname")) {\n+ print STDERR "cannot create file $out_fname\\n";\n+ exit 1;\n+ }\n+} else {\n+ $OUT = *STDOUT;\n+}\n+\n+# ... print header\n+if ($has_header) {\n+ print $OUT "#name\\tlength\\tCpGs\\tGpCs\\tCs\\tGs\\tNs\\tCpG o\\/e\\n";\n+}\n+\n+\n+\n+# ... for each sequence calculate CpGo/e ratios and related quantities:\n+# - length of the sequence\n+# - CpGs present in the sequence\n+# - GpCs present in the sequence\n+# - Cs the number of C present in the sequence\n+# - Gs the number of G present in the sequence\n+# - CpG o/e ratio of the sequence\n+my $num_short = 0; # number of sequences which are too short\n+for my $i (0 .. $#names) {\n+ my @ar = ();\n+ @ar = split(\'\\|\', $names[$i]);\n+ my $seqname = $ar[1];\n+ my $num_N = () = ( $seqs[$i] =~ m/N/gi );\n+ my $len = length($seqs[$i]);\n+ my $l = $len - $num_N;\n+ if ($l >= $min_len) {\n+\t my ($num_G, $num_CG);\n+\tif ($context eq \'CpG\') {\n+\t\t$num_G = () = ( $seqs[$i] =~ m/G/gi );\n+\t\t$num_CG = () = ( $seqs[$i] =~ m/CG/gi );\n+\t} elsif ($context eq \'CpA\') {\n+\t\t$num_G = () = ( $seqs[$i] =~ m/A/gi );\n+\t\t$num_CG = () = ( $seqs[$i] =~ m/CA/gi );\n+\t} elsif ($context eq \'CpC\') {\n+\t\t$num_G = () = ( $seqs[$i] =~ m/C/gi );\n+\t\t$num_CG = () = ( $seqs[$i] =~ m/CC/gi );\n+\t} elsif ($context eq \'CpT\') {\n+\t\t$num_G = () = ( $seqs[$i] =~ m/T/gi );\n+\t\t$num_CG = () = ( $seqs[$i] =~ m/CT/gi );\n+\t} else {\n+\t\t$num_G = 0;\n+\t\t$num_CG = 0;\n+\t}\n+ my $num_C = () = ( $seqs[$i] =~ m/C/gi );\n+\tmy $num_TG = () = ( $seqs[$i] =~ m/TG/gi );\n+ my $CpGoe;\n+ if ( ($num_G == 0) || ($num_C == 0) || ($l == 1) || ($num_CG == 0) ) {\n+ $CpGoe = 0;\n+ }\n+ else {\n+\t if ($algo == 1) {\n+ my $x = $num_CG / ($num_C * $num_G);\n+ my $y = $l**2 / ($l - 1);\n+ $CpGoe = $x * $y;\n+ } elsif ($algo == 2) {\n+\t\t# cf.Gardiner-Garden and Frommer\n+\t\t$CpGoe = ($num_CG/($num_C * $num_G))*$l;\n+ } elsif ($algo == 3) {\n+\t\t# cf. Zeng and Yi\n+\t\t$CpGoe = ($num_CG / $l)/(($num_C + $num_G)/$l)**2;\n+\t } elsif ($algo == 4) {\n+\t\t# cf. Saxonov, Berg and Brutlag\n+\t\t$CpGoe = $num_CG / (($num_C + $num_G)/2)**2;\n+\t }\n+\t}\n+ print $OUT $names[$i] . "\\t";\n+ if ($is_detailed) {\n+ if ($algo == 3) {\n+\t\tprint $OUT $len . "\\t" . $num_CG . "\\t" . $num_TG . "\\t" .$num_C. "\\t" .$num_G. "\\t" .$num_N. "\\t";\n+\t } else {\n+\t\tprint $OUT $len . "\\t" . $num_CG . "\\t" . $num_C . "\\t" .$num_G. "\\t" .$num_N. "\\t";\n+\t }\n+ }\n+ print $OUT $CpGoe . "\\n";\n+ } else {\n+ ++$num_short;\n+ }\n+}\n+if ($is_verbose) {\n+ print $num_short . " sequence(s) discarded due to short length.\\n";\n+}\n+\n+if ($has_file_output) {\n+ my $res = close($OUT);\n+ if (!$res) {\n+ print STDERR "could not close $out_fname\\n";\n+ exit 1;\n+ }\n+}\n+\n' |
b |
diff -r 000000000000 -r 1535ffddeff4 CpGoe.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CpGoe.xml Thu Sep 07 08:51:57 2017 -0400 |
[ |
@@ -0,0 +1,57 @@ +<tool id="CpGoe" name="Calculate CpNo/e" version="1.0.0"> + <requirements> + <requirement type="package" version="5.8.0">perl</requirement> + </requirements> + <stdio> + <exit_code range="1:" /> + </stdio> + <command><![CDATA[ + perl ${__tool_directory__}/CpGoe.pl -f $fastafile -a $algo -c $context -o $outfile -m $minlength + ]]></command> + <inputs> + <param name="fastafile" type="data" format="fasta" /> + <param name="context" type="select" label="Select context to calculate the CpNo/e ratio" > + <option value="CpG" selected="selected">CpG</option> + <option value="CpA">CpA</option> + <option value="CpC">CpC</option> + <option value="CpT">CpT</option> + </param> + <param name="algo" type="select" label="Select algorithm to calculate the CpGo/e ratio" > + <option value="1">(CpG / (C * G)) * (L^2 / L-1)</option> + <option value="2">Gardiner-Garden and Frommer (CpG / (G + C))*L</option> + <option value="3">Zeng and Yi (CpG / L) / ((C + G)/L)^2</option> + <option value="4">Saxonov, Berg and Brutlag CpG / ((C + G)/2)^2</option> + </param> + <param name="minlength" type="integer" value="200" help="Minimum length of the sequence to calculate CpGo/e ratio" /> + </inputs> + <outputs> + <data name="outfile" format="tabular" /> + </outputs> + <tests> + <test> + <param name="fastafile" value="9_seqs.fa"/> + <output name="outfile1" file="9_seqs_cpgoe.csv"/> + </test> + </tests> + <help><![CDATA[ + CpGoe.pl -f fasta-file -a algo -c context -o output_file -m minlength + + Reads multi fasta files, writes name, length, CpGs and GpCs, CpGo/e ratios and other quantities into TAB file for each sequence. + + The available contexts (-c) are CpG, CpA, CpC, CpT. Default CpG. + + The algorithms (-a) available for -a are the following:: + + 1 => (CpG / (C * G)) * (L^2 / L-1) + 2 => (CpG / (C * G)) * L + 3 => (CpG / L) / ((C + G) / L)^2 + 4 => (CpG / (C + G)/2)^2 + + Where L represents the length of the sequence, CpG represents the count of CG dinucleotide, C and G represent the count for the respective bases and TG represents the number of TG dinucleotides. + ]]></help> + <citations> + <citation> + <citation type="doi">10.1101/180463</citation> + </citation> + </citations> +</tool> |
b |
diff -r 000000000000 -r 1535ffddeff4 Functions/Kernel_function_form.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Functions/Kernel_function_form.R Thu Sep 07 08:51:57 2017 -0400 |
[ |
b'@@ -0,0 +1,311 @@\n+##############################################\r\n+# KDE function for single observation sequence #\r\n+##############################################\r\n+\r\n+# load packages\r\n+require(moments, quietly = TRUE)\r\n+require(doParallel, quietly = TRUE)\r\n+\r\n+\r\n+# get names for the columns of the table characterizing the peaks\r\n+col.names.peaks <- function()\r\n+{\r\n+ v <- c("Number of modes", "Number of modes (5% excluded)",\r\n+ "Number of modes (10% excluded)", "Skewness",\r\n+ "Mode skewness", "Nonparametric skewness", "Q50 skewness",\r\n+ "Absolute Q50 mode skewness", "Absolute Q80 mode skewness",\r\n+ "Peak 1", "Probability Mass 1",\r\n+ "Peak 2", "Probability Mass 2",\r\n+ "Peak 3", "Probability Mass 3",\r\n+ "Peak 4", "Probability Mass 4",\r\n+ "Peak 5", "Probability Mass 5",\r\n+ "Peak 6", "Probability Mass 6",\r\n+ "Peak 7", "Probability Mass 7",\r\n+ "Peak 8", "Probability Mass 8",\r\n+ "Peak 9", "Probability Mass 9",\r\n+ "Peak 10", "Probability Mass 10", \r\n+ "Warning close modes",\r\n+ "Number close modes", \r\n+ "Modes (close modes excluded)",\r\n+ "SD", "IQR 80", "IQR 90",\r\n+ "Total number of sequences")\r\n+ return(v)\r\n+}\r\n+\r\n+\r\n+# get names for the columns of the table containing the boostrap results\r\n+col.names.bs <- function()\r\n+{\r\n+ v <- c("Number of modes (NM)", \r\n+ "% of samples with same NM",\r\n+ "% of samples with more NM",\r\n+ "% of samples with less NM",\r\n+ "no. of samples with same NM",\r\n+ "% BS samples excluded by prob. mass crit.",\r\n+ "Warning CI")\r\n+ return(v)\r\n+}\r\n+\r\n+\r\n+# plot KDE for one set up CpGo/e ratios\r\n+# obs: CpGo/e ratios\r\n+# bs.cis: Is bootstrap done?\r\n+# t.name: Title of the plot\r\n+# t.sub: Text that is added below the title\r\n+# t.legend: Is a legend printed?\r\n+plot.KDE <- function(obs, t.name, bs.cis = FALSE, bstrp.reps = 1500, conf.lev = 0.95, t.sub = NULL, t.legend = TRUE, min.dist = 0.2, mode.mass = 0.01, band.width = 1.06)\r\n+{ \r\n+ # determine directory where functions are located\r\n+ cmdArgs <- commandArgs(trailingOnly = FALSE)\r\n+ str <- "--file="\r\n+ match <- grep(str, cmdArgs)\r\n+ if (length(match) == 0) { \r\n+ FCTN.DIR <- "../Galaxy/Functions"\r\n+ } else {\r\n+ path <- normalizePath( sub(str, "", cmdArgs[match]) )\r\n+ FCTN.DIR <- file.path(dirname(path), "Functions")\r\n+ }\r\n+ \r\n+ # part 1: initialize parameters etc\r\n+ # ---------------------------------\r\n+ \r\n+ # table with names and number of peaks\r\n+ v <- col.names.peaks()\r\n+ tab1.m <- data.frame(matrix(NA, nrow = 1, ncol = length(v)))\r\n+ names(tab1.m) <- col.names.peaks()\r\n+ # parameters to set in any case\r\n+ num.points <- 10 ^ 4 # number of points where to estimate the density\r\n+ p.bw <- "nrd" # algorithm for the bandwidth selection, "nrd" for Scott\'s bandwith\r\n+ use.seed <- TRUE\r\n+ threshold.modes <- mode.mass \r\n+ threshold.bs.ci <- 0.2 # only changes of +- threshold.bs.ci% in prob. mass is allowed for entering the CI calculation\r\n+ # bootstrap with optional parameters\r\n+ if (bs.cis) {\r\n+ ncpus = max(1, detectCores()) # number of processsors\r\n+ cl <- makeCluster(ncpus)\r\n+ registerDoParallel(cl)\r\n+ # table for the bootstrap\r\n+ tab2.m <- data.frame(matrix(NA, nrow = 1, ncol = 7))\r\n+ names(tab2.m) <- col.names.bs()\r\n+ }\r\n+\r\n+\r\n+ # part 2: estimate the KD and calculate probability masses\r\n+ # --------------------------------------------------------\r\n+ \r\n+ source( file.path(FCTN.DIR, "density_pm.R") )\r\n+ estimate <- density_pm(obs, num.points, p.bw = band.width * sd(obs) / length(obs)^0.2, threshold.modes = threshold.modes)\r\n+ ker <- estimate$ker # kernel density\r\n+ p <- estimate$peaks\r\n+ v <- estimate$valleys\r\n+ pm <- estimate$pm\r\n+ \r\n+ # select for each peak a line type\r\n+ num.pm <- length(p) # number of peaks\r\n+ p5 <- estimate$p5\r\n+ p10 <- estimate$p10\r\n+ lty = rep(1, num.pm) #pm > 0.10\r\n+ if (length(p10) > 0) { #p'..b'= t.sym[[3]]))\r\n+ } else {\r\n+ md.labs <- substitute(paste("Mode with PM ", sym3, " 0.1", sep = ""), list(sym3 = t.sym[[3]]))\r\n+ if (thr >= 0.05) {\r\n+ md.labs <- c( md.labs, substitute(paste("Mode with ", thr, " ", sym1, " PM ", sym2, " 0.1", sep = ""), list(thr = thr, sym1 = t.sym[[1]], sym2 = t.sym[[2]])) ) \r\n+ } else {\r\n+ md.labs <- c( md.labs, substitute(paste("Mode with 0.05 ", sym1, " PM ", sym2, " 0.1", sep = ""), list(sym1 = t.sym[[1]], sym2 = t.sym[[2]])),\r\n+ substitute(paste("Mode with ", thr, sym1, " PM ", sym2, " 0.05"), list(thr = thr, sym1 = t.sym[[1]], sym2 = t.sym[[2]]))) \r\n+ }\r\n+ }\r\n+ legend("topright", \r\n+ c(expression("Estimated density"), md.labs),\r\n+ lty = c(1, 1, 2, 4), lwd = c(2, 3, 3, 3),\r\n+ col = c("red", "blue", "blue", "blue"), bg = "white") \r\n+ }\r\n+ \r\n+ \r\n+ # part 5: return results\r\n+ # ----------------------\r\n+ \r\n+ # part 5 a): results table 1\r\n+ # tbd (maybe): introduce maximum number of modes (10 right now)\r\n+ tab1.m[1, "Number of modes"] <- num.pm\r\n+ tab1.m[1, 2] = num.pm - length(estimate$p5)\r\n+ tab1.m[1, 3] = num.pm - length(estimate$p10)\r\n+ for(j in 1 : 10){\r\n+ if(num.pm < j){\r\n+ tab1.m[1, j * 2 + 8] = c(" ")\r\n+ tab1.m[1, j * 2 + 9] = c(" ")\r\n+ } else{\r\n+ tab1.m[1, j * 2 + 8] = ker$x[p][j]\r\n+ tab1.m[1, j * 2 + 9] = pm[j]\r\n+ }\r\n+ }\r\n+ \r\n+ # fill table 1 with descriptives\r\n+ tab1.m[1, "Skewness"] <- skewness(obs)\r\n+ mode <- ker$x[ which.max(ker$y) ]\r\n+ tab1.m[1, "Mode skewness"] <- (mean(obs) - mode) / sd(obs)\r\n+ tab1.m[1, "Nonparametric skewness"] <- (mean(obs) - median(obs)) / sd(obs)\r\n+ q <- quantile(obs, c(0.25, 0.5, 0.75))\r\n+ tab1.m[1, "Q50 skewness"] <- (q[3] + q[1] - 2 * q[2]) / (q[3] - q[1])\r\n+ tab1.m[1, "Absolute Q50 mode skewness"] <- (q[3] + q[1]) / 2 - mode\r\n+ q <- quantile(obs, c(0.1, 0.5, 0.9))\r\n+ tab1.m[1, "Absolute Q80 mode skewness"] <- (q[3] + q[1]) / 2 - mode\r\n+ tab1.m[1, "SD"] <- sd(obs)\r\n+ tab1.m[1, "IQR 80"] <- diff(quantile(obs, c(0.1, 0.9)))\r\n+ tab1.m[1, "IQR 90"] <- diff(quantile(obs, c(0.05, 0.95)))\r\n+ tab1.m[1, "Total number of sequences"] = length(obs)\r\n+ \r\n+ # check if any peak is closer than a given threshold to any other\r\n+ num.close.modes <- sum(diff(ker$x[p]) < min.dist)\r\n+ if ( any(diff(ker$x[p]) < min.dist) && (num.pm > 1) ) {\r\n+ tab1.m[1, "Warning close modes"] <- "Modes too close"\r\n+ tab1.m[1, "Number close modes"] <- num.close.modes\r\n+ tab1.m[1, "Modes (close modes excluded)"] <- num.pm - num.close.modes\r\n+ } else {\r\n+ tab1.m[1, "Modes (close modes excluded)"] <- num.pm \r\n+ }\r\n+ \r\n+ # part 5 b): results table 2 \r\n+ if (bs.cis == TRUE) {\r\n+ ker <- lapply( estimateB, "[[", "ker")\r\n+ peaks <- lapply( estimateB, "[[", "peaks")\r\n+ num.peaks <- c()\r\n+ for (i in 1:length(peaks)) {\r\n+ curr.peaks <- ker[[i]]$x[ peaks[[i]] ]\r\n+ num.cl <- sum(diff(curr.peaks) < min.dist) \r\n+ num.peaks <- c(num.peaks, length(curr.peaks) - num.cl)\r\n+ }\r\n+ \r\n+ # fill table 2 with stats on number of modes in bs samples \r\n+ num <- num.pm - num.close.modes\r\n+ tab2.m[1, "Number of modes (NM)"] <- num\r\n+ tab2.m[1, "% of samples with same NM"] <- 100 * sum(num.peaks == num) / bstrp.reps # equal\r\n+ tab2.m[1, "% of samples with more NM"] <- 100 * sum(num.peaks > num) / bstrp.reps # more\r\n+ tab2.m[1, "% of samples with less NM"] <- 100 * sum(num.peaks < num) / bstrp.reps # less\r\n+ if (num.pm > 1) {\r\n+ tab2.m[1, "Warning CI"] <- "CI\'s may be unreliable"\r\n+ }\r\n+ \r\n+ tab2.m[1, "no. of samples with same NM"] <- sum(num.peaks == num)\r\n+ tab2.m[1, "% BS samples excluded by prob. mass crit."] <- (1 - sum(mass.ok) / sum(num.peaks.ok)) * 100\r\n+ }\r\n+\r\n+ # return the results\r\n+ if (bs.cis){\r\n+ return(list(tab.des = tab1.m, tab.bs = tab2.m, valleys = vals))\r\n+ } else {\r\n+ return(list(tab.des = tab1.m, valleys = vals))\r\n+ }\r\n+}\r\n+\r\n+\r\n+\r\n' |
b |
diff -r 000000000000 -r 1535ffddeff4 Functions/density_pm.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Functions/density_pm.R Thu Sep 07 08:51:57 2017 -0400 |
[ |
@@ -0,0 +1,141 @@ +### Auxiliary functions ### + +# Finds the peak +peaks <- function(x,partial=TRUE){ + if (partial){ #includes the first and the last element + which(diff(c(TRUE,diff(x)>=0,FALSE))<0) + }else { + which(diff(diff(x)>=0)<0)+1 + } +} + + +#Function that finds the valleys +valleys <- function(x,partial=TRUE){ + if (partial){ #includes the first and the last element + which(diff(c(FALSE,diff(x)>0,TRUE))>0) + }else { + which(diff(diff(x)>0)>0)+1 + } +} + + +#Function that calculates the probability masses +#ker: kernel density +#v: valleys +probability_mass <- function(ker,v){ + require(sfsmisc, quietly = TRUE) + ker$y[which(ker$x<0)] = 0 + pm = c() + for(j in 1:(length(v)-1)){ + pm[j] = integrate.xy(ker$x,ker$y,ker$x[v[j]],ker$x[v[j+1]], use.spline = FALSE) + } + pm = pm/sum(pm) + return(pm) +} + + +#Function that tests if pm < value +#pm: probability masses +test_pm <- function(pm,value){ + p = c() + num_pm = length(pm) + for(j in 1:num_pm){ + if(pm[j]<value){ + p = c(p,j) + } + } + return(p) +} + + + +### Main functions ### + +# Estimate the kernel density and calculate the probability masses +# obs : data set +# num.points : number of points for the estimation of the kernel density +density_pm <- function(obs, num.points, p.bw = p.bw, threshold.modes = threshold.modes){ + + # fit model + ker = density(obs, bw = p.bw, n = num.points) + + # find peaks + p = peaks(ker$y) + + # find valleys + v = valleys(ker$y) + + # probability masses + pm = probability_mass(ker,v) + num_pm = length(pm) + + # number of pm < threshold.modes + p.del = test_pm(pm,threshold.modes) + + # delete modes with probability masses < threshold.modes + for(j in 1:num_pm){ + if(j %in% p.del){ + p[j] = NA + v[j+1] = NA + } + } + p = p[!is.na(p)] + v = v[!is.na(v)] + + # probability masses (without the ones < threshold.modes) + pm = probability_mass(ker,v) + num_pm = length(pm) + + # number of pm<0.05 + p5 = test_pm(pm,0.05) + + # number of pm<0.10 + p10 = test_pm(pm,0.10) + + estimate = list(ker=ker,peaks=p,valleys=v,pm=pm,p5=p5,p10=p10) + + return(estimate) +} + + + +# Estimate the kernel density and calculate the probability masses, and do bootstrap +# obs : data set +# num.points : number of points for the estimation of the kernel density +density_pm_boot <- function(obs, num.points, p.bw = p.bw, threshold.modes = threshold.modes){ + + # fit model + ker = density(obs,bw = p.bw, n = num.points) + + # find peaks + p = peaks(ker$y) + + # find valleys + v = valleys(ker$y) + + # probability masses + pm = probability_mass(ker, v) + num_pm = length(pm) + + # number of pm < threshold.modes + p.del = test_pm(pm,threshold.modes) + + # delete modes with probability masses < threshold.modes + for(j in 1:num_pm){ + if(j %in% p.del){ + p[j] = 0 + v[j+1] = 0 + } + } + p = p[! p == 0] + v = v[! v == 0] + + # probability masses (without the ones < threshold.modes) + pm = probability_mass(ker,v) + num_pm = length(pm) + + estimate = list(ker=ker,peaks=p,valleys=v,pm=pm) + + return(estimate) +} |
b |
diff -r 000000000000 -r 1535ffddeff4 KDEanalysis.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/KDEanalysis.r Thu Sep 07 08:51:57 2017 -0400 |
[ |
b'@@ -0,0 +1,504 @@\n+# Carry out analysis of CpGo/E data for Galaxy module\n+# Ingo Bulla\n+# 27 Jan 16\n+\n+# load packages\n+pckg <- c("methods", "optparse")\n+for (p in pckg) {\n+ if (!(p %in% rownames(installed.packages()))) {\n+ stop( paste("R package", p , "is not installed"), call. = FALSE)\n+ }\n+}\n+require(methods, quietly = TRUE)\n+require(optparse, quietly = TRUE)\n+\n+# determine directory where functions are located\n+cmdArgs <- commandArgs(trailingOnly = FALSE)\n+str <- "--file="\n+match <- grep(str, cmdArgs)\n+if (length(match) == 0) {\n+ stop("notos.r not set up to be called from R console")\n+}\n+path <- normalizePath( sub(str, "", cmdArgs[match]) )\n+FCTN.DIR <- file.path(dirname(path), "Functions")\n+\n+source( file.path( FCTN.DIR, "Kernel_function_form.R") )\n+\n+\n+MAX.CPGOE <- 10 # maximum value for CpGo/e ratios\n+\n+\n+# process outliers and return quantities characterizing the distribution\n+# obs: CpGo/e ratios\n+proc.outliers <- function(obs, frac.outl) {\n+ ret <- list()\n+\n+ # remove all zeros from sample\n+ no.obs.raw <- length(obs)\n+ ret[["prop.zero"]] <- sum(obs == 0) / no.obs.raw\n+ obs <- obs[obs != 0]\n+ if (length(obs) < 3) {\n+ ret[["valid"]] <- FALSE\n+ return(ret)\n+ }\n+ ret[["obs.nz"]] <- obs\n+\n+ # replace very large values by a maximum value\n+ obs <- sapply(obs, function(x) min(x, MAX.CPGOE))\n+\n+ # defining variables\n+ # ... mean, median and standard deviation\n+ ret[["mu.obs"]] <- mu.obs <- mean(obs)\n+ ret[["me.obs"]] <- me.obs <- median(obs)\n+ sd.obs <- sd(obs)\n+ iqr.obs <- IQR(obs)\n+\n+ # ... uppper and lower limits, based on mean +- k * sd, med. +- k * iqr, k = 2, ..., 4\n+ ul.mu <- mu.obs + (2 : 5) * sd.obs\n+ ll.mu <- mu.obs - (2 : 5) * sd.obs\n+ ul.me <- quantile(obs, 0.75) + (2 : 5) * iqr.obs\n+ ll.me <- quantile(obs, 0.25) - (2 : 5) * iqr.obs\n+ names(ul.mu) <- names(ll.mu) <- 2 : 5\n+ names(ul.me) <- names(ll.me) <- 2 : 5\n+ ret[["ul.mu"]] <- ul.mu\n+ ret[["ll.mu"]] <- ll.mu\n+ ret[["ul.me"]] <- ul.me\n+ ret[["ll.me"]] <- ll.me\n+\n+ # summary statistics and data output\n+ # ... calculate proportion of data excluded when using different ranges\n+ ret[["prop2"]] <- prop2 <- length(obs[obs < ll.me["2"] | ul.me["2"] < obs]) / no.obs.raw\n+ ret[["prop3"]] <- prop3 <- length(obs[obs < ll.me["3"] | ul.me["3"] < obs]) / no.obs.raw\n+ ret[["prop4"]] <- prop4 <- length(obs[obs < ll.me["4"] | ul.me["4"] < obs]) / no.obs.raw\n+ ret[["prop5"]] <- prop5 <- length(obs[obs < ll.me["5"] | ul.me["5"] < obs]) / no.obs.raw\n+ # ... choose k in Q1 / Q3 +- k * IQR such that no more than 1% of the data are excluded\n+ v <- c(prop2, prop3, prop4, prop5) < frac.outl\n+\n+ if (any(v)) {\n+ excl.crit <- min(which(v))\n+ ret[["obs.cl"]] <- obs[!(obs < ll.me[excl.crit] | ul.me[excl.crit] < obs)]\n+ ret[["used"]] <- paste(2 : 5, "iqr", sep = "")[excl.crit]\n+ } else {\n+ ret[["obs.cl"]] <- obs[!(obs < ll.me[4] | ul.me[4] < obs)]\n+ ret[["used"]] <- "limited to 5 * iqr"\n+ }\n+ ret[["valid"]] <- TRUE\n+ return(ret)\n+}\n+\n+\n+# Read CpGo/e ratios from file\n+# warn: issue warning if necessary\n+read.CpGoe <- function(fname, warn) {\n+\t# read input file line by line, split by whitespaces, assign last substring to CpGo/e ratios\n+\t# ... remove comments and trailing whitespaces\n+\tprint(fname)\n+\tv <- read.table(fname, fill = TRUE, col.names = c("seq", "val"))\n+\tobs <- v$val\n+\n+\tobs <- obs[!is.na(obs)]\n+\treturn(obs)\n+}\n+\n+\n+# process command line arguments\n+# expected arguments:\n+# - names of the species (each as a separate argument)\n+# - names of CpGo/e files of the species (each as a separate argument)\n+# ... parse arguments\n+option_list <- list(make_option(c("-o", "--frac-outl"), type = "double", default = 0.01,\n+ help = "maximum fraction of CpGo/e ratios excluded as outliers [default %default]"),\n+ make_option(c("-d", "--min-dist"), type = "double", default = 0.2,\n+ help = "minimum distance between modes, modes that are closer are joine'..b'"mu.obs"]]\n+ me.obs <- l[["me.obs"]]\n+ ul.mu <- l[["ul.mu"]]\n+ ll.mu <- l[["ll.mu"]]\n+ ul.me <- l[["ul.me"]]\n+ ll.me <- l[["ll.me"]]\n+ tab.des[i, "prop.out.2iqr"] <- l[["prop2"]]\n+ tab.des[i, "prop.out.3iqr"] <- l[["prop3"]]\n+ tab.des[i, "prop.out.4iqr"] <- l[["prop4"]]\n+ tab.des[i, "prop.out.5iqr"] <- l[["prop5"]]\n+ obs.cl <- l[["obs.cl"]]\n+ obs.nz <- l[["obs.nz"]]\n+ tab.des[i, "used"] <- l[["used"]]\n+ tab.des[i, "no.obs.raw"] <- length(obs.org)\n+ tab.des[i, "no.obs.nozero"] <- length(obs.nz)\n+ tab.des[i, "no.obs.clean"] <- length(obs.cl)\n+ usedindex <- substr(l[["used"]],1,1)\n+ # Histograms\n+ # ... histogram 1: original data with zeros\n+ t.breaks <- seq(0, max(obs.org) + 1, by = 0.03)\n+ t.xlim <- c(0, ul.me["5"] + 0.1)\n+ hist(obs.org, breaks = t.breaks, xlim = t.xlim, xlab = "CpG o/e", main = "",\n+ sub = "Original data", prob = TRUE,\n+\t col = grey(0.9), border = grey(0.6))\n+ mtext(paste(spec.names[i]), side = 3, adj = 0)\n+\n+\n+ # ... histogram 3: median / iqr based\n+ t.lty <- rep(3, 4)\n+ t.lty[usedindex] <- 1\n+\n+ hist(obs.nz, breaks = t.breaks, xlim = t.xlim, xlab = "CpG o/e", main = "",\n+ sub = "Data without zeros, Q1/3 +- k*IQR, k=2,...,5", prob = TRUE,\n+\t col = grey(0.9), border = grey(0.6))\n+ abline(v = me.obs, col = \'blue\', lwd = 2)\n+ abline(v = c(ll.me, ul.me), col = "red", lty = rep(t.lty, 2))\n+\n+ # ... histogram 4: cleaned data\n+ hist(obs.cl, breaks = t.breaks, xlim = t.xlim, xlab = "CpG o/e", main = "",\n+ sub = "Cleaned data", prob = TRUE,\n+\t col = grey(0.9), border = grey(0.6))\n+ abline(v = me.obs, col = \'blue\', lwd = 2)\n+ abline(v = c(ll.me[usedindex], ul.me[usedindex]), col = "red")\n+}\n+invisible(dev.off())\n+\n+# output cutoff quantities\n+write.table(tab.des, file = cutoff.fname, sep = "\\t", col.names=NA)\n+\n+# plot KDE and output quantities characterizing the peaks and the bootstrap results\n+# ... table with quantities characterizing the peaks\n+v <- col.names.peaks()\n+tab1.m <- data.frame(matrix(NA, nrow = num.spec, ncol = length(v)))\n+names(tab1.m) <- col.names.peaks()\n+rownames(tab1.m) <- spec.names\n+\n+# ... table for the bootstrap\n+tab2.m <- data.frame(matrix(NA, nrow = num.spec, ncol = 7))\n+names(tab2.m) <- col.names.bs()\n+rownames(tab2.m) <- spec.names\n+\n+# summary table\n+sum1.m <- data.frame(matrix(NA, nrow = num.spec, ncol = 13)) \n+names(sum1.m) <- c("Modes", "Skewness", "Variance", "Modes too close", "Peak1", "Peak2", "Peak3", "Peak4", "Peak5", "Peak6", "Peak7", "Peak8", "Peak9")\n+rownames(sum1.m) <- spec.names\n+\n+# ... plotting\n+t.height <- 6\n+t.width <- 20\n+pdf(kde.fname, height = t.height,width = t.width, paper = "special")\n+for (i in 1:num.spec) {\n+ # read in GcGo/e ratios\n+ obs <- read.CpGoe(cpgoe.fnames[i], FALSE)\n+ l <- proc.outliers(obs, frac.outl)\n+ obs.cl <- l[["obs.cl"]]\n+\n+ # check number of values\n+ fname <- cpgoe.fnames[i]\n+ if (length(obs.cl) < 3) {\n+ stop( paste("Too few values in", fname, "(less than 3) after removal of outliers and zeros"), call. = FALSE )\n+ }\n+ if (!supp.warn.few & length(obs.cl) < 250) {\n+ warning( paste(fname, " contains only few values (", length(obs.cl), ") after removal of outliers and zeros, which may lead to unreliable results", sep = ""), call. = FALSE )\n+ }\n+\n+ # plotting\n+ l <- plot.KDE(obs.cl, t.name = spec.names[i], bs.cis = use.bstrp, bstrp.reps = bstrp.reps, conf.lev = conf.lev,\n+ min.dist = min.dist, mode.mass = mode.mass, band.width = band.width)\n+ tab1.m[i, ] <- l$tab.des\n+ sum1.m[i, ] <- l$tab.des[c(1, 4, 33, 30, 10+(2*0:8))]\n+ if (use.bstrp) {\n+ tab2.m[i, ] <- l$tab.bs\n+ }\n+ valleys = l$valleys\n+}\n+invisible(dev.off())\n+#sessionInfo()\n+\n+# ... output quantities in tables\n+write.table(sum1.m, file = summ.fname, sep = "\\t", col.names = NA)\n+write.table(tab1.m, file = peak.fname, sep = "\\t", col.names=NA)\n+write.table(valleys, file = valleys.fname, sep = "\\t", col.names=NA)\n+if (use.bstrp) {\n+ write.table(tab2.m, file = bstrp.fname, sep = "\\t", col.names=NA)\n+}\n' |
b |
diff -r 000000000000 -r 1535ffddeff4 KDEanalysis.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/KDEanalysis.xml Thu Sep 07 08:51:57 2017 -0400 |
[ |
@@ -0,0 +1,80 @@ +<tool id="notos" name="Analyse CpGo/e ratios" version="1.0.0"> + <requirements> + <requirement type="package" version="3.2.1">R</requirement> + <requirement type="package" version="1.0.0">notos</requirement> + </requirements> + <stdio> + <exit_code range="1:" /> + </stdio> + <command><![CDATA[ + Rscript ${__tool_directory__}/KDEanalysis.r + -o $fracOutl + -d $minDist + -c $confLevel + -m $modeMass + -b $bandWidth + #if $bootstrap == "TRUE" + -B + #end if + -r $BSrep + #if $nofew == "TRUE" + -f + #end if + "$species" + $cpgoe + ]]></command> + <inputs> + <param name="species" size="30" type="text" value="my_species" label="Species name" /> + <param name="cpgoe" type="data" format="tabular" /> + <param name="fracOutl" type="float" value="0.01" help="maximum fraction of CpGo/e ratios excluded as outliers" label= "max fraction of CpGo/e ratios excluded as outliers" /> + <param name="minDist" type="float" value="0.2" help="minimum distance between modes. Modes that are closer are joined" label= "min distance between modes to join" /> + <param name="confLevel" type="float" value="0.95" help="level of the confidence intervals of the mode positions" label= "level of the confidence intervals of the mode positions" /> + <param name="modeMass" type="float" value="0.05" help="minimum probability mass of a mode " label= "minimum probability mass of a mode" /> + <param name="bandWidth" type="float" value="1.06" help="bandwidth constant for kernels " label= "bandwidth constant for kernels" /> + <param name="bootstrap" type="boolean" truevalue="TRUE" falsevalue="FALSE" help="calculate confidence intervals of mode positions using bootstrap" label= "bootstrap confidence intervals?" /> + <param name="BSrep" type="integer" value="1500" help="number of bootstrap repetitions " label= "number of bootstrap repetitions " /> + <param name="nofew" type="boolean" truevalue="TRUE" falsevalue="FALSE" help="Do not print warning message when few datapoint are given." label= "Supress few datapoint warning" /> + </inputs> + <outputs> + <data name="summary" format="tabular" from_work_dir="summary.csv" label="Summary on ${on_string}" /> + <data name="outlierHist" format="pdf" from_work_dir="outliers_hist.pdf" label="Outlier Histogram on ${on_string}"/> + <data name="cutoff" format="tabular" from_work_dir="outliers_cutoff.csv" label="Outlier cutoff on ${on_string}" /> + <data name="peaks" format="tabular" from_work_dir="modes_basic_stats.csv" label="Peaks on ${on_string}" /> + <data name="kde" format="pdf" from_work_dir="KDE.pdf" label="KDE analysis on ${on_string}" /> + </outputs> + <tests> + <test> + <param name="cpgoe" value="10_std_unif.txt"/> + <output name="cutoff1" file="outliers_cutoff.csv"/> + <!-- + <output name="outlierHist" file="outliers_hist.pdf"/> + <output name="kde" file="KDE.pdf"/> + --> + </test> + </tests> + <help><![CDATA[ +Model the distribution of CpG o/e ratios using Kernel Density Estimation. + +Parameters:: + + "-o", "--frac-outl" maximum fraction of CpGo/e ratios excluded as outliers [default 0.01] + "-d", "--min-dist" minimum distance between modes, modes that are closer are joined [default 0.2] + "-c", "--conf-level" level of the confidence intervals of the mode positions [default 0.95] + "-m", "--mode-mass" minimum probability mass of a mode [default 0.05] + "-b", "--band-width" bandwidth constant for kernels [default 1.06] + "-B", "--bootstrap" calculate confidence intervals of mode positions using bootstrap. + "-r", "--bootstrap-reps" number of bootstrap repetitions [default 1500] + "-p", "--peak-file" name of the output file describing the peaks of the KDE [default peaks.csv] + "-s", "--bootstrap-file" Name of the output file with bootstrap values [default "bootstrap.csv"] + "-H", "--outlier-hist-file" Outliers histogram file [default outliers_hist.pdf] + "-C", "--cutoff-file" Outliers cutoff file [default outliers_cutoff.csv] + "-k", "--kde-file" Kernel density estimation graph [default KDE.pdf] + + + ]]></help> + <citations> + <citation> + <citation type="doi">10.1101/180463</citation> + </citation> + </citations> +</tool> |
b |
diff -r 000000000000 -r 1535ffddeff4 LICENSE --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/LICENSE Thu Sep 07 08:51:57 2017 -0400 |
b |
b'@@ -0,0 +1,674 @@\n+ GNU GENERAL PUBLIC LICENSE\n+ Version 3, 29 June 2007\n+\n+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>\n+ Everyone is permitted to copy and distribute verbatim copies\n+ of this license document, but changing it is not allowed.\n+\n+ Preamble\n+\n+ The GNU General Public License is a free, copyleft license for\n+software and other kinds of works.\n+\n+ The licenses for most software and other practical works are designed\n+to take away your freedom to share and change the works. By contrast,\n+the GNU General Public License is intended to guarantee your freedom to\n+share and change all versions of a program--to make sure it remains free\n+software for all its users. We, the Free Software Foundation, use the\n+GNU General Public License for most of our software; it applies also to\n+any other work released this way by its authors. You can apply it to\n+your programs, too.\n+\n+ When we speak of free software, we are referring to freedom, not\n+price. Our General Public Licenses are designed to make sure that you\n+have the freedom to distribute copies of free software (and charge for\n+them if you wish), that you receive source code or can get it if you\n+want it, that you can change the software or use pieces of it in new\n+free programs, and that you know you can do these things.\n+\n+ To protect your rights, we need to prevent others from denying you\n+these rights or asking you to surrender the rights. Therefore, you have\n+certain responsibilities if you distribute copies of the software, or if\n+you modify it: responsibilities to respect the freedom of others.\n+\n+ For example, if you distribute copies of such a program, whether\n+gratis or for a fee, you must pass on to the recipients the same\n+freedoms that you received. You must make sure that they, too, receive\n+or can get the source code. And you must show them these terms so they\n+know their rights.\n+\n+ Developers that use the GNU GPL protect your rights with two steps:\n+(1) assert copyright on the software, and (2) offer you this License\n+giving you legal permission to copy, distribute and/or modify it.\n+\n+ For the developers\' and authors\' protection, the GPL clearly explains\n+that there is no warranty for this free software. For both users\' and\n+authors\' sake, the GPL requires that modified versions be marked as\n+changed, so that their problems will not be attributed erroneously to\n+authors of previous versions.\n+\n+ Some devices are designed to deny users access to install or run\n+modified versions of the software inside them, although the manufacturer\n+can do so. This is fundamentally incompatible with the aim of\n+protecting users\' freedom to change the software. The systematic\n+pattern of such abuse occurs in the area of products for individuals to\n+use, which is precisely where it is most unacceptable. Therefore, we\n+have designed this version of the GPL to prohibit the practice for those\n+products. If such problems arise substantially in other domains, we\n+stand ready to extend this provision to those domains in future versions\n+of the GPL, as needed to protect the freedom of users.\n+\n+ Finally, every program is threatened constantly by software patents.\n+States should not allow patents to restrict development and use of\n+software on general-purpose computers, but in those that do, we wish to\n+avoid the special danger that patents applied to a free program could\n+make it effectively proprietary. To prevent this, the GPL assures that\n+patents cannot be used to render the program non-free.\n+\n+ The precise terms and conditions for copying, distribution and\n+modification follow.\n+\n+ TERMS AND CONDITIONS\n+\n+ 0. Definitions.\n+\n+ "This License" refers to version 3 of the GNU General Public License.\n+\n+ "Copyright" also means copyright-like laws that apply to other kinds of\n+works, such as semiconductor masks.\n+\n+ "The Program" refers to a'..b'THE PROGRAM\n+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF\n+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.\n+\n+ 16. Limitation of Liability.\n+\n+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING\n+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS\n+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY\n+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE\n+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF\n+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD\n+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),\n+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF\n+SUCH DAMAGES.\n+\n+ 17. Interpretation of Sections 15 and 16.\n+\n+ If the disclaimer of warranty and limitation of liability provided\n+above cannot be given local legal effect according to their terms,\n+reviewing courts shall apply local law that most closely approximates\n+an absolute waiver of all civil liability in connection with the\n+Program, unless a warranty or assumption of liability accompanies a\n+copy of the Program in return for a fee.\n+\n+ END OF TERMS AND CONDITIONS\n+\n+ How to Apply These Terms to Your New Programs\n+\n+ If you develop a new program, and you want it to be of the greatest\n+possible use to the public, the best way to achieve this is to make it\n+free software which everyone can redistribute and change under these terms.\n+\n+ To do so, attach the following notices to the program. It is safest\n+to attach them to the start of each source file to most effectively\n+state the exclusion of warranty; and each file should have at least\n+the "copyright" line and a pointer to where the full notice is found.\n+\n+ <one line to give the program\'s name and a brief idea of what it does.>\n+ Copyright (C) <year> <name of author>\n+\n+ This program is free software: you can redistribute it and/or modify\n+ it under the terms of the GNU General Public License as published by\n+ the Free Software Foundation, either version 3 of the License, or\n+ (at your option) any later version.\n+\n+ This program is distributed in the hope that it will be useful,\n+ but WITHOUT ANY WARRANTY; without even the implied warranty of\n+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n+ GNU General Public License for more details.\n+\n+ You should have received a copy of the GNU General Public License\n+ along with this program. If not, see <http://www.gnu.org/licenses/>.\n+\n+Also add information on how to contact you by electronic and paper mail.\n+\n+ If the program does terminal interaction, make it output a short\n+notice like this when it starts in an interactive mode:\n+\n+ <program> Copyright (C) <year> <name of author>\n+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w\'.\n+ This is free software, and you are welcome to redistribute it\n+ under certain conditions; type `show c\' for details.\n+\n+The hypothetical commands `show w\' and `show c\' should show the appropriate\n+parts of the General Public License. Of course, your program\'s commands\n+might be different; for a GUI interface, you would use an "about box".\n+\n+ You should also get your employer (if you work as a programmer) or school,\n+if any, to sign a "copyright disclaimer" for the program, if necessary.\n+For more information on this, and how to apply and follow the GNU GPL, see\n+<http://www.gnu.org/licenses/>.\n+\n+ The GNU General Public License does not permit incorporating your program\n+into proprietary programs. If your program is a subroutine library, you\n+may consider it more useful to permit linking proprietary applications with\n+the library. If this is what you want to do, use the GNU Lesser General\n+Public License instead of this License. But first, please read\n+<http://www.gnu.org/philosophy/why-not-lgpl.html>.\n' |
b |
diff -r 000000000000 -r 1535ffddeff4 readme.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readme.rst Thu Sep 07 08:51:57 2017 -0400 |
[ |
b'@@ -0,0 +1,173 @@\n+Notos\n+=====\n+\n+Notos is a suite that calculates CpN o/e ratios (e.g., the commonly used CpG o/e ratios) for a set of nucleotide sequences and uses Kernel Density Estimation (KDE) to model the obtained distribution.\n+\n+It consists of two programs, CpGoe.pl is used to calculate the CpN o/e ratios and KDEanalysis.r estimates the model. \n+In the following, these two programs are described briefly.\n+\n+CpGoe.pl\n+--------\n+\n+\n+This program will calculate CpN o/e ratios on nucleotide multifasta files.\n+For each sequence that is found in the file it will output the sequence name followed by the CpN o/e ratio, where N can be any of the nucletides A, C, G or T, into a TAB separated file.\n+\n+An example call would be:\n+\n+ perl CpGoe.pl -f input_species.fasta -a 1 -c CpG -o input_species_cpgoe.csv -m 200\n+\t\n+\n+The available contexts (-c) are CpG, CpA, CpC, CpT. Default is CpG. \n+\n+The available algorithms (-a) for calculating the CpNo/e ratio are the following (here shown for CpG o/e)::\n+\n+ 1 => (CpG / (C * G)) * (L^2 / L-1)\n+ 2 => (CpG / (C * G)) * L\n+ 3 => (CpG / L) / ((C + G) / L)^2\n+ 4 => (CpG / (C + G)/2)^2\n+\t\t\n+Here L denotes the length of the sequence, CpG represents the count of CG dinucleotide, C and G represent the count for the respective bases.\n+\n+KDEanalysis.r\n+-------------\n+\n+This program carries out two steps.\n+First, the data preparation step, mainly to remove data artifacts.\n+Secondly, the mode detection step, which is baesd on a KDE modelling approach.\n+\n+Example basic usage on command line:\n+\n+ Rscript ~/src/github/notos/KDEanalysis.r "Input species" input_species_cpgoe.csv\n+\n+\t\n+In the above case "Input species" will be used to name the graphs that are generated as well as an identifier for each sample.\n+It has to be surrounded by " if the name of the species contains spaces.\n+The input of KDEanalysis.r is of the same format as the output of CpGoe.pl.\n+\n+Any of the following parameters can be used\n+\n++--------+---------------------+-----------------------------------------------------------------------------------------+\n+| Option | Long option | Description |\n++--------+---------------------+-----------------------------------------------------------------------------------------+\n+| -o | --frac-outl | maximum fraction of CpGo/e ratios excluded as outliers [default 0.01] |\n++--------+---------------------+-----------------------------------------------------------------------------------------+\n+| -d | --min-dist | minimum distance between modes, modes that are closer are joined [default 0.2] |\n++--------+---------------------+-----------------------------------------------------------------------------------------+\n+| -c | --conf-level | level of the confidence intervals of the mode positions [default 0.95] |\n++--------+---------------------+-----------------------------------------------------------------------------------------+\n+| -m | --mode-mass | minimum probability mass of a mode [default 0.05] |\n++--------+---------------------+-----------------------------------------------------------------------------------------+\n+| -b | --band-width | bandwidth constant for kernels [default 1.06] |\n++--------+---------------------+-----------------------------------------------------------------------------------------+\n+| -B | --bootstrap | calculate confidence intervals of mode positions using bootstrap. |\n++--------+---------------------+-----------------------------------------------------------------------------------------+\n+| -r | --bootstrap-reps | number of bootstrap repetitions [default 1500] |\n++--------+---------------------+----'..b'etween the 90 % and 10 % quantile |\n++-----------------------------------+------------------------------------------------------------------------------------+\n+| IQR 90 | 90% distance between the 95 % and 5 % quantile |\n++-----------------------------------+------------------------------------------------------------------------------------+\n+| Total number of sequences | total number of sequences / CpG o/e ratios used for this analysis step |\n++-----------------------------------+------------------------------------------------------------------------------------+\n+\n+3. modes_bootstrap.csv. The columns of this optional file resulting from the bootstrap procedure contains:\n+\n++-------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+\n+| Column | description |\n++-------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+\n+| Name | name of the file analyzed |\n++-------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+\n+| Number of modes (NM) | number of modes detected for the original sample |\n++-------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+\n+| % of samples with same NM | proportion of bootstrap samples with the same number of modes (0 - 100) |\n++-------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+\n+| % of samples with more NM | proportion of bootstrap samples a higher number of modes (0 - 100) |\n++-------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+\n+| % of samples with less NM | proportion of bootstrap samples a lower number of modes (0 - 100) |\n++-------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+\n+| no. of samples with same NM | number of bootstrap samples with the same number of modes |\n++-------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+\n+| % BS samples excluded by prob.~mass crit. | proportion of bootstrap samples excluded due to strong deviations from the probability masses determined for the original sample (0 - 100) |\n++-------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+\n' |
b |
diff -r 000000000000 -r 1535ffddeff4 test-data/10_std_unif.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/10_std_unif.txt Thu Sep 07 08:51:57 2017 -0400 |
[ |
@@ -0,0 +1,11 @@ +# 10 values that are uniformly distributed on [0,1] +0.16603589 +0.77443410 +0.98018264 +0.05113997 +0.29865607 +0.45602460 +0.75859942 +0.44160652 +0.04546894 +0.05510023 |
b |
diff -r 000000000000 -r 1535ffddeff4 test-data/9_seqs.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/9_seqs.fa Thu Sep 07 08:51:57 2017 -0400 |
b |
@@ -0,0 +1,62 @@ +>seq1 +aatttatagctcttaaaaaaaagtataaccctacctaccctggacgtacgatagagggatttagagacc +tttcttccacgttagaacgacgacgtaaggacgaaagggaggtttctttttcgccttacgccgaggcgt +aaagggtggggtattttcacgcgttcgtggtacggtagaaatagggtcacatcttccacgttagaagtag +aagtagaagtaggtcgtctacacgcggaagtctgcctatttattcctcttacctagacgtagacccgcgt +ctacttgttctcgggtgtagatgtactatcccgcgaacggtcgtaggcctaactagacggtacctatcctat +>seq2 +cgcttcggagaactagacgtggaaaacgagcggcggtagaaacgtaggaataggtgtaccgtcgtaccctc +tggacgacctaggtctgtctatgtattctatctcctccggtagacgtagaaaaacgaatactcggtctatcc +tcgggcagacatagacgcgtccacctccgaatctaaacagacgggtatacgataaatctacgtcgtcggaga +cccgacgaacgaccatggaggtccgcgtagaatcgcgtcgatgtacccgtcgcggcggatccccgccggagg +agtggtcacggttggtacttcctcgacgcgatctatcgagcgggtaaacaactacgaaggaaaaaaaagcg +agcggtattccctatatgggttttaactatttatataggtataaaaacgtgacgcgtaacggtcagaaa +>seq3 +aatacgacatcgatcgtagataggattagttacacgccttgagggtaggcgtacgaaggacaag +gatatcgtcctcgagtagtttgccgttctttaccttaagactgtctctcgccgaaagatcagacgttctctt +acacatttcacccacttggacggttatgtctagactcttcacacattccgtgtcgaccagggtagatccgat +aatatgactgcgtctcgtcgcttgagttatacgcgggacaagcgccgcggttccgtacgtcgtttcggttac +gttgacgcgactgcatttgatatctacggacacttcacacgtgtcgagtagggcgagcgcctctgtacggtt +catagacacgggattcgtatctctccaaggagcgtcgctggacagtcggacgtagacgtggcgtcctttgac +>seq4 +gtgttcggatatcgccttaacggtaggacatcccgtaaatccgaacgtgacggagagtcctcctccgacgat +cgtgtggtaagtacttccgatcacgaaaccggttagaaacgattcgtcgttttcgtcgtactcgtattcctc +gctaagatcgtcgggctgctccgtacccgtcgcgccatcgtcttcgttttcgtcttctgtatacgcggattc +aatagtcgtacgcagaccctgacacggcgtgtcgcttccggtagtcctctttgtaattttggccacgacgtc +tattcccatgtatcggacatcttcgccttgtctgcagatgcccttcgtcgcgagggtcgcggccaggcacgc +gaataatacacacaggcgtttcatgatagagttcttctatacgcccac +>seq5 +gttttattataattcgacttttttttaagaatttaactagaagaaaatccgtaatctatgaaacgagtcta +catcgtttgggtgagggaaagagacggaggcgtgtaccccaatggtattccacgaagtcgtcgacctcctcg +ggatgattactgtcgctgtactcgatatctacgttcacgtcggacaggcaccctcctatcacggtaatcgtt +tccgaatgagataggacgccgaccccgtcgctcgtcctcggtatagtacgcacggcgtctccctcgacgcac +cgtatctctatattaagcgtacacattttggtcgtattctggtacctatccatcccggtgaagaatcctccc +gcgccctcgctgccgctcgttccgtagtattccgtgtggaagacgggatcacaatccgtgtgatttagcgta +>seq6 +accgagaactccgacgttttaaccacttcgttgggtccggcggtcgtcgtacaagacgtgtcgtttacggga +tataggttgaattccacgctgatgtagttaaacgacgaggtacacgtctccgtggaggatacggcgtcggaa +tacgtgtaccgaggacattttgtgcagagcacgtcgcccgtacgcgtatgtccggagacgccataccccgcg +ggacacttcgttttgggagcgcatatcctacacccctcctgtcctttcaacagacaatagttccccgcagaa +cagtcgcagactctatcgcgggttttatcacacgattgagactcggataggtggcctgtgcaccgccctcga +caacttacgcacgcgggagcgtggttcgtactcgccgtaaaggtttcgttcttgcacggagaacataccgtg +>seq7 +tcggacccgggtccgcataacctagaggcgtacgacccgggaggacaggaggtacaacacagtccgtccttt +tcgtagtcgttccctctacattttcctcgatccgcgccatacggggcaccgcccccgtatacgcacgcgacg +tacgcgagtagtagcgttaaacgaaacatgatacatataccgtagttatttttatattttacacgaagag +aaaaagacgtagaaggtgccaattagagatgccaattagagagataaaaaccgaattagagaggagcc +aattaggagccgaattagagaggagccaattaggagccgaattagagaggag +ccaattagtcgtcacgacgttcctgagagcaaaggcgttgtaaacgaatattaatcacggacctgagatgcg +>seq8 +ggacctcgtagtcggtgatcaatctcatgaacaactcgatcactctcaccgaggtggggataggatcgtcgt +acccccagtactcggcgacccgcgccaataagccaatgatggcgcaggcctcggtgatgtaactgtcgtgac +gcaacatagctcgaatgagagattcggaactgtcgacgtgttcgtacgagcatcggacgagccaacgcacgt +tgtcacgaaacacgatctccgcgtcccttttgaaagactcgaacacggcaaacaagcgacccgccgattccc +tatccctgaacggggtagccgaccatctgagatagtcctgaaccgccagaatcacgctgtcgtccgcgccga +attcgttcctacgcacatagaaagtcatggtcgcaggaatgaaaggaaacgttcgagaatggctcgaacacc +>seq9 +gttaagtagtgtgttattttttcatttattctaaagtctccatcatgtcgaagaatcccttcttcaacggat +aggacgagggttgtttagtgttgttattacactcgtagc +gtttaaacgggttcggtatcttcatgaaatcgaacatgccgtttaattcgttcgcgtttaagggttccttgg +gaatggggggacaatcggccgggacaacgacccggtcgtcgtccttcttgggatcgggttcgcaca +cgggattgagttccccgaacatgaaagacacgagtcggttgatgggatacatcagcagcgtatagacgctgt +acacgaacagattgcatagtctcgccaggtaaaatacgcacaacatacatagtcgcgcgatgctatttacca |
b |
diff -r 000000000000 -r 1535ffddeff4 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu Sep 07 08:51:57 2017 -0400 |
b |
@@ -0,0 +1,51 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="R" version="3.2.1"> + <repository changeset_revision="d9f7d84125b7" name="package_r_3_2_1" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + <package name="libxml2" version="2.9.1"> + <repository changeset_revision="45b16a3ab504" name="package_libxml2_2_9_1" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + <package name="notos" version="1.0.0"> + <install version="1.0"> + <actions> + <action type="setup_r_environment"> + <repository changeset_revision="d9f7d84125b7" name="package_r_3_2_1" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"> + <package name="R" version="3.2.1" /> + </repository> + <!-- libxml2 needs to be sourced after R because R also was compiled against a different version of libxml2 --> + <repository changeset_revision="45b16a3ab504" name="package_libxml2_2_9_1" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"> + <package name="libxml2" version="2.9.1" /> + </repository> + <package sha256sum="f920baa2a0ef7082155c8b666851af0c77534af8b2ca0cd067e7d56fdf3ec501"> + https://github.com/cche/KDEAnalysis_deps/blob/master/getopt_1.20.0.tar.gz?raw=true + </package> + <package sha256sum="bca93c8646b731758f1cc888ee6c25e8c1ecf2364d7f556489bd879413d20abd"> + https://github.com/cche/KDEAnalysis_deps/blob/master/optparse_1.3.2.tar.gz?raw=true + </package> + <package sha256sum="ae4ea23385776eb0c06c992a3da6b0256a6c84558c1061034c5a1fbdd43d05b8"> + https://github.com/cche/KDEAnalysis_deps/blob/master/iterators_1.0.8.tar.gz?raw=true + </package> + <package sha256sum="1ef03f770f726a62e3753f2402eb26b226245958fa99d570d003fc9e47d35881"> + https://github.com/cche/KDEAnalysis_deps/blob/master/foreach_1.4.3.tar.gz?raw=true + </package> + <package sha256sum="70024b6950025cc027022ee409f382e5ad3680c0a25bcd404bfc16418be8add5"> + https://github.com/cche/KDEAnalysis_deps/blob/master/doParallel_1.0.10.tar.gz?raw=true + </package> + <package sha256sum="2a3b81e60dafdd092d2bdd3513d7038855ca7d113dc71df1229f7518382a3e39"> + https://github.com/cche/KDEAnalysis_deps/blob/master/moments_0.14.tar.gz?raw=true + </package> + <package sha256sum="7f430cf3ebb95bac806fbf093fb1e2112deba47416a93be8d5d1064b76bc0015"> + https://github.com/cche/KDEAnalysis_deps/blob/master/sfsmisc_1.1-0.tar.gz?raw=true + </package> + </action> + <action type="set_environment"> + <environment_variable action="append_to" name="R_LIBS">$INSTALL_DIR</environment_variable> + </action> + </actions> + </install> + <readme> + ***CHANGE THIS***** + </readme> + </package> +</tool_dependency> |