diff Dotplot_Release/dotplot.bash @ 3:bc752a05f16d draft

Uploaded
author bornea
date Tue, 15 Mar 2016 15:25:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Dotplot_Release/dotplot.bash	Tue Mar 15 15:25:15 2016 -0400
@@ -0,0 +1,252 @@
+#!/bin/bash
+#SCRIPT=$(readlink -e $0)
+#SCRIPTPATH=`dirname $SCRIPT`
+pushd `dirname $0` > /dev/null
+SCRIPTPATH=`pwd`
+popd > /dev/null
+
+usage() { printf "Usage: $0 
+[-f <saint_file_name.txt>]
+[-i <0 for SaintExpress format, 1 for other>]
+[-c <clustering to perform. Options: b (biclustering), h (hierarchical), n (none, requires input text files for bait and prey ordering; see options -b and -p)>]
+[-n <clustering type to be performed if option -c is set to \"h\">]
+[-d <distance metric to use if option -c is set to \"h\">]
+[-b <list of bait proteins in display order (see option -c n)>]
+[-p <list of prey proteins in display order (see option -c n). Set this to \"all\" if you want to include all preys and cluster them>]
+[-s <primary FDR cutoff [0-1, recommended=0.01]>]
+[-t <secondary FDR cutoff [must be less than the primary, recommended=0.025]>
+[-x <spectral count minimum. Only preys with >= this will be used]>
+[-m <maximum spectral count>]
+[-N <normalization, 0 for no (default), 1 for yes, 2 for normalization based on significant preys counts (prey FDR <= option -t)>]
+[-C <FDR cutoff for normalization if using option -N 2 (deafult is -t)>]\n"
+1>&2; exit 1; }
+
+N=0
+n="ward"
+d="canberra"
+x=0
+i=0
+while getopts ":f:i:s:t:x:m:c:n:d:b:p:N:C:" o; do
+    case "${o}" in
+        f)
+            f=${OPTARG}
+            ;;
+        i)
+	    i=${OPTARG}
+            ;;
+        s)
+            s=${OPTARG}
+            ;;
+	t)
+            t=${OPTARG}
+            ;;
+        x)
+	    x=${OPTARG}
+            ;;
+	m)
+            m=${OPTARG}
+            ;;
+	c)
+            c=${OPTARG}
+	    ;;
+	n)
+	    n=${OPTARG}
+	    ;;
+	d)
+	    d=${OPTARG}
+	    ;;
+	b)
+            b=${OPTARG}
+	    ;;
+	p)
+	    p=${OPTARG}
+	    ;;
+	N)
+	    N=${OPTARG}
+	    ;;
+	C)
+	    C=${OPTARG}
+	    ;;
+        *)
+            usage
+            ;;
+    esac
+done
+shift $((OPTIND-1))
+
+filename=${f%%.*}
+echo "Saint input file = ${f}"
+echo "Primary FDR cutoff = ${s}"
+echo "Secondary FDR cutoff for dotplot = ${t}"
+echo "Minimum spectral count for significant preys = ${x}"
+echo "Maximum spectral count for dot plot = ${m}"
+
+if [ -z "${f}" ] || [ -z "${s}" ] || [ -z "${t}" ] || [ -z "${m}" ] || [ -z "${c}" ]; then
+    usage
+fi
+
+if [ "${i}" == 1 ]; then
+	$SCRIPTPATH/SaintConvert.pl -i ${f}
+	f="mockSaintExpress.txt"
+fi
+
+if [ "${x}" -ge "${m}" ]; then
+	echo "spectral count minimum (${x}) cannot be greater than or equal to the maximum (${m})"
+	exit 1;
+elif [ "${x}" -lt 0 ]; then
+	echo "spectral count minimum (${x}) cannot be less than 0. Setting to 0 and continuing"
+	x=0
+fi
+
+###Check for normalization
+
+if [ "${N}" == 1 ]; then
+	printf "\nNormalization is being performed\n"
+	$SCRIPTPATH/Normalization.R ${f}
+	f="norm_saint.txt"
+elif [ "${N}" == 2 ]; then
+	printf "\nNormalization is being performed\n"
+	if [ -z "${C}" ]; then
+		C=${t}
+	fi
+	$SCRIPTPATH/Normalization_sigpreys.R ${f} ${C}
+	f="norm_saint.txt"
+fi
+
+
+###Check for clustering etc
+
+if [ "${c}" == "h" ] && [ -z "${n}" ]; then
+	printf "\nHierarchial clustering was selected (-c = h), but no clustering method (-n) was chosen.\n"
+	printf "The input parameter -n must be set to one of \"average\", \"centroid\", \"complete\", \"mcquitty\",\n"
+	printf "\"median\", \"single\" or \"ward\". \"ward\" will be selected as default.\n\n"
+	n="ward"
+elif [ "${c}" == "h" ] && [ -n "${n}" ]; then
+	if [ "${n}" == "average" ] || [ "${n}" == "centroid" ] || [ "${n}" == "complete" ] || [ "${n}" == "mcquitty" ] || [ "${n}" == "median" ] || [  "${n}" == "single" ] || [ "${n}" == "ward" ]; then
+		printf "\nHierarchical clustering (method = ${n}) will be performed\n\n"
+	else
+		printf "\n${n} is not a valid Hierarchical clustering method.\n"
+		printf "Choose one of \"average\", \"centroid\", \"complete\", \"mcquitty\", \"median\", \"single\" or \"ward\"\n\n"
+		exit 1
+	fi
+fi
+
+p_c=0
+if [ "${c}" == "h" ] && [ -z "${d}" ]; then
+	printf "\nHierarchial clustering was selected (-c = h), but no distance metric (-d) was chosen.\n"
+	printf "The input parameter -d must be set to one of  \"binary\", \"canberra\", \"euclidean\",\n"
+	printf "\"manhattan\", \"maximum\" or \"minkowski\". \"canberra\" will be selected as default.\n\n"
+	d="canberra"
+elif [ "${c}" == "h" ] && [ -n "${d}" ]; then
+	if [ "${d}" == "binary" ] || [ "${d}" == "canberra" ] || [ "${d}" == "euclidean" ] || [ "${d}" == "manhattan" ] || [ "${d}" == "maximum" ] || [  "${d}" == "minkowski" ]; then
+		printf "\nHierarchical clustering (distance metric = ${d}) will be performed\n\n"
+	else
+		printf "\n${d} is not a valid Hierarchical clustering distance metric.\n"
+		printf "Choose one of  \"binary\", \"canberra\", \"euclidean\", \"manhattan\", \"maximum\" or \"minkowski\"\n\n"
+		exit 1
+	fi
+fi
+
+if [ "${c}" == "n" ] && [ -z "${b}" ]; then
+	printf "\n\"No Clustering\" option was selected (-c = n), but no bait list was included (option -b).\n"
+	printf "Bait list must be in .txt formart.\n\n"
+	exit 1
+elif [ "${c}" == "n" ] && [ -z "${p}" ]; then
+	printf "\n\"No Clustering\" option was selected (-c = n), but no prey list was included (option -p).\n"
+	printf "Prey list must be in .txt formart.\n\n"
+	exit 1
+elif [ "${c}" == "n" ] && [ "${p}" == "all" ]; then
+	printf "\n\"No Clustering\" option was selected (-c = n) for baits, but preys will still be clustered.\n"
+	printf "using \"ward\" and \"canberra\" as defaults or options as supplied on command line.\n\n"
+	p="empty"
+	p_c=1
+	n="ward"
+	d="canberra"
+fi
+
+
+###Check number of baits
+
+bait_n=$(perl $SCRIPTPATH/BaitCheck.pl -i ${f})
+echo "Number of baits = "$bait_n
+printf "\n\n"
+
+if [ "${c}" == "b" ] && [ $bait_n == 2 ]; then
+	printf "\nWarning only 2 baits are present. Biclustering will not performed.\n"
+	printf "Hierarchical clustering (method = ward) will be performed instead.\n\n"
+	c="h"
+	n="ward"
+fi
+
+
+###Generate plots
+
+if [ "${c}" == "b" ]; then
+	printf "\nBiclustering will be performed\n\n"
+	$SCRIPTPATH/Step1_data_reformating.R ${f} ${s} ${filename}
+	$SCRIPTPATH/Step2_data_filtering.R ${filename}_matrix.txt ${x} ${filename}
+	GSL_RNG_SEED=123  $SCRIPTPATH/Step3_nestedcluster ${filename}.dat $SCRIPTPATH/biclust_param.txt
+	$SCRIPTPATH/Step4_biclustering.R ${filename}.dat
+
+	$SCRIPTPATH/SOFD.pl -i ${f} -s ${s} -x ${x}
+	$SCRIPTPATH/R_dotPlot.R ${s} ${t} ${m}
+	mkdir Output_${filename}
+	mkdir Output_${filename}/TempData_${filename}
+	mv bait_lists Output_${filename}/TempData_${filename}
+	mv Clusters Output_${filename}/TempData_${filename}
+	mv MCMCparameters Output_${filename}/TempData_${filename}
+	mv NestedClusters Output_${filename}/TempData_${filename}
+	mv NestedMu Output_${filename}/TempData_${filename}
+	mv NestedSigma2 Output_${filename}/TempData_${filename}
+	mv OPTclusters Output_${filename}/TempData_${filename}
+	mv ${filename}_matrix.txt Output_${filename}/TempData_${filename}
+	mv ${filename}.dat Output_${filename}/TempData_${filename}
+	mv SC_data.txt Output_${filename}/TempData_${filename}
+	mv FDR_data.txt Output_${filename}/TempData_${filename}
+	mv clustered_matrix.txt Output_${filename}/TempData_${filename}
+	mv singletons.txt Output_${filename}/TempData_${filename}
+	mv bait2bait_matrix.txt Output_${filename}/TempData_${filename}
+	mv baitClusters Output_${filename}/TempData_${filename}
+	mv clusteredData Output_${filename}/TempData_${filename}
+	mv dotplot.pdf Output_${filename}
+	mv bait2bait.pdf Output_${filename} 
+	mv estimated.pdf Output_${filename} 
+	mv stats.pdf Output_${filename}
+	cp $SCRIPTPATH/legend.pdf Output_${filename}
+elif [ "${c}" == "h" ]; then
+
+	$SCRIPTPATH/SOFD.pl -i ${f} -s ${s} -x ${x}
+	$SCRIPTPATH/R_dotPlot_hc.R ${s} ${t} ${m} ${n} ${d} $SCRIPTPATH
+
+	mkdir Output_${filename}
+	mkdir Output_${filename}/TempData_${filename}
+	mv dotplot.pdf Output_${filename}
+	mv heatmap_borders.pdf Output_${filename}
+	mv heatmap_no_borders.pdf Output_${filename}
+	mv bait2bait.pdf Output_${filename}
+	mv SC_data.txt Output_${filename}/TempData_${filename}
+	mv FDR_data.txt Output_${filename}/TempData_${filename}
+	cp $SCRIPTPATH/legend.pdf Output_${filename}
+elif [ "${c}" == "n" ]; then
+	
+	$SCRIPTPATH/SOFD.pl -i ${f} -s ${s} -x ${x}
+	echo "$SCRIPTPATH/R_dotPlot_nc.R ${s} ${t} ${m} ${b} $p_c ${p} ${n} ${d} $SCRIPTPATH"
+	$SCRIPTPATH/R_dotPlot_nc.R ${s} ${t} ${m} ${b} $p_c ${p} ${n} ${d} $SCRIPTPATH
+
+	mkdir Output_${filename}
+	mkdir Output_${filename}/TempData_${filename}
+	mv dotplot.pdf Output_${filename}
+	mv heatmap_borders.pdf Output_${filename}
+	mv heatmap_no_borders.pdf Output_${filename}
+	mv SC_data.txt Output_${filename}/TempData_${filename}
+	mv FDR_data.txt Output_${filename}/TempData_${filename}
+	cp $SCRIPTPATH/legend.pdf Output_${filename}
+else
+	printf -- "-c must be one of [b, h, n]:  b (biclustering), h (hierarchical), n (none, requires input text files for bait and prey ordering>\n"
+	exit 1;
+fi
+
+if [ "${N}" == "1" ] || [ "${N}" == "2" ]; then
+	mv norm_saint.txt Output_${filename}/TempData_${filename}
+fi
+