view Dotplot_Release/dotplot.bash @ 15:a6c081fcc0a9 draft

Uploaded
author bornea
date Wed, 16 Mar 2016 12:10:35 -0400
parents bc752a05f16d
children
line wrap: on
line source

#!/bin/bash
#SCRIPT=$(readlink -e $0)
#SCRIPTPATH=`dirname $SCRIPT`
pushd `dirname $0` > /dev/null
SCRIPTPATH=`pwd`
popd > /dev/null

usage() { printf "Usage: $0 
[-f <saint_file_name.txt>]
[-i <0 for SaintExpress format, 1 for other>]
[-c <clustering to perform. Options: b (biclustering), h (hierarchical), n (none, requires input text files for bait and prey ordering; see options -b and -p)>]
[-n <clustering type to be performed if option -c is set to \"h\">]
[-d <distance metric to use if option -c is set to \"h\">]
[-b <list of bait proteins in display order (see option -c n)>]
[-p <list of prey proteins in display order (see option -c n). Set this to \"all\" if you want to include all preys and cluster them>]
[-s <primary FDR cutoff [0-1, recommended=0.01]>]
[-t <secondary FDR cutoff [must be less than the primary, recommended=0.025]>
[-x <spectral count minimum. Only preys with >= this will be used]>
[-m <maximum spectral count>]
[-N <normalization, 0 for no (default), 1 for yes, 2 for normalization based on significant preys counts (prey FDR <= option -t)>]
[-C <FDR cutoff for normalization if using option -N 2 (deafult is -t)>]\n"
1>&2; exit 1; }

N=0
n="ward"
d="canberra"
x=0
i=0
while getopts ":f:i:s:t:x:m:c:n:d:b:p:N:C:" o; do
    case "${o}" in
        f)
            f=${OPTARG}
            ;;
        i)
	    i=${OPTARG}
            ;;
        s)
            s=${OPTARG}
            ;;
	t)
            t=${OPTARG}
            ;;
        x)
	    x=${OPTARG}
            ;;
	m)
            m=${OPTARG}
            ;;
	c)
            c=${OPTARG}
	    ;;
	n)
	    n=${OPTARG}
	    ;;
	d)
	    d=${OPTARG}
	    ;;
	b)
            b=${OPTARG}
	    ;;
	p)
	    p=${OPTARG}
	    ;;
	N)
	    N=${OPTARG}
	    ;;
	C)
	    C=${OPTARG}
	    ;;
        *)
            usage
            ;;
    esac
done
shift $((OPTIND-1))

filename=${f%%.*}
echo "Saint input file = ${f}"
echo "Primary FDR cutoff = ${s}"
echo "Secondary FDR cutoff for dotplot = ${t}"
echo "Minimum spectral count for significant preys = ${x}"
echo "Maximum spectral count for dot plot = ${m}"

if [ -z "${f}" ] || [ -z "${s}" ] || [ -z "${t}" ] || [ -z "${m}" ] || [ -z "${c}" ]; then
    usage
fi

if [ "${i}" == 1 ]; then
	$SCRIPTPATH/SaintConvert.pl -i ${f}
	f="mockSaintExpress.txt"
fi

if [ "${x}" -ge "${m}" ]; then
	echo "spectral count minimum (${x}) cannot be greater than or equal to the maximum (${m})"
	exit 1;
elif [ "${x}" -lt 0 ]; then
	echo "spectral count minimum (${x}) cannot be less than 0. Setting to 0 and continuing"
	x=0
fi

###Check for normalization

if [ "${N}" == 1 ]; then
	printf "\nNormalization is being performed\n"
	$SCRIPTPATH/Normalization.R ${f}
	f="norm_saint.txt"
elif [ "${N}" == 2 ]; then
	printf "\nNormalization is being performed\n"
	if [ -z "${C}" ]; then
		C=${t}
	fi
	$SCRIPTPATH/Normalization_sigpreys.R ${f} ${C}
	f="norm_saint.txt"
fi


###Check for clustering etc

if [ "${c}" == "h" ] && [ -z "${n}" ]; then
	printf "\nHierarchial clustering was selected (-c = h), but no clustering method (-n) was chosen.\n"
	printf "The input parameter -n must be set to one of \"average\", \"centroid\", \"complete\", \"mcquitty\",\n"
	printf "\"median\", \"single\" or \"ward\". \"ward\" will be selected as default.\n\n"
	n="ward"
elif [ "${c}" == "h" ] && [ -n "${n}" ]; then
	if [ "${n}" == "average" ] || [ "${n}" == "centroid" ] || [ "${n}" == "complete" ] || [ "${n}" == "mcquitty" ] || [ "${n}" == "median" ] || [  "${n}" == "single" ] || [ "${n}" == "ward" ]; then
		printf "\nHierarchical clustering (method = ${n}) will be performed\n\n"
	else
		printf "\n${n} is not a valid Hierarchical clustering method.\n"
		printf "Choose one of \"average\", \"centroid\", \"complete\", \"mcquitty\", \"median\", \"single\" or \"ward\"\n\n"
		exit 1
	fi
fi

p_c=0
if [ "${c}" == "h" ] && [ -z "${d}" ]; then
	printf "\nHierarchial clustering was selected (-c = h), but no distance metric (-d) was chosen.\n"
	printf "The input parameter -d must be set to one of  \"binary\", \"canberra\", \"euclidean\",\n"
	printf "\"manhattan\", \"maximum\" or \"minkowski\". \"canberra\" will be selected as default.\n\n"
	d="canberra"
elif [ "${c}" == "h" ] && [ -n "${d}" ]; then
	if [ "${d}" == "binary" ] || [ "${d}" == "canberra" ] || [ "${d}" == "euclidean" ] || [ "${d}" == "manhattan" ] || [ "${d}" == "maximum" ] || [  "${d}" == "minkowski" ]; then
		printf "\nHierarchical clustering (distance metric = ${d}) will be performed\n\n"
	else
		printf "\n${d} is not a valid Hierarchical clustering distance metric.\n"
		printf "Choose one of  \"binary\", \"canberra\", \"euclidean\", \"manhattan\", \"maximum\" or \"minkowski\"\n\n"
		exit 1
	fi
fi

if [ "${c}" == "n" ] && [ -z "${b}" ]; then
	printf "\n\"No Clustering\" option was selected (-c = n), but no bait list was included (option -b).\n"
	printf "Bait list must be in .txt formart.\n\n"
	exit 1
elif [ "${c}" == "n" ] && [ -z "${p}" ]; then
	printf "\n\"No Clustering\" option was selected (-c = n), but no prey list was included (option -p).\n"
	printf "Prey list must be in .txt formart.\n\n"
	exit 1
elif [ "${c}" == "n" ] && [ "${p}" == "all" ]; then
	printf "\n\"No Clustering\" option was selected (-c = n) for baits, but preys will still be clustered.\n"
	printf "using \"ward\" and \"canberra\" as defaults or options as supplied on command line.\n\n"
	p="empty"
	p_c=1
	n="ward"
	d="canberra"
fi


###Check number of baits

bait_n=$(perl $SCRIPTPATH/BaitCheck.pl -i ${f})
echo "Number of baits = "$bait_n
printf "\n\n"

if [ "${c}" == "b" ] && [ $bait_n == 2 ]; then
	printf "\nWarning only 2 baits are present. Biclustering will not performed.\n"
	printf "Hierarchical clustering (method = ward) will be performed instead.\n\n"
	c="h"
	n="ward"
fi


###Generate plots

if [ "${c}" == "b" ]; then
	printf "\nBiclustering will be performed\n\n"
	$SCRIPTPATH/Step1_data_reformating.R ${f} ${s} ${filename}
	$SCRIPTPATH/Step2_data_filtering.R ${filename}_matrix.txt ${x} ${filename}
	GSL_RNG_SEED=123  $SCRIPTPATH/Step3_nestedcluster ${filename}.dat $SCRIPTPATH/biclust_param.txt
	$SCRIPTPATH/Step4_biclustering.R ${filename}.dat

	$SCRIPTPATH/SOFD.pl -i ${f} -s ${s} -x ${x}
	$SCRIPTPATH/R_dotPlot.R ${s} ${t} ${m}
	mkdir Output_${filename}
	mkdir Output_${filename}/TempData_${filename}
	mv bait_lists Output_${filename}/TempData_${filename}
	mv Clusters Output_${filename}/TempData_${filename}
	mv MCMCparameters Output_${filename}/TempData_${filename}
	mv NestedClusters Output_${filename}/TempData_${filename}
	mv NestedMu Output_${filename}/TempData_${filename}
	mv NestedSigma2 Output_${filename}/TempData_${filename}
	mv OPTclusters Output_${filename}/TempData_${filename}
	mv ${filename}_matrix.txt Output_${filename}/TempData_${filename}
	mv ${filename}.dat Output_${filename}/TempData_${filename}
	mv SC_data.txt Output_${filename}/TempData_${filename}
	mv FDR_data.txt Output_${filename}/TempData_${filename}
	mv clustered_matrix.txt Output_${filename}/TempData_${filename}
	mv singletons.txt Output_${filename}/TempData_${filename}
	mv bait2bait_matrix.txt Output_${filename}/TempData_${filename}
	mv baitClusters Output_${filename}/TempData_${filename}
	mv clusteredData Output_${filename}/TempData_${filename}
	mv dotplot.pdf Output_${filename}
	mv bait2bait.pdf Output_${filename} 
	mv estimated.pdf Output_${filename} 
	mv stats.pdf Output_${filename}
	cp $SCRIPTPATH/legend.pdf Output_${filename}
elif [ "${c}" == "h" ]; then

	$SCRIPTPATH/SOFD.pl -i ${f} -s ${s} -x ${x}
	$SCRIPTPATH/R_dotPlot_hc.R ${s} ${t} ${m} ${n} ${d} $SCRIPTPATH

	mkdir Output_${filename}
	mkdir Output_${filename}/TempData_${filename}
	mv dotplot.pdf Output_${filename}
	mv heatmap_borders.pdf Output_${filename}
	mv heatmap_no_borders.pdf Output_${filename}
	mv bait2bait.pdf Output_${filename}
	mv SC_data.txt Output_${filename}/TempData_${filename}
	mv FDR_data.txt Output_${filename}/TempData_${filename}
	cp $SCRIPTPATH/legend.pdf Output_${filename}
elif [ "${c}" == "n" ]; then
	
	$SCRIPTPATH/SOFD.pl -i ${f} -s ${s} -x ${x}
	echo "$SCRIPTPATH/R_dotPlot_nc.R ${s} ${t} ${m} ${b} $p_c ${p} ${n} ${d} $SCRIPTPATH"
	$SCRIPTPATH/R_dotPlot_nc.R ${s} ${t} ${m} ${b} $p_c ${p} ${n} ${d} $SCRIPTPATH

	mkdir Output_${filename}
	mkdir Output_${filename}/TempData_${filename}
	mv dotplot.pdf Output_${filename}
	mv heatmap_borders.pdf Output_${filename}
	mv heatmap_no_borders.pdf Output_${filename}
	mv SC_data.txt Output_${filename}/TempData_${filename}
	mv FDR_data.txt Output_${filename}/TempData_${filename}
	cp $SCRIPTPATH/legend.pdf Output_${filename}
else
	printf -- "-c must be one of [b, h, n]:  b (biclustering), h (hierarchical), n (none, requires input text files for bait and prey ordering>\n"
	exit 1;
fi

if [ "${N}" == "1" ] || [ "${N}" == "2" ]; then
	mv norm_saint.txt Output_${filename}/TempData_${filename}
fi