diff LINKYX_identify_Y_linked_SNPs.sh @ 0:05c27700e5ca

initial commit
author biomonika <biomonika@psu.edu>
date Thu, 04 Sep 2014 18:24:19 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/LINKYX_identify_Y_linked_SNPs.sh	Thu Sep 04 18:24:19 2014 -0400
@@ -0,0 +1,114 @@
+#!/bin/bash
+
+#LINKYX_PATH set during toolshed installation
+
+results=$1 #X-linked candidates written here
+
+reference=${2}
+mother_male_bam=${3}
+father_male_bam=${4}
+daughter_male_bam=${5}
+son_male_bam=${6}
+
+results_fasta=${7}
+
+mkdir -p Y references; dir="Y/";
+grep ">" $reference | sed 's/>//g' >references/male_contig_names
+
+file_exists () 
+	{
+	for ARG in "$@";
+	do
+		if [ ! -f ${ARG} ]; then
+		echo "File $ARG not found"; 
+		fi
+	done;
+	}
+
+#index bam files
+	
+index_bam_file () 
+	{
+	for ARG in "$@";
+		do
+		bam_file_index=${ARG}".bai"
+		if [ -e $bam_file_index ];
+		then	
+			echo "Index already exists. Not being created again.";
+		else
+			samtools index ${ARG};
+		fi
+	    done;
+	}
+
+#get statistics about mapped and unmapped reads for every contig
+
+idxstats_bam_file () 
+	{
+		#get statistics about mapped and unmapped reads for every contig
+		echo "Idxstats" $1 $2;
+		stat_male_name=statistics/stat_male_${2};
+		samtools idxstats $1 >$stat_male_name #statistics for bam files mapping on male reference
+
+		#with MAPQ>0
+		samtools view -bq 1 $1 >statistics/high_score_male_${2}.bam
+		samtools index statistics/high_score_male_${2}.bam
+
+		stat_male_high_name=statistics/stat_male_${2}_high
+		samtools idxstats statistics/high_score_male_${2}.bam >$stat_male_high_name #statistics for bam files mapping on male reference with MAPQ>0
+	}
+
+echo "Checking existence of input bam files."
+file_exists $mother_male_bam $father_male_bam $son_male_bam $daughter_male_bam
+index_bam_file $mother_male_bam $father_male_bam $son_male_bam $daughter_male_bam
+
+#retrieve statistics
+mkdir -p statistics;
+idxstats_bam_file $mother_male_bam mother
+idxstats_bam_file $father_male_bam father
+idxstats_bam_file $daughter_male_bam daughter
+idxstats_bam_file $son_male_bam son
+
+#read statistics
+reads_number_mother=`awk '{read_number+=($3+$4)} END {print read_number}' statistics/stat_male_mother`;
+reads_number_father=`awk '{read_number+=($3+$4)} END {print read_number}' statistics/stat_male_father`;
+reads_number_daughter=`awk '{read_number+=($3+$4)} END {print read_number}' statistics/stat_male_daughter`;
+reads_number_son=`awk '{read_number+=($3+$4)} END {print read_number}' statistics/stat_male_son`;
+
+echo "Reads numbers: " $reads_number_mother $reads_number_father $reads_number_daughter $reads_number_son
+
+#filter contigs, which are candidate Y genes - very divergent from their X counterparts
+perl ${LINKYX_PATH}/scripts/filter_diverged_Y_contigs.pl $reads_number_mother $reads_number_father $reads_number_daughter $reads_number_son $reference ${LINKYX_PATH} >Y_results.txt;
+
+echo "Y_results: " >>$results
+
+final_contigs=`cat Y_list | wc -l`
+#have we found any genes?
+
+if [ $final_contigs -gt 0 ]; then
+	#we found SOME genes
+	#retrieve sequences in fasta
+	perl ${LINKYX_PATH}/scripts/get_sequences_based_on_ids.pl $reference Y_list >Y_results_sequences.fasta;
+
+	get_results_in_bam_file () 
+	{
+	#candidates full_bam_file candidates_bam_file
+	cat Y_list | xargs samtools view -F 4 -b $1 > $2;
+	samtools index $2
+	}
+
+	get_results_in_bam_file $mother_male_bam ${8} #argument for output bam file mother
+	get_results_in_bam_file $father_male_bam ${9} #argument for output bam file father
+	get_results_in_bam_file $daughter_male_bam ${10} #argument for output bam file daughter
+	get_results_in_bam_file $son_male_bam ${11} #argument for output bam file son
+	wait;
+else
+		#we found NO genes
+		echo "No sequences found." >Y_results.txt;
+		echo "No sequences found." >Y_results_sequences.fasta;
+	fi
+
+#output results to Galaxy
+	cat Y_results.txt >>$results
+	cat Y_results_sequences.fasta >>$results_fasta
+	ls -lah */* >>$results