annotate LINKYX_identify_Y_linked_SNPs.sh @ 2:0d8315be76b7

Uploaded
author biomonika
date Tue, 17 Feb 2015 21:59:51 -0500
parents 05c27700e5ca
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
1 #!/bin/bash
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
2
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
3 #LINKYX_PATH set during toolshed installation
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
4
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
5 results=$1 #X-linked candidates written here
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
6
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
7 reference=${2}
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
8 mother_male_bam=${3}
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
9 father_male_bam=${4}
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
10 daughter_male_bam=${5}
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
11 son_male_bam=${6}
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
12
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
13 results_fasta=${7}
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
14
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
15 mkdir -p Y references; dir="Y/";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
16 grep ">" $reference | sed 's/>//g' >references/male_contig_names
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
17
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
18 file_exists ()
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
19 {
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
20 for ARG in "$@";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
21 do
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
22 if [ ! -f ${ARG} ]; then
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
23 echo "File $ARG not found";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
24 fi
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
25 done;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
26 }
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
27
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
28 #index bam files
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
29
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
30 index_bam_file ()
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
31 {
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
32 for ARG in "$@";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
33 do
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
34 bam_file_index=${ARG}".bai"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
35 if [ -e $bam_file_index ];
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
36 then
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
37 echo "Index already exists. Not being created again.";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
38 else
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
39 samtools index ${ARG};
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
40 fi
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
41 done;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
42 }
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
43
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
44 #get statistics about mapped and unmapped reads for every contig
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
45
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
46 idxstats_bam_file ()
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
47 {
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
48 #get statistics about mapped and unmapped reads for every contig
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
49 echo "Idxstats" $1 $2;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
50 stat_male_name=statistics/stat_male_${2};
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
51 samtools idxstats $1 >$stat_male_name #statistics for bam files mapping on male reference
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
52
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
53 #with MAPQ>0
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
54 samtools view -bq 1 $1 >statistics/high_score_male_${2}.bam
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
55 samtools index statistics/high_score_male_${2}.bam
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
56
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
57 stat_male_high_name=statistics/stat_male_${2}_high
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
58 samtools idxstats statistics/high_score_male_${2}.bam >$stat_male_high_name #statistics for bam files mapping on male reference with MAPQ>0
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
59 }
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
60
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
61 echo "Checking existence of input bam files."
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
62 file_exists $mother_male_bam $father_male_bam $son_male_bam $daughter_male_bam
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
63 index_bam_file $mother_male_bam $father_male_bam $son_male_bam $daughter_male_bam
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
64
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
65 #retrieve statistics
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
66 mkdir -p statistics;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
67 idxstats_bam_file $mother_male_bam mother
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
68 idxstats_bam_file $father_male_bam father
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
69 idxstats_bam_file $daughter_male_bam daughter
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
70 idxstats_bam_file $son_male_bam son
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
71
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
72 #read statistics
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
73 reads_number_mother=`awk '{read_number+=($3+$4)} END {print read_number}' statistics/stat_male_mother`;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
74 reads_number_father=`awk '{read_number+=($3+$4)} END {print read_number}' statistics/stat_male_father`;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
75 reads_number_daughter=`awk '{read_number+=($3+$4)} END {print read_number}' statistics/stat_male_daughter`;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
76 reads_number_son=`awk '{read_number+=($3+$4)} END {print read_number}' statistics/stat_male_son`;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
77
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
78 echo "Reads numbers: " $reads_number_mother $reads_number_father $reads_number_daughter $reads_number_son
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
79
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
80 #filter contigs, which are candidate Y genes - very divergent from their X counterparts
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
81 perl ${LINKYX_PATH}/scripts/filter_diverged_Y_contigs.pl $reads_number_mother $reads_number_father $reads_number_daughter $reads_number_son $reference ${LINKYX_PATH} >Y_results.txt;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
82
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
83 echo "Y_results: " >>$results
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
84
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
85 final_contigs=`cat Y_list | wc -l`
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
86 #have we found any genes?
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
87
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
88 if [ $final_contigs -gt 0 ]; then
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
89 #we found SOME genes
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
90 #retrieve sequences in fasta
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
91 perl ${LINKYX_PATH}/scripts/get_sequences_based_on_ids.pl $reference Y_list >Y_results_sequences.fasta;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
92
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
93 get_results_in_bam_file ()
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
94 {
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
95 #candidates full_bam_file candidates_bam_file
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
96 cat Y_list | xargs samtools view -F 4 -b $1 > $2;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
97 samtools index $2
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
98 }
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
99
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
100 get_results_in_bam_file $mother_male_bam ${8} #argument for output bam file mother
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
101 get_results_in_bam_file $father_male_bam ${9} #argument for output bam file father
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
102 get_results_in_bam_file $daughter_male_bam ${10} #argument for output bam file daughter
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
103 get_results_in_bam_file $son_male_bam ${11} #argument for output bam file son
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
104 wait;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
105 else
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
106 #we found NO genes
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
107 echo "No sequences found." >Y_results.txt;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
108 echo "No sequences found." >Y_results_sequences.fasta;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
109 fi
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
110
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
111 #output results to Galaxy
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
112 cat Y_results.txt >>$results
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
113 cat Y_results_sequences.fasta >>$results_fasta
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
114 ls -lah */* >>$results