annotate LINKYX_identify_X_linked_SNPs.sh @ 7:90098d9cb691 default tip

workflows updated
author biomonika <biomonika@psu.edu>
date Fri, 20 Feb 2015 18:34:34 -0500
parents 05c27700e5ca
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
1 #!/bin/bash
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
2
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
3 #LINKYX_PATH set during toolshed installation
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
4
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
5 results=$1 #X-linked candidates written here
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
6
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
7 reference=${2}
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
8 mother_bam=${3}
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
9 mother_vcf=${4}
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
10 father_bam=${5}
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
11 father_vcf=${6}
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
12 daughter_bam=${7}
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
13 daughter_vcf=${8}
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
14 son_bam=${9}
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
15 son_vcf=${10}
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
16
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
17 results_fasta=${11}
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
18
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
19 file_exists ()
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
20 {
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
21 for ARG in "$@";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
22 do
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
23 if [ ! -f ${ARG} ]; then
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
24 echo "File $ARG not found";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
25 fi
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
26 done;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
27 }
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
28
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
29 #index bam files
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
30
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
31 index_bam_file ()
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
32 {
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
33 for ARG in "$@";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
34 do
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
35 bam_file_index=${ARG}".bai"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
36 if [ -e $bam_file_index ];
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
37 then
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
38 echo "Index already exists. Not being created again.";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
39 else
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
40 samtools index ${ARG} &
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
41 fi
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
42 done;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
43 }
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
44
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
45 idxstats_bam_file ()
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
46 {
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
47 #get statistics about mapped and unmapped reads for every contig
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
48 echo "Idxstats" $1 $2;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
49 stat_name=statistics/stat_${2};
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
50 samtools idxstats $1 >$stat_name #statistics for bam files mapping on male reference
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
51
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
52 #with MAPQ>0
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
53 samtools view -bq 1 $1 >statistics/high_score_${2}.bam
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
54 samtools index statistics/high_score_${2}.bam
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
55
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
56 stat_high_name=statistics/stat_${2}_high
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
57 samtools idxstats statistics/high_score_${2}.bam >$stat_high_name #statistics for bam files mapping on male reference with MAPQ>0
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
58 }
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
59
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
60
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
61 file_exists $testing $results $reference $mother_bam $mother_vcf $father_bam $father_vcf $daughter_bam $daughter_vcf $son_bam $son_vcf #check if all input files are present
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
62 index_bam_file $mother_bam $father_bam $daughter_bam $son_bam #index bam files
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
63
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
64 #retrieve statistics
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
65 mkdir -p statistics;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
66 idxstats_bam_file $mother_bam mother
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
67 idxstats_bam_file $father_bam father
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
68 idxstats_bam_file $daughter_bam daughter
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
69 idxstats_bam_file $son_bam son
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
70
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
71 newline () {
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
72 echo -e "\n---";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
73 }
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
74
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
75 #variables
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
76
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
77 dir="A/";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
78 echo -e "A variant"; newline; mkdir -p A references bam;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
79
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
80 grep ">" $reference | sed 's/>//g' >references/reference_contig_names
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
81 mother_contigs=`cat references/reference_contig_names | wc -l`;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
82
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
83 #DAUGHTER PREPARE FILES
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
84
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
85 grep -E '#|AF1=0.' $daughter_vcf | cut -f1 | uniq >${dir}daughter_on_mother_one_allele_heterozygous_ids.vcf
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
86 perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/daughter_on_mother_one_allele_heterozygous_ids.vcf"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if exists $a{$res}' $daughter_vcf | sort | uniq | sed '/#/d' >${dir}daughter_on_mother_heterozygous.vcf
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
87
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
88 #be aware that if there are more snps, one with allele frequency '0.' is enough for the whole contig to be in contig_pos_ref_heterozygous
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
89 sed '/#/d' ${dir}daughter_on_mother_heterozygous.vcf | perl -pe 's/(comp\S+)\t(\d+)\t\.\t(\S+)\t(\S+).+/$1\t$2\t$3\t$4/' | sort >${dir}contig_pos_ref_alt_daughter_heterozygous
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
90 cut -f 1 ${dir}contig_pos_ref_alt_daughter_heterozygous | sort | uniq >${dir}contigs_where_daughter_heterozygous
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
91
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
92 sed '/#/d' $daughter_vcf | perl -pe 's/(comp\S+)\t(\d+)\t\.\t(\S+)\t(\S+).+/$1\t$2\t$3\t$4/' | sort >${dir}contig_pos_ref_alt_daughter
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
93
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
94 #FATHER PREPARE FILES
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
95
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
96 sed '/#/d' $father_vcf | perl -pe 's/(comp\S+)\t(\d+)\t\.\t(\S+)\t(\S+).+/$1\t$2\t$3\t$4/' | sort >${dir}contig_pos_ref_alt_father
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
97
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
98 #SON PREPARE FILES
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
99
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
100 sed '/#/d' $son_vcf | perl -pe 's/(comp\S+)\t(\d+)\t\.\t(\S+)\t(\S+).+/$1\t$2\t$3\t$4/' | sort >${dir}contig_pos_ref_alt_son
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
101
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
102
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
103 echo -e "Step 0 of 3.\nMother must be homozygote.";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
104 sed '/#/d' $mother_vcf | cut -f1 | sort | uniq >${dir}contigs_where_mother_differs_from_mother;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
105 differing=`cat ${dir}contigs_where_mother_differs_from_mother | wc -l`;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
106
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
107 perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/contigs_where_mother_differs_from_mother"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' references/reference_contig_names | sort | uniq >${dir}contigs_where_mother_same_as_mother_unchecked
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
108
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
109 #check whether mother really does not contain any father variants
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
110 wrong_hits=0;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
111 cat ${dir}contigs_where_mother_same_as_mother_unchecked | (while read line; do
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
112 contig_check=`bash ${LINKYX_PATH}/scripts/test.sh ${line} 4 ${dir}contig_pos_ref_alt_father $mother_bam A`
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
113 if [[ $contig_check -gt 0 ]]; then
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
114 echo "contig: " ${line} "contig_check:" $contig_check >>kontrola;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
115 echo "DID NOT PASS "${line};
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
116 wrong_hits=`expr $wrong_hits + 1`
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
117 else
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
118 #echo "0K";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
119 echo ${line} >>${dir}contigs_where_mother_same_as_mother
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
120 fi
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
121 done;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
122 echo "There is this number of wrong variants: "$wrong_hits;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
123 )
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
124
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
125 same=`cat ${dir}contigs_where_mother_same_as_mother | wc -l`;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
126 echo -e "Number of contigs, where mother differs: $differing.";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
127 echo -e "NUMBER of contigs AFTER just this filter: $same/$mother_contigs";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
128
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
129 newline;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
130
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
131 echo -e "Step 1 of 3.\nSons must be same as mother.";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
132 #First we have to get rid of the intersection with father, possible Y variants do not matter in the calculation
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
133 comm -12 ${dir}contig_pos_ref_alt_son ${dir}contig_pos_ref_alt_father | sort | uniq >${dir}possible_Yvariants
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
134 cut -f 1 ${dir}possible_Yvariants | sort | uniq >${dir}contigs_with_possible_Yvariants
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
135
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
136 #if these possible_Yvariants are in the daughter, they are not Y variants
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
137 comm -12 ${dir}contig_pos_ref_alt_daughter ${dir}possible_Yvariants | sort | uniq >${dir}NOT_Yvariants
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
138 cut -f 1 ${dir}NOT_Yvariants | sort | uniq >${dir}contigs_with_NOT_Yvariants
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
139
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
140 perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/contigs_with_NOT_Yvariants"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' ${dir}contigs_with_possible_Yvariants | sort | uniq >${dir}contigs_with_Yvariants
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
141
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
142 #contig_pos_ref_alt_Yvariants_confirmed; we deplet just those contigs that contain Y variants and then decide if sons are same as mother
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
143
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
144 touch ${dir}contig_pos_ref_alt_Yvariants_confirmed_pro #todo - needs to be created?
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
145 cat ${dir}contigs_with_Yvariants | (while read line; do
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
146 grep ${line} ${dir}contig_pos_ref_alt_father | sort >>${dir}contig_pos_ref_alt_Yvariants_confirmed_pro
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
147 done;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
148 )
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
149 cat ${dir}contig_pos_ref_alt_Yvariants_confirmed_pro | sort >${dir}contig_pos_ref_alt_Yvariants_confirmed;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
150
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
151 comm -23 ${dir}contig_pos_ref_alt_son ${dir}contig_pos_ref_alt_Yvariants_confirmed | cut -f 1 | sort | uniq >${dir}contigs_where_sons_variation_not_caused_by_father
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
152 #rm ${dir}contig_pos_ref_alt_Yvariants_confirmed_pro;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
153
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
154 perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/contigs_where_sons_variation_not_caused_by_father"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' references/reference_contig_names | sort | uniq >${dir}contigs_where_sons_same_as_mother
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
155
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
156 same=`cat ${dir}contigs_where_sons_same_as_mother | wc -l`;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
157 echo -e "Number of contigs, where sons differ: $differing.";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
158 echo -e "NUMBER of contigs AFTER just this filter: $same/$mother_contigs";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
159
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
160
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
161 newline;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
162 echo -e "Step 2 of 3.\nFather must differ from mother.";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
163 sed '/#/d' $father_vcf | cut -f1 | sort | uniq >${dir}/contigs_where_father_differs_from_mother;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
164 differing=`cat ${dir}contigs_where_father_differs_from_mother | wc -l`;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
165 echo -e "NUMBER of contigs AFTER just this filter: $differing/$mother_contigs";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
166
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
167 newline;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
168
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
169 echo -e "Step 3 of 3.\nDaughter must have mother and father alleles.";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
170 #we need daughter to have alleles from both parents (AF stands for 'allele frequency')
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
171
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
172 #unique lines in first file mean that those are not subsets of father variants - otherwise they would be in common
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
173 comm -23 ${dir}contig_pos_ref_alt_daughter_heterozygous ${dir}contig_pos_ref_alt_father | cut -f 1 | sort | uniq >${dir}daughter_NOT_subset_of_father
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
174
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
175 perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/daughter_NOT_subset_of_father"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' ${dir}contigs_where_daughter_heterozygous | sort | uniq >${dir}/contigs_where_daughter_from_parents
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
176
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
177
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
178 same=`cat ${dir}contigs_where_daughter_from_parents | wc -l`;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
179 echo -e "NUMBER of contigs AFTER just this filter: $same/$mother_contigs";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
180
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
181 newline;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
182
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
183 echo -e "Step 4 of 3.\nDaughters and sons can not have any intersection.";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
184 comm -12 ${dir}contig_pos_ref_alt_daughter ${dir}contig_pos_ref_alt_son >${dir}sons_and_daughters_similiar
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
185 cut -f 1 ${dir}sons_and_daughters_similiar | sort | uniq >${dir}contigs_where_sons_and_daughters_similiar
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
186
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
187 perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/contigs_where_sons_and_daughters_similiar"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' ${dir}contigs_where_daughter_heterozygous | sort | uniq >${dir}contigs_where_daughters_and_sons_different_pro
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
188
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
189 #check whether they are really different (small amount of variants is also wrong)
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
190
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
191 echo -e "daughter contains son variants?"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
192 #check whether daughter really does not contain any of son variants
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
193 wrong_hits=0;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
194 cat ${dir}contigs_where_daughters_and_sons_different_pro | (while read line; do
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
195 contig_check=`bash ${LINKYX_PATH}/scripts/test.sh ${line} 4 ${dir}contig_pos_ref_alt_daughter $son_bam A`
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
196 if [[ $contig_check -gt 0 ]]; then
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
197 echo "DID NOT PASS "${line};
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
198 echo "daughter contains son variants DID NOT PASS "${line} >>kontrola
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
199 wrong_hits=`expr $wrong_hits + 1`
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
200 else
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
201 #echo "0K";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
202 echo ${line} >>${dir}contigs_where_daughters_and_sons_different_un1
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
203 fi
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
204 done;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
205 echo "There is this number of wrong variants: "$wrong_hits;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
206 )
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
207
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
208 echo -e "son contains daughter variants?"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
209 #check whether son really does not contain any of daughter variants
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
210 wrong_hits=0;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
211 cat ${dir}contigs_where_daughters_and_sons_different_pro | (while read line; do
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
212 contig_check=`bash ${LINKYX_PATH}/scripts/test.sh ${line} 4 ${dir}contig_pos_ref_alt_son $daughter_bam A`
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
213 if [[ $contig_check -gt 0 ]]; then
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
214 echo "DID NOT PASS "${line};
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
215 echo "son contains daughter variants DID NOT PASS "${line} >>kontrola
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
216 wrong_hits=`expr $wrong_hits + 1`
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
217 else
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
218 #echo "0K";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
219 echo ${line} >>${dir}contigs_where_daughters_and_sons_different_un2
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
220 fi
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
221 done;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
222 echo "There is this number of wrong variants: "$wrong_hits;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
223 )
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
224
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
225 comm -12 ${dir}contigs_where_daughters_and_sons_different_un1 ${dir}contigs_where_daughters_and_sons_different_un2 | sort >${dir}contigs_where_daughters_and_sons_different_un
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
226
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
227 cat ${dir}contigs_where_daughters_and_sons_different_un | sort | uniq >${dir}contigs_where_daughters_and_sons_different;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
228
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
229 #rm ${dir}contigs_where_daughters_and_sons_different_un1 ${dir}contigs_where_daughters_and_sons_different_un2 ${dir}contigs_where_daughters_and_sons_different_un;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
230
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
231 same=`cat ${dir}contigs_where_daughters_and_sons_different | wc -l`;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
232 echo -e "NUMBER of contigs AFTER just this filter: $same/$mother_contigs";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
233
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
234 echo -e "Calculating intersection of candidate contigs from steps 0-3.";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
235
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
236 comm -12 ${dir}contigs_where_mother_same_as_mother ${dir}contigs_where_sons_same_as_mother | sort >${dir}first_couple;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
237 comm -12 ${dir}contigs_where_father_differs_from_mother ${dir}contigs_where_daughter_from_parents | sort >${dir}second_couple;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
238 comm -12 ${dir}first_couple ${dir}second_couple | sort >${dir}set1
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
239 comm -12 ${dir}contigs_where_daughters_and_sons_different ${dir}set1 | sort >${dir}set2;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
240
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
241 cat ${dir}set2 >>results; #todo remove
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
242 echo -e "Number of contigs after intersection: " `cat ${dir}set2 | wc -l`;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
243
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
244 echo -e "Step -1 of 3.\nCoverage must be sufficient.";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
245 touch ${dir}contigs_with_sufficient_coverage;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
246
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
247 cat ${dir}set2 | (while read line; do
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
248
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
249 coverage_check1=`bash ${LINKYX_PATH}/scripts/get_average_contig_coverage.sh 0.75 ${line} $mother_bam $reference 2>/dev/null &`;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
250 coverage_check2=`bash ${LINKYX_PATH}/scripts/get_average_contig_coverage.sh 0.75 ${line} $father_bam $reference 2>/dev/null &`;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
251 coverage_check3=`bash ${LINKYX_PATH}/scripts/get_average_contig_coverage.sh 0.75 ${line} $son_bam $reference 2>/dev/null &`;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
252 coverage_check4=`bash ${LINKYX_PATH}/scripts/get_average_contig_coverage.sh 0.75 ${line} $daughter_bam $reference 2>/dev/null &`;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
253
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
254 #echo "coverage check: " $coverage_check1 $coverage_check2 $coverage_check3 $coverage_check4
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
255
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
256 coverage_check=$(($coverage_check1 + $coverage_check2 + $coverage_check3 + $coverage_check4))
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
257 if [[ $coverage_check -gt 0 ]]; then
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
258 echo "LOW COVERAGE "${line};
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
259 else
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
260 echo "COVERAGE 0K";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
261 echo ${line} >>${dir}contigs_with_sufficient_coverage
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
262 fi
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
263 done;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
264 )
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
265
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
266 cat ${dir}contigs_with_sufficient_coverage | sort >X_results.txt;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
267
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
268 #have we found any genes?
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
269 final_contigs=`cat X_results.txt | wc -l`;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
270 echo -e "$final_contigs contigs found fulfilling criteria.";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
271
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
272 if [ $final_contigs -gt 0 ]; then
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
273 #we found SOME genes
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
274
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
275 #retrieve sequences in fasta
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
276 perl ${LINKYX_PATH}/scripts/get_sequences_based_on_ids.pl $reference X_results.txt >X_results_sequences.fasta;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
277 #sort sequences
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
278 perl ${LINKYX_PATH}/scripts/sorting.pl A/ $reference ${LINKYX_PATH} >>$results
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
279
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
280 get_results_in_bam_file ()
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
281 {
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
282 #candidates full_bam_file candidates_bam_file
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
283 cat X_results.txt | xargs samtools view -F 4 -b $1 > $2;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
284 samtools index $2
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
285 }
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
286
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
287 get_results_in_bam_file $mother_bam ${12} #argument for output bam file mother
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
288 get_results_in_bam_file $father_bam ${13} #argument for output bam file father
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
289 get_results_in_bam_file $daughter_bam ${14} #argument for output bam file daughter
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
290 get_results_in_bam_file $son_bam ${15} #argument for output bam file son
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
291 wait;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
292
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
293 #send BAM files with corresponding contigs
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
294 #mutt -s "LINKYX: Results of computation for X-linked genes" $email_address_of_user -a bam/mother.bam -a bam/mother.bam.bai -a bam/father.bam -a bam/father.bam.bai -a bam/daughter.bam -a bam/daughter.bam.bai -a bam/son.bam -a bam/son.bam.bai < messages_to_user/message_results_sentBamX;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
295
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
296 else
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
297 #we found NO genes
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
298 echo "No sequences found." >X_results.txt;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
299 echo "No sequences found." >X_results_sequences.fasta;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
300 fi
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
301
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
302 newline;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
303 echo -e "Writing output to X_results.txt";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
304 echo -e "Total number of found X-linked genes: $final_contigs";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
305
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
306 #TEST OUTPUT
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
307 cat X_results.txt >>$results
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
308 cat X_results_sequences.fasta >>$results_fasta
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
309 ls -lah */* >>$results
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
310
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
311