annotate scripts/to_get_vcf_files.sh @ 0:05c27700e5ca

initial commit
author biomonika <biomonika@psu.edu>
date Thu, 04 Sep 2014 18:24:19 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
1 #!/bin/bash
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
2 set -e;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
3
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
4 #read config variables
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
5 echo "Reading config...." >&2
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
6 #echo `ls -l`
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
7 computer_name=`hostname`;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
8 if [ $computer_name == "misa" ]; then
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
9 #local_computer
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
10 source config_devel;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
11 else
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
12 #metacentrum machine
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
13 source config_meta;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
14 fi
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
15
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
16 #files needed: reference.fasta $reads_1 $reads_2
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
17
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
18 reference=$1;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
19 reads_1="input/$2"; echo "reads_1: " $reads_1;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
20 reads_2="input/$3"; echo "reads_2: " $reads_2;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
21 name=$4;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
22 fragments=$5;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
23
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
24 echo "reference:" $reference "reads 1:" $reads_1 "reads 2:" $reads_2 "name:" $name;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
25
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
26 #INDEXING FILES
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
27
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
28 if [ -e "references/reference.dict" ];
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
29 then
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
30 echo "Picard reference exists. Not being created again."
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
31 else
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
32 echo "seconds of run: $SECONDS"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
33 echo "reference:"$reference;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
34 java -jar $CreateSequenceDictionary R=$reference O=references/reference.dict >mlogfile 2>>errlogfile
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
35 echo "0/9 INDEXING FILES successful"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
36 echo "seconds of run: $SECONDS"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
37 fi
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
38
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
39 if [ -e "references/reference.fai" ];
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
40 then
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
41 echo "Samtools reference exists. Not being created again."
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
42 else
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
43 echo "seconds of run: $SECONDS"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
44 echo "reference:"$reference;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
45 samtools faidx $reference 2>>errlogfile;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
46 echo "0/9 INDEXING FILES successful"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
47 echo "seconds of run: $SECONDS"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
48 fi
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
49
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
50 #1 MAPPING WITH BWA
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
51 bwa index $reference >>mlogfile 2>>errlogfile
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
52
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
53 #differs based on the fact wheter reads are single or paired-end
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
54 bwa aln -t 2 $reference $reads_1 > bam/aln_sa1_$name.sai & 2>>errlogfile
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
55 wait;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
56
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
57 if [[ $fragments =~ (single) ]]; then
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
58 #single-end reads
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
59 bwa samse $reference bam/aln_sa1_$name.sai $reads_1 > bam/aln_$name.sam 2>>errlogfile
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
60 echo "bwa samse $reference bam/aln_sa1_$name.sai $reads_1 > bam/aln_$name.sam 2>>errlogfile";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
61 wait;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
62 else
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
63 #pair-end reads
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
64 bwa aln -t 2 $reference $reads_2 > bam/aln_sa2_$name.sai & 2>>errlogfile
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
65 echo "bwa aln -t 2 $reference $reads_2 > bam/aln_sa2_$name.sai & 2>>errlogfile";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
66 wait;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
67 bwa sampe $reference bam/aln_sa1_$name.sai bam/aln_sa2_$name.sai $reads_1 $reads_2 > bam/aln_$name.sam 2>>errlogfile
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
68 echo "bwa sampe $reference bam/aln_sa1_$name.sai bam/aln_sa2_$name.sai $reads_1 $reads_2 > bam/aln_$name.sam 2>>errlogfile";
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
69 fi
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
70
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
71 wait;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
72 rm -f bam/aln_sa1_$name.sai bam/aln_sa2_$name.sai
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
73
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
74 echo "1/7 MAPPING WITH BWA successful"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
75 echo "seconds of run: $SECONDS"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
76
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
77 #2 CLEANING FILES - adjust MAPQ scores
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
78 echo "$SECONDS"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
79 java -jar $CleanSam I=bam/aln_$name.sam O=bam/aln_cleaned_$name.sam 2>>/dev/null
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
80 echo "2/7 CLEANING FILES - adjust MAPQ scores successful"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
81 echo "seconds of run: $SECONDS"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
82 wait
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
83
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
84 #3 CONVERTING TO BAM FILE
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
85 samtools view -bS bam/aln_cleaned_$name.sam > bam/aln_cleaned_$name.bam 2>>errlogfile
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
86 echo "3/7 CONVERTING TO BAM FILE successful"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
87 echo "seconds of run: $SECONDS"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
88 wait
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
89
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
90 #4 SORTING BAM FILE
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
91 samtools sort bam/aln_cleaned_$name.bam bam/aln_cleaned_sorted_$name >mlogfile 2>>errlogfile
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
92 echo "4/7 SORTING BAM FILE successful"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
93 echo "seconds of run: $SECONDS"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
94 wait
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
95 #rm aln.bam aln_cleaned.bam
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
96
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
97 #5 REMOVING DUPLICATES
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
98 java -jar $MarkDuplicates INPUT=bam/aln_cleaned_sorted_$name.bam OUTPUT=bam/aln_cleaned_sorted_deduplicated_$name.bam METRICS_FILE=picard_info.txt REMOVE_DUPLICATES=true ASSUME_SORTED=true VALIDATION_STRINGENCY=LENIENT MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 2>>/dev/null
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
99 echo "5/7 REMOVING DUPLICATES successful"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
100 echo "seconds of run: $SECONDS"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
101 wait;
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
102
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
103 #6 SNP AND VARIANT CALLING
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
104 samtools mpileup -uf $reference bam/aln_cleaned_sorted_deduplicated_$name.bam | $bcftools view -p 0.85 -cgv - | $vcfutils varFilter >vcf/aln_0.85_$name.vcf 2>>errlogfile
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
105 echo "6/7 SNP AND VARIANT CALLING successful"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
106 echo "seconds of run: $SECONDS"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
107
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
108 #7 REMOVING FILES
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
109 rm -f bam/aln_$name.sam bam/aln_cleaned_$name.sam bam/aln_cleaned_$name.bam bam/aln_cleaned_sorted_$name.bam >>mlogfile 2>>errlogfile
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
110 echo "7/7 REMOVING FILES successful"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
111 echo "seconds of run: $SECONDS"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
112 echo "DONE"
05c27700e5ca initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
113