0
|
1 #!/bin/bash
|
|
2 ##
|
|
3 ##
|
|
4 ## Authors: Vista Sohrab & Dilay Hazal Ayhan
|
|
5 ## Date: January 15, 2021
|
|
6 ## Description: TEfinder uses discordant reads to detect novel transposable element insertion events in short read paired-end sample sequencing data.
|
|
7 ## Software dependencies include bedtools 2.28.0 or later, samtools 1.3 or later, picard 2.0.1 or later
|
|
8 ## Required inputs include sample alignment file (.bam|.sam), reference genome FASTA (.fa), reference TE annotation in GFF/GTF or GFF3 (.gff|.gtf), and TEs of interest (.txt)
|
|
9 ##
|
|
10 ## University of Massachusetts Amherst
|
|
11 ##
|
|
12 ##
|
|
13 ##
|
|
14 ##
|
|
15
|
|
16 set -e
|
|
17
|
|
18 margs=4
|
|
19
|
|
20 # Functions
|
|
21 function example {
|
|
22 echo -e "example: TEfinder -alignment sample.bam -fa reference.fa -gtf TEs.gtf -te List_of_TEs.txt"
|
|
23 }
|
|
24
|
|
25 function help {
|
|
26 echo -e "REQUIRED:"
|
|
27 echo -e " -alignment, --alignmentFile STR sample reads aligned to reference genome (BAM/SAM file)"
|
|
28 echo -e " -fa, --FastaFile STR reference genome FASTA index (FA file)"
|
|
29 echo -e " -gtf, --TransposonsInGenome STR reference genome TE annotation (GFF2/GTF file)"
|
|
30 echo -e " -te, --TransposonsToSearch STR TE names (single column text file)\n"
|
|
31 echo -e "OPTIONAL:"
|
|
32 echo -e " -bamo, --DiscordantReads STR BAM output\n"
|
|
33 echo -e " -bedo, --bTEinsertions STR TEinsertions BED output\n"
|
|
34 echo -e " -gtfo, --gTEinsertions STR TEinsertions GTF output\n"
|
|
35 echo -e " -fis, --FragmentInsertSize INT short-read sequencing fragment insert size [400]"
|
|
36 echo -e " -picard, --pathToPicardjar STR path to picard tools .jar file [picard.jar]"
|
|
37 echo -e " -md, --MaxDistanceForMerge INT maximum distance between reads for bedtools merge [150]"
|
|
38 echo -e " -k, --MaxTSDLength INT maximum TE target site duplication (TSD) length [20]"
|
|
39 echo -e " -maxHeapMem, --MaxHeapMemory INT java maximum heap memory allocation for picard in Mb [2000]"
|
|
40 echo -e " -workingdir, --WorkingDirectory STR working directory name [TEfinder_<Date>]"
|
|
41 echo -e " -out, --OutputFormat STR output format as GTF [BED]"
|
|
42 echo -e " -outname, --OutputName STR output name prefix added to file names [null]"
|
|
43 echo -e " -threads, --Threads INT number of threads for samtools multi-threading [1]"
|
|
44 echo -e " -intermed --IntermediateFiles STR keep intermediate files created by pipeline [no]"
|
|
45 echo -e " -h, --help prints help\n"
|
|
46 example
|
|
47 }
|
|
48
|
|
49 # check if mandatory args are empty
|
|
50 function margs_check {
|
|
51 if [ $# -lt $margs ]; then
|
|
52 echo -e "One or more required parameters are missing."
|
|
53 example
|
|
54 exit 1 # error
|
|
55 fi
|
|
56 }
|
|
57
|
|
58 # main workflow
|
|
59 #### : comment out
|
|
60 function pipeline() {
|
|
61 mkdir ${workingdir}/${line}
|
|
62 currdir=${workingdir}/${line}
|
|
63 echo -e $(date) " Transposon analysis for "${line}" has started\n"
|
|
64
|
|
65 grep -P '[^(\w|\d|\-|\_|\#|\.)]'${line}'[^(\w|\d|\-|\_|\#|\.)]' $gtf > ${currdir}/${line}_TE.gff
|
|
66 echo -e $(date) " Individual TE GFF has been created for "${line}"\n" ####
|
|
67
|
|
68 bedtools intersect -abam ${workingdir}/${outname}Alignments.bam -b ${currdir}/${line}_TE.gff -wa > ${currdir}/${line}_MappedReadsToTE.bam
|
|
69 echo -e $(date) " Mapped reads to TE via bedtools intersect has been completed for "${line}"\n" ####
|
|
70 samtools view -@ $threads ${currdir}/${line}_MappedReadsToTE.bam | \
|
|
71 awk -v Ins=`expr $fis \* 10` '{if (($7 != "=") || ($9 > Ins) || ($9 < -Ins)) print $1}' > ${currdir}/${line}_ReadID.txt
|
|
72 echo -e $(date) " Identifying discordant read IDs has been completed for "${line}"\n" ####
|
|
73
|
|
74 # if discordant readID file exists, then continue with remainder of TE analysis
|
|
75 if [[ -s ${currdir}/${line}_ReadID.txt ]]
|
|
76 then
|
|
77 #java $maxHeapMem -jar $picard FilterSamReads I=${workingdir}/${outname}Alignments.bam O=${currdir}/${line}_DiscordantPairs.bam READ_LIST_FILE=${currdir}/${line}_ReadID.txt FILTER=includeReadList WRITE_READS_FILES=false
|
|
78 $picard $maxHeapMem FilterSamReads -I ${workingdir}/${outname}Alignments.bam -O ${currdir}/${line}_DiscordantPairs.bam -READ_LIST_FILE ${currdir}/${line}_ReadID.txt -FILTER includeReadList -WRITE_READS_FILES false &>/dev/null
|
|
79 echo -e $(date) " Filtering original alignment based on discordant reads IDs is complete for "${line}"\n" ####
|
|
80
|
|
81 bedtools merge -d $md -S + -c 1 -o count -i ${currdir}/${line}_DiscordantPairs.bam | \
|
|
82 awk '{if ($4 > 3) print $0}' > ${currdir}/${line}_plusCluster.bed
|
|
83 echo -e $(date) " Primary reads from the + strand have been merged if read count greater than 3 for "${line}"\n" ####
|
|
84
|
|
85 bedtools merge -d $md -S - -c 1 -o count -i ${currdir}/${line}_DiscordantPairs.bam | \
|
|
86 awk '{if ($4 > 3) print $0}' > ${currdir}/${line}_minusCluster.bed
|
|
87 echo -e $(date) " Primary reads from the - strand have been merged if read count greater than 3 for "${line}"\n" ####
|
|
88
|
|
89 # filtering edges piped into bedtools merge (keeping read counts greater than 3 in the line above)
|
|
90 ## find the closest minus strand to the plus strand in the cluster
|
|
91 ## filter by the distance between the plus and minus clusters - only retain pairs if reads are 0-100 bases away
|
|
92 ## if plus strand start is less than minus strand start and plus strand end is less than minus strand end then in proper orientation
|
|
93 bedtools closest -d -g ${workingdir}/reference.fa.fai -t first -a ${currdir}/${line}_plusCluster.bed -b ${currdir}/${line}_minusCluster.bed | \
|
|
94 awk -v TSD=$k '{if ($9 <= TSD && $9 >= 0) print $0}' | \
|
|
95 awk '{if ($2 < $6 && $3 < $7) print $0}' > ${currdir}/${line}_plusminus.bed
|
|
96 echo -e $(date) " Filtration of clusters in proper orientation using bedtools closest has been completed for "${line}"\n" ####
|
|
97
|
|
98 # if plus strand end is greater than minus strand start, then report the pair
|
|
99 awk '{if ($3 > $6) print $1"\t"$6"\t"$3"\t"$0}' ${currdir}/${line}_plusminus.bed > ${currdir}/${line}_plusminus_1.bed
|
|
100 echo -e $(date) " Overlapping reads TE insertions reported for "${line}"\n" ####
|
|
101
|
|
102 #if plus strand end is less than or equal to minus strand start and the region in between is less than a user-defined value k, report the pair
|
|
103 awk -v TSD=$k '{if ($3 <= $6 && $6 - $3 < TSD) print $1"\t"$3 - 1"\t"$6 + 1"\t"$0}' ${currdir}/${line}_plusminus.bed > \
|
|
104 ${currdir}/${line}_plusminus_2.bed
|
|
105 echo -e $(date) " Non-overlapping reads TE insertions reported for "${line}"\n" ####
|
|
106
|
|
107 #combine reported TE insertions
|
|
108 cat ${currdir}/${line}_plusminus_1.bed ${currdir}/${line}_plusminus_2.bed | \
|
|
109 awk -v TEname=$line '{$0=TEname"\t"$0}1' | sort -k 1 | sort -k 2 > \
|
|
110 ${currdir}/${line}_insertionRegion.txt
|
|
111
|
|
112
|
|
113 cat ${currdir}/${line}_insertionRegion.txt >> ${workingdir}/insertions.txt
|
|
114 echo -e $(date) " TE insertions for "${line}" have been reported.\n" ####
|
|
115
|
|
116
|
|
117 echo -e $(date) " Transposon named "${line}" is processed.\n"
|
|
118 else
|
|
119 echo -e $(date) " Transposon named "${line}" is processed. No discordant reads found.\n"
|
|
120 rm -r ${currdir}
|
|
121 fi
|
|
122 }
|
|
123
|
|
124 # functions end
|
|
125
|
|
126 # get arguments
|
|
127 fa=
|
|
128 alignment=
|
|
129 gtf=
|
|
130 te=
|
|
131 out=
|
|
132 intermed=
|
|
133 bamo="DiscordantReads.bam"
|
|
134 bedo="TEinsertions.bed"
|
|
135 gtfo="TEinsertions.gtf"
|
|
136 outname=""
|
|
137 fis=400
|
|
138 picard="picard"
|
|
139 maxHeapMem=-Xmx2000m
|
|
140 md=150
|
|
141 k=20
|
|
142 d=$(date +%Y%m%d%H%M%S)
|
|
143 # workingdir=TEfinder_${d}
|
|
144 workingdir="TEfinder"
|
|
145 threads=1
|
|
146
|
|
147 while [ "$1" != "" ];
|
|
148 do
|
|
149 case $1 in
|
|
150 -fa | --FastaFile )
|
|
151 shift
|
|
152 fa=$1 ;;
|
|
153 -alignment | --alignmentFile )
|
|
154 shift
|
|
155 alignment=$1 ;;
|
|
156 -gtf | --TransposonsInGenome )
|
|
157 shift
|
|
158 gtf=$1 ;;
|
|
159 -te | --TransposonsToSearch )
|
|
160 shift
|
|
161 te=$1 ;;
|
|
162 -fis | --FragmentInsertSize )
|
|
163 shift
|
|
164 fis=$1 ;;
|
|
165 -picard | --pathToPicardjar )
|
|
166 shift
|
|
167 picard=$1 ;;
|
|
168 -md | --MaxDistanceForMerge )
|
|
169 shift
|
|
170 md=$1 ;;
|
|
171 -k | --MaxTSDLength )
|
|
172 shift
|
|
173 k=$1 ;;
|
|
174 -bamo | --DiscordantReads )
|
|
175 shift
|
|
176 bamo=$1 ;;
|
|
177 -bedo | --bTEinsertions )
|
|
178 shift
|
|
179 bedo=$1 ;;
|
|
180 -gtfo | --gTEinsertions )
|
|
181 shift
|
|
182 gtfo=$1 ;;
|
|
183 -maxHeapMem | --MaxHeapMemory )
|
|
184 shift
|
|
185 maxHeapMem="-Xmx"$1"m" ;;
|
|
186 -workingdir | --WorkingDirectory )
|
|
187 shift
|
|
188 workingdir=$1 ;;
|
|
189 -out | --OutputFormat )
|
|
190 shift
|
|
191 out=$1 ;;
|
|
192 -outname | --OutputName )
|
|
193 shift
|
|
194 outname=$1 ;;
|
|
195 -threads | --Threads )
|
|
196 shift
|
|
197 threads=$1 ;;
|
|
198 -intermed | --IntermediateFiles )
|
|
199 shift
|
|
200 intermed=$1 ;;
|
|
201 -h | --help )
|
|
202 help
|
|
203 exit;;
|
|
204 *) # error
|
|
205 echo "TEfinder: illegal option $1"
|
|
206 example
|
|
207 exit 1 ;;
|
|
208 esac
|
|
209 shift
|
|
210 done
|
|
211 margs_check $fa $alignment $gtf $te
|
|
212
|
|
213 # main
|
|
214
|
|
215 mkdir ${workingdir}
|
|
216
|
|
217 # remove empty lines from user provided TE list if present
|
|
218 sed '/^$/d' $te > ${workingdir}"/userTE_noEmptyLines.txt"
|
|
219
|
|
220 # create output file
|
|
221 printf "%s\t" "track name=TEfinder" "type=bedDetail" "description=FR:forward read, RR:reverse read, InsRegion:insertion region start and end positions, FILTER:comma separated filters" > ${workingdir}/${outname}TEinsertions.bed
|
|
222 printf "\n" >> ${workingdir}/${outname}TEinsertions.bed
|
|
223
|
|
224 # create fasta index (fai) file
|
|
225 cp $fa ${workingdir}/reference.fa
|
|
226 samtools faidx ${workingdir}/reference.fa
|
|
227
|
|
228 # sort alignment input
|
|
229 samtools sort -@ $threads -o ${workingdir}/alignmentInput.sorted.bam ${alignment}
|
|
230 echo -e $(date) " Alignment file sorted successfully.\n"
|
|
231
|
|
232 # remove secondary and supplementary alignments from sorted bam
|
|
233 samtools view -F 2304 -@ $threads -o ${workingdir}/${outname}Alignments.bam ${workingdir}/alignmentInput.sorted.bam
|
|
234 echo -e $(date) " Alignments are filtered - secondary and supplementary alignments have been removed. \n"
|
|
235
|
|
236 # run pipeline for each TE
|
|
237 while IFS="" read -r line || [ -n "$line" ]
|
|
238 do
|
|
239 pipeline &
|
|
240 done < ${workingdir}/userTE_noEmptyLines.txt
|
|
241 wait
|
|
242 echo -e $(date) " All transposons are processed. Finalizing...\n"
|
|
243
|
|
244 # combine discordant bam files
|
|
245 samtools merge -@ $threads -r ${workingdir}/${outname}DiscordantReads.bam ${workingdir}/*/*_DiscordantPairs.bam
|
|
246 echo -e $(date) " BAM Output: Discordant pair alignment file is now available.\n"
|
|
247 # Sorting by position
|
|
248 samtools sort -@ $threads ${workingdir}/${outname}DiscordantReads.bam | samtools view -h -o ${workingdir}/${outname}DiscordantReads.sam
|
|
249 grep -v '^@PG' ${workingdir}/${outname}DiscordantReads.sam > ${workingdir}/${outname}DiscordantReadsNoPG.sam
|
|
250 rm ${workingdir}/${outname}DiscordantReads.sam
|
|
251 samtools view -hb -x "PG" --no-PG --remove-flags "PG" -O BAM ${workingdir}/${outname}DiscordantReadsNoPG.sam -o ${bamo}
|
|
252 rm ${workingdir}/${outname}DiscordantReadsNoPG.sam
|
|
253
|
|
254 # update output BED file with TEfinder results: organize the starting file
|
|
255 awk '{print $2"\t"$3"\t"$4"\t"$1"\t"$8+$12"\t.\tFR="$8";RR="$12";InsRegion="$6"-"$11";FILTER="}' ${workingdir}/insertions.txt > ${workingdir}/TEinsertions_putative.bed
|
|
256 # find the entries in repeat regions for filtering
|
|
257 bedtools intersect -wa -u -a ${workingdir}/TEinsertions_putative.bed -b $gtf > ${workingdir}/TEinsertions_putative_inrepeat.bed
|
|
258 # filtering process
|
|
259 while IFS="" read -r line || [ -n "$line" ]
|
|
260 do
|
|
261 #located in repeat region
|
|
262 if (grep -Fxq "$line" "${workingdir}/TEinsertions_putative_inrepeat.bed")
|
|
263 then
|
|
264 line=$line"in_repeat,"
|
|
265 fi
|
|
266
|
|
267 #weak evidence
|
|
268 readc=$(echo $line | awk '{print $5}')
|
|
269 if (( $readc < 10 ))
|
|
270 then
|
|
271 line=$line"weak_evidence,"
|
|
272 fi
|
|
273
|
|
274 #strand-biased
|
|
275 FR=$(echo $line | grep -o 'FR=[[:digit:]]*' | cut -f2 -d'=')
|
|
276 RR=$(echo $line | grep -o 'RR=[[:digit:]]*' | cut -f2 -d'=')
|
|
277 var1=$(echo 'e(l('$FR')*1.25)' | bc -l)
|
|
278 var2=$(echo 'e(l('$FR')*0.8)' | bc -l)
|
|
279
|
|
280 if [ $(echo "$RR > $var1" | bc) -eq 1 ] || [ $(echo "$RR < $var2" | bc) -eq 1 ]
|
|
281 then
|
|
282 line=$line"strand_bias,"
|
|
283 fi
|
|
284
|
|
285 #pass
|
|
286 lastchar=${line: -1}
|
|
287 if [ $lastchar == "," ]
|
|
288 then
|
|
289 line=${line::${#line}-1}
|
|
290 else
|
|
291 line=$line"PASS"
|
|
292 fi
|
|
293
|
|
294 #write to final output
|
|
295 printf "%s\n" "$line" >> ${workingdir}/${outname}TEinsertions.bed
|
|
296
|
|
297 done < ${workingdir}/TEinsertions_putative.bed
|
|
298 wait
|
|
299 echo -e $(date) " BED Output: TEfinder output BED file is now available.\n"
|
|
300 # Sorting
|
|
301 # cp ${workingdir}/${outname}TEinsertions.bed ${outo}
|
|
302 bedtools sort -chrThenSizeA -i ${workingdir}/${outname}TEinsertions.bed > ${bedo}
|
|
303 # cat ${bedo}
|
|
304
|
|
305 # gtf option - create output GTF files with TEfinder results
|
|
306 if [ ! -z "$out" ]
|
|
307 then
|
|
308 awk 'FNR > 1 {print $1"\tTEfinder\tTIP\t"$2 + 1"\t"$3"\t"$5"\t.\t.\tte_name \""$4"\"; tags \""$7"\""}' ${workingdir}/${outname}TEinsertions.bed > ${workingdir}/${outname}TEinsertions.gtf
|
|
309 bedtools sort -chrThenSizeA -i ${workingdir}/${outname}TEinsertions.gtf > ${gtfo}
|
|
310 # awk 'FNR > 1 {print $1"\tTEfinder\tTIP\t"$2 + 1"\t"$3"\t"$5"\t.\t.\tte_name \""$4"\"; tags \""$7"\""}' ${bedo} > ${gtfo}
|
|
311 # Sorting
|
|
312 # cp ${workingdir}/${outname}TEinsertions.gtf ${gtfo}
|
|
313 echo -e "\n\n"
|
|
314 # cat ${gtfo}
|
|
315 # bedtools sort -chrThenSizeA -i ${workingdir}/${outname}TEinsertions.gtf > ${gtfo}
|
|
316 echo -e $(date) " GTF Output: TEfinder output GTF file is now available.\n"
|
|
317 fi
|
|
318
|
|
319 # clean working directory
|
|
320 if [ -z "$intermed" ]
|
|
321 then
|
|
322 rm ${workingdir}/TEinsertions_putative.bed ${workingdir}/TEinsertions_putative_inrepeat.bed ${workingdir}/reference.fa ${workingdir}/reference.fa.fai \
|
|
323 ${workingdir}/alignmentInput.sorted.bam ${workingdir}/insertions.txt ${workingdir}/${outname}Alignments.bam ${workingdir}/userTE_noEmptyLines.txt
|
|
324 rm -r ${workingdir}/*/
|
|
325 fi
|
|
326
|
|
327 if [ `wc -l <${workingdir}/${outname}TEinsertions.bed` -le "1" ]
|
|
328 then
|
|
329 echo -e $(date) " Error: TEfinder run unsuccessful."
|
|
330 else
|
|
331 echo -e $(date) " TE insertion output files have been created. TEfinder completed successfully."
|
|
332 fi
|