diff TEfinder.sh @ 0:b81a83c743d3 draft default tip

Uploaded
author namhsuya
date Tue, 09 Aug 2022 06:58:49 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TEfinder.sh	Tue Aug 09 06:58:49 2022 +0000
@@ -0,0 +1,332 @@
+#!/bin/bash
+##
+##
+## Authors: Vista Sohrab & Dilay Hazal Ayhan
+## Date: January 15, 2021
+## Description: TEfinder uses discordant reads to detect novel transposable element insertion events in short read paired-end sample sequencing data. 
+##              Software dependencies include bedtools 2.28.0 or later, samtools 1.3 or later, picard 2.0.1 or later
+##              Required inputs include sample alignment file (.bam|.sam), reference genome FASTA (.fa), reference TE annotation in GFF/GTF or GFF3 (.gff|.gtf), and TEs of interest (.txt)
+##
+## University of Massachusetts Amherst
+##
+##
+##
+##
+
+set -e
+
+margs=4
+
+# Functions
+function example {
+     echo -e "example: TEfinder -alignment sample.bam -fa reference.fa -gtf TEs.gtf -te List_of_TEs.txt"
+}
+
+function help {
+     echo -e "REQUIRED:"
+     echo -e "  -alignment,  --alignmentFile       STR  sample reads aligned to reference genome (BAM/SAM file)"
+     echo -e "  -fa,         --FastaFile           STR  reference genome FASTA index (FA file)"
+     echo -e "  -gtf,        --TransposonsInGenome STR  reference genome TE annotation (GFF2/GTF file)"
+     echo -e "  -te,         --TransposonsToSearch STR  TE names (single column text file)\n"
+     echo -e "OPTIONAL:"
+     echo -e "  -bamo,       --DiscordantReads     STR  BAM output\n"
+     echo -e "  -bedo,       --bTEinsertions       STR  TEinsertions BED output\n"
+     echo -e "  -gtfo,       --gTEinsertions       STR  TEinsertions GTF output\n"
+     echo -e "  -fis,        --FragmentInsertSize  INT  short-read sequencing fragment insert size [400]"
+     echo -e "  -picard,     --pathToPicardjar     STR  path to picard tools .jar file [picard.jar]"
+     echo -e "  -md,         --MaxDistanceForMerge INT  maximum distance between reads for bedtools merge [150]"
+     echo -e "  -k,          --MaxTSDLength        INT  maximum TE target site duplication (TSD) length [20]"
+     echo -e "  -maxHeapMem, --MaxHeapMemory       INT  java maximum heap memory allocation for picard in Mb [2000]" 
+     echo -e "  -workingdir, --WorkingDirectory    STR  working directory name [TEfinder_<Date>]"
+     echo -e "  -out,        --OutputFormat        STR  output format as GTF [BED]"
+     echo -e "  -outname,    --OutputName          STR  output name prefix added to file names [null]"
+     echo -e "  -threads,    --Threads             INT  number of threads for samtools multi-threading [1]"
+     echo -e "  -intermed    --IntermediateFiles   STR  keep intermediate files created by pipeline [no]"  
+     echo -e "  -h,          --help                     prints help\n"
+     example
+}
+
+# check if mandatory args are empty
+function margs_check {
+     if [ $# -lt $margs ]; then
+          echo -e "One or more required parameters are missing."
+          example
+          exit 1 # error
+     fi
+}
+
+# main workflow
+ #### : comment out
+function pipeline() {
+     mkdir ${workingdir}/${line}
+     currdir=${workingdir}/${line}
+     echo -e $(date) " Transposon analysis for "${line}" has started\n"
+
+     grep -P '[^(\w|\d|\-|\_|\#|\.)]'${line}'[^(\w|\d|\-|\_|\#|\.)]' $gtf > ${currdir}/${line}_TE.gff
+     echo -e $(date) " Individual TE GFF has been created for "${line}"\n" ####
+     
+     bedtools intersect -abam ${workingdir}/${outname}Alignments.bam -b ${currdir}/${line}_TE.gff -wa > ${currdir}/${line}_MappedReadsToTE.bam
+     echo -e $(date) " Mapped reads to TE via bedtools intersect has been completed for "${line}"\n" ####
+     samtools view -@ $threads ${currdir}/${line}_MappedReadsToTE.bam | \
+          awk -v Ins=`expr $fis \* 10` '{if (($7 != "=") || ($9 > Ins) || ($9 < -Ins)) print $1}' > ${currdir}/${line}_ReadID.txt
+     echo -e $(date) " Identifying discordant read IDs has been completed for "${line}"\n" ####
+     
+	 # if discordant readID file exists, then continue with remainder of TE analysis
+	 if  [[ -s  ${currdir}/${line}_ReadID.txt ]]
+	 then
+		  #java $maxHeapMem -jar $picard FilterSamReads I=${workingdir}/${outname}Alignments.bam O=${currdir}/${line}_DiscordantPairs.bam READ_LIST_FILE=${currdir}/${line}_ReadID.txt FILTER=includeReadList WRITE_READS_FILES=false
+          $picard $maxHeapMem FilterSamReads -I ${workingdir}/${outname}Alignments.bam -O ${currdir}/${line}_DiscordantPairs.bam -READ_LIST_FILE ${currdir}/${line}_ReadID.txt -FILTER includeReadList -WRITE_READS_FILES false &>/dev/null
+          echo -e $(date) " Filtering original alignment based on discordant reads IDs is complete for "${line}"\n"  ####
+          
+          bedtools merge -d $md -S + -c 1 -o count -i ${currdir}/${line}_DiscordantPairs.bam | \
+               awk '{if ($4 > 3) print $0}' > ${currdir}/${line}_plusCluster.bed 
+          echo -e $(date) " Primary reads from the + strand have been merged if read count greater than 3 for "${line}"\n" ####
+          
+          bedtools merge -d $md -S - -c 1 -o count -i ${currdir}/${line}_DiscordantPairs.bam | \
+               awk '{if ($4 > 3) print $0}' > ${currdir}/${line}_minusCluster.bed
+          echo -e $(date) " Primary reads from the - strand have been merged if read count greater than 3 for "${line}"\n" ####
+          
+          # filtering edges piped into bedtools merge (keeping read counts greater than 3 in the line above)
+          ## find the closest minus strand to the plus strand in the cluster
+          ## filter by the distance between the plus and minus clusters - only retain pairs if reads are 0-100 bases away
+          ## if plus strand start is less than minus strand start and plus strand end is less than minus strand end then in proper orientation 
+          bedtools closest -d -g ${workingdir}/reference.fa.fai -t first -a ${currdir}/${line}_plusCluster.bed -b ${currdir}/${line}_minusCluster.bed | \
+               awk -v TSD=$k '{if ($9 <= TSD && $9 >= 0) print $0}' | \
+               awk '{if ($2 < $6 && $3 < $7) print $0}' > ${currdir}/${line}_plusminus.bed
+          echo -e $(date) " Filtration of clusters in proper orientation using bedtools closest has been completed for "${line}"\n" ####
+          
+          # if plus strand end is greater than minus strand start, then report the pair 
+          awk '{if ($3 > $6) print $1"\t"$6"\t"$3"\t"$0}' ${currdir}/${line}_plusminus.bed > ${currdir}/${line}_plusminus_1.bed 
+          echo -e $(date) " Overlapping reads TE insertions reported for "${line}"\n" ####
+          
+          #if plus  strand end is less than or equal to minus strand start and the region in between is less than a user-defined value k, report the pair
+          awk -v TSD=$k '{if ($3 <= $6 && $6 - $3 < TSD) print $1"\t"$3 - 1"\t"$6 + 1"\t"$0}' ${currdir}/${line}_plusminus.bed  > \
+               ${currdir}/${line}_plusminus_2.bed
+          echo -e $(date) " Non-overlapping reads TE insertions reported for "${line}"\n" ####
+          
+          #combine reported TE insertions
+          cat ${currdir}/${line}_plusminus_1.bed ${currdir}/${line}_plusminus_2.bed | \
+               awk -v TEname=$line '{$0=TEname"\t"$0}1' | sort -k 1 | sort -k 2 > \
+               ${currdir}/${line}_insertionRegion.txt
+          
+          
+          cat ${currdir}/${line}_insertionRegion.txt >> ${workingdir}/insertions.txt
+          echo -e $(date) " TE insertions for "${line}" have been reported.\n" ####
+          
+          
+          echo -e $(date) " Transposon named "${line}" is processed.\n"
+	 else
+	      echo -e $(date) " Transposon named "${line}" is processed. No discordant reads found.\n"
+	      rm -r ${currdir}
+	 fi
+}
+
+# functions end
+
+# get arguments
+fa=
+alignment=
+gtf=
+te=
+out=
+intermed=
+bamo="DiscordantReads.bam"
+bedo="TEinsertions.bed"
+gtfo="TEinsertions.gtf"
+outname=""
+fis=400
+picard="picard"
+maxHeapMem=-Xmx2000m
+md=150
+k=20
+d=$(date +%Y%m%d%H%M%S)
+# workingdir=TEfinder_${d}
+workingdir="TEfinder"
+threads=1
+ 
+while [ "$1" != "" ];
+do
+     case $1 in
+     -fa | --FastaFile )
+          shift
+          fa=$1 ;;
+     -alignment | --alignmentFile )
+	      shift
+          alignment=$1 ;;
+     -gtf | --TransposonsInGenome )
+	      shift
+          gtf=$1 ;;
+     -te | --TransposonsToSearch )
+	      shift
+          te=$1  ;;
+     -fis | --FragmentInsertSize )
+	      shift
+          fis=$1 ;;
+     -picard | --pathToPicardjar )
+	      shift
+          picard=$1 ;;
+     -md | --MaxDistanceForMerge )
+	      shift
+          md=$1  ;;
+     -k | --MaxTSDLength )
+	      shift
+          k=$1  ;;
+     -bamo | --DiscordantReads )
+	      shift
+          bamo=$1 ;;
+     -bedo | --bTEinsertions )
+	      shift
+          bedo=$1 ;;
+     -gtfo | --gTEinsertions )
+           shift
+          gtfo=$1 ;;
+     -maxHeapMem | --MaxHeapMemory )
+	      shift
+          maxHeapMem="-Xmx"$1"m" ;;
+     -workingdir | --WorkingDirectory )
+	      shift
+          workingdir=$1 ;;
+     -out | --OutputFormat )
+	      shift
+	   out=$1  ;;
+     -outname | --OutputName )
+              shift
+           outname=$1 ;;
+     -threads | --Threads )
+              shift
+           threads=$1 ;;
+     -intermed | --IntermediateFiles )
+              shift
+           intermed=$1 ;;
+     -h   | --help )
+	      help
+          exit;;
+     *) # error
+          echo "TEfinder: illegal option $1"
+          example
+          exit 1 ;;
+     esac
+     shift
+done
+margs_check $fa $alignment $gtf $te
+
+# main
+
+mkdir ${workingdir}
+
+# remove empty lines from user provided TE list if present
+sed '/^$/d' $te > ${workingdir}"/userTE_noEmptyLines.txt"
+
+# create output file
+printf "%s\t"  "track name=TEfinder" "type=bedDetail" "description=FR:forward read, RR:reverse read, InsRegion:insertion region start and end positions, FILTER:comma separated filters" > ${workingdir}/${outname}TEinsertions.bed
+printf "\n" >> ${workingdir}/${outname}TEinsertions.bed
+
+# create fasta index (fai) file
+cp $fa ${workingdir}/reference.fa
+samtools faidx ${workingdir}/reference.fa
+
+# sort alignment input
+samtools sort -@ $threads -o ${workingdir}/alignmentInput.sorted.bam ${alignment}
+echo -e $(date) " Alignment file sorted successfully.\n"
+
+# remove secondary and supplementary alignments from sorted bam
+samtools view -F 2304 -@ $threads -o ${workingdir}/${outname}Alignments.bam ${workingdir}/alignmentInput.sorted.bam
+echo -e $(date) " Alignments are filtered - secondary and supplementary alignments have been removed. \n"
+
+# run pipeline for each TE
+while IFS="" read -r line || [ -n "$line" ]
+do
+     pipeline &
+done < ${workingdir}/userTE_noEmptyLines.txt
+wait
+echo -e $(date) " All transposons are processed. Finalizing...\n"
+
+# combine discordant bam files
+samtools merge -@ $threads -r ${workingdir}/${outname}DiscordantReads.bam ${workingdir}/*/*_DiscordantPairs.bam
+echo -e $(date) " BAM Output: Discordant pair alignment file is now available.\n"
+# Sorting by position 
+samtools sort -@ $threads ${workingdir}/${outname}DiscordantReads.bam | samtools view -h -o ${workingdir}/${outname}DiscordantReads.sam
+grep -v '^@PG' ${workingdir}/${outname}DiscordantReads.sam > ${workingdir}/${outname}DiscordantReadsNoPG.sam
+rm ${workingdir}/${outname}DiscordantReads.sam
+samtools view -hb -x "PG" --no-PG --remove-flags "PG" -O BAM ${workingdir}/${outname}DiscordantReadsNoPG.sam -o ${bamo}
+rm ${workingdir}/${outname}DiscordantReadsNoPG.sam
+
+# update output BED file with TEfinder results: organize the starting file
+awk '{print $2"\t"$3"\t"$4"\t"$1"\t"$8+$12"\t.\tFR="$8";RR="$12";InsRegion="$6"-"$11";FILTER="}' ${workingdir}/insertions.txt > ${workingdir}/TEinsertions_putative.bed
+# find the entries in repeat regions for filtering
+bedtools intersect -wa -u -a ${workingdir}/TEinsertions_putative.bed -b $gtf > ${workingdir}/TEinsertions_putative_inrepeat.bed
+# filtering process
+while IFS="" read -r line || [ -n "$line" ]
+do
+	#located in repeat region
+	if (grep -Fxq "$line" "${workingdir}/TEinsertions_putative_inrepeat.bed")
+	then
+        line=$line"in_repeat,"
+	fi
+	
+	#weak evidence
+	readc=$(echo $line | awk '{print $5}')
+	if (( $readc < 10 ))
+	then
+	line=$line"weak_evidence,"
+	fi
+	
+	#strand-biased	
+	FR=$(echo $line | grep -o 'FR=[[:digit:]]*' | cut -f2 -d'=')
+	RR=$(echo $line | grep -o 'RR=[[:digit:]]*' | cut -f2 -d'=')
+	var1=$(echo 'e(l('$FR')*1.25)' | bc -l)
+	var2=$(echo 'e(l('$FR')*0.8)' | bc -l)
+	
+	if [ $(echo "$RR > $var1" | bc) -eq 1 ] || [ $(echo "$RR < $var2" | bc) -eq 1 ]
+	then
+	line=$line"strand_bias,"
+	fi
+	
+	#pass
+	lastchar=${line: -1}
+	if [ $lastchar == "," ]
+	then
+	line=${line::${#line}-1}
+	else
+	line=$line"PASS"
+	fi
+	
+	#write to final output
+	printf "%s\n" "$line" >> ${workingdir}/${outname}TEinsertions.bed
+     
+done < ${workingdir}/TEinsertions_putative.bed
+wait
+echo -e $(date) " BED Output: TEfinder output BED file is now available.\n"
+# Sorting
+# cp ${workingdir}/${outname}TEinsertions.bed ${outo}
+bedtools sort -chrThenSizeA -i ${workingdir}/${outname}TEinsertions.bed > ${bedo}
+# cat ${bedo}
+
+# gtf option - create output GTF files with TEfinder results
+if [ ! -z "$out" ]
+then
+  awk 'FNR > 1 {print $1"\tTEfinder\tTIP\t"$2 + 1"\t"$3"\t"$5"\t.\t.\tte_name \""$4"\"; tags \""$7"\""}' ${workingdir}/${outname}TEinsertions.bed > ${workingdir}/${outname}TEinsertions.gtf
+  bedtools sort -chrThenSizeA -i ${workingdir}/${outname}TEinsertions.gtf > ${gtfo}
+  # awk 'FNR > 1 {print $1"\tTEfinder\tTIP\t"$2 + 1"\t"$3"\t"$5"\t.\t.\tte_name \""$4"\"; tags \""$7"\""}' ${bedo} > ${gtfo}
+  # Sorting
+  # cp ${workingdir}/${outname}TEinsertions.gtf ${gtfo}
+  echo -e "\n\n"
+  # cat ${gtfo}
+  # bedtools sort -chrThenSizeA -i ${workingdir}/${outname}TEinsertions.gtf > ${gtfo}
+  echo -e $(date) " GTF Output: TEfinder output GTF file is now available.\n"
+fi
+
+# clean working directory
+if [ -z "$intermed" ]
+then
+ rm ${workingdir}/TEinsertions_putative.bed ${workingdir}/TEinsertions_putative_inrepeat.bed ${workingdir}/reference.fa ${workingdir}/reference.fa.fai \
+ ${workingdir}/alignmentInput.sorted.bam ${workingdir}/insertions.txt ${workingdir}/${outname}Alignments.bam ${workingdir}/userTE_noEmptyLines.txt 
+ rm -r ${workingdir}/*/
+fi
+
+if [ `wc -l <${workingdir}/${outname}TEinsertions.bed` -le "1" ]
+then
+ echo -e $(date) " Error: TEfinder run unsuccessful."
+else
+ echo -e $(date) " TE insertion output files have been created. TEfinder completed successfully."
+fi