Mercurial > repos > namhsuya > te_finder
diff TEfinder @ 0:b81a83c743d3 draft default tip
Uploaded
author | namhsuya |
---|---|
date | Tue, 09 Aug 2022 06:58:49 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEfinder Tue Aug 09 06:58:49 2022 +0000 @@ -0,0 +1,332 @@ +#!/bin/bash +## +## +## Authors: Vista Sohrab & Dilay Hazal Ayhan +## Date: January 15, 2021 +## Description: TEfinder uses discordant reads to detect novel transposable element insertion events in short read paired-end sample sequencing data. +## Software dependencies include bedtools 2.28.0 or later, samtools 1.3 or later, picard 2.0.1 or later +## Required inputs include sample alignment file (.bam|.sam), reference genome FASTA (.fa), reference TE annotation in GFF/GTF or GFF3 (.gff|.gtf), and TEs of interest (.txt) +## +## University of Massachusetts Amherst +## +## +## +## + +set -e + +margs=4 + +# Functions +function example { + echo -e "example: TEfinder -alignment sample.bam -fa reference.fa -gtf TEs.gtf -te List_of_TEs.txt" +} + +function help { + echo -e "REQUIRED:" + echo -e " -alignment, --alignmentFile STR sample reads aligned to reference genome (BAM/SAM file)" + echo -e " -fa, --FastaFile STR reference genome FASTA index (FA file)" + echo -e " -gtf, --TransposonsInGenome STR reference genome TE annotation (GFF2/GTF file)" + echo -e " -te, --TransposonsToSearch STR TE names (single column text file)\n" + echo -e "OPTIONAL:" + echo -e " -bamo, --DiscordantReads STR BAM output\n" + echo -e " -bedo, --bTEinsertions STR TEinsertions BED output\n" + echo -e " -gtfo, --gTEinsertions STR TEinsertions GTF output\n" + echo -e " -fis, --FragmentInsertSize INT short-read sequencing fragment insert size [400]" + echo -e " -picard, --pathToPicardjar STR path to picard tools .jar file [picard.jar]" + echo -e " -md, --MaxDistanceForMerge INT maximum distance between reads for bedtools merge [150]" + echo -e " -k, --MaxTSDLength INT maximum TE target site duplication (TSD) length [20]" + echo -e " -maxHeapMem, --MaxHeapMemory INT java maximum heap memory allocation for picard in Mb [2000]" + echo -e " -workingdir, --WorkingDirectory STR working directory name [TEfinder_<Date>]" + echo -e " -out, --OutputFormat STR output format as GTF [BED]" + echo -e " -outname, --OutputName STR output name prefix added to file names [null]" + echo -e " -threads, --Threads INT number of threads for samtools multi-threading [1]" + echo -e " -intermed --IntermediateFiles STR keep intermediate files created by pipeline [no]" + echo -e " -h, --help prints help\n" + example +} + +# check if mandatory args are empty +function margs_check { + if [ $# -lt $margs ]; then + echo -e "One or more required parameters are missing." + example + exit 1 # error + fi +} + +# main workflow + #### : comment out +function pipeline() { + mkdir ${workingdir}/${line} + currdir=${workingdir}/${line} + echo -e $(date) " Transposon analysis for "${line}" has started\n" + + grep -P '[^(\w|\d|\-|\_|\#|\.)]'${line}'[^(\w|\d|\-|\_|\#|\.)]' $gtf > ${currdir}/${line}_TE.gff + echo -e $(date) " Individual TE GFF has been created for "${line}"\n" #### + + bedtools intersect -abam ${workingdir}/${outname}Alignments.bam -b ${currdir}/${line}_TE.gff -wa > ${currdir}/${line}_MappedReadsToTE.bam + echo -e $(date) " Mapped reads to TE via bedtools intersect has been completed for "${line}"\n" #### + samtools view -@ $threads ${currdir}/${line}_MappedReadsToTE.bam | \ + awk -v Ins=`expr $fis \* 10` '{if (($7 != "=") || ($9 > Ins) || ($9 < -Ins)) print $1}' > ${currdir}/${line}_ReadID.txt + echo -e $(date) " Identifying discordant read IDs has been completed for "${line}"\n" #### + + # if discordant readID file exists, then continue with remainder of TE analysis + if [[ -s ${currdir}/${line}_ReadID.txt ]] + then + #java $maxHeapMem -jar $picard FilterSamReads I=${workingdir}/${outname}Alignments.bam O=${currdir}/${line}_DiscordantPairs.bam READ_LIST_FILE=${currdir}/${line}_ReadID.txt FILTER=includeReadList WRITE_READS_FILES=false + $picard $maxHeapMem FilterSamReads -I ${workingdir}/${outname}Alignments.bam -O ${currdir}/${line}_DiscordantPairs.bam -READ_LIST_FILE ${currdir}/${line}_ReadID.txt -FILTER includeReadList -WRITE_READS_FILES false &>/dev/null + echo -e $(date) " Filtering original alignment based on discordant reads IDs is complete for "${line}"\n" #### + + bedtools merge -d $md -S + -c 1 -o count -i ${currdir}/${line}_DiscordantPairs.bam | \ + awk '{if ($4 > 3) print $0}' > ${currdir}/${line}_plusCluster.bed + echo -e $(date) " Primary reads from the + strand have been merged if read count greater than 3 for "${line}"\n" #### + + bedtools merge -d $md -S - -c 1 -o count -i ${currdir}/${line}_DiscordantPairs.bam | \ + awk '{if ($4 > 3) print $0}' > ${currdir}/${line}_minusCluster.bed + echo -e $(date) " Primary reads from the - strand have been merged if read count greater than 3 for "${line}"\n" #### + + # filtering edges piped into bedtools merge (keeping read counts greater than 3 in the line above) + ## find the closest minus strand to the plus strand in the cluster + ## filter by the distance between the plus and minus clusters - only retain pairs if reads are 0-100 bases away + ## if plus strand start is less than minus strand start and plus strand end is less than minus strand end then in proper orientation + bedtools closest -d -g ${workingdir}/reference.fa.fai -t first -a ${currdir}/${line}_plusCluster.bed -b ${currdir}/${line}_minusCluster.bed | \ + awk -v TSD=$k '{if ($9 <= TSD && $9 >= 0) print $0}' | \ + awk '{if ($2 < $6 && $3 < $7) print $0}' > ${currdir}/${line}_plusminus.bed + echo -e $(date) " Filtration of clusters in proper orientation using bedtools closest has been completed for "${line}"\n" #### + + # if plus strand end is greater than minus strand start, then report the pair + awk '{if ($3 > $6) print $1"\t"$6"\t"$3"\t"$0}' ${currdir}/${line}_plusminus.bed > ${currdir}/${line}_plusminus_1.bed + echo -e $(date) " Overlapping reads TE insertions reported for "${line}"\n" #### + + #if plus strand end is less than or equal to minus strand start and the region in between is less than a user-defined value k, report the pair + awk -v TSD=$k '{if ($3 <= $6 && $6 - $3 < TSD) print $1"\t"$3 - 1"\t"$6 + 1"\t"$0}' ${currdir}/${line}_plusminus.bed > \ + ${currdir}/${line}_plusminus_2.bed + echo -e $(date) " Non-overlapping reads TE insertions reported for "${line}"\n" #### + + #combine reported TE insertions + cat ${currdir}/${line}_plusminus_1.bed ${currdir}/${line}_plusminus_2.bed | \ + awk -v TEname=$line '{$0=TEname"\t"$0}1' | sort -k 1 | sort -k 2 > \ + ${currdir}/${line}_insertionRegion.txt + + + cat ${currdir}/${line}_insertionRegion.txt >> ${workingdir}/insertions.txt + echo -e $(date) " TE insertions for "${line}" have been reported.\n" #### + + + echo -e $(date) " Transposon named "${line}" is processed.\n" + else + echo -e $(date) " Transposon named "${line}" is processed. No discordant reads found.\n" + rm -r ${currdir} + fi +} + +# functions end + +# get arguments +fa= +alignment= +gtf= +te= +out= +intermed= +bamo="DiscordantReads.bam" +bedo="TEinsertions.bed" +gtfo="TEinsertions.gtf" +outname="" +fis=400 +picard="picard" +maxHeapMem=-Xmx2000m +md=150 +k=20 +d=$(date +%Y%m%d%H%M%S) +# workingdir=TEfinder_${d} +workingdir="TEfinder" +threads=1 + +while [ "$1" != "" ]; +do + case $1 in + -fa | --FastaFile ) + shift + fa=$1 ;; + -alignment | --alignmentFile ) + shift + alignment=$1 ;; + -gtf | --TransposonsInGenome ) + shift + gtf=$1 ;; + -te | --TransposonsToSearch ) + shift + te=$1 ;; + -fis | --FragmentInsertSize ) + shift + fis=$1 ;; + -picard | --pathToPicardjar ) + shift + picard=$1 ;; + -md | --MaxDistanceForMerge ) + shift + md=$1 ;; + -k | --MaxTSDLength ) + shift + k=$1 ;; + -bamo | --DiscordantReads ) + shift + bamo=$1 ;; + -bedo | --bTEinsertions ) + shift + bedo=$1 ;; + -gtfo | --gTEinsertions ) + shift + gtfo=$1 ;; + -maxHeapMem | --MaxHeapMemory ) + shift + maxHeapMem="-Xmx"$1"m" ;; + -workingdir | --WorkingDirectory ) + shift + workingdir=$1 ;; + -out | --OutputFormat ) + shift + out=$1 ;; + -outname | --OutputName ) + shift + outname=$1 ;; + -threads | --Threads ) + shift + threads=$1 ;; + -intermed | --IntermediateFiles ) + shift + intermed=$1 ;; + -h | --help ) + help + exit;; + *) # error + echo "TEfinder: illegal option $1" + example + exit 1 ;; + esac + shift +done +margs_check $fa $alignment $gtf $te + +# main + +mkdir ${workingdir} + +# remove empty lines from user provided TE list if present +sed '/^$/d' $te > ${workingdir}"/userTE_noEmptyLines.txt" + +# create output file +printf "%s\t" "track name=TEfinder" "type=bedDetail" "description=FR:forward read, RR:reverse read, InsRegion:insertion region start and end positions, FILTER:comma separated filters" > ${workingdir}/${outname}TEinsertions.bed +printf "\n" >> ${workingdir}/${outname}TEinsertions.bed + +# create fasta index (fai) file +cp $fa ${workingdir}/reference.fa +samtools faidx ${workingdir}/reference.fa + +# sort alignment input +samtools sort -@ $threads -o ${workingdir}/alignmentInput.sorted.bam ${alignment} +echo -e $(date) " Alignment file sorted successfully.\n" + +# remove secondary and supplementary alignments from sorted bam +samtools view -F 2304 -@ $threads -o ${workingdir}/${outname}Alignments.bam ${workingdir}/alignmentInput.sorted.bam +echo -e $(date) " Alignments are filtered - secondary and supplementary alignments have been removed. \n" + +# run pipeline for each TE +while IFS="" read -r line || [ -n "$line" ] +do + pipeline & +done < ${workingdir}/userTE_noEmptyLines.txt +wait +echo -e $(date) " All transposons are processed. Finalizing...\n" + +# combine discordant bam files +samtools merge -@ $threads -r ${workingdir}/${outname}DiscordantReads.bam ${workingdir}/*/*_DiscordantPairs.bam +echo -e $(date) " BAM Output: Discordant pair alignment file is now available.\n" +# Sorting by position +samtools sort -@ $threads ${workingdir}/${outname}DiscordantReads.bam | samtools view -h -o ${workingdir}/${outname}DiscordantReads.sam +grep -v '^@PG' ${workingdir}/${outname}DiscordantReads.sam > ${workingdir}/${outname}DiscordantReadsNoPG.sam +rm ${workingdir}/${outname}DiscordantReads.sam +samtools view -hb -x "PG" --no-PG --remove-flags "PG" -O BAM ${workingdir}/${outname}DiscordantReadsNoPG.sam -o ${bamo} +rm ${workingdir}/${outname}DiscordantReadsNoPG.sam + +# update output BED file with TEfinder results: organize the starting file +awk '{print $2"\t"$3"\t"$4"\t"$1"\t"$8+$12"\t.\tFR="$8";RR="$12";InsRegion="$6"-"$11";FILTER="}' ${workingdir}/insertions.txt > ${workingdir}/TEinsertions_putative.bed +# find the entries in repeat regions for filtering +bedtools intersect -wa -u -a ${workingdir}/TEinsertions_putative.bed -b $gtf > ${workingdir}/TEinsertions_putative_inrepeat.bed +# filtering process +while IFS="" read -r line || [ -n "$line" ] +do + #located in repeat region + if (grep -Fxq "$line" "${workingdir}/TEinsertions_putative_inrepeat.bed") + then + line=$line"in_repeat," + fi + + #weak evidence + readc=$(echo $line | awk '{print $5}') + if (( $readc < 10 )) + then + line=$line"weak_evidence," + fi + + #strand-biased + FR=$(echo $line | grep -o 'FR=[[:digit:]]*' | cut -f2 -d'=') + RR=$(echo $line | grep -o 'RR=[[:digit:]]*' | cut -f2 -d'=') + var1=$(echo 'e(l('$FR')*1.25)' | bc -l) + var2=$(echo 'e(l('$FR')*0.8)' | bc -l) + + if [ $(echo "$RR > $var1" | bc) -eq 1 ] || [ $(echo "$RR < $var2" | bc) -eq 1 ] + then + line=$line"strand_bias," + fi + + #pass + lastchar=${line: -1} + if [ $lastchar == "," ] + then + line=${line::${#line}-1} + else + line=$line"PASS" + fi + + #write to final output + printf "%s\n" "$line" >> ${workingdir}/${outname}TEinsertions.bed + +done < ${workingdir}/TEinsertions_putative.bed +wait +echo -e $(date) " BED Output: TEfinder output BED file is now available.\n" +# Sorting +# cp ${workingdir}/${outname}TEinsertions.bed ${outo} +bedtools sort -chrThenSizeA -i ${workingdir}/${outname}TEinsertions.bed > ${bedo} +# cat ${bedo} + +# gtf option - create output GTF files with TEfinder results +if [ ! -z "$out" ] +then + awk 'FNR > 1 {print $1"\tTEfinder\tTIP\t"$2 + 1"\t"$3"\t"$5"\t.\t.\tte_name \""$4"\"; tags \""$7"\""}' ${workingdir}/${outname}TEinsertions.bed > ${workingdir}/${outname}TEinsertions.gtf + bedtools sort -chrThenSizeA -i ${workingdir}/${outname}TEinsertions.gtf > ${gtfo} + # awk 'FNR > 1 {print $1"\tTEfinder\tTIP\t"$2 + 1"\t"$3"\t"$5"\t.\t.\tte_name \""$4"\"; tags \""$7"\""}' ${bedo} > ${gtfo} + # Sorting + # cp ${workingdir}/${outname}TEinsertions.gtf ${gtfo} + echo -e "\n\n" + # cat ${gtfo} + # bedtools sort -chrThenSizeA -i ${workingdir}/${outname}TEinsertions.gtf > ${gtfo} + echo -e $(date) " GTF Output: TEfinder output GTF file is now available.\n" +fi + +# clean working directory +if [ -z "$intermed" ] +then + rm ${workingdir}/TEinsertions_putative.bed ${workingdir}/TEinsertions_putative_inrepeat.bed ${workingdir}/reference.fa ${workingdir}/reference.fa.fai \ + ${workingdir}/alignmentInput.sorted.bam ${workingdir}/insertions.txt ${workingdir}/${outname}Alignments.bam ${workingdir}/userTE_noEmptyLines.txt + rm -r ${workingdir}/*/ +fi + +if [ `wc -l <${workingdir}/${outname}TEinsertions.bed` -le "1" ] +then + echo -e $(date) " Error: TEfinder run unsuccessful." +else + echo -e $(date) " TE insertion output files have been created. TEfinder completed successfully." +fi