Mercurial > repos > namhsuya > te_finder
view TEfinder @ 0:b81a83c743d3 draft default tip
Uploaded
author | namhsuya |
---|---|
date | Tue, 09 Aug 2022 06:58:49 +0000 |
parents | |
children |
line wrap: on
line source
#!/bin/bash ## ## ## Authors: Vista Sohrab & Dilay Hazal Ayhan ## Date: January 15, 2021 ## Description: TEfinder uses discordant reads to detect novel transposable element insertion events in short read paired-end sample sequencing data. ## Software dependencies include bedtools 2.28.0 or later, samtools 1.3 or later, picard 2.0.1 or later ## Required inputs include sample alignment file (.bam|.sam), reference genome FASTA (.fa), reference TE annotation in GFF/GTF or GFF3 (.gff|.gtf), and TEs of interest (.txt) ## ## University of Massachusetts Amherst ## ## ## ## set -e margs=4 # Functions function example { echo -e "example: TEfinder -alignment sample.bam -fa reference.fa -gtf TEs.gtf -te List_of_TEs.txt" } function help { echo -e "REQUIRED:" echo -e " -alignment, --alignmentFile STR sample reads aligned to reference genome (BAM/SAM file)" echo -e " -fa, --FastaFile STR reference genome FASTA index (FA file)" echo -e " -gtf, --TransposonsInGenome STR reference genome TE annotation (GFF2/GTF file)" echo -e " -te, --TransposonsToSearch STR TE names (single column text file)\n" echo -e "OPTIONAL:" echo -e " -bamo, --DiscordantReads STR BAM output\n" echo -e " -bedo, --bTEinsertions STR TEinsertions BED output\n" echo -e " -gtfo, --gTEinsertions STR TEinsertions GTF output\n" echo -e " -fis, --FragmentInsertSize INT short-read sequencing fragment insert size [400]" echo -e " -picard, --pathToPicardjar STR path to picard tools .jar file [picard.jar]" echo -e " -md, --MaxDistanceForMerge INT maximum distance between reads for bedtools merge [150]" echo -e " -k, --MaxTSDLength INT maximum TE target site duplication (TSD) length [20]" echo -e " -maxHeapMem, --MaxHeapMemory INT java maximum heap memory allocation for picard in Mb [2000]" echo -e " -workingdir, --WorkingDirectory STR working directory name [TEfinder_<Date>]" echo -e " -out, --OutputFormat STR output format as GTF [BED]" echo -e " -outname, --OutputName STR output name prefix added to file names [null]" echo -e " -threads, --Threads INT number of threads for samtools multi-threading [1]" echo -e " -intermed --IntermediateFiles STR keep intermediate files created by pipeline [no]" echo -e " -h, --help prints help\n" example } # check if mandatory args are empty function margs_check { if [ $# -lt $margs ]; then echo -e "One or more required parameters are missing." example exit 1 # error fi } # main workflow #### : comment out function pipeline() { mkdir ${workingdir}/${line} currdir=${workingdir}/${line} echo -e $(date) " Transposon analysis for "${line}" has started\n" grep -P '[^(\w|\d|\-|\_|\#|\.)]'${line}'[^(\w|\d|\-|\_|\#|\.)]' $gtf > ${currdir}/${line}_TE.gff echo -e $(date) " Individual TE GFF has been created for "${line}"\n" #### bedtools intersect -abam ${workingdir}/${outname}Alignments.bam -b ${currdir}/${line}_TE.gff -wa > ${currdir}/${line}_MappedReadsToTE.bam echo -e $(date) " Mapped reads to TE via bedtools intersect has been completed for "${line}"\n" #### samtools view -@ $threads ${currdir}/${line}_MappedReadsToTE.bam | \ awk -v Ins=`expr $fis \* 10` '{if (($7 != "=") || ($9 > Ins) || ($9 < -Ins)) print $1}' > ${currdir}/${line}_ReadID.txt echo -e $(date) " Identifying discordant read IDs has been completed for "${line}"\n" #### # if discordant readID file exists, then continue with remainder of TE analysis if [[ -s ${currdir}/${line}_ReadID.txt ]] then #java $maxHeapMem -jar $picard FilterSamReads I=${workingdir}/${outname}Alignments.bam O=${currdir}/${line}_DiscordantPairs.bam READ_LIST_FILE=${currdir}/${line}_ReadID.txt FILTER=includeReadList WRITE_READS_FILES=false $picard $maxHeapMem FilterSamReads -I ${workingdir}/${outname}Alignments.bam -O ${currdir}/${line}_DiscordantPairs.bam -READ_LIST_FILE ${currdir}/${line}_ReadID.txt -FILTER includeReadList -WRITE_READS_FILES false &>/dev/null echo -e $(date) " Filtering original alignment based on discordant reads IDs is complete for "${line}"\n" #### bedtools merge -d $md -S + -c 1 -o count -i ${currdir}/${line}_DiscordantPairs.bam | \ awk '{if ($4 > 3) print $0}' > ${currdir}/${line}_plusCluster.bed echo -e $(date) " Primary reads from the + strand have been merged if read count greater than 3 for "${line}"\n" #### bedtools merge -d $md -S - -c 1 -o count -i ${currdir}/${line}_DiscordantPairs.bam | \ awk '{if ($4 > 3) print $0}' > ${currdir}/${line}_minusCluster.bed echo -e $(date) " Primary reads from the - strand have been merged if read count greater than 3 for "${line}"\n" #### # filtering edges piped into bedtools merge (keeping read counts greater than 3 in the line above) ## find the closest minus strand to the plus strand in the cluster ## filter by the distance between the plus and minus clusters - only retain pairs if reads are 0-100 bases away ## if plus strand start is less than minus strand start and plus strand end is less than minus strand end then in proper orientation bedtools closest -d -g ${workingdir}/reference.fa.fai -t first -a ${currdir}/${line}_plusCluster.bed -b ${currdir}/${line}_minusCluster.bed | \ awk -v TSD=$k '{if ($9 <= TSD && $9 >= 0) print $0}' | \ awk '{if ($2 < $6 && $3 < $7) print $0}' > ${currdir}/${line}_plusminus.bed echo -e $(date) " Filtration of clusters in proper orientation using bedtools closest has been completed for "${line}"\n" #### # if plus strand end is greater than minus strand start, then report the pair awk '{if ($3 > $6) print $1"\t"$6"\t"$3"\t"$0}' ${currdir}/${line}_plusminus.bed > ${currdir}/${line}_plusminus_1.bed echo -e $(date) " Overlapping reads TE insertions reported for "${line}"\n" #### #if plus strand end is less than or equal to minus strand start and the region in between is less than a user-defined value k, report the pair awk -v TSD=$k '{if ($3 <= $6 && $6 - $3 < TSD) print $1"\t"$3 - 1"\t"$6 + 1"\t"$0}' ${currdir}/${line}_plusminus.bed > \ ${currdir}/${line}_plusminus_2.bed echo -e $(date) " Non-overlapping reads TE insertions reported for "${line}"\n" #### #combine reported TE insertions cat ${currdir}/${line}_plusminus_1.bed ${currdir}/${line}_plusminus_2.bed | \ awk -v TEname=$line '{$0=TEname"\t"$0}1' | sort -k 1 | sort -k 2 > \ ${currdir}/${line}_insertionRegion.txt cat ${currdir}/${line}_insertionRegion.txt >> ${workingdir}/insertions.txt echo -e $(date) " TE insertions for "${line}" have been reported.\n" #### echo -e $(date) " Transposon named "${line}" is processed.\n" else echo -e $(date) " Transposon named "${line}" is processed. No discordant reads found.\n" rm -r ${currdir} fi } # functions end # get arguments fa= alignment= gtf= te= out= intermed= bamo="DiscordantReads.bam" bedo="TEinsertions.bed" gtfo="TEinsertions.gtf" outname="" fis=400 picard="picard" maxHeapMem=-Xmx2000m md=150 k=20 d=$(date +%Y%m%d%H%M%S) # workingdir=TEfinder_${d} workingdir="TEfinder" threads=1 while [ "$1" != "" ]; do case $1 in -fa | --FastaFile ) shift fa=$1 ;; -alignment | --alignmentFile ) shift alignment=$1 ;; -gtf | --TransposonsInGenome ) shift gtf=$1 ;; -te | --TransposonsToSearch ) shift te=$1 ;; -fis | --FragmentInsertSize ) shift fis=$1 ;; -picard | --pathToPicardjar ) shift picard=$1 ;; -md | --MaxDistanceForMerge ) shift md=$1 ;; -k | --MaxTSDLength ) shift k=$1 ;; -bamo | --DiscordantReads ) shift bamo=$1 ;; -bedo | --bTEinsertions ) shift bedo=$1 ;; -gtfo | --gTEinsertions ) shift gtfo=$1 ;; -maxHeapMem | --MaxHeapMemory ) shift maxHeapMem="-Xmx"$1"m" ;; -workingdir | --WorkingDirectory ) shift workingdir=$1 ;; -out | --OutputFormat ) shift out=$1 ;; -outname | --OutputName ) shift outname=$1 ;; -threads | --Threads ) shift threads=$1 ;; -intermed | --IntermediateFiles ) shift intermed=$1 ;; -h | --help ) help exit;; *) # error echo "TEfinder: illegal option $1" example exit 1 ;; esac shift done margs_check $fa $alignment $gtf $te # main mkdir ${workingdir} # remove empty lines from user provided TE list if present sed '/^$/d' $te > ${workingdir}"/userTE_noEmptyLines.txt" # create output file printf "%s\t" "track name=TEfinder" "type=bedDetail" "description=FR:forward read, RR:reverse read, InsRegion:insertion region start and end positions, FILTER:comma separated filters" > ${workingdir}/${outname}TEinsertions.bed printf "\n" >> ${workingdir}/${outname}TEinsertions.bed # create fasta index (fai) file cp $fa ${workingdir}/reference.fa samtools faidx ${workingdir}/reference.fa # sort alignment input samtools sort -@ $threads -o ${workingdir}/alignmentInput.sorted.bam ${alignment} echo -e $(date) " Alignment file sorted successfully.\n" # remove secondary and supplementary alignments from sorted bam samtools view -F 2304 -@ $threads -o ${workingdir}/${outname}Alignments.bam ${workingdir}/alignmentInput.sorted.bam echo -e $(date) " Alignments are filtered - secondary and supplementary alignments have been removed. \n" # run pipeline for each TE while IFS="" read -r line || [ -n "$line" ] do pipeline & done < ${workingdir}/userTE_noEmptyLines.txt wait echo -e $(date) " All transposons are processed. Finalizing...\n" # combine discordant bam files samtools merge -@ $threads -r ${workingdir}/${outname}DiscordantReads.bam ${workingdir}/*/*_DiscordantPairs.bam echo -e $(date) " BAM Output: Discordant pair alignment file is now available.\n" # Sorting by position samtools sort -@ $threads ${workingdir}/${outname}DiscordantReads.bam | samtools view -h -o ${workingdir}/${outname}DiscordantReads.sam grep -v '^@PG' ${workingdir}/${outname}DiscordantReads.sam > ${workingdir}/${outname}DiscordantReadsNoPG.sam rm ${workingdir}/${outname}DiscordantReads.sam samtools view -hb -x "PG" --no-PG --remove-flags "PG" -O BAM ${workingdir}/${outname}DiscordantReadsNoPG.sam -o ${bamo} rm ${workingdir}/${outname}DiscordantReadsNoPG.sam # update output BED file with TEfinder results: organize the starting file awk '{print $2"\t"$3"\t"$4"\t"$1"\t"$8+$12"\t.\tFR="$8";RR="$12";InsRegion="$6"-"$11";FILTER="}' ${workingdir}/insertions.txt > ${workingdir}/TEinsertions_putative.bed # find the entries in repeat regions for filtering bedtools intersect -wa -u -a ${workingdir}/TEinsertions_putative.bed -b $gtf > ${workingdir}/TEinsertions_putative_inrepeat.bed # filtering process while IFS="" read -r line || [ -n "$line" ] do #located in repeat region if (grep -Fxq "$line" "${workingdir}/TEinsertions_putative_inrepeat.bed") then line=$line"in_repeat," fi #weak evidence readc=$(echo $line | awk '{print $5}') if (( $readc < 10 )) then line=$line"weak_evidence," fi #strand-biased FR=$(echo $line | grep -o 'FR=[[:digit:]]*' | cut -f2 -d'=') RR=$(echo $line | grep -o 'RR=[[:digit:]]*' | cut -f2 -d'=') var1=$(echo 'e(l('$FR')*1.25)' | bc -l) var2=$(echo 'e(l('$FR')*0.8)' | bc -l) if [ $(echo "$RR > $var1" | bc) -eq 1 ] || [ $(echo "$RR < $var2" | bc) -eq 1 ] then line=$line"strand_bias," fi #pass lastchar=${line: -1} if [ $lastchar == "," ] then line=${line::${#line}-1} else line=$line"PASS" fi #write to final output printf "%s\n" "$line" >> ${workingdir}/${outname}TEinsertions.bed done < ${workingdir}/TEinsertions_putative.bed wait echo -e $(date) " BED Output: TEfinder output BED file is now available.\n" # Sorting # cp ${workingdir}/${outname}TEinsertions.bed ${outo} bedtools sort -chrThenSizeA -i ${workingdir}/${outname}TEinsertions.bed > ${bedo} # cat ${bedo} # gtf option - create output GTF files with TEfinder results if [ ! -z "$out" ] then awk 'FNR > 1 {print $1"\tTEfinder\tTIP\t"$2 + 1"\t"$3"\t"$5"\t.\t.\tte_name \""$4"\"; tags \""$7"\""}' ${workingdir}/${outname}TEinsertions.bed > ${workingdir}/${outname}TEinsertions.gtf bedtools sort -chrThenSizeA -i ${workingdir}/${outname}TEinsertions.gtf > ${gtfo} # awk 'FNR > 1 {print $1"\tTEfinder\tTIP\t"$2 + 1"\t"$3"\t"$5"\t.\t.\tte_name \""$4"\"; tags \""$7"\""}' ${bedo} > ${gtfo} # Sorting # cp ${workingdir}/${outname}TEinsertions.gtf ${gtfo} echo -e "\n\n" # cat ${gtfo} # bedtools sort -chrThenSizeA -i ${workingdir}/${outname}TEinsertions.gtf > ${gtfo} echo -e $(date) " GTF Output: TEfinder output GTF file is now available.\n" fi # clean working directory if [ -z "$intermed" ] then rm ${workingdir}/TEinsertions_putative.bed ${workingdir}/TEinsertions_putative_inrepeat.bed ${workingdir}/reference.fa ${workingdir}/reference.fa.fai \ ${workingdir}/alignmentInput.sorted.bam ${workingdir}/insertions.txt ${workingdir}/${outname}Alignments.bam ${workingdir}/userTE_noEmptyLines.txt rm -r ${workingdir}/*/ fi if [ `wc -l <${workingdir}/${outname}TEinsertions.bed` -le "1" ] then echo -e $(date) " Error: TEfinder run unsuccessful." else echo -e $(date) " TE insertion output files have been created. TEfinder completed successfully." fi