Mercurial > repos > xuef > vdm_plot
changeset 4:5a1317a26120 draft default tip
Deleted selected files
author | xuef |
---|---|
date | Mon, 16 Nov 2020 19:49:11 +0000 |
parents | 403a83d4b888 |
children | |
files | my_VDM_tool.R my_VDM_tool.xml |
diffstat | 2 files changed, 0 insertions(+), 790 deletions(-) [+] |
line wrap: on
line diff
--- a/my_VDM_tool.R Fri Nov 06 16:39:21 2020 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,592 +0,0 @@ -#!/usr/bin/env Rscript -# rm(list=ls()) -# myfunction(inf="nor22.vcf",itype="C.elegans",qual=400,thrup=1.0,thrlow=0.0,allr="AB",snp=TRUE,lsp=0.4,pcol="black",lcol="red",xstand=TRUE,bsize=1000000,bnorm=FALSE,exclf=NULL,exclthr=0.0,exclcol="green",parn="parsed.txt",outn="output.txt",pdfn="plot.pdf") - -#Rscript my_VDM_tool.R --inf "nor22.vcf" --itype "C.elegans" --qual 400 --thrup 1.0 --thrlow 0.0 --allr "AB" --snp TRUE --lsp 0.4 --pcol "black" --lcol "red" --xstand TRUE --bsize 1000000 --bnorm FALSE --exclf NULL --exclthr 0.0 --exclcol "green" --parn "parsed.txt" --outn "output.txt" --pdfn "plot.pdf" - -library("getopt") -#input from trailing line arguments -args <- commandArgs(trailingOnly = TRUE) -#read the options from input commandArgs -option_specification = matrix(c( - 'inf','i01',1,'character', - 'itype','i02',2,'character', - 'qual','i03',2,'double', - 'thrup','i04',2,'double', - 'thrlow','i05',2,'double', - 'allr','i06',2,'character', - 'snp','i07',2,'logical', - 'lsp','i08',2,'double', - 'pcol','i09',2,'character', - 'lcol','i10',2,'character', - 'xstand','i11',2,'logical', - 'bsize','i12',2,'integer', - 'bnorm','i13',2,'logical', - 'exclf','i14',2,'character', - 'exclthr','i15',2,'double', - 'exclcol','i16',2,'character', - 'parn','o1',2,'character', - 'outn','o2',2,'character', - 'pdfn','o3',2,'character' -), byrow=TRUE, ncol=4) - -# #parse options # bnorm=FALSE,exclf=NULL,exclthr=0.0,exclcol="green",parn="parsed.txt",outn="output.txt",pdfn="plot.pdf") -options = getopt(option_specification) -# -# #FOR DEBUGGING -# options<-NULL -# ###INPUT FILE -# setwd("D:/Dropbox/_galaxy/") -# options$inf<-"nor22.vcf" -# ###BASE OPTIONS -# #interval type -# options$itype<-"C.elegans" -# #quality filter -# options$qual<-200 -# #for scaling 0-1 = upper threshold for what is considered homozygous -# options$thrup<-1 -# #for scaling 0-1 = lower threshold for what is considered homozygous -# options$thrlow<-0 -# #type of allelic ratio (AB/ratio) -# options$allr<-"AB" -# #include complex variants -# options$snp<-FALSE -# ###ADDITIONAL VARIANT EXCLUSION OPTIONS -# #files with variants to exclude -# options$exclf<-NULL -# options$exclf<-c("nor22chunk1.txt","nor22chunk5.txt") -# #for variants to exclude, bottom threshold for which to be used, i.e. 0=ALL, 1=HOM only, 0.8=near HOM) -# options$exclthr<-0 -# #additional colour option for pre-subtraction line -# options$exclcol<-"green" -# ###PLOT OPTIONS -# #loess span -# options$lsp<-0.4 -# #point colour -# options$pcol<-"black" -# #loess plot colour -# options$lcol<-"red" -# #standardize x-axis interval (e.g. 1Mb interval) -# options$xstand<-TRUE -# #bin size for barplot -# options$bsize<-1000000 -# #normalization for barplot -# options$bnorm<-FALSE -# ###OUTPUT OPTIONS -# #custom files names (may not work for Galaxy) -# options$outn<-paste(gsub("vcf","",options$inf),"_output_q",options$qual,"-",paste(options$exclf,sep="",collapse=""),".txt",sep="") -# options$parn<-paste(gsub("vcf","",options$inf),"_parsed_q",options$qual,"-",paste(options$exclf,sep="",collapse=""),".txt",sep="") -# options$pdfn<-paste(gsub("vcf","",options$inf),"_plot_q",options$qual,"-",paste(options$exclf,sep="",collapse=""),".pdf",sep="") -# #fixed file names (will work in Galaxy) -# options$outn<-"vcf_output.txt" -# options$parn<-"vcf_parsedinput.txt" -# options$pdfn<-"vdm_mapping_plot.pdf" - - -myfunction<-function(inf,itype,qual,thrup,thrlow,allr,snp,lsp,pcol,lcol,xstand,bsize,bnorm,exclf,exclthr,exclcol,parn,outn,pdfn){ - - #PARAMETERS - # filename<-options$inf - # interval_type<-options$itype - # read_qual<-options$qual - # threshold_upper<-options$thrup - # threshold_lower<-options$thrlow - # allele_ratio<-options$allr - # snp_only<-options$snp - # loess_span<-options$lsp - # plot_color<-options$pcol - # loess_color<-options$lcol - # #transparency for selected colour (to see plot points underneath) - # xaxis_standard<-options$xstand - # bin_size<-options$bsize - # bfreq_norm<-options$bnorm - # exclusion_list<-options$exclf - # excl_threshold<-options$exclthr - # excl_loess_color<-options$exclcol - # vcfoutput_filename<-options$outn - # vcfparsed_filename<-options$parn - # pdf_filename<-options$pdfn - - # plot_color<-rgb(c(col2rgb(plot_color)[1]),c(col2rgb(plot_color)[2]),c(col2rgb(plot_color)[3]),max=255,alpha=150) - # loess_color<-rgb(c(col2rgb(loess_color)[1]),c(col2rgb(loess_color)[2]),c(col2rgb(loess_color)[3]),max=255,alpha=150) - # excl_loess_color<-rgb(c(col2rgb(excl_loess_color)[1]),c(col2rgb(excl_loess_color)[2]),c(col2rgb(excl_loess_color)[3]),max=255,alpha=150) - - filename<-inf - interval_type<-itype - read_qual<-as.numeric(qual) - threshold_upper<-as.numeric(thrup) - threshold_lower<-as.numeric(thrlow) - allele_ratio<-allr - snp_only<-snp - loess_span<-as.numeric(lsp) - plot_color<-pcol - loess_color<-lcol - xaxis_standard<-xstand - bin_size<-as.numeric(bsize) - bfreq_norm<-bnorm - exclusion_list<-exclf - excl_threshold<-as.numeric(exclthr) - excl_loess_color<-exclcol - vcfoutput_filename<-outn - vcfparsed_filename<-parn - pdf_filename<-pdfn - - #transparency for selected colour (to see plot points underneath) - plot_color<-rgb(c(col2rgb(plot_color)[1]),c(col2rgb(plot_color)[2]),c(col2rgb(plot_color)[3]),max=255,alpha=150) - loess_color<-rgb(c(col2rgb(loess_color)[1]),c(col2rgb(loess_color)[2]),c(col2rgb(loess_color)[3]),max=255,alpha=150) - excl_loess_color<-rgb(c(col2rgb(excl_loess_color)[1]),c(col2rgb(excl_loess_color)[2]),c(col2rgb(excl_loess_color)[3]),max=255,alpha=150) - - ###FIXED PARAMETERS - #linkage scatter plot yaxis max value=1 - sp_yaxis<-1 - #chromosome intervals in Mb rather than custom - interval_unit<-1000000 - - -###################### -###READ IN VCF FILE -#extract column names -vcf_readin<-readLines(filename) -#find header line, i.e. last line to begin with # -for(l in 1:length(vcf_readin)){ - vcf_readinl<-vcf_readin[l] - if(substr(vcf_readinl,1,1)=="#"){next} - else if(substr(vcf_readinl,1,1)!="#"){rowline<-l-1;break} -} -vcf_header<-vcf_readin[rowline] -#e.g. CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\trgSM" -vcf_header<-gsub("#","",vcf_header) -vcf_colnames<-unlist(strsplit(vcf_header,"\t")) - -#extract data (hashed vcf header skipped with read.table) -vcf_rtable<-read.table(filename,sep="\t",stringsAsFactors=FALSE) -names(vcf_rtable)<-vcf_colnames - -###################### -###PREPARE DATA - -vcfinfo_dat<-NULL -vcfinfo_pdat<-NULL -multiallele_counter<-0 -diviserror_counter<-0 -for(i in c(1:nrow(vcf_rtable))){ - vcf_line<-vcf_rtable[i,] - #to speed up runtime- quality filter here - if(vcf_line$QUAL>=read_qual){ - #remove chrom or chr prefix from chromosome value - if(grepl("chrom",vcf_line$CHROM,ignore.case=TRUE)==TRUE){ - vcf_line$CHROM<-gsub("chrom","",vcf_line$CHROM,ignore.case=TRUE) - }else if(grepl("chr",vcf_line$CHROM,ignore.case=TRUE)==TRUE){ - vcf_line$CHROM<-gsub("chr","",vcf_line$CHROM,ignore.case=TRUE) - } -#PARSE INFO - vcfinfo_split<-strsplit(vcf_line$INFO,split=";") - vcfinfo_coln<-gsub("=.*","",unlist(vcfinfo_split)) - vcfinfo_cold<-gsub(".*=","",unlist(vcfinfo_split)) - vcfinfo_ldat<-data.frame(t(vcfinfo_cold),stringsAsFactors=FALSE) - names(vcfinfo_ldat)<-vcfinfo_coln - - #skip if commas in values to avoid returning errors - if(grepl(",",vcfinfo_ldat$AO)==TRUE){ - multiallele_counter<-multiallele_counter+1 - next - } - #skip divide by zero errors (under "ratio" setting for ratio calculation) - if(as.numeric(vcfinfo_ldat$AO)+as.numeric(vcfinfo_ldat$RO)=="0"){ - diviserror_counter<-diviserror_counter+1 - next - } - - #specific accounting for nonstandard categories - #LOF columns only present for loss-of-function variants + assign NA values to all other variants - if(("LOF" %in% names(vcfinfo_ldat))==TRUE){ - LOF<-vcfinfo_ldat$LOF - vcfinfo_ldat<-vcfinfo_ldat[,!names(vcfinfo_ldat) %in% "LOF"] - vcfinfo_ldat<-cbind(vcfinfo_ldat,LOF) - }else{ - LOF<-"NA" - vcfinfo_ldat<-cbind(vcfinfo_ldat,LOF) - } - #NMD columns only present for nonsense-mediated-decay variants + assign NA values to all other variants - if(("NMD" %in% names(vcfinfo_ldat))==TRUE){ - NMD<-vcfinfo_ldat$NMD - vcfinfo_ldat<-vcfinfo_ldat[,!names(vcfinfo_ldat) %in% "NMD"] - vcfinfo_ldat<-cbind(vcfinfo_ldat,NMD) - }else{ - NMD<-"NA" - vcfinfo_ldat<-cbind(vcfinfo_ldat,NMD) - } - - #general accounting for nonstandard categories - - -#PARSE ANNOTATION - ann_rparsed<-unlist(strsplit(vcfinfo_ldat$ANN[1],split="\\|"))[1:20] - ann_rparsed[ann_rparsed==""]<-"novalue" - ann_parsed<-data.frame(t(ann_rparsed),stringsAsFactors=FALSE) - names(ann_parsed)<-paste("ANN",c(1:dim(ann_parsed)[2]),sep="") - #remove duplicate redundant INFO column (fully parsed) - vcf_line<-vcf_line[,names(vcf_line)!="INFO"] - - #dataset keeping unparsed annotation (full) - vcfinfo_pldat<-cbind(vcf_line,vcfinfo_ldat) - vcfinfo_pdat<-rbind(vcfinfo_pdat,vcfinfo_pldat) - - #dataset keeping parsed annotation (partial parsed-for relevant) - vcfinfo_ldat<-vcfinfo_ldat[,names(vcfinfo_ldat)!="ANN"] - #append copy of original data to parsed data - vcfinfo_lldat<-cbind(vcf_line,vcfinfo_ldat,ann_parsed) - vcfinfo_dat<-rbind(vcfinfo_dat,vcfinfo_lldat) - } -} -print(paste("rows with multiple alleles skipped: ",multiallele_counter,sep="")) -# print(paste("rows with AO+RO=0 (not multiple alleles) skipped: ",diviserror_counter,sep="")) - -#ENSURE CORRECT DATATYPES -#convert dataframe columns of factor type to character type -vcfinfo_dat<-data.frame(lapply(vcfinfo_dat,as.character),stringsAsFactors=FALSE) -#convert to numeric if column is numeric and not string -for(n in c(1:dim(vcfinfo_dat)[2])){ - #suppress warnings when columns with strings encountered (not converted) - suppressWarnings( - colnum_index<-!is.na(as.numeric(vcfinfo_dat[,n])) - ) - if(all(colnum_index)==TRUE){ - vcfinfo_dat[,n]<-as.numeric(vcfinfo_dat[,n]) - } -} - -###################### -#RATIO CALCULATION -#ratio calculation from AO and RO -RATIO<-c(vcfinfo_dat$AO/(vcfinfo_dat$AO+vcfinfo_dat$RO)) -#add adj_AB for AB=0->AO=1 conversion -adj_AB<-replace(vcfinfo_dat$AB,vcfinfo_dat$AB==0,1) -vcfinfo_dat<-cbind(vcfinfo_dat,RATIO,adj_AB) -vcfinfo_dat<-vcfinfo_dat[with(vcfinfo_dat,order(CHROM,POS)),] - -#CONSIDER ONLY SNP VARIANTS -if(snp_only==TRUE){ - vcfinfo_dat<-subset(vcfinfo_dat,CIGAR=="1X") -} - -###################### -#SUBTRACT VARIANTS FROM EXCLUSION LIST -# if(length(exclusion_list)>0){ -# if(exclusion_list!="FALSE"){ -if(exclusion_list!="FALSE"){ - #keep copy of pre-subtraction data for later plotting - vcfinfo_origdat<-vcfinfo_dat - - #identifiers for exclusion list based on CHROM/POS/REF/ALT - index1<-paste(vcfinfo_dat$CHROM,vcfinfo_dat$POS,sep="_") - index2<-paste(vcfinfo_dat$CHROM,vcfinfo_dat$POS,vcfinfo_dat$REF,sep="_") - index3<-paste(vcfinfo_dat$CHROM,vcfinfo_dat$POS,vcfinfo_dat$REF,vcfinfo_dat$ALT,sep="_") - vcfinfo_dat<-cbind(vcfinfo_dat,index1,index2,index3) - - print(paste("before subtraction: ",nrow(vcfinfo_dat),sep="")) - #loop and subtract through exclusion lists (if multiple files) - for(exclusion_ind in exclusion_list){ - exclin<-read.table(exclusion_ind,header=TRUE) - - #THRESHOLD FILTER ON EXCLUSION LIST VARIANTS - if(allele_ratio=="AB"){ - exclin<-subset(exclin,adj_AB>=excl_threshold) - } - if(allele_ratio=="ratio"){ - exclin<-subset(exclin,ratio>=excl_threshold) - } - - #identifiers for vcf data based on CHROM/POS/REF/ALT - index1<-paste(exclin$CHROM,exclin$POS,sep="_") - index2<-paste(exclin$CHROM,exclin$POS,exclin$REF,sep="_") - index3<-paste(exclin$CHROM,exclin$POS,exclin$REF,exclin$ALT,sep="_") - exclin<-cbind(exclin,index1,index2,index3) - #exclude based on CHROM+POS+REF+ALT - vcfinfo_dat<-subset(vcfinfo_dat,!(index3 %in% exclin$index3)) - } - print(paste("after subtraction: ",nrow(vcfinfo_dat),sep="")) -} - -###################### -#WRITE TO OUTPUT -#select relevant columns 2 variant type; 4 gene; !5 wormbase ID; 8 type change; 10 nucleotide change; 11 amino acid change; 16 warning message -vcfinfo_simp<-subset(vcfinfo_dat,select=c("CHROM","POS","QUAL","DP","REF","ALT","AB","AO","RO","RATIO","adj_AB","ANN2","ANN4","ANN8","ANN10","ANN11","ANN16")) -names(vcfinfo_simp)<-c("CHROM","POS","QUAL","DP","REF","ALT","AB","AO","RO","RATIO","adj_AB","VARTYPE","GENE","TYPE","NTCHANGE","PRCHANGE","WARNINGS") -vcfsimp_dat<-vcfinfo_simp[with(vcfinfo_simp,order(CHROM,POS)),] - -#write table with quality filtered variants for VDM plotting and relevant columns -try( - write.table(vcfsimp_dat,vcfoutput_filename,sep="\t",quote=FALSE,row.names=FALSE) - ,silent=TRUE) -#write table with all unfiltered variants and all columns including parsed INFO -try( - write.table(vcfinfo_pdat,vcfparsed_filename,sep="\t",quote=FALSE,row.names=FALSE) - ,silent=TRUE) - - -###################### -###CHROMOSOME (INTERVAL) ARRANGEMENT -#define chromosome and chromosome size in Mb -if(interval_type == 'C.elegans'){ - chrom_n<-c('I','II','III','IV','V','X') - chrom_mb<-c(16,16,14,18,21,18) - interval_frame<-data.frame(chrom_n,chrom_mb) -} else if(interval_type == 'Zebrafish'){ - chrom_n<-c('1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25') - chrom_mb<-c(61,61,64,63,76,60,78,57,59,47,47,51,55,54,48,59,54,50,51,56,45,43,47,44,39) - interval_frame<-data.frame(chrom_n,chrom_mb) -} else if(interval_type == 'Brachypodium'){ - chrom_n<-c('1','2','3','4','5') - chrom_mb<-c(75,60,60,50,30) - interval_frame<-data.frame(chrom_n,chrom_mb) -} else if(interval_type == 'Arabidopsis'){ - chrom_n<-c('1','2','3','4','5') - chrom_mb<-c(31, 20,24,19,27 ) - interval_frame<-data.frame(chrom_n,chrom_mb) -} else{ - #user interval file- no headers, with chromosome in column 1 (format CHR# or CHROM#) and size in Mb (rounded up) in column 2 - # user_interval_type<-read.table(user_interval_file) - user_interval_type<-read.table(interval_type) - if(grepl("chrom",user_interval_type[1,1],ignore.case=TRUE)==TRUE){ - user_interval_type[,1]<-gsub("chrom","",user_interval_type[,1],ignore.case=TRUE) - }else if(grep("chr",user_interval_type[1,1],ignore.case=TRUE)==TRUE){ - user_interval_type[,1]<-gsub("chr","",user_interval_type[,1],ignore.case=TRUE) - } - chrom_n<-user_interval_type[,1] - chrom_mb<-user_interval_type[,2] - interval_frame<-data.frame(chrom_n,chrom_mb) -} -names(interval_frame)<-c("CHROM","INTERVAL") - - -###################### -###PLOTTING -#VDM SCATTER PLOT -#save to pdf -pdf(file=pdf_filename,width=9,height=8) -#par(mfrow=c(2,3)) -for(chromind in interval_frame$CHROM){ - #subset by data by chromosome for plotting - intervalind<-interval_frame$INTERVAL[interval_frame$CHROM==chromind] - chr_dat<-subset(vcfsimp_dat,CHROM==chromind,silent=TRUE) - - #same subsetting by chromosome for pre-subtraction data - # if(length(exclusion_list)>0){ - if(exclusion_list!="FALSE"){ - chr_origdat<-subset(vcfinfo_origdat,CHROM==chromind,silent=TRUE) - } - - #define x-axis upper limit - if(xaxis_standard==TRUE){ - #for standardized x-axis (max x-axis chromosome length) - scupper_xaxis<-max(interval_frame$INTERVAL) - scupper_xval<-scupper_xaxis*interval_unit - } else if(xaxis_standard==FALSE){ - scupper_xaxis<-intervalind - scupper_xval<-intervalind*interval_unit - } - - if(allele_ratio=="AB"){ - plot(chr_dat$POS,chr_dat$adj_AB,cex=0.60,xlim=c(0,scupper_xval),ylim=c(0,sp_yaxis),main=paste("Chr",chromind," Variant Discovery Mapping",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Ratio of Variant Reads/Total Reads [AB]',pch=10, col=plot_color,xaxt='n') - try(lines(loess.smooth(chr_dat$POS,chr_dat$adj_AB,span=loess_span),lwd=5,col=loess_color)) - #plot loess curve for data without subtraction of exclusion variants - # if(length(exclusion_list)>0){ - if(exclusion_list!="FALSE"){ - try(lines(loess.smooth(chr_origdat$POS,chr_origdat$adj_AB,span=loess_span),lty="longdash",lwd=4,col=excl_loess_color)) - } - - axis(1,at=seq(0,scupper_xval,by=interval_unit),labels=c(0:scupper_xaxis)) - abline(h=seq(0,sp_yaxis,by=0.1),v=c(1:scupper_xaxis)*interval_unit,col="gray") - } else if(allele_ratio=="ratio"){ - plot(chr_dat$POS,chr_dat$RATIO,cex=0.60,xlim=c(0,scupper_xval),ylim=c(0,sp_yaxis),main=paste("Chr",chromind," Variant Discovery Mapping",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Ratio of Variant Reads/Total Reads [ratio]',pch=10, col=plot_color,xaxt='n') - try(lines(loess.smooth(chr_dat$POS,chr_dat$RATIO,span=loess_span),lwd=5,col=loess_color)) - #plot loess curve for data without subtraction of exclusion variants - # if(length(exclusion_list)>0){ - if(exclusion_list!="FALSE"){ - try(lines(loess.smooth(chr_origdat$POS,chr_origdat$adj_AB,span=loess_span),lty="longdash",lwd=4,col=excl_loess_color)) - } - axis(1,at=seq(0,scupper_xval,by=interval_unit),labels=c(0:scupper_xaxis)) - abline(h=seq(0,sp_yaxis,by=0.1),v=c(1:scupper_xaxis)*interval_unit,col="gray") - } -} - -###################### -#graph barplots -location_index<-NULL -meanSNP_dat<-NULL -# prepare table of counts and calculations -for(chromind in interval_frame$CHROM){ - #for standardized x-axis - if(xaxis_standard==TRUE){ - intervalind<-max(interval_frame$INTERVAL)*1000000/bin_size - } else if(xaxis_standard==FALSE){ - intervalind<-interval_frame$INTERVAL[interval_frame$CHROM==chromind]*1000000/bin_size - } - #start intervals with **1 and end with **0 - interval_begin<-c(((0:(intervalind-1))*bin_size)+1) - interval_end<-c((1:intervalind)*bin_size) - - #define x-axis upper limit - if(xaxis_standard==TRUE){ - upper_xaxis<-max(interval_frame$INTERVAL) - } else if(xaxis_standard==FALSE){ - upper_xaxis<-interval_frame$INTERVAL[interval_frame$CHROM==chromind] - } - #prepare columns - snp_counter<-0 - purealt_counter<-0 - pureref_counter<-0 - het_counter<-0 - chr_mean<-0 - normed_freq<-0 - ratio<-0 - - interval_index<-data.frame(chromind,interval_begin,interval_end,snp_counter,purealt_counter,pureref_counter,het_counter,chr_mean,normed_freq) - chr_dat<-subset(vcfinfo_dat,CHROM==chromind) - #ratio calculation - ratio<-chr_dat$AO/(chr_dat$AO+chr_dat$RO) - - #if counter based on adj_AB or ratio - if(allele_ratio=="ratio"){ - chr_purealtdat<-subset(chr_dat,ratio>=threshold_upper)#;chr_purealtdat - chr_purerefdat<-subset(chr_dat,ratio<=threshold_lower)#;chr_purerefdat - chr_hetdat<-subset(chr_dat,ratio>threshold_lower & ratio<threshold_upper)#;chr_hetdat - } else if(allele_ratio=="AB"){ - chr_purealtdat<-subset(chr_dat,adj_AB>=threshold_upper)#;chr_purealtdat - chr_purerefdat<-subset(chr_dat,adj_AB<=threshold_lower)#;chr_purerefdat - chr_hetdat<-subset(chr_dat,adj_AB>threshold_lower & adj_AB<threshold_upper)#;chr_hetdat - } - #if chromosome with data, count number of snps within each bin (positions rounded up to nearest bin), else skip to next chromosome - if(dim(chr_dat)[1]>0){ - for(i in 1:dim(chr_dat)[1]){ - chr_datind<-chr_dat[i,] - #round up to nearest bin-size interval - chr_datind_upper<-ceiling(chr_datind$POS/bin_size)*bin_size - interval_coln<-NULL;interval_rown<-NULL - #identify row and and counter column to increment - interval_coln<-which(names(interval_index)=="snp_counter") - interval_rown<-match(chr_datind_upper,interval_index$interval_end) - interval_index[interval_rown,interval_coln]<-c(interval_index$snp_counter[interval_rown]+1) - } - }else{ - next - } - ##if chromosome with pure AO, count number of snps with each bin (positions rounded up to nearest bin) - if(dim(chr_purealtdat)[1]>0){ - for(i in 1:dim(chr_purealtdat)[1]){ - chr_purealtind<-chr_purealtdat[i,] - chr_purealtind_upper<-ceiling(chr_purealtind$POS/bin_size)*bin_size - interval_coln<-NULL;interval_rown<-NULL - interval_coln<-which(names(interval_index)=="purealt_counter") - interval_rown<-match(chr_purealtind_upper,interval_index$interval_end) - interval_index[interval_rown,interval_coln]<-c(interval_index$purealt_counter[interval_rown]+1) - } - } - #if chromosome with pure RO, count number of snps with each bin (positions rounded up to nearest bin) - if(dim(chr_purerefdat)[1]>0){ - for(i in 1:dim(chr_purerefdat)[1]){ - chr_purerefind<-chr_purerefdat[i,] - chr_purerefind_upper<-ceiling(chr_purerefind$POS/bin_size)*bin_size - interval_coln<-NULL;interval_rown<-NULL - interval_coln<-which(names(interval_index)=="pureref_counter") - interval_rown<-match(chr_purerefind_upper,interval_index$interval_end) - interval_index[interval_rown,interval_coln]<-c(interval_index$pureref_counter[interval_rown]+1) - } - } - #if chromosome with hets, count number of snps with each bin (positions rounded up to nearest bin) - if(dim(chr_hetdat)[1]>0){ - for(i in 1:dim(chr_hetdat)[1]){ - chr_hetind<-chr_hetdat[i,] - chr_hetind_upper<-ceiling(chr_hetind$POS/bin_size)*bin_size - interval_coln<-NULL;interval_rown<-NULL - interval_coln<-which(names(interval_index)=="het_counter") - interval_rown<-match(chr_hetind_upper,interval_index$interval_end) - interval_index[interval_rown,interval_coln]<-c(interval_index$het_counter[interval_rown]+1) - } - } - #irrespective of standardized x-axis, mean should be calculated from actual interval range of chromosome - chr_mean<-sum(interval_index$purealt_counter)/(interval_frame$INTERVAL[interval_frame$CHROM==chromind]*1000000/bin_size) - interval_index$chr_mean<-chr_mean - meanSNP_dat<-rbind(meanSNP_dat,data.frame(chromind,chr_mean)) - #normalization treatment for if SNPs are AO=0, AO=total SNPs, or AO and RO in bin - for(i in 1:dim(interval_index)[1]){ - chr_intind<-interval_index[i,] - #hom definition based on specified upper and lower thresholds - if(chr_intind$purealt_counter<=threshold_lower){ - interval_coln<-NULL - interval_coln<-which(names(interval_index)=="normed_freq") - interval_index[i,interval_coln]=0 - } else if (chr_intind$purealt_counter==chr_intind$snp_counter){ - interval_coln<-NULL - interval_coln<-which(names(interval_index)=="normed_freq") - interval_index[i,interval_coln]=(chr_intind$purealt_counter)^2/chr_mean - } else { - interval_coln<-NULL - interval_coln<-which(names(interval_index)=="normed_freq") - interval_index[i,interval_coln]=chr_mean*(chr_intind$purealt_counter)^2/(chr_intind$snp_counter-chr_intind$purealt_counter) - } - } - location_index<-rbind(location_index,interval_index) -} - -for(chromind in interval_frame$CHROM){ - interval_index<-location_index[location_index$chromind==chromind,] - #assign 0 values to avoid empty datatable error - if(dim(interval_index)[1]==0){ - interval_index[1,]<-rep(0,dim(interval_index)[2]) - } - # } - #set up x_axis - if(xaxis_standard==TRUE){ - #for standardized x-axis (max x-axis chromosome length) - bpupper_xaxis<-max(interval_frame$INTERVAL) - bpupper_xval<-bpupper_xaxis*interval_unit - }else if(xaxis_standard==FALSE){ - bpupper_xaxis<-intervalind - bpupper_xval<-intervalind*interval_unit - } - #set up y_axis range for barplots - if(bfreq_norm==TRUE){ - bp_yaxis<-5*ceiling(max(location_index$normed_freq)/5) - #assign non-0 value to yaxis to avoid error - if(bp_yaxis==0){ - bp_yaxis<-10 - } - if(xaxis_standard==TRUE){ - bplot<-barplot(interval_index$normed_freq,space=0,ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Normalised Frequency') - }else if(xaxis_standard==FALSE){ - bplot<-barplot(interval_index$normed_freq,space=0,xlim=c(0,bpupper_xaxis),ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Normalised Frequency') - } - - }else if(bfreq_norm==FALSE){ - bp_yaxis<-5*ceiling(max(location_index$purealt_counter)/5) - #assign non-0 value to yaxis to avoid error - if(bp_yaxis==0){ - bp_yaxis<-10 - } - if(xaxis_standard==TRUE){ - bplot<-barplot(interval_index$purealt_counter,space=0,ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Frequency') - }else if(xaxis_standard==FALSE){ - bplot<-barplot(interval_index$purealt_counter,space=0,xlim=c(0,bpupper_xaxis),ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Frequency') - } - } - bp_xaxis1<-as.numeric(bplot) - bp_xaxis2<-c(bp_xaxis1,tail(bp_xaxis1,1)+bp_xaxis1[2]-bp_xaxis1[1]) - bp_xaxis<-bp_xaxis2-bp_xaxis1[1] - - axis(1,at=bp_xaxis,labels=seq(0,bpupper_xaxis,by=c(bin_size/1000000))) - - } - dev.off() -} - - -# myfunction(options$inf,options$itype,options$qual,options$thrup,options$thrlow,options$allr,options$snp,options$lsp,options$pcol,options$lcol,options$xstand, -# options$bsize,options$bnorm,options$exclf,options$exclthr,options$exclcol,options$parn,options$outn,options$pdfn) - - -myfunction(inf=options$inf,itype=options$itype,qual=options$qual,thrup=options$thrup,thrlow=options$thrlow, - allr=options$allr,snp=options$snp,lsp=options$lsp,pcol=options$pcol,lcol=options$lcol,xstand=options$xstand,bsize=options$bsize, - bnorm=options$bnorm,exclf=options$exclf,exclthr=options$exclthr,exclcol=options$exclcol,parn=options$parn,outn=options$outn,pdfn=options$pdfn) -
--- a/my_VDM_tool.xml Fri Nov 06 16:39:21 2020 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,198 +0,0 @@ - <tool id="my_VDM_tool" name="VDM_tool" version="1.0.0"> -<!--A simple description of the tool that will appear in the tool panel in Galaxy.--> -<description>Map a mutation using in silico bulk segregant linkage analysis of pooled recombinant lines generated through backcrossing.</description> -<!-- Handles exit codes in Galaxy. --> -<stdio> - <exit_code range="1:"/> -</stdio> -<requirements> - <requirement type="package" version="3.2.1">R</requirement> - <requirement type="package" version="1.2.0">getopt</requirement> -</requirements> - -<command> -Rscript /home/fxue/galaxy/tools/my_VDM_tool/my_VDM_tool.R - --inf "$inf" - #if $species.species_select=="Celegans" - --itype "$species.ce" - #else if $species.species_select=="Zebrafish" - --itype "$species.ze" - #else if $species.species_select=="Brachypodium" - --itype "$species.br" - #else if $species.species_select=="Arabidopsis" - --itype "$species.ar" - #else if $species.species_select=="other" - --itype "$species.ot" - #end if - - --qual $qual - --thrup $thrup - --thrlow $thrlow - - #if $allfreq.allfreq_select=="AB" - --allr "$allfreq.ab" - #else if $allfreq.allfreq_select=="ratio" - --allr "$allfreq.ratio" - #end if - - #if $only_snp.only_snp_select=="TRUE" - --snp "$only_snp.true" - #else if $only_snp.only_snp_select=="FALSE" - --snp "$only_snp.false" - #end if - - --lsp $lsp - --pcol "$pcol" - --lcol "$lcol" - - #if $xaxis.xaxis_select=="TRUE" - --xstand $xaxis.true - #else if $xaxis.xaxis_select=="FALSE" - --xstand $xaxis.false - #end if - - --bsize $bsize - - #if $binnorm.binnorm_select=="TRUE" - --bnorm $binnorm.true - #else if $binnorm.binnorm_select=="FALSE" - --bnorm $binnorm.false - #end if - - #if $exclfiles.exclfiles_select=="FALSE" - --exclf $exclfiles.false - #else if $exclfiles.exclfiles_select=="TRUE" - --exclf $exclfiles.true - #end if - - - --exclthr $exclthr - --exclcol "$exclcol" - - --parn "$parn" - --outn "$outn" - --pdfn "$pdfn" - -</command> -<inputs> -<param type="data" name="inf" format="vcf" label="fastq file"/> - -<conditional name="species"> - <param name="species_select" type="select" label="Select the species"> - <option value="Celegans">C. elegans</option> - <option value="Zebrafish">Zebrafish</option> - <option value="Brachypodium">Brachypodium</option> - <option value="Arabidopsis">Arabidopsis</option> - <option value="other">other</option> - </param> - <when value="Celegans"> - <param name="ce" type="hidden" value="C.elegans" label="The C. elegans chromosome numbers and lengths (in Mb)" help="C.elegans help"/> - </when> - <when value="Zebrafish"> - <param name="ze" type="hidden" value="Zebrafish" label="The Zebrafish chromosome numbers and lengths (in Mb)" help="Zebrafish help"/> - </when> - <when value="Brachypodium"> - <param name="br" type="hidden" value="Brachypodium" label="The Brachypodium chromosome numbers and lengths (in Mb)" help="Brachypodium help"/> - </when> - <when value="Arabidopsis"> - <param name="ar" type="hidden" value="Arabidopsis" label="The Arabidopsis chromosome numbers and lengths (in Mb)" help="Arabidopsis help"/> - </when> - <when value="other"> - <param name="ot" type="data" format="tabular" label="Select file with chromosome numbers and lengths (in Mb) from your history" help="Table consisting of chromosome number in column 1 and length (in Mb) in column 2 (e.g. 'CHRI 16' or 'CHR1 16') with no column header names, tab-delimitation, and no quotation marks in a .txt file"/> - </when> -</conditional> - -<param type="float" name="qual" value="200" label="Filter by quality" help="Filter results based on quality value"/> -<param type="float" name="thrup" value="1" label="upper threshold for homozygosity" help="Allele frequency values greater than or equal to this will be considered as homozygous ALT for barplots of frequency homozygous variants along chromosomes"/> -<param type="float" name="thrlow" value="0" label="lower threshold for homozygosity" help="Allele frequency values less than or equal to this will be considered as homozygous for barplots of frequency homozygous REF variants along chromosomes"/> - - -<conditional name="allfreq"> - <param name="allfreq_select" type="select" label="Select the source for allele frequency"> - <option value="AB">AB</option> - <option value="ratio">AO/(AO+RO)</option> - </param> - <when value="AB"> - <param name="ab" type="hidden" value="AB" label="Use AB field (from Freebayes) as the value for allele frequency" help=" "/> - </when> - <when value="ratio"> - <param name="ratio" type="hidden" value="ratio" label="Use AO/(AO+RO) calculation (from Freebayes) as the value for allele frequency" help=" "/> - </when> - </conditional> - -<conditional name="only_snp"> - <param name="only_snp_select" type="select" label="Select type of variants to use for plotting"> - <option value="TRUE">SNPs</option> - <option value="FALSE">all</option> - </param> - <when value="TRUE"> - <param name="true" type="hidden" value="TRUE" label="Use only SNP variants" help=" "/> - </when> - <when value="FALSE"> - <param name="false" type="hidden" value="FALSE" label="Use all types of variants" help=" "/> - </when> - </conditional> - -<param type="float" name="lsp" value="0.4" label="Loess span" help="Parameter that controls the smoothing of the Loess curve"/> -<param type="text" name="pcol" value="black" label="Colour of scatterplot points" help="See below for list of supported colors"/> -<param type="text" name="lcol" value="red" label="Colour of Loess curve" help="See below for list of supported colors"/> - - -<conditional name="xaxis"> - <param name="xaxis_select" type="select" label="Spacing of the x-axis in plots"> - <option value="TRUE">True</option> - <option value="FALSE">False</option> - </param> - <when value="TRUE"> - <param name="true" type="hidden" value="TRUE" label="Uniform spacing of the x-axis based on Mb" help="Scale of x-axis (in Mb) is fixed for the scatter plots and frequency plots across all chromosomes"/> - </when> - <when value="FALSE"> - <param name="false" type="hidden" value="FALSE" label="Variable spacing of the x-axis based on chromosome lengths" help="Scale of x-axis (in Mb) is dependent on chromosome length for the scatter plots and frequency plots for all chromosomes"/> - </when> - </conditional> - -<param type="integer" name="bsize" value="1000000" label="bin size" help="Size of the bins (in bp) for barplot of frequency of homozygous variants along chromosomes"/> - -<conditional name="binnorm"> - <param name="binnorm_select" type="select" label="Normalisation of y-axis in frequency barplots"> - <option value="TRUE">True</option> - <option value="FALSE">False</option> - </param> - <when value="TRUE"> - <param name="true" type="hidden" value="TRUE" label="Normalised y-axis frequency values based on formula" help="Normalisation formula as in cloudmap paper"/> - </when> - <when value="FALSE"> - <param name="false" type="hidden" value="FALSE" label="Original frequency y-axis values" help=" "/> - </when> - </conditional> - -<conditional name="exclfiles"> - <param name="exclfiles_select" type="select" label="Additional exclusion of variants by subtraction"> - <option value="FALSE">No</option> - <option value="TRUE">Yes</option> - </param> - <when value="FALSE"> - <param name="false" type="hidden" value="FALSE" label="No additional variant subtraction" help=""/> - </when> - <when value="TRUE"> - <param name="true" type="data" format="tabular" label="Select variant lists to subtract from your history" help="Requires CHR POS DEPTH REF ALT columns- recommend directly using the output table generated by this tool or refer to it for desired format"/> - - <param type="float" name="exclthr" value="0" label="Filter based on allelic ratio values" help="For filtering variant subtraction lists, only variants above this threshold value will be used for subtraction (e.g. 0 means all variants and 1 means only homozygous variants"/> - <param type="text" name="exclcol" value="green" label="Colour of original loess curve (before additional variant subtraction)" help="See below for list of supported colors"/> - </when> -</conditional> - -</inputs> - -<outputs> -<data name="parn" format="txt"/> -<data name="outn" format="txt"/> -<data name="pdfn" format="pdf"/> -</outputs> - -<tests> -</tests> - -<help> -</help> -</tool> \ No newline at end of file