3
|
1 #!/usr/bin/env Rscript
|
|
2 # rm(list=ls())
|
|
3 # myfunction(inf="nor22.vcf",itype="C.elegans",qual=400,thrup=1.0,thrlow=0.0,allr="AB",snp=TRUE,lsp=0.4,pcol="black",lcol="red",xstand=TRUE,bsize=1000000,bnorm=FALSE,exclf=NULL,exclthr=0.0,exclcol="green",parn="parsed.txt",outn="output.txt",pdfn="plot.pdf")
|
|
4
|
|
5 #Rscript my_VDM_tool.R --inf "nor22.vcf" --itype "C.elegans" --qual 400 --thrup 1.0 --thrlow 0.0 --allr "AB" --snp TRUE --lsp 0.4 --pcol "black" --lcol "red" --xstand TRUE --bsize 1000000 --bnorm FALSE --exclf NULL --exclthr 0.0 --exclcol "green" --parn "parsed.txt" --outn "output.txt" --pdfn "plot.pdf"
|
|
6
|
|
7 library("getopt")
|
|
8 #input from trailing line arguments
|
|
9 args <- commandArgs(trailingOnly = TRUE)
|
|
10 #read the options from input commandArgs
|
|
11 option_specification = matrix(c(
|
|
12 'inf','i01',1,'character',
|
|
13 'itype','i02',2,'character',
|
|
14 'qual','i03',2,'double',
|
|
15 'thrup','i04',2,'double',
|
|
16 'thrlow','i05',2,'double',
|
|
17 'allr','i06',2,'character',
|
|
18 'snp','i07',2,'logical',
|
|
19 'lsp','i08',2,'double',
|
|
20 'pcol','i09',2,'character',
|
|
21 'lcol','i10',2,'character',
|
|
22 'xstand','i11',2,'logical',
|
|
23 'bsize','i12',2,'integer',
|
|
24 'bnorm','i13',2,'logical',
|
|
25 'exclf','i14',2,'character',
|
|
26 'exclthr','i15',2,'double',
|
|
27 'exclcol','i16',2,'character',
|
|
28 'parn','o1',2,'character',
|
|
29 'outn','o2',2,'character',
|
|
30 'pdfn','o3',2,'character'
|
|
31 ), byrow=TRUE, ncol=4)
|
|
32
|
|
33 # #parse options # bnorm=FALSE,exclf=NULL,exclthr=0.0,exclcol="green",parn="parsed.txt",outn="output.txt",pdfn="plot.pdf")
|
|
34 options = getopt(option_specification)
|
|
35 #
|
|
36 # #FOR DEBUGGING
|
|
37 # options<-NULL
|
|
38 # ###INPUT FILE
|
|
39 # setwd("D:/Dropbox/_galaxy/")
|
|
40 # options$inf<-"nor22.vcf"
|
|
41 # ###BASE OPTIONS
|
|
42 # #interval type
|
|
43 # options$itype<-"C.elegans"
|
|
44 # #quality filter
|
|
45 # options$qual<-200
|
|
46 # #for scaling 0-1 = upper threshold for what is considered homozygous
|
|
47 # options$thrup<-1
|
|
48 # #for scaling 0-1 = lower threshold for what is considered homozygous
|
|
49 # options$thrlow<-0
|
|
50 # #type of allelic ratio (AB/ratio)
|
|
51 # options$allr<-"AB"
|
|
52 # #include complex variants
|
|
53 # options$snp<-FALSE
|
|
54 # ###ADDITIONAL VARIANT EXCLUSION OPTIONS
|
|
55 # #files with variants to exclude
|
|
56 # options$exclf<-NULL
|
|
57 # options$exclf<-c("nor22chunk1.txt","nor22chunk5.txt")
|
|
58 # #for variants to exclude, bottom threshold for which to be used, i.e. 0=ALL, 1=HOM only, 0.8=near HOM)
|
|
59 # options$exclthr<-0
|
|
60 # #additional colour option for pre-subtraction line
|
|
61 # options$exclcol<-"green"
|
|
62 # ###PLOT OPTIONS
|
|
63 # #loess span
|
|
64 # options$lsp<-0.4
|
|
65 # #point colour
|
|
66 # options$pcol<-"black"
|
|
67 # #loess plot colour
|
|
68 # options$lcol<-"red"
|
|
69 # #standardize x-axis interval (e.g. 1Mb interval)
|
|
70 # options$xstand<-TRUE
|
|
71 # #bin size for barplot
|
|
72 # options$bsize<-1000000
|
|
73 # #normalization for barplot
|
|
74 # options$bnorm<-FALSE
|
|
75 # ###OUTPUT OPTIONS
|
|
76 # #custom files names (may not work for Galaxy)
|
|
77 # options$outn<-paste(gsub("vcf","",options$inf),"_output_q",options$qual,"-",paste(options$exclf,sep="",collapse=""),".txt",sep="")
|
|
78 # options$parn<-paste(gsub("vcf","",options$inf),"_parsed_q",options$qual,"-",paste(options$exclf,sep="",collapse=""),".txt",sep="")
|
|
79 # options$pdfn<-paste(gsub("vcf","",options$inf),"_plot_q",options$qual,"-",paste(options$exclf,sep="",collapse=""),".pdf",sep="")
|
|
80 # #fixed file names (will work in Galaxy)
|
|
81 # options$outn<-"vcf_output.txt"
|
|
82 # options$parn<-"vcf_parsedinput.txt"
|
|
83 # options$pdfn<-"vdm_mapping_plot.pdf"
|
|
84
|
|
85
|
|
86 myfunction<-function(inf,itype,qual,thrup,thrlow,allr,snp,lsp,pcol,lcol,xstand,bsize,bnorm,exclf,exclthr,exclcol,parn,outn,pdfn){
|
|
87
|
|
88 #PARAMETERS
|
|
89 # filename<-options$inf
|
|
90 # interval_type<-options$itype
|
|
91 # read_qual<-options$qual
|
|
92 # threshold_upper<-options$thrup
|
|
93 # threshold_lower<-options$thrlow
|
|
94 # allele_ratio<-options$allr
|
|
95 # snp_only<-options$snp
|
|
96 # loess_span<-options$lsp
|
|
97 # plot_color<-options$pcol
|
|
98 # loess_color<-options$lcol
|
|
99 # #transparency for selected colour (to see plot points underneath)
|
|
100 # xaxis_standard<-options$xstand
|
|
101 # bin_size<-options$bsize
|
|
102 # bfreq_norm<-options$bnorm
|
|
103 # exclusion_list<-options$exclf
|
|
104 # excl_threshold<-options$exclthr
|
|
105 # excl_loess_color<-options$exclcol
|
|
106 # vcfoutput_filename<-options$outn
|
|
107 # vcfparsed_filename<-options$parn
|
|
108 # pdf_filename<-options$pdfn
|
|
109
|
|
110 # plot_color<-rgb(c(col2rgb(plot_color)[1]),c(col2rgb(plot_color)[2]),c(col2rgb(plot_color)[3]),max=255,alpha=150)
|
|
111 # loess_color<-rgb(c(col2rgb(loess_color)[1]),c(col2rgb(loess_color)[2]),c(col2rgb(loess_color)[3]),max=255,alpha=150)
|
|
112 # excl_loess_color<-rgb(c(col2rgb(excl_loess_color)[1]),c(col2rgb(excl_loess_color)[2]),c(col2rgb(excl_loess_color)[3]),max=255,alpha=150)
|
|
113
|
|
114 filename<-inf
|
|
115 interval_type<-itype
|
|
116 read_qual<-as.numeric(qual)
|
|
117 threshold_upper<-as.numeric(thrup)
|
|
118 threshold_lower<-as.numeric(thrlow)
|
|
119 allele_ratio<-allr
|
|
120 snp_only<-snp
|
|
121 loess_span<-as.numeric(lsp)
|
|
122 plot_color<-pcol
|
|
123 loess_color<-lcol
|
|
124 xaxis_standard<-xstand
|
|
125 bin_size<-as.numeric(bsize)
|
|
126 bfreq_norm<-bnorm
|
|
127 exclusion_list<-exclf
|
|
128 excl_threshold<-as.numeric(exclthr)
|
|
129 excl_loess_color<-exclcol
|
|
130 vcfoutput_filename<-outn
|
|
131 vcfparsed_filename<-parn
|
|
132 pdf_filename<-pdfn
|
|
133
|
|
134 #transparency for selected colour (to see plot points underneath)
|
|
135 plot_color<-rgb(c(col2rgb(plot_color)[1]),c(col2rgb(plot_color)[2]),c(col2rgb(plot_color)[3]),max=255,alpha=150)
|
|
136 loess_color<-rgb(c(col2rgb(loess_color)[1]),c(col2rgb(loess_color)[2]),c(col2rgb(loess_color)[3]),max=255,alpha=150)
|
|
137 excl_loess_color<-rgb(c(col2rgb(excl_loess_color)[1]),c(col2rgb(excl_loess_color)[2]),c(col2rgb(excl_loess_color)[3]),max=255,alpha=150)
|
|
138
|
|
139 ###FIXED PARAMETERS
|
|
140 #linkage scatter plot yaxis max value=1
|
|
141 sp_yaxis<-1
|
|
142 #chromosome intervals in Mb rather than custom
|
|
143 interval_unit<-1000000
|
|
144
|
|
145
|
|
146 ######################
|
|
147 ###READ IN VCF FILE
|
|
148 #extract column names
|
|
149 vcf_readin<-readLines(filename)
|
|
150 #find header line, i.e. last line to begin with #
|
|
151 for(l in 1:length(vcf_readin)){
|
|
152 vcf_readinl<-vcf_readin[l]
|
|
153 if(substr(vcf_readinl,1,1)=="#"){next}
|
|
154 else if(substr(vcf_readinl,1,1)!="#"){rowline<-l-1;break}
|
|
155 }
|
|
156 vcf_header<-vcf_readin[rowline]
|
|
157 #e.g. CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\trgSM"
|
|
158 vcf_header<-gsub("#","",vcf_header)
|
|
159 vcf_colnames<-unlist(strsplit(vcf_header,"\t"))
|
|
160
|
|
161 #extract data (hashed vcf header skipped with read.table)
|
|
162 vcf_rtable<-read.table(filename,sep="\t",stringsAsFactors=FALSE)
|
|
163 names(vcf_rtable)<-vcf_colnames
|
|
164
|
|
165 ######################
|
|
166 ###PREPARE DATA
|
|
167
|
|
168 vcfinfo_dat<-NULL
|
|
169 vcfinfo_pdat<-NULL
|
|
170 multiallele_counter<-0
|
|
171 diviserror_counter<-0
|
|
172 for(i in c(1:nrow(vcf_rtable))){
|
|
173 vcf_line<-vcf_rtable[i,]
|
|
174 #to speed up runtime- quality filter here
|
|
175 if(vcf_line$QUAL>=read_qual){
|
|
176 #remove chrom or chr prefix from chromosome value
|
|
177 if(grepl("chrom",vcf_line$CHROM,ignore.case=TRUE)==TRUE){
|
|
178 vcf_line$CHROM<-gsub("chrom","",vcf_line$CHROM,ignore.case=TRUE)
|
|
179 }else if(grepl("chr",vcf_line$CHROM,ignore.case=TRUE)==TRUE){
|
|
180 vcf_line$CHROM<-gsub("chr","",vcf_line$CHROM,ignore.case=TRUE)
|
|
181 }
|
|
182 #PARSE INFO
|
|
183 vcfinfo_split<-strsplit(vcf_line$INFO,split=";")
|
|
184 vcfinfo_coln<-gsub("=.*","",unlist(vcfinfo_split))
|
|
185 vcfinfo_cold<-gsub(".*=","",unlist(vcfinfo_split))
|
|
186 vcfinfo_ldat<-data.frame(t(vcfinfo_cold),stringsAsFactors=FALSE)
|
|
187 names(vcfinfo_ldat)<-vcfinfo_coln
|
|
188
|
|
189 #skip if commas in values to avoid returning errors
|
|
190 if(grepl(",",vcfinfo_ldat$AO)==TRUE){
|
|
191 multiallele_counter<-multiallele_counter+1
|
|
192 next
|
|
193 }
|
|
194 #skip divide by zero errors (under "ratio" setting for ratio calculation)
|
|
195 if(as.numeric(vcfinfo_ldat$AO)+as.numeric(vcfinfo_ldat$RO)=="0"){
|
|
196 diviserror_counter<-diviserror_counter+1
|
|
197 next
|
|
198 }
|
|
199
|
|
200 #specific accounting for nonstandard categories
|
|
201 #LOF columns only present for loss-of-function variants + assign NA values to all other variants
|
|
202 if(("LOF" %in% names(vcfinfo_ldat))==TRUE){
|
|
203 LOF<-vcfinfo_ldat$LOF
|
|
204 vcfinfo_ldat<-vcfinfo_ldat[,!names(vcfinfo_ldat) %in% "LOF"]
|
|
205 vcfinfo_ldat<-cbind(vcfinfo_ldat,LOF)
|
|
206 }else{
|
|
207 LOF<-"NA"
|
|
208 vcfinfo_ldat<-cbind(vcfinfo_ldat,LOF)
|
|
209 }
|
|
210 #NMD columns only present for nonsense-mediated-decay variants + assign NA values to all other variants
|
|
211 if(("NMD" %in% names(vcfinfo_ldat))==TRUE){
|
|
212 NMD<-vcfinfo_ldat$NMD
|
|
213 vcfinfo_ldat<-vcfinfo_ldat[,!names(vcfinfo_ldat) %in% "NMD"]
|
|
214 vcfinfo_ldat<-cbind(vcfinfo_ldat,NMD)
|
|
215 }else{
|
|
216 NMD<-"NA"
|
|
217 vcfinfo_ldat<-cbind(vcfinfo_ldat,NMD)
|
|
218 }
|
|
219
|
|
220 #general accounting for nonstandard categories
|
|
221
|
|
222
|
|
223 #PARSE ANNOTATION
|
|
224 ann_rparsed<-unlist(strsplit(vcfinfo_ldat$ANN[1],split="\\|"))[1:20]
|
|
225 ann_rparsed[ann_rparsed==""]<-"novalue"
|
|
226 ann_parsed<-data.frame(t(ann_rparsed),stringsAsFactors=FALSE)
|
|
227 names(ann_parsed)<-paste("ANN",c(1:dim(ann_parsed)[2]),sep="")
|
|
228 #remove duplicate redundant INFO column (fully parsed)
|
|
229 vcf_line<-vcf_line[,names(vcf_line)!="INFO"]
|
|
230
|
|
231 #dataset keeping unparsed annotation (full)
|
|
232 vcfinfo_pldat<-cbind(vcf_line,vcfinfo_ldat)
|
|
233 vcfinfo_pdat<-rbind(vcfinfo_pdat,vcfinfo_pldat)
|
|
234
|
|
235 #dataset keeping parsed annotation (partial parsed-for relevant)
|
|
236 vcfinfo_ldat<-vcfinfo_ldat[,names(vcfinfo_ldat)!="ANN"]
|
|
237 #append copy of original data to parsed data
|
|
238 vcfinfo_lldat<-cbind(vcf_line,vcfinfo_ldat,ann_parsed)
|
|
239 vcfinfo_dat<-rbind(vcfinfo_dat,vcfinfo_lldat)
|
|
240 }
|
|
241 }
|
|
242 print(paste("rows with multiple alleles skipped: ",multiallele_counter,sep=""))
|
|
243 # print(paste("rows with AO+RO=0 (not multiple alleles) skipped: ",diviserror_counter,sep=""))
|
|
244
|
|
245 #ENSURE CORRECT DATATYPES
|
|
246 #convert dataframe columns of factor type to character type
|
|
247 vcfinfo_dat<-data.frame(lapply(vcfinfo_dat,as.character),stringsAsFactors=FALSE)
|
|
248 #convert to numeric if column is numeric and not string
|
|
249 for(n in c(1:dim(vcfinfo_dat)[2])){
|
|
250 #suppress warnings when columns with strings encountered (not converted)
|
|
251 suppressWarnings(
|
|
252 colnum_index<-!is.na(as.numeric(vcfinfo_dat[,n]))
|
|
253 )
|
|
254 if(all(colnum_index)==TRUE){
|
|
255 vcfinfo_dat[,n]<-as.numeric(vcfinfo_dat[,n])
|
|
256 }
|
|
257 }
|
|
258
|
|
259 ######################
|
|
260 #RATIO CALCULATION
|
|
261 #ratio calculation from AO and RO
|
|
262 RATIO<-c(vcfinfo_dat$AO/(vcfinfo_dat$AO+vcfinfo_dat$RO))
|
|
263 #add adj_AB for AB=0->AO=1 conversion
|
|
264 adj_AB<-replace(vcfinfo_dat$AB,vcfinfo_dat$AB==0,1)
|
|
265 vcfinfo_dat<-cbind(vcfinfo_dat,RATIO,adj_AB)
|
|
266 vcfinfo_dat<-vcfinfo_dat[with(vcfinfo_dat,order(CHROM,POS)),]
|
|
267
|
|
268 #CONSIDER ONLY SNP VARIANTS
|
|
269 if(snp_only==TRUE){
|
|
270 vcfinfo_dat<-subset(vcfinfo_dat,CIGAR=="1X")
|
|
271 }
|
|
272
|
|
273 ######################
|
|
274 #SUBTRACT VARIANTS FROM EXCLUSION LIST
|
|
275 # if(length(exclusion_list)>0){
|
|
276 # if(exclusion_list!="FALSE"){
|
|
277 if(exclusion_list!="FALSE"){
|
|
278 #keep copy of pre-subtraction data for later plotting
|
|
279 vcfinfo_origdat<-vcfinfo_dat
|
|
280
|
|
281 #identifiers for exclusion list based on CHROM/POS/REF/ALT
|
|
282 index1<-paste(vcfinfo_dat$CHROM,vcfinfo_dat$POS,sep="_")
|
|
283 index2<-paste(vcfinfo_dat$CHROM,vcfinfo_dat$POS,vcfinfo_dat$REF,sep="_")
|
|
284 index3<-paste(vcfinfo_dat$CHROM,vcfinfo_dat$POS,vcfinfo_dat$REF,vcfinfo_dat$ALT,sep="_")
|
|
285 vcfinfo_dat<-cbind(vcfinfo_dat,index1,index2,index3)
|
|
286
|
|
287 print(paste("before subtraction: ",nrow(vcfinfo_dat),sep=""))
|
|
288 #loop and subtract through exclusion lists (if multiple files)
|
|
289 for(exclusion_ind in exclusion_list){
|
|
290 exclin<-read.table(exclusion_ind,header=TRUE)
|
|
291
|
|
292 #THRESHOLD FILTER ON EXCLUSION LIST VARIANTS
|
|
293 if(allele_ratio=="AB"){
|
|
294 exclin<-subset(exclin,adj_AB>=excl_threshold)
|
|
295 }
|
|
296 if(allele_ratio=="ratio"){
|
|
297 exclin<-subset(exclin,ratio>=excl_threshold)
|
|
298 }
|
|
299
|
|
300 #identifiers for vcf data based on CHROM/POS/REF/ALT
|
|
301 index1<-paste(exclin$CHROM,exclin$POS,sep="_")
|
|
302 index2<-paste(exclin$CHROM,exclin$POS,exclin$REF,sep="_")
|
|
303 index3<-paste(exclin$CHROM,exclin$POS,exclin$REF,exclin$ALT,sep="_")
|
|
304 exclin<-cbind(exclin,index1,index2,index3)
|
|
305 #exclude based on CHROM+POS+REF+ALT
|
|
306 vcfinfo_dat<-subset(vcfinfo_dat,!(index3 %in% exclin$index3))
|
|
307 }
|
|
308 print(paste("after subtraction: ",nrow(vcfinfo_dat),sep=""))
|
|
309 }
|
|
310
|
|
311 ######################
|
|
312 #WRITE TO OUTPUT
|
|
313 #select relevant columns 2 variant type; 4 gene; !5 wormbase ID; 8 type change; 10 nucleotide change; 11 amino acid change; 16 warning message
|
|
314 vcfinfo_simp<-subset(vcfinfo_dat,select=c("CHROM","POS","QUAL","DP","REF","ALT","AB","AO","RO","RATIO","adj_AB","ANN2","ANN4","ANN8","ANN10","ANN11","ANN16"))
|
|
315 names(vcfinfo_simp)<-c("CHROM","POS","QUAL","DP","REF","ALT","AB","AO","RO","RATIO","adj_AB","VARTYPE","GENE","TYPE","NTCHANGE","PRCHANGE","WARNINGS")
|
|
316 vcfsimp_dat<-vcfinfo_simp[with(vcfinfo_simp,order(CHROM,POS)),]
|
|
317
|
|
318 #write table with quality filtered variants for VDM plotting and relevant columns
|
|
319 try(
|
|
320 write.table(vcfsimp_dat,vcfoutput_filename,sep="\t",quote=FALSE,row.names=FALSE)
|
|
321 ,silent=TRUE)
|
|
322 #write table with all unfiltered variants and all columns including parsed INFO
|
|
323 try(
|
|
324 write.table(vcfinfo_pdat,vcfparsed_filename,sep="\t",quote=FALSE,row.names=FALSE)
|
|
325 ,silent=TRUE)
|
|
326
|
|
327
|
|
328 ######################
|
|
329 ###CHROMOSOME (INTERVAL) ARRANGEMENT
|
|
330 #define chromosome and chromosome size in Mb
|
|
331 if(interval_type == 'C.elegans'){
|
|
332 chrom_n<-c('I','II','III','IV','V','X')
|
|
333 chrom_mb<-c(16,16,14,18,21,18)
|
|
334 interval_frame<-data.frame(chrom_n,chrom_mb)
|
|
335 } else if(interval_type == 'Zebrafish'){
|
|
336 chrom_n<-c('1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25')
|
|
337 chrom_mb<-c(61,61,64,63,76,60,78,57,59,47,47,51,55,54,48,59,54,50,51,56,45,43,47,44,39)
|
|
338 interval_frame<-data.frame(chrom_n,chrom_mb)
|
|
339 } else if(interval_type == 'Brachypodium'){
|
|
340 chrom_n<-c('1','2','3','4','5')
|
|
341 chrom_mb<-c(75,60,60,50,30)
|
|
342 interval_frame<-data.frame(chrom_n,chrom_mb)
|
|
343 } else if(interval_type == 'Arabidopsis'){
|
|
344 chrom_n<-c('1','2','3','4','5')
|
|
345 chrom_mb<-c(31, 20,24,19,27 )
|
|
346 interval_frame<-data.frame(chrom_n,chrom_mb)
|
|
347 } else{
|
|
348 #user interval file- no headers, with chromosome in column 1 (format CHR# or CHROM#) and size in Mb (rounded up) in column 2
|
|
349 # user_interval_type<-read.table(user_interval_file)
|
|
350 user_interval_type<-read.table(interval_type)
|
|
351 if(grepl("chrom",user_interval_type[1,1],ignore.case=TRUE)==TRUE){
|
|
352 user_interval_type[,1]<-gsub("chrom","",user_interval_type[,1],ignore.case=TRUE)
|
|
353 }else if(grep("chr",user_interval_type[1,1],ignore.case=TRUE)==TRUE){
|
|
354 user_interval_type[,1]<-gsub("chr","",user_interval_type[,1],ignore.case=TRUE)
|
|
355 }
|
|
356 chrom_n<-user_interval_type[,1]
|
|
357 chrom_mb<-user_interval_type[,2]
|
|
358 interval_frame<-data.frame(chrom_n,chrom_mb)
|
|
359 }
|
|
360 names(interval_frame)<-c("CHROM","INTERVAL")
|
|
361
|
|
362
|
|
363 ######################
|
|
364 ###PLOTTING
|
|
365 #VDM SCATTER PLOT
|
|
366 #save to pdf
|
|
367 pdf(file=pdf_filename,width=9,height=8)
|
|
368 #par(mfrow=c(2,3))
|
|
369 for(chromind in interval_frame$CHROM){
|
|
370 #subset by data by chromosome for plotting
|
|
371 intervalind<-interval_frame$INTERVAL[interval_frame$CHROM==chromind]
|
|
372 chr_dat<-subset(vcfsimp_dat,CHROM==chromind,silent=TRUE)
|
|
373
|
|
374 #same subsetting by chromosome for pre-subtraction data
|
|
375 # if(length(exclusion_list)>0){
|
|
376 if(exclusion_list!="FALSE"){
|
|
377 chr_origdat<-subset(vcfinfo_origdat,CHROM==chromind,silent=TRUE)
|
|
378 }
|
|
379
|
|
380 #define x-axis upper limit
|
|
381 if(xaxis_standard==TRUE){
|
|
382 #for standardized x-axis (max x-axis chromosome length)
|
|
383 scupper_xaxis<-max(interval_frame$INTERVAL)
|
|
384 scupper_xval<-scupper_xaxis*interval_unit
|
|
385 } else if(xaxis_standard==FALSE){
|
|
386 scupper_xaxis<-intervalind
|
|
387 scupper_xval<-intervalind*interval_unit
|
|
388 }
|
|
389
|
|
390 if(allele_ratio=="AB"){
|
|
391 plot(chr_dat$POS,chr_dat$adj_AB,cex=0.60,xlim=c(0,scupper_xval),ylim=c(0,sp_yaxis),main=paste("Chr",chromind," Variant Discovery Mapping",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Ratio of Variant Reads/Total Reads [AB]',pch=10, col=plot_color,xaxt='n')
|
|
392 try(lines(loess.smooth(chr_dat$POS,chr_dat$adj_AB,span=loess_span),lwd=5,col=loess_color))
|
|
393 #plot loess curve for data without subtraction of exclusion variants
|
|
394 # if(length(exclusion_list)>0){
|
|
395 if(exclusion_list!="FALSE"){
|
|
396 try(lines(loess.smooth(chr_origdat$POS,chr_origdat$adj_AB,span=loess_span),lty="longdash",lwd=4,col=excl_loess_color))
|
|
397 }
|
|
398
|
|
399 axis(1,at=seq(0,scupper_xval,by=interval_unit),labels=c(0:scupper_xaxis))
|
|
400 abline(h=seq(0,sp_yaxis,by=0.1),v=c(1:scupper_xaxis)*interval_unit,col="gray")
|
|
401 } else if(allele_ratio=="ratio"){
|
|
402 plot(chr_dat$POS,chr_dat$RATIO,cex=0.60,xlim=c(0,scupper_xval),ylim=c(0,sp_yaxis),main=paste("Chr",chromind," Variant Discovery Mapping",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Ratio of Variant Reads/Total Reads [ratio]',pch=10, col=plot_color,xaxt='n')
|
|
403 try(lines(loess.smooth(chr_dat$POS,chr_dat$RATIO,span=loess_span),lwd=5,col=loess_color))
|
|
404 #plot loess curve for data without subtraction of exclusion variants
|
|
405 # if(length(exclusion_list)>0){
|
|
406 if(exclusion_list!="FALSE"){
|
|
407 try(lines(loess.smooth(chr_origdat$POS,chr_origdat$adj_AB,span=loess_span),lty="longdash",lwd=4,col=excl_loess_color))
|
|
408 }
|
|
409 axis(1,at=seq(0,scupper_xval,by=interval_unit),labels=c(0:scupper_xaxis))
|
|
410 abline(h=seq(0,sp_yaxis,by=0.1),v=c(1:scupper_xaxis)*interval_unit,col="gray")
|
|
411 }
|
|
412 }
|
|
413
|
|
414 ######################
|
|
415 #graph barplots
|
|
416 location_index<-NULL
|
|
417 meanSNP_dat<-NULL
|
|
418 # prepare table of counts and calculations
|
|
419 for(chromind in interval_frame$CHROM){
|
|
420 #for standardized x-axis
|
|
421 if(xaxis_standard==TRUE){
|
|
422 intervalind<-max(interval_frame$INTERVAL)*1000000/bin_size
|
|
423 } else if(xaxis_standard==FALSE){
|
|
424 intervalind<-interval_frame$INTERVAL[interval_frame$CHROM==chromind]*1000000/bin_size
|
|
425 }
|
|
426 #start intervals with **1 and end with **0
|
|
427 interval_begin<-c(((0:(intervalind-1))*bin_size)+1)
|
|
428 interval_end<-c((1:intervalind)*bin_size)
|
|
429
|
|
430 #define x-axis upper limit
|
|
431 if(xaxis_standard==TRUE){
|
|
432 upper_xaxis<-max(interval_frame$INTERVAL)
|
|
433 } else if(xaxis_standard==FALSE){
|
|
434 upper_xaxis<-interval_frame$INTERVAL[interval_frame$CHROM==chromind]
|
|
435 }
|
|
436 #prepare columns
|
|
437 snp_counter<-0
|
|
438 purealt_counter<-0
|
|
439 pureref_counter<-0
|
|
440 het_counter<-0
|
|
441 chr_mean<-0
|
|
442 normed_freq<-0
|
|
443 ratio<-0
|
|
444
|
|
445 interval_index<-data.frame(chromind,interval_begin,interval_end,snp_counter,purealt_counter,pureref_counter,het_counter,chr_mean,normed_freq)
|
|
446 chr_dat<-subset(vcfinfo_dat,CHROM==chromind)
|
|
447 #ratio calculation
|
|
448 ratio<-chr_dat$AO/(chr_dat$AO+chr_dat$RO)
|
|
449
|
|
450 #if counter based on adj_AB or ratio
|
|
451 if(allele_ratio=="ratio"){
|
|
452 chr_purealtdat<-subset(chr_dat,ratio>=threshold_upper)#;chr_purealtdat
|
|
453 chr_purerefdat<-subset(chr_dat,ratio<=threshold_lower)#;chr_purerefdat
|
|
454 chr_hetdat<-subset(chr_dat,ratio>threshold_lower & ratio<threshold_upper)#;chr_hetdat
|
|
455 } else if(allele_ratio=="AB"){
|
|
456 chr_purealtdat<-subset(chr_dat,adj_AB>=threshold_upper)#;chr_purealtdat
|
|
457 chr_purerefdat<-subset(chr_dat,adj_AB<=threshold_lower)#;chr_purerefdat
|
|
458 chr_hetdat<-subset(chr_dat,adj_AB>threshold_lower & adj_AB<threshold_upper)#;chr_hetdat
|
|
459 }
|
|
460 #if chromosome with data, count number of snps within each bin (positions rounded up to nearest bin), else skip to next chromosome
|
|
461 if(dim(chr_dat)[1]>0){
|
|
462 for(i in 1:dim(chr_dat)[1]){
|
|
463 chr_datind<-chr_dat[i,]
|
|
464 #round up to nearest bin-size interval
|
|
465 chr_datind_upper<-ceiling(chr_datind$POS/bin_size)*bin_size
|
|
466 interval_coln<-NULL;interval_rown<-NULL
|
|
467 #identify row and and counter column to increment
|
|
468 interval_coln<-which(names(interval_index)=="snp_counter")
|
|
469 interval_rown<-match(chr_datind_upper,interval_index$interval_end)
|
|
470 interval_index[interval_rown,interval_coln]<-c(interval_index$snp_counter[interval_rown]+1)
|
|
471 }
|
|
472 }else{
|
|
473 next
|
|
474 }
|
|
475 ##if chromosome with pure AO, count number of snps with each bin (positions rounded up to nearest bin)
|
|
476 if(dim(chr_purealtdat)[1]>0){
|
|
477 for(i in 1:dim(chr_purealtdat)[1]){
|
|
478 chr_purealtind<-chr_purealtdat[i,]
|
|
479 chr_purealtind_upper<-ceiling(chr_purealtind$POS/bin_size)*bin_size
|
|
480 interval_coln<-NULL;interval_rown<-NULL
|
|
481 interval_coln<-which(names(interval_index)=="purealt_counter")
|
|
482 interval_rown<-match(chr_purealtind_upper,interval_index$interval_end)
|
|
483 interval_index[interval_rown,interval_coln]<-c(interval_index$purealt_counter[interval_rown]+1)
|
|
484 }
|
|
485 }
|
|
486 #if chromosome with pure RO, count number of snps with each bin (positions rounded up to nearest bin)
|
|
487 if(dim(chr_purerefdat)[1]>0){
|
|
488 for(i in 1:dim(chr_purerefdat)[1]){
|
|
489 chr_purerefind<-chr_purerefdat[i,]
|
|
490 chr_purerefind_upper<-ceiling(chr_purerefind$POS/bin_size)*bin_size
|
|
491 interval_coln<-NULL;interval_rown<-NULL
|
|
492 interval_coln<-which(names(interval_index)=="pureref_counter")
|
|
493 interval_rown<-match(chr_purerefind_upper,interval_index$interval_end)
|
|
494 interval_index[interval_rown,interval_coln]<-c(interval_index$pureref_counter[interval_rown]+1)
|
|
495 }
|
|
496 }
|
|
497 #if chromosome with hets, count number of snps with each bin (positions rounded up to nearest bin)
|
|
498 if(dim(chr_hetdat)[1]>0){
|
|
499 for(i in 1:dim(chr_hetdat)[1]){
|
|
500 chr_hetind<-chr_hetdat[i,]
|
|
501 chr_hetind_upper<-ceiling(chr_hetind$POS/bin_size)*bin_size
|
|
502 interval_coln<-NULL;interval_rown<-NULL
|
|
503 interval_coln<-which(names(interval_index)=="het_counter")
|
|
504 interval_rown<-match(chr_hetind_upper,interval_index$interval_end)
|
|
505 interval_index[interval_rown,interval_coln]<-c(interval_index$het_counter[interval_rown]+1)
|
|
506 }
|
|
507 }
|
|
508 #irrespective of standardized x-axis, mean should be calculated from actual interval range of chromosome
|
|
509 chr_mean<-sum(interval_index$purealt_counter)/(interval_frame$INTERVAL[interval_frame$CHROM==chromind]*1000000/bin_size)
|
|
510 interval_index$chr_mean<-chr_mean
|
|
511 meanSNP_dat<-rbind(meanSNP_dat,data.frame(chromind,chr_mean))
|
|
512 #normalization treatment for if SNPs are AO=0, AO=total SNPs, or AO and RO in bin
|
|
513 for(i in 1:dim(interval_index)[1]){
|
|
514 chr_intind<-interval_index[i,]
|
|
515 #hom definition based on specified upper and lower thresholds
|
|
516 if(chr_intind$purealt_counter<=threshold_lower){
|
|
517 interval_coln<-NULL
|
|
518 interval_coln<-which(names(interval_index)=="normed_freq")
|
|
519 interval_index[i,interval_coln]=0
|
|
520 } else if (chr_intind$purealt_counter==chr_intind$snp_counter){
|
|
521 interval_coln<-NULL
|
|
522 interval_coln<-which(names(interval_index)=="normed_freq")
|
|
523 interval_index[i,interval_coln]=(chr_intind$purealt_counter)^2/chr_mean
|
|
524 } else {
|
|
525 interval_coln<-NULL
|
|
526 interval_coln<-which(names(interval_index)=="normed_freq")
|
|
527 interval_index[i,interval_coln]=chr_mean*(chr_intind$purealt_counter)^2/(chr_intind$snp_counter-chr_intind$purealt_counter)
|
|
528 }
|
|
529 }
|
|
530 location_index<-rbind(location_index,interval_index)
|
|
531 }
|
|
532
|
|
533 for(chromind in interval_frame$CHROM){
|
|
534 interval_index<-location_index[location_index$chromind==chromind,]
|
|
535 #assign 0 values to avoid empty datatable error
|
|
536 if(dim(interval_index)[1]==0){
|
|
537 interval_index[1,]<-rep(0,dim(interval_index)[2])
|
|
538 }
|
|
539 # }
|
|
540 #set up x_axis
|
|
541 if(xaxis_standard==TRUE){
|
|
542 #for standardized x-axis (max x-axis chromosome length)
|
|
543 bpupper_xaxis<-max(interval_frame$INTERVAL)
|
|
544 bpupper_xval<-bpupper_xaxis*interval_unit
|
|
545 }else if(xaxis_standard==FALSE){
|
|
546 bpupper_xaxis<-intervalind
|
|
547 bpupper_xval<-intervalind*interval_unit
|
|
548 }
|
|
549 #set up y_axis range for barplots
|
|
550 if(bfreq_norm==TRUE){
|
|
551 bp_yaxis<-5*ceiling(max(location_index$normed_freq)/5)
|
|
552 #assign non-0 value to yaxis to avoid error
|
|
553 if(bp_yaxis==0){
|
|
554 bp_yaxis<-10
|
|
555 }
|
|
556 if(xaxis_standard==TRUE){
|
|
557 bplot<-barplot(interval_index$normed_freq,space=0,ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Normalised Frequency')
|
|
558 }else if(xaxis_standard==FALSE){
|
|
559 bplot<-barplot(interval_index$normed_freq,space=0,xlim=c(0,bpupper_xaxis),ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Normalised Frequency')
|
|
560 }
|
|
561
|
|
562 }else if(bfreq_norm==FALSE){
|
|
563 bp_yaxis<-5*ceiling(max(location_index$purealt_counter)/5)
|
|
564 #assign non-0 value to yaxis to avoid error
|
|
565 if(bp_yaxis==0){
|
|
566 bp_yaxis<-10
|
|
567 }
|
|
568 if(xaxis_standard==TRUE){
|
|
569 bplot<-barplot(interval_index$purealt_counter,space=0,ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Frequency')
|
|
570 }else if(xaxis_standard==FALSE){
|
|
571 bplot<-barplot(interval_index$purealt_counter,space=0,xlim=c(0,bpupper_xaxis),ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Frequency')
|
|
572 }
|
|
573 }
|
|
574 bp_xaxis1<-as.numeric(bplot)
|
|
575 bp_xaxis2<-c(bp_xaxis1,tail(bp_xaxis1,1)+bp_xaxis1[2]-bp_xaxis1[1])
|
|
576 bp_xaxis<-bp_xaxis2-bp_xaxis1[1]
|
|
577
|
|
578 axis(1,at=bp_xaxis,labels=seq(0,bpupper_xaxis,by=c(bin_size/1000000)))
|
|
579
|
|
580 }
|
|
581 dev.off()
|
|
582 }
|
|
583
|
|
584
|
|
585 # myfunction(options$inf,options$itype,options$qual,options$thrup,options$thrlow,options$allr,options$snp,options$lsp,options$pcol,options$lcol,options$xstand,
|
|
586 # options$bsize,options$bnorm,options$exclf,options$exclthr,options$exclcol,options$parn,options$outn,options$pdfn)
|
|
587
|
|
588
|
|
589 myfunction(inf=options$inf,itype=options$itype,qual=options$qual,thrup=options$thrup,thrlow=options$thrlow,
|
|
590 allr=options$allr,snp=options$snp,lsp=options$lsp,pcol=options$pcol,lcol=options$lcol,xstand=options$xstand,bsize=options$bsize,
|
|
591 bnorm=options$bnorm,exclf=options$exclf,exclthr=options$exclthr,exclcol=options$exclcol,parn=options$parn,outn=options$outn,pdfn=options$pdfn)
|
|
592
|