# HG changeset patch # User davidvanzessen # Date 1486642809 18000 # Node ID 94765af0db1fdf0f438913cceae4fe994966bbf1 # Parent d5d203d38c8a07f3cbe4003e68496373fafd0623 Uploaded diff -r d5d203d38c8a -r 94765af0db1f report_clonality/RScript.r --- a/report_clonality/RScript.r Wed Feb 01 09:48:38 2017 -0500 +++ b/report_clonality/RScript.r Thu Feb 09 07:20:09 2017 -0500 @@ -158,7 +158,7 @@ #write the complete dataset that is left over, will be the input if 'none' for clonaltype and 'no' for filterproductive write.table(PRODF, "allUnique.txt", sep="\t",quote=F,row.names=F,col.names=T) #write.table(PRODF, "allUnique.csv", sep=",",quote=F,row.names=F,col.names=T) -write.table(UNPROD, "allUnproductive.csv", sep=",",quote=F,row.names=F,col.names=T) +write.table(UNPROD, "allUnproductive.txt", sep="\t",quote=F,row.names=F,col.names=T) print("SAMPLE TABLE:") print(table(PRODF$Sample)) @@ -697,6 +697,7 @@ } bak = PRODF +bakun = UNPROD imgtcolumns = c("X3V.REGION.trimmed.nt.nb","P3V.nt.nb", "N1.REGION.nt.nb", "P5D.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "P3D.nt.nb", "N2.REGION.nt.nb", "P5J.nt.nb", "X5J.REGION.trimmed.nt.nb", "X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb") if(all(imgtcolumns %in% colnames(inputdata))) @@ -724,8 +725,15 @@ PRODF.with.D = PRODF[nchar(PRODF$Top.D.Gene, keepNA=F) > 2,] PRODF.no.D = PRODF[nchar(PRODF$Top.D.Gene, keepNA=F) < 4,] + UNPROD.with.D = UNPROD[nchar(UNPROD$Top.D.Gene, keepNA=F) > 2,] + UNPROD.no.D = UNPROD[nchar(UNPROD$Top.D.Gene, keepNA=F) < 4,] + num_median = function(x, na.rm=T) { as.numeric(median(x, na.rm=na.rm)) } - + + print("---- table prod.with.d cdr3.length ----") + print(table(PRODF.with.D$CDR3.Length, useNA="ifany")) + print(median(PRODF.with.D$CDR3.Length, na.rm=T)) + newData = data.frame(data.table(PRODF.with.D)[,list(unique=.N, VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=mean(.SD$P3V.nt.nb, na.rm=T), @@ -740,7 +748,7 @@ Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), - Median.CDR3.l=as.double(median(.SD$CDR3.Length))), + Median.CDR3.l=as.double(median(as.numeric(.SD$CDR3.Length, na.rm=T)))), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) write.table(newData, "junctionAnalysisProd_mean_wD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) @@ -759,12 +767,16 @@ Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), - Median.CDR3.l=as.double(median(.SD$CDR3.Length))), + Median.CDR3.l=as.double(median(as.numeric(.SD$CDR3.Length, na.rm=T)))), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) write.table(newData, "junctionAnalysisProd_median_wD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) - newData = data.frame(data.table(PRODF.with.D)[,list(unique=.N, + print("---- table unprod.with.d cdr3.length ----") + print(table(UNPROD.with.D$CDR3.Length, useNA="ifany")) + print(median(UNPROD.with.D$CDR3.Length, na.rm=T)) + + newData = data.frame(data.table(UNPROD.with.D)[,list(unique=.N, VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=mean(.SD$P3V.nt.nb, na.rm=T), N1=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)), @@ -778,12 +790,12 @@ Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), - Median.CDR3.l=as.double(median(.SD$CDR3.Length))), + Median.CDR3.l=as.double(as.numeric(median(.SD$CDR3.Length, na.rm=T)))), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) write.table(newData, "junctionAnalysisUnProd_mean_wD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) - newData = data.frame(data.table(PRODF.with.D)[,list(unique=.N, + newData = data.frame(data.table(UNPROD.with.D)[,list(unique=.N, VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=num_median(.SD$P3V.nt.nb, na.rm=T), N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)), @@ -797,7 +809,7 @@ Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), - Median.CDR3.l=as.double(median(.SD$CDR3.Length))), + Median.CDR3.l=as.double(as.numeric(median(.SD$CDR3.Length, na.rm=T)))), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) write.table(newData, "junctionAnalysisUnProd_median_wD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) @@ -813,9 +825,8 @@ Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), - Median.CDR3.l=as.double(median(.SD$CDR3.Length))), + Median.CDR3.l=as.double(as.numeric(median(.SD$CDR3.Length, na.rm=T)))), by=c("Sample")]) - print(newData) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) write.table(newData, "junctionAnalysisProd_mean_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) @@ -828,12 +839,12 @@ Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), - Median.CDR3.l=as.double(median(.SD$CDR3.Length))), + Median.CDR3.l=as.double(as.numeric(median(.SD$CDR3.Length, na.rm=T)))), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) write.table(newData, "junctionAnalysisProd_median_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) - newData = data.frame(data.table(PRODF.no.D)[,list(unique=.N, + newData = data.frame(data.table(UNPROD.no.D)[,list(unique=.N, VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=mean(.SD$P3V.nt.nb, na.rm=T), N1=mean(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), @@ -842,12 +853,12 @@ Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), - Median.CDR3.l=as.double(median(.SD$CDR3.Length))), + Median.CDR3.l=as.double(as.numeric(median(.SD$CDR3.Length, na.rm=T)))), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) write.table(newData, "junctionAnalysisUnProd_mean_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) - newData = data.frame(data.table(PRODF.no.D)[,list(unique=.N, + newData = data.frame(data.table(UNPROD.no.D)[,list(unique=.N, VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=num_median(.SD$P3V.nt.nb, na.rm=T), N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), @@ -856,13 +867,14 @@ Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), - Median.CDR3.l=as.double(median(.SD$CDR3.Length))), + Median.CDR3.l=as.double(as.numeric(median(.SD$CDR3.Length, na.rm=T)))), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) write.table(newData, "junctionAnalysisUnProd_median_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) } PRODF = bak +UNPROD = bakun # ---------------------- D reading frame ---------------------- @@ -939,3 +951,54 @@ median.aa.l = data.frame(data.table(PRODF)[, list(median=as.double(median(as.numeric(.SD$CDR3.Length, na.rm=T), na.rm=T))), by=c("Sample")]) write.table(median.aa.l, "AAMedianBySample.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) + +#generate the "Sequences that are present in more than one replicate" dataset +clonaltype.in.replicates = inputdata +clonaltype = unlist(strsplit(clonaltype, ",")) +clonaltype = clonaltype[-which(clonaltype == "Sample")] + +clonaltype.in.replicates$clonaltype = do.call(paste, c(clonaltype.in.replicates[clonaltype], sep = ":")) +clonaltype.in.replicates = clonaltype.in.replicates[,c("clonaltype","Replicate", "ID", "Sequence", "Sample")] + +print(head(clonaltype.in.replicates)) + +clonaltype.counts = data.frame(table(clonaltype.in.replicates$clonaltype)) +names(clonaltype.counts) = c("clonaltype", "coincidence") + +print(head(clonaltype.counts)) + +clonaltype.counts = clonaltype.counts[clonaltype.counts$coincidence > 1,] + +head(clonaltype.counts) + +clonaltype.in.replicates = clonaltype.in.replicates[clonaltype.in.replicates$clonaltype %in% clonaltype.counts$clonaltype,] +clonaltype.in.replicates = merge(clonaltype.in.replicates, clonaltype.counts, by="clonaltype") +print(head(clonaltype.in.replicates)) +clonaltype.in.replicates = clonaltype.in.replicates[order(clonaltype.in.replicates$clonaltype),c("coincidence","clonaltype", "Sample", "Replicate", "ID", "Sequence")] + +write.table(clonaltype.in.replicates, "clonaltypes_replicates.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=T) + + + + + + + + + + + + + + + + + + + + + + + + + diff -r d5d203d38c8a -r 94765af0db1f report_clonality/r_wrapper.sh --- a/report_clonality/r_wrapper.sh Wed Feb 01 09:48:38 2017 -0500 +++ b/report_clonality/r_wrapper.sh Thu Feb 09 07:20:09 2017 -0500 @@ -396,6 +396,7 @@ echo "Clonality" >> $outputFile echo "The dataset used to calculate clonality score (Unique based on clonaltype, $clonalType)Download" >> $outputFile +echo "Sequences that are present in more than one replicateDownload" >> $outputFile echo "" >> $outputFile