Previous changeset 0:c33d93683a09 (2016-10-13) Next changeset 2:e85fec274cde (2016-10-27) |
Commit message:
Uploaded |
modified:
aa_histogram.r merge_and_filter.r pattern_plots.r shm_csr.py shm_csr.r shm_csr.xml wrapper.sh |
b |
diff -r c33d93683a09 -r faae21ba5c63 aa_histogram.r --- a/aa_histogram.r Thu Oct 13 10:52:24 2016 -0400 +++ b/aa_histogram.r Tue Oct 25 07:28:43 2016 -0400 |
b |
@@ -37,7 +37,7 @@ print("---------------- plot ----------------") - m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1), text = element_text(size=13, colour="black")) m = m + geom_bar(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=dat_dt$i, labels=dat_dt$i) m = m + annotate("segment", x = 0.5, y = -0.05, xend=26.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 13, y = -0.1, label="FR1") m = m + annotate("segment", x = 26.5, y = -0.07, xend=38.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 32.5, y = -0.15, label="CDR1") |
b |
diff -r c33d93683a09 -r faae21ba5c63 merge_and_filter.r --- a/merge_and_filter.r Thu Oct 13 10:52:24 2016 -0400 +++ b/merge_and_filter.r Tue Oct 25 07:28:43 2016 -0400 |
[ |
@@ -32,7 +32,6 @@ gene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100 gene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")] colnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match") - } input.sequence.count = nrow(summ) @@ -62,32 +61,9 @@ summ = summ[summ$Functionality != "No results" & summ$Functionality != "unknown (see comment)" & summ$Functionality != "unknown",] } -print(paste("Number of sequences after productive filter:", nrow(summ))) - -filtering.steps = rbind(filtering.steps, c("After productive filter", nrow(summ))) - -splt = strsplit(class.filter, "_")[[1]] -chunk_hit_threshold = as.numeric(splt[1]) -nt_hit_threshold = as.numeric(splt[2]) - -higher_than=(summ$chunk_hit_percentage >= chunk_hit_threshold & summ$nt_hit_percentage >= nt_hit_threshold) - -unmatched=summ[NULL,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] +print(paste("Number of sequences after functionality filter:", nrow(summ))) -if(!all(higher_than, na.rm=T)){ #check for 'not all' because that would mean the unmatched set is empty - unmatched = summ[!higher_than,] - unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] - unmatched$best_match = paste("unmatched,", unmatched$best_match) - summ[!higher_than,"best_match"] = paste("unmatched,", summ[!higher_than,"best_match"]) -} - -if(any(higher_than, na.rm=T)){ - #summ = summ[higher_than,] -} - -if(nrow(summ) == 0){ - stop("No data remaining after filter") -} +filtering.steps = rbind(filtering.steps, c("After functionality filter", nrow(summ))) result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID") @@ -114,22 +90,16 @@ result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele) result$JGene = gsub("[*].*", "", result$JGene) -result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":")) - -result = result[!(duplicated(result$past)), ] - -result = result[,!(names(result) %in% c("past"))] - -print(paste("Number of sequences in result after", unique.type, "filtering:", nrow(result))) - -filtering.steps = rbind(filtering.steps, c("After duplicate filter", nrow(result))) - print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == ""))) print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == ""))) print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == ""))) print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == ""))) -if(empty.region.filter == "FR1"){ +if(empty.region.filter == "leader"){ + result = result[result$FR1.IMGT.seq != "" & result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] + print(paste("Number of sequences after empty FR1, CDR1, FR2, CDR2 and FR3 column filter:", nrow(result))) + filtering.steps = rbind(filtering.steps, c("After empty FR1, CDR1, FR2, CDR2, FR3 filter", nrow(result))) +} else if(empty.region.filter == "FR1"){ result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] print(paste("Number of sequences after empty CDR1, FR2, CDR2 and FR3 column filter:", nrow(result))) filtering.steps = rbind(filtering.steps, c("After empty CDR1, FR2, CDR2, FR3 filter", nrow(result))) @@ -143,7 +113,12 @@ filtering.steps = rbind(filtering.steps, c("After empty CDR2, FR3 filter", nrow(result))) } -if(empty.region.filter == "FR1"){ +print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result))) +print(paste("Number of matched sequences in result after CDR/FR filtering:", nrow(result[!grepl("unmatched", result$best_match),]))) + +if(empty.region.filter == "leader"){ + result = result[!(grepl("n|N", result$FR1.IMGT.seq) | grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] +} else if(empty.region.filter == "FR1"){ result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] } else if(empty.region.filter == "CDR1"){ result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] @@ -171,7 +146,9 @@ if(filter.unique != "no"){ clmns = names(result) - if(empty.region.filter == "FR1"){ + if(empty.region.filter == "leader"){ + result$unique.def = paste(result$FR1.IMGT.seq, result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) + } else if(empty.region.filter == "FR1"){ result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) } else if(empty.region.filter == "CDR1"){ rresult$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) @@ -199,10 +176,41 @@ #write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T) } -print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result))) -print(paste("Number of matched sequences in result after CDR/FR filtering:", nrow(result[!grepl("unmatched", result$best_match),]))) +filtering.steps = rbind(filtering.steps, c("After filter unique sequences", nrow(result))) + + +splt = strsplit(class.filter, "_")[[1]] +chunk_hit_threshold = as.numeric(splt[1]) +nt_hit_threshold = as.numeric(splt[2]) + +higher_than=(summ$chunk_hit_percentage >= chunk_hit_threshold & summ$nt_hit_percentage >= nt_hit_threshold) + +unmatched=summ[NULL,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] + +if(!all(higher_than, na.rm=T)){ #check for 'not all' because that would mean the unmatched set is empty + unmatched = summ[!higher_than,] + unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] + unmatched$best_match = paste("unmatched,", unmatched$best_match) + summ[!higher_than,"best_match"] = paste("unmatched,", summ[!higher_than,"best_match"]) +} -filtering.steps = rbind(filtering.steps, c("After unique filter", nrow(result))) +if(any(higher_than, na.rm=T)){ + #summ = summ[higher_than,] +} + +if(nrow(summ) == 0){ + stop("No data remaining after filter") +} + +result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":")) + +result = result[!(duplicated(result$past)), ] + +result = result[,!(names(result) %in% c("past"))] + +print(paste("Number of sequences in result after", unique.type, "filtering:", nrow(result))) + +filtering.steps = rbind(filtering.steps, c("After remove duplicates based on filter", nrow(result))) print(paste("Number of rows in result:", nrow(result))) print(paste("Number of rows in unmatched:", nrow(unmatched))) |
b |
diff -r c33d93683a09 -r faae21ba5c63 pattern_plots.r --- a/pattern_plots.r Thu Oct 13 10:52:24 2016 -0400 +++ b/pattern_plots.r Tue Oct 25 07:28:43 2016 -0400 |
b |
@@ -45,7 +45,7 @@ write.table(data1, plot1.txt, quote=F, sep="\t", na="", row.names=F, col.names=T) p = ggplot(data1, aes(Class, value)) + geom_bar(aes(fill=Type), stat="identity", position="dodge", colour = "black") + ylab("% of mutations") + guides(fill=guide_legend(title=NULL)) -p = p + theme(panel.background = element_rect(fill = "white", colour="black")) + scale_fill_manual(values=c("RGYW.WRCY" = "white", "TW.WA" = "blue4")) +p = p + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=13, colour="black")) + scale_fill_manual(values=c("RGYW.WRCY" = "white", "TW.WA" = "blue4")) #p = p + scale_colour_manual(values=c("RGYW.WRCY" = "black", "TW.WA" = "blue4")) png(filename=plot1.png, width=480, height=300) print(p) @@ -79,7 +79,7 @@ write.table(data2, plot2.txt, quote=F, sep="\t", na="", row.names=F, col.names=T) p = ggplot(data2, aes(x=Class, y=value, fill=Type)) + geom_bar(position="fill", stat="identity", colour = "black") + scale_y_continuous(labels=percent_format()) + guides(fill=guide_legend(title=NULL)) + ylab("% of mutations") -p = p + theme(panel.background = element_rect(fill = "white", colour="black")) + scale_fill_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "white")) +p = p + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black")) + scale_fill_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "white")) #p = p + scale_colour_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "black")) png(filename=plot2.png, width=480, height=300) print(p) @@ -115,7 +115,7 @@ write.table(data3, plot3.txt, quote=F, sep="\t", na="", row.names=F, col.names=T) p = ggplot(data3, aes(Class, value)) + geom_bar(aes(fill=Type), stat="identity", position="dodge", colour = "black") + ylab("% of nucleotides") + guides(fill=guide_legend(title=NULL)) -p = p + theme(panel.background = element_rect(fill = "white", colour="black")) + scale_fill_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "white")) +p = p + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black")) + scale_fill_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "white")) #p = p + scale_colour_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "black")) png(filename=plot3.png, width=480, height=300) print(p) |
b |
diff -r c33d93683a09 -r faae21ba5c63 shm_csr.py --- a/shm_csr.py Thu Oct 13 10:52:24 2016 -0400 +++ b/shm_csr.py Tue Oct 25 07:28:43 2016 -0400 |
[ |
@@ -7,15 +7,14 @@ parser.add_argument("--input", help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation") parser.add_argument("--genes", help="The genes available in the 'best_match' column") -parser.add_argument("--includefr1", help="Should the mutation/nucleotides in the FR1 region be included?") +parser.add_argument("--empty_region_filter", help="Where does the sequence start?", choices=['leader', 'FR1', 'CDR1', 'FR2']) parser.add_argument("--output", help="Output file") args = parser.parse_args() infile = args.input genes = str(args.genes).split(",") -print "includefr1 =", args.includefr1 -include_fr1 = True if args.includefr1 == "yes" else False +empty_region_filter = args.empty_region_filter outfile = args.output genedic = dict() @@ -59,16 +58,14 @@ ID = linesplt[IDIndex] genedic[ID] = linesplt[best_matchIndex] try: - if linesplt[fr1Index] != "NA": - mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else [] - else: - mutationdic[ID + "_FR1"] = [] - mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if linesplt[cdr1Index] != "NA" else [] - mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if linesplt[fr2Index] != "NA" else [] - mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if linesplt[cdr2Index] != "NA" else [] + mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if (linesplt[fr1Index] != "NA" and empty_region_filter == "leader") else [] + mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if (linesplt[cdr1Index] != "NA" and empty_region_filter in ["leader", "FR1"]) else [] + mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if (linesplt[fr2Index] != "NA" and empty_region_filter in ["leader", "FR1", "CDR1"]) else [] + mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if (linesplt[cdr2Index] != "NA") else [] mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else [] - except e: + except Exception as e: + print "Something went wrong while processing this line:" print linesplt print linecount print e @@ -189,8 +186,6 @@ linesplt = line.split("\t") gene = linesplt[best_matchIndex] ID = linesplt[IDIndex] - if ID == "ca2": - print linesplt RGYW = [(int(x), int(y), z) for (x, y, z) in [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]] WRCY = [(int(x), int(y), z) for (x, y, z) in @@ -201,8 +196,7 @@ [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0 - mutationList = (mutationdic[ID + "_FR1"] if include_fr1 else []) + mutationdic[ID + "_CDR1"] + mutationdic[ - ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] + mutationList = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] for mutation in mutationList: frm, where, to, AAfrm, AAwhere, AAto, junk = mutation mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW]) @@ -271,7 +265,7 @@ o.write(typ + " (%)") curr = dic[typ] for gene in genes: - geneMatcher = geneMatchers[gene] #re.compile("^" + gene + ".*") #recompile every loop.... + geneMatcher = geneMatchers[gene] if valuedic[gene + "_" + fname] is 0: o.write(",0,0,0") else: |
b |
diff -r c33d93683a09 -r faae21ba5c63 shm_csr.r --- a/shm_csr.r Thu Oct 13 10:52:24 2016 -0400 +++ b/shm_csr.r Tue Oct 25 07:28:43 2016 -0400 |
[ |
b'@@ -1,493 +1,516 @@\n-library(data.table)\r\n-library(ggplot2)\r\n-library(reshape2)\r\n-\r\n-args <- commandArgs(trailingOnly = TRUE)\r\n-\r\n-input = args[1]\r\n-genes = unlist(strsplit(args[2], ","))\r\n-outputdir = args[3]\r\n-include_fr1 = ifelse(args[4] == "yes", T, F)\r\n-setwd(outputdir)\r\n-\r\n-dat = read.table(input, header=T, sep="\\t", fill=T, stringsAsFactors=F)\r\n-\r\n-if(length(dat$Sequence.ID) == 0){\r\n- setwd(outputdir)\r\n- result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5))\r\n- row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)")\r\n- write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)\r\n- transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4))\r\n- row.names(transitionTable) = c("A", "C", "G", "T")\r\n- transitionTable["A","A"] = NA\r\n- transitionTable["C","C"] = NA\r\n- transitionTable["G","G"] = NA\r\n- transitionTable["T","T"] = NA\r\n- write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)\r\n- cat("0", file="n.txt")\r\n- stop("No data")\r\n-}\r\n-\r\n-cleanup_columns = c("FR1.IMGT.c.a",\r\n- "FR2.IMGT.g.t",\r\n- "CDR1.IMGT.Nb.of.nucleotides",\r\n- "CDR2.IMGT.t.a",\r\n- "FR1.IMGT.c.g",\r\n- "CDR1.IMGT.c.t",\r\n- "FR2.IMGT.a.c",\r\n- "FR2.IMGT.Nb.of.mutations",\r\n- "FR2.IMGT.g.c",\r\n- "FR2.IMGT.a.g",\r\n- "FR3.IMGT.t.a",\r\n- "FR3.IMGT.t.c",\r\n- "FR2.IMGT.g.a",\r\n- "FR3.IMGT.c.g",\r\n- "FR1.IMGT.Nb.of.mutations",\r\n- "CDR1.IMGT.g.a",\r\n- "CDR1.IMGT.t.g",\r\n- "CDR1.IMGT.g.c",\r\n- "CDR2.IMGT.Nb.of.nucleotides",\r\n- "FR2.IMGT.a.t",\r\n- "CDR1.IMGT.Nb.of.mutations",\r\n- "CDR3.IMGT.Nb.of.nucleotides",\r\n- "CDR1.IMGT.a.g",\r\n- "FR3.IMGT.a.c",\r\n- "FR1.IMGT.g.a",\r\n- "FR3.IMGT.a.g",\r\n- "FR1.IMGT.a.t",\r\n- "CDR2.IMGT.a.g",\r\n- "CDR2.IMGT.Nb.of.mutations",\r\n- "CDR2.IMGT.g.t",\r\n- "CDR2.IMGT.a.c",\r\n- "CDR1.IMGT.t.c",\r\n- "FR3.IMGT.g.c",\r\n- "FR1.IMGT.g.t",\r\n- "FR3.IMGT.g.t",\r\n- "CDR1.IMGT.a.t",\r\n- "FR1.IMGT.a.g",\r\n- "FR3.IMGT.a.t",\r\n- "FR3.IMGT.Nb.of.nucleotides",\r\n- "FR2.IMGT.t.c",\r\n- "CDR2.IMGT.g.a",\r\n- "FR2.IMGT.t.a",\r\n- "CDR1.IMGT.t.a",\r\n- "FR2.IMGT.t.g",\r\n- "FR3.IMGT.t.g",\r\n- "FR2.IMGT.Nb.of.nucleotides",\r\n- "FR1.IMGT.t.a",\r\n- "FR1.IMGT.t.g",\r\n- "FR3.IMGT.c.t",\r\n- "FR1.IMGT.t.c",\r\n- "CDR2.IMGT.a.t",\r\n- "FR2.IMGT.c.t",\r\n- "CDR1.IMGT.g.t",\r\n- "CDR2.IMGT.t.g",\r\n- "FR1.IMGT.Nb.of.nucleotides",\r\n- "CDR1.IMGT.c.g",\r\n- "CDR2.IMGT.t.c",\r\n- "FR3.IMGT.g.a",\r\n- "CDR1.IMGT.a.c",\r\n- "FR2.IMGT.c.a",\r\n- "FR3.IMGT.Nb.of.mutations",\r\n- "FR2.IMGT.c.g",\r\n- "CDR2.IMGT.g.c",\r\n- "FR1.IMGT.g.c",\r\n- "CDR2.IMGT.c.t",\r\n- "FR3.IMGT.c.a",\r\n- "CDR1.IMGT.c.a",\r\n- "CDR2.IMGT.c.g",\r\n- "CDR2.IMGT.c.a",\r\n- "FR1.IMGT.c.t",\r\n- "F'..b'q, fill=Gene))\n+\tpc = pc + geom_bar(width = 1, stat = "identity") + scale_fill_manual(labels=genesForPlot$label, values=c("IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred"))\n+\tpc = pc + coord_polar(theta="y") + scale_y_continuous(breaks=NULL)\n+\tpc = pc + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black"))\n+\tpc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IGG subclasses", "( n =", sum(genesForPlot$Freq), ")"))\n+\twrite.table(genesForPlot, "IGG.txt", sep="\\t",quote=F,row.names=F,col.names=T)\n+\n+\tpng(filename="IGG.png")\n+\tprint(pc)\n+\tdev.off()\n+}\n+\n+print("Plotting scatterplot")\n+\n+dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2)\n+dat.clss = dat\n+\n+dat.clss$best_match = substr(dat.clss$best_match, 0, 3)\n+\n+dat.clss = rbind(dat, dat.clss)\n+\n+p = ggplot(dat.clss, aes(best_match, percentage_mutations))\n+p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA)\n+p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black"))\n+p = p + scale_fill_manual(values=c("IGA" = "blue4", "IGA1" = "lightblue1", "IGA2" = "blue4", "IGG" = "olivedrab3", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "darkviolet"))\n+p = p + scale_colour_manual(values=c("IGA" = "blue4", "IGA1" = "lightblue1", "IGA2" = "blue4", "IGG" = "olivedrab3", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "darkviolet"))\n+\n+png(filename="scatter.png")\n+print(p)\n+dev.off()\n+\n+write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\\t",quote=F,row.names=F,col.names=T)\n+\n+write.table(dat, input, sep="\\t",quote=F,row.names=F,col.names=T)\n+\n+print("Plotting frequency ranges plot")\n+\n+dat$best_match_class = substr(dat$best_match, 0, 3)\n+freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20")\n+dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels)\n+\n+frequency_bins_sum = data.frame(data.table(dat)[, list(class_sum=sum(.N)), by=c("best_match_class")])\n+\n+frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match_class", "frequency_bins")])\n+\n+frequency_bins_data = merge(frequency_bins_data, frequency_bins_sum, by="best_match_class")\n+\n+frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2)\n+\n+p = ggplot(frequency_bins_data, aes(frequency_bins, frequency))\n+p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black"))\n+p = p + xlab("Frequency ranges") + ylab("Frequency") + ggtitle("Mutation Frequencies by class") + scale_fill_manual(values=c("IGA" = "blue4", "IGG" = "olivedrab3", "IGM" = "black"))\n+\n+png(filename="frequency_ranges.png")\n+print(p)\n+dev.off()\n+\n+frequency_bins_data_by_class = frequency_bins_data\n+\n+write.table(frequency_bins_data_by_class, "frequency_ranges_classes.txt", sep="\\t",quote=F,row.names=F,col.names=T)\n+\n+frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match", "best_match_class", "frequency_bins")])\n+\n+frequency_bins_data = merge(frequency_bins_data, frequency_bins_sum, by="best_match_class")\n+\n+frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2)\n+\n+write.table(frequency_bins_data, "frequency_ranges_subclasses.txt", sep="\\t",quote=F,row.names=F,col.names=T)\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n' |
b |
diff -r c33d93683a09 -r faae21ba5c63 shm_csr.xml --- a/shm_csr.xml Thu Oct 13 10:52:24 2016 -0400 +++ b/shm_csr.xml Tue Oct 25 07:28:43 2016 -0400 |
b |
@@ -1,24 +1,20 @@ <tool id="shm_csr" name="SHM & CSR pipeline" version="1.0"> <description></description> <command interpreter="bash"> - wrapper.sh $in_file custom $out_file $out_file.files_path ${in_file.name} ${include_fr1} $functionality $unique $naive_output_ca $naive_output_cg $naive_output_cm $filter_uniques $class_filter $empty_region_filter + wrapper.sh $in_file custom $out_file $out_file.files_path ${in_file.name} "-" $functionality $unique $naive_output_ca $naive_output_cg $naive_output_cm $filter_uniques $class_filter $empty_region_filter </command> <inputs> <param name="in_file" type="data" label="IMGT zip file to be analysed" /> <param name="empty_region_filter" type="select" label="Sequence starts at" help="" > + <option value="leader" selected="true">Leader: include FR1, CDR1, FR2, CDR2, FR3 in filters</option> <option value="FR1" selected="true">FR1: include CDR1,FR2,CDR2,FR3 in filters</option> <option value="CDR1">CDR1: include FR2,CDR2,FR3 in filters</option> <option value="FR2">FR2: include CDR2,FR3 in filters</option> </param> - <param name="include_fr1" type="select" label="Include mutations in FR1 region" help="" > - <option value="yes">yes</option> - <option value="no" selected="true">no</option> - </param> <param name="functionality" type="select" label="Functionality filter" help="" > - <option value="productive" selected="true">Productive: Keep "productive" and "productive (see comment)"</option> - <option value="unproductive">Unproductive: Keep "unproductive" and "unproductive (see comment)"</option> - <option value="remove_unknown">Remove "unknown" and "unknown (see comment)"</option> - <option value="dont_filter">Don't filter</option> + <option value="productive" selected="true">Productive (Productive and Productive see comment)</option> + <option value="unproductive">Unproductive (Unproductive and Unproductive see comment)</option> + <option value="remove_unknown">Productive and Unproductive (Productive, Productive see comment, Unproductive, Unproductive and Unproductive see comment)</option> </param> <param name="filter_uniques" type="select" label="Filter unique sequences" help="See below for an example."> <option value="remove">Remove uniques (Based on nucleotide sequence + C)</option> @@ -42,7 +38,7 @@ <option value="60_55">>60% class and >55% subclass</option> <option value="70_0">>70% class</option> <option value="60_0">>60% class</option> - <option value="101_101">No class</option> + <option value="101_101">Do not assign (sub)class</option> </param> <conditional name="naive_output_cond"> <param name="naive_output" type="select" label="Output new IMGT archives per class into your history?"> |
b |
diff -r c33d93683a09 -r faae21ba5c63 wrapper.sh --- a/wrapper.sh Thu Oct 13 10:52:24 2016 -0400 +++ b/wrapper.sh Tue Oct 25 07:28:43 2016 -0400 |
b |
@@ -158,13 +158,13 @@ classes="IGA,IGA1,IGA2,IGG,IGG1,IGG2,IGG3,IGG4,IGM,unmatched" echo "R mutation analysis" -Rscript $dir/shm_csr.r $outdir/merged.txt $classes $outdir ${include_fr1} 2>&1 +Rscript $dir/shm_csr.r $outdir/merged.txt $classes $outdir ${empty_region_filter} 2>&1 echo "---------------- shm_csr.py ----------------" echo "---------------- shm_csr.py ----------------<br />" >> $log -python $dir/shm_csr.py --input $outdir/merged.txt --genes $classes --includefr1 "${include_fr1}" --output $outdir/hotspot_analysis.txt +python $dir/shm_csr.py --input $outdir/merged.txt --genes $classes --empty_region_filter "${empty_region_filter}" --output $outdir/hotspot_analysis.txt echo "---------------- aa_histogram.r ----------------" echo "---------------- aa_histogram.r ----------------<br />" >> $log @@ -575,6 +575,9 @@ cd $tmp +echo "Cleaning up *.RData files" +find $outdir/baseline -name "*.RData" -type f -delete + echo "---------------- naive_output.r ----------------" echo "---------------- naive_output.r ----------------<br />" >> $log |