Galaxy |

Changeset 1:faae21ba5c63 (2016-10-25)

Previous changeset 0:c33d93683a09 (2016-10-13) Next changeset 2:e85fec274cde (2016-10-27)

Commit message:
Uploaded

modified:
aa_histogram.r
merge_and_filter.r
pattern_plots.r
shm_csr.py
shm_csr.r
shm_csr.xml
wrapper.sh

diff -r c33d93683a09 -r faae21ba5c63 aa_histogram.r
--- a/aa_histogram.r Thu Oct 13 10:52:24 2016 -0400
+++ b/aa_histogram.r Tue Oct 25 07:28:43 2016 -0400

@@ -37,7 +37,7 @@

         print("---------------- plot ----------------")

-        m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
+        m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1), text = element_text(size=13, colour="black"))
         m = m + geom_bar(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=dat_dt$i, labels=dat_dt$i)
         m = m + annotate("segment", x = 0.5, y = -0.05, xend=26.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 13, y = -0.1, label="FR1")
         m = m + annotate("segment", x = 26.5, y = -0.07, xend=38.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 32.5, y = -0.15, label="CDR1")

diff -r c33d93683a09 -r faae21ba5c63 merge_and_filter.r
--- a/merge_and_filter.r Thu Oct 13 10:52:24 2016 -0400
+++ b/merge_and_filter.r Tue Oct 25 07:28:43 2016 -0400

[

@@ -32,7 +32,6 @@
gene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100
gene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")]
colnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")
-
}

input.sequence.count = nrow(summ)
@@ -62,32 +61,9 @@
summ = summ[summ$Functionality != "No results" & summ$Functionality != "unknown (see comment)" & summ$Functionality != "unknown",]
}

-print(paste("Number of sequences after productive filter:", nrow(summ)))
-
-filtering.steps = rbind(filtering.steps, c("After productive filter", nrow(summ)))
-
-splt = strsplit(class.filter, "_")[[1]]
-chunk_hit_threshold = as.numeric(splt[1])
-nt_hit_threshold = as.numeric(splt[2])
-
-higher_than=(summ$chunk_hit_percentage >= chunk_hit_threshold & summ$nt_hit_percentage >= nt_hit_threshold)
-
-unmatched=summ[NULL,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
+print(paste("Number of sequences after functionality filter:", nrow(summ)))

-if(!all(higher_than, na.rm=T)){ #check for 'not all' because that would mean the unmatched set is empty
- unmatched = summ[!higher_than,]
- unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
- unmatched$best_match = paste("unmatched,", unmatched$best_match)
- summ[!higher_than,"best_match"] = paste("unmatched,", summ[!higher_than,"best_match"])
-}
-
-if(any(higher_than, na.rm=T)){
- #summ = summ[higher_than,]
-}
-
-if(nrow(summ) == 0){
- stop("No data remaining after filter")
-}
+filtering.steps = rbind(filtering.steps, c("After functionality filter", nrow(summ)))

result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID")

@@ -114,22 +90,16 @@
result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele)
result$JGene = gsub("[*].*", "", result$JGene)

-result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":"))
-
-result = result[!(duplicated(result$past)), ]
-
-result = result[,!(names(result) %in% c("past"))]
-
-print(paste("Number of sequences in result after", unique.type, "filtering:", nrow(result)))
-
-filtering.steps = rbind(filtering.steps, c("After duplicate filter", nrow(result)))
-
print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "")))
print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "")))
print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "")))
print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == "")))

-if(empty.region.filter == "FR1"){
+if(empty.region.filter == "leader"){
+ result = result[result$FR1.IMGT.seq != "" & result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
+ print(paste("Number of sequences after empty FR1, CDR1, FR2, CDR2 and FR3 column filter:", nrow(result)))
+ filtering.steps = rbind(filtering.steps, c("After empty FR1, CDR1, FR2, CDR2, FR3 filter", nrow(result)))
+} else if(empty.region.filter == "FR1"){
result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
print(paste("Number of sequences after empty CDR1, FR2, CDR2 and FR3 column filter:", nrow(result)))
filtering.steps = rbind(filtering.steps, c("After empty CDR1, FR2, CDR2, FR3 filter", nrow(result)))
@@ -143,7 +113,12 @@
filtering.steps = rbind(filtering.steps, c("After empty CDR2, FR3 filter", nrow(result)))
}

-if(empty.region.filter == "FR1"){
+print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result)))
+print(paste("Number of matched sequences in result after CDR/FR filtering:", nrow(result[!grepl("unmatched", result$best_match),])))
+
+if(empty.region.filter == "leader"){
+ result = result[!(grepl("n|N", result$FR1.IMGT.seq) | grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]
+} else if(empty.region.filter == "FR1"){
result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]
} else if(empty.region.filter == "CDR1"){
result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]
@@ -171,7 +146,9 @@
if(filter.unique != "no"){
clmns = names(result)

- if(empty.region.filter == "FR1"){
+ if(empty.region.filter == "leader"){
+ result$unique.def = paste(result$FR1.IMGT.seq, result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
+ } else if(empty.region.filter == "FR1"){
result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
} else if(empty.region.filter == "CDR1"){
rresult$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
@@ -199,10 +176,41 @@
#write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T)
}

-print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result)))
-print(paste("Number of matched sequences in result after CDR/FR filtering:", nrow(result[!grepl("unmatched", result$best_match),])))
+filtering.steps = rbind(filtering.steps, c("After filter unique sequences", nrow(result)))
+
+
+splt = strsplit(class.filter, "_")[[1]]
+chunk_hit_threshold = as.numeric(splt[1])
+nt_hit_threshold = as.numeric(splt[2])
+
+higher_than=(summ$chunk_hit_percentage >= chunk_hit_threshold & summ$nt_hit_percentage >= nt_hit_threshold)
+
+unmatched=summ[NULL,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
+
+if(!all(higher_than, na.rm=T)){ #check for 'not all' because that would mean the unmatched set is empty
+ unmatched = summ[!higher_than,]
+ unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
+ unmatched$best_match = paste("unmatched,", unmatched$best_match)
+ summ[!higher_than,"best_match"] = paste("unmatched,", summ[!higher_than,"best_match"])
+}

-filtering.steps = rbind(filtering.steps, c("After unique filter", nrow(result)))
+if(any(higher_than, na.rm=T)){
+ #summ = summ[higher_than,]
+}
+
+if(nrow(summ) == 0){
+ stop("No data remaining after filter")
+}
+
+result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":"))
+
+result = result[!(duplicated(result$past)), ]
+
+result = result[,!(names(result) %in% c("past"))]
+
+print(paste("Number of sequences in result after", unique.type, "filtering:", nrow(result)))
+
+filtering.steps = rbind(filtering.steps, c("After remove duplicates based on filter", nrow(result)))

print(paste("Number of rows in result:", nrow(result)))
print(paste("Number of rows in unmatched:", nrow(unmatched)))

diff -r c33d93683a09 -r faae21ba5c63 pattern_plots.r
--- a/pattern_plots.r Thu Oct 13 10:52:24 2016 -0400
+++ b/pattern_plots.r Tue Oct 25 07:28:43 2016 -0400

@@ -45,7 +45,7 @@
write.table(data1, plot1.txt, quote=F, sep="\t", na="", row.names=F, col.names=T)

p = ggplot(data1, aes(Class, value)) + geom_bar(aes(fill=Type), stat="identity", position="dodge", colour = "black") + ylab("% of mutations") + guides(fill=guide_legend(title=NULL))
-p = p + theme(panel.background = element_rect(fill = "white", colour="black")) + scale_fill_manual(values=c("RGYW.WRCY" = "white", "TW.WA" = "blue4"))
+p = p + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=13, colour="black")) + scale_fill_manual(values=c("RGYW.WRCY" = "white", "TW.WA" = "blue4"))
#p = p + scale_colour_manual(values=c("RGYW.WRCY" = "black", "TW.WA" = "blue4"))
png(filename=plot1.png, width=480, height=300)
print(p)
@@ -79,7 +79,7 @@
write.table(data2, plot2.txt, quote=F, sep="\t", na="", row.names=F, col.names=T)

p = ggplot(data2, aes(x=Class, y=value, fill=Type)) + geom_bar(position="fill", stat="identity", colour = "black") + scale_y_continuous(labels=percent_format()) + guides(fill=guide_legend(title=NULL)) + ylab("% of mutations")
-p = p + theme(panel.background = element_rect(fill = "white", colour="black")) + scale_fill_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "white"))
+p = p + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black")) + scale_fill_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "white"))
#p = p + scale_colour_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "black"))
png(filename=plot2.png, width=480, height=300)
print(p)
@@ -115,7 +115,7 @@
write.table(data3, plot3.txt, quote=F, sep="\t", na="", row.names=F, col.names=T)

p = ggplot(data3, aes(Class, value)) + geom_bar(aes(fill=Type), stat="identity", position="dodge", colour = "black") + ylab("% of nucleotides") + guides(fill=guide_legend(title=NULL))
-p = p + theme(panel.background = element_rect(fill = "white", colour="black")) + scale_fill_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "white"))
+p = p + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black")) + scale_fill_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "white"))
#p = p + scale_colour_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "black"))
png(filename=plot3.png, width=480, height=300)
print(p)

diff -r c33d93683a09 -r faae21ba5c63 shm_csr.py
--- a/shm_csr.py Thu Oct 13 10:52:24 2016 -0400
+++ b/shm_csr.py Tue Oct 25 07:28:43 2016 -0400

[

@@ -7,15 +7,14 @@
parser.add_argument("--input",
help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation")
parser.add_argument("--genes", help="The genes available in the 'best_match' column")
-parser.add_argument("--includefr1", help="Should the mutation/nucleotides in the FR1 region be included?")
+parser.add_argument("--empty_region_filter", help="Where does the sequence start?", choices=['leader', 'FR1', 'CDR1', 'FR2'])
parser.add_argument("--output", help="Output file")

args = parser.parse_args()

infile = args.input
genes = str(args.genes).split(",")
-print "includefr1 =", args.includefr1
-include_fr1 = True if args.includefr1 == "yes" else False
+empty_region_filter = args.empty_region_filter
outfile = args.output

genedic = dict()
@@ -59,16 +58,14 @@
ID = linesplt[IDIndex]
genedic[ID] = linesplt[best_matchIndex]
try:
- if linesplt[fr1Index] != "NA":
- mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else []
- else:
- mutationdic[ID + "_FR1"] = []
- mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if linesplt[cdr1Index] != "NA" else []
- mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if linesplt[fr2Index] != "NA" else []
- mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if linesplt[cdr2Index] != "NA" else []
+ mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if (linesplt[fr1Index] != "NA" and empty_region_filter == "leader") else []
+ mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if (linesplt[cdr1Index] != "NA" and empty_region_filter in ["leader", "FR1"]) else []
+ mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if (linesplt[fr2Index] != "NA" and empty_region_filter in ["leader", "FR1", "CDR1"]) else []
+ mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if (linesplt[cdr2Index] != "NA") else []
mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]
mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else []
- except e:
+ except Exception as e:
+ print "Something went wrong while processing this line:"
print linesplt
print linecount
print e
@@ -189,8 +186,6 @@
linesplt = line.split("\t")
gene = linesplt[best_matchIndex]
ID = linesplt[IDIndex]
- if ID == "ca2":
- print linesplt
RGYW = [(int(x), int(y), z) for (x, y, z) in
[hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]]
WRCY = [(int(x), int(y), z) for (x, y, z) in
@@ -201,8 +196,7 @@
[hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]]
RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0

- mutationList = (mutationdic[ID + "_FR1"] if include_fr1 else []) + mutationdic[ID + "_CDR1"] + mutationdic[
- ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
+ mutationList = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
for mutation in mutationList:
frm, where, to, AAfrm, AAwhere, AAto, junk = mutation
mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW])
@@ -271,7 +265,7 @@
o.write(typ + " (%)")
curr = dic[typ]
for gene in genes:
- geneMatcher = geneMatchers[gene] #re.compile("^" + gene + ".*") #recompile every loop....
+ geneMatcher = geneMatchers[gene]
if valuedic[gene + "_" + fname] is 0:
o.write(",0,0,0")
else:

diff -r c33d93683a09 -r faae21ba5c63 shm_csr.r
--- a/shm_csr.r Thu Oct 13 10:52:24 2016 -0400
+++ b/shm_csr.r Tue Oct 25 07:28:43 2016 -0400

[

b'@@ -1,493 +1,516 @@\n-library(data.table)\r\n-library(ggplot2)\r\n-library(reshape2)\r\n-\r\n-args <- commandArgs(trailingOnly = TRUE)\r\n-\r\n-input = args[1]\r\n-genes = unlist(strsplit(args[2], ","))\r\n-outputdir = args[3]\r\n-include_fr1 = ifelse(args[4] == "yes", T, F)\r\n-setwd(outputdir)\r\n-\r\n-dat = read.table(input, header=T, sep="\\t", fill=T, stringsAsFactors=F)\r\n-\r\n-if(length(dat$Sequence.ID) == 0){\r\n- setwd(outputdir)\r\n- result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5))\r\n- row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)")\r\n- write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)\r\n- transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4))\r\n- row.names(transitionTable) = c("A", "C", "G", "T")\r\n- transitionTable["A","A"] = NA\r\n- transitionTable["C","C"] = NA\r\n- transitionTable["G","G"] = NA\r\n- transitionTable["T","T"] = NA\r\n- write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)\r\n- cat("0", file="n.txt")\r\n- stop("No data")\r\n-}\r\n-\r\n-cleanup_columns = c("FR1.IMGT.c.a",\r\n- "FR2.IMGT.g.t",\r\n- "CDR1.IMGT.Nb.of.nucleotides",\r\n- "CDR2.IMGT.t.a",\r\n- "FR1.IMGT.c.g",\r\n- "CDR1.IMGT.c.t",\r\n- "FR2.IMGT.a.c",\r\n- "FR2.IMGT.Nb.of.mutations",\r\n- "FR2.IMGT.g.c",\r\n- "FR2.IMGT.a.g",\r\n- "FR3.IMGT.t.a",\r\n- "FR3.IMGT.t.c",\r\n- "FR2.IMGT.g.a",\r\n- "FR3.IMGT.c.g",\r\n- "FR1.IMGT.Nb.of.mutations",\r\n- "CDR1.IMGT.g.a",\r\n- "CDR1.IMGT.t.g",\r\n- "CDR1.IMGT.g.c",\r\n- "CDR2.IMGT.Nb.of.nucleotides",\r\n- "FR2.IMGT.a.t",\r\n- "CDR1.IMGT.Nb.of.mutations",\r\n- "CDR3.IMGT.Nb.of.nucleotides",\r\n- "CDR1.IMGT.a.g",\r\n- "FR3.IMGT.a.c",\r\n- "FR1.IMGT.g.a",\r\n- "FR3.IMGT.a.g",\r\n- "FR1.IMGT.a.t",\r\n- "CDR2.IMGT.a.g",\r\n- "CDR2.IMGT.Nb.of.mutations",\r\n- "CDR2.IMGT.g.t",\r\n- "CDR2.IMGT.a.c",\r\n- "CDR1.IMGT.t.c",\r\n- "FR3.IMGT.g.c",\r\n- "FR1.IMGT.g.t",\r\n- "FR3.IMGT.g.t",\r\n- "CDR1.IMGT.a.t",\r\n- "FR1.IMGT.a.g",\r\n- "FR3.IMGT.a.t",\r\n- "FR3.IMGT.Nb.of.nucleotides",\r\n- "FR2.IMGT.t.c",\r\n- "CDR2.IMGT.g.a",\r\n- "FR2.IMGT.t.a",\r\n- "CDR1.IMGT.t.a",\r\n- "FR2.IMGT.t.g",\r\n- "FR3.IMGT.t.g",\r\n- "FR2.IMGT.Nb.of.nucleotides",\r\n- "FR1.IMGT.t.a",\r\n- "FR1.IMGT.t.g",\r\n- "FR3.IMGT.c.t",\r\n- "FR1.IMGT.t.c",\r\n- "CDR2.IMGT.a.t",\r\n- "FR2.IMGT.c.t",\r\n- "CDR1.IMGT.g.t",\r\n- "CDR2.IMGT.t.g",\r\n- "FR1.IMGT.Nb.of.nucleotides",\r\n- "CDR1.IMGT.c.g",\r\n- "CDR2.IMGT.t.c",\r\n- "FR3.IMGT.g.a",\r\n- "CDR1.IMGT.a.c",\r\n- "FR2.IMGT.c.a",\r\n- "FR3.IMGT.Nb.of.mutations",\r\n- "FR2.IMGT.c.g",\r\n- "CDR2.IMGT.g.c",\r\n- "FR1.IMGT.g.c",\r\n- "CDR2.IMGT.c.t",\r\n- "FR3.IMGT.c.a",\r\n- "CDR1.IMGT.c.a",\r\n- "CDR2.IMGT.c.g",\r\n- "CDR2.IMGT.c.a",\r\n- "FR1.IMGT.c.t",\r\n- "F'..b'q, fill=Gene))\n+\tpc = pc + geom_bar(width = 1, stat = "identity") + scale_fill_manual(labels=genesForPlot$label, values=c("IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred"))\n+\tpc = pc + coord_polar(theta="y") + scale_y_continuous(breaks=NULL)\n+\tpc = pc + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black"))\n+\tpc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IGG subclasses", "( n =", sum(genesForPlot$Freq), ")"))\n+\twrite.table(genesForPlot, "IGG.txt", sep="\\t",quote=F,row.names=F,col.names=T)\n+\n+\tpng(filename="IGG.png")\n+\tprint(pc)\n+\tdev.off()\n+}\n+\n+print("Plotting scatterplot")\n+\n+dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2)\n+dat.clss = dat\n+\n+dat.clss$best_match = substr(dat.clss$best_match, 0, 3)\n+\n+dat.clss = rbind(dat, dat.clss)\n+\n+p = ggplot(dat.clss, aes(best_match, percentage_mutations))\n+p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA)\n+p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black"))\n+p = p + scale_fill_manual(values=c("IGA" = "blue4", "IGA1" = "lightblue1", "IGA2" = "blue4", "IGG" = "olivedrab3", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "darkviolet"))\n+p = p + scale_colour_manual(values=c("IGA" = "blue4", "IGA1" = "lightblue1", "IGA2" = "blue4", "IGG" = "olivedrab3", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "darkviolet"))\n+\n+png(filename="scatter.png")\n+print(p)\n+dev.off()\n+\n+write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\\t",quote=F,row.names=F,col.names=T)\n+\n+write.table(dat, input, sep="\\t",quote=F,row.names=F,col.names=T)\n+\n+print("Plotting frequency ranges plot")\n+\n+dat$best_match_class = substr(dat$best_match, 0, 3)\n+freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20")\n+dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels)\n+\n+frequency_bins_sum = data.frame(data.table(dat)[, list(class_sum=sum(.N)), by=c("best_match_class")])\n+\n+frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match_class", "frequency_bins")])\n+\n+frequency_bins_data = merge(frequency_bins_data, frequency_bins_sum, by="best_match_class")\n+\n+frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2)\n+\n+p = ggplot(frequency_bins_data, aes(frequency_bins, frequency))\n+p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black"))\n+p = p + xlab("Frequency ranges") + ylab("Frequency") + ggtitle("Mutation Frequencies by class") + scale_fill_manual(values=c("IGA" = "blue4", "IGG" = "olivedrab3", "IGM" = "black"))\n+\n+png(filename="frequency_ranges.png")\n+print(p)\n+dev.off()\n+\n+frequency_bins_data_by_class = frequency_bins_data\n+\n+write.table(frequency_bins_data_by_class, "frequency_ranges_classes.txt", sep="\\t",quote=F,row.names=F,col.names=T)\n+\n+frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match", "best_match_class", "frequency_bins")])\n+\n+frequency_bins_data = merge(frequency_bins_data, frequency_bins_sum, by="best_match_class")\n+\n+frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2)\n+\n+write.table(frequency_bins_data, "frequency_ranges_subclasses.txt", sep="\\t",quote=F,row.names=F,col.names=T)\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n'

diff -r c33d93683a09 -r faae21ba5c63 shm_csr.xml
--- a/shm_csr.xml Thu Oct 13 10:52:24 2016 -0400
+++ b/shm_csr.xml Tue Oct 25 07:28:43 2016 -0400

@@ -1,24 +1,20 @@
<tool id="shm_csr" name="SHM & CSR pipeline" version="1.0">
<description></description>
<command interpreter="bash">
- wrapper.sh $in_file custom $out_file $out_file.files_path ${in_file.name} ${include_fr1} $functionality $unique $naive_output_ca $naive_output_cg $naive_output_cm $filter_uniques $class_filter $empty_region_filter
+ wrapper.sh $in_file custom $out_file $out_file.files_path ${in_file.name} "-" $functionality $unique $naive_output_ca $naive_output_cg $naive_output_cm $filter_uniques $class_filter $empty_region_filter
</command>
<inputs>
<param name="in_file" type="data" label="IMGT zip file to be analysed" />
<param name="empty_region_filter" type="select" label="Sequence starts at" help="" >
+ <option value="leader" selected="true">Leader: include FR1, CDR1, FR2, CDR2, FR3 in filters</option>
<option value="FR1" selected="true">FR1: include CDR1,FR2,CDR2,FR3 in filters</option>
<option value="CDR1">CDR1: include FR2,CDR2,FR3 in filters</option>
<option value="FR2">FR2: include CDR2,FR3 in filters</option>
</param>
- <param name="include_fr1" type="select" label="Include mutations in FR1 region" help="" >
- <option value="yes">yes</option>
- <option value="no" selected="true">no</option>
- </param>
<param name="functionality" type="select" label="Functionality filter" help="" >
- <option value="productive" selected="true">Productive: Keep "productive" and "productive (see comment)"</option>
- <option value="unproductive">Unproductive: Keep "unproductive" and "unproductive (see comment)"</option>
- <option value="remove_unknown">Remove "unknown" and "unknown (see comment)"</option>
- <option value="dont_filter">Don't filter</option>
+ <option value="productive" selected="true">Productive (Productive and Productive see comment)</option>
+ <option value="unproductive">Unproductive (Unproductive and Unproductive see comment)</option>
+ <option value="remove_unknown">Productive and Unproductive (Productive, Productive see comment, Unproductive, Unproductive and Unproductive see comment)</option>
</param>
<param name="filter_uniques" type="select" label="Filter unique sequences" help="See below for an example.">
<option value="remove">Remove uniques (Based on nucleotide sequence + C)</option>
@@ -42,7 +38,7 @@
<option value="60_55">>60% class and >55% subclass</option>
<option value="70_0">>70% class</option>
<option value="60_0">>60% class</option>
- <option value="101_101">No class</option>
+ <option value="101_101">Do not assign (sub)class</option>
</param>
<conditional name="naive_output_cond">
<param name="naive_output" type="select" label="Output new IMGT archives per class into your history?">

diff -r c33d93683a09 -r faae21ba5c63 wrapper.sh
--- a/wrapper.sh Thu Oct 13 10:52:24 2016 -0400
+++ b/wrapper.sh Tue Oct 25 07:28:43 2016 -0400

@@ -158,13 +158,13 @@

classes="IGA,IGA1,IGA2,IGG,IGG1,IGG2,IGG3,IGG4,IGM,unmatched"
echo "R mutation analysis"
-Rscript $dir/shm_csr.r $outdir/merged.txt $classes $outdir ${include_fr1} 2>&1
+Rscript $dir/shm_csr.r $outdir/merged.txt $classes $outdir ${empty_region_filter} 2>&1

echo "---------------- shm_csr.py ----------------"
echo "---------------- shm_csr.py ----------------<br />" >> $log

-python $dir/shm_csr.py --input $outdir/merged.txt --genes $classes --includefr1 "${include_fr1}" --output $outdir/hotspot_analysis.txt
+python $dir/shm_csr.py --input $outdir/merged.txt --genes $classes --empty_region_filter "${empty_region_filter}" --output $outdir/hotspot_analysis.txt

echo "---------------- aa_histogram.r ----------------"
echo "---------------- aa_histogram.r ----------------<br />" >> $log
@@ -575,6 +575,9 @@

cd $tmp

+echo "Cleaning up *.RData files"
+find $outdir/baseline -name "*.RData" -type f -delete
+
echo "---------------- naive_output.r ----------------"
echo "---------------- naive_output.r ----------------<br />" >> $log