changeset 47:64711f461c8e draft

Uploaded
author davidvanzessen
date Thu, 04 May 2017 07:43:09 -0400
parents cfc9a442e59d
children c5295dd10dfc
files imgt_loader.r merge_and_filter.r shm_csr.py shm_csr.xml wrapper.sh
diffstat 5 files changed, 451 insertions(+), 254 deletions(-) [+]
line wrap: on
line diff
--- a/imgt_loader.r	Wed Apr 12 04:28:16 2017 -0400
+++ b/imgt_loader.r	Thu May 04 07:43:09 2017 -0400
@@ -9,6 +9,22 @@
 aa = read.table(aa.file, sep="\t", header=T, quote="", fill=T)
 junction = read.table(junction.file, sep="\t", header=T, quote="", fill=T)
 
+fix_column_names = function(df){
+    if("V.DOMAIN.Functionality" %in% names(df)){
+        names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"
+        print("found V.DOMAIN.Functionality, changed")
+    }
+    if("V.DOMAIN.Functionality.comment" %in% names(df)){
+        names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"
+        print("found V.DOMAIN.Functionality.comment, changed")
+    }
+    return(df)
+}
+
+summ = fix_column_names(summ)
+aa = fix_column_names(aa)
+junction = fix_column_names(junction)
+
 old_summary_columns=c('Sequence.ID','JUNCTION.frame','V.GENE.and.allele','D.GENE.and.allele','J.GENE.and.allele','CDR1.IMGT.length','CDR2.IMGT.length','CDR3.IMGT.length','Orientation')
 old_sequence_columns=c('CDR1.IMGT','CDR2.IMGT','CDR3.IMGT')
 old_junction_columns=c('JUNCTION')
--- a/merge_and_filter.r	Wed Apr 12 04:28:16 2017 -0400
+++ b/merge_and_filter.r	Thu May 04 07:43:09 2017 -0400
@@ -26,6 +26,25 @@
 AAs = read.table(aafile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
 gene_identification = read.table(gene_identification_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
 
+fix_column_names = function(df){
+    if("V.DOMAIN.Functionality" %in% names(df)){
+        names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"
+        print("found V.DOMAIN.Functionality, changed")
+    }
+    if("V.DOMAIN.Functionality.comment" %in% names(df)){
+        names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"
+        print("found V.DOMAIN.Functionality.comment, changed")
+    }
+    return(df)
+}
+
+summ = fix_column_names(summ)
+sequences = fix_column_names(sequences)
+mutationanalysis = fix_column_names(mutationanalysis)
+mutationstats = fix_column_names(mutationstats)
+hotspots = fix_column_names(hotspots)
+AAs = fix_column_names(AAs)
+
 if(method == "blastn"){
 	#"qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore"
 	gene_identification = gene_identification[!duplicated(gene_identification$qseqid),]
@@ -36,8 +55,8 @@
 	colnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")
 }
 
-print("Summary analysis files columns")
-print(names(summ))
+#print("Summary analysis files columns")
+#print(names(summ))
 
 
 
@@ -75,6 +94,14 @@
 
 filtering.steps = rbind(filtering.steps, c("After functionality filter", nrow(summ)))
 
+if(FALSE){ #to speed up debugging
+    set.seed(1)
+    summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.05)),]
+    print(paste("Number of sequences after sampling 5%:", nrow(summ)))
+
+    filtering.steps = rbind(filtering.steps, c("Number of sequences after sampling 5%", nrow(summ)))
+}
+
 print("mutation analysis files columns")
 print(names(mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])]))
 
@@ -82,8 +109,8 @@
 
 print(paste("Number of sequences after merging with mutation analysis file:", nrow(result)))
 
-print("mutation stats files columns")
-print(names(mutationstats[,!(names(mutationstats) %in% names(result)[-1])]))
+#print("mutation stats files columns")
+#print(names(mutationstats[,!(names(mutationstats) %in% names(result)[-1])]))
 
 result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID")
 
@@ -135,10 +162,10 @@
 
 write.table(x=result, file=gsub("merged.txt$", "before_filters.txt", output), sep="\t",quote=F,row.names=F,col.names=T)
 
-print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "")))
-print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "")))
-print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "")))
-print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == "")))
+print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "", na.rm=T)))
+print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "", na.rm=T)))
+print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "", na.rm=T)))
+print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == "", na.rm=T)))
 
 if(empty.region.filter == "leader"){
 	result = result[result$FR1.IMGT.seq != "" & result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
@@ -219,6 +246,8 @@
 #	result[i,"past"] = paste(result[i,cls], collapse=":")
 #}
 
+
+
 result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":"))
 
 result.matched = result[!grepl("unmatched", result$best_match),]
--- a/shm_csr.py	Wed Apr 12 04:28:16 2017 -0400
+++ b/shm_csr.py	Thu May 04 07:43:09 2017 -0400
@@ -1,287 +1,436 @@
-from __future__ import division
+import argparse
+import logging
+import sys
+import os
+import re
+
 from collections import defaultdict
-import re
-import argparse
+
+def main():
+	parser = argparse.ArgumentParser()
+	parser.add_argument("--input", help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation")
+	parser.add_argument("--genes", help="The genes available in the 'best_match' column")
+	parser.add_argument("--empty_region_filter", help="Where does the sequence start?", choices=['leader', 'FR1', 'CDR1', 'FR2'])
+	parser.add_argument("--output", help="Output file")
 
-parser = argparse.ArgumentParser()
-parser.add_argument("--input",
-					help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation")
-parser.add_argument("--genes", help="The genes available in the 'best_match' column")
-parser.add_argument("--empty_region_filter", help="Where does the sequence start?", choices=['leader', 'FR1', 'CDR1', 'FR2'])
-parser.add_argument("--output", help="Output file")
+	args = parser.parse_args()
+
+	infile = args.input
+	genes = str(args.genes).split(",")
+	empty_region_filter = args.empty_region_filter
+	outfile = args.output
 
-args = parser.parse_args()
+	genedic = dict()
 
-infile = args.input
-genes = str(args.genes).split(",")
-empty_region_filter = args.empty_region_filter
-outfile = args.output
-
-genedic = dict()
+	mutationdic = dict()
+	mutationMatcher = re.compile("^(.)(\d+).(.),?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?")
+	NAMatchResult = (None, None, None, None, None, None, '')
+	geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes}
+	linecount = 0
 
-mutationdic = dict()
-mutationMatcher = re.compile("^(.)(\d+).(.),?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?")
-NAMatchResult = (None, None, None, None, None, None, '')
-linecount = 0
+	IDIndex = 0
+	best_matchIndex = 0
+	fr1Index = 0
+	cdr1Index = 0
+	fr2Index = 0
+	cdr2Index = 0
+	fr3Index = 0
+	first = True
+	IDlist = []
+	mutationList = []
+	mutationListByID = {}
+	cdr1LengthDic = {}
+	cdr2LengthDic = {}
+
+	fr1LengthDict = {}
+	fr2LengthDict = {}
+	fr3LengthDict = {}
+
+	cdr1LengthIndex = 0
+	cdr2LengthIndex = 0
 
-IDIndex = 0
-best_matchIndex = 0
-fr1Index = 0
-cdr1Index = 0
-fr2Index = 0
-cdr2Index = 0
-fr3Index = 0
-first = True
-IDlist = []
-mutationList = []
-mutationListByID = {}
-cdr1LengthDic = {}
-cdr2LengthDic = {}
+	fr1SeqIndex = 0
+	fr2SeqIndex = 0
+	fr3SeqIndex = 0
+
+	tandem_sum_by_class = defaultdict(int)
+	expected_tandem_sum_by_class = defaultdict(float)
 
-with open(infile, 'ru') as i:
-	for line in i:
-		if first:
+	with open(infile, 'ru') as i:
+		for line in i:
+			if first:
+				linesplt = line.split("\t")
+				IDIndex = linesplt.index("Sequence.ID")
+				best_matchIndex = linesplt.index("best_match")
+				fr1Index = linesplt.index("FR1.IMGT")
+				cdr1Index = linesplt.index("CDR1.IMGT")
+				fr2Index = linesplt.index("FR2.IMGT")
+				cdr2Index = linesplt.index("CDR2.IMGT")
+				fr3Index = linesplt.index("FR3.IMGT")
+				cdr1LengthIndex = linesplt.index("CDR1.IMGT.seq")
+				cdr2LengthIndex = linesplt.index("CDR2.IMGT.seq")
+				fr1SeqIndex = linesplt.index("FR1.IMGT.seq")
+				fr2SeqIndex = linesplt.index("FR2.IMGT.seq")
+				fr3SeqIndex = linesplt.index("FR3.IMGT.seq")
+				first = False
+				continue
+			linecount += 1
 			linesplt = line.split("\t")
-			IDIndex = linesplt.index("Sequence.ID")
-			best_matchIndex = linesplt.index("best_match")
-			fr1Index = linesplt.index("FR1.IMGT")
-			cdr1Index = linesplt.index("CDR1.IMGT")
-			fr2Index = linesplt.index("FR2.IMGT")
-			cdr2Index = linesplt.index("CDR2.IMGT")
-			fr3Index = linesplt.index("FR3.IMGT")
-			cdr1LengthIndex = linesplt.index("CDR1.IMGT.length")
-			cdr2LengthIndex = linesplt.index("CDR2.IMGT.length")
-			first = False
-			continue
-		linecount += 1
-		linesplt = line.split("\t")
-		ID = linesplt[IDIndex]
-		genedic[ID] = linesplt[best_matchIndex]
-		try:
-			mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if (linesplt[fr1Index] != "NA" and empty_region_filter == "leader") else []
-			mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if (linesplt[cdr1Index] != "NA" and empty_region_filter in ["leader", "FR1"]) else []
-			mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if (linesplt[fr2Index] != "NA" and empty_region_filter in ["leader", "FR1", "CDR1"]) else []
-			mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if (linesplt[cdr2Index] != "NA") else []
-			mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]
-			mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else []
-		except Exception as e:
-			print "Something went wrong while processing this line:"
-			print linesplt
-			print linecount
-			print e
-		mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
-		mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
+			ID = linesplt[IDIndex]
+			genedic[ID] = linesplt[best_matchIndex]
+			try:
+				mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if (linesplt[fr1Index] != "NA" and empty_region_filter == "leader") else []
+				mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if (linesplt[cdr1Index] != "NA" and empty_region_filter in ["leader", "FR1"]) else []
+				mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if (linesplt[fr2Index] != "NA" and empty_region_filter in ["leader", "FR1", "CDR1"]) else []
+				mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if (linesplt[cdr2Index] != "NA") else []
+				mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]
+				mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else []
+			except Exception as e:
+				print "Something went wrong while processing this line:"
+				print linesplt
+				print linecount
+				print e
+			mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
+			mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
+
+			cdr1Length = len(linesplt[cdr1LengthIndex])
+			cdr2Length = len(linesplt[cdr2LengthIndex])
+
+			#print linesplt[fr2SeqIndex]
+			fr1Length = len(linesplt[fr1SeqIndex]) if empty_region_filter == "leader" else 0
+			fr2Length = len(linesplt[fr2SeqIndex]) if empty_region_filter in ["leader", "FR1", "CDR1"] else 0
+			fr3Length = len(linesplt[fr3SeqIndex])
+
+			cdr1LengthDic[ID] = cdr1Length
+			cdr2LengthDic[ID] = cdr2Length
+
+			fr1LengthDict[ID] = fr1Length
+			fr2LengthDict[ID] = fr2Length
+			fr3LengthDict[ID] = fr3Length
+
+			IDlist += [ID]
+
+
+	#tandem mutation stuff
+	tandem_frequency = defaultdict(int)
+	mutation_frequency = defaultdict(int)
+
+	tandem_file = os.path.join(os.path.dirname(outfile), "tandems_by_id.txt")
+	with open(tandem_file, 'w') as o:
+		highest_tandem_length = 0
+
+		o.write("Sequence.ID\tnumber_of_mutations\tnumber_of_tandems\tregion_length\texpected_tandems\tlongest_tandem\ttandems\n")
+		for ID in IDlist:
+			mutations = mutationListByID[ID]
+			if len(mutations) == 0:
+				continue
+			last_mut = max(mutations, key=lambda x: int(x[1]))
+
+			last_mut_pos = int(last_mut[1])
+
+			mut_positions = [False] * (last_mut_pos + 1)
+
+			for mutation in mutations:
+				frm, where, to, frmAA, whereAA, toAA, thing = mutation
+				where = int(where)
+				mut_positions[where] = True
+
+			tandem_muts = []
+			tandem_start = -1
+			tandem_length = 0
+			for i in range(len(mut_positions)):
+				if mut_positions[i]:
+					if tandem_start == -1:
+						tandem_start = i
+					tandem_length += 1
+					#print "".join(["1" if x else "0" for x in mut_positions[:i+1]])
+				else:
+					if tandem_length > 1:
+						tandem_muts.append((tandem_start, tandem_length))
+						#print "{0}{1} {2}:{3}".format(" " * (i - tandem_length), "^" * tandem_length, tandem_start, tandem_length)
+					tandem_start = -1
+					tandem_length = 0
+			if tandem_length > 1:  # if the sequence ends with a tandem mutation
+				tandem_muts.append((tandem_start, tandem_length))
+
+			if len(tandem_muts) > 0:
+				if highest_tandem_length < len(tandem_muts):
+					highest_tandem_length = len(tandem_muts)
 
-		cdr1Length = linesplt[cdr1LengthIndex]
-		cdr2Length = linesplt[cdr2LengthIndex]
+			region_length = fr1LengthDict[ID] + cdr1LengthDic[ID] + fr2LengthDict[ID] + cdr2LengthDic[ID] + fr3LengthDict[ID]
+			longest_tandem = max(tandem_muts, key=lambda x: x[1]) if len(tandem_muts) else (0, 0)
+			num_mutations = len(mutations)
+			f_num_mutations = float(num_mutations)
+			num_tandem_muts = len(tandem_muts)
+			expected_tandem_muts = f_num_mutations * (f_num_mutations - 1.0) / float(region_length)
+			o.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(ID,
+																str(num_mutations),
+																str(num_tandem_muts),
+																str(region_length),
+																str(round(expected_tandem_muts, 2)),
+																str(longest_tandem[1]),
+																str(tandem_muts)))
+			gene = genedic[ID]
+			if gene.find("unmatched") == -1:
+				tandem_sum_by_class[gene] += num_tandem_muts
+				expected_tandem_sum_by_class[gene] += expected_tandem_muts
 
-		cdr1LengthDic[ID] = int(cdr1Length) if cdr1Length != "X" else 0
-		cdr2LengthDic[ID] = int(cdr2Length) if cdr2Length != "X" else 0
-		
-		IDlist += [ID]
+				tandem_sum_by_class["all"] += num_tandem_muts
+				expected_tandem_sum_by_class["all"] += expected_tandem_muts
 
-#print mutationList, linecount
+				gene = gene[:3]
+				if gene in ["IGA", "IGG"]:
+					tandem_sum_by_class[gene] += num_tandem_muts
+					expected_tandem_sum_by_class[gene] += expected_tandem_muts
+			else:
+				tandem_sum_by_class["unmatched"] += num_tandem_muts
+				expected_tandem_sum_by_class["unmatched"] += expected_tandem_muts
+
 
-AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] else 0)[4]) + 1)  # [4] is the position of the AA mutation, None if silent
-if AALength < 60:
-	AALength = 64
+			for tandem_mut in tandem_muts:
+				tandem_frequency[str(tandem_mut[1])] += 1
+			#print "\t".join([ID, str(len(tandem_muts)), str(longest_tandem[1]) , str(tandem_muts)])
+
+	tandem_freq_file = os.path.join(os.path.dirname(outfile), "tandem_frequency.txt")
+	with open(tandem_freq_file, 'w') as o:
+		for frq in sorted([int(x) for x in tandem_frequency.keys()]):
+			o.write("{0}\t{1}\n".format(frq, tandem_frequency[str(frq)]))
 
-AA_mutation = [0] * AALength
-AA_mutation_dic = {"IGA": AA_mutation[:], "IGG": AA_mutation[:], "IGM": AA_mutation[:], "IGE": AA_mutation[:], "unm": AA_mutation[:], "all": AA_mutation[:]}
-AA_mutation_empty = AA_mutation[:]
+	tandem_row = []
+	print genes
+	print tandem_sum_by_class
+	print expected_tandem_sum_by_class
+	genes_extra = list(genes)
+	genes_extra.append("all")
+	for x, y, in zip([tandem_sum_by_class[x] for x in genes_extra], [expected_tandem_sum_by_class[x] for x in genes_extra]):
+		if y != 0:
+			tandem_row += [x, round(y, 2), round(x / y, 2)]
+		else:
+			tandem_row += [x, round(y, 2), 0]
+	
+	"""
+	print tandem_row
+	tandem_row += tandem_row[-3:]
+	print tandem_row
+	all_expected_tandem = expected_tandem_sum_by_class["all"]
+	all_tandem = tandem_sum_by_class["all"]
+	if all_expected_tandem == 0:
+		tandem_row[-6:-3] = [all_tandem, round(all_expected_tandem, 2), 0]
+	else:
+		tandem_row[-6:-3] = [all_tandem, round(all_expected_tandem, 2), round(all_tandem / all_expected_tandem, 2)]
+	print tandem_row
+	"""
+	for i in range(len(genes_extra)):
+		gene = genes_extra[i]
+		print gene, tandem_row[i*3:i*3+3]
 
-aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt"
-with open(aa_mutations_by_id_file, 'w') as o:
-	o.write("ID\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n")
-	for ID in mutationListByID.keys():
-		AA_mutation_for_ID = AA_mutation_empty[:]
-		for mutation in mutationListByID[ID]:
-			if mutation[4]:
-				AA_mutation_position = int(mutation[4])
-				AA_mutation[AA_mutation_position] += 1
-				AA_mutation_for_ID[AA_mutation_position] += 1
-				clss = genedic[ID][:3]
-				AA_mutation_dic[clss][AA_mutation_position] += 1
-		o.write(ID + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in AA_mutation_for_ID[1:]]) + "\n")
+	tandem_freq_file = os.path.join(os.path.dirname(outfile), "shm_overview_tandem_row.txt")
+	with open(tandem_freq_file, 'w') as o:
+		o.write("Tandems/Expected (ratio),{0}\n".format(",".join([str(x) for x in tandem_row])))
+
+	#print mutationList, linecount
+
+	AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] else 0)[4]) + 1)  # [4] is the position of the AA mutation, None if silent
+	if AALength < 60:
+		AALength = 64
+
+	AA_mutation = [0] * AALength
+	AA_mutation_dic = {"IGA": AA_mutation[:], "IGG": AA_mutation[:], "IGM": AA_mutation[:], "IGE": AA_mutation[:], "unm": AA_mutation[:], "all": AA_mutation[:]}
+	AA_mutation_empty = AA_mutation[:]
+
+	aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt"
+	with open(aa_mutations_by_id_file, 'w') as o:
+		o.write("ID\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n")
+		for ID in mutationListByID.keys():
+			AA_mutation_for_ID = AA_mutation_empty[:]
+			for mutation in mutationListByID[ID]:
+				if mutation[4]:
+					AA_mutation_position = int(mutation[4])
+					AA_mutation[AA_mutation_position] += 1
+					AA_mutation_for_ID[AA_mutation_position] += 1
+					clss = genedic[ID][:3]
+					AA_mutation_dic[clss][AA_mutation_position] += 1
+			o.write(ID + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in AA_mutation_for_ID[1:]]) + "\n")
 
 
 
-#absent AA stuff
-absentAACDR1Dic = defaultdict(list)
-absentAACDR1Dic[5] = range(29,36)
-absentAACDR1Dic[6] = range(29,35)
-absentAACDR1Dic[7] = range(30,35)
-absentAACDR1Dic[8] = range(30,34)
-absentAACDR1Dic[9] = range(31,34)
-absentAACDR1Dic[10] = range(31,33)
-absentAACDR1Dic[11] = [32]
+	#absent AA stuff
+	absentAACDR1Dic = defaultdict(list)
+	absentAACDR1Dic[5] = range(29,36)
+	absentAACDR1Dic[6] = range(29,35)
+	absentAACDR1Dic[7] = range(30,35)
+	absentAACDR1Dic[8] = range(30,34)
+	absentAACDR1Dic[9] = range(31,34)
+	absentAACDR1Dic[10] = range(31,33)
+	absentAACDR1Dic[11] = [32]
 
-absentAACDR2Dic = defaultdict(list)
-absentAACDR2Dic[0] = range(55,65)
-absentAACDR2Dic[1] = range(56,65)
-absentAACDR2Dic[2] = range(56,64)
-absentAACDR2Dic[3] = range(57,64)
-absentAACDR2Dic[4] = range(57,63)
-absentAACDR2Dic[5] = range(58,63)
-absentAACDR2Dic[6] = range(58,62)
-absentAACDR2Dic[7] = range(59,62)
-absentAACDR2Dic[8] = range(59,61)
-absentAACDR2Dic[9] = [60]
+	absentAACDR2Dic = defaultdict(list)
+	absentAACDR2Dic[0] = range(55,65)
+	absentAACDR2Dic[1] = range(56,65)
+	absentAACDR2Dic[2] = range(56,64)
+	absentAACDR2Dic[3] = range(57,64)
+	absentAACDR2Dic[4] = range(57,63)
+	absentAACDR2Dic[5] = range(58,63)
+	absentAACDR2Dic[6] = range(58,62)
+	absentAACDR2Dic[7] = range(59,62)
+	absentAACDR2Dic[8] = range(59,61)
+	absentAACDR2Dic[9] = [60]
 
-absentAA = [len(IDlist)] * (AALength-1)
-for k, cdr1Length in cdr1LengthDic.iteritems():
-	for c in absentAACDR1Dic[cdr1Length]:
-		absentAA[c] -= 1
+	absentAA = [len(IDlist)] * (AALength-1)
+	for k, cdr1Length in cdr1LengthDic.iteritems():
+		for c in absentAACDR1Dic[cdr1Length]:
+			absentAA[c] -= 1
 
-for k, cdr2Length in cdr2LengthDic.iteritems():
-	for c in absentAACDR2Dic[cdr2Length]:
-		absentAA[c] -= 1
+	for k, cdr2Length in cdr2LengthDic.iteritems():
+		for c in absentAACDR2Dic[cdr2Length]:
+			absentAA[c] -= 1
 
 
-aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/absent_aa_id.txt"
-with open(aa_mutations_by_id_file, 'w') as o:
-	o.write("ID\tcdr1length\tcdr2length\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n")
-	for ID in IDlist:
-		absentAAbyID = [1] * (AALength-1)
-		cdr1Length = cdr1LengthDic[ID]
-		for c in absentAACDR1Dic[cdr1Length]:
-			absentAAbyID[c] -= 1
+	aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/absent_aa_id.txt"
+	with open(aa_mutations_by_id_file, 'w') as o:
+		o.write("ID\tcdr1length\tcdr2length\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n")
+		for ID in IDlist:
+			absentAAbyID = [1] * (AALength-1)
+			cdr1Length = cdr1LengthDic[ID]
+			for c in absentAACDR1Dic[cdr1Length]:
+				absentAAbyID[c] -= 1
 
-		cdr2Length = cdr2LengthDic[ID]
-		for c in absentAACDR2Dic[cdr2Length]:
-			absentAAbyID[c] -= 1
-		o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n")
+			cdr2Length = cdr2LengthDic[ID]
+			for c in absentAACDR2Dic[cdr2Length]:
+				absentAAbyID[c] -= 1
+			o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n")
 
-if linecount == 0:
-	print "No data, exiting"
-	with open(outfile, 'w') as o:
-		o.write("RGYW (%)," + ("0,0,0\n" * len(genes)))
-		o.write("WRCY (%)," + ("0,0,0\n" * len(genes)))
-		o.write("WA (%)," + ("0,0,0\n" * len(genes)))
-		o.write("TW (%)," + ("0,0,0\n" * len(genes)))
-	import sys
+	if linecount == 0:
+		print "No data, exiting"
+		with open(outfile, 'w') as o:
+			o.write("RGYW (%)," + ("0,0,0\n" * len(genes)))
+			o.write("WRCY (%)," + ("0,0,0\n" * len(genes)))
+			o.write("WA (%)," + ("0,0,0\n" * len(genes)))
+			o.write("TW (%)," + ("0,0,0\n" * len(genes)))
+		import sys
 
-	sys.exit()
+		sys.exit()
 
-hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)")
-RGYWCount = {}
-WRCYCount = {}
-WACount = {}
-TWCount = {}
+	hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)")
+	RGYWCount = {}
+	WRCYCount = {}
+	WACount = {}
+	TWCount = {}
 
-#IDIndex = 0
-ataIndex = 0
-tatIndex = 0
-aggctatIndex = 0
-atagcctIndex = 0
-first = True
-with open(infile, 'ru') as i:
-	for line in i:
-		if first:
+	#IDIndex = 0
+	ataIndex = 0
+	tatIndex = 0
+	aggctatIndex = 0
+	atagcctIndex = 0
+	first = True
+	with open(infile, 'ru') as i:
+		for line in i:
+			if first:
+				linesplt = line.split("\t")
+				ataIndex = linesplt.index("X.a.t.a")
+				tatIndex = linesplt.index("t.a.t.")
+				aggctatIndex = linesplt.index("X.a.g.g.c.t..a.t.")
+				atagcctIndex = linesplt.index("X.a.t..a.g.c.c.t.")
+				first = False
+				continue
 			linesplt = line.split("\t")
-			ataIndex = linesplt.index("X.a.t.a")
-			tatIndex = linesplt.index("t.a.t.")
-			aggctatIndex = linesplt.index("X.a.g.g.c.t..a.t.")
-			atagcctIndex = linesplt.index("X.a.t..a.g.c.c.t.")
-			first = False
-			continue
-		linesplt = line.split("\t")
-		gene = linesplt[best_matchIndex]
-		ID = linesplt[IDIndex]
-		RGYW = [(int(x), int(y), z) for (x, y, z) in
-				[hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]]
-		WRCY = [(int(x), int(y), z) for (x, y, z) in
-				[hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]]
-		WA = [(int(x), int(y), z) for (x, y, z) in
-			  [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]]
-		TW = [(int(x), int(y), z) for (x, y, z) in
-			  [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]]
-		RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0
+			gene = linesplt[best_matchIndex]
+			ID = linesplt[IDIndex]
+			RGYW = [(int(x), int(y), z) for (x, y, z) in
+					[hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]]
+			WRCY = [(int(x), int(y), z) for (x, y, z) in
+					[hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]]
+			WA = [(int(x), int(y), z) for (x, y, z) in
+				[hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]]
+			TW = [(int(x), int(y), z) for (x, y, z) in
+				[hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]]
+			RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0
 
-		mutationList = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
-		for mutation in mutationList:
-			frm, where, to, AAfrm, AAwhere, AAto, junk = mutation
-			mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW])
-			mutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY])
-			mutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA])
-			mutation_in_TW = any([(start <= int(where) <= end) for (start, end, region) in TW])
+			mutationList = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
+			for mutation in mutationList:
+				frm, where, to, AAfrm, AAwhere, AAto, junk = mutation
+				mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW])
+				mutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY])
+				mutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA])
+				mutation_in_TW = any([(start <= int(where) <= end) for (start, end, region) in TW])
 
-			in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW])
+				in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW])
 
-			if in_how_many_motifs > 0:
-				RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs
-				WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs
-				WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs
-				TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs
+				if in_how_many_motifs > 0:
+					RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs
+					WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs
+					WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs
+					TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs
 
 
-def mean(lst):
-	return (float(sum(lst)) / len(lst)) if len(lst) > 0 else 0.0
+	def mean(lst):
+		return (float(sum(lst)) / len(lst)) if len(lst) > 0 else 0.0
 
 
-def median(lst):
-	lst = sorted(lst)
-	l = len(lst)
-	if l == 0:
-		return 0
-	if l == 1:
-		return lst[0]
+	def median(lst):
+		lst = sorted(lst)
+		l = len(lst)
+		if l == 0:
+			return 0
+		if l == 1:
+			return lst[0]
+			
+		l = int(l / 2)
 		
-	l = int(l / 2)
-	
-	if len(lst) % 2 == 0:
-		return float(lst[l] + lst[(l - 1)]) / 2.0
-	else:
-		return lst[l]
-
-funcs = {"mean": mean, "median": median, "sum": sum}
+		if len(lst) % 2 == 0:
+			return float(lst[l] + lst[(l - 1)]) / 2.0
+		else:
+			return lst[l]
 
-directory = outfile[:outfile.rfind("/") + 1]
-value = 0
-valuedic = dict()
+	funcs = {"mean": mean, "median": median, "sum": sum}
 
-for fname in funcs.keys():
-	for gene in genes:
-		with open(directory + gene + "_" + fname + "_value.txt", 'r') as v:
-			valuedic[gene + "_" + fname] = float(v.readlines()[0].rstrip())
-	with open(directory + "all_" + fname + "_value.txt", 'r') as v:
-		valuedic["total_" + fname] = float(v.readlines()[0].rstrip())
-	
+	directory = outfile[:outfile.rfind("/") + 1]
+	value = 0
+	valuedic = dict()
 
-def get_xyz(lst, gene, f, fname):
-	x = round(round(f(lst), 1))
-	y = valuedic[gene + "_" + fname]
-	z = str(round(x / float(y) * 100, 1)) if y != 0 else "0"
-	return (str(x), str(y), z)
+	for fname in funcs.keys():
+		for gene in genes:
+			with open(directory + gene + "_" + fname + "_value.txt", 'r') as v:
+				valuedic[gene + "_" + fname] = float(v.readlines()[0].rstrip())
+		with open(directory + "all_" + fname + "_value.txt", 'r') as v:
+			valuedic["total_" + fname] = float(v.readlines()[0].rstrip())
+		
 
-dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount}
-arr = ["RGYW", "WRCY", "WA", "TW"]
+	def get_xyz(lst, gene, f, fname):
+		x = round(round(f(lst), 1))
+		y = valuedic[gene + "_" + fname]
+		z = str(round(x / float(y) * 100, 1)) if y != 0 else "0"
+		return (str(x), str(y), z)
 
-geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes}
+	dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount}
+	arr = ["RGYW", "WRCY", "WA", "TW"]
 
-for fname in funcs.keys():
-	func = funcs[fname]
-	foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt"
-	with open(foutfile, 'w') as o:
-		for typ in arr:
-			o.write(typ + " (%)")
-			curr = dic[typ]
-			for gene in genes:
-				geneMatcher = geneMatchers[gene]
-				if valuedic[gene + "_" + fname] is 0:
-					o.write(",0,0,0")
-				else:
-					x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname)
-					o.write("," + x + "," + y + "," + z)
-			x, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname)
-			#x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname)
-			o.write("," + x + "," + y + "," + z + "\n")
+	for fname in funcs.keys():
+		func = funcs[fname]
+		foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt"
+		with open(foutfile, 'w') as o:
+			for typ in arr:
+				o.write(typ + " (%)")
+				curr = dic[typ]
+				for gene in genes:
+					geneMatcher = geneMatchers[gene]
+					if valuedic[gene + "_" + fname] is 0:
+						o.write(",0,0,0")
+					else:
+						x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname)
+						o.write("," + x + "," + y + "," + z)
+				x, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname)
+				#x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname)
+				o.write("," + x + "," + y + "," + z + "\n")
 
 
-# for testing
-seq_motif_file = outfile[:outfile.rindex("/")] + "/motif_per_seq.txt"
-with open(seq_motif_file, 'w') as o:
-	o.write("ID\tRGYW\tWRCY\tWA\tTW\n")
-	for ID in IDlist:
-		#o.write(ID + "\t" + str(round(RGYWCount[ID], 2)) + "\t" + str(round(WRCYCount[ID], 2)) + "\t" + str(round(WACount[ID], 2)) + "\t" + str(round(TWCount[ID], 2)) + "\n")
-		o.write(ID + "\t" + str(RGYWCount[ID]) + "\t" + str(WRCYCount[ID]) + "\t" + str(WACount[ID]) + "\t" + str(TWCount[ID]) + "\n")
+	# for testing
+	seq_motif_file = outfile[:outfile.rindex("/")] + "/motif_per_seq.txt"
+	with open(seq_motif_file, 'w') as o:
+		o.write("ID\tRGYW\tWRCY\tWA\tTW\n")
+		for ID in IDlist:
+			#o.write(ID + "\t" + str(round(RGYWCount[ID], 2)) + "\t" + str(round(WRCYCount[ID], 2)) + "\t" + str(round(WACount[ID], 2)) + "\t" + str(round(TWCount[ID], 2)) + "\n")
+			o.write(ID + "\t" + str(RGYWCount[ID]) + "\t" + str(WRCYCount[ID]) + "\t" + str(WACount[ID]) + "\t" + str(TWCount[ID]) + "\n")
+
+if __name__ == "__main__":
+	main()
--- a/shm_csr.xml	Wed Apr 12 04:28:16 2017 -0400
+++ b/shm_csr.xml	Thu May 04 07:43:09 2017 -0400
@@ -39,6 +39,7 @@
 				<option value="60_55">>60% class and >55% subclass</option>
 				<option value="70_0">>70% class</option>
 				<option value="60_0">>60% class</option>
+                <option value="25_0">>25% class</option>
 				<option value="101_101">Do not assign (sub)class</option>
 			</param>
 		</conditional>
@@ -183,7 +184,8 @@
 
 *Chunk hit percentage*: The percentage of the chunks that is aligned 
 
-*Nt hit percentage*: The percentage of chunks covering the subclass specific nucleotide match with the different subclasses. The most stringent filter for the subclass is 70% ‘nt hit percentage’ which means that 5 out of 7 subclass specific nucleotides for Cα or 6 out of 8 subclass specific nucleotides of Cγ should match with the specific subclass.
+*Nt hit percentage*: The percentage of chunks covering the subclass specific nucleotide match with the different subclasses. The most stringent filter for the subclass is 70% ‘nt hit percentage’ which means that 5 out of 7 subclass specific nucleotides for Cα or 6 out of 8 subclass specific nucleotides of Cγ should match with the specific subclass. 
+The option “>25% class” can be chosen when you only are interested in the class (Cα/Cγ/Cµ/Cɛ) of  your sequences and the length of your sequence is not long enough to assign the subclasses.
 
 -----
 
--- a/wrapper.sh	Wed Apr 12 04:28:16 2017 -0400
+++ b/wrapper.sh	Thu May 04 07:43:09 2017 -0400
@@ -251,7 +251,7 @@
 	echo "---------------- $func table ----------------"
 	echo "---------------- $func table ----------------<br />" >> $log
 	
-	cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/data_${func}.txt
+	cat $outdir/mutations_${func}.txt $outdir/shm_overview_tandem_row.txt $outdir/hotspot_analysis_${func}.txt > $outdir/data_${func}.txt
 	
 	echo "---------------- pattern_plots.r ----------------"
 	echo "---------------- pattern_plots.r ----------------<br />" >> $log
@@ -276,7 +276,7 @@
 
 		while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz cex cey cez unx uny unz allx ally allz 
 		do
-			if [ "$name" == "FR R/S (ratio)" ] || [ "$name" == "CDR R/S (ratio)" ] ; then #meh
+			if [ "$name" == "FR R/S (ratio)" ] || [ "$name" == "CDR R/S (ratio)" ] || [ "$name" == "Tandems/Expected (ratio)" ] ; then #meh
 				echo "<tr><td>$name</td><td>${cax}/${cay} (${caz})</td><td>${ca1x}/${ca1y} (${ca1z})</td><td>${ca2x}/${ca2y} (${ca2z})</td><td>${cgx}/${cgy} (${cgz})</td><td>${cg1x}/${cg1y} (${cg1z})</td><td>${cg2x}/${cg2y} (${cg2z})</td><td>${cg3x}/${cg3y} (${cg3z})</td><td>${cg4x}/${cg4y} (${cg4z})</td><td>${cmx}/${cmy} (${cmz})</td><td>${cex}/${cey} (${cez})</td><td>${allx}/${ally} (${allz})</td><td>${unx}/${uny} (${unz})</td></tr>" >> $output
 			elif [ "$name" == "Median of Number of Mutations (%)" ] ; then
 				echo "<tr><td>$name</td><td>${caz}%</td><td>${ca1z}%</td><td>${ca2z}%</td><td>${cgz}%</td><td>${cg1z}%</td><td>${cg2z}%</td><td>${cg3z}%</td><td>${cg4z}%</td><td>${cmz}%</td><td>${cez}%</td><td>${allz}%</td><td>${unz}%</td></tr>" >> $output
@@ -665,6 +665,7 @@
 echo "<tr><td>The data used to generate the percentage of mutations in AID and pol eta motives plot</td><td><a href='aid_motives.txt' download='aid_motives.txt' >Download</a></td></tr>" >> $output
 echo "<tr><td>The data used to generate the relative mutation patterns plot</td><td><a href='relative_mutations.txt' download='relative_mutations.txt' >Download</a></td></tr>" >> $output
 echo "<tr><td>The data used to generate the absolute mutation patterns plot</td><td><a href='absolute_mutations.txt' download='absolute_mutations.txt' >Download</a></td></tr>" >> $output
+echo "<tr><td>Data about tandem mutations by ID</td><td><a href='tandems_by_id.txt' download='tandems_by_id.txt' >Download</a></td></tr>" >> $output
 
 echo "<tr><td colspan='2' style='background-color:#E0E0E0;'>SHM Frequency</td></tr>" >> $output
 echo "<tr><td>The data  generate the frequency scatter plot</td><td><a href='scatter.txt' download='scatter.txt' >Download</a></td></tr>" >> $output