Repository 'shm_csr'
hg clone https://toolshed.g2.bx.psu.edu/repos/davidvanzessen/shm_csr

Changeset 47:64711f461c8e (2017-05-04)
Previous changeset 46:cfc9a442e59d (2017-04-12) Next changeset 48:c5295dd10dfc (2017-05-08)
Commit message:
Uploaded
modified:
imgt_loader.r
merge_and_filter.r
shm_csr.py
shm_csr.xml
wrapper.sh
b
diff -r cfc9a442e59d -r 64711f461c8e imgt_loader.r
--- a/imgt_loader.r Wed Apr 12 04:28:16 2017 -0400
+++ b/imgt_loader.r Thu May 04 07:43:09 2017 -0400
[
@@ -9,6 +9,22 @@
 aa = read.table(aa.file, sep="\t", header=T, quote="", fill=T)
 junction = read.table(junction.file, sep="\t", header=T, quote="", fill=T)
 
+fix_column_names = function(df){
+    if("V.DOMAIN.Functionality" %in% names(df)){
+        names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"
+        print("found V.DOMAIN.Functionality, changed")
+    }
+    if("V.DOMAIN.Functionality.comment" %in% names(df)){
+        names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"
+        print("found V.DOMAIN.Functionality.comment, changed")
+    }
+    return(df)
+}
+
+summ = fix_column_names(summ)
+aa = fix_column_names(aa)
+junction = fix_column_names(junction)
+
 old_summary_columns=c('Sequence.ID','JUNCTION.frame','V.GENE.and.allele','D.GENE.and.allele','J.GENE.and.allele','CDR1.IMGT.length','CDR2.IMGT.length','CDR3.IMGT.length','Orientation')
 old_sequence_columns=c('CDR1.IMGT','CDR2.IMGT','CDR3.IMGT')
 old_junction_columns=c('JUNCTION')
b
diff -r cfc9a442e59d -r 64711f461c8e merge_and_filter.r
--- a/merge_and_filter.r Wed Apr 12 04:28:16 2017 -0400
+++ b/merge_and_filter.r Thu May 04 07:43:09 2017 -0400
[
@@ -26,6 +26,25 @@
 AAs = read.table(aafile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
 gene_identification = read.table(gene_identification_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
 
+fix_column_names = function(df){
+    if("V.DOMAIN.Functionality" %in% names(df)){
+        names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"
+        print("found V.DOMAIN.Functionality, changed")
+    }
+    if("V.DOMAIN.Functionality.comment" %in% names(df)){
+        names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"
+        print("found V.DOMAIN.Functionality.comment, changed")
+    }
+    return(df)
+}
+
+summ = fix_column_names(summ)
+sequences = fix_column_names(sequences)
+mutationanalysis = fix_column_names(mutationanalysis)
+mutationstats = fix_column_names(mutationstats)
+hotspots = fix_column_names(hotspots)
+AAs = fix_column_names(AAs)
+
 if(method == "blastn"){
  #"qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore"
  gene_identification = gene_identification[!duplicated(gene_identification$qseqid),]
@@ -36,8 +55,8 @@
  colnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")
 }
 
-print("Summary analysis files columns")
-print(names(summ))
+#print("Summary analysis files columns")
+#print(names(summ))
 
 
 
@@ -75,6 +94,14 @@
 
 filtering.steps = rbind(filtering.steps, c("After functionality filter", nrow(summ)))
 
+if(FALSE){ #to speed up debugging
+    set.seed(1)
+    summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.05)),]
+    print(paste("Number of sequences after sampling 5%:", nrow(summ)))
+
+    filtering.steps = rbind(filtering.steps, c("Number of sequences after sampling 5%", nrow(summ)))
+}
+
 print("mutation analysis files columns")
 print(names(mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])]))
 
@@ -82,8 +109,8 @@
 
 print(paste("Number of sequences after merging with mutation analysis file:", nrow(result)))
 
-print("mutation stats files columns")
-print(names(mutationstats[,!(names(mutationstats) %in% names(result)[-1])]))
+#print("mutation stats files columns")
+#print(names(mutationstats[,!(names(mutationstats) %in% names(result)[-1])]))
 
 result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID")
 
@@ -135,10 +162,10 @@
 
 write.table(x=result, file=gsub("merged.txt$", "before_filters.txt", output), sep="\t",quote=F,row.names=F,col.names=T)
 
-print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "")))
-print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "")))
-print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "")))
-print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == "")))
+print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "", na.rm=T)))
+print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "", na.rm=T)))
+print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "", na.rm=T)))
+print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == "", na.rm=T)))
 
 if(empty.region.filter == "leader"){
  result = result[result$FR1.IMGT.seq != "" & result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
@@ -219,6 +246,8 @@
 # result[i,"past"] = paste(result[i,cls], collapse=":")
 #}
 
+
+
 result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":"))
 
 result.matched = result[!grepl("unmatched", result$best_match),]
b
diff -r cfc9a442e59d -r 64711f461c8e shm_csr.py
--- a/shm_csr.py Wed Apr 12 04:28:16 2017 -0400
+++ b/shm_csr.py Thu May 04 07:43:09 2017 -0400
[
b'@@ -1,287 +1,436 @@\n-from __future__ import division\n+import argparse\n+import logging\n+import sys\n+import os\n+import re\n+\n from collections import defaultdict\n-import re\n-import argparse\n+\n+def main():\n+\tparser = argparse.ArgumentParser()\n+\tparser.add_argument("--input", help="The \'7_V-REGION-mutation-and-AA-change-table\' and \'10_V-REGION-mutation-hotspots\' merged together, with an added \'best_match\' annotation")\n+\tparser.add_argument("--genes", help="The genes available in the \'best_match\' column")\n+\tparser.add_argument("--empty_region_filter", help="Where does the sequence start?", choices=[\'leader\', \'FR1\', \'CDR1\', \'FR2\'])\n+\tparser.add_argument("--output", help="Output file")\n \n-parser = argparse.ArgumentParser()\n-parser.add_argument("--input",\n-\t\t\t\t\thelp="The \'7_V-REGION-mutation-and-AA-change-table\' and \'10_V-REGION-mutation-hotspots\' merged together, with an added \'best_match\' annotation")\n-parser.add_argument("--genes", help="The genes available in the \'best_match\' column")\n-parser.add_argument("--empty_region_filter", help="Where does the sequence start?", choices=[\'leader\', \'FR1\', \'CDR1\', \'FR2\'])\n-parser.add_argument("--output", help="Output file")\n+\targs = parser.parse_args()\n+\n+\tinfile = args.input\n+\tgenes = str(args.genes).split(",")\n+\tempty_region_filter = args.empty_region_filter\n+\toutfile = args.output\n \n-args = parser.parse_args()\n+\tgenedic = dict()\n \n-infile = args.input\n-genes = str(args.genes).split(",")\n-empty_region_filter = args.empty_region_filter\n-outfile = args.output\n-\n-genedic = dict()\n+\tmutationdic = dict()\n+\tmutationMatcher = re.compile("^(.)(\\d+).(.),?(.)?(\\d+)?.?(.)?(.?.?.?.?.?)?")\n+\tNAMatchResult = (None, None, None, None, None, None, \'\')\n+\tgeneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes}\n+\tlinecount = 0\n \n-mutationdic = dict()\n-mutationMatcher = re.compile("^(.)(\\d+).(.),?(.)?(\\d+)?.?(.)?(.?.?.?.?.?)?")\n-NAMatchResult = (None, None, None, None, None, None, \'\')\n-linecount = 0\n+\tIDIndex = 0\n+\tbest_matchIndex = 0\n+\tfr1Index = 0\n+\tcdr1Index = 0\n+\tfr2Index = 0\n+\tcdr2Index = 0\n+\tfr3Index = 0\n+\tfirst = True\n+\tIDlist = []\n+\tmutationList = []\n+\tmutationListByID = {}\n+\tcdr1LengthDic = {}\n+\tcdr2LengthDic = {}\n+\n+\tfr1LengthDict = {}\n+\tfr2LengthDict = {}\n+\tfr3LengthDict = {}\n+\n+\tcdr1LengthIndex = 0\n+\tcdr2LengthIndex = 0\n \n-IDIndex = 0\n-best_matchIndex = 0\n-fr1Index = 0\n-cdr1Index = 0\n-fr2Index = 0\n-cdr2Index = 0\n-fr3Index = 0\n-first = True\n-IDlist = []\n-mutationList = []\n-mutationListByID = {}\n-cdr1LengthDic = {}\n-cdr2LengthDic = {}\n+\tfr1SeqIndex = 0\n+\tfr2SeqIndex = 0\n+\tfr3SeqIndex = 0\n+\n+\ttandem_sum_by_class = defaultdict(int)\n+\texpected_tandem_sum_by_class = defaultdict(float)\n \n-with open(infile, \'ru\') as i:\n-\tfor line in i:\n-\t\tif first:\n+\twith open(infile, \'ru\') as i:\n+\t\tfor line in i:\n+\t\t\tif first:\n+\t\t\t\tlinesplt = line.split("\\t")\n+\t\t\t\tIDIndex = linesplt.index("Sequence.ID")\n+\t\t\t\tbest_matchIndex = linesplt.index("best_match")\n+\t\t\t\tfr1Index = linesplt.index("FR1.IMGT")\n+\t\t\t\tcdr1Index = linesplt.index("CDR1.IMGT")\n+\t\t\t\tfr2Index = linesplt.index("FR2.IMGT")\n+\t\t\t\tcdr2Index = linesplt.index("CDR2.IMGT")\n+\t\t\t\tfr3Index = linesplt.index("FR3.IMGT")\n+\t\t\t\tcdr1LengthIndex = linesplt.index("CDR1.IMGT.seq")\n+\t\t\t\tcdr2LengthIndex = linesplt.index("CDR2.IMGT.seq")\n+\t\t\t\tfr1SeqIndex = linesplt.index("FR1.IMGT.seq")\n+\t\t\t\tfr2SeqIndex = linesplt.index("FR2.IMGT.seq")\n+\t\t\t\tfr3SeqIndex = linesplt.index("FR3.IMGT.seq")\n+\t\t\t\tfirst = False\n+\t\t\t\tcontinue\n+\t\t\tlinecount += 1\n \t\t\tlinesplt = line.split("\\t")\n-\t\t\tIDIndex = linesplt.index("Sequence.ID")\n-\t\t\tbest_matchIndex = linesplt.index("best_match")\n-\t\t\tfr1Index = linesplt.index("FR1.IMGT")\n-\t\t\tcdr1Index = linesplt.index("CDR1.IMGT")\n-\t\t\tfr2Index = linesplt.index("FR2.IMGT")\n-\t\t\tcdr2Index = linesplt.index("CDR2.IMGT")\n-\t\t\tfr3Index = linesplt.index("FR3.IMGT")\n-\t\t\tcdr1LengthIndex = linesplt.index("CDR1.IMGT.length")\n-\t\t\tcdr2LengthIndex = linesplt.index("CDR2.IMGT.length")\n-\t\t\tfirst = False\n-\t\t\tcontinue\n-\t\tlinecount += 1\n-\t\tlinesplt = line.split("\\t")\n-\t\tID = l'..b'cs.keys():\n-\tfor gene in genes:\n-\t\twith open(directory + gene + "_" + fname + "_value.txt", \'r\') as v:\n-\t\t\tvaluedic[gene + "_" + fname] = float(v.readlines()[0].rstrip())\n-\twith open(directory + "all_" + fname + "_value.txt", \'r\') as v:\n-\t\tvaluedic["total_" + fname] = float(v.readlines()[0].rstrip())\n-\t\n+\tdirectory = outfile[:outfile.rfind("/") + 1]\n+\tvalue = 0\n+\tvaluedic = dict()\n \n-def get_xyz(lst, gene, f, fname):\n-\tx = round(round(f(lst), 1))\n-\ty = valuedic[gene + "_" + fname]\n-\tz = str(round(x / float(y) * 100, 1)) if y != 0 else "0"\n-\treturn (str(x), str(y), z)\n+\tfor fname in funcs.keys():\n+\t\tfor gene in genes:\n+\t\t\twith open(directory + gene + "_" + fname + "_value.txt", \'r\') as v:\n+\t\t\t\tvaluedic[gene + "_" + fname] = float(v.readlines()[0].rstrip())\n+\t\twith open(directory + "all_" + fname + "_value.txt", \'r\') as v:\n+\t\t\tvaluedic["total_" + fname] = float(v.readlines()[0].rstrip())\n+\t\t\n \n-dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount}\n-arr = ["RGYW", "WRCY", "WA", "TW"]\n+\tdef get_xyz(lst, gene, f, fname):\n+\t\tx = round(round(f(lst), 1))\n+\t\ty = valuedic[gene + "_" + fname]\n+\t\tz = str(round(x / float(y) * 100, 1)) if y != 0 else "0"\n+\t\treturn (str(x), str(y), z)\n \n-geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes}\n+\tdic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount}\n+\tarr = ["RGYW", "WRCY", "WA", "TW"]\n \n-for fname in funcs.keys():\n-\tfunc = funcs[fname]\n-\tfoutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt"\n-\twith open(foutfile, \'w\') as o:\n-\t\tfor typ in arr:\n-\t\t\to.write(typ + " (%)")\n-\t\t\tcurr = dic[typ]\n-\t\t\tfor gene in genes:\n-\t\t\t\tgeneMatcher = geneMatchers[gene]\n-\t\t\t\tif valuedic[gene + "_" + fname] is 0:\n-\t\t\t\t\to.write(",0,0,0")\n-\t\t\t\telse:\n-\t\t\t\t\tx, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname)\n-\t\t\t\t\to.write("," + x + "," + y + "," + z)\n-\t\t\tx, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname)\n-\t\t\t#x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname)\n-\t\t\to.write("," + x + "," + y + "," + z + "\\n")\n+\tfor fname in funcs.keys():\n+\t\tfunc = funcs[fname]\n+\t\tfoutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt"\n+\t\twith open(foutfile, \'w\') as o:\n+\t\t\tfor typ in arr:\n+\t\t\t\to.write(typ + " (%)")\n+\t\t\t\tcurr = dic[typ]\n+\t\t\t\tfor gene in genes:\n+\t\t\t\t\tgeneMatcher = geneMatchers[gene]\n+\t\t\t\t\tif valuedic[gene + "_" + fname] is 0:\n+\t\t\t\t\t\to.write(",0,0,0")\n+\t\t\t\t\telse:\n+\t\t\t\t\t\tx, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname)\n+\t\t\t\t\t\to.write("," + x + "," + y + "," + z)\n+\t\t\t\tx, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname)\n+\t\t\t\t#x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname)\n+\t\t\t\to.write("," + x + "," + y + "," + z + "\\n")\n \n \n-# for testing\n-seq_motif_file = outfile[:outfile.rindex("/")] + "/motif_per_seq.txt"\n-with open(seq_motif_file, \'w\') as o:\n-\to.write("ID\\tRGYW\\tWRCY\\tWA\\tTW\\n")\n-\tfor ID in IDlist:\n-\t\t#o.write(ID + "\\t" + str(round(RGYWCount[ID], 2)) + "\\t" + str(round(WRCYCount[ID], 2)) + "\\t" + str(round(WACount[ID], 2)) + "\\t" + str(round(TWCount[ID], 2)) + "\\n")\n-\t\to.write(ID + "\\t" + str(RGYWCount[ID]) + "\\t" + str(WRCYCount[ID]) + "\\t" + str(WACount[ID]) + "\\t" + str(TWCount[ID]) + "\\n")\n+\t# for testing\n+\tseq_motif_file = outfile[:outfile.rindex("/")] + "/motif_per_seq.txt"\n+\twith open(seq_motif_file, \'w\') as o:\n+\t\to.write("ID\\tRGYW\\tWRCY\\tWA\\tTW\\n")\n+\t\tfor ID in IDlist:\n+\t\t\t#o.write(ID + "\\t" + str(round(RGYWCount[ID], 2)) + "\\t" + str(round(WRCYCount[ID], 2)) + "\\t" + str(round(WACount[ID], 2)) + "\\t" + str(round(TWCount[ID], 2)) + "\\n")\n+\t\t\to.write(ID + "\\t" + str(RGYWCount[ID]) + "\\t" + str(WRCYCount[ID]) + "\\t" + str(WACount[ID]) + "\\t" + str(TWCount[ID]) + "\\n")\n+\n+if __name__ == "__main__":\n+\tmain()\n'
b
diff -r cfc9a442e59d -r 64711f461c8e shm_csr.xml
--- a/shm_csr.xml Wed Apr 12 04:28:16 2017 -0400
+++ b/shm_csr.xml Thu May 04 07:43:09 2017 -0400
b
@@ -39,6 +39,7 @@
  <option value="60_55">>60% class and >55% subclass</option>
  <option value="70_0">>70% class</option>
  <option value="60_0">>60% class</option>
+                <option value="25_0">>25% class</option>
  <option value="101_101">Do not assign (sub)class</option>
  </param>
  </conditional>
@@ -183,7 +184,8 @@
 
 *Chunk hit percentage*: The percentage of the chunks that is aligned 
 
-*Nt hit percentage*: The percentage of chunks covering the subclass specific nucleotide match with the different subclasses. The most stringent filter for the subclass is 70% ‘nt hit percentage’ which means that 5 out of 7 subclass specific nucleotides for Cα or 6 out of 8 subclass specific nucleotides of Cγ should match with the specific subclass.
+*Nt hit percentage*: The percentage of chunks covering the subclass specific nucleotide match with the different subclasses. The most stringent filter for the subclass is 70% ‘nt hit percentage’ which means that 5 out of 7 subclass specific nucleotides for Cα or 6 out of 8 subclass specific nucleotides of Cγ should match with the specific subclass. 
+The option “>25% class” can be chosen when you only are interested in the class (Cα/Cγ/Cµ/Cɛ) of  your sequences and the length of your sequence is not long enough to assign the subclasses.
 
 -----
 
b
diff -r cfc9a442e59d -r 64711f461c8e wrapper.sh
--- a/wrapper.sh Wed Apr 12 04:28:16 2017 -0400
+++ b/wrapper.sh Thu May 04 07:43:09 2017 -0400
[
@@ -251,7 +251,7 @@
  echo "---------------- $func table ----------------"
  echo "---------------- $func table ----------------<br />" >> $log
 
- cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/data_${func}.txt
+ cat $outdir/mutations_${func}.txt $outdir/shm_overview_tandem_row.txt $outdir/hotspot_analysis_${func}.txt > $outdir/data_${func}.txt
 
  echo "---------------- pattern_plots.r ----------------"
  echo "---------------- pattern_plots.r ----------------<br />" >> $log
@@ -276,7 +276,7 @@
 
  while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz cex cey cez unx uny unz allx ally allz 
  do
- if [ "$name" == "FR R/S (ratio)" ] || [ "$name" == "CDR R/S (ratio)" ] ; then #meh
+ if [ "$name" == "FR R/S (ratio)" ] || [ "$name" == "CDR R/S (ratio)" ] || [ "$name" == "Tandems/Expected (ratio)" ] ; then #meh
  echo "<tr><td>$name</td><td>${cax}/${cay} (${caz})</td><td>${ca1x}/${ca1y} (${ca1z})</td><td>${ca2x}/${ca2y} (${ca2z})</td><td>${cgx}/${cgy} (${cgz})</td><td>${cg1x}/${cg1y} (${cg1z})</td><td>${cg2x}/${cg2y} (${cg2z})</td><td>${cg3x}/${cg3y} (${cg3z})</td><td>${cg4x}/${cg4y} (${cg4z})</td><td>${cmx}/${cmy} (${cmz})</td><td>${cex}/${cey} (${cez})</td><td>${allx}/${ally} (${allz})</td><td>${unx}/${uny} (${unz})</td></tr>" >> $output
  elif [ "$name" == "Median of Number of Mutations (%)" ] ; then
  echo "<tr><td>$name</td><td>${caz}%</td><td>${ca1z}%</td><td>${ca2z}%</td><td>${cgz}%</td><td>${cg1z}%</td><td>${cg2z}%</td><td>${cg3z}%</td><td>${cg4z}%</td><td>${cmz}%</td><td>${cez}%</td><td>${allz}%</td><td>${unz}%</td></tr>" >> $output
@@ -665,6 +665,7 @@
 echo "<tr><td>The data used to generate the percentage of mutations in AID and pol eta motives plot</td><td><a href='aid_motives.txt' download='aid_motives.txt' >Download</a></td></tr>" >> $output
 echo "<tr><td>The data used to generate the relative mutation patterns plot</td><td><a href='relative_mutations.txt' download='relative_mutations.txt' >Download</a></td></tr>" >> $output
 echo "<tr><td>The data used to generate the absolute mutation patterns plot</td><td><a href='absolute_mutations.txt' download='absolute_mutations.txt' >Download</a></td></tr>" >> $output
+echo "<tr><td>Data about tandem mutations by ID</td><td><a href='tandems_by_id.txt' download='tandems_by_id.txt' >Download</a></td></tr>" >> $output
 
 echo "<tr><td colspan='2' style='background-color:#E0E0E0;'>SHM Frequency</td></tr>" >> $output
 echo "<tr><td>The data  generate the frequency scatter plot</td><td><a href='scatter.txt' download='scatter.txt' >Download</a></td></tr>" >> $output