Galaxy |

Changeset 83:729738462297 (2021-09-15)

Previous changeset 82:a103134ee6e0 (2021-02-25) Next changeset 84:4db34e32dd47 (2021-10-27)

Commit message:
"planemo upload commit c0ffc68aec5836d5b20b543106493056a87edf57"

modified:
LICENSE
README.md
aa_histogram.r
baseline/Baseline_Functions.r
baseline/Baseline_Main.r
baseline/FiveS_Mutability.RData
baseline/FiveS_Substitution.RData
baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa
baseline/IMGTVHreferencedataset20161215.fa
baseline/IMGTVHreferencedataset20161215.fasta
baseline/baseline_url.txt
baseline/comparePDFs.r
baseline/filter.r
baseline/script_imgt.py
baseline/script_xlsx.py
baseline/wrapper.sh
change_o/change_o_url.txt
change_o/define_clones.r
change_o/define_clones.sh
change_o/lr.txt
change_o/makedb.sh
change_o/select_first_in_clone.r
check_unique_id.r
datatypes_conf.xml
gene_identification.py
imgt_loader.r
merge.r
merge_and_filter.r
mutation_column_checker.py
naive_output.r
new_imgt.r
pattern_plots.r
plot_pdf.r
sequence_overview.r
shm_clonality.htm
shm_csr.htm
shm_csr.py
shm_csr.r
shm_csr.xml
shm_downloads.htm
shm_first.htm
shm_frequency.htm
shm_overview.htm
shm_selection.htm
shm_transition.htm
style.tar.gz
subclass_definition.db.nhr
subclass_definition.db.nin
subclass_definition.db.nsq
summary_to_fasta.py
wrapper.sh

removed:
.gitattributes
.gitignore

diff -r a103134ee6e0 -r 729738462297 .gitattributes
--- a/.gitattributes Thu Feb 25 10:32:32 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,4 +0,0 @@
-# Auto detect text files and perform LF normalization
-* text=auto
-# Convert to LF line endings on checkout.
-*.sh text eol=lf
\ No newline at end of file

diff -r a103134ee6e0 -r 729738462297 .gitignore
--- a/.gitignore Thu Feb 25 10:32:32 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,4 +0,0 @@
-
-shm_csr\.tar\.gz
-
-\.vscode/settings\.json

diff -r a103134ee6e0 -r 729738462297 CHANGELOG.md
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/CHANGELOG.md Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,8 @@
+version 1.1.0
+-------------
++ Added changeo as a dependency. Porting to python3 was necessary to achieve
+  this. This will make sure the shm_csr package can be installed on all
+  galaxies.
++ Make sure the wrapper script runs with `set -e -o pipefail` and fails on
+  error.
++ Updated all python scripts to work on python3

diff -r a103134ee6e0 -r 729738462297 LICENSE
--- a/LICENSE Thu Feb 25 10:32:32 2021 +0000
+++ b/LICENSE Wed Sep 15 12:24:06 2021 +0000

@@ -1,6 +1,7 @@
MIT License

-Copyright (c) 2019 david
+Copyright (c) 2019 David van Zessen
+Copyright (c) 2021 Leiden University Medical Center

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

diff -r a103134ee6e0 -r 729738462297 README.md
--- a/README.md Thu Feb 25 10:32:32 2021 +0000
+++ b/README.md Wed Sep 15 12:24:06 2021 +0000

[

@@ -1,13 +1,13 @@
-# SHM CSR
-
-Somatic hypermutation and class switch recombination pipeline.
-The docker version can be found [here](https://github.com/ErasmusMC-Bioinformatics/ARGalaxy-docker).
-
-# Dependencies
---------------------
-[Python 2.7](https://www.python.org/)
-[Change-O](https://changeo.readthedocs.io/en/version-0.4.4/)
-[Baseline](http://selection.med.yale.edu/baseline/)
-[R data.table](https://cran.r-project.org/web/packages/data.table/data.table.pdf)
-[R ggplot2](https://cran.r-project.org/web/packages/ggplot2/ggplot2.pdf)
-[R reshape2](https://cran.r-project.org/web/packages/reshape/reshape.pdf)
+# SHM CSR
+
+Somatic hypermutation and class switch recombination pipeline.
+The docker version can be found [here](https://github.com/ErasmusMC-Bioinformatics/ARGalaxy-docker).
+
+# Dependencies
+--------------------
+[Python 3.7](https://www.python.org/)
+[Change-O](https://changeo.readthedocs.io/en/version-0.4.4/)
+[Baseline](http://selection.med.yale.edu/baseline/)
+[R data.table](https://cran.r-project.org/web/packages/data.table/data.table.pdf)
+[R ggplot2](https://cran.r-project.org/web/packages/ggplot2/ggplot2.pdf)
+[R reshape2](https://cran.r-project.org/web/packages/reshape/reshape.pdf)

diff -r a103134ee6e0 -r 729738462297 aa_histogram.r
--- a/aa_histogram.r Thu Feb 25 10:32:32 2021 +0000
+++ b/aa_histogram.r Wed Sep 15 12:24:06 2021 +0000

[

@@ -1,69 +1,69 @@
-library(ggplot2)
-
-args <- commandArgs(trailingOnly = TRUE)
-
-mutations.by.id.file = args[1]
-absent.aa.by.id.file = args[2]
-genes = strsplit(args[3], ",")[[1]]
-genes = c(genes, "")
-outdir = args[4]
-
-
-print("---------------- read input ----------------")
-
-mutations.by.id = read.table(mutations.by.id.file, sep="\t", fill=T, header=T, quote="")
-absent.aa.by.id = read.table(absent.aa.by.id.file, sep="\t", fill=T, header=T, quote="")
-
-for(gene in genes){
- graph.title = paste(gene, "AA mutation frequency")
- if(gene == ""){
- mutations.by.id.gene = mutations.by.id[!grepl("unmatched", mutations.by.id$best_match),]
- absent.aa.by.id.gene = absent.aa.by.id[!grepl("unmatched", absent.aa.by.id$best_match),]
-
- graph.title = "AA mutation frequency all"
- } else {
- mutations.by.id.gene = mutations.by.id[grepl(paste("^", gene, sep=""), mutations.by.id$best_match),]
- absent.aa.by.id.gene = absent.aa.by.id[grepl(paste("^", gene, sep=""), absent.aa.by.id$best_match),]
- }
- print(paste("nrow", gene, nrow(absent.aa.by.id.gene)))
- if(nrow(mutations.by.id.gene) == 0){
- next
- }
-
- mutations.at.position = colSums(mutations.by.id.gene[,-c(1,2)])
- aa.at.position = colSums(absent.aa.by.id.gene[,-c(1,2,3,4)])
-
- dat_freq = mutations.at.position / aa.at.position
- dat_freq[is.na(dat_freq)] = 0
- dat_dt = data.frame(i=1:length(dat_freq), freq=dat_freq)
-
-
- print("---------------- plot ----------------")
-
- m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1), text = element_text(size=13, colour="black"))
- m = m + geom_bar(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=dat_dt$i, labels=dat_dt$i)
- m = m + annotate("segment", x = 0.5, y = -0.05, xend=26.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 13, y = -0.1, label="FR1")
- m = m + annotate("segment", x = 26.5, y = -0.07, xend=38.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 32.5, y = -0.15, label="CDR1")
- m = m + annotate("segment", x = 38.5, y = -0.05, xend=55.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 47, y = -0.1, label="FR2")
- m = m + annotate("segment", x = 55.5, y = -0.07, xend=65.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 60.5, y = -0.15, label="CDR2")
- m = m + annotate("segment", x = 65.5, y = -0.05, xend=104.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 85, y = -0.1, label="FR3")
- m = m + expand_limits(y=c(-0.1,1)) + xlab("AA position") + ylab("Frequency") + ggtitle(graph.title)
- m = m + theme(panel.background = element_rect(fill = "white", colour="black"), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
- #m = m + scale_colour_manual(values=c("black"))
-
- print("---------------- write/print ----------------")
-
-
- dat.sums = data.frame(index=1:length(mutations.at.position), mutations.at.position=mutations.at.position, aa.at.position=aa.at.position)
-
- write.table(dat.sums, paste(outdir, "/aa_histogram_sum_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
- write.table(mutations.by.id.gene, paste(outdir, "/aa_histogram_count_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
- write.table(absent.aa.by.id.gene, paste(outdir, "/aa_histogram_absent_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
- write.table(dat_dt, paste(outdir, "/aa_histogram_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
-
- png(filename=paste(outdir, "/aa_histogram_", gene, ".png", sep=""), width=1280, height=720)
- print(m)
- dev.off()
-
- ggsave(paste(outdir, "/aa_histogram_", gene, ".pdf", sep=""), m, width=14, height=7)
-}
+library(ggplot2)
+
+args <- commandArgs(trailingOnly = TRUE)
+
+mutations.by.id.file = args[1]
+absent.aa.by.id.file = args[2]
+genes = strsplit(args[3], ",")[[1]]
+genes = c(genes, "")
+outdir = args[4]
+
+
+print("---------------- read input ----------------")
+
+mutations.by.id = read.table(mutations.by.id.file, sep="\t", fill=T, header=T, quote="")
+absent.aa.by.id = read.table(absent.aa.by.id.file, sep="\t", fill=T, header=T, quote="")
+
+for(gene in genes){
+ graph.title = paste(gene, "AA mutation frequency")
+ if(gene == ""){
+ mutations.by.id.gene = mutations.by.id[!grepl("unmatched", mutations.by.id$best_match),]
+ absent.aa.by.id.gene = absent.aa.by.id[!grepl("unmatched", absent.aa.by.id$best_match),]
+
+ graph.title = "AA mutation frequency all"
+ } else {
+ mutations.by.id.gene = mutations.by.id[grepl(paste("^", gene, sep=""), mutations.by.id$best_match),]
+ absent.aa.by.id.gene = absent.aa.by.id[grepl(paste("^", gene, sep=""), absent.aa.by.id$best_match),]
+ }
+ print(paste("nrow", gene, nrow(absent.aa.by.id.gene)))
+ if(nrow(mutations.by.id.gene) == 0){
+ next
+ }
+
+ mutations.at.position = colSums(mutations.by.id.gene[,-c(1,2)])
+ aa.at.position = colSums(absent.aa.by.id.gene[,-c(1,2,3,4)])
+
+ dat_freq = mutations.at.position / aa.at.position
+ dat_freq[is.na(dat_freq)] = 0
+ dat_dt = data.frame(i=1:length(dat_freq), freq=dat_freq)
+
+
+ print("---------------- plot ----------------")
+
+ m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1), text = element_text(size=13, colour="black"))
+ m = m + geom_bar(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=dat_dt$i, labels=dat_dt$i)
+ m = m + annotate("segment", x = 0.5, y = -0.05, xend=26.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 13, y = -0.1, label="FR1")
+ m = m + annotate("segment", x = 26.5, y = -0.07, xend=38.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 32.5, y = -0.15, label="CDR1")
+ m = m + annotate("segment", x = 38.5, y = -0.05, xend=55.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 47, y = -0.1, label="FR2")
+ m = m + annotate("segment", x = 55.5, y = -0.07, xend=65.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 60.5, y = -0.15, label="CDR2")
+ m = m + annotate("segment", x = 65.5, y = -0.05, xend=104.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 85, y = -0.1, label="FR3")
+ m = m + expand_limits(y=c(-0.1,1)) + xlab("AA position") + ylab("Frequency") + ggtitle(graph.title)
+ m = m + theme(panel.background = element_rect(fill = "white", colour="black"), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
+ #m = m + scale_colour_manual(values=c("black"))
+
+ print("---------------- write/print ----------------")
+
+
+ dat.sums = data.frame(index=1:length(mutations.at.position), mutations.at.position=mutations.at.position, aa.at.position=aa.at.position)
+
+ write.table(dat.sums, paste(outdir, "/aa_histogram_sum_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+ write.table(mutations.by.id.gene, paste(outdir, "/aa_histogram_count_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+ write.table(absent.aa.by.id.gene, paste(outdir, "/aa_histogram_absent_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+ write.table(dat_dt, paste(outdir, "/aa_histogram_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+
+ png(filename=paste(outdir, "/aa_histogram_", gene, ".png", sep=""), width=1280, height=720)
+ print(m)
+ dev.off()
+
+ ggsave(paste(outdir, "/aa_histogram_", gene, ".pdf", sep=""), m, width=14, height=7)
+}

diff -r a103134ee6e0 -r 729738462297 baseline/Baseline_Functions.r
--- a/baseline/Baseline_Functions.r Thu Feb 25 10:32:32 2021 +0000
+++ b/baseline/Baseline_Functions.r Wed Sep 15 12:24:06 2021 +0000

[

b'@@ -1,2287 +1,2287 @@\n-#########################################################################################\r\n-# License Agreement\r\n-# \r\n-# THIS WORK IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE \r\n-# ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER \r\n-# APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE \r\n-# OR COPYRIGHT LAW IS PROHIBITED.\r\n-# \r\n-# BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE \r\n-# BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED \r\n-# TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN \r\n-# CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.\r\n-#\r\n-# BASELIne: Bayesian Estimation of Antigen-Driven Selection in Immunoglobulin Sequences\r\n-# Coded by: Mohamed Uduman & Gur Yaari\r\n-# Copyright 2012 Kleinstein Lab\r\n-# Version: 1.3 (01/23/2014)\r\n-#########################################################################################\r\n-\r\n-# Global variables \r\n- \r\n- FILTER_BY_MUTATIONS = 1000\r\n-\r\n- # Nucleotides\r\n- NUCLEOTIDES = c("A","C","G","T")\r\n- \r\n- # Amino Acids\r\n- AMINO_ACIDS <- c("F", "F", "L", "L", "S", "S", "S", "S", "Y", "Y", "*", "*", "C", "C", "*", "W", "L", "L", "L", "L", "P", "P", "P", "P", "H", "H", "Q", "Q", "R", "R", "R", "R", "I", "I", "I", "M", "T", "T", "T", "T", "N", "N", "K", "K", "S", "S", "R", "R", "V", "V", "V", "V", "A", "A", "A", "A", "D", "D", "E", "E", "G", "G", "G", "G")\r\n- names(AMINO_ACIDS) <- c("TTT", "TTC", "TTA", "TTG", "TCT", "TCC", "TCA", "TCG", "TAT", "TAC", "TAA", "TAG", "TGT", "TGC", "TGA", "TGG", "CTT", "CTC", "CTA", "CTG", "CCT", "CCC", "CCA", "CCG", "CAT", "CAC", "CAA", "CAG", "CGT", "CGC", "CGA", "CGG", "ATT", "ATC", "ATA", "ATG", "ACT", "ACC", "ACA", "ACG", "AAT", "AAC", "AAA", "AAG", "AGT", "AGC", "AGA", "AGG", "GTT", "GTC", "GTA", "GTG", "GCT", "GCC", "GCA", "GCG", "GAT", "GAC", "GAA", "GAG", "GGT", "GGC", "GGA", "GGG")\r\n- names(AMINO_ACIDS) <- names(AMINO_ACIDS)\r\n-\r\n- #Amino Acid Traits\r\n- #"*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V" "W" "Y"\r\n- #B = "Hydrophobic/Burried" N = "Intermediate/Neutral" S="Hydrophilic/Surface") \r\n- TRAITS_AMINO_ACIDS_CHOTHIA98 <- c("*","N","B","S","S","B","N","N","B","S","B","B","S","N","S","S","N","N","B","B","N")\r\n- names(TRAITS_AMINO_ACIDS_CHOTHIA98) <- sort(unique(AMINO_ACIDS))\r\n- TRAITS_AMINO_ACIDS <- array(NA,21)\r\n- \r\n- # Codon Table\r\n- CODON_TABLE <- as.data.frame(matrix(NA,ncol=64,nrow=12))\r\n-\r\n- # Substitution Model: Smith DS et al. 1996\r\n- substitution_Literature_Mouse <- matrix(c(0, 0.156222928, 0.601501588, 0.242275484, 0.172506739, 0, 0.241239892, 0.586253369, 0.54636291, 0.255795364, 0, 0.197841727, 0.290240811, 0.467680608, 0.24207858, 0),nrow=4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES))\r\n- substitution_Flu_Human <- matrix(c(0,0.2795596,0.5026927,0.2177477,0.1693210,0,0.3264723,0.5042067,0.4983549,0.3328321,0,0.1688130,0.2021079,0.4696077,0.3282844,0),4,4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES))\r\n- substitution_Flu25_Human <- matrix(c(0,0.2580641,0.5163685,0.2255674,0.1541125,0,0.3210224,0.5248651,0.5239281,0.3101292,0,0.1659427,0.1997207,0.4579444,0.3423350,0),4,4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES))\r\n- load("FiveS_Substitution.RData")\r\n-\r\n- # Mutability Models: Shapiro GS et al. 2002\r\n- triMutability_Literature_Human <- matrix(c(0.24, 1.2, 0.96, 0.43, 2.14, 2, 1.11, 1.9, 0.85, 1.83, 2.36, 1.31, 0.82, 0.52, 0.89, 1.33, 1.4, 0.82, 1.83, 0.73, 1.83, 1.62, 1.53, 0.57, 0.92, 0.42, 0.42, 1.47, 3.44, 2.58, 1.18, 0.47, 0.39, 1.12, 1.8, 0.68, 0.47, 2.19, 2.35, 2.19, 1.05, 1.84, 1.26, 0.28, 0.98, 2.37, 0.66, 1.58, 0.67, 0.92, 1.76, 0.83, 0.97, 0.56, 0.75, 0.62, 2.26, 0.62, 0.74, 1.11, 1.16, 0.61, 0.88, 0.67, 0.37, 0.07, 1.08, 0.46, 0.31, 0.94, 0.62, 0.57, 0.29, NA, 1.44, 0.46, 0.69, 0.57, 0.24, 0.37, 1.1, 0.99, 1.39, 0.6, 2.26, 1.24, 1.36, 0.52, 0.33, 0.26, 1.25, 0.37, 0.58, 1.03, 1.'..b'se{\n+ facGL <- factor(matInput[,2])\n+ facLevels = levels(facGL)\n+ LisGLs_MutabilityU = lapply(1:length(facLevels), function(x){\n+ computeMutabilities(facLevels[x])\n+ })\n+ facIndex = match(facGL,facLevels)\n+ \n+ LisGLs_Mutability = lapply(1:nrow(matInput), function(x){\n+ cInput = rep(NA,nchar(matInput[x,1]))\n+ cInput[s2c(matInput[x,1])!="N"] = 1\n+ LisGLs_MutabilityU[[facIndex[x]]] * cInput \n+ })\n+ \n+ LisGLs_Targeting = lapply(1:dim(matInput)[1], function(x){\n+ computeTargeting(matInput[x,2],LisGLs_Mutability[[x]])\n+ })\n+ \n+ LisGLs_MutationTypes = lapply(1:length(matInput[,2]),function(x){\n+ #print(x)\n+ computeMutationTypes(matInput[x,2])\n+ })\n+ \n+ LisGLs_R_Exp = lapply(1:nrow(matInput), function(x){\n+ Exp_R <- rollapply(as.zoo(1:readEnd),width=3,by=3,\n+ function(codonNucs){ \n+ RPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="R") \n+ sum( LisGLs_Targeting[[x]][,codonNucs][RPos], na.rm=T ) \n+ }\n+ ) \n+ })\n+ \n+ LisGLs_S_Exp = lapply(1:nrow(matInput), function(x){\n+ Exp_S <- rollapply(as.zoo(1:readEnd),width=3,by=3,\n+ function(codonNucs){ \n+ SPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="S") \n+ sum( LisGLs_Targeting[[x]][,codonNucs][SPos], na.rm=T )\n+ }\n+ ) \n+ }) \n+ \n+ Exp_R = matrix(unlist(LisGLs_R_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) \n+ Exp_S = matrix(unlist(LisGLs_S_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) \n+ return( list( "Expected_R"=Exp_R, "Expected_S"=Exp_S) ) \n+ }\n+}\n+\n+# getObservedMutationsByCodon <- function(listMutations){\n+# numbSeqs <- length(listMutations) \n+# obsMu_R <- matrix(0,nrow=numbSeqs,ncol=readEnd/3,dimnames=list(c(1:numbSeqs),c(1:(readEnd/3))))\n+# obsMu_S <- obsMu_R\n+# temp <- mclapply(1:length(listMutations), function(i){\n+# arrMutations = listMutations[[i]]\n+# RPos = as.numeric(names(arrMutations)[arrMutations=="R"])\n+# RPos <- sapply(RPos,getCodonNumb) \n+# if(any(RPos)){\n+# tabR <- table(RPos)\n+# obsMu_R[i,as.numeric(names(tabR))] <<- tabR\n+# } \n+# \n+# SPos = as.numeric(names(arrMutations)[arrMutations=="S"])\n+# SPos <- sapply(SPos,getCodonNumb)\n+# if(any(SPos)){\n+# tabS <- table(SPos)\n+# obsMu_S[i,names(tabS)] <<- tabS\n+# } \n+# }\n+# )\n+# return( list( "Observed_R"=obsMu_R, "Observed_S"=obsMu_S) ) \n+# }\n+\n+getObservedMutationsByCodon <- function(listMutations){\n+ numbSeqs <- length(listMutations) \n+ obsMu_R <- matrix(0,nrow=numbSeqs,ncol=readEnd/3,dimnames=list(c(1:numbSeqs),c(1:(readEnd/3))))\n+ obsMu_S <- obsMu_R\n+ temp <- lapply(1:length(listMutations), function(i){\n+ arrMutations = listMutations[[i]]\n+ RPos = as.numeric(names(arrMutations)[arrMutations=="R"])\n+ RPos <- sapply(RPos,getCodonNumb) \n+ if(any(RPos)){\n+ tabR <- table(RPos)\n+ obsMu_R[i,as.numeric(names(tabR))] <<- tabR\n+ } \n+ \n+ SPos = as.numeric(names(arrMutations)[arrMutations=="S"])\n+ SPos <- sapply(SPos,getCodonNumb)\n+ if(any(SPos)){\n+ tabS <- table(SPos)\n+ obsMu_S[i,names(tabS)] <<- tabS\n+ } \n+ }\n+ )\n+ return( list( "Observed_R"=obsMu_R, "Observed_S"=obsMu_S) ) \n+}\n+\n'

diff -r a103134ee6e0 -r 729738462297 baseline/Baseline_Main.r
--- a/baseline/Baseline_Main.r Thu Feb 25 10:32:32 2021 +0000
+++ b/baseline/Baseline_Main.r Wed Sep 15 12:24:06 2021 +0000

[

b'@@ -1,388 +1,388 @@\n-#########################################################################################\r\n-# License Agreement\r\n-# \r\n-# THIS WORK IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE \r\n-# ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER \r\n-# APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE \r\n-# OR COPYRIGHT LAW IS PROHIBITED.\r\n-# \r\n-# BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE \r\n-# BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED \r\n-# TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN \r\n-# CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.\r\n-#\r\n-# BASELIne: Bayesian Estimation of Antigen-Driven Selection in Immunoglobulin Sequences\r\n-# Coded by: Mohamed Uduman & Gur Yaari\r\n-# Copyright 2012 Kleinstein Lab\r\n-# Version: 1.3 (01/23/2014)\r\n-#########################################################################################\r\n-\r\n-op <- options();\r\n-options(showWarnCalls=FALSE, showErrorCalls=FALSE, warn=-1)\r\n-library(\'seqinr\')\r\n-if( F & Sys.info()[1]=="Linux"){\r\n- library("multicore")\r\n-}\r\n-\r\n-# Load functions and initialize global variables\r\n-source("Baseline_Functions.r")\r\n-\r\n-# Initialize parameters with user provided arguments\r\n- arg <- commandArgs(TRUE) \r\n- #arg = c(2,1,5,5,0,1,"1:26:38:55:65:104:116", "test.fasta","","sample")\r\n- #arg = c(1,1,5,5,0,1,"1:38:55:65:104:116:200", "test.fasta","","sample")\r\n- #arg = c(1,1,5,5,1,1,"1:26:38:55:65:104:116", "/home/mu37/Wu/Wu_Cloned_gapped_sequences_D-masked.fasta","/home/mu37/Wu/","Wu")\r\n- testID <- as.numeric(arg[1]) # 1 = Focused, 2 = Local\r\n- species <- as.numeric(arg[2]) # 1 = Human. 2 = Mouse\r\n- substitutionModel <- as.numeric(arg[3]) # 0 = Uniform substitution, 1 = Smith DS et al. 1996, 5 = FiveS\r\n- mutabilityModel <- as.numeric(arg[4]) # 0 = Uniform mutablity, 1 = Tri-nucleotide (Shapiro GS et al. 2002) , 5 = FiveS\r\n- clonal <- as.numeric(arg[5]) # 0 = Independent sequences, 1 = Clonally related, 2 = Clonally related & only non-terminal mutations\r\n- fixIndels <- as.numeric(arg[6]) # 0 = Do nothing, 1 = Try and fix Indels\r\n- region <- as.numeric(strsplit(arg[7],":")[[1]]) # StartPos:LastNucleotideF1:C1:F2:C2:F3:C3\r\n- inputFilePath <- arg[8] # Full path to input file\r\n- outputPath <- arg[9] # Full path to location of output files\r\n- outputID <- arg[10] # ID for session output \r\n- \r\n-\r\n- if(testID==5){\r\n- traitChangeModel <- 1\r\n- if( !is.na(any(arg[11])) ) traitChangeModel <- as.numeric(arg[11]) # 1 <- Chothia 1998\r\n- initializeTraitChange(traitChangeModel) \r\n- }\r\n- \r\n-# Initialize other parameters/variables\r\n- \r\n- # Initialzie the codon table ( definitions of R/S )\r\n- computeCodonTable(testID) \r\n-\r\n- # Initialize \r\n- # Test Name\r\n- testName<-"Focused"\r\n- if(testID==2) testName<-"Local"\r\n- if(testID==3) testName<-"Imbalanced" \r\n- if(testID==4) testName<-"ImbalancedSilent" \r\n- \r\n- # Indel placeholders initialization\r\n- indelPos <- NULL\r\n- delPos <- NULL\r\n- insPos <- NULL\r\n-\r\n- # Initialize in Tranistion & Mutability matrixes\r\n- substitution <- initializeSubstitutionMatrix(substitutionModel,species)\r\n- mutability <- initializeMutabilityMatrix(mutabilityModel,species)\r\n- \r\n- # FWR/CDR boundaries\r\n- flagTrim <- F\r\n- if( is.na(region[7])){\r\n- flagTrim <- T\r\n- region[7]<-region[6]\r\n- }\r\n- readStart = min(region,na.rm=T)\r\n- readEnd = max(region,na.rm=T)\r\n- if(readStart>1){\r\n- region = region - (readStart - 1)\r\n- }\r\n- region_Nuc = c( (region[1]*3-2) , (region[2:7]*3) )\r\n- region_Cod = region\r\n- \r\n- readStart = (readStart*3)-2\r\n- readEnd = (readEnd*3)\r\n- \r\n- FWR_Nuc <- c( rep(TRUE,(region_Nuc[2])),\r\n- '..b'fwr[G,],simgaP_groups_cdr[G],simgaP_groups_fwr[G])\n+ listPDFs[[rowNumb]] = list("CDR"=bayesPDF_groups_cdr[[G]],"FWR"=bayesPDF_groups_fwr[[G]])\n+ names(listPDFs)[rowNumb] = names(groups[groups==paste(G)])[1]\n+ #if(names(groups)[which(groups==G)[1]]!="All sequences combined"){\n+ gs = unique(germlines[groups==G])\n+ rowNumb = rowNumb+1\n+ if( !is.na(gs) ){\n+ for( g in gs ){\n+ matOutput[rowNumb,c(1,2,11:18)] = c("Germline",names(germlines)[germlines==g][1],bayes_germlines_cdr[g,],bayes_germlines_fwr[g,],simgaP_germlines_cdr[g],simgaP_germlines_fwr[g])\n+ listPDFs[[rowNumb]] = list("CDR"=bayesPDF_germlines_cdr[[g]],"FWR"=bayesPDF_germlines_fwr[[g]])\n+ names(listPDFs)[rowNumb] = names(germlines[germlines==paste(g)])[1]\n+ rowNumb = rowNumb+1\n+ indexesOfInterest = which(germlines==g)\n+ numbSeqsOfInterest = length(indexesOfInterest)\n+ rowNumb = seq(rowNumb,rowNumb+(numbSeqsOfInterest-1))\n+ matOutput[rowNumb,] = matrix( c( rep("Sequence",numbSeqsOfInterest),\n+ rownames(matInput)[indexesOfInterest],\n+ c(matMutationInfo[indexesOfInterest,1:4]),\n+ c(matMutationInfo[indexesOfInterest,5:8]),\n+ c(bayes_cdr[indexesOfInterest,]),\n+ c(bayes_fwr[indexesOfInterest,]),\n+ c(simgaP_cdr[indexesOfInterest]),\n+ c(simgaP_fwr[indexesOfInterest]) \n+ ), ncol=18, nrow=numbSeqsOfInterest,byrow=F)\n+ increment=0\n+ for( ioi in indexesOfInterest){\n+ listPDFs[[min(rowNumb)+increment]] = list("CDR"=bayesPDF_cdr[[ioi]] , "FWR"=bayesPDF_fwr[[ioi]])\n+ names(listPDFs)[min(rowNumb)+increment] = rownames(matInput)[ioi]\n+ increment = increment + 1\n+ }\n+ rowNumb=max(rowNumb)+1\n+\n+ }\n+ }\n+ }\n+ colsToFormat = 11:18\n+ matOutput[,colsToFormat] = formatC( matrix(as.numeric(matOutput[,colsToFormat]), nrow=nrow(matOutput), ncol=length(colsToFormat)) , digits=3)\n+ matOutput[matOutput== " NaN"] = NA\n+ \n+ \n+ \n+ colnames(matOutput) = c("Type", "ID", "Observed_CDR_R", "Observed_CDR_S", "Observed_FWR_R", "Observed_FWR_S",\n+ "Expected_CDR_R", "Expected_CDR_S", "Expected_FWR_R", "Expected_FWR_S",\n+ paste( rep(testName,6), rep(c("Sigma","CIlower","CIupper"),2),rep(c("CDR","FWR"),each=3), sep="_"),\n+ paste( rep(testName,2), rep("P",2),c("CDR","FWR"), sep="_")\n+ )\n+ fileName = paste(outputPath,outputID,".txt",sep="")\n+ write.table(matOutput,file=fileName,quote=F,sep="\\t",row.names=T,col.names=NA)\n+ fileName = paste(outputPath,outputID,".RData",sep="")\n+ save(listPDFs,file=fileName)\n+\n+indelWarning = FALSE\n+if(sum(indelPos)>0){\n+ indelWarning = "Warning: The following sequences have either gaps and/or deletions, and have been ommited from the analysis.";\n+ indelWarning = paste( indelWarning , "<UL>", sep="" )\n+ for(indels in names(indelPos)[indelPos]){\n+ indelWarning = paste( indelWarning , "<LI>", indels, "</LI>", sep="" )\n+ }\n+ indelWarning = paste( indelWarning , "</UL>", sep="" )\n+}\n+\n+cloneWarning = FALSE\n+if(clonal==1){\n+ if(sum(matInputErrors)>0){\n+ cloneWarning = "Warning: The following clones have sequences of unequal length.";\n+ cloneWarning = paste( cloneWarning , "<UL>", sep="" )\n+ for(clone in names(matInputErrors)[matInputErrors]){\n+ cloneWarning = paste( cloneWarning , "<LI>", names(germlines)[as.numeric(clone)], "</LI>", sep="" )\n+ }\n+ cloneWarning = paste( cloneWarning , "</UL>", sep="" )\n+ }\n+}\n+cat(paste("Success",outputID,indelWarning,cloneWarning,sep="|"))\n'

diff -r a103134ee6e0 -r 729738462297 baseline/comparePDFs.r
--- a/baseline/comparePDFs.r Thu Feb 25 10:32:32 2021 +0000
+++ b/baseline/comparePDFs.r Wed Sep 15 12:24:06 2021 +0000

[

b'@@ -1,225 +1,225 @@\n-options("warn"=-1)\r\n-\r\n-#from http://selection.med.yale.edu/baseline/Archive/Baseline%20Version%201.3/Baseline_Functions_Version1.3.r\r\n-# Compute p-value of two distributions\r\n-compareTwoDistsFaster <-function(sigma_S=seq(-20,20,length.out=4001), N=10000, dens1=runif(4001,0,1), dens2=runif(4001,0,1)){\r\n-#print(c(length(dens1),length(dens2)))\r\n-if(length(dens1)>1 & length(dens2)>1 ){\r\n-\tdens1<-dens1/sum(dens1)\r\n-\tdens2<-dens2/sum(dens2)\r\n-\tcum2 <- cumsum(dens2)-dens2/2\r\n-\ttmp<- sum(sapply(1:length(dens1),function(i)return(dens1[i]*cum2[i])))\r\n-\t#print(tmp)\r\n-\tif(tmp>0.5)tmp<-tmp-1\r\n-\treturn( tmp )\r\n-\t}\r\n-\telse {\r\n-\treturn(NA)\r\n-\t}\r\n-\t#return (sum(sapply(1:N,function(i)(sample(sigma_S,1,prob=dens1)>sample(sigma_S,1,prob=dens2))))/N)\r\n-} \r\n-\r\n-\r\n-require("grid")\r\n-arg <- commandArgs(TRUE)\r\n-#arg <- c("300143","4","5")\r\n-arg[!arg=="clonal"]\r\n-input <- arg[1]\r\n-output <- arg[2]\r\n-rowIDs <- as.numeric( sapply(arg[3:(max(3,length(arg)))],function(x){ gsub("chkbx","",x) } ) )\r\n-\r\n-numbSeqs = length(rowIDs)\r\n-\r\n-if ( is.na(rowIDs[1]) | numbSeqs>10 ) {\r\n- stop( paste("Error: Please select between one and 10 seqeunces to compare.") )\r\n-}\r\n-\r\n-#load( paste("output/",sessionID,".RData",sep="") )\r\n-load( input )\r\n-#input\r\n-\r\n-xMarks = seq(-20,20,length.out=4001)\r\n-\r\n-plot_grid_s<-function(pdf1,pdf2,Sample=100,cex=1,xlim=NULL,xMarks = seq(-20,20,length.out=4001)){\r\n- yMax = max(c(abs(as.numeric(unlist(listPDFs[pdf1]))),abs(as.numeric(unlist(listPDFs[pdf2]))),0),na.rm=T) * 1.1\r\n-\r\n- if(length(xlim==2)){\r\n- xMin=xlim[1]\r\n- xMax=xlim[2]\r\n- } else {\r\n- xMin_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][1]\r\n- xMin_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][1]\r\n- xMax_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001])]\r\n- xMax_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001])]\r\n- \r\n- xMin_CDR2 = xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001][1]\r\n- xMin_FWR2 = xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001][1]\r\n- xMax_CDR2 = xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001])]\r\n- xMax_FWR2 = xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001])]\r\n- \r\n- xMin=min(c(xMin_CDR,xMin_FWR,xMin_CDR2,xMin_FWR2,0),na.rm=TRUE)\r\n- xMax=max(c(xMax_CDR,xMax_FWR,xMax_CDR2,xMax_FWR2,0),na.rm=TRUE)\r\n- }\r\n-\r\n- sigma<-approx(xMarks,xout=seq(xMin,xMax,length.out=Sample))$x\r\n- grid.rect(gp = gpar(col=gray(0.6),fill="white",cex=cex))\r\n- x <- sigma\r\n- pushViewport(viewport(x=0.175,y=0.175,width=0.825,height=0.825,just=c("left","bottom"),default.units="npc"))\r\n- #pushViewport(plotViewport(c(1.8, 1.8, 0.25, 0.25)*cex))\r\n- pushViewport(dataViewport(x, c(yMax,-yMax),gp = gpar(cex=cex),extension=c(0.05)))\r\n- grid.polygon(c(0,0,1,1),c(0,0.5,0.5,0),gp=gpar(col=grey(0.95),fill=grey(0.95)),default.units="npc")\r\n- grid.polygon(c(0,0,1,1),c(1,0.5,0.5,1),gp=gpar(col=grey(0.9),fill=grey(0.9)),default.units="npc")\r\n- grid.rect()\r\n- grid.xaxis(gp = gpar(cex=cex/1.1))\r\n- yticks = pretty(c(-yMax,yMax),8)\r\n- yticks = yticks[yticks>(-yMax) & yticks<(yMax)]\r\n- grid.yaxis(at=yticks,label=abs(yticks),gp = gpar(cex=cex/1.1))\r\n- if(length(listPDFs[pdf1][[1]][["CDR"]])>1){\r\n- ycdr<-approx(xMarks,listPDFs[pdf1][[1]][["CDR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y\r\n- grid.lines(unit(x,"native"), unit(ycdr,"native"),gp=gpar(col=2,lwd=2))\r\n- }\r\n- if(length(listPDFs[pdf1][[1]][["FWR"]])>1){\r\n- yfwr<-approx(xMarks,listPDFs[pdf1][[1]][["FWR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y\r\n- grid.lines(unit(x,"native"), unit(-yfwr,"native"),gp=gpar(col=4,lwd=2))\r\n- }\r\n-\r\n- if(length(listPDFs[pdf2][[1]][["CDR"]])>1){\r\n- ycdr2<-approx(xMarks,listPDFs[pdf2][[1]][["CDR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y\r\n- grid.lines(unit(x,"native"), unit(ycdr2,"native"),gp=gpar(col=2,l'..b'= gpar(cex=cex))\n+ grid.text(formatC(as.numeric(pCDR1FWR2),digits=3), x = unit(0.75, "npc"),y = unit(0.75, "npc"),just=c("center", "center"),gp = gpar(cex=cex))\n+ grid.text(formatC(as.numeric(pCDR1CDR2),digits=3), x = unit(0.25, "npc"),y = unit(0.75, "npc"),just=c("center", "center"),gp = gpar(cex=cex))\n+ grid.text(formatC(as.numeric(pFWR1CDR2),digits=3), x = unit(0.25, "npc"),y = unit(0.25, "npc"),just=c("center", "center"),gp = gpar(cex=cex))\n+ \n+ \n+ # grid.text(paste("P = ",formatC(pCDRFWR,digits=3)), x = unit(0.5, "npc"),y = unit(0.98, "npc"),just=c("center", "top"),gp = gpar(cex=cex))\n+ # grid.text(paste("P = ",formatC(pFWRFWR,digits=3)), x = unit(0.5, "npc"),y = unit(0.02, "npc"),just=c("center", "bottom"),gp = gpar(cex=cex))\n+ }\n+ else{\n+ }\n+}\n+\n+\n+##################################################################################\n+################## The whole OCD\'s matrix ########################################\n+##################################################################################\n+\n+#pdf(width=4*numbSeqs+1/3,height=4*numbSeqs+1/3)\n+pdf( output ,width=4*numbSeqs+1/3,height=4*numbSeqs+1/3) \n+\n+pushViewport(viewport(x=0.02,y=0.02,just = c("left", "bottom"),w =0.96,height=0.96,layout = grid.layout(numbSeqs+1,numbSeqs+1,widths=unit.c(unit(rep(1,numbSeqs),"null"),unit(4,"lines")),heights=unit.c(unit(4,"lines"),unit(rep(1,numbSeqs),"null")))))\n+\n+for( seqOne in 1:numbSeqs+1){\n+ pushViewport(viewport(layout.pos.col = seqOne-1, layout.pos.row = 1))\n+ if(seqOne>2){ \n+ grid.polygon(c(0,0,0.5,0.5),c(0,0.5,0.5,0),gp=gpar(col=grey(0.5),fill=grey(0.9)),default.units="npc")\n+ grid.polygon(c(1,1,0.5,0.5),c(0,0.5,0.5,0),gp=gpar(col=grey(0.5),fill=grey(0.95)),default.units="npc")\n+ grid.polygon(c(0,0,1,1),c(1,0.5,0.5,1),gp=gpar(col=grey(0.5)),default.units="npc")\n+ \n+ grid.text(y=.25,x=0.75,"FWR",gp = gpar(cex=1.5),just="center")\n+ grid.text(y=.25,x=0.25,"CDR",gp = gpar(cex=1.5),just="center")\n+ }\n+ grid.rect(gp = gpar(col=grey(0.9)))\n+ grid.text(y=.75,substr(paste(names(listPDFs)[rowIDs[seqOne-1]]),1,16),gp = gpar(cex=2),just="center")\n+ popViewport(1)\n+}\n+\n+for( seqOne in 1:numbSeqs+1){\n+ pushViewport(viewport(layout.pos.row = seqOne, layout.pos.col = numbSeqs+1))\n+ if(seqOne<=numbSeqs){ \n+ grid.polygon(c(0,0.5,0.5,0),c(0,0,0.5,0.5),gp=gpar(col=grey(0.5),fill=grey(0.95)),default.units="npc")\n+ grid.polygon(c(0,0.5,0.5,0),c(1,1,0.5,0.5),gp=gpar(col=grey(0.5),fill=grey(0.9)),default.units="npc")\n+ grid.polygon(c(1,0.5,0.5,1),c(0,0,1,1),gp=gpar(col=grey(0.5)),default.units="npc")\n+ grid.text(x=.25,y=0.75,"CDR",gp = gpar(cex=1.5),just="center",rot=270)\n+ grid.text(x=.25,y=0.25,"FWR",gp = gpar(cex=1.5),just="center",rot=270)\n+ }\n+ grid.rect(gp = gpar(col=grey(0.9)))\n+ grid.text(x=0.75,substr(paste(names(listPDFs)[rowIDs[seqOne-1]]),1,16),gp = gpar(cex=2),rot=270,just="center")\n+ popViewport(1)\n+}\n+\n+for( seqOne in 1:numbSeqs+1){\n+ for(seqTwo in 1:numbSeqs+1){\n+ pushViewport(viewport(layout.pos.col = seqTwo-1, layout.pos.row = seqOne))\n+ if(seqTwo>seqOne){\n+ plot_pvals(rowIDs[seqOne-1],rowIDs[seqTwo-1],cex=2)\n+ grid.rect()\n+ } \n+ popViewport(1)\n+ }\n+}\n+ \n+\n+xMin=0\n+xMax=0.01\n+for(pdf1 in rowIDs){\n+ xMin_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][1]\n+ xMin_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][1]\n+ xMax_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001])]\n+ xMax_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001])]\n+ xMin=min(c(xMin_CDR,xMin_FWR,xMin),na.rm=TRUE)\n+ xMax=max(c(xMax_CDR,xMax_FWR,xMax),na.rm=TRUE)\n+}\n+\n+\n+\n+for(i in 1:numbSeqs+1){\n+ for(j in (i-1):numbSeqs){ \n+ pushViewport(viewport(layout.pos.col = i-1, layout.pos.row = j+1))\n+ grid.rect()\n+ plot_grid_s(rowIDs[i-1],rowIDs[j],cex=1)\n+ popViewport(1)\n+ }\n+}\n+\n+dev.off() \n+\n+cat("Success", paste(rowIDs,collapse="_"),sep=":")\n+\n'

diff -r a103134ee6e0 -r 729738462297 baseline/script_imgt.py
--- a/baseline/script_imgt.py Thu Feb 25 10:32:32 2021 +0000
+++ b/baseline/script_imgt.py Wed Sep 15 12:24:06 2021 +0000

[

@@ -1,86 +1,86 @@
-#import xlrd #avoid dep
-import argparse
-import re
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
-parser.add_argument("--ref", help="Reference file")
-parser.add_argument("--output", help="Output file")
-parser.add_argument("--id", help="ID to be used at the '>>>' line in the output")
-
-args = parser.parse_args()
-
-print "script_imgt.py"
-print "input:", args.input
-print "ref:", args.ref
-print "output:", args.output
-print "id:", args.id
-
-refdic = dict()
-with open(args.ref, 'rU') as ref:
- currentSeq = ""
- currentId = ""
- for line in ref:
- if line.startswith(">"):
- if currentSeq is not "" and currentId is not "":
- refdic[currentId[1:]] = currentSeq
- currentId = line.rstrip()
- currentSeq = ""
- else:
- currentSeq += line.rstrip()
- refdic[currentId[1:]] = currentSeq
-
-print "Have", str(len(refdic)), "reference sequences"
-
-vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#,
-# r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
-# r"(IGKV[0-3]D?-[0-9]{1,2})",
-# r"(IGLV[0-9]-[0-9]{1,2})",
-# r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)",
-# r"(TRGV[234589])",
-# r"(TRDV[1-3])"]
-
-#vPattern = re.compile(r"|".join(vPattern))
-vPattern = re.compile("|".join(vPattern))
-
-def filterGene(s, pattern):
-    if type(s) is not str:
-        return None
-    res = pattern.search(s)
-    if res:
-        return res.group(0)
-    return None
-
-
-
-currentSeq = ""
-currentId = ""
-first=True
-with open(args.input, 'r') as i:
- with open(args.output, 'a') as o:
- o.write(">>>" + args.id + "\n")
- outputdic = dict()
- for line in i:
- if first:
- first = False
- continue
- linesplt = line.split("\t")
- ref = filterGene(linesplt[1], vPattern)
- if not ref or not linesplt[2].rstrip():
- continue
- if ref in outputdic:
- outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
- else:
- outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
- #print outputdic
-
- for k in outputdic.keys():
- if k in refdic:
- o.write(">>" + k + "\n")
- o.write(refdic[k] + "\n")
- for seq in outputdic[k]:
- #print seq
- o.write(">" + seq[0] + "\n")
- o.write(seq[1] + "\n")
- else:
- print k + " not in reference, skipping " + k
+#import xlrd #avoid dep
+import argparse
+import re
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
+parser.add_argument("--ref", help="Reference file")
+parser.add_argument("--output", help="Output file")
+parser.add_argument("--id", help="ID to be used at the '>>>' line in the output")
+
+args = parser.parse_args()
+
+print("script_imgt.py")
+print("input:", args.input)
+print("ref:", args.ref)
+print("output:", args.output)
+print("id:", args.id)
+
+refdic = dict()
+with open(args.ref, 'rU') as ref:
+ currentSeq = ""
+ currentId = ""
+ for line in ref:
+ if line.startswith(">"):
+ if currentSeq is not "" and currentId is not "":
+ refdic[currentId[1:]] = currentSeq
+ currentId = line.rstrip()
+ currentSeq = ""
+ else:
+ currentSeq += line.rstrip()
+ refdic[currentId[1:]] = currentSeq
+
+print("Have", str(len(refdic)), "reference sequences")
+
+vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#,
+# r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
+# r"(IGKV[0-3]D?-[0-9]{1,2})",
+# r"(IGLV[0-9]-[0-9]{1,2})",
+# r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)",
+# r"(TRGV[234589])",
+# r"(TRDV[1-3])"]
+
+#vPattern = re.compile(r"|".join(vPattern))
+vPattern = re.compile("|".join(vPattern))
+
+def filterGene(s, pattern):
+    if type(s) is not str:
+        return None
+    res = pattern.search(s)
+    if res:
+        return res.group(0)
+    return None
+
+
+
+currentSeq = ""
+currentId = ""
+first=True
+with open(args.input, 'r') as i:
+ with open(args.output, 'a') as o:
+ o.write(">>>" + args.id + "\n")
+ outputdic = dict()
+ for line in i:
+ if first:
+ first = False
+ continue
+ linesplt = line.split("\t")
+ ref = filterGene(linesplt[1], vPattern)
+ if not ref or not linesplt[2].rstrip():
+ continue
+ if ref in outputdic:
+ outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
+ else:
+ outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
+ #print outputdic
+
+ for k in list(outputdic.keys()):
+ if k in refdic:
+ o.write(">>" + k + "\n")
+ o.write(refdic[k] + "\n")
+ for seq in outputdic[k]:
+ #print seq
+ o.write(">" + seq[0] + "\n")
+ o.write(seq[1] + "\n")
+ else:
+ print(k + " not in reference, skipping " + k)

diff -r a103134ee6e0 -r 729738462297 baseline/script_xlsx.py
--- a/baseline/script_xlsx.py Thu Feb 25 10:32:32 2021 +0000
+++ b/baseline/script_xlsx.py Wed Sep 15 12:24:06 2021 +0000

[

@@ -1,58 +1,58 @@
-import xlrd
-import argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
-parser.add_argument("--ref", help="Reference file")
-parser.add_argument("--output", help="Output file")
-
-args = parser.parse_args()
-
-gene_column = 6
-id_column = 7
-seq_column = 8
-LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"]
-
-
-refdic = dict()
-with open(args.ref, 'r') as ref:
- currentSeq = ""
- currentId = ""
- for line in ref.readlines():
- if line[0] is ">":
- if currentSeq is not "" and currentId is not "":
- refdic[currentId[1:]] = currentSeq
- currentId = line.rstrip()
- currentSeq = ""
- else:
- currentSeq += line.rstrip()
- refdic[currentId[1:]] = currentSeq
-
-currentSeq = ""
-currentId = ""
-with xlrd.open_workbook(args.input, 'r') as wb:
- with open(args.output, 'a') as o:
- for sheet in wb.sheets():
- if sheet.cell(1,gene_column).value.find("IGHV") < 0:
- print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name
- continue
- o.write(">>>" + sheet.name + "\n")
- outputdic = dict()
- for rowindex in range(1, sheet.nrows):
- ref = sheet.cell(rowindex, gene_column).value.replace(">", "")
- if ref in outputdic:
- outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
- else:
- outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
- #print outputdic
-
- for k in outputdic.keys():
- if k in refdic:
- o.write(">>" + k + "\n")
- o.write(refdic[k] + "\n")
- for seq in outputdic[k]:
- #print seq
- o.write(">" + seq[0] + "\n")
- o.write(seq[1] + "\n")
- else:
- print k + " not in reference, skipping " + k
+import xlrd
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
+parser.add_argument("--ref", help="Reference file")
+parser.add_argument("--output", help="Output file")
+
+args = parser.parse_args()
+
+gene_column = 6
+id_column = 7
+seq_column = 8
+LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"]
+
+
+refdic = dict()
+with open(args.ref, 'r') as ref:
+ currentSeq = ""
+ currentId = ""
+ for line in ref.readlines():
+ if line[0] is ">":
+ if currentSeq is not "" and currentId is not "":
+ refdic[currentId[1:]] = currentSeq
+ currentId = line.rstrip()
+ currentSeq = ""
+ else:
+ currentSeq += line.rstrip()
+ refdic[currentId[1:]] = currentSeq
+
+currentSeq = ""
+currentId = ""
+with xlrd.open_workbook(args.input, 'r') as wb:
+ with open(args.output, 'a') as o:
+ for sheet in wb.sheets():
+ if sheet.cell(1,gene_column).value.find("IGHV") < 0:
+ print("Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name)
+ continue
+ o.write(">>>" + sheet.name + "\n")
+ outputdic = dict()
+ for rowindex in range(1, sheet.nrows):
+ ref = sheet.cell(rowindex, gene_column).value.replace(">", "")
+ if ref in outputdic:
+ outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
+ else:
+ outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
+ #print outputdic
+
+ for k in list(outputdic.keys()):
+ if k in refdic:
+ o.write(">>" + k + "\n")
+ o.write(refdic[k] + "\n")
+ for seq in outputdic[k]:
+ #print seq
+ o.write(">" + seq[0] + "\n")
+ o.write(seq[1] + "\n")
+ else:
+ print(k + " not in reference, skipping " + k)

diff -r a103134ee6e0 -r 729738462297 conda_environment.yml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/conda_environment.yml Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,23 @@
+name: shm_csr
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - python=3.7
+  - changeo=0.4.4
+  - biopython=1.72  # Higher versions break changeo
+  - unzip=6.0
+  - bash=4.4.18
+  - tar=1.34
+  - xlrd=1.2.0
+  - r-ggplot2=3.0.0
+  - r-reshape2=1.4.3
+  - r-scales=0.5.0
+  - r-seqinr=3.4_5
+  - r-data.table=1.11.4
+  - file=5.39
+  # Test dependencies below
+  - pytest
+  # Add planemo so tool can be uploaded
+  - planemo

diff -r a103134ee6e0 -r 729738462297 gene_identification.py
--- a/gene_identification.py Thu Feb 25 10:32:32 2021 +0000
+++ b/gene_identification.py Wed Sep 15 12:24:06 2021 +0000

[

b'@@ -23,23 +23,23 @@\n seqIndex = 0\n \n with open(infile, \'r\') as f: #read all sequences into a dictionary as key = ID, value = sequence\n-\tfor line in f:\n-\t\ttotal += 1\n-\t\tlinesplt = line.split("\\t")\n-\t\tif first:\n-\t\t\tprint "linesplt", linesplt\n-\t\t\tIDIndex = linesplt.index("Sequence ID")\n-\t\t\tseqIndex = linesplt.index("Sequence")\n-\t\t\tfirst = False\n-\t\t\tcontinue\n-\t\t\n-\t\tID = linesplt[IDIndex]\n-\t\tif len(linesplt) < 28: #weird rows without a sequence\n-\t\t\tdic[ID] = ""\n-\t\telse:\n-\t\t\tdic[ID] = linesplt[seqIndex]\n-\t\t\t\n-print "Number of input sequences:", len(dic)\n+ for line in f:\n+ total += 1\n+ linesplt = line.split("\\t")\n+ if first:\n+ print("linesplt", linesplt)\n+ IDIndex = linesplt.index("Sequence ID")\n+ seqIndex = linesplt.index("Sequence")\n+ first = False\n+ continue\n+ \n+ ID = linesplt[IDIndex]\n+ if len(linesplt) < 28: #weird rows without a sequence\n+ dic[ID] = ""\n+ else:\n+ dic[ID] = linesplt[seqIndex]\n+ \n+print("Number of input sequences:", len(dic))\n \n #old cm sequence: gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc\n #old cg sequence: ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag\n@@ -73,13 +73,13 @@\n chunklength = 8\n \n #create the chunks of the reference sequence with regular expressions for the variable nucleotides\n-for i in range(0, len(searchstrings["ca"]) - chunklength, chunklength / 2):\n+for i in range(0, len(searchstrings["ca"]) - chunklength, chunklength // 2):\n pos = i\n chunk = searchstrings["ca"][i:i+chunklength]\n result = ""\n varsInResult = 0\n for c in chunk:\n- if pos in ca1.keys():\n+ if pos in list(ca1.keys()):\n varsInResult += 1\n result += "[" + ca1[pos] + ca2[pos] + "]"\n else:\n@@ -87,13 +87,13 @@\n pos += 1\n compiledregex["ca"].append((re.compile(result), varsInResult))\n \n-for i in range(0, len(searchstrings["cg"]) - chunklength, chunklength / 2):\n+for i in range(0, len(searchstrings["cg"]) - chunklength, chunklength // 2):\n pos = i\n chunk = searchstrings["cg"][i:i+chunklength]\n result = ""\n varsInResult = 0\n for c in chunk:\n- if pos in cg1.keys():\n+ if pos in list(cg1.keys()):\n varsInResult += 1\n result += "[" + "".join(set([cg1[pos], cg2[pos], cg3[pos], cg4[pos]])) + "]"\n else:\n@@ -101,10 +101,10 @@\n pos += 1\n compiledregex["cg"].append((re.compile(result), varsInResult))\n \n-for i in range(0, len(searchstrings["cm"]) - chunklength, chunklength / 2):\n+for i in range(0, len(searchstrings["cm"]) - chunklength, chunklength // 2):\n compiledregex["cm"].append((re.compile(searchstrings["cm"][i:i+chunklength]), False))\n \n-for i in range(0, len(searchstrings["ce"]) - chunklength + 1, chunklength / 2):\n+for i in range(0, len(searchstrings["ce"]) - chunklength + 1, chunklength // 2):\n compiledregex["ce"].append((re.compile(searchstrings["ce"][i:i+chunklength]), False))\n \n def removeAndReturnMaxIndex(x): #simplifies a list comprehension\n@@ -117,108 +117,108 @@\n start_location = dict()\n hits = dict()\n alltotal = 0\n-for key in compiledregex.keys(): #for ca/cg/cm/ce\n-\tregularexpressions = compiledregex[key] #get the compiled regular expressions\n-\tfor ID in dic.keys()[0:]: #for every ID\n-\t\tif ID not in hits.keys(): #ensure that the dictionairy that keeps track of the hits for every gene exists\n-\t\t\thits[ID] = {"ca_hits": 0, "cg_hits": 0, "cm_hits": 0, "ce_hits": 0, "ca1": 0, "ca2": 0, "cg1": 0, "cg2": 0, "cg3": 0, "cg4": 0}\n-\t\tcurrentIDHits = hits[ID]\n-\t\tseq = dic[ID]\n-\t\tlastindex = 0\n-\t\tstart_zero = len(searchstrings[key]) #allows the reference sequence to start before search sequence (start_locations of < 0)\n-\t\tstart = [0] * (len(seq) + start_zero)\n-\t\tfor i, regexp in enumerate(regularexpressions): #for every regular expression\n-\t\t\trelativeStartLocation = lastindex - (chunklength / 2) * i\n-\t\t\tif relativeStartLocation >= l'..b'ne\n-\t\t\t\t\to.write(ID + "\\tIGG4\\t" + str(round_int(cg4hits / varsInCG * 100)) + "\\t" + str(round_int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n-\t\t\telse: #its a cm or ce gene\n-\t\t\t\tif cmhits >= cehits:\n-\t\t\t\t\to.write(ID + "\\tIGM\\t100\\t" + str(round_int(cmhits / possiblecm * 100)) + "\\t" + start_location[ID + "_cm"] + "\\n")\n-\t\t\t\telse:\n-\t\t\t\t\to.write(ID + "\\tIGE\\t100\\t" + str(round_int(cehits / possiblece * 100)) + "\\t" + start_location[ID + "_ce"] + "\\n")\n-\t\t\tseq_write_count += 1\n+ with open(output, \'w\') as o:\n+ for line in f:\n+ total += 1\n+ if first:\n+ o.write("Sequence ID\\tbest_match\\tnt_hit_percentage\\tchunk_hit_percentage\\tstart_locations\\n")\n+ first = False\n+ continue\n+ linesplt = line.split("\\t")\n+ if linesplt[2] == "No results":\n+ pass\n+ ID = linesplt[1]\n+ currentIDHits = hits[ID]\n+ possibleca = float(len(compiledregex["ca"]))\n+ possiblecg = float(len(compiledregex["cg"]))\n+ possiblecm = float(len(compiledregex["cm"]))\n+ possiblece = float(len(compiledregex["ce"]))\n+ cahits = currentIDHits["ca_hits"]\n+ cghits = currentIDHits["cg_hits"]\n+ cmhits = currentIDHits["cm_hits"]\n+ cehits = currentIDHits["ce_hits"]\n+ if cahits >= cghits and cahits >= cmhits and cahits >= cehits: #its a ca gene\n+ ca1hits = currentIDHits["ca1"]\n+ ca2hits = currentIDHits["ca2"]\n+ if ca1hits >= ca2hits:\n+ o.write(ID + "\\tIGA1\\t" + str(round_int(ca1hits / varsInCA * 100)) + "\\t" + str(round_int(cahits / possibleca * 100)) + "\\t" + start_location[ID + "_ca"] + "\\n")\n+ else:\n+ o.write(ID + "\\tIGA2\\t" + str(round_int(ca2hits / varsInCA * 100)) + "\\t" + str(round_int(cahits / possibleca * 100)) + "\\t" + start_location[ID + "_ca"] + "\\n")\n+ elif cghits >= cahits and cghits >= cmhits and cghits >= cehits: #its a cg gene\n+ cg1hits = currentIDHits["cg1"]\n+ cg2hits = currentIDHits["cg2"]\n+ cg3hits = currentIDHits["cg3"]\n+ cg4hits = currentIDHits["cg4"]\n+ if cg1hits >= cg2hits and cg1hits >= cg3hits and cg1hits >= cg4hits: #cg1 gene\n+ o.write(ID + "\\tIGG1\\t" + str(round_int(cg1hits / varsInCG * 100)) + "\\t" + str(round_int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n+ elif cg2hits >= cg1hits and cg2hits >= cg3hits and cg2hits >= cg4hits: #cg2 gene\n+ o.write(ID + "\\tIGG2\\t" + str(round_int(cg2hits / varsInCG * 100)) + "\\t" + str(round_int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n+ elif cg3hits >= cg1hits and cg3hits >= cg2hits and cg3hits >= cg4hits: #cg3 gene\n+ o.write(ID + "\\tIGG3\\t" + str(round_int(cg3hits / varsInCG * 100)) + "\\t" + str(round_int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n+ else: #cg4 gene\n+ o.write(ID + "\\tIGG4\\t" + str(round_int(cg4hits / varsInCG * 100)) + "\\t" + str(round_int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n+ else: #its a cm or ce gene\n+ if cmhits >= cehits:\n+ o.write(ID + "\\tIGM\\t100\\t" + str(round_int(cmhits / possiblecm * 100)) + "\\t" + start_location[ID + "_cm"] + "\\n")\n+ else:\n+ o.write(ID + "\\tIGE\\t100\\t" + str(round_int(cehits / possiblece * 100)) + "\\t" + start_location[ID + "_ce"] + "\\n")\n+ seq_write_count += 1\n \n-print "Time: %i" % (int(time.time() * 1000) - starttime)\n+print("Time: %i" % (int(time.time() * 1000) - starttime))\n \n-print "Number of sequences written to file:", seq_write_count\n+print("Number of sequences written to file:", seq_write_count)\n \n \n \n'

diff -r a103134ee6e0 -r 729738462297 merge_and_filter.r
--- a/merge_and_filter.r Thu Feb 25 10:32:32 2021 +0000
+++ b/merge_and_filter.r Wed Sep 15 12:24:06 2021 +0000

[

b'@@ -1,304 +1,304 @@\n-args <- commandArgs(trailingOnly = TRUE)\r\n-\r\n-\r\n-summaryfile = args[1]\r\n-sequencesfile = args[2]\r\n-mutationanalysisfile = args[3]\r\n-mutationstatsfile = args[4]\r\n-hotspotsfile = args[5]\r\n-aafile = args[6]\r\n-gene_identification_file= args[7]\r\n-output = args[8]\r\n-before.unique.file = args[9]\r\n-unmatchedfile = args[10]\r\n-method=args[11]\r\n-functionality=args[12]\r\n-unique.type=args[13]\r\n-filter.unique=args[14]\r\n-filter.unique.count=as.numeric(args[15])\r\n-class.filter=args[16]\r\n-empty.region.filter=args[17]\r\n-\r\n-print(paste("filter.unique.count:", filter.unique.count))\r\n-\r\n-summ = read.table(summaryfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-sequences = read.table(sequencesfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-mutationstats = read.table(mutationstatsfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-hotspots = read.table(hotspotsfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-AAs = read.table(aafile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-gene_identification = read.table(gene_identification_file, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-\r\n-fix_column_names = function(df){\r\n- if("V.DOMAIN.Functionality" %in% names(df)){\r\n- names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"\r\n- print("found V.DOMAIN.Functionality, changed")\r\n- }\r\n- if("V.DOMAIN.Functionality.comment" %in% names(df)){\r\n- names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"\r\n- print("found V.DOMAIN.Functionality.comment, changed")\r\n- }\r\n- return(df)\r\n-}\r\n-\r\n-fix_non_unique_ids = function(df){\r\n-\tdf$Sequence.ID = paste(df$Sequence.ID, 1:nrow(df))\r\n-\treturn(df)\r\n-}\r\n-\r\n-summ = fix_column_names(summ)\r\n-sequences = fix_column_names(sequences)\r\n-mutationanalysis = fix_column_names(mutationanalysis)\r\n-mutationstats = fix_column_names(mutationstats)\r\n-hotspots = fix_column_names(hotspots)\r\n-AAs = fix_column_names(AAs)\r\n-\r\n-if(method == "blastn"){\r\n-\t#"qseqid\\tsseqid\\tpident\\tlength\\tmismatch\\tgapopen\\tqstart\\tqend\\tsstart\\tsend\\tevalue\\tbitscore"\r\n-\tgene_identification = gene_identification[!duplicated(gene_identification$qseqid),]\r\n-\tref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52))\r\n-\tgene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T)\r\n-\tgene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100\r\n-\tgene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")]\r\n-\tcolnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")\r\n-}\r\n-\r\n-#print("Summary analysis files columns")\r\n-#print(names(summ))\r\n-\r\n-\r\n-\r\n-input.sequence.count = nrow(summ)\r\n-print(paste("Number of sequences in summary file:", input.sequence.count))\r\n-\r\n-filtering.steps = data.frame(character(0), numeric(0))\r\n-\r\n-filtering.steps = rbind(filtering.steps, c("Input", input.sequence.count))\r\n-\r\n-filtering.steps[,1] = as.character(filtering.steps[,1])\r\n-filtering.steps[,2] = as.character(filtering.steps[,2])\r\n-#filtering.steps[,3] = as.numeric(filtering.steps[,3])\r\n-\r\n-#print("summary files columns")\r\n-#print(names(summ))\r\n-\r\n-summ = merge(summ, gene_identification, by="Sequence.ID")\r\n-\r\n-print(paste("Number of sequences after merging with gene identification:", nrow(summ)))\r\n-\r\n-summ = summ[summ$Functionality != "No results",]\r\n-\r\n-print(paste("Number of sequences after \'No results\' filter:", nrow(summ)))\r\n-\r\n-filtering.steps = rbind(filtering.steps, c("After \'No results\' filter", nrow(summ)))\r\n-\r\n-if(functionality == "productive"){\r\n-\tsumm = summ[summ$Functionality == "productive (see comment)" | summ$Functionali'..b'.def = paste(result$VGene, result$JGene, result$CDR3.IMGT.AA)\n+\t} else if(empty.region.filter == "leader"){\n+\t\tresult$unique.def = paste(result$FR1.IMGT.seq, result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)\n+\t} else if(empty.region.filter == "FR1"){\n+\t\tresult$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)\n+\t} else if(empty.region.filter == "CDR1"){\n+\t\tresult$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)\n+\t} else if(empty.region.filter == "FR2"){\n+\t\tresult$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)\n+\t}\n+\t\n+\tif(grepl("remove", filter.unique)){\n+\t\tresult = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]\n+\t\tunique.defs = data.frame(table(result$unique.def))\n+\t\tunique.defs = unique.defs[unique.defs$Freq >= filter.unique.count,]\n+\t\tresult = result[result$unique.def %in% unique.defs$Var1,]\n+\t}\n+\n+\tif(filter.unique != "remove_vjaa"){\n+\t\tresult$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don\'t have a class after it\n+\t}\n+\n+\tresult = result[!duplicated(result$unique.def),]\n+}\n+\n+write.table(result, gsub("before_unique_filter.txt", "after_unique_filter.txt", before.unique.file), sep="\\t", quote=F,row.names=F,col.names=T)\n+\n+filtering.steps = rbind(filtering.steps, c("After filter unique sequences", nrow(result)))\n+\n+print(paste("Number of sequences in result after unique filtering:", nrow(result)))\n+\n+if(nrow(summ) == 0){\n+\tstop("No data remaining after filter")\n+}\n+\n+result$best_match_class = gsub(",.*", "", result$best_match) #gsub so the unmatched don\'t have a class after it\n+\n+#result$past = ""\n+#cls = unlist(strsplit(unique.type, ","))\n+#for (i in 1:nrow(result)){\n+#\tresult[i,"past"] = paste(result[i,cls], collapse=":")\n+#}\n+\n+\n+\n+result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":"))\n+\n+result.matched = result[!grepl("unmatched", result$best_match),]\n+result.unmatched = result[grepl("unmatched", result$best_match),]\n+\n+result = rbind(result.matched, result.unmatched)\n+\n+result = result[!(duplicated(result$past)), ]\n+\n+result = result[,!(names(result) %in% c("past", "best_match_class"))]\n+\n+print(paste("Number of sequences in result after", unique.type, "filtering:", nrow(result)))\n+\n+filtering.steps = rbind(filtering.steps, c("After remove duplicates based on filter", nrow(result)))\n+\n+unmatched = result[grepl("^unmatched", result$best_match),c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]\n+\n+print(paste("Number of rows in result:", nrow(result)))\n+print(paste("Number of rows in unmatched:", nrow(unmatched)))\n+\n+matched.sequences = result[!grepl("^unmatched", result$best_match),]\n+\n+write.table(x=matched.sequences, file=gsub("merged.txt$", "filtered.txt", output), sep="\\t",quote=F,row.names=F,col.names=T)\n+\n+matched.sequences.count = nrow(matched.sequences)\n+unmatched.sequences.count = sum(grepl("^unmatched", result$best_match))\n+if(matched.sequences.count <= unmatched.sequences.count){\n+\tprint("WARNING NO MATCHED (SUB)CLASS SEQUENCES!!")\n+}\n+\n+filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count))\n+filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count))\n+filtering.steps[,2] = as.numeric(filtering.steps[,2])\n+filtering.steps$perc = round(filtering.steps[,2] / input.sequence.count * 100, 2)\n+\n+write.table(x=filtering.steps, file=gsub("unmatched", "filtering_steps", unmatchedfile), sep="\\t",quote=F,row.names=F,col.names=F)\n+\n+write.table(x=result, file=output, sep="\\t",quote=F,row.names=F,col.names=T)\n+write.table(x=unmatched, file=unmatchedfile, sep="\\t",quote=F,row.names=F,col.names=T)\n'

diff -r a103134ee6e0 -r 729738462297 mutation_column_checker.py
--- a/mutation_column_checker.py Thu Feb 25 10:32:32 2021 +0000
+++ b/mutation_column_checker.py Wed Sep 15 12:24:06 2021 +0000

[

@@ -1,27 +1,27 @@
-import re
-
-mutationMatcher = re.compile("^([nactg])(\d+).([nactg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?")
-
-with open("7_V-REGION-mutation-and-AA-change-table.txt", 'r') as file_handle:
- first = True
- fr3_index = -1
- for i, line in enumerate(file_handle):
- line_split = line.split("\t")
- if first:
- fr3_index = line_split.index("FR3-IMGT")
- first = False
- continue
-
- if len(line_split) < fr3_index:
- continue
-
- fr3_data = line_split[fr3_index]
- if len(fr3_data) > 5:
- try:
- test = [mutationMatcher.match(x).groups() for x in fr3_data.split("|") if x]
- except:
- print(line_split[1])
- print("Something went wrong at line {line} with:".format(line=line_split[0]))
- #print([x for x in fr3_data.split("|") if not mutationMatcher.match(x)])
- if i % 100000 == 0:
- print(i)
+import re
+
+mutationMatcher = re.compile("^([nactg])(\d+).([nactg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?")
+
+with open("7_V-REGION-mutation-and-AA-change-table.txt", 'r') as file_handle:
+ first = True
+ fr3_index = -1
+ for i, line in enumerate(file_handle):
+ line_split = line.split("\t")
+ if first:
+ fr3_index = line_split.index("FR3-IMGT")
+ first = False
+ continue
+
+ if len(line_split) < fr3_index:
+ continue
+
+ fr3_data = line_split[fr3_index]
+ if len(fr3_data) > 5:
+ try:
+ test = [mutationMatcher.match(x).groups() for x in fr3_data.split("|") if x]
+ except:
+ print((line_split[1]))
+ print(("Something went wrong at line {line} with:".format(line=line_split[0])))
+ #print([x for x in fr3_data.split("|") if not mutationMatcher.match(x)])
+ if i % 100000 == 0:
+ print(i)

diff -r a103134ee6e0 -r 729738462297 shm_clonality.htm
--- a/shm_clonality.htm Thu Feb 25 10:32:32 2021 +0000
+++ b/shm_clonality.htm Wed Sep 15 12:24:06 2021 +0000

[

b'@@ -1,144 +1,144 @@\n-<html>\r\n-\r\n-<head>\r\n-<meta http-equiv=Content-Type content="text/html; charset=windows-1252">\r\n-<meta name=Generator content="Microsoft Word 14 (filtered)">\r\n-<style>\r\n-\r\n-</style>\r\n-\r\n-</head>\r\n-\r\n-<body lang=EN-US link=blue vlink=purple>\r\n-\r\n-<div class=WordSection1>\r\n-\r\n-References\r\n-\r\n-Gupta,\r\n-Namita T. and Vander Heiden, Jason A. and Uduman, Mohamed and Gadala-Maria,\r\n-Daniel and Yaari, Gur and Kleinstein, Steven H. (2015). <a name="OLE_LINK106"></a><a\r\n-name="OLE_LINK107"></a>Change-O: a toolkit for analyzing large-scale B cell\r\n-immunoglobulin repertoire sequencing data: Table 1. In<span\r\n-class=apple-converted-space> Bioinformatics, 31 (20), pp.\r\n-3356\x963358. [<a\r\n-href="http://dx.doi.org/10.1093/bioinformatics/btv359" target="_blank"><span\r\n-lang=EN-GB style=\'color:#303030\'>doi:10.1093/bioinformatics/btv359</a><span\r\n-lang=EN-GB style=\'color:black\'>][<a\r\n-href="http://dx.doi.org/10.1093/bioinformatics/btv359" target="_blank"><span\r\n-lang=EN-GB style=\'color:#303030\'>Link</a>]\r\n-\r\n- \r\n-\r\n-<a name="OLE_LINK110">All, IGA, IGG, IGM and IGE tabs</a>\r\n-\r\n-In\r\n-these tabs information on the clonal relation of transcripts can be found. To\r\n-calculate clonal relation Change-O is used (Gupta et'..b'ckground:white\'>Gupta,\n+Namita T. and Vander Heiden, Jason A. and Uduman, Mohamed and Gadala-Maria,\n+Daniel and Yaari, Gur and Kleinstein, Steven H. (2015). <a name="OLE_LINK106"></a><a\n+name="OLE_LINK107"></a>Change-O: a toolkit for analyzing large-scale B cell\n+immunoglobulin repertoire sequencing data: Table 1. In<span\n+class=apple-converted-space> Bioinformatics, 31 (20), pp.\n+3356\x963358. [<a\n+href="http://dx.doi.org/10.1093/bioinformatics/btv359" target="_blank"><span\n+lang=EN-GB style=\'color:#303030\'>doi:10.1093/bioinformatics/btv359</a><span\n+lang=EN-GB style=\'color:black\'>][<a\n+href="http://dx.doi.org/10.1093/bioinformatics/btv359" target="_blank"><span\n+lang=EN-GB style=\'color:#303030\'>Link</a>]\n+\n+ \n+\n+<a name="OLE_LINK110">All, IGA, IGG, IGM and IGE tabs</a>\n+\n+In\n+these tabs information on the clonal relation of transcripts can be found. To\n+calculate clonal relation Change-O is used (Gupta et al, PMID: 26069265).\n+Transcripts are considered clonally related if they have maximal three nucleotides\n+difference in their CDR3 sequence and the same first V segment (as assigned by\n+IMGT). Results are represented in a table format showing the clone size and the\n+number of clones or sequences with this clone size. Change-O settings used are\n+the nucleotide hamming distance substitution model with\n+a complete distance of maximal three. For clonal assignment the first gene\n+segments were used, and the distances were not normalized. In case of\n+asymmetric distances, the minimal distance was used. \n+\n+ \n+\n+Overlap\n+tab \n+\n+This\n+tab gives information on with which (sub)classe(s) each unique analyzed region\n+(based on the exact nucleotide sequence of the analyzes region and the CDR3\n+nucleotide sequence) is found with. This gives information if the combination\n+of the exact same nucleotide sequence of the analyzed region and the CDR3\n+sequence can be found in multiple (sub)classes.\n+\n+<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAA8AAAAPCAYAAAA71pVKAAAAzElEQVQoka2TwQ2CQBBFpwTshw4ImW8ogJMlUIMmhNCDxgasAi50oSXA8XlAjCG7aqKTzGX/vsnM31mzR0gk7tTudO5MEizpzvQ4ryUSe408J3Xn+grE0p1rnpOamVmWsZG4rS+dzzAMsN8Hi9yyjI1JNGtxu4VxBJgLRLpoTKIPiW0LlwtUVRTubW2OBGUJu92cZRmdfbKQMAw8o+vi5v0fLorZ7Y9waGYJjsf38DJz0O1PsEQffOcv4Sa6YYfDDJ5Obzbsp93+5VfdATueO1fdLdI0AAAAAElFTkSuQmCC"> Please note that this tab is based on all\n+sequences before filter unique sequences and the remove duplicates based on\n+filters are applied. In this table only sequences occuring more than once are\n+included. \n+\n+</div>\n+\n+</body>\n+\n+</html>\n'

diff -r a103134ee6e0 -r 729738462297 shm_csr.htm
--- a/shm_csr.htm Thu Feb 25 10:32:32 2021 +0000
+++ b/shm_csr.htm Wed Sep 15 12:24:06 2021 +0000

@@ -1,95 +1,95 @@
-<html>
-
-<head>
-<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
-<meta name=Generator content="Microsoft Word 14 (filtered)">
-<style>
-
-</style>
-
-</head>
-
-<body lang=EN-US link=blue vlink=purple>
-
-<div class=WordSection1>
-
-The
-graphs in this tab give insight into the subclass distribution of IGG and IGA
-transcripts. Human C�, Cα, Cγ and Cε
-constant genes are assigned using a custom script
-specifically designed for human (sub)class assignment in repertoire data as
-described in van Schouwenburg and IJspeert et al, submitted for publication. In
-this script the reference sequences for the subclasses are divided in 8
-nucleotide chunks which overlap by 4 nucleotides. These overlapping chunks are
-then individually aligned in the right order to each input sequence. The
-percentage of the chunks identified in each rearrangement is calculated in the
-�chunk hit percentage�. Cα and Cγ
-subclasses are very homologous and only differ in a few nucleotides. To assign
-subclasses the �nt hit percentage� is calculated.
-This percentage indicates how well the chunks covering the subclass specific
-nucleotide match with the different subclasses. Information
-on normal distribution of subclasses in healthy individuals of different ages
-can be found in IJspeert and van Schouwenburg et al, PMID: 27799928.
-
-<a name="OLE_LINK100"></a><a
-name="OLE_LINK99"></a><a name="OLE_LINK25">IGA
-subclass distribution</a>
-
-Pie
-chart showing the relative distribution of IGA1 and IGA2 transcripts in the
-sample.
-
-IGG
-subclass distribution
-
-Pie
-chart showing the relative distribution of IGG1, IGG2, IGG3 and IGG4
-transcripts in the sample.
-
-</div>
-
-</body>
-
-</html>
+<html>
+
+<head>
+<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
+<meta name=Generator content="Microsoft Word 14 (filtered)">
+<style>
+
+</style>
+
+</head>
+
+<body lang=EN-US link=blue vlink=purple>
+
+<div class=WordSection1>
+
+The
+graphs in this tab give insight into the subclass distribution of IGG and IGA
+transcripts. Human C�, Cα, Cγ and Cε
+constant genes are assigned using a custom script
+specifically designed for human (sub)class assignment in repertoire data as
+described in van Schouwenburg and IJspeert et al, submitted for publication. In
+this script the reference sequences for the subclasses are divided in 8
+nucleotide chunks which overlap by 4 nucleotides. These overlapping chunks are
+then individually aligned in the right order to each input sequence. The
+percentage of the chunks identified in each rearrangement is calculated in the
+�chunk hit percentage�. Cα and Cγ
+subclasses are very homologous and only differ in a few nucleotides. To assign
+subclasses the �nt hit percentage� is calculated.
+This percentage indicates how well the chunks covering the subclass specific
+nucleotide match with the different subclasses. Information
+on normal distribution of subclasses in healthy individuals of different ages
+can be found in IJspeert and van Schouwenburg et al, PMID: 27799928.
+
+<a name="OLE_LINK100"></a><a
+name="OLE_LINK99"></a><a name="OLE_LINK25">IGA
+subclass distribution</a>
+
+Pie
+chart showing the relative distribution of IGA1 and IGA2 transcripts in the
+sample.
+
+IGG
+subclass distribution
+
+Pie
+chart showing the relative distribution of IGG1, IGG2, IGG3 and IGG4
+transcripts in the sample.
+
+</div>
+
+</body>
+
+</html>

diff -r a103134ee6e0 -r 729738462297 shm_csr.py
--- a/shm_csr.py Thu Feb 25 10:32:32 2021 +0000
+++ b/shm_csr.py Wed Sep 15 12:24:06 2021 +0000

[

@@ -26,7 +26,7 @@
mutationMatcher = re.compile("^(.)(\d+).(.),?[ ]?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?")
mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?.?([A-Z])?(.*)?")
mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?")
- mutationMatcher = re.compile("^([nactg])(\d+).([nactg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?")
+ mutationMatcher = re.compile(r"^([nactg])(\d+).([nactg]),?[ ]?([A-Z*])?(\d+)?[>]?([A-Z*;])?(.*)?")
NAMatchResult = (None, None, None, None, None, None, '')
geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes}
linecount = 0
@@ -59,7 +59,7 @@
tandem_sum_by_class = defaultdict(int)
expected_tandem_sum_by_class = defaultdict(float)

- with open(infile, 'ru') as i:
+ with open(infile, 'r') as i:
for line in i:
if first:
linesplt = line.split("\t")
@@ -130,10 +130,10 @@
fr3LengthDict[ID] = fr3Length

IDlist += [ID]
- print "len(mutationdic) =", len(mutationdic)
+ print("len(mutationdic) =", len(mutationdic))

with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "mutationdict.txt"), 'w') as out_handle:
- for ID, lst in mutationdic.iteritems():
+ for ID, lst in mutationdic.items():
for mut in lst:
out_handle.write("{0}\t{1}\n".format(ID, "\t".join([str(x) for x in mut])))

@@ -230,7 +230,7 @@

tandem_freq_file = os.path.join(os.path.dirname(outfile), "tandem_frequency.txt")
with open(tandem_freq_file, 'w') as o:
- for frq in sorted([int(x) for x in tandem_frequency.keys()]):
+ for frq in sorted([int(x) for x in list(tandem_frequency.keys())]):
o.write("{0}\t{1}\n".format(frq, tandem_frequency[str(frq)]))

tandem_row = []
@@ -256,11 +256,11 @@
AA_mutation_dic = {"IGA": AA_mutation[:], "IGG": AA_mutation[:], "IGM": AA_mutation[:], "IGE": AA_mutation[:], "unm": AA_mutation[:], "all": AA_mutation[:]}
AA_mutation_empty = AA_mutation[:]

- print "AALength:", AALength
+ print("AALength:", AALength)
aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt"
with open(aa_mutations_by_id_file, 'w') as o:
o.write("ID\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n")
- for ID in mutationListByID.keys():
+ for ID in list(mutationListByID.keys()):
AA_mutation_for_ID = AA_mutation_empty[:]
for mutation in mutationListByID[ID]:
if mutation[4] and mutation[5] != ";":
@@ -269,8 +269,8 @@
AA_mutation[AA_mutation_position] += 1
AA_mutation_for_ID[AA_mutation_position] += 1
except Exception as e:
- print e
- print mutation
+ print(e)
+ print(mutation)
sys.exit()
clss = genedic[ID][:3]
AA_mutation_dic[clss][AA_mutation_position] += 1
@@ -280,32 +280,32 @@

#absent AA stuff
absentAACDR1Dic = defaultdict(list)
- absentAACDR1Dic[5] = range(29,36)
- absentAACDR1Dic[6] = range(29,35)
- absentAACDR1Dic[7] = range(30,35)
- absentAACDR1Dic[8] = range(30,34)
- absentAACDR1Dic[9] = range(31,34)
- absentAACDR1Dic[10] = range(31,33)
+ absentAACDR1Dic[5] = list(range(29,36))
+ absentAACDR1Dic[6] = list(range(29,35))
+ absentAACDR1Dic[7] = list(range(30,35))
+ absentAACDR1Dic[8] = list(range(30,34))
+ absentAACDR1Dic[9] = list(range(31,34))
+ absentAACDR1Dic[10] = list(range(31,33))
absentAACDR1Dic[11] = [32]

absentAACDR2Dic = defaultdict(list)
- absentAACDR2Dic[0] = range(55,65)
- absentAACDR2Dic[1] = range(56,65)
- absentAACDR2Dic[2] = range(56,64)
- absentAACDR2Dic[3] = range(57,64)
- absentAACDR2Dic[4] = range(57,63)
- absentAACDR2Dic[5] = range(58,63)
- absentAACDR2Dic[6] = range(58,62)
- absentAACDR2Dic[7] = range(59,62)
- absentAACDR2Dic[8] = range(59,61)
+ absentAACDR2Dic[0] = list(range(55,65))
+ absentAACDR2Dic[1] = list(range(56,65))
+ absentAACDR2Dic[2] = list(range(56,64))
+ absentAACDR2Dic[3] = list(range(57,64))
+ absentAACDR2Dic[4] = list(range(57,63))
+ absentAACDR2Dic[5] = list(range(58,63))
+ absentAACDR2Dic[6] = list(range(58,62))
+ absentAACDR2Dic[7] = list(range(59,62))
+ absentAACDR2Dic[8] = list(range(59,61))
absentAACDR2Dic[9] = [60]

absentAA = [len(IDlist)] * (AALength-1)
- for k, cdr1Length in cdr1LengthDic.iteritems():
+ for k, cdr1Length in cdr1LengthDic.items():
for c in absentAACDR1Dic[cdr1Length]:
absentAA[c] -= 1

- for k, cdr2Length in cdr2LengthDic.iteritems():
+ for k, cdr2Length in cdr2LengthDic.items():
for c in absentAACDR2Dic[cdr2Length]:
absentAA[c] -= 1

@@ -325,14 +325,12 @@
o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n")

if linecount == 0:
- print "No data, exiting"
+ print("No data, exiting")
with open(outfile, 'w') as o:
o.write("RGYW (%)," + ("0,0,0\n" * len(genes)))
o.write("WRCY (%)," + ("0,0,0\n" * len(genes)))
o.write("WA (%)," + ("0,0,0\n" * len(genes)))
o.write("TW (%)," + ("0,0,0\n" * len(genes)))
- import sys
-
sys.exit()

hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)$(.*)$")
@@ -347,7 +345,7 @@
aggctatIndex = 0
atagcctIndex = 0
first = True
- with open(infile, 'ru') as i:
+ with open(infile, 'r') as i:
for line in i:
if first:
linesplt = line.split("\t")
@@ -412,7 +410,7 @@
motif_dic = {"RGYW": RGYW, "WRCY": WRCY, "WA": WA, "TW": TW}
for mutation in mutationList:
frm, where, to, AAfrm, AAwhere, AAto, junk = mutation
- for motif in motif_dic.keys():
+ for motif in list(motif_dic.keys()):

for start, end, region in motif_dic[motif]:
if start <= int(where) <= end:
@@ -460,7 +458,7 @@
value = 0
valuedic = dict()

- for fname in funcs.keys():
+ for fname in list(funcs.keys()):
for gene in genes:
with open(directory + gene + "_" + fname + "_value.txt", 'r') as v:
valuedic[gene + "_" + fname] = float(v.readlines()[0].rstrip())
@@ -477,7 +475,7 @@
dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount}
arr = ["RGYW", "WRCY", "WA", "TW"]

- for fname in funcs.keys():
+ for fname in list(funcs.keys()):
func = funcs[fname]
foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt"
with open(foutfile, 'w') as o:
@@ -489,9 +487,9 @@
if valuedic[gene + "_" + fname] is 0:
o.write(",0,0,0")
else:
- x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname)
+ x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.items() if geneMatcher.match(z)]], gene, func, fname)
o.write("," + x + "," + y + "," + z)
- x, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname)
+ x, y, z = get_xyz([y for x, y in curr.items() if not genedic[x].startswith("unmatched")], "total", func, fname)
#x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname)
o.write("," + x + "," + y + "," + z + "\n")

diff -r a103134ee6e0 -r 729738462297 shm_csr.xml
--- a/shm_csr.xml Thu Feb 25 10:32:32 2021 +0000
+++ b/shm_csr.xml Wed Sep 15 12:24:06 2021 +0000

b'@@ -1,240 +1,247 @@\n-<tool id="shm_csr" name="SHM & CSR pipeline" version="1.0">\r\n-\t<description></description>\r\n-\t<requirements>\r\n-\t\t<requirement type="package" version="2.7">python</requirement>\r\n-\t\t<requirement type="package" version="1.16.0">numpy</requirement>\r\n-\t\t<requirement type="package" version="1.2.0">xlrd</requirement>\r\n-\t\t<requirement type="package" version="3.0.0">r-ggplot2</requirement>\r\n-\t\t<requirement type="package" version="1.4.3">r-reshape2</requirement>\r\n-\t\t<requirement type="package" version="0.5.0">r-scales</requirement>\r\n-\t\t<requirement type="package" version="3.4_5">r-seqinr</requirement>\r\n-\t\t<requirement type="package" version="1.11.4">r-data.table</requirement>\r\n-\t</requirements>\r\n-\t<command interpreter="bash">\r\n-\t\t#if str ( $filter_unique.filter_unique_select ) == "remove":\r\n-\t\t\twrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_unique.filter_unique_select $filter_unique.filter_unique_clone_count $class_filter_cond.class_filter $empty_region_filter $fast\r\n-\t\t#else:\r\n-\t\t\twrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_unique.filter_unique_select 2 $class_filter_cond.class_filter $empty_region_filter $fast\r\n-\t\t#end if\r\n-\t</command>\r\n-\t<inputs>\r\n-\t\t<param name="in_file" type="data" format="data" label="IMGT zip file to be analysed" />\r\n-\t\t<param name="empty_region_filter" type="select" label="Sequence starts at" help="" >\r\n-\t\t\t<option value="leader" selected="true">Leader: include FR1, CDR1, FR2, CDR2, FR3 in filters</option>\r\n-\t\t\t<option value="FR1" selected="true">FR1: include CDR1,FR2,CDR2,FR3 in filters</option>\r\n-\t\t\t<option value="CDR1">CDR1: include FR2,CDR2,FR3 in filters</option>\r\n-\t\t\t<option value="FR2">FR2: include CDR2,FR3 in filters</option>\r\n-\t\t</param>\r\n-\t\t<param name="functionality" type="select" label="Functionality filter" help="" >\r\n-\t\t\t<option value="productive" selected="true">Productive (Productive and Productive see comment)</option>\r\n-\t\t\t<option value="unproductive">Unproductive (Unproductive and Unproductive see comment)</option>\r\n-\t\t\t<option value="remove_unknown">Productive and Unproductive (Productive, Productive see comment, Unproductive, Unproductive and Unproductive see comment)</option>\r\n-\t\t</param>\r\n-\t\t<conditional name="filter_unique">\r\n-\t\t\t<param name="filter_unique_select" type="select" label="Filter unique sequences" help="See below for an example.">\r\n-\t\t\t\t<option value="remove" selected="true">Remove uniques (Based on nucleotide sequence + C)</option>\r\n-\t\t\t\t<option value="remove_vjaa">Remove uniques (Based on V+J+CDR3 (AA))</option>\r\n-\t\t\t\t<option value="keep">Keep uniques (Based on nucleotide sequence + C)</option>\r\n-\t\t\t\t<option value="no">No</option>\r\n-\t\t\t</param>\r\n-\t\t\t<when value="remove">\r\n-\t\t\t\t<param name="filter_unique_clone_count" size="4" type="integer" label="How many sequences should be in a group to keep 1 of them" value="2" min="2"/>\r\n-\t\t\t</when>\r\n-\t\t\t<when value="keep"></when>\r\n-\t\t\t<when value="no"></when>\r\n-\t\t</conditional>\r\n-\t\t<param name="unique" type="select" label="Remove duplicates based on" help="" >\r\n-\t\t\t<option value="VGene,CDR3.IMGT.AA,best_match_class">Top.V.Gene, CDR3 (AA), C region</option>\r\n-\t\t\t<option value="VGene,CDR3.IMGT.AA">Top.V.Gene, CDR3 (AA)</option>\r\n-\t\t\t<option value="CDR3.IMGT.AA,best_match_class">CDR3 (AA), C region</option>\r\n-\t\t\t<option value="CDR3.IMGT.AA">CDR3 (AA)</option>\r\n-\t\t\t\r\n-\t\t\t<option value="VGene,CDR3.IMGT.seq,best_match_class">Top.V.Gene, CDR3 (nt), C region</option>\r\n-\t\t\t<option value="VGene,CDR3.IMGT.seq">Top.V.Gene, CDR3 (nt)</option>\r\n-\t\t\t<option value="CDR3.IMGT.seq,best_match_class">CDR3 (nt), C region</option>\r\n-\t\t\t<option value="CDR3.IMGT.seq">CDR3 (n'..b'me (sub)class).\n+\n+Example of the sequences that are included using either the \xe2\x80\x9cremove unique filter\xe2\x80\x9d or the \xe2\x80\x9ckeep unique filter\xe2\x80\x9d\n+\n++--------------------------+\n+| unique filter |\n++--------+--------+--------+\n+| values | remove | keep |\n++--------+--------+--------+\n+| A | A | A |\n++--------+--------+--------+\n+| A | B | B |\n++--------+--------+--------+\n+| B | D | C |\n++--------+--------+--------+\n+| B | | D |\n++--------+--------+--------+\n+| C | | |\n++--------+--------+--------+\n+| D | | |\n++--------+--------+--------+\n+| D | | |\n++--------+--------+--------+\n+\n+-----\n+ \n+**Remove duplicates based on**\n+\n+Allows the selection of a single sequence per clone. Different definitions of a clone can be chosen. \n+\n+.. class:: infomark\n+\n+Note: The first sequence (in the data set) of each clone is always included in the analysis. When the first matched sequence is unmatched (no subclass assigned) the first matched sequence will be included. This means that altering the data order (by for instance sorting) can change the sequence which is included in the analysis and therefore slightly influences the results. \n+\n+-----\n+\n+**Human Class/Subclass filter**\n+\n+.. class:: warningmark\n+\n+Note: This filter should only be applied when analysing human IGH data in which a (sub)class specific sequence is present. Otherwise please select the do not assign (sub)class option to prevent errors when running the pipeline. \n+\n+The class percentage is based on the \xe2\x80\x98chunk hit percentage\xe2\x80\x99 (see below). The subclass percentage is based on the \xe2\x80\x98nt hit percentage\xe2\x80\x99 (see below).\n+\n+The SHM & CSR pipeline identifies human C\xc2\xb5, C\xce\xb1, C\xce\xb3 and C\xce\xb5 constant genes by dividing the reference sequences for the subclasses (NG_001019) in 8 nucleotide chunks which overlap by 4 nucleotides. These overlapping chunks are then individually aligned in the right order to each input sequence. This alignment is used to calculate the chunck hit percentage and the nt hit percentage. \n+\n+*Chunk hit percentage*: The percentage of the chunks that is aligned \n+\n+*Nt hit percentage*: The percentage of chunks covering the subclass specific nucleotide match with the different subclasses. The most stringent filter for the subclass is 70% \xe2\x80\x98nt hit percentage\xe2\x80\x99 which means that 5 out of 7 subclass specific nucleotides for C\xce\xb1 or 6 out of 8 subclass specific nucleotides of C\xce\xb3 should match with the specific subclass. \n+The option \xe2\x80\x9c>25% class\xe2\x80\x9d can be chosen when you only are interested in the class (C\xce\xb1/C\xce\xb3/C\xc2\xb5/C\xc9\x9b) of your sequences and the length of your sequence is not long enough to assign the subclasses.\n+\n+-----\n+\n+**Output new IMGT archives per class into your history?**\n+\n+If yes is selected, additional output files (one for each class) will be added to the history which contain information of the sequences that passed the selected filtering criteria. These files are in the same format as the IMGT/HighV-QUEST output files and therefore are also compatible with many other analysis programs, such as the Immune repertoire pipeline. \n+\n+-----\n+\n+**Execute**\n+\n+Upon pressing execute a new analysis is added to your history (right side of the page). Initially this analysis will be grey, after initiating the analysis colour of the analysis in the history will change to yellow. When the analysis is finished it will turn green in the history. Now the analysis can be opened by clicking on the eye icon on the analysis of interest. When an analysis turns red an error has occurred when running the analysis. If you click on the analysis title additional information can be found on the analysis. In addition a bug icon appears. Here more information on the error can be found.\n+\n+]]>\n+\t</help>\n+\t<citations>\n+\t\t<citation type="doi">10.1093/nar/gks457</citation>\n+\t\t<citation type="doi">10.1093/bioinformatics/btv359</citation>\n+\t</citations>\n+</tool>\n'

diff -r a103134ee6e0 -r 729738462297 shm_downloads.htm
--- a/shm_downloads.htm Thu Feb 25 10:32:32 2021 +0000
+++ b/shm_downloads.htm Wed Sep 15 12:24:06 2021 +0000

b'@@ -1,538 +1,538 @@\n-<html>\r\n-\r\n-<head>\r\n-<meta http-equiv=Content-Type content="text/html; charset=windows-1252">\r\n-<meta name=Generator content="Microsoft Word 14 (filtered)">\r\n-<style>\r\n-\r\n-</style>\r\n-\r\n-</head>\r\n-\r\n-<body lang=EN-US link=blue vlink=purple>\r\n-\r\n-<div class=WordSection1>\r\n-\r\n-Info\r\n-\r\n-The complete\r\n-dataset:\r\n-Allows downloading of the complete parsed data set.\r\n-\r\n-The filtered\r\n-dataset:\r\n-Allows downloading of all parsed IMGT information of all transcripts that\r\n-passed the chosen filter settings.\r\n-\r\n-The alignment\r\n-info on the unmatched sequences: Provides information of the subclass\r\n-alignment of all unmatched sequences. For each sequence the chunck hit\r\n-percentage and the nt hit percentage is shown together with the best matched\r\n-subclass.\r\n-\r\n-SHM Overview\r\n-\r\n-The SHM Overview\r\n-table as a dataset: Allows downloading of the SHM Overview\r\n-table as a data set.\xa0 \r\n-\r\n-Motif data per\r\n-sequence ID: Provides a file that contains information for each\r\n-transcript on the number of mutations present in WA/TW and RGYW/WRCY motives.\r\n-\r\n-Mutation data\r\n-per sequence ID: Provides a file containing information\r\n-on the number of sequences bases, the number and location of mutations and the\r\n-type of mutations found in each transcript. \r\n-\r\n-An IMGT archive\n+with just the matched and filtered IGA1 sequences: Downloads a\n+.txz file with the same format as downloaded IMGT files that contains all IGA1\n+sequences that have passed the chosen filter settings.\n+\n+An IMGT archive\n+with just the matched and filtered IGA2 sequences: Downloads a .txz\n+file with the same format as downloaded IMGT files that contains all IGA2\n+sequences that have passed the chosen filter settings.\n+\n+An IMGT archive\n+with just the matched and filtered IGG sequences: Downloads a .txz\n+file with the same format as downloaded IMGT files that contains all IGG\n+sequences that have passed the chosen filter settings.\n+\n+An IMGT archive\n+with just the matched and filtered IGG1 sequences: Downloads a\n+.txz file with the same format as downloaded IMGT files that contains all IGG1\n+sequences that have passed the chosen filter settings.\n+\n+An IMGT archive\n+with just the matched and filtered IGG2 sequences: Downloads a\n+.txz file with the same format as downloaded IMGT files that contains all IGG2\n+sequences that have passed the chosen filter settings.\n+\n+An IMGT archive\n+with just the matched and filtered IGG3 sequences: Downloads a .txz\n+file with the same format as downloaded IMGT files that contains all IGG3\n+sequences that have passed the chosen filter settings.\n+\n+An IMGT archive\n+with just the matched and filtered IGG4 sequences: Downloads a\n+.txz file with the same format as downloaded IMGT files that contains all IGG4\n+sequences that have passed the chosen filter settings.\n+\n+An IMGT archive\n+with just the matched and filtered IGM sequences: Downloads a .txz\n+file with the same format as downloaded IMGT files that contains all IGM\n+sequences that have passed the chosen filter settings.\n+\n+An IMGT archive\n+with just the matched and filtered IGE sequences: Downloads a\n+.txz file with the same format as downloaded IMGT files that contains all IGE\n+sequences that have passed the chosen filter settings.\n+\n+</div>\n+\n+</body>\n+\n+</html>\n'

diff -r a103134ee6e0 -r 729738462297 shm_first.htm
--- a/shm_first.htm Thu Feb 25 10:32:32 2021 +0000
+++ b/shm_first.htm Wed Sep 15 12:24:06 2021 +0000

b'@@ -1,127 +1,127 @@\n-<html>\r\n-\r\n-<head>\r\n-<meta http-equiv=Content-Type content="text/html; charset=windows-1252">\r\n-<meta name=Generator content="Microsoft Word 14 (filtered)">\r\n-<style>\r\n-\r\n-</style>\r\n-\r\n-</head>\r\n-\r\n-<body lang=EN-US>\r\n-\r\n-<div class=WordSection1>\r\n-\r\n-Table showing the order of each\r\n-filtering step and the number and percentage of sequences after each filtering\r\n-step. \r\n-\r\n-Input:<span\r\n-lang=EN-GB style=\'font-size:12.0pt;font-family:"Times New Roman","serif"\'> The\r\n-number of sequences in the original IMGT file. This is always 100% of the\r\n-sequences.\r\n-\r\n-After "no results" filter: <span\r\n-lang=EN-GB style=\'font-size:12.0pt;font-family:"Times New Roman","serif"\'>IMGT\r\n-classifies sequences either as "productive", "unproductive", "unknown", or "no\r\n-results". Here, the number and percentages of sequences that are not classified\r\n-as "no results" are reported.\r\n-\r\n-After functionality filter:<span\r\n-lang=EN-GB style=\'font-size:12.0pt;font-family:"Times New Roman","serif"\'> The\r\n-number and percentages of sequences that have passed the functionality filter. The\r\n-filtering performed is dependent on the settings of the functionality filter.\r\n-Details on the functionality filter <a name="OLE_LINK12"></a><a\r\n-name="OLE_LINK11"></a><a name="OLE_LINK10">can be found on the start page of\r\n-the SHM&CSR pipeline</a>.\r\n-\r\n-After\r\n-removal sequences that are missing a gene region:\r\n-In this step all sequences that are missing a gene region (FR1, CDR1, FR2,\r\n-CDR2, FR3) that should be present are removed from analysis. The sequence\r\n-regions that should be present are dependent on the settings of the sequence\r\n-starts at filter. <a name="OLE_LINK9"></a><a name="OLE_LINK8">The number and\r\n-percentage of sequences that pass this filter step are reported.</a> \r\n-\r\n-After\r\n-N filter: In this step all sequences that contain\r\n-an ambiguous base (n) in the analysed r'..b'>After functionality filter:<span\n+lang=EN-GB style=\'font-size:12.0pt;font-family:"Times New Roman","serif"\'> The\n+number and percentages of sequences that have passed the functionality filter. The\n+filtering performed is dependent on the settings of the functionality filter.\n+Details on the functionality filter <a name="OLE_LINK12"></a><a\n+name="OLE_LINK11"></a><a name="OLE_LINK10">can be found on the start page of\n+the SHM&CSR pipeline</a>.\n+\n+After\n+removal sequences that are missing a gene region:\n+In this step all sequences that are missing a gene region (FR1, CDR1, FR2,\n+CDR2, FR3) that should be present are removed from analysis. The sequence\n+regions that should be present are dependent on the settings of the sequence\n+starts at filter. <a name="OLE_LINK9"></a><a name="OLE_LINK8">The number and\n+percentage of sequences that pass this filter step are reported.</a> \n+\n+After\n+N filter: In this step all sequences that contain\n+an ambiguous base (n) in the analysed region or the CDR3 are removed from the\n+analysis. The analysed region is determined by the setting of the sequence\n+starts at filter. The number and percentage of sequences that pass this filter\n+step are reported.\n+\n+After\n+filter unique sequences: The number and\n+percentage of sequences that pass the "filter unique sequences" filter. Details\n+on this filter can be found on the start page of\n+the SHM&CSR pipeline\n+\n+After\n+remove duplicate based on filter: The number and\n+percentage of sequences that passed the remove duplicate filter. Details on the\n+"remove duplicate filter based on filter" can be found on the start page of the\n+SHM&CSR pipeline.\n+\n+<a name="OLE_LINK17"></a><a\n+name="OLE_LINK16">Number of matches sequences:</a><span\n+lang=EN-GB style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>\n+The number and percentage of sequences that passed all the filters described\n+above and have a (sub)class assigned.\n+\n+Number\n+of unmatched sequences: The number and percentage\n+of sequences that passed all the filters described above and do not have\n+subclass assigned.\n+\n+ \n+\n+</div>\n+\n+</body>\n+\n+</html>\n'

diff -r a103134ee6e0 -r 729738462297 shm_frequency.htm
--- a/shm_frequency.htm Thu Feb 25 10:32:32 2021 +0000
+++ b/shm_frequency.htm Wed Sep 15 12:24:06 2021 +0000

[

@@ -1,87 +1,87 @@
-<html>
-
-<head>
-<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
-<meta name=Generator content="Microsoft Word 14 (filtered)">
-<style>
-
-</style>
-
-</head>
-
-<body lang=EN-US>
-
-<div class=WordSection1>
-
-SHM
-frequency tab
-
-Graphs
-
-These
-graphs give insight into the level of SHM. The data represented in these graphs
-can be downloaded in the download tab. <a name="OLE_LINK24"></a><a
-name="OLE_LINK23"></a><a name="OLE_LINK90"></a><a name="OLE_LINK89">More
-information on the values found in healthy individuals of different ages can be
-found in IJspeert and van Schouwenburg et al, PMID: 27799928. </a>
-
-Frequency
-scatter plot
-
-A
-dot plot showing the percentage of SHM in each transcript divided into the
-different (sub)classes. In the graph each dot
-represents an individual transcript.
-
-Mutation
-frequency by class
-
-A
-bar graph showing the percentage of transcripts that contain 0%, 0-2%, 2-5%,
-5-10% 10-15%, 15-20% or more than 20% SHM for each subclass. 
-
-Hanna IJspeert, Pauline A. van
-Schouwenburg, David van Zessen, Ingrid Pico-Knijnenburg, Gertjan J. Driessen,
-Andrew P. Stubbs, and Mirjam van der Burg (2016). Evaluation
-of the Antigen-Experienced B-Cell Receptor Repertoire in Healthy Children and
-Adults. In Frontiers in Immunolog, 7, pp. e410-410. [<a
-href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/">doi:10.3389/fimmu.2016.00410</a>][<a
-href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/">Link</a>]
-
-</div>
-
-</body>
-
-</html>
+<html>
+
+<head>
+<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
+<meta name=Generator content="Microsoft Word 14 (filtered)">
+<style>
+
+</style>
+
+</head>
+
+<body lang=EN-US>
+
+<div class=WordSection1>
+
+SHM
+frequency tab
+
+Graphs
+
+These
+graphs give insight into the level of SHM. The data represented in these graphs
+can be downloaded in the download tab. <a name="OLE_LINK24"></a><a
+name="OLE_LINK23"></a><a name="OLE_LINK90"></a><a name="OLE_LINK89">More
+information on the values found in healthy individuals of different ages can be
+found in IJspeert and van Schouwenburg et al, PMID: 27799928. </a>
+
+Frequency
+scatter plot
+
+A
+dot plot showing the percentage of SHM in each transcript divided into the
+different (sub)classes. In the graph each dot
+represents an individual transcript.
+
+Mutation
+frequency by class
+
+A
+bar graph showing the percentage of transcripts that contain 0%, 0-2%, 2-5%,
+5-10% 10-15%, 15-20% or more than 20% SHM for each subclass. 
+
+Hanna IJspeert, Pauline A. van
+Schouwenburg, David van Zessen, Ingrid Pico-Knijnenburg, Gertjan J. Driessen,
+Andrew P. Stubbs, and Mirjam van der Burg (2016). Evaluation
+of the Antigen-Experienced B-Cell Receptor Repertoire in Healthy Children and
+Adults. In Frontiers in Immunolog, 7, pp. e410-410. [<a
+href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/">doi:10.3389/fimmu.2016.00410</a>][<a
+href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/">Link</a>]
+
+</div>
+
+</body>
+
+</html>

diff -r a103134ee6e0 -r 729738462297 shm_overview.htm
--- a/shm_overview.htm Thu Feb 25 10:32:32 2021 +0000
+++ b/shm_overview.htm Wed Sep 15 12:24:06 2021 +0000

[

b'@@ -1,332 +1,332 @@\n-<html>\r\n-\r\n-<head>\r\n-<meta http-equiv=Content-Type content="text/html; charset=windows-1252">\r\n-<meta name=Generator content="Microsoft Word 14 (filtered)">\r\n-<style>\r\n-\r\n-</style>\r\n-\r\n-</head>\r\n-\r\n-<body lang=EN-US>\r\n-\r\n-<div class=WordSection1>\r\n-\r\n-Info\r\n-table\r\n-\r\n-This\r\n-table contains information on different characteristics of SHM. For all\r\n-characteristics information can be found for all sequences or only sequences of\r\n-a certain (sub)class. All results are based on the sequences that passed the filter\r\n-settings chosen on the start page of the SHM & CSR pipeline and only\r\n-include details on the analysed region as determined by the setting of the\r\n-sequence starts at filter. All data in this table can be downloaded via the\r\n-\x93downloads\x94 tab.\r\n-\r\n-Mutation\r\n-frequency:\r\n-\r\n-<a name="OLE_LINK83"></a><a\r\n-name="OLE_LINK82"></a><a name="OLE_LINK81">These values\r\n-give information on the level of SHM. </a><a name="OLE_LINK22"></a><a\r\n-name="OLE_LINK21"></a><a name="OLE_LINK20">More information\r\n-on the values found in healthy individuals of different ages can be found in </a><a\r\n-name="OLE_LINK15"></a><a name="OLE_LINK14"></a><a name="OLE_LINK13"><span\r\n-lang=EN-GB style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>IJspeert\r\n-and van Schouwenburg et al, PMID: 27799928</a>\r\n-\r\n-Number\r\n-of mutations: Shows the number of total\r\n-mutations / the number of sequenced bases (the % of mutated bases).\r\n-\r\n-Median\r\n-number of mutations: Shows the median % of\r\n-SHM of all sequences.\r\n-\r\n-Patterns\r\n-of SHM:\r\n-\r\n-<a name="OLE_LINK72"></a><a\r\n-name="OLE_LINK71"></a><a name="OLE_LINK70">These values\r\n-give insights into the targeting and patterns of '..b'-size:12.0pt;line-height:115%;\n+font-family:"Times New Roman","serif"\'>Shows the total number of sequenced <a\n+name="OLE_LINK59"></a><a name="OLE_LINK58">guanine</a>s / The total number of\n+sequenced bases (the percentage of sequenced bases that were guanines).\n+\n+<a name="OLE_LINK69"><span\n+lang=EN-GB style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>Graphs</a>\n+\n+<a name="OLE_LINK75"></a><a\n+name="OLE_LINK74"></a><a name="OLE_LINK73">These graphs visualize\n+information on the patterns and targeting of SHM and thereby give information\n+into the repair pathways used to repair the U:G mismatches introduced by AID. The\n+data represented in these graphs can be downloaded in the download tab. More\n+information on the values found in healthy individuals of different ages can be\n+found in IJspeert and van Schouwenburg et al, PMID: 27799928</a><span\n+lang=EN-GB style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>.\n+<a name="OLE_LINK85"></a><a name="OLE_LINK84"></a>\n+\n+Percentage\n+of mutations in AID and pol eta motives\n+\n+Visualizes\n+<a name="OLE_LINK80"></a><a name="OLE_LINK79"></a><a name="OLE_LINK78">for each\n+(sub)class </a>the percentage of mutations that are present in AID (RGYW or\n+WRCY) or polymerase eta motives (WA or TW) in the different subclasses <span\n+lang=EN-GB style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>(R=Purine,\n+Y=pyrimidine, W = A or T).\n+\n+Relative\n+mutation patterns\n+\n+Visualizes\n+for each (sub)class the distribution of mutations between mutations at AT\n+locations and transitions or transversions at GC locations. \n+\n+Absolute\n+mutation patterns\n+\n+Visualized\n+for each (sub)class the percentage of sequenced AT and GC bases that are\n+mutated. The mutations at GC bases are divided into transition and transversion\n+mutations<a name="OLE_LINK77"></a><a name="OLE_LINK76">. </a>\n+\n+Hanna IJspeert, Pauline A. van\n+Schouwenburg, David van Zessen, Ingrid Pico-Knijnenburg, Gertjan J. Driessen,\n+Andrew P. Stubbs, and Mirjam van der Burg (2016). <span\n+style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>Evaluation\n+of the Antigen-Experienced B-Cell Receptor Repertoire in Healthy Children and\n+Adults. In Frontiers in Immunolog, 7, pp. e410-410. [<a\n+href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/"><span\n+style=\'color:windowtext\'>doi:10.3389/fimmu.2016.00410</a>][<a\n+href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/"><span\n+style=\'color:windowtext\'>Link</a>]\n+\n+</div>\n+\n+</body>\n+\n+</html>\n'

diff -r a103134ee6e0 -r 729738462297 shm_selection.htm
--- a/shm_selection.htm Thu Feb 25 10:32:32 2021 +0000
+++ b/shm_selection.htm Wed Sep 15 12:24:06 2021 +0000

[

b'@@ -1,128 +1,128 @@\n-<html>\r\n-\r\n-<head>\r\n-<meta http-equiv=Content-Type content="text/html; charset=windows-1252">\r\n-<meta name=Generator content="Microsoft Word 14 (filtered)">\r\n-<style>\r\n-\r\n-</style>\r\n-\r\n-</head>\r\n-\r\n-<body lang=EN-US link=blue vlink=purple>\r\n-\r\n-<div class=WordSection1>\r\n-\r\n-References\r\n-\r\n-Yaari, G. and Uduman, M. and Kleinstein, S. H. (2012). Quantifying\r\n-selection in high-throughput Immunoglobulin sequencing data sets. In<span\r\n-class=apple-converted-space> Nucleic Acids Research, 40 (17),\r\n-pp. e134\x96e134. [<span\r\n-lang=EN-GB><a href="http://dx.doi.org/10.1093/nar/gks457" target="_blank"><span\r\n-style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif";\r\n-color:#303030\'>doi:10.1093/nar/gks457</a>][<a\r\n-href="http://dx.doi.org/10.1093/nar/gks457" target="_blank"><span\r\n-style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif";\r\n-color:#303030\'>Link</a>]\r\n-\r\n-Graphs\r\n-\r\n-AA\r\n-mutation frequency\r\n-\r\n-For\r\n-each class, the frequency of replacement mutations at each amino acid position\r\n-is shown, which is calculated by dividing the number of replacement mutations\r\n-at a particular amino acid position/the number sequences that have an amino\r\n-acid at that particular position. Since the length of the CDR1 and CDR2 region\r\n-is not the same for every VH gene, some amino acids positions are absent.\r\n-Therefore we calculate the frequency using the number of amino acids present at\r\n-that that particular location. \r\n-\r\n-Antigen\r\n-selection (BASELINe)\r\n-\r\n-Shows\r\n-the results of the an'..b'target="_blank"><span\n+style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif";\n+color:#303030\'>doi:10.1093/nar/gks457</a>][<a\n+href="http://dx.doi.org/10.1093/nar/gks457" target="_blank"><span\n+style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif";\n+color:#303030\'>Link</a>]\n+\n+Graphs\n+\n+AA\n+mutation frequency\n+\n+For\n+each class, the frequency of replacement mutations at each amino acid position\n+is shown, which is calculated by dividing the number of replacement mutations\n+at a particular amino acid position/the number sequences that have an amino\n+acid at that particular position. Since the length of the CDR1 and CDR2 region\n+is not the same for every VH gene, some amino acids positions are absent.\n+Therefore we calculate the frequency using the number of amino acids present at\n+that that particular location. \n+\n+Antigen\n+selection (BASELINe)\n+\n+Shows\n+the results of the analysis of antigen selection as performed using BASELINe.\n+Details on the analysis performed by BASELINe can be found in Yaari et al,\n+PMID: 22641856. The settings used for the analysis are:\n+focused, SHM targeting model: human Tri-nucleotide, custom bounderies. The\n+custom boundries are dependent on the \x91sequence starts at filter\x92. \n+\n+Leader:\n+1:26:38:55:65:104:-\n+\n+FR1: 27:27:38:55:65:104:-\n+\n+CDR1: 27:27:38:55:65:104:-\n+\n+FR2: 27:27:38:55:65:104:-\n+\n+Hanna IJspeert, Pauline A. van\n+Schouwenburg, David van Zessen, Ingrid Pico-Knijnenburg, Gertjan J. Driessen,\n+Andrew P. Stubbs, and Mirjam van der Burg (2016). <span\n+style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>Evaluation\n+of the Antigen-Experienced B-Cell Receptor Repertoire in Healthy Children and\n+Adults. In Frontiers in Immunolog, 7, pp. e410-410. [<a\n+href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/"><span\n+style=\'color:windowtext\'>doi:10.3389/fimmu.2016.00410</a>][<a\n+href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/"><span\n+style=\'color:windowtext\'>Link</a>]\n+\n+</div>\n+\n+</body>\n+\n+</html>\n'

diff -r a103134ee6e0 -r 729738462297 shm_transition.htm
--- a/shm_transition.htm Thu Feb 25 10:32:32 2021 +0000
+++ b/shm_transition.htm Wed Sep 15 12:24:06 2021 +0000

[

b'@@ -1,120 +1,120 @@\n-<html>\r\n-\r\n-<head>\r\n-<meta http-equiv=Content-Type content="text/html; charset=windows-1252">\r\n-<meta name=Generator content="Microsoft Word 14 (filtered)">\r\n-<style>\r\n-\r\n-</style>\r\n-\r\n-</head>\r\n-\r\n-<body lang=EN-US link=blue vlink=purple>\r\n-\r\n-<div class=WordSection1>\r\n-\r\n-These graphs and\r\n-tables give insight into the targeting and patterns of SHM. This can give\r\n-insight into the DNA repair pathways used to solve the U:G mismatches\r\n-introduced by AID. More information on the values found in healthy individuals\r\n-of different ages can be found in IJspeert and van Schouwenburg et al, PMID:\r\n-27799928.\r\n-\r\n-<span\r\n-style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>Graphs\r\n-\r\n-\r\n-<a name="OLE_LINK93"></a><a\r\n-name="OLE_LINK92"></a><a name="OLE_LINK91">Heatmap transition\r\n-information</a>\r\n-\r\n-<a name="OLE_LINK98"></a><a\r\n-name="OLE_LINK97">Heatmaps visualizing for each subclass the frequency\r\n-of all possible substitutions. On the x-axes the original base is shown, while\r\n-the y-axes shows the new base. The darker the shade of blue, the more frequent\r\n-this type of substitution is occurring.\xa0 </a>\r\n-\r\n-<span\r\n-style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>Bargraph\r\n-transition information\r\n-\r\n-Bar graph\r\n-visualizing for each original base the distribution of substitutions into the other\r\n-bases. A graph is included for each (sub)class. \r\n-\r\n-<span\r\n-style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>Tables\r\n-\r\n-Transition\r\n-tables are shown for each (sub)class. All the original bases are listed\r\n-horizontally, while the new bases are listed vertically. \r\n-\r\n-<p class'..b't, div.msochpdefault\n+\t{mso-style-name:msochpdefault;\n+\tmargin-right:0in;\n+\tmargin-left:0in;\n+\tfont-size:12.0pt;\n+\tfont-family:"Calibri","sans-serif";}\n+p.msopapdefault, li.msopapdefault, div.msopapdefault\n+\t{mso-style-name:msopapdefault;\n+\tmargin-right:0in;\n+\tmargin-bottom:10.0pt;\n+\tmargin-left:0in;\n+\tline-height:115%;\n+\tfont-size:12.0pt;\n+\tfont-family:"Times New Roman","serif";}\n+span.apple-converted-space\n+\t{mso-style-name:apple-converted-space;}\n+.MsoChpDefault\n+\t{font-size:10.0pt;\n+\tfont-family:"Calibri","sans-serif";}\n+.MsoPapDefault\n+\t{margin-bottom:10.0pt;\n+\tline-height:115%;}\n+@page WordSection1\n+\t{size:8.5in 11.0in;\n+\tmargin:1.0in 1.0in 1.0in 1.0in;}\n+div.WordSection1\n+\t{page:WordSection1;}\n+-->\n+</style>\n+\n+</head>\n+\n+<body lang=EN-US link=blue vlink=purple>\n+\n+<div class=WordSection1>\n+\n+These graphs and\n+tables give insight into the targeting and patterns of SHM. This can give\n+insight into the DNA repair pathways used to solve the U:G mismatches\n+introduced by AID. More information on the values found in healthy individuals\n+of different ages can be found in IJspeert and van Schouwenburg et al, PMID:\n+27799928.\n+\n+<span\n+style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>Graphs\n+\n+\n+<a name="OLE_LINK93"></a><a\n+name="OLE_LINK92"></a><a name="OLE_LINK91">Heatmap transition\n+information</a>\n+\n+<a name="OLE_LINK98"></a><a\n+name="OLE_LINK97">Heatmaps visualizing for each subclass the frequency\n+of all possible substitutions. On the x-axes the original base is shown, while\n+the y-axes shows the new base. The darker the shade of blue, the more frequent\n+this type of substitution is occurring.\xa0 </a>\n+\n+<span\n+style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>Bargraph\n+transition information\n+\n+Bar graph\n+visualizing for each original base the distribution of substitutions into the other\n+bases. A graph is included for each (sub)class. \n+\n+<span\n+style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>Tables\n+\n+Transition\n+tables are shown for each (sub)class. All the original bases are listed\n+horizontally, while the new bases are listed vertically. \n+\n+Hanna IJspeert, Pauline A. van\n+Schouwenburg, David van Zessen, Ingrid Pico-Knijnenburg, Gertjan J. Driessen,\n+Andrew P. Stubbs, and Mirjam van der Burg (2016). <span\n+style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>Evaluation\n+of the Antigen-Experienced B-Cell Receptor Repertoire in Healthy Children and\n+Adults. In Frontiers in Immunolog, 7, pp. e410-410. [<a\n+href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/"><span\n+style=\'color:windowtext\'>doi:10.3389/fimmu.2016.00410</a>][<a\n+href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/"><span\n+style=\'color:windowtext\'>Link</a>]\n+\n+</div>\n+\n+</body>\n+\n+</html>\n'

diff -r a103134ee6e0 -r 729738462297 summary_to_fasta.py
--- a/summary_to_fasta.py Thu Feb 25 10:32:32 2021 +0000
+++ b/summary_to_fasta.py Wed Sep 15 12:24:06 2021 +0000

@@ -37,6 +37,6 @@
o.write(">" + ID + "\n" + seq + "\n")
passed += 1

- print "No results:", no_results
- print "No sequences:", no_seqs
- print "Written to fasta file:", passed
+ print("No results:", no_results)
+ print("No sequences:", no_seqs)
+ print("Written to fasta file:", passed)

diff -r a103134ee6e0 -r 729738462297 tests/.pytest_cache/.gitignore
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/.pytest_cache/.gitignore Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,2 @@
+# Created by pytest automatically.
+*

diff -r a103134ee6e0 -r 729738462297 tests/.pytest_cache/CACHEDIR.TAG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/.pytest_cache/CACHEDIR.TAG Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,4 @@
+Signature: 8a477f597d28d172789f06886806bc55
+# This file is a cache directory tag created by pytest.
+# For information about cache directory tags, see:
+# http://www.bford.info/cachedir/spec.html

diff -r a103134ee6e0 -r 729738462297 tests/.pytest_cache/README.md
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/.pytest_cache/README.md Wed Sep 15 12:24:06 2021 +0000

[

@@ -0,0 +1,8 @@
+# pytest cache directory #
+
+This directory contains data from the pytest's cache plugin,
+which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
+
+**Do not** commit this to version control.
+
+See [the docs](https://docs.pytest.org/en/stable/cache.html) for more information.

diff -r a103134ee6e0 -r 729738462297 tests/.pytest_cache/v/cache/nodeids
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/.pytest_cache/v/cache/nodeids Wed Sep 15 12:24:06 2021 +0000

[

@@ -0,0 +1,3 @@
+[
+ "test_shm_csr.py::test_aa_histogram_sum"
+]
\ No newline at end of file

diff -r a103134ee6e0 -r 729738462297 tests/.pytest_cache/v/cache/stepwise
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/.pytest_cache/v/cache/stepwise Wed Sep 15 12:24:06 2021 +0000

[

@@ -0,0 +1,1 @@
+[]
\ No newline at end of file

diff -r a103134ee6e0 -r 729738462297 tests/__pycache__/test_shm_csr.cpython-37-pytest-6.2.4.pyc

Binary file tests/__pycache__/test_shm_csr.cpython-37-pytest-6.2.4.pyc has changed

diff -r a103134ee6e0 -r 729738462297 tests/data/CONTROL_NWK377_PB_IGHC_MID1_40nt_2.txz

Binary file tests/data/CONTROL_NWK377_PB_IGHC_MID1_40nt_2.txz has changed

diff -r a103134ee6e0 -r 729738462297 tests/sequence_overview/ntoverview.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/sequence_overview/ntoverview.txt Wed Sep 15 12:24:06 2021 +0000

b'@@ -0,0 +1,1577 @@\n+Sequence.ID\tbest_match\tSequence of the analysed region\tA\tC\tG\tT\n+JY8QFUQ01A0005\tIGG1\tggtggctccatcaacagtagaaattattat tggggctggatccgccagcccccagggaagggtttggagtggattggaaat atctattatagtgggaacacc tactacaatccgtccctcaagagtcgagtcaccgtatccgtagacaggtctaagaaccagttgtccctgaagctgacctctctgaccgccgcagacacggccgtatattactgt\t55\t55\t55\t51\n+JY8QFUQ01A004N\tIGG1\tggtggctccgtcagtaggagtgcctactac tggggctggatccgccagcccccagggaaggggctggagtggattgggacc atctattatagtgggaccaca tactccaatccgtccctcaagactcgagtcaccatgtccttggacacgtccaagaaccacatctccctgaagctgaattctgtgaccgccgcagacacggctgtttattactgt\t47\t63\t58\t48\n+JY8QFUQ01A006G\tIGG1\tggtgactccatcagtagtactcattactat tggggctggatccggcagcccccagggaggggactggagtgggttgggagt atccactacactgggagcacc tactacaactggtccctcaagcatcgagtctctatatcggtggacacatcgagtaaccagttctccctgaggttgaggtctgtgaccgccgctgacacggctgtatactactgt\t46\t57\t62\t51\n+JY8QFUQ01A018V\tIGA1\tggtgtctccatgagcaatgagtcctattac tggacgtggatccggcagcccgtcgggaagggaccggagtggattgggcgc atctacaccagtgggagcacc aattataatccttccctcaagagtcgagtcaccatgtccttagacacgtccaagaggcagttctccctgaagttgacctctatgaccgccgcagacacggccacatatttctgt\t50\t61\t57\t48\n+JY8QFUQ01A019O\tIGG1\tggatacatctttaatatccactgg atcgcctgggtccgccagatgcccgggaaaggcctggagtggatggggatc atctatcctggtaactctgagacc aaatatagcccggccttccaaggccaggtcaccatctcagccgacaggaccaccaataccgcctacctgcagtggcgcggcctgaaggcctcggacaccgccatgtattactgt\t49\t66\t56\t42\n+JY8QFUQ01A01KX\tIGG1\tggattcacatttagaagctattcc atgaattgggtccgccaggctccagggaaggggctggagtgggtctcagct gtcagtggtggtgggggcgccaca aactacgcggagtccgtgaagggccggttcaccatctccagagacaattccagggggacggtgttcttacaaatgaacagcctgagagtcgaagacacagccttatattattgt\t51\t48\t67\t47\n+JY8QFUQ01A0207\tIGG2\tggattcacctttaccaactatggc atgagctgggtccgccaggctccagggaaggggctggagtgggtctcaggt attagtggtagtggtgaaaccaca taccacgcagaatccgtgcagggccggttcaccatctccagagacaactccaagaacaatctgtatctgcaaatgaacagtctgagagccgaggacacggccatttattactgt\t56\t53\t59\t45\n+JY8QFUQ01A02HL\tIGA1\tggattcactttcagtaactactgg atgtactgggtccgccaagctccagggaaggggctggagtgggtctcacgt attaatggtgatggaagtagtaca agttacgtggactccgtgaagggccgattcaccatctccagagacaacgccaagaacaccctgtatctgcaaatgaacagtctgagagtcgacgacacggctgtttattactgt\t56\t48\t59\t50\n+JY8QFUQ01A02KS\tIGA2\tggattcacctttagtacctattgg atgacttgggtccgccaggctccagggaaggggctggagtgggtggccagc ataaaaaatgatggaagtgagaaa tcctatgtggactctgtaaagggccgattcaccatctccagagacaacgccgagaactcactgtatttgcaagtgaacaacctgagagccgaggacacggctgtatattactgt\t59\t46\t61\t47\n+JY8QFUQ01A02XZ\tIGG1\tggattcacctacagcagctatgcc atgagctgggtccgccaggctccagggaaggggctggagtgggtctcagca attagtggtggtggtgctagtaca taccacgcagactccgtgaagggccgattcaccatctccagagacaattccaagaacacgctgtatttgcaaatgaacagcctgagagccgacgacacggccgtatattactgt\t54\t55\t61\t43\n+JY8QFUQ01A03E3\tIGA2\tggattcaccttcagtaggtactgg atgcactgggtccgccaagttccagggaaggggccggtgtgggtctcacgt attaatgaagacggcagccacaca gatcacgcggactccgtgaagggccgattcaccatctccagagacaacgccaagaacaagttgtatttgcaaatgaacagtctgagaggcgaggacacggctgtctattattgt\t56\t51\t62\t44\n+JY8QFUQ01A03N6\tIGG1\tggtgactccatgagtagcgacacgtgg tggagctgggtccgccagacgccagagaagggactggaatggattggggag atcaatcaaagagggacgacc tcctacaacccgtccctcaggagtcgagtcgtcctgtcagtgggcgagtccaaaaatcaattctccctgaggctgacctctgtgaccgccgcggactcggccatctattattgt\t49\t57\t65\t42\n+JY8QFUQ01A08XO\tIGG1\tggtggctccgtcagcagtggtagttactac tggagctggatccggcagcccccagggaagggactggagtggattgggtat atctattacagtgggagcacc aactacaacccctccctcaagagtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgctgcggacacggccgtgtattactgt\t50\t59\t61\t46\n+JY8QFUQ01A0939\tIGA2\tggggacagtgtctctaccacccgtgctgct tggaactggatcaggcagtccccatcgggaggccttgagtggctgggaagg acatactacaggtccaagtggcttaat gattatgcagtgtctgtgaaaagtcgaattaccatcaatccagacacatccaagaaccagttttccctgcagttgaaatctgtcattcccgaggacacggctgtttattactgt\t55\t54\t57\t56\n+JY8QFUQ01A09OY\tIGA2\tggattcatcttcagtgactactac atgacctggatccgccaggctccagggaaggggctggagtgggtttcatac attcgtagtaatgggagtcccata tacaacgcagactctgggaggggccgattcaccatctccagggacaacgccaagaactcactgtatctgcaaatgaatagtctgagagtcgaggacacggccgtgtattactgt\t55\t51\t59\t48\n+JY8QFUQ01A0C2Y\tIGG1\tggatttacttttaacaactattgg atgacctgggttcgccaggctccagggaaggggctggaatgggtggccaac ataaaacaacatggaggtgaaacg tactatgtggactctgtga'..b'cacatccgcgaacacagcctacatggagctgagcagcctgacatctgaagacacggctgtgtattactgt\t60\t55\t54\t42\n+JY8QFUQ01DG5KX\tunmatched, IGA2\tgggttctccgtcagtttcaactac atgagctgggtccgccaggctccagggaaggggctggagtgggtctcagtt atctatgccgatggaagtaca ttctatgcagactccgtgaagggccgattcatcatctccagagacaattcaaagaacacgctcaatcttcaaatgaatagtttgagagttgacgacacggctgtgtattactgt\t53\t47\t56\t54\n+JY8QFUQ01DG6GC\tunmatched, IGA2\tggattcacctttagtagatattcc atgcactgggtccgccaggctccaggcaaggggctagagtgggtggcactt atatcatacgatggaagtagaaga atctacgcagactccgtgaagggccgattcaccatctccagagacacttccaagaacacggtgtatctgcaaatgagtagcctgagacctgaggacacggctgtgtattactgt\t57\t50\t58\t48\n+JY8QFUQ01DHXHT\tunmatched, IGG1\tggtggctccgtcagtaggagtgcctactac tggggctggatccgccagcccccagggaaggggctggagtggattgggacc atctattatagtgggaccaca tactccaatccgtccctcaagactcgagtcaccatgtccttggacacgtccaagaaccacatctccctgaagctgaattctgtgaccgccgcagacacggctgtttattactgt\t47\t63\t58\t48\n+JY8QFUQ01DI39D\tunmatched, IGA1\tggaaaaaccctcactgaagtatcc atgcactgggtgcgacaggctcctggaaaagggcttgagtggatgggagga tttgatcctgaagatggtgaaata atctacgcacagaagttccagggcagaatcaccgtgaccgaggacacatctacagacacagcctacatggagctgagcagcctgagatctgaagacacggccgtgtattactgt\t63\t48\t61\t41\n+JY8QFUQ01DIBNC\tunmatched, IGG1\tggtggctccgtcagcagtggtagttactac tggagctggatccggcagcccccagggaagggactggagtggattgggtat atctattacagtgggagcacc aactacaacccctccctcaagagtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgctgcggacacggccgtgtattactgt\t50\t59\t61\t46\n+JY8QFUQ01DJFIZ\tunmatched, IGG1\tggattcaacttggcgaagttcgcc atgagctgggtccgccaggctcctgggaaggggctggagtgggtctcagag atcagtggctccggtagtaaagtc ggatatgcggagtccgtgaagggccgattcaccatctccaaagacaattccaagaacacattgtacttgcaaatgaccgacctgagacccggcgacacggccatttattactgt\t52\t53\t63\t45\n+JY8QFUQ01DLDLD\tunmatched, IGA1\tggatacaccttcaccagctactat atacactgggtgcgacaggcccctggacaagggcttgagtggatgggaata atcgaccctagtggtggtgccaca agctacgcacagcagttccagggcagagtcaccatgaccagggacacgtccacgagcacagtctatatggagctgagcagcctgagatctgacgacacggccgtgtattactgt\t55\t57\t61\t40\n+JY8QFUQ01DMF0A\tunmatched, IGA1\tgttgacgccataagcgacctcggttatttc tgggcctgggtccgccagcccgccgcgaagggactggagtggatcggacat gcccttggtgatggatatacc gaatacaaccccgccctagagagtcgaatcaccgtgtcagtggacaagtccaagaaccagttttccctgacgttggagtccgtgaccgccgcagacacggccacttatttctgt\t46\t63\t61\t46\n+JY8QFUQ01DOVL5\tunmatched, IGA2\tggattcatcttcagcaaccttgcg atgcactgggtccgccaggctccaggcaaggggctggagtgggtggcaatt atatcatatgatggaggtattaag tactatgcagactccgtgaagggccgattcaccatctccagagacaattccaagaacacgctgtatctacaaatgaacaacctgagacttgaggacacggctgtgtattactgt\t58\t49\t56\t50\n+JY8QFUQ01DPT8R\tunmatched, IGA2\tggattcaccttcagtagctactgg atgcactgggtccgccaagctccagggaaggggctggtgtgggtctcacgt gttaatggtgatggggtagcaca gcctacgcggactccgtgaagggccgattcaccatctccagagacaacgccaagaacactctctatctccaaatgaacagtctgagagccgaggacacggctgtatattactgt\t51\t55\t61\t45\n+JY8QFUQ01DUD3U\tunmatched, IGA2\tggattcacttttaggagtcatatg atgagttgggtccgccagactccagggaaggggctggaatgggtctcaagt attcgagccagtggtgataggaca cactatgcagactccgtgaggggccgcttcaccatctccagagacaactccaagaacacgatgtatttgcaaatgcacagcctgagagtcgacgacacggccgtatactactgt\t56\t51\t60\t46\n+JY8QFUQ01DV4HU\tunmatched, IGG2\tggattcacctctcctagatactgg atgaattgggtccgccaggcttccgggaaggggctggagtgggtggccaac ataaagcaagacggaagtgaggaa aactttgtggactctgtgaagggccggttcaccatctccagagacagcgccaagaattcaatgtctctacaaatgaacagcctgagagtcgaggacacggctgtatattattgc\t58\t47\t63\t45\n+JY8QFUQ01DV8LF\tunmatched, IGA2\tggattcaccttcagtcgctactgg atgcactgggtccgccaagctccagggaagggcctggtgtgggtctcacgt attaaaagtgatgggattagcaca acgtacgcggactccgtgaagggccgattcaccatctccagagacaacgccaagaacacggtgtatctgcaaatgaacagtctgagagccgaggacacggctgtgtactactgt\t54\t54\t62\t43\n+JY8QFUQ01DVBU0\tunmatched, IGA2\tggattcatcttcagtagctactgg atgcactgggtccgccaagctccagggaaggggctggtgtgggtgtcacgt agtaatacggggggactgacaca gcctacgcggactccgtgaagggccgattcaccatctcccgagacaacgggaagaacacgctgtatctgcaaatgaacagtctgagagccgaggacacggctgtttattactgt\t51\t52\t66\t43\n+JY8QFUQ01DXDOM\tunmatched, IGA2\tggattcagtttcactggttttacc gtgatctgggtccgccaggctccaaggaaggggctggaatggatctcatcc gtcactactaatggtctcacg tactacgcagactcagtagagggccgattcaacatctccagggacaacgccaacaatttagtgtttctgcaaatgaacagcctgagagtcgaggacactggtgtatattattgt\t53\t49\t54\t54\n'

diff -r a103134ee6e0 -r 729738462297 tests/sort_by_time.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/sort_by_time.py Wed Sep 15 12:24:06 2021 +0000

[

@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+
+"""Small script to profile bash scripts that have been run with the following
+code inside:
+
+    exec 5> debug_output.txt
+    BASH_XTRACEFD="5"
+    PS4='$(date +%s.%N) $LINENO: '
+    set -x
+
+
+"""
+import calendar
+import time
+import sys
+
+import re
+
+SECONDS_FINDER = re.compile(r"^(\d+.\d+).*")
+
+
+def file_to_timestamped_lines(input_file):
+    with open(input_file, "rt") as file_h:
+        for line in file_h:
+            time_since_epoch = float(SECONDS_FINDER.search(line).group(1))
+            yield time_since_epoch, line
+
+
+def time_delta_lines(input_file):
+    timestamped_lines = file_to_timestamped_lines(input_file)
+    current_time, current_line = next(timestamped_lines)
+    for next_time, next_line in timestamped_lines:
+        time_since = next_time - current_time
+        yield time_since, current_line
+        current_time = next_time
+        current_line = next_line
+
+
+if __name__ == "__main__":
+    input_file = sys.argv[1]
+    # Sort by time ascending order.
+    sorted_time = sorted(time_delta_lines(input_file), key=lambda tup: tup[0])
+    for time_since, line in sorted_time:
+        if time_since > 60*60*24*365:
+            # big times are probably nonsensical parsing errors.
+            continue
+        print(time_since, line.strip())

diff -r a103134ee6e0 -r 729738462297 tests/test_shm_csr.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test_shm_csr.py Wed Sep 15 12:24:06 2021 +0000

[

@@ -0,0 +1,108 @@
+# Copyright (c) 2021 Leiden University Medical Center
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+import pytest
+
+GIT_ROOT = str(Path(__file__).parent.parent.absolute())
+TEST_DIR = Path(__file__).parent
+TEST_DATA_DIR = TEST_DIR / "data"
+VALIDATION_DATA_DIR = TEST_DIR / "validation_data"
+CONTROL_NWK377_PB_IGHC_MID1_40nt_2 = TEST_DATA_DIR / "CONTROL_NWK377_PB_IGHC_MID1_40nt_2.txz"
+
+
+@pytest.fixture(scope="module")
+def shm_csr_result():
+    temp_dir = tempfile.mktemp()
+    shutil.copytree(GIT_ROOT, temp_dir)
+    input = str(CONTROL_NWK377_PB_IGHC_MID1_40nt_2)
+    out_files_path = os.path.join(temp_dir, "results")
+    out_file = os.path.join(out_files_path, "result.html")
+    infile_name = "input_data"
+    functionality = "productive"
+    unique = "Sequence.ID"
+    naive_output = "no"
+    naive_output_ca = "None"
+    naive_output_cg = "None"
+    naive_output_cm = "None"
+    naive_output_ce = "None"
+    naive_output_all = "None"
+    filter_unique = "remove"
+    filter_unique_count = '2'
+    class_filter = '70_70'
+    empty_region_filter = 'FR1'
+    fast = 'no'
+    cmd = [
+        "bash",
+        "wrapper.sh",
+        input,
+        "custom",
+        out_file,
+        out_files_path,
+        infile_name,
+        "-",
+        functionality,
+        unique,
+        naive_output,
+        naive_output_ca,
+        naive_output_cg,
+        naive_output_cm,
+        naive_output_ce,
+        naive_output_all,
+        filter_unique,
+        filter_unique_count,
+        class_filter,
+        empty_region_filter,
+        fast
+    ]
+    subprocess.run(cmd, cwd=temp_dir, stdout=sys.stdout, stderr=sys.stderr,
+                   check=True)
+    yield Path(out_files_path)
+    #shutil.rmtree(temp_dir)
+
+
+def test_check_output(shm_csr_result):
+    assert shm_csr_result.exists()
+
+
+@pytest.mark.parametrize("filename", os.listdir(VALIDATION_DATA_DIR))
+def test_results_match_validation(shm_csr_result, filename):
+    if filename == "shm_overview.txt":
+        # TODO: Fix errors in shm_overview.
+        return
+    with open(Path(shm_csr_result, filename)) as result_h:
+        with open(Path(VALIDATION_DATA_DIR, filename)) as validate_h:
+            for line in result_h:
+                assert line == validate_h.readline()
+
+
+def test_nt_overview(shm_csr_result):
+    with open(Path(shm_csr_result, "sequence_overview", "ntoverview.txt")
+              ) as result_h:
+        with open(Path(TEST_DIR, "sequence_overview", "ntoverview.txt")
+                  ) as validate_h:
+            for line in result_h:
+                assert line == validate_h.readline()

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/IGA_pie.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/IGA_pie.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,3 @@
+Gene Freq label
+IGA1 593 IGA1 - 593
+IGA2 324 IGA2 - 324

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/IGG_pie.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/IGG_pie.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,5 @@
+Gene Freq label
+IGG1 274 IGG1 - 274
+IGG2 150 IGG2 - 150
+IGG3 26 IGG3 - 26
+IGG4 19 IGG4 - 19

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/aa_histogram_sum.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/aa_histogram_sum.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,105 @@
+index mutations.at.position aa.at.position
+1 0 1387
+2 0 1387
+3 0 1387
+4 0 1387
+5 0 1387
+6 0 1387
+7 0 1387
+8 0 1387
+9 0 1387
+10 0 1387
+11 0 1387
+12 0 1387
+13 0 1387
+14 0 1387
+15 0 1387
+16 0 1387
+17 0 1387
+18 0 1387
+19 0 1387
+20 0 1387
+21 0 1387
+22 0 1387
+23 0 1387
+24 0 1387
+25 0 1387
+26 0 1387
+27 123 1387
+28 180 1387
+29 528 1387
+30 167 1383
+31 78 183
+32 0 0
+33 0 0
+34 56 161
+35 692 1345
+36 1166 1385
+37 451 1387
+38 432 1387
+39 123 1387
+40 601 1387
+41 9 1387
+42 95 1387
+43 3 1387
+44 36 1387
+45 279 1387
+46 47 1387
+47 65 1387
+48 172 1387
+49 41 1387
+50 82 1387
+51 82 1387
+52 81 1387
+53 156 1387
+54 126 1387
+55 671 1387
+56 383 1387
+57 495 1386
+58 947 1386
+59 598 1382
+60 31 63
+61 14 54
+62 251 999
+63 730 1386
+64 1074 1386
+65 467 1386
+66 736 1387
+67 162 1387
+68 203 1387
+69 153 1387
+70 85 1387
+71 61 1387
+72 402 1387
+73 0 1387
+74 101 1387
+75 14 1387
+76 106 1387
+77 255 1387
+78 295 1387
+79 69 1387
+80 209 1387
+81 37 1387
+82 238 1387
+83 162 1387
+84 330 1387
+85 394 1387
+86 223 1387
+87 344 1387
+88 301 1387
+89 25 1387
+90 341 1387
+91 95 1387
+92 566 1387
+93 265 1387
+94 51 1387
+95 217 1387
+96 443 1387
+97 205 1387
+98 3 1387
+99 64 1387
+100 41 1387
+101 632 1387
+102 8 1387
+103 239 1387
+104 1 1387

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/aa_histogram_sum_IGA.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/aa_histogram_sum_IGA.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,105 @@
+index mutations.at.position aa.at.position
+1 0 917
+2 0 917
+3 0 917
+4 0 917
+5 0 917
+6 0 917
+7 0 917
+8 0 917
+9 0 917
+10 0 917
+11 0 917
+12 0 917
+13 0 917
+14 0 917
+15 0 917
+16 0 917
+17 0 917
+18 0 917
+19 0 917
+20 0 917
+21 0 917
+22 0 917
+23 0 917
+24 0 917
+25 0 917
+26 0 917
+27 75 917
+28 99 917
+29 353 917
+30 88 913
+31 46 112
+32 0 0
+33 0 0
+34 43 98
+35 421 889
+36 759 915
+37 309 917
+38 319 917
+39 108 917
+40 429 917
+41 0 917
+42 82 917
+43 0 917
+44 28 917
+45 190 917
+46 24 917
+47 45 917
+48 112 917
+49 35 917
+50 38 917
+51 66 917
+52 50 917
+53 108 917
+54 89 917
+55 425 917
+56 263 917
+57 326 916
+58 685 916
+59 399 913
+60 23 47
+61 14 38
+62 181 674
+63 450 916
+64 754 916
+65 294 916
+66 552 917
+67 91 917
+68 155 917
+69 86 917
+70 73 917
+71 53 917
+72 236 917
+73 0 917
+74 53 917
+75 8 917
+76 63 917
+77 174 917
+78 179 917
+79 58 917
+80 137 917
+81 20 917
+82 142 917
+83 114 917
+84 207 917
+85 257 917
+86 141 917
+87 218 917
+88 202 917
+89 17 917
+90 199 917
+91 64 917
+92 353 917
+93 162 917
+94 29 917
+95 130 917
+96 314 917
+97 152 917
+98 3 917
+99 37 917
+100 29 917
+101 421 917
+102 8 917
+103 156 917
+104 1 917

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/aa_histogram_sum_IGG.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/aa_histogram_sum_IGG.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,105 @@
+index mutations.at.position aa.at.position
+1 0 469
+2 0 469
+3 0 469
+4 0 469
+5 0 469
+6 0 469
+7 0 469
+8 0 469
+9 0 469
+10 0 469
+11 0 469
+12 0 469
+13 0 469
+14 0 469
+15 0 469
+16 0 469
+17 0 469
+18 0 469
+19 0 469
+20 0 469
+21 0 469
+22 0 469
+23 0 469
+24 0 469
+25 0 469
+26 0 469
+27 48 469
+28 81 469
+29 175 469
+30 79 469
+31 32 71
+32 0 0
+33 0 0
+34 13 63
+35 271 455
+36 407 469
+37 142 469
+38 113 469
+39 15 469
+40 172 469
+41 9 469
+42 13 469
+43 3 469
+44 8 469
+45 89 469
+46 23 469
+47 20 469
+48 60 469
+49 6 469
+50 44 469
+51 16 469
+52 31 469
+53 48 469
+54 37 469
+55 246 469
+56 120 469
+57 169 469
+58 262 469
+59 199 468
+60 8 16
+61 0 16
+62 70 324
+63 280 469
+64 320 469
+65 173 469
+66 184 469
+67 71 469
+68 48 469
+69 67 469
+70 12 469
+71 8 469
+72 166 469
+73 0 469
+74 48 469
+75 6 469
+76 43 469
+77 81 469
+78 116 469
+79 11 469
+80 72 469
+81 17 469
+82 96 469
+83 48 469
+84 123 469
+85 137 469
+86 82 469
+87 126 469
+88 99 469
+89 8 469
+90 142 469
+91 31 469
+92 213 469
+93 103 469
+94 22 469
+95 87 469
+96 129 469
+97 53 469
+98 0 469
+99 27 469
+100 12 469
+101 211 469
+102 0 469
+103 83 469
+104 0 469

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/absolute_mutations.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/absolute_mutations.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,37 @@
+Class Type value
+IGA G/C transitions 5.8
+IGA1 G/C transitions 6.2
+IGA2 G/C transitions 5
+IGG G/C transitions 5.4
+IGG1 G/C transitions 5.6
+IGG2 G/C transitions 5.3
+IGG3 G/C transitions 3.8
+IGG4 G/C transitions 4.8
+IGM G/C transitions 0
+IGE G/C transitions 0
+un G/C transitions 5.2
+all G/C transitions 5.6
+IGA G/C transversions 4.3
+IGA1 G/C transversions 4.4
+IGA2 G/C transversions 3.9
+IGG G/C transversions 4.2
+IGG1 G/C transversions 4.4
+IGG2 G/C transversions 4
+IGG3 G/C transversions 3.1
+IGG4 G/C transversions 4.7
+IGM G/C transversions 0
+IGE G/C transversions 0
+un G/C transversions 4
+all G/C transversions 4.2
+IGA A/T 8.3
+IGA1 A/T 8.6
+IGA2 A/T 7.7
+IGG A/T 8.4
+IGG1 A/T 8.7
+IGG2 A/T 8
+IGG3 A/T 7
+IGG4 A/T 9.3
+IGM A/T 0
+IGE A/T 0
+un A/T 8
+all A/T 8.3

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/frequency_ranges_classes.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/frequency_ranges_classes.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,16 @@
+best_match_class frequency_bins frequency_count class_sum frequency
+IGA 0 or lower 2 917 0.22
+IGA 0 to 2 12 917 1.31
+IGA 2 to 5 111 917 12.1
+IGA 5 to 10 453 917 49.4
+IGA 10 to 15 264 917 28.79
+IGA 15 to 20 60 917 6.54
+IGA 20 or higher 15 917 1.64
+IGG 0 or lower 18 469 3.84
+IGG 0 to 2 5 469 1.07
+IGG 2 to 5 34 469 7.25
+IGG 5 to 10 245 469 52.24
+IGG 10 to 15 120 469 25.59
+IGG 15 to 20 41 469 8.74
+IGG 20 or higher 6 469 1.28
+IGM 0 or lower 1 1 100

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/frequency_ranges_subclasses.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/frequency_ranges_subclasses.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,40 @@
+best_match best_match_class frequency_bins frequency_count class_sum frequency
+IGA1 IGA 0 or lower 2 593 0.34
+IGA1 IGA 0 to 2 5 593 0.84
+IGA1 IGA 2 to 5 58 593 9.78
+IGA1 IGA 5 to 10 282 593 47.55
+IGA1 IGA 10 to 15 188 593 31.7
+IGA1 IGA 15 to 20 45 593 7.59
+IGA1 IGA 20 or higher 13 593 2.19
+IGA2 IGA 0 to 2 7 324 2.16
+IGA2 IGA 2 to 5 53 324 16.36
+IGA2 IGA 5 to 10 171 324 52.78
+IGA2 IGA 10 to 15 76 324 23.46
+IGA2 IGA 15 to 20 15 324 4.63
+IGA2 IGA 20 or higher 2 324 0.62
+IGG1 IGG 0 or lower 8 274 2.92
+IGG1 IGG 0 to 2 4 274 1.46
+IGG1 IGG 2 to 5 19 274 6.93
+IGG1 IGG 5 to 10 139 274 50.73
+IGG1 IGG 10 to 15 70 274 25.55
+IGG1 IGG 15 to 20 30 274 10.95
+IGG1 IGG 20 or higher 4 274 1.46
+IGG2 IGG 0 or lower 5 150 3.33
+IGG2 IGG 2 to 5 11 150 7.33
+IGG2 IGG 5 to 10 83 150 55.33
+IGG2 IGG 10 to 15 43 150 28.67
+IGG2 IGG 15 to 20 8 150 5.33
+IGG3 IGG 0 or lower 4 26 15.38
+IGG3 IGG 0 to 2 1 26 3.85
+IGG3 IGG 2 to 5 2 26 7.69
+IGG3 IGG 5 to 10 14 26 53.85
+IGG3 IGG 10 to 15 3 26 11.54
+IGG3 IGG 15 to 20 1 26 3.85
+IGG3 IGG 20 or higher 1 26 3.85
+IGG4 IGG 0 or lower 1 19 5.26
+IGG4 IGG 2 to 5 2 19 10.53
+IGG4 IGG 5 to 10 9 19 47.37
+IGG4 IGG 10 to 15 4 19 21.05
+IGG4 IGG 15 to 20 2 19 10.53
+IGG4 IGG 20 or higher 1 19 5.26
+IGM IGM 0 or lower 1 1 100

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/mutation_by_id.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/mutation_by_id.txt Wed Sep 15 12:24:06 2021 +0000

b'@@ -0,0 +1,1577 @@\n+Sequence.ID\tVRegionMutations\tVRegionNucleotides\ttransitionMutations\ttransversionMutations\ttransitionMutationsAtGC\ttransitionMutationsAtAT\tsilentMutationsFR\tnonSilentMutationsFR\tsilentMutationsCDR\tnonSilentMutationsCDR\n+JY8QFUQ01A0005\t18\t216\t12\t6\t11\t1\t6\t6\t2\t4\n+JY8QFUQ01A004N\t20\t216\t10\t10\t4\t6\t2\t10\t2\t6\n+JY8QFUQ01A006G\t32\t216\t20\t12\t11\t9\t10\t12\t3\t7\n+JY8QFUQ01A018V\t31\t216\t19\t12\t12\t7\t7\t14\t5\t5\n+JY8QFUQ01A019O\t19\t213\t10\t9\t6\t4\t2\t10\t0\t7\n+JY8QFUQ01A01KX\t29\t213\t18\t11\t8\t10\t6\t13\t3\t7\n+JY8QFUQ01A0207\t16\t213\t6\t10\t4\t2\t2\t8\t0\t6\n+JY8QFUQ01A02HL\t13\t213\t9\t4\t8\t1\t3\t5\t3\t2\n+JY8QFUQ01A02KS\t14\t213\t9\t5\t6\t3\t3\t7\t1\t3\n+JY8QFUQ01A02XZ\t10\t213\t6\t4\t3\t3\t3\t2\t1\t4\n+JY8QFUQ01A03E3\t21\t213\t14\t7\t8\t6\t4\t8\t3\t6\n+JY8QFUQ01A03N6\t39\t213\t22\t17\t11\t11\t8\t19\t2\t10\n+JY8QFUQ01A08XO\t0\t216\t0\t0\t0\t0\t0\t0\t0\t0\n+JY8QFUQ01A0939\t15\t222\t7\t8\t4\t3\t7\t3\t0\t5\n+JY8QFUQ01A09OY\t12\t213\t6\t6\t5\t1\t2\t5\t1\t4\n+JY8QFUQ01A0C2Y\t18\t213\t12\t6\t8\t4\t3\t4\t4\t7\n+JY8QFUQ01A0C33\t13\t213\t5\t8\t3\t2\t2\t3\t0\t8\n+JY8QFUQ01A0C4X\t13\t213\t4\t9\t3\t1\t2\t3\t0\t8\n+JY8QFUQ01A0D2K\t10\t213\t4\t6\t0\t4\t0\t2\t2\t6\n+JY8QFUQ01A0D5E\t28\t212\t14\t14\t11\t3\t5\t10\t2\t11\n+JY8QFUQ01A0DA8\t23\t213\t12\t11\t4\t8\t5\t11\t4\t3\n+JY8QFUQ01A0DCS\t22\t213\t12\t10\t6\t6\t4\t9\t0\t9\n+JY8QFUQ01A0EF3\t12\t213\t8\t4\t7\t1\t4\t3\t1\t4\n+JY8QFUQ01A0ESJ\t17\t213\t8\t9\t5\t3\t4\t7\t0\t6\n+JY8QFUQ01A0FII\t26\t213\t15\t11\t8\t7\t5\t9\t3\t9\n+JY8QFUQ01A0FO5\t11\t213\t6\t5\t2\t4\t4\t4\t0\t3\n+JY8QFUQ01A0GVR\t24\t216\t17\t7\t10\t7\t9\t5\t4\t6\n+JY8QFUQ01A0GVY\t30\t213\t22\t8\t15\t7\t8\t9\t4\t9\n+JY8QFUQ01A0HBK\t0\t213\t0\t0\t0\t0\t0\t0\t0\t0\n+JY8QFUQ01A0IZI\t18\t213\t11\t7\t7\t4\t3\t5\t0\t10\n+JY8QFUQ01A0LAJ\t29\t219\t15\t14\t8\t7\t4\t14\t1\t10\n+JY8QFUQ01A0LBC\t31\t213\t14\t17\t7\t7\t7\t12\t1\t11\n+JY8QFUQ01A0LEW\t9\t213\t5\t4\t2\t3\t2\t4\t3\t0\n+JY8QFUQ01A0LZ5\t20\t213\t14\t6\t11\t3\t4\t5\t2\t9\n+JY8QFUQ01A0N2E\t0\t213\t0\t0\t0\t0\t0\t0\t0\t0\n+JY8QFUQ01A0N8H\t13\t222\t9\t4\t5\t4\t2\t6\t0\t5\n+JY8QFUQ01A0OC8\t12\t212\t6\t6\t2\t4\t3\t5\t0\t4\n+JY8QFUQ01A0OMH\t26\t213\t16\t10\t12\t4\t9\t8\t2\t7\n+JY8QFUQ01A0OTP\t15\t209\t10\t5\t3\t7\t6\t4\t1\t4\n+JY8QFUQ01A0QXW\t0\t210\t0\t0\t0\t0\t0\t0\t0\t0\n+JY8QFUQ01A0RJS\t17\t212\t11\t6\t7\t4\t3\t4\t3\t7\n+JY8QFUQ01A0S1H\t16\t213\t10\t6\t4\t6\t5\t3\t5\t3\n+JY8QFUQ01A0TAV\t12\t213\t4\t8\t2\t2\t2\t5\t1\t4\n+JY8QFUQ01A0TNI\t24\t213\t12\t12\t7\t5\t8\t11\t0\t5\n+JY8QFUQ01A0UZS\t20\t213\t12\t8\t8\t4\t2\t11\t2\t5\n+JY8QFUQ01A0VIE\t23\t213\t16\t7\t8\t8\t5\t12\t1\t5\n+JY8QFUQ01A0WDV\t18\t210\t10\t8\t6\t4\t5\t7\t0\t6\n+JY8QFUQ01A0WZB\t34\t210\t20\t14\t11\t9\t11\t9\t3\t11\n+JY8QFUQ01A0X8W\t10\t213\t5\t5\t2\t3\t5\t1\t1\t3\n+JY8QFUQ01A0XE3\t27\t211\t17\t10\t8\t9\t6\t12\t2\t7\n+JY8QFUQ01A0Z64\t9\t213\t7\t2\t5\t2\t1\t3\t0\t5\n+JY8QFUQ01A0ZW5\t27\t216\t15\t12\t7\t8\t3\t13\t2\t9\n+JY8QFUQ01A0ZX6\t12\t213\t8\t4\t5\t3\t4\t3\t1\t4\n+JY8QFUQ01A110D\t14\t213\t8\t6\t6\t2\t3\t8\t1\t2\n+JY8QFUQ01A12BY\t14\t213\t6\t8\t4\t2\t1\t5\t1\t7\n+JY8QFUQ01A12KV\t29\t213\t17\t12\t11\t6\t8\t11\t1\t9\n+JY8QFUQ01A12V0\t28\t213\t18\t10\t13\t5\t6\t13\t2\t7\n+JY8QFUQ01A14EE\t30\t213\t20\t10\t8\t12\t8\t10\t1\t11\n+JY8QFUQ01A152R\t19\t213\t12\t7\t8\t4\t4\t7\t1\t7\n+JY8QFUQ01A15L6\t31\t215\t17\t14\t6\t11\t3\t17\t3\t8\n+JY8QFUQ01A15SR\t48\t216\t21\t27\t13\t8\t6\t17\t1\t24\n+JY8QFUQ01A16XV\t0\t213\t0\t0\t0\t0\t0\t0\t0\t0\n+JY8QFUQ01A17D9\t17\t213\t10\t7\t6\t4\t2\t7\t2\t6\n+JY8QFUQ01A17TV\t14\t210\t9\t5\t3\t6\t2\t7\t1\t4\n+JY8QFUQ01A18L5\t12\t213\t7\t5\t4\t3\t3\t3\t2\t4\n+JY8QFUQ01A1963\t12\t212\t8\t4\t5\t3\t2\t2\t1\t7\n+JY8QFUQ01A1ALH\t22\t216\t15\t7\t6\t9\t3\t10\t1\t8\n+JY8QFUQ01A1AYP\t14\t213\t6\t8\t4\t2\t5\t1\t2\t6\n+JY8QFUQ01A1BK7\t14\t213\t10\t4\t6\t4\t2\t5\t2\t5\n+JY8QFUQ01A1BT3\t24\t210\t16\t8\t11\t5\t7\t12\t2\t3\n+JY8QFUQ01A1CLZ\t18\t216\t11\t7\t8\t3\t6\t4\t1\t7\n+JY8QFUQ01A1CTT\t15\t213\t9\t6\t7\t2\t2\t4\t0\t9\n+JY8QFUQ01A1DJR\t14\t213\t11\t3\t8\t3\t2\t5\t2\t5\n+JY8QFUQ01A1DVA\t35\t213\t26\t9\t11\t15\t7\t11\t0\t17\n+JY8QFUQ01A1E6T\t14\t213\t7\t7\t6\t1\t3\t5\t0\t6\n+JY8QFUQ01A1GYW\t41\t210\t25\t16\t13\t12\t7\t25\t2\t7\n+JY8QFUQ01A1GZY\t26\t212\t16\t10\t7\t9\t5\t10\t3\t8\n+JY8QFUQ01A1ISV\t11\t213\t8\t3\t5\t3\t2\t5\t0\t4\n+JY8QFUQ01A1IV8\t24\t209\t13\t11\t7\t6\t4\t12\t0\t8\n+JY8QFUQ01A1IYG\t13\t210\t6\t7\t5\t1\t2\t5\t2\t4\n+JY8QFUQ01A1K37\t9\t213\t6\t3\t5\t1\t1\t2\t1\t5\n+JY8QFUQ01A1KQO\t20\t213\t12\t8\t6\t6\t2\t9\t3\t6\n+JY8QFUQ01A1L2W\t12\t213\t5\t7\t3\t2\t3\t7\t0\t2\n+JY8QFUQ01A1LNA\t14\t213\t7\t7\t5\t2\t5\t4\t0\t5\n+JY8QFUQ01A1MBV\t21\t213\t11\t10\t4\t7\t1\t10\t2\t8\n+JY8QFUQ01A1MJG\t17\t210\t11\t6\t8\t3\t5\t8\t0\t4\n+JY8QFUQ01A1MJU\t16\t213\t9\t7\t5\t4\t5\t8\t0\t3\n+JY8QFUQ01A1OLP\t38\t213\t22\t16\t14\t8\t9\t14\t4\t11\n+JY8QFUQ01A1PLD\t20\t213\t12\t8\t6\t6\t2\t9\t3\t6\n+JY8QFUQ01A1Q3N\t32\t213\t20\t12\t16\t4\t7\t17\t3\t5\n+JY8QFUQ01A1QLN\t11\t209\t7\t4\t4\t3\t2\t4\t2\t3\n+JY8QFUQ01A1R7K\t26\t212\t12\t14\t9\t3\t6\t14\t1\t5\n+JY8QFUQ01A1RAE\t16\t213\t6\t10\t3\t3\t1'..b'\t6\t9\t5\t1\t4\t5\t0\t6\n+JY8QFUQ01BCZ6T\t18\t210\t9\t9\t7\t2\t3\t6\t0\t9\n+JY8QFUQ01BDHSG\t12\t213\t7\t5\t5\t2\t4\t4\t0\t4\n+JY8QFUQ01BF6IL\t10\t213\t7\t3\t3\t4\t2\t3\t0\t5\n+JY8QFUQ01BG2KZ\t17\t213\t10\t7\t3\t7\t0\t9\t2\t6\n+JY8QFUQ01BG9LH\t19\t213\t15\t4\t8\t7\t4\t7\t1\t7\n+JY8QFUQ01BGU0C\t20\t210\t11\t9\t8\t3\t3\t8\t0\t9\n+JY8QFUQ01BHYPA\t9\t213\t6\t3\t3\t3\t1\t4\t1\t3\n+JY8QFUQ01BJC1Y\t8\t213\t5\t3\t2\t3\t1\t2\t0\t5\n+JY8QFUQ01BKLR9\t20\t213\t8\t12\t6\t2\t5\t4\t5\t6\n+JY8QFUQ01BLF36\t38\t213\t21\t17\t10\t11\t6\t21\t3\t8\n+JY8QFUQ01BLJYE\t35\t213\t18\t17\t9\t9\t6\t19\t0\t10\n+JY8QFUQ01BLLRQ\t12\t213\t9\t3\t3\t6\t2\t3\t2\t5\n+JY8QFUQ01BM2SX\t26\t213\t19\t7\t10\t9\t6\t13\t1\t6\n+JY8QFUQ01BM631\t10\t213\t7\t3\t2\t5\t0\t6\t1\t3\n+JY8QFUQ01BMPYC\t19\t213\t12\t7\t9\t3\t4\t10\t0\t5\n+JY8QFUQ01BMULR\t13\t213\t10\t3\t7\t3\t3\t3\t1\t6\n+JY8QFUQ01BNJBB\t27\t210\t15\t12\t8\t7\t7\t12\t4\t4\n+JY8QFUQ01BNJGF\t28\t213\t18\t10\t13\t5\t6\t13\t2\t7\n+JY8QFUQ01BP3M1\t14\t213\t10\t4\t6\t4\t2\t5\t2\t5\n+JY8QFUQ01BPT8C\t16\t216\t8\t8\t3\t5\t3\t6\t0\t7\n+JY8QFUQ01BPXZS\t31\t216\t19\t12\t12\t7\t7\t14\t5\t5\n+JY8QFUQ01BR9V1\t8\t213\t4\t4\t2\t2\t1\t4\t1\t2\n+JY8QFUQ01BRGSI\t9\t213\t7\t2\t2\t5\t2\t3\t0\t4\n+JY8QFUQ01BRNFF\t17\t213\t12\t5\t9\t3\t6\t7\t2\t2\n+JY8QFUQ01BSGO4\t11\t213\t5\t6\t4\t1\t1\t4\t0\t6\n+JY8QFUQ01BT0O2\t19\t213\t12\t7\t8\t4\t4\t7\t1\t7\n+JY8QFUQ01BT4AX\t11\t213\t8\t3\t5\t3\t2\t5\t0\t4\n+JY8QFUQ01BT86M\t10\t213\t7\t3\t4\t3\t2\t3\t1\t4\n+JY8QFUQ01BTQAH\t26\t213\t12\t14\t7\t5\t3\t9\t1\t13\n+JY8QFUQ01BURMR\t36\t213\t21\t15\t7\t14\t5\t18\t3\t10\n+JY8QFUQ01BV9YG\t22\t213\t13\t9\t5\t8\t5\t6\t1\t10\n+JY8QFUQ01BW9QL\t9\t213\t6\t3\t5\t1\t1\t2\t1\t5\n+JY8QFUQ01BWI2D\t29\t213\t17\t12\t11\t6\t6\t13\t2\t8\n+JY8QFUQ01BXYLF\t15\t213\t9\t6\t7\t2\t3\t9\t0\t3\n+JY8QFUQ01BY231\t19\t216\t11\t8\t7\t4\t3\t11\t0\t5\n+JY8QFUQ01BYGN8\t20\t213\t13\t7\t8\t5\t5\t8\t2\t5\n+JY8QFUQ01C2NGE\t15\t210\t10\t5\t3\t7\t6\t4\t1\t4\n+JY8QFUQ01C2ROO\t7\t213\t3\t4\t2\t1\t2\t2\t1\t2\n+JY8QFUQ01C3QHH\t13\t213\t8\t5\t4\t4\t2\t6\t1\t4\n+JY8QFUQ01C4MHW\t11\t213\t5\t6\t4\t1\t3\t5\t0\t3\n+JY8QFUQ01C5Q2O\t16\t213\t5\t11\t2\t3\t3\t4\t0\t9\n+JY8QFUQ01C8QWZ\t11\t213\t4\t7\t3\t1\t0\t4\t1\t6\n+JY8QFUQ01C92F8\t13\t222\t9\t4\t5\t4\t2\t6\t0\t5\n+JY8QFUQ01C966Y\t10\t213\t4\t6\t1\t3\t2\t3\t0\t5\n+JY8QFUQ01C98A9\t11\t213\t8\t3\t6\t2\t3\t3\t1\t4\n+JY8QFUQ01CANL1\t20\t213\t13\t7\t7\t6\t4\t5\t0\t11\n+JY8QFUQ01CD8ZK\t30\t213\t20\t10\t8\t12\t8\t10\t1\t11\n+JY8QFUQ01CD9VK\t15\t213\t9\t6\t7\t2\t2\t4\t0\t9\n+JY8QFUQ01CDZ5R\t23\t212\t16\t7\t8\t8\t5\t12\t2\t4\n+JY8QFUQ01CE8P9\t18\t213\t13\t5\t8\t5\t6\t10\t1\t1\n+JY8QFUQ01CF06T\t12\t213\t8\t4\t5\t3\t4\t3\t1\t4\n+JY8QFUQ01CG8U2\t18\t210\t11\t7\t8\t3\t5\t5\t1\t7\n+JY8QFUQ01CGLTX\t15\t219\t9\t6\t7\t2\t1\t9\t1\t4\n+JY8QFUQ01CGQFF\t21\t213\t13\t8\t8\t5\t4\t7\t2\t8\n+JY8QFUQ01CHDDF\t23\t219\t11\t12\t8\t3\t4\t9\t1\t9\n+JY8QFUQ01CHKLB\t8\t213\t5\t3\t2\t3\t1\t4\t1\t2\n+JY8QFUQ01CHW93\t13\t213\t7\t6\t5\t2\t3\t4\t2\t4\n+JY8QFUQ01CII5W\t30\t210\t15\t15\t3\t12\t6\t11\t1\t12\n+JY8QFUQ01CINZT\t5\t213\t2\t3\t0\t2\t0\t1\t1\t3\n+JY8QFUQ01CJLXK\t22\t213\t12\t10\t9\t3\t6\t9\t0\t7\n+JY8QFUQ01CJT9B\t31\t213\t18\t13\t10\t8\t5\t16\t3\t7\n+JY8QFUQ01CK1VY\t28\t216\t19\t9\t12\t7\t7\t14\t2\t5\n+JY8QFUQ01CKN3U\t25\t213\t17\t8\t8\t9\t4\t9\t4\t8\n+JY8QFUQ01CLP4K\t14\t213\t7\t7\t5\t2\t5\t4\t0\t5\n+JY8QFUQ01CNCW4\t10\t213\t5\t5\t3\t2\t2\t2\t0\t6\n+JY8QFUQ01CO019\t16\t213\t10\t6\t5\t5\t5\t5\t0\t6\n+JY8QFUQ01CPVUP\t25\t213\t11\t14\t8\t3\t6\t10\t2\t7\n+JY8QFUQ01CPYJ0\t14\t213\t7\t7\t6\t1\t3\t5\t0\t6\n+JY8QFUQ01CQH14\t14\t213\t11\t3\t8\t3\t2\t5\t2\t5\n+JY8QFUQ01CST8T\t17\t213\t10\t7\t6\t4\t2\t4\t1\t10\n+JY8QFUQ01CU1XI\t17\t209\t8\t9\t5\t3\t6\t6\t0\t5\n+JY8QFUQ01CU5CB\t20\t213\t13\t7\t9\t4\t5\t7\t2\t6\n+JY8QFUQ01CUGFD\t30\t213\t13\t17\t8\t5\t6\t10\t2\t12\n+JY8QFUQ01CURPS\t32\t213\t20\t12\t16\t4\t7\t17\t3\t5\n+JY8QFUQ01CY2WW\t12\t213\t6\t6\t2\t4\t3\t5\t0\t4\n+JY8QFUQ01CY6MC\t16\t213\t9\t7\t6\t3\t2\t7\t0\t7\n+JY8QFUQ01CYWC2\t12\t213\t7\t5\t4\t3\t1\t3\t2\t6\n+JY8QFUQ01DA9FW\t13\t213\t9\t4\t6\t3\t3\t6\t2\t2\n+JY8QFUQ01DA9S4\t0\t216\t0\t0\t0\t0\t0\t0\t0\t0\n+JY8QFUQ01DC4QI\t21\t213\t12\t9\t7\t5\t5\t9\t2\t5\n+JY8QFUQ01DC8OC\t26\t210\t11\t15\t4\t7\t4\t13\t0\t9\n+JY8QFUQ01DCPGQ\t19\t213\t14\t5\t6\t8\t7\t6\t1\t5\n+JY8QFUQ01DEKWC\t17\t213\t6\t11\t5\t1\t2\t8\t1\t6\n+JY8QFUQ01DG2P7\t11\t211\t4\t7\t3\t1\t0\t4\t1\t6\n+JY8QFUQ01DG5KX\t21\t210\t11\t10\t8\t3\t6\t5\t4\t6\n+JY8QFUQ01DG6GC\t15\t213\t6\t9\t2\t4\t1\t6\t2\t6\n+JY8QFUQ01DHXHT\t20\t216\t10\t10\t4\t6\t2\t10\t2\t6\n+JY8QFUQ01DI39D\t8\t213\t4\t4\t3\t1\t2\t2\t0\t4\n+JY8QFUQ01DIBNC\t0\t216\t0\t0\t0\t0\t0\t0\t0\t0\n+JY8QFUQ01DJFIZ\t43\t213\t23\t20\t12\t11\t8\t15\t5\t15\n+JY8QFUQ01DLDLD\t7\t213\t4\t3\t2\t2\t1\t3\t0\t3\n+JY8QFUQ01DMF0A\t53\t216\t27\t26\t12\t15\t10\t22\t2\t19\n+JY8QFUQ01DOVL5\t15\t213\t10\t5\t6\t4\t1\t4\t2\t8\n+JY8QFUQ01DPT8R\t8\t212\t4\t4\t1\t3\t4\t2\t0\t2\n+JY8QFUQ01DUD3U\t29\t213\t18\t11\t12\t6\t7\t9\t2\t11\n+JY8QFUQ01DV4HU\t22\t213\t14\t8\t8\t6\t5\t10\t2\t5\n+JY8QFUQ01DV8LF\t6\t213\t1\t5\t0\t1\t2\t1\t0\t3\n+JY8QFUQ01DVBU0\t15\t212\t6\t9\t2\t4\t3\t4\t1\t7\n+JY8QFUQ01DXDOM\t36\t210\t22\t14\t12\t10\t7\t12\t0\t17\n'

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/relative_mutations.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/relative_mutations.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,37 @@
+Class Type value
+IGA A/T 42.7
+IGA1 A/T 42.3
+IGA2 A/T 43.6
+IGG A/T 43.9
+IGG1 A/T 43.7
+IGG2 A/T 43.6
+IGG3 A/T 47.9
+IGG4 A/T 46.2
+IGM A/T 0
+IGE A/T 0
+un A/T 43.5
+all A/T 43.1
+IGA G/C transitions 33
+IGA1 G/C transitions 33.6
+IGA2 G/C transitions 31.7
+IGG G/C transitions 31.4
+IGG1 G/C transitions 31.6
+IGG2 G/C transitions 31.9
+IGG3 G/C transitions 28.9
+IGG4 G/C transitions 27.2
+IGM G/C transitions 0
+IGE G/C transitions 0
+un G/C transitions 31.9
+all G/C transitions 32.5
+IGA G/C transversions 24.3
+IGA1 G/C transversions 24.1
+IGA2 G/C transversions 24.7
+IGG G/C transversions 24.7
+IGG1 G/C transversions 24.7
+IGG2 G/C transversions 24.6
+IGG3 G/C transversions 23.2
+IGG4 G/C transversions 26.6
+IGM G/C transversions 0
+IGE G/C transversions 0
+un G/C transversions 24.6
+all G/C transversions 24.4

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/scatter.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/scatter.txt Wed Sep 15 12:24:06 2021 +0000

b'@@ -0,0 +1,1388 @@\n+Sequence.ID\tbest_match\tVRegionMutations\tVRegionNucleotides\tpercentage_mutations\n+JY8QFUQ01A0005\tIGG1\t18\t216\t8.33\n+JY8QFUQ01A004N\tIGG1\t20\t216\t9.26\n+JY8QFUQ01A006G\tIGG1\t32\t216\t14.81\n+JY8QFUQ01A018V\tIGA1\t31\t216\t14.35\n+JY8QFUQ01A019O\tIGG1\t19\t213\t8.92\n+JY8QFUQ01A01KX\tIGG1\t29\t213\t13.62\n+JY8QFUQ01A0207\tIGG2\t16\t213\t7.51\n+JY8QFUQ01A02HL\tIGA1\t13\t213\t6.1\n+JY8QFUQ01A02KS\tIGA2\t14\t213\t6.57\n+JY8QFUQ01A02XZ\tIGG1\t10\t213\t4.69\n+JY8QFUQ01A03E3\tIGA2\t21\t213\t9.86\n+JY8QFUQ01A03N6\tIGG1\t39\t213\t18.31\n+JY8QFUQ01A08XO\tIGG1\t0\t216\t0\n+JY8QFUQ01A0939\tIGA2\t15\t222\t6.76\n+JY8QFUQ01A09OY\tIGA2\t12\t213\t5.63\n+JY8QFUQ01A0C2Y\tIGG1\t18\t213\t8.45\n+JY8QFUQ01A0C33\tIGG1\t13\t213\t6.1\n+JY8QFUQ01A0C4X\tIGG1\t13\t213\t6.1\n+JY8QFUQ01A0D2K\tIGG4\t10\t213\t4.69\n+JY8QFUQ01A0D5E\tIGA1\t28\t212\t13.21\n+JY8QFUQ01A0DA8\tIGA1\t23\t213\t10.8\n+JY8QFUQ01A0DCS\tIGG1\t22\t213\t10.33\n+JY8QFUQ01A0EF3\tIGA1\t12\t213\t5.63\n+JY8QFUQ01A0ESJ\tIGA1\t17\t213\t7.98\n+JY8QFUQ01A0FII\tIGA1\t26\t213\t12.21\n+JY8QFUQ01A0FO5\tIGA2\t11\t213\t5.16\n+JY8QFUQ01A0GVR\tIGG1\t24\t216\t11.11\n+JY8QFUQ01A0GVY\tIGA1\t30\t213\t14.08\n+JY8QFUQ01A0HBK\tIGA1\t0\t213\t0\n+JY8QFUQ01A0IZI\tIGG1\t18\t213\t8.45\n+JY8QFUQ01A0LAJ\tIGA1\t29\t219\t13.24\n+JY8QFUQ01A0LBC\tIGA1\t31\t213\t14.55\n+JY8QFUQ01A0LEW\tIGA2\t9\t213\t4.23\n+JY8QFUQ01A0LZ5\tIGA1\t20\t213\t9.39\n+JY8QFUQ01A0N2E\tIGM\t0\t213\t0\n+JY8QFUQ01A0N8H\tIGA1\t13\t222\t5.86\n+JY8QFUQ01A0OC8\tIGA2\t12\t212\t5.66\n+JY8QFUQ01A0OMH\tIGA1\t26\t213\t12.21\n+JY8QFUQ01A0OTP\tIGG1\t15\t209\t7.18\n+JY8QFUQ01A0QXW\tIGG1\t0\t210\t0\n+JY8QFUQ01A0RJS\tIGG1\t17\t212\t8.02\n+JY8QFUQ01A0S1H\tIGA1\t16\t213\t7.51\n+JY8QFUQ01A0TAV\tIGG2\t12\t213\t5.63\n+JY8QFUQ01A0TNI\tIGG1\t24\t213\t11.27\n+JY8QFUQ01A0UZS\tIGA1\t20\t213\t9.39\n+JY8QFUQ01A0VIE\tIGA1\t23\t213\t10.8\n+JY8QFUQ01A0WDV\tIGG2\t18\t210\t8.57\n+JY8QFUQ01A0WZB\tIGA1\t34\t210\t16.19\n+JY8QFUQ01A0X8W\tIGG1\t10\t213\t4.69\n+JY8QFUQ01A0XE3\tIGG1\t27\t211\t12.8\n+JY8QFUQ01A0Z64\tIGA2\t9\t213\t4.23\n+JY8QFUQ01A0ZW5\tIGG4\t27\t216\t12.5\n+JY8QFUQ01A0ZX6\tIGG1\t12\t213\t5.63\n+JY8QFUQ01A110D\tIGA1\t14\t213\t6.57\n+JY8QFUQ01A12BY\tIGG1\t14\t213\t6.57\n+JY8QFUQ01A12KV\tIGG1\t29\t213\t13.62\n+JY8QFUQ01A12V0\tIGA1\t28\t213\t13.15\n+JY8QFUQ01A14EE\tIGA1\t30\t213\t14.08\n+JY8QFUQ01A152R\tIGA1\t19\t213\t8.92\n+JY8QFUQ01A15L6\tIGA1\t31\t215\t14.42\n+JY8QFUQ01A15SR\tIGA1\t48\t216\t22.22\n+JY8QFUQ01A16XV\tIGG1\t0\t213\t0\n+JY8QFUQ01A17D9\tIGA1\t17\t213\t7.98\n+JY8QFUQ01A17TV\tIGG2\t14\t210\t6.67\n+JY8QFUQ01A18L5\tIGA1\t12\t213\t5.63\n+JY8QFUQ01A1963\tIGA2\t12\t212\t5.66\n+JY8QFUQ01A1ALH\tIGA1\t22\t216\t10.19\n+JY8QFUQ01A1AYP\tIGG2\t14\t213\t6.57\n+JY8QFUQ01A1BK7\tIGA1\t14\t213\t6.57\n+JY8QFUQ01A1BT3\tIGA2\t24\t210\t11.43\n+JY8QFUQ01A1CLZ\tIGG1\t18\t216\t8.33\n+JY8QFUQ01A1CTT\tIGG1\t15\t213\t7.04\n+JY8QFUQ01A1DJR\tIGG1\t14\t213\t6.57\n+JY8QFUQ01A1DVA\tIGG2\t35\t213\t16.43\n+JY8QFUQ01A1E6T\tIGA2\t14\t213\t6.57\n+JY8QFUQ01A1GYW\tIGG1\t41\t210\t19.52\n+JY8QFUQ01A1GZY\tIGG2\t26\t212\t12.26\n+JY8QFUQ01A1ISV\tIGG1\t11\t213\t5.16\n+JY8QFUQ01A1IV8\tIGG4\t24\t209\t11.48\n+JY8QFUQ01A1IYG\tIGG1\t13\t210\t6.19\n+JY8QFUQ01A1K37\tIGG1\t9\t213\t4.23\n+JY8QFUQ01A1KQO\tIGA2\t20\t213\t9.39\n+JY8QFUQ01A1L2W\tIGA1\t12\t213\t5.63\n+JY8QFUQ01A1LNA\tIGA2\t14\t213\t6.57\n+JY8QFUQ01A1MBV\tIGA1\t21\t213\t9.86\n+JY8QFUQ01A1MJG\tIGA1\t17\t210\t8.1\n+JY8QFUQ01A1MJU\tIGG4\t16\t213\t7.51\n+JY8QFUQ01A1OLP\tIGA1\t38\t213\t17.84\n+JY8QFUQ01A1PLD\tIGA1\t20\t213\t9.39\n+JY8QFUQ01A1Q3N\tIGA1\t32\t213\t15.02\n+JY8QFUQ01A1QLN\tIGG1\t11\t209\t5.26\n+JY8QFUQ01A1R7K\tIGA1\t26\t212\t12.26\n+JY8QFUQ01A1RAE\tIGA1\t16\t213\t7.51\n+JY8QFUQ01A1SIW\tIGG1\t14\t213\t6.57\n+JY8QFUQ01A1U7S\tIGG1\t43\t213\t20.19\n+JY8QFUQ01A1U87\tIGA2\t13\t213\t6.1\n+JY8QFUQ01A1UND\tIGG1\t19\t210\t9.05\n+JY8QFUQ01A1UXL\tIGA2\t20\t213\t9.39\n+JY8QFUQ01A1UXZ\tIGG1\t34\t210\t16.19\n+JY8QFUQ01A1W6G\tIGG1\t15\t216\t6.94\n+JY8QFUQ01A1WCP\tIGA2\t22\t213\t10.33\n+JY8QFUQ01A1X35\tIGA1\t21\t216\t9.72\n+JY8QFUQ01A1X52\tIGA1\t10\t213\t4.69\n+JY8QFUQ01A1Y2H\tIGA1\t16\t213\t7.51\n+JY8QFUQ01A1YN6\tIGA1\t6\t213\t2.82\n+JY8QFUQ01A1Z5H\tIGA2\t13\t213\t6.1\n+JY8QFUQ01A23OB\tIGG2\t16\t213\t7.51\n+JY8QFUQ01A23UZ\tIGG2\t28\t210\t13.33\n+JY8QFUQ01A26C2\tIGG1\t13\t213\t6.1\n+JY8QFUQ01A26DA\tIGA1\t38\t219\t17.35\n+JY8QFUQ01A27H2\tIGA2\t17\t213\t7.98\n+JY8QFUQ01A27QT\tIGA1\t29\t210\t13.81\n+JY8QFUQ01A287O\tIGG1\t11\t210\t5.24\n+JY8QFUQ01A29EP\tIGG2\t0\t213\t0\n+JY8QFUQ01A2AEH\tIGA1\t20\t213\t9.39\n+JY8QFUQ01A2ANY\tIGA1\t29\t213\t13.62\n+JY8QFUQ01A2AVP\tIGA2\t9\t213\t4.23\n+JY8QFUQ01A2B2A\tIGG1\t3\t210\t1.43\n+JY8QFUQ'..b'01CJ990\tIGA1\t8\t212\t3.77\n+JY8QFUQ01CJG1W\tIGA1\t31\t213\t14.55\n+JY8QFUQ01CJV6U\tIGA2\t29\t213\t13.62\n+JY8QFUQ01CJYGN\tIGG2\t15\t213\t7.04\n+JY8QFUQ01CK280\tIGA1\t19\t216\t8.8\n+JY8QFUQ01CKFL6\tIGA1\t21\t213\t9.86\n+JY8QFUQ01CKI1Y\tIGA2\t4\t213\t1.88\n+JY8QFUQ01CKO1P\tIGA1\t34\t218\t15.6\n+JY8QFUQ01CKPJN\tIGA1\t10\t213\t4.69\n+JY8QFUQ01CKX6X\tIGA2\t17\t209\t8.13\n+JY8QFUQ01CL7VE\tIGA1\t23\t209\t11\n+JY8QFUQ01CLEAH\tIGA2\t22\t213\t10.33\n+JY8QFUQ01CLIT1\tIGA1\t27\t210\t12.86\n+JY8QFUQ01CLMWX\tIGG4\t21\t213\t9.86\n+JY8QFUQ01CLR99\tIGA1\t13\t210\t6.19\n+JY8QFUQ01CLXOH\tIGA2\t13\t213\t6.1\n+JY8QFUQ01CM33P\tIGA2\t14\t216\t6.48\n+JY8QFUQ01CM5UQ\tIGA1\t10\t213\t4.69\n+JY8QFUQ01CN7M5\tIGG1\t16\t213\t7.51\n+JY8QFUQ01CNEDF\tIGG2\t12\t213\t5.63\n+JY8QFUQ01CNH24\tIGA2\t15\t213\t7.04\n+JY8QFUQ01CNLWR\tIGG2\t31\t213\t14.55\n+JY8QFUQ01CNYY7\tIGG1\t28\t212\t13.21\n+JY8QFUQ01CO06K\tIGA2\t21\t216\t9.72\n+JY8QFUQ01CO7ZE\tIGA2\t21\t210\t10\n+JY8QFUQ01COIZ4\tIGA2\t8\t213\t3.76\n+JY8QFUQ01COT7A\tIGG1\t20\t213\t9.39\n+JY8QFUQ01COULV\tIGG1\t22\t216\t10.19\n+JY8QFUQ01CP6A0\tIGA2\t12\t219\t5.48\n+JY8QFUQ01CPEX7\tIGG2\t29\t213\t13.62\n+JY8QFUQ01CPKFW\tIGA2\t24\t219\t10.96\n+JY8QFUQ01CQ2DI\tIGA2\t19\t213\t8.92\n+JY8QFUQ01CQFLG\tIGA2\t20\t216\t9.26\n+JY8QFUQ01CQHUH\tIGA1\t21\t213\t9.86\n+JY8QFUQ01CQKI9\tIGA1\t15\t213\t7.04\n+JY8QFUQ01CQOIV\tIGA1\t38\t213\t17.84\n+JY8QFUQ01CQRVK\tIGA2\t11\t212\t5.19\n+JY8QFUQ01CQSBL\tIGG1\t19\t210\t9.05\n+JY8QFUQ01CQWKF\tIGG1\t26\t212\t12.26\n+JY8QFUQ01CR76J\tIGG1\t33\t216\t15.28\n+JY8QFUQ01CR7U0\tIGA2\t27\t209\t12.92\n+JY8QFUQ01CR8IO\tIGA1\t35\t213\t16.43\n+JY8QFUQ01CRBJ3\tIGA2\t29\t216\t13.43\n+JY8QFUQ01CRNW3\tIGA1\t25\t213\t11.74\n+JY8QFUQ01CRPMT\tIGA1\t23\t213\t10.8\n+JY8QFUQ01CRXKV\tIGA1\t32\t216\t14.81\n+JY8QFUQ01CRXS1\tIGA1\t22\t210\t10.48\n+JY8QFUQ01CS8O3\tIGA2\t9\t210\t4.29\n+JY8QFUQ01CSGBR\tIGG1\t18\t212\t8.49\n+JY8QFUQ01CSVFI\tIGA2\t24\t210\t11.43\n+JY8QFUQ01CSWQD\tIGA2\t14\t213\t6.57\n+JY8QFUQ01CT0HD\tIGG2\t17\t211\t8.06\n+JY8QFUQ01CT3CN\tIGA1\t19\t212\t8.96\n+JY8QFUQ01CTI25\tIGA1\t9\t213\t4.23\n+JY8QFUQ01CTJ46\tIGA2\t29\t213\t13.62\n+JY8QFUQ01CU8BS\tIGA1\t15\t211\t7.11\n+JY8QFUQ01CU8RS\tIGG2\t13\t210\t6.19\n+JY8QFUQ01CVKGA\tIGG2\t14\t213\t6.57\n+JY8QFUQ01CVRND\tIGG1\t37\t213\t17.37\n+JY8QFUQ01CVY8N\tIGA1\t22\t210\t10.48\n+JY8QFUQ01CW65U\tIGG2\t14\t213\t6.57\n+JY8QFUQ01CWEVR\tIGA1\t12\t213\t5.63\n+JY8QFUQ01CWPJP\tIGA1\t26\t216\t12.04\n+JY8QFUQ01CWQZU\tIGA2\t13\t213\t6.1\n+JY8QFUQ01CWYA5\tIGG1\t42\t210\t20\n+JY8QFUQ01CXAGM\tIGA1\t23\t213\t10.8\n+JY8QFUQ01CXD17\tIGA1\t28\t213\t13.15\n+JY8QFUQ01CXIGS\tIGA2\t30\t213\t14.08\n+JY8QFUQ01CXM4M\tIGA2\t13\t221\t5.88\n+JY8QFUQ01CY2HZ\tIGA1\t5\t213\t2.35\n+JY8QFUQ01CY3ZT\tIGA2\t22\t211\t10.43\n+JY8QFUQ01CYT8I\tIGA2\t10\t213\t4.69\n+JY8QFUQ01CYTAI\tIGG1\t40\t210\t19.05\n+JY8QFUQ01CYU3K\tIGG3\t13\t213\t6.1\n+JY8QFUQ01CYX7E\tIGA2\t25\t216\t11.57\n+JY8QFUQ01CZ1IL\tIGA2\t17\t213\t7.98\n+JY8QFUQ01CZDE0\tIGA1\t30\t213\t14.08\n+JY8QFUQ01CZH0L\tIGA2\t16\t212\t7.55\n+JY8QFUQ01DABGE\tIGG2\t12\t213\t5.63\n+JY8QFUQ01DARQJ\tIGA2\t15\t213\t7.04\n+JY8QFUQ01DDQ8A\tIGG2\t15\t213\t7.04\n+JY8QFUQ01DECT5\tIGG2\t41\t213\t19.25\n+JY8QFUQ01DEK3I\tIGG3\t12\t213\t5.63\n+JY8QFUQ01DEX6Z\tIGG1\t22\t215\t10.23\n+JY8QFUQ01DFG5Q\tIGA1\t26\t210\t12.38\n+JY8QFUQ01DFJK6\tIGG2\t14\t213\t6.57\n+JY8QFUQ01DFMQ1\tIGA1\t9\t210\t4.29\n+JY8QFUQ01DFNKY\tIGA1\t3\t219\t1.37\n+JY8QFUQ01DFY56\tIGA1\t25\t213\t11.74\n+JY8QFUQ01DG3GX\tIGA2\t12\t213\t5.63\n+JY8QFUQ01DG853\tIGA1\t17\t213\t7.98\n+JY8QFUQ01DGFLY\tIGA2\t15\t213\t7.04\n+JY8QFUQ01DGFY7\tIGA1\t10\t213\t4.69\n+JY8QFUQ01DH08T\tIGA1\t18\t210\t8.57\n+JY8QFUQ01DH0MW\tIGA2\t6\t213\t2.82\n+JY8QFUQ01DHC55\tIGG1\t25\t213\t11.74\n+JY8QFUQ01DHH0D\tIGG3\t16\t219\t7.31\n+JY8QFUQ01DHJ30\tIGA2\t5\t219\t2.28\n+JY8QFUQ01DI98R\tIGA2\t25\t213\t11.74\n+JY8QFUQ01DIWKI\tIGG3\t9\t213\t4.23\n+JY8QFUQ01DJ168\tIGG1\t14\t212\t6.6\n+JY8QFUQ01DJCKC\tIGA2\t13\t213\t6.1\n+JY8QFUQ01DJQ7I\tIGA1\t13\t213\t6.1\n+JY8QFUQ01DJU2K\tIGA1\t30\t210\t14.29\n+JY8QFUQ01DKCDX\tIGG3\t18\t213\t8.45\n+JY8QFUQ01DKKJ0\tIGA1\t32\t213\t15.02\n+JY8QFUQ01DLA1D\tIGA1\t5\t212\t2.36\n+JY8QFUQ01DM6TL\tIGA2\t23\t213\t10.8\n+JY8QFUQ01DN21P\tIGA1\t12\t213\t5.63\n+JY8QFUQ01DNADY\tIGG2\t28\t213\t13.15\n+JY8QFUQ01DO347\tIGA2\t15\t213\t7.04\n+JY8QFUQ01DP55Y\tIGG1\t21\t216\t9.72\n+JY8QFUQ01DP8YH\tIGG2\t21\t213\t9.86\n+JY8QFUQ01DPFIO\tIGA1\t13\t213\t6.1\n+JY8QFUQ01DQV12\tIGG4\t16\t213\t7.51\n+JY8QFUQ01DQVYX\tIGG2\t21\t213\t9.86\n+JY8QFUQ01DRFXI\tIGG2\t24\t215\t11.16\n+JY8QFUQ01DSBEO\tIGA2\t30\t213\t14.08\n+JY8QFUQ01DU1MH\tIGA2\t4\t210\t1.9\n+JY8QFUQ01DUKA9\tIGG1\t28\t213\t13.15\n+JY8QFUQ01DVTFY\tIGA1\t24\t213\t11.27\n'

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/shm_overview.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/shm_overview.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,22 @@
+ IGA.x IGA.y IGA.z IGA1.x IGA1.y IGA1.z IGA2.x IGA2.y IGA2.z IGG.x IGG.y IGG.z IGG1.x IGG1.y IGG1.z IGG2.x IGG2.y IGG2.z IGG3.x IGG3.y IGG3.z IGG4.x IGG4.y IGG4.z IGM.x IGM.y IGM.z IGE.x IGE.y IGE.z all.x all.y all.z un.x un.y un.z
+Number of Mutations (%) 17977 195251 9.2 12223 126298 9.7 5754 68953 8.3 9036 99835 9.1 5492 58332 9.4 2777 31881 8.7 388 5570 7 379 4052 9.4 0 213 0 0 0 0 27013 295299 9.1 3486 40247 8.7
+Median of Number of Mutations (%) 19 213 8.9 20 213 9.4 16.5 213 7.8 18 213 8.5 18 213 8.5 17 213 8.1 13.5 213 6.3 18 213 8.5 0 213 0 0 0 0 18 213 8.5 16 213 7.5
+Transitions (%) 10318 17977 57.4 7067 12223 57.8 3251 5754 56.5 5045 9036 55.8 3062 5492 55.8 1573 2777 56.6 211 388 54.4 199 379 52.5 0 0 0 0 0 15363 27013 56.9 1983 3486 56.9
+Transversions (%) 7659 17977 42.6 5156 12223 42.2 2503 5754 43.5 3991 9036 44.2 2430 5492 44.2 1204 2777 43.4 177 388 45.6 180 379 47.5 0 0 0 0 0 11650 27013 43.1 1503 3486 43.1
+Transitions at G C (%) 5933 10299 57.6 4111 7056 58.3 1822 3243 56.2 2836 5066 56 1736 3093 56.1 885 1567 56.5 112 202 55.4 103 204 50.5 0 0 0 0 0 8769 15365 57.1 1111 1968 56.5
+Targeting of G C (%) 10299 17977 57.3 7056 12223 57.7 3243 5754 56.4 5066 9036 56.1 3093 5492 56.3 1567 2777 56.4 202 388 52.1 204 379 53.8 0 0 0 0 0 15365 27013 56.9 1968 3486 56.5
+Transitions at A T (%) 4385 7678 57.1 2956 5167 57.2 1429 2511 56.9 2209 3970 55.6 1326 2399 55.3 688 1210 56.9 99 186 53.2 96 175 54.9 0 0 0 0 0 6594 11648 56.6 872 1518 57.4
+Targeting of A T (%) 7678 17977 42.7 5167 12223 42.3 2511 5754 43.6 3970 9036 43.9 2399 5492 43.7 1210 2777 43.6 186 388 47.9 175 379 46.2 0 0 0 0 0 11648 27013 43.1 1518 3486 43.5
+FR R/S (ratio) 7123 3750 1.9 4909 2586 1.9 2214 1164 1.9 3622 1847 2 2246 1148 2 1045 560 1.9 164 68 2.4 167 71 2.4 0 0 0 0 0 10745 5597 1.9 1336 719 1.9
+CDR R/S (ratio) 5901 1203 4.9 3873 855 4.5 2028 348 5.8 2962 605 4.9 1717 381 4.5 998 174 5.7 126 30 4.2 121 20 6 0 0 0 0 0 8863 1808 4.9 1208 223 5.4
+nt in FR 151127 195251 77.4 97738 126298 77.4 53389 68953 77.4 77252 99835 77.4 45155 58332 77.4 24674 31881 77.4 4289 5570 77 3134 4052 77.3 165 213 77.5 0 0 0 228544 295299 77.4 31179 40247 77.5
+nt in CDR 44124 195251 22.6 28560 126298 22.6 15564 68953 22.6 22583 99835 22.6 13177 58332 22.6 7207 31881 22.6 1281 5570 23 918 4052 22.7 48 213 22.5 0 0 0 66755 295299 22.6 9068 40247 22.5
+Tandems/Expected (ratio) 2438 2262.33 1.08 1680 1601.93 1.05 758 660.41 1.15 1212 1128.79 1.07 769 710.06 1.08 357 321.62 1.11 43 45.72 0.94 43 51.4 0.84 0 0 0 0 0 0 3650 3391.13 1.08 482 411.6 1.17
+RGYW (%) 3163 17977 17.6 2186 12223 17.9 978 5754 17 1524 9036 16.9 911 5492 16.6 485 2777 17.5 63 388 16.2 65 379 17.2 0 0 0 0 0 0 4687 27013 17.4 602 3486 17.3
+WRCY (%) 2984 17977 16.6 2060 12223 16.9 924 5754 16.1 1445 9036 16 888 5492 16.2 455 2777 16.4 52 388 13.4 50 379 13.2 0 0 0 0 0 0 4429 27013 16.4 614 3486 17.6
+WA (%) 2610 17977 14.5 1715 12223 14 895 5754 15.6 1420 9036 15.7 859 5492 15.6 433 2777 15.6 67 388 17.3 61 379 16.1 0 0 0 0 0 0 4030 27013 14.9 540 3486 15.5
+TW (%) 1561 17977 8.7 1044 12223 8.5 517 5754 9 818 9036 9.1 474 5492 8.6 265 2777 9.5 34 388 8.8 46 379 12.1 0 0 0 0 0 0 2379 27013 8.8 330 3486 9.5
+A 49123 195251 25.16 31648 126298 25.06 17475 68953 25.34 24685 99835 24.73 14372 58332 24.64 7945 31881 24.92 1399 5570 25.12 969 4052 23.91 58 213 27.23 0 0 0 73866 295299 25.01 10122 40247 25.15
+C 48130 195251 24.65 31276 126298 24.76 16854 68953 24.44 25076 99835 25.12 14646 58332 25.11 8009 31881 25.12 1389 5570 24.94 1032 4052 25.47 51 213 23.94 0 0 0 73257 295299 24.81 9848 40247 24.47
+T 43599 195251 22.33 28348 126298 22.45 15251 68953 22.12 22415 99835 22.45 13167 58332 22.57 7087 31881 22.23 1243 5570 22.32 918 4052 22.66 47 213 22.07 0 0 0 66061 295299 22.37 8943 40247 22.22
+G 54399 195251 27.86 35026 126298 27.73 19373 68953 28.1 27659 99835 27.7 16147 58332 27.68 8840 31881 27.73 1539 5570 27.63 1133 4052 27.96 57 213 26.76 0 0 0 82115 295299 27.81 11334 40247 28.16

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/transitions_IGA1_sum.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/transitions_IGA1_sum.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,5 @@
+,A,C,G,T
+A,NA,723,1905,754
+C,397,NA,749,2044
+G,2067,1272,NA,527
+T,369,1051,365,NA

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/transitions_IGA2_sum.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/transitions_IGA2_sum.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,5 @@
+,A,C,G,T
+A,NA,330,1008,370
+C,180,NA,347,833
+G,989,626,NA,268
+T,191,421,191,NA

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/transitions_IGA_sum.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/transitions_IGA_sum.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,5 @@
+,A,C,G,T
+A,NA,1053,2913,1124
+C,577,NA,1096,2877
+G,3056,1898,NA,795
+T,560,1472,556,NA

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/transitions_IGE_sum.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/transitions_IGE_sum.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,5 @@
+,A,C,G,T
+A,NA,0,0,0
+C,0,NA,0,0
+G,0,0,NA,0
+T,0,0,0,NA

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/transitions_IGG1_sum.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/transitions_IGG1_sum.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,5 @@
+,A,C,G,T
+A,NA,370,858,350
+C,165,NA,347,840
+G,896,585,NA,260
+T,176,468,177,NA

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/transitions_IGG2_sum.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/transitions_IGG2_sum.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,5 @@
+,A,C,G,T
+A,NA,207,431,171
+C,78,NA,164,411
+G,474,309,NA,131
+T,74,257,70,NA

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/transitions_IGG3_sum.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/transitions_IGG3_sum.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,5 @@
+,A,C,G,T
+A,NA,41,69,25
+C,11,NA,13,52
+G,60,50,NA,16
+T,10,30,11,NA

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/transitions_IGG4_sum.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/transitions_IGG4_sum.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,5 @@
+,A,C,G,T
+A,NA,27,54,26
+C,14,NA,19,55
+G,48,50,NA,18
+T,12,42,14,NA

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/transitions_IGG_sum.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/transitions_IGG_sum.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,5 @@
+,A,C,G,T
+A,NA,645,1412,572
+C,268,NA,543,1358
+G,1478,994,NA,425
+T,272,797,272,NA

diff -r a103134ee6e0 -r 729738462297 tests/validation_data/transitions_all_sum.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/validation_data/transitions_all_sum.txt Wed Sep 15 12:24:06 2021 +0000

@@ -0,0 +1,5 @@
+,A,C,G,T
+A,NA,1698,4325,1696
+C,845,NA,1639,4235
+G,4534,2892,NA,1220
+T,832,2269,828,NA

diff -r a103134ee6e0 -r 729738462297 wrapper.sh
--- a/wrapper.sh Thu Feb 25 10:32:32 2021 +0000
+++ b/wrapper.sh Wed Sep 15 12:24:06 2021 +0000

@@ -1,5 +1,5 @@
-#!/bin/bash
-#set -e
+#!/usr/bin/env bash
+set -e -o pipefail
dir="$(cd "$(dirname "$0")" && pwd)"
input=$1
method=$2
@@ -22,7 +22,12 @@
empty_region_filter=${18}
fast=${19}

-mkdir $outdir
+#exec 5> debug_output.txt
+#BASH_XTRACEFD="5"
+#PS4='$(date +%s.%N) $LINENO: '
+#set -x
+
+mkdir -p $outdir

tar -xzf $dir/style.tar.gz -C $outdir

@@ -447,7 +452,7 @@
echo "---------------- baseline ---------------- " >> $log
tmp="$PWD"

- mkdir $outdir/baseline
+ mkdir -p $outdir/baseline

echo "<center><h1>BASELINe</h1>" >> $output
header_substring="Based on CDR1, FR2, CDR2, FR3 (27:27:38:55:65:104:-)"
@@ -557,7 +562,7 @@

echo "---------------- change-o MakeDB ----------------"

- mkdir $outdir/change_o
+ mkdir -p $outdir/change_o

tmp="$PWD"