Galaxy |

Changeset 5:bcec7bb4e089 (2016-12-12)

Previous changeset 4:5ffd52fc35c4 (2016-12-12) Next changeset 6:d001d0c05dbe (2016-12-16)

Commit message:
Uploaded

added:
complete.sh
complete_immunerepertoire.xml
experimental_design/experimental_design.py
experimental_design/experimental_design.r
experimental_design/experimental_design.sh
igblast/igblast.r
igblast/igblast.sh
igblastparser/igparse.pl
imgt_loader/imgt_loader.py
imgt_loader/imgt_loader.r
imgt_loader/imgt_loader.sh
report_clonality/RScript.r
report_clonality/circos/LTe50046.ttf
report_clonality/circos/LTe50048.ttf
report_clonality/circos/LTe50050.ttf
report_clonality/circos/LTe50054.ttf
report_clonality/circos/circos.conf
report_clonality/circos/circos.tar.gz
report_clonality/circos/etc_colors.conf
report_clonality/circos/fonts.conf
report_clonality/circos/housekeeping.conf
report_clonality/circos/ideogram.conf
report_clonality/circos/parse-table.conf
report_clonality/circos/pragmata.ttf
report_clonality/circos/ticks.conf
report_clonality/circos/wingding.ttf
report_clonality/genes.txt
report_clonality/jquery-1.11.0.min.js
report_clonality/pure-min.css
report_clonality/r_wrapper.sh
report_clonality/script.js
report_clonality/style.css
report_clonality/tabber.js
tool_dependencies.xml

removed:
aa_histogram.r
baseline/Baseline_Functions.r
baseline/Baseline_Main.r
baseline/FiveS_Mutability.RData
baseline/FiveS_Substitution.RData
baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa
baseline/comparePDFs.r
baseline/filter.r
baseline/script_imgt.py
baseline/script_xlsx.py
baseline/wrapper.sh
change_o/DefineClones.py
change_o/MakeDb.py
change_o/define_clones.r
change_o/define_clones.sh
change_o/makedb.sh
datatypes_conf.xml
gene_identification.py
imgt_loader.r
merge.r
merge_and_filter.r
naive_output.r
new_imgt.r
pattern_plots.r
sequence_overview.r
shm_csr.py
shm_csr.r
shm_csr.xml
style.tar.gz
subclass_definition.db.nhr
subclass_definition.db.nin
subclass_definition.db.nsq
summary_to_fasta.py
wrapper.sh

diff -r 5ffd52fc35c4 -r bcec7bb4e089 aa_histogram.r
--- a/aa_histogram.r Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,63 +0,0 @@
-library(ggplot2)
-
-args <- commandArgs(trailingOnly = TRUE)
-
-mutations.by.id.file = args[1]
-absent.aa.by.id.file = args[2]
-genes = strsplit(args[3], ",")[[1]]
-genes = c(genes, "")
-outdir = args[4]
-
-
-print("---------------- read input ----------------")
-
-mutations.by.id = read.table(mutations.by.id.file, sep="\t", fill=T, header=T, quote="")
-absent.aa.by.id = read.table(absent.aa.by.id.file, sep="\t", fill=T, header=T, quote="")
-
-for(gene in genes){
- if(gene == ""){
- mutations.by.id.gene = mutations.by.id[!grepl("unmatched", mutations.by.id$best_match),]
- absent.aa.by.id.gene = absent.aa.by.id[!grepl("unmatched", absent.aa.by.id$best_match),]
- } else {
- mutations.by.id.gene = mutations.by.id[grepl(paste("^", gene, sep=""), mutations.by.id$best_match),]
- absent.aa.by.id.gene = absent.aa.by.id[grepl(paste("^", gene, sep=""), absent.aa.by.id$best_match),]
- }
- print(paste("nrow", gene, nrow(absent.aa.by.id.gene)))
- if(nrow(mutations.by.id.gene) == 0){
- next
- }
-
- mutations.at.position = colSums(mutations.by.id.gene[,-c(1,2)])
- aa.at.position = colSums(absent.aa.by.id.gene[,-c(1,2,3,4)])
-
- dat_freq = mutations.at.position / aa.at.position
- dat_freq[is.na(dat_freq)] = 0
- dat_dt = data.frame(i=1:length(dat_freq), freq=dat_freq)
-
- print("---------------- plot ----------------")
-
- m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1), text = element_text(size=13, colour="black"))
- m = m + geom_bar(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=dat_dt$i, labels=dat_dt$i)
- m = m + annotate("segment", x = 0.5, y = -0.05, xend=26.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 13, y = -0.1, label="FR1")
- m = m + annotate("segment", x = 26.5, y = -0.07, xend=38.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 32.5, y = -0.15, label="CDR1")
- m = m + annotate("segment", x = 38.5, y = -0.05, xend=55.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 47, y = -0.1, label="FR2")
- m = m + annotate("segment", x = 55.5, y = -0.07, xend=65.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 60.5, y = -0.15, label="CDR2")
- m = m + annotate("segment", x = 65.5, y = -0.05, xend=104.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 85, y = -0.1, label="FR3")
- m = m + expand_limits(y=c(-0.1,1)) + xlab("AA position") + ylab("Frequency") + ggtitle(paste(gene, "AA mutation frequency"))
- m = m + theme(panel.background = element_rect(fill = "white", colour="black"), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
- #m = m + scale_colour_manual(values=c("black"))
-
- print("---------------- write/print ----------------")
-
-
- dat.sums = data.frame(index=1:length(mutations.at.position), mutations.at.position=mutations.at.position, aa.at.position=aa.at.position)
-
- write.table(dat.sums, paste(outdir, "/aa_histogram_sum_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
- write.table(mutations.by.id.gene, paste(outdir, "/aa_histogram_count_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
- write.table(absent.aa.by.id.gene, paste(outdir, "/aa_histogram_absent_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
- write.table(dat_dt, paste(outdir, "/aa_histogram_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
-
- png(filename=paste(outdir, "/aa_histogram_", gene, ".png", sep=""), width=1280, height=720)
- print(m)
- dev.off()
-}

diff -r 5ffd52fc35c4 -r bcec7bb4e089 baseline/Baseline_Functions.r
--- a/baseline/Baseline_Functions.r Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,2287 +0,0 @@\n-#########################################################################################\r\n-# License Agreement\r\n-# \r\n-# THIS WORK IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE \r\n-# ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER \r\n-# APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE \r\n-# OR COPYRIGHT LAW IS PROHIBITED.\r\n-# \r\n-# BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE \r\n-# BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED \r\n-# TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN \r\n-# CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.\r\n-#\r\n-# BASELIne: Bayesian Estimation of Antigen-Driven Selection in Immunoglobulin Sequences\r\n-# Coded by: Mohamed Uduman & Gur Yaari\r\n-# Copyright 2012 Kleinstein Lab\r\n-# Version: 1.3 (01/23/2014)\r\n-#########################################################################################\r\n-\r\n-# Global variables \r\n- \r\n- FILTER_BY_MUTATIONS = 1000\r\n-\r\n- # Nucleotides\r\n- NUCLEOTIDES = c("A","C","G","T")\r\n- \r\n- # Amino Acids\r\n- AMINO_ACIDS <- c("F", "F", "L", "L", "S", "S", "S", "S", "Y", "Y", "*", "*", "C", "C", "*", "W", "L", "L", "L", "L", "P", "P", "P", "P", "H", "H", "Q", "Q", "R", "R", "R", "R", "I", "I", "I", "M", "T", "T", "T", "T", "N", "N", "K", "K", "S", "S", "R", "R", "V", "V", "V", "V", "A", "A", "A", "A", "D", "D", "E", "E", "G", "G", "G", "G")\r\n- names(AMINO_ACIDS) <- c("TTT", "TTC", "TTA", "TTG", "TCT", "TCC", "TCA", "TCG", "TAT", "TAC", "TAA", "TAG", "TGT", "TGC", "TGA", "TGG", "CTT", "CTC", "CTA", "CTG", "CCT", "CCC", "CCA", "CCG", "CAT", "CAC", "CAA", "CAG", "CGT", "CGC", "CGA", "CGG", "ATT", "ATC", "ATA", "ATG", "ACT", "ACC", "ACA", "ACG", "AAT", "AAC", "AAA", "AAG", "AGT", "AGC", "AGA", "AGG", "GTT", "GTC", "GTA", "GTG", "GCT", "GCC", "GCA", "GCG", "GAT", "GAC", "GAA", "GAG", "GGT", "GGC", "GGA", "GGG")\r\n- names(AMINO_ACIDS) <- names(AMINO_ACIDS)\r\n-\r\n- #Amino Acid Traits\r\n- #"*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V" "W" "Y"\r\n- #B = "Hydrophobic/Burried" N = "Intermediate/Neutral" S="Hydrophilic/Surface") \r\n- TRAITS_AMINO_ACIDS_CHOTHIA98 <- c("*","N","B","S","S","B","N","N","B","S","B","B","S","N","S","S","N","N","B","B","N")\r\n- names(TRAITS_AMINO_ACIDS_CHOTHIA98) <- sort(unique(AMINO_ACIDS))\r\n- TRAITS_AMINO_ACIDS <- array(NA,21)\r\n- \r\n- # Codon Table\r\n- CODON_TABLE <- as.data.frame(matrix(NA,ncol=64,nrow=12))\r\n-\r\n- # Substitution Model: Smith DS et al. 1996\r\n- substitution_Literature_Mouse <- matrix(c(0, 0.156222928, 0.601501588, 0.242275484, 0.172506739, 0, 0.241239892, 0.586253369, 0.54636291, 0.255795364, 0, 0.197841727, 0.290240811, 0.467680608, 0.24207858, 0),nrow=4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES))\r\n- substitution_Flu_Human <- matrix(c(0,0.2795596,0.5026927,0.2177477,0.1693210,0,0.3264723,0.5042067,0.4983549,0.3328321,0,0.1688130,0.2021079,0.4696077,0.3282844,0),4,4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES))\r\n- substitution_Flu25_Human <- matrix(c(0,0.2580641,0.5163685,0.2255674,0.1541125,0,0.3210224,0.5248651,0.5239281,0.3101292,0,0.1659427,0.1997207,0.4579444,0.3423350,0),4,4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES))\r\n- load("FiveS_Substitution.RData")\r\n-\r\n- # Mutability Models: Shapiro GS et al. 2002\r\n- triMutability_Literature_Human <- matrix(c(0.24, 1.2, 0.96, 0.43, 2.14, 2, 1.11, 1.9, 0.85, 1.83, 2.36, 1.31, 0.82, 0.52, 0.89, 1.33, 1.4, 0.82, 1.83, 0.73, 1.83, 1.62, 1.53, 0.57, 0.92, 0.42, 0.42, 1.47, 3.44, 2.58, 1.18, 0.47, 0.39, 1.12, 1.8, 0.68, 0.47, 2.19, 2.35, 2.19, 1.05, 1.84, 1.26, 0.28, 0.98, 2.37, 0.66, 1.58, 0.67, 0.92, 1.76, 0.83, 0.97, 0.56, 0.75, 0.62, 2.26, 0.62, 0.74, 1.11, 1.16, 0.61, 0.88, 0.67, 0.37, 0.07, 1.08, 0.46, 0.31, 0.94, 0.62, 0.57, 0.29, NA, 1.44, 0.46, 0.69, 0.57, 0.24, 0.37, 1.1, 0.99, 1.39, 0.6, 2.26, 1.24, 1.36, 0.52, 0.33, 0.26, 1.25, 0.37, 0.58, 1.03, 1.2, '..b'U = lapply(1:length(facLevels), function(x){\r\n- computeMutabilities(facLevels[x])\r\n- })\r\n- facIndex = match(facGL,facLevels)\r\n- \r\n- LisGLs_Mutability = lapply(1:nrow(matInput), function(x){\r\n- cInput = rep(NA,nchar(matInput[x,1]))\r\n- cInput[s2c(matInput[x,1])!="N"] = 1\r\n- LisGLs_MutabilityU[[facIndex[x]]] * cInput \r\n- })\r\n- \r\n- LisGLs_Targeting = lapply(1:dim(matInput)[1], function(x){\r\n- computeTargeting(matInput[x,2],LisGLs_Mutability[[x]])\r\n- })\r\n- \r\n- LisGLs_MutationTypes = lapply(1:length(matInput[,2]),function(x){\r\n- #print(x)\r\n- computeMutationTypes(matInput[x,2])\r\n- })\r\n- \r\n- LisGLs_R_Exp = lapply(1:nrow(matInput), function(x){\r\n- Exp_R <- rollapply(as.zoo(1:readEnd),width=3,by=3,\r\n- function(codonNucs){ \r\n- RPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="R") \r\n- sum( LisGLs_Targeting[[x]][,codonNucs][RPos], na.rm=T ) \r\n- }\r\n- ) \r\n- })\r\n- \r\n- LisGLs_S_Exp = lapply(1:nrow(matInput), function(x){\r\n- Exp_S <- rollapply(as.zoo(1:readEnd),width=3,by=3,\r\n- function(codonNucs){ \r\n- SPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="S") \r\n- sum( LisGLs_Targeting[[x]][,codonNucs][SPos], na.rm=T )\r\n- }\r\n- ) \r\n- }) \r\n- \r\n- Exp_R = matrix(unlist(LisGLs_R_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) \r\n- Exp_S = matrix(unlist(LisGLs_S_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) \r\n- return( list( "Expected_R"=Exp_R, "Expected_S"=Exp_S) ) \r\n- }\r\n-}\r\n-\r\n-# getObservedMutationsByCodon <- function(listMutations){\r\n-# numbSeqs <- length(listMutations) \r\n-# obsMu_R <- matrix(0,nrow=numbSeqs,ncol=readEnd/3,dimnames=list(c(1:numbSeqs),c(1:(readEnd/3))))\r\n-# obsMu_S <- obsMu_R\r\n-# temp <- mclapply(1:length(listMutations), function(i){\r\n-# arrMutations = listMutations[[i]]\r\n-# RPos = as.numeric(names(arrMutations)[arrMutations=="R"])\r\n-# RPos <- sapply(RPos,getCodonNumb) \r\n-# if(any(RPos)){\r\n-# tabR <- table(RPos)\r\n-# obsMu_R[i,as.numeric(names(tabR))] <<- tabR\r\n-# } \r\n-# \r\n-# SPos = as.numeric(names(arrMutations)[arrMutations=="S"])\r\n-# SPos <- sapply(SPos,getCodonNumb)\r\n-# if(any(SPos)){\r\n-# tabS <- table(SPos)\r\n-# obsMu_S[i,names(tabS)] <<- tabS\r\n-# } \r\n-# }\r\n-# )\r\n-# return( list( "Observed_R"=obsMu_R, "Observed_S"=obsMu_S) ) \r\n-# }\r\n-\r\n-getObservedMutationsByCodon <- function(listMutations){\r\n- numbSeqs <- length(listMutations) \r\n- obsMu_R <- matrix(0,nrow=numbSeqs,ncol=readEnd/3,dimnames=list(c(1:numbSeqs),c(1:(readEnd/3))))\r\n- obsMu_S <- obsMu_R\r\n- temp <- lapply(1:length(listMutations), function(i){\r\n- arrMutations = listMutations[[i]]\r\n- RPos = as.numeric(names(arrMutations)[arrMutations=="R"])\r\n- RPos <- sapply(RPos,getCodonNumb) \r\n- if(any(RPos)){\r\n- tabR <- table(RPos)\r\n- obsMu_R[i,as.numeric(names(tabR))] <<- tabR\r\n- } \r\n- \r\n- SPos = as.numeric(names(arrMutations)[arrMutations=="S"])\r\n- SPos <- sapply(SPos,getCodonNumb)\r\n- if(any(SPos)){\r\n- tabS <- table(SPos)\r\n- obsMu_S[i,names(tabS)] <<- tabS\r\n- } \r\n- }\r\n- )\r\n- return( list( "Observed_R"=obsMu_R, "Observed_S"=obsMu_S) ) \r\n-}\r\n-\r\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 baseline/Baseline_Main.r
--- a/baseline/Baseline_Main.r Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,388 +0,0 @@\n-#########################################################################################\r\n-# License Agreement\r\n-# \r\n-# THIS WORK IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE \r\n-# ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER \r\n-# APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE \r\n-# OR COPYRIGHT LAW IS PROHIBITED.\r\n-# \r\n-# BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE \r\n-# BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED \r\n-# TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN \r\n-# CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.\r\n-#\r\n-# BASELIne: Bayesian Estimation of Antigen-Driven Selection in Immunoglobulin Sequences\r\n-# Coded by: Mohamed Uduman & Gur Yaari\r\n-# Copyright 2012 Kleinstein Lab\r\n-# Version: 1.3 (01/23/2014)\r\n-#########################################################################################\r\n-\r\n-op <- options();\r\n-options(showWarnCalls=FALSE, showErrorCalls=FALSE, warn=-1)\r\n-library(\'seqinr\')\r\n-if( F & Sys.info()[1]=="Linux"){\r\n- library("multicore")\r\n-}\r\n-\r\n-# Load functions and initialize global variables\r\n-source("Baseline_Functions.r")\r\n-\r\n-# Initialize parameters with user provided arguments\r\n- arg <- commandArgs(TRUE) \r\n- #arg = c(2,1,5,5,0,1,"1:26:38:55:65:104:116", "test.fasta","","sample")\r\n- #arg = c(1,1,5,5,0,1,"1:38:55:65:104:116:200", "test.fasta","","sample")\r\n- #arg = c(1,1,5,5,1,1,"1:26:38:55:65:104:116", "/home/mu37/Wu/Wu_Cloned_gapped_sequences_D-masked.fasta","/home/mu37/Wu/","Wu")\r\n- testID <- as.numeric(arg[1]) # 1 = Focused, 2 = Local\r\n- species <- as.numeric(arg[2]) # 1 = Human. 2 = Mouse\r\n- substitutionModel <- as.numeric(arg[3]) # 0 = Uniform substitution, 1 = Smith DS et al. 1996, 5 = FiveS\r\n- mutabilityModel <- as.numeric(arg[4]) # 0 = Uniform mutablity, 1 = Tri-nucleotide (Shapiro GS et al. 2002) , 5 = FiveS\r\n- clonal <- as.numeric(arg[5]) # 0 = Independent sequences, 1 = Clonally related, 2 = Clonally related & only non-terminal mutations\r\n- fixIndels <- as.numeric(arg[6]) # 0 = Do nothing, 1 = Try and fix Indels\r\n- region <- as.numeric(strsplit(arg[7],":")[[1]]) # StartPos:LastNucleotideF1:C1:F2:C2:F3:C3\r\n- inputFilePath <- arg[8] # Full path to input file\r\n- outputPath <- arg[9] # Full path to location of output files\r\n- outputID <- arg[10] # ID for session output \r\n- \r\n-\r\n- if(testID==5){\r\n- traitChangeModel <- 1\r\n- if( !is.na(any(arg[11])) ) traitChangeModel <- as.numeric(arg[11]) # 1 <- Chothia 1998\r\n- initializeTraitChange(traitChangeModel) \r\n- }\r\n- \r\n-# Initialize other parameters/variables\r\n- \r\n- # Initialzie the codon table ( definitions of R/S )\r\n- computeCodonTable(testID) \r\n-\r\n- # Initialize \r\n- # Test Name\r\n- testName<-"Focused"\r\n- if(testID==2) testName<-"Local"\r\n- if(testID==3) testName<-"Imbalanced" \r\n- if(testID==4) testName<-"ImbalancedSilent" \r\n- \r\n- # Indel placeholders initialization\r\n- indelPos <- NULL\r\n- delPos <- NULL\r\n- insPos <- NULL\r\n-\r\n- # Initialize in Tranistion & Mutability matrixes\r\n- substitution <- initializeSubstitutionMatrix(substitutionModel,species)\r\n- mutability <- initializeMutabilityMatrix(mutabilityModel,species)\r\n- \r\n- # FWR/CDR boundaries\r\n- flagTrim <- F\r\n- if( is.na(region[7])){\r\n- flagTrim <- T\r\n- region[7]<-region[6]\r\n- }\r\n- readStart = min(region,na.rm=T)\r\n- readEnd = max(region,na.rm=T)\r\n- if(readStart>1){\r\n- region = region - (readStart - 1)\r\n- }\r\n- region_Nuc = c( (region[1]*3-2) , (region[2:7]*3) )\r\n- region_Cod = region\r\n- \r\n- readStart = (readStart*3)-2\r\n- readEnd = (readEnd*3)\r\n- \r\n- FWR_Nuc <- c( rep(TRUE,(region_Nuc[2])),\r\n- '..b'umb]] = list("CDR"=bayesPDF_groups_cdr[[G]],"FWR"=bayesPDF_groups_fwr[[G]])\r\n- names(listPDFs)[rowNumb] = names(groups[groups==paste(G)])[1]\r\n- #if(names(groups)[which(groups==G)[1]]!="All sequences combined"){\r\n- gs = unique(germlines[groups==G])\r\n- rowNumb = rowNumb+1\r\n- if( !is.na(gs) ){\r\n- for( g in gs ){\r\n- matOutput[rowNumb,c(1,2,11:18)] = c("Germline",names(germlines)[germlines==g][1],bayes_germlines_cdr[g,],bayes_germlines_fwr[g,],simgaP_germlines_cdr[g],simgaP_germlines_fwr[g])\r\n- listPDFs[[rowNumb]] = list("CDR"=bayesPDF_germlines_cdr[[g]],"FWR"=bayesPDF_germlines_fwr[[g]])\r\n- names(listPDFs)[rowNumb] = names(germlines[germlines==paste(g)])[1]\r\n- rowNumb = rowNumb+1\r\n- indexesOfInterest = which(germlines==g)\r\n- numbSeqsOfInterest = length(indexesOfInterest)\r\n- rowNumb = seq(rowNumb,rowNumb+(numbSeqsOfInterest-1))\r\n- matOutput[rowNumb,] = matrix( c( rep("Sequence",numbSeqsOfInterest),\r\n- rownames(matInput)[indexesOfInterest],\r\n- c(matMutationInfo[indexesOfInterest,1:4]),\r\n- c(matMutationInfo[indexesOfInterest,5:8]),\r\n- c(bayes_cdr[indexesOfInterest,]),\r\n- c(bayes_fwr[indexesOfInterest,]),\r\n- c(simgaP_cdr[indexesOfInterest]),\r\n- c(simgaP_fwr[indexesOfInterest]) \r\n- ), ncol=18, nrow=numbSeqsOfInterest,byrow=F)\r\n- increment=0\r\n- for( ioi in indexesOfInterest){\r\n- listPDFs[[min(rowNumb)+increment]] = list("CDR"=bayesPDF_cdr[[ioi]] , "FWR"=bayesPDF_fwr[[ioi]])\r\n- names(listPDFs)[min(rowNumb)+increment] = rownames(matInput)[ioi]\r\n- increment = increment + 1\r\n- }\r\n- rowNumb=max(rowNumb)+1\r\n-\r\n- }\r\n- }\r\n- }\r\n- colsToFormat = 11:18\r\n- matOutput[,colsToFormat] = formatC( matrix(as.numeric(matOutput[,colsToFormat]), nrow=nrow(matOutput), ncol=length(colsToFormat)) , digits=3)\r\n- matOutput[matOutput== " NaN"] = NA\r\n- \r\n- \r\n- \r\n- colnames(matOutput) = c("Type", "ID", "Observed_CDR_R", "Observed_CDR_S", "Observed_FWR_R", "Observed_FWR_S",\r\n- "Expected_CDR_R", "Expected_CDR_S", "Expected_FWR_R", "Expected_FWR_S",\r\n- paste( rep(testName,6), rep(c("Sigma","CIlower","CIupper"),2),rep(c("CDR","FWR"),each=3), sep="_"),\r\n- paste( rep(testName,2), rep("P",2),c("CDR","FWR"), sep="_")\r\n- )\r\n- fileName = paste(outputPath,outputID,".txt",sep="")\r\n- write.table(matOutput,file=fileName,quote=F,sep="\\t",row.names=T,col.names=NA)\r\n- fileName = paste(outputPath,outputID,".RData",sep="")\r\n- save(listPDFs,file=fileName)\r\n-\r\n-indelWarning = FALSE\r\n-if(sum(indelPos)>0){\r\n- indelWarning = "<P>Warning: The following sequences have either gaps and/or deletions, and have been ommited from the analysis.";\r\n- indelWarning = paste( indelWarning , "<UL>", sep="" )\r\n- for(indels in names(indelPos)[indelPos]){\r\n- indelWarning = paste( indelWarning , "<LI>", indels, "</LI>", sep="" )\r\n- }\r\n- indelWarning = paste( indelWarning , "</UL></P>", sep="" )\r\n-}\r\n-\r\n-cloneWarning = FALSE\r\n-if(clonal==1){\r\n- if(sum(matInputErrors)>0){\r\n- cloneWarning = "<P>Warning: The following clones have sequences of unequal length.";\r\n- cloneWarning = paste( cloneWarning , "<UL>", sep="" )\r\n- for(clone in names(matInputErrors)[matInputErrors]){\r\n- cloneWarning = paste( cloneWarning , "<LI>", names(germlines)[as.numeric(clone)], "</LI>", sep="" )\r\n- }\r\n- cloneWarning = paste( cloneWarning , "</UL></P>", sep="" )\r\n- }\r\n-}\r\n-cat(paste("Success",outputID,indelWarning,cloneWarning,sep="|"))\r\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 baseline/FiveS_Mutability.RData

Binary file baseline/FiveS_Mutability.RData has changed

diff -r 5ffd52fc35c4 -r bcec7bb4e089 baseline/FiveS_Substitution.RData

Binary file baseline/FiveS_Substitution.RData has changed

diff -r 5ffd52fc35c4 -r bcec7bb4e089 baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa
--- a/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

b'@@ -1,703 +0,0 @@\n->IGHV1-18*01\n-caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga\n->IGHV1-18*02\n-caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctaagatctgacgacacggcc\n->IGHV1-18*03\n-caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacatggccgtgtattactgtgcgagaga\n->IGHV1-18*04\n-caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctacggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga\n->IGHV1-2*01\n-caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccagtaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga\n->IGHV1-2*02\n-caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\n->IGHV1-2*03\n-caggtgcagctggtgcagtctggggct...gaggtgaagaagcttggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcnacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\n->IGHV1-2*04\n-caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggctgggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\n->IGHV1-2*05\n-caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga\n->IGHV1-24*01\n-caggtccagctggtacagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggtttccggatacaccctc............actgaattatccatgcactgggtgcgacaggctcctggaaaagggcttgagtggatgggaggttttgatcctgaa......gatggtgaaacaatctacgcacagaagttccag...ggcagagtcaccatgaccgaggacacatctacagacacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcaacaga\n->IGHV1-3*01\n-caggtccagcttgtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggatcaacgctggc......aatggtaacacaaaatattcacagaagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaagacacggctgtgtattactgtgcgagaga\n->IGHV1-3*02\n-caggttcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggagcaacgctggc......aatggtaacacaaaatattcacaggagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaggacatg'..b'aggtgcagctgttgcagtctgcagca...gaggtgaaaagacccggggagtctctgaggatctcctgtaagacttctggatacagcttt............accagctactggatccactgggtgcgccagatgcccgggaaagaactggagtggatggggagcatctatcctggg......aactctgataccagatacagcccatccttccaa...ggccacgtcaccatctcagccgacagctccagcagcaccgcctacctgcagtggagcagcctgaaggcctcggacgccgccatgtattattgtgtgaga\n->IGHV6-1*01\n-caggtacagctgcagcagtcaggtcca...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga\n->IGHV6-1*02\n-caggtacagctgcagcagtcaggtccg...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga\n->IGHV7-34-1*01\n-...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......actgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta\n->IGHV7-34-1*02\n-...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......aatgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta\n->IGHV7-4-1*01\n-caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatctgcagcctaaaggctgaggacactgccgtgtattactgtgcgaga\n->IGHV7-4-1*02\n-caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga\n->IGHV7-4-1*03\n-caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcacgctaaaggctgaggacactg\n->IGHV7-4-1*04\n-caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga\n->IGHV7-4-1*05\n-caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtgttactgtgcgagaga\n->AIGHV7-40*03|\n-ttttcaatagaaaagtcaaataatcta...agtgtcaatcagtggatgattagataaaatatgatatatgtaaatcatggaatactatgc............agccagtatggtatgaattcagtgtgaccagcccctggacaagggcttgagtggatgggatggatcatcacctac......actgggaacccaacatataccaacggcttcaca...ggacggtttctattctccatggacacctctgtcagcatggcgtatctgcagatcagcagcctaaaggctgaggacacggccgtgtatgactgtatgagaga\n->IGHV7-81*01\n-caggtgcagctggtgcagtctggccat...gaggtgaagcagcctggggcctcagtgaaggtctcctgcaaggcttctggttacagtttc............accacctatggtatgaattgggtgccacaggcccctggacaagggcttgagtggatgggatggttcaacacctac......actgggaacccaacatatgcccagggcttcaca...ggacggtttgtcttctccatggacacctctgccagcacagcatacctgcagatcagcagcctaaaggctgaggacatggccatgtattactgtgcgagata\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 baseline/comparePDFs.r
--- a/baseline/comparePDFs.r Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,225 +0,0 @@\n-options("warn"=-1)\r\n-\r\n-#from http://selection.med.yale.edu/baseline/Archive/Baseline%20Version%201.3/Baseline_Functions_Version1.3.r\r\n-# Compute p-value of two distributions\r\n-compareTwoDistsFaster <-function(sigma_S=seq(-20,20,length.out=4001), N=10000, dens1=runif(4001,0,1), dens2=runif(4001,0,1)){\r\n-#print(c(length(dens1),length(dens2)))\r\n-if(length(dens1)>1 & length(dens2)>1 ){\r\n-\tdens1<-dens1/sum(dens1)\r\n-\tdens2<-dens2/sum(dens2)\r\n-\tcum2 <- cumsum(dens2)-dens2/2\r\n-\ttmp<- sum(sapply(1:length(dens1),function(i)return(dens1[i]*cum2[i])))\r\n-\t#print(tmp)\r\n-\tif(tmp>0.5)tmp<-tmp-1\r\n-\treturn( tmp )\r\n-\t}\r\n-\telse {\r\n-\treturn(NA)\r\n-\t}\r\n-\t#return (sum(sapply(1:N,function(i)(sample(sigma_S,1,prob=dens1)>sample(sigma_S,1,prob=dens2))))/N)\r\n-} \r\n-\r\n-\r\n-require("grid")\r\n-arg <- commandArgs(TRUE)\r\n-#arg <- c("300143","4","5")\r\n-arg[!arg=="clonal"]\r\n-input <- arg[1]\r\n-output <- arg[2]\r\n-rowIDs <- as.numeric( sapply(arg[3:(max(3,length(arg)))],function(x){ gsub("chkbx","",x) } ) )\r\n-\r\n-numbSeqs = length(rowIDs)\r\n-\r\n-if ( is.na(rowIDs[1]) | numbSeqs>10 ) {\r\n- stop( paste("Error: Please select between one and 10 seqeunces to compare.") )\r\n-}\r\n-\r\n-#load( paste("output/",sessionID,".RData",sep="") )\r\n-load( input )\r\n-#input\r\n-\r\n-xMarks = seq(-20,20,length.out=4001)\r\n-\r\n-plot_grid_s<-function(pdf1,pdf2,Sample=100,cex=1,xlim=NULL,xMarks = seq(-20,20,length.out=4001)){\r\n- yMax = max(c(abs(as.numeric(unlist(listPDFs[pdf1]))),abs(as.numeric(unlist(listPDFs[pdf2]))),0),na.rm=T) * 1.1\r\n-\r\n- if(length(xlim==2)){\r\n- xMin=xlim[1]\r\n- xMax=xlim[2]\r\n- } else {\r\n- xMin_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][1]\r\n- xMin_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][1]\r\n- xMax_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001])]\r\n- xMax_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001])]\r\n- \r\n- xMin_CDR2 = xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001][1]\r\n- xMin_FWR2 = xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001][1]\r\n- xMax_CDR2 = xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001])]\r\n- xMax_FWR2 = xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001])]\r\n- \r\n- xMin=min(c(xMin_CDR,xMin_FWR,xMin_CDR2,xMin_FWR2,0),na.rm=TRUE)\r\n- xMax=max(c(xMax_CDR,xMax_FWR,xMax_CDR2,xMax_FWR2,0),na.rm=TRUE)\r\n- }\r\n-\r\n- sigma<-approx(xMarks,xout=seq(xMin,xMax,length.out=Sample))$x\r\n- grid.rect(gp = gpar(col=gray(0.6),fill="white",cex=cex))\r\n- x <- sigma\r\n- pushViewport(viewport(x=0.175,y=0.175,width=0.825,height=0.825,just=c("left","bottom"),default.units="npc"))\r\n- #pushViewport(plotViewport(c(1.8, 1.8, 0.25, 0.25)*cex))\r\n- pushViewport(dataViewport(x, c(yMax,-yMax),gp = gpar(cex=cex),extension=c(0.05)))\r\n- grid.polygon(c(0,0,1,1),c(0,0.5,0.5,0),gp=gpar(col=grey(0.95),fill=grey(0.95)),default.units="npc")\r\n- grid.polygon(c(0,0,1,1),c(1,0.5,0.5,1),gp=gpar(col=grey(0.9),fill=grey(0.9)),default.units="npc")\r\n- grid.rect()\r\n- grid.xaxis(gp = gpar(cex=cex/1.1))\r\n- yticks = pretty(c(-yMax,yMax),8)\r\n- yticks = yticks[yticks>(-yMax) & yticks<(yMax)]\r\n- grid.yaxis(at=yticks,label=abs(yticks),gp = gpar(cex=cex/1.1))\r\n- if(length(listPDFs[pdf1][[1]][["CDR"]])>1){\r\n- ycdr<-approx(xMarks,listPDFs[pdf1][[1]][["CDR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y\r\n- grid.lines(unit(x,"native"), unit(ycdr,"native"),gp=gpar(col=2,lwd=2))\r\n- }\r\n- if(length(listPDFs[pdf1][[1]][["FWR"]])>1){\r\n- yfwr<-approx(xMarks,listPDFs[pdf1][[1]][["FWR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y\r\n- grid.lines(unit(x,"native"), unit(-yfwr,"native"),gp=gpar(col=4,lwd=2))\r\n- }\r\n-\r\n- if(length(listPDFs[pdf2][[1]][["CDR"]])>1){\r\n- ycdr2<-approx(xMarks,listPDFs[pdf2][[1]][["CDR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y\r\n- grid.lines(unit(x,"native"), unit(ycdr2,"native"),gp=gpar(col=2,lwd'..b'npc"),y = unit(0.75, "npc"),just=c("center", "center"),gp = gpar(cex=cex))\r\n- grid.text(formatC(as.numeric(pCDR1CDR2),digits=3), x = unit(0.25, "npc"),y = unit(0.75, "npc"),just=c("center", "center"),gp = gpar(cex=cex))\r\n- grid.text(formatC(as.numeric(pFWR1CDR2),digits=3), x = unit(0.25, "npc"),y = unit(0.25, "npc"),just=c("center", "center"),gp = gpar(cex=cex))\r\n- \r\n- \r\n- # grid.text(paste("P = ",formatC(pCDRFWR,digits=3)), x = unit(0.5, "npc"),y = unit(0.98, "npc"),just=c("center", "top"),gp = gpar(cex=cex))\r\n- # grid.text(paste("P = ",formatC(pFWRFWR,digits=3)), x = unit(0.5, "npc"),y = unit(0.02, "npc"),just=c("center", "bottom"),gp = gpar(cex=cex))\r\n- }\r\n- else{\r\n- }\r\n-}\r\n-\r\n-\r\n-##################################################################################\r\n-################## The whole OCD\'s matrix ########################################\r\n-##################################################################################\r\n-\r\n-#pdf(width=4*numbSeqs+1/3,height=4*numbSeqs+1/3)\r\n-pdf( output ,width=4*numbSeqs+1/3,height=4*numbSeqs+1/3) \r\n-\r\n-pushViewport(viewport(x=0.02,y=0.02,just = c("left", "bottom"),w =0.96,height=0.96,layout = grid.layout(numbSeqs+1,numbSeqs+1,widths=unit.c(unit(rep(1,numbSeqs),"null"),unit(4,"lines")),heights=unit.c(unit(4,"lines"),unit(rep(1,numbSeqs),"null")))))\r\n-\r\n-for( seqOne in 1:numbSeqs+1){\r\n- pushViewport(viewport(layout.pos.col = seqOne-1, layout.pos.row = 1))\r\n- if(seqOne>2){ \r\n- grid.polygon(c(0,0,0.5,0.5),c(0,0.5,0.5,0),gp=gpar(col=grey(0.5),fill=grey(0.9)),default.units="npc")\r\n- grid.polygon(c(1,1,0.5,0.5),c(0,0.5,0.5,0),gp=gpar(col=grey(0.5),fill=grey(0.95)),default.units="npc")\r\n- grid.polygon(c(0,0,1,1),c(1,0.5,0.5,1),gp=gpar(col=grey(0.5)),default.units="npc")\r\n- \r\n- grid.text(y=.25,x=0.75,"FWR",gp = gpar(cex=1.5),just="center")\r\n- grid.text(y=.25,x=0.25,"CDR",gp = gpar(cex=1.5),just="center")\r\n- }\r\n- grid.rect(gp = gpar(col=grey(0.9)))\r\n- grid.text(y=.75,substr(paste(names(listPDFs)[rowIDs[seqOne-1]]),1,16),gp = gpar(cex=2),just="center")\r\n- popViewport(1)\r\n-}\r\n-\r\n-for( seqOne in 1:numbSeqs+1){\r\n- pushViewport(viewport(layout.pos.row = seqOne, layout.pos.col = numbSeqs+1))\r\n- if(seqOne<=numbSeqs){ \r\n- grid.polygon(c(0,0.5,0.5,0),c(0,0,0.5,0.5),gp=gpar(col=grey(0.5),fill=grey(0.95)),default.units="npc")\r\n- grid.polygon(c(0,0.5,0.5,0),c(1,1,0.5,0.5),gp=gpar(col=grey(0.5),fill=grey(0.9)),default.units="npc")\r\n- grid.polygon(c(1,0.5,0.5,1),c(0,0,1,1),gp=gpar(col=grey(0.5)),default.units="npc")\r\n- grid.text(x=.25,y=0.75,"CDR",gp = gpar(cex=1.5),just="center",rot=270)\r\n- grid.text(x=.25,y=0.25,"FWR",gp = gpar(cex=1.5),just="center",rot=270)\r\n- }\r\n- grid.rect(gp = gpar(col=grey(0.9)))\r\n- grid.text(x=0.75,substr(paste(names(listPDFs)[rowIDs[seqOne-1]]),1,16),gp = gpar(cex=2),rot=270,just="center")\r\n- popViewport(1)\r\n-}\r\n-\r\n-for( seqOne in 1:numbSeqs+1){\r\n- for(seqTwo in 1:numbSeqs+1){\r\n- pushViewport(viewport(layout.pos.col = seqTwo-1, layout.pos.row = seqOne))\r\n- if(seqTwo>seqOne){\r\n- plot_pvals(rowIDs[seqOne-1],rowIDs[seqTwo-1],cex=2)\r\n- grid.rect()\r\n- } \r\n- popViewport(1)\r\n- }\r\n-}\r\n- \r\n-\r\n-xMin=0\r\n-xMax=0.01\r\n-for(pdf1 in rowIDs){\r\n- xMin_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][1]\r\n- xMin_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][1]\r\n- xMax_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001])]\r\n- xMax_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001])]\r\n- xMin=min(c(xMin_CDR,xMin_FWR,xMin),na.rm=TRUE)\r\n- xMax=max(c(xMax_CDR,xMax_FWR,xMax),na.rm=TRUE)\r\n-}\r\n-\r\n-\r\n-\r\n-for(i in 1:numbSeqs+1){\r\n- for(j in (i-1):numbSeqs){ \r\n- pushViewport(viewport(layout.pos.col = i-1, layout.pos.row = j+1))\r\n- grid.rect()\r\n- plot_grid_s(rowIDs[i-1],rowIDs[j],cex=1)\r\n- popViewport(1)\r\n- }\r\n-}\r\n-\r\n-dev.off() \r\n-\r\n-cat("Success", paste(rowIDs,collapse="_"),sep=":")\r\n-\r\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 baseline/filter.r
--- a/baseline/filter.r Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,35 +0,0 @@
-arg = commandArgs(TRUE)
-summaryfile = arg[1]
-gappedfile = arg[2]
-selection = arg[3]
-output = arg[4]
-print(paste("selection = ", selection))
-
-
-summarydat = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
-gappeddat = read.table(gappedfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
-
-#dat = data.frame(merge(gappeddat, summarydat, by="Sequence.ID", all.x=T))
-
-dat = cbind(gappeddat, summarydat$AA.JUNCTION)
-
-colnames(dat)[length(dat)] = "AA.JUNCTION"
-
-dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele)
-dat$VGene = gsub("[*].*", "", dat$VGene)
-
-dat$DGene = gsub("^Homsap ", "", dat$D.GENE.and.allele)
-dat$DGene = gsub("[*].*", "", dat$DGene)
-
-dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele)
-dat$JGene = gsub("[*].*", "", dat$JGene)
-
-#print(str(dat))
-
-dat$past = do.call(paste, c(dat[unlist(strsplit(selection, ","))], sep = ":"))
-
-dat = dat[!duplicated(dat$past), ]
-
-dat = dat[dat$Functionality != "No results" & dat$Functionality != "unproductive",]
-
-write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T)

diff -r 5ffd52fc35c4 -r bcec7bb4e089 baseline/script_imgt.py
--- a/baseline/script_imgt.py Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,79 +0,0 @@
-#import xlrd #avoid dep
-import argparse
-import re
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
-parser.add_argument("--ref", help="Reference file")
-parser.add_argument("--output", help="Output file")
-parser.add_argument("--id", help="ID to be used at the '>>>' line in the output")
-
-args = parser.parse_args()
-
-refdic = dict()
-with open(args.ref, 'r') as ref:
- currentSeq = ""
- currentId = ""
- for line in ref:
- if line[0] is ">":
- if currentSeq is not "" and currentId is not "":
- refdic[currentId[1:]] = currentSeq
- currentId = line.rstrip()
- currentSeq = ""
- else:
- currentSeq += line.rstrip()
- refdic[currentId[1:]] = currentSeq
-
-
-vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#,
-# r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
-# r"(IGKV[0-3]D?-[0-9]{1,2})",
-# r"(IGLV[0-9]-[0-9]{1,2})",
-# r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)",
-# r"(TRGV[234589])",
-# r"(TRDV[1-3])"]
-
-#vPattern = re.compile(r"|".join(vPattern))
-vPattern = re.compile("|".join(vPattern))
-
-def filterGene(s, pattern):
-    if type(s) is not str:
-        return None
-    res = pattern.search(s)
-    if res:
-        return res.group(0)
-    return None
-
-
-
-currentSeq = ""
-currentId = ""
-first=True
-with open(args.input, 'r') as i:
- with open(args.output, 'a') as o:
- o.write(">>>" + args.id + "\n")
- outputdic = dict()
- for line in i:
- if first:
- first = False
- continue
- linesplt = line.split("\t")
- ref = filterGene(linesplt[1], vPattern)
- if not ref or not linesplt[2].rstrip():
- continue
- if ref in outputdic:
- outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
- else:
- outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
- #print outputdic
-
- for k in outputdic.keys():
- if k in refdic:
- o.write(">>" + k + "\n")
- o.write(refdic[k] + "\n")
- for seq in outputdic[k]:
- #print seq
- o.write(">" + seq[0] + "\n")
- o.write(seq[1] + "\n")
- else:
- print k + " not in reference, skipping " + k

diff -r 5ffd52fc35c4 -r bcec7bb4e089 baseline/script_xlsx.py
--- a/baseline/script_xlsx.py Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,58 +0,0 @@
-import xlrd
-import argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
-parser.add_argument("--ref", help="Reference file")
-parser.add_argument("--output", help="Output file")
-
-args = parser.parse_args()
-
-gene_column = 6
-id_column = 7
-seq_column = 8
-LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"]
-
-
-refdic = dict()
-with open(args.ref, 'r') as ref:
- currentSeq = ""
- currentId = ""
- for line in ref.readlines():
- if line[0] is ">":
- if currentSeq is not "" and currentId is not "":
- refdic[currentId[1:]] = currentSeq
- currentId = line.rstrip()
- currentSeq = ""
- else:
- currentSeq += line.rstrip()
- refdic[currentId[1:]] = currentSeq
-
-currentSeq = ""
-currentId = ""
-with xlrd.open_workbook(args.input, 'r') as wb:
- with open(args.output, 'a') as o:
- for sheet in wb.sheets():
- if sheet.cell(1,gene_column).value.find("IGHV") < 0:
- print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name
- continue
- o.write(">>>" + sheet.name + "\n")
- outputdic = dict()
- for rowindex in range(1, sheet.nrows):
- ref = sheet.cell(rowindex, gene_column).value.replace(">", "")
- if ref in outputdic:
- outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
- else:
- outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
- #print outputdic
-
- for k in outputdic.keys():
- if k in refdic:
- o.write(">>" + k + "\n")
- o.write(refdic[k] + "\n")
- for seq in outputdic[k]:
- #print seq
- o.write(">" + seq[0] + "\n")
- o.write(seq[1] + "\n")
- else:
- print k + " not in reference, skipping " + k

diff -r 5ffd52fc35c4 -r bcec7bb4e089 baseline/wrapper.sh
--- a/baseline/wrapper.sh Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,104 +0,0 @@
-#!/bin/bash
-dir="$(cd "$(dirname "$0")" && pwd)"
-
-testID=$1
-species=$2
-substitutionModel=$3
-mutabilityModel=$4
-clonal=$5
-fixIndels=$6
-region=$7
-inputs=$8
-inputs=($inputs)
-IDs=$9
-IDs=($IDs)
-ref=${10}
-output=${11}
-selection=${12}
-output_table=${13}
-outID="result"
-
-echo "$PWD"
-
-echo "testID = $testID"
-echo "species = $species"
-echo "substitutionModel = $substitutionModel"
-echo "mutabilityModel = $mutabilityModel"
-echo "clonal = $clonal"
-echo "fixIndels = $fixIndels"
-echo "region = $region"
-echo "inputs = ${inputs[@]}"
-echo "IDs = ${IDs[@]}"
-echo "ref = $ref"
-echo "output = $output"
-echo "outID = $outID"
-
-fasta="$PWD/baseline.fasta"
-
-
-count=0
-for current in ${inputs[@]}
-do
- f=$(file $current)
- zipType="Zip archive"
- if [[ "$f" == *"$zipType"* ]] || [[ "$f" == *"XZ compressed data"* ]]
- then
- id=${IDs[$count]}
- echo "id=$id"
- if [[ "$f" == *"Zip archive"* ]] ; then
- echo "Zip archive"
- echo "unzip $input -d $PWD/files/"
- unzip $current -d "$PWD/$id/"
- elif [[ "$f" == *"XZ compressed data"* ]] ; then
- echo "ZX archive"
- echo "tar -xJf $input -C $PWD/files/"
- mkdir -p "$PWD/$id/files"
- tar -xJf $current -C "$PWD/$id/files/"
- fi
- summaryfile="$PWD/summary_${id}.txt"
- gappedfile="$PWD/gappednt_${id}.txt"
- filtered="$PWD/filtered_${id}.txt"
- filecount=`ls -l $PWD/$id/ | wc -l`
- if [[ "$filecount" -eq "2" ]]
- then
- cat $PWD/$id/*/1_* > $summaryfile
- cat $PWD/$id/*/2_* > $gappedfile
- else
- cat $PWD/$id/1_* > $summaryfile
- cat $PWD/$id/2_* > $gappedfile
- fi
- Rscript $dir/filter.r $summaryfile $gappedfile "$selection" $filtered 2>&1
-
- final="$PWD/final_${id}.txt"
- cat $filtered | cut -f2,4,7 > $final
- python $dir/script_imgt.py --input $final --ref $ref --output $fasta --id $id
- else
- python $dir/script_xlsx.py --input $current --ref $ref --output $fasta
- fi
- count=$((count+1))
-done
-
-if [[ $(wc -l < $fasta) -eq "1" ]]; then
- echo "No sequences in the fasta file, exiting"
- exit 0
-fi
-
-workdir="$PWD"
-cd $dir
-echo "file: ${inputs[0]}"
-#Rscript --verbose $dir/Baseline_Main.r $testID $species $substitutionModel $mutabilityModel $clonal $fixIndels $region ${inputs[0]} $workdir/ $outID 2>&1
-Rscript --verbose $dir/Baseline_Main.r $testID $species $substitutionModel $mutabilityModel $clonal $fixIndels $region $fasta $workdir/ $outID 2>&1
-
-echo "$workdir/${outID}.txt"
-
-rows=`tail -n +2 $workdir/${outID}.txt | grep -v "All sequences combined" | grep -n 'Group' | grep -Eoh '^[0-9]+' | tr '\n' ' '`
-rows=($rows)
-#unset rows[${#rows[@]}-1]
-
-cd $dir
-Rscript --verbose $dir/comparePDFs.r $workdir/${outID}.RData $output ${rows[@]} 2>&1
-cp $workdir/result.txt ${output_table}
-
-
-
-

diff -r 5ffd52fc35c4 -r bcec7bb4e089 change_o/DefineClones.py
--- a/change_o/DefineClones.py Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,1052 +0,0 @@\n-#!/usr/bin/env python3\n-"""\n-Assign Ig sequences into clones\n-"""\n-# Info\n-__author__ = \'Namita Gupta, Jason Anthony Vander Heiden, Gur Yaari, Mohamed Uduman\'\n-from changeo import __version__, __date__\n-\n-# Imports\n-import os\n-import re\n-import sys\n-import numpy as np\n-from argparse import ArgumentParser\n-from collections import OrderedDict\n-from itertools import chain\n-from textwrap import dedent\n-from time import time\n-from Bio import pairwise2\n-from Bio.Seq import translate\n-\n-# Presto and changeo imports\n-from presto.Defaults import default_out_args\n-from presto.IO import getFileType, getOutputHandle, printLog, printProgress\n-from presto.Multiprocessing import manageProcesses\n-from presto.Sequence import getDNAScoreDict\n-from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs\n-from changeo.Distance import getDNADistMatrix, getAADistMatrix, \\\n- hs1f_model, m1n_model, hs5f_model, \\\n- calcDistances, formClusters\n-from changeo.IO import getDbWriter, readDbFile, countDbFile\n-from changeo.Multiprocessing import DbData, DbResult\n-\n-# Defaults\n-default_translate = False\n-default_distance = 0.0\n-default_bygroup_model = \'hs1f\'\n-default_hclust_model = \'chen2010\'\n-default_seq_field = \'JUNCTION\'\n-default_norm = \'len\'\n-default_sym = \'avg\'\n-default_linkage = \'single\'\n-\n-# TODO: should be in Distance, but need to be after function definitions\n-# Amino acid Hamming distance\n-aa_model = getAADistMatrix(mask_dist=1, gap_dist=0)\n-\n-# DNA Hamming distance\n-ham_model = getDNADistMatrix(mask_dist=0, gap_dist=0)\n-\n-\n-# TODO: this function is an abstraction to facilitate later cleanup\n-def getModelMatrix(model):\n- """\n- Simple wrapper to get distance matrix from model name\n-\n- Arguments:\n- model = model name\n-\n- Return:\n- a pandas.DataFrame containing the character distance matrix\n- """\n- if model == \'aa\':\n- return(aa_model)\n- elif model == \'ham\':\n- return(ham_model)\n- elif model == \'m1n\':\n- return(m1n_model)\n- elif model == \'hs1f\':\n- return(hs1f_model)\n- elif model == \'hs5f\':\n- return(hs5f_model)\n- else:\n- sys.stderr.write(\'Unrecognized distance model: %s.\\n\' % model)\n-\n-\n-def indexJunctions(db_iter, fields=None, mode=\'gene\', action=\'first\'):\n- """\n- Identifies preclonal groups by V, J and junction length\n-\n- Arguments: \n- db_iter = an iterator of IgRecords defined by readDbFile\n- fields = additional annotation fields to use to group preclones;\n- if None use only V, J and junction length\n- mode = specificity of alignment call to use for assigning preclones;\n- one of (\'allele\', \'gene\')\n- action = how to handle multiple value fields when assigning preclones;\n- one of (\'first\', \'set\')\n- \n- Returns: \n- a dictionary of {(V, J, junction length):[IgRecords]}\n- """\n- # Define functions for grouping keys\n- if mode == \'allele\' and fields is None:\n- def _get_key(rec, act):\n- return (rec.getVAllele(act), rec.getJAllele(act),\n- None if rec.junction is None else len(rec.junction))\n- elif mode == \'gene\' and fields is None:\n- def _get_key(rec, act): \n- return (rec.getVGene(act), rec.getJGene(act),\n- None if rec.junction is None else len(rec.junction))\n- elif mode == \'allele\' and fields is not None:\n- def _get_key(rec, act):\n- vdj = [rec.getVAllele(act), rec.getJAllele(act),\n- None if rec.junction is None else len(rec.junction)]\n- ann = [rec.toDict().get(k, None) for k in fields]\n- return tuple(chain(vdj, ann))\n- elif mode == \'gene\' and fields is not None:\n- def _get_key(rec, act):\n- vdj = [rec.getVGene(act), rec.getJGene(act),\n- None if rec.junction is None else len(rec.junction)]\n- ann = [rec.toDict().get(k, None'..b'\n- parser_bygroup.add_argument(\'--sf\', action=\'store\', dest=\'seq_field\',\n- default=default_seq_field,\n- help=\'\'\'The name of the field to be used to calculate\n- distance between records\'\'\')\n- parser_bygroup.set_defaults(feed_func=feedQueue)\n- parser_bygroup.set_defaults(work_func=processQueue)\n- parser_bygroup.set_defaults(collect_func=collectQueue) \n- parser_bygroup.set_defaults(group_func=indexJunctions) \n- parser_bygroup.set_defaults(clone_func=distanceClones)\n- \n- \n- # Hierarchical clustering cloning method\n- parser_hclust = subparsers.add_parser(\'hclust\', parents=[parser_parent],\n- formatter_class=CommonHelpFormatter,\n- help=\'Defines clones by specified distance metric on CDR3s and \\\n- cutting of hierarchical clustering tree\')\n-# parser_hclust.add_argument(\'-f\', nargs=\'+\', action=\'store\', dest=\'fields\', default=None,\n-# help=\'Fields to use for grouping clones (non VDJ)\')\n- parser_hclust.add_argument(\'--method\', action=\'store\', dest=\'method\', \n- choices=(\'chen2010\', \'ademokun2011\'), default=default_hclust_model, \n- help=\'Specifies which cloning method to use for calculating distance \\\n- between CDR3s, computing linkage, and cutting clusters\')\n- parser_hclust.set_defaults(feed_func=feedQueueClust)\n- parser_hclust.set_defaults(work_func=processQueueClust)\n- parser_hclust.set_defaults(collect_func=collectQueueClust)\n- parser_hclust.set_defaults(cluster_func=hierClust)\n- \n- return parser\n-\n-\n-if __name__ == \'__main__\':\n- """\n- Parses command line arguments and calls main function\n- """\n- # Parse arguments\n- parser = getArgParser()\n- args = parser.parse_args()\n- args_dict = parseCommonArgs(args)\n- # Convert case of fields\n- if \'seq_field\' in args_dict:\n- args_dict[\'seq_field\'] = args_dict[\'seq_field\'].upper()\n- if \'fields\' in args_dict and args_dict[\'fields\'] is not None: \n- args_dict[\'fields\'] = [f.upper() for f in args_dict[\'fields\']]\n- \n- # Define clone_args\n- if args.command == \'bygroup\':\n- args_dict[\'group_args\'] = {\'fields\': args_dict[\'fields\'],\n- \'action\': args_dict[\'action\'], \n- \'mode\':args_dict[\'mode\']}\n- args_dict[\'clone_args\'] = {\'model\': args_dict[\'model\'],\n- \'distance\': args_dict[\'distance\'],\n- \'norm\': args_dict[\'norm\'],\n- \'sym\': args_dict[\'sym\'],\n- \'linkage\': args_dict[\'linkage\'],\n- \'seq_field\': args_dict[\'seq_field\']}\n-\n- # TODO: can be cleaned up with abstract model class\n- args_dict[\'clone_args\'][\'dist_mat\'] = getModelMatrix(args_dict[\'model\'])\n-\n- del args_dict[\'fields\']\n- del args_dict[\'action\']\n- del args_dict[\'mode\']\n- del args_dict[\'model\']\n- del args_dict[\'distance\']\n- del args_dict[\'norm\']\n- del args_dict[\'sym\']\n- del args_dict[\'linkage\']\n- del args_dict[\'seq_field\']\n-\n- # Define clone_args\n- if args.command == \'hclust\':\n- dist_funcs = {\'chen2010\':distChen2010, \'ademokun2011\':distAdemokun2011}\n- args_dict[\'clone_func\'] = dist_funcs[args_dict[\'method\']]\n- args_dict[\'cluster_args\'] = {\'method\': args_dict[\'method\']}\n- #del args_dict[\'fields\']\n- del args_dict[\'method\']\n- \n- # Call defineClones\n- del args_dict[\'command\']\n- del args_dict[\'db_files\']\n- for f in args.__dict__[\'db_files\']:\n- args_dict[\'db_file\'] = f\n- defineClones(**args_dict)\n\\ No newline at end of file\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 change_o/MakeDb.py
--- a/change_o/MakeDb.py Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,1025 +0,0 @@\n-#!/usr/bin/env python3\n-"""\n-Create tab-delimited database file to store sequence alignment information\n-"""\n-# Info\n-__author__ = \'Namita Gupta, Jason Anthony Vander Heiden\'\n-from changeo import __version__, __date__\n-\n-# Imports\n-import csv\n-import os\n-import re\n-import sys\n-import pandas as pd\n-import tarfile\n-import zipfile\n-from argparse import ArgumentParser\n-from collections import OrderedDict\n-from itertools import groupby\n-from shutil import rmtree\n-from tempfile import mkdtemp\n-from textwrap import dedent\n-from time import time\n-from Bio import SeqIO\n-from Bio.Seq import Seq\n-from Bio.Alphabet import IUPAC\n-\n-# Presto and changeo imports\n-from presto.Defaults import default_out_args\n-from presto.Annotation import parseAnnotation\n-from presto.IO import countSeqFile, printLog, printProgress\n-from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs\n-from changeo.IO import getDbWriter, countDbFile, getRepo\n-from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \\\n- j_allele_regex\n-\n-# Default parameters\n-default_delimiter = (\'\\t\', \',\', \'-\')\n-\n-\n-def gapV(ig_dict, repo_dict):\n- """\n- Insert gaps into V region and update alignment information\n-\n- Arguments:\n- ig_dict : Dictionary of parsed IgBlast output\n- repo_dict : Dictionary of IMGT gapped germline sequences\n-\n- Returns:\n- dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields\n- """\n-\n- seq_imgt = \'.\' * (int(ig_dict[\'V_GERM_START_VDJ\'])-1) + ig_dict[\'SEQUENCE_VDJ\']\n-\n- # Find gapped germline V segment\n- vgene = parseAllele(ig_dict[\'V_CALL\'], v_allele_regex, \'first\')\n- vkey = (vgene, )\n- #TODO: Figure out else case\n- if vkey in repo_dict:\n- vgap = repo_dict[vkey]\n- # Iterate over gaps in the germline segment\n- gaps = re.finditer(r\'\\.\', vgap)\n- gapcount = int(ig_dict[\'V_GERM_START_VDJ\'])-1\n- for gap in gaps:\n- i = gap.start()\n- # Break if gap begins after V region\n- if i >= ig_dict[\'V_GERM_LENGTH_VDJ\'] + gapcount:\n- break\n- # Insert gap into IMGT sequence\n- seq_imgt = seq_imgt[:i] + \'.\' + seq_imgt[i:]\n- # Update gap counter\n- gapcount += 1\n- ig_dict[\'SEQUENCE_IMGT\'] = seq_imgt\n- # Update IMGT positioning information for V\n- ig_dict[\'V_GERM_START_IMGT\'] = 1\n- ig_dict[\'V_GERM_LENGTH_IMGT\'] = ig_dict[\'V_GERM_LENGTH_VDJ\'] + gapcount\n-\n- return ig_dict\n-\n-\n-def getIMGTJunc(ig_dict, repo_dict):\n- """\n- Identify junction region by IMGT definition\n-\n- Arguments:\n- ig_dict : Dictionary of parsed IgBlast output\n- repo_dict : Dictionary of IMGT gapped germline sequences\n-\n- Returns:\n- dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields\n- """\n- # Find germline J segment\n- jgene = parseAllele(ig_dict[\'J_CALL\'], j_allele_regex, \'first\')\n- jkey = (jgene, )\n- #TODO: Figure out else case\n- if jkey in repo_dict:\n- # Get germline J sequence\n- jgerm = repo_dict[jkey]\n- jgerm = jgerm[:ig_dict[\'J_GERM_START\']+ig_dict[\'J_GERM_LENGTH\']-1]\n- # Look for (F|W)GXG aa motif in nt sequence\n- motif = re.search(r\'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]\', jgerm)\n- aa_end = len(ig_dict[\'SEQUENCE_IMGT\'])\n- #TODO: Figure out else case\n- if motif:\n- # print(\'\\n\', motif.group())\n- aa_end = motif.start() - len(jgerm) + 3\n- # Add fields to dict\n- ig_dict[\'JUNCTION\'] = ig_dict[\'SEQUENCE_IMGT\'][309:aa_end]\n- ig_dict[\'JUNCTION_LENGTH\'] = len(ig_dict[\'JUNCTION\'])\n-\n- return ig_dict\n-\n-\n-def getRegions(ig_dict):\n- """\n- Identify FWR and CDR regions by IMGT definition\n-\n- Arguments:\n- ig_dict : Dictionary of parsed alignment output\n-\n- Returns:\n- dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMG'..b'P, J_SCORE, J_IDENTITY,\n- J_BTOP, and J_EVALUE columns.\'\'\')\n- parser_igblast.add_argument(\'--regions\', action=\'store_true\', dest=\'region_fields\',\n- help=\'\'\'Specify if IMGT framework and CDR regions should be\n- included in the output. Adds the FWR1_IMGT, FWR2_IMGT,\n- FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and\n- CDR3_IMGT columns.\'\'\')\n- \n- # IMGT aligner\n- parser_imgt = subparsers.add_parser(\'imgt\', help=\'Process IMGT/HighV-Quest output\', \n- parents=[parser_parent], \n- formatter_class=CommonHelpFormatter)\n- imgt_arg_group = parser_imgt.add_mutually_exclusive_group(required=True)\n- imgt_arg_group.add_argument(\'-i\', nargs=\'+\', action=\'store\', dest=\'aligner_files\',\n- help=\'\'\'Either zipped IMGT output files (.zip) or a folder\n- containing unzipped IMGT output files (which must\n- include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,\n- and 6_Junction).\'\'\')\n- parser_imgt.add_argument(\'-s\', nargs=\'*\', action=\'store\', dest=\'seq_files\',\n- required=False,\n- help=\'List of input FASTA files containing sequences\')\n- parser_imgt.add_argument(\'--noparse\', action=\'store_true\', dest=\'no_parse\', \n- help=\'\'\'Specify if input IDs should not be parsed to add new\n- columns to database.\'\'\')\n- parser_imgt.add_argument(\'--scores\', action=\'store_true\', dest=\'score_fields\',\n- help=\'\'\'Specify if alignment score metrics should be\n- included in the output. Adds the V_SCORE, V_IDENTITY,\n- J_SCORE and J_IDENTITY. Note, this will also add\n- the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP,\n- but they will be empty for IMGT output.\'\'\')\n- parser_imgt.add_argument(\'--regions\', action=\'store_true\', dest=\'region_fields\',\n- help=\'\'\'Specify if IMGT framework and CDR regions should be\n- included in the output. Adds the FWR1_IMGT, FWR2_IMGT,\n- FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and\n- CDR3_IMGT columns.\'\'\')\n- parser_imgt.set_defaults(func=parseIMGT)\n-\n- return parser\n- \n- \n-if __name__ == "__main__":\n- """\n- Parses command line arguments and calls main\n- """\n- parser = getArgParser() \n- args = parser.parse_args()\n- args_dict = parseCommonArgs(args, in_arg=\'aligner_files\')\n-\n- # Set no ID parsing if sequence files are not provided\n- if \'seq_files\' in args_dict and not args_dict[\'seq_files\']:\n- args_dict[\'no_parse\'] = True\n-\n- # Delete\n- if \'seq_files\' in args_dict: del args_dict[\'seq_files\']\n- if \'aligner_files\' in args_dict: del args_dict[\'aligner_files\']\n- if \'command\' in args_dict: del args_dict[\'command\']\n- if \'func\' in args_dict: del args_dict[\'func\'] \n- \n- if args.command == \'imgt\':\n- for i in range(len(args.__dict__[\'aligner_files\'])):\n- args_dict[\'imgt_output\'] = args.__dict__[\'aligner_files\'][i]\n- args_dict[\'seq_file\'] = args.__dict__[\'seq_files\'][i] \\\n- if args.__dict__[\'seq_files\'] else None\n- args.func(**args_dict)\n- elif args.command == \'igblast\':\n- for i in range(len(args.__dict__[\'aligner_files\'])):\n- args_dict[\'igblast_output\'] = args.__dict__[\'aligner_files\'][i]\n- args_dict[\'seq_file\'] = args.__dict__[\'seq_files\'][i]\n- args.func(**args_dict)\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 change_o/define_clones.r
--- a/change_o/define_clones.r Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,15 +0,0 @@
-args <- commandArgs(trailingOnly = TRUE)
-
-input=args[1]
-output=args[2]
-
-change.o = read.table(input, header=T, sep="\t", quote="", stringsAsFactors=F)
-
-freq = data.frame(table(change.o$CLONE))
-freq2 = data.frame(table(freq$Freq))
-
-freq2$final = as.numeric(freq2$Freq) * as.numeric(as.character(freq2$Var1))
-
-names(freq2) = c("Clone size", "Nr of clones", "Nr of sequences")
-
-write.table(x=freq2, file=output, sep="\t",quote=F,row.names=F,col.names=T)

diff -r 5ffd52fc35c4 -r bcec7bb4e089 change_o/define_clones.sh
--- a/change_o/define_clones.sh Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,43 +0,0 @@
-#!/bin/bash
-dir="$(cd "$(dirname "$0")" && pwd)"
-
-#define_clones.sh $input $noparse $scores $regions $out_file
-
-type=$1
-input=$2
-
-mkdir -p $PWD/outdir
-
-cp $input $PWD/input.tab #file has to have a ".tab" extension
-
-if [ "bygroup" == "$type" ] ; then
- mode=$3
- act=$4
- model=$5
- norm=$6
- sym=$7
- link=$8
- dist=$9
- output=${10}
- output2=${11}
-
- python3 $dir/DefineClones.py bygroup -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link
- #/data/users/david/anaconda3/bin/python $dir/DefineClones.py bygroup -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link
- #/home/galaxy/anaconda3/bin/python $dir/DefineClones.py bygroup -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link
-
- Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1
-else
- method=$3
- output=$4
- output2=$5
-
- python3 $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method
- #/data/users/david/anaconda3/bin/python $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method
- #/home/galaxy/anaconda3/bin/python $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method
-
- Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1
-fi
-
-cp $PWD/outdir/output_clone-pass.tab $output
-
-rm -rf $PWD/outdir/

diff -r 5ffd52fc35c4 -r bcec7bb4e089 change_o/makedb.sh
--- a/change_o/makedb.sh Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,38 +0,0 @@
-#!/bin/bash
-dir="$(cd "$(dirname "$0")" && pwd)"
-
-input=$1
-noparse=$2
-scores=$3
-regions=$4
-output=$5
-
-if [ "true" == "$noparse" ] ; then
- noparse="--noparse"
-else
- noparse=""
-fi
-
-if [ "true" == "$scores" ] ; then
- scores="--scores"
-else
- scores=""
-fi
-
-if [ "true" == "$regions" ] ; then
- regions="--regions"
-else
- regions=""
-fi
-
-mkdir $PWD/outdir
-
-echo "makedb: $PWD/outdir"
-
-python3 $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions
-#/data/users/david/anaconda3/bin/python $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions
-#/home/galaxy/anaconda3/bin/python $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions
-
-mv $PWD/outdir/output_db-pass.tab $output
-
-rm -rf $PWD/outdir/

diff -r 5ffd52fc35c4 -r bcec7bb4e089 complete.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/complete.sh Mon Dec 12 05:22:57 2016 -0500

[

@@ -0,0 +1,67 @@
+#!/bin/bash
+set -e
+inputFiles=($1)
+outputDir=$3
+outputFile=$3/index.html #$1
+clonalType=$4
+species=$5
+locus=$6
+filterproductive=$7
+clonality_method=$8
+
+html=$2
+dir="$(cd "$(dirname "$0")" && pwd)"
+array=("$@")
+echo "<html><h3>Progress</h3><table><tr><td>info</td></tr>" > $html
+echo "<tr><td>-----------------------------------</td></tr>" >> $html
+
+#mkdir $PWD/igblastdatabase
+#unzip $dir/database.zip -d $PWD/igblastdatabase/
+#export IGDATA=$PWD/igblastdatabase/
+
+id=""
+forwardSlash="/"
+mergerInput=()
+echo "Before loop"
+count=1
+for current in "${inputFiles[@]}"
+do
+ if [[ "$current" != *"$forwardSlash"* ]]; then
+ id="$current"
+ mergerInput+=($id)
+ count=1
+ continue
+ fi
+ echo "working on $current"
+ fileName=$(basename $current)
+ fileName="${fileName%.*}"
+ parsedFileName="$PWD/$fileName.parsed"
+ f=$(file $current)
+ zipType="Zip archive"
+ zxType="XZ compressed data"
+ if [[ "$f" == *"$zipType"* ]] || [[ "$f" == *"$zxType"* ]]
+ then
+ echo "<tr><td>Sample $count of patient $id is an archive file, using IMGT Loader</td></tr>" >> $html
+ fileName=$(basename $current)
+ bash ${dir}/imgt_loader/imgt_loader.sh $current $parsedFileName "${fileName}"
+ else
+ echo "<tr><td>Sample $count of patient $id is not a zip file so assuming fasta/fastq, using igBLASTn</td></tr>" >> $html
+ bash ${dir}/igblast/igblast.sh $current $species $locus $parsedFileName
+ fi
+ mergerInput+=($parsedFileName)
+ count=$((count+1))
+done
+
+echo "<tr><td>-----------------------------------</td></tr>" >> $html
+echo "<tr><td>merging</td></tr>" >> $html
+
+bash $dir/experimental_design/experimental_design.sh ${mergerInput[*]} $PWD/merged.txt
+
+echo "<tr><td>done</td></tr>" >> $html
+echo "<tr><td>-----------------------------------</td></tr>" >> $html
+echo "<tr><td>plotting</td></tr>" >> $html
+
+echo "after ED"
+
+bash $dir/report_clonality/r_wrapper.sh $PWD/merged.txt $2 $outputDir $clonalType "$species" "$locus" $filterproductive $clonality_method
+

diff -r 5ffd52fc35c4 -r bcec7bb4e089 complete_immunerepertoire.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/complete_immunerepertoire.xml Mon Dec 12 05:22:57 2016 -0500

b'@@ -0,0 +1,203 @@\n+<tool id="complete_immunerepertoire_igg" name="Immune Repertoire pipeline" version="1.0">\n+\t<description> </description>\n+\t<command interpreter="bash">\n+complete.sh "\n+#for $i, $f in enumerate($patients)\n+\t"${f.id}"\n+\t#for $j, $g in enumerate($f.samples)\n+\t\t${g.sample}\n+\t#end for\n+#end for\n+" $out_file $out_file.files_path "$clonaltype"\n+#if $gene_selection.source == "imgtdb"\t\t\n+\t"${gene_selection.species}" "${gene_selection.locus}" $filterproductive ${clonality_method}\n+#else\n+\t"custom" "${gene_selection.vgenes};${gene_selection.dgenes};${gene_selection.jgenes}" $filterproductive $clonality_method\n+#end if\n+\t</command>\n+\t<inputs>\n+\t\t<repeat name="patients" title="Donor" min="1" default="1">\n+\t\t\t<repeat name="samples" title="Sample" min="1" default="1">\n+\t\t\t\t\t<param name="sample" type="data" label="Sample to Process" />\n+\t\t\t</repeat>\n+\t\t\t<param name="id" type="text" label="ID" />\n+\t\t</repeat>\n+\t\t<param name="clonaltype" type="select" label="Clonal Type Definition">\n+\t\t\t<option value="none">Don\'t remove duplicates based on clonaltype</option>\n+\t\t\t<option value="Top.V.Gene,CDR3.Seq">Top.V.Gene, CDR3 (AA)</option>\n+\t\t\t<option value="Top.V.Gene,CDR3.Seq.DNA">Top.V.Gene, CDR3 (nt)</option>\n+\t\t\t<option value="Top.V.Gene,Top.J.Gene,CDR3.Seq">Top.V.Gene, Top.J.Gene, CDR3 (AA)</option>\n+\t\t\t<option value="Top.V.Gene,Top.J.Gene,CDR3.Seq.DNA">Top.V.Gene, Top.J.Gene, CDR3 (nt)</option>\n+\t\t\t<option value="Top.V.Gene,Top.D.Gene,Top.J.Gene,CDR3.Seq.DNA">Top.V.Gene, Top.D.Gene, Top.J.Gene, CDR3 (nt)</option>\n+\t\t</param>\n+\t\t\n+\t\t<conditional name="gene_selection" >\n+\t\t\t<param name="source" type="select" label="Order of V(D)J genes in graphs" help="" >\n+\t\t\t\t\t<option value="imgtdb" selected="true">IMGT-DB</option>\n+\t\t\t\t\t<option value="custom">User defined</option>\n+\t\t\t</param>\n+\t\t\t<when value="imgtdb">\n+\t\t\t\t<param name="species" type="select" label="Species">\n+\t\t\t\t\t<option value="Homo sapiens functional">Homo sapiens functional</option>\n+\t\t\t\t\t<option value="Homo sapiens">Homo sapiens</option>\n+\t\t\t\t\t<option value="Homo sapiens non-functional">Homo sapiens non-functional</option>\n+\t\t\t\t\t<option value="Bos taurus">Bos taurus</option>\n+\t\t\t\t\t<option value="Bos taurus functional">Bos taurus functional</option>\n+\t\t\t\t\t<option value="Bos taurus non-functional">Bos taurus non-functional</option>\n+\t\t\t\t\t<option value="Camelus dromedarius">Camelus dromedarius</option>\n+\t\t\t\t\t<option value="Camelus dromedarius functional">Camelus dromedarius functional</option>\n+\t\t\t\t\t<option value="Camelus dromedarius non-functional">Camelus dromedarius non-functional</option>\n+\t\t\t\t\t<option value="Canis lupus familiaris">Canis lupus familiaris</option>\n+\t\t\t\t\t<option value="Canis lupus familiaris functional">Canis lupus familiaris functional</option>\n+\t\t\t\t\t<option value="Canis lupus familiaris non-functional">Canis lupus familiaris non-functional</option>\n+\t\t\t\t\t<option value="Danio rerio">Danio rerio</option>\n+\t\t\t\t\t<option value="Danio rerio functional">Danio rerio functional</option>\n+\t\t\t\t\t<option value="Danio rerio non-functional">Danio rerio non-functional</option>\n+\t\t\t\t\t<option value="Macaca mulatta">Macaca mulatta</option>\n+\t\t\t\t\t<option value="Macaca mulatta functional">Macaca mulatta functional</option>\n+\t\t\t\t\t<option value="Macaca mulatta non-functional">Macaca mulatta non-functional</option>\n+\t\t\t\t\t<option value="Mus musculus">Mus musculus</option>\n+\t\t\t\t\t<option value="Mus musculus functional">Mus musculus functional</option>\n+\t\t\t\t\t<option value="Mus musculus non-functional">Mus musculus non-functional</option>\n+\t\t\t\t\t<option value="Mus spretus">Mus spretus</option>\n+\t\t\t\t\t<option value="Mus spretus functional">Mus spretus functional</option>\n+\t\t\t\t\t<option value="Mus spretus non-functional">Mus spretus non-functional</option>\n+\t\t\t\t\t<option value="Oncorhynchus mykiss">Oncorhynchus mykiss</option>\n+\t\t\t\t\t<option value="Oncorhynchus mykiss functional">Oncorhynchus mykiss functional</option>\n+\t\t\t\t\t<option value="Oncorhynchus mykiss non-functional">Oncorhynchus m'..b'<data format="html" name="out_file" />\n+\t</outputs>\n+\t<requirements>\n+\t\t<requirement type="package" version="0.6">igblastwrp</requirement>\n+\t\t<requirement type="package" version="3.3">weblogo</requirement>\n+\t\t\n+\t</requirements>\n+\t<help>\n+\t\tThe entire Immune Repertoire pipeline as a single tool, input several FASTA files or IMGT zip/txz files, give them an ID and it will BLAST/parse, merge and plot them.\n+\n+\t\t.. class:: warningmark\n+\n+Custom gene ordering based on position on genome: \n+\n+**Human**\n+\n+IGH::\n+\n+ V:\n+ IGHV7-81,IGHV3-74,IGHV3-73,IGHV3-72,IGHV3-71,IGHV2-70,IGHV1-69,IGHV3-66,IGHV3-64,IGHV4-61,IGHV4-59,IGHV1-58,IGHV3-53,IGHV3-52,IGHV5-a,IGHV5-51,IGHV3-49,IGHV3-48,IGHV3-47,IGHV1-46,IGHV1-45,IGHV3-43,IGHV4-39,IGHV3-35,IGHV4-34,IGHV3-33,IGHV4-31,IGHV4-30-4,IGHV4-30-2,IGHV3-30-3,IGHV3-30,IGHV4-28,IGHV2-26,IGHV1-24,IGHV3-23,IGHV3-22,IGHV3-21,IGHV3-20,IGHV3-19,IGHV1-18,IGHV3-15,IGHV3-13,IGHV3-11,IGHV3-9,IGHV1-8,IGHV3-7,IGHV2-5,IGHV7-4-1,IGHV4-4,IGHV4-b,IGHV1-3,IGHV1-2,IGHV6-1\n+ D:\n+ IGHD1-1,IGHD2-2,IGHD3-3,IGHD6-6,IGHD1-7,IGHD2-8,IGHD3-9,IGHD3-10,IGHD4-11,IGHD5-12,IGHD6-13,IGHD1-14,IGHD2-15,IGHD3-16,IGHD4-17,IGHD5-18,IGHD6-19,IGHD1-20,IGHD2-21,IGHD3-22,IGHD4-23,IGHD5-24,IGHD6-25,IGHD1-26,IGHD7-27\n+ J:\n+ IGHJ1,IGHJ2,IGHJ3,IGHJ4,IGHJ5,IGHJ6\n+\n+\n+IGK::\n+\n+ V:\n+ IGKV3D-7,IGKV1D-8,IGKV1D-43,IGKV3D-11,IGKV1D-12,IGKV1D-13,IGKV3D-15,IGKV1D-16,IGKV1D-17,IGKV3D-20,IGKV2D-26,IGKV2D-28,IGKV2D-29,IGKV2D-30,IGKV1D-33,IGKV1D-39,IGKV2D-40,IGKV2-40,IGKV1-39,IGKV1-33,IGKV2-30,IGKV2-29,IGKV2-28,IGKV1-27,IGKV2-24,IGKV3-20,IGKV1-17,IGKV1-16,IGKV3-15,IGKV1-13,IGKV1-12,IGKV3-11,IGKV1-9,IGKV1-8,IGKV1-6,IGKV1-5,IGKV5-2,IGKV4-1\n+ J:\n+ IGKJ1,IGKJ2,IGKJ3,IGKJ4,IGKJ5\n+\n+\n+IGL::\n+\n+ V:\n+ IGLV4-69,IGLV8-61,IGLV4-60,IGLV6-57,IGLV5-52,IGLV1-51,IGLV9-49,IGLV1-47,IGLV7-46,IGLV5-45,IGLV1-44,IGLV7-43,IGLV1-41,IGLV1-40,IGLV5-39,IGLV5-37,IGLV1-36,IGLV3-27,IGLV3-25,IGLV2-23,IGLV3-22,IGLV3-21,IGLV3-19,IGLV2-18,IGLV3-16,IGLV2-14,IGLV3-12,IGLV2-11,IGLV3-10,IGLV3-9,IGLV2-8,IGLV4-3,IGLV3-1\n+ J:\n+ IGLJ1,IGLJ2,IGLJ3,IGLJ6,IGLJ7\n+\n+\n+TRB::\n+\n+ V:\n+ TRBV2,TRBV3-1,TRBV4-1,TRBV5-1,TRBV6-1,TRBV4-2,TRBV6-2,TRBV4-3,TRBV6-3,TRBV7-2,TRBV6-4,TRBV7-3,TRBV9,TRBV10-1,TRBV11-1,TRBV10-2,TRBV11-2,TRBV6-5,TRBV7-4,TRBV5-4,TRBV6-6,TRBV5-5,TRBV7-6,TRBV5-6,TRBV6-8,TRBV7-7,TRBV6-9,TRBV7-8,TRBV5-8,TRBV7-9,TRBV13,TRBV10-3,TRBV11-3,TRBV12-3,TRBV12-4,TRBV12-5,TRBV14,TRBV15,TRBV16,TRBV18,TRBV19,TRBV20-1,TRBV24-1,TRBV25-1,TRBV27,TRBV28,TRBV29-1,TRBV30\n+ D:\n+ TRBD1,TRBD2\n+ J:\n+ TRBJ1-1,TRBJ1-2,TRBJ1-3,TRBJ1-4,TRBJ1-5,TRBJ1-6,TRBJ2-1,TRBJ2-2,TRBJ2-3,TRBJ2-4,TRBJ2-5,TRBJ2-6,TRBJ2-7\n+\n+\n+TRA::\n+\n+ V:\n+ TRAV1-1,TRAV1-2,TRAV2,TRAV3,TRAV4,TRAV5,TRAV6,TRAV7,TRAV8-1,TRAV9-1,TRAV10,TRAV12-1,TRAV8-2,TRAV8-3,TRAV13-1,TRAV12-2,TRAV8-4,TRAV13-2,TRAV14/DV4,TRAV9-2,TRAV12-3,TRAV8-6,TRAV16,TRAV17,TRAV18,TRAV19,TRAV20,TRAV21,TRAV22,TRAV23/DV6,TRAV24,TRAV25,TRAV26-1,TRAV27,TRAV29/DV5,TRAV30,TRAV26-2,TRAV34,TRAV35,TRAV36/DV7,TRAV38-1,TRAV38-2/DV8,TRAV39,TRAV40,TRAV41\n+ J:\n+ TRAJ57,TRAJ56,TRAJ54,TRAJ53,TRAJ52,TRAJ50,TRAJ49,TRAJ48,TRAJ47,TRAJ46,TRAJ45,TRAJ44,TRAJ43,TRAJ42,TRAJ41,TRAJ40,TRAJ39,TRAJ38,TRAJ37,TRAJ36,TRAJ34,TRAJ33,TRAJ32,TRAJ31,TRAJ30,TRAJ29,TRAJ28,TRAJ27,TRAJ26,TRAJ24,TRAJ23,TRAJ22,TRAJ21,TRAJ20,TRAJ18,TRAJ17,TRAJ16,TRAJ15,TRAJ14,TRAJ13,TRAJ12,TRAJ11,TRAJ10,TRAJ9,TRAJ8,TRAJ7,TRAJ6,TRAJ5,TRAJ4,TRAJ3\n+\n+\n+TRG::\n+\n+ V:\n+ TRGV9,TRGV8,TRGV5,TRGV4,TRGV3,TRGV2\n+ J:\n+ TRGJ2,TRGJP2,TRGJ1,TRGJP1\n+\n+\n+TRD::\n+\n+ V:\n+ TRDV1,TRDV2,TRDV3\n+ D:\n+ TRDD1,TRDD2,TRDD3\n+ J:\n+ TRDJ1,TRDJ4,TRDJ2,TRDJ3\n+\n+\n+**Mouse**\n+\n+TRB::\n+\n+ V:\n+ TRBV1,TRBV2,TRBV3,TRBV4,TRBV5,TRBV12-1,TRBV13-1,TRBV12-2,TRBV13-2,TRBV13-3,TRBV14,TRBV15,TRBV16,TRBV17,TRBV19,TRBV20,TRBV23,TRBV24,TRBV26,TRBV29,TRBV30,TRBV31\n+ D:\n+ TRBD1,TRBD2\n+ J:\n+ TRBJ1-1,TRBJ1-2,TRBJ1-3,TRBJ1-4,TRBJ1-5,TRBJ2-1,TRBJ2-2,TRBJ2-3,TRBJ2-4,TRBJ2-5,TRBJ2-6,TRBJ2-7\n+ \n+\t</help>\n+\n+</tool>\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 datatypes_conf.xml
--- a/datatypes_conf.xml Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,6 +0,0 @@
-<?xml version="1.0"?>
-<datatypes>
-    <registration>
-        <datatype extension="imgt_archive" type="galaxy.datatypes.binary:CompressedArchive" display_in_upload="True" subclass="True"/>
-    </registration>
-</datatypes>

diff -r 5ffd52fc35c4 -r bcec7bb4e089 experimental_design/experimental_design.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/experimental_design/experimental_design.py Mon Dec 12 05:22:57 2016 -0500

[

@@ -0,0 +1,50 @@
+import sys
+import pandas as pd
+
+def main():
+ patients = {}
+ files = []
+ sample_id = sys.argv[1]
+ imgt_files = 0
+ blast_files = 0
+ #organize files
+ for arg in sys.argv[2:-2]:
+ if arg.find("/") is -1:
+ patients[sample_id] = files
+ files = []
+ sample_id = arg
+ else:
+ df = pd.read_csv(arg, sep="\t", dtype=object, error_bad_lines=False)
+ if "Functionality" in list(df.columns.values):
+ df["VDJ Frame"][df["Functionality"] != "productive"] = "In-frame with stop codon"
+ imgt_files += 1
+ else:
+ blast_files += 1
+ files.append(df)
+ patients[sample_id] = files
+ columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length',
+    u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Functionality', 'V-REGION identity %',
+    'V-REGION identity nt', 'D-REGION reading frame', 'AA JUNCTION', 'Functionality comment', 'Sequence', 'FR1-IMGT', 'FR2-IMGT',
+    'FR3-IMGT', 'CDR3-IMGT', 'JUNCTION', 'J-REGION', 'FR4-IMGT', 'P3V-nt nb', 'N1-REGION-nt nb', 'P5D-nt nb', 'P3D-nt nb', 'N2-REGION-nt nb',
+    'P5J-nt nb', '3V-REGION trimmed-nt nb', '5D-REGION trimmed-nt nb', '3D-REGION trimmed-nt nb', '5J-REGION trimmed-nt nb', u'Sample', u'Replicate']
+ if "N-REGION-nt nb" in files[0].columns:
+ columns.insert(30, "N-REGION-nt nb")
+ if blast_files is not 0:
+ print "Has a parsed blastn file, using limited columns."
+ columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Sample', u'Replicate']
+
+ result = None
+ for patient_id, samples in patients.iteritems():
+ count = 1
+ for sample in samples:
+ sample['Sample'] = patient_id
+ sample['Replicate'] = str(count)
+ count += 1
+ if result is None:
+ result = sample[columns]
+ else:
+ result = result.append(sample[columns])
+ result.to_csv(sys.argv[-1], sep="\t", index=False, index_label="index")
+
+if __name__ == "__main__":
+ main()

diff -r 5ffd52fc35c4 -r bcec7bb4e089 experimental_design/experimental_design.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/experimental_design/experimental_design.r Mon Dec 12 05:22:57 2016 -0500

[

@@ -0,0 +1,38 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+print(args)
+
+inputs = args[1:(length(args) - 1)]
+output = args[length(args)]
+
+current.id = ""
+counter = 1
+
+result = NULL
+
+for(current in inputs){
+ if(grepl("/", current)){ #its a path to a file
+ print(paste("Adding file", counter, "to", current.id))
+ dat = read.table(current, sep="\t", header=T, quote="", fill=T)
+
+ #IMGT check
+
+ dat$Sample = current.id
+ dat$Replicate = counter
+
+ if(is.null(result)){
+ result = dat[NULL,]
+ }
+
+ result = rbind(result, dat)
+
+ counter = counter + 1
+
+ } else { #its an ID of a patient
+ print(paste("New patient", current))
+ current.id = current
+ counter = 1
+ }
+}
+
+write.table(result, output, sep="\t", quote=F, row.names=F, col.names=T)

diff -r 5ffd52fc35c4 -r bcec7bb4e089 experimental_design/experimental_design.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/experimental_design/experimental_design.sh Mon Dec 12 05:22:57 2016 -0500

@@ -0,0 +1,4 @@
+
+dir="$(cd "$(dirname "$0")" && pwd)"
+
+Rscript --verbose $dir/experimental_design.r $@ 2>&1

diff -r 5ffd52fc35c4 -r bcec7bb4e089 gene_identification.py
--- a/gene_identification.py Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,226 +0,0 @@\n-import re\n-import argparse\n-import time\n-starttime= int(time.time() * 1000)\n-\n-parser = argparse.ArgumentParser()\n-parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file")\n-parser.add_argument("--output", help="The annotated output file to be merged back with the summary file")\n-\n-args = parser.parse_args()\n-\n-infile = args.input\n-#infile = "test_VH-Ca_Cg_25nt/1_Summary_test_VH-Ca_Cg_25nt_241013.txt"\n-output = args.output\n-#outfile = "identified.txt"\n-\n-dic = dict()\n-total = 0\n-\n-\n-first = True\n-IDIndex = 0\n-seqIndex = 0\n-\n-with open(infile, \'r\') as f: #read all sequences into a dictionary as key = ID, value = sequence\n-\tfor line in f:\n-\t\ttotal += 1\n-\t\tlinesplt = line.split("\\t")\n-\t\tif first:\n-\t\t\tprint "linesplt", linesplt\n-\t\t\tIDIndex = linesplt.index("Sequence ID")\n-\t\t\tseqIndex = linesplt.index("Sequence")\n-\t\t\tfirst = False\n-\t\t\tcontinue\n-\t\t\n-\t\tID = linesplt[IDIndex]\n-\t\tif len(linesplt) < 28: #weird rows without a sequence\n-\t\t\tdic[ID] = ""\n-\t\telse:\n-\t\t\tdic[ID] = linesplt[seqIndex]\n-\t\t\t\n-print "Number of input sequences:", len(dic)\n-\n-#old cm sequence: gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc\n-#old cg sequence: ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag\n-\n-#lambda/kappa reference sequence\n-searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg",\n- "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggcc",\n- "ce": "gcctccacacagagcccatccgtcttccccttgacccgctgctgcaaaaacattccctcc",\n- "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence\n-\n-compiledregex = {"ca": [],\n- "cg": [],\n- "ce": [],\n- "cm": []}\n-\n-#lambda/kappa reference sequence variable nucleotides\n-ca1 = {38: \'t\', 39: \'g\', 48: \'a\', 49: \'g\', 51: \'c\', 68: \'a\', 73: \'c\'}\n-ca2 = {38: \'g\', 39: \'a\', 48: \'c\', 49: \'c\', 51: \'a\', 68: \'g\', 73: \'a\'}\n-cg1 = {0: \'c\', 33: \'a\', 38: \'c\', 44: \'a\', 54: \'t\', 56: \'g\', 58: \'g\', 66: \'g\', 132: \'c\'}\n-cg2 = {0: \'c\', 33: \'g\', 38: \'g\', 44: \'g\', 54: \'c\', 56: \'a\', 58: \'a\', 66: \'g\', 132: \'t\'}\n-cg3 = {0: \'t\', 33: \'g\', 38: \'g\', 44: \'g\', 54: \'t\', 56: \'g\', 58: \'g\', 66: \'g\', 132: \'c\'}\n-cg4 = {0: \'t\', 33: \'g\', 38: \'g\', 44: \'g\', 54: \'c\', 56: \'a\', 58: \'a\', 66: \'c\', 132: \'c\'}\n-\n-#remove last snp for shorter cg sequence --- note, also change varsInCG\n-del cg1[132]\n-del cg2[132]\n-del cg3[132]\n-del cg4[132]\n-\n-#reference sequences are cut into smaller parts of \'chunklength\' length, and with \'chunklength\' / 2 overlap\n-chunklength = 8\n-\n-#create the chunks of the reference sequence with regular expressions for the variable nucleotides\n-for i in range(0, len(searchstrings["ca"]) - chunklength, chunklength / 2):\n- pos = i\n- chunk = searchstrings["ca"][i:i+chunklength]\n- result = ""\n- varsInResult = 0\n- for c in chunk:\n- if pos in ca1.keys():\n- varsInResult += 1\n- result += "[" + ca1[pos] + ca2[pos] + "]"\n- else:\n- result += c\n- pos += 1\n- compiledregex["ca"].append((re.compile(result), varsInResult))\n-\n-for i in range(0, len(searchstrings["cg"]) - chunklength, chunklength / 2):\n- pos = i\n- chunk = searchstrings["cg"][i:i+chunklength]\n- result = ""\n- varsInResult = 0\n- for c in chunk:\n- if pos in cg1.keys():\n- varsInResult += 1\n- result += "[" + "".join(set([cg1[pos], cg2[pos], cg3[pos], cg4[pos]])) + "]"\n- else:\n- result += c\n- pos += 1\n- compiledregex["cg"].append((re.compile(result), varsInResult))\n-\n-for i in range(0, len(searchstrings["cm"]) - chunklength, chunklength / 2):\n- compiledregex["cm"].append((re.compile(searchstrings["cm"][i:i+chunklength]), False))\n-\n-for i in range(0, len(searchstrings["ce"]) - chunklength + 1, chunklength / 2):\n- compiledregex["ce"].append((re.compile(searchstrings["ce"][i:i+chunklength]), False))\n-\n-def removeAndReturnMaxIndex(x): #simplifies a list compr'..b'kstart <= x < chunkend and cg4[x] == seq[lastindex + x - chunkstart]])\n-\t\t\t\t\telse: #key == "cm" #no variable regions in \'cm\' or \'ce\'\n-\t\t\t\t\t\tpass\n-\t\t\t\tbreak #this only breaks when there was a match with the regex, breaking means the \'else:\' clause is skipped\n-\t\t\telse: #only runs if there were no hits\n-\t\t\t\tcontinue\n-\t\t\t#print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it\'s now:", start[lastindex - chunklength / 2 * i]\n-\t\t\tcurrentIDHits[key + "_hits"] += 1\n-\t\tstart_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1 - start_zero) for x in range(5) if len(start) > 0 and max(start) > 1])\n-\t\t#start_location[ID + "_" + key] = str(start.index(max(start)))\n-\n-\n-varsInCA = float(len(ca1.keys()) * 2)\n-varsInCG = float(len(cg1.keys()) * 2) - 2 # -2 because the sliding window doesn\'t hit the first and last nt twice\n-varsInCM = 0\n-varsInCE = 0\n-\n-def round_int(val):\n-\treturn int(round(val))\n-\n-first = True\n-seq_write_count=0\n-with open(infile, \'r\') as f: #read all sequences into a dictionary as key = ID, value = sequence\n-\twith open(output, \'w\') as o:\n-\t\tfor line in f:\n-\t\t\ttotal += 1\n-\t\t\tif first:\n-\t\t\t\to.write("Sequence ID\\tbest_match\\tnt_hit_percentage\\tchunk_hit_percentage\\tstart_locations\\n")\n-\t\t\t\tfirst = False\n-\t\t\t\tcontinue\n-\t\t\tlinesplt = line.split("\\t")\n-\t\t\tif linesplt[2] == "No results":\n-\t\t\t\tpass\n-\t\t\tID = linesplt[1]\n-\t\t\tcurrentIDHits = hits[ID]\n-\t\t\tpossibleca = float(len(compiledregex["ca"]))\n-\t\t\tpossiblecg = float(len(compiledregex["cg"]))\n-\t\t\tpossiblecm = float(len(compiledregex["cm"]))\n-\t\t\tpossiblece = float(len(compiledregex["ce"]))\n-\t\t\tcahits = currentIDHits["ca_hits"]\n-\t\t\tcghits = currentIDHits["cg_hits"]\n-\t\t\tcmhits = currentIDHits["cm_hits"]\n-\t\t\tcehits = currentIDHits["ce_hits"]\n-\t\t\tif cahits >= cghits and cahits >= cmhits and cahits >= cehits: #its a ca gene\n-\t\t\t\tca1hits = currentIDHits["ca1"]\n-\t\t\t\tca2hits = currentIDHits["ca2"]\n-\t\t\t\tif ca1hits >= ca2hits:\n-\t\t\t\t\to.write(ID + "\\tIGA1\\t" + str(round_int(ca1hits / varsInCA * 100)) + "\\t" + str(round_int(cahits / possibleca * 100)) + "\\t" + start_location[ID + "_ca"] + "\\n")\n-\t\t\t\telse:\n-\t\t\t\t\to.write(ID + "\\tIGA2\\t" + str(round_int(ca2hits / varsInCA * 100)) + "\\t" + str(round_int(cahits / possibleca * 100)) + "\\t" + start_location[ID + "_ca"] + "\\n")\n-\t\t\telif cghits >= cahits and cghits >= cmhits and cghits >= cehits: #its a cg gene\n-\t\t\t\tcg1hits = currentIDHits["cg1"]\n-\t\t\t\tcg2hits = currentIDHits["cg2"]\n-\t\t\t\tcg3hits = currentIDHits["cg3"]\n-\t\t\t\tcg4hits = currentIDHits["cg4"]\n-\t\t\t\tif cg1hits >= cg2hits and cg1hits >= cg3hits and cg1hits >= cg4hits: #cg1 gene\n-\t\t\t\t\to.write(ID + "\\tIGG1\\t" + str(round_int(cg1hits / varsInCG * 100)) + "\\t" + str(round_int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n-\t\t\t\telif cg2hits >= cg1hits and cg2hits >= cg3hits and cg2hits >= cg4hits: #cg2 gene\n-\t\t\t\t\to.write(ID + "\\tIGG2\\t" + str(round_int(cg2hits / varsInCG * 100)) + "\\t" + str(round_int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n-\t\t\t\telif cg3hits >= cg1hits and cg3hits >= cg2hits and cg3hits >= cg4hits: #cg3 gene\n-\t\t\t\t\to.write(ID + "\\tIGG3\\t" + str(round_int(cg3hits / varsInCG * 100)) + "\\t" + str(round_int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n-\t\t\t\telse: #cg4 gene\n-\t\t\t\t\to.write(ID + "\\tIGG4\\t" + str(round_int(cg4hits / varsInCG * 100)) + "\\t" + str(round_int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n-\t\t\telse: #its a cm or ce gene\n-\t\t\t\tif cmhits >= cehits:\n-\t\t\t\t\to.write(ID + "\\tIGM\\t100\\t" + str(round_int(cmhits / possiblecm * 100)) + "\\t" + start_location[ID + "_cm"] + "\\n")\n-\t\t\t\telse:\n-\t\t\t\t\to.write(ID + "\\tIGE\\t100\\t" + str(round_int(cehits / possiblece * 100)) + "\\t" + start_location[ID + "_ce"] + "\\n")\n-\t\t\tseq_write_count += 1\n-\n-print "Time: %i" % (int(time.time() * 1000) - starttime)\n-\n-print "Number of sequences written to file:", seq_write_count\n-\n-\n-\n-\n-\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 igblast/igblast.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/igblast/igblast.r Mon Dec 12 05:22:57 2016 -0500

[

@@ -0,0 +1,56 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+infile=args[1]
+outfile=args[2]
+
+blasted = read.table(infile, header=T, sep="\t", fill=T, stringsAsFactors=F, comment.char="")
+
+blasted$ID = 1:nrow(blasted)
+blasted$VDJ.Frame = "Out-of-frame"
+
+search = blasted$inFrame == "true" & blasted$noStop == "false"
+if(sum(search) > 0){
+  blasted[search ,]$VDJ.Frame = "In-frame with stop codon"
+}
+
+search = blasted$inFrame == "true" & blasted$noStop == "true"
+if(sum(search) > 0){
+  blasted[search ,]$VDJ.Frame = "In-frame"
+}
+
+blasted$Top.V.Gene = blasted$vSegment
+blasted$Top.D.Gene = blasted$dSegment
+blasted$Top.J.Gene = blasted$jSegment
+blasted$CDR1.Seq = blasted$cdr1aa
+blasted$CDR1.Length = nchar(blasted$CDR1.Seq)
+blasted$CDR2.Seq = blasted$cdr2aa
+blasted$CDR2.Length = nchar(blasted$CDR2.Seq)
+blasted$CDR3.Seq = blasted$cdr3aa
+blasted$CDR3.Length = nchar(blasted$CDR3.Seq)
+blasted$CDR3.Seq.DNA = blasted$cdr3nt
+blasted$CDR3.Length.DNA = nchar(blasted$CDR3.Seq.DNA)
+blasted$Strand = "+/-"
+blasted$CDR3.Found.How = "found"
+
+search = blasted$cdr3nt == ""
+if(sum(search) > 0){
+  blasted[search,]$CDR3.Found.How = "NOT_FOUND"
+}
+
+blasted$AA.JUNCTION = blasted$CDR3.Seq
+
+n = c("X.reads_count", "ID", "VDJ.Frame", "Top.V.Gene", "Top.D.Gene", "Top.J.Gene", "CDR1.Seq", "CDR1.Length", "CDR2.Seq", "CDR2.Length", "CDR3.Seq", "CDR3.Length", "CDR3.Seq.DNA", "CDR3.Length.DNA", "Strand", "CDR3.Found.How", "Functionality", "AA.JUNCTION")
+
+n[!(n %in% names(blasted))]
+
+blasted = blasted[,c("X.reads_count", "ID", "VDJ.Frame", "Top.V.Gene", "Top.D.Gene", "Top.J.Gene", "CDR1.Seq", "CDR1.Length", "CDR2.Seq", "CDR2.Length", "CDR3.Seq", "CDR3.Length", "CDR3.Seq.DNA", "CDR3.Length.DNA", "Strand", "CDR3.Found.How", "AA.JUNCTION")]
+
+names(blasted) = c("frequency.count", "ID", "VDJ Frame", "Top V Gene", "Top D Gene", "Top J Gene", "CDR1 Seq", "CDR1 Length", "CDR2 Seq", "CDR2 Length", "CDR3 Seq", "CDR3 Length", "CDR3 Seq DNA", "CDR3 Length DNA", "Strand", "CDR3 Found How", "AA JUNCTION")
+
+#duplicate rows based on frequency.count
+blasted = blasted[rep(seq_len(nrow(blasted)), blasted$frequency.count),]
+blasted$ID = 1:nrow(blasted)
+
+blasted = blasted[,c("ID", "VDJ Frame", "Top V Gene", "Top D Gene", "Top J Gene", "CDR1 Seq", "CDR1 Length", "CDR2 Seq", "CDR2 Length", "CDR3 Seq", "CDR3 Length", "CDR3 Seq DNA", "CDR3 Length DNA", "Strand", "CDR3 Found How", "AA JUNCTION")]
+
+write.table(blasted, outfile, quote=F, sep="\t", row.names=F, col.names=T)

diff -r 5ffd52fc35c4 -r bcec7bb4e089 igblast/igblast.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/igblast/igblast.sh Mon Dec 12 05:22:57 2016 -0500

@@ -0,0 +1,15 @@
+set -e
+
+dir="$(cd "$(dirname "$0")" && pwd)"
+
+input=$1
+species=$2
+locus=$3
+output=$4
+
+
+echo "$input $species $locus $output"
+
+java -Xmx64G -jar $IGBLASTWRP/igblastwrp.jar -p 4 -S $species -R $locus ${input} $PWD/blasted_output 2>&1
+
+Rscript --verbose $dir/igblast.r "$PWD/blasted_output.L2.txt" "$output" 2>&1

diff -r 5ffd52fc35c4 -r bcec7bb4e089 igblastparser/igparse.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/igblastparser/igparse.pl Mon Dec 12 05:22:57 2016 -0500

[

b'@@ -0,0 +1,1252 @@\n+#!/usr/bin/perl\n+=head1 IGBLAST_simple.pl\n+\n+This version (1.4) has been heavily adapted since the original program was first created back in October 2012.\n+Bas Horsman (EMC, Rotterdam, The Netherlands) has contributed with minor - though important - code changes.\n+\n+From V 1.2 onwards a \'Change Log\' is included at the end of the program\n+\n+=head2 Usage\n+\n+Requires no modules in general use; the Data::Dumper (supplied as part of the Perl Core module set) might be useful for debugging/adjustment \n+as it allows inspection of the data stores.\n+\n+The program takes a text file of the \n+\n+ ./IGBLAST_simple.pl igBLASTOutput.txt <-optional: index of record to process->\n+ \n+Supply the text version of the igBLAST report in the format as in the example below.\n+The extra command line arugment is the record number (aka. BLAST report) to process. \n+If 0 or absent all are processed, if supplied that record (base 1) is processed and the program dies afterwards. \n+\n+=head2 Example Input\n+\n+A standard igBLAST record or set of them in a file; this being typical:\n+\n+ BLASTN 2.2.27+\n+\n+\n+Reference: Stephen F. Altschul, Thomas L. Madden, Alejandro A.\n+Schaffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J.\n+Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of\n+protein database search programs", Nucleic Acids Res. 25:3389-3402.\n+\n+\n+\n+Database: human_gl_V; human_gl_D; human_gl_J\n+ 674 sequences; 179,480 total letters\n+\n+\n+\n+Query= HL67IUI01D26LR length=433 xy=1559_1437 region=1\n+run=R_2012_04_10_11_57_56_\n+\n+Length=433\n+ Score E\n+Sequences producing significant alignments: (Bits) Value\n+\n+lcl|IGHV3-30*04 330 2e-92\n+lcl|IGHV3-30-3*01 330 2e-92\n+lcl|IGHV3-30*01 327 2e-91\n+lcl|IGHD3-16*01 14.4 11\n+lcl|IGHD3-16*02 14.4 11\n+lcl|IGHD1-14*01 12.4 43\n+lcl|IGHJ4*02 78.3 1e-18\n+lcl|IGHJ5*02 70.3 4e-16\n+lcl|IGHJ4*01 68.3 2e-15\n+\n+\n+Domain classification requested: imgt\n+\n+\n+V(D)J rearrangement summary for query sequence (Top V gene match, Top D gene match, Top J gene match, Chain type, V-J Frame, Strand):\n+IGHV3-30*04\tIGHD3-16*01\tIGHJ4*02\tVH\tIn-frame\t+\n+\n+V(D)J junction details (V end, V-D junction, D region, D-J junction, J start). Note that possible overlapping nucleotides at VDJ junction (i.e, nucleotides that could be assigned to either joining gene segment) are indicated in parentheses (i.e., (TACT)) but are not included under V, D, or J gene itself\n+AGAGA\tTATGAGCCCCATCATGACA\tACGTTTG\tCCGGAA\tACTAC\t\n+\n+Alignment summary between query and top germline V gene hit (from, to, length, matches, mismatches, gaps, percent identity)\n+FWR1\t27\t38\t12\t11\t1\t0\t91.7\n+CDR1\t39\t62\t24\t22\t2\t0\t91.7\n+FWR2\t63\t113\t51\t50\t1\t0\t98\n+CDR2\t114\t137\t24\t23\t1\t0\t95.8\n+FWR3\t138\t251\t114\t109\t5\t0\t95.6\n+CDR3 (V region only)\t252\t259\t8\t7\t1\t0\t87.5\n+Total\tN/A\tN/A\t233\t222\t11\t0\t95.3\n+\n+\n+Alignments\n+\n+ <----FWR1--><----------CDR1--------><-----------------------FWR2------\n+ W A A S G F T F N T Y A V H W V R Q A P G K G \n+ Query_1 27 TGGGCAGCCTCTGGATTCACCTTCAATACCTATGCTGTGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGC 96\n+V 95.3% (222/233) IGHV3-30*04 64 ..T......................G..G.......A................................. 133\n+ C A A S G F T F S S Y A M H W '..b'plit (/\\s+/,$InfoPanel);\t#Split on spaces\n+##Enable if you need to know what we just found: \n+#\t\t#print "D: Fields are (Germclass, PID, PID_Counts, Allele) \\t$Germclass, $PID, $PID_Counts, $Allele\\n";\n+#\t\t#A reality check: we should have an Allele - or some text here.\n+#\t\tunless (defined $Allele && $Allele ne "")\n+#\t\t\t{\twarn "Cannot get Allele for Line \'$C_Line\' - implies improper parsing: \'",substr ($Lines[$C_Line],0,15),"...\'\\n";\t}\n+#\t\tif (exists ($Alginments {$Germclass}{$Allele}))\n+#\t\t\t{\t$Alginments {$Germclass}{$Allele}\t=\t$Alginments {$Germclass}{$Allele}.$CurrentAASequence;\t}\t#Carry on adding\n+#\t\telse\t#more work needed as we need to \'pad\' the sequence with fake gap characters)\n+#\t\t\t{\n+##Do we still need this padding? I don\'t think so\t\t\t\t\n+#\t\t\t\t\n+#\n+#\t\t\tmy $PaddingChars = ($ThisQueryStart-$Query_Start);\n+#\t\t\tprint "D: New gene found: need to pad it with ($ThisQueryStart-$Query_Start) i.e. \'$PaddingChars\' characters\\n";\n+#\t\t\t#To help testing, calculate this first:\n+#\t\t\tmy $PaddingString = " "x $PaddingChars;\n+#\t\t\t$Alginments {$Germclass}{$Allele}\t=\t$CurrentAASequence;\t\n+#\t\t\t}\n+#\t\tnext\n+\n+=head3 Demonstration of Pattern match positions\n+\n+my $Text = "12345TTT TTAAAAA";\n+my $TestPat = "TTT\\\\s+TT";\n+(my $Result)= $Text =~ m/$TestPat/;\n+print "D: Two vars are: - = ",$-[0], " & + =", $+[0]," for test pattern \'$TestPat\'\\n";\n+\n+sub printCDR3 {\n+\n+=head3 Subroutine: printCDR3 ($CDR3_Start, $CDR3_End, "SUMMARY_TABLE", $AAQuerySequence, $DNAQuerySequence);\n+\n+???? IS THIS FUNCTION IN USE ?????\n+\t\n+Handles the printing of the output when passed information about the CDR3 region.\n+\n+\n+The result is sent returned as a text string in this version hence use it like this if you want to send it to STDOUT:\n+\n+ print printCDR3 ($CDR3_Start, $CDR3_End, "SUMMARY_TABLE", $AAQuerySequence, $DNAQuerySequence), "\\n";\n+ \n+#=cut \t\n+\n+#Despite the similarity in names, these are all local copies passed to us:\n+\n+my ($Start, $End, $Tag, $FullAAQuerySequence, $FullDNAQuerySequence) = @_;\n+\n+#For DNA:\n+my ($CDR_DNA_Seq) = substr ($FullDNAQuerySequence, $Start, $Start+$End);\n+my ($CDR_DNA_Length) = length ($CDR_DNA_Seq);\n+\n+#For AA:\n+my ($CDR_AA_Seq) = substr ($FullAAQuerySequence, $Start, $Start+$End);\n+my ($CDR_AA_Length) = length ($CDR_AA_Seq);\n+\n+my $ReturnString = join ("\\t", $CDR_DNA_Seq, $CDR_DNA_Length, $CDR_AA_Seq, $CDR_AA_Length, $Tag); #Create here so we can inspect it / post process it if needed:\n+print "D: SUB: printCDR3: As returned: \'$ReturnString\'\\n";\n+return ($ReturnString);\n+\n+}\n+\n+=cut \n+\n+\n+\n+=head2 Change Log\n+\n+=head3 Version 1.2\n+\n+ 1) Fixed the \'Process recrod request\' feature\' [was failed increment in $Record]\n+ 2) Deleted / Deactivated the function \'printCDR3\' [wasn\'t in used; kept if useful for parts]. \n+ \tThis function is replaced by the more general printOUTPUTData()\n+ 3) A tag for the CDR3 status is now output for every record / read. \n+ Initially this is set to "NOT_FOUND" and changed if evidence for the CDR3 is found. \n+\n+=head4 Version 1.3\n+\n+ 1) The tophit line was split on whitespace, however sometimes the VJFrame is something like \xe2\x80\x9cIn-frame with stop codon\xe2\x80\x9d, \n+ which means the line is also split on the spaces therein. It now splits on tabs only, and this seems to work properly.\n+ - found by Bas Horsman. \n+\n+=head4 Version 1.3a\n+\n+ 1) "MOTIF_FOUND_IN_AA" reported correctly (was impossible previously due to addition error to the $MotifFound var (never could == 3)\n+ \n+=cut \n+\n+=head4 Version 1.4\n+\n+ 1) Now processes files using Mac/Unix/MS-DOS newline characters:\n+ \n+ $_ =~ s/\\r\\n/\\n/g;\t\t#In case line ends are MS-DOS\n+ $_ =~ s/\\r/\\n/g;\t\t#In case line ends are Mac\n+ #The whole record - one per read - is now stored in $_\n+ my @Lines =split (/\\R/,$_);\t#Split on new lines \n+\n+=head4 Version 1.4a\n+\n+1) Fixed the length of the CDR3 AA string being reported correctly:\n+\n+ $OUTPUT_Data{"CDR3 Length"} = $CDR3_Length; \n+ to: \n+ $OUTPUT_Data{"CDR3 Length"} = $CDR3_Seq_AA_Length;\n+ \n\\ No newline at end of file\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 imgt_loader.r
--- a/imgt_loader.r Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,82 +0,0 @@
-args <- commandArgs(trailingOnly = TRUE)
-
-summ.file = args[1]
-aa.file = args[2]
-junction.file = args[3]
-out.file = args[4]
-
-summ = read.table(summ.file, sep="\t", header=T, quote="", fill=T)
-aa = read.table(aa.file, sep="\t", header=T, quote="", fill=T)
-junction = read.table(junction.file, sep="\t", header=T, quote="", fill=T)
-
-old_summary_columns=c('Sequence.ID','JUNCTION.frame','V.GENE.and.allele','D.GENE.and.allele','J.GENE.and.allele','CDR1.IMGT.length','CDR2.IMGT.length','CDR3.IMGT.length','Orientation')
-old_sequence_columns=c('CDR1.IMGT','CDR2.IMGT','CDR3.IMGT')
-old_junction_columns=c('JUNCTION')
-
-added_summary_columns=c('Functionality','V.REGION.identity..','V.REGION.identity.nt','D.REGION.reading.frame','AA.JUNCTION','Functionality.comment','Sequence')
-added_sequence_columns=c('FR1.IMGT','FR2.IMGT','FR3.IMGT','CDR3.IMGT','JUNCTION','J.REGION','FR4.IMGT')
-
-added_junction_columns=c('P3.V.nt.nb','N.REGION.nt.nb','N1.REGION.nt.nb','P5.D.nt.nb','P3.D.nt.nb','N2.REGION.nt.nb','P5.J.nt.nb','X3.V.REGION.trimmed.nt.nb','X5.D.REGION.trimmed.nt.nb','X3.D.REGION.trimmed.nt.nb','X5.J.REGION.trimmed.nt.nb','N.REGION','N1.REGION','N2.REGION')
-added_junction_columns=c(added_junction_columns, 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb')
-
-out=summ[,c("Sequence.ID","JUNCTION.frame","V.GENE.and.allele","D.GENE.and.allele","J.GENE.and.allele")]
-
-out[,"CDR1.Seq"] = aa[,"CDR1.IMGT"]
-out[,"CDR1.Length"] = summ[,"CDR1.IMGT.length"]
-
-out[,"CDR2.Seq"] = aa[,"CDR2.IMGT"]
-out[,"CDR2.Length"] = summ[,"CDR2.IMGT.length"]
-
-out[,"CDR3.Seq"] = aa[,"CDR3.IMGT"]
-out[,"CDR3.Length"] = summ[,"CDR3.IMGT.length"]
-
-out[,"CDR3.Seq.DNA"] = junction[,"JUNCTION"]
-out[,"CDR3.Length.DNA"] = nchar(as.character(junction[,"JUNCTION"]))
-out[,"Strand"] = summ[,"Orientation"]
-out[,"CDR3.Found.How"] = "a"
-
-out[,added_summary_columns] = summ[,added_summary_columns]
-
-out[,added_sequence_columns] = aa[,added_sequence_columns]
-
-out[,added_junction_columns] = junction[,added_junction_columns]
-
-out[,"Top V Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"V.GENE.and.allele"]))
-out[,"Top D Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"D.GENE.and.allele"]))
-out[,"Top J Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"J.GENE.and.allele"]))
-
-out = out[,c('Sequence.ID','JUNCTION.frame','Top V Gene','Top D Gene','Top J Gene','CDR1.Seq','CDR1.Length','CDR2.Seq','CDR2.Length','CDR3.Seq','CDR3.Length','CDR3.Seq.DNA','CDR3.Length.DNA','Strand','CDR3.Found.How','Functionality','V.REGION.identity..','V.REGION.identity.nt','D.REGION.reading.frame','AA.JUNCTION','Functionality.comment','Sequence','FR1.IMGT','FR2.IMGT','FR3.IMGT','CDR3.IMGT','JUNCTION','J.REGION','FR4.IMGT','P3.V.nt.nb','N.REGION.nt.nb','N1.REGION.nt.nb','P5.D.nt.nb','P3.D.nt.nb','N2.REGION.nt.nb','P5.J.nt.nb','X3.V.REGION.trimmed.nt.nb','X5.D.REGION.trimmed.nt.nb','X3.D.REGION.trimmed.nt.nb','X5.J.REGION.trimmed.nt.nb','N.REGION','N1.REGION','N2.REGION', 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb')]
-
-names(out) = c('ID','VDJ Frame','Top V Gene','Top D Gene','Top J Gene','CDR1 Seq','CDR1 Length','CDR2 Seq','CDR2 Length','CDR3 Seq','CDR3 Length','CDR3 Seq DNA','CDR3 Length DNA','Strand','CDR3 Found How','Functionality','V-REGION identity %','V-REGION identity nt','D-REGION reading frame','AA JUNCTION','Functionality comment','Sequence','FR1-IMGT','FR2-IMGT','FR3-IMGT','CDR3-IMGT','JUNCTION','J-REGION','FR4-IMGT','P3V-nt nb','N-REGION-nt nb','N1-REGION-nt nb','P5D-nt nb','P3D-nt nb','N2-REGION-nt nb','P5J-nt nb','3V-REGION trimmed-nt nb','5D-REGION trimmed-nt nb','3D-REGION trimmed-nt nb','5J-REGION trimmed-nt nb','N-REGION','N1-REGION','N2-REGION', 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb')
-
-out[,"VDJ Frame"] = as.character(out[,"VDJ Frame"])
-
-fltr = out[,"VDJ Frame"] == "in-frame"
-if(any(fltr)){
- out[fltr, "VDJ Frame"] = "In-frame"
-}
-
-fltr = out[,"VDJ Frame"] == "null"
-if(any(fltr)){
- out[fltr, "VDJ Frame"] = "Out-of-frame"
-}
-
-fltr = out[,"VDJ Frame"] == "out-of-frame"
-if(any(fltr)){
- out[fltr, "VDJ Frame"] = "Out-of-frame"
-}
-
-fltr = out[,"VDJ Frame"] == ""
-if(any(fltr)){
- out[fltr, "VDJ Frame"] = "Out-of-frame"
-}
-
-for(col in c('Top V Gene','Top D Gene','Top J Gene')){
- out[,col] = as.character(out[,col])
- fltr = out[,col] == ""
- if(any(fltr)){
- out[fltr,col] = "NA"
- }
-}
-
-write.table(out, out.file, sep="\t", quote=F, row.names=F, col.names=T)

diff -r 5ffd52fc35c4 -r bcec7bb4e089 imgt_loader/imgt_loader.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/imgt_loader/imgt_loader.py Mon Dec 12 05:22:57 2016 -0500

[

@@ -0,0 +1,147 @@
+import pandas as pd
+try:
+ pd.options.mode.chained_assignment = None  # default='warn'
+except:
+ pass
+import re
+import argparse
+import os
+
+def stop_err( msg, ret=1 ):
+    sys.stderr.write( msg )
+    sys.exit( ret )
+
+#docs.python.org/dev/library/argparse.html
+parser = argparse.ArgumentParser()
+parser.add_argument("--summ", help="The 1_Summary file from the imgt output")
+parser.add_argument("--aa", help="The 5_AA-Sequence file from the imgt output")
+parser.add_argument("--junction", help="The 6_Junction file from the imgt output")
+parser.add_argument("--output", help="Output file")
+
+args = parser.parse_args()
+
+old_summary_columns = [u'Sequence ID', u'JUNCTION frame', u'V-GENE and allele', u'D-GENE and allele', u'J-GENE and allele', u'CDR1-IMGT length', u'CDR2-IMGT length', u'CDR3-IMGT length', u'Orientation']
+old_sequence_columns = [u'CDR1-IMGT', u'CDR2-IMGT', u'CDR3-IMGT']
+old_junction_columns = [u'JUNCTION']
+
+added_summary_columns = [u'Functionality', u'V-REGION identity %', u'V-REGION identity nt', u'D-REGION reading frame', u'AA JUNCTION', u'Functionality comment', u'Sequence']
+added_sequence_columns = [u'FR1-IMGT', u'FR2-IMGT', u'FR3-IMGT', u'CDR3-IMGT', u'JUNCTION', u'J-REGION', u'FR4-IMGT']
+added_junction_columns = [u"P3'V-nt nb", u'N-REGION-nt nb', u'N1-REGION-nt nb', u"P5'D-nt nb", u"P3'D-nt nb", u'N2-REGION-nt nb', u"P5'J-nt nb", u"3'V-REGION trimmed-nt nb",
+   u"5'D-REGION trimmed-nt nb", u"3'D-REGION trimmed-nt nb", u"5'J-REGION trimmed-nt nb", u"N-REGION", u"N1-REGION", u"N2-REGION"]
+
+outFile = args.output
+
+#fSummary = pd.read_csv(triplets[0][0], sep="\t", low_memory=False)
+fSummary = pd.read_csv(args.summ, sep="\t", dtype=object)
+#fSequence = pd.read_csv(triplets[0][1], sep="\t", low_memory=False)
+fSequence = pd.read_csv(args.aa, sep="\t", dtype=object)
+#fJunction = pd.read_csv(triplets[0][2], sep="\t", low_memory=False)
+fJunction = pd.read_csv(args.junction, sep="\t", dtype=object)
+tmp = fSummary[["Sequence ID", "JUNCTION frame", "V-GENE and allele", "D-GENE and allele", "J-GENE and allele"]]
+
+tmp["CDR1 Seq"] = fSequence["CDR1-IMGT"]
+tmp["CDR1 Length"] = fSummary["CDR1-IMGT length"]
+
+tmp["CDR2 Seq"] = fSequence["CDR2-IMGT"]
+tmp["CDR2 Length"] = fSummary["CDR2-IMGT length"]
+
+tmp["CDR3 Seq"] = fSequence["CDR3-IMGT"]
+tmp["CDR3 Length"] = fSummary["CDR3-IMGT length"]
+
+tmp["CDR3 Seq DNA"] = fJunction["JUNCTION"]
+tmp["CDR3 Length DNA"] = '1'
+tmp["Strand"] = fSummary["Orientation"]
+tmp["CDR3 Found How"] = 'a'
+
+for col in added_summary_columns:
+    tmp[col] = fSummary[col]
+
+for col in added_sequence_columns:
+    tmp[col] = fSequence[col]
+
+for col in added_junction_columns:
+    tmp[col] = fJunction[col]
+
+outFrame = tmp
+
+outFrame.columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', u'CDR3 Seq', u'CDR3 Length',
+ u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Functionality', 'V-REGION identity %', 'V-REGION identity nt', 'D-REGION reading frame',
+ 'AA JUNCTION', 'Functionality comment', 'Sequence', 'FR1-IMGT', 'FR2-IMGT', 'FR3-IMGT', 'CDR3-IMGT', 'JUNCTION', 'J-REGION', 'FR4-IMGT', 'P3V-nt nb',
+ 'N-REGION-nt nb', 'N1-REGION-nt nb', 'P5D-nt nb', 'P3D-nt nb', 'N2-REGION-nt nb', 'P5J-nt nb', '3V-REGION trimmed-nt nb', '5D-REGION trimmed-nt nb', '3D-REGION trimmed-nt nb',
+ '5J-REGION trimmed-nt nb', "N-REGION", "N1-REGION", "N2-REGION"]
+
+"""
+IGHV[0-9]-[0-9ab]+-?[0-9]?D?
+TRBV[0-9]{1,2}-?[0-9]?-?[123]?
+IGKV[0-3]D?-[0-9]{1,2}
+IGLV[0-9]-[0-9]{1,2}
+TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?
+TRGV[234589]
+TRDV[1-3]
+
+IGHD[0-9]-[0-9ab]+
+TRBD[12]
+TRDD[1-3]
+
+IGHJ[1-6]
+TRBJ[12]-[1-7]
+IGKJ[1-5]
+IGLJ[12367]
+TRAJ[0-9]{1,2}
+TRGJP?[12]
+TRDJ[1-4]
+"""
+
+vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?)",
+ r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
+ r"(IGKV[0-3]D?-[0-9]{1,2})",
+ r"(IGLV[0-9]-[0-9]{1,2})",
+ r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)",
+ r"(TRGV[234589])",
+ r"(TRDV[1-3])",
+ r"(IGHV[0-9]S[0-9]+)"]
+
+dPattern = [r"(IGHD[0-9]-[0-9ab]+)",
+ r"(TRBD[12])",
+ r"(TRDD[1-3])"]
+
+jPattern = [r"(IGHJ[1-6])",
+ r"(TRBJ[12]-[1-7])",
+ r"(IGKJ[1-5])",
+ r"(IGLJ[12367])",
+ r"(TRAJ[0-9]{1,2})",
+ r"(TRGJP?[12])",
+ r"(TRDJ[1-4])"]
+
+vPattern = re.compile(r"|".join(vPattern))
+
+dPattern = re.compile(r"|".join(dPattern))
+
+jPattern = re.compile(r"|".join(jPattern))
+
+
+def filterGenes(s, pattern):
+    if type(s) is not str:
+        return "NA"
+    res = pattern.search(s)
+    if res:
+        return res.group(0)
+    return "NA"
+
+
+
+outFrame["Top V Gene"] = outFrame["Top V Gene"].apply(lambda x: filterGenes(x, vPattern))
+outFrame["Top D Gene"] = outFrame["Top D Gene"].apply(lambda x: filterGenes(x, dPattern))
+outFrame["Top J Gene"] = outFrame["Top J Gene"].apply(lambda x: filterGenes(x, jPattern))
+
+
+tmp = outFrame["VDJ Frame"]
+tmp = tmp.replace("in-frame", "In-frame")
+tmp = tmp.replace("null", "Out-of-frame")
+tmp = tmp.replace("out-of-frame", "Out-of-frame")
+outFrame["VDJ Frame"] = tmp
+outFrame["CDR3 Length DNA"] = outFrame["CDR3 Seq DNA"].map(str).map(len)
+safeLength = lambda x: len(x) if type(x) == str else 0
+#outFrame = outFrame[(outFrame["CDR3 Seq DNA"].map(safeLength) > 0) & (outFrame["Top V Gene"] != "NA") & (outFrame["Top J Gene"] != "NA")] #filter out weird rows?
+#outFrame = outFrame[(outFrame["CDR3 Seq DNA"].map(safeLength) > 0) & (outFrame["Top V Gene"] != "NA") & (outFrame["Top D Gene"] != "NA") & (outFrame["Top J Gene"] != "NA")] #filter out weird rows?
+outFrame.to_csv(outFile, sep="\t", index=False, index_label="index")

diff -r 5ffd52fc35c4 -r bcec7bb4e089 imgt_loader/imgt_loader.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/imgt_loader/imgt_loader.r Mon Dec 12 05:22:57 2016 -0500

[

@@ -0,0 +1,87 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+summ.file = args[1]
+aa.file = args[2]
+junction.file = args[3]
+out.file = args[4]
+
+summ = read.table(summ.file, sep="\t", header=T, quote="", fill=T)
+aa = read.table(aa.file, sep="\t", header=T, quote="", fill=T)
+junction = read.table(junction.file, sep="\t", header=T, quote="", fill=T)
+
+old_summary_columns=c('Sequence.ID','JUNCTION.frame','V.GENE.and.allele','D.GENE.and.allele','J.GENE.and.allele','CDR1.IMGT.length','CDR2.IMGT.length','CDR3.IMGT.length','Orientation')
+old_sequence_columns=c('CDR1.IMGT','CDR2.IMGT','CDR3.IMGT')
+old_junction_columns=c('JUNCTION')
+
+added_summary_columns=c('Functionality','V.REGION.identity..','V.REGION.identity.nt','D.REGION.reading.frame','AA.JUNCTION','Functionality.comment','Sequence')
+added_sequence_columns=c('FR1.IMGT','FR2.IMGT','FR3.IMGT','CDR3.IMGT','JUNCTION','J.REGION','FR4.IMGT')
+
+added_junction_columns=c('P3.V.nt.nb','N.REGION.nt.nb','N1.REGION.nt.nb','P5.D.nt.nb','P3.D.nt.nb','N2.REGION.nt.nb','P5.J.nt.nb','X3.V.REGION.trimmed.nt.nb','X5.D.REGION.trimmed.nt.nb','X3.D.REGION.trimmed.nt.nb','X5.J.REGION.trimmed.nt.nb','N.REGION','N1.REGION','N2.REGION')
+added_junction_columns=c(added_junction_columns, 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb')
+
+out=summ[,c("Sequence.ID","JUNCTION.frame","V.GENE.and.allele","D.GENE.and.allele","J.GENE.and.allele")]
+
+out[,"CDR1.Seq"] = aa[,"CDR1.IMGT"]
+out[,"CDR1.Length"] = summ[,"CDR1.IMGT.length"]
+
+out[,"CDR2.Seq"] = aa[,"CDR2.IMGT"]
+out[,"CDR2.Length"] = summ[,"CDR2.IMGT.length"]
+
+out[,"CDR3.Seq"] = aa[,"CDR3.IMGT"]
+out[,"CDR3.Length"] = summ[,"CDR3.IMGT.length"]
+
+out[,"CDR3.Seq.DNA"] = junction[,"JUNCTION"]
+out[,"CDR3.Length.DNA"] = nchar(as.character(junction[,"JUNCTION"]))
+out[,"Strand"] = summ[,"Orientation"]
+out[,"CDR3.Found.How"] = "a"
+
+out[,added_summary_columns] = summ[,added_summary_columns]
+
+out[,added_sequence_columns] = aa[,added_sequence_columns]
+
+out[,added_junction_columns] = junction[,added_junction_columns]
+
+out[,"Top V Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"V.GENE.and.allele"]))
+out[,"Top D Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"D.GENE.and.allele"]))
+out[,"Top J Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"J.GENE.and.allele"]))
+
+out = out[!grepl("Less than", summ[,"V.GENE.and.allele"]),]
+out = out[!grepl("Less than", summ[,"D.GENE.and.allele"]),]
+out = out[!grepl("Less than", summ[,"J.GENE.and.allele"]),]
+
+out = out[,c('Sequence.ID','JUNCTION.frame','Top V Gene','Top D Gene','Top J Gene','CDR1.Seq','CDR1.Length','CDR2.Seq','CDR2.Length','CDR3.Seq','CDR3.Length','CDR3.Seq.DNA','CDR3.Length.DNA','Strand','CDR3.Found.How','Functionality','V.REGION.identity..','V.REGION.identity.nt','D.REGION.reading.frame','AA.JUNCTION','Functionality.comment','Sequence','FR1.IMGT','FR2.IMGT','FR3.IMGT','CDR3.IMGT','JUNCTION','J.REGION','FR4.IMGT','P3.V.nt.nb','N.REGION.nt.nb','N1.REGION.nt.nb','P5.D.nt.nb','P3.D.nt.nb','N2.REGION.nt.nb','P5.J.nt.nb','X3.V.REGION.trimmed.nt.nb','X5.D.REGION.trimmed.nt.nb','X3.D.REGION.trimmed.nt.nb','X5.J.REGION.trimmed.nt.nb','N.REGION','N1.REGION','N2.REGION', 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb')]
+
+names(out) = c('ID','VDJ Frame','Top V Gene','Top D Gene','Top J Gene','CDR1 Seq','CDR1 Length','CDR2 Seq','CDR2 Length','CDR3 Seq','CDR3 Length','CDR3 Seq DNA','CDR3 Length DNA','Strand','CDR3 Found How','Functionality','V-REGION identity %','V-REGION identity nt','D-REGION reading frame','AA JUNCTION','Functionality comment','Sequence','FR1-IMGT','FR2-IMGT','FR3-IMGT','CDR3-IMGT','JUNCTION','J-REGION','FR4-IMGT','P3V-nt nb','N-REGION-nt nb','N1-REGION-nt nb','P5D-nt nb','P3D-nt nb','N2-REGION-nt nb','P5J-nt nb','3V-REGION trimmed-nt nb','5D-REGION trimmed-nt nb','3D-REGION trimmed-nt nb','5J-REGION trimmed-nt nb','N-REGION','N1-REGION','N2-REGION', 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb')
+
+out[,"VDJ Frame"] = as.character(out[,"VDJ Frame"])
+
+fltr = out[,"VDJ Frame"] == "in-frame"
+if(any(fltr)){
+ out[fltr, "VDJ Frame"] = "In-frame"
+}
+
+fltr = out[,"VDJ Frame"] == "null"
+if(any(fltr)){
+ out[fltr, "VDJ Frame"] = "Out-of-frame"
+}
+
+fltr = out[,"VDJ Frame"] == "out-of-frame"
+if(any(fltr)){
+ out[fltr, "VDJ Frame"] = "Out-of-frame"
+}
+
+fltr = out[,"VDJ Frame"] == ""
+if(any(fltr)){
+ out[fltr, "VDJ Frame"] = "Out-of-frame"
+}
+
+for(col in c('Top V Gene','Top D Gene','Top J Gene')){
+ out[,col] = as.character(out[,col])
+ fltr = out[,col] == ""
+ fltr[is.na(fltr)] = T
+ if(any(fltr)){
+ out[fltr,col] = "NA"
+ }
+}
+
+write.table(out, out.file, sep="\t", quote=F, row.names=F, col.names=T)

diff -r 5ffd52fc35c4 -r bcec7bb4e089 imgt_loader/imgt_loader.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/imgt_loader/imgt_loader.sh Mon Dec 12 05:22:57 2016 -0500

[

@@ -0,0 +1,69 @@
+#!/bin/bash
+input=$1
+output=$2
+name=$3
+dir="$(cd "$(dirname "$0")" && pwd)"
+mkdir -p $PWD/$name/files
+f=$(file $input)
+zip7Type="7-zip archive"
+tarType="tar archive"
+bzip2Type="bzip2 compressed"
+gzipType="gzip compressed"
+zipType="Zip archive"
+rarType="RAR archive"
+zxType="XZ compressed data"
+
+if [[ "$f" == *"$zip7Type"* ]]; then
+ echo "7-zip"
+ echo "Trying: 7za e $input -o$PWD/files/"
+ 7za e $input -o$PWD/$name/files
+fi
+
+if [[ "$f" == *"$tarType"* ]]
+then
+ echo "tar archive"
+ echo "Trying: tar xvf $input -C $PWD/files/"
+ tar -xvf $input -C $PWD/$name/files
+fi
+
+if [[ "$f" == *"$bzip2Type"* ]]
+then
+ echo "bzip2 compressed data"
+ echo "Trying: tar jxf $input -C $PWD/files/"
+ tar -jxf $input -C $PWD/$name/files
+fi
+
+if [[ "$f" == *"$gzipType"* ]]
+then
+ echo "gzip compressed data"
+ echo "Trying: tar xvzf $input -C $PWD/files/"
+ tar -xvzf $input -C $PWD/$name/files
+fi
+
+if [[ "$f" == *"$zipType"* ]]
+then
+ echo "Zip archive"
+ echo "Trying: unzip $input -d $PWD/files/"
+ unzip $input -d $PWD/$name/files > $PWD/unziplog.log
+fi
+
+if [[ "$f" == *"$rarType"* ]]
+then
+ echo "RAR archive"
+ echo "Trying: unrar e $input $PWD/files/"
+ unrar e $input $PWD/$name/files
+fi
+
+if [[ "$f" == *"$zxType"* ]]
+then
+ echo "xz compressed data"
+ echo "Trying: tar -xJf $input -C $PWD/files/"
+ tar xJf $input -C $PWD/$name/files
+fi
+find $PWD/$name/files -iname "1_*" -exec cat {} + > $PWD/$name/summ.txt
+find $PWD/$name/files -iname "5_*" -exec cat {} + > $PWD/$name/aa.txt
+find $PWD/$name/files -iname "6_*" -exec cat {} + > $PWD/$name/junction.txt
+
+#python $dir/imgt_loader.py --summ $PWD/$name/summ.txt --aa $PWD/$name/aa.txt --junction $PWD/$name/junction.txt --output $output
+
+Rscript --verbose $dir/imgt_loader.r $PWD/$name/summ.txt $PWD/$name/aa.txt $PWD/$name/junction.txt $output 2>&1

diff -r 5ffd52fc35c4 -r bcec7bb4e089 merge.r
--- a/merge.r Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,27 +0,0 @@
-args <- commandArgs(trailingOnly = TRUE)
-
-input.1 = args[1]
-input.2 = args[2]
-
-fields.1 = args[3]
-fields.2 = args[4]
-
-field.1 = args[5]
-field.2 = args[6]
-
-output = args[7]
-
-dat1 = read.table(input.1, header=T, sep="\t", quote="", stringsAsFactors=F, fill=T, row.names=NULL)
-if(fields.1 != "all"){
- fields.1 = unlist(strsplit(fields.1, ","))
- dat1 = dat1[,fields.1]
-}
-dat2 = read.table(input.2, header=T, sep="\t", quote="", stringsAsFactors=F, fill=T, row.names=NULL)
-if(fields.2 != "all"){
- fields.2 = unlist(strsplit(fields.2, ","))
- dat2 = dat2[,fields.2]
-}
-
-dat3 = merge(dat1, dat2, by.x=field.1, by.y=field.2)
-
-write.table(dat3, output, sep="\t",quote=F,row.names=F,col.names=T)

diff -r 5ffd52fc35c4 -r bcec7bb4e089 merge_and_filter.r
--- a/merge_and_filter.r Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,229 +0,0 @@\n-args <- commandArgs(trailingOnly = TRUE)\r\n-\r\n-\r\n-summaryfile = args[1]\r\n-sequencesfile = args[2]\r\n-mutationanalysisfile = args[3]\r\n-mutationstatsfile = args[4]\r\n-hotspotsfile = args[5]\r\n-aafile = args[6]\r\n-gene_identification_file= args[7]\r\n-output = args[8]\r\n-before.unique.file = args[9]\r\n-unmatchedfile = args[10]\r\n-method=args[11]\r\n-functionality=args[12]\r\n-unique.type=args[13]\r\n-filter.unique=args[14]\r\n-class.filter=args[15]\r\n-empty.region.filter=args[16]\r\n-\r\n-summ = read.table(summaryfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-sequences = read.table(sequencesfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-mutationstats = read.table(mutationstatsfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-hotspots = read.table(hotspotsfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-AAs = read.table(aafile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-gene_identification = read.table(gene_identification_file, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-\r\n-if(method == "blastn"){\r\n-\t#"qseqid\\tsseqid\\tpident\\tlength\\tmismatch\\tgapopen\\tqstart\\tqend\\tsstart\\tsend\\tevalue\\tbitscore"\r\n-\tgene_identification = gene_identification[!duplicated(gene_identification$qseqid),]\r\n-\tref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52))\r\n-\tgene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T)\r\n-\tgene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100\r\n-\tgene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")]\r\n-\tcolnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")\r\n-}\r\n-\r\n-input.sequence.count = nrow(summ)\r\n-print(paste("Number of sequences in summary file:", input.sequence.count))\r\n-\r\n-filtering.steps = data.frame(character(0), numeric(0))\r\n-\r\n-filtering.steps = rbind(filtering.steps, c("Input", input.sequence.count))\r\n-\r\n-filtering.steps[,1] = as.character(filtering.steps[,1])\r\n-filtering.steps[,2] = as.character(filtering.steps[,2])\r\n-#filtering.steps[,3] = as.numeric(filtering.steps[,3])\r\n-\r\n-summ = merge(summ, gene_identification, by="Sequence.ID")\r\n-\r\n-summ = summ[summ$Functionality != "No results",]\r\n-\r\n-print(paste("Number of sequences after \'No results\' filter:", nrow(summ)))\r\n-\r\n-filtering.steps = rbind(filtering.steps, c("After \'No results\' filter", nrow(summ)))\r\n-\r\n-if(functionality == "productive"){\r\n-\tsumm = summ[summ$Functionality == "productive (see comment)" | summ$Functionality == "productive",]\r\n-} else if (functionality == "unproductive"){\r\n-\tsumm = summ[summ$Functionality == "unproductive (see comment)" | summ$Functionality == "unproductive",]\r\n-} else if (functionality == "remove_unknown"){\r\n-\tsumm = summ[summ$Functionality != "No results" & summ$Functionality != "unknown (see comment)" & summ$Functionality != "unknown",]\r\n-}\r\n-\r\n-print(paste("Number of sequences after functionality filter:", nrow(summ)))\r\n-\r\n-filtering.steps = rbind(filtering.steps, c("After functionality filter", nrow(summ)))\r\n-\r\n-result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID")\r\n-\r\n-print(paste("Number of sequences after merging with mutation analysis file:", nrow(result)))\r\n-\r\n-result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID")\r\n-\r\n-print(paste("Number of sequences after merging with mutation stats file:", nrow(result)))\r\n-\r\n-result = merge(result, hotspots[,!(names(hotspots) %in% names(result)[-1])], by="Sequence.ID")\r\n-\r\n-print(paste("Number of sequences after merging with hotspots file:", nrow(result)))\r\n-\r\n-sequences = sequences[,c("Sequenc'..b'esult)))\r\n-\r\n-cleanup_columns = c("FR1.IMGT.Nb.of.mutations", \r\n- "CDR1.IMGT.Nb.of.mutations", \r\n- "FR2.IMGT.Nb.of.mutations", \r\n- "CDR2.IMGT.Nb.of.mutations", \r\n- "FR3.IMGT.Nb.of.mutations")\r\n-\r\n-for(col in cleanup_columns){\r\n- result[,col] = gsub("\\\$.*\\\$", "", result[,col])\r\n- result[,col] = as.numeric(result[,col])\r\n- result[is.na(result[,col]),] = 0\r\n-}\r\n-\r\n-write.table(result, before.unique.file, sep="\\t", quote=F,row.names=F,col.names=T)\r\n-\r\n-if(filter.unique != "no"){\r\n-\tclmns = names(result)\r\n-\t\r\n-\tif(empty.region.filter == "leader"){\r\n-\t\tresult$unique.def = paste(result$FR1.IMGT.seq, result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)\r\n-\t} else if(empty.region.filter == "FR1"){\r\n-\t\tresult$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)\r\n-\t} else if(empty.region.filter == "CDR1"){\r\n-\t\tresult$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)\r\n-\t} else if(empty.region.filter == "FR2"){\r\n-\t\tresult$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)\r\n-\t}\r\n-\t\r\n-\tif(filter.unique == "remove"){\r\n-\t\tresult = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]\r\n-\t}\r\n-\tresult$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don\'t have a class after it\r\n-\t\r\n-\tresult = result[!duplicated(result$unique.def),]\r\n-}\r\n-\r\n-write.table(result, gsub("before_unique_filter.txt", "after_unique_filter.txt", before.unique.file), sep="\\t", quote=F,row.names=F,col.names=T)\r\n-\r\n-filtering.steps = rbind(filtering.steps, c("After filter unique sequences", nrow(result)))\r\n-\r\n-if(nrow(summ) == 0){\r\n-\tstop("No data remaining after filter")\r\n-}\r\n-\r\n-result$best_match_class = gsub(",.*", "", result$best_match) #gsub so the unmatched don\'t have a class after it\r\n-\r\n-result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":"))\r\n-\r\n-\r\n-\r\n-\r\n-result.matched = result[!grepl("unmatched", result$best_match),]\r\n-result.unmatched = result[grepl("unmatched", result$best_match),]\r\n-\r\n-result = rbind(result.matched, result.unmatched)\r\n-\r\n-result = result[!(duplicated(result$past)), ]\r\n-\r\n-result = result[,!(names(result) %in% c("past", "best_match_class"))]\r\n-\r\n-print(paste("Number of sequences in result after", unique.type, "filtering:", nrow(result)))\r\n-\r\n-filtering.steps = rbind(filtering.steps, c("After remove duplicates based on filter", nrow(result)))\r\n-\r\n-unmatched = result[grepl("^unmatched", result$best_match),c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]\r\n-\r\n-print(paste("Number of rows in result:", nrow(result)))\r\n-print(paste("Number of rows in unmatched:", nrow(unmatched)))\r\n-\r\n-matched.sequences = result[!grepl("^unmatched", result$best_match),]\r\n-\r\n-write.table(x=matched.sequences, file=gsub("merged.txt$", "filtered.txt", output), sep="\\t",quote=F,row.names=F,col.names=T)\r\n-\r\n-matched.sequences.count = nrow(matched.sequences)\r\n-unmatched.sequences.count = sum(grepl("^unmatched", result$best_match))\r\n-\r\n-filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count))\r\n-filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count))\r\n-filtering.steps[,2] = as.numeric(filtering.steps[,2])\r\n-filtering.steps$perc = round(filtering.steps[,2] / input.sequence.count * 100, 2)\r\n-\r\n-write.table(x=filtering.steps, file=gsub("unmatched", "filtering_steps", unmatchedfile), sep="\\t",quote=F,row.names=F,col.names=F)\r\n-\r\n-write.table(x=result, file=output, sep="\\t",quote=F,row.names=F,col.names=T)\r\n-write.table(x=unmatched, file=unmatchedfile, sep="\\t",quote=F,row.names=F,col.names=T)\r\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 naive_output.r
--- a/naive_output.r Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,45 +0,0 @@
-args <- commandArgs(trailingOnly = TRUE)
-
-naive.file = args[1]
-shm.file = args[2]
-output.file.ca = args[3]
-output.file.cg = args[4]
-output.file.cm = args[5]
-
-naive = read.table(naive.file, sep="\t", header=T, quote="", fill=T)
-shm.merge = read.table(shm.file, sep="\t", header=T, quote="", fill=T)
-
-
-final = merge(naive, shm.merge[,c("Sequence.ID", "best_match")], by.x="ID", by.y="Sequence.ID")
-print(paste("nrow final:", nrow(final)))
-names(final)[names(final) == "best_match"] = "Sample"
-final.numeric = final[,sapply(final, is.numeric)]
-final.numeric[is.na(final.numeric)] = 0
-final[,sapply(final, is.numeric)] = final.numeric
-
-final.ca = final[grepl("^ca", final$Sample),]
-final.cg = final[grepl("^cg", final$Sample),]
-final.cm = final[grepl("^cm", final$Sample),]
-
-if(nrow(final.ca) > 0){
- final.ca$Replicate = 1
-}
-
-if(nrow(final.cg) > 0){
- final.cg$Replicate = 1
-}
-
-if(nrow(final.cm) > 0){
- final.cm$Replicate = 1
-}
-
-#print(paste("nrow final:", nrow(final)))
-#final2 = final
-#final2$Sample = gsub("[0-9]", "", final2$Sample)
-#final = rbind(final, final2)
-#final$Replicate = 1
-
-write.table(final.ca, output.file.ca, quote=F, sep="\t", row.names=F, col.names=T)
-write.table(final.cg, output.file.cg, quote=F, sep="\t", row.names=F, col.names=T)
-write.table(final.cm, output.file.cm, quote=F, sep="\t", row.names=F, col.names=T)
-

diff -r 5ffd52fc35c4 -r bcec7bb4e089 new_imgt.r
--- a/new_imgt.r Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,29 +0,0 @@
-args <- commandArgs(trailingOnly = TRUE)
-
-imgt.dir = args[1]
-merged.file = args[2]
-gene = args[3]
-
-merged = read.table(merged.file, header=T, sep="\t", fill=T, stringsAsFactors=F)
-
-if(gene != "-"){
- merged = merged[grepl(paste("^", gene, sep=""), merged$best_match),]
-} else {
- merged = merged[!grepl("unmatched", merged$best_match),]
-}
-
-merged = merged[!grepl("unmatched", merged$best_match),]
-
-for(f in list.files(imgt.dir, pattern="*.txt$")){
- #print(paste("filtering", f))
- path = paste(imgt.dir, f, sep="")
- dat = read.table(path, header=T, sep="\t", fill=T, quote="", stringsAsFactors=F, check.names=FALSE)
-
- dat = dat[dat[,"Sequence ID"] %in% merged$Sequence.ID,]
-
- if(nrow(dat) > 0 & grepl("^8_", f)){ #change the FR1 columns to 0 in the "8_..." file
- dat[,grepl("^FR1", names(dat))] = 0
- }
-
- write.table(dat, path, quote=F, sep="\t", row.names=F, col.names=T, na="")
-}

diff -r 5ffd52fc35c4 -r bcec7bb4e089 pattern_plots.r
--- a/pattern_plots.r Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,159 +0,0 @@
-library(ggplot2)
-library(reshape2)
-library(scales)
-
-args <- commandArgs(trailingOnly = TRUE)
-
-input.file = args[1] #the data that's get turned into the "SHM overview" table in the html report "data_sum.txt"
-
-plot1.path = args[2]
-plot1.png = paste(plot1.path, ".png", sep="")
-plot1.txt = paste(plot1.path, ".txt", sep="")
-
-plot2.path = args[3]
-plot2.png = paste(plot2.path, ".png", sep="")
-plot2.txt = paste(plot2.path, ".txt", sep="")
-
-plot3.path = args[4]
-plot3.png = paste(plot3.path, ".png", sep="")
-plot3.txt = paste(plot3.path, ".txt", sep="")
-
-clean.output = args[5]
-
-dat = read.table(input.file, header=F, sep=",", quote="", stringsAsFactors=F, fill=T, row.names=1)
-
-
-
-classes = c("IGA", "IGA1", "IGA2", "IGG", "IGG1", "IGG2", "IGG3", "IGG4", "IGM", "IGE")
-xyz = c("x", "y", "z")
-new.names = c(paste(rep(classes, each=3), xyz, sep="."), paste("un", xyz, sep="."), paste("all", xyz, sep="."))
-
-names(dat) = new.names
-
-clean.dat = dat
-clean.dat = clean.dat[,c(paste(rep(classes, each=3), xyz, sep="."), paste("all", xyz, sep="."), paste("un", xyz, sep="."))]
-
-write.table(clean.dat, clean.output, quote=F, sep="\t", na="", row.names=T, col.names=NA)
-
-dat["RGYW.WRCY",] = colSums(dat[c(13,14),], na.rm=T)
-dat["TW.WA",] = colSums(dat[c(15,16),], na.rm=T)
-
-data1 = dat[c("RGYW.WRCY", "TW.WA"),]
-
-data1 = data1[,names(data1)[grepl(".z", names(data1))]]
-names(data1) = gsub("\\..*", "", names(data1))
-
-data1 = melt(t(data1))
-
-names(data1) = c("Class", "Type", "value")
-
-data1 = data1[order(data1$Type),]
-
-write.table(data1, plot1.txt, quote=F, sep="\t", na="", row.names=F, col.names=T)
-
-p = ggplot(data1, aes(Class, value)) + geom_bar(aes(fill=Type), stat="identity", position="dodge", colour = "black") + ylab("% of mutations") + guides(fill=guide_legend(title=NULL))
-p = p + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1)) + scale_fill_manual(values=c("RGYW.WRCY" = "white", "TW.WA" = "blue4"))
-#p = p + scale_colour_manual(values=c("RGYW.WRCY" = "black", "TW.WA" = "blue4"))
-png(filename=plot1.png, width=480, height=300)
-print(p)
-dev.off()
-
-data2 = dat[c(1, 5:8),]
-
-data2 = data2[,names(data2)[grepl("\\.x", names(data2))]]
-names(data2) = gsub(".x", "", names(data2))
-
-data2["A/T",] = dat["Targeting of A T (%)",names(dat)[grepl("\\.z", names(dat))]]
-
-data2["G/C transitions",] = round(data2["Transitions at G C (%)",] / data2["Number of Mutations (%)",] * 100, 1)
-
-data2["mutation.at.gc",] = dat["Transitions at G C (%)",names(dat)[grepl("\\.y", names(dat))]]
-data2["G/C transversions",] = round((data2["mutation.at.gc",] - data2["Transitions at G C (%)",]) / data2["Number of Mutations (%)",] * 100, 1)
-
-data2["G/C transversions",is.nan(unlist(data2["G/C transversions",]))] = 0
-data2["G/C transversions",is.infinite(unlist(data2["G/C transversions",]))] = 0
-data2["G/C transitions",is.nan(unlist(data2["G/C transitions",]))] = 0
-data2["G/C transitions",is.infinite(unlist(data2["G/C transitions",]))] = 0
-
-data2 = melt(t(data2[c("A/T","G/C transitions","G/C transversions"),]))
-
-names(data2) = c("Class", "Type", "value")
-
-data2 = data2[order(data2$Type),]
-
-write.table(data2, plot2.txt, quote=F, sep="\t", na="", row.names=F, col.names=T)
-
-p = ggplot(data2, aes(x=Class, y=value, fill=Type)) + geom_bar(position="fill", stat="identity", colour = "black") + scale_y_continuous(labels=percent_format()) + guides(fill=guide_legend(title=NULL)) + ylab("% of mutations")
-p = p + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1)) + scale_fill_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "white"))
-#p = p + scale_colour_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "black"))
-png(filename=plot2.png, width=480, height=300)
-print(p)
-dev.off()
-
-data3 = dat[c(5, 6, 8, 17:20),]
-data3 = data3[,names(data3)[grepl("\\.x", names(data3))]]
-names(data3) = gsub(".x", "", names(data3))
-
-data3[is.na(data3)] = 0
-#data3[is.infinite(data3)] = 0
-
-data3["G/C transitions",] = round(data3["Transitions at G C (%)",] / (data3["C",] + data3["G",]) * 100, 1)
-
-data3["G/C transversions",] = round((data3["Targeting of G C (%)",] - data3["Transitions at G C (%)",]) / (data3["C",] + data3["G",]) * 100, 1)
-
-data3["A/T",] = round(data3["Targeting of A T (%)",] / (data3["A",] + data3["T",]) * 100, 1)
-
-data3["G/C transitions",is.nan(unlist(data3["G/C transitions",]))] = 0
-data3["G/C transitions",is.infinite(unlist(data3["G/C transitions",]))] = 0
-
-data3["G/C transversions",is.nan(unlist(data3["G/C transversions",]))] = 0
-data3["G/C transversions",is.infinite(unlist(data3["G/C transversions",]))] = 0
-
-data3["A/T",is.nan(unlist(data3["A/T",]))] = 0
-data3["A/T",is.infinite(unlist(data3["A/T",]))] = 0
-
-data3 = melt(t(data3[8:10,]))
-names(data3) = c("Class", "Type", "value")
-
-data3 = data3[order(data3$Type),]
-
-write.table(data3, plot3.txt, quote=F, sep="\t", na="", row.names=F, col.names=T)
-
-p = ggplot(data3, aes(Class, value)) + geom_bar(aes(fill=Type), stat="identity", position="dodge", colour = "black") + ylab("% of nucleotides") + guides(fill=guide_legend(title=NULL))
-p = p + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1)) + scale_fill_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "white"))
-#p = p + scale_colour_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "black"))
-png(filename=plot3.png, width=480, height=300)
-print(p)
-dev.off()
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/RScript.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/report_clonality/RScript.r Mon Dec 12 05:22:57 2016 -0500

[

b'@@ -0,0 +1,830 @@\n+# ---------------------- load/install packages ----------------------\n+\n+if (!("gridExtra" %in% rownames(installed.packages()))) {\n+ install.packages("gridExtra", repos="http://cran.xl-mirror.nl/") \n+}\n+library(gridExtra)\n+if (!("ggplot2" %in% rownames(installed.packages()))) {\n+ install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") \n+}\n+library(ggplot2)\n+if (!("plyr" %in% rownames(installed.packages()))) {\n+ install.packages("plyr", repos="http://cran.xl-mirror.nl/") \n+}\n+library(plyr)\n+\n+if (!("data.table" %in% rownames(installed.packages()))) {\n+ install.packages("data.table", repos="http://cran.xl-mirror.nl/") \n+}\n+library(data.table)\n+\n+if (!("reshape2" %in% rownames(installed.packages()))) {\n+ install.packages("reshape2", repos="http://cran.xl-mirror.nl/")\n+}\n+library(reshape2)\n+\n+if (!("lymphclon" %in% rownames(installed.packages()))) {\n+ install.packages("lymphclon", repos="http://cran.xl-mirror.nl/")\n+}\n+library(lymphclon)\n+\n+# ---------------------- parameters ----------------------\n+\n+args <- commandArgs(trailingOnly = TRUE)\n+\n+infile = args[1] #path to input file\n+outfile = args[2] #path to output file\n+outdir = args[3] #path to output folder (html/images/data)\n+clonaltype = args[4] #clonaltype definition, or \'none\' for no unique filtering\n+ct = unlist(strsplit(clonaltype, ","))\n+species = args[5] #human or mouse\n+locus = args[6] # IGH, IGK, IGL, TRB, TRA, TRG or TRD\n+filterproductive = ifelse(args[7] == "yes", T, F) #should unproductive sequences be filtered out? (yes/no)\n+clonality_method = args[8]\n+\n+\n+# ---------------------- Data preperation ----------------------\n+\n+print("Report Clonality - Data preperation")\n+\n+inputdata = read.table(infile, sep="\\t", header=TRUE, fill=T, comment.char="")\n+\n+print(paste("nrows: ", nrow(inputdata)))\n+\n+setwd(outdir)\n+\n+# remove weird rows\n+inputdata = inputdata[inputdata$Sample != "",]\n+\n+print(paste("nrows: ", nrow(inputdata)))\n+\n+#remove the allele from the V,D and J genes\n+inputdata$Top.V.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.V.Gene)\n+inputdata$Top.D.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.D.Gene)\n+inputdata$Top.J.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.J.Gene)\n+\n+print(paste("nrows: ", nrow(inputdata)))\n+\n+#filter uniques\n+inputdata.removed = inputdata[NULL,]\n+\n+print(paste("nrows: ", nrow(inputdata)))\n+\n+inputdata$clonaltype = 1:nrow(inputdata)\n+\n+#keep track of the count of sequences in samples or samples/replicates for the front page overview\n+input.sample.count = data.frame(data.table(inputdata)[, list(All=.N), by=c("Sample")])\n+input.rep.count = data.frame(data.table(inputdata)[, list(All=.N), by=c("Sample", "Replicate")])\n+\n+PRODF = inputdata\n+UNPROD = inputdata\n+if(filterproductive){\n+ if("Functionality" %in% colnames(inputdata)) { # "Functionality" is an IMGT column\n+ #PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ]\n+ PRODF = inputdata[inputdata$Functionality %in% c("productive (see comment)","productive"),]\n+ \n+ PRODF.count = data.frame(data.table(PRODF)[, list(count=.N), by=c("Sample")])\n+ \n+ UNPROD = inputdata[inputdata$Functionality %in% c("unproductive (see comment)","unproductive"), ]\n+ } else {\n+ PRODF = inputdata[inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" , ]\n+ UNPROD = inputdata[!(inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" ), ]\n+ }\n+}\n+\n+prod.sample.count = data.frame(data.table(PRODF)[, list(Productive=.N), by=c("Sample")])\n+prod.rep.count = data.frame(data.table(PRODF)[, list(Productive=.N), by=c("Sample", "Replicate")])\n+\n+unprod.sample.count = data.frame(data.table(UNPROD)[, list(Unproductive=.N), by=c("Sample")])\n+unprod.rep.count = data.frame(data.table(UNPROD)[, list(Unproductive=.N), by=c("Sample", "Replicate")])\n+\n+clonalityF'..b' P3=num_median(.SD$P3D.nt.nb, na.rm=T),\n+ N2=num_median(rowSums(.SD[,c("N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)),\n+ P4=num_median(.SD$P5J.nt.nb, na.rm=T),\n+ DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T),\n+ Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)),\n+ Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)),\n+ Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)),\n+ Median.CDR3.l=median(.SD$CDR3.Length.DNA)),\n+\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tby=c("Sample")])\n+\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n+ newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)\n+ write.table(newData, "junctionAnalysisUnProd_median.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F)\n+}\n+\n+PRODF = bak\n+\n+\n+# ---------------------- D reading frame ----------------------\n+\n+D.REGION.reading.frame = PRODF$D.REGION.reading.frame\n+\n+D.REGION.reading.frame[is.na(D.REGION.reading.frame)] = "No D"\n+\n+D.REGION.reading.frame = data.frame(table(D.REGION.reading.frame))\n+\n+write.table(D.REGION.reading.frame, "DReadingFrame.csv" , sep="\\t",quote=F,row.names=F,col.names=T)\n+\n+D.REGION.reading.frame = ggplot(D.REGION.reading.frame)\n+D.REGION.reading.frame = D.REGION.reading.frame + geom_bar(aes( x = D.REGION.reading.frame, y = Freq), stat=\'identity\', position=\'dodge\' ) + ggtitle("D reading frame") + xlab("Frequency") + ylab("Frame")\n+\n+png("DReadingFrame.png")\n+D.REGION.reading.frame\n+dev.off()\n+\n+\n+\n+\n+# ---------------------- AA composition in CDR3 ----------------------\n+\n+AACDR3 = PRODF[,c("Sample", "CDR3.Seq")]\n+\n+TotalPerSample = data.frame(data.table(AACDR3)[, list(total=sum(nchar(as.character(.SD$CDR3.Seq)))), by=Sample])\n+\n+AAfreq = list()\n+\n+for(i in 1:nrow(TotalPerSample)){\n+\tsample = TotalPerSample$Sample[i]\n+ AAfreq[[i]] = data.frame(table(unlist(strsplit(as.character(AACDR3[AACDR3$Sample == sample,c("CDR3.Seq")]), ""))))\n+ AAfreq[[i]]$Sample = sample\n+}\n+\n+AAfreq = ldply(AAfreq, data.frame)\n+AAfreq = merge(AAfreq, TotalPerSample, by="Sample", all.x = T)\n+AAfreq$freq_perc = as.numeric(AAfreq$Freq / AAfreq$total * 100)\n+\n+\n+AAorder = read.table(sep="\\t", header=TRUE, text="order.aa\\tAA\\n1\\tR\\n2\\tK\\n3\\tN\\n4\\tD\\n5\\tQ\\n6\\tE\\n7\\tH\\n8\\tP\\n9\\tY\\n10\\tW\\n11\\tS\\n12\\tT\\n13\\tG\\n14\\tA\\n15\\tM\\n16\\tC\\n17\\tF\\n18\\tL\\n19\\tV\\n20\\tI")\n+AAfreq = merge(AAfreq, AAorder, by.x=\'Var1\', by.y=\'AA\', all.x=TRUE)\n+\n+AAfreq = AAfreq[!is.na(AAfreq$order.aa),]\n+\n+AAfreqplot = ggplot(AAfreq)\n+AAfreqplot = AAfreqplot + geom_bar(aes( x=factor(reorder(Var1, order.aa)), y = freq_perc, fill = Sample), stat=\'identity\', position=\'dodge\' )\n+AAfreqplot = AAfreqplot + annotate("rect", xmin = 0.5, xmax = 2.5, ymin = 0, ymax = Inf, fill = "red", alpha = 0.2)\n+AAfreqplot = AAfreqplot + annotate("rect", xmin = 3.5, xmax = 4.5, ymin = 0, ymax = Inf, fill = "blue", alpha = 0.2)\n+AAfreqplot = AAfreqplot + annotate("rect", xmin = 5.5, xmax = 6.5, ymin = 0, ymax = Inf, fill = "blue", alpha = 0.2)\n+AAfreqplot = AAfreqplot + annotate("rect", xmin = 6.5, xmax = 7.5, ymin = 0, ymax = Inf, fill = "red", alpha = 0.2)\n+AAfreqplot = AAfreqplot + ggtitle("Amino Acid Composition in the CDR3") + xlab("Amino Acid, from Hydrophilic (left) to Hydrophobic (right)") + ylab("Percentage")\n+\n+png("AAComposition.png",width = 1280, height = 720)\n+AAfreqplot\n+dev.off()\n+write.table(AAfreq, "AAComposition.csv" , sep=",",quote=F,na="-",row.names=F,col.names=T)\n+\n+\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/circos/LTe50046.ttf

Binary file report_clonality/circos/LTe50046.ttf has changed

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/circos/LTe50048.ttf

Binary file report_clonality/circos/LTe50048.ttf has changed

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/circos/LTe50050.ttf

Binary file report_clonality/circos/LTe50050.ttf has changed

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/circos/LTe50054.ttf

Binary file report_clonality/circos/LTe50054.ttf has changed

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/circos/circos.conf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/report_clonality/circos/circos.conf Mon Dec 12 05:22:57 2016 -0500

@@ -0,0 +1,148 @@
+
+# This is the main configuration file for the Circos tableviewer. This file also
+# depends on colors.conf (definition on basic colors), ideogram.conf (size and spacing of
+# row/cell segments), and ticks.conf (tick spacing and label definitions - these are turned
+# off by default).
+#
+# In addition to these configuration files, the bin/make-conf script creates
+# colors.conf (colors of row/col segments) and colors_percentile.conf (colors based on
+# cell percentile values). These configuration files are also included via the <<include>> directive.
+#
+# Some elements of the output image are toggled off by default (e.g. row and column highlights,
+# anchor links to segment labels, tick marks).
+
+<colors>
+<<include DATA_DIR/etc_colors.conf>>
+<<include DATA_DIR/colors.conf>>
+<<include DATA_DIR/colors_percentile.conf>>
+</colors>
+
+<fonts>
+<<include DATA_DIR/fonts.conf>>
+</fonts>
+
+<<include DATA_DIR/ideogram.conf>>
+<<include DATA_DIR/ticks.conf>>
+
+karyotype = DATA_DIR/karyotype.txt
+
+<image>
+dir   = DATA_DIR
+file  = circos.png
+24bit = yes
+svg   = no
+png   = yes
+pdf   = no
+# radius of inscribed circle in image
+radius         = 1500p
+background     = white
+# by default angle=0 is at 3 o'clock position
+angle_offset   = -180
+auto_alpha_colors = yes
+auto_alpha_steps  = 5
+</image>
+
+chromosomes_units              = 10
+chromosomes_display_default    = yes
+chromosomes_order_by_karyotype = yes
+
+<highlights>
+
+show = yes
+
+<highlight>
+show = no
+file = DATA_DIR/row.txt
+r0 = 1r+200p
+r1 = 1r+220p
+stroke_color = black
+stroke_thickness = 2
+</highlight>
+
+<highlight>
+show = no
+file = DATA_DIR/col.txt
+r0 = 1r+230p
+r1 = 1r+250p
+stroke_color = black
+stroke_thickness = 2
+</highlight>
+
+<highlight>
+show = no
+file = DATA_DIR/all.txt
+r0 = 1r+10p
+r1 = 1r+35p
+stroke_color = black
+stroke_thickness = 2
+</highlight>
+
+</highlights>
+
+<plots>
+
+<plot>
+type = text
+file = DATA_DIR/segmentlabel.txt
+label_font = condensedbold
+color = black
+label_size = 30p
+r0   = 1r+50p
+r1   = 1r+500p
+rpadding = 0p
+padding = 0p
+
+show_links     = no
+link_dims      = 0p,10p,32p,10p,5p
+link_thickness = 3p
+link_color     = black
+
+label_snuggle             = no
+# shift label up to its height in pixels in the angular direction
+max_snuggle_distance      = 2r
+snuggle_sampling          = 2
+snuggle_tolerance         = 0.25r
+
+</plot>
+
+</plots>
+
+<links>
+
+<link cellvalues>
+ribbon        = yes
+flat          = yes
+file          = DATA_DIR/cells.txt
+bezier_radius = 0.0r
+radius        = 0.999r-15p
+thickness     = 1
+color         = grey
+stroke_color     = black
+stroke_thickness = 1
+<rules>
+
+<rule>
+importance = 95
+condition  = 1
+radius1    = 0.999r+2p
+flow       = continue
+</rule>
+
+</rules>
+
+</link>
+
+</links>
+
+#anglestep       = 0.5
+#minslicestep    = 10
+#beziersamples   = 40
+#debug           = no
+#warnings        = no
+#imagemap        = no
+
+#units_ok = bupr
+#units_nounit = n
+
+<<include DATA_DIR/housekeeping.conf>>
+

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/circos/circos.tar.gz

Binary file report_clonality/circos/circos.tar.gz has changed

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/circos/etc_colors.conf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/report_clonality/circos/etc_colors.conf Mon Dec 12 05:22:57 2016 -0500

@@ -0,0 +1,142 @@
+# RGB color definition. Colors are refered to within configuration files
+# by their name. In order to use a color, you must define it here.
+#
+# e.g. if you really must use 'bisque', then add
+#
+# bisque = 255,228,196
+#
+# Many useful colors are already defined. In general, given a HUE, these
+# colors are defined
+#
+# vlHUE (very light HUE, e.g. vlred)
+# lHUE  (light HUE, e.g. red)
+# HUE   (e.g. red)
+# dHUE  (dark HUE, e.g. dred)
+#
+# In addition to hues, two other color groups are defined.
+#
+# - cytogenetic band colors (e.g. gposNNN, acen, stalk, etc.) which
+#   correspond to colors on ideogram bands
+# - UCSC chromosome color palette (e.g. chrNN, chrUn, chrNA)
+
+optblue   = 55,133,221
+optgreen  = 55,221,125
+optyellow = 221,215,55
+optorange = 221,164,55
+optred    = 221,55,55
+optviolet = 145,55,221
+optpurple = 219,55,221
+
+white     = 255,255,255
+vvvvlgrey = 250,250,250
+vvvlgrey  = 240,240,240
+vvlgrey   = 230,230,230
+vlgrey    = 220,220,220
+lgrey     = 210,210,210
+grey      = 200,200,200
+dgrey     = 170,170,170
+vdgrey    = 140,140,140
+vvdgrey   = 100,100,100
+vvvdgrey  = 70,70,70
+vvvvdgrey = 40,40,40
+black     = 0,0,0
+
+vlred   = 255,193,200
+lred    = 255,122,137
+red     = 247,42,66
+dred    = 205,51,69
+
+vlgreen = 204,255,218
+lgreen  = 128,255,164
+green   = 51,204,94
+dgreen  = 38,153,71
+
+vlblue  = 128,176,255
+lblue   = 64,137,255
+blue    = 54,116,217
+dblue   = 38,82,153
+
+vlpurple= 242,128,255
+lpurple = 236,64,255
+purple  = 189,51,204
+dpurple = 118,32,128
+
+vlyellow = 255,253,202
+lyellow  = 255,252,150
+yellow   = 255,255,0
+dyellow  = 191,186,48
+
+lime     = 186,255,0
+
+vlorange = 255,228,193
+lorange  = 255,187,110
+orange   = 255,136,0
+dorange  = 221,143,55
+
+# karyotype colors
+
+gpos100 = 0,0,0
+gpos    = 0,0,0
+gpos75  = 130,130,130
+gpos66  = 160,160,160
+gpos50  = 200,200,200
+gpos33  = 210,210,210
+gpos25  = 200,200,200
+gvar    = 220,220,220
+gneg    = 255,255,255
+acen    = 217,47,39
+stalk   = 100,127,164
+
+# others
+
+select = 135,177,255
+
+# new york times cmyk-safe
+
+# roygbiv - normal
+nyt_blue   = 104,152,178
+nyt_green  = 137,129,96
+nyt_yellow = 241,221,117
+nyt_orange = 230,146,57
+nyt_red    = 217,47,39
+
+# chromosome color map (UCSC)
+
+chr1 = 153,102,0
+chr2 = 102,102,0
+chr3 = 153,153,30
+chr4 = 204,0,0
+chr5 = 255,0,0
+chr6 = 255,0,204
+chr7 = 255,204,204
+chr8 = 255,153,0
+chr9 = 255,204,0
+chr10 = 255,255,0
+chr11 = 204,255,0
+chr12 = 0,255,0
+chr13 = 53,128,0
+chr14 = 0,0,204
+chr15 = 102,153,255
+chr16 = 153,204,255
+chr17 = 0,255,255
+chr18 = 204,255,255
+chr19 = 153,0,204
+chr20 = 204,51,255
+chr21 = 204,153,255
+chr22 = 102,102,102
+chr23 = 153,153,153
+chrX  = 153,153,153
+chr24 = 204,204,204
+chrY = 204,204,204
+chrM = 204,204,153
+chr0 = 204,204,153
+chrUn = 121,204,61
+chrNA = 255,255,255
+
+
+
+
+
+
+
+

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/circos/fonts.conf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/report_clonality/circos/fonts.conf Mon Dec 12 05:22:57 2016 -0500

@@ -0,0 +1,8 @@
+
+default       = LTe50046.ttf
+normal        = LTe50046.ttf
+bold          = LTe50048.ttf
+condensed     = LTe50050.ttf
+condensedbold = LTe50054.ttf
+mono          = pragmata.ttf
+glyph         = wingding.ttf

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/circos/housekeeping.conf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/report_clonality/circos/housekeeping.conf Mon Dec 12 05:22:57 2016 -0500

[

@@ -0,0 +1,230 @@
+# Housekeeping parameters, which must be included
+# at the top level of the configuration.
+#
+# Don't adjust these, unless you know what you are doing, or
+# feel like experimenting
+
+anglestep       = 0.5
+minslicestep    = 10
+beziersamples   = 40 # bezier curves are drawn piece-wise
+                     # linear, with this many samples
+debug           = no
+warnings        = no
+imagemap        = no
+paranoid        = yes
+
+units_ok        = bupr
+units_nounit    = n
+
+# \t  tab
+# \s  any whitespace
+file_delim = \s
+# collapse adjacent whitespace
+# e.g. two spaces are treated as one, not as a missing field
+file_delim_collapse = yes
+
+# Record delimiter for parameter values that are lists, such as
+# hs1:0.25;hs2:0.10. By default, both ; and , are accepted
+#
+# e.g. hs1:0.25,hs2:0.10
+#      hs1:0.25;hs2:0.10
+list_record_delim = \s*[;,]\s*
+# Field delimiter specifies the assignment operator, e.g.
+list_field_delim  = \s*[:=]\s*]
+
+# Rule fields and other parameters accept var(VARIABLE) syntax
+# to reference parameters of data points. By default, if
+# VARIABLE does not exist, Circos quits with an error, unless
+# the skip parameter below is set.
+#
+# This feature is useful when you have data that don't always
+# have the same options. For example,
+#
+# chr1 10 20 a=10
+# chr1 50 60 b=10
+skip_missing_expression_vars = no
+
+# In old versions, data point parameters were referenced using _NAME_
+# syntax. This has been replaced with var(NAME). The _NAME_ syntax is
+# deprecated (for example, it will break when dealing with fields like
+# gene_a_1). If you must use it, set the parameter below.
+
+legacy_underline_expression_syntax = no
+
+# Magnification factor for text in SVG files.
+svg_font_scale = 1.3
+
+# default font - pick one of the keys from <fonts> block
+default_font   = default
+# default font name is used for SVG files for cases where
+# the font definition does not include a name
+# see etc/fonts.conf for details
+default_font_name  = Arial
+default_font_color = black
+
+# default color for cases when color is not specified
+default_color  = black
+
+<guides>
+thickness      = 1
+size           = 5
+type           = outline
+<object>
+all            = no
+ideogram       = no
+ideogram_label = no
+</object>
+<color>
+default = lblue
+text    = red
+</color>
+</guides>
+
+# Receive debug messages about actions
+#
+# Comma-separated list of one or more of the following
+#
+# summary   - top level indications of what's happening
+# chrfilter - ideogram filtering (parsing 'chromosomes' parameter)
+# conf      - configuration file
+# counter   - plot counters
+# spacing   - ideogram spacing
+# scale     - ideogram scaling
+# ideogram  - ideogram reporting
+# anglepos  - report angle positions for base pair coordinates
+# zoom      - zoom regions
+# layers    - specific plot z-layers
+# rules     - dynamic rules
+# text      - text tracks
+# heatmap   - detailed heatmap reports
+# brush     - brushes
+# color     - color allocation and definition
+# ticks     - tick marks and labels
+# timers    - some benchmark timings
+# cache     - caches
+# _all      - turn on all groups
+debug_group = summary
+
+# run length duration after which timing report is automatically
+# generated at the end of the run
+debug_auto_timer_report = 30
+
+debug_word_separator = " "
+debug_undef_text     = _undef_
+debug_empty_text     = _emptylist_
+
+# parameters passed to functions can be
+# validated to check consistency. turn this
+# off to speed things up
+debug_validate       = yes
+
+# Reformat numbers in debug output for consistency.
+# If you have a lot of debug output (e.g. -debug_group _all)
+# this will slow things considerably.
+debug_output_tidy    = no
+
+# pixel sub-sampling for text tracks
+text_pixel_subsampling = 1
+# array or span - use 'span' if applying snuggle refinement
+text_snuggle_method    = array
+
+# restrict names of parameters?
+# if 'yes' then only certain named parameters are allowed within
+# blocks and option fields for data
+#
+# if 'no' then you can define parameters with any name what-so-ever,
+# useful if you wish to define states or labels for your data
+#
+# e.g. hs1 10 20 0.5 paired=yes,special=no,myvar=0.5
+#
+# ordinarily, 'paired', 'special' and 'myvar' would not be allowed
+restrict_parameter_names = no
+
+# Unless set to 'yes', parameter names will be converted to lowercase
+case_sensitive_parameter_names = no
+
+# The location of configuration and data files will be guessed if
+# (a) configuration file is not specified
+# (b) data file paths are relative
+# Circos will look in the following locations, where
+# SCRIPTPATH is the location of the 'circos' script (e.g. /usr/local/bin) and
+# CWD is the current directory (where the 'circos' command was executed).
+# All paths under CWD will be scanned first, then under SCRIPTPATH.
+#
+# {CWD,SCRIPTPATH}/.
+# {CWD,SCRIPTPATH}/..
+# {CWD,SCRIPTPATH}/etc/
+# {CWD,SCRIPTPATH}/../etc
+# {CWD,SCRIPTPATH}/../../etc
+# {CWD,SCRIPTPATH}/data
+# {CWD,SCRIPTPATH}/../data
+# {CWD,SCRIPTPATH}/../../data
+#
+# If you would like to prepend this list with custom directories for
+# data files, enter them as a CSV list here
+# data_path = /home/martink/circos-tutorials
+# If the cache is static, it will always be used and will not be updated
+# unless it is deleted (use -color_cache_rebuild on the command line).
+# Otherwise, the cache will be updated if
+#  - config file is newer than cache file
+#  - list of colors in config file is different than in cache file
+color_cache_static = yes
+color_cache_file   = circos.colorlist
+color_lists_use    = yes
+# if the directory is not defined, then the system will guess a temporary
+# directory compatible with your operating system (using File::Temp)
+# color_cache_dir    = /tmp
+
+# Make some functions faster. This should always be 'yes' unless you
+# want things to run slowly or suspect deep issues.
+memoize = yes
+
+# This is a debugging flag and should be set to 'no' for regular use
+quit_on_dump = yes
+
+offsets = 0,0
+
+# Maximum number of image and data elements. If these are exceeded,
+# Circos will quit with an error. These values are arbitrary, but in
+# my experience images with significantly more data points than this
+# are uninterpretable.
+
+max_ticks            = 5000
+max_ideograms        = 200
+max_links            = 25000
+max_points_per_track = 25000
+
+# What to do when data is found for an ideogram that does not appear in the karyotype file.
+
+# Set to 'skip' or 'exit'
+undefined_ideogram = skip
+
+# Number of iterations for determining ideogram sizes when
+# relative scale is used.
+relative_scale_iterations = 10
+
+# min, max, average, mode             - based on scale statistics of ALL ideograms
+# minadj, maxadj, averageadj, modeadj - based on scale statistics of adjacent ideograms
+#
+# You can specify a fixed scale for spacing using a floating value
+#
+# e.g. relative_scale_spacing = 1.5
+relative_scale_spacing    = mode
+
+# What to do with out-of-range data. Either 'clip' or 'hide'
+data_out_of_range = clip
+
+# Track default directory
+track_defaults = etc/tracks
+
+# Use round brushes for elements with thickness greater than round_brush_min_thickness?
+round_brush_use           = yes
+round_brush_min_thickness = 5
+
+# Use anti aliasing, where possible? I've seen bugs in some gd libraries
+# that cause artefacts to appear when lines are anti-aliased. If your
+# image contains unexpected elements, turn aa off.
+anti_aliasing = yes
+
+# A parameter that must be set. Checks whether this file was imported.
+housekeeping = yes

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/circos/ideogram.conf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/report_clonality/circos/ideogram.conf Mon Dec 12 05:22:57 2016 -0500

@@ -0,0 +1,37 @@
+
+<ideogram>
+
+<spacing>
+
+default = 0.0025r
+
+</spacing>
+
+thickness        = 50p
+stroke_thickness = 0
+# ideogram border color
+stroke_color     = black
+fill             = yes
+# the default chromosome color is set here and any value
+# defined in the karyotype file overrides it
+fill_color       = black
+
+# fractional radius position of chromosome ideogram within image
+radius         = 0.85r
+show_label     = no
+label_font     = condensedbold
+label_radius   = 0.99r
+label_size     = 36
+
+# cytogenetic bands
+band_stroke_thickness = 2
+
+# show_bands determines whether the outline of cytogenetic bands
+# will be seen
+show_bands            = no
+# in order to fill the bands with the color defined in the karyotype
+# file you must set fill_bands
+fill_bands            = yes
+
+</ideogram>
+

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/circos/parse-table.conf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/report_clonality/circos/parse-table.conf Mon Dec 12 05:22:57 2016 -0500

b"@@ -0,0 +1,391 @@\n+\n+################################################################\n+#\n+# This is a fairly complicated configuration file. Take your time in\n+# experimenting and adjust one thing at a time :)\n+#\n+################################################################\n+\n+max_col_num = 200\n+max_row_num = 200\n+\n+# skip this many rows before reading in header and data\n+skip_rows = 0\n+\n+# is there a header line that identifies the columns?\n+header = yes\n+\n+# is there a row that specifies the order of columns in the image?\n+# - if so, this must be the first line of the header\n+# - if the line exists (col_order_row=yes), employ the use_col_order_row to toggle whether it is used\n+col_order_row = no\n+use_col_order_row = no\n+\n+# is there a row that specifies the size of columns in the image?\n+# - if so, this must be the next line of the header\n+# - if the line exists (col_size_row=yes), employ the use_col_size_row to toggle whether it is used\n+col_size_row = no\n+use_col_size_row = no\n+\n+# is there a row that specifies the color of each column segment in the image?\n+# - if so, this must be the next line of the header\n+# - if the line exists (col_color_row=yes), employ the use_col_color_row to toggle whether it is used\n+col_color_row = no\n+use_col_color_row = no\n+\n+# is there a column that specifies the order of rows in the image?\n+# - if so, this must be the first column\n+# - if the line exists (row_order_col=yes), employ the use_row_order_col to toggle whether it is used\n+row_order_col = no\n+use_row_order_col = no\n+\n+# is there a column that specifies the color of each row segment in the image?\n+# - if so, this must be the second column\n+# - if the line exists (row_color_col=yes), employ the use_row_color_col to toggle whether it is used\n+row_color_col = no\n+use_row_color_col = no\n+\n+# if you do not have a column/row that explicitly defines order\n+# of segments in the image, you can set this here. Use one (or more) of \n+# these values to specify how segments should be ordered.\n+# - row_major (row segments first, then column)\n+# - col_major (col segments first, then row)\n+# - ascii (asciibetic order)\n+# - row_size (total of rows for the segment - useful if the segment has both row and column contributions)\n+# - col_size (total of colums for the segment - useful if the segment has both row and column contributions)\n+# - row_to_col_ratio (ratio of total of rows to columns for the segment)\n+# - col_to_row_ratio (ratio of total of rows to columns for the segment)\n+# - size_asc (size, in ascending order)\n+# - size_desc (size, in descending order)\n+\n+#segment_order = row_to_col_ratio,size_desc # col_major,size_desc\n+#segment_order = size_desc\n+segment_order = row_major,size_desc\n+#segment_order = ascii\n+#segment_order = file:etc/order-by-table-remapped.txt\n+#segment_order = size_desc,row_to_col_ratio\n+segment_color_order = row_major,size_desc\n+\n+# values for segments can be normalized if the use_segment_normalization is set to yes\n+use_segment_normalization = no\n+\n+# the normalization function can be one of the following, and is applied to\n+# all values that correspond to the segment's label\n+# total - sum of cell values for the segment label (row and col)\n+# average - average of cell values for the segment label (row and col)\n+# row_total, row_average - sum or average for cell values for the segment row\n+# col_total, col_average - sum or average for cell values for the segment col\n+# row_size, col_size, total_size - based on the optional size column (see col_size_row and row_size_col above)\n+# VALUE - segments are scaled to a constant VALUE (e.g. 1000)\n+segment_normalization_function = 1000\n+\n+# normalization can be performed by either altering the actual data values or\n+# by applying a visual scaling of the segments. When 'value' is used, the data\n+# is changed. When 'visual' is used, then a chromosomes_scale line is reported\n+# by this script which you must include in circos.conf for "..b' values or only unique values when\n+# calculating percentiles\n+percentile_unique_only = yes\n+\n+# use a function, f(X), to remap cell values when calculating percentiles\n+# for the purpose of color mapping. This allows you to apply a remapping to how\n+# colors are calculated, without actually changing the values. The remap\n+# applies only if percentile_method=value\n+\n+# percentile_remap = sqrt(X)\n+\n+# Which cell value set to use for percentile color mapping\n+# raw - original values\n+# filtered - values that pass min/max filters\n+# scaled - filtered values that have been scaled if use_scaling is set\n+percentile_data_domain = raw\n+\n+<colors>\n+h0 = 0\n+s0 = 1\n+v0 = 1\n+h1 = 300\n+s1 = 1\n+v1 = 1\n+</colors>\n+\n+# You can control the color and stroke of ribbons for each\n+# quartile (q1, q2, q3, q4). Any values defined here will\n+# overwrite colors determined by remapping. \n+#\n+# For example, if you have a lot of cells and wish to attenuate\n+# the visibility of ribbons associated with small values, you can\n+# set cell_q1_color=vvlgrey,cell_q1_nostroke=yes to fade the\n+# ribbons into the background.\n+\n+#cell_q1_color = vvlgrey\n+#cell_q2_color = vlgrey\n+#cell_q3_color = lgrey\n+#cell_q4_color = red\n+#cell_q1_nostroke = yes\n+#cell_q2_nostroke = yes\n+#cell_q3_nostroke = yes\n+#cell_q4_nostroke = yes\n+\n+# cell value multiplier, required when all data is small (e.g. <1), in which\n+# case set the multiplier to something like 1000 because Circos\n+# works only with integer scales\n+\n+data_mult = 1\n+\n+################################################################\n+# Segment labels can be optionally set to a size that is\n+# proportional to the size of the segment. Set min/max size\n+# values here. If this line is commented out, then the label\n+# size is determined by the circos.conf file used to draw the image\n+\n+#segment_label_size_range = 60,60\n+\n+# progression controls how fast the label size changes from\n+# min to max (larger value of progression means values close to max\n+# are achieved for smaller segments)\n+\n+segment_label_size_progression = 4\n+\n+segment_label_uppercase = no\n+\n+################################################################\n+# Segment colors can be specified in the data file (in this\n+# case use row_color_col and col_color_row), otherwise colors\n+# are interpolated within an HSV range. Color interpolation can be\n+# done in two ways: based on segment index (interpolation steps through\n+# colors uniformly for each segment) and total size (interpolation\n+# steps through colors in proportion to segment size).\n+\n+<segment_colors>\n+interpolate_type = size # size or count\n+h0 = 0\n+s0 = 0.8\n+v0 = 0.9\n+h1 = 300\n+s1 = 0.8\n+v1 = 0.9\n+</segment_colors>\n+\n+################################################################\n+# Shorten the labels of segments. Specify whether to do this\n+# with shorten_text=yes|no parameter and provide regular\n+# expressions in string_replace which define the text to\n+# replace. \n+\n+shorten_text = yes\n+\n+<string_replace>\n+IGH = \n+</string_replace>\n+\n+# exit on any error\n+strict_sanity = yes\n+\n+################################################################\n+# if the segment_prefix is set, then rows and columns will be\n+# renamed to internal fields segment_prefix + DIGIT\n+\n+#segment_prefix = id\n+color_prefix = color\n+\n+################################################################\n+# Delimiters\n+\n+# field delimiter regular expression\n+# if this is not defined, any whitespace will be considered a delimiter\n+field_delim = \\s\n+\n+# collapse adjacent delimiters?\n+field_delim_collapse = yes\n+\n+# remove any leading space in the input file\n+# by default, this is on - if you set this to "no", make sure that you don\'t have any leading spaces in your table!\n+strip_leading_space = yes\n+\n+# remove quotes and thousand separators - concatenate characters to remove\n+#\n+# e.g. to remove characters a b c set remove_cell_rx=abc\n+# e.g. to remove characters " \' , set remove_cell_rx="\',\n+remove_cell_rx = "\',\n+\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/circos/pragmata.ttf

Binary file report_clonality/circos/pragmata.ttf has changed

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/circos/ticks.conf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/report_clonality/circos/ticks.conf Mon Dec 12 05:22:57 2016 -0500

@@ -0,0 +1,65 @@
+
+show_ticks          = no
+show_tick_labels    = no
+chrticklabels       = yes
+chrticklabelfont    = default
+
+grid_start         = dims(ideogram,radius_inner)-0.5r
+grid_end           = dims(ideogram,radius_outer)+100
+
+<ticks>
+skip_first_label     = no
+skip_last_label      = no
+radius               = dims(ideogram,radius_outer)
+label_offset     = 0p
+color = black
+
+<tick>
+spacing        = 5u
+spacing_type   = relative
+rspacing       = 0.1
+size           = 3p
+thickness      = 2p
+color          = dgrey
+show_label     = yes
+label_size     = 16p
+label_offset   = 3p
+label_relative = yes
+format         = %d
+grid           = yes
+grid_color     = dgrey
+grid_thickness = 1p
+suffix = %
+rmultiplier = 100
+offset = 40p
+</tick>
+
+<tick>
+spacing        = 50u
+size           = 3p
+thickness      = 2p
+color          = black
+show_label     = yes
+label_size     = 12p
+label_offset   = 3p
+format         = %d
+grid           = yes
+grid_color     = dgrey
+grid_thickness = 1p
+</tick>
+
+<tick>
+spacing        = 10u
+size           = 2p
+thickness      = 1p
+color          = black
+show_label     = no
+label_size     = 12p
+label_offset   = 3p
+format         = %d
+grid           = yes
+grid_color     = dgrey
+grid_thickness = 1p
+</tick>
+
+</ticks>

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/circos/wingding.ttf

Binary file report_clonality/circos/wingding.ttf has changed

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/genes.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/report_clonality/genes.txt Mon Dec 12 05:22:57 2016 -0500

b'@@ -0,0 +1,3306 @@\n+Species\tIMGT.GENE.DB\tlocus\tregion\tfunctional\tchr.order\r\n+Bos taurus functional\tTRAJ1\tTRA\tJ\tTRUE\t1\r\n+Bos taurus functional\tTRAJ10\tTRA\tJ\tTRUE\t2\r\n+Bos taurus functional\tTRAJ11\tTRA\tJ\tTRUE\t3\r\n+Bos taurus non-functional\tTRAJ12\tTRA\tJ\tFALSE\t4\r\n+Bos taurus functional\tTRAJ13\tTRA\tJ\tTRUE\t5\r\n+Bos taurus functional\tTRAJ14\tTRA\tJ\tTRUE\t6\r\n+Bos taurus functional\tTRAJ15\tTRA\tJ\tTRUE\t7\r\n+Bos taurus functional\tTRAJ16\tTRA\tJ\tTRUE\t8\r\n+Bos taurus functional\tTRAJ17\tTRA\tJ\tTRUE\t9\r\n+Bos taurus functional\tTRAJ18\tTRA\tJ\tTRUE\t10\r\n+Bos taurus functional\tTRAJ19\tTRA\tJ\tTRUE\t11\r\n+Bos taurus functional\tTRAJ2\tTRA\tJ\tTRUE\t12\r\n+Bos taurus functional\tTRAJ20\tTRA\tJ\tTRUE\t13\r\n+Bos taurus functional\tTRAJ21\tTRA\tJ\tTRUE\t14\r\n+Bos taurus functional\tTRAJ22\tTRA\tJ\tTRUE\t15\r\n+Bos taurus functional\tTRAJ23\tTRA\tJ\tTRUE\t16\r\n+Bos taurus functional\tTRAJ24\tTRA\tJ\tTRUE\t17\r\n+Bos taurus functional\tTRAJ25\tTRA\tJ\tTRUE\t18\r\n+Bos taurus functional\tTRAJ26\tTRA\tJ\tTRUE\t19\r\n+Bos taurus non-functional\tTRAJ27\tTRA\tJ\tFALSE\t20\r\n+Bos taurus functional\tTRAJ28\tTRA\tJ\tTRUE\t21\r\n+Bos taurus functional\tTRAJ29\tTRA\tJ\tTRUE\t22\r\n+Bos taurus functional\tTRAJ3\tTRA\tJ\tTRUE\t23\r\n+Bos taurus non-functional\tTRAJ30\tTRA\tJ\tFALSE\t24\r\n+Bos taurus functional\tTRAJ31\tTRA\tJ\tTRUE\t25\r\n+Bos taurus functional\tTRAJ32\tTRA\tJ\tTRUE\t26\r\n+Bos taurus functional\tTRAJ33\tTRA\tJ\tTRUE\t27\r\n+Bos taurus functional\tTRAJ34\tTRA\tJ\tTRUE\t28\r\n+Bos taurus functional\tTRAJ35\tTRA\tJ\tTRUE\t29\r\n+Bos taurus functional\tTRAJ36\tTRA\tJ\tTRUE\t30\r\n+Bos taurus functional\tTRAJ37\tTRA\tJ\tTRUE\t31\r\n+Bos taurus functional\tTRAJ38\tTRA\tJ\tTRUE\t32\r\n+Bos taurus functional\tTRAJ39\tTRA\tJ\tTRUE\t33\r\n+Bos taurus non-functional\tTRAJ4\tTRA\tJ\tFALSE\t34\r\n+Bos taurus functional\tTRAJ40\tTRA\tJ\tTRUE\t35\r\n+Bos taurus functional\tTRAJ41\tTRA\tJ\tTRUE\t36\r\n+Bos taurus functional\tTRAJ42\tTRA\tJ\tTRUE\t37\r\n+Bos taurus functional\tTRAJ43\tTRA\tJ\tTRUE\t38\r\n+Bos taurus functional\tTRAJ44\tTRA\tJ\tTRUE\t39\r\n+Bos taurus functional\tTRAJ45\tTRA\tJ\tTRUE\t40\r\n+Bos taurus functional\tTRAJ46\tTRA\tJ\tTRUE\t41\r\n+Bos taurus non-functional\tTRAJ47\tTRA\tJ\tFALSE\t42\r\n+Bos taurus functional\tTRAJ48\tTRA\tJ\tTRUE\t43\r\n+Bos taurus functional\tTRAJ49\tTRA\tJ\tTRUE\t44\r\n+Bos taurus functional\tTRAJ5\tTRA\tJ\tTRUE\t45\r\n+Bos taurus functional\tTRAJ50\tTRA\tJ\tTRUE\t46\r\n+Bos taurus functional\tTRAJ51\tTRA\tJ\tTRUE\t47\r\n+Bos taurus non-functional\tTRAJ52\tTRA\tJ\tFALSE\t48\r\n+Bos taurus functional\tTRAJ6\tTRA\tJ\tTRUE\t49\r\n+Bos taurus functional\tTRAJ7\tTRA\tJ\tTRUE\t50\r\n+Bos taurus functional\tTRAJ8\tTRA\tJ\tTRUE\t51\r\n+Bos taurus functional\tTRAJ9\tTRA\tJ\tTRUE\t52\r\n+Bos taurus functional\tTRDD1\tTRD\tD\tTRUE\t53\r\n+Bos taurus functional\tTRDD2\tTRD\tD\tTRUE\t54\r\n+Bos taurus functional\tTRDD3\tTRD\tD\tTRUE\t55\r\n+Bos taurus functional\tTRDD4\tTRD\tD\tTRUE\t56\r\n+Bos taurus functional\tTRDD5\tTRD\tD\tTRUE\t57\r\n+Bos taurus functional\tTRDJ1\tTRD\tJ\tTRUE\t58\r\n+Bos taurus functional\tTRDJ2\tTRD\tJ\tTRUE\t59\r\n+Bos taurus functional\tTRDJ3\tTRD\tJ\tTRUE\t60\r\n+Bos taurus functional\tTRDV1S1\tTRD\tV\tTRUE\t61\r\n+Bos taurus functional\tTRDV1S10\tTRD\tV\tTRUE\t62\r\n+Bos taurus functional\tTRDV1S11\tTRD\tV\tTRUE\t63\r\n+Bos taurus functional\tTRDV1S12\tTRD\tV\tTRUE\t64\r\n+Bos taurus functional\tTRDV1S13-1\tTRD\tV\tTRUE\t65\r\n+Bos taurus functional\tTRDV1S13-2\tTRD\tV\tTRUE\t66\r\n+Bos taurus functional\tTRDV1S14\tTRD\tV\tTRUE\t67\r\n+Bos taurus functional\tTRDV1S15-1\tTRD\tV\tTRUE\t68\r\n+Bos taurus functional\tTRDV1S15-2\tTRD\tV\tTRUE\t69\r\n+Bos taurus functional\tTRDV1S16\tTRD\tV\tTRUE\t70\r\n+Bos taurus functional\tTRDV1S17\tTRD\tV\tTRUE\t71\r\n+Bos taurus functional\tTRDV1S18-1\tTRD\tV\tTRUE\t72\r\n+Bos taurus functional\tTRDV1S18-2\tTRD\tV\tTRUE\t73\r\n+Bos taurus functional\tTRDV1S19\tTRD\tV\tTRUE\t74\r\n+Bos taurus non-functional\tTRDV1S2-1\tTRD\tV\tFALSE\t75\r\n+Bos taurus non-functional\tTRDV1S2-2\tTRD\tV\tFALSE\t76\r\n+Bos taurus functional\tTRDV1S20\tTRD\tV\tTRUE\t77\r\n+Bos taurus functional\tTRDV1S21-1\tTRD\tV\tTRUE\t78\r\n+Bos taurus functional\tTRDV1S21-2\tTRD\tV\tTRUE\t79\r\n+Bos taurus non-functional\tTRDV1S22\tTRD\tV\tFALSE\t80\r\n+Bos taurus functional\tTRDV1S23\tTRD\tV\tTRUE\t81\r\n+Bos taurus functional\tTRDV1S24\tTRD\tV\tTRUE\t82\r\n+Bos taurus non-functional\tTRDV1S25-1\tTRD\tV\tFALSE\t83\r\n+Bos taurus non-functional\tTRDV1S25-2\tTRD\tV\tFALSE\t84\r\n+Bos taurus functional\tTRDV1S26\tTRD\tV\tTRUE\t85'..b'TRUE\t3225\r\n+Rattus norvegicus functional\tIGLV3S1\tIGL\tV\tTRUE\t3226\r\n+Rattus norvegicus functional\tIGLV3S2\tIGL\tV\tTRUE\t3227\r\n+Rattus norvegicus functional\tIGLV3S3\tIGL\tV\tTRUE\t3228\r\n+Rattus norvegicus functional\tIGLV3S4\tIGL\tV\tTRUE\t3229\r\n+Rattus norvegicus functional\tIGLV3S5\tIGL\tV\tTRUE\t3230\r\n+Sus scrofa functional\tIGHD\tIGH\tD\tTRUE\t3231\r\n+Sus scrofa functional\tIGHD1\tIGH\tD\tTRUE\t3232\r\n+Sus scrofa functional\tIGHD2\tIGH\tD\tTRUE\t3233\r\n+Sus scrofa functional\tIGHD3\tIGH\tD\tTRUE\t3234\r\n+Sus scrofa functional\tIGHD4\tIGH\tD\tTRUE\t3235\r\n+Sus scrofa functional\tIGHJ1\tIGH\tJ\tTRUE\t3236\r\n+Sus scrofa functional\tIGHJ2\tIGH\tJ\tTRUE\t3237\r\n+Sus scrofa functional\tIGHJ3\tIGH\tJ\tTRUE\t3238\r\n+Sus scrofa functional\tIGHJ4\tIGH\tJ\tTRUE\t3239\r\n+Sus scrofa functional\tIGHJ5\tIGH\tJ\tTRUE\t3240\r\n+Sus scrofa non-functional\tIGHV1-1\tIGH\tV\tFALSE\t3241\r\n+Sus scrofa functional\tIGHV1-10\tIGH\tV\tTRUE\t3242\r\n+Sus scrofa functional\tIGHV1-11\tIGH\tV\tTRUE\t3243\r\n+Sus scrofa functional\tIGHV1-12\tIGH\tV\tTRUE\t3244\r\n+Sus scrofa non-functional\tIGHV1-13\tIGH\tV\tFALSE\t3245\r\n+Sus scrofa functional\tIGHV1-14\tIGH\tV\tTRUE\t3246\r\n+Sus scrofa functional\tIGHV1-15\tIGH\tV\tTRUE\t3247\r\n+Sus scrofa functional\tIGHV1-2\tIGH\tV\tTRUE\t3248\r\n+Sus scrofa non-functional\tIGHV1-3\tIGH\tV\tFALSE\t3249\r\n+Sus scrofa functional\tIGHV1-4\tIGH\tV\tTRUE\t3250\r\n+Sus scrofa functional\tIGHV1-5\tIGH\tV\tTRUE\t3251\r\n+Sus scrofa functional\tIGHV1-6\tIGH\tV\tTRUE\t3252\r\n+Sus scrofa non-functional\tIGHV1-7\tIGH\tV\tFALSE\t3253\r\n+Sus scrofa functional\tIGHV1-8\tIGH\tV\tTRUE\t3254\r\n+Sus scrofa non-functional\tIGHV1-9\tIGH\tV\tFALSE\t3255\r\n+Sus scrofa functional\tIGHV1S2\tIGH\tV\tTRUE\t3256\r\n+Sus scrofa non-functional\tIGHV1S3\tIGH\tV\tFALSE\t3257\r\n+Sus scrofa functional\tIGHV1S5\tIGH\tV\tTRUE\t3258\r\n+Sus scrofa functional\tIGHV1S6\tIGH\tV\tTRUE\t3259\r\n+Sus scrofa non-functional\tIGHV1S7\tIGH\tV\tFALSE\t3260\r\n+Sus scrofa non-functional\tIGHV1S8\tIGH\tV\tFALSE\t3261\r\n+Sus scrofa functional\tIGKJ1\tIGK\tJ\tTRUE\t3262\r\n+Sus scrofa functional\tIGKJ2\tIGK\tJ\tTRUE\t3263\r\n+Sus scrofa functional\tIGKJ3\tIGK\tJ\tTRUE\t3264\r\n+Sus scrofa functional\tIGKJ4\tIGK\tJ\tTRUE\t3265\r\n+Sus scrofa functional\tIGKJ5\tIGK\tJ\tTRUE\t3266\r\n+Sus scrofa functional\tIGKV1-11\tIGK\tV\tTRUE\t3267\r\n+Sus scrofa functional\tIGKV1-14\tIGK\tV\tTRUE\t3268\r\n+Sus scrofa functional\tIGKV1-7\tIGK\tV\tTRUE\t3269\r\n+Sus scrofa functional\tIGKV1-9\tIGK\tV\tTRUE\t3270\r\n+Sus scrofa functional\tIGKV1D-11\tIGK\tV\tTRUE\t3271\r\n+Sus scrofa functional\tIGKV2-10\tIGK\tV\tTRUE\t3272\r\n+Sus scrofa functional\tIGKV2-12\tIGK\tV\tTRUE\t3273\r\n+Sus scrofa functional\tIGKV2-13\tIGK\tV\tTRUE\t3274\r\n+Sus scrofa non-functional\tIGKV2-5\tIGK\tV\tFALSE\t3275\r\n+Sus scrofa functional\tIGKV2-6\tIGK\tV\tTRUE\t3276\r\n+Sus scrofa functional\tIGKV2-8\tIGK\tV\tTRUE\t3277\r\n+Sus scrofa non-functional\tIGKV2/OR3-1\tIGK\tV\tFALSE\t3278\r\n+Sus scrofa functional\tIGKV2D-12\tIGK\tV\tTRUE\t3279\r\n+Sus scrofa non-functional\tIGKV3-3\tIGK\tV\tFALSE\t3280\r\n+Sus scrofa non-functional\tIGKV5-4\tIGK\tV\tFALSE\t3281\r\n+Sus scrofa functional\tIGLJ2\tIGL\tJ\tTRUE\t3282\r\n+Sus scrofa functional\tIGLJ3\tIGL\tJ\tTRUE\t3283\r\n+Sus scrofa functional\tIGLJ4\tIGL\tJ\tTRUE\t3284\r\n+Sus scrofa non-functional\tIGLV(III)-8\tIGL\tV\tFALSE\t3285\r\n+Sus scrofa non-functional\tIGLV1-15\tIGL\tV\tFALSE\t3286\r\n+Sus scrofa non-functional\tIGLV1-20\tIGL\tV\tFALSE\t3287\r\n+Sus scrofa functional\tIGLV2-6\tIGL\tV\tTRUE\t3288\r\n+Sus scrofa non-functional\tIGLV3-1\tIGL\tV\tFALSE\t3289\r\n+Sus scrofa functional\tIGLV3-2\tIGL\tV\tTRUE\t3290\r\n+Sus scrofa functional\tIGLV3-3\tIGL\tV\tTRUE\t3291\r\n+Sus scrofa functional\tIGLV3-4\tIGL\tV\tTRUE\t3292\r\n+Sus scrofa functional\tIGLV3-5\tIGL\tV\tTRUE\t3293\r\n+Sus scrofa non-functional\tIGLV5-11\tIGL\tV\tFALSE\t3294\r\n+Sus scrofa functional\tIGLV5-14\tIGL\tV\tTRUE\t3295\r\n+Sus scrofa non-functional\tIGLV5-17\tIGL\tV\tFALSE\t3296\r\n+Sus scrofa non-functional\tIGLV5-22\tIGL\tV\tFALSE\t3297\r\n+Sus scrofa non-functional\tIGLV7-7\tIGL\tV\tFALSE\t3298\r\n+Sus scrofa non-functional\tIGLV7-9\tIGL\tV\tFALSE\t3299\r\n+Sus scrofa functional\tIGLV8-10\tIGL\tV\tTRUE\t3300\r\n+Sus scrofa functional\tIGLV8-13\tIGL\tV\tTRUE\t3301\r\n+Sus scrofa non-functional\tIGLV8-16\tIGL\tV\tFALSE\t3302\r\n+Sus scrofa functional\tIGLV8-18\tIGL\tV\tTRUE\t3303\r\n+Sus scrofa functional\tIGLV8-19\tIGL\tV\tTRUE\t3304\r\n+Sus scrofa non-functional\tIGLV8-21\tIGL\tV\tFALSE\t3305\r\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/jquery-1.11.0.min.js
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/report_clonality/jquery-1.11.0.min.js Mon Dec 12 05:22:57 2016 -0500

[

b'@@ -0,0 +1,4 @@\n+/*! jQuery v1.11.0 | (c) 2005, 2014 jQuery Foundation, Inc. | jquery.org/license */\n+!function(a,b){"object"==typeof module&&"object"==typeof module.exports?module.exports=a.document?b(a,!0):function(a){if(!a.document)throw new Error("jQuery requires a window with a document");return b(a)}:b(a)}("undefined"!=typeof window?window:this,function(a,b){var c=[],d=c.slice,e=c.concat,f=c.push,g=c.indexOf,h={},i=h.toString,j=h.hasOwnProperty,k="".trim,l={},m="1.11.0",n=function(a,b){return new n.fn.init(a,b)},o=/^[\\s\\uFEFF\\xA0]+|[\\s\\uFEFF\\xA0]+$/g,p=/^-ms-/,q=/-([\\da-z])/gi,r=function(a,b){return b.toUpperCase()};n.fn=n.prototype={jquery:m,constructor:n,selector:"",length:0,toArray:function(){return d.call(this)},get:function(a){return null!=a?0>a?this[a+this.length]:this[a]:d.call(this)},pushStack:function(a){var b=n.merge(this.constructor(),a);return b.prevObject=this,b.context=this.context,b},each:function(a,b){return n.each(this,a,b)},map:function(a){return this.pushStack(n.map(this,function(b,c){return a.call(b,c,b)}))},slice:function(){return this.pushStack(d.apply(this,arguments))},first:function(){return this.eq(0)},last:function(){return this.eq(-1)},eq:function(a){var b=this.length,c=+a+(0>a?b:0);return this.pushStack(c>=0&&b>c?[this[c]]:[])},end:function(){return this.prevObject||this.constructor(null)},push:f,sort:c.sort,splice:c.splice},n.extend=n.fn.extend=function(){var a,b,c,d,e,f,g=arguments[0]||{},h=1,i=arguments.length,j=!1;for("boolean"==typeof g&&(j=g,g=arguments[h]||{},h++),"object"==typeof g||n.isFunction(g)||(g={}),h===i&&(g=this,h--);i>h;h++)if(null!=(e=arguments[h]))for(d in e)a=g[d],c=e[d],g!==c&&(j&&c&&(n.isPlainObject(c)||(b=n.isArray(c)))?(b?(b=!1,f=a&&n.isArray(a)?a:[]):f=a&&n.isPlainObject(a)?a:{},g[d]=n.extend(j,f,c)):void 0!==c&&(g[d]=c));return g},n.extend({expando:"jQuery"+(m+Math.random()).replace(/\\D/g,""),isReady:!0,error:function(a){throw new Error(a)},noop:function(){},isFunction:function(a){return"function"===n.type(a)},isArray:Array.isArray||function(a){return"array"===n.type(a)},isWindow:function(a){return null!=a&&a==a.window},isNumeric:function(a){return a-parseFloat(a)>=0},isEmptyObject:function(a){var b;for(b in a)return!1;return!0},isPlainObject:function(a){var b;if(!a||"object"!==n.type(a)||a.nodeType||n.isWindow(a))return!1;try{if(a.constructor&&!j.call(a,"constructor")&&!j.call(a.constructor.prototype,"isPrototypeOf"))return!1}catch(c){return!1}if(l.ownLast)for(b in a)return j.call(a,b);for(b in a);return void 0===b||j.call(a,b)},type:function(a){return null==a?a+"":"object"==typeof a||"function"==typeof a?h[i.call(a)]||"object":typeof a},globalEval:function(b){b&&n.trim(b)&&(a.execScript||function(b){a.eval.call(a,b)})(b)},camelCase:function(a){return a.replace(p,"ms-").replace(q,r)},nodeName:function(a,b){return a.nodeName&&a.nodeName.toLowerCase()===b.toLowerCase()},each:function(a,b,c){var d,e=0,f=a.length,g=s(a);if(c){if(g){for(;f>e;e++)if(d=b.apply(a[e],c),d===!1)break}else for(e in a)if(d=b.apply(a[e],c),d===!1)break}else if(g){for(;f>e;e++)if(d=b.call(a[e],e,a[e]),d===!1)break}else for(e in a)if(d=b.call(a[e],e,a[e]),d===!1)break;return a},trim:k&&!k.call("\\ufeff\\xa0")?function(a){return null==a?"":k.call(a)}:function(a){return null==a?"":(a+"").replace(o,"")},makeArray:function(a,b){var c=b||[];return null!=a&&(s(Object(a))?n.merge(c,"string"==typeof a?[a]:a):f.call(c,a)),c},inArray:function(a,b,c){var d;if(b){if(g)return g.call(b,a,c);for(d=b.length,c=c?0>c?Math.max(0,d+c):c:0;d>c;c++)if(c in b&&b[c]===a)return c}return-1},merge:function(a,b){var c=+b.length,d=0,e=a.length;while(c>d)a[e++]=b[d++];if(c!==c)while(void 0!==b[d])a[e++]=b[d++];return a.length=e,a},grep:function(a,b,c){for(var d,e=[],f=0,g=a.length,h=!c;g>f;f++)d=!b(a[f],f),d!==h&&e.push(a[f]);return e},map:function(a,b,c){var d,f=0,g=a.length,h=s(a),i=[];if(h)for(;g>f;f++)d=b(a[f],f,c),null!=d&&i.push(d);else for(f in a)d=b(a[f],f,c),null!=d&&i.push(d);return e.apply([],i)},guid:1,proxy:functio'..b'=!1&&(b.url+=(xc.test(b.url)?"&":"?")+b.jsonp+"="+e),b.converters["script json"]=function(){return g||n.error(e+" was not called"),g[0]},b.dataTypes[0]="json",f=a[e],a[e]=function(){g=arguments},d.always(function(){a[e]=f,b[e]&&(b.jsonpCallback=c.jsonpCallback,ad.push(e)),g&&n.isFunction(f)&&f(g[0]),g=f=void 0}),"script"):void 0}),n.parseHTML=function(a,b,c){if(!a||"string"!=typeof a)return null;"boolean"==typeof b&&(c=b,b=!1),b=b||z;var d=v.exec(a),e=!c&&[];return d?[b.createElement(d[1])]:(d=n.buildFragment([a],b,e),e&&e.length&&n(e).remove(),n.merge([],d.childNodes))};var cd=n.fn.load;n.fn.load=function(a,b,c){if("string"!=typeof a&&cd)return cd.apply(this,arguments);var d,e,f,g=this,h=a.indexOf(" ");return h>=0&&(d=a.slice(h,a.length),a=a.slice(0,h)),n.isFunction(b)?(c=b,b=void 0):b&&"object"==typeof b&&(f="POST"),g.length>0&&n.ajax({url:a,type:f,dataType:"html",data:b}).done(function(a){e=arguments,g.html(d?n("<div>").append(n.parseHTML(a)).find(d):a)}).complete(c&&function(a,b){g.each(c,e||[a.responseText,b,a])}),this},n.expr.filters.animated=function(a){return n.grep(n.timers,function(b){return a===b.elem}).length};var dd=a.document.documentElement;function ed(a){return n.isWindow(a)?a:9===a.nodeType?a.defaultView||a.parentWindow:!1}n.offset={setOffset:function(a,b,c){var d,e,f,g,h,i,j,k=n.css(a,"position"),l=n(a),m={};"static"===k&&(a.style.position="relative"),h=l.offset(),f=n.css(a,"top"),i=n.css(a,"left"),j=("absolute"===k||"fixed"===k)&&n.inArray("auto",[f,i])>-1,j?(d=l.position(),g=d.top,e=d.left):(g=parseFloat(f)||0,e=parseFloat(i)||0),n.isFunction(b)&&(b=b.call(a,c,h)),null!=b.top&&(m.top=b.top-h.top+g),null!=b.left&&(m.left=b.left-h.left+e),"using"in b?b.using.call(a,m):l.css(m)}},n.fn.extend({offset:function(a){if(arguments.length)return void 0===a?this:this.each(function(b){n.offset.setOffset(this,a,b)});var b,c,d={top:0,left:0},e=this[0],f=e&&e.ownerDocument;if(f)return b=f.documentElement,n.contains(b,e)?(typeof e.getBoundingClientRect!==L&&(d=e.getBoundingClientRect()),c=ed(f),{top:d.top+(c.pageYOffset||b.scrollTop)-(b.clientTop||0),left:d.left+(c.pageXOffset||b.scrollLeft)-(b.clientLeft||0)}):d},position:function(){if(this[0]){var a,b,c={top:0,left:0},d=this[0];return"fixed"===n.css(d,"position")?b=d.getBoundingClientRect():(a=this.offsetParent(),b=this.offset(),n.nodeName(a[0],"html")||(c=a.offset()),c.top+=n.css(a[0],"borderTopWidth",!0),c.left+=n.css(a[0],"borderLeftWidth",!0)),{top:b.top-c.top-n.css(d,"marginTop",!0),left:b.left-c.left-n.css(d,"marginLeft",!0)}}},offsetParent:function(){return this.map(function(){var a=this.offsetParent||dd;while(a&&!n.nodeName(a,"html")&&"static"===n.css(a,"position"))a=a.offsetParent;return a||dd})}}),n.each({scrollLeft:"pageXOffset",scrollTop:"pageYOffset"},function(a,b){var c=/Y/.test(b);n.fn[a]=function(d){return W(this,function(a,d,e){var f=ed(a);return void 0===e?f?b in f?f[b]:f.document.documentElement[d]:a[d]:void(f?f.scrollTo(c?n(f).scrollLeft():e,c?e:n(f).scrollTop()):a[d]=e)},a,d,arguments.length,null)}}),n.each(["top","left"],function(a,b){n.cssHooks[b]=Mb(l.pixelPosition,function(a,c){return c?(c=Kb(a,b),Ib.test(c)?n(a).position()[b]+"px":c):void 0})}),n.each({Height:"height",Width:"width"},function(a,b){n.each({padding:"inner"+a,content:b,"":"outer"+a},function(c,d){n.fn[d]=function(d,e){var f=arguments.length&&(c||"boolean"!=typeof d),g=c||(d===!0||e===!0?"margin":"border");return W(this,function(b,c,d){var e;return n.isWindow(b)?b.document.documentElement["client"+a]:9===b.nodeType?(e=b.documentElement,Math.max(b.body["scroll"+a],e["scroll"+a],b.body["offset"+a],e["offset"+a],e["client"+a])):void 0===d?n.css(b,c,g):n.style(b,c,d,g)},b,f?d:void 0,f,null)}})}),n.fn.size=function(){return this.length},n.fn.andSelf=n.fn.addBack,"function"==typeof define&&define.amd&&define("jquery",[],function(){return n});var fd=a.jQuery,gd=a.$;return n.noConflict=function(b){return a.$===n&&(a.$=gd),b&&a.jQuery===n&&(a.jQuery=fd),n},typeof b===L&&(a.jQuery=a.$=n),n});\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/pure-min.css
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/report_clonality/pure-min.css Mon Dec 12 05:22:57 2016 -0500

[

b'@@ -0,0 +1,11 @@\n+/*!\n+Pure v0.6.0\n+Copyright 2014 Yahoo! Inc. All rights reserved.\n+Licensed under the BSD License.\n+https://github.com/yahoo/pure/blob/master/LICENSE.md\n+*/\n+/*!\n+normalize.css v^3.0 | MIT License | git.io/normalize\n+Copyright (c) Nicolas Gallagher and Jonathan Neal\n+*/\n+/*! normalize.css v3.0.2 | MIT License | git.io/normalize */html{font-family:sans-serif;-ms-text-size-adjust:100%;-webkit-text-size-adjust:100%}body{margin:0}article,aside,details,figcaption,figure,footer,header,hgroup,main,menu,nav,section,summary{display:block}audio,canvas,progress,video{display:inline-block;vertical-align:baseline}audio:not([controls]){display:none;height:0}[hidden],template{display:none}a{background-color:transparent}a:active,a:hover{outline:0}abbr[title]{border-bottom:1px dotted}b,strong{font-weight:700}dfn{font-style:italic}h1{font-size:2em;margin:.67em 0}mark{background:#ff0;color:#000}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sup{top:-.5em}sub{bottom:-.25em}img{border:0}svg:not(:root){overflow:hidden}figure{margin:1em 40px}hr{-moz-box-sizing:content-box;box-sizing:content-box;height:0}pre{overflow:auto}code,kbd,pre,samp{font-family:monospace,monospace;font-size:1em}button,input,optgroup,select,textarea{color:inherit;font:inherit;margin:0}button{overflow:visible}button,select{text-transform:none}button,html input[type=button],input[type=reset],input[type=submit]{-webkit-appearance:button;cursor:pointer}button[disabled],html input[disabled]{cursor:default}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}input{line-height:normal}input[type=checkbox],input[type=radio]{box-sizing:border-box;padding:0}input[type=number]::-webkit-inner-spin-button,input[type=number]::-webkit-outer-spin-button{height:auto}input[type=search]{-webkit-appearance:textfield;-moz-box-sizing:content-box;-webkit-box-sizing:content-box;box-sizing:content-box}input[type=search]::-webkit-search-cancel-button,input[type=search]::-webkit-search-decoration{-webkit-appearance:none}fieldset{border:1px solid silver;margin:0 2px;padding:.35em .625em .75em}legend{border:0;padding:0}textarea{overflow:auto}optgroup{font-weight:700}table{border-collapse:collapse;border-spacing:0}td,th{padding:0}.hidden,[hidden]{display:none!important}.pure-img{max-width:100%;height:auto;display:block}.pure-g{letter-spacing:-.31em;*letter-spacing:normal;*word-spacing:-.43em;text-rendering:optimizespeed;font-family:FreeSans,Arimo,"Droid Sans",Helvetica,Arial,sans-serif;display:-webkit-flex;-webkit-flex-flow:row wrap;display:-ms-flexbox;-ms-flex-flow:row wrap;-ms-align-content:flex-start;-webkit-align-content:flex-start;align-content:flex-start}.opera-only :-o-prefocus,.pure-g{word-spacing:-.43em}.pure-u{display:inline-block;*display:inline;zoom:1;letter-spacing:normal;word-spacing:normal;vertical-align:top;text-rendering:auto}.pure-g [class *="pure-u"]{font-family:sans-serif}.pure-u-1,.pure-u-1-1,.pure-u-1-2,.pure-u-1-3,.pure-u-2-3,.pure-u-1-4,.pure-u-3-4,.pure-u-1-5,.pure-u-2-5,.pure-u-3-5,.pure-u-4-5,.pure-u-5-5,.pure-u-1-6,.pure-u-5-6,.pure-u-1-8,.pure-u-3-8,.pure-u-5-8,.pure-u-7-8,.pure-u-1-12,.pure-u-5-12,.pure-u-7-12,.pure-u-11-12,.pure-u-1-24,.pure-u-2-24,.pure-u-3-24,.pure-u-4-24,.pure-u-5-24,.pure-u-6-24,.pure-u-7-24,.pure-u-8-24,.pure-u-9-24,.pure-u-10-24,.pure-u-11-24,.pure-u-12-24,.pure-u-13-24,.pure-u-14-24,.pure-u-15-24,.pure-u-16-24,.pure-u-17-24,.pure-u-18-24,.pure-u-19-24,.pure-u-20-24,.pure-u-21-24,.pure-u-22-24,.pure-u-23-24,.pure-u-24-24{display:inline-block;*display:inline;zoom:1;letter-spacing:normal;word-spacing:normal;vertical-align:top;text-rendering:auto}.pure-u-1-24{width:4.1667%;*width:4.1357%}.pure-u-1-12,.pure-u-2-24{width:8.3333%;*width:8.3023%}.pure-u-1-8,.pure-u-3-24{width:12.5%;*width:12.469%}.pure-u-1-6,.pure-u-4-24{width:16.6667%;*width:16.6357%}.pure-u-1-5{width:20%;*width:19.969%}.pure-u-5-24{width:20.8333%;*width:20.8023%}.pure-u-1-4,.pure-u-6-24{width:25%;*width:24.969'..b'roup input:not([type]),.pure-group input[type=text],.pure-group input[type=password],.pure-group input[type=email],.pure-group input[type=url],.pure-group input[type=date],.pure-group input[type=month],.pure-group input[type=time],.pure-group input[type=datetime],.pure-group input[type=datetime-local],.pure-group input[type=week],.pure-group input[type=number],.pure-group input[type=search],.pure-group input[type=tel],.pure-group input[type=color]{margin-bottom:0}.pure-form-aligned .pure-control-group label{margin-bottom:.3em;text-align:left;display:block;width:100%}.pure-form-aligned .pure-controls{margin:1.5em 0 0}.pure-form .pure-help-inline,.pure-form-message-inline,.pure-form-message{display:block;font-size:.75em;padding:.2em 0 .8em}}.pure-menu{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}.pure-menu-fixed{position:fixed;left:0;top:0;z-index:3}.pure-menu-list,.pure-menu-item{position:relative}.pure-menu-list{list-style:none;margin:0;padding:0}.pure-menu-item{padding:0;margin:0;height:100%}.pure-menu-link,.pure-menu-heading{display:block;text-decoration:none;white-space:nowrap}.pure-menu-horizontal{width:100%;white-space:nowrap}.pure-menu-horizontal .pure-menu-list{display:inline-block}.pure-menu-horizontal .pure-menu-item,.pure-menu-horizontal .pure-menu-heading,.pure-menu-horizontal .pure-menu-separator{display:inline-block;*display:inline;zoom:1;vertical-align:middle}.pure-menu-item .pure-menu-item{display:block}.pure-menu-children{display:none;position:absolute;left:100%;top:0;margin:0;padding:0;z-index:3}.pure-menu-horizontal .pure-menu-children{left:0;top:auto;width:inherit}.pure-menu-allow-hover:hover>.pure-menu-children,.pure-menu-active>.pure-menu-children{display:block;position:absolute}.pure-menu-has-children>.pure-menu-link:after{padding-left:.5em;content:"\\25B8";font-size:small}.pure-menu-horizontal .pure-menu-has-children>.pure-menu-link:after{content:"\\25BE"}.pure-menu-scrollable{overflow-y:scroll;overflow-x:hidden}.pure-menu-scrollable .pure-menu-list{display:block}.pure-menu-horizontal.pure-menu-scrollable .pure-menu-list{display:inline-block}.pure-menu-horizontal.pure-menu-scrollable{white-space:nowrap;overflow-y:hidden;overflow-x:auto;-ms-overflow-style:none;-webkit-overflow-scrolling:touch;padding:.5em 0}.pure-menu-horizontal.pure-menu-scrollable::-webkit-scrollbar{display:none}.pure-menu-separator{background-color:#ccc;height:1px;margin:.3em 0}.pure-menu-horizontal .pure-menu-separator{width:1px;height:1.3em;margin:0 .3em}.pure-menu-heading{text-transform:uppercase;color:#565d64}.pure-menu-link{color:#777}.pure-menu-children{background-color:#fff}.pure-menu-link,.pure-menu-disabled,.pure-menu-heading{padding:.5em 1em}.pure-menu-disabled{opacity:.5}.pure-menu-disabled .pure-menu-link:hover{background-color:transparent}.pure-menu-active>.pure-menu-link,.pure-menu-link:hover,.pure-menu-link:focus{background-color:#eee}.pure-menu-selected .pure-menu-link,.pure-menu-selected .pure-menu-link:visited{color:#000}.pure-table{border-collapse:collapse;border-spacing:0;empty-cells:show;border:1px solid #cbcbcb}.pure-table caption{color:#000;font:italic 85%/1 arial,sans-serif;padding:1em 0;text-align:center}.pure-table td,.pure-table th{border-left:1px solid #cbcbcb;border-width:0 0 0 1px;font-size:inherit;margin:0;overflow:visible;padding:.5em 1em}.pure-table td:first-child,.pure-table th:first-child{border-left-width:0}.pure-table thead{background-color:#e0e0e0;color:#000;text-align:left;vertical-align:bottom}.pure-table td{background-color:transparent}.pure-table-odd td{background-color:#f2f2f2}.pure-table-striped tr:nth-child(2n-1) td{background-color:#f2f2f2}.pure-table-bordered td{border-bottom:1px solid #cbcbcb}.pure-table-bordered tbody>tr:last-child>td{border-bottom-width:0}.pure-table-horizontal td,.pure-table-horizontal th{border-width:0 0 1px;border-bottom:1px solid #cbcbcb}.pure-table-horizontal tbody>tr:last-child>td{border-bottom-width:0}\n\\ No newline at end of file\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/r_wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/report_clonality/r_wrapper.sh Mon Dec 12 05:22:57 2016 -0500

[

b'@@ -0,0 +1,295 @@\n+#!/bin/bash\n+\n+inputFile=$1\n+outputDir=$3\n+outputFile=$3/index.html #$2\n+clonalType=$4\n+species=$5\n+locus=$6\n+filterproductive=$7\n+clonality_method=$8\n+\n+dir="$(cd "$(dirname "$0")" && pwd)"\n+useD="false"\n+if grep -q "$species.*${locus}D" "$dir/genes.txt" ; then\n+\techo "species D region in reference db"\n+\tuseD="true"\n+fi\n+echo "$species"\n+if [[ "$species" == *"custom"* ]] ; then\n+\tloci=(${locus//;/ })\n+\tuseD="true"\n+\techo "${loci[@]}"\n+\tif [[ "${#loci[@]}" -eq "2" ]] ; then\n+\t\tuseD="false"\n+\tfi\n+fi\n+mkdir $3\n+cp $dir/genes.txt $outputDir\n+Rscript --verbose $dir/RScript.r $inputFile $outputDir $outputDir $clonalType "$species" "$locus" $filterproductive ${clonality_method} 2>&1\n+cp $dir/tabber.js $outputDir\n+cp $dir/style.css $outputDir\n+cp $dir/script.js $outputDir\n+cp $dir/jquery-1.11.0.min.js $outputDir\n+cp $dir/pure-min.css $outputDir\n+samples=`cat $outputDir/samples.txt`\n+\n+echo "<html><center><h1><a href=\'index.html\'>Click here for the results</a></h1>Tip: Open it in a new tab (middle mouse button or right mouse button -> \'open in new tab\' on the link above)<br />" > $2\n+echo "<table border = 1>" >> $2\n+echo "<thead><tr><th>Sample/Replicate</th><th>All</th><th>Productive</th><th>Unique Productive</th><th>Unproductive</th><th>Unique Unproductive</th></tr></thead>" >> $2\n+while IFS=, read sample all productive perc_prod productive_unique perc_prod_un unproductive perc_unprod unproductive_unique perc_unprod_un\n+\tdo\n+\t\techo "<tr><td>$sample</td>" >> $2\n+\t\techo "<td>$all</td>" >> $2\n+\t\techo "<td>$productive (${perc_prod}%)</td>" >> $2\n+\t\techo "<td>$productive_unique (${perc_prod_un}%)</td>" >> $2\n+\t\techo "<td>$unproductive (${perc_unprod}%)</td>" >> $2\n+\t\techo "<td>$unproductive_unique (${perc_unprod_un}%)</td></tr>" >> $2\n+done < $outputDir/productive_counting.txt\n+echo "</table border></center></html>" >> $2\n+\n+echo "<html><head><title>Report on:" >> $outputFile\n+\n+mkdir $outputDir/circos\n+cp $dir/circos/* $outputDir/circos/\n+#CIRCOSTOOLS="/data/galaxy/galaxy-dist/toolsheddependencies/circos/0.64/saskia-hiltemann/cg_circos_plots/bbfdd52d64fd/circos-tools-0.21/tools"\n+#CIRCOSDIR="/data/galaxy/galaxy-dist/toolsheddependencies/circos/0.64/saskia-hiltemann/cg_circos_plots/bbfdd52d64fd/bin/"\n+\n+#CIRCOSTOOLS="/home/galaxy/circos/circos-tools-0.22/tools"\n+#CIRCOSDIR="/home/galaxy/Anaconda3/bin"\n+\n+USECIRCOS="no"\n+if [ -d "$CIRCOSDIR" ]; then\n+\tUSECIRCOS="yes"\n+else\n+\tif [ -d "/home/galaxy/Anaconda3/bin" ]; then #hopefully temporary fix\n+\t\tUSECIRCOS="yes"\n+\t\tCIRCOSTOOLS="/data/galaxy/galaxy-dist/toolsheddependencies/circos/0.64/saskia-hiltemann/cg_circos_plots/bbfdd52d64fd/circos-tools-0.21/tools"\n+\t\tCIRCOSDIR="/data/galaxy/galaxy-dist/toolsheddependencies/circos/0.64/saskia-hiltemann/cg_circos_plots/bbfdd52d64fd/bin/"\n+\tfi\n+fi\n+echo "Using Circos: $USECIRCOS"\n+sed -i "s%DATA_DIR%$outputDir/circos%" $outputDir/circos/circos.conf\n+for sample in $samples; do #output the samples to a file and create the circos plots with the R script output\n+\techo " $sample" >> $outputFile\n+\t\n+\tif [[ "$USECIRCOS" != "yes" ]]; then\n+\t\tcontinue\n+\tfi\n+\t\n+\tcircos_file="$outputDir/${sample}_VJ_circos.txt"\n+\techo -e -n "labels$(cat ${circos_file})" > ${circos_file}\n+\tcat "${circos_file}" | $CIRCOSTOOLS/tableviewer/bin/parse-table -configfile $dir/circos/parse-table.conf 2>&1 | $CIRCOSTOOLS/tableviewer/bin/make-conf -dir $outputDir/circos/\n+\t$CIRCOSDIR/circos -conf $outputDir/circos/circos.conf 2>&1\n+\tmv $outputDir/circos/circos.png $outputDir/circosVJ_${sample}.png\n+\t\n+\t\n+\tif [[ "$useD" == "true" ]] ; then\n+\t\tcircos_file="$outputDir/${sample}_VD_circos.txt"\n+\t\techo -e -n "labels$(cat ${circos_file})" > ${circos_file}\n+\t\tcat "${circos_file}" | $CIRCOSTOOLS/tableviewer/bin/parse-table -configfile $dir/circos/parse-table.conf 2>&1 | $CIRCOSTOOLS/tableviewer/bin/make-conf -dir $outputDir/circos/\n+\t\t$CIRCOSDIR/circos -conf $outputDir/circos/circos.conf 2>&1\n+\t\tmv $outputDir/circos/circos.png $outputDir/circosVD_${sample}.png\n+\t\t\n+\t\tcircos_file'..b' pure-table-striped\' id=\'junction_table\'> <caption>Unproductive median</caption><thead><tr><th>Sample</th><th>count</th><th>V.DEL</th><th>P1</th><th>N1</th><th>P2</th><th>DEL.D</th><th>D.DEL</th><th>P3</th><th>N2</th><th>P4</th><th>DEL.J</th><th>Total.Del</th><th>Total.N</th><th>Total.P</th><th>Median.CDR3</th><thead></tr><tbody>" >> $outputFile\n+\twhile IFS=, read Sample unique VDEL P1 N1 P2 DELD DDEL P3 N2 P4 DELJ TotalDel TotalN TotalP median\n+\tdo\n+\t\techo "<tr><td>$Sample</td><td>$unique</td><td>$VDEL</td><td>$P1</td><td>$N1</td><td>$P2</td><td>$DELD</td><td>$DDEL</td><td>$P3</td><td>$N2</td><td>$P4</td><td>$DELJ</td><td>$TotalDel</td><td>$TotalN</td><td>$TotalP</td><td>$median</td></tr>" >> $outputFile\n+\tdone < $outputDir/junctionAnalysisUnProd_median.csv\n+\techo "</tbody></table>" >> $outputFile\n+\t\n+\techo "</div>" >> $outputFile\n+fi\n+\n+echo "<div class=\'tabbertab\' title=\'Comparison\'><table class=\'pure-table pure-table-striped\'><thead><tr><th>ID</th><th>Include</th></tr></thead>" >> $outputFile\n+for sample in $samples; do\n+\techo "<tr><td>$sample</td><td><input type=\'checkbox\' onchange=\\"javascript:compareAdd(\'$sample\')\\" id=\'compare_checkbox_$sample\'/></td></tr>" >> $outputFile\n+done\n+echo "</table><div name=\'comparisonarea\'>" >> $outputFile\n+echo "<table><tr id=\'comparison_table_vd\'></tr></table>" >> $outputFile\n+echo "<table><tr id=\'comparison_table_vj\'></tr></table>" >> $outputFile\n+echo "<table><tr id=\'comparison_table_dj\'></tr></table>" >> $outputFile\n+echo "</div></div>" >> $outputFile\n+\n+echo "<div class=\'tabbertab\' title=\'Downloads\'>" >> $outputFile\n+echo "<table class=\'pure-table pure-table-striped\'>" >> $outputFile\n+echo "<thead><tr><th>Description</th><th>Link</th></tr></thead>" >> $outputFile\n+echo "<tr><td>The dataset used to generate the frequency graphs and the heatmaps (Unique based on clonaltype, $clonalType)</td><td><a href=\'allUnique.csv\'>Download</a></td></tr>" >> $outputFile\n+echo "<tr><td>The dataset used to calculate clonality score (Unique based on clonaltype, $clonalType)</td><td><a href=\'clonalityComplete.csv\'>Download</a></td></tr>" >> $outputFile\n+\n+echo "<tr><td>The dataset used to generate the CDR3 length frequency graph</td><td><a href=\'CDR3LengthPlot.csv\'>Download</a></td></tr>" >> $outputFile\n+\n+echo "<tr><td>The dataset used to generate the V gene family frequency graph</td><td><a href=\'VFFrequency.csv\'>Download</a></td></tr>" >> $outputFile\n+if [[ "$useD" == "true" ]] ; then\n+\techo "<tr><td>The dataset used to generate the D gene family frequency graph</td><td><a href=\'DFFrequency.csv\'>Download</a></td></tr>" >> $outputFile\n+fi\n+\n+echo "<tr><td>The dataset used to generate the V gene frequency graph</td><td><a href=\'VFrequency.csv\'>Download</a></td></tr>" >> $outputFile\n+if [[ "$useD" == "true" ]] ; then\n+\techo "<tr><td>The dataset used to generate the D gene frequency graph</td><td><a href=\'DFrequency.csv\'>Download</a></td></tr>" >> $outputFile\n+fi\n+echo "<tr><td>The dataset used to generate the J gene frequency graph</td><td><a href=\'JFrequency.csv\'>Download</a></td></tr>" >> $outputFile\n+echo "<tr><td>The dataset used to generate the AA composition graph</td><td><a href=\'AAComposition.csv\'>Download</a></td></tr>" >> $outputFile\n+\n+for sample in $samples; do\n+\tif [[ "$useD" == "true" ]] ; then\n+\t\techo "<tr><td>The data used to generate the VD heatmap for $sample.</td><td><a href=\'HeatmapVD_$sample.csv\'>Download</a></td></tr>" >> $outputFile\n+\tfi\n+\techo "<tr><td>The data used to generate the VJ heatmap for $sample.</td><td><a href=\'HeatmapVJ_$sample.csv\'>Download</a></td></tr>" >> $outputFile\n+\tif [[ "$useD" == "true" ]] ; then\n+\t\techo "<tr><td>The data used to generate the DJ heatmap for $sample.</td><td><a href=\'HeatmapDJ_$sample.csv\'>Download</a></td></tr>" >> $outputFile\n+\tfi\n+done\n+\n+echo "<tr><td>A frequency count of V Gene + J Gene + CDR3</td><td><a href=\'VJCDR3_count.txt\'>Download</a></td></tr>" >> $outputFile\n+\n+echo "</table>" >> $outputFile\n+echo "</div></html>" >> $outputFile\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/script.js
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/report_clonality/script.js Mon Dec 12 05:22:57 2016 -0500

@@ -0,0 +1,36 @@
+function compareAdd(id){
+ var img = document.createElement('img');
+ img.setAttribute('src', 'HeatmapVD_' + id + '.png');
+ var td = document.createElement('td');
+ td.setAttribute('id', "comparison_vd_" + id);
+ td.appendChild(img)
+ $('#comparison_table_vd').append(td);
+
+ img = document.createElement('img');
+ img.setAttribute('src', 'HeatmapVJ_' + id + '.png');
+ td = document.createElement('td');
+ td.setAttribute('id', "comparison_vj_" + id);
+ td.appendChild(img)
+ $('#comparison_table_vj').append(td);
+
+ img = document.createElement('img');
+ img.setAttribute('src', 'HeatmapDJ_' + id + '.png');
+ td = document.createElement('td');
+ td.setAttribute('id', "comparison_dj_" + id);
+ td.appendChild(img)
+ $('#comparison_table_dj').append(td);
+
+ $('#compare_checkbox_' + id).attr('onchange', "javascript:compareRemove('" + id + "')");
+}
+
+
+function compareRemove(id){
+ $("#comparison_vd_" + id).remove()
+ $("#comparison_vj_" + id).remove()
+ $("#comparison_dj_" + id).remove()
+ $("#compare_checkbox_" + id).attr('onchange', "javascript:compareAdd('" + id + "')");
+}
+
+$( document ).ready(function () {
+ $('#junction_table').tablesorter();
+})

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/style.css
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/report_clonality/style.css Mon Dec 12 05:22:57 2016 -0500

@@ -0,0 +1,109 @@
+/* $Id: example.css,v 1.5 2006/03/27 02:44:36 pat Exp $ */
+
+/*--------------------------------------------------
+  REQUIRED to hide the non-active tab content.
+  But do not hide them in the print stylesheet!
+  --------------------------------------------------*/
+.tabberlive .tabbertabhide {
+ display:none;
+}
+
+/*--------------------------------------------------
+  .tabber = before the tabber interface is set up
+  .tabberlive = after the tabber interface is set up
+  --------------------------------------------------*/
+.tabber {
+}
+.tabberlive {
+ margin-top:1em;
+}
+
+/*--------------------------------------------------
+  ul.tabbernav = the tab navigation list
+  li.tabberactive = the active tab
+  --------------------------------------------------*/
+ul.tabbernav
+{
+ margin:0;
+ padding: 3px 0;
+ border-bottom: 1px solid #778;
+ font: bold 12px Verdana, sans-serif;
+}
+
+ul.tabbernav li
+{
+ list-style: none;
+ margin: 0;
+ display: inline;
+}
+
+ul.tabbernav li a
+{
+ padding: 3px 0.5em;
+ margin-left: 3px;
+ border: 1px solid #778;
+ border-bottom: none;
+ background: #DDE;
+ text-decoration: none;
+}
+
+ul.tabbernav li a:link { color: #448; }
+ul.tabbernav li a:visited { color: #667; }
+
+ul.tabbernav li a:hover
+{
+ color: #000;
+ background: #AAE;
+ border-color: #227;
+}
+
+ul.tabbernav li.tabberactive a
+{
+ background-color: #fff;
+ border-bottom: 1px solid #fff;
+}
+
+ul.tabbernav li.tabberactive a:hover
+{
+ color: #000;
+ background: white;
+ border-bottom: 1px solid white;
+}
+
+/*--------------------------------------------------
+  .tabbertab = the tab content
+  Add style only after the tabber interface is set up (.tabberlive)
+  --------------------------------------------------*/
+.tabberlive .tabbertab {
+ padding:5px;
+ border:1px solid #aaa;
+ border-top:0;
+
+ /* If you don't want the tab size changing whenever a tab is changed
+    you can set a fixed height */
+
+ /* height:200px; */
+
+ /* If you set a fix height set overflow to auto and you will get a
+    scrollbar when necessary */
+
+ /* overflow:auto; */
+}
+
+/* If desired, hide the heading since a heading is provided by the tab */
+.tabberlive .tabbertab h2 {
+ display:none;
+}
+.tabberlive .tabbertab h3 {
+ display:none;
+}
+
+/* Example of using an ID to set different styles for the tabs on the page */
+.tabberlive#tab1 {
+}
+.tabberlive#tab2 {
+}
+.tabberlive#tab2 .tabbertab {
+ height:200px;
+ overflow:auto;
+}

diff -r 5ffd52fc35c4 -r bcec7bb4e089 report_clonality/tabber.js
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/report_clonality/tabber.js Mon Dec 12 05:22:57 2016 -0500

[

@@ -0,0 +1,40 @@
+/* Copyright (c) 2006 Patrick Fitzgerald */
+
+function tabberObj(argsObj)
+{var arg;this.div=null;this.classMain="tabber";this.classMainLive="tabberlive";this.classTab="tabbertab";this.classTabDefault="tabbertabdefault";this.classNav="tabbernav";this.classTabHide="tabbertabhide";this.classNavActive="tabberactive";this.titleElements=['h2','h3','h4','h5','h6'];this.titleElementsStripHTML=true;this.removeTitle=true;this.addLinkId=false;this.linkIdFormat='<tabberid>nav<tabnumberone>';for(arg in argsObj){this[arg]=argsObj[arg];}
+this.REclassMain=new RegExp('\\b'+this.classMain+'\\b','gi');this.REclassMainLive=new RegExp('\\b'+this.classMainLive+'\\b','gi');this.REclassTab=new RegExp('\\b'+this.classTab+'\\b','gi');this.REclassTabDefault=new RegExp('\\b'+this.classTabDefault+'\\b','gi');this.REclassTabHide=new RegExp('\\b'+this.classTabHide+'\\b','gi');this.tabs=new Array();if(this.div){this.init(this.div);this.div=null;}}
+tabberObj.prototype.init=function(e)
+{var
+childNodes,i,i2,t,defaultTab=0,DOM_ul,DOM_li,DOM_a,aId,headingElement;if(!document.getElementsByTagName){return false;}
+if(e.id){this.id=e.id;}
+this.tabs.length=0;childNodes=e.childNodes;for(i=0;i<childNodes.length;i++){if(childNodes[i].className&&childNodes[i].className.match(this.REclassTab)){t=new Object();t.div=childNodes[i];this.tabs[this.tabs.length]=t;if(childNodes[i].className.match(this.REclassTabDefault)){defaultTab=this.tabs.length-1;}}}
+DOM_ul=document.createElement("ul");DOM_ul.className=this.classNav;for(i=0;i<this.tabs.length;i++){t=this.tabs[i];t.headingText=t.div.title;if(this.removeTitle){t.div.title='';}
+if(!t.headingText){for(i2=0;i2<this.titleElements.length;i2++){headingElement=t.div.getElementsByTagName(this.titleElements[i2])[0];if(headingElement){t.headingText=headingElement.innerHTML;if(this.titleElementsStripHTML){t.headingText.replace(/<br>/gi," ");t.headingText=t.headingText.replace(/<[^>]+>/g,"");}
+break;}}}
+if(!t.headingText){t.headingText=i+1;}
+DOM_li=document.createElement("li");t.li=DOM_li;DOM_a=document.createElement("a");DOM_a.appendChild(document.createTextNode(t.headingText));DOM_a.href="javascript:void(null);";DOM_a.title=t.headingText;DOM_a.onclick=this.navClick;DOM_a.tabber=this;DOM_a.tabberIndex=i;if(this.addLinkId&&this.linkIdFormat){aId=this.linkIdFormat;aId=aId.replace(/<tabberid>/gi,this.id);aId=aId.replace(/<tabnumberzero>/gi,i);aId=aId.replace(/<tabnumberone>/gi,i+1);aId=aId.replace(/<tabtitle>/gi,t.headingText.replace(/[^a-zA-Z0-9\-]/gi,''));DOM_a.id=aId;}
+DOM_li.appendChild(DOM_a);DOM_ul.appendChild(DOM_li);}
+e.insertBefore(DOM_ul,e.firstChild);e.className=e.className.replace(this.REclassMain,this.classMainLive);this.tabShow(defaultTab);if(typeof this.onLoad=='function'){this.onLoad({tabber:this});}
+return this;};tabberObj.prototype.navClick=function(event)
+{var
+rVal,a,self,tabberIndex,onClickArgs;a=this;if(!a.tabber){return false;}
+self=a.tabber;tabberIndex=a.tabberIndex;a.blur();if(typeof self.onClick=='function'){onClickArgs={'tabber':self,'index':tabberIndex,'event':event};if(!event){onClickArgs.event=window.event;}
+rVal=self.onClick(onClickArgs);if(rVal===false){return false;}}
+self.tabShow(tabberIndex);return false;};tabberObj.prototype.tabHideAll=function()
+{var i;for(i=0;i<this.tabs.length;i++){this.tabHide(i);}};tabberObj.prototype.tabHide=function(tabberIndex)
+{var div;if(!this.tabs[tabberIndex]){return false;}
+div=this.tabs[tabberIndex].div;if(!div.className.match(this.REclassTabHide)){div.className+=' '+this.classTabHide;}
+this.navClearActive(tabberIndex);return this;};tabberObj.prototype.tabShow=function(tabberIndex)
+{var div;if(!this.tabs[tabberIndex]){return false;}
+this.tabHideAll();div=this.tabs[tabberIndex].div;div.className=div.className.replace(this.REclassTabHide,'');this.navSetActive(tabberIndex);if(typeof this.onTabDisplay=='function'){this.onTabDisplay({'tabber':this,'index':tabberIndex});}
+return this;};tabberObj.prototype.navSetActive=function(tabberIndex)
+{this.tabs[tabberIndex].li.className=this.classNavActive;return this;};tabberObj.prototype.navClearActive=function(tabberIndex)
+{this.tabs[tabberIndex].li.className='';return this;};function tabberAutomatic(tabberArgs)
+{var
+tempObj,divs,i;if(!tabberArgs){tabberArgs={};}
+tempObj=new tabberObj(tabberArgs);divs=document.getElementsByTagName("div");for(i=0;i<divs.length;i++){if(divs[i].className&&divs[i].className.match(tempObj.REclassMain)){tabberArgs.div=divs[i];divs[i].tabber=new tabberObj(tabberArgs);}}
+return this;}
+function tabberAutomaticOnLoad(tabberArgs)
+{var oldOnLoad;if(!tabberArgs){tabberArgs={};}
+oldOnLoad=window.onload;if(typeof window.onload!='function'){window.onload=function(){tabberAutomatic(tabberArgs);};}else{window.onload=function(){oldOnLoad();tabberAutomatic(tabberArgs);};}}
+if(typeof tabberOptions=='undefined'){tabberAutomaticOnLoad();}else{if(!tabberOptions['manualStartup']){tabberAutomaticOnLoad(tabberOptions);}}
\ No newline at end of file

diff -r 5ffd52fc35c4 -r bcec7bb4e089 sequence_overview.r
--- a/sequence_overview.r Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,340 +0,0 @@\n-library(reshape2)\n-\n-args <- commandArgs(trailingOnly = TRUE)\n-\n-before.unique.file = args[1]\n-merged.file = args[2]\n-outputdir = args[3]\n-gene.classes = unlist(strsplit(args[4], ","))\n-hotspot.analysis.sum.file = args[5]\n-NToverview.file = paste(outputdir, "ntoverview.txt", sep="/")\n-NTsum.file = paste(outputdir, "ntsum.txt", sep="/")\n-main.html = "index.html"\n-empty.region.filter = args[6]\n-\n-\n-setwd(outputdir)\n-\n-before.unique = read.table(before.unique.file, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\n-merged = read.table(merged.file, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\n-hotspot.analysis.sum = read.table(hotspot.analysis.sum.file, header=F, sep=",", fill=T, stringsAsFactors=F, quote="")\n-\n-#before.unique = before.unique[!grepl("unmatched", before.unique$best_match),]\n-\n-if(empty.region.filter == "leader"){\n-\tbefore.unique$seq_conc = paste(before.unique$FR1.IMGT.seq, before.unique$CDR1.IMGT.seq, before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq)\n-} else if(empty.region.filter == "FR1"){\n-\tbefore.unique$seq_conc = paste(before.unique$CDR1.IMGT.seq, before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq)\n-} else if(empty.region.filter == "CDR1"){\n-\tbefore.unique$seq_conc = paste(before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq)\n-} else if(empty.region.filter == "FR2"){\n-\tbefore.unique$seq_conc = paste(before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq)\n-}\n-\n-IDs = before.unique[,c("Sequence.ID", "seq_conc", "best_match", "Functionality")]\n-IDs$best_match = as.character(IDs$best_match)\n-\n-dat = data.frame(table(before.unique$seq_conc))\n-\n-names(dat) = c("seq_conc", "Freq")\n-\n-dat$seq_conc = factor(dat$seq_conc)\n-\n-dat = dat[order(as.character(dat$seq_conc)),]\n-\n-#writing html from R...\n-get.bg.color = function(val){\n-\tif(val %in% c("TRUE", "FALSE", "T", "F")){ #if its a logical value, give the background a green/red color\n-\t\treturn(ifelse(val,"#eafaf1","#f9ebea"))\n-\t} else if (!is.na(as.numeric(val))) { #if its a numerical value, give it a grey tint if its >0\n-\t\treturn(ifelse(val > 0,"#eaecee","white"))\n-\t} else {\n-\t\treturn("white")\n-\t}\n-}\n-td = function(val) {\n- return(paste("<td bgcolor=\'", get.bg.color(val), "\'>", val, "</td>", sep=""))\n-}\n-tr = function(val) { \n-\treturn(paste(c("<tr>", sapply(val, td), "</tr>"), collapse="")) \n-}\n-\n-make.link = function(id, clss, val) { \n-\tpaste("<a href=\'", clss, "_", id, ".html\'>", val, "</a>", sep="") \n-}\n-tbl = function(df) {\n-\tres = "<table border=\'1\'>"\n-\tfor(i in 1:nrow(df)){ \n-\t\tres = paste(res, tr(df[i,]), sep="")\n-\t}\n-\tres = paste(res, "</table>")\n-}\n-\n-cat("<table border=\'1\' class=\'pure-table pure-table-striped\'>", file=main.html, append=F)\n-\n-if(empty.region.filter == "leader"){\n-\tcat("<caption>FR1+CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)\n-} else if(empty.region.filter == "FR1"){\n-\tcat("<caption>CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)\n-} else if(empty.region.filter == "CDR1"){\n-\tcat("<caption>FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)\n-} else if(empty.region.filter == "FR2"){\n-\tcat("<caption>CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)\n-}\n-\n-cat("<tr>", file=main.html, append=T)\n-cat("<th>Sequence</th><th>Functionality</th><th>ca1</th><th>ca2</th><th>cg1</th><th>cg2</th><th>cg3</th><th>cg4</th><th>cm</th><th>un</th>", file=main.html, append=T)\n-cat("<th>total CA</th><th>total CG</th><th>number of subclasses</th><th>present in both Ca and Cg</th><th>Ca1+Ca2</th>", file=main.html, append=T)\n-cat("<th>Cg1+Cg2</th><th>Cg1+Cg3</th><th>Cg1+Cg4</th><th>Cg2+Cg3</th><th>Cg2+Cg4</th><th>Cg3+Cg4</th>", file=main.html, append=T)\n-cat("<th>Cg1+Cg2+Cg3</th><th>Cg2+Cg3+Cg4</th><th>Cg1+Cg2+Cg4</th><th>Cg1+Cg3+Cg4</th><th>Cg1+Cg2'..b'cg4.n > 0)\n-\t\n-\tin.cg1.cg2.cg3 = (cg1.n > 0 & cg2.n > 0 & cg3.n > 0)\n-\tin.cg2.cg3.cg4 = (cg2.n > 0 & cg3.n > 0 & cg4.n > 0)\n-\tin.cg1.cg2.cg4 = (cg1.n > 0 & cg2.n > 0 & cg4.n > 0)\n-\tin.cg1.cg3.cg4 = (cg1.n > 0 & cg3.n > 0 & cg4.n > 0)\n-\t\n-\tin.cg.all = (cg1.n > 0 & cg2.n > 0 & cg3.n > 0 & cg4.n > 0)\n-\t\n-\t\n-\t\n-\t\n-\t#rw = c(as.character(dat[i,"seq_conc"]), functionality, ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html, un.html)\n-\trw = c(as.character(dat[i,"seq_conc"]), functionality, ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html, un.html)\n-\trw = c(rw, ca.n, cg.n, in.classes, in.ca.cg, in.ca1.ca2, in.cg1.cg2, in.cg1.cg3, in.cg1.cg4, in.cg2.cg3, in.cg2.cg4, in.cg3.cg4, in.cg1.cg2.cg3, in.cg2.cg3.cg4, in.cg1.cg2.cg4, in.cg1.cg3.cg4, in.cg.all)\n-\n-\tcat(tr(rw), file=main.html, append=T)\n-\t\n-\t\n-\tfor(i in 1:nrow(allc)){ #generate html by id\n-\t\thtml = make.link(id, allc[i,"best_match"], allc[i,"Sequence.ID"])\n-\t\tcat(paste(html, "<br />"), file=sequence.id.page, append=T)\n-\t}\n-}\n-\n-cat("</table>", file=main.html, append=T)\n-\n-print(paste("Single sequences:", single.sequences))\n-print(paste("Sequences in multiple subclasses:", in.multiple))\n-print(paste("Multiple sequences in one subclass:", multiple.in.one))\n-print(paste("Matched with unmatched:", some.unmatched))\n-print(paste("Count that should match \'matched\' sequences:", matched))\n-\n-#ACGT overview\n-\n-#NToverview = merged[!grepl("^unmatched", merged$best_match),]\n-NToverview = merged\n-\n-if(empty.region.filter == "leader"){\n-\tNToverview$seq = paste(NToverview$FR1.IMGT.seq, NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)\n-} else if(empty.region.filter == "FR1"){\n-\tNToverview$seq = paste(NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)\n-} else if(empty.region.filter == "CDR1"){\n-\tNToverview$seq = paste(NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)\n-} else if(empty.region.filter == "FR2"){\n-\tNToverview$seq = paste(NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)\n-}\n-\n-NToverview$A = nchar(gsub("[^Aa]", "", NToverview$seq))\n-NToverview$C = nchar(gsub("[^Cc]", "", NToverview$seq))\n-NToverview$G = nchar(gsub("[^Gg]", "", NToverview$seq))\n-NToverview$T = nchar(gsub("[^Tt]", "", NToverview$seq))\n-\n-#Nsum = data.frame(Sequence.ID="-", best_match="Sum", seq="-", A = sum(NToverview$A), C = sum(NToverview$C), G = sum(NToverview$G), T = sum(NToverview$T))\n-\n-#NToverview = rbind(NToverview, NTsum)\n-\n-NTresult = data.frame(nt=c("A", "C", "T", "G"))\n-\n-for(clazz in gene.classes){\n-\tprint(paste("class:", clazz))\n-\tNToverview.sub = NToverview[grepl(paste("^", clazz, sep=""), NToverview$best_match),]\n-\tprint(paste("nrow:", nrow(NToverview.sub)))\n-\tnew.col.x = c(sum(NToverview.sub$A), sum(NToverview.sub$C), sum(NToverview.sub$T), sum(NToverview.sub$G))\n-\tnew.col.y = sum(new.col.x)\n-\tnew.col.z = round(new.col.x / new.col.y * 100, 2)\n-\t\n-\ttmp = names(NTresult)\n-\tNTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))\n-\tnames(NTresult) = c(tmp, paste(clazz, c("x", "y", "z"), sep=""))\n-}\n-\n-write.table(NToverview[,c("Sequence.ID", "best_match", "seq", "A", "C", "G", "T")], NToverview.file, quote=F, sep="\\t", row.names=F, col.names=T)\n-\n-NToverview = NToverview[!grepl("unmatched", NToverview$best_match),]\n-\n-new.col.x = c(sum(NToverview$A), sum(NToverview$C), sum(NToverview$T), sum(NToverview$G))\n-new.col.y = sum(new.col.x)\n-new.col.z = round(new.col.x / new.col.y * 100, 2)\n-\n-tmp = names(NTresult)\n-NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))\n-names(NTresult) = c(tmp, paste("all", c("x", "y", "z"), sep=""))\n-\n-names(hotspot.analysis.sum) = names(NTresult)\n-\n-hotspot.analysis.sum = rbind(hotspot.analysis.sum, NTresult)\n-\n-write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0")\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 shm_csr.py
--- a/shm_csr.py Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,285 +0,0 @@\n-from __future__ import division\n-from collections import defaultdict\n-import re\n-import argparse\n-\n-parser = argparse.ArgumentParser()\n-parser.add_argument("--input",\n-\t\t\t\t\thelp="The \'7_V-REGION-mutation-and-AA-change-table\' and \'10_V-REGION-mutation-hotspots\' merged together, with an added \'best_match\' annotation")\n-parser.add_argument("--genes", help="The genes available in the \'best_match\' column")\n-parser.add_argument("--empty_region_filter", help="Where does the sequence start?", choices=[\'leader\', \'FR1\', \'CDR1\', \'FR2\'])\n-parser.add_argument("--output", help="Output file")\n-\n-args = parser.parse_args()\n-\n-infile = args.input\n-genes = str(args.genes).split(",")\n-empty_region_filter = args.empty_region_filter\n-outfile = args.output\n-\n-genedic = dict()\n-\n-mutationdic = dict()\n-mutationMatcher = re.compile("^(.)(\\d+).(.),?(.)?(\\d+)?.?(.)?(.?.?.?.?.?)?")\n-NAMatchResult = (None, None, None, None, None, None, \'\')\n-linecount = 0\n-\n-IDIndex = 0\n-best_matchIndex = 0\n-fr1Index = 0\n-cdr1Index = 0\n-fr2Index = 0\n-cdr2Index = 0\n-fr3Index = 0\n-first = True\n-IDlist = []\n-mutationList = []\n-mutationListByID = {}\n-cdr1LengthDic = {}\n-cdr2LengthDic = {}\n-\n-with open(infile, \'r\') as i:\n-\tfor line in i:\n-\t\tif first:\n-\t\t\tlinesplt = line.split("\\t")\n-\t\t\tIDIndex = linesplt.index("Sequence.ID")\n-\t\t\tbest_matchIndex = linesplt.index("best_match")\n-\t\t\tfr1Index = linesplt.index("FR1.IMGT")\n-\t\t\tcdr1Index = linesplt.index("CDR1.IMGT")\n-\t\t\tfr2Index = linesplt.index("FR2.IMGT")\n-\t\t\tcdr2Index = linesplt.index("CDR2.IMGT")\n-\t\t\tfr3Index = linesplt.index("FR3.IMGT")\n-\t\t\tcdr1LengthIndex = linesplt.index("CDR1.IMGT.length")\n-\t\t\tcdr2LengthIndex = linesplt.index("CDR2.IMGT.length")\n-\t\t\tfirst = False\n-\t\t\tcontinue\n-\t\tlinecount += 1\n-\t\tlinesplt = line.split("\\t")\n-\t\tID = linesplt[IDIndex]\n-\t\tgenedic[ID] = linesplt[best_matchIndex]\n-\t\ttry:\n-\t\t\tmutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if (linesplt[fr1Index] != "NA" and empty_region_filter == "leader") else []\n-\t\t\tmutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if (linesplt[cdr1Index] != "NA" and empty_region_filter in ["leader", "FR1"]) else []\n-\t\t\tmutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if (linesplt[fr2Index] != "NA" and empty_region_filter in ["leader", "FR1", "CDR1"]) else []\n-\t\t\tmutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if (linesplt[cdr2Index] != "NA") else []\n-\t\t\tmutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]\n-\t\t\tmutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else []\n-\t\texcept Exception as e:\n-\t\t\tprint "Something went wrong while processing this line:"\n-\t\t\tprint linesplt\n-\t\t\tprint linecount\n-\t\t\tprint e\n-\t\tmutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]\n-\t\tmutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]\n-\n-\t\tcdr1Length = linesplt[cdr1LengthIndex]\n-\t\tcdr2Length = linesplt[cdr2LengthIndex]\n-\n-\t\tcdr1LengthDic[ID] = int(cdr1Length) if cdr1Length != "X" else 0\n-\t\tcdr2LengthDic[ID] = int(cdr2Length) if cdr2Length != "X" else 0\n-\t\t\n-\t\tIDlist += [ID]\n-\n-AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] else 0)[4]) + 1) # [4] is the position of the AA mutation, None if silent\n-if AALength < 60:\n-\tAALength = 64\n-\n-AA_mutation = [0] * AALength\n-AA_mutation_dic = {"IGA": AA_mutation[:], "IGG": AA_mutation[:], "IGM": AA_mutation[:], "IGE": AA_mutation[:], "unm": AA_mutation[:], "all": AA_mutation[:]}\n-AA_mutation_empty = AA_mutation[:]\n-\n-aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa'..b'otspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]]\n-\t\tWRCY = [(int(x), int(y), z) for (x, y, z) in\n-\t\t\t\t[hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]]\n-\t\tWA = [(int(x), int(y), z) for (x, y, z) in\n-\t\t\t [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]]\n-\t\tTW = [(int(x), int(y), z) for (x, y, z) in\n-\t\t\t [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]]\n-\t\tRGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0\n-\n-\t\tmutationList = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]\n-\t\tfor mutation in mutationList:\n-\t\t\tfrm, where, to, AAfrm, AAwhere, AAto, junk = mutation\n-\t\t\tmutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW])\n-\t\t\tmutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY])\n-\t\t\tmutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA])\n-\t\t\tmutation_in_TW = any([(start <= int(where) <= end) for (start, end, region) in TW])\n-\n-\t\t\tin_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW])\n-\n-\t\t\tif in_how_many_motifs > 0:\n-\t\t\t\tRGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs\n-\t\t\t\tWRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs\n-\t\t\t\tWACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs\n-\t\t\t\tTWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs\n-\n-\n-def mean(lst):\n-\treturn (float(sum(lst)) / len(lst)) if len(lst) > 0 else 0.0\n-\n-\n-def median(lst):\n-\tlst = sorted(lst)\n-\tl = len(lst)\n-\tif l == 0:\n-\t\treturn 0\n-\tif l == 1:\n-\t\treturn lst[0]\n-\t\t\n-\tl = int(l / 2)\n-\t\n-\tif len(lst) % 2 == 0:\n-\t\treturn float(lst[l] + lst[(l - 1)]) / 2.0\n-\telse:\n-\t\treturn lst[l]\n-\n-funcs = {"mean": mean, "median": median, "sum": sum}\n-\n-directory = outfile[:outfile.rfind("/") + 1]\n-value = 0\n-valuedic = dict()\n-\n-for fname in funcs.keys():\n-\tfor gene in genes:\n-\t\twith open(directory + gene + "_" + fname + "_value.txt", \'r\') as v:\n-\t\t\tvaluedic[gene + "_" + fname] = float(v.readlines()[0].rstrip())\n-\twith open(directory + "all_" + fname + "_value.txt", \'r\') as v:\n-\t\tvaluedic["total_" + fname] = float(v.readlines()[0].rstrip())\n-\t\n-\n-def get_xyz(lst, gene, f, fname):\n-\tx = round(round(f(lst), 1))\n-\ty = valuedic[gene + "_" + fname]\n-\tz = str(round(x / float(y) * 100, 1)) if y != 0 else "0"\n-\treturn (str(x), str(y), z)\n-\n-dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount}\n-arr = ["RGYW", "WRCY", "WA", "TW"]\n-\n-geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes}\n-\n-for fname in funcs.keys():\n-\tfunc = funcs[fname]\n-\tfoutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt"\n-\twith open(foutfile, \'w\') as o:\n-\t\tfor typ in arr:\n-\t\t\to.write(typ + " (%)")\n-\t\t\tcurr = dic[typ]\n-\t\t\tfor gene in genes:\n-\t\t\t\tgeneMatcher = geneMatchers[gene]\n-\t\t\t\tif valuedic[gene + "_" + fname] is 0:\n-\t\t\t\t\to.write(",0,0,0")\n-\t\t\t\telse:\n-\t\t\t\t\tx, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname)\n-\t\t\t\t\to.write("," + x + "," + y + "," + z)\n-\t\t\tx, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname)\n-\t\t\t#x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname)\n-\t\t\to.write("," + x + "," + y + "," + z + "\\n")\n-\n-\n-# for testing\n-seq_motif_file = outfile[:outfile.rindex("/")] + "/motif_per_seq.txt"\n-with open(seq_motif_file, \'w\') as o:\n-\to.write("ID\\tRGYW\\tWRCY\\tWA\\tTW\\n")\n-\tfor ID in IDlist:\n-\t\t#o.write(ID + "\\t" + str(round(RGYWCount[ID], 2)) + "\\t" + str(round(WRCYCount[ID], 2)) + "\\t" + str(round(WACount[ID], 2)) + "\\t" + str(round(TWCount[ID], 2)) + "\\n")\n-\t\to.write(ID + "\\t" + str(RGYWCount[ID]) + "\\t" + str(WRCYCount[ID]) + "\\t" + str(WACount[ID]) + "\\t" + str(TWCount[ID]) + "\\n")\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 shm_csr.r
--- a/shm_csr.r Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,542 +0,0 @@\n-library(data.table)\n-library(ggplot2)\n-library(reshape2)\n-\n-args <- commandArgs(trailingOnly = TRUE)\n-\n-input = args[1]\n-genes = unlist(strsplit(args[2], ","))\n-outputdir = args[3]\n-empty.region.filter = args[4]\n-setwd(outputdir)\n-\n-dat = read.table(input, header=T, sep="\\t", fill=T, stringsAsFactors=F)\n-\n-if(length(dat$Sequence.ID) == 0){\n- setwd(outputdir)\n- result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5))\n- row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of G C (%)")\n- write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)\n- transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4))\n- row.names(transitionTable) = c("A", "C", "G", "T")\n- transitionTable["A","A"] = NA\n- transitionTable["C","C"] = NA\n- transitionTable["G","G"] = NA\n- transitionTable["T","T"] = NA\n-\n- write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)\n- cat("0", file="n.txt")\n- stop("No data")\n-}\n-\n-cleanup_columns = c("FR1.IMGT.c.a",\n-\t\t\t\t\t"FR2.IMGT.g.t",\n-\t\t\t\t\t"CDR1.IMGT.Nb.of.nucleotides",\n-\t\t\t\t\t"CDR2.IMGT.t.a",\n-\t\t\t\t\t"FR1.IMGT.c.g",\n-\t\t\t\t\t"CDR1.IMGT.c.t",\n-\t\t\t\t\t"FR2.IMGT.a.c",\n-\t\t\t\t\t"FR2.IMGT.Nb.of.mutations",\n-\t\t\t\t\t"FR2.IMGT.g.c",\n-\t\t\t\t\t"FR2.IMGT.a.g",\n-\t\t\t\t\t"FR3.IMGT.t.a",\n-\t\t\t\t\t"FR3.IMGT.t.c",\n-\t\t\t\t\t"FR2.IMGT.g.a",\n-\t\t\t\t\t"FR3.IMGT.c.g",\n-\t\t\t\t\t"FR1.IMGT.Nb.of.mutations",\n-\t\t\t\t\t"CDR1.IMGT.g.a",\n-\t\t\t\t\t"CDR1.IMGT.t.g",\n-\t\t\t\t\t"CDR1.IMGT.g.c",\n-\t\t\t\t\t"CDR2.IMGT.Nb.of.nucleotides",\n-\t\t\t\t\t"FR2.IMGT.a.t",\n-\t\t\t\t\t"CDR1.IMGT.Nb.of.mutations",\n-\t\t\t\t\t"CDR3.IMGT.Nb.of.nucleotides",\n-\t\t\t\t\t"CDR1.IMGT.a.g",\n-\t\t\t\t\t"FR3.IMGT.a.c",\n-\t\t\t\t\t"FR1.IMGT.g.a",\n-\t\t\t\t\t"FR3.IMGT.a.g",\n-\t\t\t\t\t"FR1.IMGT.a.t",\n-\t\t\t\t\t"CDR2.IMGT.a.g",\n-\t\t\t\t\t"CDR2.IMGT.Nb.of.mutations",\n-\t\t\t\t\t"CDR2.IMGT.g.t",\n-\t\t\t\t\t"CDR2.IMGT.a.c",\n-\t\t\t\t\t"CDR1.IMGT.t.c",\n-\t\t\t\t\t"FR3.IMGT.g.c",\n-\t\t\t\t\t"FR1.IMGT.g.t",\n-\t\t\t\t\t"FR3.IMGT.g.t",\n-\t\t\t\t\t"CDR1.IMGT.a.t",\n-\t\t\t\t\t"FR1.IMGT.a.g",\n-\t\t\t\t\t"FR3.IMGT.a.t",\n-\t\t\t\t\t"FR3.IMGT.Nb.of.nucleotides",\n-\t\t\t\t\t"FR2.IMGT.t.c",\n-\t\t\t\t\t"CDR2.IMGT.g.a",\n-\t\t\t\t\t"FR2.IMGT.t.a",\n-\t\t\t\t\t"CDR1.IMGT.t.a",\n-\t\t\t\t\t"FR2.IMGT.t.g",\n-\t\t\t\t\t"FR3.IMGT.t.g",\n-\t\t\t\t\t"FR2.IMGT.Nb.of.nucleotides",\n-\t\t\t\t\t"FR1.IMGT.t.a",\n-\t\t\t\t\t"FR1.IMGT.t.g",\n-\t\t\t\t\t"FR3.IMGT.c.t",\n-\t\t\t\t\t"FR1.IMGT.t.c",\n-\t\t\t\t\t"CDR2.IMGT.a.t",\n-\t\t\t\t\t"FR2.IMGT.c.t",\n-\t\t\t\t\t"CDR1.IMGT.g.t",\n-\t\t\t\t\t"CDR2.IMGT.t.g",\n-\t\t\t\t\t"FR1.IMGT.Nb.of.nucleotides",\n-\t\t\t\t\t"CDR1.IMGT.c.g",\n-\t\t\t\t\t"CDR2.IMGT.t.c",\n-\t\t\t\t\t"FR3.IMGT.g.a",\n-\t\t\t\t\t"CDR1.IMGT.a.c",\n-\t\t\t\t\t"FR2.IMGT.c.a",\n-\t\t\t\t\t"FR3.IMGT.Nb.of.mutations",\n-\t\t\t\t\t"FR2.IMGT.c.g",\n-\t\t\t\t\t"CDR2.IMGT.g.c",\n-\t\t\t\t\t"FR1.IMGT.g.c",\n-\t\t\t\t\t"CDR2.IMGT.c.t",\n-\t\t\t\t\t"FR3.IMGT.c.a",\n-\t\t\t\t\t"CDR1.IMGT.c.a",\n-\t\t\t\t\t"CDR2.IMGT.c.g",\n-\t\t\t\t\t"CDR2.IMGT.c.a",\n-\t\t\t\t\t"FR1.IMGT.c.t",\n-\t\t\t\t\t"FR1.IMGT.Nb.of.silent.mutations",\n-\t\t\t\t\t"FR2.IMGT.Nb.of.silent.mutations",\n-\t\t\t\t\t"FR3.IMGT.Nb.of.silent.mutations",\n-\t\t\t\t\t"FR1.IMGT.Nb.of.nonsilent.mutations",\n-\t\t\t\t\t"FR2.IMGT.Nb.of.nonsilent.mutations",\n-\t\t\t\t\t"FR3.IMGT.Nb.of.nonsilent.mutations")\n-\n-print("Cleaning up columns")\n-\n-for(col in cleanup_columns){\n- dat[,col] = gsub("\\\$.*\\\$", "", dat[,col])\n- #dat[dat[,col] == "",] = "0"\n- dat[,col] = as.numeric(dat[,col])\n- dat[is.na(dat[,col]),col] = 0\n-}\n-\n-regions = c("FR1", "CDR1", "FR2", "CDR2", "FR3")\n-if(empty.region.filter == "FR1") {\n-\tregions = c("CDR1", "FR2", "CDR2", "FR3")\n-} else if (empty.region.filter == "CDR1") {\n-\tregions = c("FR2", "CDR2", "FR3")\n-} else if (empty.region.filter == "FR2") {\n-\tregions = c("CDR2", "FR3")\n-}\n-\n-sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) }\n-\n-print("aggregating data into new columns")\n-\n-VRegionMutations_columns = paste(regions, ".IMGT.Nb.of.mutations", sep="")\n-dat$VRegionMutations = apply(dat, FUN=sum_by_row, 1, columns=VRegionMutations_columns)\n-\n-VRegionNucleotides_columns = paste(regions, ".IMGT.Nb.of.nucleotides", sep="")\n-dat$FR3.IMGT.Nb.of.nucleotides = nchar('..b'hape = NA)\n-p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=16, colour="black"))\n-p = p + scale_fill_manual(values=c("IGA" = "blue4", "IGA1" = "lightblue1", "IGA2" = "blue4", "IGG" = "olivedrab3", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "darkviolet", "IGE" = "darkorange", "all" = "blue4"))\n-p = p + scale_colour_manual(values=c("IGA" = "blue4", "IGA1" = "lightblue1", "IGA2" = "blue4", "IGG" = "olivedrab3", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "darkviolet", "IGE" = "darkorange", "all" = "blue4"))\n-\n-png(filename="scatter.png")\n-print(p)\n-dev.off()\n-\n-write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\\t",quote=F,row.names=F,col.names=T)\n-\n-print("Plotting frequency ranges plot")\n-\n-dat$best_match_class = substr(dat$best_match, 0, 3)\n-freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20")\n-dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels)\n-\n-frequency_bins_sum = data.frame(data.table(dat)[, list(class_sum=sum(.N)), by=c("best_match_class")])\n-\n-frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match_class", "frequency_bins")])\n-\n-frequency_bins_data = merge(frequency_bins_data, frequency_bins_sum, by="best_match_class")\n-\n-frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2)\n-\n-p = ggplot(frequency_bins_data, aes(frequency_bins, frequency))\n-p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=16, colour="black"))\n-p = p + xlab("Frequency ranges") + ylab("Frequency") + ggtitle("Mutation Frequencies by class") + scale_fill_manual(values=c("IGA" = "blue4", "IGG" = "olivedrab3", "IGM" = "darkviolet", "IGE" = "darkorange", "all" = "blue4"))\n-\n-png(filename="frequency_ranges.png")\n-print(p)\n-dev.off()\n-\n-frequency_bins_data_by_class = frequency_bins_data\n-\n-frequency_bins_data_by_class = frequency_bins_data_by_class[order(frequency_bins_data_by_class$best_match_class, frequency_bins_data_by_class$frequency_bins),]\n-\n-frequency_bins_data_by_class$frequency_bins = gsub("-", " to ", frequency_bins_data_by_class$frequency_bins)\n-frequency_bins_data_by_class[frequency_bins_data_by_class$frequency_bins == "20", c("frequency_bins")] = "20 or higher"\n-frequency_bins_data_by_class[frequency_bins_data_by_class$frequency_bins == "0", c("frequency_bins")] = "0 or lower"\n-\n-write.table(frequency_bins_data_by_class, "frequency_ranges_classes.txt", sep="\\t",quote=F,row.names=F,col.names=T)\n-\n-frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match", "best_match_class", "frequency_bins")])\n-\n-frequency_bins_sum = data.frame(data.table(dat)[, list(class_sum=sum(.N)), by=c("best_match")])\n-\n-frequency_bins_data = merge(frequency_bins_data, frequency_bins_sum, by="best_match")\n-\n-frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2)\n-\n-frequency_bins_data = frequency_bins_data[order(frequency_bins_data$best_match, frequency_bins_data$frequency_bins),]\n-frequency_bins_data$frequency_bins = gsub("-", " to ", frequency_bins_data$frequency_bins)\n-frequency_bins_data[frequency_bins_data$frequency_bins == "20", c("frequency_bins")] = "20 or higher"\n-frequency_bins_data[frequency_bins_data$frequency_bins == "0", c("frequency_bins")] = "0 or lower"\n-\n-write.table(frequency_bins_data, "frequency_ranges_subclasses.txt", sep="\\t",quote=F,row.names=F,col.names=T)\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 shm_csr.xml
--- a/shm_csr.xml Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,196 +0,0 @@\n-<tool id="shm_csr" name="SHM & CSR pipeline" version="1.0">\n-\t<description></description>\n-\t<command interpreter="bash">\n-\t\twrapper.sh $in_file custom $out_file $out_file.files_path ${in_file.name} "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_uniques $class_filter_cond.class_filter $empty_region_filter $fast\n-\t</command>\n-\t<inputs>\n-\t\t<param name="in_file" type="data" label="IMGT zip file to be analysed" />\n-\t\t<param name="empty_region_filter" type="select" label="Sequence starts at" help="" >\n-\t\t\t<option value="leader" selected="true">Leader: include FR1, CDR1, FR2, CDR2, FR3 in filters</option>\n-\t\t\t<option value="FR1" selected="true">FR1: include CDR1,FR2,CDR2,FR3 in filters</option>\n-\t\t\t<option value="CDR1">CDR1: include FR2,CDR2,FR3 in filters</option>\n-\t\t\t<option value="FR2">FR2: include CDR2,FR3 in filters</option>\n-\t\t</param>\n-\t\t<param name="functionality" type="select" label="Functionality filter" help="" >\n-\t\t\t<option value="productive" selected="true">Productive (Productive and Productive see comment)</option>\n-\t\t\t<option value="unproductive">Unproductive (Unproductive and Unproductive see comment)</option>\n-\t\t\t<option value="remove_unknown">Productive and Unproductive (Productive, Productive see comment, Unproductive, Unproductive and Unproductive see comment)</option>\n-\t\t</param>\n-\t\t<param name="filter_uniques" type="select" label="Filter unique sequences" help="See below for an example.">\n-\t\t\t<option value="remove" selected="true">Remove uniques (Based on nucleotide sequence + C)</option>\n-\t\t\t<option value="keep">Keep uniques (Based on nucleotide sequence + C)</option>\n-\t\t\t<option value="no">No</option>\n-\t\t</param>\n-\t\t<param name="unique" type="select" label="Remove duplicates based on" help="" >\n-\t\t\t<option value="VGene,CDR3.IMGT.AA,best_match_class">Top.V.Gene, CDR3 (AA), C region</option>\n-\t\t\t<option value="VGene,CDR3.IMGT.AA">Top.V.Gene, CDR3 (AA)</option>\n-\t\t\t<option value="CDR3.IMGT.AA,best_match_class">CDR3 (AA), C region</option>\n-\t\t\t<option value="CDR3.IMGT.AA">CDR3 (AA)</option>\n-\t\t\t\n-\t\t\t<option value="VGene,CDR3.IMGT.seq,best_match_class">Top.V.Gene, CDR3 (nt), C region</option>\n-\t\t\t<option value="VGene,CDR3.IMGT.seq">Top.V.Gene, CDR3 (nt)</option>\n-\t\t\t<option value="CDR3.IMGT.seq,best_match_class">CDR3 (nt), C region</option>\n-\t\t\t<option value="CDR3.IMGT.seq">CDR3 (nt)</option>\n-\t\t\t<option value="Sequence.ID" selected="true">Don\'t remove duplicates</option>\n-\t\t</param>\n-\t\t<conditional name="class_filter_cond">\n-\t\t\t<param name="class_filter" type="select" label="Human Class/Subclass filter" help="" >\n-\t\t\t\t<option value="70_70" selected="true">>70% class and >70% subclass</option>\n-\t\t\t\t<option value="60_55">>60% class and >55% subclass</option>\n-\t\t\t\t<option value="70_0">>70% class</option>\n-\t\t\t\t<option value="60_0">>60% class</option>\n-\t\t\t\t<option value="101_101">Do not assign (sub)class</option>\n-\t\t\t</param>\n-\t\t</conditional>\n-\t\t<conditional name="naive_output_cond">\n-\t\t\t<param name="naive_output" type="select" label="Output new IMGT archives per class into your history?">\n-\t\t\t\t<option value="yes">Yes</option>\n-\t\t\t\t<option value="no" selected="true">No</option>\n-\t\t\t</param>\n-\t\t</conditional>\n-\t\t<param name="fast" type="select" label="Fast" help="Skips generating the new ZIP files and Change-O/Baseline" >\n-\t\t\t<option value="yes">Yes</option>\n-\t\t\t<option value="no" selected="true">No</option>\n-\t\t</param>\n-\t</inputs>\n-\t<outputs>\n-\t\t<data format="html" name="out_file" label = "SHM & CSR on ${in_file.name}"/>\n-\t\t<data format="imgt_archive" name="naive_output_ca" label = "Naive CA input data from ${in_file.name}" >\n-\t\t <filter>naive_output_cond[\'naive_output\'] == "yes"</filter>\n-\t\t <filter>class_filter_cond[\'class_filter\'] != "101_101"</filter>\n-\t\t</data>\n-\t\t<data format="imgt_archive" name="naive_output_cg" label = "Naive CG input data from ${in_file.name}" >\n-\t\t <f'..b'lysed region are excluded\n-- All other filtering/analysis is based on the analysed region\n-\n------\n-\n-**Functionality filter**\n-\n-Allows filtering on productive rearrangement, unproductive rearrangements or both based on the assignment provided by IMGT. \n-\n-**Filter unique sequences**\n-\n-*Remove unique:*\n-\n-\n-This filter consists of two different steps.\n-\n-Step 1: removes all sequences of which the nucleotide sequence in the \xe2\x80\x9canalysed region\xe2\x80\x9d (see sequence starts at filter) occurs only once. (Sub)classes are not taken into account in this filter step.\n-\n-Step 2: removes all duplicate sequences (sequences with the exact same nucleotide sequence in the analysed region and the same (sub)class).\n-\n-.. class:: infomark\n-\n-Note: This means that sequences with the same nucleotide sequence but a different (sub)class will be included in the results of both (sub)classes.\n-\n-*Keep unique:*\n-\n-Removes all duplicate sequences (sequences with the exact same nucleotide sequence in the analysed region and the same (sub)class).\n-\n-Example of the sequences that are included using either the \xe2\x80\x9cremove unique filter\xe2\x80\x9d or the \xe2\x80\x9ckeep unique filter\xe2\x80\x9d\n-\n-+--------------------------+\n-| unique filter |\n-+--------+--------+--------+\n-| values | remove | keep |\n-+--------+--------+--------+\n-| A | A | A |\n-+--------+--------+--------+\n-| A | B | B |\n-+--------+--------+--------+\n-| B | D | C |\n-+--------+--------+--------+\n-| B | | D |\n-+--------+--------+--------+\n-| C | | |\n-+--------+--------+--------+\n-| D | | |\n-+--------+--------+--------+\n-| D | | |\n-+--------+--------+--------+\n-\n------\n- \n-**Remove duplicates based on**\n-\n-Allows the selection of a single sequence per clone. Different definitions of a clone can be chosen. \n-\n-.. class:: infomark\n-\n-Note: The first sequence (in the data set) of each clone is always included in the analysis. When the first matched sequence is unmatched (no subclass assigned) the first matched sequence will be included. This means that altering the data order (by for instance sorting) can change the sequence which is included in the analysis and therefore slightly influence results. \n-\n------\n-\n-**Human Class/Subclass filter**\n-\n-.. class:: warningmark\n-\n-Note: This filter should only be applied when analysing human IGH data in which a (sub)class specific sequence is present. Otherwise please select the "do not assign (sub)class" option to prevent errors when running the pipeline. \n-\n-The class percentage is based on the \xe2\x80\x98chunk hit percentage\xe2\x80\x99 (see below). The subclass percentage is based on the \xe2\x80\x98nt hit percentage\xe2\x80\x99 (see below).\n-\n-The SHM & CSR pipeline identifies human C\xc2\xb5, C\xce\xb1, C\xce\xb3 and C\xce\xb5 constant genes by dividing the reference sequences for the subclasses (NG_001019) in 8 nucleotide chunks which overlap by 4 nucleotides. These overlapping chunks are then individually aligned in the right order to each input sequence. This alignment is used to calculate the chunck hit percentage and the nt hit percentage. \n-\n-*Chunk hit percentage*: the percentage of the chunks that is aligned \n-\n-*Nt hit percentage*: The percentage of chunks covering the subclass specific nucleotide match with the different subclasses. The most stringent filter for the subclass is 70% \xe2\x80\x98nt hit percentage\xe2\x80\x99 which means that 5 out of 7 subclass specific nucleotides for C\xce\xb1 or 6 out of 8 subclass specific nucleotides of C\xce\xb3 should match with the specific subclass. \n-\n------\n-\n-**Output new IMGT archives per class into your history?**\n-\n-If yes is selected, additional output files (one for each class) will be added to the history which contain information of the sequences that passed the selected filtering criteria. These files are in the same format as the IMGT/HighV-QUEST output files and therefore are also compatible with many other analysis programs, such as IGGalaxy. \n-\n-]]>\n-\t</help>\n-</tool>\n'

diff -r 5ffd52fc35c4 -r bcec7bb4e089 style.tar.gz

Binary file style.tar.gz has changed

diff -r 5ffd52fc35c4 -r bcec7bb4e089 subclass_definition.db.nhr

Binary file subclass_definition.db.nhr has changed

diff -r 5ffd52fc35c4 -r bcec7bb4e089 subclass_definition.db.nin

Binary file subclass_definition.db.nin has changed

diff -r 5ffd52fc35c4 -r bcec7bb4e089 subclass_definition.db.nsq

Binary file subclass_definition.db.nsq has changed

diff -r 5ffd52fc35c4 -r bcec7bb4e089 summary_to_fasta.py
--- a/summary_to_fasta.py Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,42 +0,0 @@
-import argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--input", help="The 1_Summary file of an IMGT zip file")
-parser.add_argument("--fasta", help="The output fasta file")
-
-args = parser.parse_args()
-
-infile = args.input
-fasta = args.fasta
-
-with open(infile, 'r') as i, open(fasta, 'w') as o:
- first = True
- id_col = 0
- seq_col = 0
- no_results = 0
- no_seqs = 0
- passed = 0
- for line in i:
- splt = line.split("\t")
- if first:
- id_col = splt.index("Sequence ID")
- seq_col = splt.index("Sequence")
- first = False
- continue
- if len(splt) < 5:
- no_results += 1
- continue
-
- ID = splt[id_col]
- seq = splt[seq_col]
-
- if not len(seq) > 0:
- no_seqs += 1
- continue
-
- o.write(">" + ID + "\n" + seq + "\n")
- passed += 1
-
- print "No results:", no_results
- print "No sequences:", no_seqs
- print "Written to fasta file:", passed

diff -r 5ffd52fc35c4 -r bcec7bb4e089 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Mon Dec 12 05:22:57 2016 -0500

@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="igblastwrp" version="0.6">
+        <install version="1.0">
+            <actions>
+                <action type="download_by_url">https://github.com/mikessh/higblast/releases/download/v0.6/igblastwrapper_linux64.tar.gz</action>
+                <action type="move_file">
+                    <source>bin</source>
+                    <destination>$INSTALL_DIR/</destination>
+                </action>
+                <action type="move_file">
+                    <source>data</source>
+                    <destination>$INSTALL_DIR/</destination>
+                </action>
+                <action type="move_file">
+                    <source>igblastwrp.jar</source>
+                    <destination>$INSTALL_DIR/</destination>
+                </action>
+                <action type="set_environment">
+                    <environment_variable action="set_to" name="IGBLASTWRP">$INSTALL_DIR/</environment_variable>
+                </action>
+            </actions>
+        </install>
+        <readme>
+Downloads https://github.com/mikessh/higblast/
+        </readme>
+    </package>
+    <package name="weblogo" version="3.3">
+      <repository changeset_revision="648e4b32f15c" name="package_weblogo_3_3" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+    
+</tool_dependency>

diff -r 5ffd52fc35c4 -r bcec7bb4e089 wrapper.sh
--- a/wrapper.sh Mon Dec 12 05:22:37 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,772 +0,0 @@\n-#!/bin/bash\n-#set -e\n-dir="$(cd "$(dirname "$0")" && pwd)"\n-input=$1\n-method=$2\n-log=$3 #becomes the main html page at the end\n-outdir=$4\n-output="$outdir/index.html" #copied to $log location at the end\n-title=$5\n-include_fr1=$6\n-functionality=$7\n-unique=$8\n-naive_output=$9\n-naive_output_ca=${10}\n-naive_output_cg=${11}\n-naive_output_cm=${12}\n-naive_output_ce=${13}\n-naive_output_all=${14}\n-filter_unique=${15}\n-class_filter=${16}\n-empty_region_filter=${17}\n-fast=${18}\n-mkdir $outdir\n-\n-tar -xzf $dir/style.tar.gz -C $outdir\n-\n-echo "---------------- read parameters ----------------"\n-echo "---------------- read parameters ----------------<br />" > $log\n-\n-echo "unpacking IMGT file"\n-\n-type="`file $input`"\n-if [[ "$type" == *"Zip archive"* ]] ; then\n-\techo "Zip archive"\n-\techo "unzip $input -d $PWD/files/"\n-\tunzip $input -d $PWD/files/\n-elif [[ "$type" == *"XZ compressed data"* ]] ; then\n-\techo "ZX archive"\n-\techo "tar -xJf $input -C $PWD/files/"\n-\tmkdir -p $PWD/files/$title\n-\ttar -xJf $input -C $PWD/files/$title\n-fi\n-\n-cat `find $PWD/files/ -name "1_*"` > $PWD/summary.txt\n-cat `find $PWD/files/ -name "3_*"` > $PWD/sequences.txt\n-cat `find $PWD/files/ -name "5_*"` > $PWD/aa.txt\n-cat `find $PWD/files/ -name "6_*"` > $PWD/junction.txt\n-cat `find $PWD/files/ -name "7_*"` > $PWD/mutationanalysis.txt\n-cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt\n-cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt\n-\n-if [[ ${#BLASTN_DIR} -ge 5 ]] ; then\n-\techo "On server, using BLASTN_DIR env: ${BLASTN_DIR}"\n-else\n-\tBLASTN_DIR="/home/galaxy/Downloads/ncbi-blast-2.4.0+/bin"\n-\techo "Dev Galaxy set BLASTN_DIR to: ${BLASTN_DIR}"\n-fi\n-\n-echo "---------------- class identification ----------------"\n-echo "---------------- class identification ----------------<br />" >> $log\n-\n-python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt\n-\n-echo "---------------- merge_and_filter.r ----------------"\n-echo "---------------- merge_and_filter.r ----------------<br />" >> $log\n-\n-Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $PWD/aa.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter} ${empty_region_filter} 2>&1\n-\n-if [[ "$fast" == "no" ]] ; then\n-\n-\techo "---------------- creating new IMGT zips ----------------"\n-\techo "---------------- creating new IMGT zips ----------------<br />" >> $log\n-\n-\tmkdir $outdir/new_IMGT\n-\n-\tcat `find $PWD/files/ -name "1_*"` > "$outdir/new_IMGT/1_Summary.txt"\n-\tcat `find $PWD/files/ -name "2_*"` > "$outdir/new_IMGT/2_IMGT-gapped-nt-sequences.txt"\n-\tcat `find $PWD/files/ -name "3_*"` > "$outdir/new_IMGT/3_Nt-sequences.txt"\n-\tcat `find $PWD/files/ -name "4_*"` > "$outdir/new_IMGT/4_IMGT-gapped-AA-sequences.txt"\n-\tcat `find $PWD/files/ -name "5_*"` > "$outdir/new_IMGT/5_AA-sequences.txt"\n-\tcat `find $PWD/files/ -name "6_*"` > "$outdir/new_IMGT/6_Junction.txt"\n-\tcat `find $PWD/files/ -name "7_*"` > "$outdir/new_IMGT/7_V-REGION-mutation-and-AA-change-table.txt"\n-\tcat `find $PWD/files/ -name "8_*"` > "$outdir/new_IMGT/8_V-REGION-nt-mutation-statistics.txt"\n-\tcat `find $PWD/files/ -name "9_*"` > "$outdir/new_IMGT/9_V-REGION-AA-change-statistics.txt"\n-\tcat `find $PWD/files/ -name "10_*"` > "$outdir/new_IMGT/10_V-REGION-mutation-hotspots.txt"\n-\n-\tmkdir $outdir/new_IMGT_IGA\n-\tcp $outdir/new_IMGT/* $outdir/new_IMGT_IGA\n-\n-\tmkdir $outdir/new_IMGT_IGA1\n-\tcp $outdir/new_IMGT/* $outdir/new_IMGT_IGA1\n-\n-\tmkdir $outdir/new_IMGT_IGA2\n-\tcp $outdir/new_IMGT/* $outdir/new_IMGT_IGA2\n-\n-\tmkdir $outdir/new_IMGT_IGG\n-\tcp $outdir/new_IMGT/* $outdir/new_IMGT_IGG\n-\n-\tmkdir $outdir/new_IMGT_IGG1\n-\tcp $outdir/new_IMGT/* $outdir/new_IMGT_IGG1\n-\n-\tmkdir $outdir/new_IMGT_IGG2\n-\tcp $outdir/new_IMGT/* $outdir/new_IMGT_IGG2\n-\n-\tmkdir $outdir/new_IMGT_IGG3\n-\tcp $outdir/new_IMGT/* $outdir/ne'..b'$output\n-\n-echo "</div>" >> $output #downloads tab end\n-\n-echo "</div>" >> $output #tabs end \n-\n-echo "</html>" >> $output\n-\n-\n-if [[ "$fast" == "no" ]] ; then\n-\n-\techo "---------------- baseline ----------------"\n-\techo "---------------- baseline ----------------<br />" >> $log\n-\ttmp="$PWD"\n-\n-\tmkdir $outdir/baseline\n-\n-\n-\tmkdir $outdir/baseline/IGA_IGG_IGM\n-\tif [[ $(wc -l < $outdir/new_IMGT/1_Summary.txt) -gt "1" ]]; then\n-\t\tcd $outdir/baseline/IGA_IGG_IGM\n-\t\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT.txz "IGA_IGG_IGM" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline.pdf" "Sequence.ID" "$outdir/baseline.txt"\t\n-\telse\n-\t\techo "No sequences" > "$outdir/baseline.txt"\n-\tfi\n-\n-\tmkdir $outdir/baseline/IGA\n-\tif [[ $(wc -l < $outdir/new_IMGT_IGA/1_Summary.txt) -gt "1" ]]; then\n-\t\tcd $outdir/baseline/IGA\n-\t\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_IGA.txz "IGA" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_IGA.pdf" "Sequence.ID" "$outdir/baseline_IGA.txt"\n-\telse\n-\t\techo "No IGA sequences" > "$outdir/baseline_IGA.txt"\n-\tfi\n-\n-\tmkdir $outdir/baseline/IGG\n-\tif [[ $(wc -l < $outdir/new_IMGT_IGG/1_Summary.txt) -gt "1" ]]; then\n-\t\tcd $outdir/baseline/IGG\n-\t\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_IGG.txz "cg" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_IGG.pdf" "Sequence.ID" "$outdir/baseline_IGG.txt"\n-\telse\n-\t\techo "No IGG sequences" > "$outdir/baseline_IGG.txt"\n-\tfi\n-\n-\tmkdir $outdir/baseline/IGM\n-\tif [[ $(wc -l < $outdir/new_IMGT_IGM/1_Summary.txt) -gt "1" ]]; then\n-\t\tcd $outdir/baseline/IGM\n-\t\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_IGM.txz "IGM" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_IGM.pdf" "Sequence.ID" "$outdir/baseline_IGM.txt"\n-\telse\n-\t\techo "No IGM sequences" > "$outdir/baseline_IGM.txt"\n-\tfi\n-\n-\tmkdir $outdir/baseline/IGE\n-\tif [[ $(wc -l < $outdir/new_IMGT_IGE/1_Summary.txt) -gt "1" ]]; then\n-\t\tcd $outdir/baseline/IGE\n-\t\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_IGE.txz "IGE" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_IGE.pdf" "Sequence.ID" "$outdir/baseline_IGE.txt"\n-\telse\n-\t\techo "No IGE sequences" > "$outdir/baseline_IGE.txt"\n-\tfi\n-\n-\tcd $tmp\n-\n-\techo "Cleaning up *.RData files"\n-\tfind $outdir/baseline -name "*.RData" -type f -delete\n-\n-fi\n-\n-echo "---------------- naive_output.r ----------------"\n-echo "---------------- naive_output.r ----------------<br />" >> $log\n-\n-if [[ "$naive_output" == "yes" ]]\n-then\n-\techo "output naive output"\n-\tif [[ "${class_filter}" == "101_101" ]]\n-\tthen\n-\t\techo "copy new_IMGT.txz to ${naive_output_all}"\n-\t\tcp $outdir/new_IMGT.txz ${naive_output_all}\n-\telse\n-\t\techo "copy for classes"\n-\t\tcp $outdir/new_IMGT_IGA.txz ${naive_output_ca}\n-\t\tcp $outdir/new_IMGT_IGG.txz ${naive_output_cg}\n-\t\tcp $outdir/new_IMGT_IGM.txz ${naive_output_cm}\n-\t\tcp $outdir/new_IMGT_IGE.txz ${naive_output_ce}\n-\tfi\n-fi\n-\n-echo "</table>" >> $outdir/base_overview.html\n-\n-mv $log $outdir/log.html\n-\n-echo "<html><center><h1><a href=\'index.html\'>Click here for the results</a></h1>Tip: Open it in a new tab (middle mouse button or right mouse button -> \'open in new tab\' on the link above)<br />" > $log\n-echo "<table border = 1>" >> $log\n-echo "<thead><tr><th>Info</th><th>Sequences</th><th>Percentage</th></tr></thead>" >> $log\n-tIFS="$TMP"\n-IFS=$\'\\t\'\n-while read step seq perc\n-\tdo\n-\t\techo "<tr>" >> $log\n-\t\techo "<td>$step</td>" >> $log\n-\t\techo "<td>$seq</td>" >> $log\n-\t\techo "<td>${perc}%</td>" >> $log\n-\t\techo "</tr>" >> $log\n-done < $outdir/filtering_steps.txt\n-echo "</table border></center></html>" >> $log\n-\n-IFS="$tIFS"\n-\n-\n-echo "---------------- Done! ----------------"\n-echo "---------------- Done! ----------------<br />" >> $outdir/log.html\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n'