Repository 'mutation_analysis'
hg clone https://toolshed.g2.bx.psu.edu/repos/davidvanzessen/mutation_analysis

Changeset 0:8a5a2abbb870 (2016-08-29)
Commit message:
Uploaded
added:
aa_histogram.r
baseline/Baseline_Functions.r
baseline/Baseline_Main.r
baseline/FiveS_Mutability.RData
baseline/FiveS_Substitution.RData
baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa
baseline/comparePDFs.r
baseline/filter.r
baseline/script_imgt.py
baseline/script_xlsx.py
baseline/wrapper.sh
change_o/DefineClones.py
change_o/MakeDb.py
change_o/define_clones.r
change_o/define_clones.sh
change_o/makedb.sh
datatypes_conf.xml
gene_identification.py
imgt_loader.r
merge.r
merge_and_filter.r
mutation_analysis.py
mutation_analysis.r
mutation_analysis.xml
naive_output.r
new_imgt.r
pattern_plots.r
sequence_overview.r
style.tar.gz
subclass_definition.db.nhr
subclass_definition.db.nin
subclass_definition.db.nsq
summary_to_fasta.py
tool_dependencies.xml
wrapper.sh
b
diff -r 000000000000 -r 8a5a2abbb870 aa_histogram.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/aa_histogram.r Mon Aug 29 05:36:10 2016 -0400
[
@@ -0,0 +1,61 @@
+library(ggplot2)
+
+args <- commandArgs(trailingOnly = TRUE)
+
+mutations.by.id.file = args[1]
+absent.aa.by.id.file = args[2]
+genes = strsplit(args[3], ",")[[1]]
+genes = c(genes, "")
+outdir = args[4]
+
+
+print("---------------- read input ----------------")
+
+mutations.by.id = read.table(mutations.by.id.file, sep="\t", fill=T, header=T, quote="")
+absent.aa.by.id = read.table(absent.aa.by.id.file, sep="\t", fill=T, header=T, quote="")
+
+for(gene in genes){
+
+        if(gene == ""){
+                mutations.by.id.gene = mutations.by.id[!grepl("unmatched", mutations.by.id$best_match),]
+                absent.aa.by.id.gene = absent.aa.by.id[!grepl("unmatched", absent.aa.by.id$best_match),]
+        } else {
+                mutations.by.id.gene = mutations.by.id[grepl(paste("^", gene, sep=""), mutations.by.id$best_match),]
+                absent.aa.by.id.gene = absent.aa.by.id[grepl(paste("^", gene, sep=""), absent.aa.by.id$best_match),]
+        }
+        print(paste("nrow", gene, nrow(absent.aa.by.id.gene)))
+        if(nrow(mutations.by.id.gene) == 0){
+                next
+        }
+
+        mutations.at.position = colSums(mutations.by.id.gene[,-c(1,2)])
+        aa.at.position = colSums(absent.aa.by.id.gene[,-c(1,2,3,4)])
+
+        dat_freq = mutations.at.position / aa.at.position
+        dat_freq[is.na(dat_freq)] = 0
+        dat_dt = data.frame(i=1:length(dat_freq), freq=dat_freq)
+
+        print("---------------- plot ----------------")
+
+        m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
+        m = m + geom_bar(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=dat_dt$i, labels=dat_dt$i)
+        m = m + annotate("segment", x = 0.5, y = -0.05, xend=26.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 13, y = -0.1, label="FR1")
+        m = m + annotate("segment", x = 26.5, y = -0.07, xend=38.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 32.5, y = -0.15, label="CDR1")
+        m = m + annotate("segment", x = 38.5, y = -0.05, xend=55.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 47, y = -0.1, label="FR2")
+        m = m + annotate("segment", x = 55.5, y = -0.07, xend=65.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 60.5, y = -0.15, label="CDR2")
+        m = m + annotate("segment", x = 65.5, y = -0.05, xend=104.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 85, y = -0.1, label="FR3")
+        m = m + expand_limits(y=c(-0.1,1)) + xlab("AA position") + ylab("Frequency") + ggtitle(paste(gene, "AA mutation frequency"))
+
+        print("---------------- write/print ----------------")
+
+        png(filename=paste(outdir, "/aa_histogram_", gene, ".png", sep=""), width=1280, height=720)
+        print(m)
+        dev.off()
+
+        dat.sums = data.frame(index=1:length(mutations.at.position), mutations.at.position=mutations.at.position, aa.at.position=aa.at.position)
+
+        write.table(dat.sums, paste(outdir, "/aa_histogram_sum_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+        write.table(mutations.by.id.gene, paste(outdir, "/aa_histogram_count_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+        write.table(absent.aa.by.id.gene, paste(outdir, "/aa_histogram_absent_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+        write.table(dat_dt, paste(outdir, "/aa_histogram_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+}
b
diff -r 000000000000 -r 8a5a2abbb870 baseline/Baseline_Functions.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline/Baseline_Functions.r Mon Aug 29 05:36:10 2016 -0400
[
b'@@ -0,0 +1,2287 @@\n+#########################################################################################\r\n+# License Agreement\r\n+# \r\n+# THIS WORK IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE \r\n+# ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER \r\n+# APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE \r\n+# OR COPYRIGHT LAW IS PROHIBITED.\r\n+# \r\n+# BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE \r\n+# BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED \r\n+# TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN \r\n+# CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.\r\n+#\r\n+# BASELIne: Bayesian Estimation of Antigen-Driven Selection in Immunoglobulin Sequences\r\n+# Coded by: Mohamed Uduman & Gur Yaari\r\n+# Copyright 2012 Kleinstein Lab\r\n+# Version: 1.3 (01/23/2014)\r\n+#########################################################################################\r\n+\r\n+# Global variables  \r\n+  \r\n+  FILTER_BY_MUTATIONS = 1000\r\n+\r\n+  # Nucleotides\r\n+  NUCLEOTIDES = c("A","C","G","T")\r\n+  \r\n+  # Amino Acids\r\n+  AMINO_ACIDS <- c("F", "F", "L", "L", "S", "S", "S", "S", "Y", "Y", "*", "*", "C", "C", "*", "W", "L", "L", "L", "L", "P", "P", "P", "P", "H", "H", "Q", "Q", "R", "R", "R", "R", "I", "I", "I", "M", "T", "T", "T", "T", "N", "N", "K", "K", "S", "S", "R", "R", "V", "V", "V", "V", "A", "A", "A", "A", "D", "D", "E", "E", "G", "G", "G", "G")\r\n+  names(AMINO_ACIDS) <- c("TTT", "TTC", "TTA", "TTG", "TCT", "TCC", "TCA", "TCG", "TAT", "TAC", "TAA", "TAG", "TGT", "TGC", "TGA", "TGG", "CTT", "CTC", "CTA", "CTG", "CCT", "CCC", "CCA", "CCG", "CAT", "CAC", "CAA", "CAG", "CGT", "CGC", "CGA", "CGG", "ATT", "ATC", "ATA", "ATG", "ACT", "ACC", "ACA", "ACG", "AAT", "AAC", "AAA", "AAG", "AGT", "AGC", "AGA", "AGG", "GTT", "GTC", "GTA", "GTG", "GCT", "GCC", "GCA", "GCG", "GAT", "GAC", "GAA", "GAG", "GGT", "GGC", "GGA", "GGG")\r\n+  names(AMINO_ACIDS) <- names(AMINO_ACIDS)\r\n+\r\n+  #Amino Acid Traits\r\n+  #"*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V" "W" "Y"\r\n+  #B = "Hydrophobic/Burried"  N = "Intermediate/Neutral"  S="Hydrophilic/Surface") \r\n+  TRAITS_AMINO_ACIDS_CHOTHIA98 <- c("*","N","B","S","S","B","N","N","B","S","B","B","S","N","S","S","N","N","B","B","N")\r\n+  names(TRAITS_AMINO_ACIDS_CHOTHIA98) <- sort(unique(AMINO_ACIDS))\r\n+  TRAITS_AMINO_ACIDS <- array(NA,21)\r\n+  \r\n+  # Codon Table\r\n+  CODON_TABLE <- as.data.frame(matrix(NA,ncol=64,nrow=12))\r\n+\r\n+  # Substitution Model: Smith DS et al. 1996\r\n+  substitution_Literature_Mouse <- matrix(c(0, 0.156222928, 0.601501588, 0.242275484, 0.172506739, 0, 0.241239892, 0.586253369, 0.54636291, 0.255795364, 0, 0.197841727, 0.290240811, 0.467680608, 0.24207858, 0),nrow=4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES))\r\n+  substitution_Flu_Human <- matrix(c(0,0.2795596,0.5026927,0.2177477,0.1693210,0,0.3264723,0.5042067,0.4983549,0.3328321,0,0.1688130,0.2021079,0.4696077,0.3282844,0),4,4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES))\r\n+  substitution_Flu25_Human <- matrix(c(0,0.2580641,0.5163685,0.2255674,0.1541125,0,0.3210224,0.5248651,0.5239281,0.3101292,0,0.1659427,0.1997207,0.4579444,0.3423350,0),4,4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES))\r\n+  load("FiveS_Substitution.RData")\r\n+\r\n+  # Mutability Models: Shapiro GS et al. 2002\r\n+  triMutability_Literature_Human <- matrix(c(0.24, 1.2, 0.96, 0.43, 2.14, 2, 1.11, 1.9, 0.85, 1.83, 2.36, 1.31, 0.82, 0.52, 0.89, 1.33, 1.4, 0.82, 1.83, 0.73, 1.83, 1.62, 1.53, 0.57, 0.92, 0.42, 0.42, 1.47, 3.44, 2.58, 1.18, 0.47, 0.39, 1.12, 1.8, 0.68, 0.47, 2.19, 2.35, 2.19, 1.05, 1.84, 1.26, 0.28, 0.98, 2.37, 0.66, 1.58, 0.67, 0.92, 1.76, 0.83, 0.97, 0.56, 0.75, 0.62, 2.26, 0.62, 0.74, 1.11, 1.16, 0.61, 0.88, 0.67, 0.37, 0.07, 1.08, 0.46, 0.31, 0.94, 0.62, 0.57, 0.29, NA, 1.44, 0.46, 0.69, 0.57, 0.24, 0.37, 1.1, 0.99, 1.39, 0.6, 2.26, 1.24, 1.36, 0.52, 0.33, 0.26, 1.25, 0.37, 0.58, 1.03, 1.2, '..b'U = lapply(1:length(facLevels),  function(x){\r\n+      computeMutabilities(facLevels[x])\r\n+    })\r\n+    facIndex = match(facGL,facLevels)\r\n+    \r\n+    LisGLs_Mutability = lapply(1:nrow(matInput),  function(x){\r\n+      cInput = rep(NA,nchar(matInput[x,1]))\r\n+      cInput[s2c(matInput[x,1])!="N"] = 1\r\n+      LisGLs_MutabilityU[[facIndex[x]]] * cInput                                                   \r\n+    })\r\n+    \r\n+    LisGLs_Targeting =  lapply(1:dim(matInput)[1],  function(x){\r\n+      computeTargeting(matInput[x,2],LisGLs_Mutability[[x]])\r\n+    })\r\n+    \r\n+    LisGLs_MutationTypes  = lapply(1:length(matInput[,2]),function(x){\r\n+      #print(x)\r\n+      computeMutationTypes(matInput[x,2])\r\n+    })\r\n+    \r\n+    LisGLs_R_Exp = lapply(1:nrow(matInput),  function(x){\r\n+      Exp_R <-  rollapply(as.zoo(1:readEnd),width=3,by=3,\r\n+                          function(codonNucs){                                                      \r\n+                            RPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="R") \r\n+                            sum( LisGLs_Targeting[[x]][,codonNucs][RPos], na.rm=T ) \r\n+                          }\r\n+      )                                                   \r\n+    })\r\n+    \r\n+    LisGLs_S_Exp = lapply(1:nrow(matInput),  function(x){\r\n+      Exp_S <-  rollapply(as.zoo(1:readEnd),width=3,by=3,\r\n+                          function(codonNucs){                                                      \r\n+                            SPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="S")   \r\n+                            sum( LisGLs_Targeting[[x]][,codonNucs][SPos], na.rm=T )\r\n+                          }\r\n+      )                                                 \r\n+    })                                                \r\n+    \r\n+    Exp_R = matrix(unlist(LisGLs_R_Exp),nrow=nrow(matInput),ncol=readEnd/3,T)  \r\n+    Exp_S = matrix(unlist(LisGLs_S_Exp),nrow=nrow(matInput),ncol=readEnd/3,T)  \r\n+    return( list( "Expected_R"=Exp_R, "Expected_S"=Exp_S) )    \r\n+  }\r\n+}\r\n+\r\n+# getObservedMutationsByCodon <- function(listMutations){\r\n+#   numbSeqs <- length(listMutations) \r\n+#   obsMu_R <- matrix(0,nrow=numbSeqs,ncol=readEnd/3,dimnames=list(c(1:numbSeqs),c(1:(readEnd/3))))\r\n+#   obsMu_S <- obsMu_R\r\n+#   temp <- mclapply(1:length(listMutations), function(i){\r\n+#     arrMutations = listMutations[[i]]\r\n+#     RPos = as.numeric(names(arrMutations)[arrMutations=="R"])\r\n+#     RPos <- sapply(RPos,getCodonNumb)                                                                    \r\n+#     if(any(RPos)){\r\n+#       tabR <- table(RPos)\r\n+#       obsMu_R[i,as.numeric(names(tabR))] <<- tabR\r\n+#     }                                    \r\n+#     \r\n+#     SPos = as.numeric(names(arrMutations)[arrMutations=="S"])\r\n+#     SPos <- sapply(SPos,getCodonNumb)\r\n+#     if(any(SPos)){\r\n+#       tabS <- table(SPos)\r\n+#       obsMu_S[i,names(tabS)] <<- tabS\r\n+#     }                                          \r\n+#   }\r\n+#   )\r\n+#   return( list( "Observed_R"=obsMu_R, "Observed_S"=obsMu_S) ) \r\n+# }\r\n+\r\n+getObservedMutationsByCodon <- function(listMutations){\r\n+  numbSeqs <- length(listMutations) \r\n+  obsMu_R <- matrix(0,nrow=numbSeqs,ncol=readEnd/3,dimnames=list(c(1:numbSeqs),c(1:(readEnd/3))))\r\n+  obsMu_S <- obsMu_R\r\n+  temp <- lapply(1:length(listMutations), function(i){\r\n+    arrMutations = listMutations[[i]]\r\n+    RPos = as.numeric(names(arrMutations)[arrMutations=="R"])\r\n+    RPos <- sapply(RPos,getCodonNumb)                                                                    \r\n+    if(any(RPos)){\r\n+      tabR <- table(RPos)\r\n+      obsMu_R[i,as.numeric(names(tabR))] <<- tabR\r\n+    }                                    \r\n+    \r\n+    SPos = as.numeric(names(arrMutations)[arrMutations=="S"])\r\n+    SPos <- sapply(SPos,getCodonNumb)\r\n+    if(any(SPos)){\r\n+      tabS <- table(SPos)\r\n+      obsMu_S[i,names(tabS)] <<- tabS\r\n+    }                                          \r\n+  }\r\n+  )\r\n+  return( list( "Observed_R"=obsMu_R, "Observed_S"=obsMu_S) ) \r\n+}\r\n+\r\n'
b
diff -r 000000000000 -r 8a5a2abbb870 baseline/Baseline_Main.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline/Baseline_Main.r Mon Aug 29 05:36:10 2016 -0400
[
b'@@ -0,0 +1,388 @@\n+#########################################################################################\r\n+# License Agreement\r\n+# \r\n+# THIS WORK IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE \r\n+# ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER \r\n+# APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE \r\n+# OR COPYRIGHT LAW IS PROHIBITED.\r\n+# \r\n+# BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE \r\n+# BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED \r\n+# TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN \r\n+# CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.\r\n+#\r\n+# BASELIne: Bayesian Estimation of Antigen-Driven Selection in Immunoglobulin Sequences\r\n+# Coded by: Mohamed Uduman & Gur Yaari\r\n+# Copyright 2012 Kleinstein Lab\r\n+# Version: 1.3 (01/23/2014)\r\n+#########################################################################################\r\n+\r\n+op <- options();\r\n+options(showWarnCalls=FALSE, showErrorCalls=FALSE, warn=-1)\r\n+library(\'seqinr\')\r\n+if( F & Sys.info()[1]=="Linux"){\r\n+  library("multicore")\r\n+}\r\n+\r\n+# Load functions and initialize global variables\r\n+source("Baseline_Functions.r")\r\n+\r\n+# Initialize parameters with user provided arguments\r\n+  arg <- commandArgs(TRUE)                       \r\n+  #arg = c(2,1,5,5,0,1,"1:26:38:55:65:104:116", "test.fasta","","sample")\r\n+  #arg = c(1,1,5,5,0,1,"1:38:55:65:104:116:200", "test.fasta","","sample")\r\n+  #arg = c(1,1,5,5,1,1,"1:26:38:55:65:104:116", "/home/mu37/Wu/Wu_Cloned_gapped_sequences_D-masked.fasta","/home/mu37/Wu/","Wu")\r\n+  testID <- as.numeric(arg[1])                    # 1 = Focused, 2 = Local\r\n+  species <- as.numeric(arg[2])                   # 1 = Human. 2 = Mouse\r\n+  substitutionModel <- as.numeric(arg[3])         # 0 = Uniform substitution, 1 = Smith DS et al. 1996, 5 = FiveS\r\n+  mutabilityModel <- as.numeric(arg[4])           # 0 = Uniform mutablity, 1 = Tri-nucleotide (Shapiro GS et al. 2002)  , 5 = FiveS\r\n+  clonal <- as.numeric(arg[5])                    # 0 = Independent sequences, 1 = Clonally related, 2 = Clonally related & only non-terminal mutations\r\n+  fixIndels <- as.numeric(arg[6])                 # 0 = Do nothing, 1 = Try and fix Indels\r\n+  region <- as.numeric(strsplit(arg[7],":")[[1]]) # StartPos:LastNucleotideF1:C1:F2:C2:F3:C3\r\n+  inputFilePath <- arg[8]                         # Full path to input file\r\n+  outputPath <- arg[9]                            # Full path to location of output files\r\n+  outputID <- arg[10]                             # ID for session output  \r\n+  \r\n+\r\n+  if(testID==5){\r\n+    traitChangeModel <- 1\r\n+    if( !is.na(any(arg[11])) ) traitChangeModel <- as.numeric(arg[11])    # 1 <- Chothia 1998\r\n+    initializeTraitChange(traitChangeModel)    \r\n+  }\r\n+  \r\n+# Initialize other parameters/variables\r\n+    \r\n+  # Initialzie the codon table ( definitions of R/S )\r\n+  computeCodonTable(testID) \r\n+\r\n+  # Initialize   \r\n+  # Test Name\r\n+  testName<-"Focused"\r\n+  if(testID==2) testName<-"Local"\r\n+  if(testID==3) testName<-"Imbalanced"    \r\n+  if(testID==4) testName<-"ImbalancedSilent"    \r\n+    \r\n+  # Indel placeholders initialization\r\n+  indelPos <- NULL\r\n+  delPos <- NULL\r\n+  insPos <- NULL\r\n+\r\n+  # Initialize in Tranistion & Mutability matrixes\r\n+  substitution <- initializeSubstitutionMatrix(substitutionModel,species)\r\n+  mutability <- initializeMutabilityMatrix(mutabilityModel,species)\r\n+  \r\n+  # FWR/CDR boundaries\r\n+  flagTrim <- F\r\n+  if( is.na(region[7])){\r\n+    flagTrim <- T\r\n+    region[7]<-region[6]\r\n+  }\r\n+  readStart = min(region,na.rm=T)\r\n+  readEnd = max(region,na.rm=T)\r\n+  if(readStart>1){\r\n+    region = region - (readStart - 1)\r\n+  }\r\n+  region_Nuc = c( (region[1]*3-2) , (region[2:7]*3) )\r\n+  region_Cod = region\r\n+  \r\n+  readStart = (readStart*3)-2\r\n+  readEnd = (readEnd*3)\r\n+    \r\n+    FWR_Nuc <- c( rep(TRUE,(region_Nuc[2])),\r\n+                  '..b'umb]] = list("CDR"=bayesPDF_groups_cdr[[G]],"FWR"=bayesPDF_groups_fwr[[G]])\r\n+      names(listPDFs)[rowNumb] = names(groups[groups==paste(G)])[1]\r\n+      #if(names(groups)[which(groups==G)[1]]!="All sequences combined"){\r\n+      gs = unique(germlines[groups==G])\r\n+      rowNumb = rowNumb+1\r\n+      if( !is.na(gs) ){\r\n+        for( g in gs ){\r\n+          matOutput[rowNumb,c(1,2,11:18)] = c("Germline",names(germlines)[germlines==g][1],bayes_germlines_cdr[g,],bayes_germlines_fwr[g,],simgaP_germlines_cdr[g],simgaP_germlines_fwr[g])\r\n+          listPDFs[[rowNumb]] = list("CDR"=bayesPDF_germlines_cdr[[g]],"FWR"=bayesPDF_germlines_fwr[[g]])\r\n+          names(listPDFs)[rowNumb] = names(germlines[germlines==paste(g)])[1]\r\n+          rowNumb = rowNumb+1\r\n+          indexesOfInterest = which(germlines==g)\r\n+          numbSeqsOfInterest =  length(indexesOfInterest)\r\n+          rowNumb = seq(rowNumb,rowNumb+(numbSeqsOfInterest-1))\r\n+          matOutput[rowNumb,] = matrix(   c(  rep("Sequence",numbSeqsOfInterest),\r\n+                                              rownames(matInput)[indexesOfInterest],\r\n+                                              c(matMutationInfo[indexesOfInterest,1:4]),\r\n+                                              c(matMutationInfo[indexesOfInterest,5:8]),\r\n+                                              c(bayes_cdr[indexesOfInterest,]),\r\n+                                              c(bayes_fwr[indexesOfInterest,]),\r\n+                                              c(simgaP_cdr[indexesOfInterest]),\r\n+                                              c(simgaP_fwr[indexesOfInterest])                                              \r\n+          ), ncol=18, nrow=numbSeqsOfInterest,byrow=F)\r\n+          increment=0\r\n+          for( ioi in indexesOfInterest){\r\n+            listPDFs[[min(rowNumb)+increment]] =  list("CDR"=bayesPDF_cdr[[ioi]] , "FWR"=bayesPDF_fwr[[ioi]])\r\n+            names(listPDFs)[min(rowNumb)+increment] = rownames(matInput)[ioi]\r\n+            increment = increment + 1\r\n+          }\r\n+          rowNumb=max(rowNumb)+1\r\n+\r\n+        }\r\n+      }\r\n+    }\r\n+    colsToFormat = 11:18\r\n+    matOutput[,colsToFormat] = formatC(  matrix(as.numeric(matOutput[,colsToFormat]), nrow=nrow(matOutput), ncol=length(colsToFormat)) ,  digits=3)\r\n+    matOutput[matOutput== " NaN"] = NA\r\n+    \r\n+    \r\n+    \r\n+    colnames(matOutput) = c("Type", "ID", "Observed_CDR_R", "Observed_CDR_S", "Observed_FWR_R", "Observed_FWR_S",\r\n+                            "Expected_CDR_R", "Expected_CDR_S", "Expected_FWR_R", "Expected_FWR_S",\r\n+                            paste( rep(testName,6), rep(c("Sigma","CIlower","CIupper"),2),rep(c("CDR","FWR"),each=3), sep="_"),\r\n+                            paste( rep(testName,2), rep("P",2),c("CDR","FWR"), sep="_")\r\n+    )\r\n+    fileName = paste(outputPath,outputID,".txt",sep="")\r\n+    write.table(matOutput,file=fileName,quote=F,sep="\\t",row.names=T,col.names=NA)\r\n+    fileName = paste(outputPath,outputID,".RData",sep="")\r\n+    save(listPDFs,file=fileName)\r\n+\r\n+indelWarning = FALSE\r\n+if(sum(indelPos)>0){\r\n+  indelWarning = "<P>Warning: The following sequences have either gaps and/or deletions, and have been ommited from the analysis.";\r\n+  indelWarning = paste( indelWarning , "<UL>", sep="" )\r\n+  for(indels in names(indelPos)[indelPos]){\r\n+    indelWarning = paste( indelWarning , "<LI>", indels, "</LI>", sep="" )\r\n+  }\r\n+  indelWarning = paste( indelWarning , "</UL></P>", sep="" )\r\n+}\r\n+\r\n+cloneWarning = FALSE\r\n+if(clonal==1){\r\n+  if(sum(matInputErrors)>0){\r\n+    cloneWarning = "<P>Warning: The following clones have sequences of unequal length.";\r\n+    cloneWarning = paste( cloneWarning , "<UL>", sep="" )\r\n+    for(clone in names(matInputErrors)[matInputErrors]){\r\n+      cloneWarning = paste( cloneWarning , "<LI>", names(germlines)[as.numeric(clone)], "</LI>", sep="" )\r\n+    }\r\n+    cloneWarning = paste( cloneWarning , "</UL></P>", sep="" )\r\n+  }\r\n+}\r\n+cat(paste("Success",outputID,indelWarning,cloneWarning,sep="|"))\r\n'
b
diff -r 000000000000 -r 8a5a2abbb870 baseline/FiveS_Mutability.RData
b
Binary file baseline/FiveS_Mutability.RData has changed
b
diff -r 000000000000 -r 8a5a2abbb870 baseline/FiveS_Substitution.RData
b
Binary file baseline/FiveS_Substitution.RData has changed
b
diff -r 000000000000 -r 8a5a2abbb870 baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa Mon Aug 29 05:36:10 2016 -0400
b
b'@@ -0,0 +1,703 @@\n+>IGHV1-18*01\n+caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga\n+>IGHV1-18*02\n+caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctaagatctgacgacacggcc\n+>IGHV1-18*03\n+caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacatggccgtgtattactgtgcgagaga\n+>IGHV1-18*04\n+caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctacggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga\n+>IGHV1-2*01\n+caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccagtaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga\n+>IGHV1-2*02\n+caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\n+>IGHV1-2*03\n+caggtgcagctggtgcagtctggggct...gaggtgaagaagcttggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcnacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\n+>IGHV1-2*04\n+caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggctgggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\n+>IGHV1-2*05\n+caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga\n+>IGHV1-24*01\n+caggtccagctggtacagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggtttccggatacaccctc............actgaattatccatgcactgggtgcgacaggctcctggaaaagggcttgagtggatgggaggttttgatcctgaa......gatggtgaaacaatctacgcacagaagttccag...ggcagagtcaccatgaccgaggacacatctacagacacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcaacaga\n+>IGHV1-3*01\n+caggtccagcttgtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggatcaacgctggc......aatggtaacacaaaatattcacagaagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaagacacggctgtgtattactgtgcgagaga\n+>IGHV1-3*02\n+caggttcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggagcaacgctggc......aatggtaacacaaaatattcacaggagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaggacatg'..b'aggtgcagctgttgcagtctgcagca...gaggtgaaaagacccggggagtctctgaggatctcctgtaagacttctggatacagcttt............accagctactggatccactgggtgcgccagatgcccgggaaagaactggagtggatggggagcatctatcctggg......aactctgataccagatacagcccatccttccaa...ggccacgtcaccatctcagccgacagctccagcagcaccgcctacctgcagtggagcagcctgaaggcctcggacgccgccatgtattattgtgtgaga\n+>IGHV6-1*01\n+caggtacagctgcagcagtcaggtcca...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga\n+>IGHV6-1*02\n+caggtacagctgcagcagtcaggtccg...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga\n+>IGHV7-34-1*01\n+...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......actgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta\n+>IGHV7-34-1*02\n+...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......aatgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta\n+>IGHV7-4-1*01\n+caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatctgcagcctaaaggctgaggacactgccgtgtattactgtgcgaga\n+>IGHV7-4-1*02\n+caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga\n+>IGHV7-4-1*03\n+caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcacgctaaaggctgaggacactg\n+>IGHV7-4-1*04\n+caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga\n+>IGHV7-4-1*05\n+caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtgttactgtgcgagaga\n+>AIGHV7-40*03|\n+ttttcaatagaaaagtcaaataatcta...agtgtcaatcagtggatgattagataaaatatgatatatgtaaatcatggaatactatgc............agccagtatggtatgaattcagtgtgaccagcccctggacaagggcttgagtggatgggatggatcatcacctac......actgggaacccaacatataccaacggcttcaca...ggacggtttctattctccatggacacctctgtcagcatggcgtatctgcagatcagcagcctaaaggctgaggacacggccgtgtatgactgtatgagaga\n+>IGHV7-81*01\n+caggtgcagctggtgcagtctggccat...gaggtgaagcagcctggggcctcagtgaaggtctcctgcaaggcttctggttacagtttc............accacctatggtatgaattgggtgccacaggcccctggacaagggcttgagtggatgggatggttcaacacctac......actgggaacccaacatatgcccagggcttcaca...ggacggtttgtcttctccatggacacctctgccagcacagcatacctgcagatcagcagcctaaaggctgaggacatggccatgtattactgtgcgagata\n'
b
diff -r 000000000000 -r 8a5a2abbb870 baseline/comparePDFs.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline/comparePDFs.r Mon Aug 29 05:36:10 2016 -0400
[
b'@@ -0,0 +1,225 @@\n+options("warn"=-1)\r\n+\r\n+#from http://selection.med.yale.edu/baseline/Archive/Baseline%20Version%201.3/Baseline_Functions_Version1.3.r\r\n+# Compute p-value of two distributions\r\n+compareTwoDistsFaster <-function(sigma_S=seq(-20,20,length.out=4001), N=10000, dens1=runif(4001,0,1), dens2=runif(4001,0,1)){\r\n+#print(c(length(dens1),length(dens2)))\r\n+if(length(dens1)>1 & length(dens2)>1 ){\r\n+\tdens1<-dens1/sum(dens1)\r\n+\tdens2<-dens2/sum(dens2)\r\n+\tcum2 <- cumsum(dens2)-dens2/2\r\n+\ttmp<- sum(sapply(1:length(dens1),function(i)return(dens1[i]*cum2[i])))\r\n+\t#print(tmp)\r\n+\tif(tmp>0.5)tmp<-tmp-1\r\n+\treturn( tmp )\r\n+\t}\r\n+\telse {\r\n+\treturn(NA)\r\n+\t}\r\n+\t#return (sum(sapply(1:N,function(i)(sample(sigma_S,1,prob=dens1)>sample(sigma_S,1,prob=dens2))))/N)\r\n+}  \r\n+\r\n+\r\n+require("grid")\r\n+arg <- commandArgs(TRUE)\r\n+#arg <- c("300143","4","5")\r\n+arg[!arg=="clonal"]\r\n+input <- arg[1]\r\n+output <- arg[2]\r\n+rowIDs <- as.numeric(  sapply(arg[3:(max(3,length(arg)))],function(x){ gsub("chkbx","",x) } )  )\r\n+\r\n+numbSeqs = length(rowIDs)\r\n+\r\n+if ( is.na(rowIDs[1]) | numbSeqs>10 ) {\r\n+  stop( paste("Error: Please select between one and 10 seqeunces to compare.") )\r\n+}\r\n+\r\n+#load( paste("output/",sessionID,".RData",sep="") )\r\n+load( input )\r\n+#input\r\n+\r\n+xMarks = seq(-20,20,length.out=4001)\r\n+\r\n+plot_grid_s<-function(pdf1,pdf2,Sample=100,cex=1,xlim=NULL,xMarks = seq(-20,20,length.out=4001)){\r\n+  yMax = max(c(abs(as.numeric(unlist(listPDFs[pdf1]))),abs(as.numeric(unlist(listPDFs[pdf2]))),0),na.rm=T) * 1.1\r\n+\r\n+  if(length(xlim==2)){\r\n+    xMin=xlim[1]\r\n+    xMax=xlim[2]\r\n+  } else {\r\n+    xMin_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][1]\r\n+    xMin_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][1]\r\n+    xMax_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001])]\r\n+    xMax_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001])]\r\n+  \r\n+    xMin_CDR2 = xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001][1]\r\n+    xMin_FWR2 = xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001][1]\r\n+    xMax_CDR2 = xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001])]\r\n+    xMax_FWR2 = xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001])]\r\n+  \r\n+    xMin=min(c(xMin_CDR,xMin_FWR,xMin_CDR2,xMin_FWR2,0),na.rm=TRUE)\r\n+    xMax=max(c(xMax_CDR,xMax_FWR,xMax_CDR2,xMax_FWR2,0),na.rm=TRUE)\r\n+  }\r\n+\r\n+  sigma<-approx(xMarks,xout=seq(xMin,xMax,length.out=Sample))$x\r\n+  grid.rect(gp = gpar(col=gray(0.6),fill="white",cex=cex))\r\n+  x <- sigma\r\n+  pushViewport(viewport(x=0.175,y=0.175,width=0.825,height=0.825,just=c("left","bottom"),default.units="npc"))\r\n+  #pushViewport(plotViewport(c(1.8, 1.8, 0.25, 0.25)*cex))\r\n+  pushViewport(dataViewport(x, c(yMax,-yMax),gp = gpar(cex=cex),extension=c(0.05)))\r\n+  grid.polygon(c(0,0,1,1),c(0,0.5,0.5,0),gp=gpar(col=grey(0.95),fill=grey(0.95)),default.units="npc")\r\n+  grid.polygon(c(0,0,1,1),c(1,0.5,0.5,1),gp=gpar(col=grey(0.9),fill=grey(0.9)),default.units="npc")\r\n+  grid.rect()\r\n+  grid.xaxis(gp = gpar(cex=cex/1.1))\r\n+  yticks = pretty(c(-yMax,yMax),8)\r\n+  yticks = yticks[yticks>(-yMax) & yticks<(yMax)]\r\n+  grid.yaxis(at=yticks,label=abs(yticks),gp = gpar(cex=cex/1.1))\r\n+  if(length(listPDFs[pdf1][[1]][["CDR"]])>1){\r\n+    ycdr<-approx(xMarks,listPDFs[pdf1][[1]][["CDR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y\r\n+    grid.lines(unit(x,"native"), unit(ycdr,"native"),gp=gpar(col=2,lwd=2))\r\n+  }\r\n+  if(length(listPDFs[pdf1][[1]][["FWR"]])>1){\r\n+    yfwr<-approx(xMarks,listPDFs[pdf1][[1]][["FWR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y\r\n+    grid.lines(unit(x,"native"), unit(-yfwr,"native"),gp=gpar(col=4,lwd=2))\r\n+   }\r\n+\r\n+  if(length(listPDFs[pdf2][[1]][["CDR"]])>1){\r\n+    ycdr2<-approx(xMarks,listPDFs[pdf2][[1]][["CDR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y\r\n+    grid.lines(unit(x,"native"), unit(ycdr2,"native"),gp=gpar(col=2,lwd'..b'npc"),y = unit(0.75, "npc"),just=c("center", "center"),gp = gpar(cex=cex))\r\n+    grid.text(formatC(as.numeric(pCDR1CDR2),digits=3), x = unit(0.25, "npc"),y = unit(0.75, "npc"),just=c("center", "center"),gp = gpar(cex=cex))\r\n+    grid.text(formatC(as.numeric(pFWR1CDR2),digits=3), x = unit(0.25, "npc"),y = unit(0.25, "npc"),just=c("center", "center"),gp = gpar(cex=cex))\r\n+    \r\n+           \r\n+ #   grid.text(paste("P = ",formatC(pCDRFWR,digits=3)), x = unit(0.5, "npc"),y = unit(0.98, "npc"),just=c("center", "top"),gp = gpar(cex=cex))\r\n+ #   grid.text(paste("P = ",formatC(pFWRFWR,digits=3)), x = unit(0.5, "npc"),y = unit(0.02, "npc"),just=c("center", "bottom"),gp = gpar(cex=cex))\r\n+  }\r\n+  else{\r\n+  }\r\n+}\r\n+\r\n+\r\n+##################################################################################\r\n+################## The whole OCD\'s matrix ########################################\r\n+##################################################################################\r\n+\r\n+#pdf(width=4*numbSeqs+1/3,height=4*numbSeqs+1/3)\r\n+pdf( output ,width=4*numbSeqs+1/3,height=4*numbSeqs+1/3) \r\n+\r\n+pushViewport(viewport(x=0.02,y=0.02,just = c("left", "bottom"),w =0.96,height=0.96,layout = grid.layout(numbSeqs+1,numbSeqs+1,widths=unit.c(unit(rep(1,numbSeqs),"null"),unit(4,"lines")),heights=unit.c(unit(4,"lines"),unit(rep(1,numbSeqs),"null")))))\r\n+\r\n+for( seqOne in 1:numbSeqs+1){\r\n+  pushViewport(viewport(layout.pos.col = seqOne-1, layout.pos.row = 1))\r\n+  if(seqOne>2){ \r\n+    grid.polygon(c(0,0,0.5,0.5),c(0,0.5,0.5,0),gp=gpar(col=grey(0.5),fill=grey(0.9)),default.units="npc")\r\n+    grid.polygon(c(1,1,0.5,0.5),c(0,0.5,0.5,0),gp=gpar(col=grey(0.5),fill=grey(0.95)),default.units="npc")\r\n+    grid.polygon(c(0,0,1,1),c(1,0.5,0.5,1),gp=gpar(col=grey(0.5)),default.units="npc")\r\n+       \r\n+    grid.text(y=.25,x=0.75,"FWR",gp = gpar(cex=1.5),just="center")\r\n+    grid.text(y=.25,x=0.25,"CDR",gp = gpar(cex=1.5),just="center")\r\n+  }\r\n+  grid.rect(gp = gpar(col=grey(0.9)))\r\n+  grid.text(y=.75,substr(paste(names(listPDFs)[rowIDs[seqOne-1]]),1,16),gp = gpar(cex=2),just="center")\r\n+  popViewport(1)\r\n+}\r\n+\r\n+for( seqOne in 1:numbSeqs+1){\r\n+  pushViewport(viewport(layout.pos.row = seqOne, layout.pos.col = numbSeqs+1))\r\n+  if(seqOne<=numbSeqs){   \r\n+    grid.polygon(c(0,0.5,0.5,0),c(0,0,0.5,0.5),gp=gpar(col=grey(0.5),fill=grey(0.95)),default.units="npc")\r\n+    grid.polygon(c(0,0.5,0.5,0),c(1,1,0.5,0.5),gp=gpar(col=grey(0.5),fill=grey(0.9)),default.units="npc")\r\n+    grid.polygon(c(1,0.5,0.5,1),c(0,0,1,1),gp=gpar(col=grey(0.5)),default.units="npc")\r\n+    grid.text(x=.25,y=0.75,"CDR",gp = gpar(cex=1.5),just="center",rot=270)\r\n+    grid.text(x=.25,y=0.25,"FWR",gp = gpar(cex=1.5),just="center",rot=270)\r\n+  }\r\n+  grid.rect(gp = gpar(col=grey(0.9)))\r\n+  grid.text(x=0.75,substr(paste(names(listPDFs)[rowIDs[seqOne-1]]),1,16),gp = gpar(cex=2),rot=270,just="center")\r\n+  popViewport(1)\r\n+}\r\n+\r\n+for( seqOne in 1:numbSeqs+1){\r\n+  for(seqTwo in 1:numbSeqs+1){\r\n+    pushViewport(viewport(layout.pos.col = seqTwo-1, layout.pos.row = seqOne))\r\n+    if(seqTwo>seqOne){\r\n+      plot_pvals(rowIDs[seqOne-1],rowIDs[seqTwo-1],cex=2)\r\n+      grid.rect()\r\n+    }    \r\n+    popViewport(1)\r\n+  }\r\n+}\r\n+   \r\n+\r\n+xMin=0\r\n+xMax=0.01\r\n+for(pdf1 in rowIDs){\r\n+  xMin_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][1]\r\n+  xMin_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][1]\r\n+  xMax_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001])]\r\n+  xMax_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001])]\r\n+  xMin=min(c(xMin_CDR,xMin_FWR,xMin),na.rm=TRUE)\r\n+  xMax=max(c(xMax_CDR,xMax_FWR,xMax),na.rm=TRUE)\r\n+}\r\n+\r\n+\r\n+\r\n+for(i in 1:numbSeqs+1){\r\n+  for(j in (i-1):numbSeqs){    \r\n+    pushViewport(viewport(layout.pos.col = i-1, layout.pos.row = j+1))\r\n+    grid.rect()\r\n+    plot_grid_s(rowIDs[i-1],rowIDs[j],cex=1)\r\n+    popViewport(1)\r\n+  }\r\n+}\r\n+\r\n+dev.off() \r\n+\r\n+cat("Success", paste(rowIDs,collapse="_"),sep=":")\r\n+\r\n'
b
diff -r 000000000000 -r 8a5a2abbb870 baseline/filter.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline/filter.r Mon Aug 29 05:36:10 2016 -0400
[
@@ -0,0 +1,35 @@
+arg = commandArgs(TRUE)
+summaryfile = arg[1]
+gappedfile = arg[2]
+selection = arg[3]
+output = arg[4]
+print(paste("selection = ", selection))
+
+
+summarydat = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
+gappeddat = read.table(gappedfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
+
+#dat = data.frame(merge(gappeddat, summarydat, by="Sequence.ID", all.x=T))
+
+dat = cbind(gappeddat, summarydat$AA.JUNCTION)
+
+colnames(dat)[length(dat)] = "AA.JUNCTION"
+
+dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele)
+dat$VGene = gsub("[*].*", "", dat$VGene)
+
+dat$DGene = gsub("^Homsap ", "", dat$D.GENE.and.allele)
+dat$DGene = gsub("[*].*", "", dat$DGene)
+
+dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele)
+dat$JGene = gsub("[*].*", "", dat$JGene)
+
+#print(str(dat))
+
+dat$past = do.call(paste, c(dat[unlist(strsplit(selection, ","))], sep = ":"))
+
+dat = dat[!duplicated(dat$past), ]
+
+dat = dat[dat$Functionality != "No results" & dat$Functionality != "unproductive",]
+
+write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T)
b
diff -r 000000000000 -r 8a5a2abbb870 baseline/script_imgt.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline/script_imgt.py Mon Aug 29 05:36:10 2016 -0400
[
@@ -0,0 +1,79 @@
+#import xlrd #avoid dep
+import argparse
+import re
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
+parser.add_argument("--ref", help="Reference file")
+parser.add_argument("--output", help="Output file")
+parser.add_argument("--id", help="ID to be used at the '>>>' line in the output")
+
+args = parser.parse_args()
+
+refdic = dict()
+with open(args.ref, 'r') as ref:
+ currentSeq = ""
+ currentId = ""
+ for line in ref:
+ if line[0] is ">":
+ if currentSeq is not "" and currentId is not "":
+ refdic[currentId[1:]] = currentSeq
+ currentId = line.rstrip()
+ currentSeq = ""
+ else:
+ currentSeq += line.rstrip()
+ refdic[currentId[1:]] = currentSeq
+
+
+vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#,
+# r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
+# r"(IGKV[0-3]D?-[0-9]{1,2})",
+# r"(IGLV[0-9]-[0-9]{1,2})",
+# r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)",
+# r"(TRGV[234589])",
+# r"(TRDV[1-3])"]
+
+#vPattern = re.compile(r"|".join(vPattern))
+vPattern = re.compile("|".join(vPattern))
+
+def filterGene(s, pattern):
+    if type(s) is not str:
+        return None
+    res = pattern.search(s)
+    if res:
+        return res.group(0)
+    return None
+
+
+
+currentSeq = ""
+currentId = ""
+first=True
+with open(args.input, 'r') as i:
+ with open(args.output, 'a') as o:
+ o.write(">>>" + args.id + "\n")
+ outputdic = dict()
+ for line in i:
+ if first:
+ first = False
+ continue
+ linesplt = line.split("\t")
+ ref = filterGene(linesplt[1], vPattern)
+ if not ref or not linesplt[2].rstrip():
+ continue
+ if ref in outputdic:
+ outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
+ else:
+ outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
+ #print outputdic
+
+ for k in outputdic.keys():
+ if k in refdic:
+ o.write(">>" + k + "\n")
+ o.write(refdic[k] + "\n")
+ for seq in outputdic[k]:
+ #print seq
+ o.write(">" + seq[0] + "\n")
+ o.write(seq[1] + "\n")
+ else:
+ print k + " not in reference, skipping " + k
b
diff -r 000000000000 -r 8a5a2abbb870 baseline/script_xlsx.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline/script_xlsx.py Mon Aug 29 05:36:10 2016 -0400
[
@@ -0,0 +1,58 @@
+import xlrd
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
+parser.add_argument("--ref", help="Reference file")
+parser.add_argument("--output", help="Output file")
+
+args = parser.parse_args()
+
+gene_column = 6
+id_column = 7
+seq_column = 8
+LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"]
+
+
+refdic = dict()
+with open(args.ref, 'r') as ref:
+ currentSeq = ""
+ currentId = ""
+ for line in ref.readlines():
+ if line[0] is ">":
+ if currentSeq is not "" and currentId is not "":
+ refdic[currentId[1:]] = currentSeq
+ currentId = line.rstrip()
+ currentSeq = ""
+ else:
+ currentSeq += line.rstrip()
+ refdic[currentId[1:]] = currentSeq
+
+currentSeq = ""
+currentId = ""
+with xlrd.open_workbook(args.input, 'r') as wb:
+ with open(args.output, 'a') as o:
+ for sheet in wb.sheets():
+ if sheet.cell(1,gene_column).value.find("IGHV") < 0:
+ print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name
+ continue
+ o.write(">>>" + sheet.name + "\n")
+ outputdic = dict()
+ for rowindex in range(1, sheet.nrows):
+ ref = sheet.cell(rowindex, gene_column).value.replace(">", "")
+ if ref in outputdic:
+ outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
+ else:
+ outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
+ #print outputdic
+
+ for k in outputdic.keys():
+ if k in refdic:
+ o.write(">>" + k + "\n")
+ o.write(refdic[k] + "\n")
+ for seq in outputdic[k]:
+ #print seq
+ o.write(">" + seq[0] + "\n")
+ o.write(seq[1] + "\n")
+ else:
+ print k + " not in reference, skipping " + k
b
diff -r 000000000000 -r 8a5a2abbb870 baseline/wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline/wrapper.sh Mon Aug 29 05:36:10 2016 -0400
[
@@ -0,0 +1,104 @@
+#!/bin/bash
+dir="$(cd "$(dirname "$0")" && pwd)"
+
+testID=$1
+species=$2
+substitutionModel=$3
+mutabilityModel=$4
+clonal=$5
+fixIndels=$6
+region=$7
+inputs=$8
+inputs=($inputs)
+IDs=$9
+IDs=($IDs)
+ref=${10}
+output=${11}
+selection=${12}
+output_table=${13}
+outID="result"
+
+echo "$PWD"
+
+echo "testID = $testID"
+echo "species = $species"
+echo "substitutionModel = $substitutionModel"
+echo "mutabilityModel = $mutabilityModel"
+echo "clonal = $clonal"
+echo "fixIndels = $fixIndels"
+echo "region = $region"
+echo "inputs = ${inputs[@]}"
+echo "IDs = ${IDs[@]}"
+echo "ref = $ref"
+echo "output = $output"
+echo "outID = $outID"
+
+fasta="$PWD/baseline.fasta"
+
+
+count=0
+for current in ${inputs[@]}
+do
+ f=$(file $current)
+ zipType="Zip archive"
+ if [[ "$f" == *"$zipType"* ]] || [[ "$f" == *"XZ compressed data"* ]]
+ then
+ id=${IDs[$count]}
+ echo "id=$id"
+ if [[ "$f" == *"Zip archive"* ]] ; then
+ echo "Zip archive"
+ echo "unzip $input -d $PWD/files/"
+ unzip $current -d "$PWD/$id/"
+ elif [[ "$f" == *"XZ compressed data"* ]] ; then
+ echo "ZX archive"
+ echo "tar -xJf $input -C $PWD/files/"
+ mkdir -p "$PWD/$id/files"
+ tar -xJf $current -C "$PWD/$id/files/"
+ fi
+ summaryfile="$PWD/summary_${id}.txt"
+ gappedfile="$PWD/gappednt_${id}.txt"
+ filtered="$PWD/filtered_${id}.txt"
+ filecount=`ls -l $PWD/$id/ | wc -l`
+ if [[ "$filecount" -eq "2" ]]
+ then
+ cat $PWD/$id/*/1_* > $summaryfile
+ cat $PWD/$id/*/2_* > $gappedfile
+ else
+ cat $PWD/$id/1_* > $summaryfile
+ cat $PWD/$id/2_* > $gappedfile
+ fi
+ Rscript $dir/filter.r $summaryfile $gappedfile "$selection" $filtered 2>&1
+
+ final="$PWD/final_${id}.txt"
+ cat $filtered | cut -f2,4,7 > $final
+ python $dir/script_imgt.py --input $final --ref $ref --output $fasta --id $id
+ else
+ python $dir/script_xlsx.py --input $current --ref $ref --output $fasta
+ fi
+ count=$((count+1))
+done
+
+if [[ $(wc -l < $fasta) -eq "1" ]]; then
+ echo "No sequences in the fasta file, exiting"
+ exit 0
+fi
+
+workdir="$PWD"
+cd $dir
+echo "file: ${inputs[0]}"
+#Rscript --verbose $dir/Baseline_Main.r $testID $species $substitutionModel $mutabilityModel $clonal $fixIndels $region ${inputs[0]} $workdir/ $outID 2>&1
+Rscript --verbose $dir/Baseline_Main.r $testID $species $substitutionModel $mutabilityModel $clonal $fixIndels $region $fasta $workdir/ $outID 2>&1
+
+echo "$workdir/${outID}.txt"
+
+rows=`tail -n +2 $workdir/${outID}.txt | grep -v "All sequences combined" | grep -n 'Group' | grep -Eoh '^[0-9]+' | tr '\n' ' '`
+rows=($rows)
+#unset rows[${#rows[@]}-1]
+
+cd $dir
+Rscript --verbose $dir/comparePDFs.r $workdir/${outID}.RData $output ${rows[@]} 2>&1
+cp $workdir/result.txt ${output_table}
+
+
+
+
b
diff -r 000000000000 -r 8a5a2abbb870 change_o/DefineClones.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/change_o/DefineClones.py Mon Aug 29 05:36:10 2016 -0400
[
b'@@ -0,0 +1,1052 @@\n+#!/usr/bin/env python3\n+"""\n+Assign Ig sequences into clones\n+"""\n+# Info\n+__author__ = \'Namita Gupta, Jason Anthony Vander Heiden, Gur Yaari, Mohamed Uduman\'\n+from changeo import __version__, __date__\n+\n+# Imports\n+import os\n+import re\n+import sys\n+import numpy as np\n+from argparse import ArgumentParser\n+from collections import OrderedDict\n+from itertools import chain\n+from textwrap import dedent\n+from time import time\n+from Bio import pairwise2\n+from Bio.Seq import translate\n+\n+# Presto and changeo imports\n+from presto.Defaults import default_out_args\n+from presto.IO import getFileType, getOutputHandle, printLog, printProgress\n+from presto.Multiprocessing import manageProcesses\n+from presto.Sequence import getDNAScoreDict\n+from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs\n+from changeo.Distance import getDNADistMatrix, getAADistMatrix, \\\n+                             hs1f_model, m1n_model, hs5f_model, \\\n+                             calcDistances, formClusters\n+from changeo.IO import getDbWriter, readDbFile, countDbFile\n+from changeo.Multiprocessing import DbData, DbResult\n+\n+# Defaults\n+default_translate = False\n+default_distance = 0.0\n+default_bygroup_model = \'hs1f\'\n+default_hclust_model = \'chen2010\'\n+default_seq_field = \'JUNCTION\'\n+default_norm = \'len\'\n+default_sym = \'avg\'\n+default_linkage = \'single\'\n+\n+# TODO:  should be in Distance, but need to be after function definitions\n+# Amino acid Hamming distance\n+aa_model = getAADistMatrix(mask_dist=1, gap_dist=0)\n+\n+# DNA Hamming distance\n+ham_model = getDNADistMatrix(mask_dist=0, gap_dist=0)\n+\n+\n+# TODO:  this function is an abstraction to facilitate later cleanup\n+def getModelMatrix(model):\n+    """\n+    Simple wrapper to get distance matrix from model name\n+\n+    Arguments:\n+    model = model name\n+\n+    Return:\n+    a pandas.DataFrame containing the character distance matrix\n+    """\n+    if model == \'aa\':\n+        return(aa_model)\n+    elif model == \'ham\':\n+        return(ham_model)\n+    elif model == \'m1n\':\n+        return(m1n_model)\n+    elif model == \'hs1f\':\n+        return(hs1f_model)\n+    elif model == \'hs5f\':\n+        return(hs5f_model)\n+    else:\n+        sys.stderr.write(\'Unrecognized distance model: %s.\\n\' % model)\n+\n+\n+def indexJunctions(db_iter, fields=None, mode=\'gene\', action=\'first\'):\n+    """\n+    Identifies preclonal groups by V, J and junction length\n+\n+    Arguments: \n+    db_iter = an iterator of IgRecords defined by readDbFile\n+    fields = additional annotation fields to use to group preclones;\n+             if None use only V, J and junction length\n+    mode = specificity of alignment call to use for assigning preclones;\n+           one of (\'allele\', \'gene\')\n+    action = how to handle multiple value fields when assigning preclones;\n+             one of (\'first\', \'set\')\n+    \n+    Returns: \n+    a dictionary of {(V, J, junction length):[IgRecords]}\n+    """\n+    # Define functions for grouping keys\n+    if mode == \'allele\' and fields is None:\n+        def _get_key(rec, act):\n+            return (rec.getVAllele(act), rec.getJAllele(act),\n+                    None if rec.junction is None else len(rec.junction))\n+    elif mode == \'gene\' and fields is None:\n+        def _get_key(rec, act):  \n+            return (rec.getVGene(act), rec.getJGene(act),\n+                    None if rec.junction is None else len(rec.junction))\n+    elif mode == \'allele\' and fields is not None:\n+        def _get_key(rec, act):\n+            vdj = [rec.getVAllele(act), rec.getJAllele(act),\n+                    None if rec.junction is None else len(rec.junction)]\n+            ann = [rec.toDict().get(k, None) for k in fields]\n+            return tuple(chain(vdj, ann))\n+    elif mode == \'gene\' and fields is not None:\n+        def _get_key(rec, act):\n+            vdj = [rec.getVGene(act), rec.getJGene(act),\n+                    None if rec.junction is None else len(rec.junction)]\n+            ann = [rec.toDict().get(k, None'..b'\n+    parser_bygroup.add_argument(\'--sf\', action=\'store\', dest=\'seq_field\',\n+                                default=default_seq_field,\n+                                help=\'\'\'The name of the field to be used to calculate\n+                                     distance between records\'\'\')\n+    parser_bygroup.set_defaults(feed_func=feedQueue)\n+    parser_bygroup.set_defaults(work_func=processQueue)\n+    parser_bygroup.set_defaults(collect_func=collectQueue)  \n+    parser_bygroup.set_defaults(group_func=indexJunctions)  \n+    parser_bygroup.set_defaults(clone_func=distanceClones)\n+    \n+    \n+    # Hierarchical clustering cloning method\n+    parser_hclust = subparsers.add_parser(\'hclust\', parents=[parser_parent],\n+                                        formatter_class=CommonHelpFormatter,\n+                                        help=\'Defines clones by specified distance metric on CDR3s and \\\n+                                              cutting of hierarchical clustering tree\')\n+#     parser_hclust.add_argument(\'-f\', nargs=\'+\', action=\'store\', dest=\'fields\', default=None,\n+#                              help=\'Fields to use for grouping clones (non VDJ)\')\n+    parser_hclust.add_argument(\'--method\', action=\'store\', dest=\'method\', \n+                             choices=(\'chen2010\', \'ademokun2011\'), default=default_hclust_model, \n+                             help=\'Specifies which cloning method to use for calculating distance \\\n+                                   between CDR3s, computing linkage, and cutting clusters\')\n+    parser_hclust.set_defaults(feed_func=feedQueueClust)\n+    parser_hclust.set_defaults(work_func=processQueueClust)\n+    parser_hclust.set_defaults(collect_func=collectQueueClust)\n+    parser_hclust.set_defaults(cluster_func=hierClust)\n+        \n+    return parser\n+\n+\n+if __name__ == \'__main__\':\n+    """\n+    Parses command line arguments and calls main function\n+    """\n+    # Parse arguments\n+    parser = getArgParser()\n+    args = parser.parse_args()\n+    args_dict = parseCommonArgs(args)\n+    # Convert case of fields\n+    if \'seq_field\' in args_dict:\n+        args_dict[\'seq_field\'] = args_dict[\'seq_field\'].upper()\n+    if \'fields\' in args_dict and args_dict[\'fields\'] is not None:  \n+        args_dict[\'fields\'] = [f.upper() for f in args_dict[\'fields\']]\n+    \n+    # Define clone_args\n+    if args.command == \'bygroup\':\n+        args_dict[\'group_args\'] = {\'fields\': args_dict[\'fields\'],\n+                                   \'action\': args_dict[\'action\'], \n+                                   \'mode\':args_dict[\'mode\']}\n+        args_dict[\'clone_args\'] = {\'model\':  args_dict[\'model\'],\n+                                   \'distance\':  args_dict[\'distance\'],\n+                                   \'norm\': args_dict[\'norm\'],\n+                                   \'sym\': args_dict[\'sym\'],\n+                                   \'linkage\': args_dict[\'linkage\'],\n+                                   \'seq_field\': args_dict[\'seq_field\']}\n+\n+        # TODO:  can be cleaned up with abstract model class\n+        args_dict[\'clone_args\'][\'dist_mat\'] = getModelMatrix(args_dict[\'model\'])\n+\n+        del args_dict[\'fields\']\n+        del args_dict[\'action\']\n+        del args_dict[\'mode\']\n+        del args_dict[\'model\']\n+        del args_dict[\'distance\']\n+        del args_dict[\'norm\']\n+        del args_dict[\'sym\']\n+        del args_dict[\'linkage\']\n+        del args_dict[\'seq_field\']\n+\n+    # Define clone_args\n+    if args.command == \'hclust\':\n+        dist_funcs = {\'chen2010\':distChen2010, \'ademokun2011\':distAdemokun2011}\n+        args_dict[\'clone_func\'] = dist_funcs[args_dict[\'method\']]\n+        args_dict[\'cluster_args\'] = {\'method\':  args_dict[\'method\']}\n+        #del args_dict[\'fields\']\n+        del args_dict[\'method\']\n+    \n+    # Call defineClones\n+    del args_dict[\'command\']\n+    del args_dict[\'db_files\']\n+    for f in args.__dict__[\'db_files\']:\n+        args_dict[\'db_file\'] = f\n+        defineClones(**args_dict)\n\\ No newline at end of file\n'
b
diff -r 000000000000 -r 8a5a2abbb870 change_o/MakeDb.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/change_o/MakeDb.py Mon Aug 29 05:36:10 2016 -0400
[
b'@@ -0,0 +1,1025 @@\n+#!/usr/bin/env python3\n+"""\n+Create tab-delimited database file to store sequence alignment information\n+"""\n+# Info\n+__author__ = \'Namita Gupta, Jason Anthony Vander Heiden\'\n+from changeo import __version__, __date__\n+\n+# Imports\n+import csv\n+import os\n+import re\n+import sys\n+import pandas as pd\n+import tarfile\n+import zipfile\n+from argparse import ArgumentParser\n+from collections import OrderedDict\n+from itertools import groupby\n+from shutil import rmtree\n+from tempfile import mkdtemp\n+from textwrap import dedent\n+from time import time\n+from Bio import SeqIO\n+from Bio.Seq import Seq\n+from Bio.Alphabet import IUPAC\n+\n+# Presto and changeo imports\n+from presto.Defaults import default_out_args\n+from presto.Annotation import parseAnnotation\n+from presto.IO import countSeqFile, printLog, printProgress\n+from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs\n+from changeo.IO import getDbWriter, countDbFile, getRepo\n+from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \\\n+                             j_allele_regex\n+\n+# Default parameters\n+default_delimiter = (\'\\t\', \',\', \'-\')\n+\n+\n+def gapV(ig_dict, repo_dict):\n+    """\n+    Insert gaps into V region and update alignment information\n+\n+    Arguments:\n+      ig_dict : Dictionary of parsed IgBlast output\n+      repo_dict : Dictionary of IMGT gapped germline sequences\n+\n+    Returns:\n+      dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields\n+    """\n+\n+    seq_imgt = \'.\' * (int(ig_dict[\'V_GERM_START_VDJ\'])-1) + ig_dict[\'SEQUENCE_VDJ\']\n+\n+    # Find gapped germline V segment\n+    vgene = parseAllele(ig_dict[\'V_CALL\'], v_allele_regex, \'first\')\n+    vkey = (vgene, )\n+    #TODO: Figure out else case\n+    if vkey in repo_dict:\n+        vgap = repo_dict[vkey]\n+        # Iterate over gaps in the germline segment\n+        gaps = re.finditer(r\'\\.\', vgap)\n+        gapcount = int(ig_dict[\'V_GERM_START_VDJ\'])-1\n+        for gap in gaps:\n+            i = gap.start()\n+            # Break if gap begins after V region\n+            if i >= ig_dict[\'V_GERM_LENGTH_VDJ\'] + gapcount:\n+                break\n+            # Insert gap into IMGT sequence\n+            seq_imgt = seq_imgt[:i] + \'.\' + seq_imgt[i:]\n+            # Update gap counter\n+            gapcount += 1\n+        ig_dict[\'SEQUENCE_IMGT\'] = seq_imgt\n+        # Update IMGT positioning information for V\n+        ig_dict[\'V_GERM_START_IMGT\'] = 1\n+        ig_dict[\'V_GERM_LENGTH_IMGT\'] = ig_dict[\'V_GERM_LENGTH_VDJ\'] + gapcount\n+\n+    return ig_dict\n+\n+\n+def getIMGTJunc(ig_dict, repo_dict):\n+    """\n+    Identify junction region by IMGT definition\n+\n+    Arguments:\n+      ig_dict : Dictionary of parsed IgBlast output\n+      repo_dict : Dictionary of IMGT gapped germline sequences\n+\n+    Returns:\n+      dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields\n+    """\n+    # Find germline J segment\n+    jgene = parseAllele(ig_dict[\'J_CALL\'], j_allele_regex, \'first\')\n+    jkey = (jgene, )\n+    #TODO: Figure out else case\n+    if jkey in repo_dict:\n+        # Get germline J sequence\n+        jgerm = repo_dict[jkey]\n+        jgerm = jgerm[:ig_dict[\'J_GERM_START\']+ig_dict[\'J_GERM_LENGTH\']-1]\n+        # Look for (F|W)GXG aa motif in nt sequence\n+        motif = re.search(r\'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]\', jgerm)\n+        aa_end = len(ig_dict[\'SEQUENCE_IMGT\'])\n+        #TODO: Figure out else case\n+        if motif:\n+            # print(\'\\n\', motif.group())\n+            aa_end = motif.start() - len(jgerm) + 3\n+        # Add fields to dict\n+        ig_dict[\'JUNCTION\'] = ig_dict[\'SEQUENCE_IMGT\'][309:aa_end]\n+        ig_dict[\'JUNCTION_LENGTH\'] = len(ig_dict[\'JUNCTION\'])\n+\n+    return ig_dict\n+\n+\n+def getRegions(ig_dict):\n+    """\n+    Identify FWR and CDR regions by IMGT definition\n+\n+    Arguments:\n+      ig_dict : Dictionary of parsed alignment output\n+\n+    Returns:\n+      dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMG'..b'P, J_SCORE, J_IDENTITY,\n+                                     J_BTOP, and J_EVALUE columns.\'\'\')\n+    parser_igblast.add_argument(\'--regions\', action=\'store_true\', dest=\'region_fields\',\n+                                help=\'\'\'Specify if IMGT framework and CDR regions should be\n+                                     included in the output. Adds the FWR1_IMGT, FWR2_IMGT,\n+                                     FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and\n+                                     CDR3_IMGT columns.\'\'\')\n+    \n+    # IMGT aligner\n+    parser_imgt = subparsers.add_parser(\'imgt\', help=\'Process IMGT/HighV-Quest output\', \n+                                        parents=[parser_parent], \n+                                        formatter_class=CommonHelpFormatter)\n+    imgt_arg_group =  parser_imgt.add_mutually_exclusive_group(required=True)\n+    imgt_arg_group.add_argument(\'-i\', nargs=\'+\', action=\'store\', dest=\'aligner_files\',\n+                                help=\'\'\'Either zipped IMGT output files (.zip) or a folder\n+                                     containing unzipped IMGT output files (which must\n+                                     include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,\n+                                     and 6_Junction).\'\'\')\n+    parser_imgt.add_argument(\'-s\', nargs=\'*\', action=\'store\', dest=\'seq_files\',\n+                             required=False,\n+                             help=\'List of input FASTA files containing sequences\')\n+    parser_imgt.add_argument(\'--noparse\', action=\'store_true\', dest=\'no_parse\', \n+                             help=\'\'\'Specify if input IDs should not be parsed to add new\n+                                  columns to database.\'\'\')\n+    parser_imgt.add_argument(\'--scores\', action=\'store_true\', dest=\'score_fields\',\n+                             help=\'\'\'Specify if alignment score metrics should be\n+                                  included in the output. Adds the V_SCORE, V_IDENTITY,\n+                                  J_SCORE and J_IDENTITY. Note, this will also add\n+                                  the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP,\n+                                  but they will be empty for IMGT output.\'\'\')\n+    parser_imgt.add_argument(\'--regions\', action=\'store_true\', dest=\'region_fields\',\n+                             help=\'\'\'Specify if IMGT framework and CDR regions should be\n+                                  included in the output. Adds the FWR1_IMGT, FWR2_IMGT,\n+                                  FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and\n+                                  CDR3_IMGT columns.\'\'\')\n+    parser_imgt.set_defaults(func=parseIMGT)\n+\n+    return parser\n+    \n+    \n+if __name__ == "__main__":\n+    """\n+    Parses command line arguments and calls main\n+    """\n+    parser = getArgParser()    \n+    args = parser.parse_args()\n+    args_dict = parseCommonArgs(args, in_arg=\'aligner_files\')\n+\n+    # Set no ID parsing if sequence files are not provided\n+    if \'seq_files\' in args_dict and not args_dict[\'seq_files\']:\n+        args_dict[\'no_parse\'] = True\n+\n+    # Delete\n+    if \'seq_files\' in args_dict: del args_dict[\'seq_files\']\n+    if \'aligner_files\' in args_dict: del args_dict[\'aligner_files\']\n+    if \'command\' in args_dict: del args_dict[\'command\']\n+    if \'func\' in args_dict: del args_dict[\'func\']           \n+    \n+    if args.command == \'imgt\':\n+        for i in range(len(args.__dict__[\'aligner_files\'])):\n+            args_dict[\'imgt_output\'] = args.__dict__[\'aligner_files\'][i]\n+            args_dict[\'seq_file\'] = args.__dict__[\'seq_files\'][i] \\\n+                                    if args.__dict__[\'seq_files\'] else None\n+            args.func(**args_dict)\n+    elif args.command == \'igblast\':\n+        for i in range(len(args.__dict__[\'aligner_files\'])):\n+            args_dict[\'igblast_output\'] =  args.__dict__[\'aligner_files\'][i]\n+            args_dict[\'seq_file\'] = args.__dict__[\'seq_files\'][i]\n+            args.func(**args_dict)\n'
b
diff -r 000000000000 -r 8a5a2abbb870 change_o/define_clones.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/change_o/define_clones.r Mon Aug 29 05:36:10 2016 -0400
[
@@ -0,0 +1,15 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+input=args[1]
+output=args[2]
+
+change.o = read.table(input, header=T, sep="\t", quote="", stringsAsFactors=F)
+
+freq = data.frame(table(change.o$CLONE))
+freq2 = data.frame(table(freq$Freq))
+
+freq2$final = as.numeric(freq2$Freq) * as.numeric(as.character(freq2$Var1))
+
+names(freq2) = c("Clone size", "Nr of clones", "Nr of sequences")
+
+write.table(x=freq2, file=output, sep="\t",quote=F,row.names=F,col.names=T)
b
diff -r 000000000000 -r 8a5a2abbb870 change_o/define_clones.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/change_o/define_clones.sh Mon Aug 29 05:36:10 2016 -0400
[
@@ -0,0 +1,43 @@
+#!/bin/bash
+dir="$(cd "$(dirname "$0")" && pwd)"
+
+#define_clones.sh $input $noparse $scores $regions $out_file
+
+type=$1
+input=$2
+
+mkdir -p $PWD/outdir
+
+cp $input $PWD/input.tab #file has to have a ".tab" extension
+
+if [ "bygroup" == "$type" ] ; then
+ mode=$3
+ act=$4
+ model=$5
+ norm=$6
+ sym=$7
+ link=$8
+ dist=$9
+ output=${10}
+ output2=${11}
+
+ python3 $dir/DefineClones.py bygroup -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link
+ #/data/users/david/anaconda3/bin/python $dir/DefineClones.py bygroup -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link
+ #/home/galaxy/anaconda3/bin/python $dir/DefineClones.py bygroup -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link
+
+ Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1
+else
+ method=$3
+ output=$4
+ output2=$5
+
+ python3 $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method
+ #/data/users/david/anaconda3/bin/python $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method
+ #/home/galaxy/anaconda3/bin/python $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method
+
+ Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1
+fi
+
+cp $PWD/outdir/output_clone-pass.tab $output
+
+rm -rf $PWD/outdir/
b
diff -r 000000000000 -r 8a5a2abbb870 change_o/makedb.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/change_o/makedb.sh Mon Aug 29 05:36:10 2016 -0400
[
@@ -0,0 +1,38 @@
+#!/bin/bash
+dir="$(cd "$(dirname "$0")" && pwd)"
+
+input=$1
+noparse=$2
+scores=$3
+regions=$4
+output=$5
+
+if [ "true" == "$noparse" ] ; then
+ noparse="--noparse"
+else
+ noparse=""
+fi
+
+if [ "true" == "$scores" ] ; then
+ scores="--scores"
+else
+ scores=""
+fi
+
+if [ "true" == "$regions" ] ; then
+ regions="--regions"
+else
+ regions=""
+fi
+
+mkdir $PWD/outdir
+
+echo "makedb: $PWD/outdir"
+
+python3 $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions
+#/data/users/david/anaconda3/bin/python $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions
+#/home/galaxy/anaconda3/bin/python $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions
+
+mv $PWD/outdir/output_db-pass.tab $output
+
+rm -rf $PWD/outdir/
b
diff -r 000000000000 -r 8a5a2abbb870 datatypes_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml Mon Aug 29 05:36:10 2016 -0400
b
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<datatypes>
+    <registration>
+        <datatype extension="imgt_archive" type="galaxy.datatypes.binary:CompressedArchive" display_in_upload="True" subclass="True"/>
+    </registration>
+</datatypes>
b
diff -r 000000000000 -r 8a5a2abbb870 gene_identification.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_identification.py Mon Aug 29 05:36:10 2016 -0400
[
b'@@ -0,0 +1,220 @@\n+import re\n+import argparse\n+import time\n+starttime= int(time.time() * 1000)\n+\n+parser = argparse.ArgumentParser()\n+parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file")\n+parser.add_argument("--output", help="The annotated output file to be merged back with the summary file")\n+\n+args = parser.parse_args()\n+\n+infile = args.input\n+#infile = "test_VH-Ca_Cg_25nt/1_Summary_test_VH-Ca_Cg_25nt_241013.txt"\n+output = args.output\n+#outfile = "identified.txt"\n+\n+dic = dict()\n+total = 0\n+\n+\n+first = True\n+IDIndex = 0\n+seqIndex = 0\n+\n+with open(infile, \'r\') as f: #read all sequences into a dictionary as key = ID, value = sequence\n+\tfor line in f:\n+\t\ttotal += 1\n+\t\tlinesplt = line.split("\\t")\n+\t\tif first:\n+\t\t\tprint "linesplt", linesplt\n+\t\t\tIDIndex = linesplt.index("Sequence ID")\n+\t\t\tseqIndex = linesplt.index("Sequence")\n+\t\t\tfirst = False\n+\t\t\tcontinue\n+\t\t\n+\t\tID = linesplt[IDIndex]\n+\t\tif len(linesplt) < 28: #weird rows without a sequence\n+\t\t\tdic[ID] = ""\n+\t\telse:\n+\t\t\tdic[ID] = linesplt[seqIndex]\n+\t\t\t\n+print "Number of input sequences:", len(dic)\n+\n+#old cm sequence: gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc\n+#old cg sequence: ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag\n+\n+#lambda/kappa reference sequence\n+searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg",\n+                 "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggcc",\n+                 "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence\n+\n+compiledregex = {"ca": [],\n+                 "cg": [],\n+                 "cm": []}\n+\n+#lambda/kappa reference sequence variable nucleotides\n+ca1 = {38: \'t\', 39: \'g\', 48: \'a\', 49: \'g\', 51: \'c\', 68: \'a\', 73: \'c\'}\n+ca2 = {38: \'g\', 39: \'a\', 48: \'c\', 49: \'c\', 51: \'a\', 68: \'g\', 73: \'a\'}\n+cg1 = {0: \'c\', 33: \'a\', 38: \'c\', 44: \'a\', 54: \'t\', 56: \'g\', 58: \'g\', 66: \'g\', 132: \'c\'}\n+cg2 = {0: \'c\', 33: \'g\', 38: \'g\', 44: \'g\', 54: \'c\', 56: \'a\', 58: \'a\', 66: \'g\', 132: \'t\'}\n+cg3 = {0: \'t\', 33: \'g\', 38: \'g\', 44: \'g\', 54: \'t\', 56: \'g\', 58: \'g\', 66: \'g\', 132: \'c\'}\n+cg4 = {0: \'t\', 33: \'g\', 38: \'g\', 44: \'g\', 54: \'c\', 56: \'a\', 58: \'a\', 66: \'c\', 132: \'c\'}\n+\n+#remove last snp for shorter cg sequence --- note, also change varsInCG\n+del cg1[132]\n+del cg2[132]\n+del cg3[132]\n+del cg4[132]\n+\n+#reference sequences are cut into smaller parts of \'chunklength\' length, and with \'chunklength\' / 2 overlap\n+chunklength = 8\n+\n+#create the chunks of the reference sequence with regular expressions for the variable nucleotides\n+for i in range(0, len(searchstrings["ca"]) - chunklength, chunklength / 2):\n+  pos = i\n+  chunk = searchstrings["ca"][i:i+chunklength]\n+  result = ""\n+  varsInResult = 0\n+  for c in chunk:\n+    if pos in ca1.keys():\n+      varsInResult += 1\n+      result += "[" + ca1[pos] + ca2[pos] + "]"\n+    else:\n+      result += c\n+    pos += 1\n+  compiledregex["ca"].append((re.compile(result), varsInResult))\n+\n+for i in range(0, len(searchstrings["cg"]) - chunklength, chunklength / 2):\n+  pos = i\n+  chunk = searchstrings["cg"][i:i+chunklength]\n+  result = ""\n+  varsInResult = 0\n+  for c in chunk:\n+    if pos in cg1.keys():\n+      varsInResult += 1\n+      result += "[" + "".join(set([cg1[pos], cg2[pos], cg3[pos], cg4[pos]])) + "]"\n+    else:\n+      result += c\n+    pos += 1\n+  compiledregex["cg"].append((re.compile(result), varsInResult))\n+\n+for i in range(0, len(searchstrings["cm"]) - chunklength, chunklength / 2):\n+  compiledregex["cm"].append((re.compile(searchstrings["cm"][i:i+chunklength]), False))\n+\n+\n+\n+def removeAndReturnMaxIndex(x): #simplifies a list comprehension\n+  m = max(x)\n+  index = x.index(m)\n+  x[index] = 0\n+  return index\n+  \n+\n+start_location = dict()\n+hits = dict()\n+alltotal = 0\n+for key in compiledregex.keys(): #for ca/cg/cm\n+\tregularexpressions = compiledregex[key] #get the compiled regular expressions\n+\tfor ID in dic.key'..b'ts["cg2"] += len([1 for x in cg2 if chunkstart <= x < chunkend and cg2[x] == seq[lastindex + x - chunkstart]])\n+\t\t\t\t\t\tcurrentIDHits["cg3"] += len([1 for x in cg3 if chunkstart <= x < chunkend and cg3[x] == seq[lastindex + x - chunkstart]])\n+\t\t\t\t\t\tcurrentIDHits["cg4"] += len([1 for x in cg4 if chunkstart <= x < chunkend and cg4[x] == seq[lastindex + x - chunkstart]])\n+\t\t\t\t\telse: #key == "cm" #no variable regions in \'cm\'\n+\t\t\t\t\t\tpass\n+\t\t\t\tbreak #this only breaks when there was a match with the regex, breaking means the \'else:\' clause is skipped\n+\t\t\telse: #only runs if there were no hits\n+\t\t\t\tcontinue\n+\t\t\t#print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it\'s now:", start[lastindex - chunklength / 2 * i]\n+\t\t\tcurrentIDHits[key + "_hits"] += 1\n+\t\tstart_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1 - start_zero) for x in range(5) if len(start) > 0 and max(start) > 1])\n+\t\t#start_location[ID + "_" + key] = str(start.index(max(start)))\n+\n+\n+chunksInCA = len(compiledregex["ca"])\n+chunksInCG = len(compiledregex["cg"])\n+chunksInCM = len(compiledregex["cm"])\n+requiredChunkPercentage = 0.7\n+varsInCA = float(len(ca1.keys()) * 2)\n+varsInCG = float(len(cg1.keys()) * 2) - 2 # -2 because the sliding window doesn\'t hit the first and last nt twice\n+varsInCM = 0\n+\n+\n+\n+first = True\n+seq_write_count=0\n+with open(infile, \'r\') as f: #read all sequences into a dictionary as key = ID, value = sequence\n+\twith open(output, \'w\') as o:\n+\t\tfor line in f:\n+\t\t\ttotal += 1\n+\t\t\tif first:\n+\t\t\t\to.write("Sequence ID\\tbest_match\\tnt_hit_percentage\\tchunk_hit_percentage\\tstart_locations\\n")\n+\t\t\t\tfirst = False\n+\t\t\t\tcontinue\n+\t\t\tlinesplt = line.split("\\t")\n+\t\t\tif linesplt[2] == "No results":\n+\t\t\t\tpass\n+\t\t\tID = linesplt[1]\n+\t\t\tcurrentIDHits = hits[ID]\n+\t\t\tpossibleca = float(len(compiledregex["ca"]))\n+\t\t\tpossiblecg = float(len(compiledregex["cg"]))\n+\t\t\tpossiblecm = float(len(compiledregex["cm"]))\n+\t\t\tcahits = currentIDHits["ca_hits"]\n+\t\t\tcghits = currentIDHits["cg_hits"]\n+\t\t\tcmhits = currentIDHits["cm_hits"]\n+\t\t\tif cahits >= cghits and cahits >= cmhits: #its a ca gene\n+\t\t\t\tca1hits = currentIDHits["ca1"]\n+\t\t\t\tca2hits = currentIDHits["ca2"]\n+\t\t\t\tif ca1hits >= ca2hits:\n+\t\t\t\t\to.write(ID + "\\tca1\\t" + str(int(ca1hits / varsInCA * 100)) + "\\t" + str(int(cahits / possibleca * 100)) + "\\t" + start_location[ID + "_ca"] + "\\n")\n+\t\t\t\telse:\n+\t\t\t\t\to.write(ID + "\\tca2\\t" + str(int(ca2hits / varsInCA * 100)) + "\\t" + str(int(cahits / possibleca * 100)) + "\\t" + start_location[ID + "_ca"] + "\\n")\n+\t\t\telif cghits >= cahits and cghits >= cmhits: #its a cg gene\n+\t\t\t\tcg1hits = currentIDHits["cg1"]\n+\t\t\t\tcg2hits = currentIDHits["cg2"]\n+\t\t\t\tcg3hits = currentIDHits["cg3"]\n+\t\t\t\tcg4hits = currentIDHits["cg4"]\n+\t\t\t\tif cg1hits >= cg2hits and cg1hits >= cg3hits and cg1hits >= cg4hits: #cg1 gene\n+\t\t\t\t\to.write(ID + "\\tcg1\\t" + str(int(cg1hits / varsInCG * 100)) + "\\t" + str(int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n+\t\t\t\telif cg2hits >= cg1hits and cg2hits >= cg3hits and cg2hits >= cg4hits: #cg2 gene\n+\t\t\t\t\to.write(ID + "\\tcg2\\t" + str(int(cg2hits / varsInCG * 100)) + "\\t" + str(int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n+\t\t\t\telif cg3hits >= cg1hits and cg3hits >= cg2hits and cg3hits >= cg4hits: #cg3 gene\n+\t\t\t\t\to.write(ID + "\\tcg3\\t" + str(int(cg3hits / varsInCG * 100)) + "\\t" + str(int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n+\t\t\t\telse: #cg4 gene\n+\t\t\t\t\to.write(ID + "\\tcg4\\t" + str(int(cg4hits / varsInCG * 100)) + "\\t" + str(int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n+\t\t\telse: #its a cm gene\n+\t\t\t\to.write(ID + "\\tcm\\t100\\t" + str(int(cmhits / possiblecm * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n+\t\t\tseq_write_count += 1\n+\n+print "Time: %i" % (int(time.time() * 1000) - starttime)\n+\n+print "Number of sequences written to file:", seq_write_count\n+\n+\n+\n+\n+\n'
b
diff -r 000000000000 -r 8a5a2abbb870 imgt_loader.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/imgt_loader.r Mon Aug 29 05:36:10 2016 -0400
[
@@ -0,0 +1,82 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+summ.file = args[1]
+aa.file = args[2]
+junction.file = args[3]
+out.file = args[4]
+
+summ = read.table(summ.file, sep="\t", header=T, quote="", fill=T)
+aa = read.table(aa.file, sep="\t", header=T, quote="", fill=T)
+junction = read.table(junction.file, sep="\t", header=T, quote="", fill=T)
+
+old_summary_columns=c('Sequence.ID','JUNCTION.frame','V.GENE.and.allele','D.GENE.and.allele','J.GENE.and.allele','CDR1.IMGT.length','CDR2.IMGT.length','CDR3.IMGT.length','Orientation')
+old_sequence_columns=c('CDR1.IMGT','CDR2.IMGT','CDR3.IMGT')
+old_junction_columns=c('JUNCTION')
+
+added_summary_columns=c('Functionality','V.REGION.identity..','V.REGION.identity.nt','D.REGION.reading.frame','AA.JUNCTION','Functionality.comment','Sequence')
+added_sequence_columns=c('FR1.IMGT','FR2.IMGT','FR3.IMGT','CDR3.IMGT','JUNCTION','J.REGION','FR4.IMGT')
+
+added_junction_columns=c('P3.V.nt.nb','N.REGION.nt.nb','N1.REGION.nt.nb','P5.D.nt.nb','P3.D.nt.nb','N2.REGION.nt.nb','P5.J.nt.nb','X3.V.REGION.trimmed.nt.nb','X5.D.REGION.trimmed.nt.nb','X3.D.REGION.trimmed.nt.nb','X5.J.REGION.trimmed.nt.nb','N.REGION','N1.REGION','N2.REGION')
+added_junction_columns=c(added_junction_columns, 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb')
+
+out=summ[,c("Sequence.ID","JUNCTION.frame","V.GENE.and.allele","D.GENE.and.allele","J.GENE.and.allele")]
+
+out[,"CDR1.Seq"] = aa[,"CDR1.IMGT"]
+out[,"CDR1.Length"] = summ[,"CDR1.IMGT.length"]
+
+out[,"CDR2.Seq"] = aa[,"CDR2.IMGT"]
+out[,"CDR2.Length"] = summ[,"CDR2.IMGT.length"]
+
+out[,"CDR3.Seq"] = aa[,"CDR3.IMGT"]
+out[,"CDR3.Length"] = summ[,"CDR3.IMGT.length"]
+
+out[,"CDR3.Seq.DNA"] = junction[,"JUNCTION"]
+out[,"CDR3.Length.DNA"] = nchar(as.character(junction[,"JUNCTION"]))
+out[,"Strand"] = summ[,"Orientation"]
+out[,"CDR3.Found.How"] = "a"
+
+out[,added_summary_columns] = summ[,added_summary_columns]
+
+out[,added_sequence_columns] = aa[,added_sequence_columns]
+
+out[,added_junction_columns] = junction[,added_junction_columns]
+
+out[,"Top V Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"V.GENE.and.allele"]))
+out[,"Top D Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"D.GENE.and.allele"]))
+out[,"Top J Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"J.GENE.and.allele"]))
+
+out = out[,c('Sequence.ID','JUNCTION.frame','Top V Gene','Top D Gene','Top J Gene','CDR1.Seq','CDR1.Length','CDR2.Seq','CDR2.Length','CDR3.Seq','CDR3.Length','CDR3.Seq.DNA','CDR3.Length.DNA','Strand','CDR3.Found.How','Functionality','V.REGION.identity..','V.REGION.identity.nt','D.REGION.reading.frame','AA.JUNCTION','Functionality.comment','Sequence','FR1.IMGT','FR2.IMGT','FR3.IMGT','CDR3.IMGT','JUNCTION','J.REGION','FR4.IMGT','P3.V.nt.nb','N.REGION.nt.nb','N1.REGION.nt.nb','P5.D.nt.nb','P3.D.nt.nb','N2.REGION.nt.nb','P5.J.nt.nb','X3.V.REGION.trimmed.nt.nb','X5.D.REGION.trimmed.nt.nb','X3.D.REGION.trimmed.nt.nb','X5.J.REGION.trimmed.nt.nb','N.REGION','N1.REGION','N2.REGION', 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb')]
+
+names(out) = c('ID','VDJ Frame','Top V Gene','Top D Gene','Top J Gene','CDR1 Seq','CDR1 Length','CDR2 Seq','CDR2 Length','CDR3 Seq','CDR3 Length','CDR3 Seq DNA','CDR3 Length DNA','Strand','CDR3 Found How','Functionality','V-REGION identity %','V-REGION identity nt','D-REGION reading frame','AA JUNCTION','Functionality comment','Sequence','FR1-IMGT','FR2-IMGT','FR3-IMGT','CDR3-IMGT','JUNCTION','J-REGION','FR4-IMGT','P3V-nt nb','N-REGION-nt nb','N1-REGION-nt nb','P5D-nt nb','P3D-nt nb','N2-REGION-nt nb','P5J-nt nb','3V-REGION trimmed-nt nb','5D-REGION trimmed-nt nb','3D-REGION trimmed-nt nb','5J-REGION trimmed-nt nb','N-REGION','N1-REGION','N2-REGION', 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb')
+
+out[,"VDJ Frame"] = as.character(out[,"VDJ Frame"])
+
+fltr = out[,"VDJ Frame"] == "in-frame"
+if(any(fltr)){
+ out[fltr, "VDJ Frame"] = "In-frame"
+}
+
+fltr = out[,"VDJ Frame"] == "null"
+if(any(fltr)){
+ out[fltr, "VDJ Frame"] = "Out-of-frame"
+}
+
+fltr = out[,"VDJ Frame"] == "out-of-frame"
+if(any(fltr)){
+ out[fltr, "VDJ Frame"] = "Out-of-frame"
+}
+
+fltr = out[,"VDJ Frame"] == ""
+if(any(fltr)){
+ out[fltr, "VDJ Frame"] = "Out-of-frame"
+}
+
+for(col in c('Top V Gene','Top D Gene','Top J Gene')){
+ out[,col] = as.character(out[,col])
+ fltr = out[,col] == ""
+ if(any(fltr)){
+ out[fltr,col] = "NA"
+ }
+}
+
+write.table(out, out.file, sep="\t", quote=F, row.names=F, col.names=T)
b
diff -r 000000000000 -r 8a5a2abbb870 merge.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/merge.r Mon Aug 29 05:36:10 2016 -0400
[
@@ -0,0 +1,27 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+input.1 = args[1]
+input.2 = args[2]
+
+fields.1 = args[3]
+fields.2 = args[4]
+
+field.1 = args[5]
+field.2 = args[6]
+
+output = args[7]
+
+dat1 = read.table(input.1, header=T, sep="\t", quote="", stringsAsFactors=F, fill=T, row.names=NULL)
+if(fields.1 != "all"){
+ fields.1 = unlist(strsplit(fields.1, ","))
+ dat1 = dat1[,fields.1]
+}
+dat2 = read.table(input.2, header=T, sep="\t", quote="", stringsAsFactors=F, fill=T, row.names=NULL)
+if(fields.2 != "all"){
+ fields.2 = unlist(strsplit(fields.2, ","))
+ dat2 = dat2[,fields.2]
+}
+
+dat3 = merge(dat1, dat2, by.x=field.1, by.y=field.2)
+
+write.table(dat3, output, sep="\t",quote=F,row.names=F,col.names=T)
b
diff -r 000000000000 -r 8a5a2abbb870 merge_and_filter.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/merge_and_filter.r Mon Aug 29 05:36:10 2016 -0400
[
b'@@ -0,0 +1,209 @@\n+args <- commandArgs(trailingOnly = TRUE)\r\n+\r\n+\r\n+summaryfile = args[1]\r\n+sequencesfile = args[2]\r\n+mutationanalysisfile = args[3]\r\n+mutationstatsfile = args[4]\r\n+hotspotsfile = args[5]\r\n+gene_identification_file= args[6]\r\n+output = args[7]\r\n+before.unique.file = args[8]\r\n+unmatchedfile = args[9]\r\n+method=args[10]\r\n+functionality=args[11]\r\n+unique.type=args[12]\r\n+filter.unique=args[13]\r\n+class.filter=args[14]\r\n+empty.region.filter=args[15]\r\n+\r\n+summ = read.table(summaryfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n+sequences = read.table(sequencesfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n+mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n+mutationstats = read.table(mutationstatsfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n+hotspots = read.table(hotspotsfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n+gene_identification = read.table(gene_identification_file, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n+\r\n+if(method == "blastn"){\r\n+\t"qseqid\\tsseqid\\tpident\\tlength\\tmismatch\\tgapopen\\tqstart\\tqend\\tsstart\\tsend\\tevalue\\tbitscore"\r\n+\tgene_identification = gene_identification[!duplicated(gene_identification$qseqid),]\r\n+\tref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52))\r\n+\tgene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T)\r\n+\tgene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100\r\n+\tgene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")]\r\n+\tcolnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")\r\n+\t\r\n+}\r\n+\r\n+input.sequence.count = nrow(summ)\r\n+print(paste("Number of sequences in summary file:", input.sequence.count))\r\n+\r\n+filtering.steps = data.frame(character(0), numeric(0))\r\n+\r\n+filtering.steps = rbind(filtering.steps, c("Input", input.sequence.count))\r\n+\r\n+filtering.steps[,1] = as.character(filtering.steps[,1])\r\n+filtering.steps[,2] = as.character(filtering.steps[,2])\r\n+#filtering.steps[,3] = as.numeric(filtering.steps[,3])\r\n+\r\n+summ = merge(summ, gene_identification, by="Sequence.ID")\r\n+\r\n+summ = summ[summ$Functionality != "No results",]\r\n+\r\n+print(paste("Number of sequences after \'No results\' filter:", nrow(summ)))\r\n+\r\n+filtering.steps = rbind(filtering.steps, c("After \'No results\' filter", nrow(summ)))\r\n+\r\n+if(functionality == "productive"){\r\n+\tsumm = summ[summ$Functionality == "productive (see comment)" | summ$Functionality == "productive",]\r\n+} else if (functionality == "unproductive"){\r\n+\tsumm = summ[summ$Functionality == "unproductive (see comment)" | summ$Functionality == "unproductive",]\r\n+} else if (functionality == "remove_unknown"){\r\n+\tsumm = summ[summ$Functionality != "No results" & summ$Functionality != "unknown (see comment)" & summ$Functionality != "unknown",]\r\n+}\r\n+\r\n+print(paste("Number of sequences after productive filter:", nrow(summ)))\r\n+\r\n+filtering.steps = rbind(filtering.steps, c("After productive filter", nrow(summ)))\r\n+\r\n+splt = strsplit(class.filter, "_")[[1]]\r\n+chunk_hit_threshold = as.numeric(splt[1])\r\n+nt_hit_threshold = as.numeric(splt[2])\r\n+\r\n+higher_than=(summ$chunk_hit_percentage >= chunk_hit_threshold & summ$nt_hit_percentage >= nt_hit_threshold)\r\n+\r\n+unmatched=summ[NULL,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]\r\n+\r\n+if(!all(higher_than, na.rm=T)){ #check for \'not all\' because that would mean the unmatched set is empty\r\n+\tunmatched = summ[!higher_than,]\r\n+\tunmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]\r\n+\tunmatched$best_match = paste("unmatched,", unmatched$best_match)\r\n+\tsumm[!higher_than,"best_match"] = paste("unmatche'..b'.steps = rbind(filtering.steps, c("After empty CDR1, FR2, CDR2, FR3 filter", nrow(result)))\r\n+} else if(empty.region.filter == "CDR1"){\r\n+\tresult = result[result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]\r\n+\tprint(paste("Number of sequences after empty FR2, CDR2 and FR3 column filter:", nrow(result)))\r\n+\tfiltering.steps = rbind(filtering.steps, c("After empty FR2, CDR2, FR3 filter", nrow(result)))\r\n+} else if(empty.region.filter == "FR2"){\r\n+\tresult = result[result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]\r\n+\tprint(paste("Number of sequences after empty CDR2 and FR3 column filter:", nrow(result)))\r\n+\tfiltering.steps = rbind(filtering.steps, c("After empty CDR2, FR3 filter", nrow(result)))\r\n+}\r\n+\r\n+result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]\r\n+\r\n+print(paste("Number of sequences in result after n filtering:", nrow(result)))\r\n+filtering.steps = rbind(filtering.steps, c("After N filter", nrow(result)))\r\n+\r\n+cleanup_columns = c("FR1.IMGT.Nb.of.mutations", \r\n+                    "CDR1.IMGT.Nb.of.mutations", \r\n+                    "FR2.IMGT.Nb.of.mutations", \r\n+                    "CDR2.IMGT.Nb.of.mutations", \r\n+                    "FR3.IMGT.Nb.of.mutations")\r\n+\r\n+for(col in cleanup_columns){\r\n+  result[,col] = gsub("\\\\(.*\\\\)", "", result[,col])\r\n+  result[,col] = as.numeric(result[,col])\r\n+  result[is.na(result[,col]),] = 0\r\n+}\r\n+\r\n+write.table(result, before.unique.file, sep="\\t", quote=F,row.names=F,col.names=T)\r\n+\r\n+if(filter.unique != "no"){\r\n+\tclmns = names(result)\r\n+\t\r\n+\tif(grepl("_c", filter.unique)){\r\n+\t\tresult$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq, result$best_match)\r\n+\t} else {\r\n+\t\tresult$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)\r\n+\t}\r\n+\r\n+\t#fltr = result$unique.def %in% result.filtered$unique.def\r\n+\t\t\r\n+\tif(grepl("keep", filter.unique)){\r\n+\t\tresult$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes\r\n+\t\tresult = result[!duplicated(result$unique.def),]\r\n+\t} else {\r\n+\t\tresult = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]\r\n+\t\tresult$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes\r\n+\t\tresult = result[!duplicated(result$unique.def),]\r\n+\t}\r\n+\t\r\n+\t#result = result[,clmns]\r\n+\t\r\n+\t#write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T)\r\n+}\r\n+\r\n+print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result)))\r\n+print(paste("Number of matched sequences in result after CDR/FR filtering:", nrow(result[!grepl("unmatched", result$best_match),])))\r\n+\r\n+filtering.steps = rbind(filtering.steps, c("After unique filter", nrow(result)))\r\n+\r\n+print(paste("Number of rows in result:", nrow(result)))\r\n+print(paste("Number of rows in unmatched:", nrow(unmatched)))\r\n+\r\n+matched.sequences.count = sum(!grepl("^unmatched", result$best_match))\r\n+unmatched.sequences.count = sum(grepl("^unmatched", result$best_match))\r\n+\r\n+filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count))\r\n+filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count))\r\n+filtering.steps[,2] = as.numeric(filtering.steps[,2])\r\n+filtering.steps$perc = round(filtering.steps[,2] / input.sequence.count * 100, 2)\r\n+\r\n+write.table(x=filtering.steps, file=gsub("unmatched", "filtering_steps", unmatchedfile), sep="\\t",quote=F,row.names=F,col.names=F)\r\n+\r\n+write.table(x=result, file=output, sep="\\t",quote=F,row.names=F,col.names=T)\r\n+write.table(x=unmatched, file=unmatchedfile, sep="\\t",quote=F,row.names=F,col.names=T)\r\n'
b
diff -r 000000000000 -r 8a5a2abbb870 mutation_analysis.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mutation_analysis.py Mon Aug 29 05:36:10 2016 -0400
[
b'@@ -0,0 +1,290 @@\n+from __future__ import division\n+from collections import defaultdict\n+import re\n+import argparse\n+\n+parser = argparse.ArgumentParser()\n+parser.add_argument("--input",\n+\t\t\t\t\thelp="The \'7_V-REGION-mutation-and-AA-change-table\' and \'10_V-REGION-mutation-hotspots\' merged together, with an added \'best_match\' annotation")\n+parser.add_argument("--genes", help="The genes available in the \'best_match\' column")\n+parser.add_argument("--includefr1", help="Should the mutation/nucleotides in the FR1 region be included?")\n+parser.add_argument("--output", help="Output file")\n+\n+args = parser.parse_args()\n+\n+infile = args.input\n+genes = str(args.genes).split(",")\n+print "includefr1 =", args.includefr1\n+include_fr1 = True if args.includefr1 == "yes" else False\n+outfile = args.output\n+\n+genedic = dict()\n+\n+mutationdic = dict()\n+mutationMatcher = re.compile("^(.)(\\d+).(.),?(.)?(\\d+)?.?(.)?(.?.?.?.?.?)?")\n+NAMatchResult = (None, None, None, None, None, None, \'\')\n+linecount = 0\n+\n+IDIndex = 0\n+best_matchIndex = 0\n+fr1Index = 0\n+cdr1Index = 0\n+fr2Index = 0\n+cdr2Index = 0\n+fr3Index = 0\n+first = True\n+IDlist = []\n+mutationList = []\n+mutationListByID = {}\n+cdr1LengthDic = {}\n+cdr2LengthDic = {}\n+\n+with open(infile, \'r\') as i:\n+\tfor line in i:\n+\t\tif first:\n+\t\t\tlinesplt = line.split("\\t")\n+\t\t\tIDIndex = linesplt.index("Sequence.ID")\n+\t\t\tbest_matchIndex = linesplt.index("best_match")\n+\t\t\tfr1Index = linesplt.index("FR1.IMGT")\n+\t\t\tcdr1Index = linesplt.index("CDR1.IMGT")\n+\t\t\tfr2Index = linesplt.index("FR2.IMGT")\n+\t\t\tcdr2Index = linesplt.index("CDR2.IMGT")\n+\t\t\tfr3Index = linesplt.index("FR3.IMGT")\n+\t\t\tcdr1LengthIndex = linesplt.index("CDR1.IMGT.length")\n+\t\t\tcdr2LengthIndex = linesplt.index("CDR2.IMGT.length")\n+\t\t\tfirst = False\n+\t\t\tcontinue\n+\t\tlinecount += 1\n+\t\tlinesplt = line.split("\\t")\n+\t\tID = linesplt[IDIndex]\n+\t\tgenedic[ID] = linesplt[best_matchIndex]\n+\t\ttry:\n+\t\t\tif linesplt[fr1Index] != "NA":\n+\t\t\t\tmutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else []\n+\t\t\telse:\n+\t\t\t\tmutationdic[ID + "_FR1"] = []\n+\t\t\tmutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if linesplt[cdr1Index] != "NA" else []\n+\t\t\tmutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if linesplt[fr2Index] != "NA" else []\n+\t\t\tmutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if linesplt[cdr2Index] != "NA" else []\n+\t\t\tmutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]\n+\t\t\tmutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else []\n+\t\texcept e:\n+\t\t\tprint linesplt\n+\t\t\tprint linecount\n+\t\t\tprint e\n+\t\tmutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]\n+\t\tmutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]\n+\n+\t\tcdr1Length = linesplt[cdr1LengthIndex]\n+\t\tcdr2Length = linesplt[cdr2LengthIndex]\n+\n+\t\tcdr1LengthDic[ID] = int(cdr1Length) if cdr1Length != "X" else 0\n+\t\tcdr2LengthDic[ID] = int(cdr2Length) if cdr2Length != "X" else 0\n+\t\t\n+\t\tIDlist += [ID]\n+\n+AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] else 0)[4]) + 1)  # [4] is the position of the AA mutation, None if silent\n+if AALength < 60:\n+\tAALength = 64\n+\n+AA_mutation = [0] * AALength\n+AA_mutation_dic = {"ca": AA_mutation[:], "cg": AA_mutation[:], "cm": AA_mutation[:], "un": AA_mutation[:]}\n+AA_mutation_empty = AA_mutation[:]\n+\n+aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt"\n+with open(aa_mutations_by_id_file, \'w\') as o:\n+\to.write("ID\\tbest_match\\t" + "\\t".join([str(x) for x in range(1,AALength)]) + "\\n")\n+\tfor ID in mutat'..b'ndex]\n+\t\tID = linesplt[IDIndex]\n+\t\tif ID == "ca2":\n+\t\t\tprint linesplt\n+\t\tRGYW = [(int(x), int(y), z) for (x, y, z) in\n+\t\t\t\t[hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]]\n+\t\tWRCY = [(int(x), int(y), z) for (x, y, z) in\n+\t\t\t\t[hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]]\n+\t\tWA = [(int(x), int(y), z) for (x, y, z) in\n+\t\t\t  [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]]\n+\t\tTW = [(int(x), int(y), z) for (x, y, z) in\n+\t\t\t  [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]]\n+\t\tRGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0\n+\n+\t\tmutationList = (mutationdic[ID + "_FR1"] if include_fr1 else []) + mutationdic[ID + "_CDR1"] + mutationdic[\n+\t\t\tID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]\n+\t\tfor mutation in mutationList:\n+\t\t\tfrm, where, to, AAfrm, AAwhere, AAto, junk = mutation\n+\t\t\tmutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW])\n+\t\t\tmutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY])\n+\t\t\tmutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA])\n+\t\t\tmutation_in_TW = any([(start <= int(where) <= end) for (start, end, region) in TW])\n+\n+\t\t\tin_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW])\n+\n+\t\t\tif in_how_many_motifs > 0:\n+\t\t\t\tRGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs\n+\t\t\t\tWRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs\n+\t\t\t\tWACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs\n+\t\t\t\tTWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs\n+\n+\n+def mean(lst):\n+\treturn (float(sum(lst)) / len(lst)) if len(lst) > 0 else 0.0\n+\n+\n+def median(lst):\n+\tlst = sorted(lst)\n+\tl = len(lst)\n+\tif l == 0:\n+\t\treturn 0\n+\tif l == 1:\n+\t\treturn lst[0]\n+\t\t\n+\tl = int(l / 2)\n+\t\n+\tif len(lst) % 2 == 0:\n+\t\treturn float(lst[l] + lst[(l - 1)]) / 2.0\n+\telse:\n+\t\treturn lst[l]\n+\n+funcs = {"mean": mean, "median": median, "sum": sum}\n+\n+directory = outfile[:outfile.rfind("/") + 1]\n+value = 0\n+valuedic = dict()\n+\n+for fname in funcs.keys():\n+\tfor gene in genes:\n+\t\twith open(directory + gene + "_" + fname + "_value.txt", \'r\') as v:\n+\t\t\tvaluedic[gene + "_" + fname] = float(v.readlines()[0].rstrip())\n+\twith open(directory + "all_" + fname + "_value.txt", \'r\') as v:\n+\t\tvaluedic["total_" + fname] = float(v.readlines()[0].rstrip())\n+\t\n+\n+def get_xyz(lst, gene, f, fname):\n+\tx = int(round(f(lst)))\n+\ty = valuedic[gene + "_" + fname]\n+\tz = str(round(x / float(y) * 100, 1)) if y != 0 else "0"\n+\treturn (str(x), str(y), z)\n+\n+dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount}\n+arr = ["RGYW", "WRCY", "WA", "TW"]\n+\n+geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes}\n+\n+for fname in funcs.keys():\n+\tfunc = funcs[fname]\n+\tfoutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt"\n+\twith open(foutfile, \'w\') as o:\n+\t\tfor typ in arr:\n+\t\t\to.write(typ + " (%)")\n+\t\t\tcurr = dic[typ]\n+\t\t\tfor gene in genes:\n+\t\t\t\tgeneMatcher = geneMatchers[gene] #re.compile("^" + gene + ".*") #recompile every loop....\n+\t\t\t\tif valuedic[gene + "_" + fname] is 0:\n+\t\t\t\t\to.write(",0,0,0")\n+\t\t\t\telse:\n+\t\t\t\t\tx, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname)\n+\t\t\t\t\to.write("," + x + "," + y + "," + z)\n+\n+\t\t\tx, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname)\n+\t\t\to.write("," + x + "," + y + "," + z + "\\n")\n+\n+\n+# for testing\n+seq_motif_file = outfile[:outfile.rindex("/")] + "/motif_per_seq.txt"\n+with open(seq_motif_file, \'w\') as o:\n+\to.write("ID\\tRGYWC\\tWRCY\\tWA\\tTW\\n")\n+\tfor ID in IDlist:\n+\t\to.write(ID + "\\t" + str(round(RGYWCount[ID], 2)) + "\\t" + str(round(WRCYCount[ID], 2)) + "\\t" + str(round(WACount[ID], 2)) + "\\t" + str(round(TWCount[ID], 2)) + "\\n")\n'
b
diff -r 000000000000 -r 8a5a2abbb870 mutation_analysis.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mutation_analysis.r Mon Aug 29 05:36:10 2016 -0400
[
b'@@ -0,0 +1,477 @@\n+library(data.table)\r\n+library(ggplot2)\r\n+library(reshape2)\r\n+\r\n+args <- commandArgs(trailingOnly = TRUE)\r\n+\r\n+input = args[1]\r\n+genes = unlist(strsplit(args[2], ","))\r\n+outputdir = args[3]\r\n+include_fr1 = ifelse(args[4] == "yes", T, F)\r\n+setwd(outputdir)\r\n+\r\n+dat = read.table(input, header=T, sep="\\t", fill=T, stringsAsFactors=F)\r\n+\r\n+if(length(dat$Sequence.ID) == 0){\r\n+  setwd(outputdir)\r\n+  result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5))\r\n+  row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)")\r\n+  write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)\r\n+  transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4))\r\n+  row.names(transitionTable) = c("A", "C", "G", "T")\r\n+  transitionTable["A","A"] = NA\r\n+  transitionTable["C","C"] = NA\r\n+  transitionTable["G","G"] = NA\r\n+  transitionTable["T","T"] = NA\r\n+  write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)\r\n+  cat("0", file="n.txt")\r\n+  stop("No data")\r\n+}\r\n+\r\n+cleanup_columns = c("FR1.IMGT.c.a",\r\n+                    "FR2.IMGT.g.t",\r\n+                    "CDR1.IMGT.Nb.of.nucleotides",\r\n+                    "CDR2.IMGT.t.a",\r\n+                    "FR1.IMGT.c.g",\r\n+                    "CDR1.IMGT.c.t",\r\n+                    "FR2.IMGT.a.c",\r\n+                    "FR2.IMGT.Nb.of.mutations",\r\n+                    "FR2.IMGT.g.c",\r\n+                    "FR2.IMGT.a.g",\r\n+                    "FR3.IMGT.t.a",\r\n+                    "FR3.IMGT.t.c",\r\n+                    "FR2.IMGT.g.a",\r\n+                    "FR3.IMGT.c.g",\r\n+                    "FR1.IMGT.Nb.of.mutations",\r\n+                    "CDR1.IMGT.g.a",\r\n+                    "CDR1.IMGT.t.g",\r\n+                    "CDR1.IMGT.g.c",\r\n+                    "CDR2.IMGT.Nb.of.nucleotides",\r\n+                    "FR2.IMGT.a.t",\r\n+                    "CDR1.IMGT.Nb.of.mutations",\r\n+                    "CDR3.IMGT.Nb.of.nucleotides",\r\n+                    "CDR1.IMGT.a.g",\r\n+                    "FR3.IMGT.a.c",\r\n+                    "FR1.IMGT.g.a",\r\n+                    "FR3.IMGT.a.g",\r\n+                    "FR1.IMGT.a.t",\r\n+                    "CDR2.IMGT.a.g",\r\n+                    "CDR2.IMGT.Nb.of.mutations",\r\n+                    "CDR2.IMGT.g.t",\r\n+                    "CDR2.IMGT.a.c",\r\n+                    "CDR1.IMGT.t.c",\r\n+                    "FR3.IMGT.g.c",\r\n+                    "FR1.IMGT.g.t",\r\n+                    "FR3.IMGT.g.t",\r\n+                    "CDR1.IMGT.a.t",\r\n+                    "FR1.IMGT.a.g",\r\n+                    "FR3.IMGT.a.t",\r\n+                    "FR3.IMGT.Nb.of.nucleotides",\r\n+                    "FR2.IMGT.t.c",\r\n+                    "CDR2.IMGT.g.a",\r\n+                    "FR2.IMGT.t.a",\r\n+                    "CDR1.IMGT.t.a",\r\n+                    "FR2.IMGT.t.g",\r\n+                    "FR3.IMGT.t.g",\r\n+                    "FR2.IMGT.Nb.of.nucleotides",\r\n+                    "FR1.IMGT.t.a",\r\n+                    "FR1.IMGT.t.g",\r\n+                    "FR3.IMGT.c.t",\r\n+                    "FR1.IMGT.t.c",\r\n+                    "CDR2.IMGT.a.t",\r\n+                    "FR2.IMGT.c.t",\r\n+                    "CDR1.IMGT.g.t",\r\n+                    "CDR2.IMGT.t.g",\r\n+                    "FR1.IMGT.Nb.of.nucleotides",\r\n+                    "CDR1.IMGT.c.g",\r\n+                    "CDR2.IMGT.t.c",\r\n+                    "FR3.IMGT.g.a",\r\n+                    "CDR1.IMGT.a.c",\r\n+                    "FR2.IMGT.c.a",\r\n+                    "FR3.IMGT.Nb.of.mutations",\r\n+                    "FR2.IMGT.c.g",\r\n+                    "CDR2.IMGT.g.c",\r\n+                    "FR1.IMGT.g.c",\r\n+                    "CDR2.IMGT.c.t",\r\n+                    "FR3.IMGT.c.a",\r\n+                    "CDR1.IMGT.c.a",\r\n+                    "CDR2.IMGT.c.g",\r\n+                    "CDR2.IMGT.c.a",\r\n+                    "FR1.IMGT.c.t",\r\n+                    "FR1'..b'ble[,1] = as.character(new.table[,1])\r\n+new.table[2,1] = "Median of Number of Mutations (%)"\r\n+\r\n+#sum.table = sum.table[c("Number of Mutations (%)", "Median of Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR"),]\r\n+\r\n+write.table(x=new.table, file="mutations_sum.txt", sep=",",quote=F,row.names=F,col.names=F)\r\n+\r\n+\r\n+print("Plotting ca piechart")\r\n+\r\n+dat = dat[!grepl("^unmatched", dat$best_match),]\r\n+\r\n+#blegh\r\n+genesForPlot = dat[grepl("ca", dat$best_match),]$best_match\r\n+if(length(genesForPlot) > 0){\r\n+\tgenesForPlot = data.frame(table(genesForPlot))\r\n+\tcolnames(genesForPlot) = c("Gene","Freq")\r\n+\tgenesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)\r\n+\r\n+\tpc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))\r\n+\tpc = pc + geom_bar(width = 1, stat = "identity")\r\n+\tpc = pc + coord_polar(theta="y")\r\n+\tpc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA subclasses", "( n =", sum(genesForPlot$Freq), ")"))\r\n+\twrite.table(genesForPlot, "ca.txt", sep="\\t",quote=F,row.names=F,col.names=T)\r\n+\r\n+\tpng(filename="ca.png")\r\n+\tprint(pc)\r\n+\tdev.off()\r\n+}\r\n+\r\n+print("Plotting cg piechart")\r\n+\r\n+genesForPlot = dat[grepl("cg", dat$best_match),]$best_match\r\n+if(length(genesForPlot) > 0){\r\n+\tgenesForPlot = data.frame(table(genesForPlot))\r\n+\tcolnames(genesForPlot) = c("Gene","Freq")\r\n+\tgenesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)\r\n+\r\n+\tpc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))\r\n+\tpc = pc + geom_bar(width = 1, stat = "identity")\r\n+\tpc = pc + coord_polar(theta="y")\r\n+\tpc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgG subclasses", "( n =", sum(genesForPlot$Freq), ")"))\r\n+\twrite.table(genesForPlot, "cg.txt", sep="\\t",quote=F,row.names=F,col.names=T)\r\n+\r\n+\tpng(filename="cg.png")\r\n+\tprint(pc)\r\n+\tdev.off()\r\n+}\r\n+\r\n+\r\n+print("Plotting scatterplot")\r\n+\r\n+dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2)\r\n+\r\n+p = ggplot(dat, aes(best_match, percentage_mutations))\r\n+p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA)\r\n+p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot")\r\n+\r\n+png(filename="scatter.png")\r\n+print(p)\r\n+dev.off()\r\n+\r\n+write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\\t",quote=F,row.names=F,col.names=T)\r\n+\r\n+write.table(dat, input, sep="\\t",quote=F,row.names=F,col.names=T)\r\n+\r\n+\r\n+print("Plotting frequency ranges plot")\r\n+\r\n+dat$best_match_class = substr(dat$best_match, 0, 2)\r\n+freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20")\r\n+dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels)\r\n+\r\n+frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match_class", "frequency_bins")])\r\n+\r\n+p = ggplot(frequency_bins_data, aes(frequency_bins, frequency_count))\r\n+p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge")\r\n+p = p + xlab("Frequency ranges") + ylab("Frequency") + ggtitle("Mutation Frequencies by class")\r\n+\r\n+png(filename="frequency_ranges.png")\r\n+print(p)\r\n+dev.off()\r\n+\r\n+frequency_bins_data_by_class = frequency_bins_data\r\n+\r\n+write.table(frequency_bins_data_by_class, "frequency_ranges_classes.txt", sep="\\t",quote=F,row.names=F,col.names=T)\r\n+\r\n+frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match", "frequency_bins")])\r\n+\r\n+write.table(frequency_bins_data, "frequency_ranges_subclasses.txt", sep="\\t",quote=F,row.names=F,col.names=T)\r\n+\r\n+\r\n+#frequency_bins_data_by_class\r\n+#frequency_ranges_subclasses.txt\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n'
b
diff -r 000000000000 -r 8a5a2abbb870 mutation_analysis.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mutation_analysis.xml Mon Aug 29 05:36:10 2016 -0400
[
@@ -0,0 +1,101 @@
+<tool id="mutation_analysis_shm" name="Mutation Analysis" version="1.0">
+ <description></description>
+ <command interpreter="bash">
+ wrapper.sh $in_file $method $out_file $out_file.files_path ${in_file.name} ${include_fr1} $functionality $unique $naive_output_ca $naive_output_cg $naive_output_cm $filter_uniques $class_filter $empty_region_filter
+ </command>
+ <inputs>
+ <param name="in_file" type="data" label="IMGT zip file to be analysed" />
+ <param name="method" type="select" label="Method of identification of C region" help="" >
+ <option value="custom" selected="true">custom</option>
+ <option value="blastn">blastn</option>
+ </param>
+ <param name="include_fr1" type="select" label="Include mutations in FR1 region" help="" >
+ <option value="yes">yes</option>
+ <option value="no" selected="true">no</option>
+ </param>
+ <param name="functionality" type="select" label="Functionality filter" help="" >
+ <option value="productive" selected="true">Productive: Keep "productive" and "productive (see comment)"</option>
+ <option value="unproductive">Unproductive: Keep "unproductive" and "unproductive (see comment)"</option>
+ <option value="remove_unknown">Remove "unknown" and "unknown (see comment)"</option>
+ <option value="dont_filter">Don't filter</option>
+ </param>
+ <param name="filter_uniques" type="select" label="Filter unique sequences" help="See below for an example.">
+ <option value="remove">Remove uniques (Based on nucleotide sequence + C)</option>
+ <option value="keep">Keep uniques (Based on nucleotide sequence + C)</option>
+ <option value="no" selected="true">No</option>
+ </param>
+ <param name="unique" type="select" label="Remove duplicates based on" help="" >
+ <option value="VGene,AA.JUNCTION,best_match" selected="true">Top.V.Gene, CDR3.AA.Seq, C region</option>
+ <option value="VGene,AA.JUNCTION">Top.V.Gene, CDR3.AA.Seq</option>
+ <option value="AA.JUNCTION,best_match">CDR3.AA.Seq, C region</option>
+ <option value="AA.JUNCTION">CDR3.AA.Seq</option>
+
+ <option value="VGene,CDR3.IMGT.seq,best_match" selected="true">Top.V.Gene, CDR3.nt.Seq, C region</option>
+ <option value="VGene,CDR3.IMGT.seq">Top.V.Gene, CDR3.nt.Seq</option>
+ <option value="CDR3.IMGT.seq,best_match">CDR3.nt.Seq, C region</option>
+ <option value="CDR3.IMGT.seq">CDR3.nt.Seq</option>
+ <option value="Sequence.ID">Don't remove duplicates</option>
+ </param>
+ <param name="class_filter" type="select" label="Class/Subclass filter" help="" >
+ <option value="70_70" selected="true">>70% class and >70% subclass</option>
+ <option value="60_55">>60% class and >55% subclass</option>
+ <option value="70_0">>70% class</option>
+ <option value="60_0">>60% class</option>
+ </param>
+ <param name="empty_region_filter" type="select" label="Sequence starts at" help="" >
+ <option value="FR1" selected="true">FR1: exclude empty CDR1,FR2,CDR2,FR3</option>
+ <option value="CDR1">CDR1: exclude empty FR2,CDR2,FR3</option>
+ <option value="FR2">FR2: exclude empty CDR2,FR3</option>
+ </param>
+ <conditional name="naive_output_cond">
+ <param name="naive_output" type="select" label="Output new IMGT archives per class into your history?">
+ <option value="yes">Yes</option>
+ <option value="no" selected="true">No</option>
+ </param>
+ </conditional>
+ </inputs>
+ <outputs>
+ <data format="html" name="out_file" label = "Mutation analysis on ${in_file.name}"/>
+ <data format="imgt_archive" name="naive_output_ca" label = "Naive CA input data from ${in_file.name}" >
+     <filter>naive_output_cond['naive_output'] == "yes"</filter>
+ </data>
+ <data format="imgt_archive" name="naive_output_cg" label = "Naive CG input data from ${in_file.name}" >
+     <filter>naive_output_cond['naive_output'] == "yes"</filter>
+ </data>
+ <data format="imgt_archive" name="naive_output_cm" label = "Naive CM input data from ${in_file.name}" >
+     <filter>naive_output_cond['naive_output'] == "yes"</filter>
+ </data>
+ </outputs>
+ <citations>
+ <citation type="doi">10.1093/nar/gks457</citation>
+ <citation type="doi">10.1093/bioinformatics/btv359</citation>
+ <citation type="doi">10.1186/1471-2105-10-421</citation>
+ </citations>
+ <help>
+ Takes an IMGT zip (http://www.imgt.org/HighV-QUEST/search.action) file and creates a summarization of the mutation analysis.  
+
+ +--------------------------+
+ |       unique filter      |
+ +--------+--------+--------+
+ | values | remove | keep   |
+ +--------+--------+--------+
+ |   A    |   A    |   A    |
+ +--------+--------+--------+
+ |   A    |   B    |   B    |
+ +--------+--------+--------+
+ |   B    |   D    |   C    |
+ +--------+--------+--------+
+ |   B    |        |   D    |
+ +--------+--------+--------+
+ |   C    |        |        |
+ +--------+--------+--------+
+ |   D    |        |        |
+ +--------+--------+--------+
+ |   D    |        |        |
+ +--------+--------+--------+
+
+ </help>
+ <requirements>
+    <requirement type="package" version="1.0">blastn</requirement>
+ </requirements>
+</tool>
b
diff -r 000000000000 -r 8a5a2abbb870 naive_output.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/naive_output.r Mon Aug 29 05:36:10 2016 -0400
[
@@ -0,0 +1,45 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+naive.file = args[1]
+shm.file = args[2]
+output.file.ca = args[3]
+output.file.cg = args[4]
+output.file.cm = args[5]
+
+naive = read.table(naive.file, sep="\t", header=T, quote="", fill=T)
+shm.merge = read.table(shm.file, sep="\t", header=T, quote="", fill=T)
+
+
+final = merge(naive, shm.merge[,c("Sequence.ID", "best_match")], by.x="ID", by.y="Sequence.ID")
+print(paste("nrow final:", nrow(final)))
+names(final)[names(final) == "best_match"] = "Sample"
+final.numeric = final[,sapply(final, is.numeric)]
+final.numeric[is.na(final.numeric)] = 0
+final[,sapply(final, is.numeric)] = final.numeric
+
+final.ca = final[grepl("^ca", final$Sample),]
+final.cg = final[grepl("^cg", final$Sample),]
+final.cm = final[grepl("^cm", final$Sample),]
+
+if(nrow(final.ca) > 0){
+ final.ca$Replicate = 1
+}
+
+if(nrow(final.cg) > 0){
+ final.cg$Replicate = 1
+}
+
+if(nrow(final.cm) > 0){
+ final.cm$Replicate = 1
+}
+
+#print(paste("nrow final:", nrow(final)))
+#final2 = final
+#final2$Sample = gsub("[0-9]", "", final2$Sample)
+#final = rbind(final, final2)
+#final$Replicate = 1
+
+write.table(final.ca, output.file.ca, quote=F, sep="\t", row.names=F, col.names=T)
+write.table(final.cg, output.file.cg, quote=F, sep="\t", row.names=F, col.names=T)
+write.table(final.cm, output.file.cm, quote=F, sep="\t", row.names=F, col.names=T)
+
b
diff -r 000000000000 -r 8a5a2abbb870 new_imgt.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/new_imgt.r Mon Aug 29 05:36:10 2016 -0400
[
@@ -0,0 +1,27 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+imgt.dir = args[1]
+merged.file = args[2]
+gene = args[3]
+
+merged = read.table(merged.file, header=T, sep="\t", fill=T, stringsAsFactors=F)
+
+if(gene != "-"){
+ merged = merged[grepl(gene, merged$best_match),]
+}
+
+merged = merged[!grepl("unmatched", merged$best_match),]
+
+for(f in list.files(imgt.dir, pattern="*.txt$")){
+ #print(paste("filtering", f))
+ path = paste(imgt.dir, f, sep="")
+ dat = read.table(path, header=T, sep="\t", fill=T, quote="", stringsAsFactors=F, check.names=FALSE)
+
+ dat = dat[dat[,"Sequence ID"] %in% merged$Sequence.ID,]
+
+ if(nrow(dat) > 0 & grepl("^8_", f)){ #change the FR1 columns to 0 in the "8_..." file
+ dat[,grepl("^FR1", names(dat))] = 0
+ }
+
+ write.table(dat, path, quote=F, sep="\t", row.names=F, col.names=T, na="")
+}
b
diff -r 000000000000 -r 8a5a2abbb870 pattern_plots.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pattern_plots.r Mon Aug 29 05:36:10 2016 -0400
[
@@ -0,0 +1,139 @@
+library(ggplot2)
+library(reshape2)
+library(scales)
+
+args <- commandArgs(trailingOnly = TRUE)
+
+input.file = args[1] #the data that's get turned into the "SHM overview" table in the html report "data_sum.txt"
+
+plot1.path = args[2]
+plot1.png = paste(plot1.path, ".png", sep="")
+plot1.txt = paste(plot1.path, ".txt", sep="")
+
+plot2.path = args[3]
+plot2.png = paste(plot2.path, ".png", sep="")
+plot2.txt = paste(plot2.path, ".txt", sep="")
+
+plot3.path = args[4]
+plot3.png = paste(plot3.path, ".png", sep="")
+plot3.txt = paste(plot3.path, ".txt", sep="")
+
+dat = read.table(input.file, header=F, sep=",", quote="", stringsAsFactors=F, fill=T, row.names=1)
+
+
+
+classes = c("ca", "ca1", "ca2", "cg", "cg1", "cg2", "cg3", "cg4", "cm")
+xyz = c("x", "y", "z")
+new.names = c(paste(rep(classes, each=3), xyz, sep="."), paste("un", xyz, sep="."), paste("all", xyz, sep="."))
+
+names(dat) = new.names
+
+dat["RGYW.WRCY",] = colSums(dat[c(13,14),])
+dat["TW.WA",] = colSums(dat[c(15,16),])
+
+data1 = dat[c("RGYW.WRCY", "TW.WA"),]
+
+data1 = data1[,names(data1)[grepl(".z", names(data1))]]
+names(data1) = gsub("\\..*", "", names(data1))
+
+data1 = melt(t(data1))
+
+names(data1) = c("Class", "Type", "value")
+
+write.table(data1, plot1.txt, quote=F, sep="\t", na="", row.names=F, col.names=T)
+
+p = ggplot(data1, aes(Class, value)) + geom_bar(aes(fill=Type), stat="identity", position="dodge") + ylab("% of mutations") + guides(fill=guide_legend(title=NULL))
+png(filename=plot1.png)
+print(p)
+dev.off()
+
+data2 = dat[5:8,]
+
+data2["sum",] = colSums(data2)
+
+data2 = data2[,names(data2)[grepl("\\.x", names(data2))]]
+names(data2) = gsub(".x", "", names(data2))
+
+data2["A/T",] = round(colSums(data2[3:4,]) / data2["sum",] * 100, 1)
+data2["A/T",is.nan(unlist(data2["A/T",]))] = 0
+
+data2["G/C transversions",] = round(data2[2,] / data2["sum",] * 100, 1)
+data2["G/C transitions",] = round(data2[1,] / data2["sum",] * 100, 1)
+
+
+data2["G/C transversions",is.nan(unlist(data2["G/C transversions",]))] = 0
+data2["G/C transversions",is.infinite(unlist(data2["G/C transversions",]))] = 0
+data2["G/C transitions",is.nan(unlist(data2["G/C transitions",]))] = 0
+data2["G/C transitions",is.infinite(unlist(data2["G/C transitions",]))] = 0
+
+data2 = melt(t(data2[6:8,]))
+
+names(data2) = c("Class", "Type", "value")
+
+write.table(data2, plot2.txt, quote=F, sep="\t", na="", row.names=F, col.names=T)
+
+p = ggplot(data2, aes(x=Class, y=value, fill=Type)) + geom_bar(position="fill", stat="identity") + scale_y_continuous(labels=percent_format()) + guides(fill=guide_legend(title=NULL)) + ylab("% of mutations")
+png(filename=plot2.png)
+print(p)
+dev.off()
+
+data3 = dat[c(5, 6, 8, 17:20),]
+data3 = data3[,names(data3)[grepl("\\.x", names(data3))]]
+names(data3) = gsub(".x", "", names(data3))
+
+data3["G/C transitions",] = round(data3[1,] / (data3[5,] + data3[7,]) * 100, 1)
+
+data3["G/C transversions",] = round(data3[2,] / (data3[5,] + data3[7,]) * 100, 1)
+
+data3["A/T",] = round(data3[3,] / (data3[4,] + data3[6,]) * 100, 1)
+
+data3["G/C transitions",is.nan(unlist(data3["G/C transitions",]))] = 0
+data3["G/C transitions",is.infinite(unlist(data3["G/C transitions",]))] = 0
+
+data3["G/C transversions",is.nan(unlist(data3["G/C transversions",]))] = 0
+data3["G/C transversions",is.infinite(unlist(data3["G/C transversions",]))] = 0
+
+data3["A/T",is.nan(unlist(data3["A/T",]))] = 0
+data3["A/T",is.infinite(unlist(data3["A/T",]))] = 0
+
+data3 = melt(t(data3[8:10,]))
+names(data3) = c("Class", "Type", "value")
+
+write.table(data3, plot3.txt, quote=F, sep="\t", na="", row.names=F, col.names=T)
+
+p = ggplot(data3, aes(Class, value)) + geom_bar(aes(fill=Type), stat="identity", position="dodge") + ylab("% of nucleotides") + guides(fill=guide_legend(title=NULL))
+png(filename=plot3.png)
+print(p)
+dev.off()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
b
diff -r 000000000000 -r 8a5a2abbb870 sequence_overview.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sequence_overview.r Mon Aug 29 05:36:10 2016 -0400
[
b'@@ -0,0 +1,315 @@\n+library(reshape2)\n+\n+args <- commandArgs(trailingOnly = TRUE)\n+\n+before.unique.file = args[1]\n+merged.file = args[2]\n+outputdir = args[3]\n+gene.classes = unlist(strsplit(args[4], ","))\n+hotspot.analysis.sum.file = args[5]\n+NToverview.file = paste(outputdir, "ntoverview.txt", sep="/")\n+NTsum.file = paste(outputdir, "ntsum.txt", sep="/")\n+main.html = "index.html"\n+\n+setwd(outputdir)\n+\n+before.unique = read.table(before.unique.file, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\n+merged = read.table(merged.file, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\n+hotspot.analysis.sum = read.table(hotspot.analysis.sum.file, header=F, sep=",", fill=T, stringsAsFactors=F, quote="")\n+\n+#before.unique = before.unique[!grepl("unmatched", before.unique$best_match),]\n+\n+before.unique$seq_conc = paste(before.unique$CDR1.IMGT.seq, before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)\n+\n+IDs = before.unique[,c("Sequence.ID", "seq_conc", "best_match", "Functionality")]\n+IDs$best_match = as.character(IDs$best_match)\n+\n+#dat = data.frame(data.table(dat)[, list(freq=.N), by=c("best_match", "seq_conc")])\n+\n+dat = data.frame(table(before.unique$seq_conc))\n+#dat = data.frame(table(merged$seq_conc, merged$Functionality))\n+\n+#dat = dat[dat$Freq > 1,]\n+\n+#names(dat) = c("seq_conc", "Functionality", "Freq")\n+names(dat) = c("seq_conc", "Freq")\n+\n+dat$seq_conc = factor(dat$seq_conc)\n+\n+dat = dat[order(as.character(dat$seq_conc)),]\n+\n+#writing html from R...\n+get.bg.color = function(val){\n+\tif(val %in% c("TRUE", "FALSE", "T", "F")){ #if its a logical value, give the background a green/red color\n+\t\treturn(ifelse(val,"#eafaf1","#f9ebea"))\n+\t} else if (!is.na(as.numeric(val))) { #if its a numerical value, give it a grey tint if its >0\n+\t\treturn(ifelse(val > 0,"#eaecee","white"))\n+\t} else {\n+\t\treturn("white")\n+\t}\n+}\n+td = function(val) {\n+  return(paste("<td bgcolor=\'", get.bg.color(val), "\'>", val, "</td>", sep=""))\n+}\n+tr = function(val) { \n+\treturn(paste(c("<tr>", sapply(val, td), "</tr>"), collapse="")) \n+}\n+\n+make.link = function(id, clss, val) { \n+\tpaste("<a href=\'", clss, "_", id, ".html\'>", val, "</a>", sep="") \n+}\n+tbl = function(df) {\n+\tres = "<table border=\'1\'>"\n+\tfor(i in 1:nrow(df)){ \n+\t\tres = paste(res, tr(df[i,]), sep="")\n+\t}\n+\tres = paste(res, "</table>")\n+}\n+\n+cat("<table border=\'1\'>", file=main.html, append=F)\n+cat("<caption>CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)\n+cat("<tr>", file=main.html, append=T)\n+cat("<th>Sequence</th><th>Functionality</th><th>ca1</th><th>ca2</th><th>cg1</th><th>cg2</th><th>cg3</th><th>cg4</th><th>cm</th><th>un</th>", file=main.html, append=T)\n+cat("<th>total CA</th><th>total CG</th><th>number of subclasses</th><th>present in both Ca and Cg</th><th>Ca1+Ca2</th>", file=main.html, append=T)\n+cat("<th>Cg1+Cg2</th><th>Cg1+Cg3</th><th>Cg1+Cg4</th><th>Cg2+Cg3</th><th>Cg2+Cg4</th><th>Cg3+Cg4</th>", file=main.html, append=T)\n+cat("<th>Cg1+Cg2+Cg3</th><th>Cg2+Cg3+Cg4</th><th>Cg1+Cg2+Cg4</th><th>Cg1+Cg3+Cg4</th><th>Cg1+Cg2+Cg3+Cg4</th>", file=main.html, append=T)\n+cat("</tr>", file=main.html, append=T)\n+\n+\n+\n+single.sequences=0 #sequence only found once, skipped\n+in.multiple=0 #same sequence across multiple subclasses\n+multiple.in.one=0 #same sequence multiple times in one subclass\n+unmatched=0 #all of the sequences are unmatched\n+some.unmatched=0 #one or more sequences in a clone are unmatched\n+matched=0 #should be the same als matched sequences\n+\n+sequence.id.page="by_id.html"\n+\n+for(i in 1:nrow(dat)){\n+\t\n+\tca1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^ca1", IDs$best_match),]\n+\tca2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^ca2", IDs$best_match),]\n+\t\n+\tcg1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^cg1", IDs$best_match),]\n+\tcg2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^cg2", IDs$best_match),]\n+\tcg3 = IDs[IDs$seq_conc == dat[i,c("'..b'd, "cg2", cg2.n)\n+\tcg3.html = make.link(id, "cg3", cg3.n)\n+\tcg4.html = make.link(id, "cg4", cg4.n)\n+\t\n+\tcm.html = make.link(id, "cm", cm.n)\n+\t\n+\tun.html = make.link(id, "un", un.n)\n+\t\n+\t#extra columns\n+\tca.n = ca1.n + ca2.n\n+\t\n+\tcg.n = cg1.n + cg2.n + cg3.n + cg4.n\n+\t\n+\t#in.classes\n+\t\n+\tin.ca.cg = (ca.n > 0 & cg.n > 0)\n+\t\n+\tin.ca1.ca2 = (ca1.n > 0 & ca2.n > 0)\n+\t\n+\tin.cg1.cg2 = (cg1.n > 0 & cg2.n > 0)\n+\tin.cg1.cg3 = (cg1.n > 0 & cg3.n > 0)\n+\tin.cg1.cg4 = (cg1.n > 0 & cg4.n > 0)\n+\tin.cg2.cg3 = (cg2.n > 0 & cg3.n > 0)\n+\tin.cg2.cg4 = (cg2.n > 0 & cg4.n > 0)\n+\tin.cg3.cg4 = (cg3.n > 0 & cg4.n > 0)\n+\t\n+\tin.cg1.cg2.cg3 = (cg1.n > 0 & cg2.n > 0 & cg3.n > 0)\n+\tin.cg2.cg3.cg4 = (cg2.n > 0 & cg3.n > 0 & cg4.n > 0)\n+\tin.cg1.cg2.cg4 = (cg1.n > 0 & cg2.n > 0 & cg4.n > 0)\n+\tin.cg1.cg3.cg4 = (cg1.n > 0 & cg3.n > 0 & cg4.n > 0)\n+\t\n+\tin.cg.all = (cg1.n > 0 & cg2.n > 0 & cg3.n > 0 & cg4.n > 0)\n+\t\n+\t\n+\t\n+\t\n+\t#rw = c(as.character(dat[i,"seq_conc"]), functionality, ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html, un.html)\n+\trw = c(as.character(dat[i,"seq_conc"]), functionality, ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html, un.html)\n+\trw = c(rw, ca.n, cg.n, in.classes, in.ca.cg, in.ca1.ca2, in.cg1.cg2, in.cg1.cg3, in.cg1.cg4, in.cg2.cg3, in.cg2.cg4, in.cg3.cg4, in.cg1.cg2.cg3, in.cg2.cg3.cg4, in.cg1.cg2.cg4, in.cg1.cg3.cg4, in.cg.all)\n+\n+\tcat(tr(rw), file=main.html, append=T)\n+\t\n+\t\n+\tfor(i in 1:nrow(allc)){ #generate html by id\n+\t\thtml = make.link(id, allc[i,"best_match"], allc[i,"Sequence.ID"])\n+\t\tcat(paste(html, "<br />"), file=sequence.id.page, append=T)\n+\t}\n+}\n+\n+cat("</table>", file=main.html, append=T)\n+\n+print(paste("Single sequences:", single.sequences))\n+print(paste("Sequences in multiple subclasses:", in.multiple))\n+print(paste("Multiple sequences in one subclass:", multiple.in.one))\n+print(paste("Matched with unmatched:", some.unmatched))\n+print(paste("Count that should match \'matched\' sequences:", matched))\n+\n+#ACGT overview\n+\n+NToverview = merged[!grepl("^unmatched", merged$best_match),]\n+\n+NToverview$seq = paste(NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq, sep="_")\n+\n+NToverview$A = nchar(gsub("[^Aa]", "", NToverview$seq))\n+NToverview$C = nchar(gsub("[^Cc]", "", NToverview$seq))\n+NToverview$G = nchar(gsub("[^Gg]", "", NToverview$seq))\n+NToverview$T = nchar(gsub("[^Tt]", "", NToverview$seq))\n+\n+#Nsum = data.frame(Sequence.ID="-", best_match="Sum", seq="-", A = sum(NToverview$A), C = sum(NToverview$C), G = sum(NToverview$G), T = sum(NToverview$T))\n+\n+#NToverview = rbind(NToverview, NTsum)\n+\n+NTresult = data.frame(nt=c("A", "C", "T", "G"))\n+\n+for(clazz in gene.classes){\n+\tNToverview.sub = NToverview[grepl(paste("^", clazz, sep=""), NToverview$best_match),]\n+\tnew.col.x = c(sum(NToverview.sub$A), sum(NToverview.sub$C), sum(NToverview.sub$T), sum(NToverview.sub$G))\n+\tnew.col.y = sum(new.col.x)\n+\tnew.col.z = round(new.col.x / new.col.y * 100, 2)\n+\t\n+\ttmp = names(NTresult)\n+\tNTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))\n+\tnames(NTresult) = c(tmp, paste(clazz, c("x", "y", "z"), sep=""))\n+}\n+\n+write.table(NToverview[,c("Sequence.ID", "best_match", "seq", "A", "C", "G", "T")], NToverview.file, quote=F, sep="\\t", row.names=F, col.names=T)\n+\n+NToverview = NToverview[!grepl("unmatched", NToverview$best_match),]\n+\n+new.col.x = c(sum(NToverview$A), sum(NToverview$C), sum(NToverview$T), sum(NToverview$G))\n+new.col.y = sum(new.col.x)\n+new.col.z = round(new.col.x / new.col.y * 100, 2)\n+\n+tmp = names(NTresult)\n+NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))\n+names(NTresult) = c(tmp, paste("all", c("x", "y", "z"), sep=""))\n+\n+names(hotspot.analysis.sum) = names(NTresult)\n+\n+hotspot.analysis.sum = rbind(hotspot.analysis.sum, NTresult)\n+\n+write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0")\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n'
b
diff -r 000000000000 -r 8a5a2abbb870 style.tar.gz
b
Binary file style.tar.gz has changed
b
diff -r 000000000000 -r 8a5a2abbb870 subclass_definition.db.nhr
b
Binary file subclass_definition.db.nhr has changed
b
diff -r 000000000000 -r 8a5a2abbb870 subclass_definition.db.nin
b
Binary file subclass_definition.db.nin has changed
b
diff -r 000000000000 -r 8a5a2abbb870 subclass_definition.db.nsq
b
Binary file subclass_definition.db.nsq has changed
b
diff -r 000000000000 -r 8a5a2abbb870 summary_to_fasta.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/summary_to_fasta.py Mon Aug 29 05:36:10 2016 -0400
[
@@ -0,0 +1,42 @@
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", help="The 1_Summary file of an IMGT zip file")
+parser.add_argument("--fasta", help="The output fasta file")
+
+args = parser.parse_args()
+
+infile = args.input
+fasta = args.fasta
+
+with open(infile, 'r') as i, open(fasta, 'w') as o:
+ first = True
+ id_col = 0
+ seq_col = 0
+ no_results = 0
+ no_seqs = 0
+ passed = 0
+ for line in i:
+ splt = line.split("\t")
+ if first:
+ id_col = splt.index("Sequence ID")
+ seq_col = splt.index("Sequence")
+ first = False
+ continue
+ if len(splt) < 5:
+ no_results += 1
+ continue
+
+ ID = splt[id_col]
+ seq = splt[seq_col]
+
+ if not len(seq) > 0:
+ no_seqs += 1
+ continue
+
+ o.write(">" + ID + "\n" + seq + "\n")
+ passed += 1
+
+ print "No results:", no_results
+ print "No sequences:", no_seqs
+ print "Written to fasta file:", passed
b
diff -r 000000000000 -r 8a5a2abbb870 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Mon Aug 29 05:36:10 2016 -0400
b
@@ -0,0 +1,24 @@
+<?xml version="1.0"?>
+<tool_dependency>
+ <set_environment version="1.0">            
+  </set_environment>
+    
+ <package name="blastn" version="1.0"> 
+        <install version="1.0">
+            <actions>
+ <action type="download_by_url">ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.30/ncbi-blast-2.2.30+-x64-linux.tar.gz</action>
+ <action type="move_file">
+ <source>bin/blastn</source>
+ <destination>$INSTALL_DIR</destination>
+ </action>
+ <action type="set_environment">
+ <environment_variable name="BLASTN_DIR" action="set_to">$INSTALL_DIR</environment_variable>
+ </action>
+            </actions>
+        </install>
+        <readme>
+ downloads blast (ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.30/ncbi-blast-2.2.30+-x64-linux.tar.gz) and keeps the blastn executable
+        </readme>
+    </package>
+</tool_dependency>
+
b
diff -r 000000000000 -r 8a5a2abbb870 wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/wrapper.sh Mon Aug 29 05:36:10 2016 -0400
[
b'@@ -0,0 +1,603 @@\n+#!/bin/bash\n+#set -e\n+dir="$(cd "$(dirname "$0")" && pwd)"\n+input=$1\n+method=$2\n+log=$3 #becomes the main html page at the end\n+outdir=$4\n+output="$outdir/index.html" #copied to $log location at the end\n+title=$5\n+include_fr1=$6\n+functionality=$7\n+unique=$8\n+naive_output_ca=$9\n+naive_output_cg=${10}\n+naive_output_cm=${11}\n+filter_unique=${12}\n+class_filter=${13}\n+empty_region_filter=${14}\n+mkdir $outdir\n+\n+tar -xzf $dir/style.tar.gz -C $outdir\n+\n+echo "---------------- read parameters ----------------"\n+echo "---------------- read parameters ----------------<br />" > $log\n+\n+echo "unpacking IMGT file"\n+\n+type="`file $input`"\n+if [[ "$type" == *"Zip archive"* ]] ; then\n+\techo "Zip archive"\n+\techo "unzip $input -d $PWD/files/"\n+\tunzip $input -d $PWD/files/\n+elif [[ "$type" == *"XZ compressed data"* ]] ; then\n+\techo "ZX archive"\n+\techo "tar -xJf $input -C $PWD/files/"\n+\tmkdir -p $PWD/files/$title\n+\ttar -xJf $input -C $PWD/files/$title\n+fi\n+\n+cat `find $PWD/files/ -name "1_*"` > $PWD/summary.txt\n+cat `find $PWD/files/ -name "3_*"` > $PWD/sequences.txt\n+cat `find $PWD/files/ -name "5_*"` > $PWD/aa.txt\n+cat `find $PWD/files/ -name "6_*"` > $PWD/junction.txt\n+cat `find $PWD/files/ -name "7_*"` > $PWD/mutationanalysis.txt\n+cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt\n+cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt\n+\n+if [[ ${#BLASTN_DIR} -ge 5 ]] ; then\n+\techo "On server, using BLASTN_DIR env: ${BLASTN_DIR}"\n+else\n+\tBLASTN_DIR="/home/galaxy/Downloads/ncbi-blast-2.4.0+/bin"\n+\techo "Dev Galaxy set BLASTN_DIR to: ${BLASTN_DIR}"\n+fi\n+\n+echo "---------------- identification ($method) ----------------"\n+echo "---------------- identification ($method) ----------------<br />" >> $log\n+\n+if [[ "${method}" == "custom" ]] ; then\n+\tpython $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt\n+else\n+\techo "---------------- summary_to_fasta.py ----------------"\n+\techo "---------------- summary_to_fasta.py ----------------<br />" >> $log\n+\t\n+\tpython $dir/summary_to_fasta.py --input $PWD/summary.txt --fasta $PWD/sequences.fasta\n+\t\n+\techo -e "qseqid\\tsseqid\\tpident\\tlength\\tmismatch\\tgapopen\\tqstart\\tqend\\tsstart\\tsend\\tevalue\\tbitscore" > $outdir/identified_genes.txt\n+\t${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt\n+fi\n+\n+echo "---------------- merge_and_filter.r ----------------"\n+echo "---------------- merge_and_filter.r ----------------<br />" >> $log\n+\n+Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter} ${empty_region_filter} 2>&1\n+\n+echo "---------------- creating new IMGT zip ----------------"\n+echo "---------------- creating new IMGT zip ----------------<br />" >> $log\n+\n+mkdir $outdir/new_IMGT\n+\n+cat `find $PWD/files/ -name "1_*"` > "$outdir/new_IMGT/1_Summary.txt"\n+cat `find $PWD/files/ -name "2_*"` > "$outdir/new_IMGT/2_IMGT-gapped-nt-sequences.txt"\n+cat `find $PWD/files/ -name "3_*"` > "$outdir/new_IMGT/3_Nt-sequences.txt"\n+cat `find $PWD/files/ -name "4_*"` > "$outdir/new_IMGT/4_IMGT-gapped-AA-sequences.txt"\n+cat `find $PWD/files/ -name "5_*"` > "$outdir/new_IMGT/5_AA-sequences.txt"\n+cat `find $PWD/files/ -name "6_*"` > "$outdir/new_IMGT/6_Junction.txt"\n+cat `find $PWD/files/ -name "7_*"` > "$outdir/new_IMGT/7_V-REGION-mutation-and-AA-change-table.txt"\n+cat `find $PWD/files/ -name "8_*"` > "$outdir/new_IMGT/8_V-REGION-nt-mutation-statistics.txt"\n+cat `find $PWD/files/ -name "9_*"` > "$outdir/new_IMGT/9_V-REGION-AA-change-statistics.txt"\n+cat `find $PWD/files/ -name "10_*"` > "$outdir/new_IMGT/10_V-REGION-mutation-hotspots.txt"\n+\n+mkdir $outdir/new_IMGT_ca\n+cp $outdir/new_IMGT/* $outdir/new_IMGT_ca\n+\n+mkdir $outdir/new_IMGT_ca'..b'ed_clones-ca.txt\'>Download</a></td></tr>" >> $output\n+echo "<tr><td>The Change-O DB defined clones summary file of ca</td><td><a href=\'change_o/change-o-defined_clones-summary-ca.txt\'>Download</a></td></tr>" >> $output\n+echo "<tr><td>The Change-O DB file with defined clones of cg</td><td><a href=\'change_o/change-o-db-defined_clones-cg.txt\'>Download</a></td></tr>" >> $output\n+echo "<tr><td>The Change-O DB defined clones summary file of cg</td><td><a href=\'change_o/change-o-defined_clones-summary-cg.txt\'>Download</a></td></tr>" >> $output\n+echo "<tr><td>The Change-O DB file with defined clones of cm</td><td><a href=\'change_o/change-o-db-defined_clones-cm.txt\'>Download</a></td></tr>" >> $output\n+echo "<tr><td>The Change-O DB defined clones summary file of cm</td><td><a href=\'change_o/change-o-defined_clones-summary-cm.txt\'>Download</a></td></tr>" >> $output\n+\n+echo "</table>" >> $output\n+\n+echo "</div>" >> $output #downloads tab end\n+\n+echo "</div>" >> $output #tabs end \n+\n+echo "</html>" >> $output\n+\n+echo "---------------- baseline ----------------"\n+echo "---------------- baseline ----------------<br />" >> $log\n+tmp="$PWD"\n+\n+mkdir $outdir/baseline\n+\n+\n+mkdir $outdir/baseline/ca_cg_cm\n+if [[ $(wc -l < $outdir/new_IMGT/1_Summary.txt) -gt "1" ]]; then\n+\tcd $outdir/baseline/ca_cg_cm\n+\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT.txz "ca_cg_cm" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline.pdf" "Sequence.ID" "$outdir/baseline.txt"\t\n+else\n+\techo "No sequences" > "$outdir/baseline.txt"\t\n+fi\n+\n+mkdir $outdir/baseline/ca\n+if [[ $(wc -l < $outdir/new_IMGT_ca/1_Summary.txt) -gt "1" ]]; then\n+\tcd $outdir/baseline/ca\n+\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_ca.txz "ca" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_ca.pdf" "Sequence.ID" "$outdir/baseline_ca.txt"\n+else\n+\techo "No ca sequences" > "$outdir/baseline_ca.txt"\t\n+fi\n+\n+mkdir $outdir/baseline/cg\n+if [[ $(wc -l < $outdir/new_IMGT_cg/1_Summary.txt) -gt "1" ]]; then\n+\tcd $outdir/baseline/cg\n+\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_cg.txz "cg" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_cg.pdf" "Sequence.ID" "$outdir/baseline_cg.txt"\n+else\n+\techo "No cg sequences" > "$outdir/baseline_cg.txt"\t\n+fi\n+\n+mkdir $outdir/baseline/cm\n+if [[ $(wc -l < $outdir/new_IMGT_cm/1_Summary.txt) -gt "1" ]]; then\n+\tcd $outdir/baseline/cm\n+\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_cm.txz "cm" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_cm.pdf" "Sequence.ID" "$outdir/baseline_cm.txt"\n+else\n+\techo "No cm sequences" > "$outdir/baseline_cm.txt"\t\n+fi\n+\n+cd $tmp\n+\n+echo "---------------- naive_output.r ----------------"\n+echo "---------------- naive_output.r ----------------<br />" >> $log\n+\n+if [[ "$naive_output" != "None" ]]\n+then\n+\tcp $outdir/new_IMGT_ca.txz ${naive_output_ca}\n+\tcp $outdir/new_IMGT_cg.txz ${naive_output_cg}\n+\tcp $outdir/new_IMGT_cm.txz ${naive_output_cm}\n+fi\n+\n+echo "</table>" >> $outdir/base_overview.html\n+\n+mv $log $outdir/log.html\n+\n+echo "<html><center><h1><a href=\'index.html\'>Click here for the results</a></h1>Tip: Open it in a new tab (middle mouse button or right mouse button -> \'open in new tab\' on the link above)<br />" > $log\n+echo "<table border = 1>" >> $log\n+echo "<thead><tr><th>Info</th><th>Sequences</th><th>Percentage</th></tr></thead>" >> $log\n+tIFS="$TMP"\n+IFS=$\'\\t\'\n+while read step seq perc\n+\tdo\n+\t\techo "<tr>" >> $log\n+\t\techo "<td>$step</td>" >> $log\n+\t\techo "<td>$seq</td>" >> $log\n+\t\techo "<td>${perc}%</td>" >> $log\n+\t\techo "</tr>" >> $log\n+done < $outdir/filtering_steps.txt\n+echo "</table border></center></html>" >> $log\n+\n+IFS="$tIFS"\n+\n+\n+echo "---------------- Done! ----------------"\n+echo "---------------- Done! ----------------<br />" >> $outdir/log.html\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n'