# HG changeset patch # User davidvanzessen # Date 1472463370 14400 # Node ID 8a5a2abbb870aff9f04a95f7cfbdcbd794b6e074 Uploaded diff -r 000000000000 -r 8a5a2abbb870 aa_histogram.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/aa_histogram.r Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,61 @@ +library(ggplot2) + +args <- commandArgs(trailingOnly = TRUE) + +mutations.by.id.file = args[1] +absent.aa.by.id.file = args[2] +genes = strsplit(args[3], ",")[[1]] +genes = c(genes, "") +outdir = args[4] + + +print("---------------- read input ----------------") + +mutations.by.id = read.table(mutations.by.id.file, sep="\t", fill=T, header=T, quote="") +absent.aa.by.id = read.table(absent.aa.by.id.file, sep="\t", fill=T, header=T, quote="") + +for(gene in genes){ + + if(gene == ""){ + mutations.by.id.gene = mutations.by.id[!grepl("unmatched", mutations.by.id$best_match),] + absent.aa.by.id.gene = absent.aa.by.id[!grepl("unmatched", absent.aa.by.id$best_match),] + } else { + mutations.by.id.gene = mutations.by.id[grepl(paste("^", gene, sep=""), mutations.by.id$best_match),] + absent.aa.by.id.gene = absent.aa.by.id[grepl(paste("^", gene, sep=""), absent.aa.by.id$best_match),] + } + print(paste("nrow", gene, nrow(absent.aa.by.id.gene))) + if(nrow(mutations.by.id.gene) == 0){ + next + } + + mutations.at.position = colSums(mutations.by.id.gene[,-c(1,2)]) + aa.at.position = colSums(absent.aa.by.id.gene[,-c(1,2,3,4)]) + + dat_freq = mutations.at.position / aa.at.position + dat_freq[is.na(dat_freq)] = 0 + dat_dt = data.frame(i=1:length(dat_freq), freq=dat_freq) + + print("---------------- plot ----------------") + + m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + m = m + geom_bar(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=dat_dt$i, labels=dat_dt$i) + m = m + annotate("segment", x = 0.5, y = -0.05, xend=26.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 13, y = -0.1, label="FR1") + m = m + annotate("segment", x = 26.5, y = -0.07, xend=38.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 32.5, y = -0.15, label="CDR1") + m = m + annotate("segment", x = 38.5, y = -0.05, xend=55.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 47, y = -0.1, label="FR2") + m = m + annotate("segment", x = 55.5, y = -0.07, xend=65.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 60.5, y = -0.15, label="CDR2") + m = m + annotate("segment", x = 65.5, y = -0.05, xend=104.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 85, y = -0.1, label="FR3") + m = m + expand_limits(y=c(-0.1,1)) + xlab("AA position") + ylab("Frequency") + ggtitle(paste(gene, "AA mutation frequency")) + + print("---------------- write/print ----------------") + + png(filename=paste(outdir, "/aa_histogram_", gene, ".png", sep=""), width=1280, height=720) + print(m) + dev.off() + + dat.sums = data.frame(index=1:length(mutations.at.position), mutations.at.position=mutations.at.position, aa.at.position=aa.at.position) + + write.table(dat.sums, paste(outdir, "/aa_histogram_sum_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + write.table(mutations.by.id.gene, paste(outdir, "/aa_histogram_count_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + write.table(absent.aa.by.id.gene, paste(outdir, "/aa_histogram_absent_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + write.table(dat_dt, paste(outdir, "/aa_histogram_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) +} diff -r 000000000000 -r 8a5a2abbb870 baseline/Baseline_Functions.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/Baseline_Functions.r Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,2287 @@ +######################################################################################### +# License Agreement +# +# THIS WORK IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE +# ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER +# APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE +# OR COPYRIGHT LAW IS PROHIBITED. +# +# BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE +# BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED +# TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN +# CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS. +# +# BASELIne: Bayesian Estimation of Antigen-Driven Selection in Immunoglobulin Sequences +# Coded by: Mohamed Uduman & Gur Yaari +# Copyright 2012 Kleinstein Lab +# Version: 1.3 (01/23/2014) +######################################################################################### + +# Global variables + + FILTER_BY_MUTATIONS = 1000 + + # Nucleotides + NUCLEOTIDES = c("A","C","G","T") + + # Amino Acids + AMINO_ACIDS <- c("F", "F", "L", "L", "S", "S", "S", "S", "Y", "Y", "*", "*", "C", "C", "*", "W", "L", "L", "L", "L", "P", "P", "P", "P", "H", "H", "Q", "Q", "R", "R", "R", "R", "I", "I", "I", "M", "T", "T", "T", "T", "N", "N", "K", "K", "S", "S", "R", "R", "V", "V", "V", "V", "A", "A", "A", "A", "D", "D", "E", "E", "G", "G", "G", "G") + names(AMINO_ACIDS) <- c("TTT", "TTC", "TTA", "TTG", "TCT", "TCC", "TCA", "TCG", "TAT", "TAC", "TAA", "TAG", "TGT", "TGC", "TGA", "TGG", "CTT", "CTC", "CTA", "CTG", "CCT", "CCC", "CCA", "CCG", "CAT", "CAC", "CAA", "CAG", "CGT", "CGC", "CGA", "CGG", "ATT", "ATC", "ATA", "ATG", "ACT", "ACC", "ACA", "ACG", "AAT", "AAC", "AAA", "AAG", "AGT", "AGC", "AGA", "AGG", "GTT", "GTC", "GTA", "GTG", "GCT", "GCC", "GCA", "GCG", "GAT", "GAC", "GAA", "GAG", "GGT", "GGC", "GGA", "GGG") + names(AMINO_ACIDS) <- names(AMINO_ACIDS) + + #Amino Acid Traits + #"*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V" "W" "Y" + #B = "Hydrophobic/Burried" N = "Intermediate/Neutral" S="Hydrophilic/Surface") + TRAITS_AMINO_ACIDS_CHOTHIA98 <- c("*","N","B","S","S","B","N","N","B","S","B","B","S","N","S","S","N","N","B","B","N") + names(TRAITS_AMINO_ACIDS_CHOTHIA98) <- sort(unique(AMINO_ACIDS)) + TRAITS_AMINO_ACIDS <- array(NA,21) + + # Codon Table + CODON_TABLE <- as.data.frame(matrix(NA,ncol=64,nrow=12)) + + # Substitution Model: Smith DS et al. 1996 + substitution_Literature_Mouse <- matrix(c(0, 0.156222928, 0.601501588, 0.242275484, 0.172506739, 0, 0.241239892, 0.586253369, 0.54636291, 0.255795364, 0, 0.197841727, 0.290240811, 0.467680608, 0.24207858, 0),nrow=4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES)) + substitution_Flu_Human <- matrix(c(0,0.2795596,0.5026927,0.2177477,0.1693210,0,0.3264723,0.5042067,0.4983549,0.3328321,0,0.1688130,0.2021079,0.4696077,0.3282844,0),4,4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES)) + substitution_Flu25_Human <- matrix(c(0,0.2580641,0.5163685,0.2255674,0.1541125,0,0.3210224,0.5248651,0.5239281,0.3101292,0,0.1659427,0.1997207,0.4579444,0.3423350,0),4,4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES)) + load("FiveS_Substitution.RData") + + # Mutability Models: Shapiro GS et al. 2002 + triMutability_Literature_Human <- matrix(c(0.24, 1.2, 0.96, 0.43, 2.14, 2, 1.11, 1.9, 0.85, 1.83, 2.36, 1.31, 0.82, 0.52, 0.89, 1.33, 1.4, 0.82, 1.83, 0.73, 1.83, 1.62, 1.53, 0.57, 0.92, 0.42, 0.42, 1.47, 3.44, 2.58, 1.18, 0.47, 0.39, 1.12, 1.8, 0.68, 0.47, 2.19, 2.35, 2.19, 1.05, 1.84, 1.26, 0.28, 0.98, 2.37, 0.66, 1.58, 0.67, 0.92, 1.76, 0.83, 0.97, 0.56, 0.75, 0.62, 2.26, 0.62, 0.74, 1.11, 1.16, 0.61, 0.88, 0.67, 0.37, 0.07, 1.08, 0.46, 0.31, 0.94, 0.62, 0.57, 0.29, NA, 1.44, 0.46, 0.69, 0.57, 0.24, 0.37, 1.1, 0.99, 1.39, 0.6, 2.26, 1.24, 1.36, 0.52, 0.33, 0.26, 1.25, 0.37, 0.58, 1.03, 1.2, 0.34, 0.49, 0.33, 2.62, 0.16, 0.4, 0.16, 0.35, 0.75, 1.85, 0.94, 1.61, 0.85, 2.09, 1.39, 0.3, 0.52, 1.33, 0.29, 0.51, 0.26, 0.51, 3.83, 2.01, 0.71, 0.58, 0.62, 1.07, 0.28, 1.2, 0.74, 0.25, 0.59, 1.09, 0.91, 1.36, 0.45, 2.89, 1.27, 3.7, 0.69, 0.28, 0.41, 1.17, 0.56, 0.93, 3.41, 1, 1, NA, 5.9, 0.74, 2.51, 2.24, 2.24, 1.95, 3.32, 2.34, 1.3, 2.3, 1, 0.66, 0.73, 0.93, 0.41, 0.65, 0.89, 0.65, 0.32, NA, 0.43, 0.85, 0.43, 0.31, 0.31, 0.23, 0.29, 0.57, 0.71, 0.48, 0.44, 0.76, 0.51, 1.7, 0.85, 0.74, 2.23, 2.08, 1.16, 0.51, 0.51, 1, 0.5, NA, NA, 0.71, 2.14), nrow=64,byrow=T) + triMutability_Literature_Mouse <- matrix(c(1.31, 1.35, 1.42, 1.18, 2.02, 2.02, 1.02, 1.61, 1.99, 1.42, 2.01, 1.03, 2.02, 0.97, 0.53, 0.71, 1.19, 0.83, 0.96, 0.96, 0, 1.7, 2.22, 0.59, 1.24, 1.07, 0.51, 1.68, 3.36, 3.36, 1.14, 0.29, 0.33, 0.9, 1.11, 0.63, 1.08, 2.07, 2.27, 1.74, 0.22, 1.19, 2.37, 1.15, 1.15, 1.56, 0.81, 0.34, 0.87, 0.79, 2.13, 0.49, 0.85, 0.97, 0.36, 0.82, 0.66, 0.63, 1.15, 0.94, 0.85, 0.25, 0.93, 1.19, 0.4, 0.2, 0.44, 0.44, 0.88, 1.06, 0.77, 0.39, 0, 0, 0, 0, 0, 0, 0.43, 0.43, 0.86, 0.59, 0.59, 0, 1.18, 0.86, 2.9, 1.66, 0.4, 0.2, 1.54, 0.43, 0.69, 1.71, 0.68, 0.55, 0.91, 0.7, 1.71, 0.09, 0.27, 0.63, 0.2, 0.45, 1.01, 1.63, 0.96, 1.48, 2.18, 1.2, 1.31, 0.66, 2.13, 0.49, 0, 0, 0, 2.97, 2.8, 0.79, 0.4, 0.5, 0.4, 0.11, 1.68, 0.42, 0.13, 0.44, 0.93, 0.71, 1.11, 1.19, 2.71, 1.08, 3.43, 0.4, 0.67, 0.47, 1.02, 0.14, 1.56, 1.98, 0.53, 0.33, 0.63, 2.06, 1.77, 1.46, 3.74, 2.93, 2.1, 2.18, 0.78, 0.73, 2.93, 0.63, 0.57, 0.17, 0.85, 0.52, 0.31, 0.31, 0, 0, 0.51, 0.29, 0.83, 0.54, 0.28, 0.47, 0.9, 0.99, 1.24, 2.47, 0.73, 0.23, 1.13, 0.24, 2.12, 0.24, 0.33, 0.83, 1.41, 0.62, 0.28, 0.35, 0.77, 0.17, 0.72, 0.58, 0.45, 0.41), nrow=64,byrow=T) + triMutability_Names <- c("AAA", "AAC", "AAG", "AAT", "ACA", "ACC", "ACG", "ACT", "AGA", "AGC", "AGG", "AGT", "ATA", "ATC", "ATG", "ATT", "CAA", "CAC", "CAG", "CAT", "CCA", "CCC", "CCG", "CCT", "CGA", "CGC", "CGG", "CGT", "CTA", "CTC", "CTG", "CTT", "GAA", "GAC", "GAG", "GAT", "GCA", "GCC", "GCG", "GCT", "GGA", "GGC", "GGG", "GGT", "GTA", "GTC", "GTG", "GTT", "TAA", "TAC", "TAG", "TAT", "TCA", "TCC", "TCG", "TCT", "TGA", "TGC", "TGG", "TGT", "TTA", "TTC", "TTG", "TTT") + load("FiveS_Mutability.RData") + +# Functions + + # Translate codon to amino acid + translateCodonToAminoAcid<-function(Codon){ + return(AMINO_ACIDS[Codon]) + } + + # Translate amino acid to trait change + translateAminoAcidToTraitChange<-function(AminoAcid){ + return(TRAITS_AMINO_ACIDS[AminoAcid]) + } + + # Initialize Amino Acid Trait Changes + initializeTraitChange <- function(traitChangeModel=1,species=1,traitChangeFileName=NULL){ + if(!is.null(traitChangeFileName)){ + tryCatch( + traitChange <- read.delim(traitChangeFileName,sep="\t",header=T) + , error = function(ex){ + cat("Error|Error reading trait changes. Please check file name/path and format.\n") + q() + } + ) + }else{ + traitChange <- TRAITS_AMINO_ACIDS_CHOTHIA98 + } + TRAITS_AMINO_ACIDS <<- traitChange + } + + # Read in formatted nucleotide substitution matrix + initializeSubstitutionMatrix <- function(substitutionModel,species,subsMatFileName=NULL){ + if(!is.null(subsMatFileName)){ + tryCatch( + subsMat <- read.delim(subsMatFileName,sep="\t",header=T) + , error = function(ex){ + cat("Error|Error reading substitution matrix. Please check file name/path and format.\n") + q() + } + ) + if(sum(apply(subsMat,1,sum)==1)!=4) subsMat = t(apply(subsMat,1,function(x)x/sum(x))) + }else{ + if(substitutionModel==1)subsMat <- substitution_Literature_Mouse + if(substitutionModel==2)subsMat <- substitution_Flu_Human + if(substitutionModel==3)subsMat <- substitution_Flu25_Human + + } + + if(substitutionModel==0){ + subsMat <- matrix(1,4,4) + subsMat[,] = 1/3 + subsMat[1,1] = 0 + subsMat[2,2] = 0 + subsMat[3,3] = 0 + subsMat[4,4] = 0 + } + + + NUCLEOTIDESN = c(NUCLEOTIDES,"N", "-") + if(substitutionModel==5){ + subsMat <- FiveS_Substitution + return(subsMat) + }else{ + subsMat <- rbind(subsMat,rep(NA,4),rep(NA,4)) + return( matrix(data.matrix(subsMat),6,4,dimnames=list(NUCLEOTIDESN,NUCLEOTIDES) ) ) + } + } + + + # Read in formatted Mutability file + initializeMutabilityMatrix <- function(mutabilityModel=1, species=1,mutabilityMatFileName=NULL){ + if(!is.null(mutabilityMatFileName)){ + tryCatch( + mutabilityMat <- read.delim(mutabilityMatFileName,sep="\t",header=T) + , error = function(ex){ + cat("Error|Error reading mutability matrix. Please check file name/path and format.\n") + q() + } + ) + }else{ + mutabilityMat <- triMutability_Literature_Human + if(species==2) mutabilityMat <- triMutability_Literature_Mouse + } + + if(mutabilityModel==0){ mutabilityMat <- matrix(1,64,3)} + + if(mutabilityModel==5){ + mutabilityMat <- FiveS_Mutability + return(mutabilityMat) + }else{ + return( matrix( data.matrix(mutabilityMat), 64, 3, dimnames=list(triMutability_Names,1:3)) ) + } + } + + # Read FASTA file formats + # Modified from read.fasta from the seqinR package + baseline.read.fasta <- + function (file = system.file("sequences/sample.fasta", package = "seqinr"), + seqtype = c("DNA", "AA"), as.string = FALSE, forceDNAtolower = TRUE, + set.attributes = TRUE, legacy.mode = TRUE, seqonly = FALSE, + strip.desc = FALSE, sizeof.longlong = .Machine$sizeof.longlong, + endian = .Platform$endian, apply.mask = TRUE) + { + seqtype <- match.arg(seqtype) + + lines <- readLines(file) + + if (legacy.mode) { + comments <- grep("^;", lines) + if (length(comments) > 0) + lines <- lines[-comments] + } + + + ind_groups<-which(substr(lines, 1L, 3L) == ">>>") + lines_mod<-lines + + if(!length(ind_groups)){ + lines_mod<-c(">>>All sequences combined",lines) + } + + ind_groups<-which(substr(lines_mod, 1L, 3L) == ">>>") + + lines <- array("BLA",dim=(length(ind_groups)+length(lines_mod))) + id<-sapply(1:length(ind_groups),function(i)ind_groups[i]+i-1)+1 + lines[id] <- "THIS IS A FAKE SEQUENCE" + lines[-id] <- lines_mod + rm(lines_mod) + + ind <- which(substr(lines, 1L, 1L) == ">") + nseq <- length(ind) + if (nseq == 0) { + stop("no line starting with a > character found") + } + start <- ind + 1 + end <- ind - 1 + + while( any(which(ind%in%end)) ){ + ind=ind[-which(ind%in%end)] + nseq <- length(ind) + if (nseq == 0) { + stop("no line starting with a > character found") + } + start <- ind + 1 + end <- ind - 1 + } + + end <- c(end[-1], length(lines)) + sequences <- lapply(seq_len(nseq), function(i) paste(lines[start[i]:end[i]], collapse = "")) + if (seqonly) + return(sequences) + nomseq <- lapply(seq_len(nseq), function(i) { + + #firstword <- strsplit(lines[ind[i]], " ")[[1]][1] + substr(lines[ind[i]], 2, nchar(lines[ind[i]])) + + }) + if (seqtype == "DNA") { + if (forceDNAtolower) { + sequences <- as.list(tolower(chartr(".","-",sequences))) + }else{ + sequences <- as.list(toupper(chartr(".","-",sequences))) + } + } + if (as.string == FALSE) + sequences <- lapply(sequences, s2c) + if (set.attributes) { + for (i in seq_len(nseq)) { + Annot <- lines[ind[i]] + if (strip.desc) + Annot <- substr(Annot, 2L, nchar(Annot)) + attributes(sequences[[i]]) <- list(name = nomseq[[i]], + Annot = Annot, class = switch(seqtype, AA = "SeqFastaAA", + DNA = "SeqFastadna")) + } + } + names(sequences) <- nomseq + return(sequences) + } + + + # Replaces non FASTA characters in input files with N + replaceNonFASTAChars <-function(inSeq="ACGTN-AApA"){ + gsub('[^ACGTNacgt[:punct:]-[:punct:].]','N',inSeq,perl=TRUE) + } + + # Find the germlines in the FASTA list + germlinesInFile <- function(seqIDs){ + firstChar = sapply(seqIDs,function(x){substr(x,1,1)}) + secondChar = sapply(seqIDs,function(x){substr(x,2,2)}) + return(firstChar==">" & secondChar!=">") + } + + # Find the groups in the FASTA list + groupsInFile <- function(seqIDs){ + sapply(seqIDs,function(x){substr(x,1,2)})==">>" + } + + # In the process of finding germlines/groups, expand from the start to end of the group + expandTillNext <- function(vecPosToID){ + IDs = names(vecPosToID) + posOfInterests = which(vecPosToID) + + expandedID = rep(NA,length(IDs)) + expandedIDNames = gsub(">","",IDs[posOfInterests]) + startIndexes = c(1,posOfInterests[-1]) + stopIndexes = c(posOfInterests[-1]-1,length(IDs)) + expandedID = unlist(sapply(1:length(startIndexes),function(i){ + rep(i,stopIndexes[i]-startIndexes[i]+1) + })) + names(expandedID) = unlist(sapply(1:length(startIndexes),function(i){ + rep(expandedIDNames[i],stopIndexes[i]-startIndexes[i]+1) + })) + return(expandedID) + } + + # Process FASTA (list) to return a matrix[input, germline) + processInputAdvanced <- function(inputFASTA){ + + seqIDs = names(inputFASTA) + numbSeqs = length(seqIDs) + posGermlines1 = germlinesInFile(seqIDs) + numbGermlines = sum(posGermlines1) + posGroups1 = groupsInFile(seqIDs) + numbGroups = sum(posGroups1) + consDef = NA + + if(numbGermlines==0){ + posGermlines = 2 + numbGermlines = 1 + } + + glPositionsSum = cumsum(posGermlines1) + glPositions = table(glPositionsSum) + #Find the position of the conservation row + consDefPos = as.numeric(names(glPositions[names(glPositions)!=0 & glPositions==1]))+1 + if( length(consDefPos)> 0 ){ + consDefID = match(consDefPos, glPositionsSum) + #The coservation rows need to be pulled out and stores seperately + consDef = inputFASTA[consDefID] + inputFASTA = inputFASTA[-consDefID] + + seqIDs = names(inputFASTA) + numbSeqs = length(seqIDs) + posGermlines1 = germlinesInFile(seqIDs) + numbGermlines = sum(posGermlines1) + posGroups1 = groupsInFile(seqIDs) + numbGroups = sum(posGroups1) + if(numbGermlines==0){ + posGermlines = 2 + numbGermlines = 1 + } + } + + posGroups <- expandTillNext(posGroups1) + posGermlines <- expandTillNext(posGermlines1) + posGermlines[posGroups1] = 0 + names(posGermlines)[posGroups1] = names(posGroups)[posGroups1] + posInput = rep(TRUE,numbSeqs) + posInput[posGroups1 | posGermlines1] = FALSE + + matInput = matrix(NA, nrow=sum(posInput), ncol=2) + rownames(matInput) = seqIDs[posInput] + colnames(matInput) = c("Input","Germline") + + vecInputFASTA = unlist(inputFASTA) + matInput[,1] = vecInputFASTA[posInput] + matInput[,2] = vecInputFASTA[ which( names(inputFASTA)%in%paste(">",names(posGermlines)[posInput],sep="") )[ posGermlines[posInput]] ] + + germlines = posGermlines[posInput] + groups = posGroups[posInput] + + return( list("matInput"=matInput, "germlines"=germlines, "groups"=groups, "conservationDefinition"=consDef )) + } + + + # Replace leading and trailing dashes in the sequence + replaceLeadingTrailingDashes <- function(x,readEnd){ + iiGap = unlist(gregexpr("-",x[1])) + ggGap = unlist(gregexpr("-",x[2])) + #posToChange = intersect(iiGap,ggGap) + + + seqIn = replaceLeadingTrailingDashesHelper(x[1]) + seqGL = replaceLeadingTrailingDashesHelper(x[2]) + seqTemplate = rep('N',readEnd) + seqIn <- c(seqIn,seqTemplate[(length(seqIn)+1):readEnd]) + seqGL <- c(seqGL,seqTemplate[(length(seqGL)+1):readEnd]) +# if(posToChange!=-1){ +# seqIn[posToChange] = "-" +# seqGL[posToChange] = "-" +# } + + seqIn = c2s(seqIn[1:readEnd]) + seqGL = c2s(seqGL[1:readEnd]) + + lenGL = nchar(seqGL) + if(lenGL seqLen ) + trimmedSeq = substr(seqToTrim,1, ( (getCodonPos(seqLen)[1])-1 ) ) + + return(trimmedSeq) + } + + # Given a nuclotide position, returns the pos of the 3 nucs that made the codon + # e.g. nuc 86 is part of nucs 85,86,87 + getCodonPos <- function(nucPos){ + codonNum = (ceiling(nucPos/3))*3 + return( (codonNum-2):codonNum) + } + + # Given a nuclotide position, returns the codon number + # e.g. nuc 86 = codon 29 + getCodonNumb <- function(nucPos){ + return( ceiling(nucPos/3) ) + } + + # Given a codon, returns all the nuc positions that make the codon + getCodonNucs <- function(codonNumb){ + getCodonPos(codonNumb*3) + } + + computeCodonTable <- function(testID=1){ + + if(testID<=4){ + # Pre-compute every codons + intCounter = 1 + for(pOne in NUCLEOTIDES){ + for(pTwo in NUCLEOTIDES){ + for(pThree in NUCLEOTIDES){ + codon = paste(pOne,pTwo,pThree,sep="") + colnames(CODON_TABLE)[intCounter] = codon + intCounter = intCounter + 1 + CODON_TABLE[,codon] = mutationTypeOptimized(cbind(permutateAllCodon(codon),rep(codon,12))) + } + } + } + chars = c("N","A","C","G","T", "-") + for(a in chars){ + for(b in chars){ + for(c in chars){ + if(a=="N" | b=="N" | c=="N"){ + #cat(paste(a,b,c),sep="","\n") + CODON_TABLE[,paste(a,b,c,sep="")] = rep(NA,12) + } + } + } + } + + chars = c("-","A","C","G","T") + for(a in chars){ + for(b in chars){ + for(c in chars){ + if(a=="-" | b=="-" | c=="-"){ + #cat(paste(a,b,c),sep="","\n") + CODON_TABLE[,paste(a,b,c,sep="")] = rep(NA,12) + } + } + } + } + CODON_TABLE <<- as.matrix(CODON_TABLE) + } + } + + collapseClone <- function(vecInputSeqs,glSeq,readEnd,nonTerminalOnly=0){ + #print(length(vecInputSeqs)) + vecInputSeqs = unique(vecInputSeqs) + if(length(vecInputSeqs)==1){ + return( list( c(vecInputSeqs,glSeq), F) ) + }else{ + charInputSeqs <- sapply(vecInputSeqs, function(x){ + s2c(x)[1:readEnd] + }) + charGLSeq <- s2c(glSeq) + matClone <- sapply(1:readEnd, function(i){ + posNucs = unique(charInputSeqs[i,]) + posGL = charGLSeq[i] + error = FALSE + if(posGL=="-" & sum(!(posNucs%in%c("-","N")))==0 ){ + return(c("-",error)) + } + if(length(posNucs)==1) + return(c(posNucs[1],error)) + else{ + if("N"%in%posNucs){ + error=TRUE + } + if(sum(!posNucs[posNucs!="N"]%in%posGL)==0){ + return( c(posGL,error) ) + }else{ + #return( c(sample(posNucs[posNucs!="N"],1),error) ) + if(nonTerminalOnly==0){ + return( c(sample(charInputSeqs[i,charInputSeqs[i,]!="N" & charInputSeqs[i,]!=posGL],1),error) ) + }else{ + posNucs = charInputSeqs[i,charInputSeqs[i,]!="N" & charInputSeqs[i,]!=posGL] + posNucsTable = table(posNucs) + if(sum(posNucsTable>1)==0){ + return( c(posGL,error) ) + }else{ + return( c(sample( posNucs[posNucs%in%names(posNucsTable)[posNucsTable>1]],1),error) ) + } + } + + } + } + }) + + + #print(length(vecInputSeqs)) + return(list(c(c2s(matClone[1,]),glSeq),"TRUE"%in%matClone[2,])) + } + } + + # Compute the expected for each sequence-germline pair + getExpectedIndividual <- function(matInput){ + if( any(grep("multicore",search())) ){ + facGL <- factor(matInput[,2]) + facLevels = levels(facGL) + LisGLs_MutabilityU = mclapply(1:length(facLevels), function(x){ + computeMutabilities(facLevels[x]) + }) + facIndex = match(facGL,facLevels) + + LisGLs_Mutability = mclapply(1:nrow(matInput), function(x){ + cInput = rep(NA,nchar(matInput[x,1])) + cInput[s2c(matInput[x,1])!="N"] = 1 + LisGLs_MutabilityU[[facIndex[x]]] * cInput + }) + + LisGLs_Targeting = mclapply(1:dim(matInput)[1], function(x){ + computeTargeting(matInput[x,2],LisGLs_Mutability[[x]]) + }) + + LisGLs_MutationTypes = mclapply(1:length(matInput[,2]),function(x){ + #print(x) + computeMutationTypes(matInput[x,2]) + }) + + LisGLs_Exp = mclapply(1:dim(matInput)[1], function(x){ + computeExpected(LisGLs_Targeting[[x]],LisGLs_MutationTypes[[x]]) + }) + + ul_LisGLs_Exp = unlist(LisGLs_Exp) + return(matrix(ul_LisGLs_Exp,ncol=4,nrow=(length(ul_LisGLs_Exp)/4),byrow=T)) + }else{ + facGL <- factor(matInput[,2]) + facLevels = levels(facGL) + LisGLs_MutabilityU = lapply(1:length(facLevels), function(x){ + computeMutabilities(facLevels[x]) + }) + facIndex = match(facGL,facLevels) + + LisGLs_Mutability = lapply(1:nrow(matInput), function(x){ + cInput = rep(NA,nchar(matInput[x,1])) + cInput[s2c(matInput[x,1])!="N"] = 1 + LisGLs_MutabilityU[[facIndex[x]]] * cInput + }) + + LisGLs_Targeting = lapply(1:dim(matInput)[1], function(x){ + computeTargeting(matInput[x,2],LisGLs_Mutability[[x]]) + }) + + LisGLs_MutationTypes = lapply(1:length(matInput[,2]),function(x){ + #print(x) + computeMutationTypes(matInput[x,2]) + }) + + LisGLs_Exp = lapply(1:dim(matInput)[1], function(x){ + computeExpected(LisGLs_Targeting[[x]],LisGLs_MutationTypes[[x]]) + }) + + ul_LisGLs_Exp = unlist(LisGLs_Exp) + return(matrix(ul_LisGLs_Exp,ncol=4,nrow=(length(ul_LisGLs_Exp)/4),byrow=T)) + + } + } + + # Compute mutabilities of sequence based on the tri-nucleotide model + computeMutabilities <- function(paramSeq){ + seqLen = nchar(paramSeq) + seqMutabilites = rep(NA,seqLen) + + gaplessSeq = gsub("-", "", paramSeq) + gaplessSeqLen = nchar(gaplessSeq) + gaplessSeqMutabilites = rep(NA,gaplessSeqLen) + + if(mutabilityModel!=5){ + pos<- 3:(gaplessSeqLen) + subSeq = substr(rep(gaplessSeq,gaplessSeqLen-2),(pos-2),(pos+2)) + gaplessSeqMutabilites[pos] = + tapply( c( + getMutability( substr(subSeq,1,3), 3) , + getMutability( substr(subSeq,2,4), 2), + getMutability( substr(subSeq,3,5), 1) + ),rep(1:(gaplessSeqLen-2),3),mean,na.rm=TRUE + ) + #Pos 1 + subSeq = substr(gaplessSeq,1,3) + gaplessSeqMutabilites[1] = getMutability(subSeq , 1) + #Pos 2 + subSeq = substr(gaplessSeq,1,4) + gaplessSeqMutabilites[2] = mean( c( + getMutability( substr(subSeq,1,3), 2) , + getMutability( substr(subSeq,2,4), 1) + ),na.rm=T + ) + seqMutabilites[which(s2c(paramSeq)!="-")]<- gaplessSeqMutabilites + return(seqMutabilites) + }else{ + + pos<- 3:(gaplessSeqLen) + subSeq = substr(rep(gaplessSeq,gaplessSeqLen-2),(pos-2),(pos+2)) + gaplessSeqMutabilites[pos] = sapply(subSeq,function(x){ getMutability5(x) }, simplify=T) + seqMutabilites[which(s2c(paramSeq)!="-")]<- gaplessSeqMutabilites + return(seqMutabilites) + } + + } + + # Returns the mutability of a triplet at a given position + getMutability <- function(codon, pos=1:3){ + triplets <- rownames(mutability) + mutability[ match(codon,triplets) ,pos] + } + + getMutability5 <- function(fivemer){ + return(mutability[fivemer]) + } + + # Returns the substitution probabilty + getTransistionProb <- function(nuc){ + substitution[nuc,] + } + + getTransistionProb5 <- function(fivemer){ + if(any(which(fivemer==colnames(substitution)))){ + return(substitution[,fivemer]) + }else{ + return(array(NA,4)) + } + } + + # Given a nuc, returns the other 3 nucs it can mutate to + canMutateTo <- function(nuc){ + NUCLEOTIDES[- which(NUCLEOTIDES==nuc)] + } + + # Given a nucleotide, returns the probabilty of other nucleotide it can mutate to + canMutateToProb <- function(nuc){ + substitution[nuc,canMutateTo(nuc)] + } + + # Compute targeting, based on precomputed mutatbility & substitution + computeTargeting <- function(param_strSeq,param_vecMutabilities){ + + if(substitutionModel!=5){ + vecSeq = s2c(param_strSeq) + matTargeting = sapply( 1:length(vecSeq), function(x) { param_vecMutabilities[x] * getTransistionProb(vecSeq[x]) } ) + #matTargeting = apply( rbind(vecSeq,param_vecMutabilities),2, function(x) { as.vector(as.numeric(x[2]) * getTransistionProb(x[1])) } ) + dimnames( matTargeting ) = list(NUCLEOTIDES,1:(length(vecSeq))) + return (matTargeting) + }else{ + + seqLen = nchar(param_strSeq) + seqsubstitution = matrix(NA,ncol=seqLen,nrow=4) + paramSeq <- param_strSeq + gaplessSeq = gsub("-", "", paramSeq) + gaplessSeqLen = nchar(gaplessSeq) + gaplessSeqSubstitution = matrix(NA,ncol=gaplessSeqLen,nrow=4) + + pos<- 3:(gaplessSeqLen) + subSeq = substr(rep(gaplessSeq,gaplessSeqLen-2),(pos-2),(pos+2)) + gaplessSeqSubstitution[,pos] = sapply(subSeq,function(x){ getTransistionProb5(x) }, simplify=T) + seqsubstitution[,which(s2c(paramSeq)!="-")]<- gaplessSeqSubstitution + #matTargeting <- param_vecMutabilities %*% seqsubstitution + matTargeting <- sweep(seqsubstitution,2,param_vecMutabilities,`*`) + dimnames( matTargeting ) = list(NUCLEOTIDES,1:(seqLen)) + return (matTargeting) + } + } + + # Compute the mutations types + computeMutationTypes <- function(param_strSeq){ + #cat(param_strSeq,"\n") + #vecSeq = trimToLastCodon(param_strSeq) + lenSeq = nchar(param_strSeq) + vecCodons = sapply({1:(lenSeq/3)}*3-2,function(x){substr(param_strSeq,x,x+2)}) + matMutationTypes = matrix( unlist(CODON_TABLE[,vecCodons]) ,ncol=lenSeq,nrow=4, byrow=F) + dimnames( matMutationTypes ) = list(NUCLEOTIDES,1:(ncol(matMutationTypes))) + return(matMutationTypes) + } + computeMutationTypesFast <- function(param_strSeq){ + matMutationTypes = matrix( CODON_TABLE[,param_strSeq] ,ncol=3,nrow=4, byrow=F) + #dimnames( matMutationTypes ) = list(NUCLEOTIDES,1:(length(vecSeq))) + return(matMutationTypes) + } + mutationTypeOptimized <- function( matOfCodons ){ + apply( matOfCodons,1,function(x){ mutationType(x[2],x[1]) } ) + } + + # Returns a vector of codons 1 mutation away from the given codon + permutateAllCodon <- function(codon){ + cCodon = s2c(codon) + matCodons = t(array(cCodon,dim=c(3,12))) + matCodons[1:4,1] = NUCLEOTIDES + matCodons[5:8,2] = NUCLEOTIDES + matCodons[9:12,3] = NUCLEOTIDES + apply(matCodons,1,c2s) + } + + # Given two codons, tells you if the mutation is R or S (based on your definition) + mutationType <- function(codonFrom,codonTo){ + if(testID==4){ + if( is.na(codonFrom) | is.na(codonTo) | is.na(translateCodonToAminoAcid(codonFrom)) | is.na(translateCodonToAminoAcid(codonTo)) ){ + return(NA) + }else{ + mutationType = "S" + if( translateAminoAcidToTraitChange(translateCodonToAminoAcid(codonFrom)) != translateAminoAcidToTraitChange(translateCodonToAminoAcid(codonTo)) ){ + mutationType = "R" + } + if(translateCodonToAminoAcid(codonTo)=="*" | translateCodonToAminoAcid(codonFrom)=="*"){ + mutationType = "Stop" + } + return(mutationType) + } + }else if(testID==5){ + if( is.na(codonFrom) | is.na(codonTo) | is.na(translateCodonToAminoAcid(codonFrom)) | is.na(translateCodonToAminoAcid(codonTo)) ){ + return(NA) + }else{ + if(codonFrom==codonTo){ + mutationType = "S" + }else{ + codonFrom = s2c(codonFrom) + codonTo = s2c(codonTo) + mutationType = "Stop" + nucOfI = codonFrom[which(codonTo!=codonFrom)] + if(nucOfI=="C"){ + mutationType = "R" + }else if(nucOfI=="G"){ + mutationType = "S" + } + } + return(mutationType) + } + }else{ + if( is.na(codonFrom) | is.na(codonTo) | is.na(translateCodonToAminoAcid(codonFrom)) | is.na(translateCodonToAminoAcid(codonTo)) ){ + return(NA) + }else{ + mutationType = "S" + if( translateCodonToAminoAcid(codonFrom) != translateCodonToAminoAcid(codonTo) ){ + mutationType = "R" + } + if(translateCodonToAminoAcid(codonTo)=="*" | translateCodonToAminoAcid(codonFrom)=="*"){ + mutationType = "Stop" + } + return(mutationType) + } + } + } + + + #given a mat of targeting & it's corresponding mutationtypes returns + #a vector of Exp_RCDR,Exp_SCDR,Exp_RFWR,Exp_RFWR + computeExpected <- function(paramTargeting,paramMutationTypes){ + # Replacements + RPos = which(paramMutationTypes=="R") + #FWR + Exp_R_FWR = sum(paramTargeting[ RPos[which(FWR_Nuc_Mat[RPos]==T)] ],na.rm=T) + #CDR + Exp_R_CDR = sum(paramTargeting[ RPos[which(CDR_Nuc_Mat[RPos]==T)] ],na.rm=T) + # Silents + SPos = which(paramMutationTypes=="S") + #FWR + Exp_S_FWR = sum(paramTargeting[ SPos[which(FWR_Nuc_Mat[SPos]==T)] ],na.rm=T) + #CDR + Exp_S_CDR = sum(paramTargeting[ SPos[which(CDR_Nuc_Mat[SPos]==T)] ],na.rm=T) + + return(c(Exp_R_CDR,Exp_S_CDR,Exp_R_FWR,Exp_S_FWR)) + } + + # Count the mutations in a sequence + # each mutation is treated independently + analyzeMutations2NucUri_website <- function( rev_in_matrix ){ + paramGL = rev_in_matrix[2,] + paramSeq = rev_in_matrix[1,] + + #Fill seq with GL seq if gapped + #if( any(paramSeq=="-") ){ + # gapPos_Seq = which(paramSeq=="-") + # gapPos_Seq_ToReplace = gapPos_Seq[paramGL[gapPos_Seq] != "-"] + # paramSeq[gapPos_Seq_ToReplace] = paramGL[gapPos_Seq_ToReplace] + #} + + + #if( any(paramSeq=="N") ){ + # gapPos_Seq = which(paramSeq=="N") + # gapPos_Seq_ToReplace = gapPos_Seq[paramGL[gapPos_Seq] != "N"] + # paramSeq[gapPos_Seq_ToReplace] = paramGL[gapPos_Seq_ToReplace] + #} + + analyzeMutations2NucUri( matrix(c( paramGL, paramSeq ),2,length(paramGL),byrow=T) ) + + } + + #1 = GL + #2 = Seq + analyzeMutations2NucUri <- function( in_matrix=matrix(c(c("A","A","A","C","C","C"),c("A","G","G","C","C","A")),2,6,byrow=T) ){ + paramGL = in_matrix[2,] + paramSeq = in_matrix[1,] + paramSeqUri = paramGL + #mutations = apply(rbind(paramGL,paramSeq), 2, function(x){!x[1]==x[2]}) + mutations_val = paramGL != paramSeq + if(any(mutations_val)){ + mutationPos = {1:length(mutations_val)}[mutations_val] + mutationPos = mutationPos[sapply(mutationPos, function(x){!any(paramSeq[getCodonPos(x)]=="N")})] + length_mutations =length(mutationPos) + mutationInfo = rep(NA,length_mutations) + if(any(mutationPos)){ + + pos<- mutationPos + pos_array<-array(sapply(pos,getCodonPos)) + codonGL = paramGL[pos_array] + + codonSeq = sapply(pos,function(x){ + seqP = paramGL[getCodonPos(x)] + muCodonPos = {x-1}%%3+1 + seqP[muCodonPos] = paramSeq[x] + return(seqP) + }) + GLcodons = apply(matrix(codonGL,length_mutations,3,byrow=TRUE),1,c2s) + Seqcodons = apply(codonSeq,2,c2s) + mutationInfo = apply(rbind(GLcodons , Seqcodons),2,function(x){mutationType(c2s(x[1]),c2s(x[2]))}) + names(mutationInfo) = mutationPos + } + if(any(!is.na(mutationInfo))){ + return(mutationInfo[!is.na(mutationInfo)]) + }else{ + return(NA) + } + + + }else{ + return (NA) + } + } + + processNucMutations2 <- function(mu){ + if(!is.na(mu)){ + #R + if(any(mu=="R")){ + Rs = mu[mu=="R"] + nucNumbs = as.numeric(names(Rs)) + R_CDR = sum(as.integer(CDR_Nuc[nucNumbs]),na.rm=T) + R_FWR = sum(as.integer(FWR_Nuc[nucNumbs]),na.rm=T) + }else{ + R_CDR = 0 + R_FWR = 0 + } + + #S + if(any(mu=="S")){ + Ss = mu[mu=="S"] + nucNumbs = as.numeric(names(Ss)) + S_CDR = sum(as.integer(CDR_Nuc[nucNumbs]),na.rm=T) + S_FWR = sum(as.integer(FWR_Nuc[nucNumbs]),na.rm=T) + }else{ + S_CDR = 0 + S_FWR = 0 + } + + + retVec = c(R_CDR,S_CDR,R_FWR,S_FWR) + retVec[is.na(retVec)]=0 + return(retVec) + }else{ + return(rep(0,4)) + } + } + + + ## Z-score Test + computeZScore <- function(mat, test="Focused"){ + matRes <- matrix(NA,ncol=2,nrow=(nrow(mat))) + if(test=="Focused"){ + #Z_Focused_CDR + #P_Denom = sum( mat[1,c(5,6,8)], na.rm=T ) + P = apply(mat[,c(5,6,8)],1,function(x){(x[1]/sum(x))}) + R_mean = apply(cbind(mat[,c(1,2,4)],P),1,function(x){x[4]*(sum(x[1:3]))}) + R_sd=sqrt(R_mean*(1-P)) + matRes[,1] = (mat[,1]-R_mean)/R_sd + + #Z_Focused_FWR + #P_Denom = sum( mat[1,c(7,6,8)], na.rm=T ) + P = apply(mat[,c(7,6,8)],1,function(x){(x[1]/sum(x))}) + R_mean = apply(cbind(mat[,c(3,2,4)],P),1,function(x){x[4]*(sum(x[1:3]))}) + R_sd=sqrt(R_mean*(1-P)) + matRes[,2] = (mat[,3]-R_mean)/R_sd + } + + if(test=="Local"){ + #Z_Focused_CDR + #P_Denom = sum( mat[1,c(5,6,8)], na.rm=T ) + P = apply(mat[,c(5,6)],1,function(x){(x[1]/sum(x))}) + R_mean = apply(cbind(mat[,c(1,2)],P),1,function(x){x[3]*(sum(x[1:2]))}) + R_sd=sqrt(R_mean*(1-P)) + matRes[,1] = (mat[,1]-R_mean)/R_sd + + #Z_Focused_FWR + #P_Denom = sum( mat[1,c(7,6,8)], na.rm=T ) + P = apply(mat[,c(7,8)],1,function(x){(x[1]/sum(x))}) + R_mean = apply(cbind(mat[,c(3,4)],P),1,function(x){x[3]*(sum(x[1:2]))}) + R_sd=sqrt(R_mean*(1-P)) + matRes[,2] = (mat[,3]-R_mean)/R_sd + } + + if(test=="Imbalanced"){ + #Z_Focused_CDR + #P_Denom = sum( mat[1,c(5,6,8)], na.rm=T ) + P = apply(mat[,5:8],1,function(x){((x[1]+x[2])/sum(x))}) + R_mean = apply(cbind(mat[,1:4],P),1,function(x){x[5]*(sum(x[1:4]))}) + R_sd=sqrt(R_mean*(1-P)) + matRes[,1] = (mat[,1]-R_mean)/R_sd + + #Z_Focused_FWR + #P_Denom = sum( mat[1,c(7,6,8)], na.rm=T ) + P = apply(mat[,5:8],1,function(x){((x[3]+x[4])/sum(x))}) + R_mean = apply(cbind(mat[,1:4],P),1,function(x){x[5]*(sum(x[1:4]))}) + R_sd=sqrt(R_mean*(1-P)) + matRes[,2] = (mat[,3]-R_mean)/R_sd + } + + matRes[is.nan(matRes)] = NA + return(matRes) + } + + # Return a p-value for a z-score + z2p <- function(z){ + p=NA + if( !is.nan(z) && !is.na(z)){ + if(z>0){ + p = (1 - pnorm(z,0,1)) + } else if(z<0){ + p = (-1 * pnorm(z,0,1)) + } else{ + p = 0.5 + } + }else{ + p = NA + } + return(p) + } + + + ## Bayesian Test + + # Fitted parameter for the bayesian framework +BAYESIAN_FITTED<-c(0.407277142798302, 0.554007336744485, 0.63777155771234, 0.693989162719009, 0.735450014674917, 0.767972534429806, 0.794557287143399, 0.816906816601605, 0.83606796225341, 0.852729446430296, 0.867370424541641, 0.880339760590323, 0.891900995024999, 0.902259181289864, 0.911577919359,0.919990301665853, 0.927606458124537, 0.934518806350661, 0.940805863754375, 0.946534836475715, 0.951763691199255, 0.95654428191308, 0.960920179487397, 0.964930893680829, 0.968611312149038, 0.971992459313836, 0.975102110004818, 0.977964943023096, 0.980603428208439, 0.983037660179428, 0.985285800977406, 0.987364285326685, 0.989288037855441, 0.991070478823525, 0.992723699729969, 0.994259575477392, 0.995687688867975, 0.997017365051493, 0.998257085153047, 0.999414558305388, 1.00049681357804, 1.00151036237481, 1.00246080204981, 1.00335370751909, 1.0041939329768, 1.0049859393417, 1.00573382091263, 1.00644127217376, 1.00711179729107, 1.00774845526417, 1.00835412715854, 1.00893143010366, 1.00948275846309, 1.01001030293661, 1.01051606798079, 1.01100188771288, 1.01146944044216, 1.01192026195449, 1.01235575766094, 1.01277721370986) + CONST_i <- sort(c(((2^(seq(-39,0,length.out=201)))/2)[1:200],(c(0:11,13:99)+0.5)/100,1-(2^(seq(-39,0,length.out=201)))/2)) + + # Given x, M & p, returns a pdf + calculate_bayes <- function ( x=3, N=10, p=0.33, + i=CONST_i, + max_sigma=20,length_sigma=4001 + ){ + if(!0%in%N){ + G <- max(length(x),length(N),length(p)) + x=array(x,dim=G) + N=array(N,dim=G) + p=array(p,dim=G) + sigma_s<-seq(-max_sigma,max_sigma,length.out=length_sigma) + sigma_1<-log({i/{1-i}}/{p/{1-p}}) + index<-min(N,60) + y<-dbeta(i,x+BAYESIAN_FITTED[index],N+BAYESIAN_FITTED[index]-x)*(1-p)*p*exp(sigma_1)/({1-p}^2+2*p*{1-p}*exp(sigma_1)+{p^2}*exp(2*sigma_1)) + if(!sum(is.na(y))){ + tmp<-approx(sigma_1,y,sigma_s)$y + tmp/sum(tmp)/{2*max_sigma/{length_sigma-1}} + }else{ + return(NA) + } + }else{ + return(NA) + } + } + # Given a mat of observed & expected, return a list of CDR & FWR pdf for selection + computeBayesianScore <- function(mat, test="Focused", max_sigma=20,length_sigma=4001){ + flagOneSeq = F + if(nrow(mat)==1){ + mat=rbind(mat,mat) + flagOneSeq = T + } + if(test=="Focused"){ + #CDR + P = c(apply(mat[,c(5,6,8)],1,function(x){(x[1]/sum(x))}),0.5) + N = c(apply(mat[,c(1,2,4)],1,function(x){(sum(x))}),0) + X = c(mat[,1],0) + bayesCDR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) + bayesCDR = bayesCDR[-length(bayesCDR)] + + #FWR + P = c(apply(mat[,c(7,6,8)],1,function(x){(x[1]/sum(x))}),0.5) + N = c(apply(mat[,c(3,2,4)],1,function(x){(sum(x))}),0) + X = c(mat[,3],0) + bayesFWR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) + bayesFWR = bayesFWR[-length(bayesFWR)] + } + + if(test=="Local"){ + #CDR + P = c(apply(mat[,c(5,6)],1,function(x){(x[1]/sum(x))}),0.5) + N = c(apply(mat[,c(1,2)],1,function(x){(sum(x))}),0) + X = c(mat[,1],0) + bayesCDR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) + bayesCDR = bayesCDR[-length(bayesCDR)] + + #FWR + P = c(apply(mat[,c(7,8)],1,function(x){(x[1]/sum(x))}),0.5) + N = c(apply(mat[,c(3,4)],1,function(x){(sum(x))}),0) + X = c(mat[,3],0) + bayesFWR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) + bayesFWR = bayesFWR[-length(bayesFWR)] + } + + if(test=="Imbalanced"){ + #CDR + P = c(apply(mat[,c(5:8)],1,function(x){((x[1]+x[2])/sum(x))}),0.5) + N = c(apply(mat[,c(1:4)],1,function(x){(sum(x))}),0) + X = c(apply(mat[,c(1:2)],1,function(x){(sum(x))}),0) + bayesCDR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) + bayesCDR = bayesCDR[-length(bayesCDR)] + + #FWR + P = c(apply(mat[,c(5:8)],1,function(x){((x[3]+x[4])/sum(x))}),0.5) + N = c(apply(mat[,c(1:4)],1,function(x){(sum(x))}),0) + X = c(apply(mat[,c(3:4)],1,function(x){(sum(x))}),0) + bayesFWR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) + bayesFWR = bayesFWR[-length(bayesFWR)] + } + + if(test=="ImbalancedSilent"){ + #CDR + P = c(apply(mat[,c(6,8)],1,function(x){((x[1])/sum(x))}),0.5) + N = c(apply(mat[,c(2,4)],1,function(x){(sum(x))}),0) + X = c(apply(mat[,c(2,4)],1,function(x){(x[1])}),0) + bayesCDR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) + bayesCDR = bayesCDR[-length(bayesCDR)] + + #FWR + P = c(apply(mat[,c(6,8)],1,function(x){((x[2])/sum(x))}),0.5) + N = c(apply(mat[,c(2,4)],1,function(x){(sum(x))}),0) + X = c(apply(mat[,c(2,4)],1,function(x){(x[2])}),0) + bayesFWR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) + bayesFWR = bayesFWR[-length(bayesFWR)] + } + + if(flagOneSeq==T){ + bayesCDR = bayesCDR[1] + bayesFWR = bayesFWR[1] + } + return( list("CDR"=bayesCDR, "FWR"=bayesFWR) ) + } + + ##Covolution + break2chunks<-function(G=1000){ + base<-2^round(log(sqrt(G),2),0) + return(c(rep(base,floor(G/base)-1),base+G-(floor(G/base)*base))) + } + + PowersOfTwo <- function(G=100){ + exponents <- array() + i = 0 + while(G > 0){ + i=i+1 + exponents[i] <- floor( log2(G) ) + G <- G-2^exponents[i] + } + return(exponents) + } + + convolutionPowersOfTwo <- function( cons, length_sigma=4001 ){ + G = ncol(cons) + if(G>1){ + for(gen in log(G,2):1){ + ll<-seq(from=2,to=2^gen,by=2) + sapply(ll,function(l){cons[,l/2]<<-weighted_conv(cons[,l],cons[,l-1],length_sigma=length_sigma)}) + } + } + return( cons[,1] ) + } + + convolutionPowersOfTwoByTwos <- function( cons, length_sigma=4001,G=1 ){ + if(length(ncol(cons))) G<-ncol(cons) + groups <- PowersOfTwo(G) + matG <- matrix(NA, ncol=length(groups), nrow=length(cons)/G ) + startIndex = 1 + for( i in 1:length(groups) ){ + stopIndex <- 2^groups[i] + startIndex - 1 + if(stopIndex!=startIndex){ + matG[,i] <- convolutionPowersOfTwo( cons[,startIndex:stopIndex], length_sigma=length_sigma ) + startIndex = stopIndex + 1 + } + else { + if(G>1) matG[,i] <- cons[,startIndex:stopIndex] + else matG[,i] <- cons + #startIndex = stopIndex + 1 + } + } + return( list( matG, groups ) ) + } + + weighted_conv<-function(x,y,w=1,m=100,length_sigma=4001){ + lx<-length(x) + ly<-length(y) + if({lx1){ + while( i1 & Length_Postrior<=Threshold){ + cons = matrix(unlist(listPosteriors),length(listPosteriors[[1]]),length(listPosteriors)) + listMatG <- convolutionPowersOfTwoByTwos(cons,length_sigma=length_sigma) + y<-calculate_bayesGHelper(listMatG,length_sigma=length_sigma) + return( y/sum(y)/(2*max_sigma/(length_sigma-1)) ) + }else if(Length_Postrior==1) return(listPosteriors[[1]]) + else if(Length_Postrior==0) return(NA) + else { + cons = matrix(unlist(listPosteriors),length(listPosteriors[[1]]),length(listPosteriors)) + y = fastConv(cons,max_sigma=max_sigma, length_sigma=length_sigma ) + return( y/sum(y)/(2*max_sigma/(length_sigma-1)) ) + } + } + + fastConv<-function(cons, max_sigma=20, length_sigma=4001){ + chunks<-break2chunks(G=ncol(cons)) + if(ncol(cons)==3) chunks<-2:1 + index_chunks_end <- cumsum(chunks) + index_chunks_start <- c(1,index_chunks_end[-length(index_chunks_end)]+1) + index_chunks <- cbind(index_chunks_start,index_chunks_end) + + case <- sum(chunks!=chunks[1]) + if(case==1) End <- max(1,((length(index_chunks)/2)-1)) + else End <- max(1,((length(index_chunks)/2))) + + firsts <- sapply(1:End,function(i){ + indexes<-index_chunks[i,1]:index_chunks[i,2] + convolutionPowersOfTwoByTwos(cons[ ,indexes])[[1]] + }) + if(case==0){ + result<-calculate_bayesGHelper( convolutionPowersOfTwoByTwos(firsts) ) + }else if(case==1){ + last<-list(calculate_bayesGHelper( + convolutionPowersOfTwoByTwos( cons[ ,index_chunks[length(index_chunks)/2,1]:index_chunks[length(index_chunks)/2,2]] ) + ),0) + result_first<-calculate_bayesGHelper(convolutionPowersOfTwoByTwos(firsts)) + result<-calculate_bayesGHelper( + list( + cbind( + result_first,last[[1]]), + c(log(index_chunks_end[length(index_chunks)/2-1],2),log(index_chunks[length(index_chunks)/2,2]-index_chunks[length(index_chunks)/2,1]+1,2)) + ) + ) + } + return(as.vector(result)) + } + + # Computes the 95% CI for a pdf + calcBayesCI <- function(Pdf,low=0.025,up=0.975,max_sigma=20, length_sigma=4001){ + if(length(Pdf)!=length_sigma) return(NA) + sigma_s=seq(-max_sigma,max_sigma,length.out=length_sigma) + cdf = cumsum(Pdf) + cdf = cdf/cdf[length(cdf)] + return( c(sigma_s[findInterval(low,cdf)-1] , sigma_s[findInterval(up,cdf)]) ) + } + + # Computes a mean for a pdf + calcBayesMean <- function(Pdf,max_sigma=20,length_sigma=4001){ + if(length(Pdf)!=length_sigma) return(NA) + sigma_s=seq(-max_sigma,max_sigma,length.out=length_sigma) + norm = {length_sigma-1}/2/max_sigma + return( (Pdf%*%sigma_s/norm) ) + } + + # Returns the mean, and the 95% CI for a pdf + calcBayesOutputInfo <- function(Pdf,low=0.025,up=0.975,max_sigma=20, length_sigma=4001){ + if(is.na(Pdf)) + return(rep(NA,3)) + bCI = calcBayesCI(Pdf=Pdf,low=low,up=up,max_sigma=max_sigma,length_sigma=length_sigma) + bMean = calcBayesMean(Pdf=Pdf,max_sigma=max_sigma,length_sigma=length_sigma) + return(c(bMean, bCI)) + } + + # Computes the p-value of a pdf + computeSigmaP <- function(Pdf, length_sigma=4001, max_sigma=20){ + if(length(Pdf)>1){ + norm = {length_sigma-1}/2/max_sigma + pVal = {sum(Pdf[1:{{length_sigma-1}/2}]) + Pdf[{{length_sigma+1}/2}]/2}/norm + if(pVal>0.5){ + pVal = pVal-1 + } + return(pVal) + }else{ + return(NA) + } + } + + # Compute p-value of two distributions + compareTwoDistsFaster <-function(sigma_S=seq(-20,20,length.out=4001), N=10000, dens1=runif(4001,0,1), dens2=runif(4001,0,1)){ + #print(c(length(dens1),length(dens2))) + if(length(dens1)>1 & length(dens2)>1 ){ + dens1<-dens1/sum(dens1) + dens2<-dens2/sum(dens2) + cum2 <- cumsum(dens2)-dens2/2 + tmp<- sum(sapply(1:length(dens1),function(i)return(dens1[i]*cum2[i]))) + #print(tmp) + if(tmp>0.5)tmp<-tmp-1 + return( tmp ) + } + else { + return(NA) + } + #return (sum(sapply(1:N,function(i)(sample(sigma_S,1,prob=dens1)>sample(sigma_S,1,prob=dens2))))/N) + } + + # get number of seqeunces contributing to the sigma (i.e. seqeunces with mutations) + numberOfSeqsWithMutations <- function(matMutations,test=1){ + if(test==4)test=2 + cdrSeqs <- 0 + fwrSeqs <- 0 + if(test==1){#focused + cdrMutations <- apply(matMutations, 1, function(x){ sum(x[c(1,2,4)]) }) + fwrMutations <- apply(matMutations, 1, function(x){ sum(x[c(3,4,2)]) }) + if( any(which(cdrMutations>0)) ) cdrSeqs <- sum(cdrMutations>0) + if( any(which(fwrMutations>0)) ) fwrSeqs <- sum(fwrMutations>0) + } + if(test==2){#local + cdrMutations <- apply(matMutations, 1, function(x){ sum(x[c(1,2)]) }) + fwrMutations <- apply(matMutations, 1, function(x){ sum(x[c(3,4)]) }) + if( any(which(cdrMutations>0)) ) cdrSeqs <- sum(cdrMutations>0) + if( any(which(fwrMutations>0)) ) fwrSeqs <- sum(fwrMutations>0) + } + return(c("CDR"=cdrSeqs, "FWR"=fwrSeqs)) +} + + + +shadeColor <- function(sigmaVal=NA,pVal=NA){ + if(is.na(sigmaVal) & is.na(pVal)) return(NA) + if(is.na(sigmaVal) & !is.na(pVal)) sigmaVal=sign(pVal) + if(is.na(pVal) || pVal==1 || pVal==0){ + returnColor = "#FFFFFF"; + }else{ + colVal=abs(pVal); + + if(sigmaVal<0){ + if(colVal>0.1) + returnColor = "#CCFFCC"; + if(colVal<=0.1) + returnColor = "#99FF99"; + if(colVal<=0.050) + returnColor = "#66FF66"; + if(colVal<=0.010) + returnColor = "#33FF33"; + if(colVal<=0.005) + returnColor = "#00FF00"; + + }else{ + if(colVal>0.1) + returnColor = "#FFCCCC"; + if(colVal<=0.1) + returnColor = "#FF9999"; + if(colVal<=0.05) + returnColor = "#FF6666"; + if(colVal<=0.01) + returnColor = "#FF3333"; + if(colVal<0.005) + returnColor = "#FF0000"; + } + } + + return(returnColor) +} + + + +plotHelp <- function(xfrac=0.05,yfrac=0.05,log=FALSE){ + if(!log){ + x = par()$usr[1]-(par()$usr[2]-par()$usr[1])*xfrac + y = par()$usr[4]+(par()$usr[4]-par()$usr[3])*yfrac + }else { + if(log==2){ + x = par()$usr[1]-(par()$usr[2]-par()$usr[1])*xfrac + y = 10^((par()$usr[4])+((par()$usr[4])-(par()$usr[3]))*yfrac) + } + if(log==1){ + x = 10^((par()$usr[1])-((par()$usr[2])-(par()$usr[1]))*xfrac) + y = par()$usr[4]+(par()$usr[4]-par()$usr[3])*yfrac + } + if(log==3){ + x = 10^((par()$usr[1])-((par()$usr[2])-(par()$usr[1]))*xfrac) + y = 10^((par()$usr[4])+((par()$usr[4])-(par()$usr[3]))*yfrac) + } + } + return(c("x"=x,"y"=y)) +} + +# SHMulation + + # Based on targeting, introduce a single mutation & then update the targeting + oneMutation <- function(){ + # Pick a postion + mutation + posMutation = sample(1:(seqGermlineLen*4),1,replace=F,prob=as.vector(seqTargeting)) + posNucNumb = ceiling(posMutation/4) # Nucleotide number + posNucKind = 4 - ( (posNucNumb*4) - posMutation ) # Nuc the position mutates to + + #mutate the simulation sequence + seqSimVec <- s2c(seqSim) + seqSimVec[posNucNumb] <- NUCLEOTIDES[posNucKind] + seqSim <<- c2s(seqSimVec) + + #update Mutability, Targeting & MutationsTypes + updateMutabilityNTargeting(posNucNumb) + + #return(c(posNucNumb,NUCLEOTIDES[posNucKind])) + return(posNucNumb) + } + + updateMutabilityNTargeting <- function(position){ + min_i<-max((position-2),1) + max_i<-min((position+2),nchar(seqSim)) + min_ii<-min(min_i,3) + + #mutability - update locally + seqMutability[(min_i):(max_i)] <<- computeMutabilities(substr(seqSim,position-4,position+4))[(min_ii):(max_i-min_i+min_ii)] + + + #targeting - compute locally + seqTargeting[,min_i:max_i] <<- computeTargeting(substr(seqSim,min_i,max_i),seqMutability[min_i:max_i]) + seqTargeting[is.na(seqTargeting)] <<- 0 + #mutCodonPos = getCodonPos(position) + mutCodonPos = seq(getCodonPos(min_i)[1],getCodonPos(max_i)[3]) + #cat(mutCodonPos,"\n") + mutTypeCodon = getCodonPos(position) + seqMutationTypes[,mutTypeCodon] <<- computeMutationTypesFast( substr(seqSim,mutTypeCodon[1],mutTypeCodon[3]) ) + # Stop = 0 + if(any(seqMutationTypes[,mutCodonPos]=="Stop",na.rm=T )){ + seqTargeting[,mutCodonPos][seqMutationTypes[,mutCodonPos]=="Stop"] <<- 0 + } + + + #Selection + selectedPos = (min_i*4-4)+(which(seqMutationTypes[,min_i:max_i]=="R")) + # CDR + selectedCDR = selectedPos[which(matCDR[selectedPos]==T)] + seqTargeting[selectedCDR] <<- seqTargeting[selectedCDR] * exp(selCDR) + seqTargeting[selectedCDR] <<- seqTargeting[selectedCDR]/baseLineCDR_K + + # FWR + selectedFWR = selectedPos[which(matFWR[selectedPos]==T)] + seqTargeting[selectedFWR] <<- seqTargeting[selectedFWR] * exp(selFWR) + seqTargeting[selectedFWR] <<- seqTargeting[selectedFWR]/baseLineFWR_K + + } + + + + # Validate the mutation: if the mutation has not been sampled before validate it, else discard it. + validateMutation <- function(){ + if( !(mutatedPos%in%mutatedPositions) ){ # if it's a new mutation + uniqueMutationsIntroduced <<- uniqueMutationsIntroduced + 1 + mutatedPositions[uniqueMutationsIntroduced] <<- mutatedPos + }else{ + if(substr(seqSim,mutatedPos,mutatedPos)==substr(seqGermline,mutatedPos,mutatedPos)){ # back to germline mutation + mutatedPositions <<- mutatedPositions[-which(mutatedPositions==mutatedPos)] + uniqueMutationsIntroduced <<- uniqueMutationsIntroduced - 1 + } + } + } + + + + # Places text (labels) at normalized coordinates + myaxis <- function(xfrac=0.05,yfrac=0.05,log=FALSE,w="text",cex=1,adj=1,thecol="black"){ + par(xpd=TRUE) + if(!log) + text(par()$usr[1]-(par()$usr[2]-par()$usr[1])*xfrac,par()$usr[4]+(par()$usr[4]-par()$usr[3])*yfrac,w,cex=cex,adj=adj,col=thecol) + else { + if(log==2) + text( + par()$usr[1]-(par()$usr[2]-par()$usr[1])*xfrac, + 10^((par()$usr[4])+((par()$usr[4])-(par()$usr[3]))*yfrac), + w,cex=cex,adj=adj,col=thecol) + if(log==1) + text( + 10^((par()$usr[1])-((par()$usr[2])-(par()$usr[1]))*xfrac), + par()$usr[4]+(par()$usr[4]-par()$usr[3])*yfrac, + w,cex=cex,adj=adj,col=thecol) + if(log==3) + text( + 10^((par()$usr[1])-((par()$usr[2])-(par()$usr[1]))*xfrac), + 10^((par()$usr[4])+((par()$usr[4])-(par()$usr[3]))*yfrac), + w,cex=cex,adj=adj,col=thecol) + } + par(xpd=FALSE) + } + + + + # Count the mutations in a sequence + analyzeMutations <- function( inputMatrixIndex, model = 0 , multipleMutation=0, seqWithStops=0){ + + paramGL = s2c(matInput[inputMatrixIndex,2]) + paramSeq = s2c(matInput[inputMatrixIndex,1]) + + #if( any(paramSeq=="N") ){ + # gapPos_Seq = which(paramSeq=="N") + # gapPos_Seq_ToReplace = gapPos_Seq[paramGL[gapPos_Seq] != "N"] + # paramSeq[gapPos_Seq_ToReplace] = paramGL[gapPos_Seq_ToReplace] + #} + mutations_val = paramGL != paramSeq + + if(any(mutations_val)){ + mutationPos = which(mutations_val)#{1:length(mutations_val)}[mutations_val] + length_mutations =length(mutationPos) + mutationInfo = rep(NA,length_mutations) + + pos<- mutationPos + pos_array<-array(sapply(pos,getCodonPos)) + codonGL = paramGL[pos_array] + codonSeqWhole = paramSeq[pos_array] + codonSeq = sapply(pos,function(x){ + seqP = paramGL[getCodonPos(x)] + muCodonPos = {x-1}%%3+1 + seqP[muCodonPos] = paramSeq[x] + return(seqP) + }) + GLcodons = apply(matrix(codonGL,length_mutations,3,byrow=TRUE),1,c2s) + SeqcodonsWhole = apply(matrix(codonSeqWhole,length_mutations,3,byrow=TRUE),1,c2s) + Seqcodons = apply(codonSeq,2,c2s) + + mutationInfo = apply(rbind(GLcodons , Seqcodons),2,function(x){mutationType(c2s(x[1]),c2s(x[2]))}) + names(mutationInfo) = mutationPos + + mutationInfoWhole = apply(rbind(GLcodons , SeqcodonsWhole),2,function(x){mutationType(c2s(x[1]),c2s(x[2]))}) + names(mutationInfoWhole) = mutationPos + + mutationInfo <- mutationInfo[!is.na(mutationInfo)] + mutationInfoWhole <- mutationInfoWhole[!is.na(mutationInfoWhole)] + + if(any(!is.na(mutationInfo))){ + + #Filter based on Stop (at the codon level) + if(seqWithStops==1){ + nucleotidesAtStopCodons = names(mutationInfoWhole[mutationInfoWhole!="Stop"]) + mutationInfo = mutationInfo[nucleotidesAtStopCodons] + mutationInfoWhole = mutationInfo[nucleotidesAtStopCodons] + }else{ + countStops = sum(mutationInfoWhole=="Stop") + if(seqWithStops==2 & countStops==0) mutationInfo = NA + if(seqWithStops==3 & countStops>0) mutationInfo = NA + } + + if(any(!is.na(mutationInfo))){ + #Filter mutations based on multipleMutation + if(multipleMutation==1 & !is.na(mutationInfo)){ + mutationCodons = getCodonNumb(as.numeric(names(mutationInfoWhole))) + tableMutationCodons <- table(mutationCodons) + codonsWithMultipleMutations <- as.numeric(names(tableMutationCodons[tableMutationCodons>1])) + if(any(codonsWithMultipleMutations)){ + #remove the nucleotide mutations in the codons with multiple mutations + mutationInfo <- mutationInfo[!(mutationCodons %in% codonsWithMultipleMutations)] + #replace those codons with Ns in the input sequence + paramSeq[unlist(lapply(codonsWithMultipleMutations, getCodonNucs))] = "N" + matInput[inputMatrixIndex,1] <<- c2s(paramSeq) + } + } + + #Filter mutations based on the model + if(any(mutationInfo)==T | is.na(any(mutationInfo))){ + + if(model==1 & !is.na(mutationInfo)){ + mutationInfo <- mutationInfo[mutationInfo=="S"] + } + if(any(mutationInfo)==T | is.na(any(mutationInfo))) return(mutationInfo) + else return(NA) + }else{ + return(NA) + } + }else{ + return(NA) + } + + + }else{ + return(NA) + } + + + }else{ + return (NA) + } + } + + analyzeMutationsFixed <- function( inputArray, model = 0 , multipleMutation=0, seqWithStops=0){ + + paramGL = s2c(inputArray[2]) + paramSeq = s2c(inputArray[1]) + inputSeq <- inputArray[1] + #if( any(paramSeq=="N") ){ + # gapPos_Seq = which(paramSeq=="N") + # gapPos_Seq_ToReplace = gapPos_Seq[paramGL[gapPos_Seq] != "N"] + # paramSeq[gapPos_Seq_ToReplace] = paramGL[gapPos_Seq_ToReplace] + #} + mutations_val = paramGL != paramSeq + + if(any(mutations_val)){ + mutationPos = which(mutations_val)#{1:length(mutations_val)}[mutations_val] + length_mutations =length(mutationPos) + mutationInfo = rep(NA,length_mutations) + + pos<- mutationPos + pos_array<-array(sapply(pos,getCodonPos)) + codonGL = paramGL[pos_array] + codonSeqWhole = paramSeq[pos_array] + codonSeq = sapply(pos,function(x){ + seqP = paramGL[getCodonPos(x)] + muCodonPos = {x-1}%%3+1 + seqP[muCodonPos] = paramSeq[x] + return(seqP) + }) + GLcodons = apply(matrix(codonGL,length_mutations,3,byrow=TRUE),1,c2s) + SeqcodonsWhole = apply(matrix(codonSeqWhole,length_mutations,3,byrow=TRUE),1,c2s) + Seqcodons = apply(codonSeq,2,c2s) + + mutationInfo = apply(rbind(GLcodons , Seqcodons),2,function(x){mutationType(c2s(x[1]),c2s(x[2]))}) + names(mutationInfo) = mutationPos + + mutationInfoWhole = apply(rbind(GLcodons , SeqcodonsWhole),2,function(x){mutationType(c2s(x[1]),c2s(x[2]))}) + names(mutationInfoWhole) = mutationPos + + mutationInfo <- mutationInfo[!is.na(mutationInfo)] + mutationInfoWhole <- mutationInfoWhole[!is.na(mutationInfoWhole)] + + if(any(!is.na(mutationInfo))){ + + #Filter based on Stop (at the codon level) + if(seqWithStops==1){ + nucleotidesAtStopCodons = names(mutationInfoWhole[mutationInfoWhole!="Stop"]) + mutationInfo = mutationInfo[nucleotidesAtStopCodons] + mutationInfoWhole = mutationInfo[nucleotidesAtStopCodons] + }else{ + countStops = sum(mutationInfoWhole=="Stop") + if(seqWithStops==2 & countStops==0) mutationInfo = NA + if(seqWithStops==3 & countStops>0) mutationInfo = NA + } + + if(any(!is.na(mutationInfo))){ + #Filter mutations based on multipleMutation + if(multipleMutation==1 & !is.na(mutationInfo)){ + mutationCodons = getCodonNumb(as.numeric(names(mutationInfoWhole))) + tableMutationCodons <- table(mutationCodons) + codonsWithMultipleMutations <- as.numeric(names(tableMutationCodons[tableMutationCodons>1])) + if(any(codonsWithMultipleMutations)){ + #remove the nucleotide mutations in the codons with multiple mutations + mutationInfo <- mutationInfo[!(mutationCodons %in% codonsWithMultipleMutations)] + #replace those codons with Ns in the input sequence + paramSeq[unlist(lapply(codonsWithMultipleMutations, getCodonNucs))] = "N" + #matInput[inputMatrixIndex,1] <<- c2s(paramSeq) + inputSeq <- c2s(paramSeq) + } + } + + #Filter mutations based on the model + if(any(mutationInfo)==T | is.na(any(mutationInfo))){ + + if(model==1 & !is.na(mutationInfo)){ + mutationInfo <- mutationInfo[mutationInfo=="S"] + } + if(any(mutationInfo)==T | is.na(any(mutationInfo))) return(list(mutationInfo,inputSeq)) + else return(list(NA,inputSeq)) + }else{ + return(list(NA,inputSeq)) + } + }else{ + return(list(NA,inputSeq)) + } + + + }else{ + return(list(NA,inputSeq)) + } + + + }else{ + return (list(NA,inputSeq)) + } + } + + # triMutability Background Count + buildMutabilityModel <- function( inputMatrixIndex, model=0 , multipleMutation=0, seqWithStops=0, stopMutations=0){ + + #rowOrigMatInput = matInput[inputMatrixIndex,] + seqGL = gsub("-", "", matInput[inputMatrixIndex,2]) + seqInput = gsub("-", "", matInput[inputMatrixIndex,1]) + #matInput[inputMatrixIndex,] <<- cbind(seqInput,seqGL) + tempInput <- cbind(seqInput,seqGL) + seqLength = nchar(seqGL) + list_analyzeMutationsFixed<- analyzeMutationsFixed(tempInput, model, multipleMutation, seqWithStops) + mutationCount <- list_analyzeMutationsFixed[[1]] + seqInput <- list_analyzeMutationsFixed[[2]] + BackgroundMatrix = mutabilityMatrix + MutationMatrix = mutabilityMatrix + MutationCountMatrix = mutabilityMatrix + if(!is.na(mutationCount)){ + if((stopMutations==0 & model==0) | (stopMutations==1 & (sum(mutationCount=="Stop")0)) ){ + + fivermerStartPos = 1:(seqLength-4) + fivemerLength <- length(fivermerStartPos) + fivemerGL <- substr(rep(seqGL,length(fivermerStartPos)),(fivermerStartPos),(fivermerStartPos+4)) + fivemerSeq <- substr(rep(seqInput,length(fivermerStartPos)),(fivermerStartPos),(fivermerStartPos+4)) + + #Background + for(fivemerIndex in 1:fivemerLength){ + fivemer = fivemerGL[fivemerIndex] + if(!any(grep("N",fivemer))){ + fivemerCodonPos = fivemerCodon(fivemerIndex) + fivemerReadingFrameCodon = substr(fivemer,fivemerCodonPos[1],fivemerCodonPos[3]) + fivemerReadingFrameCodonInputSeq = substr(fivemerSeq[fivemerIndex],fivemerCodonPos[1],fivemerCodonPos[3]) + + # All mutations model + #if(!any(grep("N",fivemerReadingFrameCodon))){ + if(model==0){ + if(stopMutations==0){ + if(!any(grep("N",fivemerReadingFrameCodonInputSeq))) + BackgroundMatrix[fivemer] <- (BackgroundMatrix[fivemer] + 1) + }else{ + if( !any(grep("N",fivemerReadingFrameCodonInputSeq)) & translateCodonToAminoAcid(fivemerReadingFrameCodon)!="*" ){ + positionWithinCodon = which(fivemerCodonPos==3)#positionsWithinCodon[(fivemerCodonPos[1]%%3)+1] + BackgroundMatrix[fivemer] <- (BackgroundMatrix[fivemer] + probNonStopMutations[fivemerReadingFrameCodon,positionWithinCodon]) + } + } + }else{ # Only silent mutations + if( !any(grep("N",fivemerReadingFrameCodonInputSeq)) & translateCodonToAminoAcid(fivemerReadingFrameCodon)!="*" & translateCodonToAminoAcid(fivemerReadingFrameCodonInputSeq)==translateCodonToAminoAcid(fivemerReadingFrameCodon)){ + positionWithinCodon = which(fivemerCodonPos==3) + BackgroundMatrix[fivemer] <- (BackgroundMatrix[fivemer] + probSMutations[fivemerReadingFrameCodon,positionWithinCodon]) + } + } + #} + } + } + + #Mutations + if(stopMutations==1) mutationCount = mutationCount[mutationCount!="Stop"] + if(model==1) mutationCount = mutationCount[mutationCount=="S"] + mutationPositions = as.numeric(names(mutationCount)) + mutationCount = mutationCount[mutationPositions>2 & mutationPositions<(seqLength-1)] + mutationPositions = mutationPositions[mutationPositions>2 & mutationPositions<(seqLength-1)] + countMutations = 0 + for(mutationPosition in mutationPositions){ + fivemerIndex = mutationPosition-2 + fivemer = fivemerSeq[fivemerIndex] + GLfivemer = fivemerGL[fivemerIndex] + fivemerCodonPos = fivemerCodon(fivemerIndex) + fivemerReadingFrameCodon = substr(fivemer,fivemerCodonPos[1],fivemerCodonPos[3]) + fivemerReadingFrameCodonGL = substr(GLfivemer,fivemerCodonPos[1],fivemerCodonPos[3]) + if(!any(grep("N",fivemer)) & !any(grep("N",GLfivemer))){ + if(model==0){ + countMutations = countMutations + 1 + MutationMatrix[GLfivemer] <- (MutationMatrix[GLfivemer] + 1) + MutationCountMatrix[GLfivemer] <- (MutationCountMatrix[GLfivemer] + 1) + }else{ + if( translateCodonToAminoAcid(fivemerReadingFrameCodonGL)!="*" ){ + countMutations = countMutations + 1 + positionWithinCodon = which(fivemerCodonPos==3) + glNuc = substr(fivemerReadingFrameCodonGL,positionWithinCodon,positionWithinCodon) + inputNuc = substr(fivemerReadingFrameCodon,positionWithinCodon,positionWithinCodon) + MutationMatrix[GLfivemer] <- (MutationMatrix[GLfivemer] + substitution[glNuc,inputNuc]) + MutationCountMatrix[GLfivemer] <- (MutationCountMatrix[GLfivemer] + 1) + } + } + } + } + + seqMutability = MutationMatrix/BackgroundMatrix + seqMutability = seqMutability/sum(seqMutability,na.rm=TRUE) + #cat(inputMatrixIndex,"\t",countMutations,"\n") + return(list("seqMutability" = seqMutability,"numbMutations" = countMutations,"seqMutabilityCount" = MutationCountMatrix, "BackgroundMatrix"=BackgroundMatrix)) + + } + } + + } + + #Returns the codon position containing the middle nucleotide + fivemerCodon <- function(fivemerIndex){ + codonPos = list(2:4,1:3,3:5) + fivemerType = fivemerIndex%%3 + return(codonPos[[fivemerType+1]]) + } + + #returns probability values for one mutation in codons resulting in R, S or Stop + probMutations <- function(typeOfMutation){ + matMutationProb <- matrix(0,ncol=3,nrow=125,dimnames=list(words(alphabet = c(NUCLEOTIDES,"N"), length=3),c(1:3))) + for(codon in rownames(matMutationProb)){ + if( !any(grep("N",codon)) ){ + for(muPos in 1:3){ + matCodon = matrix(rep(s2c(codon),3),nrow=3,ncol=3,byrow=T) + glNuc = matCodon[1,muPos] + matCodon[,muPos] = canMutateTo(glNuc) + substitutionRate = substitution[glNuc,matCodon[,muPos]] + typeOfMutations = apply(rbind(rep(codon,3),apply(matCodon,1,c2s)),2,function(x){mutationType(c2s(x[1]),c2s(x[2]))}) + matMutationProb[codon,muPos] <- sum(substitutionRate[typeOfMutations==typeOfMutation]) + } + } + } + + return(matMutationProb) + } + + + + +#Mapping Trinucleotides to fivemers +mapTriToFivemer <- function(triMutability=triMutability_Literature_Human){ + rownames(triMutability) <- triMutability_Names + Fivemer<-rep(NA,1024) + names(Fivemer)<-words(alphabet=NUCLEOTIDES,length=5) + Fivemer<-sapply(names(Fivemer),function(Word)return(sum( c(triMutability[substring(Word,3,5),1],triMutability[substring(Word,2,4),2],triMutability[substring(Word,1,3),3]),na.rm=TRUE))) + Fivemer<-Fivemer/sum(Fivemer) + return(Fivemer) +} + +collapseFivemerToTri<-function(Fivemer,Weights=MutabilityWeights,position=1,NUC="A"){ + Indices<-substring(names(Fivemer),3,3)==NUC + Factors<-substring(names(Fivemer[Indices]),(4-position),(6-position)) + tapply(which(Indices),Factors,function(i)weighted.mean(Fivemer[i],Weights[i],na.rm=TRUE)) +} + + + +CountFivemerToTri<-function(Fivemer,Weights=MutabilityWeights,position=1,NUC="A"){ + Indices<-substring(names(Fivemer),3,3)==NUC + Factors<-substring(names(Fivemer[Indices]),(4-position),(6-position)) + tapply(which(Indices),Factors,function(i)sum(Weights[i],na.rm=TRUE)) +} + +#Uses the real counts of the mutated fivemers +CountFivemerToTri2<-function(Fivemer,Counts=MutabilityCounts,position=1,NUC="A"){ + Indices<-substring(names(Fivemer),3,3)==NUC + Factors<-substring(names(Fivemer[Indices]),(4-position),(6-position)) + tapply(which(Indices),Factors,function(i)sum(Counts[i],na.rm=TRUE)) +} + +bootstrap<-function(x=c(33,12,21),M=10000,alpha=0.05){ +N<-sum(x) +if(N){ +p<-x/N +k<-length(x)-1 +tmp<-rmultinom(M, size = N, prob=p) +tmp_p<-apply(tmp,2,function(y)y/N) +(apply(tmp_p,1,function(y)quantile(y,c(alpha/2/k,1-alpha/2/k)))) +} +else return(matrix(0,2,length(x))) +} + + + + +bootstrap2<-function(x=c(33,12,21),n=10,M=10000,alpha=0.05){ + +N<-sum(x) +k<-length(x) +y<-rep(1:k,x) +tmp<-sapply(1:M,function(i)sample(y,n)) +if(n>1)tmp_p<-sapply(1:M,function(j)sapply(1:k,function(i)sum(tmp[,j]==i)))/n +if(n==1)tmp_p<-sapply(1:M,function(j)sapply(1:k,function(i)sum(tmp[j]==i)))/n +(apply(tmp_p,1,function(z)quantile(z,c(alpha/2/(k-1),1-alpha/2/(k-1))))) +} + + + +p_value<-function(x=c(33,12,21),M=100000,x_obs=c(2,5,3)){ +n=sum(x_obs) +N<-sum(x) +k<-length(x) +y<-rep(1:k,x) +tmp<-sapply(1:M,function(i)sample(y,n)) +if(n>1)tmp_p<-sapply(1:M,function(j)sapply(1:k,function(i)sum(tmp[,j]==i))) +if(n==1)tmp_p<-sapply(1:M,function(j)sapply(1:k,function(i)sum(tmp[j]==i))) +tmp<-rbind(sapply(1:3,function(i)sum(tmp_p[i,]>=x_obs[i])/M), +sapply(1:3,function(i)sum(tmp_p[i,]<=x_obs[i])/M)) +sapply(1:3,function(i){if(tmp[1,i]>=tmp[2,i])return(-tmp[2,i])else return(tmp[1,i])}) +} + +#"D:\\Sequences\\IMGT Germlines\\Human_SNPless_IGHJ.FASTA" +# Remove SNPs from IMGT germline segment alleles +generateUnambiguousRepertoire <- function(repertoireInFile,repertoireOutFile){ + repertoireIn <- read.fasta(repertoireInFile, seqtype="DNA",as.string=T,set.attributes=F,forceDNAtolower=F) + alleleNames <- sapply(names(repertoireIn),function(x)strsplit(x,"|",fixed=TRUE)[[1]][2]) + SNPs <- tapply(repertoireIn,sapply(alleleNames,function(x)strsplit(x,"*",fixed=TRUE)[[1]][1]),function(x){ + Indices<-NULL + for(i in 1:length(x)){ + firstSeq = s2c(x[[1]]) + iSeq = s2c(x[[i]]) + Indices<-c(Indices,which(firstSeq[1:320]!=iSeq[1:320] & firstSeq[1:320]!="." & iSeq[1:320]!="." )) + } + return(sort(unique(Indices))) + }) + repertoireOut <- repertoireIn + repertoireOut <- lapply(names(repertoireOut), function(repertoireName){ + alleleName <- strsplit(repertoireName,"|",fixed=TRUE)[[1]][2] + geneSegmentName <- strsplit(alleleName,"*",fixed=TRUE)[[1]][1] + alleleSeq <- s2c(repertoireOut[[repertoireName]]) + alleleSeq[as.numeric(unlist(SNPs[geneSegmentName]))] <- "N" + alleleSeq <- c2s(alleleSeq) + repertoireOut[[repertoireName]] <- alleleSeq + }) + names(repertoireOut) <- names(repertoireIn) + write.fasta(repertoireOut,names(repertoireOut),file.out=repertoireOutFile) + +} + + + + + + +############ +groupBayes2 = function(indexes, param_resultMat){ + + BayesGDist_Focused_CDR = calculate_bayesG( x=param_resultMat[indexes,1], N=apply(param_resultMat[indexes,c(1,2,4)],1,sum,na.rm=T), p=apply(param_resultMat[indexes,5:8],1,function(x){x[1]/(x[1]+x[2]+x[4])})) + BayesGDist_Focused_FWR = calculate_bayesG( x=param_resultMat[indexes,3], N=apply(param_resultMat[indexes,c(3,2,4)],1,sum,na.rm=T), p=apply(param_resultMat[indexes,5:8],1,function(x){x[3]/(x[3]+x[2]+x[4])})) + #BayesGDist_Local_CDR = calculate_bayesG( x=param_resultMat[indexes,1], N=apply(param_resultMat[indexes,c(1,2)],1,sum,na.rm=T), p=apply(param_resultMat[indexes,5:8],1,function(x){x[1]/(x[1]+x[2])})) + #BayesGDist_Local_FWR = calculate_bayesG( x=param_resultMat[indexes,3], N=apply(param_resultMat[indexes,c(3,4)],1,sum,na.rm=T), p=apply(param_resultMat[indexes,5:8],1,function(x){x[3]/(x[3]+x[4])})) + #BayesGDist_Global_CDR = calculate_bayesG( x=param_resultMat[indexes,1], N=apply(param_resultMat[indexes,c(1,2,3,4)],1,sum,na.rm=T), p=apply(param_resultMat[indexes,5:8],1,function(x){x[1]/(x[1]+x[2]+x[3]+x[4])})) + #BayesGDist_Global_FWR = calculate_bayesG( x=param_resultMat[indexes,3], N=apply(param_resultMat[indexes,c(1,2,3,4)],1,sum,na.rm=T), p=apply(param_resultMat[indexes,5:8],1,function(x){x[3]/(x[1]+x[2]+x[3]+x[4])})) + return ( list("BayesGDist_Focused_CDR"=BayesGDist_Focused_CDR, + "BayesGDist_Focused_FWR"=BayesGDist_Focused_FWR) ) + #"BayesGDist_Local_CDR"=BayesGDist_Local_CDR, + #"BayesGDist_Local_FWR" = BayesGDist_Local_FWR)) +# "BayesGDist_Global_CDR" = BayesGDist_Global_CDR, +# "BayesGDist_Global_FWR" = BayesGDist_Global_FWR) ) + + +} + + +calculate_bayesG <- function( x=array(), N=array(), p=array(), max_sigma=20, length_sigma=4001){ + G <- max(length(x),length(N),length(p)) + x=array(x,dim=G) + N=array(N,dim=G) + p=array(p,dim=G) + + indexOfZero = N>0 & p>0 + N = N[indexOfZero] + x = x[indexOfZero] + p = p[indexOfZero] + G <- length(x) + + if(G){ + + cons<-array( dim=c(length_sigma,G) ) + if(G==1) { + return(calculate_bayes(x=x[G],N=N[G],p=p[G],max_sigma=max_sigma,length_sigma=length_sigma)) + } + else { + for(g in 1:G) cons[,g] <- calculate_bayes(x=x[g],N=N[g],p=p[g],max_sigma=max_sigma,length_sigma=length_sigma) + listMatG <- convolutionPowersOfTwoByTwos(cons,length_sigma=length_sigma) + y<-calculate_bayesGHelper(listMatG,length_sigma=length_sigma) + return( y/sum(y)/(2*max_sigma/(length_sigma-1)) ) + } + }else{ + return(NA) + } +} + + +calculate_bayesGHelper <- function( listMatG,length_sigma=4001 ){ + matG <- listMatG[[1]] + groups <- listMatG[[2]] + i = 1 + resConv <- matG[,i] + denom <- 2^groups[i] + if(length(groups)>1){ + while( i0)) ){ + +# ONEmerStartPos = 1:(seqLength) +# ONEmerLength <- length(ONEmerStartPos) + ONEmerGL <- s2c(seqGL) + ONEmerSeq <- s2c(seqInput) + + #Background + for(ONEmerIndex in 1:seqLength){ + ONEmer = ONEmerGL[ONEmerIndex] + if(ONEmer!="N"){ + ONEmerCodonPos = getCodonPos(ONEmerIndex) + ONEmerReadingFrameCodon = c2s(ONEmerGL[ONEmerCodonPos]) + ONEmerReadingFrameCodonInputSeq = c2s(ONEmerSeq[ONEmerCodonPos] ) + + # All mutations model + #if(!any(grep("N",ONEmerReadingFrameCodon))){ + if(model==0){ + if(stopMutations==0){ + if(!any(grep("N",ONEmerReadingFrameCodonInputSeq))) + BackgroundMatrix[ONEmer] <- (BackgroundMatrix[ONEmer] + 1) + }else{ + if( !any(grep("N",ONEmerReadingFrameCodonInputSeq)) & translateCodonToAminoAcid(ONEmerReadingFrameCodonInputSeq)!="*"){ + positionWithinCodon = which(ONEmerCodonPos==ONEmerIndex)#positionsWithinCodon[(ONEmerCodonPos[1]%%3)+1] + BackgroundMatrix[ONEmer] <- (BackgroundMatrix[ONEmer] + probNonStopMutations[ONEmerReadingFrameCodon,positionWithinCodon]) + } + } + }else{ # Only silent mutations + if( !any(grep("N",ONEmerReadingFrameCodonInputSeq)) & translateCodonToAminoAcid(ONEmerReadingFrameCodonInputSeq)!="*" & translateCodonToAminoAcid(ONEmerReadingFrameCodonInputSeq)==translateCodonToAminoAcid(ONEmerReadingFrameCodon) ){ + positionWithinCodon = which(ONEmerCodonPos==ONEmerIndex) + BackgroundMatrix[ONEmer] <- (BackgroundMatrix[ONEmer] + probSMutations[ONEmerReadingFrameCodon,positionWithinCodon]) + } + } + } + } + } + + #Mutations + if(stopMutations==1) mutationCount = mutationCount[mutationCount!="Stop"] + if(model==1) mutationCount = mutationCount[mutationCount=="S"] + mutationPositions = as.numeric(names(mutationCount)) + mutationCount = mutationCount[mutationPositions>2 & mutationPositions<(seqLength-1)] + mutationPositions = mutationPositions[mutationPositions>2 & mutationPositions<(seqLength-1)] + countMutations = 0 + for(mutationPosition in mutationPositions){ + ONEmerIndex = mutationPosition + ONEmer = ONEmerSeq[ONEmerIndex] + GLONEmer = ONEmerGL[ONEmerIndex] + ONEmerCodonPos = getCodonPos(ONEmerIndex) + ONEmerReadingFrameCodon = c2s(ONEmerSeq[ONEmerCodonPos]) + ONEmerReadingFrameCodonGL =c2s(ONEmerGL[ONEmerCodonPos]) + if(!any(grep("N",ONEmer)) & !any(grep("N",GLONEmer))){ + if(model==0){ + countMutations = countMutations + 1 + MutationMatrix[GLONEmer] <- (MutationMatrix[GLONEmer] + 1) + MutationCountMatrix[GLONEmer] <- (MutationCountMatrix[GLONEmer] + 1) + }else{ + if( translateCodonToAminoAcid(ONEmerReadingFrameCodonGL)!="*" ){ + countMutations = countMutations + 1 + positionWithinCodon = which(ONEmerCodonPos==ONEmerIndex) + glNuc = substr(ONEmerReadingFrameCodonGL,positionWithinCodon,positionWithinCodon) + inputNuc = substr(ONEmerReadingFrameCodon,positionWithinCodon,positionWithinCodon) + MutationMatrix[GLONEmer] <- (MutationMatrix[GLONEmer] + substitution[glNuc,inputNuc]) + MutationCountMatrix[GLONEmer] <- (MutationCountMatrix[GLONEmer] + 1) + } + } + } + } + + seqMutability = MutationMatrix/BackgroundMatrix + seqMutability = seqMutability/sum(seqMutability,na.rm=TRUE) + #cat(inputMatrixIndex,"\t",countMutations,"\n") + return(list("seqMutability" = seqMutability,"numbMutations" = countMutations,"seqMutabilityCount" = MutationCountMatrix, "BackgroundMatrix"=BackgroundMatrix)) +# tmp<-list("seqMutability" = seqMutability,"numbMutations" = countMutations,"seqMutabilityCount" = MutationCountMatrix) + } + } + +################ +# $Id: trim.R 989 2006-10-29 15:28:26Z ggorjan $ + +trim <- function(s, recode.factor=TRUE, ...) + UseMethod("trim", s) + +trim.default <- function(s, recode.factor=TRUE, ...) + s + +trim.character <- function(s, recode.factor=TRUE, ...) +{ + s <- sub(pattern="^ +", replacement="", x=s) + s <- sub(pattern=" +$", replacement="", x=s) + s +} + +trim.factor <- function(s, recode.factor=TRUE, ...) +{ + levels(s) <- trim(levels(s)) + if(recode.factor) { + dots <- list(x=s, ...) + if(is.null(dots$sort)) dots$sort <- sort + s <- do.call(what=reorder.factor, args=dots) + } + s +} + +trim.list <- function(s, recode.factor=TRUE, ...) + lapply(s, trim, recode.factor=recode.factor, ...) + +trim.data.frame <- function(s, recode.factor=TRUE, ...) +{ + s[] <- trim.list(s, recode.factor=recode.factor, ...) + s +} +####################################### +# Compute the expected for each sequence-germline pair by codon +getExpectedIndividualByCodon <- function(matInput){ +if( any(grep("multicore",search())) ){ + facGL <- factor(matInput[,2]) + facLevels = levels(facGL) + LisGLs_MutabilityU = mclapply(1:length(facLevels), function(x){ + computeMutabilities(facLevels[x]) + }) + facIndex = match(facGL,facLevels) + + LisGLs_Mutability = mclapply(1:nrow(matInput), function(x){ + cInput = rep(NA,nchar(matInput[x,1])) + cInput[s2c(matInput[x,1])!="N"] = 1 + LisGLs_MutabilityU[[facIndex[x]]] * cInput + }) + + LisGLs_Targeting = mclapply(1:dim(matInput)[1], function(x){ + computeTargeting(matInput[x,2],LisGLs_Mutability[[x]]) + }) + + LisGLs_MutationTypes = mclapply(1:length(matInput[,2]),function(x){ + #print(x) + computeMutationTypes(matInput[x,2]) + }) + + LisGLs_R_Exp = mclapply(1:nrow(matInput), function(x){ + Exp_R <- rollapply(as.zoo(1:readEnd),width=3,by=3, + function(codonNucs){ + RPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="R") + sum( LisGLs_Targeting[[x]][,codonNucs][RPos], na.rm=T ) + } + ) + }) + + LisGLs_S_Exp = mclapply(1:nrow(matInput), function(x){ + Exp_S <- rollapply(as.zoo(1:readEnd),width=3,by=3, + function(codonNucs){ + SPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="S") + sum( LisGLs_Targeting[[x]][,codonNucs][SPos], na.rm=T ) + } + ) + }) + + Exp_R = matrix(unlist(LisGLs_R_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) + Exp_S = matrix(unlist(LisGLs_S_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) + return( list( "Expected_R"=Exp_R, "Expected_S"=Exp_S) ) + }else{ + facGL <- factor(matInput[,2]) + facLevels = levels(facGL) + LisGLs_MutabilityU = lapply(1:length(facLevels), function(x){ + computeMutabilities(facLevels[x]) + }) + facIndex = match(facGL,facLevels) + + LisGLs_Mutability = lapply(1:nrow(matInput), function(x){ + cInput = rep(NA,nchar(matInput[x,1])) + cInput[s2c(matInput[x,1])!="N"] = 1 + LisGLs_MutabilityU[[facIndex[x]]] * cInput + }) + + LisGLs_Targeting = lapply(1:dim(matInput)[1], function(x){ + computeTargeting(matInput[x,2],LisGLs_Mutability[[x]]) + }) + + LisGLs_MutationTypes = lapply(1:length(matInput[,2]),function(x){ + #print(x) + computeMutationTypes(matInput[x,2]) + }) + + LisGLs_R_Exp = lapply(1:nrow(matInput), function(x){ + Exp_R <- rollapply(as.zoo(1:readEnd),width=3,by=3, + function(codonNucs){ + RPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="R") + sum( LisGLs_Targeting[[x]][,codonNucs][RPos], na.rm=T ) + } + ) + }) + + LisGLs_S_Exp = lapply(1:nrow(matInput), function(x){ + Exp_S <- rollapply(as.zoo(1:readEnd),width=3,by=3, + function(codonNucs){ + SPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="S") + sum( LisGLs_Targeting[[x]][,codonNucs][SPos], na.rm=T ) + } + ) + }) + + Exp_R = matrix(unlist(LisGLs_R_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) + Exp_S = matrix(unlist(LisGLs_S_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) + return( list( "Expected_R"=Exp_R, "Expected_S"=Exp_S) ) + } +} + +# getObservedMutationsByCodon <- function(listMutations){ +# numbSeqs <- length(listMutations) +# obsMu_R <- matrix(0,nrow=numbSeqs,ncol=readEnd/3,dimnames=list(c(1:numbSeqs),c(1:(readEnd/3)))) +# obsMu_S <- obsMu_R +# temp <- mclapply(1:length(listMutations), function(i){ +# arrMutations = listMutations[[i]] +# RPos = as.numeric(names(arrMutations)[arrMutations=="R"]) +# RPos <- sapply(RPos,getCodonNumb) +# if(any(RPos)){ +# tabR <- table(RPos) +# obsMu_R[i,as.numeric(names(tabR))] <<- tabR +# } +# +# SPos = as.numeric(names(arrMutations)[arrMutations=="S"]) +# SPos <- sapply(SPos,getCodonNumb) +# if(any(SPos)){ +# tabS <- table(SPos) +# obsMu_S[i,names(tabS)] <<- tabS +# } +# } +# ) +# return( list( "Observed_R"=obsMu_R, "Observed_S"=obsMu_S) ) +# } + +getObservedMutationsByCodon <- function(listMutations){ + numbSeqs <- length(listMutations) + obsMu_R <- matrix(0,nrow=numbSeqs,ncol=readEnd/3,dimnames=list(c(1:numbSeqs),c(1:(readEnd/3)))) + obsMu_S <- obsMu_R + temp <- lapply(1:length(listMutations), function(i){ + arrMutations = listMutations[[i]] + RPos = as.numeric(names(arrMutations)[arrMutations=="R"]) + RPos <- sapply(RPos,getCodonNumb) + if(any(RPos)){ + tabR <- table(RPos) + obsMu_R[i,as.numeric(names(tabR))] <<- tabR + } + + SPos = as.numeric(names(arrMutations)[arrMutations=="S"]) + SPos <- sapply(SPos,getCodonNumb) + if(any(SPos)){ + tabS <- table(SPos) + obsMu_S[i,names(tabS)] <<- tabS + } + } + ) + return( list( "Observed_R"=obsMu_R, "Observed_S"=obsMu_S) ) +} + diff -r 000000000000 -r 8a5a2abbb870 baseline/Baseline_Main.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/Baseline_Main.r Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,388 @@ +######################################################################################### +# License Agreement +# +# THIS WORK IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE +# ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER +# APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE +# OR COPYRIGHT LAW IS PROHIBITED. +# +# BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE +# BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED +# TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN +# CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS. +# +# BASELIne: Bayesian Estimation of Antigen-Driven Selection in Immunoglobulin Sequences +# Coded by: Mohamed Uduman & Gur Yaari +# Copyright 2012 Kleinstein Lab +# Version: 1.3 (01/23/2014) +######################################################################################### + +op <- options(); +options(showWarnCalls=FALSE, showErrorCalls=FALSE, warn=-1) +library('seqinr') +if( F & Sys.info()[1]=="Linux"){ + library("multicore") +} + +# Load functions and initialize global variables +source("Baseline_Functions.r") + +# Initialize parameters with user provided arguments + arg <- commandArgs(TRUE) + #arg = c(2,1,5,5,0,1,"1:26:38:55:65:104:116", "test.fasta","","sample") + #arg = c(1,1,5,5,0,1,"1:38:55:65:104:116:200", "test.fasta","","sample") + #arg = c(1,1,5,5,1,1,"1:26:38:55:65:104:116", "/home/mu37/Wu/Wu_Cloned_gapped_sequences_D-masked.fasta","/home/mu37/Wu/","Wu") + testID <- as.numeric(arg[1]) # 1 = Focused, 2 = Local + species <- as.numeric(arg[2]) # 1 = Human. 2 = Mouse + substitutionModel <- as.numeric(arg[3]) # 0 = Uniform substitution, 1 = Smith DS et al. 1996, 5 = FiveS + mutabilityModel <- as.numeric(arg[4]) # 0 = Uniform mutablity, 1 = Tri-nucleotide (Shapiro GS et al. 2002) , 5 = FiveS + clonal <- as.numeric(arg[5]) # 0 = Independent sequences, 1 = Clonally related, 2 = Clonally related & only non-terminal mutations + fixIndels <- as.numeric(arg[6]) # 0 = Do nothing, 1 = Try and fix Indels + region <- as.numeric(strsplit(arg[7],":")[[1]]) # StartPos:LastNucleotideF1:C1:F2:C2:F3:C3 + inputFilePath <- arg[8] # Full path to input file + outputPath <- arg[9] # Full path to location of output files + outputID <- arg[10] # ID for session output + + + if(testID==5){ + traitChangeModel <- 1 + if( !is.na(any(arg[11])) ) traitChangeModel <- as.numeric(arg[11]) # 1 <- Chothia 1998 + initializeTraitChange(traitChangeModel) + } + +# Initialize other parameters/variables + + # Initialzie the codon table ( definitions of R/S ) + computeCodonTable(testID) + + # Initialize + # Test Name + testName<-"Focused" + if(testID==2) testName<-"Local" + if(testID==3) testName<-"Imbalanced" + if(testID==4) testName<-"ImbalancedSilent" + + # Indel placeholders initialization + indelPos <- NULL + delPos <- NULL + insPos <- NULL + + # Initialize in Tranistion & Mutability matrixes + substitution <- initializeSubstitutionMatrix(substitutionModel,species) + mutability <- initializeMutabilityMatrix(mutabilityModel,species) + + # FWR/CDR boundaries + flagTrim <- F + if( is.na(region[7])){ + flagTrim <- T + region[7]<-region[6] + } + readStart = min(region,na.rm=T) + readEnd = max(region,na.rm=T) + if(readStart>1){ + region = region - (readStart - 1) + } + region_Nuc = c( (region[1]*3-2) , (region[2:7]*3) ) + region_Cod = region + + readStart = (readStart*3)-2 + readEnd = (readEnd*3) + + FWR_Nuc <- c( rep(TRUE,(region_Nuc[2])), + rep(FALSE,(region_Nuc[3]-region_Nuc[2])), + rep(TRUE,(region_Nuc[4]-region_Nuc[3])), + rep(FALSE,(region_Nuc[5]-region_Nuc[4])), + rep(TRUE,(region_Nuc[6]-region_Nuc[5])), + rep(FALSE,(region_Nuc[7]-region_Nuc[6])) + ) + CDR_Nuc <- (1-FWR_Nuc) + CDR_Nuc <- as.logical(CDR_Nuc) + FWR_Nuc_Mat <- matrix( rep(FWR_Nuc,4), ncol=length(FWR_Nuc), nrow=4, byrow=T) + CDR_Nuc_Mat <- matrix( rep(CDR_Nuc,4), ncol=length(CDR_Nuc), nrow=4, byrow=T) + + FWR_Codon <- c( rep(TRUE,(region[2])), + rep(FALSE,(region[3]-region[2])), + rep(TRUE,(region[4]-region[3])), + rep(FALSE,(region[5]-region[4])), + rep(TRUE,(region[6]-region[5])), + rep(FALSE,(region[7]-region[6])) + ) + CDR_Codon <- (1-FWR_Codon) + CDR_Codon <- as.logical(CDR_Codon) + + +# Read input FASTA file + tryCatch( + inputFASTA <- baseline.read.fasta(inputFilePath, seqtype="DNA",as.string=T,set.attributes=F,forceDNAtolower=F) + , error = function(ex){ + cat("Error|Error reading input. Please enter or upload a valid FASTA file.\n") + q() + } + ) + + if (length(inputFASTA)==1) { + cat("Error|Error reading input. Please enter or upload a valid FASTA file.\n") + q() + } + + # Process sequence IDs/names + names(inputFASTA) <- sapply(names(inputFASTA),function(x){trim(x)}) + + # Convert non nucleotide characters to N + inputFASTA[length(inputFASTA)] = gsub("\t","",inputFASTA[length(inputFASTA)]) + inputFASTA <- lapply(inputFASTA,replaceNonFASTAChars) + + # Process the FASTA file and conver to Matrix[inputSequence, germlineSequence] + processedInput <- processInputAdvanced(inputFASTA) + matInput <- processedInput[[1]] + germlines <- processedInput[[2]] + lenGermlines = length(unique(germlines)) + groups <- processedInput[[3]] + lenGroups = length(unique(groups)) + rm(processedInput) + rm(inputFASTA) + +# # remove clones with less than 2 seqeunces +# tableGL <- table(germlines) +# singletons <- which(tableGL<8) +# rowsToRemove <- match(singletons,germlines) +# if(any(rowsToRemove)){ +# matInput <- matInput[-rowsToRemove,] +# germlines <- germlines[-rowsToRemove] +# groups <- groups[-rowsToRemove] +# } +# +# # remove unproductive seqs +# nonFuctionalSeqs <- sapply(rownames(matInput),function(x){any(grep("unproductive",x))}) +# if(any(nonFuctionalSeqs)){ +# if(sum(nonFuctionalSeqs)==length(germlines)){ +# write.table("Unproductive",file=paste(outputPath,outputID,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names=T) +# q() +# } +# matInput <- matInput[-which(nonFuctionalSeqs),] +# germlines <- germlines[-which(nonFuctionalSeqs)] +# germlines[1:length(germlines)] <- 1:length(germlines) +# groups <- groups[-which(nonFuctionalSeqs)] +# } +# +# if(class(matInput)=="character"){ +# write.table("All unproductive seqs",file=paste(outputPath,outputID,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names=T) +# q() +# } +# +# if(nrow(matInput)<10 | is.null(nrow(matInput))){ +# write.table(paste(nrow(matInput), "seqs only",sep=""),file=paste(outputPath,outputID,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names=T) +# q() +# } + +# replace leading & trailing "-" with "N: + matInput <- t(apply(matInput,1,replaceLeadingTrailingDashes,readEnd)) + + # Trim (nucleotide) input sequences to the last codon + #matInput[,1] <- apply(matrix(matInput[,1]),1,trimToLastCodon) + +# # Check for Indels +# if(fixIndels){ +# delPos <- fixDeletions(matInput) +# insPos <- fixInsertions(matInput) +# }else{ +# # Check for indels +# indelPos <- checkForInDels(matInput) +# indelPos <- apply(cbind(indelPos[[1]],indelPos[[2]]),1,function(x){(x[1]==T & x[2]==T)}) +# } + + # If indels are present, remove mutations in the seqeunce & throw warning at end + #matInput[indelPos,] <- apply(matrix(matInput[indelPos,],nrow=sum(indelPos),ncol=2),1,function(x){x[1]=x[2]; return(x) }) + + colnames(matInput)=c("Input","Germline") + + # If seqeunces are clonal, create effective sequence for each clone & modify germline/group definitions + germlinesOriginal = NULL + if(clonal){ + germlinesOriginal <- germlines + collapseCloneResults <- tapply(1:nrow(matInput),germlines,function(i){ + collapseClone(matInput[i,1],matInput[i[1],2],readEnd,nonTerminalOnly=(clonal-1)) + }) + matInput = t(sapply(collapseCloneResults,function(x){return(x[[1]])})) + names_groups = tapply(groups,germlines,function(x){names(x[1])}) + groups = tapply(groups,germlines,function(x){array(x[1],dimnames=names(x[1]))}) + names(groups) = names_groups + + names_germlines = tapply(germlines,germlines,function(x){names(x[1])}) + germlines = tapply( germlines,germlines,function(x){array(x[1],dimnames=names(x[1]))} ) + names(germlines) = names_germlines + matInputErrors = sapply(collapseCloneResults,function(x){return(x[[2]])}) + } + + +# Selection Analysis + + +# if (length(germlines)>sequenceLimit) { +# # Code to parallelize processing goes here +# stop( paste("Error: Cannot process more than ", Upper_limit," sequences",sep="") ) +# } + +# if (length(germlines)1){ + groups <- c(groups,lenGroups+1) + names(groups)[length(groups)] = "All sequences combined" + bayesPDF_groups_cdr[[lenGroups+1]] = groupPosteriors(bayesPDF_groups_cdr,length_sigma=4001) + bayesPDF_groups_fwr[[lenGroups+1]] = groupPosteriors(bayesPDF_groups_fwr,length_sigma=4001) + } + + #Bayesian Outputs + bayes_cdr = t(sapply(bayesPDF_cdr,calcBayesOutputInfo)) + bayes_fwr = t(sapply(bayesPDF_fwr,calcBayesOutputInfo)) + bayes_germlines_cdr = t(sapply(bayesPDF_germlines_cdr,calcBayesOutputInfo)) + bayes_germlines_fwr = t(sapply(bayesPDF_germlines_fwr,calcBayesOutputInfo)) + bayes_groups_cdr = t(sapply(bayesPDF_groups_cdr,calcBayesOutputInfo)) + bayes_groups_fwr = t(sapply(bayesPDF_groups_fwr,calcBayesOutputInfo)) + + #P-values + simgaP_cdr = sapply(bayesPDF_cdr,computeSigmaP) + simgaP_fwr = sapply(bayesPDF_fwr,computeSigmaP) + + simgaP_germlines_cdr = sapply(bayesPDF_germlines_cdr,computeSigmaP) + simgaP_germlines_fwr = sapply(bayesPDF_germlines_fwr,computeSigmaP) + + simgaP_groups_cdr = sapply(bayesPDF_groups_cdr,computeSigmaP) + simgaP_groups_fwr = sapply(bayesPDF_groups_fwr,computeSigmaP) + + + #Format output + + # Round expected mutation frequencies to 3 decimal places + matMutationInfo[germlinesOriginal[indelPos],] = NA + if(nrow(matMutationInfo)==1){ + matMutationInfo[5:8] = round(matMutationInfo[,5:8]/sum(matMutationInfo[,5:8],na.rm=T),3) + }else{ + matMutationInfo[,5:8] = t(round(apply(matMutationInfo[,5:8],1,function(x){ return(x/sum(x,na.rm=T)) }),3)) + } + + listPDFs = list() + nRows = length(unique(groups)) + length(unique(germlines)) + length(groups) + + matOutput = matrix(NA,ncol=18,nrow=nRows) + rowNumb = 1 + for(G in unique(groups)){ + #print(G) + matOutput[rowNumb,c(1,2,11:18)] = c("Group",names(groups)[groups==G][1],bayes_groups_cdr[G,],bayes_groups_fwr[G,],simgaP_groups_cdr[G],simgaP_groups_fwr[G]) + listPDFs[[rowNumb]] = list("CDR"=bayesPDF_groups_cdr[[G]],"FWR"=bayesPDF_groups_fwr[[G]]) + names(listPDFs)[rowNumb] = names(groups[groups==paste(G)])[1] + #if(names(groups)[which(groups==G)[1]]!="All sequences combined"){ + gs = unique(germlines[groups==G]) + rowNumb = rowNumb+1 + if( !is.na(gs) ){ + for( g in gs ){ + matOutput[rowNumb,c(1,2,11:18)] = c("Germline",names(germlines)[germlines==g][1],bayes_germlines_cdr[g,],bayes_germlines_fwr[g,],simgaP_germlines_cdr[g],simgaP_germlines_fwr[g]) + listPDFs[[rowNumb]] = list("CDR"=bayesPDF_germlines_cdr[[g]],"FWR"=bayesPDF_germlines_fwr[[g]]) + names(listPDFs)[rowNumb] = names(germlines[germlines==paste(g)])[1] + rowNumb = rowNumb+1 + indexesOfInterest = which(germlines==g) + numbSeqsOfInterest = length(indexesOfInterest) + rowNumb = seq(rowNumb,rowNumb+(numbSeqsOfInterest-1)) + matOutput[rowNumb,] = matrix( c( rep("Sequence",numbSeqsOfInterest), + rownames(matInput)[indexesOfInterest], + c(matMutationInfo[indexesOfInterest,1:4]), + c(matMutationInfo[indexesOfInterest,5:8]), + c(bayes_cdr[indexesOfInterest,]), + c(bayes_fwr[indexesOfInterest,]), + c(simgaP_cdr[indexesOfInterest]), + c(simgaP_fwr[indexesOfInterest]) + ), ncol=18, nrow=numbSeqsOfInterest,byrow=F) + increment=0 + for( ioi in indexesOfInterest){ + listPDFs[[min(rowNumb)+increment]] = list("CDR"=bayesPDF_cdr[[ioi]] , "FWR"=bayesPDF_fwr[[ioi]]) + names(listPDFs)[min(rowNumb)+increment] = rownames(matInput)[ioi] + increment = increment + 1 + } + rowNumb=max(rowNumb)+1 + + } + } + } + colsToFormat = 11:18 + matOutput[,colsToFormat] = formatC( matrix(as.numeric(matOutput[,colsToFormat]), nrow=nrow(matOutput), ncol=length(colsToFormat)) , digits=3) + matOutput[matOutput== " NaN"] = NA + + + + colnames(matOutput) = c("Type", "ID", "Observed_CDR_R", "Observed_CDR_S", "Observed_FWR_R", "Observed_FWR_S", + "Expected_CDR_R", "Expected_CDR_S", "Expected_FWR_R", "Expected_FWR_S", + paste( rep(testName,6), rep(c("Sigma","CIlower","CIupper"),2),rep(c("CDR","FWR"),each=3), sep="_"), + paste( rep(testName,2), rep("P",2),c("CDR","FWR"), sep="_") + ) + fileName = paste(outputPath,outputID,".txt",sep="") + write.table(matOutput,file=fileName,quote=F,sep="\t",row.names=T,col.names=NA) + fileName = paste(outputPath,outputID,".RData",sep="") + save(listPDFs,file=fileName) + +indelWarning = FALSE +if(sum(indelPos)>0){ + indelWarning = "

Warning: The following sequences have either gaps and/or deletions, and have been ommited from the analysis."; + indelWarning = paste( indelWarning , "

    ", sep="" ) + for(indels in names(indelPos)[indelPos]){ + indelWarning = paste( indelWarning , "
  • ", indels, "
  • ", sep="" ) + } + indelWarning = paste( indelWarning , "

", sep="" ) +} + +cloneWarning = FALSE +if(clonal==1){ + if(sum(matInputErrors)>0){ + cloneWarning = "

Warning: The following clones have sequences of unequal length."; + cloneWarning = paste( cloneWarning , "

    ", sep="" ) + for(clone in names(matInputErrors)[matInputErrors]){ + cloneWarning = paste( cloneWarning , "
  • ", names(germlines)[as.numeric(clone)], "
  • ", sep="" ) + } + cloneWarning = paste( cloneWarning , "

", sep="" ) + } +} +cat(paste("Success",outputID,indelWarning,cloneWarning,sep="|")) diff -r 000000000000 -r 8a5a2abbb870 baseline/FiveS_Mutability.RData Binary file baseline/FiveS_Mutability.RData has changed diff -r 000000000000 -r 8a5a2abbb870 baseline/FiveS_Substitution.RData Binary file baseline/FiveS_Substitution.RData has changed diff -r 000000000000 -r 8a5a2abbb870 baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,703 @@ +>IGHV1-18*01 +caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga +>IGHV1-18*02 +caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctaagatctgacgacacggcc +>IGHV1-18*03 +caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacatggccgtgtattactgtgcgagaga +>IGHV1-18*04 +caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctacggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga +>IGHV1-2*01 +caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccagtaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga +>IGHV1-2*02 +caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga +>IGHV1-2*03 +caggtgcagctggtgcagtctggggct...gaggtgaagaagcttggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcnacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga +>IGHV1-2*04 +caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggctgggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga +>IGHV1-2*05 +caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga +>IGHV1-24*01 +caggtccagctggtacagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggtttccggatacaccctc............actgaattatccatgcactgggtgcgacaggctcctggaaaagggcttgagtggatgggaggttttgatcctgaa......gatggtgaaacaatctacgcacagaagttccag...ggcagagtcaccatgaccgaggacacatctacagacacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcaacaga +>IGHV1-3*01 +caggtccagcttgtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggatcaacgctggc......aatggtaacacaaaatattcacagaagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaagacacggctgtgtattactgtgcgagaga +>IGHV1-3*02 +caggttcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggagcaacgctggc......aatggtaacacaaaatattcacaggagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaggacatggctgtgtattactgtgcgagaga +>IGHV1-38-4*01 +caggtccagctggtgcagtcttgggct...gaggtgaggaagtctggggcctcagtgaaagtctcctgtagtttttctgggtttaccatc............accagctacggtatacattgggtgcaacagtcccctggacaagggcttgagtggatgggatggatcaaccctggc......aatggtagcccaagctatgccaagaagtttcag...ggcagattcaccatgaccagggacatgtccacaaccacagcctacacagacctgagcagcctgacatctgaggacatggctgtgtattactatgcaagaca +>IGHV1-45*01 +cagatgcagctggtgcagtctggggct...gaggtgaagaagactgggtcctcagtgaaggtttcctgcaaggcttccggatacaccttc............acctaccgctacctgcactgggtgcgacaggcccccggacaagcgcttgagtggatgggatggatcacacctttc......aatggtaacaccaactacgcacagaaattccag...gacagagtcaccattactagggacaggtctatgagcacagcctacatggagctgagcagcctgagatctgaggacacagccatgtattactgtgcaagana +>IGHV1-45*02 +cagatgcagctggtgcagtctggggct...gaggtgaagaagactgggtcctcagtgaaggtttcctgcaaggcttccggatacaccttc............acctaccgctacctgcactgggtgcgacaggcccccggacaagcgcttgagtggatgggatggatcacacctttc......aatggtaacaccaactacgcacagaaattccag...gacagagtcaccattaccagggacaggtctatgagcacagcctacatggagctgagcagcctgagatctgaggacacagccatgtattactgtgcaagata +>IGHV1-45*03 +.....................................agaagactgggtcctcagtgaaggtttcctgcaaggcttccggatacaccttc............acctaccgctacctgcactgggtgcgacaggcccccagacaagcgcttgagtggatgggatggatcacacctttc......aatggtaacaccaactacgcacagaaattccag...gacagagtcaccattaccagggacaggtctatgagcacagcctacatggagctgagcagcctgagatctgaggacacagccatgtattactgtgcaaga +>IGHV1-46*01 +caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcatctggatacaccttc............accagctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggaataatcaaccctagt......ggtggtagcacaagctacgcacagaagttccag...ggcagagtcaccatgaccagggacacgtccacgagcacagtctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgagaga +>IGHV1-46*02 +caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcatctggatacaccttc............aacagctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggaataatcaaccctagt......ggtggtagcacaagctacgcacagaagttccag...ggcagagtcaccatgaccagggacacgtccacgagcacagtctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgagaga +>IGHV1-46*03 +caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcatctggatacaccttc............accagctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggaataatcaaccctagt......ggtggtagcacaagctacgcacagaagttccag...ggcagagtcaccatgaccagggacacgtccacgagcacagtctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgctagaga +>IGHV1-58*01 +caaatgcagctggtgcagtctgggcct...gaggtgaagaagcctgggacctcagtgaaggtctcctgcaaggcttctggattcaccttt............actagctctgctgtgcagtgggtgcgacaggctcgtggacaacgccttgagtggataggatggatcgtcgttggc......agtggtaacacaaactacgcacagaagttccag...gaaagagtcaccattaccagggacatgtccacaagcacagcctacatggagctgagcagcctgagatccgaggacacggccgtgtattactgtgcggcaga +>IGHV1-58*02 +caaatgcagctggtgcagtctgggcct...gaggtgaagaagcctgggacctcagtgaaggtctcctgcaaggcttctggattcaccttt............actagctctgctatgcagtgggtgcgacaggctcgtggacaacgccttgagtggataggatggatcgtcgttggc......agtggtaacacaaactacgcacagaagttccag...gaaagagtcaccattaccagggacatgtccacaagcacagcctacatggagctgagcagcctgagatccgaggacacggccgtgtattactgtgcggcaga +>IGHV1-68*01 +caggtgcagctggggcagtctgaggct...gaggtaaagaagcctggggcctcagtgaaggtctcctgcaaggcttccggatacaccttc............acttgctgctccttgcactggttgcaacaggcccctggacaagggcttgaaaggatgagatggatcacactttac......aatggtaacaccaactatgcaaagaagttccag...ggcagagtcaccattaccagggacatgtccctgaggacagcctacatagagctgagcagcctgagatctgaggactcggctgtgtattactgggcaagata +>IGHV1-69*01 +caggtgcagctggtgcagtctggggct...gaggtgaagaagcctgggtcctcggtgaaggtctcctgcaaggcttctggaggcaccttc............agcagctatgctatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggagggatcatccctatc......tttggtacagcaaactacgcacagaagttccag...ggcagagtcacgattaccgcggacgaatccacgagcacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgagaga +>IGHV1-69*02 +caggtccagctggtgcaatctggggct...gaggtgaagaagcctgggtcctcggtgaaggtctcctgcaaggcttctggaggcaccttc............agcagctatactatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggaaggatcatccctatc......cttggtatagcaaactacgcacagaagttccag...ggcagagtcacgattaccgcggacaaatccacgagcacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgaga +>IGHV1-69*03 +caggtgcagctggtgcagtctggggct...gaggtgaagaagcctgggtcctcggtgaaggtctcctgcaaggcttctggaggcaccttc............agcagctatgctatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggagggatcatccctatc......tttggtacagcaaactacgcacagaagttccag...ggcagagtcacgattaccgcggacgaatccacgagcacagcctacatggagctgagcagcctgagatctgatgacacggc +>IGHV1-69*04 +caggtccagctggtgcagtctggggct...gaggtgaagaagcctgggtcctcggtgaaggtctcctgcaaggcttctggaggcaccttc............agcagctatgctatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggaaggatcatccctatc......cttggtatagcaaactacgcacagaagttccag...ggcagagtcacgattaccgcggacaaatccacgagcacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgagaga +>IGHV1-69*05 +caggtccagctggtgcagtctggggct...gaggtgaagaagcctgggtcctcggtgaaggtctcctgcaaggcttctggaggcaccttc............agcagctatgctatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggagggatcatccctatc......tttggtacagcaaactacgcacagaagttccag...ggcagagtcacgattaccacggacgaatccacgagcacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgaga +>IGHV1-69*06 +caggtgcagctggtgcagtctggggct...gaggtgaagaagcctgggtcctcggtgaaggtctcctgcaaggcttctggaggcaccttc............agcagctatgctatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggagggatcatccctatc......tttggtacagcaaactacgcacagaagttccag...ggcagagtcacgattaccgcggacaaatccacgagcacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgagaga +>IGHV1-69*07 +.....................................agaagcctgggtcctcggtgaaggtctcctgcaaggcttctggaggcaccttc............agcagctatgctatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggaaggatcatccctatc......tttggtacagcaaactacgcacagaagttccag...ggcagagtcacgattaccgcggacgaatccacgagcacagcctacatggagctgagcagcctgagatctgag +>IGHV1-69*08 +caggtccagctggtgcaatctggggct...gaggtgaagaagcctgggtcctcggtgaaggtctcctgcaaggcttctggaggcaccttc............agcagctatactatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggaaggatcatccctatc......cttggtacagcaaactacgcacagaagttccag...ggcagagtcacgattaccgcggacaaatccacgagcacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgagaga +>IGHV1-69*09 +caggtgcagctggtgcagtctggggct...gaggtgaagaagcctgggtcctcggtgaaggtctcctgcaaggcttctggaggcaccttc............agcagctatgctatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggaaggatcatccctatc......cttggtatagcaaactacgcacagaagttccag...ggcagagtcacgattaccgcggacaaatccacgagcacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgagaga +>IGHV1-69*10 +caggtccagctggtgcagtctggggct...gaggtgaagaagcctgggtcctcagtgaaggtctcctgcaaggcttctggaggcaccttc............agcagctatgctatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggagggatcatccctatc......cttggtatagcaaactacgcacagaagttccag...ggcagagtcacgattaccgcggacaaatccacgagcacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgagaga +>IGHV1-69*11 +caggtccagctggtgcagtctggggct...gaggtgaagaagcctgggtcctcggtgaaggtctcctgcaaggcttctggaggcaccttc............agcagctatgctatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggaaggatcatccctatc......cttggtacagcaaactacgcacagaagttccag...ggcagagtcacgattaccgcggacgaatccacgagcacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgagaga +>IGHV1-69*12 +caggtccagctggtgcagtctggggct...gaggtgaagaagcctgggtcctcggtgaaggtctcctgcaaggcttctggaggcaccttc............agcagctatgctatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggagggatcatccctatc......tttggtacagcaaactacgcacagaagttccag...ggcagagtcacgattaccgcggacgaatccacgagcacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgagaga +>IGHV1-69*13 +caggtccagctggtgcagtctggggct...gaggtgaagaagcctgggtcctcagtgaaggtctcctgcaaggcttctggaggcaccttc............agcagctatgctatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggagggatcatccctatc......tttggtacagcaaactacgcacagaagttccag...ggcagagtcacgattaccgcggacgaatccacgagcacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgagaga +>IGHV1-69*14 +caggtccagctggtgcagtctggggct...gaggtgaagaagcctgggtcctcggtgaaggtctcctgcaaggcttctggaggcaccttc............agcagctatgctatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggagggatcatccctatc......tttggtacagcaaactacgcacagaagttccag...ggcagagtcacgattaccgcggacaaatccacgagcacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgagaga +>IGHV1-69-2*01 +gaggtccagctggtacagtctggggct...gaggtgaagaagcctggggctacagtgaaaatctcctgcaaggtttctggatacaccttc............accgactactacatgcactgggtgcaacaggcccctggaaaagggcttgagtggatgggacttgttgatcctgaa......gatggtgaaacaatatacgcagagaagttccag...ggcagagtcaccataaccgcggacacgtctacagacacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcaacaga +>IGHV1-69-2*02 +.....................................agaagcctggggctacagtgaaaatctcctgcaaggtttctggatacaccttc............accgactactacatgcactgggtgcaacaggcccctggaaaagggcttgagtggatgggacttgttgatcctgaa......gatggtgaaacaatatatgcagagaagttccag...ggcagagtcaccataaccgcggacacgtctacagacacagcctacatggagctgagcagcctgagatctgag +>IGHV1-69D*01 +caggtgcagctggtgcagtctggggct...gaggtgaagaagcctgggtcctcggtgaaggtctcctgcaaggcttctggaggcaccttc............agcagctatgctatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggagggatcatccctatc......tttggtacagcaaactacgcacagaagttccag...ggcagagtcacgattaccgcggacgaatccacgagcacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgagaga +>IGHV1-8*01 +caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accagttatgatatcaactgggtgcgacaggccactggacaagggcttgagtggatgggatggatgaaccctaac......agtggtaacacaggctatgcacagaagttccag...ggcagagtcaccatgaccaggaacacctccataagcacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgagagg +>IGHV1-8*02 +caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accagctatgatatcaactgggtgcgacaggccactggacaagggcttgagtggatgggatggatgaaccctaac......agtggtaacacaggctatgcacagaagttccag...ggcagagtcaccatgaccaggaacacctccataagcacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgagagg +>IGHV1-NL1*01 +caggttcagctgttgcagcctggggtc...caggtgaagaagcctgggtcctcagtgaaggtctcctgctaggcttccagatacaccttc............accaaatactttacacggtgggtgtgacaaagccctggacaagggcatnagtggatgggatgaatcaacccttac......aacgataacacacactacgcacagacgttctgg...ggcagagtcaccattaccagtgacaggtccatgagcacagcctacatggagctgagcngcctgagatccgaagacatggtcgtgtattactgtgtgagaga +>IGHV1/OR15-1*01 +caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacatcttc............accgactactatatgcactgggtgcgacaggcccctggacaagagcttgggtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagagtcaccatgaccagggacacgtccatcagcacagcctacacggagctgagcagcctgagatctgaggacacggccacgtattactgtgcgaga +>IGHV1/OR15-1*02 +caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacatcttc............accgactactatatgcactgggtgcgacaggcccctggacaagagcttgggtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagagtcaccatgaccagggacacgtccatcagcacagcctgcacggagctgagcagcctgagatctgaggacacggccacgtattactgtgcgagaga +>IGHV1/OR15-1*03 +caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacatcttc............accgactactatatgcactgggtgcgacaggcccctggacaagagcttgggtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagagtcaccatgaccagggacacgtccatcagcacagcctacacggagctgagcagcctgagatctgaggacacagccacgtattactgtgcgagaga +>IGHV1/OR15-1*04 +caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacatcttc............accgactactatatgcactgggtgcgacaggcccctggacaagagcttgggtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagagtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcagcctgagatctgaggacacggccacgtattactgtgcgagaga +>IGHV1/OR15-2*01 +caggtgcagctggtgcagtctggagct...gaggtgaagaagcctagagcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctactatatgcactgggtgtgacaggcccctgaacaagggcttgagtggatgggatggatcaacacttac......aatggtaacacaaactacccacagaagctccag...ggcagagtcaccatgaccagagacacatccacgagcacagcctacatggagctgagcaggctgagatctgacgacatggccgtgtattactgtgcgagaga +>IGHV1/OR15-2*02 +caggtgcagctggtgcagtctggagct...gaggtgaagaagcctggagcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctactatatgcactgggtgtgacaggcccctgaacaagggcttgagtggatgggatggatcaacacttac......aatggtaacacaaactacccacagaagctccag...ggcagagtcaccatgaccagagacacatccacgagcacagcctacatggagctgagcagcctgagatctgacgacatggccgtgtattactgtgcgagaga +>IGHV1/OR15-2*03 +caggtgcagctggtgcagtctggagct...gaggtgaagaagcctagagcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctactatatgcactgggtgtgacaggcccctgaacaagggcttgagtggatgggatggatcaacacttac......aatggtaacacaaactacccacagaagctccag...ggcagagtcaccatgaccagagacacatccacgagcacagcctacatggagctgagcagcctgagatctgacgacatggccgtgtattactgtgcgagaga +>IGHV1/OR15-3*01 +caggtccaactggtgtagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accgactactttatgaactggatgcgccaggcccctggacaaaggcttgagtggatgggatggatcaacgctggc......aatggtaacacaaaatattcacagaagctccag...ggcagagtcaccattaccagggacacatcttcgagcacagcctacatgcagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgaga +>IGHV1/OR15-3*02 +caggtccaactggtgtagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accgactactttatgaactggatgcgccaggcccctggacaaaggcttgagtggatgggatggatcaacgctggc......aatggtaacacaaaatattcacagaagctccag...ggcagagtcaccattaccagggacacatctgcgagcacagcctacatgcagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgagaga +>IGHV1/OR15-3*03 +caggtccaactggtgtagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accagctactatatgaactggatgcgccaggcccctggacaaggcttcgagtggatgggatggatcaacgctggc......aatggtaacacaaagtattcacagaagctccag...ggcagagtcaccattaccagggacacatctgcgagcacagcctacatgcagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgaga +>IGHV1/OR15-4*01 +caggaccagttggtgcagtctggggct...gaggtgaagaagcctctgtcctcagtgaaggtctccttcaaggcttctggatacaccttc............accaacaactttatgcactgggtgtgacaggcccctggacaaggacttgagtggatgggatggatcaatgctggc......aatggtaacacaacatatgcacagaagttccag...ggcagagtcaccataaccagggacacgtccatgagcacagcctacacggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgaga +>IGHV1/OR15-5*01 +.....................................agaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accagctactgtatgcactgggtgcaccaggtccatgcacaagggcttgagtggatgggattggtgtgccctagt......gatggcagcacaagctatgcacagaagttccag...gccagagtcaccataaccagggacacatccatgagcacagcctacatggagctaagcagtctgagatctgaggacacggccatgtattactgtgtgaga +>IGHV1/OR15-5*02 +caggtacagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accaactactgtatgcactgggtgcgccaggtccatgcacaagggcttgagtggatgggattggtgtgccctagt......gatggcagcacaagctatgcacaaaagttccag...gccagagtcaccataaccagggacacatccatgagcacagcctacatggagctaagcagtctgagatctgaggacacggccatgtattactgtgtgaga +>IGHV1/OR15-9*01 +caggtacagctgatgcagtctggggct...gaggtgaagaagcctggggcctcagtgaggatctcctgcaaggcttctggatacaccttc............accagctactgtatgcactgggtgtgccaggcccatgcacaagggcttgagtggatgggattggtgtgccctagt......gatggcagcacaagctatgcacagaagttccag...ggcagagtcaccataaccagggacacatccatgggcacagcctacatggagctaagcagcctgagatctgaggacacggccatgtattactgtgtgagaga +>IGHV1/OR21-1*01 +caggtacagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccatc............accagctactgtatgcactgggtgcaccaggtccatgcacaagggcttgagtggatgggattggtgtgccctagt......gatggcagcacaagctatgcacagaagttccag...gccagagtcaccataaccagggacacatccatgagcacagcctacatggagctaagcagtctgagatctgaggacacggccatgtattactgtgtgagaga +>IGHV2-10*01 +caggtcaccttgaaggagtctggtcct...gcactggtgaaacccacacagaccctcatgctgacctgcaccttctctgggttctcactcagc......acttctggaatgggtgtgggttagatctgtcagccctcagcaaaggccctggagtggcttgcacacatttattagaat.........gataataaatactacagcccatctctgaag...agtaggctcattatctccaaggacacctccaagaatgaagtggttctaacagtgatcaacatggacattgtggacacagccacacattactgtgcaaggagac +>IGHV2-26*01 +caggtcaccttgaaggagtctggtcct...gtgctggtgaaacccacagagaccctcacgctgacctgcaccgtctctgggttctcactcagc......aatgctagaatgggtgtgagctggatccgtcagcccccagggaaggccctggagtggcttgcacacattttttcgaat.........gacgaaaaatcctacagcacatctctgaag...agcaggctcaccatctccaaggacacctccaaaagccaggtggtccttaccatgaccaacatggaccctgtggacacagccacatattactgtgcacggatac +>IGHV2-5*01 +cagatcaccttgaaggagtctggtcct...acgctggtgaaacccacacagaccctcacgctgacctgcaccttctctgggttctcactcagc......actagtggagtgggtgtgggctggatccgtcagcccccaggaaaggccctggagtggcttgcactcatttattggaat.........gatgataagcgctacagcccatctctgaag...agcaggctcaccatcaccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacagccacatattactgtgcacacagac +>IGHV2-5*02 +cagatcaccttgaaggagtctggtcct...acgctggtgaaacccacacagaccctcacgctgacctgcaccttctctgggttctcactcagc......actagtggagtgggtgtgggctggatccgtcagcccccaggaaaggccctggagtggcttgcactcatttattgggat.........gatgataagcgctacagcccatctctgaag...agcaggctcaccatcaccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacagccacatattactgtgcacacagac +>IGHV2-5*03 +................................gctggtgaaacccacacagaccctcacgctgacctgcaccttctctgggttctcactcagc......actagtggagtgggtgtgggctggatccgtcagcccccaggaaaggccctggagtggcttgcactcatttattgggat.........gatgataagcgctacagcccatctctgaag...agcaggctcaccattaccaaggacacctccaaaaaccaggt +>IGHV2-5*04| +cagatcaccttgaaggagtctggtcct...acgctggtgaaacccacacagaccctcacgctgacctgcaccttctctgggttctcactcagc......actagtggagtgggtgtgggctggatccgtcagcccccaggaaaggccctggagtggcttgcactcatttattggaat.........gatgataagcgctacagcccatctctgaag...agcaggctcaccatcaccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacaggcacatattactgtgtac +>IGHV2-5*05 +cagatcaccttgaaggagtctggtcct...acgctggtgaaacccacacagaccctcacgctgacctgcaccttctctgggttctcactcagc......actagtggagtgggtgtgggctggatccgtcagcccccaggaaaggccctggagtggcttgcactcatttattgggat.........gatgataagcgctacggcccatctctgaag...agcaggctcaccatcaccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacagccacatattactgtgcacacagac +>IGHV2-5*06 +cagatcaccttgaaggagtctggtcct...acgctggtaaaacccacacagaccctcacgctgacctgcaccttctctgggttctcactcagc......actagtggagtgggtgtgggctggatccgtcagcccccaggaaaggccctggagtggcttgcactcatttattgggat.........gatgataagcgctacggcccatctctgaag...agcaggctcaccatcaccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacagccacatattactgtgcacacaga +>IGHV2-5*08 +caggtcaccttgaaggagtctggtcct...gcgctggtgaaacccacacagaccctcacactgacctgcaccttctctgggttctcactcagc......actagtggaatgcgtgtgagctggatccgtcagcccccaggaaaggccctggagtggcttgcactcatttattgggat.........gatgataagcgctacagcccatctctgaag...agcaggctcaccatcaccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacagccacatattactgtgcacacagac +>IGHV2-5*09 +caggtcaccttgaaggagtctggtcct...acgctggtgaaacccacacagaccctcacgctgacctgcaccttctctgggttctcactcagc......actagtggagtgggtgtgggctggatccgtcagcccccaggaaaggccctggagtggcttgcactcatttattgggat.........gatgataagcgctacggcccatctctgaag...agcaggctcaccatcaccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacagccacatattactgtgcacacagac +>IGHV2-70*01 +caggtcaccttgagggagtctggtcct...gcgctggtgaaacccacacagaccctcacactgacctgcaccttctctgggttctcactcagc......actagtggaatgtgtgtgagctggatccgtcagcccccagggaaggccctggagtggcttgcactcattgattgggat.........gatgataaatactacagcacatctctgaag...accaggctcaccatctccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacagccacgtattactgtgcacggatac +>IGHV2-70*02 +caggtcaccttgagggagtctggtcct...gcgctggtgaaacccacacagaccctcacactgacctgcaccttctctgggttctcactcagc......actagtggaatgtgtgtgagctggatccgtcagcccccagggaaggccctggagtggcttgcactcattgattgggat.........gatgataaatactacagcacatctctgaag...accaggctcaccatctccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacggccgtgtattactg +>IGHV2-70*03 +caggtcaccttgaaggagtctggtcct...gcgctggtgaaacccacacagaccctcacactgacctgcaccttctctgggttctcactcagc......actagtggaatgcgtgtgagctggatccgtcagcccccagggaaggccctggagtggcttgcacgcattgattgggat.........gatgataaattctacagcacatctctgaag...accaggctcaccatctccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacggccgtgtattactg +>IGHV2-70*04 +caggtcaccttgaaggagtctggtcct...gcgctggtgaaacccacacagaccctcacactgacctgcaccttctctgggttctcactcagc......actagtggaatgcgtgtgagctggatccgtcagcccccagggaaggccctggagtggcttgcacgcattgattgggat.........gatgataaattctacagcacatctctgaag...accaggctcaccatctccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacagccacgtattac +>IGHV2-70*05 +..........................t...gcgctggtgaaacccacacagaccctcacactgacctgcaccttctctgggttctcactcagc......actagtggaatgcgtgcgagctggatccgtcagcccccagggaaggccctggagtggcttgcacgcattgattgggat.........gatgataaattctacagcacatctctgaag...accaggctcaccatctccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatgga +>IGHV2-70*06 +caggtcaccttgaaggagtctggtcct...gcgctggtgaaacccacacagaccctcacactgacctgcaccttctctgggttctcactcagc......actagtggaatgcgtgtgagctggatccgtcagcccccagggaaggccctggagtggcttgcacgcattgattgggat.........gatgataaattctacagcacatccctgaag...accaggctcaccatctccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacggccgtgtattactg +>IGHV2-70*07 +caggtcaccttgagggagtctggtcct...gcgctggtgaaacccacacagaccctcacactgacctgcaccttctctgggttctcactcagc......actagtggaatgtgtgtgagctggatccgtcagcccccggggaaggccctggagtggcttgcactcattgattgggat.........gatgataaatactacagcacatctctgaag...accaggctcaccatctccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacggccgtgtattactg +>IGHV2-70*08 +caggtcaccttgagggagtctggtcct...gcgctggtgaaacccacacagaccctcacactgacctgcgccttctctgggttctcactcagc......actagtggaatgtgtgtgagctggatccgtcagcccccagggaaggccctggagtggcttgcacgcattgattgggat.........gatgataaatactacagcacatctctgaag...accaggctcaccatctccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacggccgtgtattactg +>IGHV2-70*09 +cagatcaccttgaaggagtctggtcct...acgctggtgaaacccacacagaccctcacgctgacccgcaccttctctgggttctcactcagc......actagtggaatgtgtgtgagctggatccgtcagcccccagggaaggccctggagtggcttgcactcattgattgggat.........gatgataaatactacagcacatctctgaac...accaggctcaccatctccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacaggcacatattactgtgtacgg +>IGHV2-70*10 +caggtcaccttgaaggagtctggtcct...gcgctggtgaaacccacacagaccctcacactgacctgcaccttctctgggttctcactcagc......actagtggaatgcgtgtgagctggatccgtcagcccccagggaaggccctggagtggattgcacgcattgattgggat.........gatgataaatactacagcacatctctgaag...accaggctcaccatctccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacagccacgtattactgtgcacggatac +>IGHV2-70*11 +cgggtcaccttgagggagtctggtcct...gcgctggtgaaacccacacagaccctcacactgacctgcaccttctctgggttctcactcagc......actagtggaatgtgtgtgagctggatccgtcagcccccagggaaggccctggagtggcttgcacgcattgattgggat.........gatgataaatactacagcacatctctgaag...accaggctcaccatctccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacagccacgtattactgtgcacggatac +>IGHV2-70*12 +cagatcaccttgaaggagtctggtcct...acgctggtgaaacccacacagaccctcacgctgacctgcaccttctctgggttctcactcagc......actagtggaatgtgtgtgagctggatccgtcagcccccagggaaggccctggagtggcttgcactcattgattgggat.........gatgataaatactacagcacatctctgaag...accaggctcaccatctccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacagccacatattactgtgcacacagac +>IGHV2-70*13 +caggtcaccttgagggagtctggtcct...gcgctggtgaaacccacacagaccctcacactgacctgcaccttctctgggttctcactcagc......actagtggaatgtgtgtgagctggatccgtcagcccccagggaaggccctggagtggcttgcactcattgattgggat.........gatgataaatactacagcacatctctgaag...accaggctcaccatctccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacagccacgtattattgtgcacggatac +>IGHV2-70D*04 +caggtcaccttgaaggagtctggtcct...gcgctggtgaaacccacacagaccctcacactgacctgcaccttctctgggttctcactcagc......actagtggaatgcgtgtgagctggatccgtcagcccccagggaaggccctggagtggcttgcacgcattgattgggat.........gatgataaattctacagcacatctctgaag...accaggctcaccatctccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacagccacgtattactgtgcacggatac +>IGHV2-70D*14 +caggtcaccttgaaggagtctggtcct...gcgctggtgaaacccacacagaccctcacactgacctgcaccttctctgggttctcactcagc......actagtggaatgcgtgtgagctggatccgtcagcccccaggtaaggccctggagtggcttgcacgcattgattgggat.........gatgataaattctacagcacatctctgaag...accaggctcaccatctccaaggacacctccaaaaaccaggtggtccttacaatgaccaacatggaccctgtggacacagccacgtattactgtgcacggatac +>IGHV2/OR16-5*01 +caggtcaccttgaaggagtctggtcct...gcgctggtgaaacccacagagaccctcacgctgacctgcactctctctgggttctcactcagc......acttctggaatgggtatgagctggatccgtcagcccccagggaaggccctggagtggcttgctcacatttttttgaat.........gacaaaaaatcctacagcacgtctctgaag...aacaggctcatcatctccaaggacacctccaaaagccaggtggtccttaccatgaccaacatggaccctgtggacacagccacgtattactgtgcatggagag +>IGHV3-11*01 +caggtgcagctggtggagtctggggga...ggcttggtcaagcctggagggtccctgagactctcctgtgcagcctctggattcaccttc............agtgactactacatgagctggatccgccaggctccagggaaggggctggagtgggtttcatacattagtagtagt......ggtagtaccatatactacgcagactctgtgaag...ggccgattcaccatctccagggacaacgccaagaactcactgtatctgcaaatgaacagcctgagagccgaggacacggccgtgtattactgtgcgagaga +>IGHV3-11*03 +caggtgcagctgttggagtctggggga...ggcttggtcaagcctggagggtccctgagactctcctgtgcagcctctggattcaccttc............agtgactactacatgagctggatccgccaggctccagggaaggggctggagtgggtttcatacattagtagtagt......agtagttacacaaactacgcagactctgtgaag...ggccgattcaccatctccagagacaacgccaagaactcactgtatctgcaaatgaacagcctgagagccgaggacacggccgtgtattactgtgcgaga +>IGHV3-11*04 +caggtgcagctggtggagtctggggga...ggcttggtcaagcctggagggtccctgagactctcctgtgcagcctctggattcaccttc............agtgactactacatgagctggatccgccaggctccagggaaggggctggagtgggtttcatacattagtagtagt......ggtagtaccatatactacgcagactctgtgaag...ggccgattcaccatctccagggacaacgccaagaactcactgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgagaga +>IGHV3-11*05 +caggtgcagctggtggagtctggggga...ggcttggtcaagcctggagggtccctgagactctcctgtgcagcctctggattcaccttc............agtgactactacatgagctggatccgccaggctccagggaaggggctggagtgggtttcatacattagtagtagt......agtagttacacaaactacgcagactctgtgaag...ggccgattcaccatctccagagacaacgccaagaactcactgtatctgcaaatgaacagcctgagagccgaggacacggccgtgtattactgtgcgagaga +>IGHV3-11*06 +caggtgcagctggtggagtctggggga...ggcttggtcaagcctggagggtccctgagactctcctgtgcagcctctggattcaccttc............agtgactactacatgagctggatccgccaggctccagggaaggggctggagtgggtttcatacattagtagtagt......agtagttacacaaactacgcagactctgtgaag...ggccgattcaccatctccagagacaacgccaagaactcactgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgagaga +>IGHV3-13*01 +gaggtgcagctggtggagtctggggga...ggcttggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctacgacatgcactgggtccgccaagctacaggaaaaggtctggagtgggtctcagctattggtactgct.........ggtgacacatactatccaggctccgtgaag...ggccgattcaccatctccagagaaaatgccaagaactccttgtatcttcaaatgaacagcctgagagccggggacacggctgtgtattactgtgcaagaga +>IGHV3-13*02 +gaggtgcatctggtggagtctggggga...ggcttggtacagcctgggggggccctgagactctcctgtgcagcctctggattcaccttc............agtaactacgacatgcactgggtccgccaagctacaggaaaaggtctggagtgggtctcagccaatggtactgct.........ggtgacacatactatccaggctccgtgaag...gggcgattcaccatctccagagaaaatgccaagaactccttgtatcttcaaatgaacagcctgagagccggggacacggctgtgtattactgtgcaagaga +>IGHV3-13*03 +gaggtgcagctggtggagtctggggga...ggcttggtacagcctggggggtccctgagactctcctgtgcagcctgtggattcaccttc............agtagctacgacatgcactgggtccgccaagctacaggaaaaggtctggagtgggtctcagctattggtactgct.........ggtgacacatactatccaggctccgtgaag...ggccaattcaccatctccagagaaaatgccaagaactccttgtatcttcaaatgaacagcctgagagccggggacacggctgtgtattactgtgcaaga +>IGHV3-13*04 +gaggtgcagctggtggagtctggggga...ggcttggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctacgacatgcactgggtccgccaagctacaggaaaaggtctggaatgggtctcagctattggtactgct.........ggtgacacatactatccaggctccgtgaag...ggccgattcaccatctccagagaaaatgccaagaactccttgtatcttcaaatgaacagcctgagagccggggacacggctgtgtattactgtgcaagaga +>IGHV3-13*05 +gaggtgcagctggtggagtctggggga...ggcttggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctacgacatgcactgggtccgccaagctacaggaaaaggtctggagtgggtctcagctattggtactgct.........ggtgacccatactatccaggctccgtgaag...ggccgattcaccatctccagagaaaatgccaagaactccttgtatcttcaaatgaacagcctgagagccggggacacggctgtgtattactgtgcaagaga +>IGHV3-15*01 +gaggtgcagctggtggagtctggggga...ggcttggtaaagcctggggggtcccttagactctcctgtgcagcctctggattcactttc............agtaacgcctggatgagctgggtccgccaggctccagggaaggggctggagtgggttggccgtattaaaagcaaaactgatggtgggacaacagactacgctgcacccgtgaaa...ggcagattcaccatctcaagagatgattcaaaaaacacgctgtatctgcaaatgaacagcctgaaaaccgaggacacagccgtgtattactgtaccacaga +>IGHV3-15*02 +gaggtgcagctggtggagtctggggga...gccttggtaaagcctggggggtcccttagactctcctgtgcagcctctggattcactttc............agtaacgcctggatgagctgggtccgccaggctccagggaaggggctggagtgggttggccgtattaaaagcaaaactgatggtgggacaacagactacgctgcacccgtgaaa...ggcagattcaccatctcaagagatgattcaaaaaacacgctgtatctgcaaatgaacagcctgaaaaccgaggacacagccgtgtattactgtaccacaga +>IGHV3-15*03 +gaggtgcagctggtggagtctgccgga...gccttggtacagcctggggggtcccttagactctcctgtgcagcctctggattcacttgc............agtaacgcctggatgagctgggtccgccaggctccagggaaggggctggagtgggttggccgtattaaaagcaaagctaatggtgggacaacagactacgctgcacctgtgaaa...ggcagattcaccatctcaagagttgattcaaaaaacacgctgtatctgcaaatgaacagcctgaaaaccgaggacacagccgtgtattactgtaccacaga +>IGHV3-15*04 +gaggtgcagctggtggagtctggggga...ggcttggtaaagcctggggggtcccttagactctcctgtgcagcctctggattcactttc............agtaacgcctggatgagctgggtccgccaggctccagggaaggggctggagtgggttggccgtattgaaagcaaaactgatggtgggacaacagactacgctgcacccgtgaaa...ggcagattcaccatctcaagagatgattcaaaaaacacgctgtatctgcaaatgaacagcctgaaaaccgaggacacagccgtgtattactgtaccacaga +>IGHV3-15*05 +gaggtgcagctggtggagtctggggga...ggcttggtaaagcctggggggtcccttagactctcctgtgcagcctctggattcactttc............agtaacgcctggatgagctgggtccgccaggctccagggaaggggctggagtgggttggccgtattaaaagcaaaactgatggtgggacaacagactacgctgcacccgtgaaa...ggcagattcaccatctcaagagatgattcaaaaaacacgctgtatctgcaaatgaacagtctgaaaaccgaggacacagccgtgtattactgtaccacaga +>IGHV3-15*06 +gaggtgcagctggtggagtctggggga...ggcttggtaaagcctggggggtcccttagactctcctgtgcagcctctggattcactttc............agtaacgcctggatgagctgggtccgccaggctccagggaaggggctggagtgggtcggccgtattaaaagcaaaactgatggtgggacaacaaactacgctgcacccgtgaaa...ggcagattcaccatctcaagagatgattcaaaaaacacgctgtatctgcaaatgaacagcctgaaaaccgaggacacagccgtgtattactgtaccacaga +>IGHV3-15*07 +gaggtgcagctggtggagtctggggga...ggcttggtaaagcctggggggtcccttagactctcctgtgcagcctctggtttcactttc............agtaacgcctggatgaactgggtccgccaggctccagggaaggggctggagtgggtcggccgtattaaaagcaaaactgatggtgggacaacagactacgctgcacccgtgaaa...ggcagattcaccatctcaagagatgattcaaaaaacacgctgtatctgcaaatgaacagcctgaaaaccgaggacacagccgtgtattactgtaccacaga +>IGHV3-15*08 +gaggtgcagctggtggagtctgcggga...ggcttggtacagcctggggggtcccttagactctcctgtgcagcctctggattcacttgc............agtaacgcctggatgagctgggtccgccaggctccagggaaggggctggagtgggttggctgtattaaaagcaaagctaatggtgggacaacagactacgctgcacctgtgaaa...ggcagattcaccatctcaagagatgattcaaaaaacacgctgtatctgcaaatgatcagcctgaaaaccgaggacacggccgtgtattactgtaccacagg +>IGHV3-16*01 +gaggtacaactggtggagtctggggga...ggcttggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtaacagtgacatgaactgggcccgcaaggctccaggaaaggggctggagtgggtatcgggtgttagttggaat......ggcagtaggacgcactatgtggactccgtgaag...cgccgattcatcatctccagagacaattccaggaactccctgtatctgcaaaagaacagacggagagccgaggacatggctgtgtattactgtgtgagaaa +>IGHV3-16*02 +gaggtgcagctggtggagtctggggga...ggcttggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtaacagtgacatgaactgggcccgcaaggctccaggaaaggggctggagtgggtatcgggtgttagttggaat......ggcagtaggacgcactatgtggactccgtgaag...cgccgattcatcatctccagagacaattccaggaactccctgtatctgcaaaagaacagacggagagccgaggacatggctgtgtattactgtgtgagaaa +>IGHV3-19*01 +acagtgcagctggtggagtctggggga...ggcttggtagagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtaacagtgacatgaactgggtccgccaggctccaggaaaggggctggagtgggtatcgggtgttagttggaat......ggcagtaggacgcactatgcagactctgtgaag...ggccgattcatcatctccagagacaattccaggaacttcctgtatcagcaaatgaacagcctgaggcccgaggacatggctgtgtattactgtgtgagaaa +>IGHV3-20*01 +gaggtgcagctggtggagtctggggga...ggtgtggtacggcctggggggtccctgagactctcctgtgcagcctctggattcaccttt............gatgattatggcatgagctgggtccgccaagctccagggaaggggctggagtgggtctctggtattaattggaat......ggtggtagcacaggttatgcagactctgtgaag...ggccgattcaccatctccagagacaacgccaagaactccctgtatctgcaaatgaacagtctgagagccgaggacacggccttgtatcactgtgcgagaga +>IGHV3-20*02 +gaggtgcagctggtggagtctggggga...ggtgtggtacggcctggggggtccctgagactctcctttgcagcctctggattcaccttt............gatgattatggcatgagctgggtccgccaagctccagggaaggggctggagtgggtctctggtattaattggaat......ggtggtagcacaggttatgcagactctgtgaag...ggccgattcaccatctccagagacaacgccaagaactccctgtatctgcaaatgaacagtctgagagccgaggacacggccttgtatcactgtgcgagaga +>IGHV3-21*01 +gaggtgcagctggtggagtctggggga...ggcctggtcaagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatagcatgaactgggtccgccaggctccagggaaggggctggagtgggtctcatccattagtagtagt......agtagttacatatactacgcagactcagtgaag...ggccgattcaccatctccagagacaacgccaagaactcactgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgagaga +>IGHV3-21*02 +gaggtgcaactggtggagtctggggga...ggcctggtcaagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatagcatgaactgggtccgccaggctccagggaaggggctggagtgggtctcatccattagtagtagt......agtagttacatatactacgcagactcagtgaag...ggccgattcaccatctccagagacaacgccaagaactcactgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgagaga +>IGHV3-21*03 +gaggtgcagctggtggagtctggggga...ggcctggtcaagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatagcatgaactgggtccgccaggctccagggaaggggctggagtgggtctcatccattagtagtagt......agtagttacatatactacgcagactcagtgaag...ggccgattcaccatctccagagacaacgccaagaactcactgtatctgcaaatgaacagcctgagagccgaggacacagctgtgtattactgtgcgagaga +>IGHV3-21*04 +gaggtgcagctggtggagtctggggga...ggcctggtcaagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatagcatgaactgggtccgccaggctccagggaaggggctggagtgggtctcatccattagtagtagt......agtagttacatatactacgcagactcagtgaag...ggccgattcaccatctccagagacaacgccaagaactcactgtatctgcaaatgaacagcctgagagccgaggacacggccgtgtattactgtgcgagaga +>IGHV3-22*01 +gaggtgcatctggtggagtctggggga...gccttggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agttactactacatgagcggggtccgccaggctcccgggaaggggctggaatgggtaggtttcattagaaacaaagctaatggtgggacaacagaatagaccacgtctgtgaaa...ggcagattcacaatctcaagagatgattccaaaagcatcacctatctgcaaatgaagagcctgaaaaccgaggacacggccgtgtattactgttccagaga +>IGHV3-22*02 +gaggtgcagctggtggagtctggggga...ggcttggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agttactactacatgagcggggtccgccaggctcccgggaaggggctggaatgggtaggtttcattagaaacaaagctaatggtgggacaacagaatagaccacgtctgtgaaa...ggcagattcacaatctcaagagatgattccaaaagcatcacctatctgcaaatgaagagcctgaaaaccgaggacacggccgtgtattactgttccagaga +>IGHV3-23*01 +gaggtgcagctgttggagtctggggga...ggcttggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttt............agcagctatgccatgagctgggtccgccaggctccagggaaggggctggagtgggtctcagctattagtggtagt......ggtggtagcacatactacgcagactccgtgaag...ggccggttcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagccgaggacacggccgtatattactgtgcgaaaga +>IGHV3-23*02 +gaggtgcagctgttggagtctggggga...ggcttggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttt............agcagctatgccatgagctgggtccgccaggctccagggaaggggctggagtgggtctcagctattagtggtagt......ggtggtagcacatactacggagactccgtgaag...ggccggttcaccatctcaagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagccgaggacacggccgtatattactgtgcgaaaga +>IGHV3-23*03 +gaggtgcagctgttggagtctggggga...ggcttggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttt............agcagctatgccatgagctgggtccgccaggctccagggaaggggctggagtgggtctcagttatttatagcggt......ggtagtagcacatactatgcagactccgtgaag...ggccggttcaccatctccagagataattccaagaacacgctgtatctgcaaatgaacagcctgagagccgaggacacggccgtatattactgtgcgaaaga +>IGHV3-23*04 +gaggtgcagctggtggagtctggggga...ggcttggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttt............agcagctatgccatgagctgggtccgccaggctccagggaaggggctggagtgggtctcagctattagtggtagt......ggtggtagcacatactacgcagactccgtgaag...ggccggttcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagccgaggacacggccgtatattactgtgcgaaaga +>IGHV3-23*05 +gaggtgcagctgttggagtctggggga...ggcttggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttt............agcagctatgccatgagctgggtccgccaggctccagggaaggggctggagtgggtctcagctatttatagcagt......ggtagtagcacatactatgcagactccgtgaag...ggccggttcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagccgaggacacggccgtatattactgtgcgaaa +>IGHV3-23D*01 +gaggtgcagctgttggagtctggggga...ggcttggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttt............agcagctatgccatgagctgggtccgccaggctccagggaaggggctggagtgggtctcagctattagtggtagt......ggtggtagcacatactacgcagactccgtgaag...ggccggttcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagccgaggacacggccgtatattactgtgcgaaaga +>IGHV3-23D*02 +gaggtgcagctggtggagtctggggga...ggcttggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttt............agcagctatgccatgagctgggtccgccaggctccagggaaggggctggagtgggtctcagctattagtggtagt......ggtggtagcacatactacgcagactccgtgaag...ggccggttcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagccgaggacacggccgtatattactgtgcgaaaga +>IGHV3-25*01 +gagatgcagctggtggagtctggggga...ggcttgcaaaagcctgcgtggtccccgagactctcctgtgcagcctctcaattcaccttc............agtagctactacatgaactgtgtccgccaggctccagggaatgggctggagttggtttgacaagttaatcctaat......gggggtagcacatacctcatagactccggtaag...gaccgattcaatacctccagagataacgccaagaacacacttcatctgcaaatgaacagcctgaaaaccgaggacacggccctctattagtgtaccagaga +>IGHV3-25*02 +gagatgcagctggtggagtctggggga...ggcttggcaaagcctgcgtggtccccgagactctcctgtgcagcctctcaattcaccttc............agtagctactacatgaactgtgtccgccaggctccagggaatgggctggagttggtttgacaagttaatcctaat......gggggtagcacatacctcatagactccggtaag...gaccgattcaatacctccagagataacgccaagaacacacttcatctgcaaatgaacagcctgaaaaccgaggacacggccctctattagtgtaccagaga +>IGHV3-25*03 +gagatgcagctggtggagtctggggga...ggcttggcaaagcctgcgtggtccccgagactctcctgtgcagcctctcaattcaccttc............agtagctactacatgaactgtgtccgccaggctccagggaatgggctggagttggttggacaagttaatcctaat......gggggtagcacatacctcatagactccggtaag...gaccgattcaatacctccagagataacgccaagaacacacttcatctgcaaatgaacagcctgaaaaccgaggacacggccctgtattagtgtaccaga +>IGHV3-25*04 +gagacgcagctggtggagtctggggga...ggcttggcaaagcctgggcggtccccgagactctcctgtgcagcctctcaattcaccttc............agtagctactacatgaactgtgtccgccaggctccagggaatgggctggagttggttggacaagttaatcctaat......gggggtagcacatacctcatagactccggtaag...gaccgattcaatacctccagagataacgccaagaacacacttcatctgcaaatgaacagcctgaaaaccgaggacacggccctgtattactgtaccagaga +>IGHV3-25*05 +gagatgcagctggtggagtctggggga...ggcttggcaaagcctgcgtggtccccgagactctcctgtgcagcctctcaattcaccttc............agtagctactacatgaactgtgtccgccaggctccagggaatgggctggagttggttggacaagttaatcctaat......gggggtagcacatacctcatagactccggtaag...gaccgattcaatacctccagagataacgccaagaacacacttcatctgcaaatgaacagcctgaaaaccgaggacacggccctctattagtgtaccagaga +>IGHV3-29*01 +gaggtggagctgatagagcccacagag...gacctgagacaacctgggaagttcctgagactctcctgtgtagcctctagattcgccttc............agtagcttctgaatgagcccagttcaccagtctgcaggcaaggggctggagtgagtaatagatataaaagatgat......ggaagtcagatacaccatgcagactctgtgaag...ggcagattctccatctccaaagacaatgctaagaactctctgtatctgcaaatgaacagtcagagaactgaggacatggctgtgtatggctgtacataaggtt +>IGHV3-30*01 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatgctatgcactgggtccgccaggctccaggcaaggggctagagtgggtggcagttatatcatatgat......ggaagtaataaatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgagaga +>IGHV3-30*02 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctggggggtccctgagactctcctgtgcagcgtctggattcaccttc............agtagctatggcatgcactgggtccgccaggctccaggcaaggggctggagtgggtggcatttatacggtatgat......ggaagtaataaatactatgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgaaaga +>IGHV3-30*03 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatggcatgcactgggtccgccaggctccaggcaaggggctggagtgggtggcagttatatcatatgat......ggaagtaataaatactatgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgagaga +>IGHV3-30*04 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatgctatgcactgggtccgccaggctccaggcaaggggctggagtgggtggcagttatatcatatgat......ggaagtaataaatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgagaga +>IGHV3-30*05 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatggcatgcactgggtccgccaggctccaggcaaggggctagagtgggtggcagttatatcatatgat......ggaagtaataaatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgagggcacggctgtgtattactgtgcgagaga +>IGHV3-30*06 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcgtctggattcaccttc............agtagctatggcatgcactgggtccgccaggctccaggcaaggggctagagtgggtggcagttatatcatatgat......ggaagtaataaatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgagaga +>IGHV3-30*07 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatgctatgcactgggtccgccaggctccaggcaaggggctagagtgggtggcagttatatcatatgat......ggaagtaataaatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgagaga +>IGHV3-30*08 +caggtgcagctggtggactctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcctctgcattcaccttc............agtagctatgctatgcactgggtccgccaggctccaggcaaggggctagagtgggtggcagttatatcatatgat......ggaagtaataaatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgaga +>IGHV3-30*09 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatgctatgcactgggtccgccaggctccaggcaaggggctggagtgggtggcagttatatcatatgat......ggaagtaataaatactacgcagactccgtgaag...ggccgattcgccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgagaga +>IGHV3-30*10 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatgctatgcactgggtccgccaggctccaggcaaggggctagagtgggtggcagttatatcatatgat......ggaagtaataaatactacacagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgagaga +>IGHV3-30*11 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcgtctggattcaccttc............agtagctatgctatgcactgggtccgccaggctccaggcaaggggctagagtgggtggcagttatatcatatgat......ggaagtaataaatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgagaga +>IGHV3-30*12 +caggtgcagctggtggagtctgggggg...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcgtctggattcaccttc............agtagctatggcatgcactgggtccgccaggctccaggcaaggggctagagtgggtggcagttatatcatatgat......ggaagtaataaatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgagaga +>IGHV3-30*13 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatggcatgcactgggtccgccaggctccaggcaaggggctagagtgggtggcagttatatcatatgat......ggaagtaataaatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacaggctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgagaga +>IGHV3-30*14 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatgctatgcactgggtccgccaggctccaggcaaggggctggagtgggtggcagttatatcatatgat......ggaagtaataaatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatcttcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgagaga +>IGHV3-30*15 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatgctatgcactgggtccgccaggctccaggcaaggggctagagtgggtggcagttatatcatatgat......ggaagtaataaatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgagcagcctgagagctgaggacacggctgtgtattactgtgcgagaga +>IGHV3-30*16 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatgctatgcactgggtccgccaggccccaggcaaggggctagagtgggtggcagttatatcatatgat......ggaagtaataaatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgagaga +>IGHV3-30*17 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatgctatgcactgggtccgccaggctccgggcaaggggctagagtgggtggcagttatatcatatgat......ggaagtaataaatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgagaga +>IGHV3-30*18 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatggcatgcactgggtccgccaggctccaggcaaggggctggagtgggtggcagttatatcatatgat......ggaagtaataaatactatgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgaaaga +>IGHV3-30*19 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcgtctggattcaccttc............agtagctatggcatgcactgggtccgccaggctccaggcaaggggctggagtgggtggcagttatatcatatgat......ggaagtaataaatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgagaga +>IGHV3-30-2*01 +gaggtacagctcgtggagtccggagag...gacccaagacaacctgggggatccctgagactctcctgtgcagactctggattaaccttc............agtagctactgaaggaactcggtttcccaggctccagggaaggggctggagtgagtagtagatatacagtgtgat......ggaagtcagatatgttatgcataatctttgaag...agcaaattcaccatctccaaagaaaatgccaagaactcactgtatttgctaatgaacagtctgagagcagcgggcacagctgtgtgttactgtatgtgaggca +>IGHV3-30-22*01 +gaggtggagctgatagagtccatagag...gacctgagacaacctgggaagttcctgagactctcctgtgtagcctctagattcgccttc............agtagcttctgaatgagccgagttcaccagtctccaggcaaggggctggagtgagtaatagatataaaagatgat......ggaagtcagatacaccatgcagactctgtgaag...ggcagattctccatctccaaagacaatgctaagaactctctgtatctgcaaatgaacagtcagagagctgaggacatggacgtgtatggctgtacataaggtc +>IGHV3-30-3*01 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatgctatgcactgggtccgccaggctccaggcaaggggctggagtgggtggcagttatatcatatgat......ggaagcaataaatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgagaga +>IGHV3-30-3*02 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcgtctggattcaccttc............agtagctatgctatgcactgggtccgccaggctccaggcaaggggctggagtgggtggcagttatatcatatgat......ggaagcaataaatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgaaaga +>IGHV3-30-3*03 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatgctatgcactgggtccgccaggctccaggcaaggggctggagtgggtggcagttatatcatatgat......ggaagtaataaatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgagaga +>IGHV3-30-33*01 +gaggtacagctcgtggagtccggagag...gacccaagacaacctgggggatccctgagactctcctgtgcagactctggattaaccttc............agtagctactgaaggagctcggtttcccaggctccagggaaggggctggagtgagtagtagatatacagtgtgat......ggaagtcagatatgttatgcataatctttgaag...agcaaattcaccatctccaaagaaaatgccaagaactcactgtatttgctaatgaacagtctgagagcagagggcacagctgtgtgttactgtatgtgagg +>IGHV3-30-42*01 +gaggtggagctgatagagcccacagag...gacctgagacaacctgggaagttcctgagactctcctgtgtagcctctagattcgccttc............agtagcttctgaatgagcccagttcaccagtctgcaggcaaggggctggagtgagtaatagatataaaagatgat......ggaagtcagatacaccatgcagactctgtgaag...ggcagattctccatctccaaagacaatgctaagaactctctgtatctgcaaatgaacagtcagagaactgaggacatggctgtgtatggctgtacataaggtt +>IGHV3-30-5*01 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatggcatgcactgggtccgccaggctccaggcaaggggctggagtgggtggcagttatatcatatgat......ggaagtaataaatactatgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgaaaga +>IGHV3-30-5*02 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctggggggtccctgagactctcctgtgcagcgtctggattcaccttc............agtagctatggcatgcactgggtccgccaggctccaggcaaggggctggagtgggtggcatttatacggtatgat......ggaagtaataaatactatgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgaaaga +>IGHV3-30-52*01 +gaggtacagctcgtggagtccggagag...gacccaagacaacctgggggatccctgagactctcctgtgcagactctggattaaccttc............agtagctactgaaggaactcggtttcccaggctccagggaaggggctggagtgagtagtagatatacagtgtgat......ggaagtcagatatgttatgcataatctttgaag...agcaaattcaccatctccaaagaaaatgccaagaactcactgtatttgctaatgaacagtctgagagcagcgggcacagctgtgtgttactgtatgtgagg +>IGHV3-32*01 +gaggtggagctgatagagtccatagag...gacctgagacaacctgggaagttcctgagactctcctgtgtagcctctagattcgccttc............agtagcttctgaatgagccgagttcaccagtctccaggcaaggggctggagtgagtaatagatataaaagatgat......ggaagtcagatacaccatgcagactctgtgaag...ggcagattctccatctccaaagacaatgctaagaactctctgtatctgcaaatgaacactcagagagctgaggacgtggccgtgtatggctatacataaggtc +>AIGHV3-33*01 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcgtctggattcaccttc............agtagctatggcatgcactgggtccgccaggctccaggcaaggggctggagtgggtggcagttatatggtatgat......ggaagtaataaatactatgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgagaga +>IGHV3-33*02 +caggtacagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcgtctggattcaccttc............agtagctatggcatgcactgggtccgccaggctccaggcaaggggctggagtgggtggcagttatatggtatgat......ggaagtaataaatactatgcagactccgcgaag...ggccgattcaccatctccagagacaattccacgaacacgctgtttctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgagaga +>IGHV3-33*03 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcgtctggattcaccttc............agtagctatggcatgcactgggtccgccaggctccaggcaaggggctggagtgggtggcagttatatggtatgat......ggaagtaataaatactatgcagactccgtgaag...ggccgattcaccatctccagagacaactccaagaacacgctgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgaaaga +>IGHV3-33*04 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcgtctggattcaccttc............agtagctatggcatgcactgggtccgccaggctccaggcaaggggctagagtgggtggcagttatatggtatgac......ggaagtaataaatactatgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgagaga +>IGHV3-33*05 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcgtctggattcaccttc............agtagctatggcatgcactgggtccgccaggctccaggcaaggggctggagtgggtggcagttatatcatatgat......ggaagtaataaatactatgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgagaga +>IGHV3-33*06 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctgggaggtccctgagactctcctgtgcagcgtctggattcaccttc............agtagctatggcatgcactgggtccgccaggctccaggcaaggggctggagtgggtggcagttatatggtatgat......ggaagtaataaatactatgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgaaaga +>IGHV3-33-2*01 +gaggtacagctcgtggagtccggagag...gacccaagacaacctgggggatccttgagactctcctgtgcagactctggattaaccttc............agtagctactgaatgagctcggtttcccaggctccagggaaggggctggagtgagtagtagatatacagtgtgat......ggaagtcagatatgttatgcccaatctgtgaag...agcaaattcaccatctccaaagaaaatgccaagaactcactgtatttgcaaatgaacagtctgagagcagagggcacagctgtgtgttactgtatgtgaggca +>IGHV3-35*01 +gaggtgcagctggtggagtctggggga...ggcttggtacagcctgggggatccctgagactctcctgtgcagcctctggattcaccttc............agtaacagtgacatgaactgggtccatcaggctccaggaaaggggctggagtgggtatcgggtgttagttggaat......ggcagtaggacgcactatgcagactctgtgaag...ggccgattcatcatctccagagacaattccaggaacaccctgtatctgcaaacgaatagcctgagggccgaggacacggctgtgtattactgtgtgagaaa +>IGHV3-38*01| +gaggtgcagctggtggagtctggggga...ggcttggtacagcctagggggtccctgagactctcctgtgcagcctctggattcaccgtc............agtagcaatgagatgagctggatccgccaggctccagggaaggggctggagtgggtctcatccattagtggt............ggtagcacatactacgcagactccaggaag...ggcagattcaccatctccagagacaattccaagaacacgctgtatcttcaaatgaacaacctgagagctgagggcacggccgcgtattactgtgccagatata +>IGHV3-38*02 +gaggtgcagctggtggagtctggggga...ggcttggtacagcctagggggtccctgagactctcctgtgcagcctctggattcaccgtc............agtagcaatgagatgagctggatccgccaggctccagggaaggggctggagtgggtctcatccattagtggt............ggtagcacatactacgcagactccaggaag...ggcagattcaccatctccagagacaattccaagaacacgctgtatcttcaaatgaacaacctgagagctgagggcacggccgtgtattactgtgccagatata +>IGHV3-38*03 +gaggtgcagctggtggagtctggggga...ggcttggtacagcctagggggtccctgagactctcctgtgcagcctctggattcaccgtc............agtagcaatgagatgagctggatccgccaggctccagggaagggtctggagtgggtctcatccattagtggt............ggtagcacatactacgcagactccaggaag...ggcagattcaccatctccagagacaattccaagaacacgctgtatcttcaaatgaacaacctgagagctgagggcacggccgtgtattactgtgccagatata +>IGHV3-38-3*01 +gaggtgcagctggtggagtctcgggga...gtcttggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccgtc............agtagcaatgagatgagctgggtccgccaggctccagggaagggtctggagtgggtctcatccattagtggt............ggtagcacatactacgcagactccaggaag...ggcagattcaccatctccagagacaattccaagaacacgctgcatcttcaaatgaacagcctgagagctgaggacacggctgtgtattactgtaagaaaga +>IGHV3-43*01 +gaagtgcagctggtggagtctggggga...gtcgtggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttt............gatgattataccatgcactgggtccgtcaagctccggggaagggtctggagtgggtctctcttattagttgggat......ggtggtagcacatactatgcagactctgtgaag...ggccgattcaccatctccagagacaacagcaaaaactccctgtatctgcaaatgaacagtctgagaactgaggacaccgccttgtattactgtgcaaaagata +>IGHV3-43*02 +gaagtgcagctggtggagtctggggga...ggcgtggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttt............gatgattatgccatgcactgggtccgtcaagctccagggaagggtctggagtgggtctctcttattagtggggat......ggtggtagcacatactatgcagactctgtgaag...ggccgattcaccatctccagagacaacagcaaaaactccctgtatctgcaaatgaacagtctgagaactgaggacaccgccttgtattactgtgcaaaagata +>IGHV3-43D*01 +gaagtgcagctggtggagtctggggga...gtcgtggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttt............gatgattatgccatgcactgggtccgtcaagctccggggaagggtctggagtgggtctctcttattagttgggat......ggtggtagcacctactatgcagactctgtgaag...ggtcgattcaccatctccagagacaacagcaaaaactccctgtatctgcaaatgaacagtctgagagctgaggacaccgccttgtattactgtgcaaaagata +>IGHV3-47*01 +gaggatcagctggtggagtctggggga...ggcttggtacagcctggggggtccctgcgaccctcctgtgcagcctctggattcgccttc............agtagctatgctctgcactgggttcgccgggctccagggaagggtctggagtgggtatcagctattggtactggt.........ggtgatacatactatgcagactccgtgatg...ggccgattcaccatctccagagacaacgccaagaagtccttgtatcttcatatgaacagcctgatagctgaggacatggctgtgtattattgtgcaaga +>IGHV3-47*02 +gaggatcagctggtggagtctggggga...ggcttggtacagcctggggggtccctgagaccctcctgtgcagcctctggattcgccttc............agtagctatgttctgcactgggttcgccgggctccagggaagggtccggagtgggtatcagctattggtactggt.........ggtgatacatactatgcagactccgtgatg...ggccgattcaccatctccagagacaacgccaagaagtccttgtatcttcaaatgaacagcctgatagctgaggacatggctgtgtattattgtgcaagaga +>IGHV3-48*01 +gaggtgcagctggtggagtctggggga...ggcttggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatagcatgaactgggtccgccaggctccagggaaggggctggagtgggtttcatacattagtagtagt......agtagtaccatatactacgcagactctgtgaag...ggccgattcaccatctccagagacaatgccaagaactcactgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgagaga +>IGHV3-48*02 +gaggtgcagctggtggagtctggggga...ggcttggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatagcatgaactgggtccgccaggctccagggaaggggctggagtgggtttcatacattagtagtagt......agtagtaccatatactacgcagactctgtgaag...ggccgattcaccatctccagagacaatgccaagaactcactgtatctgcaaatgaacagcctgagagacgaggacacggctgtgtattactgtgcgagaga +>IGHV3-48*03 +gaggtgcagctggtggagtctggggga...ggcttggtacagcctggagggtccctgagactctcctgtgcagcctctggattcaccttc............agtagttatgaaatgaactgggtccgccaggctccagggaaggggctggagtgggtttcatacattagtagtagt......ggtagtaccatatactacgcagactctgtgaag...ggccgattcaccatctccagagacaacgccaagaactcactgtatctgcaaatgaacagcctgagagccgaggacacggctgtttattactgtgcgagaga +>IGHV3-48*04 +gaggtgcagctggtggagtctggggga...ggcttggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatagcatgaactgggtccgccaggctccagggaaggggctggagtgggtttcatacattagtagtagt......agtagtaccatatactacgcagactctgtgaag...ggccgattcaccatctccagagacaacgccaagaactcactgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgagaga +>IGHV3-49*01 +gaggtgcagctggtggagtctggggga...ggcttggtacagccagggcggtccctgagactctcctgtacagcttctggattcaccttt............ggtgattatgctatgagctggttccgccaggctccagggaaggggctggagtgggtaggtttcattagaagcaaagcttatggtgggacaacagaatacaccgcgtctgtgaaa...ggcagattcaccatctcaagagatggttccaaaagcatcgcctatctgcaaatgaacagcctgaaaaccgaggacacagccgtgtattactgtactagaga +>IGHV3-49*02 +gaggtgcagctggtggagtctggggga...ggcttggtacagccagggccgtccctgagactctcctgtacagcttctggattcaccttt............gggtattatcctatgagctgggtccgccaggctccagggaaggggctggagtgggtaggtttcattagaagcaaagcttatggtgggacaacagaatacgccgcgtctgtgaaa...ggcagattcaccatctcaagagatgattccaaaagcatcgcctatctgcaaatgaacagcctgaaaaccgaggacacagccgtgtattactgtactagaga +>IGHV3-49*03 +gaggtgcagctggtggagtctggggga...ggcttggtacagccagggcggtccctgagactctcctgtacagcttctggattcaccttt............ggtgattatgctatgagctggttccgccaggctccagggaaggggctggagtgggtaggtttcattagaagcaaagcttatggtgggacaacagaatacgccgcgtctgtgaaa...ggcagattcaccatctcaagagatgattccaaaagcatcgcctatctgcaaatgaacagcctgaaaaccgaggacacagccgtgtattactgtactagaga +>IGHV3-49*04 +gaggtgcagctggtggagtctggggga...ggcttggtacagccagggcggtccctgagactctcctgtacagcttctggattcaccttt............ggtgattatgctatgagctgggtccgccaggctccagggaaggggctggagtgggtaggtttcattagaagcaaagcttatggtgggacaacagaatacgccgcgtctgtgaaa...ggcagattcaccatctcaagagatgattccaaaagcatcgcctatctgcaaatgaacagcctgaaaaccgaggacacagccgtgtattactgtactagaga +>IGHV3-49*05 +gaggtgcagctggtggagtctggggga...ggcttggtaaagccagggcggtccctgagactctcctgtacagcttctggattcaccttt............ggtgattatgctatgagctggttccgccaggctccagggaaggggctggagtgggtaggtttcattagaagcaaagcttatggtgggacaacagaatacgccgcgtctgtgaaa...ggcagattcaccatctcaagagatgattccaaaagcatcgcctatctgcaaatgaacagcctgaaaaccgaggacacagccgtgtattactgtactagaga +>IGHV3-52*01 +gaggtgcagctggtggagtctgggtga...ggcttggtacagcctggagggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctcctggatgcactgggtctgccaggctccggagaaggggctggagtgggtggccgacataaagtgtgac......ggaagtgagaaatactatgtagactctgtgaag...ggccgattgaccatctccagagacaatgccaagaactccctctatctgcaagtgaacagcctgagagctgaggacatgaccgtgtattactgtgtgagagg +>IGHV3-52*02 +gaggtgcagctggtggagtctgggtga...ggcttggtacagcctggagggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctcctggatgcactgggtctgccaggctccggagaaggggcaggagtgggtggccgacataaagtgtgac......ggaagtgagaaatactatgtagactctgtgaag...ggccgattgaccatctccagagacaatgccaagaactccctctatctgcaagtgaacagcctgagagctgaggacatgaccgtgtattactgtgtgaga +>IGHV3-52*03 +gaggtgcagctggtcgagtctgggtga...ggcttggtacagcctggagggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctcctggatgcactgggtctgccaggctccggagaaggggctggagtgggtggccgacataaagtgtgac......ggaagtgagaaatactatgtagactctgtgaag...ggccgattgaccatctccagagacaatgccaagaactccctctatctgcaagtgaacagcctgagagctgaggacatgaccgtgtattactgtgtgaga +>IGHV3-53*01 +gaggtgcagctggtggagtctggagga...ggcttgatccagcctggggggtccctgagactctcctgtgcagcctctgggttcaccgtc............agtagcaactacatgagctgggtccgccaggctccagggaaggggctggagtgggtctcagttatttatagcggt.........ggtagcacatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatcttcaaatgaacagcctgagagccgaggacacggccgtgtattactgtgcgagaga +>IGHV3-53*02 +gaggtgcagctggtggagactggagga...ggcttgatccagcctggggggtccctgagactctcctgtgcagcctctgggttcaccgtc............agtagcaactacatgagctgggtccgccaggctccagggaaggggctggagtgggtctcagttatttatagcggt.........ggtagcacatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatcttcaaatgaacagcctgagagccgaggacacggccgtgtattactgtgcgagaga +>IGHV3-53*03 +gaggtgcagctggtggagtctggagga...ggcttgatccagcctggggggtccctgagactctcctgtgcagcctctgggttcaccgtc............agtagcaactacatgagctgggtccgccagcctccagggaaggggctggagtgggtctcagttatttatagcggt.........ggtagcacatactacgcagactctgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatcttcaaatgaacagcctgagagccgaggacacggccgtgtattactgtgctaggga +>IGHV3-53*04 +gaggtgcagctggtggagtctggagga...ggcttggtccagcctggggggtccctgagactctcctgtgcagcctctgggttcaccgtc............agtagcaactacatgagctgggtccgccaggctccagggaaggggctggagtgggtctcagttatttatagcggt.........ggtagcacatactacgcagactccgtgaag...ggccgattcaccatctccagacacaattccaagaacacgctgtatcttcaaatgaacagcctgagagctgaggacacggccgtgtattactgtgcgagaga +>IGHV3-54*01 +gaggtacagctggtggagtctgaagaa...aaccaaagacaacttgggggatccctgagactctcctgtgcagactctggattaaccttc............agtagctactgaatgagctcagattcccaagctccagggaaggggctggagtgagtagtagatatatagtaggat......agaagtcagctatgttatgcacaatctgtgaag...agcagattcaccatctccaaagaaaatgccaagaactcactctgtttgcaaatgaacagtctgagagcagagggcacggccgtgtattactgtatgtgagt +>IGHV3-54*02 +gaggtacagctggtggagtctgaagaa...aaccaaagacaacttgggggatccctgagactctcctgtgcagactctggattaaccttc............agtagctactgaatgagctcagattcccaggctccagggaaggggctggagtgagtagtagatatatagtacgat......agaagtcagatatgttatgcacaatctgtgaag...agcagattcaccatctccaaagaaaatgccaagaactcactccgtttgcaaatgaacagtctgagagcagagggcacggccgtgtattactgtatgtgagg +>IGHV3-54*04 +gaggtacagctggtggagtctgaagaa...aaccaaagacaacttgggggatccctgagactctcctgtgcagactctggattaaccttc............agtagctactgaatgagctcagattcccaggctccagggaaggggctggagtgagtagtagatatatagtaggat......agaagtcagctatgttatgcacaatctgtgaag...agcagattcaccatctccaaagaaaatgccaagaactcactctgtttgcaaatgaacagtctgagagcagagggcacggccgtgtattactgtatgtgagt +>IGHV3-62*01 +gaggtgcagctggtggagtctggggaa...ggcttggtccagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctctgctatgcactgggtccgccaggctccaagaaagggtttgtagtgggtctcagttattagtacaagt......ggtgataccgtactctacacagactctgtgaag...ggccgattcaccatctccagagacaatgcccagaattcactgtctctgcaaatgaacagcctgagagccgagggcacagttgtgtactactgtgtgaaaga +>IGHV3-63*01 +gaggtggagctgatagagtccatagag...ggcctgagacaacttgggaagttcctgagactctcctgtgtagcctctggattcaccttc............agtagctactgaatgagctgggtcaatgagactctagggaaggggctggagggagtaatagatgtaaaatatgat......ggaagtcagatataccatgcagactctgtgaag...ggcagattcaccatctccaaagacaatgctaagaactcaccgtatctccaaacgaacagtctgagagctgaggacatgaccatgcatggctgtacataaggtt +>IGHV3-63*02 +gaggtggagctgatagagtccatagag...ggcctgagacaacttgggaagttcctgagactctcctgtgtagcctctggattcaccttc............agtagctactgaatgagctgggtcaatgagactctagggaaggggctggagggagtaatagatgtaaaatatgat......ggaagtcagatataccatgcagactctgtgaag...ggcagattcaccatctccaaagacaatgctaagaactcaccgtatctgcaaacgaacagtctgagagctgaggacatgaccatgcatggctgtacataa +>IGHV3-64*01 +gaggtgcagctggtggagtctggggga...ggcttggtccagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatgctatgcactgggtccgccaggctccagggaagggactggaatatgtttcagctattagtagtaat......gggggtagcacatattatgcaaactctgtgaag...ggcagattcaccatctccagagacaattccaagaacacgctgtatcttcaaatgggcagcctgagagctgaggacatggctgtgtattactgtgcgagaga +>IGHV3-64*02 +gaggtgcagctggtggagtctggggaa...ggcttggtccagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctatgctatgcactgggtccgccaggctccagggaagggactggaatatgtttcagctattagtagtaat......gggggtagcacatattatgcagactctgtgaag...ggcagattcaccatctccagagacaattccaagaacacgctgtatcttcaaatgggcagcctgagagctgaggacatggctgtgtattactgtgcgagaga +>IGHV3-64*03 +gaggtgcagctggtggagtctggggga...ggcttggtccagcctggggggtccctgagactctcctgttcagcctctggattcaccttc............agtagctatgctatgcactgggtccgccaggctccagggaagggactggaatatgtttcagctattagtagtaat......gggggtagcacatactacgcagactcagtgaag...ggcagattcaccatctccagagacaattccaagaacacgctgtatgtccaaatgagcagtctgagagctgaggacacggctgtgtattactgtgtgaaaga +>IGHV3-64*04 +caggtgcagctggtggagtctggggga...ggcttggtccagcctggggggtccctgagactctcctgttcagcctctggattcaccttc............agtagctatgctatgcactgggtccgccaggctccagggaagggactggaatatgtttcagctattagtagtaat......gggggtagcacatactacgcagactcagtgaag...ggcagattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgagaga +>IGHV3-64*05 +gaggtgcagctggtggagtctggggga...ggcttggtccagcctggggggtccctgagactctcctgttcagcctctggattcaccttc............agtagctatgctatgcactgggtccgccaggctccagggaagggactggaatatgtttcagctattagtagtaat......gggggtagcacatactacgcagactcagtgaag...ggcagattcaccatctccagagacaattccaagaacacgctgtatgttcaaatgagcagtctgagagctgaggacacggctgtgtattactgtgtgaaaga +>IGHV3-64D*06 +gaggtgcagctggtggagtctggggga...ggcttggtccagcctggggggtccctgagactctcctgttcagcctctggattcaccttc............agtagctatgctatgcactgggtccgccaggctccagggaagggactggaatatgtttcagctattagtagtaat......gggggtagcacatactacgcagactccgtgaag...ggcagattcaccatctccagagacaattccaagaacacgctgtatcttcaaatgagcagtctgagagctgaggacacggctgtgtattactgtgtgaaaga +>IGHV3-66*01 +gaggtgcagctggtggagtctggggga...ggcttggtccagcctggggggtccctgagactctcctgtgcagcctctggattcaccgtc............agtagcaactacatgagctgggtccgccaggctccagggaaggggctggagtgggtctcagttatttatagcggt.........ggtagcacatactacgcagactccgtgaag...ggcagattcaccatctccagagacaattccaagaacacgctgtatcttcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgagaga +>IGHV3-66*02 +gaggtgcagctggtggagtctggggga...ggcttggtccagcctggggggtccctgagactctcctgtgcagcctctggattcaccgtc............agtagcaactacatgagctgggtccgccaggctccagggaaggggctggagtgggtctcagttatttatagcggt.........ggtagcacatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatcttcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgaga +>IGHV3-66*03 +gaggtgcagctggtggagtctggagga...ggcttgatccagcctggggggtccctgagactctcctgtgcagcctctgggttcaccgtc............agtagcaactacatgagctgggtccgccaggctccagggaaggggctggagtgggtctcagttatttatagctgt.........ggtagcacatactacgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatcttcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgagaga +>IGHV3-66*04 +gaggtgcagctggtggagtctggggga...ggcttggtccagcctggggggtccctgagactctcctgtgcagcctctggattcaccgtc............agtagcaactacatgagctgggtccgccaggctccagggaaggggctggagtgggtctcagttatttatagcggt.........ggtagcacatactacgcagactccgtgaag...ggcagattcaccatctccagagacaattccaagaacacgctgtatcttcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgagaca +>IGHV3-69-1*01 +gaggtgcagctggtggagtctggggga...ggcttggtaaagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtgactactacatgaactgggtccgccaggctccagggaaggggctggagtgggtctcatccattagtagtagt.........agtaccatatactacgcagactctgtgaag...ggccgattcaccatctccagagacaacgccaagaactcactgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgagaga +>IGHV3-69-1*02 +gaggtgcagctggtggagtctggggga...ggcttggtaaagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtgactactacatgaactgggtccgccaggctccagggaaggggctggagtgggtctcatccattagtagtagt.........agtaccatatactacgcagactctgtgaag...ggccgattcaccatctccagagacaacgccaagaactcactgtatctgcaaatgaacagcctgagagccgaggacacggctgtttattactgtgcgagaga +>IGHV3-7*01 +gaggtgcagctggtggagtctggggga...ggcttggtccagcctggggggtccctgagactctcctgtgcagcctctggattcaccttt............agtagctattggatgagctgggtccgccaggctccagggaaggggctggagtgggtggccaacataaagcaagat......ggaagtgagaaatactatgtggactctgtgaag...ggccgattcaccatctccagagacaacgccaagaactcactgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgagaga +>IGHV3-7*02 +gaggtgcagctggtggagtctggggga...ggcttggtccagcctggggggtccctgagactctcctgtgcagcctctggattcaccttt............agtagctattggatgagctgggtccgccaggctccagggaaagggctggagtgggtggccaacataaagcaagat......ggaagtgagaaatactatgtggactctgtgaag...ggccgattcaccatctccagagacaacgccaagaactcactgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgaga +>IGHV3-7*03 +gaggtgcagctggtggagtctggggga...ggcttggtccagcctggggggtccctgagactctcctgtgcagcctctggattcaccttt............agtagctattggatgagctgggtccgccaggctccagggaaggggctggagtgggtggccaacataaagcaagat......ggaagtgagaaatactatgtggactctgtgaag...ggccgattcaccatctccagagacaacgccaagaactcactgtatctgcaaatgaacagcctgagagccgaggacacggccgtgtattactgtgcgagaga +>IGHV3-71*01 +gaggtgcagctggtggagtccggggga...ggcttggtccagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtgactactacatgagctgggtccgccaggctcccgggaaggggctggagtgggtaggtttcattagaaacaaagctaatggtgggacaacagaatagaccacgtctgtgaaa...ggcagattcacaatctcaagagatgattccaaaagcatcacctatctgcaaatgaacagcctgagagccgaggacacggccgtgtattactgtgcgagaga +>IGHV3-71*02 +gaggtgcagctggtggagtctggggga...ggcttggtccagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtgactactacatgagctgggtccgccaggctcccgggaaggggctggagtgggtaggtttcattagaaacaaagctaatggtgggacaacagaatagaccacgtctgtgaaa...ggcagattcacaatctcaagagatgattccaaaagcatcacctatctgcaaatgaacagcctgagagccgaggacatggctgtgtattactgtgcgagaga +>IGHV3-71*03 +gaggtgcagctggtggagtctggggga...ggcttggtccagcctggggggtccctgagactctcctgtgcagcctctggtttcaccttc............agtgactactacatgagctgggtccgccaggctcccgggaaggggctggagtgggtaggtttcattagaaacaaagctaatggtgggacaacagaatagaccacgtctgtgaaa...ggcagattcacaatctcaagagatgattccaaaagcatcacctatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgcgagaga +>IGHV3-72*01 +gaggtgcagctggtggagtctggggga...ggcttggtccagcctggagggtccctgagactctcctgtgcagcctctggattcaccttc............agtgaccactacatggactgggtccgccaggctccagggaaggggctggagtgggttggccgtactagaaacaaagctaacagttacaccacagaatacgccgcgtctgtgaaa...ggcagattcaccatctcaagagatgattcaaagaactcactgtatctgcaaatgaacagcctgaaaaccgaggacacggccgtgtattactgtgctagaga +>IGHV3-72*02 +....................................................................................accttc............agtgaccactacatggactgggtccgccaggctccagggaaggggctggagtgggttggccgtactagaaacaaagctaacagctacaccacagaatacgccgcgtctgtgaaa...ggcagattcaccatctcaagagatgattcaaagaactcactgtat +>IGHV3-73*01 +gaggtgcagctggtggagtctggggga...ggcttggtccagcctggggggtccctgaaactctcctgtgcagcctctgggttcaccttc............agtggctctgctatgcactgggtccgccaggcttccgggaaagggctggagtgggttggccgtattagaagcaaagctaacagttacgcgacagcatatgctgcgtcggtgaaa...ggcaggttcaccatctccagagatgattcaaagaacacggcgtatctgcaaatgaacagcctgaaaaccgaggacacggccgtgtattactgtactagaca +>IGHV3-73*02 +gaggtgcagctggtggagtccggggga...ggcttggtccagcctggggggtccctgaaactctcctgtgcagcctctgggttcaccttc............agtggctctgctatgcactgggtccgccaggcttccgggaaagggctggagtgggttggccgtattagaagcaaagctaacagttacgcgacagcatatgctgcgtcggtgaaa...ggcaggttcaccatctccagagatgattcaaagaacacggcgtatctgcaaatgaacagcctgaaaaccgaggacacggccgtgtattactgtactagaca +>IGHV3-74*01 +gaggtgcagctggtggagtccggggga...ggcttagttcagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctactggatgcactgggtccgccaagctccagggaaggggctggtgtgggtctcacgtattaatagtgat......gggagtagcacaagctacgcggactccgtgaag...ggccgattcaccatctccagagacaacgccaagaacacgctgtatctgcaaatgaacagtctgagagccgaggacacggctgtgtattactgtgcaagaga +>IGHV3-74*02 +gaggtgcagctggtggagtctggggga...ggcttagttcagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctactggatgcactgggtccgccaagctccagggaaggggctggtgtgggtctcacgtattaatagtgat......gggagtagcacaagctacgcggactccgtgaag...ggccgattcaccatctccagagacaacgccaagaacacgctgtatctgcaaatgaacagtctgagagccgaggacacggctgtgtattactgtgcaaga +>IGHV3-74*03 +gaggtgcagctggtggagtccggggga...ggcttagttcagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctactggatgcactgggtccgccaagctccagggaaggggctggtgtgggtctcacgtattaatagtgat......gggagtagcacaacgtacgcggactccgtgaag...ggccgattcaccatctccagagacaacgccaagaacacgctgtatctgcaaatgaacagtctgagagccgaggacacggctgtgtattactgtgcaagaga +>IGHV3-9*01 +gaagtgcagctggtggagtctggggga...ggcttggtacagcctggcaggtccctgagactctcctgtgcagcctctggattcaccttt............gatgattatgccatgcactgggtccggcaagctccagggaagggcctggagtgggtctcaggtattagttggaat......agtggtagcataggctatgcggactctgtgaag...ggccgattcaccatctccagagacaacgccaagaactccctgtatctgcaaatgaacagtctgagagctgaggacacggccttgtattactgtgcaaaagata +>IGHV3-9*02 +gaagtgcagctggtggagtctggggga...ggcttggtacagcctggcaggtccctgagactctcctgtgcagcctctggattcacctct............gatgattatgccatgcactgggtccggcaagctccagggaagggcctggagtgggtctcaggtattagttggaat......agtggtagcataggctatgcggactctgtgaag...ggccgattcaccatctccagagacaacgccaagaactccctgtatctgcaaatgaacagtctgagagctgaggacacggccttgtattactgtgcaaaagata +>IGHV3-9*03 +gaagtgcagctggtggagtctggggga...ggcttggtacagcctggcaggtccctgagactctcctgtgcagcctctggattcaccttt............gatgattatgccatgcactgggtccggcaagctccagggaagggcctggagtgggtctcaggtattagttggaat......agtggtagcataggctatgcggactctgtgaag...ggccgattcaccatctccagagacaacgccaagaactccctgtatctgcaaatgaacagtctgagagctgaggacatggccttgtattactgtgcaaaagata +>IGHV3-NL1*01 +caggtgcagctggtggagtctggggga...ggcgtggtccagcctggggggtccctgagactctcctgtgcagcgtctggattcaccttc............agtagctatggcatgcactgggtccgccaggctccaggcaaggggctggagtgggtctcagttatttatagcggt......ggtagtagcacatactatgcagactccgtgaag...ggccgattcaccatctccagagacaattccaagaacacgctgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgcgaaaga +>IGHV3/OR15-7*01 +gaggtgcagctggtggagtctggggga...ggcttggtccagcctgggggttctctgagactctcatgtgcagcctctggattcaccttc............agtgaccactacatgagctgggtccgccaggctcaagggaaagggctagagttggtaggtttaataagaaacaaagctaacagttacacgacagaatatgctgcgtctgtgaaa...ggcagacttaccatctcaagagaggattcaaagaacacgatgtatctgcaaatgagcaacctgaaaaccgaggacttggccgtgtattactgtgctaga +>IGHV3/OR15-7*02 +gaggtgcagctgttggagtctggggga...ggcttggtccagcctgggggttctctgagactctcatgtgctgcctctggattcaccttc............agtgaccactacatgagctgggtccgccaggctcaagggaaagggctagagttggtaggtttaataagaaacaaagctaacagttacacgacagaatatgctgcgtctgtgaaa...ggcagacttaccatctcaagagaggattcaaagaacacgctgtatctgcaaatgagcagcctgaaaaccgaggacttggccgtgtattactgtgctaga +>IGHV3/OR15-7*03 +gaggtgcagctggtggagtctggggga...ggcttggtccagcctgggggttctctgagactctcatgtgcagcctctggattcaccttc............agtgaccactacatgagctgggtccgccaggctcaagggaaagggctagagttggtaggtttaataagaaacaaagctaacagttacacgacagaatatgctgcgtctgtgaaa...ggcagacttaccatctcaagagaggattcaaagaacacgctgtatctgcaaatgagcagcctgaaaaccgaggacttggccgtgtattactgtgctaga +>IGHV3/OR15-7*05 +gaggtgcagctggtggagtctggggga...ggcttggtccagcctgggggttctctgagactctcatgtgcagcctctggattcaccttc............agtgaccactacatgagctgggtccgccaggctcaagggaaagggctagagttggtaggtttaataagaaacaaagctaacagttacacgacagaatatgctgcgtctgtgaaa...ggcagacttaccatctcaagagaggattcaaagaacacgctgtatctgcaaatgagcaacctgaaaaccgaggacttggccgtgtattactgtgctagaga +>IGHV3/OR16-10*01 +gaggttcagctggtgcagtctggggga...ggcttggtacatcctggggggtccctgagactctcctgtgcaggctctggattcaccttc............agtagctatgctatgcactgggttcgccaggctccaggaaaaggtctggagtgggtatcagctattggtactggt.........ggtggcacatactatgcagactccgtgaag...ggccgattcaccatctccagagacaatgccaagaactccttgtatcttcaaatgaacagcctgagagccgaggacatggctgtgtattactgtgcaaga +>IGHV3/OR16-10*02 +gaggttcagctggtgcagtctggggga...ggcttggtacagcctggggggtccctgagactctcctgtgcaggctctggattcaccttc............agtagctatgctatgcactgggttcgccaggctccaggaaaaggtctggagtgggtatcagctattggtactggt.........ggtggcacatactatgcagactccgtgaag...ggccgattcaccatctccagagacaatgccaagaactccttgtatcttcaaatgaacagcctgagagccgaggacatggctgtgtattactgtgcaaga +>IGHV3/OR16-10*03 +gaggtgcagctggtggagtctggggga...ggcttggtacagcctggggggtccctgagactctcctgtgcaggctctggattcaccttc............agtagctatgctatgcactgggttcgccaggctccaggaaaaggtctggagtgggtatcagctattggtactggt.........ggtggcacatactatgcagactccgtgaag...ggccgattcaccatctccagagacaatgccaagaactccttgtatcttcaaatgaacagcctgagagccgaggacatggctgtgtattactgtgcaagaga +>IGHV3/OR16-12*01 +gaggtgcagctggtagagtctgggaga...ggcttggcccagcctggggggtacctaaaactctccggtgcagcctctggattcaccgtc............ggtagctggtacatgagctggatccaccaggctccagggaagggtctggagtgggtctcatacattagtagtagt......ggttgtagcacaaactacgcagactctgtgaag...ggcagattcaccatctccacagacaactcaaagaacacgctctacctgcaaatgaacagcctgagagtggaggacacggccgtgtattactgtgcaaga +>IGHV3/OR16-13*01 +gaggtgcagctggtggagtctggggga...ggcttagtacagcctggagggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctactggatgcactgggtccgccaagctccagggaaggggctggtgtgggtctcacgtattaatagtgat......gggagtagcacaagctacgcagactccatgaag...ggccaattcaccatctccagagacaatgctaagaacacgctgtatctgcaaatgaacagtctgagagctgaggacatggctgtgtattactgtactaga +>IGHV3/OR16-14*01 +gaggtgcagctggaggagtctggggga...ggcttagtacagcctggagggtccctgagactctcctgtgcagcctctggattcaccttc............agtagctactggatgcactgggtccgccaatctccagggaaggggctggtgtgagtctcacgtattaatagtgat......gggagtagcacaagctacgcagactccttgaag...ggccaattcaccatctccagagacaatgctaagaacacgctgtatctgcaaatgaacagtctgagagctgaggacatggctgtgtattactgtactaga +>IGHV3/OR16-15*01 +gaagtgcagctggtggagtctggggga...ggcttggtccagcctggggggtccctgagactctcctgtgcagcctctgtattcaccttc............agtaacagtgacataaactgggtcctctaggctccaggaaaggggctggagtgggtctcgggtattagttggaat......ggcggtaagacgcactatgtggactccgtgaag...ggccaattttccatctccagagacaattccagcaagtccctgtatctgcaaaagaacagacagagagccaaggacatggccgtgtattactgtgtgagaaa +>IGHV3/OR16-15*02 +gaggtgcagctggtggagtctggggga...ggcttggtccagcctggggggtccctgagacactcctgtgcagcctctggattcaccttc............agtaacagtgacatgaactgggtcctctaggctccaggaaaggggctggagtgggtctcgggtattagttggaat......ggcggtaagacgcactatgtggactccgtgaag...ggccaatttaccatctccagagacaattccagcaagtccctgtatctgcaaaagaacagacagagagccaaagacatggccgtgtattactgtgtgaga +>IGHV3/OR16-16*01 +gaggtgcagctggtggagtctggggga...ggcttggtccagcctggggggtccctgagacactcctgtgcagcctctggattcaccttc............agtaacagtgacatgaactgggtcctctaggctccaggaaaggggctggagtgggtctcggatattagttggaat......ggcggtaagacgcactatgtggactccgtgaag...ggccaatttaccatctccagagacaattccagcaagtccctgtatctgcaaaagaacagacagagagccaaggacatggccgtgtattactgtgtgaga +>IGHV3/OR16-6*02 +gaggtgcagctggtggagtctgcggga...ggccttggtacagcctgggggtcccttagactctcctgtgcagcctctggattcacttgc............agtaacgcctggatgagctgggtccgccaggctccagggaaggggctggagtgggttggctgtattaaaagcaaagctaatggtgggacaacagactacgctgcacctgtgaaa...ggcagattcaccatctcaagagatgattcaaaaaacacgctgtatctgcaaatgatcagcctgaaaaccgaggacacggccgtgtattactgtaccacagg +>IGHV3/OR16-8*01 +gaggtgcagctggtggagtctggggga...ggcttggtacagcctggggggtccctgagactgtcctgtccagcctctggattcaccttc............agtaaccactacatgagctgggtccgccaggctccagggaagggactggagtgggtttcatacattagtggtgat......agtggttacacaaactacgcagactctgtgaag...ggccgattcaccatctccagggacaacgccaataactcaccgtatctgcaaatgaacagcctgagagctgaggacacggctgtgtattactgtgtgaaa +>IGHV3/OR16-8*02 +gaggtgcagctggtggagtctggggga...ggcttggtacagcctggggggtccctgagactgtcctgtccagactctggattcaccttc............agtaaccactacatgagctgggtccgccaggctccagggaagggactggagtggatttcatacattagtggtgat......agtggttacacaaactacgcagactctgtgaag...ggccgattcaccatctccagggacaacgccaataactcaccgtatctgcaaatgaacagcttgagagctgaggacacggctgtgtattactgtgtgaaaca +>IGHV3/OR16-9*01 +gaggtgcagctggtggagtctggagga...ggcttggtacagcctggggggtccctgagactctcctgtgcagcctctggattcaccttc............agtaaccactacacgagctgggtccgccaggctccagggaagggactggagtgggtttcatacagtagtggtaat......agtggttacacaaactacgcagactctgtgaaa...ggccgattcaccatctccagggacaacgccaagaactcactgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgtgaaa +>IGHV4-28*01 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggacaccctgtccctcacctgcgctgtctctggttactccatcagc.........agtagtaactggtggggctggatccggcagcccccagggaagggactggagtggattgggtacatctattatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatgtcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgtggacacggccgtgtattactgtgcgagaaa +>IGHV4-28*02 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcacagaccctgtccctcacctgcgctgtctctggttactccatcagc.........agtagtaactggtggggctggatccggcagcccccagggaagggactggagtggattgggtacatctattatagt.........gggagcatctactacaacccgtccctcaag...agtcgagtcaccatgtcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgtggacacggccgtgtattactgtgcgagaaa +>IGHV4-28*03 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggacaccctgtccctcacctgcgctgtctctggttactccatcagc.........agtagtaactggtggggctggatccggcagcccccagggaagggactggagtggattgggtacatctattatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatgtcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgtggacacggccgtgtattactgtgcgagaga +>IGHV4-28*04 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggacaccctgtccctcacctgcgctgtctctggttactccatcagc.........agtagtaactggtggggctggatccggcagcccccagggaagggactggagtggattgggtacatctattatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatgtcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgtggacaccggcgtgtattactgtgcgaga +>IGHV4-28*05 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggacaccctgtccctcacctgcgctgtctctggttactccatcagc.........agtagtaactggtggggctggatccggcagcccccagggaagggactggagtggattgggtacatctattatagt.........gggagcatctactacaacccgtccctcaag...agtcgagtcaccatgtcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgtggacacggccgtgtattactgtgcgagaaa +>IGHV4-28*06 +caggtgcagctacaggagtcgggccca...ggactggtgaagccttcggacaccctgtccctcacctgcgctgtctctggttactccatcagc.........agtagtaactggtggggctggatccggcagcccccagggaagggactggagtggattgggtacatctattatagt.........gggagcaccaactacaacccgtccctcaag...agtcgagtcaccatgtcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccttggacacggccgtgtattactgtgcgagaaa +>IGHV4-28*07 +caggtacagctgcaggagtcgggccca...ggactggtgaagccttcggacaccctgtccctcacctgcgctgtctctggttactccatcagc.........agtagtaactggtggggctggatccggcagcccccagggaagggactggagtggattgggtacatctattatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatgtcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgtggacacggccgtgtattactgtgcgagaaa +>IGHV4-30-2*01 +cagctgcagctgcaggagtccggctca...ggactggtgaagccttcacagaccctgtccctcacctgcgctgtctctggtggctccatcagc......agtggtggttactcctggagctggatccggcagccaccagggaagggcctggagtggattgggtacatctatcatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatatcagtagacaggtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactgtgccagaga +>IGHV4-30-2*02 +cagctgcagctgcaggagtccggctca...ggactggtgaagccttcacagaccctgtccctcacctgcgctgtctctggtggctccatcagc......agtggtggttactcctggagctggatccggcagccaccagggaagggcctggagtggattgggtacatctatcatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatatcagtagacaggtccaagaaccagttctccctgaagctgagctctgtgaccgctgcggacacggccgtgtattactgtgcg +>IGHV4-30-2*03 +cagctgcagctgcaggagtccggctca...ggactggtgaagccttcacagaccctgtccctcacctgcgctgtctctggtggctccatcagc......agtggtggttactcctggagctggatccggcagccaccagggaagggcctggagtggattgggagtatctattatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatatccgtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgctgcagacacggctgtgtattactgtgcgagaca +>IGHV4-30-2*04 +...........................................................................tctggtggctccatcagc......agtggtggttactcctggagctggatccggcagccaccagggaagggcctggagtggattgggtacatctatcatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcagacacggccgtgtattactgtgcgagaga +>IGHV4-30-2*05 +cagctgcagctgcaggagtccggctca...ggactggtgaagccttcacagaccctgtccctcacctgcgctgtctctggtggctccatcagc......agtggtggttactcctggagctggatccggcagccaccagggaagggcctggagtggattgggtacatctatcatagt.........gggagcacctactacaacccgtccctcaag...agtcgagttaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgactgccgcagacacggccgtgtattactgtgccagaga +>IGHV4-30-2*06 +cagctgcagctgcaggagtccggctca...ggactggtgaagccttcacagaccctgtccctcacctgcgctgtctctggtggctccatcagc......agtggtggttactcctggagctggatccggcagtcaccagggaagggcctggagtggattgggtacatctatcatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatatcagtagacaggtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactgtgccagaga +>IGHV4-30-4*01 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcacagaccctgtccctcacctgcactgtctctggtggctccatcagc......agtggtgattactactggagttggatccgccagcccccagggaagggcctggagtggattgggtacatctattacagt.........gggagcacctactacaacccgtccctcaag...agtcgagttaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgactgccgcagacacggccgtgtattactgtgccagaga +>IGHV4-30-4*02 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggacaccctgtccctcacctgcactgtctctggtggctccatcagc......agtggtgattactactggagttggatccgccagcccccagggaagggcctggagtggattgggtacatctattacagt.........gggagcacctactacaacccgtccctcaag...agtcgagttaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgactgcagcagacacggccgtgtattactgtgccagaga +>IGHV4-30-4*03 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcacagaccctgtccctcacctgcactgtctctggtggctccatcagc......agtggtgattactactggagttggatccgccagcccccagggaagggcctggagtggattgggtacatctattacagt.........gggagcacctactacaacccgtccctcaag...agtcgagttaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgactgccgcggacacggccgtgtattactg +>XIGHV4-30-4*04 +caggtgcagctgcaggactcgggccca...ggactggtgaagccttcacagaccctgtccctcacctgcactgtctctggtggctccatcagc......agtggtgattactactggagttggatccgccagcccccagggaagggcctggagtggattgggtacttctattacagt.........gggagcacctactacaacccgtccctcaag...agtcgagttaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgactgccgcagacacggccgtgtattactg +>IGHV4-30-4*05 +..........................................................................ctctggtggctccatcagc......agtggtgattactactggagttggatccgccagcncccagggaagggcctggagtggattgggtacatctattacagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgactgccgcagacacggccgtgtattactgtgccagaga +>IGHV4-30-4*06 +...........................................................................tctggtggctccatcagc......agtggtgattactactggagttggatccgccagcacccagggaagggcctggagtggattgggtacatctattacagt.........gggagcacctactacaacccgtccctcaag...agtcgagttaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgactgccgcagacacggccgtgtattactgtgccagaga +>IGHV4-30-4*07 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcacagaccctgtccctcacctgcgctgtctctggtggctccatcagc......agtggtggttactcctggagctggatccggcagccaccagggaagggactggagtggattgggtatatctattacagt.........gggagcacctactacaacccgtccctcaag...agtcgagttaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactgtgccagaga +>IGHV4-31*01 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcacagaccctgtccctcacctgcactgtctctggtggctccatcagc......agtggtggttactactggagctggatccgccagcacccagggaagggcctggagtggattgggtacatctattacagt.........gggagcacctactacaacccgtccctcaag...agtctagttaccatatcagtagacacgtctaagaaccagttctccctgaagctgagctctgtgactgccgcggacacggccgtgtattactgtgcgagaga +>IGHV4-31*02 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcacagaccctgtccctcacctgtactgtctctggtggctccatcagc......agtggtggttactactggagctggatccgccagcacccagggaagggcctggagtggattgggtacatctattacagt.........gggagcacctactacaacccgtccctcaag...agtcgagttaccatatcagtagacacgtctaagaaccagttctccctgaagctgagctctgtgactgccgcggacacggccgtgtattactgtgcgagaga +>IGHV4-31*03 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcacagaccctgtccctcacctgcactgtctctggtggctccatcagc......agtggtggttactactggagctggatccgccagcacccagggaagggcctggagtggattgggtacatctattacagt.........gggagcacctactacaacccgtccctcaag...agtcgagttaccatatcagtagacacgtctaagaaccagttctccctgaagctgagctctgtgactgccgcggacacggccgtgtattactgtgcgagaga +>IGHV4-31*04 +caggtgcggctgcaggagtcgggccca...ggactggtgaagccttcacagaccctgtccctcacctgcactgtctctggtggctccatcagc......agtggtggttactactggagctggatccgccagcacccagggaagggcctggagtggattgggtacatctattacagt.........gggagcacctactacaacccgtccctcaag...agtcgagttaccatatcagtagacacgtctaagaaccagttctccctgaagctgagctctgtgactgccgcggacacggccgtgtattactgtgcg +>IGHV4-31*05 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcacagaccctgtccctcacctgcactgtctctggtggctccatcagc......agtggtggttactactggagctggatccgccagcacccagggaagggcctggagtggattgggtacatctattacagt.........gggagcacctactacaacccgtccctcaag...agtcgagttaccatatcagtagacacgtctaagaaccagttctccctgaagctgagctctgtgacc...gcggacgcggccgtgtattactgtgcg +>IGHV4-31*06 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcacagaccctgtccctcacctgcactgtctctggtggctccatcagc......agtggtagttactactggagctggatccgccagcacccagggaagggcctggagtggattgggtacatctattacagt.........gggagcacctactacaacccgtccctcaag...agtcgagttaccatatcagtagacacgtctaagaaccagttctccctgaagctgagctctgtgactgccgcggacacggccgtgtattactg +>IGHV4-31*07 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcacagaccctgtccctcacctgcactgtctctggtggatccatcagc......agtggtggttactactggagctggatccgccagcacccagggaagggcctggagtggattgggtacatctattacagt.........gggagcacctactacaacccgtccctcaag...agtcgagttaccatatcagtagacacgtctaagaaccagttctccctgaagctgagctctgtgactgccgcggacacggccgtgtattactg +>IGHV4-31*08 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcacagaccctgtccctcacctgcactgtctctggtggctccatcagc......agtggtggttactactggagctggatccgccagcacccagggaagggcctggagtggattgggtacatctattacagt.........gggagcacctactacaacccgtccctcaag...agtcgagttaccatatccgtagacacgtccaagaaccagttctccctgaagctgagctctgtgactgccgcggacacggccgtgtattactg +>IGHV4-31*09 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcacagaccctgtccctcacctgcactgtctctggtggctccatcagc......agtggtggttactactggagctggatccgccagcacccagggaagggcctggagtggattgggtacatctattacagt.........gggagcacctactacaacccgtccctcaag...agtcgagttaccatatcagtagacaagtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactg +>IGHV4-31*10 +caggtgcagctgcaggagtcgggccca...ggactgttgaagccttcacagaccctgtccctcacctgcactgtctctggtggctccatcagc......agtggtggttactactggagctggatccgccagcacccagggaagggcctggagtggattgggtgcatctattacagt.........gggagcacctactacaacccgtccctcaag...agtcgagttaccatatcagtagacccgtccaagaaccagttctccctgaagccgagctctgtgactgccgcggacacggccgtggattactgtgcgagaga +>IGHV4-34*01 +caggtgcagctacagcagtggggcgca...ggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccttc............agtggttactactggagctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcatagt.........ggaagcaccaactacaacccgtccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggctgtgtattactgtgcgagagg +>IGHV4-34*02 +caggtgcagctacaacagtggggcgca...ggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccttc............agtggttactactggagctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcatagt.........ggaagcaccaactacaacccgtccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggctgtgtattactgtgcgagagg +>IGHV4-34*03 +caggtgcagctacagcagtggggcgca...ggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccttc............agtggttactactggagctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcatagt.........ggaagcaccaactacaacccgtccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactg +>IGHV4-34*04 +caggtgcagctacagcagtggggcgca...ggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccttc............agtggttactactggagctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcatagt.........ggaagcaccaacaacaacccgtccctcaag...agtcgagccaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggctgtgtattactgtgcgagagg +>IGHV4-34*05 +caggtgcagctacagcagtggggcgca...ggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccttc............agtggttactactggtgctggatccgccagcccctagggaaggggctggagtggattggggaaatcaatcatagt.........ggaagcaccaacaacaacccgtccctcaag...agtcgagccaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggctgtgtattactgtgcgagagg +>IGHV4-34*06 +caggtgcagctacagcagtggggcgca...ggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccttc............agtggttactactggagctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcatagt.........ggaagcaccaactacaacccgtccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgggctctgtgaccgccgcggacacggccgtgtattactg +>IGHV4-34*07 +caggtgcagctacagcagtggggcgca...ggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccttc............agtggttactactggagctggatccgccagcccccagggaaggggctggagtggattggggaaatcaaccatagt.........ggaagcaccaactacaacccgtccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactg +>IGHV4-34*08 +caggtgcagctacagcagtggggcgca...ggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggaccttc............agtggttactactggagctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcatagt.........ggaagcaccaactacaacccgtccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggctgtgtattactgtgcg +>IGHV4-34*09 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcacagaccctgtccctcacctgcgctgtctatggtgggtccttc............agtggttactactggagctggatccgccagcccccagggaagggactggagtggattggggaaatcaatcatagt.........ggaagcaccaactacaacccgtccctcaag...agtcgagttaccatatcagtagacacgtctaagaaccagttctccctgaagctgagctctgtgactgccgcggacacggccgtgtattactgtgcgagaga +>IGHV4-34*10 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccttc............agtggttactactggagctggatccgccagcccccagggaagggactggagtggattggggaaatcaatcatagt.........ggaagcaccaactacaacccgtccctcaag...agtcgaatcaccatgtcagtagacacgtccaagaaccagttctacctgaagctgagctctgtgaccgccgcggacacggccgtgtattactgtgcgagata +>IGHV4-34*11 +caggtgcagctacagcagtggggcgca...ggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccgtc............agtggttactactggagctggatccggcagcccccagggaaggggctggagtggattgggtatatctattatagt.........gggagcaccaacaacaacccctccctcaag...agtcgagccaccatatcagtagacacgtccaagaaccagttctccctgaacctgagctctgtgaccgccgcggacacggccgtgtattgctgtgcgagaga +>IGHV4-34*12 +caggtgcagctacagcagtggggcgca...ggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccttc............agtggttactactggagctggatccgccagcccccagggaaggggctggagtggattggggaaatcattcatagt.........ggaagcaccaactacaacccgtccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggctgtgtattactgtgcgaga +>IGHV4-34*13 +...........................................................................tatggtgggtccttc............agtggttactactggagctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcatagt.........ggaagcaccaactacaacccctccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggctgtgtattactgtgcgagagg +>IGHV4-38-2*01 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcgctgtctctggttactccatcagc.........agtggttactactggggctggatccggcagcccccagggaaggggctggagtggattgggagtatctatcatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcagacacggccgtgtattactgtgcgaga +>IGHV4-38-2*02 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtctctggttactccatcagc.........agtggttactactggggctggatccggcagcccccagggaaggggctggagtggattgggagtatctatcatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcagacacggccgtgtattactgtgcgagaga +>IGHV4-39*01 +cagctgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtctctggtggctccatcagc......agtagtagttactactggggctggatccgccagcccccagggaaggggctggagtggattgggagtatctattatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatatccgtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcagacacggctgtgtattactgtgcgagaca +>IGHV4-39*02 +cagctgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtctctggtggctccatcagc......agtagtagttactactggggctggatccgccagcccccagggaaggggctggagtggattgggagtatctattatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatatccgtagacacgtccaagaaccacttctccctgaagctgagctctgtgaccgccgcagacacggctgtgtattactgtgcgagaga +>IGHV4-39*03 +cagctgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtctctggtggctccatcagc......agtagtagttactactggggctggatccgccagcccccagggaaggggctggagtggattgggagtatctattatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatatccgtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcagacacggccgtgtattactg +>IGHV4-39*04 +..................................................................................gctccatcagc......agtagtagttactactggggctggatccgccagcccccagggaaggggctggagtggattgggagtatctattatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatatccgtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacac +>IGHV4-39*05 +cagctgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccccgtccctcacctgcactgtctctggtggctccatcagc......agtagtagttactactggggctggatccgccagcccccagggaaggggctggagtggattgggagtatctattatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatatccgtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcagacacggctgtgtattactgtgcg +>IGHV4-39*06 +cggctgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtctctggtggctccatcagc......agtagtagttactactggggctggatccgccagcccccagggaaggggctggagtggattgggagtatctattatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttccccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactgtgcgagaga +>IGHV4-39*07 +cagctgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtctctggtggctccatcagc......agtagtagttactactggggctggatccgccagcccccagggaaggggctggagtggattgggagtatctattatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactgtgcgagaga +>IGHV4-4*01 +caggtgcagctgcaggagtcgggccca...ggactggtgaagcctccggggaccctgtccctcacctgcgctgtctctggtggctccatcagc.........agtagtaactggtggagttgggtccgccagcccccagggaaggggctggagtggattggggaaatctatcatagt.........gggagcaccaactacaacccgtccctcaag...agtcgagtcaccatatcagtagacaagtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattgctgtgcgagaga +>IGHV4-4*02 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggggaccctgtccctcacctgcgctgtctctggtggctccatcagc.........agtagtaactggtggagttgggtccgccagcccccagggaaggggctggagtggattggggaaatctatcatagt.........gggagcaccaactacaacccgtccctcaag...agtcgagtcaccatatcagtagacaagtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactgtgcgagaga +>IGHV4-4*03 +caggtgcagctgcaggagtcgggccca...ggactggtgaagcctccggggaccctgtccctcacctgcgctgtctctggtggctccatcagc.........agtagtaactggtggagttgggtccgccagcccccagggaaggggctggagtggattggggaaatctatcatagt.........gggagcaccaactacaacccgtccctcaag...agtcgagtcaccatatcagtagacaagtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactg +>IGHV4-4*04 +caggtgcagctgcaggagtcgggccca...ggactggtgaagcctccggggaccctgtccctcacctgcgctatctctggtggctccatcagc.........agtagtaactggtggagttgggtccgccagcccccagggaaggggctggagtggattggggaaatctatcatagt.........gggagcaccaactacaacccgtccctcaag...agtcgagtcaccatatcagtagacaagtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactg +>IGHV4-4*05 +caggtgcagctgcaggagttgggccca...ggactggtgaagcctccggggaccctgtccctcacctgcgctgtctctggtggctccatcagc.........agtagtaactggtggagttgggtccgccagcccccagggaaggggctggagtggattggggaaatctatcatagt.........gggagcaccaactacaacccgtccctcaag...agtcgagtcaccatatcagtagacaagtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactg +>IGHV4-4*06 +............................................................ +...............tctggtggctccatcagc.........agtagtaactggtggagttgggtccgccagcccccagggannnggctggagtggattggggaaatctatcatagt.........gggagcaccaactacaacccgtccctcaag...agtcgagtcaccatgtcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactgtgcgagaga +>IGHV4-4*07 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtctctggtggctccatc............agtagttactactggagctggatccggcagcccgccgggaagggactggagtggattgggcgtatctataccagt.........gggagcaccaactacaacccctccctcaag...agtcgagtcaccatgtcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactgtgcgagaga +>IGHV4-4*08 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtctctggtggctccatc............agtagttactactggagctggatccggcagcccccagggaagggactggagtggattgggtatatctataccagt.........gggagcaccaactacaacccctccctcaag...agtcgagtcaccatatccgtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcagacacggccgtgtattactgtgcgagaga +>IGHV4-55*01 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcatctgcgctgtctctggtgactccatcagc.........agtggtaactggtgaatctgggtccgccagcccccagggaaggggctggagtggattggggaaatccatcatagt.........gggagcacctactacaacccgtccctcaag...agtcgaatcaccatgtccgtagacacgtccaagaaccagttctacctgaagctgagctctgtgaccgccgcggacacggccgtgtattactgtgcgagata +>IGHV4-55*02 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcatctgcgctgtctctggtgactccatcagc.........agtggtaactggtgaatctgggtccgccagcccccagggaaggggctggagtggattggggaaatccatcatagt.........gggagcacctactacaacccgtccctcaag...agtcgaatcaccatgtcagtagacacgtccaagaaccagttctacctgaagctgagctctgtgaccgccgcggacacggccgtgtattactgtgcgagata +>IGHV4-55*03 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcatctgcgctgtctctggtgactccatcagc.........agtggtaactggtgaatctgggtccgccagcccccagggaaggggctggagtggattggggaaatccatcatagt.........gggagcacctactacaacccgtccctcaag...agtcgaatcaccatgtcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactg +>IGHV4-55*04 +caggtgcagctgcaggagtcgggccca...ggactggtgaagctttcggagaccctgtccctcatctgcgctgtctctggtgactccatcagc.........agtggtaactggtgaatctgggtccgccagcccccagggaaggggctggagtggattggggaaatccatcatagt.........gggagcacctactacaacccgtccctcaag...agtcgaatcaccatgtcagtagacacgtccaagaaccagttctacctgaagctgagctctgtgaccgccgcggacacggccgtgtattactg +>IGHV4-55*05 +caggtgcagctgcaggagtcgggccca...ggactggtgaagctttcggagaccctgtccctcatctgcgctgtctctggtgactccatcagc.........agtggtaactggtgaatctgggtccgccagcccccagggaaggggctggagtggattggggaaatccatcatagt.........gggagcacctactacaacccgtccctcaag...agtcgaatcaccatgtccgtagacacgtccaagaaccagttctacctgaagctgagctctgtgaccgccgcggacacggccgtgtattactg +>IGHV4-55*06 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcatctgcgctgtctctggtgactccatcagc.........agtggtaactggtgaatctgggtccgccagcccccagggaaggggctggagtggattggggaaatccatcatagt.........gggagcacctactacaacccgtccctcaag...agtcgaatcaccatgtccgtagacacgtccaagaagcagttctacctgaagctgagctctgtgaccgctgcggacacggccgtgtattactg +>IGHV4-55*07 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcatctgcgctgtctctggtgactccatcagc.........agtggtaactggtgaatctgggtccgccagcccccagggaaggggctggagtggattggggaaatccatcatagt.........gggagcacctactacaacccgtccctcaag...agtcgaatcaccatgtccgtagacacgtccaggaaccagttctccctgaagctgagctctgtgaccgccgcagacacggccgtgtattactg +>IGHV4-55*08 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcatctgcgctgtctctggtgactccatcagc.........agtggtaactggtgaatctgggtccgccagcccccagggaaggggctggagtggattggggaaatccatcatagt.........gggagcacctactacaacccgtccctcaag...agtcgaatcaccatgtcagtagacacgtccaagaaccagttctacctgaagctgagctctgtgaccgccgcggacacggccgtgtattactgtgcgagaga +>IGHV4-55*09 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcatctgcgctgtctctggtgactccatcagc.........agtggtaactggtgaatctgggtccgccagcccccagggaaggggctggagtggattggggaaatccatcatagt.........gggagcacctactacaacccgtccctcaag...agtcgaatcaccatgtccgtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgtggacacggccgtgtattactgtgcgagaaa +>IGHV4-59*01 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtctctggtggctccatc............agtagttactactggagctggatccggcagcccccagggaagggactggagtggattgggtatatctattacagt.........gggagcaccaactacaacccctccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgctgcggacacggccgtgtattactgtgcgagaga +>IGHV4-59*02 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtctctggtggctccgtc............agtagttactactggagctggatccggcagcccccagggaagggactggagtggattgggtatatctattacagt.........gggagcaccaactacaacccctccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgctgcggacacggccgtgtattactgtgcgagaga +>IGHV4-59*03 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtctctggtggctccatc............agtagttactactggagctggatccggcagcccccagggaagggactggagtggattgggtatatctattacagt.........gggagcaccaactacaacccctccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccaattctccctgaagctgagctctgtgaccgctgcggacacggccgtgtattactgtgcg +>IGHV4-59*04 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtctctggtggctccatc............agtagttactactggagctggatccggcagcccccagggaagggactggagtggattgggtatatctattatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatgtcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcagacacggctgtgtattactgtgcg +>IGHV4-59*05 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtctctggtggctccatc............agtagttactactggagctggatccggcagccgccggggaagggactggagtggattgggcgtatctattatagt.........gggagcacctactacaacccgtccctcaag...agtcgagtcaccatatccgtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcagacacggctgtgtattactgtgcg +>IGHV4-59*06 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtcactggtggctccatc............agtagttactactggagctggatccggcagcccgctgggaagggcctggagtggattgggtacatctattacagt.........gggagcacctactacaacccgtccctcaag...agtcgagttaccatatcagtagacacgtctaagaaccagttctccctgaagctgagctctgtgactgccgcggacacggccgtgtattactgtgcg +>IGHV4-59*07 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggacaccctgtccctcacctgcactgtctctggtggctccatc............agtagttactactggagctggatccggcagcccccagggaagggactggagtggattgggtatatctattacagt.........gggagcaccaactacaacccctccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgctgcggacacggccgtgtattactgtgcgaga +>IGHV4-59*08 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtctctggtggctccatc............agtagttactactggagctggatccggcagcccccagggaagggactggagtggattgggtatatctattacagt.........gggagcaccaactacaacccctccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcagacacggccgtgtattactgtgcgagaca +>IGHV4-59*09 +...........................................................................tctggtggctccatc............agtagttactactggagctggatccggcagcccccaggnannngactggagtggattgggtatatctattacagt.........gggagcaccaactacaacccctccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgctgcggacacggccgtgtattactgtgcgagagg +>IGHV4-59*10 +caggtgcagctacagcagtggggcgca...ggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtggctccatc............agtagttactactggagctggatccggcagcccgccgggaaggggctggagtggattgggcgtatctataccagt.........gggagcaccaactacaacccctccctcaag...agtcgagtcaccatgtcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactgtgcgagata +>IGHV4-61*01 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtctctggtggctccgtcagc......agtggtagttactactggagctggatccggcagcccccagggaagggactggagtggattgggtatatctattacagt.........gggagcaccaactacaacccctccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgctgcggacacggccgtgtattactgtgcgagaga +>IGHV4-61*02 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcacagaccctgtccctcacctgcactgtctctggtggctccatcagc......agtggtagttactactggagctggatccggcagcccgccgggaagggactggagtggattgggcgtatctataccagt.........gggagcaccaactacaacccctccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcagacacggccgtgtattactgtgcgagaga +>IGHV4-61*03 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtctctggtggctccgtcagc......agtggtagttactactggagctggatccggcagcccccagggaagggactggagtggattgggtatatctattacagt.........gggagcaccaactacaacccctccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccacttctccctgaagctgagctctgtgaccgctgcggacacggccgtgtattactgtgcgagaga +>IGHV4-61*04 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtctctggtggctccgtcagc......agtggtagttactactggagctggatccggcagcccccagggaagggactggagtggattggatatatctattacagt.........gggagcaccaactacaacccctccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgct...gacacggccgtgtattactg +>IGHV4-61*05 +cagctgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtctctggtggctccatcagc......agtagtagttactactggggctggatccggcagcccccagggaagggactggagtggattgggtatatctattacagt.........gggagcaccaactacaacccctccctcaag...agtcgagtcaccatatcagtagacaagtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactgtgcgaga +>IGHV4-61*06 +...........................................................................tctggtggctccgtcagc......agtggtagttactactggagctggatccggcagcccccagggaagggactggagtggattgggtatatctattacagt.........gggagcaccaactacaacccctccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactgtgccagaga +>IGHV4-61*07 +...........................................................................tctggtggctccgtcagc......agtggtagttactactggagctggatccggcagcccccagggaagggactggagtggattgggtatatctattacagt.........gggagcaccaactacaacccctccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgctgcggacacggccgtgtattactgtgcgagaca +>IGHV4-61*08 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcactgtctctggtggctccgtcagc......agtggtggttactactggagctggatccggcagcccccagggaagggactggagtggattgggtatatctattacagt.........gggagcaccaactacaacccctccctcaag...agtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgctgcggacacggccgtgtattactgtgcgagaga +>IGHV4/OR15-8*01 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcgttgtctctggtggctccatcagc.........agtagtaactggtggagctgggtccgccagcccccagggaaggggctggagtggattggggaaatctatcatagt.........gggagccccaactacaacccgtccctcaag...agtcgagtcaccatatcagtagacaagtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactgtgcgagaga +>IGHV4/OR15-8*02 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcgttgtctctggtggctccatcagc.........agtagtaactggtggagctgggtccgccagcccccagggaaggggctggagtggattggggaaatctatcatagt.........gggaaccccaactacaacccgtccctcaag...agtcgagtcaccatatcaatagacaagtccaagaaccaattctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactgtgcgagaga +>IGHV4/OR15-8*03 +caggtgcagctgcaggagtcgggccca...ggactggtgaagccttcggagaccctgtccctcacctgcgttgtctctggtggctccatcagc.........agtagtaactggtggagctgggtccgccagcccccagggaaggggctggagtggattggggaaatctatcatagt.........gggagccccaactacaacccatccctcaag...agtcgagtcaccatatcagtagacaagtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggccgtgtattactgtgcgagaga +>IGHV5-10-1*01 +gaagtgcagctggtgcagtctggagca...gaggtgaaaaagcccggggagtctctgaggatctcctgtaagggttctggatacagcttt............accagctactggatcagctgggtgcgccagatgcccgggaaaggcctggagtggatggggaggattgatcctagt......gactcttataccaactacagcccgtccttccaa...ggccacgtcaccatctcagctgacaagtccatcagcactgcctacctgcagtggagcagcctgaaggcctcggacaccgccatgtattactgtgcgaga +>IGHV5-10-1*02 +gaagtgcagctggtgcagtctggagca...gaggtgaaaaagcccggggagtctctgaggatctcctgtaagggttctggatacagcttt............accagctactggatcagctgggtgcgccagatgcccgggaaaggcttggagtggatggggaggattgatcctagt......gactcttataccaactacagcccgtccttccaa...ggccacgtcaccatctcagctgacaagtccatcagcactgcctacctgcagtggagcagcctgaaggc.tcggacaccgccatgtattactgtgcgagaca +>IGHV5-10-1*03 +gaagtgcagctggtgcagtccggagca...gaggtgaaaaagcccggggagtctctgaggatctcctgtaagggttctggatacagcttt............accagctactggatcagctgggtgcgccagatgcccgggaaaggcctggagtggatggggaggattgatcctagt......gactcttataccaactacagcccgtccttccaa...ggccacgtcaccatctcagctgacaagtccatcagcactgcctacctgcagtggagcagcctgaaggcctcggacaccgccatgtattactgtgcgaga +>IGHV5-10-1*04 +gaagtgcagctggtgcagtctggagca...gaggtgaaaaagcccggggagtctctgaggatctcctgtaagggttctggatacagcttt............accagctactggatcagctgggtgcgccagatgcccgggaaaggcctggagtggatggggaggattgatcctagt......gactcttataccaactacagcccgtccttccaa...ggccaggtcaccatctcagctgacaagtccatcagcactgcctacctgcagtggagcagcctgaaggcctcggacaccgccatgtattactgtgcgaga +>IGHV5-51*01 +gaggtgcagctggtgcagtctggagca...gaggtgaaaaagcccggggagtctctgaagatctcctgtaagggttctggatacagcttt............accagctactggatcggctgggtgcgccagatgcccgggaaaggcctggagtggatggggatcatctatcctggt......gactctgataccagatacagcccgtccttccaa...ggccaggtcaccatctcagccgacaagtccatcagcaccgcctacctgcagtggagcagcctgaaggcctcggacaccgccatgtattactgtgcgagaca +>IGHV5-51*02 +gaggtgcagctggtgcagtctggagca...gaggtgaaaaagcccggggagtctctgaagatctcctgtaagggttctggatacagcttt............accagctactggaccggctgggtgcgccagatgcccgggaaaggcttggagtggatggggatcatctatcctggt......gactctgataccagatacagcccgtccttccaa...ggccaggtcaccatctcagccgacaagtccatcagcaccgcctacctgcagtggagcagcctgaaggcctcggacaccgccatgtattactgtgcgagaca +>IGHV5-51*03 +gaggtgcagctggtgcagtctggagca...gaggtgaaaaagccgggggagtctctgaagatctcctgtaagggttctggatacagcttt............accagctactggatcggctgggtgcgccagatgcccgggaaaggcctggagtggatggggatcatctatcctggt......gactctgataccagatacagcccgtccttccaa...ggccaggtcaccatctcagccgacaagtccatcagcaccgcctacctgcagtggagcagcctgaaggcctcggacaccgccatgtattactgtgcgaga +>IGHV5-51*04 +gaggtgcagctggtgcagtctggagca...gaggtgaaaaagccgggggagtctctgaagatctcctgtaagggttctggatacagcttt............accagctactggatcggctgggtgcgccagatgcccgggaaaggcctggagtggatggggatcatctatcctggt......gactctgataccagatacagcccgtccttccaa...ggccaggtcaccatctcagccgacaagcccatcagcaccgcctacctgcagtggagcagcctgaaggcctcggacaccgccatgtattactgtgcgaga +>IGHV5-51*05 +.....................................aaaagcccggggagtctctgaagatctcctgtaagggttctggatacagcttt............accagctactggatcggctgggtgcgccagatgcccaggaaaggcctggagtggatggggatcatctatcctggt......gactctgataccagatacagcccgtccttccaa...ggccaggtcaccatctcagccgacaagtccatcagcaccgcctacctgcagtggagcagcctgaaggcctcggacaccgccatg +>IGHV5-78*01 +gaggtgcagctgttgcagtctgcagca...gaggtgaaaagacccggggagtctctgaggatctcctgtaagacttctggatacagcttt............accagctactggatccactgggtgcgccagatgcccgggaaagaactggagtggatggggagcatctatcctggg......aactctgataccagatacagcccatccttccaa...ggccacgtcaccatctcagccgacagctccagcagcaccgcctacctgcagtggagcagcctgaaggcctcggacgccgccatgtattattgtgtgaga +>IGHV6-1*01 +caggtacagctgcagcagtcaggtcca...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga +>IGHV6-1*02 +caggtacagctgcagcagtcaggtccg...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga +>IGHV7-34-1*01 +...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......actgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta +>IGHV7-34-1*02 +...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......aatgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta +>IGHV7-4-1*01 +caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatctgcagcctaaaggctgaggacactgccgtgtattactgtgcgaga +>IGHV7-4-1*02 +caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga +>IGHV7-4-1*03 +caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcacgctaaaggctgaggacactg +>IGHV7-4-1*04 +caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga +>IGHV7-4-1*05 +caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtgttactgtgcgagaga +>AIGHV7-40*03| +ttttcaatagaaaagtcaaataatcta...agtgtcaatcagtggatgattagataaaatatgatatatgtaaatcatggaatactatgc............agccagtatggtatgaattcagtgtgaccagcccctggacaagggcttgagtggatgggatggatcatcacctac......actgggaacccaacatataccaacggcttcaca...ggacggtttctattctccatggacacctctgtcagcatggcgtatctgcagatcagcagcctaaaggctgaggacacggccgtgtatgactgtatgagaga +>IGHV7-81*01 +caggtgcagctggtgcagtctggccat...gaggtgaagcagcctggggcctcagtgaaggtctcctgcaaggcttctggttacagtttc............accacctatggtatgaattgggtgccacaggcccctggacaagggcttgagtggatgggatggttcaacacctac......actgggaacccaacatatgcccagggcttcaca...ggacggtttgtcttctccatggacacctctgccagcacagcatacctgcagatcagcagcctaaaggctgaggacatggccatgtattactgtgcgagata diff -r 000000000000 -r 8a5a2abbb870 baseline/comparePDFs.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/comparePDFs.r Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,225 @@ +options("warn"=-1) + +#from http://selection.med.yale.edu/baseline/Archive/Baseline%20Version%201.3/Baseline_Functions_Version1.3.r +# Compute p-value of two distributions +compareTwoDistsFaster <-function(sigma_S=seq(-20,20,length.out=4001), N=10000, dens1=runif(4001,0,1), dens2=runif(4001,0,1)){ +#print(c(length(dens1),length(dens2))) +if(length(dens1)>1 & length(dens2)>1 ){ + dens1<-dens1/sum(dens1) + dens2<-dens2/sum(dens2) + cum2 <- cumsum(dens2)-dens2/2 + tmp<- sum(sapply(1:length(dens1),function(i)return(dens1[i]*cum2[i]))) + #print(tmp) + if(tmp>0.5)tmp<-tmp-1 + return( tmp ) + } + else { + return(NA) + } + #return (sum(sapply(1:N,function(i)(sample(sigma_S,1,prob=dens1)>sample(sigma_S,1,prob=dens2))))/N) +} + + +require("grid") +arg <- commandArgs(TRUE) +#arg <- c("300143","4","5") +arg[!arg=="clonal"] +input <- arg[1] +output <- arg[2] +rowIDs <- as.numeric( sapply(arg[3:(max(3,length(arg)))],function(x){ gsub("chkbx","",x) } ) ) + +numbSeqs = length(rowIDs) + +if ( is.na(rowIDs[1]) | numbSeqs>10 ) { + stop( paste("Error: Please select between one and 10 seqeunces to compare.") ) +} + +#load( paste("output/",sessionID,".RData",sep="") ) +load( input ) +#input + +xMarks = seq(-20,20,length.out=4001) + +plot_grid_s<-function(pdf1,pdf2,Sample=100,cex=1,xlim=NULL,xMarks = seq(-20,20,length.out=4001)){ + yMax = max(c(abs(as.numeric(unlist(listPDFs[pdf1]))),abs(as.numeric(unlist(listPDFs[pdf2]))),0),na.rm=T) * 1.1 + + if(length(xlim==2)){ + xMin=xlim[1] + xMax=xlim[2] + } else { + xMin_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][1] + xMin_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][1] + xMax_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001])] + xMax_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001])] + + xMin_CDR2 = xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001][1] + xMin_FWR2 = xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001][1] + xMax_CDR2 = xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001])] + xMax_FWR2 = xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001])] + + xMin=min(c(xMin_CDR,xMin_FWR,xMin_CDR2,xMin_FWR2,0),na.rm=TRUE) + xMax=max(c(xMax_CDR,xMax_FWR,xMax_CDR2,xMax_FWR2,0),na.rm=TRUE) + } + + sigma<-approx(xMarks,xout=seq(xMin,xMax,length.out=Sample))$x + grid.rect(gp = gpar(col=gray(0.6),fill="white",cex=cex)) + x <- sigma + pushViewport(viewport(x=0.175,y=0.175,width=0.825,height=0.825,just=c("left","bottom"),default.units="npc")) + #pushViewport(plotViewport(c(1.8, 1.8, 0.25, 0.25)*cex)) + pushViewport(dataViewport(x, c(yMax,-yMax),gp = gpar(cex=cex),extension=c(0.05))) + grid.polygon(c(0,0,1,1),c(0,0.5,0.5,0),gp=gpar(col=grey(0.95),fill=grey(0.95)),default.units="npc") + grid.polygon(c(0,0,1,1),c(1,0.5,0.5,1),gp=gpar(col=grey(0.9),fill=grey(0.9)),default.units="npc") + grid.rect() + grid.xaxis(gp = gpar(cex=cex/1.1)) + yticks = pretty(c(-yMax,yMax),8) + yticks = yticks[yticks>(-yMax) & yticks<(yMax)] + grid.yaxis(at=yticks,label=abs(yticks),gp = gpar(cex=cex/1.1)) + if(length(listPDFs[pdf1][[1]][["CDR"]])>1){ + ycdr<-approx(xMarks,listPDFs[pdf1][[1]][["CDR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y + grid.lines(unit(x,"native"), unit(ycdr,"native"),gp=gpar(col=2,lwd=2)) + } + if(length(listPDFs[pdf1][[1]][["FWR"]])>1){ + yfwr<-approx(xMarks,listPDFs[pdf1][[1]][["FWR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y + grid.lines(unit(x,"native"), unit(-yfwr,"native"),gp=gpar(col=4,lwd=2)) + } + + if(length(listPDFs[pdf2][[1]][["CDR"]])>1){ + ycdr2<-approx(xMarks,listPDFs[pdf2][[1]][["CDR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y + grid.lines(unit(x,"native"), unit(ycdr2,"native"),gp=gpar(col=2,lwd=2,lty=2)) + } + if(length(listPDFs[pdf2][[1]][["FWR"]])>1){ + yfwr2<-approx(xMarks,listPDFs[pdf2][[1]][["FWR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y + grid.lines(unit(x,"native"), unit(-yfwr2,"native"),gp=gpar(col=4,lwd=2,lty=2)) + } + + grid.lines(unit(c(0,1),"npc"), unit(c(0.5,0.5),"npc"),gp=gpar(col=1)) + grid.lines(unit(c(0,0),"native"), unit(c(0,1),"npc"),gp=gpar(col=1,lwd=1,lty=3)) + + grid.text("Density", x = unit(-2.5, "lines"), rot = 90,gp = gpar(cex=cex)) + grid.text( expression(paste("Selection Strength (", Sigma, ")", sep="")) , y = unit(-2.5, "lines"),gp = gpar(cex=cex)) + + if(pdf1==pdf2 & length(listPDFs[pdf2][[1]][["FWR"]])>1 & length(listPDFs[pdf2][[1]][["CDR"]])>1 ){ + pCDRFWR = compareTwoDistsFaster(sigma_S=xMarks, N=10000, dens1=listPDFs[[pdf1]][["CDR"]], dens2=listPDFs[[pdf1]][["FWR"]]) + pval = formatC(as.numeric(pCDRFWR),digits=3) + grid.text( substitute(expression(paste(P[CDR/FWR], "=", x, sep="")),list(x=pval))[[2]] , x = unit(0.02, "npc"),y = unit(0.98, "npc"),just=c("left", "top"),gp = gpar(cex=cex*1.2)) + } + grid.text(paste("CDR"), x = unit(0.98, "npc"),y = unit(0.98, "npc"),just=c("right", "top"),gp = gpar(cex=cex*1.5)) + grid.text(paste("FWR"), x = unit(0.98, "npc"),y = unit(0.02, "npc"),just=c("right", "bottom"),gp = gpar(cex=cex*1.5)) + popViewport(2) +} +#plot_grid_s(1) + + +p2col<-function(p=0.01){ + breaks=c(-.51,-0.1,-.05,-0.01,-0.005,0,0.005,0.01,0.05,0.1,0.51) + i<-findInterval(p,breaks) + cols = c( rgb(0.8,1,0.8), rgb(0.6,1,0.6), rgb(0.4,1,0.4), rgb(0.2,1,0.2) , rgb(0,1,0), + rgb(1,0,0), rgb(1,.2,.2), rgb(1,.4,.4), rgb(1,.6,.6) , rgb(1,.8,.8) ) + return(cols[i]) +} + + +plot_pvals<-function(pdf1,pdf2,cex=1,upper=TRUE){ + if(upper){ + pCDR1FWR2 = compareTwoDistsFaster(sigma_S=xMarks, N=10000, dens1=listPDFs[[pdf1]][["CDR"]], dens2=listPDFs[[pdf2]][["FWR"]]) + pFWR1FWR2 = compareTwoDistsFaster(sigma_S=xMarks, N=10000, dens1=listPDFs[[pdf1]][["FWR"]], dens2=listPDFs[[pdf2]][["FWR"]]) + pFWR1CDR2 = compareTwoDistsFaster(sigma_S=xMarks, N=10000, dens2=listPDFs[[pdf2]][["CDR"]], dens1=listPDFs[[pdf1]][["FWR"]]) + pCDR1CDR2 = compareTwoDistsFaster(sigma_S=xMarks, N=10000, dens2=listPDFs[[pdf2]][["CDR"]], dens1=listPDFs[[pdf1]][["CDR"]]) + grid.polygon(c(0.5,0.5,1,1),c(0,0.5,0.5,0),gp=gpar(col=p2col(pFWR1FWR2),fill=p2col(pFWR1FWR2)),default.units="npc") + grid.polygon(c(0.5,0.5,1,1),c(1,0.5,0.5,1),gp=gpar(col=p2col(pCDR1FWR2),fill=p2col(pCDR1FWR2)),default.units="npc") + grid.polygon(c(0.5,0.5,0,0),c(1,0.5,0.5,1),gp=gpar(col=p2col(pCDR1CDR2),fill=p2col(pCDR1CDR2)),default.units="npc") + grid.polygon(c(0.5,0.5,0,0),c(0,0.5,0.5,0),gp=gpar(col=p2col(pFWR1CDR2),fill=p2col(pFWR1CDR2)),default.units="npc") + + grid.lines(c(0,1),0.5,gp=gpar(lty=2,col=gray(0.925))) + grid.lines(0.5,c(0,1),gp=gpar(lty=2,col=gray(0.925))) + + grid.text(formatC(as.numeric(pFWR1FWR2),digits=3), x = unit(0.75, "npc"),y = unit(0.25, "npc"),just=c("center", "center"),gp = gpar(cex=cex)) + grid.text(formatC(as.numeric(pCDR1FWR2),digits=3), x = unit(0.75, "npc"),y = unit(0.75, "npc"),just=c("center", "center"),gp = gpar(cex=cex)) + grid.text(formatC(as.numeric(pCDR1CDR2),digits=3), x = unit(0.25, "npc"),y = unit(0.75, "npc"),just=c("center", "center"),gp = gpar(cex=cex)) + grid.text(formatC(as.numeric(pFWR1CDR2),digits=3), x = unit(0.25, "npc"),y = unit(0.25, "npc"),just=c("center", "center"),gp = gpar(cex=cex)) + + + # grid.text(paste("P = ",formatC(pCDRFWR,digits=3)), x = unit(0.5, "npc"),y = unit(0.98, "npc"),just=c("center", "top"),gp = gpar(cex=cex)) + # grid.text(paste("P = ",formatC(pFWRFWR,digits=3)), x = unit(0.5, "npc"),y = unit(0.02, "npc"),just=c("center", "bottom"),gp = gpar(cex=cex)) + } + else{ + } +} + + +################################################################################## +################## The whole OCD's matrix ######################################## +################################################################################## + +#pdf(width=4*numbSeqs+1/3,height=4*numbSeqs+1/3) +pdf( output ,width=4*numbSeqs+1/3,height=4*numbSeqs+1/3) + +pushViewport(viewport(x=0.02,y=0.02,just = c("left", "bottom"),w =0.96,height=0.96,layout = grid.layout(numbSeqs+1,numbSeqs+1,widths=unit.c(unit(rep(1,numbSeqs),"null"),unit(4,"lines")),heights=unit.c(unit(4,"lines"),unit(rep(1,numbSeqs),"null"))))) + +for( seqOne in 1:numbSeqs+1){ + pushViewport(viewport(layout.pos.col = seqOne-1, layout.pos.row = 1)) + if(seqOne>2){ + grid.polygon(c(0,0,0.5,0.5),c(0,0.5,0.5,0),gp=gpar(col=grey(0.5),fill=grey(0.9)),default.units="npc") + grid.polygon(c(1,1,0.5,0.5),c(0,0.5,0.5,0),gp=gpar(col=grey(0.5),fill=grey(0.95)),default.units="npc") + grid.polygon(c(0,0,1,1),c(1,0.5,0.5,1),gp=gpar(col=grey(0.5)),default.units="npc") + + grid.text(y=.25,x=0.75,"FWR",gp = gpar(cex=1.5),just="center") + grid.text(y=.25,x=0.25,"CDR",gp = gpar(cex=1.5),just="center") + } + grid.rect(gp = gpar(col=grey(0.9))) + grid.text(y=.75,substr(paste(names(listPDFs)[rowIDs[seqOne-1]]),1,16),gp = gpar(cex=2),just="center") + popViewport(1) +} + +for( seqOne in 1:numbSeqs+1){ + pushViewport(viewport(layout.pos.row = seqOne, layout.pos.col = numbSeqs+1)) + if(seqOne<=numbSeqs){ + grid.polygon(c(0,0.5,0.5,0),c(0,0,0.5,0.5),gp=gpar(col=grey(0.5),fill=grey(0.95)),default.units="npc") + grid.polygon(c(0,0.5,0.5,0),c(1,1,0.5,0.5),gp=gpar(col=grey(0.5),fill=grey(0.9)),default.units="npc") + grid.polygon(c(1,0.5,0.5,1),c(0,0,1,1),gp=gpar(col=grey(0.5)),default.units="npc") + grid.text(x=.25,y=0.75,"CDR",gp = gpar(cex=1.5),just="center",rot=270) + grid.text(x=.25,y=0.25,"FWR",gp = gpar(cex=1.5),just="center",rot=270) + } + grid.rect(gp = gpar(col=grey(0.9))) + grid.text(x=0.75,substr(paste(names(listPDFs)[rowIDs[seqOne-1]]),1,16),gp = gpar(cex=2),rot=270,just="center") + popViewport(1) +} + +for( seqOne in 1:numbSeqs+1){ + for(seqTwo in 1:numbSeqs+1){ + pushViewport(viewport(layout.pos.col = seqTwo-1, layout.pos.row = seqOne)) + if(seqTwo>seqOne){ + plot_pvals(rowIDs[seqOne-1],rowIDs[seqTwo-1],cex=2) + grid.rect() + } + popViewport(1) + } +} + + +xMin=0 +xMax=0.01 +for(pdf1 in rowIDs){ + xMin_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][1] + xMin_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][1] + xMax_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001])] + xMax_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001])] + xMin=min(c(xMin_CDR,xMin_FWR,xMin),na.rm=TRUE) + xMax=max(c(xMax_CDR,xMax_FWR,xMax),na.rm=TRUE) +} + + + +for(i in 1:numbSeqs+1){ + for(j in (i-1):numbSeqs){ + pushViewport(viewport(layout.pos.col = i-1, layout.pos.row = j+1)) + grid.rect() + plot_grid_s(rowIDs[i-1],rowIDs[j],cex=1) + popViewport(1) + } +} + +dev.off() + +cat("Success", paste(rowIDs,collapse="_"),sep=":") + diff -r 000000000000 -r 8a5a2abbb870 baseline/filter.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/filter.r Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,35 @@ +arg = commandArgs(TRUE) +summaryfile = arg[1] +gappedfile = arg[2] +selection = arg[3] +output = arg[4] +print(paste("selection = ", selection)) + + +summarydat = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F) +gappeddat = read.table(gappedfile, header=T, sep="\t", fill=T, stringsAsFactors=F) + +#dat = data.frame(merge(gappeddat, summarydat, by="Sequence.ID", all.x=T)) + +dat = cbind(gappeddat, summarydat$AA.JUNCTION) + +colnames(dat)[length(dat)] = "AA.JUNCTION" + +dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele) +dat$VGene = gsub("[*].*", "", dat$VGene) + +dat$DGene = gsub("^Homsap ", "", dat$D.GENE.and.allele) +dat$DGene = gsub("[*].*", "", dat$DGene) + +dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele) +dat$JGene = gsub("[*].*", "", dat$JGene) + +#print(str(dat)) + +dat$past = do.call(paste, c(dat[unlist(strsplit(selection, ","))], sep = ":")) + +dat = dat[!duplicated(dat$past), ] + +dat = dat[dat$Functionality != "No results" & dat$Functionality != "unproductive",] + +write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T) diff -r 000000000000 -r 8a5a2abbb870 baseline/script_imgt.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/script_imgt.py Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,79 @@ +#import xlrd #avoid dep +import argparse +import re + +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") +parser.add_argument("--ref", help="Reference file") +parser.add_argument("--output", help="Output file") +parser.add_argument("--id", help="ID to be used at the '>>>' line in the output") + +args = parser.parse_args() + +refdic = dict() +with open(args.ref, 'r') as ref: + currentSeq = "" + currentId = "" + for line in ref: + if line[0] is ">": + if currentSeq is not "" and currentId is not "": + refdic[currentId[1:]] = currentSeq + currentId = line.rstrip() + currentSeq = "" + else: + currentSeq += line.rstrip() + refdic[currentId[1:]] = currentSeq + + +vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#, +# r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)", +# r"(IGKV[0-3]D?-[0-9]{1,2})", +# r"(IGLV[0-9]-[0-9]{1,2})", +# r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)", +# r"(TRGV[234589])", +# r"(TRDV[1-3])"] + +#vPattern = re.compile(r"|".join(vPattern)) +vPattern = re.compile("|".join(vPattern)) + +def filterGene(s, pattern): + if type(s) is not str: + return None + res = pattern.search(s) + if res: + return res.group(0) + return None + + + +currentSeq = "" +currentId = "" +first=True +with open(args.input, 'r') as i: + with open(args.output, 'a') as o: + o.write(">>>" + args.id + "\n") + outputdic = dict() + for line in i: + if first: + first = False + continue + linesplt = line.split("\t") + ref = filterGene(linesplt[1], vPattern) + if not ref or not linesplt[2].rstrip(): + continue + if ref in outputdic: + outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] + else: + outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] + #print outputdic + + for k in outputdic.keys(): + if k in refdic: + o.write(">>" + k + "\n") + o.write(refdic[k] + "\n") + for seq in outputdic[k]: + #print seq + o.write(">" + seq[0] + "\n") + o.write(seq[1] + "\n") + else: + print k + " not in reference, skipping " + k diff -r 000000000000 -r 8a5a2abbb870 baseline/script_xlsx.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/script_xlsx.py Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,58 @@ +import xlrd +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") +parser.add_argument("--ref", help="Reference file") +parser.add_argument("--output", help="Output file") + +args = parser.parse_args() + +gene_column = 6 +id_column = 7 +seq_column = 8 +LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"] + + +refdic = dict() +with open(args.ref, 'r') as ref: + currentSeq = "" + currentId = "" + for line in ref.readlines(): + if line[0] is ">": + if currentSeq is not "" and currentId is not "": + refdic[currentId[1:]] = currentSeq + currentId = line.rstrip() + currentSeq = "" + else: + currentSeq += line.rstrip() + refdic[currentId[1:]] = currentSeq + +currentSeq = "" +currentId = "" +with xlrd.open_workbook(args.input, 'r') as wb: + with open(args.output, 'a') as o: + for sheet in wb.sheets(): + if sheet.cell(1,gene_column).value.find("IGHV") < 0: + print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name + continue + o.write(">>>" + sheet.name + "\n") + outputdic = dict() + for rowindex in range(1, sheet.nrows): + ref = sheet.cell(rowindex, gene_column).value.replace(">", "") + if ref in outputdic: + outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)] + else: + outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)] + #print outputdic + + for k in outputdic.keys(): + if k in refdic: + o.write(">>" + k + "\n") + o.write(refdic[k] + "\n") + for seq in outputdic[k]: + #print seq + o.write(">" + seq[0] + "\n") + o.write(seq[1] + "\n") + else: + print k + " not in reference, skipping " + k diff -r 000000000000 -r 8a5a2abbb870 baseline/wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/wrapper.sh Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,104 @@ +#!/bin/bash +dir="$(cd "$(dirname "$0")" && pwd)" + +testID=$1 +species=$2 +substitutionModel=$3 +mutabilityModel=$4 +clonal=$5 +fixIndels=$6 +region=$7 +inputs=$8 +inputs=($inputs) +IDs=$9 +IDs=($IDs) +ref=${10} +output=${11} +selection=${12} +output_table=${13} +outID="result" + +echo "$PWD" + +echo "testID = $testID" +echo "species = $species" +echo "substitutionModel = $substitutionModel" +echo "mutabilityModel = $mutabilityModel" +echo "clonal = $clonal" +echo "fixIndels = $fixIndels" +echo "region = $region" +echo "inputs = ${inputs[@]}" +echo "IDs = ${IDs[@]}" +echo "ref = $ref" +echo "output = $output" +echo "outID = $outID" + +fasta="$PWD/baseline.fasta" + + +count=0 +for current in ${inputs[@]} +do + f=$(file $current) + zipType="Zip archive" + if [[ "$f" == *"$zipType"* ]] || [[ "$f" == *"XZ compressed data"* ]] + then + id=${IDs[$count]} + echo "id=$id" + if [[ "$f" == *"Zip archive"* ]] ; then + echo "Zip archive" + echo "unzip $input -d $PWD/files/" + unzip $current -d "$PWD/$id/" + elif [[ "$f" == *"XZ compressed data"* ]] ; then + echo "ZX archive" + echo "tar -xJf $input -C $PWD/files/" + mkdir -p "$PWD/$id/files" + tar -xJf $current -C "$PWD/$id/files/" + fi + summaryfile="$PWD/summary_${id}.txt" + gappedfile="$PWD/gappednt_${id}.txt" + filtered="$PWD/filtered_${id}.txt" + filecount=`ls -l $PWD/$id/ | wc -l` + if [[ "$filecount" -eq "2" ]] + then + cat $PWD/$id/*/1_* > $summaryfile + cat $PWD/$id/*/2_* > $gappedfile + else + cat $PWD/$id/1_* > $summaryfile + cat $PWD/$id/2_* > $gappedfile + fi + Rscript $dir/filter.r $summaryfile $gappedfile "$selection" $filtered 2>&1 + + final="$PWD/final_${id}.txt" + cat $filtered | cut -f2,4,7 > $final + python $dir/script_imgt.py --input $final --ref $ref --output $fasta --id $id + else + python $dir/script_xlsx.py --input $current --ref $ref --output $fasta + fi + count=$((count+1)) +done + +if [[ $(wc -l < $fasta) -eq "1" ]]; then + echo "No sequences in the fasta file, exiting" + exit 0 +fi + +workdir="$PWD" +cd $dir +echo "file: ${inputs[0]}" +#Rscript --verbose $dir/Baseline_Main.r $testID $species $substitutionModel $mutabilityModel $clonal $fixIndels $region ${inputs[0]} $workdir/ $outID 2>&1 +Rscript --verbose $dir/Baseline_Main.r $testID $species $substitutionModel $mutabilityModel $clonal $fixIndels $region $fasta $workdir/ $outID 2>&1 + +echo "$workdir/${outID}.txt" + +rows=`tail -n +2 $workdir/${outID}.txt | grep -v "All sequences combined" | grep -n 'Group' | grep -Eoh '^[0-9]+' | tr '\n' ' '` +rows=($rows) +#unset rows[${#rows[@]}-1] + +cd $dir +Rscript --verbose $dir/comparePDFs.r $workdir/${outID}.RData $output ${rows[@]} 2>&1 +cp $workdir/result.txt ${output_table} + + + + diff -r 000000000000 -r 8a5a2abbb870 change_o/DefineClones.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/change_o/DefineClones.py Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,1052 @@ +#!/usr/bin/env python3 +""" +Assign Ig sequences into clones +""" +# Info +__author__ = 'Namita Gupta, Jason Anthony Vander Heiden, Gur Yaari, Mohamed Uduman' +from changeo import __version__, __date__ + +# Imports +import os +import re +import sys +import numpy as np +from argparse import ArgumentParser +from collections import OrderedDict +from itertools import chain +from textwrap import dedent +from time import time +from Bio import pairwise2 +from Bio.Seq import translate + +# Presto and changeo imports +from presto.Defaults import default_out_args +from presto.IO import getFileType, getOutputHandle, printLog, printProgress +from presto.Multiprocessing import manageProcesses +from presto.Sequence import getDNAScoreDict +from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs +from changeo.Distance import getDNADistMatrix, getAADistMatrix, \ + hs1f_model, m1n_model, hs5f_model, \ + calcDistances, formClusters +from changeo.IO import getDbWriter, readDbFile, countDbFile +from changeo.Multiprocessing import DbData, DbResult + +# Defaults +default_translate = False +default_distance = 0.0 +default_bygroup_model = 'hs1f' +default_hclust_model = 'chen2010' +default_seq_field = 'JUNCTION' +default_norm = 'len' +default_sym = 'avg' +default_linkage = 'single' + +# TODO: should be in Distance, but need to be after function definitions +# Amino acid Hamming distance +aa_model = getAADistMatrix(mask_dist=1, gap_dist=0) + +# DNA Hamming distance +ham_model = getDNADistMatrix(mask_dist=0, gap_dist=0) + + +# TODO: this function is an abstraction to facilitate later cleanup +def getModelMatrix(model): + """ + Simple wrapper to get distance matrix from model name + + Arguments: + model = model name + + Return: + a pandas.DataFrame containing the character distance matrix + """ + if model == 'aa': + return(aa_model) + elif model == 'ham': + return(ham_model) + elif model == 'm1n': + return(m1n_model) + elif model == 'hs1f': + return(hs1f_model) + elif model == 'hs5f': + return(hs5f_model) + else: + sys.stderr.write('Unrecognized distance model: %s.\n' % model) + + +def indexJunctions(db_iter, fields=None, mode='gene', action='first'): + """ + Identifies preclonal groups by V, J and junction length + + Arguments: + db_iter = an iterator of IgRecords defined by readDbFile + fields = additional annotation fields to use to group preclones; + if None use only V, J and junction length + mode = specificity of alignment call to use for assigning preclones; + one of ('allele', 'gene') + action = how to handle multiple value fields when assigning preclones; + one of ('first', 'set') + + Returns: + a dictionary of {(V, J, junction length):[IgRecords]} + """ + # Define functions for grouping keys + if mode == 'allele' and fields is None: + def _get_key(rec, act): + return (rec.getVAllele(act), rec.getJAllele(act), + None if rec.junction is None else len(rec.junction)) + elif mode == 'gene' and fields is None: + def _get_key(rec, act): + return (rec.getVGene(act), rec.getJGene(act), + None if rec.junction is None else len(rec.junction)) + elif mode == 'allele' and fields is not None: + def _get_key(rec, act): + vdj = [rec.getVAllele(act), rec.getJAllele(act), + None if rec.junction is None else len(rec.junction)] + ann = [rec.toDict().get(k, None) for k in fields] + return tuple(chain(vdj, ann)) + elif mode == 'gene' and fields is not None: + def _get_key(rec, act): + vdj = [rec.getVGene(act), rec.getJGene(act), + None if rec.junction is None else len(rec.junction)] + ann = [rec.toDict().get(k, None) for k in fields] + return tuple(chain(vdj, ann)) + + start_time = time() + clone_index = {} + rec_count = 0 + for rec in db_iter: + key = _get_key(rec, action) + + # Print progress + if rec_count == 0: + print('PROGRESS> Grouping sequences') + + printProgress(rec_count, step=1000, start_time=start_time) + rec_count += 1 + + # Assigned passed preclone records to key and failed to index None + if all([k is not None and k != '' for k in key]): + #print key + # TODO: Has much slow. Should have less slow. + if action == 'set': + + f_range = list(range(2, 3 + (len(fields) if fields else 0))) + vdj_range = list(range(2)) + + # Check for any keys that have matching columns and junction length and overlapping genes/alleles + to_remove = [] + if len(clone_index) > (1 if None in clone_index else 0) and key not in clone_index: + key = list(key) + for k in clone_index: + if k is not None and all([key[i] == k[i] for i in f_range]): + if all([not set(key[i]).isdisjoint(set(k[i])) for i in vdj_range]): + for i in vdj_range: key[i] = tuple(set(key[i]).union(set(k[i]))) + to_remove.append(k) + + # Remove original keys, replace with union of all genes/alleles and append values to new key + val = [rec] + val += list(chain(*(clone_index.pop(k) for k in to_remove))) + clone_index[tuple(key)] = clone_index.get(tuple(key),[]) + val + + elif action == 'first': + clone_index.setdefault(key, []).append(rec) + else: + clone_index.setdefault(None, []).append(rec) + + printProgress(rec_count, step=1000, start_time=start_time, end=True) + + return clone_index + + +def distanceClones(records, model=default_bygroup_model, distance=default_distance, + dist_mat=None, norm=default_norm, sym=default_sym, + linkage=default_linkage, seq_field=default_seq_field): + """ + Separates a set of IgRecords into clones + + Arguments: + records = an iterator of IgRecords + model = substitution model used to calculate distance + distance = the distance threshold to assign clonal groups + dist_mat = pandas DataFrame of pairwise nucleotide or amino acid distances + norm = normalization method + sym = symmetry method + linkage = type of linkage + seq_field = sequence field used to calculate distance between records + + Returns: + a dictionary of lists defining {clone number: [IgRecords clonal group]} + """ + # Get distance matrix if not provided + if dist_mat is None: dist_mat = getModelMatrix(model) + + # Determine length of n-mers + if model in ['hs1f', 'm1n', 'aa', 'ham']: + nmer_len = 1 + elif model in ['hs5f']: + nmer_len = 5 + else: + sys.stderr.write('Unrecognized distance model: %s.\n' % model) + + # Define unique junction mapping + seq_map = {} + for ig in records: + seq = ig.getSeqField(seq_field) + # Check if sequence length is 0 + if len(seq) == 0: + return None + + seq = re.sub('[\.-]','N', str(seq)) + if model == 'aa': seq = translate(seq) + + seq_map.setdefault(seq, []).append(ig) + + # Process records + if len(seq_map) == 1: + return {1:records} + + # Define sequences + seqs = list(seq_map.keys()) + + # Calculate pairwise distance matrix + dists = calcDistances(seqs, nmer_len, dist_mat, norm, sym) + + # Perform hierarchical clustering + clusters = formClusters(dists, linkage, distance) + + # Turn clusters into clone dictionary + clone_dict = {} + for i, c in enumerate(clusters): + clone_dict.setdefault(c, []).extend(seq_map[seqs[i]]) + + return clone_dict + + +def distChen2010(records): + """ + Calculate pairwise distances as defined in Chen 2010 + + Arguments: + records = list of IgRecords where first is query to be compared to others in list + + Returns: + list of distances + """ + # Pull out query sequence and V/J information + query = records.popitem(last=False) + query_cdr3 = query.junction[3:-3] + query_v_allele = query.getVAllele() + query_v_gene = query.getVGene() + query_v_family = query.getVFamily() + query_j_allele = query.getJAllele() + query_j_gene = query.getJGene() + # Create alignment scoring dictionary + score_dict = getDNAScoreDict() + + scores = [0]*len(records) + for i in range(len(records)): + ld = pairwise2.align.globalds(query_cdr3, records[i].junction[3:-3], + score_dict, -1, -1, one_alignment_only=True) + # Check V similarity + if records[i].getVAllele() == query_v_allele: ld += 0 + elif records[i].getVGene() == query_v_gene: ld += 1 + elif records[i].getVFamily() == query_v_family: ld += 3 + else: ld += 5 + # Check J similarity + if records[i].getJAllele() == query_j_allele: ld += 0 + elif records[i].getJGene() == query_j_gene: ld += 1 + else: ld += 3 + # Divide by length + scores[i] = ld/max(len(records[i].junction[3:-3]), query_cdr3) + + return scores + + +def distAdemokun2011(records): + """ + Calculate pairwise distances as defined in Ademokun 2011 + + Arguments: + records = list of IgRecords where first is query to be compared to others in list + + Returns: + list of distances + """ + # Pull out query sequence and V family information + query = records.popitem(last=False) + query_cdr3 = query.junction[3:-3] + query_v_family = query.getVFamily() + # Create alignment scoring dictionary + score_dict = getDNAScoreDict() + + scores = [0]*len(records) + for i in range(len(records)): + + if abs(len(query_cdr3) - len(records[i].junction[3:-3])) > 10: + scores[i] = 1 + elif query_v_family != records[i].getVFamily(): + scores[i] = 1 + else: + ld = pairwise2.align.globalds(query_cdr3, records[i].junction[3:-3], + score_dict, -1, -1, one_alignment_only=True) + scores[i] = ld/min(len(records[i].junction[3:-3]), query_cdr3) + + return scores + + +def hierClust(dist_mat, method='chen2010'): + """ + Calculate hierarchical clustering + + Arguments: + dist_mat = square-formed distance matrix of pairwise CDR3 comparisons + + Returns: + list of cluster ids + """ + if method == 'chen2010': + clusters = formClusters(dist_mat, 'average', 0.32) + elif method == 'ademokun2011': + clusters = formClusters(dist_mat, 'complete', 0.25) + else: clusters = np.ones(dist_mat.shape[0]) + + return clusters + +# TODO: Merge duplicate feed, process and collect functions. +def feedQueue(alive, data_queue, db_file, group_func, group_args={}): + """ + Feeds the data queue with Ig records + + Arguments: + alive = a multiprocessing.Value boolean controlling whether processing continues + if False exit process + data_queue = a multiprocessing.Queue to hold data for processing + db_file = the Ig record database file + group_func = the function to use for assigning preclones + group_args = a dictionary of arguments to pass to group_func + + Returns: + None + """ + # Open input file and perform grouping + try: + # Iterate over Ig records and assign groups + db_iter = readDbFile(db_file) + clone_dict = group_func(db_iter, **group_args) + except: + #sys.stderr.write('Exception in feeder grouping step\n') + alive.value = False + raise + + # Add groups to data queue + try: + #print 'START FEED', alive.value + # Iterate over groups and feed data queue + clone_iter = iter(clone_dict.items()) + while alive.value: + # Get data from queue + if data_queue.full(): continue + else: data = next(clone_iter, None) + # Exit upon reaching end of iterator + if data is None: break + #print "FEED", alive.value, k + + # Feed queue + data_queue.put(DbData(*data)) + else: + sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ + % os.getpid()) + return None + except: + #sys.stderr.write('Exception in feeder queue feeding step\n') + alive.value = False + raise + + return None + + +def feedQueueClust(alive, data_queue, db_file, group_func=None, group_args={}): + """ + Feeds the data queue with Ig records + + Arguments: + alive = a multiprocessing.Value boolean controlling whether processing continues + if False exit process + data_queue = a multiprocessing.Queue to hold data for processing + db_file = the Ig record database file + + Returns: + None + """ + # Open input file and perform grouping + try: + # Iterate over Ig records and order by junction length + records = {} + db_iter = readDbFile(db_file) + for rec in db_iter: + records[rec.id] = rec + records = OrderedDict(sorted(list(records.items()), key=lambda i: i[1].junction_length)) + dist_dict = {} + for __ in range(len(records)): + k,v = records.popitem(last=False) + dist_dict[k] = [v].append(list(records.values())) + except: + #sys.stderr.write('Exception in feeder grouping step\n') + alive.value = False + raise + + # Add groups to data queue + try: + # print 'START FEED', alive.value + # Iterate over groups and feed data queue + dist_iter = iter(dist_dict.items()) + while alive.value: + # Get data from queue + if data_queue.full(): continue + else: data = next(dist_iter, None) + # Exit upon reaching end of iterator + if data is None: break + #print "FEED", alive.value, k + + # Feed queue + data_queue.put(DbData(*data)) + else: + sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ + % os.getpid()) + return None + except: + #sys.stderr.write('Exception in feeder queue feeding step\n') + alive.value = False + raise + + return None + + +def processQueue(alive, data_queue, result_queue, clone_func, clone_args): + """ + Pulls from data queue, performs calculations, and feeds results queue + + Arguments: + alive = a multiprocessing.Value boolean controlling whether processing continues + if False exit process + data_queue = a multiprocessing.Queue holding data to process + result_queue = a multiprocessing.Queue to hold processed results + clone_func = the function to call for clonal assignment + clone_args = a dictionary of arguments to pass to clone_func + + Returns: + None + """ + try: + # Iterator over data queue until sentinel object reached + while alive.value: + # Get data from queue + if data_queue.empty(): continue + else: data = data_queue.get() + # Exit upon reaching sentinel + if data is None: break + + # Define result object for iteration and get data records + records = data.data + result = DbResult(data.id, records) + + # Check for invalid data (due to failed indexing) and add failed result + if not data: + result_queue.put(result) + continue + + # Add V(D)J to log + result.log['ID'] = ','.join([str(x) for x in data.id]) + result.log['VALLELE'] = ','.join(set([(r.getVAllele() or '') for r in records])) + result.log['DALLELE'] = ','.join(set([(r.getDAllele() or '') for r in records])) + result.log['JALLELE'] = ','.join(set([(r.getJAllele() or '') for r in records])) + result.log['JUNCLEN'] = ','.join(set([(str(len(r.junction)) or '0') for r in records])) + result.log['SEQUENCES'] = len(records) + + # Checking for preclone failure and assign clones + clones = clone_func(records, **clone_args) if data else None + + # import cProfile + # prof = cProfile.Profile() + # clones = prof.runcall(clone_func, records, **clone_args) + # prof.dump_stats('worker-%d.prof' % os.getpid()) + + if clones is not None: + result.results = clones + result.valid = True + result.log['CLONES'] = len(clones) + else: + result.log['CLONES'] = 0 + + # Feed results to result queue + result_queue.put(result) + else: + sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ + % os.getpid()) + return None + except: + #sys.stderr.write('Exception in worker\n') + alive.value = False + raise + + return None + + +def processQueueClust(alive, data_queue, result_queue, clone_func, clone_args): + """ + Pulls from data queue, performs calculations, and feeds results queue + + Arguments: + alive = a multiprocessing.Value boolean controlling whether processing continues + if False exit process + data_queue = a multiprocessing.Queue holding data to process + result_queue = a multiprocessing.Queue to hold processed results + clone_func = the function to call for calculating pairwise distances between sequences + clone_args = a dictionary of arguments to pass to clone_func + + Returns: + None + """ + + try: + # print 'START WORK', alive.value + # Iterator over data queue until sentinel object reached + while alive.value: + # Get data from queue + if data_queue.empty(): continue + else: data = data_queue.get() + # Exit upon reaching sentinel + if data is None: break + # print "WORK", alive.value, data['id'] + + # Define result object for iteration and get data records + records = data.data + result = DbResult(data.id, records) + + # Create row of distance matrix and check for error + dist_row = clone_func(records, **clone_args) if data else None + if dist_row is not None: + result.results = dist_row + result.valid = True + + # Feed results to result queue + result_queue.put(result) + else: + sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ + % os.getpid()) + return None + except: + #sys.stderr.write('Exception in worker\n') + alive.value = False + raise + + return None + + +def collectQueue(alive, result_queue, collect_queue, db_file, out_args, cluster_func=None, cluster_args={}): + """ + Assembles results from a queue of individual sequence results and manages log/file I/O + + Arguments: + alive = a multiprocessing.Value boolean controlling whether processing continues + if False exit process + result_queue = a multiprocessing.Queue holding processQueue results + collect_queue = a multiprocessing.Queue to store collector return values + db_file = the input database file name + out_args = common output argument dictionary from parseCommonArgs + cluster_func = the function to call for carrying out clustering on distance matrix + cluster_args = a dictionary of arguments to pass to cluster_func + + Returns: + None + (adds 'log' and 'out_files' to collect_dict) + """ + # Open output files + try: + # Count records and define output format + out_type = getFileType(db_file) if out_args['out_type'] is None \ + else out_args['out_type'] + result_count = countDbFile(db_file) + + # Defined successful output handle + pass_handle = getOutputHandle(db_file, + out_label='clone-pass', + out_dir=out_args['out_dir'], + out_name=out_args['out_name'], + out_type=out_type) + pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE') + + # Defined failed alignment output handle + if out_args['failed']: + fail_handle = getOutputHandle(db_file, + out_label='clone-fail', + out_dir=out_args['out_dir'], + out_name=out_args['out_name'], + out_type=out_type) + fail_writer = getDbWriter(fail_handle, db_file) + else: + fail_handle = None + fail_writer = None + + # Define log handle + if out_args['log_file'] is None: + log_handle = None + else: + log_handle = open(out_args['log_file'], 'w') + except: + #sys.stderr.write('Exception in collector file opening step\n') + alive.value = False + raise + + # Get results from queue and write to files + try: + #print 'START COLLECT', alive.value + # Iterator over results queue until sentinel object reached + start_time = time() + rec_count = clone_count = pass_count = fail_count = 0 + while alive.value: + # Get result from queue + if result_queue.empty(): continue + else: result = result_queue.get() + # Exit upon reaching sentinel + if result is None: break + #print "COLLECT", alive.value, result['id'] + + # Print progress for previous iteration and update record count + if rec_count == 0: + print('PROGRESS> Assigning clones') + printProgress(rec_count, result_count, 0.05, start_time) + rec_count += len(result.data) + + # Write passed and failed records + if result: + for clone in result.results.values(): + clone_count += 1 + for i, rec in enumerate(clone): + rec.annotations['CLONE'] = clone_count + pass_writer.writerow(rec.toDict()) + pass_count += 1 + result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction) + + else: + for i, rec in enumerate(result.data): + if fail_writer is not None: fail_writer.writerow(rec.toDict()) + fail_count += 1 + result.log['CLONE0-%i' % (i + 1)] = str(rec.junction) + + # Write log + printLog(result.log, handle=log_handle) + else: + sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ + % os.getpid()) + return None + + # Print total counts + printProgress(rec_count, result_count, 0.05, start_time) + + # Close file handles + pass_handle.close() + if fail_handle is not None: fail_handle.close() + if log_handle is not None: log_handle.close() + + # Update return list + log = OrderedDict() + log['OUTPUT'] = os.path.basename(pass_handle.name) + log['CLONES'] = clone_count + log['RECORDS'] = rec_count + log['PASS'] = pass_count + log['FAIL'] = fail_count + collect_dict = {'log':log, 'out_files': [pass_handle.name]} + collect_queue.put(collect_dict) + except: + #sys.stderr.write('Exception in collector result processing step\n') + alive.value = False + raise + + return None + + +def collectQueueClust(alive, result_queue, collect_queue, db_file, out_args, cluster_func, cluster_args): + """ + Assembles results from a queue of individual sequence results and manages log/file I/O + + Arguments: + alive = a multiprocessing.Value boolean controlling whether processing continues + if False exit process + result_queue = a multiprocessing.Queue holding processQueue results + collect_queue = a multiprocessing.Queue to store collector return values + db_file = the input database file name + out_args = common output argument dictionary from parseCommonArgs + cluster_func = the function to call for carrying out clustering on distance matrix + cluster_args = a dictionary of arguments to pass to cluster_func + + Returns: + None + (adds 'log' and 'out_files' to collect_dict) + """ + # Open output files + try: + + # Iterate over Ig records to count and order by junction length + result_count = 0 + records = {} + # print 'Reading file...' + db_iter = readDbFile(db_file) + for rec in db_iter: + records[rec.id] = rec + result_count += 1 + records = OrderedDict(sorted(list(records.items()), key=lambda i: i[1].junction_length)) + + # Define empty matrix to store assembled results + dist_mat = np.zeros((result_count,result_count)) + + # Count records and define output format + out_type = getFileType(db_file) if out_args['out_type'] is None \ + else out_args['out_type'] + + # Defined successful output handle + pass_handle = getOutputHandle(db_file, + out_label='clone-pass', + out_dir=out_args['out_dir'], + out_name=out_args['out_name'], + out_type=out_type) + pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE') + + # Defined failed cloning output handle + if out_args['failed']: + fail_handle = getOutputHandle(db_file, + out_label='clone-fail', + out_dir=out_args['out_dir'], + out_name=out_args['out_name'], + out_type=out_type) + fail_writer = getDbWriter(fail_handle, db_file) + else: + fail_handle = None + fail_writer = None + + # Open log file + if out_args['log_file'] is None: + log_handle = None + else: + log_handle = open(out_args['log_file'], 'w') + except: + alive.value = False + raise + + try: + # Iterator over results queue until sentinel object reached + start_time = time() + row_count = rec_count = 0 + while alive.value: + # Get result from queue + if result_queue.empty(): continue + else: result = result_queue.get() + # Exit upon reaching sentinel + if result is None: break + + # Print progress for previous iteration + if row_count == 0: + print('PROGRESS> Assigning clones') + printProgress(row_count, result_count, 0.05, start_time) + + # Update counts for iteration + row_count += 1 + rec_count += len(result) + + # Add result row to distance matrix + if result: + dist_mat[list(range(result_count-len(result),result_count)),result_count-len(result)] = result.results + + else: + sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ + % os.getpid()) + return None + + # Calculate linkage and carry out clustering + # print dist_mat + clusters = cluster_func(dist_mat, **cluster_args) if dist_mat is not None else None + clones = {} + # print clusters + for i, c in enumerate(clusters): + clones.setdefault(c, []).append(records[list(records.keys())[i]]) + + # Write passed and failed records + clone_count = pass_count = fail_count = 0 + if clones: + for clone in clones.values(): + clone_count += 1 + for i, rec in enumerate(clone): + rec.annotations['CLONE'] = clone_count + pass_writer.writerow(rec.toDict()) + pass_count += 1 + #result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction) + + else: + for i, rec in enumerate(result.data): + fail_writer.writerow(rec.toDict()) + fail_count += 1 + #result.log['CLONE0-%i' % (i + 1)] = str(rec.junction) + + # Print final progress + printProgress(row_count, result_count, 0.05, start_time) + + # Close file handles + pass_handle.close() + if fail_handle is not None: fail_handle.close() + if log_handle is not None: log_handle.close() + + # Update return list + log = OrderedDict() + log['OUTPUT'] = os.path.basename(pass_handle.name) + log['CLONES'] = clone_count + log['RECORDS'] = rec_count + log['PASS'] = pass_count + log['FAIL'] = fail_count + collect_dict = {'log':log, 'out_files': [pass_handle.name]} + collect_queue.put(collect_dict) + except: + alive.value = False + raise + + return None + + +def defineClones(db_file, feed_func, work_func, collect_func, clone_func, cluster_func=None, + group_func=None, group_args={}, clone_args={}, cluster_args={}, + out_args=default_out_args, nproc=None, queue_size=None): + """ + Define clonally related sequences + + Arguments: + db_file = filename of input database + feed_func = the function that feeds the queue + work_func = the worker function that will run on each CPU + collect_func = the function that collects results from the workers + group_func = the function to use for assigning preclones + clone_func = the function to use for determining clones within preclonal groups + group_args = a dictionary of arguments to pass to group_func + clone_args = a dictionary of arguments to pass to clone_func + out_args = common output argument dictionary from parseCommonArgs + nproc = the number of processQueue processes; + if None defaults to the number of CPUs + queue_size = maximum size of the argument queue; + if None defaults to 2*nproc + + Returns: + a list of successful output file names + """ + # Print parameter info + log = OrderedDict() + log['START'] = 'DefineClones' + log['DB_FILE'] = os.path.basename(db_file) + if group_func is not None: + log['GROUP_FUNC'] = group_func.__name__ + log['GROUP_ARGS'] = group_args + log['CLONE_FUNC'] = clone_func.__name__ + + # TODO: this is yucky, but can be fixed by using a model class + clone_log = clone_args.copy() + if 'dist_mat' in clone_log: del clone_log['dist_mat'] + log['CLONE_ARGS'] = clone_log + + if cluster_func is not None: + log['CLUSTER_FUNC'] = cluster_func.__name__ + log['CLUSTER_ARGS'] = cluster_args + log['NPROC'] = nproc + printLog(log) + + # Define feeder function and arguments + feed_args = {'db_file': db_file, + 'group_func': group_func, + 'group_args': group_args} + # Define worker function and arguments + work_args = {'clone_func': clone_func, + 'clone_args': clone_args} + # Define collector function and arguments + collect_args = {'db_file': db_file, + 'out_args': out_args, + 'cluster_func': cluster_func, + 'cluster_args': cluster_args} + + # Call process manager + result = manageProcesses(feed_func, work_func, collect_func, + feed_args, work_args, collect_args, + nproc, queue_size) + + # Print log + result['log']['END'] = 'DefineClones' + printLog(result['log']) + + return result['out_files'] + + +def getArgParser(): + """ + Defines the ArgumentParser + + Arguments: + None + + Returns: + an ArgumentParser object + """ + # Define input and output fields + fields = dedent( + ''' + output files: + clone-pass + database with assigned clonal group numbers. + clone-fail + database with records failing clonal grouping. + + required fields: + SEQUENCE_ID, V_CALL or V_CALL_GENOTYPED, D_CALL, J_CALL, JUNCTION_LENGTH + + + sequence field specified by the --sf parameter + + output fields: + CLONE + ''') + + # Define ArgumentParser + parser = ArgumentParser(description=__doc__, epilog=fields, + formatter_class=CommonHelpFormatter) + parser.add_argument('--version', action='version', + version='%(prog)s:' + ' %s-%s' %(__version__, __date__)) + subparsers = parser.add_subparsers(title='subcommands', dest='command', metavar='', + help='Cloning method') + # TODO: This is a temporary fix for Python issue 9253 + subparsers.required = True + + # Parent parser + parser_parent = getCommonArgParser(seq_in=False, seq_out=False, db_in=True, + multiproc=True) + + # Distance cloning method + parser_bygroup = subparsers.add_parser('bygroup', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='''Defines clones as having same V assignment, + J assignment, and junction length with + specified substitution distance model.''') + parser_bygroup.add_argument('-f', nargs='+', action='store', dest='fields', default=None, + help='Additional fields to use for grouping clones (non VDJ)') + parser_bygroup.add_argument('--mode', action='store', dest='mode', + choices=('allele', 'gene'), default='gene', + help='''Specifies whether to use the V(D)J allele or gene for + initial grouping.''') + parser_bygroup.add_argument('--act', action='store', dest='action', default='set', + choices=('first', 'set'), + help='''Specifies how to handle multiple V(D)J assignments + for initial grouping.''') + parser_bygroup.add_argument('--model', action='store', dest='model', + choices=('aa', 'ham', 'm1n', 'hs1f', 'hs5f'), + default=default_bygroup_model, + help='''Specifies which substitution model to use for + calculating distance between sequences. Where m1n is the + mouse single nucleotide transition/trasversion model + of Smith et al, 1996; hs1f is the human single + nucleotide model derived from Yaari et al, 2013; hs5f + is the human S5F model of Yaari et al, 2013; ham is + nucleotide Hamming distance; and aa is amino acid + Hamming distance. The hs5f data should be + considered experimental.''') + parser_bygroup.add_argument('--dist', action='store', dest='distance', type=float, + default=default_distance, + help='The distance threshold for clonal grouping') + parser_bygroup.add_argument('--norm', action='store', dest='norm', + choices=('len', 'mut', 'none'), default=default_norm, + help='''Specifies how to normalize distances. One of none + (do not normalize), len (normalize by length), + or mut (normalize by number of mutations between sequences).''') + parser_bygroup.add_argument('--sym', action='store', dest='sym', + choices=('avg', 'min'), default=default_sym, + help='''Specifies how to combine asymmetric distances. One of avg + (average of A->B and B->A) or min (minimum of A->B and B->A).''') + parser_bygroup.add_argument('--link', action='store', dest='linkage', + choices=('single', 'average', 'complete'), default=default_linkage, + help='''Type of linkage to use for hierarchical clustering.''') + parser_bygroup.add_argument('--sf', action='store', dest='seq_field', + default=default_seq_field, + help='''The name of the field to be used to calculate + distance between records''') + parser_bygroup.set_defaults(feed_func=feedQueue) + parser_bygroup.set_defaults(work_func=processQueue) + parser_bygroup.set_defaults(collect_func=collectQueue) + parser_bygroup.set_defaults(group_func=indexJunctions) + parser_bygroup.set_defaults(clone_func=distanceClones) + + + # Hierarchical clustering cloning method + parser_hclust = subparsers.add_parser('hclust', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='Defines clones by specified distance metric on CDR3s and \ + cutting of hierarchical clustering tree') +# parser_hclust.add_argument('-f', nargs='+', action='store', dest='fields', default=None, +# help='Fields to use for grouping clones (non VDJ)') + parser_hclust.add_argument('--method', action='store', dest='method', + choices=('chen2010', 'ademokun2011'), default=default_hclust_model, + help='Specifies which cloning method to use for calculating distance \ + between CDR3s, computing linkage, and cutting clusters') + parser_hclust.set_defaults(feed_func=feedQueueClust) + parser_hclust.set_defaults(work_func=processQueueClust) + parser_hclust.set_defaults(collect_func=collectQueueClust) + parser_hclust.set_defaults(cluster_func=hierClust) + + return parser + + +if __name__ == '__main__': + """ + Parses command line arguments and calls main function + """ + # Parse arguments + parser = getArgParser() + args = parser.parse_args() + args_dict = parseCommonArgs(args) + # Convert case of fields + if 'seq_field' in args_dict: + args_dict['seq_field'] = args_dict['seq_field'].upper() + if 'fields' in args_dict and args_dict['fields'] is not None: + args_dict['fields'] = [f.upper() for f in args_dict['fields']] + + # Define clone_args + if args.command == 'bygroup': + args_dict['group_args'] = {'fields': args_dict['fields'], + 'action': args_dict['action'], + 'mode':args_dict['mode']} + args_dict['clone_args'] = {'model': args_dict['model'], + 'distance': args_dict['distance'], + 'norm': args_dict['norm'], + 'sym': args_dict['sym'], + 'linkage': args_dict['linkage'], + 'seq_field': args_dict['seq_field']} + + # TODO: can be cleaned up with abstract model class + args_dict['clone_args']['dist_mat'] = getModelMatrix(args_dict['model']) + + del args_dict['fields'] + del args_dict['action'] + del args_dict['mode'] + del args_dict['model'] + del args_dict['distance'] + del args_dict['norm'] + del args_dict['sym'] + del args_dict['linkage'] + del args_dict['seq_field'] + + # Define clone_args + if args.command == 'hclust': + dist_funcs = {'chen2010':distChen2010, 'ademokun2011':distAdemokun2011} + args_dict['clone_func'] = dist_funcs[args_dict['method']] + args_dict['cluster_args'] = {'method': args_dict['method']} + #del args_dict['fields'] + del args_dict['method'] + + # Call defineClones + del args_dict['command'] + del args_dict['db_files'] + for f in args.__dict__['db_files']: + args_dict['db_file'] = f + defineClones(**args_dict) \ No newline at end of file diff -r 000000000000 -r 8a5a2abbb870 change_o/MakeDb.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/change_o/MakeDb.py Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,1025 @@ +#!/usr/bin/env python3 +""" +Create tab-delimited database file to store sequence alignment information +""" +# Info +__author__ = 'Namita Gupta, Jason Anthony Vander Heiden' +from changeo import __version__, __date__ + +# Imports +import csv +import os +import re +import sys +import pandas as pd +import tarfile +import zipfile +from argparse import ArgumentParser +from collections import OrderedDict +from itertools import groupby +from shutil import rmtree +from tempfile import mkdtemp +from textwrap import dedent +from time import time +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.Alphabet import IUPAC + +# Presto and changeo imports +from presto.Defaults import default_out_args +from presto.Annotation import parseAnnotation +from presto.IO import countSeqFile, printLog, printProgress +from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs +from changeo.IO import getDbWriter, countDbFile, getRepo +from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \ + j_allele_regex + +# Default parameters +default_delimiter = ('\t', ',', '-') + + +def gapV(ig_dict, repo_dict): + """ + Insert gaps into V region and update alignment information + + Arguments: + ig_dict : Dictionary of parsed IgBlast output + repo_dict : Dictionary of IMGT gapped germline sequences + + Returns: + dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields + """ + + seq_imgt = '.' * (int(ig_dict['V_GERM_START_VDJ'])-1) + ig_dict['SEQUENCE_VDJ'] + + # Find gapped germline V segment + vgene = parseAllele(ig_dict['V_CALL'], v_allele_regex, 'first') + vkey = (vgene, ) + #TODO: Figure out else case + if vkey in repo_dict: + vgap = repo_dict[vkey] + # Iterate over gaps in the germline segment + gaps = re.finditer(r'\.', vgap) + gapcount = int(ig_dict['V_GERM_START_VDJ'])-1 + for gap in gaps: + i = gap.start() + # Break if gap begins after V region + if i >= ig_dict['V_GERM_LENGTH_VDJ'] + gapcount: + break + # Insert gap into IMGT sequence + seq_imgt = seq_imgt[:i] + '.' + seq_imgt[i:] + # Update gap counter + gapcount += 1 + ig_dict['SEQUENCE_IMGT'] = seq_imgt + # Update IMGT positioning information for V + ig_dict['V_GERM_START_IMGT'] = 1 + ig_dict['V_GERM_LENGTH_IMGT'] = ig_dict['V_GERM_LENGTH_VDJ'] + gapcount + + return ig_dict + + +def getIMGTJunc(ig_dict, repo_dict): + """ + Identify junction region by IMGT definition + + Arguments: + ig_dict : Dictionary of parsed IgBlast output + repo_dict : Dictionary of IMGT gapped germline sequences + + Returns: + dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields + """ + # Find germline J segment + jgene = parseAllele(ig_dict['J_CALL'], j_allele_regex, 'first') + jkey = (jgene, ) + #TODO: Figure out else case + if jkey in repo_dict: + # Get germline J sequence + jgerm = repo_dict[jkey] + jgerm = jgerm[:ig_dict['J_GERM_START']+ig_dict['J_GERM_LENGTH']-1] + # Look for (F|W)GXG aa motif in nt sequence + motif = re.search(r'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]', jgerm) + aa_end = len(ig_dict['SEQUENCE_IMGT']) + #TODO: Figure out else case + if motif: + # print('\n', motif.group()) + aa_end = motif.start() - len(jgerm) + 3 + # Add fields to dict + ig_dict['JUNCTION'] = ig_dict['SEQUENCE_IMGT'][309:aa_end] + ig_dict['JUNCTION_LENGTH'] = len(ig_dict['JUNCTION']) + + return ig_dict + + +def getRegions(ig_dict): + """ + Identify FWR and CDR regions by IMGT definition + + Arguments: + ig_dict : Dictionary of parsed alignment output + + Returns: + dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT, + CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT fields + """ + try: + seq_len = len(ig_dict['SEQUENCE_IMGT']) + ig_dict['FWR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][0:min(78,seq_len)] + except (KeyError, IndexError): + return ig_dict + + try: ig_dict['CDR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][78:min(114, seq_len)] + except (IndexError): return ig_dict + + try: ig_dict['FWR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][114:min(165, seq_len)] + except (IndexError): return ig_dict + + try: ig_dict['CDR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][165:min(195, seq_len)] + except (IndexError): return ig_dict + + try: ig_dict['FWR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][195:min(312, seq_len)] + except (IndexError): return ig_dict + + try: + cdr3_end = 306 + ig_dict['JUNCTION_LENGTH'] + ig_dict['CDR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][312:cdr3_end] + ig_dict['FWR4_IMGT'] = ig_dict['SEQUENCE_IMGT'][cdr3_end:] + except (KeyError, IndexError): + return ig_dict + + return ig_dict + + +def getSeqforIgBlast(seq_file): + """ + Fetch input sequences for IgBlast queries + + Arguments: + seq_file = a fasta file of sequences input to IgBlast + + Returns: + a dictionary of {ID:Seq} + """ + + seq_dict = SeqIO.index(seq_file, "fasta", IUPAC.ambiguous_dna) + + # Create a seq_dict ID translation using IDs truncate up to space or 50 chars + seqs = {} + for seq in seq_dict.values(): + seqs.update({seq.description:str(seq.seq)}) + + return seqs + + +def findLine(handle, query): + """ + Finds line with query string in file + + Arguments: + handle = file handle in which to search for line + query = query string for which to search in file + + Returns: + line from handle in which query string was found + """ + for line in handle: + if(re.match(query, line)): + return line + + +def extractIMGT(imgt_output): + """ + Extract necessary files from IMGT results, zipped or unzipped + + Arguments: + imgt_output = zipped file or unzipped folder output by IMGT + + Returns: + sorted list of filenames from which information will be read + """ + #file_ext = os.path.splitext(imgt_output)[1].lower() + imgt_flags = ('1_Summary', '2_IMGT-gapped', '3_Nt-sequences', '6_Junction') + temp_dir = mkdtemp() + if zipfile.is_zipfile(imgt_output): + # Open zip file + imgt_zip = zipfile.ZipFile(imgt_output, 'r') + # Extract required files + imgt_files = sorted([n for n in imgt_zip.namelist() \ + if os.path.basename(n).startswith(imgt_flags)]) + imgt_zip.extractall(temp_dir, imgt_files) + # Define file list + imgt_files = [os.path.join(temp_dir, f) for f in imgt_files] + elif os.path.isdir(imgt_output): + # Find required files in folder + folder_files = [] + for root, dirs, files in os.walk(imgt_output): + folder_files.extend([os.path.join(os.path.abspath(root), f) for f in files]) + # Define file list + imgt_files = sorted([n for n in folder_files \ + if os.path.basename(n).startswith(imgt_flags)]) + elif tarfile.is_tarfile(imgt_output): + # Open zip file + imgt_tar = tarfile.open(imgt_output, 'r') + # Extract required files + imgt_files = sorted([n for n in imgt_tar.getnames() \ + if os.path.basename(n).startswith(imgt_flags)]) + imgt_tar.extractall(temp_dir, [imgt_tar.getmember(n) for n in imgt_files]) + # Define file list + imgt_files = [os.path.join(temp_dir, f) for f in imgt_files] + else: + sys.exit('ERROR: Unsupported IGMT output file. Must be either a zipped file (.zip), LZMA compressed tarfile (.txz) or a folder.') + + if len(imgt_files) > len(imgt_flags): # e.g. multiple 1_Summary files + sys.exit('ERROR: Wrong files in IMGT output %s.' % imgt_output) + elif len(imgt_files) < len(imgt_flags): + sys.exit('ERROR: Missing necessary file IMGT output %s.' % imgt_output) + + return temp_dir, imgt_files + + +# TODO: return a dictionary with keys determined by the comment strings in the blocks, thus avoiding problems with missing blocks +def readOneIgBlastResult(block): + """ + Parse a single IgBLAST query result + + Arguments: + block = itertools groupby object of single result + + Returns: + None if no results, otherwise list of DataFrames for each result block + """ + results = list() + i = 0 + for match, subblock in groupby(block, lambda l: l=='\n'): + if not match: + # Strip whitespace and comments + sub = [s.strip() for s in subblock if not s.startswith('#')] + + # Continue on empty block + if not sub: continue + else: i += 1 + + # Split by tabs + sub = [s.split('\t') for s in sub] + + # Append list for "V-(D)-J rearrangement summary" (i == 1) + # And "V-(D)-J junction details" (i == 2) + # Otherwise append DataFrame of subblock + if i == 1 or i == 2: + results.append(sub[0]) + else: + df = pd.DataFrame(sub) + if not df.empty: results.append(df) + + return results if results else None + + +# TODO: needs more speeds. pandas is probably to blame. +def readIgBlast(igblast_output, seq_dict, repo_dict, + score_fields=False, region_fields=False): + """ + Reads IgBlast output + + Arguments: + igblast_output = IgBlast output file (format 7) + seq_dict = a dictionary of {ID:Seq} from input fasta file + repo_dict = dictionary of IMGT gapped germline sequences + score_fields = if True parse alignment scores + region_fields = if True add FWR and CDR region fields + + Returns: + a generator of dictionaries containing alignment data + """ + + # Open IgBlast output file + with open(igblast_output) as f: + # Iterate over individual results (separated by # IGBLASTN) + for k1, block in groupby(f, lambda x: re.match('# IGBLASTN', x)): + block = list(block) + if not k1: + # TODO: move query name extraction into block parser readOneIgBlastResult(). + # Extract sequence ID + query_name = ' '.join(block[0].strip().split(' ')[2:]) + # Initialize db_gen to have ID and input sequence + db_gen = {'SEQUENCE_ID': query_name, + 'SEQUENCE_INPUT': seq_dict[query_name]} + + # Parse further sub-blocks + block_list = readOneIgBlastResult(block) + + # TODO: this is indented pretty far. should be a separate function. or several functions. + # If results exist, parse further to obtain full db_gen + if block_list is not None: + # Parse quality information + db_gen['STOP'] = 'T' if block_list[0][-4] == 'Yes' else 'F' + db_gen['IN_FRAME'] = 'T' if block_list[0][-3] == 'In-frame' else 'F' + db_gen['FUNCTIONAL'] = 'T' if block_list[0][-2] == 'Yes' else 'F' + if block_list[0][-1] == '-': + db_gen['SEQUENCE_INPUT'] = str(Seq(db_gen['SEQUENCE_INPUT'], + IUPAC.ambiguous_dna).reverse_complement()) + + # Parse V, D, and J calls + call_str = ' '.join(block_list[0]) + v_call = parseAllele(call_str, v_allele_regex, action='list') + d_call = parseAllele(call_str, d_allele_regex, action='list') + j_call = parseAllele(call_str, j_allele_regex, action='list') + db_gen['V_CALL'] = ','.join(v_call) if v_call is not None else 'None' + db_gen['D_CALL'] = ','.join(d_call) if d_call is not None else 'None' + db_gen['J_CALL'] = ','.join(j_call) if j_call is not None else 'None' + + # Parse junction sequence + # db_gen['JUNCTION_VDJ'] = re.sub('(N/A)|\[|\(|\)|\]', '', ''.join(block_list[1])) + # db_gen['JUNCTION_LENGTH_VDJ'] = len(db_gen['JUNCTION_VDJ']) + + # TODO: IgBLAST does a stupid and doesn't output block #3 sometimes. why? + # TODO: maybe we should fail these. they look craptastic. + #pd.set_option('display.width', 500) + #print query_name, len(block_list), hit_idx + #for i, x in enumerate(block_list): + # print '[%i]' % i + # print x + + # Parse segment start and stop positions + hit_df = block_list[-1] + + # Alignment info block + # 0: segment + # 1: query id + # 2: subject id + # 3: % identity + # 4: alignment length + # 5: mismatches + # 6: gap opens + # 7: gaps + # 8: q. start + # 9: q. end + # 10: s. start + # 11: s. end + # 12: evalue + # 13: bit score + # 14: query seq + # 15: subject seq + # 16: btop + + # If V call exists, parse V alignment information + seq_vdj = '' + if v_call is not None: + v_align = hit_df[hit_df[0] == 'V'].iloc[0] + # Germline positions + db_gen['V_GERM_START_VDJ'] = int(v_align[10]) + db_gen['V_GERM_LENGTH_VDJ'] = int(v_align[11]) - db_gen['V_GERM_START_VDJ'] + 1 + # Query sequence positions + db_gen['V_SEQ_START'] = int(v_align[8]) + db_gen['V_SEQ_LENGTH'] = int(v_align[9]) - db_gen['V_SEQ_START'] + 1 + + if int(v_align[6]) == 0: + db_gen['INDELS'] = 'F' + else: + db_gen['INDELS'] = 'T' + # Set functional to none so record gets tossed (junction will be wrong) + # db_gen['FUNCTIONAL'] = None + + # V alignment scores + if score_fields: + try: db_gen['V_SCORE'] = float(v_align[13]) + except (TypeError, ValueError): db_gen['V_SCORE'] = 'None' + + try: db_gen['V_IDENTITY'] = float(v_align[3]) / 100.0 + except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None' + + try: db_gen['V_EVALUE'] = float(v_align[12]) + except (TypeError, ValueError): db_gen['V_EVALUE'] = 'None' + + try: db_gen['V_BTOP'] = v_align[16] + except (TypeError, ValueError): db_gen['V_BTOP'] = 'None' + + # Update VDJ sequence, removing insertions + start = 0 + for m in re.finditer(r'-', v_align[15]): + ins = m.start() + seq_vdj += v_align[14][start:ins] + start = ins + 1 + seq_vdj += v_align[14][start:] + + # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them. + # If D call exists, parse D alignment information + if d_call is not None: + d_align = hit_df[hit_df[0] == 'D'].iloc[0] + + # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though. + # Determine N-region length and amount of J overlap with V or D alignment + overlap = 0 + if v_call is not None: + n1_len = int(d_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']) + if n1_len < 0: + db_gen['N1_LENGTH'] = 0 + overlap = abs(n1_len) + else: + db_gen['N1_LENGTH'] = n1_len + n1_start = (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']-1) + n1_end = int(d_align[8])-1 + seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end] + + # Query sequence positions + db_gen['D_SEQ_START'] = int(d_align[8]) + overlap + db_gen['D_SEQ_LENGTH'] = max(int(d_align[9]) - db_gen['D_SEQ_START'] + 1, 0) + + # Germline positions + db_gen['D_GERM_START'] = int(d_align[10]) + overlap + db_gen['D_GERM_LENGTH'] = max(int(d_align[11]) - db_gen['D_GERM_START'] + 1, 0) + + # Update VDJ sequence, removing insertions + start = overlap + for m in re.finditer(r'-', d_align[15]): + ins = m.start() + seq_vdj += d_align[14][start:ins] + start = ins + 1 + seq_vdj += d_align[14][start:] + + # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them. + # If J call exists, parse J alignment information + if j_call is not None: + j_align = hit_df[hit_df[0] == 'J'].iloc[0] + + # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though. + # Determine N-region length and amount of J overlap with V or D alignment + overlap = 0 + if d_call is not None: + n2_len = int(j_align[8]) - (db_gen['D_SEQ_START'] + db_gen['D_SEQ_LENGTH']) + if n2_len < 0: + db_gen['N2_LENGTH'] = 0 + overlap = abs(n2_len) + else: + db_gen['N2_LENGTH'] = n2_len + n2_start = (db_gen['D_SEQ_START']+db_gen['D_SEQ_LENGTH']-1) + n2_end = int(j_align[8])-1 + seq_vdj += db_gen['SEQUENCE_INPUT'][n2_start:n2_end] + elif v_call is not None: + n1_len = int(j_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']) + if n1_len < 0: + db_gen['N1_LENGTH'] = 0 + overlap = abs(n1_len) + else: + db_gen['N1_LENGTH'] = n1_len + n1_start = (db_gen['V_SEQ_START']+db_gen['V_SEQ_LENGTH']-1) + n1_end = int(j_align[8])-1 + seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end] + else: + db_gen['N1_LENGTH'] = 0 + + # Query positions + db_gen['J_SEQ_START'] = int(j_align[8]) + overlap + db_gen['J_SEQ_LENGTH'] = max(int(j_align[9]) - db_gen['J_SEQ_START'] + 1, 0) + + # Germline positions + db_gen['J_GERM_START'] = int(j_align[10]) + overlap + db_gen['J_GERM_LENGTH'] = max(int(j_align[11]) - db_gen['J_GERM_START'] + 1, 0) + + # J alignment scores + if score_fields: + try: db_gen['J_SCORE'] = float(j_align[13]) + except (TypeError, ValueError): db_gen['J_SCORE'] = 'None' + + try: db_gen['J_IDENTITY'] = float(j_align[3]) / 100.0 + except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None' + + try: db_gen['J_EVALUE'] = float(j_align[12]) + except (TypeError, ValueError): db_gen['J_EVALUE'] = 'None' + + try: db_gen['J_BTOP'] = j_align[16] + except (TypeError, ValueError): db_gen['J_BTOP'] = 'None' + + # Update VDJ sequence, removing insertions + start = overlap + for m in re.finditer(r'-', j_align[15]): + ins = m.start() + seq_vdj += j_align[14][start:ins] + start = ins + 1 + seq_vdj += j_align[14][start:] + + db_gen['SEQUENCE_VDJ'] = seq_vdj + + # Create IMGT-gapped sequence and infer IMGT junction + if v_call is not None: + db_gen = gapV(db_gen, repo_dict) + if j_call is not None: + db_gen = getIMGTJunc(db_gen, repo_dict) + + # FWR and CDR regions + if region_fields: getRegions(db_gen) + + yield IgRecord(db_gen) + + +# TODO: should be more readable +def readIMGT(imgt_files, score_fields=False, region_fields=False): + """ + Reads IMGT/HighV-Quest output + + Arguments: + imgt_files = IMGT/HighV-Quest output files 1, 2, 3, and 6 + score_fields = if True parse alignment scores + region_fields = if True add FWR and CDR region fields + + Returns: + a generator of dictionaries containing alignment data + """ + imgt_iters = [csv.DictReader(open(f, 'rU'), delimiter='\t') for f in imgt_files] + # Create a dictionary for each sequence alignment and yield its generator + for sm, gp, nt, jn in zip(*imgt_iters): + if len(set([sm['Sequence ID'], + gp['Sequence ID'], + nt['Sequence ID'], + jn['Sequence ID']])) != 1: + sys.exit('Error: IMGT files are corrupt starting with Summary file record %s' \ + % sm['Sequence ID']) + + db_gen = {'SEQUENCE_ID': sm['Sequence ID'], + 'SEQUENCE_INPUT': sm['Sequence']} + + if 'No results' not in sm['Functionality']: + db_gen['FUNCTIONAL'] = ['?','T','F'][('productive' in sm['Functionality']) + + ('unprod' in sm['Functionality'])] + db_gen['IN_FRAME'] = ['?','T','F'][('in-frame' in sm['JUNCTION frame']) + + ('out-of-frame' in sm['JUNCTION frame'])], + db_gen['STOP'] = ['F','?','T'][('stop codon' in sm['Functionality comment']) + + ('unprod' in sm['Functionality'])] + db_gen['MUTATED_INVARIANT'] = ['F','?','T'][(any(('missing' in sm['Functionality comment'], + 'missing' in sm['V-REGION potential ins/del']))) + + ('unprod' in sm['Functionality'])] + db_gen['INDELS'] = ['F','T'][any((sm['V-REGION potential ins/del'], + sm['V-REGION insertions'], + sm['V-REGION deletions']))] + + db_gen['SEQUENCE_VDJ'] = nt['V-D-J-REGION'] if nt['V-D-J-REGION'] else nt['V-J-REGION'] + db_gen['SEQUENCE_IMGT'] = gp['V-D-J-REGION'] if gp['V-D-J-REGION'] else gp['V-J-REGION'] + + db_gen['V_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['V-GENE and allele'])) + db_gen['D_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['D-GENE and allele'])) + db_gen['J_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['J-GENE and allele'])) + + v_seq_length = len(nt['V-REGION']) if nt['V-REGION'] else 0 + db_gen['V_SEQ_START'] = nt['V-REGION start'] + db_gen['V_SEQ_LENGTH'] = v_seq_length + db_gen['V_GERM_START_IMGT'] = 1 + db_gen['V_GERM_LENGTH_IMGT'] = len(gp['V-REGION']) if gp['V-REGION'] else 0 + + db_gen['N1_LENGTH'] = sum(int(i) for i in [jn["P3'V-nt nb"], + jn['N-REGION-nt nb'], + jn['N1-REGION-nt nb'], + jn["P5'D-nt nb"]] if i) + db_gen['D_SEQ_START'] = sum(int(i) for i in [1, v_seq_length, + jn["P3'V-nt nb"], + jn['N-REGION-nt nb'], + jn['N1-REGION-nt nb'], + jn["P5'D-nt nb"]] if i) + db_gen['D_SEQ_LENGTH'] = int(jn["D-REGION-nt nb"] or 0) + db_gen['D_GERM_START'] = int(jn["5'D-REGION trimmed-nt nb"] or 0) + 1 + db_gen['D_GERM_LENGTH'] = int(jn["D-REGION-nt nb"] or 0) + db_gen['N2_LENGTH'] = sum(int(i) for i in [jn["P3'D-nt nb"], + jn['N2-REGION-nt nb'], + jn["P5'J-nt nb"]] if i) + + db_gen['J_SEQ_START_IMGT'] = sum(int(i) for i in [1, v_seq_length, + jn["P3'V-nt nb"], + jn['N-REGION-nt nb'], + jn['N1-REGION-nt nb'], + jn["P5'D-nt nb"], + jn["D-REGION-nt nb"], + jn["P3'D-nt nb"], + jn['N2-REGION-nt nb'], + jn["P5'J-nt nb"]] if i) + db_gen['J_SEQ_LENGTH'] = len(nt['J-REGION']) if nt['J-REGION'] else 0 + db_gen['J_GERM_START'] = int(jn["5'J-REGION trimmed-nt nb"] or 0) + 1 + db_gen['J_GERM_LENGTH'] = len(gp['J-REGION']) if gp['J-REGION'] else 0 + + db_gen['JUNCTION_LENGTH'] = len(jn['JUNCTION']) if jn['JUNCTION'] else 0 + db_gen['JUNCTION'] = jn['JUNCTION'] + + # Alignment scores + if score_fields: + try: db_gen['V_SCORE'] = float(sm['V-REGION score']) + except (TypeError, ValueError): db_gen['V_SCORE'] = 'None' + + try: db_gen['V_IDENTITY'] = float(sm['V-REGION identity %']) / 100.0 + except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None' + + try: db_gen['J_SCORE'] = float(sm['J-REGION score']) + except (TypeError, ValueError): db_gen['J_SCORE'] = 'None' + + try: db_gen['J_IDENTITY'] = float(sm['J-REGION identity %']) / 100.0 + except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None' + + # FWR and CDR regions + if region_fields: getRegions(db_gen) + else: + db_gen['V_CALL'] = 'None' + db_gen['D_CALL'] = 'None' + db_gen['J_CALL'] = 'None' + + yield IgRecord(db_gen) + + +def getIDforIMGT(seq_file): + """ + Create a sequence ID translation using IMGT truncation + + Arguments: + seq_file = a fasta file of sequences input to IMGT + + Returns: + a dictionary of {truncated ID: full seq description} + """ + + # Create a seq_dict ID translation using IDs truncate up to space or 50 chars + ids = {} + for i, rec in enumerate(SeqIO.parse(seq_file, 'fasta', IUPAC.ambiguous_dna)): + if len(rec.description) <= 50: + id_key = rec.description + else: + id_key = re.sub('\||\s|!|&|\*|<|>|\?','_',rec.description[:50]) + ids.update({id_key:rec.description}) + + return ids + + +def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True, + score_fields=False, region_fields=False, out_args=default_out_args): + """ + Writes tab-delimited database file in output directory + + Arguments: + db_gen = a generator of IgRecord objects containing alignment data + file_prefix = directory and prefix for CLIP tab-delim file + total_count = number of records (for progress bar) + id_dict = a dictionary of {IMGT ID: full seq description} + no_parse = if ID is to be parsed for pRESTO output with default delimiters + score_fields = if True add alignment score fields to output file + region_fields = if True add FWR and CDR region fields to output file + out_args = common output argument dictionary from parseCommonArgs + + Returns: + None + """ + pass_file = "%s_db-pass.tab" % file_prefix + fail_file = "%s_db-fail.tab" % file_prefix + ordered_fields = ['SEQUENCE_ID', + 'SEQUENCE_INPUT', + 'FUNCTIONAL', + 'IN_FRAME', + 'STOP', + 'MUTATED_INVARIANT', + 'INDELS', + 'V_CALL', + 'D_CALL', + 'J_CALL', + 'SEQUENCE_VDJ', + 'SEQUENCE_IMGT', + 'V_SEQ_START', + 'V_SEQ_LENGTH', + 'V_GERM_START_VDJ', + 'V_GERM_LENGTH_VDJ', + 'V_GERM_START_IMGT', + 'V_GERM_LENGTH_IMGT', + 'N1_LENGTH', + 'D_SEQ_START', + 'D_SEQ_LENGTH', + 'D_GERM_START', + 'D_GERM_LENGTH', + 'N2_LENGTH', + 'J_SEQ_START', + 'J_SEQ_LENGTH', + 'J_GERM_START', + 'J_GERM_LENGTH', + 'JUNCTION_LENGTH', + 'JUNCTION'] + + if score_fields: + ordered_fields.extend(['V_SCORE', + 'V_IDENTITY', + 'V_EVALUE', + 'V_BTOP', + 'J_SCORE', + 'J_IDENTITY', + 'J_EVALUE', + 'J_BTOP']) + + if region_fields: + ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT', + 'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT']) + + + # TODO: This is not the best approach. should pass in output fields. + # Initiate passed handle + pass_handle = None + + # Open failed file + if out_args['failed']: + fail_handle = open(fail_file, 'wt') + fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT']) + else: + fail_handle = None + fail_writer = None + + # Initialize counters and file + pass_writer = None + start_time = time() + rec_count = pass_count = fail_count = 0 + for record in db_gen: + #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) + printProgress(rec_count, total_count, 0.05, start_time) + rec_count += 1 + + # Count pass or fail + if (record.v_call == 'None' and record.j_call == 'None') or \ + record.functional is None or \ + not record.seq_vdj or \ + not record.junction: + # print(record.v_call, record.j_call, record.functional, record.junction) + fail_count += 1 + if fail_writer is not None: fail_writer.writerow(record.toDict()) + continue + else: + pass_count += 1 + + # Build sample sequence description + if record.id in id_dict: + record.id = id_dict[record.id] + + # Parse sequence description into new columns + if not no_parse: + record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) + record.id = record.annotations['ID'] + del record.annotations['ID'] + + # TODO: This is not the best approach. should pass in output fields. + # If first sequence, use parsed description to create new columns and initialize writer + if pass_writer is None: + if not no_parse: ordered_fields.extend(list(record.annotations.keys())) + pass_handle = open(pass_file, 'wt') + pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields) + + # Write row to tab-delim CLIP file + pass_writer.writerow(record.toDict()) + + # Print log + #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) + printProgress(rec_count, total_count, 0.05, start_time) + + log = OrderedDict() + log['OUTPUT'] = pass_file + log['PASS'] = pass_count + log['FAIL'] = fail_count + log['END'] = 'MakeDb' + printLog(log) + + if pass_handle is not None: pass_handle.close() + if fail_handle is not None: fail_handle.close() + + +# TODO: may be able to merge with parseIMGT +def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False, + region_fields=False, out_args=default_out_args): + """ + Main for IgBlast aligned sample sequences + + Arguments: + igblast_output = IgBlast output file to process + seq_file = fasta file input to IgBlast (from which to get sequence) + repo = folder with germline repertoire files + no_parse = if ID is to be parsed for pRESTO output with default delimiters + score_fields = if True add alignment score fields to output file + region_fields = if True add FWR and CDR region fields to output file + out_args = common output argument dictionary from parseCommonArgs + + Returns: + None + """ + # Print parameter info + log = OrderedDict() + log['START'] = 'MakeDB' + log['ALIGNER'] = 'IgBlast' + log['ALIGN_RESULTS'] = os.path.basename(igblast_output) + log['SEQ_FILE'] = os.path.basename(seq_file) + log['NO_PARSE'] = no_parse + log['SCORE_FIELDS'] = score_fields + log['REGION_FIELDS'] = region_fields + printLog(log) + + # Get input sequence dictionary + seq_dict = getSeqforIgBlast(seq_file) + + # Formalize out_dir and file-prefix + if not out_args['out_dir']: + out_dir = os.path.split(igblast_output)[0] + else: + out_dir = os.path.abspath(out_args['out_dir']) + if not os.path.exists(out_dir): os.mkdir(out_dir) + if out_args['out_name']: + file_prefix = out_args['out_name'] + else: + file_prefix = os.path.basename(os.path.splitext(igblast_output)[0]) + file_prefix = os.path.join(out_dir, file_prefix) + + total_count = countSeqFile(seq_file) + + # Create + repo_dict = getRepo(repo) + igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict, + score_fields=score_fields, region_fields=region_fields) + writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse, + score_fields=score_fields, region_fields=region_fields, out_args=out_args) + + +# TODO: may be able to merge with parseIgBlast +def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False, + region_fields=False, out_args=default_out_args): + """ + Main for IMGT aligned sample sequences + + Arguments: + imgt_output = zipped file or unzipped folder output by IMGT + seq_file = FASTA file input to IMGT (from which to get seqID) + no_parse = if ID is to be parsed for pRESTO output with default delimiters + score_fields = if True add alignment score fields to output file + region_fields = if True add FWR and CDR region fields to output file + out_args = common output argument dictionary from parseCommonArgs + + Returns: + None + """ + # Print parameter info + log = OrderedDict() + log['START'] = 'MakeDb' + log['ALIGNER'] = 'IMGT' + log['ALIGN_RESULTS'] = imgt_output + log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' + log['NO_PARSE'] = no_parse + log['SCORE_FIELDS'] = score_fields + log['REGION_FIELDS'] = region_fields + printLog(log) + + # Get individual IMGT result files + temp_dir, imgt_files = extractIMGT(imgt_output) + + # Formalize out_dir and file-prefix + if not out_args['out_dir']: + out_dir = os.path.dirname(os.path.abspath(imgt_output)) + else: + out_dir = os.path.abspath(out_args['out_dir']) + if not os.path.exists(out_dir): os.mkdir(out_dir) + if out_args['out_name']: + file_prefix = out_args['out_name'] + else: + file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0] + file_prefix = os.path.join(out_dir, file_prefix) + + total_count = countDbFile(imgt_files[0]) + + # Get (parsed) IDs from fasta file submitted to IMGT + id_dict = getIDforIMGT(seq_file) if seq_file else {} + + # Create + imgt_dict = readIMGT(imgt_files, score_fields=score_fields, + region_fields=region_fields) + writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse, + score_fields=score_fields, region_fields=region_fields, out_args=out_args) + + # Delete temp directory + rmtree(temp_dir) + + +def getArgParser(): + """ + Defines the ArgumentParser + + Arguments: + None + + Returns: + an ArgumentParser object + """ + fields = dedent( + ''' + output files: + db-pass + database of parsed alignment records. + db-fail + database with records failing alignment. + + output fields: + SEQUENCE_ID, SEQUENCE_INPUT, FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT, + INDELS, V_CALL, D_CALL, J_CALL, SEQUENCE_VDJ and/or SEQUENCE_IMGT, + V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_VDJ and/or V_GERM_START_IMGT, + V_GERM_LENGTH_VDJ and/or V_GERM_LENGTH_IMGT, N1_LENGTH, + D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, N2_LENGTH, + J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH, + JUNCTION_LENGTH, JUNCTION, V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP, + J_SCORE, J_IDENTITY, J_EVALUE, J_BTOP, FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, + FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, CDR3_IMGT + ''') + + # Define ArgumentParser + parser = ArgumentParser(description=__doc__, epilog=fields, + formatter_class=CommonHelpFormatter) + parser.add_argument('--version', action='version', + version='%(prog)s:' + ' %s-%s' %(__version__, __date__)) + subparsers = parser.add_subparsers(title='subcommands', dest='command', + help='Aligner used', metavar='') + # TODO: This is a temporary fix for Python issue 9253 + subparsers.required = True + + # Parent parser + parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False) + + # IgBlast Aligner + parser_igblast = subparsers.add_parser('igblast', help='Process IgBlast output', + parents=[parser_parent], + formatter_class=CommonHelpFormatter) + parser_igblast.set_defaults(func=parseIgBlast) + parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files', + required=True, + help='''IgBLAST output files in format 7 with query sequence + (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''') + parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True, + help='''List of folders and/or fasta files containing + IMGT-gapped germline sequences corresponding to the + set of germlines used in the IgBLAST alignment.''') + parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files', + required=True, + help='List of input FASTA files containing sequences') + parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse', + help='''Specify if input IDs should not be parsed to add + new columns to database.''') + parser_igblast.add_argument('--scores', action='store_true', dest='score_fields', + help='''Specify if alignment score metrics should be + included in the output. Adds the V_SCORE, V_IDENTITY, + V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY, + J_BTOP, and J_EVALUE columns.''') + parser_igblast.add_argument('--regions', action='store_true', dest='region_fields', + help='''Specify if IMGT framework and CDR regions should be + included in the output. Adds the FWR1_IMGT, FWR2_IMGT, + FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and + CDR3_IMGT columns.''') + + # IMGT aligner + parser_imgt = subparsers.add_parser('imgt', help='Process IMGT/HighV-Quest output', + parents=[parser_parent], + formatter_class=CommonHelpFormatter) + imgt_arg_group = parser_imgt.add_mutually_exclusive_group(required=True) + imgt_arg_group.add_argument('-i', nargs='+', action='store', dest='aligner_files', + help='''Either zipped IMGT output files (.zip) or a folder + containing unzipped IMGT output files (which must + include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences, + and 6_Junction).''') + parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', + required=False, + help='List of input FASTA files containing sequences') + parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse', + help='''Specify if input IDs should not be parsed to add new + columns to database.''') + parser_imgt.add_argument('--scores', action='store_true', dest='score_fields', + help='''Specify if alignment score metrics should be + included in the output. Adds the V_SCORE, V_IDENTITY, + J_SCORE and J_IDENTITY. Note, this will also add + the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP, + but they will be empty for IMGT output.''') + parser_imgt.add_argument('--regions', action='store_true', dest='region_fields', + help='''Specify if IMGT framework and CDR regions should be + included in the output. Adds the FWR1_IMGT, FWR2_IMGT, + FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and + CDR3_IMGT columns.''') + parser_imgt.set_defaults(func=parseIMGT) + + return parser + + +if __name__ == "__main__": + """ + Parses command line arguments and calls main + """ + parser = getArgParser() + args = parser.parse_args() + args_dict = parseCommonArgs(args, in_arg='aligner_files') + + # Set no ID parsing if sequence files are not provided + if 'seq_files' in args_dict and not args_dict['seq_files']: + args_dict['no_parse'] = True + + # Delete + if 'seq_files' in args_dict: del args_dict['seq_files'] + if 'aligner_files' in args_dict: del args_dict['aligner_files'] + if 'command' in args_dict: del args_dict['command'] + if 'func' in args_dict: del args_dict['func'] + + if args.command == 'imgt': + for i in range(len(args.__dict__['aligner_files'])): + args_dict['imgt_output'] = args.__dict__['aligner_files'][i] + args_dict['seq_file'] = args.__dict__['seq_files'][i] \ + if args.__dict__['seq_files'] else None + args.func(**args_dict) + elif args.command == 'igblast': + for i in range(len(args.__dict__['aligner_files'])): + args_dict['igblast_output'] = args.__dict__['aligner_files'][i] + args_dict['seq_file'] = args.__dict__['seq_files'][i] + args.func(**args_dict) diff -r 000000000000 -r 8a5a2abbb870 change_o/define_clones.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/change_o/define_clones.r Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,15 @@ +args <- commandArgs(trailingOnly = TRUE) + +input=args[1] +output=args[2] + +change.o = read.table(input, header=T, sep="\t", quote="", stringsAsFactors=F) + +freq = data.frame(table(change.o$CLONE)) +freq2 = data.frame(table(freq$Freq)) + +freq2$final = as.numeric(freq2$Freq) * as.numeric(as.character(freq2$Var1)) + +names(freq2) = c("Clone size", "Nr of clones", "Nr of sequences") + +write.table(x=freq2, file=output, sep="\t",quote=F,row.names=F,col.names=T) diff -r 000000000000 -r 8a5a2abbb870 change_o/define_clones.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/change_o/define_clones.sh Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,43 @@ +#!/bin/bash +dir="$(cd "$(dirname "$0")" && pwd)" + +#define_clones.sh $input $noparse $scores $regions $out_file + +type=$1 +input=$2 + +mkdir -p $PWD/outdir + +cp $input $PWD/input.tab #file has to have a ".tab" extension + +if [ "bygroup" == "$type" ] ; then + mode=$3 + act=$4 + model=$5 + norm=$6 + sym=$7 + link=$8 + dist=$9 + output=${10} + output2=${11} + + python3 $dir/DefineClones.py bygroup -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link + #/data/users/david/anaconda3/bin/python $dir/DefineClones.py bygroup -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link + #/home/galaxy/anaconda3/bin/python $dir/DefineClones.py bygroup -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link + + Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1 +else + method=$3 + output=$4 + output2=$5 + + python3 $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method + #/data/users/david/anaconda3/bin/python $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method + #/home/galaxy/anaconda3/bin/python $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method + + Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1 +fi + +cp $PWD/outdir/output_clone-pass.tab $output + +rm -rf $PWD/outdir/ diff -r 000000000000 -r 8a5a2abbb870 change_o/makedb.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/change_o/makedb.sh Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,38 @@ +#!/bin/bash +dir="$(cd "$(dirname "$0")" && pwd)" + +input=$1 +noparse=$2 +scores=$3 +regions=$4 +output=$5 + +if [ "true" == "$noparse" ] ; then + noparse="--noparse" +else + noparse="" +fi + +if [ "true" == "$scores" ] ; then + scores="--scores" +else + scores="" +fi + +if [ "true" == "$regions" ] ; then + regions="--regions" +else + regions="" +fi + +mkdir $PWD/outdir + +echo "makedb: $PWD/outdir" + +python3 $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions +#/data/users/david/anaconda3/bin/python $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions +#/home/galaxy/anaconda3/bin/python $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions + +mv $PWD/outdir/output_db-pass.tab $output + +rm -rf $PWD/outdir/ diff -r 000000000000 -r 8a5a2abbb870 datatypes_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,6 @@ + + + + + + diff -r 000000000000 -r 8a5a2abbb870 gene_identification.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_identification.py Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,220 @@ +import re +import argparse +import time +starttime= int(time.time() * 1000) + +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file") +parser.add_argument("--output", help="The annotated output file to be merged back with the summary file") + +args = parser.parse_args() + +infile = args.input +#infile = "test_VH-Ca_Cg_25nt/1_Summary_test_VH-Ca_Cg_25nt_241013.txt" +output = args.output +#outfile = "identified.txt" + +dic = dict() +total = 0 + + +first = True +IDIndex = 0 +seqIndex = 0 + +with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence + for line in f: + total += 1 + linesplt = line.split("\t") + if first: + print "linesplt", linesplt + IDIndex = linesplt.index("Sequence ID") + seqIndex = linesplt.index("Sequence") + first = False + continue + + ID = linesplt[IDIndex] + if len(linesplt) < 28: #weird rows without a sequence + dic[ID] = "" + else: + dic[ID] = linesplt[seqIndex] + +print "Number of input sequences:", len(dic) + +#old cm sequence: gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc +#old cg sequence: ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag + +#lambda/kappa reference sequence +searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg", + "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggcc", + "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence + +compiledregex = {"ca": [], + "cg": [], + "cm": []} + +#lambda/kappa reference sequence variable nucleotides +ca1 = {38: 't', 39: 'g', 48: 'a', 49: 'g', 51: 'c', 68: 'a', 73: 'c'} +ca2 = {38: 'g', 39: 'a', 48: 'c', 49: 'c', 51: 'a', 68: 'g', 73: 'a'} +cg1 = {0: 'c', 33: 'a', 38: 'c', 44: 'a', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} +cg2 = {0: 'c', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'g', 132: 't'} +cg3 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} +cg4 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'c', 132: 'c'} + +#remove last snp for shorter cg sequence --- note, also change varsInCG +del cg1[132] +del cg2[132] +del cg3[132] +del cg4[132] + +#reference sequences are cut into smaller parts of 'chunklength' length, and with 'chunklength' / 2 overlap +chunklength = 8 + +#create the chunks of the reference sequence with regular expressions for the variable nucleotides +for i in range(0, len(searchstrings["ca"]) - chunklength, chunklength / 2): + pos = i + chunk = searchstrings["ca"][i:i+chunklength] + result = "" + varsInResult = 0 + for c in chunk: + if pos in ca1.keys(): + varsInResult += 1 + result += "[" + ca1[pos] + ca2[pos] + "]" + else: + result += c + pos += 1 + compiledregex["ca"].append((re.compile(result), varsInResult)) + +for i in range(0, len(searchstrings["cg"]) - chunklength, chunklength / 2): + pos = i + chunk = searchstrings["cg"][i:i+chunklength] + result = "" + varsInResult = 0 + for c in chunk: + if pos in cg1.keys(): + varsInResult += 1 + result += "[" + "".join(set([cg1[pos], cg2[pos], cg3[pos], cg4[pos]])) + "]" + else: + result += c + pos += 1 + compiledregex["cg"].append((re.compile(result), varsInResult)) + +for i in range(0, len(searchstrings["cm"]) - chunklength, chunklength / 2): + compiledregex["cm"].append((re.compile(searchstrings["cm"][i:i+chunklength]), False)) + + + +def removeAndReturnMaxIndex(x): #simplifies a list comprehension + m = max(x) + index = x.index(m) + x[index] = 0 + return index + + +start_location = dict() +hits = dict() +alltotal = 0 +for key in compiledregex.keys(): #for ca/cg/cm + regularexpressions = compiledregex[key] #get the compiled regular expressions + for ID in dic.keys()[0:]: #for every ID + if ID not in hits.keys(): #ensure that the dictionairy that keeps track of the hits for every gene exists + hits[ID] = {"ca_hits": 0, "cg_hits": 0, "cm_hits": 0, "ca1": 0, "ca2": 0, "cg1": 0, "cg2": 0, "cg3": 0, "cg4": 0} + currentIDHits = hits[ID] + seq = dic[ID] + lastindex = 0 + start_zero = len(searchstrings[key]) #allows the reference sequence to start before search sequence (start_locations of < 0) + start = [0] * (len(seq) + start_zero) + for i, regexp in enumerate(regularexpressions): #for every regular expression + relativeStartLocation = lastindex - (chunklength / 2) * i + if relativeStartLocation >= len(seq): + break + regex, hasVar = regexp + matches = regex.finditer(seq[lastindex:]) + for match in matches: #for every match with the current regex, only uses the first hit + lastindex += match.start() + start[relativeStartLocation + start_zero] += 1 + if hasVar: #if the regex has a variable nt in it + chunkstart = chunklength / 2 * i #where in the reference does this chunk start + chunkend = chunklength / 2 * i + chunklength #where in the reference does this chunk end + if key == "ca": #just calculate the variable nt score for 'ca', cheaper + currentIDHits["ca1"] += len([1 for x in ca1 if chunkstart <= x < chunkend and ca1[x] == seq[lastindex + x - chunkstart]]) + currentIDHits["ca2"] += len([1 for x in ca2 if chunkstart <= x < chunkend and ca2[x] == seq[lastindex + x - chunkstart]]) + elif key == "cg": #just calculate the variable nt score for 'cg', cheaper + currentIDHits["cg1"] += len([1 for x in cg1 if chunkstart <= x < chunkend and cg1[x] == seq[lastindex + x - chunkstart]]) + currentIDHits["cg2"] += len([1 for x in cg2 if chunkstart <= x < chunkend and cg2[x] == seq[lastindex + x - chunkstart]]) + currentIDHits["cg3"] += len([1 for x in cg3 if chunkstart <= x < chunkend and cg3[x] == seq[lastindex + x - chunkstart]]) + currentIDHits["cg4"] += len([1 for x in cg4 if chunkstart <= x < chunkend and cg4[x] == seq[lastindex + x - chunkstart]]) + else: #key == "cm" #no variable regions in 'cm' + pass + break #this only breaks when there was a match with the regex, breaking means the 'else:' clause is skipped + else: #only runs if there were no hits + continue + #print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it's now:", start[lastindex - chunklength / 2 * i] + currentIDHits[key + "_hits"] += 1 + start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1 - start_zero) for x in range(5) if len(start) > 0 and max(start) > 1]) + #start_location[ID + "_" + key] = str(start.index(max(start))) + + +chunksInCA = len(compiledregex["ca"]) +chunksInCG = len(compiledregex["cg"]) +chunksInCM = len(compiledregex["cm"]) +requiredChunkPercentage = 0.7 +varsInCA = float(len(ca1.keys()) * 2) +varsInCG = float(len(cg1.keys()) * 2) - 2 # -2 because the sliding window doesn't hit the first and last nt twice +varsInCM = 0 + + + +first = True +seq_write_count=0 +with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence + with open(output, 'w') as o: + for line in f: + total += 1 + if first: + o.write("Sequence ID\tbest_match\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + first = False + continue + linesplt = line.split("\t") + if linesplt[2] == "No results": + pass + ID = linesplt[1] + currentIDHits = hits[ID] + possibleca = float(len(compiledregex["ca"])) + possiblecg = float(len(compiledregex["cg"])) + possiblecm = float(len(compiledregex["cm"])) + cahits = currentIDHits["ca_hits"] + cghits = currentIDHits["cg_hits"] + cmhits = currentIDHits["cm_hits"] + if cahits >= cghits and cahits >= cmhits: #its a ca gene + ca1hits = currentIDHits["ca1"] + ca2hits = currentIDHits["ca2"] + if ca1hits >= ca2hits: + o.write(ID + "\tca1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") + else: + o.write(ID + "\tca2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") + elif cghits >= cahits and cghits >= cmhits: #its a cg gene + cg1hits = currentIDHits["cg1"] + cg2hits = currentIDHits["cg2"] + cg3hits = currentIDHits["cg3"] + cg4hits = currentIDHits["cg4"] + if cg1hits >= cg2hits and cg1hits >= cg3hits and cg1hits >= cg4hits: #cg1 gene + o.write(ID + "\tcg1\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + elif cg2hits >= cg1hits and cg2hits >= cg3hits and cg2hits >= cg4hits: #cg2 gene + o.write(ID + "\tcg2\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + elif cg3hits >= cg1hits and cg3hits >= cg2hits and cg3hits >= cg4hits: #cg3 gene + o.write(ID + "\tcg3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + else: #cg4 gene + o.write(ID + "\tcg4\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + else: #its a cm gene + o.write(ID + "\tcm\t100\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + seq_write_count += 1 + +print "Time: %i" % (int(time.time() * 1000) - starttime) + +print "Number of sequences written to file:", seq_write_count + + + + + diff -r 000000000000 -r 8a5a2abbb870 imgt_loader.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/imgt_loader.r Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,82 @@ +args <- commandArgs(trailingOnly = TRUE) + +summ.file = args[1] +aa.file = args[2] +junction.file = args[3] +out.file = args[4] + +summ = read.table(summ.file, sep="\t", header=T, quote="", fill=T) +aa = read.table(aa.file, sep="\t", header=T, quote="", fill=T) +junction = read.table(junction.file, sep="\t", header=T, quote="", fill=T) + +old_summary_columns=c('Sequence.ID','JUNCTION.frame','V.GENE.and.allele','D.GENE.and.allele','J.GENE.and.allele','CDR1.IMGT.length','CDR2.IMGT.length','CDR3.IMGT.length','Orientation') +old_sequence_columns=c('CDR1.IMGT','CDR2.IMGT','CDR3.IMGT') +old_junction_columns=c('JUNCTION') + +added_summary_columns=c('Functionality','V.REGION.identity..','V.REGION.identity.nt','D.REGION.reading.frame','AA.JUNCTION','Functionality.comment','Sequence') +added_sequence_columns=c('FR1.IMGT','FR2.IMGT','FR3.IMGT','CDR3.IMGT','JUNCTION','J.REGION','FR4.IMGT') + +added_junction_columns=c('P3.V.nt.nb','N.REGION.nt.nb','N1.REGION.nt.nb','P5.D.nt.nb','P3.D.nt.nb','N2.REGION.nt.nb','P5.J.nt.nb','X3.V.REGION.trimmed.nt.nb','X5.D.REGION.trimmed.nt.nb','X3.D.REGION.trimmed.nt.nb','X5.J.REGION.trimmed.nt.nb','N.REGION','N1.REGION','N2.REGION') +added_junction_columns=c(added_junction_columns, 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb') + +out=summ[,c("Sequence.ID","JUNCTION.frame","V.GENE.and.allele","D.GENE.and.allele","J.GENE.and.allele")] + +out[,"CDR1.Seq"] = aa[,"CDR1.IMGT"] +out[,"CDR1.Length"] = summ[,"CDR1.IMGT.length"] + +out[,"CDR2.Seq"] = aa[,"CDR2.IMGT"] +out[,"CDR2.Length"] = summ[,"CDR2.IMGT.length"] + +out[,"CDR3.Seq"] = aa[,"CDR3.IMGT"] +out[,"CDR3.Length"] = summ[,"CDR3.IMGT.length"] + +out[,"CDR3.Seq.DNA"] = junction[,"JUNCTION"] +out[,"CDR3.Length.DNA"] = nchar(as.character(junction[,"JUNCTION"])) +out[,"Strand"] = summ[,"Orientation"] +out[,"CDR3.Found.How"] = "a" + +out[,added_summary_columns] = summ[,added_summary_columns] + +out[,added_sequence_columns] = aa[,added_sequence_columns] + +out[,added_junction_columns] = junction[,added_junction_columns] + +out[,"Top V Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"V.GENE.and.allele"])) +out[,"Top D Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"D.GENE.and.allele"])) +out[,"Top J Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"J.GENE.and.allele"])) + +out = out[,c('Sequence.ID','JUNCTION.frame','Top V Gene','Top D Gene','Top J Gene','CDR1.Seq','CDR1.Length','CDR2.Seq','CDR2.Length','CDR3.Seq','CDR3.Length','CDR3.Seq.DNA','CDR3.Length.DNA','Strand','CDR3.Found.How','Functionality','V.REGION.identity..','V.REGION.identity.nt','D.REGION.reading.frame','AA.JUNCTION','Functionality.comment','Sequence','FR1.IMGT','FR2.IMGT','FR3.IMGT','CDR3.IMGT','JUNCTION','J.REGION','FR4.IMGT','P3.V.nt.nb','N.REGION.nt.nb','N1.REGION.nt.nb','P5.D.nt.nb','P3.D.nt.nb','N2.REGION.nt.nb','P5.J.nt.nb','X3.V.REGION.trimmed.nt.nb','X5.D.REGION.trimmed.nt.nb','X3.D.REGION.trimmed.nt.nb','X5.J.REGION.trimmed.nt.nb','N.REGION','N1.REGION','N2.REGION', 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb')] + +names(out) = c('ID','VDJ Frame','Top V Gene','Top D Gene','Top J Gene','CDR1 Seq','CDR1 Length','CDR2 Seq','CDR2 Length','CDR3 Seq','CDR3 Length','CDR3 Seq DNA','CDR3 Length DNA','Strand','CDR3 Found How','Functionality','V-REGION identity %','V-REGION identity nt','D-REGION reading frame','AA JUNCTION','Functionality comment','Sequence','FR1-IMGT','FR2-IMGT','FR3-IMGT','CDR3-IMGT','JUNCTION','J-REGION','FR4-IMGT','P3V-nt nb','N-REGION-nt nb','N1-REGION-nt nb','P5D-nt nb','P3D-nt nb','N2-REGION-nt nb','P5J-nt nb','3V-REGION trimmed-nt nb','5D-REGION trimmed-nt nb','3D-REGION trimmed-nt nb','5J-REGION trimmed-nt nb','N-REGION','N1-REGION','N2-REGION', 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb') + +out[,"VDJ Frame"] = as.character(out[,"VDJ Frame"]) + +fltr = out[,"VDJ Frame"] == "in-frame" +if(any(fltr)){ + out[fltr, "VDJ Frame"] = "In-frame" +} + +fltr = out[,"VDJ Frame"] == "null" +if(any(fltr)){ + out[fltr, "VDJ Frame"] = "Out-of-frame" +} + +fltr = out[,"VDJ Frame"] == "out-of-frame" +if(any(fltr)){ + out[fltr, "VDJ Frame"] = "Out-of-frame" +} + +fltr = out[,"VDJ Frame"] == "" +if(any(fltr)){ + out[fltr, "VDJ Frame"] = "Out-of-frame" +} + +for(col in c('Top V Gene','Top D Gene','Top J Gene')){ + out[,col] = as.character(out[,col]) + fltr = out[,col] == "" + if(any(fltr)){ + out[fltr,col] = "NA" + } +} + +write.table(out, out.file, sep="\t", quote=F, row.names=F, col.names=T) diff -r 000000000000 -r 8a5a2abbb870 merge.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge.r Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,27 @@ +args <- commandArgs(trailingOnly = TRUE) + +input.1 = args[1] +input.2 = args[2] + +fields.1 = args[3] +fields.2 = args[4] + +field.1 = args[5] +field.2 = args[6] + +output = args[7] + +dat1 = read.table(input.1, header=T, sep="\t", quote="", stringsAsFactors=F, fill=T, row.names=NULL) +if(fields.1 != "all"){ + fields.1 = unlist(strsplit(fields.1, ",")) + dat1 = dat1[,fields.1] +} +dat2 = read.table(input.2, header=T, sep="\t", quote="", stringsAsFactors=F, fill=T, row.names=NULL) +if(fields.2 != "all"){ + fields.2 = unlist(strsplit(fields.2, ",")) + dat2 = dat2[,fields.2] +} + +dat3 = merge(dat1, dat2, by.x=field.1, by.y=field.2) + +write.table(dat3, output, sep="\t",quote=F,row.names=F,col.names=T) diff -r 000000000000 -r 8a5a2abbb870 merge_and_filter.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_and_filter.r Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,209 @@ +args <- commandArgs(trailingOnly = TRUE) + + +summaryfile = args[1] +sequencesfile = args[2] +mutationanalysisfile = args[3] +mutationstatsfile = args[4] +hotspotsfile = args[5] +gene_identification_file= args[6] +output = args[7] +before.unique.file = args[8] +unmatchedfile = args[9] +method=args[10] +functionality=args[11] +unique.type=args[12] +filter.unique=args[13] +class.filter=args[14] +empty.region.filter=args[15] + +summ = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") +sequences = read.table(sequencesfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") +mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") +mutationstats = read.table(mutationstatsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") +hotspots = read.table(hotspotsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") +gene_identification = read.table(gene_identification_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") + +if(method == "blastn"){ + "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" + gene_identification = gene_identification[!duplicated(gene_identification$qseqid),] + ref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52)) + gene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T) + gene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100 + gene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")] + colnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match") + +} + +input.sequence.count = nrow(summ) +print(paste("Number of sequences in summary file:", input.sequence.count)) + +filtering.steps = data.frame(character(0), numeric(0)) + +filtering.steps = rbind(filtering.steps, c("Input", input.sequence.count)) + +filtering.steps[,1] = as.character(filtering.steps[,1]) +filtering.steps[,2] = as.character(filtering.steps[,2]) +#filtering.steps[,3] = as.numeric(filtering.steps[,3]) + +summ = merge(summ, gene_identification, by="Sequence.ID") + +summ = summ[summ$Functionality != "No results",] + +print(paste("Number of sequences after 'No results' filter:", nrow(summ))) + +filtering.steps = rbind(filtering.steps, c("After 'No results' filter", nrow(summ))) + +if(functionality == "productive"){ + summ = summ[summ$Functionality == "productive (see comment)" | summ$Functionality == "productive",] +} else if (functionality == "unproductive"){ + summ = summ[summ$Functionality == "unproductive (see comment)" | summ$Functionality == "unproductive",] +} else if (functionality == "remove_unknown"){ + summ = summ[summ$Functionality != "No results" & summ$Functionality != "unknown (see comment)" & summ$Functionality != "unknown",] +} + +print(paste("Number of sequences after productive filter:", nrow(summ))) + +filtering.steps = rbind(filtering.steps, c("After productive filter", nrow(summ))) + +splt = strsplit(class.filter, "_")[[1]] +chunk_hit_threshold = as.numeric(splt[1]) +nt_hit_threshold = as.numeric(splt[2]) + +higher_than=(summ$chunk_hit_percentage >= chunk_hit_threshold & summ$nt_hit_percentage >= nt_hit_threshold) + +unmatched=summ[NULL,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] + +if(!all(higher_than, na.rm=T)){ #check for 'not all' because that would mean the unmatched set is empty + unmatched = summ[!higher_than,] + unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] + unmatched$best_match = paste("unmatched,", unmatched$best_match) + summ[!higher_than,"best_match"] = paste("unmatched,", summ[!higher_than,"best_match"]) +} + +if(any(higher_than, na.rm=T)){ + #summ = summ[higher_than,] +} + +if(nrow(summ) == 0){ + stop("No data remaining after filter") +} + +result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID") + +print(paste("Number of sequences after merging with mutation analysis file:", nrow(result))) + +result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID") + +print(paste("Number of sequences after merging with mutation stats file:", nrow(result))) + +result = merge(result, hotspots[,!(names(hotspots) %in% names(result)[-1])], by="Sequence.ID") + +print(paste("Number of sequences after merging with hotspots file:", nrow(result))) + +sequences = sequences[,c("Sequence.ID", "FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")] +names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq") +result = merge(result, sequences, by="Sequence.ID", all.x=T) + +print(paste("Number of sequences in result after merging with sequences:", nrow(result))) + +result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele) +result$VGene = gsub("[*].*", "", result$VGene) +result$DGene = gsub("^Homsap ", "", result$D.GENE.and.allele) +result$DGene = gsub("[*].*", "", result$DGene) +result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele) +result$JGene = gsub("[*].*", "", result$JGene) + +result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":")) + +result = result[!(duplicated(result$past)), ] + +result = result[,!(names(result) %in% c("past"))] + +print(paste("Number of sequences in result after", unique.type, "filtering:", nrow(result))) + +filtering.steps = rbind(filtering.steps, c("After duplicate filter", nrow(result))) + +print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == ""))) +print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == ""))) +print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == ""))) +print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == ""))) + +if(empty.region.filter == "FR1"){ + result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] + print(paste("Number of sequences after empty CDR1, FR2, CDR2 and FR3 column filter:", nrow(result))) + filtering.steps = rbind(filtering.steps, c("After empty CDR1, FR2, CDR2, FR3 filter", nrow(result))) +} else if(empty.region.filter == "CDR1"){ + result = result[result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] + print(paste("Number of sequences after empty FR2, CDR2 and FR3 column filter:", nrow(result))) + filtering.steps = rbind(filtering.steps, c("After empty FR2, CDR2, FR3 filter", nrow(result))) +} else if(empty.region.filter == "FR2"){ + result = result[result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] + print(paste("Number of sequences after empty CDR2 and FR3 column filter:", nrow(result))) + filtering.steps = rbind(filtering.steps, c("After empty CDR2, FR3 filter", nrow(result))) +} + +result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] + +print(paste("Number of sequences in result after n filtering:", nrow(result))) +filtering.steps = rbind(filtering.steps, c("After N filter", nrow(result))) + +cleanup_columns = c("FR1.IMGT.Nb.of.mutations", + "CDR1.IMGT.Nb.of.mutations", + "FR2.IMGT.Nb.of.mutations", + "CDR2.IMGT.Nb.of.mutations", + "FR3.IMGT.Nb.of.mutations") + +for(col in cleanup_columns){ + result[,col] = gsub("\\(.*\\)", "", result[,col]) + result[,col] = as.numeric(result[,col]) + result[is.na(result[,col]),] = 0 +} + +write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T) + +if(filter.unique != "no"){ + clmns = names(result) + + if(grepl("_c", filter.unique)){ + result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq, result$best_match) + } else { + result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) + } + + #fltr = result$unique.def %in% result.filtered$unique.def + + if(grepl("keep", filter.unique)){ + result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes + result = result[!duplicated(result$unique.def),] + } else { + result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),] + result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes + result = result[!duplicated(result$unique.def),] + } + + #result = result[,clmns] + + #write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T) +} + +print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result))) +print(paste("Number of matched sequences in result after CDR/FR filtering:", nrow(result[!grepl("unmatched", result$best_match),]))) + +filtering.steps = rbind(filtering.steps, c("After unique filter", nrow(result))) + +print(paste("Number of rows in result:", nrow(result))) +print(paste("Number of rows in unmatched:", nrow(unmatched))) + +matched.sequences.count = sum(!grepl("^unmatched", result$best_match)) +unmatched.sequences.count = sum(grepl("^unmatched", result$best_match)) + +filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count)) +filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count)) +filtering.steps[,2] = as.numeric(filtering.steps[,2]) +filtering.steps$perc = round(filtering.steps[,2] / input.sequence.count * 100, 2) + +write.table(x=filtering.steps, file=gsub("unmatched", "filtering_steps", unmatchedfile), sep="\t",quote=F,row.names=F,col.names=F) + +write.table(x=result, file=output, sep="\t",quote=F,row.names=F,col.names=T) +write.table(x=unmatched, file=unmatchedfile, sep="\t",quote=F,row.names=F,col.names=T) diff -r 000000000000 -r 8a5a2abbb870 mutation_analysis.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutation_analysis.py Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,290 @@ +from __future__ import division +from collections import defaultdict +import re +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--input", + help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation") +parser.add_argument("--genes", help="The genes available in the 'best_match' column") +parser.add_argument("--includefr1", help="Should the mutation/nucleotides in the FR1 region be included?") +parser.add_argument("--output", help="Output file") + +args = parser.parse_args() + +infile = args.input +genes = str(args.genes).split(",") +print "includefr1 =", args.includefr1 +include_fr1 = True if args.includefr1 == "yes" else False +outfile = args.output + +genedic = dict() + +mutationdic = dict() +mutationMatcher = re.compile("^(.)(\d+).(.),?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?") +NAMatchResult = (None, None, None, None, None, None, '') +linecount = 0 + +IDIndex = 0 +best_matchIndex = 0 +fr1Index = 0 +cdr1Index = 0 +fr2Index = 0 +cdr2Index = 0 +fr3Index = 0 +first = True +IDlist = [] +mutationList = [] +mutationListByID = {} +cdr1LengthDic = {} +cdr2LengthDic = {} + +with open(infile, 'r') as i: + for line in i: + if first: + linesplt = line.split("\t") + IDIndex = linesplt.index("Sequence.ID") + best_matchIndex = linesplt.index("best_match") + fr1Index = linesplt.index("FR1.IMGT") + cdr1Index = linesplt.index("CDR1.IMGT") + fr2Index = linesplt.index("FR2.IMGT") + cdr2Index = linesplt.index("CDR2.IMGT") + fr3Index = linesplt.index("FR3.IMGT") + cdr1LengthIndex = linesplt.index("CDR1.IMGT.length") + cdr2LengthIndex = linesplt.index("CDR2.IMGT.length") + first = False + continue + linecount += 1 + linesplt = line.split("\t") + ID = linesplt[IDIndex] + genedic[ID] = linesplt[best_matchIndex] + try: + if linesplt[fr1Index] != "NA": + mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else [] + else: + mutationdic[ID + "_FR1"] = [] + mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if linesplt[cdr1Index] != "NA" else [] + mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if linesplt[fr2Index] != "NA" else [] + mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if linesplt[cdr2Index] != "NA" else [] + mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else [] + except e: + print linesplt + print linecount + print e + mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] + mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] + + cdr1Length = linesplt[cdr1LengthIndex] + cdr2Length = linesplt[cdr2LengthIndex] + + cdr1LengthDic[ID] = int(cdr1Length) if cdr1Length != "X" else 0 + cdr2LengthDic[ID] = int(cdr2Length) if cdr2Length != "X" else 0 + + IDlist += [ID] + +AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] else 0)[4]) + 1) # [4] is the position of the AA mutation, None if silent +if AALength < 60: + AALength = 64 + +AA_mutation = [0] * AALength +AA_mutation_dic = {"ca": AA_mutation[:], "cg": AA_mutation[:], "cm": AA_mutation[:], "un": AA_mutation[:]} +AA_mutation_empty = AA_mutation[:] + +aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt" +with open(aa_mutations_by_id_file, 'w') as o: + o.write("ID\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n") + for ID in mutationListByID.keys(): + AA_mutation_for_ID = AA_mutation_empty[:] + for mutation in mutationListByID[ID]: + if mutation[4]: + AA_mutation_position = int(mutation[4]) + AA_mutation[AA_mutation_position] += 1 + AA_mutation_for_ID[AA_mutation_position] += 1 + clss = genedic[ID][:2] + AA_mutation_dic[clss][AA_mutation_position] += 1 + o.write(ID + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in AA_mutation_for_ID[1:]]) + "\n") + + + +#absent AA stuff +absentAACDR1Dic = defaultdict(list) +absentAACDR1Dic[5] = range(29,36) +absentAACDR1Dic[6] = range(29,35) +absentAACDR1Dic[7] = range(30,35) +absentAACDR1Dic[8] = range(30,34) +absentAACDR1Dic[9] = range(31,34) +absentAACDR1Dic[10] = range(31,33) +absentAACDR1Dic[11] = [32] + +absentAACDR2Dic = defaultdict(list) +absentAACDR2Dic[0] = range(55,65) +absentAACDR2Dic[1] = range(56,65) +absentAACDR2Dic[2] = range(56,64) +absentAACDR2Dic[3] = range(57,64) +absentAACDR2Dic[4] = range(57,63) +absentAACDR2Dic[5] = range(58,63) +absentAACDR2Dic[6] = range(58,62) +absentAACDR2Dic[7] = range(59,62) +absentAACDR2Dic[8] = range(59,61) +absentAACDR2Dic[9] = [60] + +absentAA = [len(IDlist)] * (AALength-1) +for k, cdr1Length in cdr1LengthDic.iteritems(): + for c in absentAACDR1Dic[cdr1Length]: + absentAA[c] -= 1 + +for k, cdr2Length in cdr2LengthDic.iteritems(): + for c in absentAACDR2Dic[cdr2Length]: + absentAA[c] -= 1 + + +aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/absent_aa_id.txt" +with open(aa_mutations_by_id_file, 'w') as o: + o.write("ID\tcdr1length\tcdr2length\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n") + for ID in IDlist: + absentAAbyID = [1] * (AALength-1) + cdr1Length = cdr1LengthDic[ID] + for c in absentAACDR1Dic[cdr1Length]: + absentAAbyID[c] -= 1 + + cdr2Length = cdr2LengthDic[ID] + for c in absentAACDR2Dic[cdr2Length]: + absentAAbyID[c] -= 1 + o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n") + +if linecount == 0: + print "No data, exiting" + with open(outfile, 'w') as o: + o.write("RGYW (%)," + ("0,0,0\n" * len(genes))) + o.write("WRCY (%)," + ("0,0,0\n" * len(genes))) + o.write("WA (%)," + ("0,0,0\n" * len(genes))) + o.write("TW (%)," + ("0,0,0\n" * len(genes))) + import sys + + sys.exit() + +hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)") +RGYWCount = {} +WRCYCount = {} +WACount = {} +TWCount = {} + +#IDIndex = 0 +ataIndex = 0 +tatIndex = 0 +aggctatIndex = 0 +atagcctIndex = 0 +first = True +with open(infile, 'r') as i: + for line in i: + if first: + linesplt = line.split("\t") + ataIndex = linesplt.index("X.a.t.a") + tatIndex = linesplt.index("t.a.t.") + aggctatIndex = linesplt.index("X.a.g.g.c.t..a.t.") + atagcctIndex = linesplt.index("X.a.t..a.g.c.c.t.") + first = False + continue + linesplt = line.split("\t") + gene = linesplt[best_matchIndex] + ID = linesplt[IDIndex] + if ID == "ca2": + print linesplt + RGYW = [(int(x), int(y), z) for (x, y, z) in + [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]] + WRCY = [(int(x), int(y), z) for (x, y, z) in + [hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]] + WA = [(int(x), int(y), z) for (x, y, z) in + [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] + TW = [(int(x), int(y), z) for (x, y, z) in + [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] + RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0 + + mutationList = (mutationdic[ID + "_FR1"] if include_fr1 else []) + mutationdic[ID + "_CDR1"] + mutationdic[ + ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] + for mutation in mutationList: + frm, where, to, AAfrm, AAwhere, AAto, junk = mutation + mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW]) + mutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY]) + mutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA]) + mutation_in_TW = any([(start <= int(where) <= end) for (start, end, region) in TW]) + + in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW]) + + if in_how_many_motifs > 0: + RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs + WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs + WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs + TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs + + +def mean(lst): + return (float(sum(lst)) / len(lst)) if len(lst) > 0 else 0.0 + + +def median(lst): + lst = sorted(lst) + l = len(lst) + if l == 0: + return 0 + if l == 1: + return lst[0] + + l = int(l / 2) + + if len(lst) % 2 == 0: + return float(lst[l] + lst[(l - 1)]) / 2.0 + else: + return lst[l] + +funcs = {"mean": mean, "median": median, "sum": sum} + +directory = outfile[:outfile.rfind("/") + 1] +value = 0 +valuedic = dict() + +for fname in funcs.keys(): + for gene in genes: + with open(directory + gene + "_" + fname + "_value.txt", 'r') as v: + valuedic[gene + "_" + fname] = float(v.readlines()[0].rstrip()) + with open(directory + "all_" + fname + "_value.txt", 'r') as v: + valuedic["total_" + fname] = float(v.readlines()[0].rstrip()) + + +def get_xyz(lst, gene, f, fname): + x = int(round(f(lst))) + y = valuedic[gene + "_" + fname] + z = str(round(x / float(y) * 100, 1)) if y != 0 else "0" + return (str(x), str(y), z) + +dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount} +arr = ["RGYW", "WRCY", "WA", "TW"] + +geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes} + +for fname in funcs.keys(): + func = funcs[fname] + foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt" + with open(foutfile, 'w') as o: + for typ in arr: + o.write(typ + " (%)") + curr = dic[typ] + for gene in genes: + geneMatcher = geneMatchers[gene] #re.compile("^" + gene + ".*") #recompile every loop.... + if valuedic[gene + "_" + fname] is 0: + o.write(",0,0,0") + else: + x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname) + o.write("," + x + "," + y + "," + z) + + x, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname) + o.write("," + x + "," + y + "," + z + "\n") + + +# for testing +seq_motif_file = outfile[:outfile.rindex("/")] + "/motif_per_seq.txt" +with open(seq_motif_file, 'w') as o: + o.write("ID\tRGYWC\tWRCY\tWA\tTW\n") + for ID in IDlist: + o.write(ID + "\t" + str(round(RGYWCount[ID], 2)) + "\t" + str(round(WRCYCount[ID], 2)) + "\t" + str(round(WACount[ID], 2)) + "\t" + str(round(TWCount[ID], 2)) + "\n") diff -r 000000000000 -r 8a5a2abbb870 mutation_analysis.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutation_analysis.r Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,477 @@ +library(data.table) +library(ggplot2) +library(reshape2) + +args <- commandArgs(trailingOnly = TRUE) + +input = args[1] +genes = unlist(strsplit(args[2], ",")) +outputdir = args[3] +include_fr1 = ifelse(args[4] == "yes", T, F) +setwd(outputdir) + +dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) + +if(length(dat$Sequence.ID) == 0){ + setwd(outputdir) + result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5)) + row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)") + write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) + transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4)) + row.names(transitionTable) = c("A", "C", "G", "T") + transitionTable["A","A"] = NA + transitionTable["C","C"] = NA + transitionTable["G","G"] = NA + transitionTable["T","T"] = NA + write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) + cat("0", file="n.txt") + stop("No data") +} + +cleanup_columns = c("FR1.IMGT.c.a", + "FR2.IMGT.g.t", + "CDR1.IMGT.Nb.of.nucleotides", + "CDR2.IMGT.t.a", + "FR1.IMGT.c.g", + "CDR1.IMGT.c.t", + "FR2.IMGT.a.c", + "FR2.IMGT.Nb.of.mutations", + "FR2.IMGT.g.c", + "FR2.IMGT.a.g", + "FR3.IMGT.t.a", + "FR3.IMGT.t.c", + "FR2.IMGT.g.a", + "FR3.IMGT.c.g", + "FR1.IMGT.Nb.of.mutations", + "CDR1.IMGT.g.a", + "CDR1.IMGT.t.g", + "CDR1.IMGT.g.c", + "CDR2.IMGT.Nb.of.nucleotides", + "FR2.IMGT.a.t", + "CDR1.IMGT.Nb.of.mutations", + "CDR3.IMGT.Nb.of.nucleotides", + "CDR1.IMGT.a.g", + "FR3.IMGT.a.c", + "FR1.IMGT.g.a", + "FR3.IMGT.a.g", + "FR1.IMGT.a.t", + "CDR2.IMGT.a.g", + "CDR2.IMGT.Nb.of.mutations", + "CDR2.IMGT.g.t", + "CDR2.IMGT.a.c", + "CDR1.IMGT.t.c", + "FR3.IMGT.g.c", + "FR1.IMGT.g.t", + "FR3.IMGT.g.t", + "CDR1.IMGT.a.t", + "FR1.IMGT.a.g", + "FR3.IMGT.a.t", + "FR3.IMGT.Nb.of.nucleotides", + "FR2.IMGT.t.c", + "CDR2.IMGT.g.a", + "FR2.IMGT.t.a", + "CDR1.IMGT.t.a", + "FR2.IMGT.t.g", + "FR3.IMGT.t.g", + "FR2.IMGT.Nb.of.nucleotides", + "FR1.IMGT.t.a", + "FR1.IMGT.t.g", + "FR3.IMGT.c.t", + "FR1.IMGT.t.c", + "CDR2.IMGT.a.t", + "FR2.IMGT.c.t", + "CDR1.IMGT.g.t", + "CDR2.IMGT.t.g", + "FR1.IMGT.Nb.of.nucleotides", + "CDR1.IMGT.c.g", + "CDR2.IMGT.t.c", + "FR3.IMGT.g.a", + "CDR1.IMGT.a.c", + "FR2.IMGT.c.a", + "FR3.IMGT.Nb.of.mutations", + "FR2.IMGT.c.g", + "CDR2.IMGT.g.c", + "FR1.IMGT.g.c", + "CDR2.IMGT.c.t", + "FR3.IMGT.c.a", + "CDR1.IMGT.c.a", + "CDR2.IMGT.c.g", + "CDR2.IMGT.c.a", + "FR1.IMGT.c.t", + "FR1.IMGT.Nb.of.silent.mutations", + "FR2.IMGT.Nb.of.silent.mutations", + "FR3.IMGT.Nb.of.silent.mutations", + "FR1.IMGT.Nb.of.nonsilent.mutations", + "FR2.IMGT.Nb.of.nonsilent.mutations", + "FR3.IMGT.Nb.of.nonsilent.mutations") + + +print("Cleaning up columns") +for(col in cleanup_columns){ + dat[,col] = gsub("\\(.*\\)", "", dat[,col]) + #dat[dat[,col] == "",] = "0" + dat[,col] = as.numeric(dat[,col]) + dat[is.na(dat[,col]),col] = 0 +} + +regions = c("FR1", "CDR1", "FR2", "CDR2", "FR3") +if(!include_fr1){ + regions = c("CDR1", "FR2", "CDR2", "FR3") +} + +sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) } + +print("aggregating data into new columns") + +VRegionMutations_columns = paste(regions, ".IMGT.Nb.of.mutations", sep="") +dat$VRegionMutations = apply(dat, FUN=sum_by_row, 1, columns=VRegionMutations_columns) + +VRegionNucleotides_columns = paste(regions, ".IMGT.Nb.of.nucleotides", sep="") +dat$FR3.IMGT.Nb.of.nucleotides = nchar(dat$FR3.IMGT.seq) +dat$VRegionNucleotides = apply(dat, FUN=sum_by_row, 1, columns=VRegionNucleotides_columns) + +transitionMutations_columns = paste(rep(regions, each=4), c(".IMGT.a.g", ".IMGT.g.a", ".IMGT.c.t", ".IMGT.t.c"), sep="") +dat$transitionMutations = apply(dat, FUN=sum_by_row, 1, columns=transitionMutations_columns) + +transversionMutations_columns = paste(rep(regions, each=8), c(".IMGT.a.c",".IMGT.c.a",".IMGT.a.t",".IMGT.t.a",".IMGT.g.c",".IMGT.c.g",".IMGT.g.t",".IMGT.t.g"), sep="") +dat$transversionMutations = apply(dat, FUN=sum_by_row, 1, columns=transversionMutations_columns) + + +transitionMutationsAtGC_columns = paste(rep(regions, each=2), c(".IMGT.g.a",".IMGT.c.t"), sep="") +dat$transitionMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtGC_columns) + + +totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.c.g",".IMGT.c.t",".IMGT.c.a",".IMGT.g.c",".IMGT.g.a",".IMGT.g.t"), sep="") +#totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.g.a",".IMGT.c.t",".IMGT.c.a",".IMGT.c.g",".IMGT.g.t"), sep="") +dat$totalMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtGC_columns) + +transitionMutationsAtAT_columns = paste(rep(regions, each=2), c(".IMGT.a.g",".IMGT.t.c"), sep="") +dat$transitionMutationsAtAT = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtAT_columns) + +totalMutationsAtAT_columns = paste(rep(regions, each=6), c(".IMGT.a.g",".IMGT.a.c",".IMGT.a.t",".IMGT.t.g",".IMGT.t.c",".IMGT.t.a"), sep="") +#totalMutationsAtAT_columns = paste(rep(regions, each=5), c(".IMGT.a.g",".IMGT.t.c",".IMGT.a.c",".IMGT.g.c",".IMGT.t.g"), sep="") +dat$totalMutationsAtAT = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtAT_columns) + + +FRRegions = regions[grepl("FR", regions)] +CDRRegions = regions[grepl("CDR", regions)] + +FR_silentMutations_columns = paste(FRRegions, ".IMGT.Nb.of.silent.mutations", sep="") +dat$silentMutationsFR = apply(dat, FUN=sum_by_row, 1, columns=FR_silentMutations_columns) + +CDR_silentMutations_columns = paste(CDRRegions, ".IMGT.Nb.of.silent.mutations", sep="") +dat$silentMutationsCDR = apply(dat, FUN=sum_by_row, 1, columns=CDR_silentMutations_columns) + +FR_nonSilentMutations_columns = paste(FRRegions, ".IMGT.Nb.of.nonsilent.mutations", sep="") +dat$nonSilentMutationsFR = apply(dat, FUN=sum_by_row, 1, columns=FR_nonSilentMutations_columns) + +CDR_nonSilentMutations_columns = paste(CDRRegions, ".IMGT.Nb.of.nonsilent.mutations", sep="") +dat$nonSilentMutationsCDR = apply(dat, FUN=sum_by_row, 1, columns=CDR_nonSilentMutations_columns) + +mutation.sum.columns = c("Sequence.ID", "VRegionMutations", "VRegionNucleotides", "transitionMutations", "transversionMutations", "transitionMutationsAtGC", "transitionMutationsAtAT", "silentMutationsFR", "nonSilentMutationsFR", "silentMutationsCDR", "nonSilentMutationsCDR") + +write.table(dat[,mutation.sum.columns], "mutation_by_id.txt", sep="\t",quote=F,row.names=F,col.names=T) + +setwd(outputdir) + +base.order = data.frame(base=c("A", "T", "C", "G"), order=1:4) + +calculate_result = function(i, gene, dat, matrx, f, fname, name){ + tmp = dat[grepl(paste("^", gene, ".*", sep=""), dat$best_match),] + + j = i - 1 + x = (j * 3) + 1 + y = (j * 3) + 2 + z = (j * 3) + 3 + + if(nrow(tmp) > 0){ + + if(fname == "sum"){ + matrx[1,x] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) + matrx[1,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1) + matrx[1,z] = round(f(matrx[1,x] / matrx[1,y]) * 100, digits=1) + } else { + matrx[1,x] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) + matrx[1,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1) + matrx[1,z] = round(f(tmp$VRegionMutations / tmp$VRegionNucleotides) * 100, digits=1) + } + + matrx[2,x] = round(f(tmp$transitionMutations, na.rm=T), digits=1) + matrx[2,y] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) + matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) + + matrx[3,x] = round(f(tmp$transversionMutations, na.rm=T), digits=1) + matrx[3,y] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) + matrx[3,z] = round(matrx[3,x] / matrx[3,y] * 100, digits=1) + + matrx[4,x] = round(f(tmp$transitionMutationsAtGC, na.rm=T), digits=1) + matrx[4,y] = round(f(tmp$totalMutationsAtGC, na.rm=T), digits=1) + matrx[4,z] = round(matrx[4,x] / matrx[4,y] * 100, digits=1) + + matrx[5,x] = round(f(tmp$totalMutationsAtGC, na.rm=T), digits=1) + matrx[5,y] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) + matrx[5,z] = round(matrx[5,x] / matrx[5,y] * 100, digits=1) + + matrx[6,x] = round(f(tmp$transitionMutationsAtAT, na.rm=T), digits=1) + matrx[6,y] = round(f(tmp$totalMutationsAtAT, na.rm=T), digits=1) + matrx[6,z] = round(matrx[6,x] / matrx[6,y] * 100, digits=1) + + matrx[7,x] = round(f(tmp$totalMutationsAtAT, na.rm=T), digits=1) + matrx[7,y] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) + matrx[7,z] = round(matrx[7,x] / matrx[7,y] * 100, digits=1) + + matrx[8,x] = round(f(tmp$nonSilentMutationsFR, na.rm=T), digits=1) + matrx[8,y] = round(f(tmp$silentMutationsFR, na.rm=T), digits=1) + matrx[8,z] = round(matrx[8,x] / matrx[8,y], digits=1) + + matrx[9,x] = round(f(tmp$nonSilentMutationsCDR, na.rm=T), digits=1) + matrx[9,y] = round(f(tmp$silentMutationsCDR, na.rm=T), digits=1) + matrx[9,z] = round(matrx[9,x] / matrx[9,y], digits=1) + + if(fname == "sum"){ + matrx[10,x] = round(f(rowSums(tmp[,c("FR2.IMGT.Nb.of.nucleotides", "FR3.IMGT.Nb.of.nucleotides")], na.rm=T)), digits=1) + matrx[10,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1) + matrx[10,z] = round(matrx[10,x] / matrx[10,y] * 100, digits=1) + + matrx[11,x] = round(f(rowSums(tmp[,c("CDR1.IMGT.Nb.of.nucleotides", "CDR2.IMGT.Nb.of.nucleotides")], na.rm=T)), digits=1) + matrx[11,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1) + matrx[11,z] = round(matrx[11,x] / matrx[11,y] * 100, digits=1) + } + } + + transitionTable = data.frame(A=zeros,C=zeros,G=zeros,T=zeros) + row.names(transitionTable) = c("A", "C", "G", "T") + transitionTable["A","A"] = NA + transitionTable["C","C"] = NA + transitionTable["G","G"] = NA + transitionTable["T","T"] = NA + + if(nrow(tmp) > 0){ + for(nt1 in nts){ + for(nt2 in nts){ + if(nt1 == nt2){ + next + } + NT1 = LETTERS[letters == nt1] + NT2 = LETTERS[letters == nt2] + FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") + CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") + FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") + CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") + FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") + if(include_fr1){ + transitionTable[NT1,NT2] = sum(tmp[,c(FR1, CDR1, FR2, CDR2, FR3)]) + } else { + transitionTable[NT1,NT2] = sum(tmp[,c(CDR1, FR2, CDR2, FR3)]) + } + } + } + transition = transitionTable + transition$id = names(transition) + + transition2 = melt(transition, id.vars="id") + + transition2 = merge(transition2, base.order, by.x="id", by.y="base") + transition2 = merge(transition2, base.order, by.x="variable", by.y="base") + + transition2[is.na(transition2$value),]$value = 0 + + if(!all(transition2$value == 0)){ #having rows of data but a transition table filled with 0 is bad + + print("Plotting stacked transition") + + png(filename=paste("transitions_stacked_", name, ".png", sep="")) + p = ggplot(transition2, aes(factor(reorder(id, order.x)), y=value, fill=factor(reorder(variable, order.y)))) + geom_bar(position="fill", stat="identity") #stacked bar + p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + guides(fill=guide_legend(title=NULL)) + print(p) + dev.off() + + print("Plotting heatmap transition") + + png(filename=paste("transitions_heatmap_", name, ".png", sep="")) + p = ggplot(transition2, aes(factor(reorder(id, order.x)), factor(reorder(variable, order.y)))) + geom_tile(aes(fill = value), colour="white") + scale_fill_gradient(low="white", high="steelblue") #heatmap + p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + print(p) + dev.off() + } else { + print("No data to plot") + } + } + + #print(paste("writing value file: ", name, "_", fname, "_value.txt" ,sep="")) + + write.table(x=transitionTable, file=paste("transitions_", name ,"_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=NA) + write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file=paste("matched_", name , "_", fname, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + + cat(matrx[1,x], file=paste(name, "_", fname, "_value.txt" ,sep="")) + cat(nrow(tmp), file=paste(name, "_", fname, "_n.txt" ,sep="")) + + #print(paste(fname, name, nrow(tmp))) + + matrx +} + +nts = c("a", "c", "g", "t") +zeros=rep(0, 4) + +funcs = c(median, sum, mean) +fnames = c("median", "sum", "mean") + +print("Creating result tables") + +for(i in 1:length(funcs)){ + func = funcs[[i]] + fname = fnames[[i]] + + rows = 9 + if(fname == "sum"){ + rows = 11 + } + matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=rows) + + for(i in 1:length(genes)){ + print(paste("Creating table for", fname, genes[i])) + matrx = calculate_result(i, genes[i], dat, matrx, func, fname, genes[i]) + } + + matrx = calculate_result(i + 1, ".*", dat[!grepl("unmatched", dat$best_match),], matrx, func, fname, name="all") + + result = data.frame(matrx) + if(fname == "sum"){ + row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR") + } else { + row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)") + } + + write.table(x=result, file=paste("mutations_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=F) +} + +print("Adding median number of mutations to sum table") + +sum.table = read.table("mutations_sum.txt", sep=",", header=F) +median.table = read.table("mutations_median.txt", sep=",", header=F) + +new.table = sum.table[1,] +new.table[2,] = median.table[1,] +new.table[3:12,] = sum.table[2:11,] +new.table[,1] = as.character(new.table[,1]) +new.table[2,1] = "Median of Number of Mutations (%)" + +#sum.table = sum.table[c("Number of Mutations (%)", "Median of Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR"),] + +write.table(x=new.table, file="mutations_sum.txt", sep=",",quote=F,row.names=F,col.names=F) + + +print("Plotting ca piechart") + +dat = dat[!grepl("^unmatched", dat$best_match),] + +#blegh +genesForPlot = dat[grepl("ca", dat$best_match),]$best_match +if(length(genesForPlot) > 0){ + genesForPlot = data.frame(table(genesForPlot)) + colnames(genesForPlot) = c("Gene","Freq") + genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) + + pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label)) + pc = pc + geom_bar(width = 1, stat = "identity") + pc = pc + coord_polar(theta="y") + pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA subclasses", "( n =", sum(genesForPlot$Freq), ")")) + write.table(genesForPlot, "ca.txt", sep="\t",quote=F,row.names=F,col.names=T) + + png(filename="ca.png") + print(pc) + dev.off() +} + +print("Plotting cg piechart") + +genesForPlot = dat[grepl("cg", dat$best_match),]$best_match +if(length(genesForPlot) > 0){ + genesForPlot = data.frame(table(genesForPlot)) + colnames(genesForPlot) = c("Gene","Freq") + genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) + + pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label)) + pc = pc + geom_bar(width = 1, stat = "identity") + pc = pc + coord_polar(theta="y") + pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgG subclasses", "( n =", sum(genesForPlot$Freq), ")")) + write.table(genesForPlot, "cg.txt", sep="\t",quote=F,row.names=F,col.names=T) + + png(filename="cg.png") + print(pc) + dev.off() +} + + +print("Plotting scatterplot") + +dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2) + +p = ggplot(dat, aes(best_match, percentage_mutations)) +p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA) +p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") + +png(filename="scatter.png") +print(p) +dev.off() + +write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T) + +write.table(dat, input, sep="\t",quote=F,row.names=F,col.names=T) + + +print("Plotting frequency ranges plot") + +dat$best_match_class = substr(dat$best_match, 0, 2) +freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20") +dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels) + +frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match_class", "frequency_bins")]) + +p = ggplot(frequency_bins_data, aes(frequency_bins, frequency_count)) +p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge") +p = p + xlab("Frequency ranges") + ylab("Frequency") + ggtitle("Mutation Frequencies by class") + +png(filename="frequency_ranges.png") +print(p) +dev.off() + +frequency_bins_data_by_class = frequency_bins_data + +write.table(frequency_bins_data_by_class, "frequency_ranges_classes.txt", sep="\t",quote=F,row.names=F,col.names=T) + +frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match", "frequency_bins")]) + +write.table(frequency_bins_data, "frequency_ranges_subclasses.txt", sep="\t",quote=F,row.names=F,col.names=T) + + +#frequency_bins_data_by_class +#frequency_ranges_subclasses.txt + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r 8a5a2abbb870 mutation_analysis.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutation_analysis.xml Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,101 @@ + + + + wrapper.sh $in_file $method $out_file $out_file.files_path ${in_file.name} ${include_fr1} $functionality $unique $naive_output_ca $naive_output_cg $naive_output_cm $filter_uniques $class_filter $empty_region_filter + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + naive_output_cond['naive_output'] == "yes" + + + naive_output_cond['naive_output'] == "yes" + + + naive_output_cond['naive_output'] == "yes" + + + + 10.1093/nar/gks457 + 10.1093/bioinformatics/btv359 + 10.1186/1471-2105-10-421 + + + Takes an IMGT zip (http://www.imgt.org/HighV-QUEST/search.action) file and creates a summarization of the mutation analysis. + + +--------------------------+ + | unique filter | + +--------+--------+--------+ + | values | remove | keep | + +--------+--------+--------+ + | A | A | A | + +--------+--------+--------+ + | A | B | B | + +--------+--------+--------+ + | B | D | C | + +--------+--------+--------+ + | B | | D | + +--------+--------+--------+ + | C | | | + +--------+--------+--------+ + | D | | | + +--------+--------+--------+ + | D | | | + +--------+--------+--------+ + + + + blastn + + diff -r 000000000000 -r 8a5a2abbb870 naive_output.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/naive_output.r Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,45 @@ +args <- commandArgs(trailingOnly = TRUE) + +naive.file = args[1] +shm.file = args[2] +output.file.ca = args[3] +output.file.cg = args[4] +output.file.cm = args[5] + +naive = read.table(naive.file, sep="\t", header=T, quote="", fill=T) +shm.merge = read.table(shm.file, sep="\t", header=T, quote="", fill=T) + + +final = merge(naive, shm.merge[,c("Sequence.ID", "best_match")], by.x="ID", by.y="Sequence.ID") +print(paste("nrow final:", nrow(final))) +names(final)[names(final) == "best_match"] = "Sample" +final.numeric = final[,sapply(final, is.numeric)] +final.numeric[is.na(final.numeric)] = 0 +final[,sapply(final, is.numeric)] = final.numeric + +final.ca = final[grepl("^ca", final$Sample),] +final.cg = final[grepl("^cg", final$Sample),] +final.cm = final[grepl("^cm", final$Sample),] + +if(nrow(final.ca) > 0){ + final.ca$Replicate = 1 +} + +if(nrow(final.cg) > 0){ + final.cg$Replicate = 1 +} + +if(nrow(final.cm) > 0){ + final.cm$Replicate = 1 +} + +#print(paste("nrow final:", nrow(final))) +#final2 = final +#final2$Sample = gsub("[0-9]", "", final2$Sample) +#final = rbind(final, final2) +#final$Replicate = 1 + +write.table(final.ca, output.file.ca, quote=F, sep="\t", row.names=F, col.names=T) +write.table(final.cg, output.file.cg, quote=F, sep="\t", row.names=F, col.names=T) +write.table(final.cm, output.file.cm, quote=F, sep="\t", row.names=F, col.names=T) + diff -r 000000000000 -r 8a5a2abbb870 new_imgt.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/new_imgt.r Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,27 @@ +args <- commandArgs(trailingOnly = TRUE) + +imgt.dir = args[1] +merged.file = args[2] +gene = args[3] + +merged = read.table(merged.file, header=T, sep="\t", fill=T, stringsAsFactors=F) + +if(gene != "-"){ + merged = merged[grepl(gene, merged$best_match),] +} + +merged = merged[!grepl("unmatched", merged$best_match),] + +for(f in list.files(imgt.dir, pattern="*.txt$")){ + #print(paste("filtering", f)) + path = paste(imgt.dir, f, sep="") + dat = read.table(path, header=T, sep="\t", fill=T, quote="", stringsAsFactors=F, check.names=FALSE) + + dat = dat[dat[,"Sequence ID"] %in% merged$Sequence.ID,] + + if(nrow(dat) > 0 & grepl("^8_", f)){ #change the FR1 columns to 0 in the "8_..." file + dat[,grepl("^FR1", names(dat))] = 0 + } + + write.table(dat, path, quote=F, sep="\t", row.names=F, col.names=T, na="") +} diff -r 000000000000 -r 8a5a2abbb870 pattern_plots.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pattern_plots.r Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,139 @@ +library(ggplot2) +library(reshape2) +library(scales) + +args <- commandArgs(trailingOnly = TRUE) + +input.file = args[1] #the data that's get turned into the "SHM overview" table in the html report "data_sum.txt" + +plot1.path = args[2] +plot1.png = paste(plot1.path, ".png", sep="") +plot1.txt = paste(plot1.path, ".txt", sep="") + +plot2.path = args[3] +plot2.png = paste(plot2.path, ".png", sep="") +plot2.txt = paste(plot2.path, ".txt", sep="") + +plot3.path = args[4] +plot3.png = paste(plot3.path, ".png", sep="") +plot3.txt = paste(plot3.path, ".txt", sep="") + +dat = read.table(input.file, header=F, sep=",", quote="", stringsAsFactors=F, fill=T, row.names=1) + + + +classes = c("ca", "ca1", "ca2", "cg", "cg1", "cg2", "cg3", "cg4", "cm") +xyz = c("x", "y", "z") +new.names = c(paste(rep(classes, each=3), xyz, sep="."), paste("un", xyz, sep="."), paste("all", xyz, sep=".")) + +names(dat) = new.names + +dat["RGYW.WRCY",] = colSums(dat[c(13,14),]) +dat["TW.WA",] = colSums(dat[c(15,16),]) + +data1 = dat[c("RGYW.WRCY", "TW.WA"),] + +data1 = data1[,names(data1)[grepl(".z", names(data1))]] +names(data1) = gsub("\\..*", "", names(data1)) + +data1 = melt(t(data1)) + +names(data1) = c("Class", "Type", "value") + +write.table(data1, plot1.txt, quote=F, sep="\t", na="", row.names=F, col.names=T) + +p = ggplot(data1, aes(Class, value)) + geom_bar(aes(fill=Type), stat="identity", position="dodge") + ylab("% of mutations") + guides(fill=guide_legend(title=NULL)) +png(filename=plot1.png) +print(p) +dev.off() + +data2 = dat[5:8,] + +data2["sum",] = colSums(data2) + +data2 = data2[,names(data2)[grepl("\\.x", names(data2))]] +names(data2) = gsub(".x", "", names(data2)) + +data2["A/T",] = round(colSums(data2[3:4,]) / data2["sum",] * 100, 1) +data2["A/T",is.nan(unlist(data2["A/T",]))] = 0 + +data2["G/C transversions",] = round(data2[2,] / data2["sum",] * 100, 1) +data2["G/C transitions",] = round(data2[1,] / data2["sum",] * 100, 1) + + +data2["G/C transversions",is.nan(unlist(data2["G/C transversions",]))] = 0 +data2["G/C transversions",is.infinite(unlist(data2["G/C transversions",]))] = 0 +data2["G/C transitions",is.nan(unlist(data2["G/C transitions",]))] = 0 +data2["G/C transitions",is.infinite(unlist(data2["G/C transitions",]))] = 0 + +data2 = melt(t(data2[6:8,])) + +names(data2) = c("Class", "Type", "value") + +write.table(data2, plot2.txt, quote=F, sep="\t", na="", row.names=F, col.names=T) + +p = ggplot(data2, aes(x=Class, y=value, fill=Type)) + geom_bar(position="fill", stat="identity") + scale_y_continuous(labels=percent_format()) + guides(fill=guide_legend(title=NULL)) + ylab("% of mutations") +png(filename=plot2.png) +print(p) +dev.off() + +data3 = dat[c(5, 6, 8, 17:20),] +data3 = data3[,names(data3)[grepl("\\.x", names(data3))]] +names(data3) = gsub(".x", "", names(data3)) + +data3["G/C transitions",] = round(data3[1,] / (data3[5,] + data3[7,]) * 100, 1) + +data3["G/C transversions",] = round(data3[2,] / (data3[5,] + data3[7,]) * 100, 1) + +data3["A/T",] = round(data3[3,] / (data3[4,] + data3[6,]) * 100, 1) + +data3["G/C transitions",is.nan(unlist(data3["G/C transitions",]))] = 0 +data3["G/C transitions",is.infinite(unlist(data3["G/C transitions",]))] = 0 + +data3["G/C transversions",is.nan(unlist(data3["G/C transversions",]))] = 0 +data3["G/C transversions",is.infinite(unlist(data3["G/C transversions",]))] = 0 + +data3["A/T",is.nan(unlist(data3["A/T",]))] = 0 +data3["A/T",is.infinite(unlist(data3["A/T",]))] = 0 + +data3 = melt(t(data3[8:10,])) +names(data3) = c("Class", "Type", "value") + +write.table(data3, plot3.txt, quote=F, sep="\t", na="", row.names=F, col.names=T) + +p = ggplot(data3, aes(Class, value)) + geom_bar(aes(fill=Type), stat="identity", position="dodge") + ylab("% of nucleotides") + guides(fill=guide_legend(title=NULL)) +png(filename=plot3.png) +print(p) +dev.off() + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r 8a5a2abbb870 sequence_overview.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sequence_overview.r Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,315 @@ +library(reshape2) + +args <- commandArgs(trailingOnly = TRUE) + +before.unique.file = args[1] +merged.file = args[2] +outputdir = args[3] +gene.classes = unlist(strsplit(args[4], ",")) +hotspot.analysis.sum.file = args[5] +NToverview.file = paste(outputdir, "ntoverview.txt", sep="/") +NTsum.file = paste(outputdir, "ntsum.txt", sep="/") +main.html = "index.html" + +setwd(outputdir) + +before.unique = read.table(before.unique.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") +merged = read.table(merged.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") +hotspot.analysis.sum = read.table(hotspot.analysis.sum.file, header=F, sep=",", fill=T, stringsAsFactors=F, quote="") + +#before.unique = before.unique[!grepl("unmatched", before.unique$best_match),] + +before.unique$seq_conc = paste(before.unique$CDR1.IMGT.seq, before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq) + +IDs = before.unique[,c("Sequence.ID", "seq_conc", "best_match", "Functionality")] +IDs$best_match = as.character(IDs$best_match) + +#dat = data.frame(data.table(dat)[, list(freq=.N), by=c("best_match", "seq_conc")]) + +dat = data.frame(table(before.unique$seq_conc)) +#dat = data.frame(table(merged$seq_conc, merged$Functionality)) + +#dat = dat[dat$Freq > 1,] + +#names(dat) = c("seq_conc", "Functionality", "Freq") +names(dat) = c("seq_conc", "Freq") + +dat$seq_conc = factor(dat$seq_conc) + +dat = dat[order(as.character(dat$seq_conc)),] + +#writing html from R... +get.bg.color = function(val){ + if(val %in% c("TRUE", "FALSE", "T", "F")){ #if its a logical value, give the background a green/red color + return(ifelse(val,"#eafaf1","#f9ebea")) + } else if (!is.na(as.numeric(val))) { #if its a numerical value, give it a grey tint if its >0 + return(ifelse(val > 0,"#eaecee","white")) + } else { + return("white") + } +} +td = function(val) { + return(paste("", val, "", sep="")) +} +tr = function(val) { + return(paste(c("", sapply(val, td), ""), collapse="")) +} + +make.link = function(id, clss, val) { + paste("", val, "", sep="") +} +tbl = function(df) { + res = "" + for(i in 1:nrow(df)){ + res = paste(res, tr(df[i,]), sep="") + } + res = paste(res, "
") +} + +cat("", file=main.html, append=F) +cat("", file=main.html, append=T) +cat("", file=main.html, append=T) +cat("", file=main.html, append=T) +cat("", file=main.html, append=T) +cat("", file=main.html, append=T) +cat("", file=main.html, append=T) +cat("", file=main.html, append=T) + + + +single.sequences=0 #sequence only found once, skipped +in.multiple=0 #same sequence across multiple subclasses +multiple.in.one=0 #same sequence multiple times in one subclass +unmatched=0 #all of the sequences are unmatched +some.unmatched=0 #one or more sequences in a clone are unmatched +matched=0 #should be the same als matched sequences + +sequence.id.page="by_id.html" + +for(i in 1:nrow(dat)){ + + ca1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^ca1", IDs$best_match),] + ca2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^ca2", IDs$best_match),] + + cg1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^cg1", IDs$best_match),] + cg2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^cg2", IDs$best_match),] + cg3 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^cg3", IDs$best_match),] + cg4 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^cg4", IDs$best_match),] + + cm = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^cm", IDs$best_match),] + + un = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^unmatched", IDs$best_match),] + allc = rbind(ca1, ca2, cg1, cg2, cg3, cg4, cm, un) + + ca1.n = nrow(ca1) + ca2.n = nrow(ca2) + + cg1.n = nrow(cg1) + cg2.n = nrow(cg2) + cg3.n = nrow(cg3) + cg4.n = nrow(cg4) + + cm.n = nrow(cm) + + un.n = nrow(un) + + classes = c(ca1.n, ca2.n, cg1.n, cg2.n, cg3.n, cg4.n, cm.n, un.n) + + classes.sum = sum(classes) + + if(classes.sum == 1){ + single.sequences = single.sequences + 1 + next + } + + if(un.n == classes.sum){ + unmatched = unmatched + 1 + next + } + + in.classes = sum(classes > 0) + + matched = matched + in.classes #count in how many subclasses the sequence occurs. + + if(any(classes == classes.sum)){ + multiple.in.one = multiple.in.one + 1 + } else if (un.n > 0) { + some.unmatched = some.unmatched + 1 + } else { + in.multiple = in.multiple + 1 + } + + id = as.numeric(dat[i,"seq_conc"]) + + functionality = paste(unique(allc[,"Functionality"]), collapse=",") + + by.id.row = c() + + if(ca1.n > 0){ + cat(tbl(ca1), file=paste("ca1_", id, ".html", sep="")) + } + + if(ca2.n > 0){ + cat(tbl(ca2), file=paste("ca2_", id, ".html", sep="")) + } + + if(cg1.n > 0){ + cat(tbl(cg1), file=paste("cg1_", id, ".html", sep="")) + } + + if(cg2.n > 0){ + cat(tbl(cg2), file=paste("cg2_", id, ".html", sep="")) + } + + if(cg3.n > 0){ + cat(tbl(cg3), file=paste("cg3_", id, ".html", sep="")) + } + + if(cg4.n > 0){ + cat(tbl(cg4), file=paste("cg4_", id, ".html", sep="")) + } + + if(cm.n > 0){ + cat(tbl(cm), file=paste("cm_", id, ".html", sep="")) + } + + if(un.n > 0){ + cat(tbl(un), file=paste("un_", id, ".html", sep="")) + } + + ca1.html = make.link(id, "ca1", ca1.n) + ca2.html = make.link(id, "ca2", ca2.n) + + cg1.html = make.link(id, "cg1", cg1.n) + cg2.html = make.link(id, "cg2", cg2.n) + cg3.html = make.link(id, "cg3", cg3.n) + cg4.html = make.link(id, "cg4", cg4.n) + + cm.html = make.link(id, "cm", cm.n) + + un.html = make.link(id, "un", un.n) + + #extra columns + ca.n = ca1.n + ca2.n + + cg.n = cg1.n + cg2.n + cg3.n + cg4.n + + #in.classes + + in.ca.cg = (ca.n > 0 & cg.n > 0) + + in.ca1.ca2 = (ca1.n > 0 & ca2.n > 0) + + in.cg1.cg2 = (cg1.n > 0 & cg2.n > 0) + in.cg1.cg3 = (cg1.n > 0 & cg3.n > 0) + in.cg1.cg4 = (cg1.n > 0 & cg4.n > 0) + in.cg2.cg3 = (cg2.n > 0 & cg3.n > 0) + in.cg2.cg4 = (cg2.n > 0 & cg4.n > 0) + in.cg3.cg4 = (cg3.n > 0 & cg4.n > 0) + + in.cg1.cg2.cg3 = (cg1.n > 0 & cg2.n > 0 & cg3.n > 0) + in.cg2.cg3.cg4 = (cg2.n > 0 & cg3.n > 0 & cg4.n > 0) + in.cg1.cg2.cg4 = (cg1.n > 0 & cg2.n > 0 & cg4.n > 0) + in.cg1.cg3.cg4 = (cg1.n > 0 & cg3.n > 0 & cg4.n > 0) + + in.cg.all = (cg1.n > 0 & cg2.n > 0 & cg3.n > 0 & cg4.n > 0) + + + + + #rw = c(as.character(dat[i,"seq_conc"]), functionality, ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html, un.html) + rw = c(as.character(dat[i,"seq_conc"]), functionality, ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html, un.html) + rw = c(rw, ca.n, cg.n, in.classes, in.ca.cg, in.ca1.ca2, in.cg1.cg2, in.cg1.cg3, in.cg1.cg4, in.cg2.cg3, in.cg2.cg4, in.cg3.cg4, in.cg1.cg2.cg3, in.cg2.cg3.cg4, in.cg1.cg2.cg4, in.cg1.cg3.cg4, in.cg.all) + + cat(tr(rw), file=main.html, append=T) + + + for(i in 1:nrow(allc)){ #generate html by id + html = make.link(id, allc[i,"best_match"], allc[i,"Sequence.ID"]) + cat(paste(html, "
"), file=sequence.id.page, append=T) + } +} + +cat("
CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once
SequenceFunctionalityca1ca2cg1cg2cg3cg4cmuntotal CAtotal CGnumber of subclassespresent in both Ca and CgCa1+Ca2Cg1+Cg2Cg1+Cg3Cg1+Cg4Cg2+Cg3Cg2+Cg4Cg3+Cg4Cg1+Cg2+Cg3Cg2+Cg3+Cg4Cg1+Cg2+Cg4Cg1+Cg3+Cg4Cg1+Cg2+Cg3+Cg4
", file=main.html, append=T) + +print(paste("Single sequences:", single.sequences)) +print(paste("Sequences in multiple subclasses:", in.multiple)) +print(paste("Multiple sequences in one subclass:", multiple.in.one)) +print(paste("Matched with unmatched:", some.unmatched)) +print(paste("Count that should match 'matched' sequences:", matched)) + +#ACGT overview + +NToverview = merged[!grepl("^unmatched", merged$best_match),] + +NToverview$seq = paste(NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq, sep="_") + +NToverview$A = nchar(gsub("[^Aa]", "", NToverview$seq)) +NToverview$C = nchar(gsub("[^Cc]", "", NToverview$seq)) +NToverview$G = nchar(gsub("[^Gg]", "", NToverview$seq)) +NToverview$T = nchar(gsub("[^Tt]", "", NToverview$seq)) + +#Nsum = data.frame(Sequence.ID="-", best_match="Sum", seq="-", A = sum(NToverview$A), C = sum(NToverview$C), G = sum(NToverview$G), T = sum(NToverview$T)) + +#NToverview = rbind(NToverview, NTsum) + +NTresult = data.frame(nt=c("A", "C", "T", "G")) + +for(clazz in gene.classes){ + NToverview.sub = NToverview[grepl(paste("^", clazz, sep=""), NToverview$best_match),] + new.col.x = c(sum(NToverview.sub$A), sum(NToverview.sub$C), sum(NToverview.sub$T), sum(NToverview.sub$G)) + new.col.y = sum(new.col.x) + new.col.z = round(new.col.x / new.col.y * 100, 2) + + tmp = names(NTresult) + NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z)) + names(NTresult) = c(tmp, paste(clazz, c("x", "y", "z"), sep="")) +} + +write.table(NToverview[,c("Sequence.ID", "best_match", "seq", "A", "C", "G", "T")], NToverview.file, quote=F, sep="\t", row.names=F, col.names=T) + +NToverview = NToverview[!grepl("unmatched", NToverview$best_match),] + +new.col.x = c(sum(NToverview$A), sum(NToverview$C), sum(NToverview$T), sum(NToverview$G)) +new.col.y = sum(new.col.x) +new.col.z = round(new.col.x / new.col.y * 100, 2) + +tmp = names(NTresult) +NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z)) +names(NTresult) = c(tmp, paste("all", c("x", "y", "z"), sep="")) + +names(hotspot.analysis.sum) = names(NTresult) + +hotspot.analysis.sum = rbind(hotspot.analysis.sum, NTresult) + +write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0") + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r 8a5a2abbb870 style.tar.gz Binary file style.tar.gz has changed diff -r 000000000000 -r 8a5a2abbb870 subclass_definition.db.nhr Binary file subclass_definition.db.nhr has changed diff -r 000000000000 -r 8a5a2abbb870 subclass_definition.db.nin Binary file subclass_definition.db.nin has changed diff -r 000000000000 -r 8a5a2abbb870 subclass_definition.db.nsq Binary file subclass_definition.db.nsq has changed diff -r 000000000000 -r 8a5a2abbb870 summary_to_fasta.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/summary_to_fasta.py Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,42 @@ +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="The 1_Summary file of an IMGT zip file") +parser.add_argument("--fasta", help="The output fasta file") + +args = parser.parse_args() + +infile = args.input +fasta = args.fasta + +with open(infile, 'r') as i, open(fasta, 'w') as o: + first = True + id_col = 0 + seq_col = 0 + no_results = 0 + no_seqs = 0 + passed = 0 + for line in i: + splt = line.split("\t") + if first: + id_col = splt.index("Sequence ID") + seq_col = splt.index("Sequence") + first = False + continue + if len(splt) < 5: + no_results += 1 + continue + + ID = splt[id_col] + seq = splt[seq_col] + + if not len(seq) > 0: + no_seqs += 1 + continue + + o.write(">" + ID + "\n" + seq + "\n") + passed += 1 + + print "No results:", no_results + print "No sequences:", no_seqs + print "Written to fasta file:", passed diff -r 000000000000 -r 8a5a2abbb870 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,24 @@ + + + + + + + + + ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.30/ncbi-blast-2.2.30+-x64-linux.tar.gz + + bin/blastn + $INSTALL_DIR + + + $INSTALL_DIR + + + + + downloads blast (ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.30/ncbi-blast-2.2.30+-x64-linux.tar.gz) and keeps the blastn executable + + + + diff -r 000000000000 -r 8a5a2abbb870 wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wrapper.sh Mon Aug 29 05:36:10 2016 -0400 @@ -0,0 +1,603 @@ +#!/bin/bash +#set -e +dir="$(cd "$(dirname "$0")" && pwd)" +input=$1 +method=$2 +log=$3 #becomes the main html page at the end +outdir=$4 +output="$outdir/index.html" #copied to $log location at the end +title=$5 +include_fr1=$6 +functionality=$7 +unique=$8 +naive_output_ca=$9 +naive_output_cg=${10} +naive_output_cm=${11} +filter_unique=${12} +class_filter=${13} +empty_region_filter=${14} +mkdir $outdir + +tar -xzf $dir/style.tar.gz -C $outdir + +echo "---------------- read parameters ----------------" +echo "---------------- read parameters ----------------
" > $log + +echo "unpacking IMGT file" + +type="`file $input`" +if [[ "$type" == *"Zip archive"* ]] ; then + echo "Zip archive" + echo "unzip $input -d $PWD/files/" + unzip $input -d $PWD/files/ +elif [[ "$type" == *"XZ compressed data"* ]] ; then + echo "ZX archive" + echo "tar -xJf $input -C $PWD/files/" + mkdir -p $PWD/files/$title + tar -xJf $input -C $PWD/files/$title +fi + +cat `find $PWD/files/ -name "1_*"` > $PWD/summary.txt +cat `find $PWD/files/ -name "3_*"` > $PWD/sequences.txt +cat `find $PWD/files/ -name "5_*"` > $PWD/aa.txt +cat `find $PWD/files/ -name "6_*"` > $PWD/junction.txt +cat `find $PWD/files/ -name "7_*"` > $PWD/mutationanalysis.txt +cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt +cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt + +if [[ ${#BLASTN_DIR} -ge 5 ]] ; then + echo "On server, using BLASTN_DIR env: ${BLASTN_DIR}" +else + BLASTN_DIR="/home/galaxy/Downloads/ncbi-blast-2.4.0+/bin" + echo "Dev Galaxy set BLASTN_DIR to: ${BLASTN_DIR}" +fi + +echo "---------------- identification ($method) ----------------" +echo "---------------- identification ($method) ----------------
" >> $log + +if [[ "${method}" == "custom" ]] ; then + python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt +else + echo "---------------- summary_to_fasta.py ----------------" + echo "---------------- summary_to_fasta.py ----------------
" >> $log + + python $dir/summary_to_fasta.py --input $PWD/summary.txt --fasta $PWD/sequences.fasta + + echo -e "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" > $outdir/identified_genes.txt + ${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt +fi + +echo "---------------- merge_and_filter.r ----------------" +echo "---------------- merge_and_filter.r ----------------
" >> $log + +Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter} ${empty_region_filter} 2>&1 + +echo "---------------- creating new IMGT zip ----------------" +echo "---------------- creating new IMGT zip ----------------
" >> $log + +mkdir $outdir/new_IMGT + +cat `find $PWD/files/ -name "1_*"` > "$outdir/new_IMGT/1_Summary.txt" +cat `find $PWD/files/ -name "2_*"` > "$outdir/new_IMGT/2_IMGT-gapped-nt-sequences.txt" +cat `find $PWD/files/ -name "3_*"` > "$outdir/new_IMGT/3_Nt-sequences.txt" +cat `find $PWD/files/ -name "4_*"` > "$outdir/new_IMGT/4_IMGT-gapped-AA-sequences.txt" +cat `find $PWD/files/ -name "5_*"` > "$outdir/new_IMGT/5_AA-sequences.txt" +cat `find $PWD/files/ -name "6_*"` > "$outdir/new_IMGT/6_Junction.txt" +cat `find $PWD/files/ -name "7_*"` > "$outdir/new_IMGT/7_V-REGION-mutation-and-AA-change-table.txt" +cat `find $PWD/files/ -name "8_*"` > "$outdir/new_IMGT/8_V-REGION-nt-mutation-statistics.txt" +cat `find $PWD/files/ -name "9_*"` > "$outdir/new_IMGT/9_V-REGION-AA-change-statistics.txt" +cat `find $PWD/files/ -name "10_*"` > "$outdir/new_IMGT/10_V-REGION-mutation-hotspots.txt" + +mkdir $outdir/new_IMGT_ca +cp $outdir/new_IMGT/* $outdir/new_IMGT_ca + +mkdir $outdir/new_IMGT_ca1 +cp $outdir/new_IMGT/* $outdir/new_IMGT_ca1 + +mkdir $outdir/new_IMGT_ca2 +cp $outdir/new_IMGT/* $outdir/new_IMGT_ca2 + +mkdir $outdir/new_IMGT_cg +cp $outdir/new_IMGT/* $outdir/new_IMGT_cg + +mkdir $outdir/new_IMGT_cg1 +cp $outdir/new_IMGT/* $outdir/new_IMGT_cg1 + +mkdir $outdir/new_IMGT_cg2 +cp $outdir/new_IMGT/* $outdir/new_IMGT_cg2 + +mkdir $outdir/new_IMGT_cg3 +cp $outdir/new_IMGT/* $outdir/new_IMGT_cg3 + +mkdir $outdir/new_IMGT_cg4 +cp $outdir/new_IMGT/* $outdir/new_IMGT_cg4 + +mkdir $outdir/new_IMGT_cm +cp $outdir/new_IMGT/* $outdir/new_IMGT_cm + +Rscript $dir/new_imgt.r $outdir/new_IMGT/ $outdir/merged.txt "-" 2>&1 + +Rscript $dir/new_imgt.r $outdir/new_IMGT_ca/ $outdir/merged.txt "ca" 2>&1 +Rscript $dir/new_imgt.r $outdir/new_IMGT_ca1/ $outdir/merged.txt "ca1" 2>&1 +Rscript $dir/new_imgt.r $outdir/new_IMGT_ca2/ $outdir/merged.txt "ca2" 2>&1 + +Rscript $dir/new_imgt.r $outdir/new_IMGT_cg/ $outdir/merged.txt "cg" 2>&1 +Rscript $dir/new_imgt.r $outdir/new_IMGT_cg1/ $outdir/merged.txt "cg1" 2>&1 +Rscript $dir/new_imgt.r $outdir/new_IMGT_cg2/ $outdir/merged.txt "cg2" 2>&1 +Rscript $dir/new_imgt.r $outdir/new_IMGT_cg3/ $outdir/merged.txt "cg3" 2>&1 +Rscript $dir/new_imgt.r $outdir/new_IMGT_cg4/ $outdir/merged.txt "cg4" 2>&1 + +Rscript $dir/new_imgt.r $outdir/new_IMGT_cm/ $outdir/merged.txt "cm" 2>&1 + + +tmp="$PWD" +cd $outdir/new_IMGT/ #tar weirdness... +tar -cJf ../new_IMGT.txz * + +cd $outdir/new_IMGT_ca/ +tar -cJf ../new_IMGT_ca.txz * + +cd $outdir/new_IMGT_ca1/ +tar -cJf ../new_IMGT_ca1.txz * + +cd $outdir/new_IMGT_ca2/ +tar -cJf ../new_IMGT_ca2.txz * + +cd $outdir/new_IMGT_cg/ +tar -cJf ../new_IMGT_cg.txz * + +cd $outdir/new_IMGT_cg1/ +tar -cJf ../new_IMGT_cg1.txz * + +cd $outdir/new_IMGT_cg2/ +tar -cJf ../new_IMGT_cg2.txz * + +cd $outdir/new_IMGT_cg3/ +tar -cJf ../new_IMGT_cg3.txz * + +cd $outdir/new_IMGT_cg4/ +tar -cJf ../new_IMGT_cg4.txz * + +cd $outdir/new_IMGT_cm/ +tar -cJf ../new_IMGT_cm.txz * + +cd $tmp + +echo "---------------- mutation_analysis.r ----------------" +echo "---------------- mutation_analysis.r ----------------
" >> $log + +classes="ca,ca1,ca2,cg,cg1,cg2,cg3,cg4,cm,unmatched" +echo "R mutation analysis" +Rscript $dir/mutation_analysis.r $outdir/merged.txt $classes $outdir ${include_fr1} 2>&1 + + +echo "---------------- mutation_analysis.py ----------------" +echo "---------------- mutation_analysis.py ----------------
" >> $log + +python $dir/mutation_analysis.py --input $outdir/merged.txt --genes $classes --includefr1 "${include_fr1}" --output $outdir/hotspot_analysis.txt + +echo "---------------- aa_histogram.r ----------------" +echo "---------------- aa_histogram.r ----------------
" >> $log + +Rscript $dir/aa_histogram.r $outdir/aa_id_mutations.txt $outdir/absent_aa_id.txt "ca,cg,cm" $outdir/ 2>&1 +if [ -e "$outdir/aa_histogram_.png" ]; then + mv $outdir/aa_histogram_.png $outdir/aa_histogram.png + mv $outdir/aa_histogram_.txt $outdir/aa_histogram.txt +fi + +genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm) + +funcs=(sum mean median) +funcs=(sum) + +echo "---------------- sequence_overview.r ----------------" +echo "---------------- sequence_overview.r ----------------
" >> $log + +mkdir $outdir/sequence_overview + +Rscript $dir/sequence_overview.r $outdir/before_unique_filter.txt $outdir/merged.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt 2>&1 + +echo "" > $outdir/base_overview.html + +while IFS=$'\t' read ID class seq A C G T +do + echo "" >> $outdir/base_overview.html +done < $outdir/sequence_overview/ntoverview.txt + +echo "

$title

" > $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output + +matched_count="`cat $outdir/merged.txt | grep -v 'unmatched' | tail -n +2 | wc -l`" +unmatched_count="`cat $outdir/unmatched.txt | tail -n +2 | wc -l`" +total_count=$((matched_count + unmatched_count)) +perc_count=$((unmatched_count / total_count * 100)) +perc_count=`bc -l <<< "scale=2; ${unmatched_count} / ${total_count} * 100"` +perc_count=`bc -l <<< "scale=2; (${unmatched_count} / ${total_count} * 100 ) / 1"` + +echo "

Total: ${total_count}

" >> $output +echo "

Matched: ${matched_count} Unmatched: ${unmatched_count}

" >> $output +echo "

Percentage unmatched: ${perc_count}

" >> $output + +echo "---------------- main tables ----------------" +echo "---------------- main tables ----------------
" >> $log + +echo "
" >> $output +echo "
" >> $output + +for func in ${funcs[@]} +do + + echo "---------------- $func table ----------------" + echo "---------------- $func table ----------------
" >> $log + + cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/data_${func}.txt + + echo "---------------- pattern_plots.r ----------------" + echo "---------------- pattern_plots.r ----------------
" >> $log + + Rscript $dir/pattern_plots.r $outdir/data_${func}.txt $outdir/plot1 $outdir/plot2 $outdir/plot3 2>&1 + + echo "
$ID$seq$class$A$C$G$T
" >> $output + echo "" >> $output + for gene in ${genes[@]} + do + tmp=`cat $outdir/${gene}_${func}_n.txt` + echo "" >> $output + done + + tmp=`cat $outdir/all_${func}_n.txt` + echo "" >> $output + tmp=`cat $outdir/unmatched_${func}_n.txt` + echo "" >> $output + + while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz unx uny unz allx ally allz + do + if [ "$name" == "FR S/R (ratio)" ] || [ "$name" == "CDR S/R (ratio)" ] ; then #meh + echo "" >> $output + else + echo "" >> $output + fi + done < $outdir/data_${func}.txt + echo "
info${gene} (N = $tmp)all (N = $tmp)unmatched (N = ${unmatched_count})
$name${cax}/${cay} (${caz})${ca1x}/${ca1y} (${ca1z})${ca2x}/${ca2y} (${ca2z})${cgx}/${cgy} (${cgz})${cg1x}/${cg1y} (${cg1z})${cg2x}/${cg2y} (${cg2z})${cg3x}/${cg3y} (${cg3z})${cg4x}/${cg4y} (${cg4z})${cmx}/${cmy} (${cmz})${allx}/${ally} (${allz})
$name${cax}/${cay} (${caz}%)${ca1x}/${ca1y} (${ca1z}%)${ca2x}/${ca2y} (${ca2z}%)${cgx}/${cgy} (${cgz}%)${cg1x}/${cg1y} (${cg1z}%)${cg2x}/${cg2y} (${cg2z}%)${cg3x}/${cg3y} (${cg3z}%)${cg4x}/${cg4y} (${cg4z}%)${cmx}/${cmy} (${cmz}%)${allx}/${ally} (${allz}%)${unx}/${uny} (${unz}%)
" >> $output + #echo "Download data" >> $output +done + +echo "
" >> $output +echo "
" >> $output +echo "
" >> $output + +echo "" >> $output #SHM overview tab end + +echo "---------------- images ----------------" +echo "---------------- images ----------------
" >> $log + +echo "
" >> $output + +if [ -a $outdir/scatter.png ] +then + echo "
" >> $output + echo "download data
" >> $output +fi +if [ -a $outdir/frequency_ranges.png ] +then + echo "
" >> $output + echo "download class data
" >> $output + echo "download subclass data
" >> $output +fi + +echo "
" >> $output #SHM frequency tab end + +echo "
" >> $output + +echo "" >> $output + +for gene in ${genes[@]} +do + echo "" >> $output + echo "" >> $output + echo "" >> $output + echo "" >> $output + echo "" >> $output + + echo "" >> $output +done + +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output + +echo "" >> $output + +echo "

${gene}

" >> $output + while IFS=, read from a c g t + do + echo "" >> $output + done < $outdir/transitions_${gene}_sum.txt + echo "
$from$a$c$g$t

All

" >> $output +while IFS=, read from a c g t + do + echo "" >> $output +done < $outdir/transitions_all_sum.txt +echo "
$from$a$c$g$t
" >> $output + +echo "
" >> $output #transition tables tab end + +echo "
" >> $output + +if [ -a $outdir/aa_histogram.png ] +then + echo "
" >> $output + echo "download data
" >> $output + echo "
" >> $output + echo "download data
" >> $output + echo "
" >> $output + echo "download data
" >> $output + echo "
" >> $output + echo "download data
" >> $output +fi + +echo "" >> $output +echo "" >> $output +echo "" >> $output + +echo "
" >> $output #antigen selection tab end + +echo "
" >> $output #CSR tab + +if [ -a $outdir/ca.png ] +then + echo "
" >> $output + echo "download data
" >> $output +fi +if [ -a $outdir/cg.png ] +then + echo "
" >> $output + echo "download data
" >> $output +fi + +echo "
" >> $output #CSR tab end + +echo "---------------- change-o MakeDB ----------------" + +mkdir $outdir/change_o + +tmp="$PWD" + +cd $outdir/change_o + +bash $dir/change_o/makedb.sh $input false false false $outdir/change_o/change-o-db.txt +bash $dir/change_o/define_clones.sh bygroup $outdir/change_o/change-o-db.txt gene first ham none min complete 3.0 $outdir/change_o/change-o-db-defined_clones.txt $outdir/change_o/change-o-defined_clones-summary.txt + +Rscript $dir/merge.r $outdir/change_o/change-o-db-defined_clones.txt $outdir/merged.txt "all" "Sequence.ID,best_match" "SEQUENCE_ID" "Sequence.ID" $outdir/change_o/change-o-db-defined_clones.txt 2>&1 + +echo "Rscript $dir/merge.r $outdir/change_o/change-o-db-defined_clones.txt $outdir/$outdir/merged.txt 'all' 'Sequence.ID,best_match' 'Sequence.ID' 'Sequence.ID' '\t' $outdir/change_o/change-o-db-defined_clones.txt 2>&1" + +if [[ $(wc -l < $outdir/new_IMGT_ca/1_Summary.txt) -gt "1" ]]; then + bash $dir/change_o/makedb.sh $outdir/new_IMGT_ca.txz false false false $outdir/change_o/change-o-db-ca.txt + bash $dir/change_o/define_clones.sh bygroup $outdir/change_o/change-o-db-ca.txt gene first ham none min complete 3.0 $outdir/change_o/change-o-db-defined_clones-ca.txt $outdir/change_o/change-o-defined_clones-summary-ca.txt +else + echo "No ca sequences" > "$outdir/change_o/change-o-db-defined_clones-ca.txt" + echo "No ca sequences" > "$outdir/change_o/change-o-defined_clones-summary-ca.txt" +fi + +if [[ $(wc -l < $outdir/new_IMGT_cg/1_Summary.txt) -gt "1" ]]; then + bash $dir/change_o/makedb.sh $outdir/new_IMGT_cg.txz false false false $outdir/change_o/change-o-db-cg.txt + bash $dir/change_o/define_clones.sh bygroup $outdir/change_o/change-o-db-cg.txt gene first ham none min complete 3.0 $outdir/change_o/change-o-db-defined_clones-cg.txt $outdir/change_o/change-o-defined_clones-summary-cg.txt +else + echo "No cg sequences" > "$outdir/change_o/change-o-db-defined_clones-cg.txt" + echo "No cg sequences" > "$outdir/change_o/change-o-defined_clones-summary-cg.txt" +fi + +if [[ $(wc -l < $outdir/new_IMGT_cm/1_Summary.txt) -gt "1" ]]; then + bash $dir/change_o/makedb.sh $outdir/new_IMGT_cm.txz false false false $outdir/change_o/change-o-db-cm.txt + bash $dir/change_o/define_clones.sh bygroup $outdir/change_o/change-o-db-cm.txt gene first ham none min complete 3.0 $outdir/change_o/change-o-db-defined_clones-cm.txt $outdir/change_o/change-o-defined_clones-summary-cm.txt +else + echo "No cm sequences" > "$outdir/change_o/change-o-db-defined_clones-cm.txt" + echo "No cm sequences" > "$outdir/change_o/change-o-defined_clones-summary-cm.txt" +fi + +PWD="$tmp" + +echo "
" >> $output #clonality tab + +function clonality_table { + local infile=$1 + local outfile=$2 + + echo "" >> $outfile + echo "" >> $outfile + + first='true' + + while read size clones seqs + do + if [[ "$first" == "true" ]]; then + first="false" + continue + fi + echo "" >> $outfile + done < $infile + + echo "
Clone sizeNr of clonesNr of sequences
$size$clones$seqs
" >> $outfile +} +echo "
" >> $output + +echo "
" >> $output +clonality_table $outdir/change_o/change-o-defined_clones-summary.txt $output +echo "
" >> $output + +echo "
" >> $output +clonality_table $outdir/change_o/change-o-defined_clones-summary-ca.txt $output +echo "
" >> $output + +echo "
" >> $output +clonality_table $outdir/change_o/change-o-defined_clones-summary-cg.txt $output +echo "
" >> $output + +echo "
" >> $output +clonality_table $outdir/change_o/change-o-defined_clones-summary-cm.txt $output +echo "
" >> $output + +echo "
" >> $output #clonality tabber end + +echo "
" >> $output #clonality tab end + +echo "
" >> $output + +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output + +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output + +echo "" >> $output + +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output + +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output + +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output + +echo "
infolink
The complete datasetDownload
The SHM Overview table as a datasetDownload
The data used to generate the first SHM Overview plotDownload
The data used to generate the sexond SHM Overview plotDownload
The data used to generate the third SHM Overview plotDownload
The alignment info on the unmatched sequencesDownload
Motif data per sequence IDDownload
Mutation data per sequence IDDownload
AA mutation data per sequence IDDownload
Absent AA location data per sequence IDDownload
CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than onceDownload
Base count for every sequenceDownload
Baseline PDF (http://selection.med.yale.edu/baseline/)Download
Baseline dataDownload
Baseline ca PDFDownload
Baseline ca dataDownload
Baseline cg PDFDownload
Baseline cg dataDownload
Baseline cm PDFDownload
Baseline cm dataDownload
An IMGT archive with just the matched and filtered sequencesDownload
An IMGT archive with just the matched and filtered ca sequencesDownload
An IMGT archive with just the matched and filtered ca1 sequencesDownload
An IMGT archive with just the matched and filtered ca2 sequencesDownload
An IMGT archive with just the matched and filtered cg sequencesDownload
An IMGT archive with just the matched and filtered cg1 sequencesDownload
An IMGT archive with just the matched and filtered cg2 sequencesDownload
An IMGT archive with just the matched and filtered cg3 sequencesDownload
An IMGT archive with just the matched and filtered cg4 sequencesDownload
An IMGT archive with just the matched and filtered cm sequencesDownload
The Change-O DB file with defined clones and subclass annotationDownload
The Change-O DB defined clones summary fileDownload
The Change-O DB file with defined clones of caDownload
The Change-O DB defined clones summary file of caDownload
The Change-O DB file with defined clones of cgDownload
The Change-O DB defined clones summary file of cgDownload
The Change-O DB file with defined clones of cmDownload
The Change-O DB defined clones summary file of cmDownload
" >> $output + +echo "
" >> $output #downloads tab end + +echo "" >> $output #tabs end + +echo "" >> $output + +echo "---------------- baseline ----------------" +echo "---------------- baseline ----------------
" >> $log +tmp="$PWD" + +mkdir $outdir/baseline + + +mkdir $outdir/baseline/ca_cg_cm +if [[ $(wc -l < $outdir/new_IMGT/1_Summary.txt) -gt "1" ]]; then + cd $outdir/baseline/ca_cg_cm + bash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT.txz "ca_cg_cm" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline.pdf" "Sequence.ID" "$outdir/baseline.txt" +else + echo "No sequences" > "$outdir/baseline.txt" +fi + +mkdir $outdir/baseline/ca +if [[ $(wc -l < $outdir/new_IMGT_ca/1_Summary.txt) -gt "1" ]]; then + cd $outdir/baseline/ca + bash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_ca.txz "ca" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_ca.pdf" "Sequence.ID" "$outdir/baseline_ca.txt" +else + echo "No ca sequences" > "$outdir/baseline_ca.txt" +fi + +mkdir $outdir/baseline/cg +if [[ $(wc -l < $outdir/new_IMGT_cg/1_Summary.txt) -gt "1" ]]; then + cd $outdir/baseline/cg + bash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_cg.txz "cg" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_cg.pdf" "Sequence.ID" "$outdir/baseline_cg.txt" +else + echo "No cg sequences" > "$outdir/baseline_cg.txt" +fi + +mkdir $outdir/baseline/cm +if [[ $(wc -l < $outdir/new_IMGT_cm/1_Summary.txt) -gt "1" ]]; then + cd $outdir/baseline/cm + bash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_cm.txz "cm" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_cm.pdf" "Sequence.ID" "$outdir/baseline_cm.txt" +else + echo "No cm sequences" > "$outdir/baseline_cm.txt" +fi + +cd $tmp + +echo "---------------- naive_output.r ----------------" +echo "---------------- naive_output.r ----------------
" >> $log + +if [[ "$naive_output" != "None" ]] +then + cp $outdir/new_IMGT_ca.txz ${naive_output_ca} + cp $outdir/new_IMGT_cg.txz ${naive_output_cg} + cp $outdir/new_IMGT_cm.txz ${naive_output_cm} +fi + +echo "" >> $outdir/base_overview.html + +mv $log $outdir/log.html + +echo "

Click here for the results

Tip: Open it in a new tab (middle mouse button or right mouse button -> 'open in new tab' on the link above)
" > $log +echo "" >> $log +echo "" >> $log +tIFS="$TMP" +IFS=$'\t' +while read step seq perc + do + echo "" >> $log + echo "" >> $log + echo "" >> $log + echo "" >> $log + echo "" >> $log +done < $outdir/filtering_steps.txt +echo "
InfoSequencesPercentage
$step$seq${perc}%
" >> $log + +IFS="$tIFS" + + +echo "---------------- Done! ----------------" +echo "---------------- Done! ----------------
" >> $outdir/log.html + + + + + + + + + + + + + + + + + + + + +