Commit message:
Uploaded |
added:
aa_histogram.r baseline/Baseline_Functions.r baseline/Baseline_Main.r baseline/FiveS_Mutability.RData baseline/FiveS_Substitution.RData baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa baseline/comparePDFs.r baseline/filter.r baseline/script_imgt.py baseline/script_xlsx.py baseline/wrapper.sh change_o/DefineClones.py change_o/MakeDb.py change_o/define_clones.r change_o/define_clones.sh change_o/makedb.sh datatypes_conf.xml gene_identification.py imgt_loader.r merge.r merge_and_filter.r mutation_analysis.py mutation_analysis.r mutation_analysis.xml naive_output.r new_imgt.r pattern_plots.r sequence_overview.r style.tar.gz subclass_definition.db.nhr subclass_definition.db.nin subclass_definition.db.nsq summary_to_fasta.py tool_dependencies.xml wrapper.sh |
b |
diff -r 000000000000 -r 8a5a2abbb870 aa_histogram.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/aa_histogram.r Mon Aug 29 05:36:10 2016 -0400 |
[ |
@@ -0,0 +1,61 @@ +library(ggplot2) + +args <- commandArgs(trailingOnly = TRUE) + +mutations.by.id.file = args[1] +absent.aa.by.id.file = args[2] +genes = strsplit(args[3], ",")[[1]] +genes = c(genes, "") +outdir = args[4] + + +print("---------------- read input ----------------") + +mutations.by.id = read.table(mutations.by.id.file, sep="\t", fill=T, header=T, quote="") +absent.aa.by.id = read.table(absent.aa.by.id.file, sep="\t", fill=T, header=T, quote="") + +for(gene in genes){ + + if(gene == ""){ + mutations.by.id.gene = mutations.by.id[!grepl("unmatched", mutations.by.id$best_match),] + absent.aa.by.id.gene = absent.aa.by.id[!grepl("unmatched", absent.aa.by.id$best_match),] + } else { + mutations.by.id.gene = mutations.by.id[grepl(paste("^", gene, sep=""), mutations.by.id$best_match),] + absent.aa.by.id.gene = absent.aa.by.id[grepl(paste("^", gene, sep=""), absent.aa.by.id$best_match),] + } + print(paste("nrow", gene, nrow(absent.aa.by.id.gene))) + if(nrow(mutations.by.id.gene) == 0){ + next + } + + mutations.at.position = colSums(mutations.by.id.gene[,-c(1,2)]) + aa.at.position = colSums(absent.aa.by.id.gene[,-c(1,2,3,4)]) + + dat_freq = mutations.at.position / aa.at.position + dat_freq[is.na(dat_freq)] = 0 + dat_dt = data.frame(i=1:length(dat_freq), freq=dat_freq) + + print("---------------- plot ----------------") + + m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + m = m + geom_bar(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=dat_dt$i, labels=dat_dt$i) + m = m + annotate("segment", x = 0.5, y = -0.05, xend=26.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 13, y = -0.1, label="FR1") + m = m + annotate("segment", x = 26.5, y = -0.07, xend=38.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 32.5, y = -0.15, label="CDR1") + m = m + annotate("segment", x = 38.5, y = -0.05, xend=55.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 47, y = -0.1, label="FR2") + m = m + annotate("segment", x = 55.5, y = -0.07, xend=65.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 60.5, y = -0.15, label="CDR2") + m = m + annotate("segment", x = 65.5, y = -0.05, xend=104.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 85, y = -0.1, label="FR3") + m = m + expand_limits(y=c(-0.1,1)) + xlab("AA position") + ylab("Frequency") + ggtitle(paste(gene, "AA mutation frequency")) + + print("---------------- write/print ----------------") + + png(filename=paste(outdir, "/aa_histogram_", gene, ".png", sep=""), width=1280, height=720) + print(m) + dev.off() + + dat.sums = data.frame(index=1:length(mutations.at.position), mutations.at.position=mutations.at.position, aa.at.position=aa.at.position) + + write.table(dat.sums, paste(outdir, "/aa_histogram_sum_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + write.table(mutations.by.id.gene, paste(outdir, "/aa_histogram_count_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + write.table(absent.aa.by.id.gene, paste(outdir, "/aa_histogram_absent_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + write.table(dat_dt, paste(outdir, "/aa_histogram_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) +} |
b |
diff -r 000000000000 -r 8a5a2abbb870 baseline/Baseline_Functions.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/Baseline_Functions.r Mon Aug 29 05:36:10 2016 -0400 |
[ |
b'@@ -0,0 +1,2287 @@\n+#########################################################################################\r\n+# License Agreement\r\n+# \r\n+# THIS WORK IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE \r\n+# ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER \r\n+# APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE \r\n+# OR COPYRIGHT LAW IS PROHIBITED.\r\n+# \r\n+# BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE \r\n+# BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED \r\n+# TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN \r\n+# CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.\r\n+#\r\n+# BASELIne: Bayesian Estimation of Antigen-Driven Selection in Immunoglobulin Sequences\r\n+# Coded by: Mohamed Uduman & Gur Yaari\r\n+# Copyright 2012 Kleinstein Lab\r\n+# Version: 1.3 (01/23/2014)\r\n+#########################################################################################\r\n+\r\n+# Global variables \r\n+ \r\n+ FILTER_BY_MUTATIONS = 1000\r\n+\r\n+ # Nucleotides\r\n+ NUCLEOTIDES = c("A","C","G","T")\r\n+ \r\n+ # Amino Acids\r\n+ AMINO_ACIDS <- c("F", "F", "L", "L", "S", "S", "S", "S", "Y", "Y", "*", "*", "C", "C", "*", "W", "L", "L", "L", "L", "P", "P", "P", "P", "H", "H", "Q", "Q", "R", "R", "R", "R", "I", "I", "I", "M", "T", "T", "T", "T", "N", "N", "K", "K", "S", "S", "R", "R", "V", "V", "V", "V", "A", "A", "A", "A", "D", "D", "E", "E", "G", "G", "G", "G")\r\n+ names(AMINO_ACIDS) <- c("TTT", "TTC", "TTA", "TTG", "TCT", "TCC", "TCA", "TCG", "TAT", "TAC", "TAA", "TAG", "TGT", "TGC", "TGA", "TGG", "CTT", "CTC", "CTA", "CTG", "CCT", "CCC", "CCA", "CCG", "CAT", "CAC", "CAA", "CAG", "CGT", "CGC", "CGA", "CGG", "ATT", "ATC", "ATA", "ATG", "ACT", "ACC", "ACA", "ACG", "AAT", "AAC", "AAA", "AAG", "AGT", "AGC", "AGA", "AGG", "GTT", "GTC", "GTA", "GTG", "GCT", "GCC", "GCA", "GCG", "GAT", "GAC", "GAA", "GAG", "GGT", "GGC", "GGA", "GGG")\r\n+ names(AMINO_ACIDS) <- names(AMINO_ACIDS)\r\n+\r\n+ #Amino Acid Traits\r\n+ #"*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V" "W" "Y"\r\n+ #B = "Hydrophobic/Burried" N = "Intermediate/Neutral" S="Hydrophilic/Surface") \r\n+ TRAITS_AMINO_ACIDS_CHOTHIA98 <- c("*","N","B","S","S","B","N","N","B","S","B","B","S","N","S","S","N","N","B","B","N")\r\n+ names(TRAITS_AMINO_ACIDS_CHOTHIA98) <- sort(unique(AMINO_ACIDS))\r\n+ TRAITS_AMINO_ACIDS <- array(NA,21)\r\n+ \r\n+ # Codon Table\r\n+ CODON_TABLE <- as.data.frame(matrix(NA,ncol=64,nrow=12))\r\n+\r\n+ # Substitution Model: Smith DS et al. 1996\r\n+ substitution_Literature_Mouse <- matrix(c(0, 0.156222928, 0.601501588, 0.242275484, 0.172506739, 0, 0.241239892, 0.586253369, 0.54636291, 0.255795364, 0, 0.197841727, 0.290240811, 0.467680608, 0.24207858, 0),nrow=4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES))\r\n+ substitution_Flu_Human <- matrix(c(0,0.2795596,0.5026927,0.2177477,0.1693210,0,0.3264723,0.5042067,0.4983549,0.3328321,0,0.1688130,0.2021079,0.4696077,0.3282844,0),4,4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES))\r\n+ substitution_Flu25_Human <- matrix(c(0,0.2580641,0.5163685,0.2255674,0.1541125,0,0.3210224,0.5248651,0.5239281,0.3101292,0,0.1659427,0.1997207,0.4579444,0.3423350,0),4,4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES))\r\n+ load("FiveS_Substitution.RData")\r\n+\r\n+ # Mutability Models: Shapiro GS et al. 2002\r\n+ triMutability_Literature_Human <- matrix(c(0.24, 1.2, 0.96, 0.43, 2.14, 2, 1.11, 1.9, 0.85, 1.83, 2.36, 1.31, 0.82, 0.52, 0.89, 1.33, 1.4, 0.82, 1.83, 0.73, 1.83, 1.62, 1.53, 0.57, 0.92, 0.42, 0.42, 1.47, 3.44, 2.58, 1.18, 0.47, 0.39, 1.12, 1.8, 0.68, 0.47, 2.19, 2.35, 2.19, 1.05, 1.84, 1.26, 0.28, 0.98, 2.37, 0.66, 1.58, 0.67, 0.92, 1.76, 0.83, 0.97, 0.56, 0.75, 0.62, 2.26, 0.62, 0.74, 1.11, 1.16, 0.61, 0.88, 0.67, 0.37, 0.07, 1.08, 0.46, 0.31, 0.94, 0.62, 0.57, 0.29, NA, 1.44, 0.46, 0.69, 0.57, 0.24, 0.37, 1.1, 0.99, 1.39, 0.6, 2.26, 1.24, 1.36, 0.52, 0.33, 0.26, 1.25, 0.37, 0.58, 1.03, 1.2, '..b'U = lapply(1:length(facLevels), function(x){\r\n+ computeMutabilities(facLevels[x])\r\n+ })\r\n+ facIndex = match(facGL,facLevels)\r\n+ \r\n+ LisGLs_Mutability = lapply(1:nrow(matInput), function(x){\r\n+ cInput = rep(NA,nchar(matInput[x,1]))\r\n+ cInput[s2c(matInput[x,1])!="N"] = 1\r\n+ LisGLs_MutabilityU[[facIndex[x]]] * cInput \r\n+ })\r\n+ \r\n+ LisGLs_Targeting = lapply(1:dim(matInput)[1], function(x){\r\n+ computeTargeting(matInput[x,2],LisGLs_Mutability[[x]])\r\n+ })\r\n+ \r\n+ LisGLs_MutationTypes = lapply(1:length(matInput[,2]),function(x){\r\n+ #print(x)\r\n+ computeMutationTypes(matInput[x,2])\r\n+ })\r\n+ \r\n+ LisGLs_R_Exp = lapply(1:nrow(matInput), function(x){\r\n+ Exp_R <- rollapply(as.zoo(1:readEnd),width=3,by=3,\r\n+ function(codonNucs){ \r\n+ RPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="R") \r\n+ sum( LisGLs_Targeting[[x]][,codonNucs][RPos], na.rm=T ) \r\n+ }\r\n+ ) \r\n+ })\r\n+ \r\n+ LisGLs_S_Exp = lapply(1:nrow(matInput), function(x){\r\n+ Exp_S <- rollapply(as.zoo(1:readEnd),width=3,by=3,\r\n+ function(codonNucs){ \r\n+ SPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="S") \r\n+ sum( LisGLs_Targeting[[x]][,codonNucs][SPos], na.rm=T )\r\n+ }\r\n+ ) \r\n+ }) \r\n+ \r\n+ Exp_R = matrix(unlist(LisGLs_R_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) \r\n+ Exp_S = matrix(unlist(LisGLs_S_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) \r\n+ return( list( "Expected_R"=Exp_R, "Expected_S"=Exp_S) ) \r\n+ }\r\n+}\r\n+\r\n+# getObservedMutationsByCodon <- function(listMutations){\r\n+# numbSeqs <- length(listMutations) \r\n+# obsMu_R <- matrix(0,nrow=numbSeqs,ncol=readEnd/3,dimnames=list(c(1:numbSeqs),c(1:(readEnd/3))))\r\n+# obsMu_S <- obsMu_R\r\n+# temp <- mclapply(1:length(listMutations), function(i){\r\n+# arrMutations = listMutations[[i]]\r\n+# RPos = as.numeric(names(arrMutations)[arrMutations=="R"])\r\n+# RPos <- sapply(RPos,getCodonNumb) \r\n+# if(any(RPos)){\r\n+# tabR <- table(RPos)\r\n+# obsMu_R[i,as.numeric(names(tabR))] <<- tabR\r\n+# } \r\n+# \r\n+# SPos = as.numeric(names(arrMutations)[arrMutations=="S"])\r\n+# SPos <- sapply(SPos,getCodonNumb)\r\n+# if(any(SPos)){\r\n+# tabS <- table(SPos)\r\n+# obsMu_S[i,names(tabS)] <<- tabS\r\n+# } \r\n+# }\r\n+# )\r\n+# return( list( "Observed_R"=obsMu_R, "Observed_S"=obsMu_S) ) \r\n+# }\r\n+\r\n+getObservedMutationsByCodon <- function(listMutations){\r\n+ numbSeqs <- length(listMutations) \r\n+ obsMu_R <- matrix(0,nrow=numbSeqs,ncol=readEnd/3,dimnames=list(c(1:numbSeqs),c(1:(readEnd/3))))\r\n+ obsMu_S <- obsMu_R\r\n+ temp <- lapply(1:length(listMutations), function(i){\r\n+ arrMutations = listMutations[[i]]\r\n+ RPos = as.numeric(names(arrMutations)[arrMutations=="R"])\r\n+ RPos <- sapply(RPos,getCodonNumb) \r\n+ if(any(RPos)){\r\n+ tabR <- table(RPos)\r\n+ obsMu_R[i,as.numeric(names(tabR))] <<- tabR\r\n+ } \r\n+ \r\n+ SPos = as.numeric(names(arrMutations)[arrMutations=="S"])\r\n+ SPos <- sapply(SPos,getCodonNumb)\r\n+ if(any(SPos)){\r\n+ tabS <- table(SPos)\r\n+ obsMu_S[i,names(tabS)] <<- tabS\r\n+ } \r\n+ }\r\n+ )\r\n+ return( list( "Observed_R"=obsMu_R, "Observed_S"=obsMu_S) ) \r\n+}\r\n+\r\n' |
b |
diff -r 000000000000 -r 8a5a2abbb870 baseline/Baseline_Main.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/Baseline_Main.r Mon Aug 29 05:36:10 2016 -0400 |
[ |
b'@@ -0,0 +1,388 @@\n+#########################################################################################\r\n+# License Agreement\r\n+# \r\n+# THIS WORK IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE \r\n+# ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER \r\n+# APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE \r\n+# OR COPYRIGHT LAW IS PROHIBITED.\r\n+# \r\n+# BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE \r\n+# BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED \r\n+# TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN \r\n+# CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.\r\n+#\r\n+# BASELIne: Bayesian Estimation of Antigen-Driven Selection in Immunoglobulin Sequences\r\n+# Coded by: Mohamed Uduman & Gur Yaari\r\n+# Copyright 2012 Kleinstein Lab\r\n+# Version: 1.3 (01/23/2014)\r\n+#########################################################################################\r\n+\r\n+op <- options();\r\n+options(showWarnCalls=FALSE, showErrorCalls=FALSE, warn=-1)\r\n+library(\'seqinr\')\r\n+if( F & Sys.info()[1]=="Linux"){\r\n+ library("multicore")\r\n+}\r\n+\r\n+# Load functions and initialize global variables\r\n+source("Baseline_Functions.r")\r\n+\r\n+# Initialize parameters with user provided arguments\r\n+ arg <- commandArgs(TRUE) \r\n+ #arg = c(2,1,5,5,0,1,"1:26:38:55:65:104:116", "test.fasta","","sample")\r\n+ #arg = c(1,1,5,5,0,1,"1:38:55:65:104:116:200", "test.fasta","","sample")\r\n+ #arg = c(1,1,5,5,1,1,"1:26:38:55:65:104:116", "/home/mu37/Wu/Wu_Cloned_gapped_sequences_D-masked.fasta","/home/mu37/Wu/","Wu")\r\n+ testID <- as.numeric(arg[1]) # 1 = Focused, 2 = Local\r\n+ species <- as.numeric(arg[2]) # 1 = Human. 2 = Mouse\r\n+ substitutionModel <- as.numeric(arg[3]) # 0 = Uniform substitution, 1 = Smith DS et al. 1996, 5 = FiveS\r\n+ mutabilityModel <- as.numeric(arg[4]) # 0 = Uniform mutablity, 1 = Tri-nucleotide (Shapiro GS et al. 2002) , 5 = FiveS\r\n+ clonal <- as.numeric(arg[5]) # 0 = Independent sequences, 1 = Clonally related, 2 = Clonally related & only non-terminal mutations\r\n+ fixIndels <- as.numeric(arg[6]) # 0 = Do nothing, 1 = Try and fix Indels\r\n+ region <- as.numeric(strsplit(arg[7],":")[[1]]) # StartPos:LastNucleotideF1:C1:F2:C2:F3:C3\r\n+ inputFilePath <- arg[8] # Full path to input file\r\n+ outputPath <- arg[9] # Full path to location of output files\r\n+ outputID <- arg[10] # ID for session output \r\n+ \r\n+\r\n+ if(testID==5){\r\n+ traitChangeModel <- 1\r\n+ if( !is.na(any(arg[11])) ) traitChangeModel <- as.numeric(arg[11]) # 1 <- Chothia 1998\r\n+ initializeTraitChange(traitChangeModel) \r\n+ }\r\n+ \r\n+# Initialize other parameters/variables\r\n+ \r\n+ # Initialzie the codon table ( definitions of R/S )\r\n+ computeCodonTable(testID) \r\n+\r\n+ # Initialize \r\n+ # Test Name\r\n+ testName<-"Focused"\r\n+ if(testID==2) testName<-"Local"\r\n+ if(testID==3) testName<-"Imbalanced" \r\n+ if(testID==4) testName<-"ImbalancedSilent" \r\n+ \r\n+ # Indel placeholders initialization\r\n+ indelPos <- NULL\r\n+ delPos <- NULL\r\n+ insPos <- NULL\r\n+\r\n+ # Initialize in Tranistion & Mutability matrixes\r\n+ substitution <- initializeSubstitutionMatrix(substitutionModel,species)\r\n+ mutability <- initializeMutabilityMatrix(mutabilityModel,species)\r\n+ \r\n+ # FWR/CDR boundaries\r\n+ flagTrim <- F\r\n+ if( is.na(region[7])){\r\n+ flagTrim <- T\r\n+ region[7]<-region[6]\r\n+ }\r\n+ readStart = min(region,na.rm=T)\r\n+ readEnd = max(region,na.rm=T)\r\n+ if(readStart>1){\r\n+ region = region - (readStart - 1)\r\n+ }\r\n+ region_Nuc = c( (region[1]*3-2) , (region[2:7]*3) )\r\n+ region_Cod = region\r\n+ \r\n+ readStart = (readStart*3)-2\r\n+ readEnd = (readEnd*3)\r\n+ \r\n+ FWR_Nuc <- c( rep(TRUE,(region_Nuc[2])),\r\n+ '..b'umb]] = list("CDR"=bayesPDF_groups_cdr[[G]],"FWR"=bayesPDF_groups_fwr[[G]])\r\n+ names(listPDFs)[rowNumb] = names(groups[groups==paste(G)])[1]\r\n+ #if(names(groups)[which(groups==G)[1]]!="All sequences combined"){\r\n+ gs = unique(germlines[groups==G])\r\n+ rowNumb = rowNumb+1\r\n+ if( !is.na(gs) ){\r\n+ for( g in gs ){\r\n+ matOutput[rowNumb,c(1,2,11:18)] = c("Germline",names(germlines)[germlines==g][1],bayes_germlines_cdr[g,],bayes_germlines_fwr[g,],simgaP_germlines_cdr[g],simgaP_germlines_fwr[g])\r\n+ listPDFs[[rowNumb]] = list("CDR"=bayesPDF_germlines_cdr[[g]],"FWR"=bayesPDF_germlines_fwr[[g]])\r\n+ names(listPDFs)[rowNumb] = names(germlines[germlines==paste(g)])[1]\r\n+ rowNumb = rowNumb+1\r\n+ indexesOfInterest = which(germlines==g)\r\n+ numbSeqsOfInterest = length(indexesOfInterest)\r\n+ rowNumb = seq(rowNumb,rowNumb+(numbSeqsOfInterest-1))\r\n+ matOutput[rowNumb,] = matrix( c( rep("Sequence",numbSeqsOfInterest),\r\n+ rownames(matInput)[indexesOfInterest],\r\n+ c(matMutationInfo[indexesOfInterest,1:4]),\r\n+ c(matMutationInfo[indexesOfInterest,5:8]),\r\n+ c(bayes_cdr[indexesOfInterest,]),\r\n+ c(bayes_fwr[indexesOfInterest,]),\r\n+ c(simgaP_cdr[indexesOfInterest]),\r\n+ c(simgaP_fwr[indexesOfInterest]) \r\n+ ), ncol=18, nrow=numbSeqsOfInterest,byrow=F)\r\n+ increment=0\r\n+ for( ioi in indexesOfInterest){\r\n+ listPDFs[[min(rowNumb)+increment]] = list("CDR"=bayesPDF_cdr[[ioi]] , "FWR"=bayesPDF_fwr[[ioi]])\r\n+ names(listPDFs)[min(rowNumb)+increment] = rownames(matInput)[ioi]\r\n+ increment = increment + 1\r\n+ }\r\n+ rowNumb=max(rowNumb)+1\r\n+\r\n+ }\r\n+ }\r\n+ }\r\n+ colsToFormat = 11:18\r\n+ matOutput[,colsToFormat] = formatC( matrix(as.numeric(matOutput[,colsToFormat]), nrow=nrow(matOutput), ncol=length(colsToFormat)) , digits=3)\r\n+ matOutput[matOutput== " NaN"] = NA\r\n+ \r\n+ \r\n+ \r\n+ colnames(matOutput) = c("Type", "ID", "Observed_CDR_R", "Observed_CDR_S", "Observed_FWR_R", "Observed_FWR_S",\r\n+ "Expected_CDR_R", "Expected_CDR_S", "Expected_FWR_R", "Expected_FWR_S",\r\n+ paste( rep(testName,6), rep(c("Sigma","CIlower","CIupper"),2),rep(c("CDR","FWR"),each=3), sep="_"),\r\n+ paste( rep(testName,2), rep("P",2),c("CDR","FWR"), sep="_")\r\n+ )\r\n+ fileName = paste(outputPath,outputID,".txt",sep="")\r\n+ write.table(matOutput,file=fileName,quote=F,sep="\\t",row.names=T,col.names=NA)\r\n+ fileName = paste(outputPath,outputID,".RData",sep="")\r\n+ save(listPDFs,file=fileName)\r\n+\r\n+indelWarning = FALSE\r\n+if(sum(indelPos)>0){\r\n+ indelWarning = "<P>Warning: The following sequences have either gaps and/or deletions, and have been ommited from the analysis.";\r\n+ indelWarning = paste( indelWarning , "<UL>", sep="" )\r\n+ for(indels in names(indelPos)[indelPos]){\r\n+ indelWarning = paste( indelWarning , "<LI>", indels, "</LI>", sep="" )\r\n+ }\r\n+ indelWarning = paste( indelWarning , "</UL></P>", sep="" )\r\n+}\r\n+\r\n+cloneWarning = FALSE\r\n+if(clonal==1){\r\n+ if(sum(matInputErrors)>0){\r\n+ cloneWarning = "<P>Warning: The following clones have sequences of unequal length.";\r\n+ cloneWarning = paste( cloneWarning , "<UL>", sep="" )\r\n+ for(clone in names(matInputErrors)[matInputErrors]){\r\n+ cloneWarning = paste( cloneWarning , "<LI>", names(germlines)[as.numeric(clone)], "</LI>", sep="" )\r\n+ }\r\n+ cloneWarning = paste( cloneWarning , "</UL></P>", sep="" )\r\n+ }\r\n+}\r\n+cat(paste("Success",outputID,indelWarning,cloneWarning,sep="|"))\r\n' |
b |
diff -r 000000000000 -r 8a5a2abbb870 baseline/FiveS_Mutability.RData |
b |
Binary file baseline/FiveS_Mutability.RData has changed |
b |
diff -r 000000000000 -r 8a5a2abbb870 baseline/FiveS_Substitution.RData |
b |
Binary file baseline/FiveS_Substitution.RData has changed |
b |
diff -r 000000000000 -r 8a5a2abbb870 baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa Mon Aug 29 05:36:10 2016 -0400 |
b |
b'@@ -0,0 +1,703 @@\n+>IGHV1-18*01\n+caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga\n+>IGHV1-18*02\n+caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctaagatctgacgacacggcc\n+>IGHV1-18*03\n+caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacatggccgtgtattactgtgcgagaga\n+>IGHV1-18*04\n+caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctacggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga\n+>IGHV1-2*01\n+caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccagtaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga\n+>IGHV1-2*02\n+caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\n+>IGHV1-2*03\n+caggtgcagctggtgcagtctggggct...gaggtgaagaagcttggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcnacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\n+>IGHV1-2*04\n+caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggctgggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\n+>IGHV1-2*05\n+caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga\n+>IGHV1-24*01\n+caggtccagctggtacagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggtttccggatacaccctc............actgaattatccatgcactgggtgcgacaggctcctggaaaagggcttgagtggatgggaggttttgatcctgaa......gatggtgaaacaatctacgcacagaagttccag...ggcagagtcaccatgaccgaggacacatctacagacacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcaacaga\n+>IGHV1-3*01\n+caggtccagcttgtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggatcaacgctggc......aatggtaacacaaaatattcacagaagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaagacacggctgtgtattactgtgcgagaga\n+>IGHV1-3*02\n+caggttcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggagcaacgctggc......aatggtaacacaaaatattcacaggagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaggacatg'..b'aggtgcagctgttgcagtctgcagca...gaggtgaaaagacccggggagtctctgaggatctcctgtaagacttctggatacagcttt............accagctactggatccactgggtgcgccagatgcccgggaaagaactggagtggatggggagcatctatcctggg......aactctgataccagatacagcccatccttccaa...ggccacgtcaccatctcagccgacagctccagcagcaccgcctacctgcagtggagcagcctgaaggcctcggacgccgccatgtattattgtgtgaga\n+>IGHV6-1*01\n+caggtacagctgcagcagtcaggtcca...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga\n+>IGHV6-1*02\n+caggtacagctgcagcagtcaggtccg...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga\n+>IGHV7-34-1*01\n+...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......actgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta\n+>IGHV7-34-1*02\n+...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......aatgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta\n+>IGHV7-4-1*01\n+caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatctgcagcctaaaggctgaggacactgccgtgtattactgtgcgaga\n+>IGHV7-4-1*02\n+caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga\n+>IGHV7-4-1*03\n+caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcacgctaaaggctgaggacactg\n+>IGHV7-4-1*04\n+caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga\n+>IGHV7-4-1*05\n+caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtgttactgtgcgagaga\n+>AIGHV7-40*03|\n+ttttcaatagaaaagtcaaataatcta...agtgtcaatcagtggatgattagataaaatatgatatatgtaaatcatggaatactatgc............agccagtatggtatgaattcagtgtgaccagcccctggacaagggcttgagtggatgggatggatcatcacctac......actgggaacccaacatataccaacggcttcaca...ggacggtttctattctccatggacacctctgtcagcatggcgtatctgcagatcagcagcctaaaggctgaggacacggccgtgtatgactgtatgagaga\n+>IGHV7-81*01\n+caggtgcagctggtgcagtctggccat...gaggtgaagcagcctggggcctcagtgaaggtctcctgcaaggcttctggttacagtttc............accacctatggtatgaattgggtgccacaggcccctggacaagggcttgagtggatgggatggttcaacacctac......actgggaacccaacatatgcccagggcttcaca...ggacggtttgtcttctccatggacacctctgccagcacagcatacctgcagatcagcagcctaaaggctgaggacatggccatgtattactgtgcgagata\n' |
b |
diff -r 000000000000 -r 8a5a2abbb870 baseline/comparePDFs.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/comparePDFs.r Mon Aug 29 05:36:10 2016 -0400 |
[ |
b'@@ -0,0 +1,225 @@\n+options("warn"=-1)\r\n+\r\n+#from http://selection.med.yale.edu/baseline/Archive/Baseline%20Version%201.3/Baseline_Functions_Version1.3.r\r\n+# Compute p-value of two distributions\r\n+compareTwoDistsFaster <-function(sigma_S=seq(-20,20,length.out=4001), N=10000, dens1=runif(4001,0,1), dens2=runif(4001,0,1)){\r\n+#print(c(length(dens1),length(dens2)))\r\n+if(length(dens1)>1 & length(dens2)>1 ){\r\n+\tdens1<-dens1/sum(dens1)\r\n+\tdens2<-dens2/sum(dens2)\r\n+\tcum2 <- cumsum(dens2)-dens2/2\r\n+\ttmp<- sum(sapply(1:length(dens1),function(i)return(dens1[i]*cum2[i])))\r\n+\t#print(tmp)\r\n+\tif(tmp>0.5)tmp<-tmp-1\r\n+\treturn( tmp )\r\n+\t}\r\n+\telse {\r\n+\treturn(NA)\r\n+\t}\r\n+\t#return (sum(sapply(1:N,function(i)(sample(sigma_S,1,prob=dens1)>sample(sigma_S,1,prob=dens2))))/N)\r\n+} \r\n+\r\n+\r\n+require("grid")\r\n+arg <- commandArgs(TRUE)\r\n+#arg <- c("300143","4","5")\r\n+arg[!arg=="clonal"]\r\n+input <- arg[1]\r\n+output <- arg[2]\r\n+rowIDs <- as.numeric( sapply(arg[3:(max(3,length(arg)))],function(x){ gsub("chkbx","",x) } ) )\r\n+\r\n+numbSeqs = length(rowIDs)\r\n+\r\n+if ( is.na(rowIDs[1]) | numbSeqs>10 ) {\r\n+ stop( paste("Error: Please select between one and 10 seqeunces to compare.") )\r\n+}\r\n+\r\n+#load( paste("output/",sessionID,".RData",sep="") )\r\n+load( input )\r\n+#input\r\n+\r\n+xMarks = seq(-20,20,length.out=4001)\r\n+\r\n+plot_grid_s<-function(pdf1,pdf2,Sample=100,cex=1,xlim=NULL,xMarks = seq(-20,20,length.out=4001)){\r\n+ yMax = max(c(abs(as.numeric(unlist(listPDFs[pdf1]))),abs(as.numeric(unlist(listPDFs[pdf2]))),0),na.rm=T) * 1.1\r\n+\r\n+ if(length(xlim==2)){\r\n+ xMin=xlim[1]\r\n+ xMax=xlim[2]\r\n+ } else {\r\n+ xMin_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][1]\r\n+ xMin_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][1]\r\n+ xMax_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001])]\r\n+ xMax_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001])]\r\n+ \r\n+ xMin_CDR2 = xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001][1]\r\n+ xMin_FWR2 = xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001][1]\r\n+ xMax_CDR2 = xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001])]\r\n+ xMax_FWR2 = xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001])]\r\n+ \r\n+ xMin=min(c(xMin_CDR,xMin_FWR,xMin_CDR2,xMin_FWR2,0),na.rm=TRUE)\r\n+ xMax=max(c(xMax_CDR,xMax_FWR,xMax_CDR2,xMax_FWR2,0),na.rm=TRUE)\r\n+ }\r\n+\r\n+ sigma<-approx(xMarks,xout=seq(xMin,xMax,length.out=Sample))$x\r\n+ grid.rect(gp = gpar(col=gray(0.6),fill="white",cex=cex))\r\n+ x <- sigma\r\n+ pushViewport(viewport(x=0.175,y=0.175,width=0.825,height=0.825,just=c("left","bottom"),default.units="npc"))\r\n+ #pushViewport(plotViewport(c(1.8, 1.8, 0.25, 0.25)*cex))\r\n+ pushViewport(dataViewport(x, c(yMax,-yMax),gp = gpar(cex=cex),extension=c(0.05)))\r\n+ grid.polygon(c(0,0,1,1),c(0,0.5,0.5,0),gp=gpar(col=grey(0.95),fill=grey(0.95)),default.units="npc")\r\n+ grid.polygon(c(0,0,1,1),c(1,0.5,0.5,1),gp=gpar(col=grey(0.9),fill=grey(0.9)),default.units="npc")\r\n+ grid.rect()\r\n+ grid.xaxis(gp = gpar(cex=cex/1.1))\r\n+ yticks = pretty(c(-yMax,yMax),8)\r\n+ yticks = yticks[yticks>(-yMax) & yticks<(yMax)]\r\n+ grid.yaxis(at=yticks,label=abs(yticks),gp = gpar(cex=cex/1.1))\r\n+ if(length(listPDFs[pdf1][[1]][["CDR"]])>1){\r\n+ ycdr<-approx(xMarks,listPDFs[pdf1][[1]][["CDR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y\r\n+ grid.lines(unit(x,"native"), unit(ycdr,"native"),gp=gpar(col=2,lwd=2))\r\n+ }\r\n+ if(length(listPDFs[pdf1][[1]][["FWR"]])>1){\r\n+ yfwr<-approx(xMarks,listPDFs[pdf1][[1]][["FWR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y\r\n+ grid.lines(unit(x,"native"), unit(-yfwr,"native"),gp=gpar(col=4,lwd=2))\r\n+ }\r\n+\r\n+ if(length(listPDFs[pdf2][[1]][["CDR"]])>1){\r\n+ ycdr2<-approx(xMarks,listPDFs[pdf2][[1]][["CDR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y\r\n+ grid.lines(unit(x,"native"), unit(ycdr2,"native"),gp=gpar(col=2,lwd'..b'npc"),y = unit(0.75, "npc"),just=c("center", "center"),gp = gpar(cex=cex))\r\n+ grid.text(formatC(as.numeric(pCDR1CDR2),digits=3), x = unit(0.25, "npc"),y = unit(0.75, "npc"),just=c("center", "center"),gp = gpar(cex=cex))\r\n+ grid.text(formatC(as.numeric(pFWR1CDR2),digits=3), x = unit(0.25, "npc"),y = unit(0.25, "npc"),just=c("center", "center"),gp = gpar(cex=cex))\r\n+ \r\n+ \r\n+ # grid.text(paste("P = ",formatC(pCDRFWR,digits=3)), x = unit(0.5, "npc"),y = unit(0.98, "npc"),just=c("center", "top"),gp = gpar(cex=cex))\r\n+ # grid.text(paste("P = ",formatC(pFWRFWR,digits=3)), x = unit(0.5, "npc"),y = unit(0.02, "npc"),just=c("center", "bottom"),gp = gpar(cex=cex))\r\n+ }\r\n+ else{\r\n+ }\r\n+}\r\n+\r\n+\r\n+##################################################################################\r\n+################## The whole OCD\'s matrix ########################################\r\n+##################################################################################\r\n+\r\n+#pdf(width=4*numbSeqs+1/3,height=4*numbSeqs+1/3)\r\n+pdf( output ,width=4*numbSeqs+1/3,height=4*numbSeqs+1/3) \r\n+\r\n+pushViewport(viewport(x=0.02,y=0.02,just = c("left", "bottom"),w =0.96,height=0.96,layout = grid.layout(numbSeqs+1,numbSeqs+1,widths=unit.c(unit(rep(1,numbSeqs),"null"),unit(4,"lines")),heights=unit.c(unit(4,"lines"),unit(rep(1,numbSeqs),"null")))))\r\n+\r\n+for( seqOne in 1:numbSeqs+1){\r\n+ pushViewport(viewport(layout.pos.col = seqOne-1, layout.pos.row = 1))\r\n+ if(seqOne>2){ \r\n+ grid.polygon(c(0,0,0.5,0.5),c(0,0.5,0.5,0),gp=gpar(col=grey(0.5),fill=grey(0.9)),default.units="npc")\r\n+ grid.polygon(c(1,1,0.5,0.5),c(0,0.5,0.5,0),gp=gpar(col=grey(0.5),fill=grey(0.95)),default.units="npc")\r\n+ grid.polygon(c(0,0,1,1),c(1,0.5,0.5,1),gp=gpar(col=grey(0.5)),default.units="npc")\r\n+ \r\n+ grid.text(y=.25,x=0.75,"FWR",gp = gpar(cex=1.5),just="center")\r\n+ grid.text(y=.25,x=0.25,"CDR",gp = gpar(cex=1.5),just="center")\r\n+ }\r\n+ grid.rect(gp = gpar(col=grey(0.9)))\r\n+ grid.text(y=.75,substr(paste(names(listPDFs)[rowIDs[seqOne-1]]),1,16),gp = gpar(cex=2),just="center")\r\n+ popViewport(1)\r\n+}\r\n+\r\n+for( seqOne in 1:numbSeqs+1){\r\n+ pushViewport(viewport(layout.pos.row = seqOne, layout.pos.col = numbSeqs+1))\r\n+ if(seqOne<=numbSeqs){ \r\n+ grid.polygon(c(0,0.5,0.5,0),c(0,0,0.5,0.5),gp=gpar(col=grey(0.5),fill=grey(0.95)),default.units="npc")\r\n+ grid.polygon(c(0,0.5,0.5,0),c(1,1,0.5,0.5),gp=gpar(col=grey(0.5),fill=grey(0.9)),default.units="npc")\r\n+ grid.polygon(c(1,0.5,0.5,1),c(0,0,1,1),gp=gpar(col=grey(0.5)),default.units="npc")\r\n+ grid.text(x=.25,y=0.75,"CDR",gp = gpar(cex=1.5),just="center",rot=270)\r\n+ grid.text(x=.25,y=0.25,"FWR",gp = gpar(cex=1.5),just="center",rot=270)\r\n+ }\r\n+ grid.rect(gp = gpar(col=grey(0.9)))\r\n+ grid.text(x=0.75,substr(paste(names(listPDFs)[rowIDs[seqOne-1]]),1,16),gp = gpar(cex=2),rot=270,just="center")\r\n+ popViewport(1)\r\n+}\r\n+\r\n+for( seqOne in 1:numbSeqs+1){\r\n+ for(seqTwo in 1:numbSeqs+1){\r\n+ pushViewport(viewport(layout.pos.col = seqTwo-1, layout.pos.row = seqOne))\r\n+ if(seqTwo>seqOne){\r\n+ plot_pvals(rowIDs[seqOne-1],rowIDs[seqTwo-1],cex=2)\r\n+ grid.rect()\r\n+ } \r\n+ popViewport(1)\r\n+ }\r\n+}\r\n+ \r\n+\r\n+xMin=0\r\n+xMax=0.01\r\n+for(pdf1 in rowIDs){\r\n+ xMin_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][1]\r\n+ xMin_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][1]\r\n+ xMax_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001])]\r\n+ xMax_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001])]\r\n+ xMin=min(c(xMin_CDR,xMin_FWR,xMin),na.rm=TRUE)\r\n+ xMax=max(c(xMax_CDR,xMax_FWR,xMax),na.rm=TRUE)\r\n+}\r\n+\r\n+\r\n+\r\n+for(i in 1:numbSeqs+1){\r\n+ for(j in (i-1):numbSeqs){ \r\n+ pushViewport(viewport(layout.pos.col = i-1, layout.pos.row = j+1))\r\n+ grid.rect()\r\n+ plot_grid_s(rowIDs[i-1],rowIDs[j],cex=1)\r\n+ popViewport(1)\r\n+ }\r\n+}\r\n+\r\n+dev.off() \r\n+\r\n+cat("Success", paste(rowIDs,collapse="_"),sep=":")\r\n+\r\n' |
b |
diff -r 000000000000 -r 8a5a2abbb870 baseline/filter.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/filter.r Mon Aug 29 05:36:10 2016 -0400 |
[ |
@@ -0,0 +1,35 @@ +arg = commandArgs(TRUE) +summaryfile = arg[1] +gappedfile = arg[2] +selection = arg[3] +output = arg[4] +print(paste("selection = ", selection)) + + +summarydat = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F) +gappeddat = read.table(gappedfile, header=T, sep="\t", fill=T, stringsAsFactors=F) + +#dat = data.frame(merge(gappeddat, summarydat, by="Sequence.ID", all.x=T)) + +dat = cbind(gappeddat, summarydat$AA.JUNCTION) + +colnames(dat)[length(dat)] = "AA.JUNCTION" + +dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele) +dat$VGene = gsub("[*].*", "", dat$VGene) + +dat$DGene = gsub("^Homsap ", "", dat$D.GENE.and.allele) +dat$DGene = gsub("[*].*", "", dat$DGene) + +dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele) +dat$JGene = gsub("[*].*", "", dat$JGene) + +#print(str(dat)) + +dat$past = do.call(paste, c(dat[unlist(strsplit(selection, ","))], sep = ":")) + +dat = dat[!duplicated(dat$past), ] + +dat = dat[dat$Functionality != "No results" & dat$Functionality != "unproductive",] + +write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T) |
b |
diff -r 000000000000 -r 8a5a2abbb870 baseline/script_imgt.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/script_imgt.py Mon Aug 29 05:36:10 2016 -0400 |
[ |
@@ -0,0 +1,79 @@ +#import xlrd #avoid dep +import argparse +import re + +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") +parser.add_argument("--ref", help="Reference file") +parser.add_argument("--output", help="Output file") +parser.add_argument("--id", help="ID to be used at the '>>>' line in the output") + +args = parser.parse_args() + +refdic = dict() +with open(args.ref, 'r') as ref: + currentSeq = "" + currentId = "" + for line in ref: + if line[0] is ">": + if currentSeq is not "" and currentId is not "": + refdic[currentId[1:]] = currentSeq + currentId = line.rstrip() + currentSeq = "" + else: + currentSeq += line.rstrip() + refdic[currentId[1:]] = currentSeq + + +vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#, +# r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)", +# r"(IGKV[0-3]D?-[0-9]{1,2})", +# r"(IGLV[0-9]-[0-9]{1,2})", +# r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)", +# r"(TRGV[234589])", +# r"(TRDV[1-3])"] + +#vPattern = re.compile(r"|".join(vPattern)) +vPattern = re.compile("|".join(vPattern)) + +def filterGene(s, pattern): + if type(s) is not str: + return None + res = pattern.search(s) + if res: + return res.group(0) + return None + + + +currentSeq = "" +currentId = "" +first=True +with open(args.input, 'r') as i: + with open(args.output, 'a') as o: + o.write(">>>" + args.id + "\n") + outputdic = dict() + for line in i: + if first: + first = False + continue + linesplt = line.split("\t") + ref = filterGene(linesplt[1], vPattern) + if not ref or not linesplt[2].rstrip(): + continue + if ref in outputdic: + outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] + else: + outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] + #print outputdic + + for k in outputdic.keys(): + if k in refdic: + o.write(">>" + k + "\n") + o.write(refdic[k] + "\n") + for seq in outputdic[k]: + #print seq + o.write(">" + seq[0] + "\n") + o.write(seq[1] + "\n") + else: + print k + " not in reference, skipping " + k |
b |
diff -r 000000000000 -r 8a5a2abbb870 baseline/script_xlsx.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/script_xlsx.py Mon Aug 29 05:36:10 2016 -0400 |
[ |
@@ -0,0 +1,58 @@ +import xlrd +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") +parser.add_argument("--ref", help="Reference file") +parser.add_argument("--output", help="Output file") + +args = parser.parse_args() + +gene_column = 6 +id_column = 7 +seq_column = 8 +LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"] + + +refdic = dict() +with open(args.ref, 'r') as ref: + currentSeq = "" + currentId = "" + for line in ref.readlines(): + if line[0] is ">": + if currentSeq is not "" and currentId is not "": + refdic[currentId[1:]] = currentSeq + currentId = line.rstrip() + currentSeq = "" + else: + currentSeq += line.rstrip() + refdic[currentId[1:]] = currentSeq + +currentSeq = "" +currentId = "" +with xlrd.open_workbook(args.input, 'r') as wb: + with open(args.output, 'a') as o: + for sheet in wb.sheets(): + if sheet.cell(1,gene_column).value.find("IGHV") < 0: + print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name + continue + o.write(">>>" + sheet.name + "\n") + outputdic = dict() + for rowindex in range(1, sheet.nrows): + ref = sheet.cell(rowindex, gene_column).value.replace(">", "") + if ref in outputdic: + outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)] + else: + outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)] + #print outputdic + + for k in outputdic.keys(): + if k in refdic: + o.write(">>" + k + "\n") + o.write(refdic[k] + "\n") + for seq in outputdic[k]: + #print seq + o.write(">" + seq[0] + "\n") + o.write(seq[1] + "\n") + else: + print k + " not in reference, skipping " + k |
b |
diff -r 000000000000 -r 8a5a2abbb870 baseline/wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/wrapper.sh Mon Aug 29 05:36:10 2016 -0400 |
[ |
@@ -0,0 +1,104 @@ +#!/bin/bash +dir="$(cd "$(dirname "$0")" && pwd)" + +testID=$1 +species=$2 +substitutionModel=$3 +mutabilityModel=$4 +clonal=$5 +fixIndels=$6 +region=$7 +inputs=$8 +inputs=($inputs) +IDs=$9 +IDs=($IDs) +ref=${10} +output=${11} +selection=${12} +output_table=${13} +outID="result" + +echo "$PWD" + +echo "testID = $testID" +echo "species = $species" +echo "substitutionModel = $substitutionModel" +echo "mutabilityModel = $mutabilityModel" +echo "clonal = $clonal" +echo "fixIndels = $fixIndels" +echo "region = $region" +echo "inputs = ${inputs[@]}" +echo "IDs = ${IDs[@]}" +echo "ref = $ref" +echo "output = $output" +echo "outID = $outID" + +fasta="$PWD/baseline.fasta" + + +count=0 +for current in ${inputs[@]} +do + f=$(file $current) + zipType="Zip archive" + if [[ "$f" == *"$zipType"* ]] || [[ "$f" == *"XZ compressed data"* ]] + then + id=${IDs[$count]} + echo "id=$id" + if [[ "$f" == *"Zip archive"* ]] ; then + echo "Zip archive" + echo "unzip $input -d $PWD/files/" + unzip $current -d "$PWD/$id/" + elif [[ "$f" == *"XZ compressed data"* ]] ; then + echo "ZX archive" + echo "tar -xJf $input -C $PWD/files/" + mkdir -p "$PWD/$id/files" + tar -xJf $current -C "$PWD/$id/files/" + fi + summaryfile="$PWD/summary_${id}.txt" + gappedfile="$PWD/gappednt_${id}.txt" + filtered="$PWD/filtered_${id}.txt" + filecount=`ls -l $PWD/$id/ | wc -l` + if [[ "$filecount" -eq "2" ]] + then + cat $PWD/$id/*/1_* > $summaryfile + cat $PWD/$id/*/2_* > $gappedfile + else + cat $PWD/$id/1_* > $summaryfile + cat $PWD/$id/2_* > $gappedfile + fi + Rscript $dir/filter.r $summaryfile $gappedfile "$selection" $filtered 2>&1 + + final="$PWD/final_${id}.txt" + cat $filtered | cut -f2,4,7 > $final + python $dir/script_imgt.py --input $final --ref $ref --output $fasta --id $id + else + python $dir/script_xlsx.py --input $current --ref $ref --output $fasta + fi + count=$((count+1)) +done + +if [[ $(wc -l < $fasta) -eq "1" ]]; then + echo "No sequences in the fasta file, exiting" + exit 0 +fi + +workdir="$PWD" +cd $dir +echo "file: ${inputs[0]}" +#Rscript --verbose $dir/Baseline_Main.r $testID $species $substitutionModel $mutabilityModel $clonal $fixIndels $region ${inputs[0]} $workdir/ $outID 2>&1 +Rscript --verbose $dir/Baseline_Main.r $testID $species $substitutionModel $mutabilityModel $clonal $fixIndels $region $fasta $workdir/ $outID 2>&1 + +echo "$workdir/${outID}.txt" + +rows=`tail -n +2 $workdir/${outID}.txt | grep -v "All sequences combined" | grep -n 'Group' | grep -Eoh '^[0-9]+' | tr '\n' ' '` +rows=($rows) +#unset rows[${#rows[@]}-1] + +cd $dir +Rscript --verbose $dir/comparePDFs.r $workdir/${outID}.RData $output ${rows[@]} 2>&1 +cp $workdir/result.txt ${output_table} + + + + |
b |
diff -r 000000000000 -r 8a5a2abbb870 change_o/DefineClones.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/change_o/DefineClones.py Mon Aug 29 05:36:10 2016 -0400 |
[ |
b'@@ -0,0 +1,1052 @@\n+#!/usr/bin/env python3\n+"""\n+Assign Ig sequences into clones\n+"""\n+# Info\n+__author__ = \'Namita Gupta, Jason Anthony Vander Heiden, Gur Yaari, Mohamed Uduman\'\n+from changeo import __version__, __date__\n+\n+# Imports\n+import os\n+import re\n+import sys\n+import numpy as np\n+from argparse import ArgumentParser\n+from collections import OrderedDict\n+from itertools import chain\n+from textwrap import dedent\n+from time import time\n+from Bio import pairwise2\n+from Bio.Seq import translate\n+\n+# Presto and changeo imports\n+from presto.Defaults import default_out_args\n+from presto.IO import getFileType, getOutputHandle, printLog, printProgress\n+from presto.Multiprocessing import manageProcesses\n+from presto.Sequence import getDNAScoreDict\n+from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs\n+from changeo.Distance import getDNADistMatrix, getAADistMatrix, \\\n+ hs1f_model, m1n_model, hs5f_model, \\\n+ calcDistances, formClusters\n+from changeo.IO import getDbWriter, readDbFile, countDbFile\n+from changeo.Multiprocessing import DbData, DbResult\n+\n+# Defaults\n+default_translate = False\n+default_distance = 0.0\n+default_bygroup_model = \'hs1f\'\n+default_hclust_model = \'chen2010\'\n+default_seq_field = \'JUNCTION\'\n+default_norm = \'len\'\n+default_sym = \'avg\'\n+default_linkage = \'single\'\n+\n+# TODO: should be in Distance, but need to be after function definitions\n+# Amino acid Hamming distance\n+aa_model = getAADistMatrix(mask_dist=1, gap_dist=0)\n+\n+# DNA Hamming distance\n+ham_model = getDNADistMatrix(mask_dist=0, gap_dist=0)\n+\n+\n+# TODO: this function is an abstraction to facilitate later cleanup\n+def getModelMatrix(model):\n+ """\n+ Simple wrapper to get distance matrix from model name\n+\n+ Arguments:\n+ model = model name\n+\n+ Return:\n+ a pandas.DataFrame containing the character distance matrix\n+ """\n+ if model == \'aa\':\n+ return(aa_model)\n+ elif model == \'ham\':\n+ return(ham_model)\n+ elif model == \'m1n\':\n+ return(m1n_model)\n+ elif model == \'hs1f\':\n+ return(hs1f_model)\n+ elif model == \'hs5f\':\n+ return(hs5f_model)\n+ else:\n+ sys.stderr.write(\'Unrecognized distance model: %s.\\n\' % model)\n+\n+\n+def indexJunctions(db_iter, fields=None, mode=\'gene\', action=\'first\'):\n+ """\n+ Identifies preclonal groups by V, J and junction length\n+\n+ Arguments: \n+ db_iter = an iterator of IgRecords defined by readDbFile\n+ fields = additional annotation fields to use to group preclones;\n+ if None use only V, J and junction length\n+ mode = specificity of alignment call to use for assigning preclones;\n+ one of (\'allele\', \'gene\')\n+ action = how to handle multiple value fields when assigning preclones;\n+ one of (\'first\', \'set\')\n+ \n+ Returns: \n+ a dictionary of {(V, J, junction length):[IgRecords]}\n+ """\n+ # Define functions for grouping keys\n+ if mode == \'allele\' and fields is None:\n+ def _get_key(rec, act):\n+ return (rec.getVAllele(act), rec.getJAllele(act),\n+ None if rec.junction is None else len(rec.junction))\n+ elif mode == \'gene\' and fields is None:\n+ def _get_key(rec, act): \n+ return (rec.getVGene(act), rec.getJGene(act),\n+ None if rec.junction is None else len(rec.junction))\n+ elif mode == \'allele\' and fields is not None:\n+ def _get_key(rec, act):\n+ vdj = [rec.getVAllele(act), rec.getJAllele(act),\n+ None if rec.junction is None else len(rec.junction)]\n+ ann = [rec.toDict().get(k, None) for k in fields]\n+ return tuple(chain(vdj, ann))\n+ elif mode == \'gene\' and fields is not None:\n+ def _get_key(rec, act):\n+ vdj = [rec.getVGene(act), rec.getJGene(act),\n+ None if rec.junction is None else len(rec.junction)]\n+ ann = [rec.toDict().get(k, None'..b'\n+ parser_bygroup.add_argument(\'--sf\', action=\'store\', dest=\'seq_field\',\n+ default=default_seq_field,\n+ help=\'\'\'The name of the field to be used to calculate\n+ distance between records\'\'\')\n+ parser_bygroup.set_defaults(feed_func=feedQueue)\n+ parser_bygroup.set_defaults(work_func=processQueue)\n+ parser_bygroup.set_defaults(collect_func=collectQueue) \n+ parser_bygroup.set_defaults(group_func=indexJunctions) \n+ parser_bygroup.set_defaults(clone_func=distanceClones)\n+ \n+ \n+ # Hierarchical clustering cloning method\n+ parser_hclust = subparsers.add_parser(\'hclust\', parents=[parser_parent],\n+ formatter_class=CommonHelpFormatter,\n+ help=\'Defines clones by specified distance metric on CDR3s and \\\n+ cutting of hierarchical clustering tree\')\n+# parser_hclust.add_argument(\'-f\', nargs=\'+\', action=\'store\', dest=\'fields\', default=None,\n+# help=\'Fields to use for grouping clones (non VDJ)\')\n+ parser_hclust.add_argument(\'--method\', action=\'store\', dest=\'method\', \n+ choices=(\'chen2010\', \'ademokun2011\'), default=default_hclust_model, \n+ help=\'Specifies which cloning method to use for calculating distance \\\n+ between CDR3s, computing linkage, and cutting clusters\')\n+ parser_hclust.set_defaults(feed_func=feedQueueClust)\n+ parser_hclust.set_defaults(work_func=processQueueClust)\n+ parser_hclust.set_defaults(collect_func=collectQueueClust)\n+ parser_hclust.set_defaults(cluster_func=hierClust)\n+ \n+ return parser\n+\n+\n+if __name__ == \'__main__\':\n+ """\n+ Parses command line arguments and calls main function\n+ """\n+ # Parse arguments\n+ parser = getArgParser()\n+ args = parser.parse_args()\n+ args_dict = parseCommonArgs(args)\n+ # Convert case of fields\n+ if \'seq_field\' in args_dict:\n+ args_dict[\'seq_field\'] = args_dict[\'seq_field\'].upper()\n+ if \'fields\' in args_dict and args_dict[\'fields\'] is not None: \n+ args_dict[\'fields\'] = [f.upper() for f in args_dict[\'fields\']]\n+ \n+ # Define clone_args\n+ if args.command == \'bygroup\':\n+ args_dict[\'group_args\'] = {\'fields\': args_dict[\'fields\'],\n+ \'action\': args_dict[\'action\'], \n+ \'mode\':args_dict[\'mode\']}\n+ args_dict[\'clone_args\'] = {\'model\': args_dict[\'model\'],\n+ \'distance\': args_dict[\'distance\'],\n+ \'norm\': args_dict[\'norm\'],\n+ \'sym\': args_dict[\'sym\'],\n+ \'linkage\': args_dict[\'linkage\'],\n+ \'seq_field\': args_dict[\'seq_field\']}\n+\n+ # TODO: can be cleaned up with abstract model class\n+ args_dict[\'clone_args\'][\'dist_mat\'] = getModelMatrix(args_dict[\'model\'])\n+\n+ del args_dict[\'fields\']\n+ del args_dict[\'action\']\n+ del args_dict[\'mode\']\n+ del args_dict[\'model\']\n+ del args_dict[\'distance\']\n+ del args_dict[\'norm\']\n+ del args_dict[\'sym\']\n+ del args_dict[\'linkage\']\n+ del args_dict[\'seq_field\']\n+\n+ # Define clone_args\n+ if args.command == \'hclust\':\n+ dist_funcs = {\'chen2010\':distChen2010, \'ademokun2011\':distAdemokun2011}\n+ args_dict[\'clone_func\'] = dist_funcs[args_dict[\'method\']]\n+ args_dict[\'cluster_args\'] = {\'method\': args_dict[\'method\']}\n+ #del args_dict[\'fields\']\n+ del args_dict[\'method\']\n+ \n+ # Call defineClones\n+ del args_dict[\'command\']\n+ del args_dict[\'db_files\']\n+ for f in args.__dict__[\'db_files\']:\n+ args_dict[\'db_file\'] = f\n+ defineClones(**args_dict)\n\\ No newline at end of file\n' |
b |
diff -r 000000000000 -r 8a5a2abbb870 change_o/MakeDb.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/change_o/MakeDb.py Mon Aug 29 05:36:10 2016 -0400 |
[ |
b'@@ -0,0 +1,1025 @@\n+#!/usr/bin/env python3\n+"""\n+Create tab-delimited database file to store sequence alignment information\n+"""\n+# Info\n+__author__ = \'Namita Gupta, Jason Anthony Vander Heiden\'\n+from changeo import __version__, __date__\n+\n+# Imports\n+import csv\n+import os\n+import re\n+import sys\n+import pandas as pd\n+import tarfile\n+import zipfile\n+from argparse import ArgumentParser\n+from collections import OrderedDict\n+from itertools import groupby\n+from shutil import rmtree\n+from tempfile import mkdtemp\n+from textwrap import dedent\n+from time import time\n+from Bio import SeqIO\n+from Bio.Seq import Seq\n+from Bio.Alphabet import IUPAC\n+\n+# Presto and changeo imports\n+from presto.Defaults import default_out_args\n+from presto.Annotation import parseAnnotation\n+from presto.IO import countSeqFile, printLog, printProgress\n+from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs\n+from changeo.IO import getDbWriter, countDbFile, getRepo\n+from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \\\n+ j_allele_regex\n+\n+# Default parameters\n+default_delimiter = (\'\\t\', \',\', \'-\')\n+\n+\n+def gapV(ig_dict, repo_dict):\n+ """\n+ Insert gaps into V region and update alignment information\n+\n+ Arguments:\n+ ig_dict : Dictionary of parsed IgBlast output\n+ repo_dict : Dictionary of IMGT gapped germline sequences\n+\n+ Returns:\n+ dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields\n+ """\n+\n+ seq_imgt = \'.\' * (int(ig_dict[\'V_GERM_START_VDJ\'])-1) + ig_dict[\'SEQUENCE_VDJ\']\n+\n+ # Find gapped germline V segment\n+ vgene = parseAllele(ig_dict[\'V_CALL\'], v_allele_regex, \'first\')\n+ vkey = (vgene, )\n+ #TODO: Figure out else case\n+ if vkey in repo_dict:\n+ vgap = repo_dict[vkey]\n+ # Iterate over gaps in the germline segment\n+ gaps = re.finditer(r\'\\.\', vgap)\n+ gapcount = int(ig_dict[\'V_GERM_START_VDJ\'])-1\n+ for gap in gaps:\n+ i = gap.start()\n+ # Break if gap begins after V region\n+ if i >= ig_dict[\'V_GERM_LENGTH_VDJ\'] + gapcount:\n+ break\n+ # Insert gap into IMGT sequence\n+ seq_imgt = seq_imgt[:i] + \'.\' + seq_imgt[i:]\n+ # Update gap counter\n+ gapcount += 1\n+ ig_dict[\'SEQUENCE_IMGT\'] = seq_imgt\n+ # Update IMGT positioning information for V\n+ ig_dict[\'V_GERM_START_IMGT\'] = 1\n+ ig_dict[\'V_GERM_LENGTH_IMGT\'] = ig_dict[\'V_GERM_LENGTH_VDJ\'] + gapcount\n+\n+ return ig_dict\n+\n+\n+def getIMGTJunc(ig_dict, repo_dict):\n+ """\n+ Identify junction region by IMGT definition\n+\n+ Arguments:\n+ ig_dict : Dictionary of parsed IgBlast output\n+ repo_dict : Dictionary of IMGT gapped germline sequences\n+\n+ Returns:\n+ dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields\n+ """\n+ # Find germline J segment\n+ jgene = parseAllele(ig_dict[\'J_CALL\'], j_allele_regex, \'first\')\n+ jkey = (jgene, )\n+ #TODO: Figure out else case\n+ if jkey in repo_dict:\n+ # Get germline J sequence\n+ jgerm = repo_dict[jkey]\n+ jgerm = jgerm[:ig_dict[\'J_GERM_START\']+ig_dict[\'J_GERM_LENGTH\']-1]\n+ # Look for (F|W)GXG aa motif in nt sequence\n+ motif = re.search(r\'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]\', jgerm)\n+ aa_end = len(ig_dict[\'SEQUENCE_IMGT\'])\n+ #TODO: Figure out else case\n+ if motif:\n+ # print(\'\\n\', motif.group())\n+ aa_end = motif.start() - len(jgerm) + 3\n+ # Add fields to dict\n+ ig_dict[\'JUNCTION\'] = ig_dict[\'SEQUENCE_IMGT\'][309:aa_end]\n+ ig_dict[\'JUNCTION_LENGTH\'] = len(ig_dict[\'JUNCTION\'])\n+\n+ return ig_dict\n+\n+\n+def getRegions(ig_dict):\n+ """\n+ Identify FWR and CDR regions by IMGT definition\n+\n+ Arguments:\n+ ig_dict : Dictionary of parsed alignment output\n+\n+ Returns:\n+ dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMG'..b'P, J_SCORE, J_IDENTITY,\n+ J_BTOP, and J_EVALUE columns.\'\'\')\n+ parser_igblast.add_argument(\'--regions\', action=\'store_true\', dest=\'region_fields\',\n+ help=\'\'\'Specify if IMGT framework and CDR regions should be\n+ included in the output. Adds the FWR1_IMGT, FWR2_IMGT,\n+ FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and\n+ CDR3_IMGT columns.\'\'\')\n+ \n+ # IMGT aligner\n+ parser_imgt = subparsers.add_parser(\'imgt\', help=\'Process IMGT/HighV-Quest output\', \n+ parents=[parser_parent], \n+ formatter_class=CommonHelpFormatter)\n+ imgt_arg_group = parser_imgt.add_mutually_exclusive_group(required=True)\n+ imgt_arg_group.add_argument(\'-i\', nargs=\'+\', action=\'store\', dest=\'aligner_files\',\n+ help=\'\'\'Either zipped IMGT output files (.zip) or a folder\n+ containing unzipped IMGT output files (which must\n+ include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,\n+ and 6_Junction).\'\'\')\n+ parser_imgt.add_argument(\'-s\', nargs=\'*\', action=\'store\', dest=\'seq_files\',\n+ required=False,\n+ help=\'List of input FASTA files containing sequences\')\n+ parser_imgt.add_argument(\'--noparse\', action=\'store_true\', dest=\'no_parse\', \n+ help=\'\'\'Specify if input IDs should not be parsed to add new\n+ columns to database.\'\'\')\n+ parser_imgt.add_argument(\'--scores\', action=\'store_true\', dest=\'score_fields\',\n+ help=\'\'\'Specify if alignment score metrics should be\n+ included in the output. Adds the V_SCORE, V_IDENTITY,\n+ J_SCORE and J_IDENTITY. Note, this will also add\n+ the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP,\n+ but they will be empty for IMGT output.\'\'\')\n+ parser_imgt.add_argument(\'--regions\', action=\'store_true\', dest=\'region_fields\',\n+ help=\'\'\'Specify if IMGT framework and CDR regions should be\n+ included in the output. Adds the FWR1_IMGT, FWR2_IMGT,\n+ FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and\n+ CDR3_IMGT columns.\'\'\')\n+ parser_imgt.set_defaults(func=parseIMGT)\n+\n+ return parser\n+ \n+ \n+if __name__ == "__main__":\n+ """\n+ Parses command line arguments and calls main\n+ """\n+ parser = getArgParser() \n+ args = parser.parse_args()\n+ args_dict = parseCommonArgs(args, in_arg=\'aligner_files\')\n+\n+ # Set no ID parsing if sequence files are not provided\n+ if \'seq_files\' in args_dict and not args_dict[\'seq_files\']:\n+ args_dict[\'no_parse\'] = True\n+\n+ # Delete\n+ if \'seq_files\' in args_dict: del args_dict[\'seq_files\']\n+ if \'aligner_files\' in args_dict: del args_dict[\'aligner_files\']\n+ if \'command\' in args_dict: del args_dict[\'command\']\n+ if \'func\' in args_dict: del args_dict[\'func\'] \n+ \n+ if args.command == \'imgt\':\n+ for i in range(len(args.__dict__[\'aligner_files\'])):\n+ args_dict[\'imgt_output\'] = args.__dict__[\'aligner_files\'][i]\n+ args_dict[\'seq_file\'] = args.__dict__[\'seq_files\'][i] \\\n+ if args.__dict__[\'seq_files\'] else None\n+ args.func(**args_dict)\n+ elif args.command == \'igblast\':\n+ for i in range(len(args.__dict__[\'aligner_files\'])):\n+ args_dict[\'igblast_output\'] = args.__dict__[\'aligner_files\'][i]\n+ args_dict[\'seq_file\'] = args.__dict__[\'seq_files\'][i]\n+ args.func(**args_dict)\n' |
b |
diff -r 000000000000 -r 8a5a2abbb870 change_o/define_clones.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/change_o/define_clones.r Mon Aug 29 05:36:10 2016 -0400 |
[ |
@@ -0,0 +1,15 @@ +args <- commandArgs(trailingOnly = TRUE) + +input=args[1] +output=args[2] + +change.o = read.table(input, header=T, sep="\t", quote="", stringsAsFactors=F) + +freq = data.frame(table(change.o$CLONE)) +freq2 = data.frame(table(freq$Freq)) + +freq2$final = as.numeric(freq2$Freq) * as.numeric(as.character(freq2$Var1)) + +names(freq2) = c("Clone size", "Nr of clones", "Nr of sequences") + +write.table(x=freq2, file=output, sep="\t",quote=F,row.names=F,col.names=T) |
b |
diff -r 000000000000 -r 8a5a2abbb870 change_o/define_clones.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/change_o/define_clones.sh Mon Aug 29 05:36:10 2016 -0400 |
[ |
@@ -0,0 +1,43 @@ +#!/bin/bash +dir="$(cd "$(dirname "$0")" && pwd)" + +#define_clones.sh $input $noparse $scores $regions $out_file + +type=$1 +input=$2 + +mkdir -p $PWD/outdir + +cp $input $PWD/input.tab #file has to have a ".tab" extension + +if [ "bygroup" == "$type" ] ; then + mode=$3 + act=$4 + model=$5 + norm=$6 + sym=$7 + link=$8 + dist=$9 + output=${10} + output2=${11} + + python3 $dir/DefineClones.py bygroup -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link + #/data/users/david/anaconda3/bin/python $dir/DefineClones.py bygroup -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link + #/home/galaxy/anaconda3/bin/python $dir/DefineClones.py bygroup -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link + + Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1 +else + method=$3 + output=$4 + output2=$5 + + python3 $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method + #/data/users/david/anaconda3/bin/python $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method + #/home/galaxy/anaconda3/bin/python $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method + + Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1 +fi + +cp $PWD/outdir/output_clone-pass.tab $output + +rm -rf $PWD/outdir/ |
b |
diff -r 000000000000 -r 8a5a2abbb870 change_o/makedb.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/change_o/makedb.sh Mon Aug 29 05:36:10 2016 -0400 |
[ |
@@ -0,0 +1,38 @@ +#!/bin/bash +dir="$(cd "$(dirname "$0")" && pwd)" + +input=$1 +noparse=$2 +scores=$3 +regions=$4 +output=$5 + +if [ "true" == "$noparse" ] ; then + noparse="--noparse" +else + noparse="" +fi + +if [ "true" == "$scores" ] ; then + scores="--scores" +else + scores="" +fi + +if [ "true" == "$regions" ] ; then + regions="--regions" +else + regions="" +fi + +mkdir $PWD/outdir + +echo "makedb: $PWD/outdir" + +python3 $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions +#/data/users/david/anaconda3/bin/python $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions +#/home/galaxy/anaconda3/bin/python $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions + +mv $PWD/outdir/output_db-pass.tab $output + +rm -rf $PWD/outdir/ |
b |
diff -r 000000000000 -r 8a5a2abbb870 datatypes_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Mon Aug 29 05:36:10 2016 -0400 |
b |
@@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<datatypes> + <registration> + <datatype extension="imgt_archive" type="galaxy.datatypes.binary:CompressedArchive" display_in_upload="True" subclass="True"/> + </registration> +</datatypes> |
b |
diff -r 000000000000 -r 8a5a2abbb870 gene_identification.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_identification.py Mon Aug 29 05:36:10 2016 -0400 |
[ |
b'@@ -0,0 +1,220 @@\n+import re\n+import argparse\n+import time\n+starttime= int(time.time() * 1000)\n+\n+parser = argparse.ArgumentParser()\n+parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file")\n+parser.add_argument("--output", help="The annotated output file to be merged back with the summary file")\n+\n+args = parser.parse_args()\n+\n+infile = args.input\n+#infile = "test_VH-Ca_Cg_25nt/1_Summary_test_VH-Ca_Cg_25nt_241013.txt"\n+output = args.output\n+#outfile = "identified.txt"\n+\n+dic = dict()\n+total = 0\n+\n+\n+first = True\n+IDIndex = 0\n+seqIndex = 0\n+\n+with open(infile, \'r\') as f: #read all sequences into a dictionary as key = ID, value = sequence\n+\tfor line in f:\n+\t\ttotal += 1\n+\t\tlinesplt = line.split("\\t")\n+\t\tif first:\n+\t\t\tprint "linesplt", linesplt\n+\t\t\tIDIndex = linesplt.index("Sequence ID")\n+\t\t\tseqIndex = linesplt.index("Sequence")\n+\t\t\tfirst = False\n+\t\t\tcontinue\n+\t\t\n+\t\tID = linesplt[IDIndex]\n+\t\tif len(linesplt) < 28: #weird rows without a sequence\n+\t\t\tdic[ID] = ""\n+\t\telse:\n+\t\t\tdic[ID] = linesplt[seqIndex]\n+\t\t\t\n+print "Number of input sequences:", len(dic)\n+\n+#old cm sequence: gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc\n+#old cg sequence: ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag\n+\n+#lambda/kappa reference sequence\n+searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg",\n+ "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggcc",\n+ "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence\n+\n+compiledregex = {"ca": [],\n+ "cg": [],\n+ "cm": []}\n+\n+#lambda/kappa reference sequence variable nucleotides\n+ca1 = {38: \'t\', 39: \'g\', 48: \'a\', 49: \'g\', 51: \'c\', 68: \'a\', 73: \'c\'}\n+ca2 = {38: \'g\', 39: \'a\', 48: \'c\', 49: \'c\', 51: \'a\', 68: \'g\', 73: \'a\'}\n+cg1 = {0: \'c\', 33: \'a\', 38: \'c\', 44: \'a\', 54: \'t\', 56: \'g\', 58: \'g\', 66: \'g\', 132: \'c\'}\n+cg2 = {0: \'c\', 33: \'g\', 38: \'g\', 44: \'g\', 54: \'c\', 56: \'a\', 58: \'a\', 66: \'g\', 132: \'t\'}\n+cg3 = {0: \'t\', 33: \'g\', 38: \'g\', 44: \'g\', 54: \'t\', 56: \'g\', 58: \'g\', 66: \'g\', 132: \'c\'}\n+cg4 = {0: \'t\', 33: \'g\', 38: \'g\', 44: \'g\', 54: \'c\', 56: \'a\', 58: \'a\', 66: \'c\', 132: \'c\'}\n+\n+#remove last snp for shorter cg sequence --- note, also change varsInCG\n+del cg1[132]\n+del cg2[132]\n+del cg3[132]\n+del cg4[132]\n+\n+#reference sequences are cut into smaller parts of \'chunklength\' length, and with \'chunklength\' / 2 overlap\n+chunklength = 8\n+\n+#create the chunks of the reference sequence with regular expressions for the variable nucleotides\n+for i in range(0, len(searchstrings["ca"]) - chunklength, chunklength / 2):\n+ pos = i\n+ chunk = searchstrings["ca"][i:i+chunklength]\n+ result = ""\n+ varsInResult = 0\n+ for c in chunk:\n+ if pos in ca1.keys():\n+ varsInResult += 1\n+ result += "[" + ca1[pos] + ca2[pos] + "]"\n+ else:\n+ result += c\n+ pos += 1\n+ compiledregex["ca"].append((re.compile(result), varsInResult))\n+\n+for i in range(0, len(searchstrings["cg"]) - chunklength, chunklength / 2):\n+ pos = i\n+ chunk = searchstrings["cg"][i:i+chunklength]\n+ result = ""\n+ varsInResult = 0\n+ for c in chunk:\n+ if pos in cg1.keys():\n+ varsInResult += 1\n+ result += "[" + "".join(set([cg1[pos], cg2[pos], cg3[pos], cg4[pos]])) + "]"\n+ else:\n+ result += c\n+ pos += 1\n+ compiledregex["cg"].append((re.compile(result), varsInResult))\n+\n+for i in range(0, len(searchstrings["cm"]) - chunklength, chunklength / 2):\n+ compiledregex["cm"].append((re.compile(searchstrings["cm"][i:i+chunklength]), False))\n+\n+\n+\n+def removeAndReturnMaxIndex(x): #simplifies a list comprehension\n+ m = max(x)\n+ index = x.index(m)\n+ x[index] = 0\n+ return index\n+ \n+\n+start_location = dict()\n+hits = dict()\n+alltotal = 0\n+for key in compiledregex.keys(): #for ca/cg/cm\n+\tregularexpressions = compiledregex[key] #get the compiled regular expressions\n+\tfor ID in dic.key'..b'ts["cg2"] += len([1 for x in cg2 if chunkstart <= x < chunkend and cg2[x] == seq[lastindex + x - chunkstart]])\n+\t\t\t\t\t\tcurrentIDHits["cg3"] += len([1 for x in cg3 if chunkstart <= x < chunkend and cg3[x] == seq[lastindex + x - chunkstart]])\n+\t\t\t\t\t\tcurrentIDHits["cg4"] += len([1 for x in cg4 if chunkstart <= x < chunkend and cg4[x] == seq[lastindex + x - chunkstart]])\n+\t\t\t\t\telse: #key == "cm" #no variable regions in \'cm\'\n+\t\t\t\t\t\tpass\n+\t\t\t\tbreak #this only breaks when there was a match with the regex, breaking means the \'else:\' clause is skipped\n+\t\t\telse: #only runs if there were no hits\n+\t\t\t\tcontinue\n+\t\t\t#print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it\'s now:", start[lastindex - chunklength / 2 * i]\n+\t\t\tcurrentIDHits[key + "_hits"] += 1\n+\t\tstart_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1 - start_zero) for x in range(5) if len(start) > 0 and max(start) > 1])\n+\t\t#start_location[ID + "_" + key] = str(start.index(max(start)))\n+\n+\n+chunksInCA = len(compiledregex["ca"])\n+chunksInCG = len(compiledregex["cg"])\n+chunksInCM = len(compiledregex["cm"])\n+requiredChunkPercentage = 0.7\n+varsInCA = float(len(ca1.keys()) * 2)\n+varsInCG = float(len(cg1.keys()) * 2) - 2 # -2 because the sliding window doesn\'t hit the first and last nt twice\n+varsInCM = 0\n+\n+\n+\n+first = True\n+seq_write_count=0\n+with open(infile, \'r\') as f: #read all sequences into a dictionary as key = ID, value = sequence\n+\twith open(output, \'w\') as o:\n+\t\tfor line in f:\n+\t\t\ttotal += 1\n+\t\t\tif first:\n+\t\t\t\to.write("Sequence ID\\tbest_match\\tnt_hit_percentage\\tchunk_hit_percentage\\tstart_locations\\n")\n+\t\t\t\tfirst = False\n+\t\t\t\tcontinue\n+\t\t\tlinesplt = line.split("\\t")\n+\t\t\tif linesplt[2] == "No results":\n+\t\t\t\tpass\n+\t\t\tID = linesplt[1]\n+\t\t\tcurrentIDHits = hits[ID]\n+\t\t\tpossibleca = float(len(compiledregex["ca"]))\n+\t\t\tpossiblecg = float(len(compiledregex["cg"]))\n+\t\t\tpossiblecm = float(len(compiledregex["cm"]))\n+\t\t\tcahits = currentIDHits["ca_hits"]\n+\t\t\tcghits = currentIDHits["cg_hits"]\n+\t\t\tcmhits = currentIDHits["cm_hits"]\n+\t\t\tif cahits >= cghits and cahits >= cmhits: #its a ca gene\n+\t\t\t\tca1hits = currentIDHits["ca1"]\n+\t\t\t\tca2hits = currentIDHits["ca2"]\n+\t\t\t\tif ca1hits >= ca2hits:\n+\t\t\t\t\to.write(ID + "\\tca1\\t" + str(int(ca1hits / varsInCA * 100)) + "\\t" + str(int(cahits / possibleca * 100)) + "\\t" + start_location[ID + "_ca"] + "\\n")\n+\t\t\t\telse:\n+\t\t\t\t\to.write(ID + "\\tca2\\t" + str(int(ca2hits / varsInCA * 100)) + "\\t" + str(int(cahits / possibleca * 100)) + "\\t" + start_location[ID + "_ca"] + "\\n")\n+\t\t\telif cghits >= cahits and cghits >= cmhits: #its a cg gene\n+\t\t\t\tcg1hits = currentIDHits["cg1"]\n+\t\t\t\tcg2hits = currentIDHits["cg2"]\n+\t\t\t\tcg3hits = currentIDHits["cg3"]\n+\t\t\t\tcg4hits = currentIDHits["cg4"]\n+\t\t\t\tif cg1hits >= cg2hits and cg1hits >= cg3hits and cg1hits >= cg4hits: #cg1 gene\n+\t\t\t\t\to.write(ID + "\\tcg1\\t" + str(int(cg1hits / varsInCG * 100)) + "\\t" + str(int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n+\t\t\t\telif cg2hits >= cg1hits and cg2hits >= cg3hits and cg2hits >= cg4hits: #cg2 gene\n+\t\t\t\t\to.write(ID + "\\tcg2\\t" + str(int(cg2hits / varsInCG * 100)) + "\\t" + str(int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n+\t\t\t\telif cg3hits >= cg1hits and cg3hits >= cg2hits and cg3hits >= cg4hits: #cg3 gene\n+\t\t\t\t\to.write(ID + "\\tcg3\\t" + str(int(cg3hits / varsInCG * 100)) + "\\t" + str(int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n+\t\t\t\telse: #cg4 gene\n+\t\t\t\t\to.write(ID + "\\tcg4\\t" + str(int(cg4hits / varsInCG * 100)) + "\\t" + str(int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n+\t\t\telse: #its a cm gene\n+\t\t\t\to.write(ID + "\\tcm\\t100\\t" + str(int(cmhits / possiblecm * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n+\t\t\tseq_write_count += 1\n+\n+print "Time: %i" % (int(time.time() * 1000) - starttime)\n+\n+print "Number of sequences written to file:", seq_write_count\n+\n+\n+\n+\n+\n' |
b |
diff -r 000000000000 -r 8a5a2abbb870 imgt_loader.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/imgt_loader.r Mon Aug 29 05:36:10 2016 -0400 |
[ |
@@ -0,0 +1,82 @@ +args <- commandArgs(trailingOnly = TRUE) + +summ.file = args[1] +aa.file = args[2] +junction.file = args[3] +out.file = args[4] + +summ = read.table(summ.file, sep="\t", header=T, quote="", fill=T) +aa = read.table(aa.file, sep="\t", header=T, quote="", fill=T) +junction = read.table(junction.file, sep="\t", header=T, quote="", fill=T) + +old_summary_columns=c('Sequence.ID','JUNCTION.frame','V.GENE.and.allele','D.GENE.and.allele','J.GENE.and.allele','CDR1.IMGT.length','CDR2.IMGT.length','CDR3.IMGT.length','Orientation') +old_sequence_columns=c('CDR1.IMGT','CDR2.IMGT','CDR3.IMGT') +old_junction_columns=c('JUNCTION') + +added_summary_columns=c('Functionality','V.REGION.identity..','V.REGION.identity.nt','D.REGION.reading.frame','AA.JUNCTION','Functionality.comment','Sequence') +added_sequence_columns=c('FR1.IMGT','FR2.IMGT','FR3.IMGT','CDR3.IMGT','JUNCTION','J.REGION','FR4.IMGT') + +added_junction_columns=c('P3.V.nt.nb','N.REGION.nt.nb','N1.REGION.nt.nb','P5.D.nt.nb','P3.D.nt.nb','N2.REGION.nt.nb','P5.J.nt.nb','X3.V.REGION.trimmed.nt.nb','X5.D.REGION.trimmed.nt.nb','X3.D.REGION.trimmed.nt.nb','X5.J.REGION.trimmed.nt.nb','N.REGION','N1.REGION','N2.REGION') +added_junction_columns=c(added_junction_columns, 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb') + +out=summ[,c("Sequence.ID","JUNCTION.frame","V.GENE.and.allele","D.GENE.and.allele","J.GENE.and.allele")] + +out[,"CDR1.Seq"] = aa[,"CDR1.IMGT"] +out[,"CDR1.Length"] = summ[,"CDR1.IMGT.length"] + +out[,"CDR2.Seq"] = aa[,"CDR2.IMGT"] +out[,"CDR2.Length"] = summ[,"CDR2.IMGT.length"] + +out[,"CDR3.Seq"] = aa[,"CDR3.IMGT"] +out[,"CDR3.Length"] = summ[,"CDR3.IMGT.length"] + +out[,"CDR3.Seq.DNA"] = junction[,"JUNCTION"] +out[,"CDR3.Length.DNA"] = nchar(as.character(junction[,"JUNCTION"])) +out[,"Strand"] = summ[,"Orientation"] +out[,"CDR3.Found.How"] = "a" + +out[,added_summary_columns] = summ[,added_summary_columns] + +out[,added_sequence_columns] = aa[,added_sequence_columns] + +out[,added_junction_columns] = junction[,added_junction_columns] + +out[,"Top V Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"V.GENE.and.allele"])) +out[,"Top D Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"D.GENE.and.allele"])) +out[,"Top J Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"J.GENE.and.allele"])) + +out = out[,c('Sequence.ID','JUNCTION.frame','Top V Gene','Top D Gene','Top J Gene','CDR1.Seq','CDR1.Length','CDR2.Seq','CDR2.Length','CDR3.Seq','CDR3.Length','CDR3.Seq.DNA','CDR3.Length.DNA','Strand','CDR3.Found.How','Functionality','V.REGION.identity..','V.REGION.identity.nt','D.REGION.reading.frame','AA.JUNCTION','Functionality.comment','Sequence','FR1.IMGT','FR2.IMGT','FR3.IMGT','CDR3.IMGT','JUNCTION','J.REGION','FR4.IMGT','P3.V.nt.nb','N.REGION.nt.nb','N1.REGION.nt.nb','P5.D.nt.nb','P3.D.nt.nb','N2.REGION.nt.nb','P5.J.nt.nb','X3.V.REGION.trimmed.nt.nb','X5.D.REGION.trimmed.nt.nb','X3.D.REGION.trimmed.nt.nb','X5.J.REGION.trimmed.nt.nb','N.REGION','N1.REGION','N2.REGION', 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb')] + +names(out) = c('ID','VDJ Frame','Top V Gene','Top D Gene','Top J Gene','CDR1 Seq','CDR1 Length','CDR2 Seq','CDR2 Length','CDR3 Seq','CDR3 Length','CDR3 Seq DNA','CDR3 Length DNA','Strand','CDR3 Found How','Functionality','V-REGION identity %','V-REGION identity nt','D-REGION reading frame','AA JUNCTION','Functionality comment','Sequence','FR1-IMGT','FR2-IMGT','FR3-IMGT','CDR3-IMGT','JUNCTION','J-REGION','FR4-IMGT','P3V-nt nb','N-REGION-nt nb','N1-REGION-nt nb','P5D-nt nb','P3D-nt nb','N2-REGION-nt nb','P5J-nt nb','3V-REGION trimmed-nt nb','5D-REGION trimmed-nt nb','3D-REGION trimmed-nt nb','5J-REGION trimmed-nt nb','N-REGION','N1-REGION','N2-REGION', 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb') + +out[,"VDJ Frame"] = as.character(out[,"VDJ Frame"]) + +fltr = out[,"VDJ Frame"] == "in-frame" +if(any(fltr)){ + out[fltr, "VDJ Frame"] = "In-frame" +} + +fltr = out[,"VDJ Frame"] == "null" +if(any(fltr)){ + out[fltr, "VDJ Frame"] = "Out-of-frame" +} + +fltr = out[,"VDJ Frame"] == "out-of-frame" +if(any(fltr)){ + out[fltr, "VDJ Frame"] = "Out-of-frame" +} + +fltr = out[,"VDJ Frame"] == "" +if(any(fltr)){ + out[fltr, "VDJ Frame"] = "Out-of-frame" +} + +for(col in c('Top V Gene','Top D Gene','Top J Gene')){ + out[,col] = as.character(out[,col]) + fltr = out[,col] == "" + if(any(fltr)){ + out[fltr,col] = "NA" + } +} + +write.table(out, out.file, sep="\t", quote=F, row.names=F, col.names=T) |
b |
diff -r 000000000000 -r 8a5a2abbb870 merge.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge.r Mon Aug 29 05:36:10 2016 -0400 |
[ |
@@ -0,0 +1,27 @@ +args <- commandArgs(trailingOnly = TRUE) + +input.1 = args[1] +input.2 = args[2] + +fields.1 = args[3] +fields.2 = args[4] + +field.1 = args[5] +field.2 = args[6] + +output = args[7] + +dat1 = read.table(input.1, header=T, sep="\t", quote="", stringsAsFactors=F, fill=T, row.names=NULL) +if(fields.1 != "all"){ + fields.1 = unlist(strsplit(fields.1, ",")) + dat1 = dat1[,fields.1] +} +dat2 = read.table(input.2, header=T, sep="\t", quote="", stringsAsFactors=F, fill=T, row.names=NULL) +if(fields.2 != "all"){ + fields.2 = unlist(strsplit(fields.2, ",")) + dat2 = dat2[,fields.2] +} + +dat3 = merge(dat1, dat2, by.x=field.1, by.y=field.2) + +write.table(dat3, output, sep="\t",quote=F,row.names=F,col.names=T) |
b |
diff -r 000000000000 -r 8a5a2abbb870 merge_and_filter.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_and_filter.r Mon Aug 29 05:36:10 2016 -0400 |
[ |
b'@@ -0,0 +1,209 @@\n+args <- commandArgs(trailingOnly = TRUE)\r\n+\r\n+\r\n+summaryfile = args[1]\r\n+sequencesfile = args[2]\r\n+mutationanalysisfile = args[3]\r\n+mutationstatsfile = args[4]\r\n+hotspotsfile = args[5]\r\n+gene_identification_file= args[6]\r\n+output = args[7]\r\n+before.unique.file = args[8]\r\n+unmatchedfile = args[9]\r\n+method=args[10]\r\n+functionality=args[11]\r\n+unique.type=args[12]\r\n+filter.unique=args[13]\r\n+class.filter=args[14]\r\n+empty.region.filter=args[15]\r\n+\r\n+summ = read.table(summaryfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n+sequences = read.table(sequencesfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n+mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n+mutationstats = read.table(mutationstatsfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n+hotspots = read.table(hotspotsfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n+gene_identification = read.table(gene_identification_file, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n+\r\n+if(method == "blastn"){\r\n+\t"qseqid\\tsseqid\\tpident\\tlength\\tmismatch\\tgapopen\\tqstart\\tqend\\tsstart\\tsend\\tevalue\\tbitscore"\r\n+\tgene_identification = gene_identification[!duplicated(gene_identification$qseqid),]\r\n+\tref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52))\r\n+\tgene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T)\r\n+\tgene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100\r\n+\tgene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")]\r\n+\tcolnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")\r\n+\t\r\n+}\r\n+\r\n+input.sequence.count = nrow(summ)\r\n+print(paste("Number of sequences in summary file:", input.sequence.count))\r\n+\r\n+filtering.steps = data.frame(character(0), numeric(0))\r\n+\r\n+filtering.steps = rbind(filtering.steps, c("Input", input.sequence.count))\r\n+\r\n+filtering.steps[,1] = as.character(filtering.steps[,1])\r\n+filtering.steps[,2] = as.character(filtering.steps[,2])\r\n+#filtering.steps[,3] = as.numeric(filtering.steps[,3])\r\n+\r\n+summ = merge(summ, gene_identification, by="Sequence.ID")\r\n+\r\n+summ = summ[summ$Functionality != "No results",]\r\n+\r\n+print(paste("Number of sequences after \'No results\' filter:", nrow(summ)))\r\n+\r\n+filtering.steps = rbind(filtering.steps, c("After \'No results\' filter", nrow(summ)))\r\n+\r\n+if(functionality == "productive"){\r\n+\tsumm = summ[summ$Functionality == "productive (see comment)" | summ$Functionality == "productive",]\r\n+} else if (functionality == "unproductive"){\r\n+\tsumm = summ[summ$Functionality == "unproductive (see comment)" | summ$Functionality == "unproductive",]\r\n+} else if (functionality == "remove_unknown"){\r\n+\tsumm = summ[summ$Functionality != "No results" & summ$Functionality != "unknown (see comment)" & summ$Functionality != "unknown",]\r\n+}\r\n+\r\n+print(paste("Number of sequences after productive filter:", nrow(summ)))\r\n+\r\n+filtering.steps = rbind(filtering.steps, c("After productive filter", nrow(summ)))\r\n+\r\n+splt = strsplit(class.filter, "_")[[1]]\r\n+chunk_hit_threshold = as.numeric(splt[1])\r\n+nt_hit_threshold = as.numeric(splt[2])\r\n+\r\n+higher_than=(summ$chunk_hit_percentage >= chunk_hit_threshold & summ$nt_hit_percentage >= nt_hit_threshold)\r\n+\r\n+unmatched=summ[NULL,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]\r\n+\r\n+if(!all(higher_than, na.rm=T)){ #check for \'not all\' because that would mean the unmatched set is empty\r\n+\tunmatched = summ[!higher_than,]\r\n+\tunmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]\r\n+\tunmatched$best_match = paste("unmatched,", unmatched$best_match)\r\n+\tsumm[!higher_than,"best_match"] = paste("unmatche'..b'.steps = rbind(filtering.steps, c("After empty CDR1, FR2, CDR2, FR3 filter", nrow(result)))\r\n+} else if(empty.region.filter == "CDR1"){\r\n+\tresult = result[result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]\r\n+\tprint(paste("Number of sequences after empty FR2, CDR2 and FR3 column filter:", nrow(result)))\r\n+\tfiltering.steps = rbind(filtering.steps, c("After empty FR2, CDR2, FR3 filter", nrow(result)))\r\n+} else if(empty.region.filter == "FR2"){\r\n+\tresult = result[result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]\r\n+\tprint(paste("Number of sequences after empty CDR2 and FR3 column filter:", nrow(result)))\r\n+\tfiltering.steps = rbind(filtering.steps, c("After empty CDR2, FR3 filter", nrow(result)))\r\n+}\r\n+\r\n+result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]\r\n+\r\n+print(paste("Number of sequences in result after n filtering:", nrow(result)))\r\n+filtering.steps = rbind(filtering.steps, c("After N filter", nrow(result)))\r\n+\r\n+cleanup_columns = c("FR1.IMGT.Nb.of.mutations", \r\n+ "CDR1.IMGT.Nb.of.mutations", \r\n+ "FR2.IMGT.Nb.of.mutations", \r\n+ "CDR2.IMGT.Nb.of.mutations", \r\n+ "FR3.IMGT.Nb.of.mutations")\r\n+\r\n+for(col in cleanup_columns){\r\n+ result[,col] = gsub("\\\\(.*\\\\)", "", result[,col])\r\n+ result[,col] = as.numeric(result[,col])\r\n+ result[is.na(result[,col]),] = 0\r\n+}\r\n+\r\n+write.table(result, before.unique.file, sep="\\t", quote=F,row.names=F,col.names=T)\r\n+\r\n+if(filter.unique != "no"){\r\n+\tclmns = names(result)\r\n+\t\r\n+\tif(grepl("_c", filter.unique)){\r\n+\t\tresult$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq, result$best_match)\r\n+\t} else {\r\n+\t\tresult$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)\r\n+\t}\r\n+\r\n+\t#fltr = result$unique.def %in% result.filtered$unique.def\r\n+\t\t\r\n+\tif(grepl("keep", filter.unique)){\r\n+\t\tresult$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes\r\n+\t\tresult = result[!duplicated(result$unique.def),]\r\n+\t} else {\r\n+\t\tresult = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]\r\n+\t\tresult$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes\r\n+\t\tresult = result[!duplicated(result$unique.def),]\r\n+\t}\r\n+\t\r\n+\t#result = result[,clmns]\r\n+\t\r\n+\t#write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T)\r\n+}\r\n+\r\n+print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result)))\r\n+print(paste("Number of matched sequences in result after CDR/FR filtering:", nrow(result[!grepl("unmatched", result$best_match),])))\r\n+\r\n+filtering.steps = rbind(filtering.steps, c("After unique filter", nrow(result)))\r\n+\r\n+print(paste("Number of rows in result:", nrow(result)))\r\n+print(paste("Number of rows in unmatched:", nrow(unmatched)))\r\n+\r\n+matched.sequences.count = sum(!grepl("^unmatched", result$best_match))\r\n+unmatched.sequences.count = sum(grepl("^unmatched", result$best_match))\r\n+\r\n+filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count))\r\n+filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count))\r\n+filtering.steps[,2] = as.numeric(filtering.steps[,2])\r\n+filtering.steps$perc = round(filtering.steps[,2] / input.sequence.count * 100, 2)\r\n+\r\n+write.table(x=filtering.steps, file=gsub("unmatched", "filtering_steps", unmatchedfile), sep="\\t",quote=F,row.names=F,col.names=F)\r\n+\r\n+write.table(x=result, file=output, sep="\\t",quote=F,row.names=F,col.names=T)\r\n+write.table(x=unmatched, file=unmatchedfile, sep="\\t",quote=F,row.names=F,col.names=T)\r\n' |
b |
diff -r 000000000000 -r 8a5a2abbb870 mutation_analysis.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutation_analysis.py Mon Aug 29 05:36:10 2016 -0400 |
[ |
b'@@ -0,0 +1,290 @@\n+from __future__ import division\n+from collections import defaultdict\n+import re\n+import argparse\n+\n+parser = argparse.ArgumentParser()\n+parser.add_argument("--input",\n+\t\t\t\t\thelp="The \'7_V-REGION-mutation-and-AA-change-table\' and \'10_V-REGION-mutation-hotspots\' merged together, with an added \'best_match\' annotation")\n+parser.add_argument("--genes", help="The genes available in the \'best_match\' column")\n+parser.add_argument("--includefr1", help="Should the mutation/nucleotides in the FR1 region be included?")\n+parser.add_argument("--output", help="Output file")\n+\n+args = parser.parse_args()\n+\n+infile = args.input\n+genes = str(args.genes).split(",")\n+print "includefr1 =", args.includefr1\n+include_fr1 = True if args.includefr1 == "yes" else False\n+outfile = args.output\n+\n+genedic = dict()\n+\n+mutationdic = dict()\n+mutationMatcher = re.compile("^(.)(\\d+).(.),?(.)?(\\d+)?.?(.)?(.?.?.?.?.?)?")\n+NAMatchResult = (None, None, None, None, None, None, \'\')\n+linecount = 0\n+\n+IDIndex = 0\n+best_matchIndex = 0\n+fr1Index = 0\n+cdr1Index = 0\n+fr2Index = 0\n+cdr2Index = 0\n+fr3Index = 0\n+first = True\n+IDlist = []\n+mutationList = []\n+mutationListByID = {}\n+cdr1LengthDic = {}\n+cdr2LengthDic = {}\n+\n+with open(infile, \'r\') as i:\n+\tfor line in i:\n+\t\tif first:\n+\t\t\tlinesplt = line.split("\\t")\n+\t\t\tIDIndex = linesplt.index("Sequence.ID")\n+\t\t\tbest_matchIndex = linesplt.index("best_match")\n+\t\t\tfr1Index = linesplt.index("FR1.IMGT")\n+\t\t\tcdr1Index = linesplt.index("CDR1.IMGT")\n+\t\t\tfr2Index = linesplt.index("FR2.IMGT")\n+\t\t\tcdr2Index = linesplt.index("CDR2.IMGT")\n+\t\t\tfr3Index = linesplt.index("FR3.IMGT")\n+\t\t\tcdr1LengthIndex = linesplt.index("CDR1.IMGT.length")\n+\t\t\tcdr2LengthIndex = linesplt.index("CDR2.IMGT.length")\n+\t\t\tfirst = False\n+\t\t\tcontinue\n+\t\tlinecount += 1\n+\t\tlinesplt = line.split("\\t")\n+\t\tID = linesplt[IDIndex]\n+\t\tgenedic[ID] = linesplt[best_matchIndex]\n+\t\ttry:\n+\t\t\tif linesplt[fr1Index] != "NA":\n+\t\t\t\tmutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if include_fr1 else []\n+\t\t\telse:\n+\t\t\t\tmutationdic[ID + "_FR1"] = []\n+\t\t\tmutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if linesplt[cdr1Index] != "NA" else []\n+\t\t\tmutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if linesplt[fr2Index] != "NA" else []\n+\t\t\tmutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if linesplt[cdr2Index] != "NA" else []\n+\t\t\tmutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]\n+\t\t\tmutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else []\n+\t\texcept e:\n+\t\t\tprint linesplt\n+\t\t\tprint linecount\n+\t\t\tprint e\n+\t\tmutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]\n+\t\tmutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]\n+\n+\t\tcdr1Length = linesplt[cdr1LengthIndex]\n+\t\tcdr2Length = linesplt[cdr2LengthIndex]\n+\n+\t\tcdr1LengthDic[ID] = int(cdr1Length) if cdr1Length != "X" else 0\n+\t\tcdr2LengthDic[ID] = int(cdr2Length) if cdr2Length != "X" else 0\n+\t\t\n+\t\tIDlist += [ID]\n+\n+AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] else 0)[4]) + 1) # [4] is the position of the AA mutation, None if silent\n+if AALength < 60:\n+\tAALength = 64\n+\n+AA_mutation = [0] * AALength\n+AA_mutation_dic = {"ca": AA_mutation[:], "cg": AA_mutation[:], "cm": AA_mutation[:], "un": AA_mutation[:]}\n+AA_mutation_empty = AA_mutation[:]\n+\n+aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt"\n+with open(aa_mutations_by_id_file, \'w\') as o:\n+\to.write("ID\\tbest_match\\t" + "\\t".join([str(x) for x in range(1,AALength)]) + "\\n")\n+\tfor ID in mutat'..b'ndex]\n+\t\tID = linesplt[IDIndex]\n+\t\tif ID == "ca2":\n+\t\t\tprint linesplt\n+\t\tRGYW = [(int(x), int(y), z) for (x, y, z) in\n+\t\t\t\t[hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]]\n+\t\tWRCY = [(int(x), int(y), z) for (x, y, z) in\n+\t\t\t\t[hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]]\n+\t\tWA = [(int(x), int(y), z) for (x, y, z) in\n+\t\t\t [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]]\n+\t\tTW = [(int(x), int(y), z) for (x, y, z) in\n+\t\t\t [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]]\n+\t\tRGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0\n+\n+\t\tmutationList = (mutationdic[ID + "_FR1"] if include_fr1 else []) + mutationdic[ID + "_CDR1"] + mutationdic[\n+\t\t\tID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]\n+\t\tfor mutation in mutationList:\n+\t\t\tfrm, where, to, AAfrm, AAwhere, AAto, junk = mutation\n+\t\t\tmutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW])\n+\t\t\tmutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY])\n+\t\t\tmutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA])\n+\t\t\tmutation_in_TW = any([(start <= int(where) <= end) for (start, end, region) in TW])\n+\n+\t\t\tin_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW])\n+\n+\t\t\tif in_how_many_motifs > 0:\n+\t\t\t\tRGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs\n+\t\t\t\tWRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs\n+\t\t\t\tWACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs\n+\t\t\t\tTWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs\n+\n+\n+def mean(lst):\n+\treturn (float(sum(lst)) / len(lst)) if len(lst) > 0 else 0.0\n+\n+\n+def median(lst):\n+\tlst = sorted(lst)\n+\tl = len(lst)\n+\tif l == 0:\n+\t\treturn 0\n+\tif l == 1:\n+\t\treturn lst[0]\n+\t\t\n+\tl = int(l / 2)\n+\t\n+\tif len(lst) % 2 == 0:\n+\t\treturn float(lst[l] + lst[(l - 1)]) / 2.0\n+\telse:\n+\t\treturn lst[l]\n+\n+funcs = {"mean": mean, "median": median, "sum": sum}\n+\n+directory = outfile[:outfile.rfind("/") + 1]\n+value = 0\n+valuedic = dict()\n+\n+for fname in funcs.keys():\n+\tfor gene in genes:\n+\t\twith open(directory + gene + "_" + fname + "_value.txt", \'r\') as v:\n+\t\t\tvaluedic[gene + "_" + fname] = float(v.readlines()[0].rstrip())\n+\twith open(directory + "all_" + fname + "_value.txt", \'r\') as v:\n+\t\tvaluedic["total_" + fname] = float(v.readlines()[0].rstrip())\n+\t\n+\n+def get_xyz(lst, gene, f, fname):\n+\tx = int(round(f(lst)))\n+\ty = valuedic[gene + "_" + fname]\n+\tz = str(round(x / float(y) * 100, 1)) if y != 0 else "0"\n+\treturn (str(x), str(y), z)\n+\n+dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount}\n+arr = ["RGYW", "WRCY", "WA", "TW"]\n+\n+geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes}\n+\n+for fname in funcs.keys():\n+\tfunc = funcs[fname]\n+\tfoutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt"\n+\twith open(foutfile, \'w\') as o:\n+\t\tfor typ in arr:\n+\t\t\to.write(typ + " (%)")\n+\t\t\tcurr = dic[typ]\n+\t\t\tfor gene in genes:\n+\t\t\t\tgeneMatcher = geneMatchers[gene] #re.compile("^" + gene + ".*") #recompile every loop....\n+\t\t\t\tif valuedic[gene + "_" + fname] is 0:\n+\t\t\t\t\to.write(",0,0,0")\n+\t\t\t\telse:\n+\t\t\t\t\tx, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname)\n+\t\t\t\t\to.write("," + x + "," + y + "," + z)\n+\n+\t\t\tx, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname)\n+\t\t\to.write("," + x + "," + y + "," + z + "\\n")\n+\n+\n+# for testing\n+seq_motif_file = outfile[:outfile.rindex("/")] + "/motif_per_seq.txt"\n+with open(seq_motif_file, \'w\') as o:\n+\to.write("ID\\tRGYWC\\tWRCY\\tWA\\tTW\\n")\n+\tfor ID in IDlist:\n+\t\to.write(ID + "\\t" + str(round(RGYWCount[ID], 2)) + "\\t" + str(round(WRCYCount[ID], 2)) + "\\t" + str(round(WACount[ID], 2)) + "\\t" + str(round(TWCount[ID], 2)) + "\\n")\n' |
b |
diff -r 000000000000 -r 8a5a2abbb870 mutation_analysis.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutation_analysis.r Mon Aug 29 05:36:10 2016 -0400 |
[ |
b'@@ -0,0 +1,477 @@\n+library(data.table)\r\n+library(ggplot2)\r\n+library(reshape2)\r\n+\r\n+args <- commandArgs(trailingOnly = TRUE)\r\n+\r\n+input = args[1]\r\n+genes = unlist(strsplit(args[2], ","))\r\n+outputdir = args[3]\r\n+include_fr1 = ifelse(args[4] == "yes", T, F)\r\n+setwd(outputdir)\r\n+\r\n+dat = read.table(input, header=T, sep="\\t", fill=T, stringsAsFactors=F)\r\n+\r\n+if(length(dat$Sequence.ID) == 0){\r\n+ setwd(outputdir)\r\n+ result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5))\r\n+ row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)")\r\n+ write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)\r\n+ transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4))\r\n+ row.names(transitionTable) = c("A", "C", "G", "T")\r\n+ transitionTable["A","A"] = NA\r\n+ transitionTable["C","C"] = NA\r\n+ transitionTable["G","G"] = NA\r\n+ transitionTable["T","T"] = NA\r\n+ write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)\r\n+ cat("0", file="n.txt")\r\n+ stop("No data")\r\n+}\r\n+\r\n+cleanup_columns = c("FR1.IMGT.c.a",\r\n+ "FR2.IMGT.g.t",\r\n+ "CDR1.IMGT.Nb.of.nucleotides",\r\n+ "CDR2.IMGT.t.a",\r\n+ "FR1.IMGT.c.g",\r\n+ "CDR1.IMGT.c.t",\r\n+ "FR2.IMGT.a.c",\r\n+ "FR2.IMGT.Nb.of.mutations",\r\n+ "FR2.IMGT.g.c",\r\n+ "FR2.IMGT.a.g",\r\n+ "FR3.IMGT.t.a",\r\n+ "FR3.IMGT.t.c",\r\n+ "FR2.IMGT.g.a",\r\n+ "FR3.IMGT.c.g",\r\n+ "FR1.IMGT.Nb.of.mutations",\r\n+ "CDR1.IMGT.g.a",\r\n+ "CDR1.IMGT.t.g",\r\n+ "CDR1.IMGT.g.c",\r\n+ "CDR2.IMGT.Nb.of.nucleotides",\r\n+ "FR2.IMGT.a.t",\r\n+ "CDR1.IMGT.Nb.of.mutations",\r\n+ "CDR3.IMGT.Nb.of.nucleotides",\r\n+ "CDR1.IMGT.a.g",\r\n+ "FR3.IMGT.a.c",\r\n+ "FR1.IMGT.g.a",\r\n+ "FR3.IMGT.a.g",\r\n+ "FR1.IMGT.a.t",\r\n+ "CDR2.IMGT.a.g",\r\n+ "CDR2.IMGT.Nb.of.mutations",\r\n+ "CDR2.IMGT.g.t",\r\n+ "CDR2.IMGT.a.c",\r\n+ "CDR1.IMGT.t.c",\r\n+ "FR3.IMGT.g.c",\r\n+ "FR1.IMGT.g.t",\r\n+ "FR3.IMGT.g.t",\r\n+ "CDR1.IMGT.a.t",\r\n+ "FR1.IMGT.a.g",\r\n+ "FR3.IMGT.a.t",\r\n+ "FR3.IMGT.Nb.of.nucleotides",\r\n+ "FR2.IMGT.t.c",\r\n+ "CDR2.IMGT.g.a",\r\n+ "FR2.IMGT.t.a",\r\n+ "CDR1.IMGT.t.a",\r\n+ "FR2.IMGT.t.g",\r\n+ "FR3.IMGT.t.g",\r\n+ "FR2.IMGT.Nb.of.nucleotides",\r\n+ "FR1.IMGT.t.a",\r\n+ "FR1.IMGT.t.g",\r\n+ "FR3.IMGT.c.t",\r\n+ "FR1.IMGT.t.c",\r\n+ "CDR2.IMGT.a.t",\r\n+ "FR2.IMGT.c.t",\r\n+ "CDR1.IMGT.g.t",\r\n+ "CDR2.IMGT.t.g",\r\n+ "FR1.IMGT.Nb.of.nucleotides",\r\n+ "CDR1.IMGT.c.g",\r\n+ "CDR2.IMGT.t.c",\r\n+ "FR3.IMGT.g.a",\r\n+ "CDR1.IMGT.a.c",\r\n+ "FR2.IMGT.c.a",\r\n+ "FR3.IMGT.Nb.of.mutations",\r\n+ "FR2.IMGT.c.g",\r\n+ "CDR2.IMGT.g.c",\r\n+ "FR1.IMGT.g.c",\r\n+ "CDR2.IMGT.c.t",\r\n+ "FR3.IMGT.c.a",\r\n+ "CDR1.IMGT.c.a",\r\n+ "CDR2.IMGT.c.g",\r\n+ "CDR2.IMGT.c.a",\r\n+ "FR1.IMGT.c.t",\r\n+ "FR1'..b'ble[,1] = as.character(new.table[,1])\r\n+new.table[2,1] = "Median of Number of Mutations (%)"\r\n+\r\n+#sum.table = sum.table[c("Number of Mutations (%)", "Median of Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR"),]\r\n+\r\n+write.table(x=new.table, file="mutations_sum.txt", sep=",",quote=F,row.names=F,col.names=F)\r\n+\r\n+\r\n+print("Plotting ca piechart")\r\n+\r\n+dat = dat[!grepl("^unmatched", dat$best_match),]\r\n+\r\n+#blegh\r\n+genesForPlot = dat[grepl("ca", dat$best_match),]$best_match\r\n+if(length(genesForPlot) > 0){\r\n+\tgenesForPlot = data.frame(table(genesForPlot))\r\n+\tcolnames(genesForPlot) = c("Gene","Freq")\r\n+\tgenesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)\r\n+\r\n+\tpc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))\r\n+\tpc = pc + geom_bar(width = 1, stat = "identity")\r\n+\tpc = pc + coord_polar(theta="y")\r\n+\tpc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgA subclasses", "( n =", sum(genesForPlot$Freq), ")"))\r\n+\twrite.table(genesForPlot, "ca.txt", sep="\\t",quote=F,row.names=F,col.names=T)\r\n+\r\n+\tpng(filename="ca.png")\r\n+\tprint(pc)\r\n+\tdev.off()\r\n+}\r\n+\r\n+print("Plotting cg piechart")\r\n+\r\n+genesForPlot = dat[grepl("cg", dat$best_match),]$best_match\r\n+if(length(genesForPlot) > 0){\r\n+\tgenesForPlot = data.frame(table(genesForPlot))\r\n+\tcolnames(genesForPlot) = c("Gene","Freq")\r\n+\tgenesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)\r\n+\r\n+\tpc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=label))\r\n+\tpc = pc + geom_bar(width = 1, stat = "identity")\r\n+\tpc = pc + coord_polar(theta="y")\r\n+\tpc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IgG subclasses", "( n =", sum(genesForPlot$Freq), ")"))\r\n+\twrite.table(genesForPlot, "cg.txt", sep="\\t",quote=F,row.names=F,col.names=T)\r\n+\r\n+\tpng(filename="cg.png")\r\n+\tprint(pc)\r\n+\tdev.off()\r\n+}\r\n+\r\n+\r\n+print("Plotting scatterplot")\r\n+\r\n+dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2)\r\n+\r\n+p = ggplot(dat, aes(best_match, percentage_mutations))\r\n+p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA)\r\n+p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot")\r\n+\r\n+png(filename="scatter.png")\r\n+print(p)\r\n+dev.off()\r\n+\r\n+write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\\t",quote=F,row.names=F,col.names=T)\r\n+\r\n+write.table(dat, input, sep="\\t",quote=F,row.names=F,col.names=T)\r\n+\r\n+\r\n+print("Plotting frequency ranges plot")\r\n+\r\n+dat$best_match_class = substr(dat$best_match, 0, 2)\r\n+freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20")\r\n+dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels)\r\n+\r\n+frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match_class", "frequency_bins")])\r\n+\r\n+p = ggplot(frequency_bins_data, aes(frequency_bins, frequency_count))\r\n+p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge")\r\n+p = p + xlab("Frequency ranges") + ylab("Frequency") + ggtitle("Mutation Frequencies by class")\r\n+\r\n+png(filename="frequency_ranges.png")\r\n+print(p)\r\n+dev.off()\r\n+\r\n+frequency_bins_data_by_class = frequency_bins_data\r\n+\r\n+write.table(frequency_bins_data_by_class, "frequency_ranges_classes.txt", sep="\\t",quote=F,row.names=F,col.names=T)\r\n+\r\n+frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match", "frequency_bins")])\r\n+\r\n+write.table(frequency_bins_data, "frequency_ranges_subclasses.txt", sep="\\t",quote=F,row.names=F,col.names=T)\r\n+\r\n+\r\n+#frequency_bins_data_by_class\r\n+#frequency_ranges_subclasses.txt\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n+\r\n' |
b |
diff -r 000000000000 -r 8a5a2abbb870 mutation_analysis.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutation_analysis.xml Mon Aug 29 05:36:10 2016 -0400 |
[ |
@@ -0,0 +1,101 @@ +<tool id="mutation_analysis_shm" name="Mutation Analysis" version="1.0"> + <description></description> + <command interpreter="bash"> + wrapper.sh $in_file $method $out_file $out_file.files_path ${in_file.name} ${include_fr1} $functionality $unique $naive_output_ca $naive_output_cg $naive_output_cm $filter_uniques $class_filter $empty_region_filter + </command> + <inputs> + <param name="in_file" type="data" label="IMGT zip file to be analysed" /> + <param name="method" type="select" label="Method of identification of C region" help="" > + <option value="custom" selected="true">custom</option> + <option value="blastn">blastn</option> + </param> + <param name="include_fr1" type="select" label="Include mutations in FR1 region" help="" > + <option value="yes">yes</option> + <option value="no" selected="true">no</option> + </param> + <param name="functionality" type="select" label="Functionality filter" help="" > + <option value="productive" selected="true">Productive: Keep "productive" and "productive (see comment)"</option> + <option value="unproductive">Unproductive: Keep "unproductive" and "unproductive (see comment)"</option> + <option value="remove_unknown">Remove "unknown" and "unknown (see comment)"</option> + <option value="dont_filter">Don't filter</option> + </param> + <param name="filter_uniques" type="select" label="Filter unique sequences" help="See below for an example."> + <option value="remove">Remove uniques (Based on nucleotide sequence + C)</option> + <option value="keep">Keep uniques (Based on nucleotide sequence + C)</option> + <option value="no" selected="true">No</option> + </param> + <param name="unique" type="select" label="Remove duplicates based on" help="" > + <option value="VGene,AA.JUNCTION,best_match" selected="true">Top.V.Gene, CDR3.AA.Seq, C region</option> + <option value="VGene,AA.JUNCTION">Top.V.Gene, CDR3.AA.Seq</option> + <option value="AA.JUNCTION,best_match">CDR3.AA.Seq, C region</option> + <option value="AA.JUNCTION">CDR3.AA.Seq</option> + + <option value="VGene,CDR3.IMGT.seq,best_match" selected="true">Top.V.Gene, CDR3.nt.Seq, C region</option> + <option value="VGene,CDR3.IMGT.seq">Top.V.Gene, CDR3.nt.Seq</option> + <option value="CDR3.IMGT.seq,best_match">CDR3.nt.Seq, C region</option> + <option value="CDR3.IMGT.seq">CDR3.nt.Seq</option> + <option value="Sequence.ID">Don't remove duplicates</option> + </param> + <param name="class_filter" type="select" label="Class/Subclass filter" help="" > + <option value="70_70" selected="true">>70% class and >70% subclass</option> + <option value="60_55">>60% class and >55% subclass</option> + <option value="70_0">>70% class</option> + <option value="60_0">>60% class</option> + </param> + <param name="empty_region_filter" type="select" label="Sequence starts at" help="" > + <option value="FR1" selected="true">FR1: exclude empty CDR1,FR2,CDR2,FR3</option> + <option value="CDR1">CDR1: exclude empty FR2,CDR2,FR3</option> + <option value="FR2">FR2: exclude empty CDR2,FR3</option> + </param> + <conditional name="naive_output_cond"> + <param name="naive_output" type="select" label="Output new IMGT archives per class into your history?"> + <option value="yes">Yes</option> + <option value="no" selected="true">No</option> + </param> + </conditional> + </inputs> + <outputs> + <data format="html" name="out_file" label = "Mutation analysis on ${in_file.name}"/> + <data format="imgt_archive" name="naive_output_ca" label = "Naive CA input data from ${in_file.name}" > + <filter>naive_output_cond['naive_output'] == "yes"</filter> + </data> + <data format="imgt_archive" name="naive_output_cg" label = "Naive CG input data from ${in_file.name}" > + <filter>naive_output_cond['naive_output'] == "yes"</filter> + </data> + <data format="imgt_archive" name="naive_output_cm" label = "Naive CM input data from ${in_file.name}" > + <filter>naive_output_cond['naive_output'] == "yes"</filter> + </data> + </outputs> + <citations> + <citation type="doi">10.1093/nar/gks457</citation> + <citation type="doi">10.1093/bioinformatics/btv359</citation> + <citation type="doi">10.1186/1471-2105-10-421</citation> + </citations> + <help> + Takes an IMGT zip (http://www.imgt.org/HighV-QUEST/search.action) file and creates a summarization of the mutation analysis. + + +--------------------------+ + | unique filter | + +--------+--------+--------+ + | values | remove | keep | + +--------+--------+--------+ + | A | A | A | + +--------+--------+--------+ + | A | B | B | + +--------+--------+--------+ + | B | D | C | + +--------+--------+--------+ + | B | | D | + +--------+--------+--------+ + | C | | | + +--------+--------+--------+ + | D | | | + +--------+--------+--------+ + | D | | | + +--------+--------+--------+ + + </help> + <requirements> + <requirement type="package" version="1.0">blastn</requirement> + </requirements> +</tool> |
b |
diff -r 000000000000 -r 8a5a2abbb870 naive_output.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/naive_output.r Mon Aug 29 05:36:10 2016 -0400 |
[ |
@@ -0,0 +1,45 @@ +args <- commandArgs(trailingOnly = TRUE) + +naive.file = args[1] +shm.file = args[2] +output.file.ca = args[3] +output.file.cg = args[4] +output.file.cm = args[5] + +naive = read.table(naive.file, sep="\t", header=T, quote="", fill=T) +shm.merge = read.table(shm.file, sep="\t", header=T, quote="", fill=T) + + +final = merge(naive, shm.merge[,c("Sequence.ID", "best_match")], by.x="ID", by.y="Sequence.ID") +print(paste("nrow final:", nrow(final))) +names(final)[names(final) == "best_match"] = "Sample" +final.numeric = final[,sapply(final, is.numeric)] +final.numeric[is.na(final.numeric)] = 0 +final[,sapply(final, is.numeric)] = final.numeric + +final.ca = final[grepl("^ca", final$Sample),] +final.cg = final[grepl("^cg", final$Sample),] +final.cm = final[grepl("^cm", final$Sample),] + +if(nrow(final.ca) > 0){ + final.ca$Replicate = 1 +} + +if(nrow(final.cg) > 0){ + final.cg$Replicate = 1 +} + +if(nrow(final.cm) > 0){ + final.cm$Replicate = 1 +} + +#print(paste("nrow final:", nrow(final))) +#final2 = final +#final2$Sample = gsub("[0-9]", "", final2$Sample) +#final = rbind(final, final2) +#final$Replicate = 1 + +write.table(final.ca, output.file.ca, quote=F, sep="\t", row.names=F, col.names=T) +write.table(final.cg, output.file.cg, quote=F, sep="\t", row.names=F, col.names=T) +write.table(final.cm, output.file.cm, quote=F, sep="\t", row.names=F, col.names=T) + |
b |
diff -r 000000000000 -r 8a5a2abbb870 new_imgt.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/new_imgt.r Mon Aug 29 05:36:10 2016 -0400 |
[ |
@@ -0,0 +1,27 @@ +args <- commandArgs(trailingOnly = TRUE) + +imgt.dir = args[1] +merged.file = args[2] +gene = args[3] + +merged = read.table(merged.file, header=T, sep="\t", fill=T, stringsAsFactors=F) + +if(gene != "-"){ + merged = merged[grepl(gene, merged$best_match),] +} + +merged = merged[!grepl("unmatched", merged$best_match),] + +for(f in list.files(imgt.dir, pattern="*.txt$")){ + #print(paste("filtering", f)) + path = paste(imgt.dir, f, sep="") + dat = read.table(path, header=T, sep="\t", fill=T, quote="", stringsAsFactors=F, check.names=FALSE) + + dat = dat[dat[,"Sequence ID"] %in% merged$Sequence.ID,] + + if(nrow(dat) > 0 & grepl("^8_", f)){ #change the FR1 columns to 0 in the "8_..." file + dat[,grepl("^FR1", names(dat))] = 0 + } + + write.table(dat, path, quote=F, sep="\t", row.names=F, col.names=T, na="") +} |
b |
diff -r 000000000000 -r 8a5a2abbb870 pattern_plots.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pattern_plots.r Mon Aug 29 05:36:10 2016 -0400 |
[ |
@@ -0,0 +1,139 @@ +library(ggplot2) +library(reshape2) +library(scales) + +args <- commandArgs(trailingOnly = TRUE) + +input.file = args[1] #the data that's get turned into the "SHM overview" table in the html report "data_sum.txt" + +plot1.path = args[2] +plot1.png = paste(plot1.path, ".png", sep="") +plot1.txt = paste(plot1.path, ".txt", sep="") + +plot2.path = args[3] +plot2.png = paste(plot2.path, ".png", sep="") +plot2.txt = paste(plot2.path, ".txt", sep="") + +plot3.path = args[4] +plot3.png = paste(plot3.path, ".png", sep="") +plot3.txt = paste(plot3.path, ".txt", sep="") + +dat = read.table(input.file, header=F, sep=",", quote="", stringsAsFactors=F, fill=T, row.names=1) + + + +classes = c("ca", "ca1", "ca2", "cg", "cg1", "cg2", "cg3", "cg4", "cm") +xyz = c("x", "y", "z") +new.names = c(paste(rep(classes, each=3), xyz, sep="."), paste("un", xyz, sep="."), paste("all", xyz, sep=".")) + +names(dat) = new.names + +dat["RGYW.WRCY",] = colSums(dat[c(13,14),]) +dat["TW.WA",] = colSums(dat[c(15,16),]) + +data1 = dat[c("RGYW.WRCY", "TW.WA"),] + +data1 = data1[,names(data1)[grepl(".z", names(data1))]] +names(data1) = gsub("\\..*", "", names(data1)) + +data1 = melt(t(data1)) + +names(data1) = c("Class", "Type", "value") + +write.table(data1, plot1.txt, quote=F, sep="\t", na="", row.names=F, col.names=T) + +p = ggplot(data1, aes(Class, value)) + geom_bar(aes(fill=Type), stat="identity", position="dodge") + ylab("% of mutations") + guides(fill=guide_legend(title=NULL)) +png(filename=plot1.png) +print(p) +dev.off() + +data2 = dat[5:8,] + +data2["sum",] = colSums(data2) + +data2 = data2[,names(data2)[grepl("\\.x", names(data2))]] +names(data2) = gsub(".x", "", names(data2)) + +data2["A/T",] = round(colSums(data2[3:4,]) / data2["sum",] * 100, 1) +data2["A/T",is.nan(unlist(data2["A/T",]))] = 0 + +data2["G/C transversions",] = round(data2[2,] / data2["sum",] * 100, 1) +data2["G/C transitions",] = round(data2[1,] / data2["sum",] * 100, 1) + + +data2["G/C transversions",is.nan(unlist(data2["G/C transversions",]))] = 0 +data2["G/C transversions",is.infinite(unlist(data2["G/C transversions",]))] = 0 +data2["G/C transitions",is.nan(unlist(data2["G/C transitions",]))] = 0 +data2["G/C transitions",is.infinite(unlist(data2["G/C transitions",]))] = 0 + +data2 = melt(t(data2[6:8,])) + +names(data2) = c("Class", "Type", "value") + +write.table(data2, plot2.txt, quote=F, sep="\t", na="", row.names=F, col.names=T) + +p = ggplot(data2, aes(x=Class, y=value, fill=Type)) + geom_bar(position="fill", stat="identity") + scale_y_continuous(labels=percent_format()) + guides(fill=guide_legend(title=NULL)) + ylab("% of mutations") +png(filename=plot2.png) +print(p) +dev.off() + +data3 = dat[c(5, 6, 8, 17:20),] +data3 = data3[,names(data3)[grepl("\\.x", names(data3))]] +names(data3) = gsub(".x", "", names(data3)) + +data3["G/C transitions",] = round(data3[1,] / (data3[5,] + data3[7,]) * 100, 1) + +data3["G/C transversions",] = round(data3[2,] / (data3[5,] + data3[7,]) * 100, 1) + +data3["A/T",] = round(data3[3,] / (data3[4,] + data3[6,]) * 100, 1) + +data3["G/C transitions",is.nan(unlist(data3["G/C transitions",]))] = 0 +data3["G/C transitions",is.infinite(unlist(data3["G/C transitions",]))] = 0 + +data3["G/C transversions",is.nan(unlist(data3["G/C transversions",]))] = 0 +data3["G/C transversions",is.infinite(unlist(data3["G/C transversions",]))] = 0 + +data3["A/T",is.nan(unlist(data3["A/T",]))] = 0 +data3["A/T",is.infinite(unlist(data3["A/T",]))] = 0 + +data3 = melt(t(data3[8:10,])) +names(data3) = c("Class", "Type", "value") + +write.table(data3, plot3.txt, quote=F, sep="\t", na="", row.names=F, col.names=T) + +p = ggplot(data3, aes(Class, value)) + geom_bar(aes(fill=Type), stat="identity", position="dodge") + ylab("% of nucleotides") + guides(fill=guide_legend(title=NULL)) +png(filename=plot3.png) +print(p) +dev.off() + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + |
b |
diff -r 000000000000 -r 8a5a2abbb870 sequence_overview.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sequence_overview.r Mon Aug 29 05:36:10 2016 -0400 |
[ |
b'@@ -0,0 +1,315 @@\n+library(reshape2)\n+\n+args <- commandArgs(trailingOnly = TRUE)\n+\n+before.unique.file = args[1]\n+merged.file = args[2]\n+outputdir = args[3]\n+gene.classes = unlist(strsplit(args[4], ","))\n+hotspot.analysis.sum.file = args[5]\n+NToverview.file = paste(outputdir, "ntoverview.txt", sep="/")\n+NTsum.file = paste(outputdir, "ntsum.txt", sep="/")\n+main.html = "index.html"\n+\n+setwd(outputdir)\n+\n+before.unique = read.table(before.unique.file, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\n+merged = read.table(merged.file, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\n+hotspot.analysis.sum = read.table(hotspot.analysis.sum.file, header=F, sep=",", fill=T, stringsAsFactors=F, quote="")\n+\n+#before.unique = before.unique[!grepl("unmatched", before.unique$best_match),]\n+\n+before.unique$seq_conc = paste(before.unique$CDR1.IMGT.seq, before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)\n+\n+IDs = before.unique[,c("Sequence.ID", "seq_conc", "best_match", "Functionality")]\n+IDs$best_match = as.character(IDs$best_match)\n+\n+#dat = data.frame(data.table(dat)[, list(freq=.N), by=c("best_match", "seq_conc")])\n+\n+dat = data.frame(table(before.unique$seq_conc))\n+#dat = data.frame(table(merged$seq_conc, merged$Functionality))\n+\n+#dat = dat[dat$Freq > 1,]\n+\n+#names(dat) = c("seq_conc", "Functionality", "Freq")\n+names(dat) = c("seq_conc", "Freq")\n+\n+dat$seq_conc = factor(dat$seq_conc)\n+\n+dat = dat[order(as.character(dat$seq_conc)),]\n+\n+#writing html from R...\n+get.bg.color = function(val){\n+\tif(val %in% c("TRUE", "FALSE", "T", "F")){ #if its a logical value, give the background a green/red color\n+\t\treturn(ifelse(val,"#eafaf1","#f9ebea"))\n+\t} else if (!is.na(as.numeric(val))) { #if its a numerical value, give it a grey tint if its >0\n+\t\treturn(ifelse(val > 0,"#eaecee","white"))\n+\t} else {\n+\t\treturn("white")\n+\t}\n+}\n+td = function(val) {\n+ return(paste("<td bgcolor=\'", get.bg.color(val), "\'>", val, "</td>", sep=""))\n+}\n+tr = function(val) { \n+\treturn(paste(c("<tr>", sapply(val, td), "</tr>"), collapse="")) \n+}\n+\n+make.link = function(id, clss, val) { \n+\tpaste("<a href=\'", clss, "_", id, ".html\'>", val, "</a>", sep="") \n+}\n+tbl = function(df) {\n+\tres = "<table border=\'1\'>"\n+\tfor(i in 1:nrow(df)){ \n+\t\tres = paste(res, tr(df[i,]), sep="")\n+\t}\n+\tres = paste(res, "</table>")\n+}\n+\n+cat("<table border=\'1\'>", file=main.html, append=F)\n+cat("<caption>CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)\n+cat("<tr>", file=main.html, append=T)\n+cat("<th>Sequence</th><th>Functionality</th><th>ca1</th><th>ca2</th><th>cg1</th><th>cg2</th><th>cg3</th><th>cg4</th><th>cm</th><th>un</th>", file=main.html, append=T)\n+cat("<th>total CA</th><th>total CG</th><th>number of subclasses</th><th>present in both Ca and Cg</th><th>Ca1+Ca2</th>", file=main.html, append=T)\n+cat("<th>Cg1+Cg2</th><th>Cg1+Cg3</th><th>Cg1+Cg4</th><th>Cg2+Cg3</th><th>Cg2+Cg4</th><th>Cg3+Cg4</th>", file=main.html, append=T)\n+cat("<th>Cg1+Cg2+Cg3</th><th>Cg2+Cg3+Cg4</th><th>Cg1+Cg2+Cg4</th><th>Cg1+Cg3+Cg4</th><th>Cg1+Cg2+Cg3+Cg4</th>", file=main.html, append=T)\n+cat("</tr>", file=main.html, append=T)\n+\n+\n+\n+single.sequences=0 #sequence only found once, skipped\n+in.multiple=0 #same sequence across multiple subclasses\n+multiple.in.one=0 #same sequence multiple times in one subclass\n+unmatched=0 #all of the sequences are unmatched\n+some.unmatched=0 #one or more sequences in a clone are unmatched\n+matched=0 #should be the same als matched sequences\n+\n+sequence.id.page="by_id.html"\n+\n+for(i in 1:nrow(dat)){\n+\t\n+\tca1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^ca1", IDs$best_match),]\n+\tca2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^ca2", IDs$best_match),]\n+\t\n+\tcg1 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^cg1", IDs$best_match),]\n+\tcg2 = IDs[IDs$seq_conc == dat[i,c("seq_conc")] & grepl("^cg2", IDs$best_match),]\n+\tcg3 = IDs[IDs$seq_conc == dat[i,c("'..b'd, "cg2", cg2.n)\n+\tcg3.html = make.link(id, "cg3", cg3.n)\n+\tcg4.html = make.link(id, "cg4", cg4.n)\n+\t\n+\tcm.html = make.link(id, "cm", cm.n)\n+\t\n+\tun.html = make.link(id, "un", un.n)\n+\t\n+\t#extra columns\n+\tca.n = ca1.n + ca2.n\n+\t\n+\tcg.n = cg1.n + cg2.n + cg3.n + cg4.n\n+\t\n+\t#in.classes\n+\t\n+\tin.ca.cg = (ca.n > 0 & cg.n > 0)\n+\t\n+\tin.ca1.ca2 = (ca1.n > 0 & ca2.n > 0)\n+\t\n+\tin.cg1.cg2 = (cg1.n > 0 & cg2.n > 0)\n+\tin.cg1.cg3 = (cg1.n > 0 & cg3.n > 0)\n+\tin.cg1.cg4 = (cg1.n > 0 & cg4.n > 0)\n+\tin.cg2.cg3 = (cg2.n > 0 & cg3.n > 0)\n+\tin.cg2.cg4 = (cg2.n > 0 & cg4.n > 0)\n+\tin.cg3.cg4 = (cg3.n > 0 & cg4.n > 0)\n+\t\n+\tin.cg1.cg2.cg3 = (cg1.n > 0 & cg2.n > 0 & cg3.n > 0)\n+\tin.cg2.cg3.cg4 = (cg2.n > 0 & cg3.n > 0 & cg4.n > 0)\n+\tin.cg1.cg2.cg4 = (cg1.n > 0 & cg2.n > 0 & cg4.n > 0)\n+\tin.cg1.cg3.cg4 = (cg1.n > 0 & cg3.n > 0 & cg4.n > 0)\n+\t\n+\tin.cg.all = (cg1.n > 0 & cg2.n > 0 & cg3.n > 0 & cg4.n > 0)\n+\t\n+\t\n+\t\n+\t\n+\t#rw = c(as.character(dat[i,"seq_conc"]), functionality, ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html, un.html)\n+\trw = c(as.character(dat[i,"seq_conc"]), functionality, ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html, un.html)\n+\trw = c(rw, ca.n, cg.n, in.classes, in.ca.cg, in.ca1.ca2, in.cg1.cg2, in.cg1.cg3, in.cg1.cg4, in.cg2.cg3, in.cg2.cg4, in.cg3.cg4, in.cg1.cg2.cg3, in.cg2.cg3.cg4, in.cg1.cg2.cg4, in.cg1.cg3.cg4, in.cg.all)\n+\n+\tcat(tr(rw), file=main.html, append=T)\n+\t\n+\t\n+\tfor(i in 1:nrow(allc)){ #generate html by id\n+\t\thtml = make.link(id, allc[i,"best_match"], allc[i,"Sequence.ID"])\n+\t\tcat(paste(html, "<br />"), file=sequence.id.page, append=T)\n+\t}\n+}\n+\n+cat("</table>", file=main.html, append=T)\n+\n+print(paste("Single sequences:", single.sequences))\n+print(paste("Sequences in multiple subclasses:", in.multiple))\n+print(paste("Multiple sequences in one subclass:", multiple.in.one))\n+print(paste("Matched with unmatched:", some.unmatched))\n+print(paste("Count that should match \'matched\' sequences:", matched))\n+\n+#ACGT overview\n+\n+NToverview = merged[!grepl("^unmatched", merged$best_match),]\n+\n+NToverview$seq = paste(NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq, sep="_")\n+\n+NToverview$A = nchar(gsub("[^Aa]", "", NToverview$seq))\n+NToverview$C = nchar(gsub("[^Cc]", "", NToverview$seq))\n+NToverview$G = nchar(gsub("[^Gg]", "", NToverview$seq))\n+NToverview$T = nchar(gsub("[^Tt]", "", NToverview$seq))\n+\n+#Nsum = data.frame(Sequence.ID="-", best_match="Sum", seq="-", A = sum(NToverview$A), C = sum(NToverview$C), G = sum(NToverview$G), T = sum(NToverview$T))\n+\n+#NToverview = rbind(NToverview, NTsum)\n+\n+NTresult = data.frame(nt=c("A", "C", "T", "G"))\n+\n+for(clazz in gene.classes){\n+\tNToverview.sub = NToverview[grepl(paste("^", clazz, sep=""), NToverview$best_match),]\n+\tnew.col.x = c(sum(NToverview.sub$A), sum(NToverview.sub$C), sum(NToverview.sub$T), sum(NToverview.sub$G))\n+\tnew.col.y = sum(new.col.x)\n+\tnew.col.z = round(new.col.x / new.col.y * 100, 2)\n+\t\n+\ttmp = names(NTresult)\n+\tNTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))\n+\tnames(NTresult) = c(tmp, paste(clazz, c("x", "y", "z"), sep=""))\n+}\n+\n+write.table(NToverview[,c("Sequence.ID", "best_match", "seq", "A", "C", "G", "T")], NToverview.file, quote=F, sep="\\t", row.names=F, col.names=T)\n+\n+NToverview = NToverview[!grepl("unmatched", NToverview$best_match),]\n+\n+new.col.x = c(sum(NToverview$A), sum(NToverview$C), sum(NToverview$T), sum(NToverview$G))\n+new.col.y = sum(new.col.x)\n+new.col.z = round(new.col.x / new.col.y * 100, 2)\n+\n+tmp = names(NTresult)\n+NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))\n+names(NTresult) = c(tmp, paste("all", c("x", "y", "z"), sep=""))\n+\n+names(hotspot.analysis.sum) = names(NTresult)\n+\n+hotspot.analysis.sum = rbind(hotspot.analysis.sum, NTresult)\n+\n+write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0")\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n' |
b |
diff -r 000000000000 -r 8a5a2abbb870 style.tar.gz |
b |
Binary file style.tar.gz has changed |
b |
diff -r 000000000000 -r 8a5a2abbb870 subclass_definition.db.nhr |
b |
Binary file subclass_definition.db.nhr has changed |
b |
diff -r 000000000000 -r 8a5a2abbb870 subclass_definition.db.nin |
b |
Binary file subclass_definition.db.nin has changed |
b |
diff -r 000000000000 -r 8a5a2abbb870 subclass_definition.db.nsq |
b |
Binary file subclass_definition.db.nsq has changed |
b |
diff -r 000000000000 -r 8a5a2abbb870 summary_to_fasta.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/summary_to_fasta.py Mon Aug 29 05:36:10 2016 -0400 |
[ |
@@ -0,0 +1,42 @@ +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="The 1_Summary file of an IMGT zip file") +parser.add_argument("--fasta", help="The output fasta file") + +args = parser.parse_args() + +infile = args.input +fasta = args.fasta + +with open(infile, 'r') as i, open(fasta, 'w') as o: + first = True + id_col = 0 + seq_col = 0 + no_results = 0 + no_seqs = 0 + passed = 0 + for line in i: + splt = line.split("\t") + if first: + id_col = splt.index("Sequence ID") + seq_col = splt.index("Sequence") + first = False + continue + if len(splt) < 5: + no_results += 1 + continue + + ID = splt[id_col] + seq = splt[seq_col] + + if not len(seq) > 0: + no_seqs += 1 + continue + + o.write(">" + ID + "\n" + seq + "\n") + passed += 1 + + print "No results:", no_results + print "No sequences:", no_seqs + print "Written to fasta file:", passed |
b |
diff -r 000000000000 -r 8a5a2abbb870 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Aug 29 05:36:10 2016 -0400 |
b |
@@ -0,0 +1,24 @@ +<?xml version="1.0"?> +<tool_dependency> + <set_environment version="1.0"> + </set_environment> + + <package name="blastn" version="1.0"> + <install version="1.0"> + <actions> + <action type="download_by_url">ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.30/ncbi-blast-2.2.30+-x64-linux.tar.gz</action> + <action type="move_file"> + <source>bin/blastn</source> + <destination>$INSTALL_DIR</destination> + </action> + <action type="set_environment"> + <environment_variable name="BLASTN_DIR" action="set_to">$INSTALL_DIR</environment_variable> + </action> + </actions> + </install> + <readme> + downloads blast (ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.30/ncbi-blast-2.2.30+-x64-linux.tar.gz) and keeps the blastn executable + </readme> + </package> +</tool_dependency> + |
b |
diff -r 000000000000 -r 8a5a2abbb870 wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wrapper.sh Mon Aug 29 05:36:10 2016 -0400 |
[ |
b'@@ -0,0 +1,603 @@\n+#!/bin/bash\n+#set -e\n+dir="$(cd "$(dirname "$0")" && pwd)"\n+input=$1\n+method=$2\n+log=$3 #becomes the main html page at the end\n+outdir=$4\n+output="$outdir/index.html" #copied to $log location at the end\n+title=$5\n+include_fr1=$6\n+functionality=$7\n+unique=$8\n+naive_output_ca=$9\n+naive_output_cg=${10}\n+naive_output_cm=${11}\n+filter_unique=${12}\n+class_filter=${13}\n+empty_region_filter=${14}\n+mkdir $outdir\n+\n+tar -xzf $dir/style.tar.gz -C $outdir\n+\n+echo "---------------- read parameters ----------------"\n+echo "---------------- read parameters ----------------<br />" > $log\n+\n+echo "unpacking IMGT file"\n+\n+type="`file $input`"\n+if [[ "$type" == *"Zip archive"* ]] ; then\n+\techo "Zip archive"\n+\techo "unzip $input -d $PWD/files/"\n+\tunzip $input -d $PWD/files/\n+elif [[ "$type" == *"XZ compressed data"* ]] ; then\n+\techo "ZX archive"\n+\techo "tar -xJf $input -C $PWD/files/"\n+\tmkdir -p $PWD/files/$title\n+\ttar -xJf $input -C $PWD/files/$title\n+fi\n+\n+cat `find $PWD/files/ -name "1_*"` > $PWD/summary.txt\n+cat `find $PWD/files/ -name "3_*"` > $PWD/sequences.txt\n+cat `find $PWD/files/ -name "5_*"` > $PWD/aa.txt\n+cat `find $PWD/files/ -name "6_*"` > $PWD/junction.txt\n+cat `find $PWD/files/ -name "7_*"` > $PWD/mutationanalysis.txt\n+cat `find $PWD/files/ -name "8_*"` > $PWD/mutationstats.txt\n+cat `find $PWD/files/ -name "10_*"` > $PWD/hotspots.txt\n+\n+if [[ ${#BLASTN_DIR} -ge 5 ]] ; then\n+\techo "On server, using BLASTN_DIR env: ${BLASTN_DIR}"\n+else\n+\tBLASTN_DIR="/home/galaxy/Downloads/ncbi-blast-2.4.0+/bin"\n+\techo "Dev Galaxy set BLASTN_DIR to: ${BLASTN_DIR}"\n+fi\n+\n+echo "---------------- identification ($method) ----------------"\n+echo "---------------- identification ($method) ----------------<br />" >> $log\n+\n+if [[ "${method}" == "custom" ]] ; then\n+\tpython $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt\n+else\n+\techo "---------------- summary_to_fasta.py ----------------"\n+\techo "---------------- summary_to_fasta.py ----------------<br />" >> $log\n+\t\n+\tpython $dir/summary_to_fasta.py --input $PWD/summary.txt --fasta $PWD/sequences.fasta\n+\t\n+\techo -e "qseqid\\tsseqid\\tpident\\tlength\\tmismatch\\tgapopen\\tqstart\\tqend\\tsstart\\tsend\\tevalue\\tbitscore" > $outdir/identified_genes.txt\n+\t${BLASTN_DIR}/blastn -task blastn -db $dir/subclass_definition.db -query $PWD/sequences.fasta -outfmt 6 >> $outdir/identified_genes.txt\n+fi\n+\n+echo "---------------- merge_and_filter.r ----------------"\n+echo "---------------- merge_and_filter.r ----------------<br />" >> $log\n+\n+Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter} ${empty_region_filter} 2>&1\n+\n+echo "---------------- creating new IMGT zip ----------------"\n+echo "---------------- creating new IMGT zip ----------------<br />" >> $log\n+\n+mkdir $outdir/new_IMGT\n+\n+cat `find $PWD/files/ -name "1_*"` > "$outdir/new_IMGT/1_Summary.txt"\n+cat `find $PWD/files/ -name "2_*"` > "$outdir/new_IMGT/2_IMGT-gapped-nt-sequences.txt"\n+cat `find $PWD/files/ -name "3_*"` > "$outdir/new_IMGT/3_Nt-sequences.txt"\n+cat `find $PWD/files/ -name "4_*"` > "$outdir/new_IMGT/4_IMGT-gapped-AA-sequences.txt"\n+cat `find $PWD/files/ -name "5_*"` > "$outdir/new_IMGT/5_AA-sequences.txt"\n+cat `find $PWD/files/ -name "6_*"` > "$outdir/new_IMGT/6_Junction.txt"\n+cat `find $PWD/files/ -name "7_*"` > "$outdir/new_IMGT/7_V-REGION-mutation-and-AA-change-table.txt"\n+cat `find $PWD/files/ -name "8_*"` > "$outdir/new_IMGT/8_V-REGION-nt-mutation-statistics.txt"\n+cat `find $PWD/files/ -name "9_*"` > "$outdir/new_IMGT/9_V-REGION-AA-change-statistics.txt"\n+cat `find $PWD/files/ -name "10_*"` > "$outdir/new_IMGT/10_V-REGION-mutation-hotspots.txt"\n+\n+mkdir $outdir/new_IMGT_ca\n+cp $outdir/new_IMGT/* $outdir/new_IMGT_ca\n+\n+mkdir $outdir/new_IMGT_ca'..b'ed_clones-ca.txt\'>Download</a></td></tr>" >> $output\n+echo "<tr><td>The Change-O DB defined clones summary file of ca</td><td><a href=\'change_o/change-o-defined_clones-summary-ca.txt\'>Download</a></td></tr>" >> $output\n+echo "<tr><td>The Change-O DB file with defined clones of cg</td><td><a href=\'change_o/change-o-db-defined_clones-cg.txt\'>Download</a></td></tr>" >> $output\n+echo "<tr><td>The Change-O DB defined clones summary file of cg</td><td><a href=\'change_o/change-o-defined_clones-summary-cg.txt\'>Download</a></td></tr>" >> $output\n+echo "<tr><td>The Change-O DB file with defined clones of cm</td><td><a href=\'change_o/change-o-db-defined_clones-cm.txt\'>Download</a></td></tr>" >> $output\n+echo "<tr><td>The Change-O DB defined clones summary file of cm</td><td><a href=\'change_o/change-o-defined_clones-summary-cm.txt\'>Download</a></td></tr>" >> $output\n+\n+echo "</table>" >> $output\n+\n+echo "</div>" >> $output #downloads tab end\n+\n+echo "</div>" >> $output #tabs end \n+\n+echo "</html>" >> $output\n+\n+echo "---------------- baseline ----------------"\n+echo "---------------- baseline ----------------<br />" >> $log\n+tmp="$PWD"\n+\n+mkdir $outdir/baseline\n+\n+\n+mkdir $outdir/baseline/ca_cg_cm\n+if [[ $(wc -l < $outdir/new_IMGT/1_Summary.txt) -gt "1" ]]; then\n+\tcd $outdir/baseline/ca_cg_cm\n+\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT.txz "ca_cg_cm" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline.pdf" "Sequence.ID" "$outdir/baseline.txt"\t\n+else\n+\techo "No sequences" > "$outdir/baseline.txt"\t\n+fi\n+\n+mkdir $outdir/baseline/ca\n+if [[ $(wc -l < $outdir/new_IMGT_ca/1_Summary.txt) -gt "1" ]]; then\n+\tcd $outdir/baseline/ca\n+\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_ca.txz "ca" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_ca.pdf" "Sequence.ID" "$outdir/baseline_ca.txt"\n+else\n+\techo "No ca sequences" > "$outdir/baseline_ca.txt"\t\n+fi\n+\n+mkdir $outdir/baseline/cg\n+if [[ $(wc -l < $outdir/new_IMGT_cg/1_Summary.txt) -gt "1" ]]; then\n+\tcd $outdir/baseline/cg\n+\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_cg.txz "cg" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_cg.pdf" "Sequence.ID" "$outdir/baseline_cg.txt"\n+else\n+\techo "No cg sequences" > "$outdir/baseline_cg.txt"\t\n+fi\n+\n+mkdir $outdir/baseline/cm\n+if [[ $(wc -l < $outdir/new_IMGT_cm/1_Summary.txt) -gt "1" ]]; then\n+\tcd $outdir/baseline/cm\n+\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_cm.txz "cm" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_cm.pdf" "Sequence.ID" "$outdir/baseline_cm.txt"\n+else\n+\techo "No cm sequences" > "$outdir/baseline_cm.txt"\t\n+fi\n+\n+cd $tmp\n+\n+echo "---------------- naive_output.r ----------------"\n+echo "---------------- naive_output.r ----------------<br />" >> $log\n+\n+if [[ "$naive_output" != "None" ]]\n+then\n+\tcp $outdir/new_IMGT_ca.txz ${naive_output_ca}\n+\tcp $outdir/new_IMGT_cg.txz ${naive_output_cg}\n+\tcp $outdir/new_IMGT_cm.txz ${naive_output_cm}\n+fi\n+\n+echo "</table>" >> $outdir/base_overview.html\n+\n+mv $log $outdir/log.html\n+\n+echo "<html><center><h1><a href=\'index.html\'>Click here for the results</a></h1>Tip: Open it in a new tab (middle mouse button or right mouse button -> \'open in new tab\' on the link above)<br />" > $log\n+echo "<table border = 1>" >> $log\n+echo "<thead><tr><th>Info</th><th>Sequences</th><th>Percentage</th></tr></thead>" >> $log\n+tIFS="$TMP"\n+IFS=$\'\\t\'\n+while read step seq perc\n+\tdo\n+\t\techo "<tr>" >> $log\n+\t\techo "<td>$step</td>" >> $log\n+\t\techo "<td>$seq</td>" >> $log\n+\t\techo "<td>${perc}%</td>" >> $log\n+\t\techo "</tr>" >> $log\n+done < $outdir/filtering_steps.txt\n+echo "</table border></center></html>" >> $log\n+\n+IFS="$tIFS"\n+\n+\n+echo "---------------- Done! ----------------"\n+echo "---------------- Done! ----------------<br />" >> $outdir/log.html\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n' |