Repository 'shm_csr'
hg clone https://toolshed.g2.bx.psu.edu/repos/davidvanzessen/shm_csr

Changeset 5:012a738edf5a (2016-11-01)
Previous changeset 4:477e95b098fd (2016-10-31) Next changeset 6:2ddb9a21f635 (2016-11-01)
Commit message:
Uploaded
modified:
gene_identification.py
merge_and_filter.r
pattern_plots.r
shm_csr.r
shm_csr.xml
wrapper.sh
b
diff -r 477e95b098fd -r 012a738edf5a gene_identification.py
--- a/gene_identification.py Mon Oct 31 05:05:26 2016 -0400
+++ b/gene_identification.py Tue Nov 01 10:15:37 2016 -0400
[
@@ -47,10 +47,12 @@
 #lambda/kappa reference sequence
 searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg",
                  "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggcc",
+                 "ce": "gcctccacacagagcccatccgtcttccccttgacccgctgctgcaaaaacattccctcc",
                  "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence
 
 compiledregex = {"ca": [],
                  "cg": [],
+                 "ce": [],
                  "cm": []}
 
 #lambda/kappa reference sequence variable nucleotides
@@ -102,6 +104,8 @@
 for i in range(0, len(searchstrings["cm"]) - chunklength, chunklength / 2):
   compiledregex["cm"].append((re.compile(searchstrings["cm"][i:i+chunklength]), False))
 
+for i in range(0, len(searchstrings["ce"]) - chunklength, chunklength / 2):
+  compiledregex["ce"].append((re.compile(searchstrings["ce"][i:i+chunklength]), False))
 
 
 def removeAndReturnMaxIndex(x): #simplifies a list comprehension
@@ -114,11 +118,11 @@
 start_location = dict()
 hits = dict()
 alltotal = 0
-for key in compiledregex.keys(): #for ca/cg/cm
+for key in compiledregex.keys(): #for ca/cg/cm/ce
  regularexpressions = compiledregex[key] #get the compiled regular expressions
  for ID in dic.keys()[0:]: #for every ID
  if ID not in hits.keys(): #ensure that the dictionairy that keeps track of the hits for every gene exists
- hits[ID] = {"ca_hits": 0, "cg_hits": 0, "cm_hits": 0, "ca1": 0, "ca2": 0, "cg1": 0, "cg2": 0, "cg3": 0, "cg4": 0}
+ hits[ID] = {"ca_hits": 0, "cg_hits": 0, "cm_hits": 0, "ce_hits": 0, "ca1": 0, "ca2": 0, "cg1": 0, "cg2": 0, "cg3": 0, "cg4": 0}
  currentIDHits = hits[ID]
  seq = dic[ID]
  lastindex = 0
@@ -130,7 +134,7 @@
  break
  regex, hasVar = regexp
  matches = regex.finditer(seq[lastindex:])
- for match in matches: #for every match with the current regex, only uses the first hit
+ for match in matches: #for every match with the current regex, only uses the first hit because of the break at the end of this loop
  lastindex += match.start()
  start[relativeStartLocation + start_zero] += 1
  if hasVar: #if the regex has a variable nt in it
@@ -144,7 +148,7 @@
  currentIDHits["cg2"] += len([1 for x in cg2 if chunkstart <= x < chunkend and cg2[x] == seq[lastindex + x - chunkstart]])
  currentIDHits["cg3"] += len([1 for x in cg3 if chunkstart <= x < chunkend and cg3[x] == seq[lastindex + x - chunkstart]])
  currentIDHits["cg4"] += len([1 for x in cg4 if chunkstart <= x < chunkend and cg4[x] == seq[lastindex + x - chunkstart]])
- else: #key == "cm" #no variable regions in 'cm'
+ else: #key == "cm" #no variable regions in 'cm' or 'ce'
  pass
  break #this only breaks when there was a match with the regex, breaking means the 'else:' clause is skipped
  else: #only runs if there were no hits
@@ -155,13 +159,10 @@
  #start_location[ID + "_" + key] = str(start.index(max(start)))
 
 
-chunksInCA = len(compiledregex["ca"])
-chunksInCG = len(compiledregex["cg"])
-chunksInCM = len(compiledregex["cm"])
-requiredChunkPercentage = 0.7
 varsInCA = float(len(ca1.keys()) * 2)
 varsInCG = float(len(cg1.keys()) * 2) - 2 # -2 because the sliding window doesn't hit the first and last nt twice
 varsInCM = 0
+varsInCE = 0
 
 
 
@@ -183,17 +184,19 @@
  possibleca = float(len(compiledregex["ca"]))
  possiblecg = float(len(compiledregex["cg"]))
  possiblecm = float(len(compiledregex["cm"]))
+ possiblece = float(len(compiledregex["ce"]))
  cahits = currentIDHits["ca_hits"]
  cghits = currentIDHits["cg_hits"]
  cmhits = currentIDHits["cm_hits"]
- if cahits >= cghits and cahits >= cmhits: #its a ca gene
+ cehits = currentIDHits["ce_hits"]
+ if cahits >= cghits and cahits >= cmhits and cahits >= cehits: #its a ca gene
  ca1hits = currentIDHits["ca1"]
  ca2hits = currentIDHits["ca2"]
  if ca1hits >= ca2hits:
  o.write(ID + "\tIGA1\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
  else:
  o.write(ID + "\tIGA2\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n")
- elif cghits >= cahits and cghits >= cmhits: #its a cg gene
+ elif cghits >= cahits and cghits >= cmhits and cghits >= cehits: #its a cg gene
  cg1hits = currentIDHits["cg1"]
  cg2hits = currentIDHits["cg2"]
  cg3hits = currentIDHits["cg3"]
@@ -206,8 +209,11 @@
  o.write(ID + "\tIGG3\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
  else: #cg4 gene
  o.write(ID + "\tIGG4\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
- else: #its a cm gene
- o.write(ID + "\tIGM\t100\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cg"] + "\n")
+ else: #its a cm or ce gene
+ if cmhits >= cehits:
+ o.write(ID + "\tIGM\t100\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cm"] + "\n")
+ else:
+ o.write(ID + "\tIGE\t100\t" + str(int(cehits / possiblece * 100)) + "\t" + start_location[ID + "_ce"] + "\n")
  seq_write_count += 1
 
 print "Time: %i" % (int(time.time() * 1000) - starttime)
b
diff -r 477e95b098fd -r 012a738edf5a merge_and_filter.r
--- a/merge_and_filter.r Mon Oct 31 05:05:26 2016 -0400
+++ b/merge_and_filter.r Tue Nov 01 10:15:37 2016 -0400
[
@@ -141,44 +141,6 @@
   result[is.na(result[,col]),] = 0
 }
 
-write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T)
-
-if(filter.unique != "no"){
- clmns = names(result)
-
- if(empty.region.filter == "leader"){
- result$unique.def = paste(result$FR1.IMGT.seq, result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
- } else if(empty.region.filter == "FR1"){
- result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
- } else if(empty.region.filter == "CDR1"){
- rresult$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
- } else if(empty.region.filter == "FR2"){
- result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
- }
-
- if(grepl("_c", filter.unique)){
- result$unique.def = paste(result$unique.def, result$best_match)
- }
-
- #fltr = result$unique.def %in% result.filtered$unique.def
-
- if(grepl("keep", filter.unique)){
- result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes
- result = result[!duplicated(result$unique.def),]
- } else {
- result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]
- result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes
- result = result[!duplicated(result$unique.def),]
- }
-
- #result = result[,clmns]
-
- #write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T)
-}
-
-filtering.steps = rbind(filtering.steps, c("After filter unique sequences", nrow(result)))
-
-
 splt = strsplit(class.filter, "_")[[1]]
 chunk_hit_threshold = as.numeric(splt[1])
 nt_hit_threshold = as.numeric(splt[2])
@@ -198,10 +160,39 @@
  result$best_match = "all"
 }
 
-if(any(higher_than, na.rm=T)){
- #summ = summ[higher_than,]
+write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T)
+
+if(filter.unique != "no"){
+ clmns = names(result)
+
+ if(empty.region.filter == "leader"){
+ result$unique.def = paste(result$FR1.IMGT.seq, result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
+ } else if(empty.region.filter == "FR1"){
+ result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
+ } else if(empty.region.filter == "CDR1"){
+ result$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
+ } else if(empty.region.filter == "FR2"){
+ result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
+ }
+
+ if(grepl("_c", filter.unique)){
+ result$unique.def = paste(result$unique.def, result$best_match)
+ }
+
+ if(grepl("keep", filter.unique)){
+ result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes
+ result = result[!duplicated(result$unique.def),]
+ } else {
+ result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]
+ result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes
+ result = result[!duplicated(result$unique.def),]
+ }
 }
 
+write.table(result, gsub("before_unique_filter.txt", "after_unique_filter.txt", before.unique.file), sep="\t", quote=F,row.names=F,col.names=T)
+
+filtering.steps = rbind(filtering.steps, c("After filter unique sequences", nrow(result)))
+
 if(nrow(summ) == 0){
  stop("No data remaining after filter")
 }
b
diff -r 477e95b098fd -r 012a738edf5a pattern_plots.r
--- a/pattern_plots.r Mon Oct 31 05:05:26 2016 -0400
+++ b/pattern_plots.r Tue Nov 01 10:15:37 2016 -0400
b
@@ -22,7 +22,7 @@
 
 
 
-classes = c("IGA", "IGA1", "IGA2", "IGG", "IGG1", "IGG2", "IGG3", "IGG4", "IGM")
+classes = c("IGA", "IGA1", "IGA2", "IGG", "IGG1", "IGG2", "IGG3", "IGG4", "IGM", "IGE")
 xyz = c("x", "y", "z")
 new.names = c(paste(rep(classes, each=3), xyz, sep="."), paste("un", xyz, sep="."), paste("all", xyz, sep="."))
 
@@ -45,7 +45,7 @@
 write.table(data1, plot1.txt, quote=F, sep="\t", na="", row.names=F, col.names=T)
 
 p = ggplot(data1, aes(Class, value)) + geom_bar(aes(fill=Type), stat="identity", position="dodge", colour = "black") + ylab("% of mutations") + guides(fill=guide_legend(title=NULL))
-p = p + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=13, colour="black")) + scale_fill_manual(values=c("RGYW.WRCY" = "white", "TW.WA" = "blue4"))
+p = p + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1)) + scale_fill_manual(values=c("RGYW.WRCY" = "white", "TW.WA" = "blue4"))
 #p = p + scale_colour_manual(values=c("RGYW.WRCY" = "black", "TW.WA" = "blue4"))
 png(filename=plot1.png, width=480, height=300)
 print(p)
@@ -79,7 +79,7 @@
 write.table(data2, plot2.txt, quote=F, sep="\t", na="", row.names=F, col.names=T)
 
 p = ggplot(data2, aes(x=Class, y=value, fill=Type)) + geom_bar(position="fill", stat="identity", colour = "black") + scale_y_continuous(labels=percent_format()) + guides(fill=guide_legend(title=NULL)) + ylab("% of mutations")
-p = p + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black")) + scale_fill_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "white"))
+p = p + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1)) + scale_fill_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "white"))
 #p = p + scale_colour_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "black"))
 png(filename=plot2.png, width=480, height=300)
 print(p)
@@ -115,7 +115,7 @@
 write.table(data3, plot3.txt, quote=F, sep="\t", na="", row.names=F, col.names=T)
 
 p = ggplot(data3, aes(Class, value)) + geom_bar(aes(fill=Type), stat="identity", position="dodge", colour = "black") + ylab("% of nucleotides") + guides(fill=guide_legend(title=NULL))
-p = p + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black")) + scale_fill_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "white"))
+p = p + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1)) + scale_fill_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "white"))
 #p = p + scale_colour_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "black"))
 png(filename=plot3.png, width=480, height=300)
 print(p)
b
diff -r 477e95b098fd -r 012a738edf5a shm_csr.r
--- a/shm_csr.r Mon Oct 31 05:05:26 2016 -0400
+++ b/shm_csr.r Tue Nov 01 10:15:37 2016 -0400
[
@@ -280,15 +280,13 @@
  transition2 = merge(transition2, base.order, by.x="variable", by.y="base")
 
  transition2[is.na(transition2$value),]$value = 0
-
- print(transition2)
 
  if(any(transition2$value != 0)){ #having rows of data but a transition table filled with 0 is bad
  print("Plotting stacked transition")
  png(filename=paste("transitions_stacked_", name, ".png", sep=""))
  p = ggplot(transition2, aes(factor(reorder(id, order.x)), y=value, fill=factor(reorder(variable, order.y)))) + geom_bar(position="fill", stat="identity", colour="black") #stacked bar
  p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + guides(fill=guide_legend(title=NULL))
- p = p + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black")) + scale_fill_manual(values=c("A" = "blue4", "G" = "lightblue1", "C" = "olivedrab3", "T" = "olivedrab4"))
+ p = p + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=16, colour="black")) + scale_fill_manual(values=c("A" = "blue4", "G" = "lightblue1", "C" = "olivedrab3", "T" = "olivedrab4"))
  #p = p + scale_colour_manual(values=c("A" = "black", "G" = "black", "C" = "black", "T" = "black"))
  print(p)
  dev.off()
@@ -374,7 +372,7 @@
  pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=Gene))
  pc = pc + geom_bar(width = 1, stat = "identity") + scale_fill_manual(labels=genesForPlot$label, values=c("IGA1" = "lightblue1", "IGA2" = "blue4"))
  pc = pc + coord_polar(theta="y") + scale_y_continuous(breaks=NULL)
- pc = pc + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black"))
+ pc = pc + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=16, colour="black"), axis.title=element_blank(), axis.text=element_blank(), axis.ticks=element_blank())
  pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IGA subclasses", "( n =", sum(genesForPlot$Freq), ")"))
  write.table(genesForPlot, "IGA_pie.txt", sep="\t",quote=F,row.names=F,col.names=T)
 
@@ -395,7 +393,7 @@
  pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=Gene))
  pc = pc + geom_bar(width = 1, stat = "identity") + scale_fill_manual(labels=genesForPlot$label, values=c("IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred"))
  pc = pc + coord_polar(theta="y") + scale_y_continuous(breaks=NULL)
- pc = pc + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black"))
+ pc = pc + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=16, colour="black"), axis.title=element_blank(), axis.text=element_blank(), axis.ticks=element_blank())
  pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IGG subclasses", "( n =", sum(genesForPlot$Freq), ")"))
  write.table(genesForPlot, "IGG_pie.txt", sep="\t",quote=F,row.names=F,col.names=T)
 
@@ -415,7 +413,7 @@
 
 p = ggplot(dat.clss, aes(best_match, percentage_mutations))
 p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA)
-p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black"))
+p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=16, colour="black"))
 p = p + scale_fill_manual(values=c("IGA" = "blue4", "IGA1" = "lightblue1", "IGA2" = "blue4", "IGG" = "olivedrab3", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "darkviolet", "all" = "blue4"))
 p = p + scale_colour_manual(values=c("IGA" = "blue4", "IGA1" = "lightblue1", "IGA2" = "blue4", "IGG" = "olivedrab3", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "darkviolet", "all" = "blue4"))
 
@@ -442,7 +440,7 @@
 frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2)
 
 p = ggplot(frequency_bins_data, aes(frequency_bins, frequency))
-p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black"))
+p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=16, colour="black"))
 p = p + xlab("Frequency ranges") + ylab("Frequency") + ggtitle("Mutation Frequencies by class") + scale_fill_manual(values=c("IGA" = "blue4", "IGG" = "olivedrab3", "IGM" = "black", "all" = "blue4"))
 
 png(filename="frequency_ranges.png")
b
diff -r 477e95b098fd -r 012a738edf5a shm_csr.xml
--- a/shm_csr.xml Mon Oct 31 05:05:26 2016 -0400
+++ b/shm_csr.xml Tue Nov 01 10:15:37 2016 -0400
b
@@ -1,7 +1,7 @@
 <tool id="shm_csr" name="SHM &amp; CSR pipeline" version="1.0">
  <description></description>
  <command interpreter="bash">
- wrapper.sh $in_file custom $out_file $out_file.files_path ${in_file.name} "-" $functionality $unique $naive_output_ca $naive_output_cg $naive_output_cm $filter_uniques $class_filter $empty_region_filter
+ wrapper.sh $in_file custom $out_file $out_file.files_path ${in_file.name} "-" $functionality $unique $naive_output_ca $naive_output_cg $naive_output_cm $filter_uniques $class_filter $empty_region_filter $fast
  </command>
  <inputs>
  <param name="in_file" type="data" label="IMGT zip file to be analysed" />
@@ -33,7 +33,7 @@
  <option value="CDR3.IMGT.seq">CDR3 (nt)</option>
  <option value="Sequence.ID">Don't remove duplicates</option>
  </param>
- <param name="class_filter" type="select" label="Class/Subclass filter" help="" >
+ <param name="class_filter" type="select" label="Human Class/Subclass filter" help="" >
  <option value="70_70" selected="true">>70% class and >70% subclass</option>
  <option value="60_55">>60% class and >55% subclass</option>
  <option value="70_0">>70% class</option>
@@ -46,6 +46,14 @@
  <option value="no" selected="true">No</option>
  </param>
  </conditional>
+
+
+ <param name="fast" type="select" label="Fast" help="Skips generating the new ZIP files and Change-O/Baseline" >
+ <option value="yes">Yes</option>
+ <option value="no" selected="true">No</option>
+ </param>
+
+
  </inputs>
  <outputs>
  <data format="html" name="out_file" label = "SHM &amp; CSR on ${in_file.name}"/>
b
diff -r 477e95b098fd -r 012a738edf5a wrapper.sh
--- a/wrapper.sh Mon Oct 31 05:05:26 2016 -0400
+++ b/wrapper.sh Tue Nov 01 10:15:37 2016 -0400
[
b'@@ -16,6 +16,7 @@\n filter_unique=${12}\n class_filter=${13}\n empty_region_filter=${14}\n+fast=${15}\n mkdir $outdir\n \n tar -xzf $dir/style.tar.gz -C $outdir\n@@ -62,101 +63,112 @@\n \n Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter} ${empty_region_filter} 2>&1\n \n-echo "---------------- creating new IMGT zips ----------------"\n-echo "---------------- creating new IMGT zips ----------------<br />" >> $log\n+if [[ "$fast" == "no" ]] ; then\n \n-mkdir $outdir/new_IMGT\n+\techo "---------------- creating new IMGT zips ----------------"\n+\techo "---------------- creating new IMGT zips ----------------<br />" >> $log\n+\n+\tmkdir $outdir/new_IMGT\n \n-cat `find $PWD/files/ -name "1_*"` > "$outdir/new_IMGT/1_Summary.txt"\n-cat `find $PWD/files/ -name "2_*"` > "$outdir/new_IMGT/2_IMGT-gapped-nt-sequences.txt"\n-cat `find $PWD/files/ -name "3_*"` > "$outdir/new_IMGT/3_Nt-sequences.txt"\n-cat `find $PWD/files/ -name "4_*"` > "$outdir/new_IMGT/4_IMGT-gapped-AA-sequences.txt"\n-cat `find $PWD/files/ -name "5_*"` > "$outdir/new_IMGT/5_AA-sequences.txt"\n-cat `find $PWD/files/ -name "6_*"` > "$outdir/new_IMGT/6_Junction.txt"\n-cat `find $PWD/files/ -name "7_*"` > "$outdir/new_IMGT/7_V-REGION-mutation-and-AA-change-table.txt"\n-cat `find $PWD/files/ -name "8_*"` > "$outdir/new_IMGT/8_V-REGION-nt-mutation-statistics.txt"\n-cat `find $PWD/files/ -name "9_*"` > "$outdir/new_IMGT/9_V-REGION-AA-change-statistics.txt"\n-cat `find $PWD/files/ -name "10_*"` > "$outdir/new_IMGT/10_V-REGION-mutation-hotspots.txt"\n+\tcat `find $PWD/files/ -name "1_*"` > "$outdir/new_IMGT/1_Summary.txt"\n+\tcat `find $PWD/files/ -name "2_*"` > "$outdir/new_IMGT/2_IMGT-gapped-nt-sequences.txt"\n+\tcat `find $PWD/files/ -name "3_*"` > "$outdir/new_IMGT/3_Nt-sequences.txt"\n+\tcat `find $PWD/files/ -name "4_*"` > "$outdir/new_IMGT/4_IMGT-gapped-AA-sequences.txt"\n+\tcat `find $PWD/files/ -name "5_*"` > "$outdir/new_IMGT/5_AA-sequences.txt"\n+\tcat `find $PWD/files/ -name "6_*"` > "$outdir/new_IMGT/6_Junction.txt"\n+\tcat `find $PWD/files/ -name "7_*"` > "$outdir/new_IMGT/7_V-REGION-mutation-and-AA-change-table.txt"\n+\tcat `find $PWD/files/ -name "8_*"` > "$outdir/new_IMGT/8_V-REGION-nt-mutation-statistics.txt"\n+\tcat `find $PWD/files/ -name "9_*"` > "$outdir/new_IMGT/9_V-REGION-AA-change-statistics.txt"\n+\tcat `find $PWD/files/ -name "10_*"` > "$outdir/new_IMGT/10_V-REGION-mutation-hotspots.txt"\n \n-mkdir $outdir/new_IMGT_IGA\n-cp $outdir/new_IMGT/* $outdir/new_IMGT_IGA\n+\tmkdir $outdir/new_IMGT_IGA\n+\tcp $outdir/new_IMGT/* $outdir/new_IMGT_IGA\n \n-mkdir $outdir/new_IMGT_IGA1\n-cp $outdir/new_IMGT/* $outdir/new_IMGT_IGA1\n+\tmkdir $outdir/new_IMGT_IGA1\n+\tcp $outdir/new_IMGT/* $outdir/new_IMGT_IGA1\n \n-mkdir $outdir/new_IMGT_IGA2\n-cp $outdir/new_IMGT/* $outdir/new_IMGT_IGA2\n+\tmkdir $outdir/new_IMGT_IGA2\n+\tcp $outdir/new_IMGT/* $outdir/new_IMGT_IGA2\n \n-mkdir $outdir/new_IMGT_IGG\n-cp $outdir/new_IMGT/* $outdir/new_IMGT_IGG\n+\tmkdir $outdir/new_IMGT_IGG\n+\tcp $outdir/new_IMGT/* $outdir/new_IMGT_IGG\n \n-mkdir $outdir/new_IMGT_IGG1\n-cp $outdir/new_IMGT/* $outdir/new_IMGT_IGG1\n+\tmkdir $outdir/new_IMGT_IGG1\n+\tcp $outdir/new_IMGT/* $outdir/new_IMGT_IGG1\n \n-mkdir $outdir/new_IMGT_IGG2\n-cp $outdir/new_IMGT/* $outdir/new_IMGT_IGG2\n+\tmkdir $outdir/new_IMGT_IGG2\n+\tcp $outdir/new_IMGT/* $outdir/new_IMGT_IGG2\n \n-mkdir $outdir/new_IMGT_IGG3\n-cp $outdir/new_IMGT/* $outdir/new_IMGT_IGG3\n+\tmkdir $outdir/new_IMGT_IGG3\n+\tcp $outdir/new_IMGT/* $outdir/new_IMGT_IGG3\n \n-mkdir $outdir/new_IMGT_IGG4\n-cp $outdir/new_IMGT/* $outdir/new_IMGT_IGG4\n+\tmkdir $outdir/new_IMGT_IGG4\n+\tcp $outdir/new_IMGT/* $outdir/new_IMGT_IGG4\n+\n+\tmkdir $outdir/new_IMGT_IGM\n+\tcp $outdir/new_IMGT/* $outdir/new_IMGT_IGM\n \n-mkdir $outdir/new_IMGT_IGM\n-cp $outdir/new_IMGT/* $outdir/new_IMGT_IGM\n+\tmkdir $outdir/new_IMGT_IGE\n+\tcp $out'..b'seline\n+\techo "---------------- baseline ----------------"\n+\techo "---------------- baseline ----------------<br />" >> $log\n+\ttmp="$PWD"\n+\n+\tmkdir $outdir/baseline\n \n \n-mkdir $outdir/baseline/IGA_IGG_IGM\n-if [[ $(wc -l < $outdir/new_IMGT/1_Summary.txt) -gt "1" ]]; then\n-\tcd $outdir/baseline/IGA_IGG_IGM\n-\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT.txz "IGA_IGG_IGM" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline.pdf" "Sequence.ID" "$outdir/baseline.txt"\t\n-else\n-\techo "No sequences" > "$outdir/baseline.txt"\n-fi\n+\tmkdir $outdir/baseline/IGA_IGG_IGM\n+\tif [[ $(wc -l < $outdir/new_IMGT/1_Summary.txt) -gt "1" ]]; then\n+\t\tcd $outdir/baseline/IGA_IGG_IGM\n+\t\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT.txz "IGA_IGG_IGM" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline.pdf" "Sequence.ID" "$outdir/baseline.txt"\t\n+\telse\n+\t\techo "No sequences" > "$outdir/baseline.txt"\n+\tfi\n \n-mkdir $outdir/baseline/IGA\n-if [[ $(wc -l < $outdir/new_IMGT_IGA/1_Summary.txt) -gt "1" ]]; then\n-\tcd $outdir/baseline/IGA\n-\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_IGA.txz "IGA" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_IGA.pdf" "Sequence.ID" "$outdir/baseline_IGA.txt"\n-else\n-\techo "No IGA sequences" > "$outdir/baseline_IGA.txt"\n-fi\n+\tmkdir $outdir/baseline/IGA\n+\tif [[ $(wc -l < $outdir/new_IMGT_IGA/1_Summary.txt) -gt "1" ]]; then\n+\t\tcd $outdir/baseline/IGA\n+\t\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_IGA.txz "IGA" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_IGA.pdf" "Sequence.ID" "$outdir/baseline_IGA.txt"\n+\telse\n+\t\techo "No IGA sequences" > "$outdir/baseline_IGA.txt"\n+\tfi\n \n-mkdir $outdir/baseline/IGG\n-if [[ $(wc -l < $outdir/new_IMGT_IGG/1_Summary.txt) -gt "1" ]]; then\n-\tcd $outdir/baseline/IGG\n-\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_IGG.txz "cg" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_IGG.pdf" "Sequence.ID" "$outdir/baseline_IGG.txt"\n-else\n-\techo "No IGG sequences" > "$outdir/baseline_IGG.txt"\n-fi\n+\tmkdir $outdir/baseline/IGG\n+\tif [[ $(wc -l < $outdir/new_IMGT_IGG/1_Summary.txt) -gt "1" ]]; then\n+\t\tcd $outdir/baseline/IGG\n+\t\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_IGG.txz "cg" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_IGG.pdf" "Sequence.ID" "$outdir/baseline_IGG.txt"\n+\telse\n+\t\techo "No IGG sequences" > "$outdir/baseline_IGG.txt"\n+\tfi\n \n-mkdir $outdir/baseline/IGM\n-if [[ $(wc -l < $outdir/new_IMGT_IGM/1_Summary.txt) -gt "1" ]]; then\n-\tcd $outdir/baseline/IGM\n-\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_IGM.txz "IGM" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_IGM.pdf" "Sequence.ID" "$outdir/baseline_IGM.txt"\n-else\n-\techo "No IGM sequences" > "$outdir/baseline_IGM.txt"\n+\tmkdir $outdir/baseline/IGM\n+\tif [[ $(wc -l < $outdir/new_IMGT_IGM/1_Summary.txt) -gt "1" ]]; then\n+\t\tcd $outdir/baseline/IGM\n+\t\tbash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "25:26:38:55:65:104:-" $outdir/new_IMGT_IGM.txz "IGM" "$dir/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa" "$outdir/baseline_IGM.pdf" "Sequence.ID" "$outdir/baseline_IGM.txt"\n+\telse\n+\t\techo "No IGM sequences" > "$outdir/baseline_IGM.txt"\n+\tfi\n+\n+\tcd $tmp\n+\n+\techo "Cleaning up *.RData files"\n+\tfind $outdir/baseline -name "*.RData" -type f -delete\n+\n fi\n \n-cd $tmp\n-\n-echo "Cleaning up *.RData files"\n-find $outdir/baseline -name "*.RData" -type f -delete\n-\n echo "---------------- naive_output.r ----------------"\n echo "---------------- naive_output.r ----------------<br />" >> $log\n \n-if [[ "$naive_output" != "None" ]]\n+if [[ "$naive_output" == "yes" ]]\n then\n \tcp $outdir/new_IMGT_IGA.txz ${naive_output_ca}\n \tcp $outdir/new_IMGT_IGG.txz ${naive_output_cg}\n'