Repository 'argalaxy_tools'
hg clone https://toolshed.g2.bx.psu.edu/repos/davidvanzessen/argalaxy_tools

Changeset 24:d5d203d38c8a (2017-02-01)
Previous changeset 23:e2fbdfacec1d (2017-01-27) Next changeset 25:94765af0db1f (2017-02-09)
Commit message:
Uploaded
modified:
complete.sh
report_clonality/RScript.r
report_clonality/r_wrapper.sh
b
diff -r e2fbdfacec1d -r d5d203d38c8a complete.sh
--- a/complete.sh Fri Jan 27 04:29:43 2017 -0500
+++ b/complete.sh Wed Feb 01 09:48:38 2017 -0500
[
@@ -19,6 +19,10 @@
 #unzip $dir/database.zip -d $PWD/igblastdatabase/
 #export IGDATA=$PWD/igblastdatabase/
 
+echo "python: `which python`"
+echo "R: `which R`"
+echo "Rscript: `which Rscript`"
+
 id=""
 forwardSlash="/"
 mergerInput=()
@@ -39,6 +43,7 @@
  f=$(file $current)
  zipType="Zip archive"
  zxType="XZ compressed data"
+ echo "filetype of ${id}: $f"
    if [[ "$f" == *"$zipType"* ]] || [[ "$f" == *"$zxType"* ]]
  then
  echo "<tr><td>Sample $count of patient $id is an archive file, using IMGT Loader</td></tr>" >> $html
b
diff -r e2fbdfacec1d -r d5d203d38c8a report_clonality/RScript.r
--- a/report_clonality/RScript.r Fri Jan 27 04:29:43 2017 -0500
+++ b/report_clonality/RScript.r Wed Feb 01 09:48:38 2017 -0500
[
b'@@ -49,6 +49,7 @@\n \n inputdata = read.table(infile, sep="\\t", header=TRUE, fill=T, comment.char="", stringsAsFactors=F)\n \n+\n print(paste("nrows: ", nrow(inputdata)))\n \n setwd(outdir)\n@@ -121,9 +122,6 @@\n   clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$clonality_clonaltype), ]\n }\n \n-print("SAMPLE TABLE:")\n-print(table(PRODF$Sample))\n-\n prod.unique.sample.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample")])\n prod.unique.rep.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample", "Replicate")])\n \n@@ -162,6 +160,9 @@\n #write.table(PRODF, "allUnique.csv", sep=",",quote=F,row.names=F,col.names=T)\n write.table(UNPROD, "allUnproductive.csv", sep=",",quote=F,row.names=F,col.names=T)\n \n+print("SAMPLE TABLE:")\n+print(table(PRODF$Sample))\n+\n #write the samples to a file\n sampleFile <- file("samples.txt")\n un = unique(inputdata$Sample)\n@@ -383,7 +384,7 @@\n png("CDR3LengthPlot.png",width = 1280, height = 720)\n CDR3LengthPlot\n dev.off()\n-write.table(x=CDR3Length, file="CDR3LengthPlot.csv", sep=",",quote=F,row.names=F,col.names=T)\n+write.table(x=CDR3Length, file="CDR3LengthPlot.txt", sep="\\t",quote=F,row.names=F,col.names=T)\n \n # ---------------------- Plot the heatmaps ----------------------\n \n@@ -412,7 +413,7 @@\n     png(paste("HeatmapVD_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Dchain$v.name)), height=100+(15*length(Vchain$v.name)))\n     print(img)\n     dev.off()\n-    write.table(x=acast(dat, Top.V.Gene~Top.D.Gene, value.var="Length"), file=paste("HeatmapVD_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA)\n+    write.table(x=acast(dat, Top.V.Gene~Top.D.Gene, value.var="Length"), file=paste("HeatmapVD_", unique(dat[3])[1,1], ".txt", sep=""), sep="\\t",quote=F,row.names=T,col.names=NA)\n   }\n   \n   VandDCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.D.Gene", "Sample")])\n@@ -462,7 +463,7 @@\n   png(paste("HeatmapVJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Vchain$v.name)))\n   print(img)\n   dev.off()\n-  write.table(x=acast(dat, Top.V.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapVJ_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA)\n+  write.table(x=acast(dat, Top.V.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapVJ_", unique(dat[3])[1,1], ".txt", sep=""), sep="\\t",quote=F,row.names=T,col.names=NA)\n }\n \n VandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.J.Gene", "Sample")])\n@@ -511,7 +512,7 @@\n     png(paste("HeatmapDJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Dchain$v.name)))\n     print(img)\n     dev.off()\n-    write.table(x=acast(dat, Top.D.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapDJ_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA)\n+    write.table(x=acast(dat, Top.D.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapDJ_", unique(dat[3])[1,1], ".txt", sep=""), sep="\\t",quote=F,row.names=T,col.names=NA)\n   }\n   \n   \n@@ -701,17 +702,7 @@\n if(all(imgtcolumns %in% colnames(inputdata)))\n {\n   print("found IMGT columns, running junction analysis")\n-  \n-  if(locus %in% c("IGK","IGL", "TRA", "TRG")){\n-\t  print("VJ recombination, no filtering on absent D")\n-  } else {\n-\t  print("VDJ recombination, using N column for junction analysis")\n-\t  fltr = nchar(PRODF$Top.D.Gene) < 4\n-\t  print(paste("Removing", sum(fltr), "sequences without a identified D"))\n-\t  PRODF = PRODF[!fltr,]\n-  }\n-  \n-  \n+    \n   #ensure certain columns are in the data (files generated with older versions of IMGT Loader)\n   col.checks = c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb")\n   for(col.check in col.checks){\n@@ -730,9 +721,12 @@\n \t  }\n   }\n   \n+  PRODF.with.D = PRODF[nchar(PRODF$Top.D.Gene, keepNA=F) > 2,]\n+  PRODF.no.D '..b'                                  by=c("Sample")])\n+  newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)\n+  write.table(newData, "junctionAnalysisUnProd_mean_nD.txt" , sep="\\t",quote=F,na="-",row.names=F,col.names=F)\n+  \n+    newData = data.frame(data.table(PRODF.no.D)[,list(unique=.N, \n+                                                VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T),\n+                                                P1=num_median(.SD$P3V.nt.nb, na.rm=T),\n+                                                N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)),\n+                                                P2=num_median(.SD$P5J.nt.nb, na.rm=T),\n+                                                DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T),\n+                                                Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)),\n+                                                Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)),\n+                                                Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)),\n+                                                Median.CDR3.l=as.double(median(.SD$CDR3.Length))),\n+\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tby=c("Sample")])\n+  newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)\n+  write.table(newData, "junctionAnalysisUnProd_median_nD.txt" , sep="\\t",quote=F,na="-",row.names=F,col.names=F)\n }\n \n PRODF = bak\n@@ -822,12 +874,18 @@\n \tD.REGION.reading.frame[chck,"D.REGION.reading.frame"] = "No D"\n }\n \n-D.REGION.reading.frame = data.frame(data.table(D.REGION.reading.frame)[, list(Freq=.N), by=c("Sample", "D.REGION.reading.frame")])\n+D.REGION.reading.frame.1 = data.frame(data.table(D.REGION.reading.frame)[, list(Freq=.N), by=c("Sample", "D.REGION.reading.frame")])\n+\n+D.REGION.reading.frame.2 = data.frame(data.table(D.REGION.reading.frame)[, list(sample.sum=sum(as.numeric(.SD$D.REGION.reading.frame), na.rm=T)), by=c("Sample")])\n \n-write.table(D.REGION.reading.frame, "DReadingFrame.csv" , sep="\\t",quote=F,row.names=F,col.names=T)\n+D.REGION.reading.frame = merge(D.REGION.reading.frame.1, D.REGION.reading.frame.2, by="Sample")\n+\n+D.REGION.reading.frame$percentage = round(D.REGION.reading.frame$Freq / D.REGION.reading.frame$sample.sum * 100, 1)\n+\n+write.table(D.REGION.reading.frame, "DReadingFrame.txt" , sep="\\t",quote=F,row.names=F,col.names=T)\n \n D.REGION.reading.frame = ggplot(D.REGION.reading.frame)\n-D.REGION.reading.frame = D.REGION.reading.frame + geom_bar(aes( x = D.REGION.reading.frame, y = Freq, fill=Sample), stat=\'identity\', position=\'dodge\' ) + ggtitle("D reading frame") + xlab("Frequency") + ylab("Frame")\n+D.REGION.reading.frame = D.REGION.reading.frame + geom_bar(aes( x = D.REGION.reading.frame, y = percentage, fill=Sample), stat=\'identity\', position=\'dodge\' ) + ggtitle("D reading frame") + xlab("Frequency") + ylab("Frame")\n D.REGION.reading.frame = D.REGION.reading.frame + scale_fill_manual(values=sample.colors)\n D.REGION.reading.frame = D.REGION.reading.frame + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())\n \n@@ -878,6 +936,6 @@\n \n # ---------------------- AA median CDR3 length ----------------------\n \n-median.aa.l = data.frame(data.table(PRODF)[, list(median=as.double(median(.SD$CDR3.Length))), by=c("Sample")])\n-write.table(median.aa.l, "AAMedianBySample.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F)\n+median.aa.l = data.frame(data.table(PRODF)[, list(median=as.double(median(as.numeric(.SD$CDR3.Length, na.rm=T), na.rm=T))), by=c("Sample")])\n+write.table(median.aa.l, "AAMedianBySample.txt" , sep="\\t",quote=F,na="-",row.names=F,col.names=F)\n \n'
b
diff -r e2fbdfacec1d -r d5d203d38c8a report_clonality/r_wrapper.sh
--- a/report_clonality/r_wrapper.sh Fri Jan 27 04:29:43 2017 -0500
+++ b/report_clonality/r_wrapper.sh Wed Feb 01 09:48:38 2017 -0500
[
b'@@ -72,7 +72,7 @@\n \t\tCIRCOSDIR="/data/galaxy/galaxy-dist/toolsheddependencies/circos/0.64/saskia-hiltemann/cg_circos_plots/bbfdd52d64fd/bin/"\n \tfi\n \t\n-\tif [ -d "/home/galaxy/Anaconda3/bin" ]; then #hopefully temporary fix\n+\tif [ -d "/home/galaxy/Anaconda3/bin" ]; then #hopefully temporary fix #or not\n \t\tUSECIRCOS="yes"\n \t\tCIRCOSTOOLS="/home/galaxy/circos/circos-tools-0.22/tools"\n \t\tCIRCOSDIR="/home/galaxy/Anaconda3/bin"\n@@ -126,7 +126,9 @@\n if [[ "$useD" == "true" ]] ; then\n \techo "<img src=\'DPlot.png\'/>" >> $outputFile\n fi\n-echo "<img src=\'JPlot.png\'/>" >> $outputFile\n+echo "<img src=\'JPlot.png\'/> <br />" >> $outputFile\n+\n+echo "<img src=\'DReadingFrame.png\'/>" >> $outputFile\n \n cat $dir/naive_gene_freq.htm >> $outputFile\n \n@@ -135,14 +137,14 @@\n echo "<div class=\'tabbertab\' title=\'CDR3 Characteristics\'>" >> $outputFile\n echo "<img src=\'CDR3LengthPlot.png\'/><br />" >> $outputFile\n echo "<img src=\'AAComposition.png\'/>" >> $outputFile\n-echo "<img src=\'DReadingFrame.png\'/>" >> $outputFile\n+\n \n echo "<table class=\'pure-table pure-table-striped\'>" >> $outputFile\n echo "<thead><tr><th>Donor</th><th>Median CDR3 Length</th></tr></thead>" >> $outputFile\n-while IFS=, read Sample median\n+while read Sample median\n do\n \techo "<tr><td>$Sample</td><td>$median</td></tr>" >> $outputFile\n-done < $outputDir/AAMedianBySample.csv\n+done < $outputDir/AAMedianBySample.txt\n echo "</table>" >> $outputFile\n \n cat $dir/naive_cdr3_char.htm >> $outputFile\n@@ -278,36 +280,67 @@\n #hasJunctionData="$(if head -n 1 $inputFile | grep -qE \'3V.REGION.trimmed.nt.nb\'; then echo \'Yes\'; else echo \'No\'; fi)"\n \n #if [[ "$hasJunctionData" == "Yes" ]] ; then\n-if [ -a "$outputDir/junctionAnalysisProd_mean.csv" ] ; then\n+if [ -a "$outputDir/junctionAnalysisProd_mean_wD.txt" ] ; then\n \techo "<div class=\'tabbertab\' title=\'Junction Analysis\'>" >> $outputFile\n \techo "<img src=\'IGH_junctie_analyse.png\' />" >> $outputFile\n \t\n+\techo "<center><p style=\'font-size: 20;\'>Unique rearrangements with a V, D and J gene assigned</p></center>" >> $outputFile\n \techo "<table class=\'pure-table pure-table-striped\' id=\'junction_table\'> <caption>Productive mean</caption><thead><tr><th>Donor</th><th>Number of sequences</th><th>V.DEL</th><th>P1</th><th>N1</th><th>P2</th><th>DEL.D</th><th>D.DEL</th><th>P3</th><th>N2</th><th>P4</th><th>DEL.J</th><th>Total.Del</th><th>Total.N</th><th>Total.P</th><th>Median.CDR3</th><thead></tr><tbody>" >> $outputFile\n-\twhile IFS=, read Sample unique VDEL P1 N1 P2 DELD DDEL P3 N2 P4 DELJ TotalDel TotalN TotalP median\n+\twhile read Sample unique VDEL P1 N1 P2 DELD DDEL P3 N2 P4 DELJ TotalDel TotalN TotalP median\n \tdo\n \t\techo "<tr><td>$Sample</td><td>$unique</td><td>$VDEL</td><td>$P1</td><td>$N1</td><td>$P2</td><td>$DELD</td><td>$DDEL</td><td>$P3</td><td>$N2</td><td>$P4</td><td>$DELJ</td><td>$TotalDel</td><td>$TotalN</td><td>$TotalP</td><td>$median</td></tr>" >> $outputFile\n-\tdone < $outputDir/junctionAnalysisProd_mean.csv\n+\tdone < $outputDir/junctionAnalysisProd_mean_wD.txt\n \techo "</tbody></table>" >> $outputFile\n \t\n \techo "<table class=\'pure-table pure-table-striped\' id=\'junction_table\'> <caption>Unproductive mean</caption><thead><tr><th>Donor</th><th>Number of sequences</th><th>V.DEL</th><th>P1</th><th>N1</th><th>P2</th><th>DEL.D</th><th>D.DEL</th><th>P3</th><th>N2</th><th>P4</th><th>DEL.J</th><th>Total.Del</th><th>Total.N</th><th>Total.P</th><th>Median.CDR3</th><thead></tr><tbody>" >> $outputFile\n-\twhile IFS=, read Sample unique VDEL P1 N1 P2 DELD DDEL P3 N2 P4 DELJ TotalDel TotalN TotalP median\n+\twhile read Sample unique VDEL P1 N1 P2 DELD DDEL P3 N2 P4 DELJ TotalDel TotalN TotalP median\n \tdo\n \t\techo "<tr><td>$Sample</td><td>$unique</td><td>$VDEL</td><td>$P1</td><td>$N1</td><td>$P2</td><td>$DELD</td><td>$DDEL</td><td>$P3</td><td>$N2</td><td>$P4</td><td>$DELJ</td><td>$TotalDel</td><td>$TotalN</td><td>$TotalP</td><td>$median</td></tr>" >> $outputFile\n-\tdone < $outputDir/junctionAnalysisUnProd_mean.csv\n+\tdone < $outputDir/junctionAnalysisUnProd_mean_wD.txt\n \techo "</t'..b'wnload</a></td></tr>" >> $outputFile\n-echo "<tr><td>The dataset used to calculate clonality score (Unique based on clonaltype, $clonalType)</td><td><a href=\'clonalityComplete.txt\'>Download</a></td></tr>" >> $outputFile\n-\n-echo "<tr><td>The dataset used to generate the CDR3 length frequency graph</td><td><a href=\'CDR3LengthPlot.csv\'>Download</a></td></tr>" >> $outputFile\n+echo "<tr><td colspan=\'2\' style=\'background-color:#E0E0E0;\'>Gene frequencies</td></tr>" >> $outputFile\n \n echo "<tr><td>The dataset used to generate the distribution of V gene families graph</td><td><a href=\'VFFrequency.txt\'>Download</a></td></tr>" >> $outputFile\n if [[ "$useD" == "true" ]] ; then\n@@ -333,19 +364,38 @@\n \techo "<tr><td>The dataset used to generate the relative frequency of D gene usage graph</td><td><a href=\'DFrequency.txt\'>Download</a></td></tr>" >> $outputFile\n fi\n echo "<tr><td>The dataset used to generate the relative frequency of J gene usage graph</td><td><a href=\'JFrequency.txt\'>Download</a></td></tr>" >> $outputFile\n+echo "<tr><td>The dataset used to generate the relative frequency of the D reading frame graph</td><td><a href=\'DReadingFrame.txt\'>Download</a></td></tr>" >> $outputFile\n+\n+echo "<tr><td colspan=\'2\' style=\'background-color:#E0E0E0;\'>CDR3 Characteristics</td></tr>" >> $outputFile\n+echo "<tr><td>The dataset used to generate the CDR3 length frequency graph</td><td><a href=\'CDR3LengthPlot.txt\'>Download</a></td></tr>" >> $outputFile\n echo "<tr><td>The dataset used to generate the Amino Acid Composition in the CDR3 graph</td><td><a href=\'AAComposition.txt\'>Download</a></td></tr>" >> $outputFile\n \n+echo "<tr><td colspan=\'2\' style=\'background-color:#E0E0E0;\'>Heatmaps</td></tr>" >> $outputFile\n for sample in $samples; do\n \tif [[ "$useD" == "true" ]] ; then\n-\t\techo "<tr><td>The data used to generate the VD heatmap for $sample.</td><td><a href=\'HeatmapVD_$sample.csv\'>Download</a></td></tr>" >> $outputFile\n+\t\techo "<tr><td>The data used to generate the VD heatmap for $sample.</td><td><a href=\'HeatmapVD_$sample.txt\'>Download</a></td></tr>" >> $outputFile\n \tfi\n-\techo "<tr><td>The data used to generate the VJ heatmap for $sample.</td><td><a href=\'HeatmapVJ_$sample.csv\'>Download</a></td></tr>" >> $outputFile\n+\techo "<tr><td>The data used to generate the VJ heatmap for $sample.</td><td><a href=\'HeatmapVJ_$sample.txt\'>Download</a></td></tr>" >> $outputFile\n \tif [[ "$useD" == "true" ]] ; then\n-\t\techo "<tr><td>The data used to generate the DJ heatmap for $sample.</td><td><a href=\'HeatmapDJ_$sample.csv\'>Download</a></td></tr>" >> $outputFile\n+\t\techo "<tr><td>The data used to generate the DJ heatmap for $sample.</td><td><a href=\'HeatmapDJ_$sample.txt\'>Download</a></td></tr>" >> $outputFile\n \tfi\n done\n \n-echo "<tr><td>A frequency count of V Gene + J Gene + CDR3</td><td><a href=\'VJCDR3_count.txt\'>Download</a></td></tr>" >> $outputFile\n+echo "<tr><td colspan=\'2\' style=\'background-color:#E0E0E0;\'>Circos</td></tr>" >> $outputFile\n+for sample in $samples; do\n+\tif [[ "$useD" == "true" ]] ; then\n+\t\techo "<tr><td>The data used to generate the VD Circos plots for $sample.</td><td><a href=\'${sample}_VD_circos.txt\'>Download</a></td></tr>" >> $outputFile\n+\tfi\n+\techo "<tr><td>The data used to generate the VJ Circos plots for $sample.</td><td><a href=\'${sample}_VJ_circos.txt\'>Download</a></td></tr>" >> $outputFile\n+\tif [[ "$useD" == "true" ]] ; then\n+\t\techo "<tr><td>The data used to generate the DJ Circos plots for $sample.</td><td><a href=\'${sample}_DJ_circos.txt\'>Download</a></td></tr>" >> $outputFile\n+\tfi\n+done\n+\n+#echo "<tr><td>A frequency count of V Gene + J Gene + CDR3</td><td><a href=\'VJCDR3_count.txt\'>Download</a></td></tr>" >> $outputFile\n+\n+echo "<tr><td colspan=\'2\' style=\'background-color:#E0E0E0;\'>Clonality</td></tr>" >> $outputFile\n+echo "<tr><td>The dataset used to calculate clonality score (Unique based on clonaltype, $clonalType)</td><td><a href=\'clonalityComplete.txt\'>Download</a></td></tr>" >> $outputFile\n \n echo "</table>" >> $outputFile\n \n'