Repository 'shm_csr'
hg clone https://toolshed.g2.bx.psu.edu/repos/davidvanzessen/shm_csr

Changeset 56:ee807645b224 (2017-07-17)
Previous changeset 55:6cd12c71c3d3 (2017-06-14) Next changeset 57:cb779a45537b (2017-07-18)
Commit message:
Uploaded
modified:
merge_and_filter.r
shm_csr.xml
wrapper.sh
added:
check_unique_id.r
b
diff -r 6cd12c71c3d3 -r ee807645b224 check_unique_id.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/check_unique_id.r Mon Jul 17 10:44:40 2017 -0400
[
@@ -0,0 +1,25 @@
+args <- commandArgs(trailingOnly = TRUE) #first argument must be the summary file so it can grab the 
+
+current_file = args[1]
+
+current = read.table(current_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="", check.names=F)
+
+if(!("Sequence number" %in% names(current))){
+ stop("First argument doesn't contain the 'Sequence number' column")
+}
+
+tbl = table(current$Sequence.ID)
+l_tbl = length(tbl)
+check = any(tbl > 1)
+
+#if(l_tbl != nrow(current)){ # non unique IDs?
+if(check){
+ print("Sequence.ID is not unique for every sequence, adding sequence number to IDs")
+ for(i in 1:length(args)){
+ current_file = args[i]
+ print(paste("Appending 'Sequence number' column to 'Sequence ID' column in", current_file))
+ current = read.table(current_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="", check.names=F)
+ current[,"Sequence ID"] = paste(current[,"Sequence ID"], current[,"Sequence number"], sep="_")
+ write.table(x = current, file = current_file, quote = F, sep = "\t", na = "", row.names = F, col.names = T)
+ }
+}
b
diff -r 6cd12c71c3d3 -r ee807645b224 merge_and_filter.r
--- a/merge_and_filter.r Wed Jun 14 11:14:00 2017 -0400
+++ b/merge_and_filter.r Mon Jul 17 10:44:40 2017 -0400
[
@@ -41,6 +41,11 @@
     return(df)
 }
 
+fix_non_unique_ids = function(df){
+ df$Sequence.ID = paste(df$Sequence.ID, 1:nrow(df))
+ return(df)
+}
+
 summ = fix_column_names(summ)
 sequences = fix_column_names(sequences)
 mutationanalysis = fix_column_names(mutationanalysis)
@@ -79,6 +84,8 @@
 
 summ = merge(summ, gene_identification, by="Sequence.ID")
 
+print(paste("Number of sequences after merging with gene identification:", nrow(summ)))
+
 summ = summ[summ$Functionality != "No results",]
 
 print(paste("Number of sequences after 'No results' filter:", nrow(summ)))
@@ -99,15 +106,21 @@
 
 if(F){ #to speed up debugging
     set.seed(1)
-    summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.05)),]
-    print(paste("Number of sequences after sampling 5%:", nrow(summ)))
+    summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.03)),]
+    print(paste("Number of sequences after sampling 3%:", nrow(summ)))
 
-    filtering.steps = rbind(filtering.steps, c("Number of sequences after sampling 5%", nrow(summ)))
+    filtering.steps = rbind(filtering.steps, c("Number of sequences after sampling 3%", nrow(summ)))
 }
 
 print("mutation analysis files columns")
 print(names(mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])]))
 
+print(head(summ$Sequence.ID))
+
+print("_-------------------------------------")
+
+print(head(mutationanalysis$Sequence.ID))
+
 result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID")
 
 print(paste("Number of sequences after merging with mutation analysis file:", nrow(result)))
b
diff -r 6cd12c71c3d3 -r ee807645b224 shm_csr.xml
--- a/shm_csr.xml Wed Jun 14 11:14:00 2017 -0400
+++ b/shm_csr.xml Mon Jul 17 10:44:40 2017 -0400
[
@@ -1,5 +1,12 @@
 <tool id="shm_csr" name="SHM &amp; CSR pipeline" version="1.0">
  <description></description>
+ <requirements>
+ <requirement type="package" version="3.1_3">r-seqinr</requirement>
+ <requirement type="package" version="2.2.0">r-ggplot2</requirement>
+ <requirement type="package" version="1.4.2">r-reshape2</requirement>
+ <requirement type="package" version="0.4.1">r-scales</requirement>
+ <requirement type="package" version="1.10.0">r-data.table</requirement>
+ </requirements>
  <command interpreter="bash">
  #if str ( $filter_unique.filter_unique_select ) == "remove":
  wrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_unique.filter_unique_select $filter_unique.filter_unique_clone_count $class_filter_cond.class_filter $empty_region_filter $fast
@@ -8,7 +15,7 @@
  #end if
  </command>
  <inputs>
- <param name="in_file" type="data" label="IMGT zip file to be analysed" />
+ <param name="in_file" type="data" format="data" label="IMGT zip file to be analysed" />
  <param name="empty_region_filter" type="select" label="Sequence starts at" help="" >
  <option value="leader" selected="true">Leader: include FR1, CDR1, FR2, CDR2, FR3 in filters</option>
  <option value="FR1" selected="true">FR1: include CDR1,FR2,CDR2,FR3 in filters</option>
@@ -29,6 +36,8 @@
  <when value="remove">
  <param name="filter_unique_clone_count" size="4" type="integer" label="How many sequences should be in a group to keep 1 of them" value="2" min="2"/>
  </when>
+ <when value="keep"></when>
+ <when value="no"></when>
  </conditional>
  <param name="unique" type="select" label="Remove duplicates based on" help="" >
  <option value="VGene,CDR3.IMGT.AA,best_match_class">Top.V.Gene, CDR3 (AA), C region</option>
@@ -51,12 +60,20 @@
  <option value="19_0">>19% class</option>
  <option value="101_101">Do not assign (sub)class</option>
  </param>
+ <when value="70_70"></when>
+ <when value="60_55"></when>
+ <when value="70_0"></when>
+ <when value="60_0"></when>
+ <when value="19_0"></when>
+ <when value="101_101"></when>
  </conditional>
  <conditional name="naive_output_cond">
  <param name="naive_output" type="select" label="Output new IMGT archives per class into your history?">
  <option value="yes">Yes</option>
  <option value="no" selected="true">No</option>
  </param>
+ <when value="yes"></when>
+ <when value="no"></when>
  </conditional>
  <param name="fast" type="select" label="Fast" help="Skips generating the new ZIP files and Change-O/Baseline" >
  <option value="yes">Yes</option>
@@ -86,10 +103,12 @@
      <filter>class_filter_cond['class_filter'] == "101_101"</filter>
  </data>
  </outputs>
- <citations>
- <citation type="doi">10.1093/nar/gks457</citation>
- <citation type="doi">10.1093/bioinformatics/btv359</citation>
- </citations>
+ <tests>
+ <test>
+ <param name="fast" value="yes"/>
+ <output name="out_file" file="test1.html"/>
+ </test>
+ </tests>
  <help>
 <![CDATA[
 **References**
@@ -210,4 +229,8 @@
 
 ]]>
  </help>
+ <citations>
+ <citation type="doi">10.1093/nar/gks457</citation>
+ <citation type="doi">10.1093/bioinformatics/btv359</citation>
+ </citations>
 </tool>
b
diff -r 6cd12c71c3d3 -r ee807645b224 wrapper.sh
--- a/wrapper.sh Wed Jun 14 11:14:00 2017 -0400
+++ b/wrapper.sh Mon Jul 17 10:44:40 2017 -0400
[
@@ -41,6 +41,10 @@
  echo "tar -xJf $input -C $PWD/files/"
  mkdir -p "$PWD/files/$title"
  tar -xJf $input -C "$PWD/files/$title"
+else
+ echo "Unrecognized format $type"
+ echo "Unrecognized format $type" > $log
+ exit 1
 fi
 
 cat "`find $PWD/files/ -name "1_*"`" > $PWD/summary.txt
@@ -52,6 +56,10 @@
 cat "`find $PWD/files/ -name "8_*"`" > $PWD/mutationstats.txt
 cat "`find $PWD/files/ -name "10_*"`" > $PWD/hotspots.txt
 
+echo "---------------- unique id check ----------------"
+
+Rscript $dir/check_unique_id.r $PWD/summary.txt $PWD/sequences.txt $PWD/gapped_aa.txt $PWD/aa.txt $PWD/junction.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt
+
 if [[ ${#BLASTN_DIR} -ge 5 ]] ; then
  echo "On server, using BLASTN_DIR env: ${BLASTN_DIR}"
 else
@@ -69,7 +77,7 @@
 
 Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt "$PWD/gapped_aa.txt" $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${filter_unique_count} ${class_filter} ${empty_region_filter} 2>&1
 
-if [[ "$fast" == "no" ]] ; then
+if [[ "${naive_output}" == "yes" ]] ; then
 
  echo "---------------- creating new IMGT zips ----------------"
  echo "---------------- creating new IMGT zips ----------------<br />" >> $log