Mercurial > repos > davidvanzessen > shm_csr
changeset 49:f5fe63533c58 draft
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 11 May 2017 10:21:39 -0400 |
parents | c5295dd10dfc |
children | 75ee66a691a0 |
files | merge_and_filter.r shm_csr.xml wrapper.sh |
diffstat | 3 files changed, 33 insertions(+), 13 deletions(-) [+] |
line wrap: on
line diff
--- a/merge_and_filter.r Mon May 08 09:27:27 2017 -0400 +++ b/merge_and_filter.r Thu May 11 10:21:39 2017 -0400 @@ -15,8 +15,11 @@ functionality=args[12] unique.type=args[13] filter.unique=args[14] -class.filter=args[15] -empty.region.filter=args[16] +filter.unique.count=as.numeric(args[15]) +class.filter=args[16] +empty.region.filter=args[17] + +print(paste("filter.unique.count:", filter.unique.count)) summ = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") sequences = read.table(sequencesfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") @@ -96,7 +99,7 @@ if(FALSE){ #to speed up debugging set.seed(1) - summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.05)),] + summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.1)),] print(paste("Number of sequences after sampling 5%:", nrow(summ))) filtering.steps = rbind(filtering.steps, c("Number of sequences after sampling 5%", nrow(summ))) @@ -225,6 +228,12 @@ result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don't have a class after it + if(filter.unique == "remove"){ + unique.defs = data.frame(table(result$unique.def)) + unique.defs = unique.defs[unique.defs$Freq >= filter.unique.count,] + result = result[result$unique.def %in% unique.defs$Var1,] + } + result = result[!duplicated(result$unique.def),] }
--- a/shm_csr.xml Mon May 08 09:27:27 2017 -0400 +++ b/shm_csr.xml Thu May 11 10:21:39 2017 -0400 @@ -1,7 +1,11 @@ <tool id="shm_csr" name="SHM & CSR pipeline" version="1.0"> <description></description> <command interpreter="bash"> - wrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_uniques $class_filter_cond.class_filter $empty_region_filter $fast + #if str ( $filter_unique.filter_unique_select ) == "remove": + wrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_unique.filter_unique_select $filter_unique.filter_unique_clone_count $class_filter_cond.class_filter $empty_region_filter $fast + #else: + wrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_unique.filter_unique_select 2 $class_filter_cond.class_filter $empty_region_filter $fast + #end if </command> <inputs> <param name="in_file" type="data" label="IMGT zip file to be analysed" /> @@ -16,11 +20,16 @@ <option value="unproductive">Unproductive (Unproductive and Unproductive see comment)</option> <option value="remove_unknown">Productive and Unproductive (Productive, Productive see comment, Unproductive, Unproductive and Unproductive see comment)</option> </param> - <param name="filter_uniques" type="select" label="Filter unique sequences" help="See below for an example."> - <option value="remove" selected="true">Remove uniques (Based on nucleotide sequence + C)</option> - <option value="keep">Keep uniques (Based on nucleotide sequence + C)</option> - <option value="no">No</option> - </param> + <conditional name="filter_unique"> + <param name="filter_unique_select" type="select" label="Filter unique sequences" help="See below for an example."> + <option value="remove" selected="true">Remove uniques (Based on nucleotide sequence + C)</option> + <option value="keep">Keep uniques (Based on nucleotide sequence + C)</option> + <option value="no">No</option> + </param> + <when value="remove"> + <param name="filter_unique_clone_count" size="4" type="integer" label="How many sequences should be in a group to keep 1 of them" value="2" min="2"/> + </when> + </conditional> <param name="unique" type="select" label="Remove duplicates based on" help="" > <option value="VGene,CDR3.IMGT.AA,best_match_class">Top.V.Gene, CDR3 (AA), C region</option> <option value="VGene,CDR3.IMGT.AA">Top.V.Gene, CDR3 (AA)</option>
--- a/wrapper.sh Mon May 08 09:27:27 2017 -0400 +++ b/wrapper.sh Thu May 11 10:21:39 2017 -0400 @@ -17,9 +17,11 @@ naive_output_ce=${13} naive_output_all=${14} filter_unique=${15} -class_filter=${16} -empty_region_filter=${17} -fast=${18} +filter_unique_count=${16} +class_filter=${17} +empty_region_filter=${18} +fast=${19} + mkdir $outdir tar -xzf $dir/style.tar.gz -C $outdir @@ -65,7 +67,7 @@ echo "---------------- merge_and_filter.r ----------------" echo "---------------- merge_and_filter.r ----------------<br />" >> $log -Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt "$PWD/gapped_aa.txt" $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${class_filter} ${empty_region_filter} 2>&1 +Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt "$PWD/gapped_aa.txt" $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${filter_unique_count} ${class_filter} ${empty_region_filter} 2>&1 if [[ "$fast" == "no" ]] ; then