changeset 10:4b695ca65213 draft

Uploaded
author davidvanzessen
date Wed, 09 Nov 2016 09:45:32 -0500 (2016-11-09)
parents 372ccdcf0b2d
children c4ab5034c4d4
files merge_and_filter.r shm_csr.xml
diffstat 2 files changed, 7 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/merge_and_filter.r	Tue Nov 08 07:32:54 2016 -0500
+++ b/merge_and_filter.r	Wed Nov 09 09:45:32 2016 -0500
@@ -25,7 +25,7 @@
 gene_identification = read.table(gene_identification_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
 
 if(method == "blastn"){
-	"qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore"
+	#"qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore"
 	gene_identification = gene_identification[!duplicated(gene_identification$qseqid),]
 	ref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52))
 	gene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T)
@@ -144,12 +144,7 @@
 
 higher_than=(result$chunk_hit_percentage >= chunk_hit_threshold & result$nt_hit_percentage >= nt_hit_threshold)
 
-unmatched=result[NULL,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
-
 if(!all(higher_than, na.rm=T)){ #check for no unmatched
-	unmatched = result[!higher_than,]
-	unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
-	unmatched$best_match = paste("unmatched,", unmatched$best_match)
 	result[!higher_than,"best_match"] = paste("unmatched,", result[!higher_than,"best_match"])
 }
 
@@ -200,6 +195,8 @@
 
 filtering.steps = rbind(filtering.steps, c("After remove duplicates based on filter", nrow(result)))
 
+unmatched = result[grepl("^unmatched", result$best_match),c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
+
 print(paste("Number of rows in result:", nrow(result)))
 print(paste("Number of rows in unmatched:", nrow(unmatched)))
 
--- a/shm_csr.xml	Tue Nov 08 07:32:54 2016 -0500
+++ b/shm_csr.xml	Wed Nov 09 09:45:32 2016 -0500
@@ -17,12 +17,12 @@
 			<option value="remove_unknown">Productive and Unproductive (Productive, Productive see comment, Unproductive, Unproductive and Unproductive see comment)</option>
 		</param>
 		<param name="filter_uniques" type="select" label="Filter unique sequences" help="See below for an example.">
-			<option value="remove">Remove uniques (Based on nucleotide sequence + C)</option>
+			<option value="remove" selected="true">Remove uniques (Based on nucleotide sequence + C)</option>
 			<option value="keep">Keep uniques (Based on nucleotide sequence + C)</option>
-			<option value="no" selected="true">No</option>
+			<option value="no">No</option>
 		</param>
 		<param name="unique" type="select" label="Remove duplicates based on" help="" >
-			<option value="VGene,AA.JUNCTION,best_match" selected="true">Top.V.Gene, CDR3 (AA), C region</option>
+			<option value="VGene,AA.JUNCTION,best_match">Top.V.Gene, CDR3 (AA), C region</option>
 			<option value="VGene,AA.JUNCTION">Top.V.Gene, CDR3 (AA)</option>
 			<option value="AA.JUNCTION,best_match">CDR3 (AA), C region</option>
 			<option value="AA.JUNCTION">CDR3 (AA)</option>
@@ -31,7 +31,7 @@
 			<option value="VGene,CDR3.IMGT.seq">Top.V.Gene, CDR3 (nt)</option>
 			<option value="CDR3.IMGT.seq,best_match">CDR3 (nt), C region</option>
 			<option value="CDR3.IMGT.seq">CDR3 (nt)</option>
-			<option value="Sequence.ID">Don't remove duplicates</option>
+			<option value="Sequence.ID" selected="true">Don't remove duplicates</option>
 		</param>
 		<param name="class_filter" type="select" label="Human Class/Subclass filter" help="" >
 			<option value="70_70" selected="true">>70% class and >70% subclass</option>