changeset 12:6b66c1c57f22 draft

Uploaded
author davidvanzessen
date Thu, 10 Nov 2016 08:36:18 -0500
parents c4ab5034c4d4
children 933fb21568ce
files merge_and_filter.r shm_csr.xml
diffstat 1 files changed, 19 insertions(+), 17 deletions(-) [+]
line wrap: on
line diff
--- a/merge_and_filter.r	Wed Nov 09 10:42:25 2016 -0500
+++ b/merge_and_filter.r	Thu Nov 10 08:36:18 2016 -0500
@@ -90,6 +90,22 @@
 result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele)
 result$JGene = gsub("[*].*", "", result$JGene)
 
+splt = strsplit(class.filter, "_")[[1]]
+chunk_hit_threshold = as.numeric(splt[1])
+nt_hit_threshold = as.numeric(splt[2])
+
+higher_than=(result$chunk_hit_percentage >= chunk_hit_threshold & result$nt_hit_percentage >= nt_hit_threshold)
+
+if(!all(higher_than, na.rm=T)){ #check for no unmatched
+	result[!higher_than,"best_match"] = paste("unmatched,", result[!higher_than,"best_match"])
+}
+
+if(class.filter == "101_101"){
+	result$best_match = "all"
+}
+
+write.table(x=result, file=gsub("merged.txt$", "before_filters.txt", output), sep="\t",quote=F,row.names=F,col.names=T)
+
 print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "")))
 print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "")))
 print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "")))
@@ -138,20 +154,6 @@
   result[is.na(result[,col]),] = 0
 }
 
-splt = strsplit(class.filter, "_")[[1]]
-chunk_hit_threshold = as.numeric(splt[1])
-nt_hit_threshold = as.numeric(splt[2])
-
-higher_than=(result$chunk_hit_percentage >= chunk_hit_threshold & result$nt_hit_percentage >= nt_hit_threshold)
-
-if(!all(higher_than, na.rm=T)){ #check for no unmatched
-	result[!higher_than,"best_match"] = paste("unmatched,", result[!higher_than,"best_match"])
-}
-
-if(class.filter == "101_101"){
-	result$best_match = "all"
-}
-
 write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T)
 
 if(filter.unique != "no"){
@@ -166,13 +168,13 @@
 	} else if(empty.region.filter == "FR2"){
 		result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
 	}
-
+	
 	if(grepl("keep", filter.unique)){
-		result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes
+		result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes
 		result = result[!duplicated(result$unique.def),]
 	} else {
 		result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]
-		result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes
+		result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don't have a class after it
 		result = result[!duplicated(result$unique.def),]
 	}
 }