Mercurial > repos > davidvanzessen > shm_csr
changeset 12:6b66c1c57f22 draft
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 10 Nov 2016 08:36:18 -0500 |
parents | c4ab5034c4d4 |
children | 933fb21568ce |
files | merge_and_filter.r shm_csr.xml |
diffstat | 1 files changed, 19 insertions(+), 17 deletions(-) [+] |
line wrap: on
line diff
--- a/merge_and_filter.r Wed Nov 09 10:42:25 2016 -0500 +++ b/merge_and_filter.r Thu Nov 10 08:36:18 2016 -0500 @@ -90,6 +90,22 @@ result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele) result$JGene = gsub("[*].*", "", result$JGene) +splt = strsplit(class.filter, "_")[[1]] +chunk_hit_threshold = as.numeric(splt[1]) +nt_hit_threshold = as.numeric(splt[2]) + +higher_than=(result$chunk_hit_percentage >= chunk_hit_threshold & result$nt_hit_percentage >= nt_hit_threshold) + +if(!all(higher_than, na.rm=T)){ #check for no unmatched + result[!higher_than,"best_match"] = paste("unmatched,", result[!higher_than,"best_match"]) +} + +if(class.filter == "101_101"){ + result$best_match = "all" +} + +write.table(x=result, file=gsub("merged.txt$", "before_filters.txt", output), sep="\t",quote=F,row.names=F,col.names=T) + print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == ""))) print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == ""))) print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == ""))) @@ -138,20 +154,6 @@ result[is.na(result[,col]),] = 0 } -splt = strsplit(class.filter, "_")[[1]] -chunk_hit_threshold = as.numeric(splt[1]) -nt_hit_threshold = as.numeric(splt[2]) - -higher_than=(result$chunk_hit_percentage >= chunk_hit_threshold & result$nt_hit_percentage >= nt_hit_threshold) - -if(!all(higher_than, na.rm=T)){ #check for no unmatched - result[!higher_than,"best_match"] = paste("unmatched,", result[!higher_than,"best_match"]) -} - -if(class.filter == "101_101"){ - result$best_match = "all" -} - write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T) if(filter.unique != "no"){ @@ -166,13 +168,13 @@ } else if(empty.region.filter == "FR2"){ result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) } - + if(grepl("keep", filter.unique)){ - result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes + result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes result = result[!duplicated(result$unique.def),] } else { result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),] - result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes + result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don't have a class after it result = result[!duplicated(result$unique.def),] } }