Mercurial > repos > iuc > hgvsparser
view parseHGVS.R @ 0:c12a4d187121 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/hgvsparser/ commit f9deb29cdbd2d2a5f2f4fbd470b1078431a36ae0
author | iuc |
---|---|
date | Fri, 07 Jun 2024 15:21:07 +0000 |
parents | |
children |
line wrap: on
line source
# Copyright (C) 2018 Jochen Weile, Roth Lab # # This file is part of hgvsParseR. # # hgvsParseR is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # hgvsParseR is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with hgvsParseR. If not, see <https://www.gnu.org/licenses/>. #' HGVS Parser #' #' Parses HGVS strings #' @param strings A character vector containing the HGVS strings #' @param aacode allowed values: 1, 3, or NA. Determines whether 1-letter codes or 3-letter codes should be forced. NA uses input format. #' @return A \code{data.frame} with the following columns: #' @keywords HGVS parsing #' @export #' @examples #' result <- parseHGVS(c("g.1318G>T","c.123_125inv","p.R123_L152del")) parseHGVS <- function(strings,aacode=c(NA,1,3)) { #Check that parameters are valid if (!is.character(strings)) { stop("Input for 'parse' function must be a character vector! Found '",class(strings),"' instead.") } aacode <- aacode[[1]] if (!is.na(aacode) && !(aacode %in% c(1,3))) { warning("Invalid aacode parameter, defaulting to NA!") aacode <- NA } #Helper function: turns a list of lists (lol) in to a dataframe to.df <- function(lol) { colnames <- unique(do.call(c,lapply(lol,names))) columns <- lapply(colnames,function(cn) sapply(lol,function(row) { if (cn %in% names(row)) row[[cn]] else NA })) names(columns) <- colnames empty <- which(sapply(columns,function(xs)all(is.na(xs)))) columns[empty] <- NULL do.call(data.frame,columns) } # ### # # Binds matrices of same size together to a 3D matrix, analogously # # to cbind and rbind. # # # zbind <- function(...) { # x <- list(...) # y <- array(0,dim=c(nrow(x[[1]]),ncol(x[[1]]),length(x)),dimnames=dimnames(x[[1]])) # for (i in 1:length(x)) y[,,i] <- x[[i]] # y # } ### # Function to *locally* excise regex groups from string vectors. # I.e. only extract the first occurrence of each group within each string. # x = string vector # re = regular expression with groups # extract.groups <- function(x, re) { matches <- regexpr(re,x,perl=TRUE) start <- attr(matches,"capture.start") end <- start + attr(matches,"capture.length") - 1 do.call(cbind,lapply(1:ncol(start), function(i) { sapply(1:nrow(start),function(j){ if (start[j,i] > -1) substr(x[[j]],start[j,i],end[j,i]) else NA }) })) } # ### # # Function to *globally* excise regex groups from string vectors. # # x = string vector # # re = regular expression with groups # # # global.extract.groups <- function(x,re) { # all.matches <- gregexpr(re,x,perl=TRUE) # mapply(function(matches,x) { # start <- attr(matches,"capture.start") # end <- start + attr(matches,"capture.length") - 1 # apply(zbind(start,end),c(1,2),function(pos) substr(x,pos[[1]],pos[[2]]) ) # },matches=all.matches,x=x,SIMPLIFY=FALSE) # } ### # Helper function to split multi-mutant bodies into their individual # elements. Returns a vector of strings containing these elements. # An attribute "multi" is attached to the vector, detailing the type # of multi-mutant # splitMulti <- function(body) { if (regexpr("\\[.+\\];\\[.+\\]",body) > 0) { out <- strsplit(substr(body,2,nchar(body)-1),"\\];\\[")[[1]] attr(out,"multi") <- "trans" } else if (regexpr("\\[.+\\(;\\).+\\]",body) > 0) { out <- strsplit(substr(body,2,nchar(body)-1),"\\(;\\)")[[1]] attr(out,"multi") <- "unknown" } else if (regexpr("\\[.+;.+\\]",body) > 0) { out <- strsplit(substr(body,2,nchar(body)-1),";")[[1]] attr(out,"multi") <- "cis" } else { out <- body attr(out,"multi") <- "single" } return(out) } ### # Helper function: # Given an HGVS body and a list of regexes corresponding to types, # find the (first) matching type. findType <- function(body,types) { i <- 0 found <- done <- FALSE while (!found && !done) { found <- regexpr(types[[i <- i+1]],body) > 0 done <- i >= length(types) } if (found) { return(names(types)[[i]]) } else { return("invalid") } } out <- lapply(strings,function(s) { if (regexpr("^[gcnmrp]\\.",s) < 1) { return(list(list(hgvs=s,subject="invalid",type="invalid"))) } body <- substr(s,3,nchar(s)) subbodies <- splitMulti(body) subjects <- c( g="genomic",c="coding",n="noncoding", m="mitochondrial",r="rna",p="protein" ) subject <- subjects[[substr(s,1,1)]] if (subject=="genomic") { types <- c( substitution="\\d+[ACGT]>[ACGT]", singledeletion="^\\d+del$", deletion="\\d+_\\d+del$",inversion="\\d+_\\d+inv", duplication="\\d+_\\d+dup",insertion="\\d+_\\d+ins[ATCG]+", conversion="\\d+_\\d+con\\d+_\\d+",delins="\\d+_\\d+delins[ATCG]+", amplification="\\d+_\\d+\\[\\d+\\]" ) phasing <- attr(subbodies,"multi") isMulti <- length(subbodies) > 1 lapply(1:length(subbodies), function(i.multi) { body <- subbodies[[i.multi]] type <- findType(body,types) if (type == "substitution") { groups <- extract.groups(body,"(\\d+)([ACGT])>([ACGT])")[1,] position <- as.integer(groups[[1]]) ancestral <- groups[[2]] variant <- groups[[3]] if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=position,ancestral=ancestral,variant=variant)) } else { return(list(hgvs=s,subject=subject,type=type,start=position, ancestral=ancestral,variant=variant)) } } else if (type == "singledeletion") { groups <- extract.groups(body,"(\\d+)del")[1,] position <- as.integer(groups[[1]]) if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=position)) } else { return(list(hgvs=s,subject=subject,type=type,start=position)) } } else if (type == "deletion") { groups <- extract.groups(body,"(\\d+)_(\\d+)del") start <- as.integer(groups[[1]]) end <- as.integer(groups[[2]]) if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=start,end=end)) } else { return(list(hgvs=s,subject=subject,type=type,start=start,end=end)) } } else if (type == "inversion") { groups <- extract.groups(body,"(\\d+)_(\\d+)inv") start <- as.integer(groups[[1]]) end <- as.integer(groups[[2]]) if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=start,end=end)) } else { return(list(hgvs=s,subject=subject,type=type,start=start,end=end)) } } else if (type == "duplication") { groups <- extract.groups(body,"(\\d+)_(\\d+)dup") start <- as.integer(groups[[1]]) end <- as.integer(groups[[2]]) if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=start,end=end)) } else { return(list(hgvs=s,subject=subject,type=type,start=start,end=end)) } } else if (type == "insertion") { groups <- extract.groups(body,"(\\d+)_(\\d+)ins([ATCG]+)") start <- as.integer(groups[[1]]) end <- as.integer(groups[[2]]) if (abs(end-start)!=1) { warning("Invalid insertion definition: Start and end positions must be adjacent!") } variant <- groups[[3]] if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=start,end=end,variant=variant)) } else { return(list(hgvs=s,subject=subject,type=type,start=start,end=end, variant=variant)) } } else if (type == "conversion") { groups <- extract.groups(body,"(\\d+)_(\\d+)con(\\d+)_(\\d+)") start <- as.integer(groups[[1]]) end <- as.integer(groups[[2]]) tStart <- as.integer(groups[[3]]) tEnd <- as.integer(groups[[4]]) if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=start,end=end, templateStart=tStart,templateEnd=tEnd)) } else { return(list(hgvs=s,subject=subject,type=type,start=start,end=end, templateStart=tStart,templateEnd=tEnd)) } } else if (type == "delins") { groups <- extract.groups(body,"(\\d+)_(\\d+)delins([ATCG]+)") start <- as.integer(groups[[1]]) end <- as.integer(groups[[2]]) variant <- groups[[3]] if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=start,end=end,variant=variant)) } else { return(list(hgvs=s,subject=subject,type=type,start=start,end=end, variant=variant)) } } else if (type == "amplification") { groups <- extract.groups(body,"(\\d+)_(\\d+)\\[(\\d+)\\]") start <- as.integer(groups[[1]]) end <- as.integer(groups[[2]]) copies <- as.integer(groups[[3]]) if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=start,end=end,copies=copies)) } else { return(list(hgvs=s,subject=subject,type=type, start=start,end=end,copies=copies)) } } else { if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,type="invalid")) } else { return(list(hgvs=s,subject=subject,type="invalid")) } } }) } else if (subject=="coding") { #coding needs to be handled separately from genomic, as the syntax may differ #e.g. it allows for offset descriptions relative to exon-intron borders types <- c( substitution="\\d+([+-]\\d+)?[ACGT]>[ACGT]", singledeletion="^\\d+([+-]\\d+)?del$", deletion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?del$", inversion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?inv", duplication="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?dup", insertion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?ins[ATCG]+", conversion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?con\\d+([+-]\\d+)?_\\d+([+-]\\d+)?", delins="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?delins[ATCG]+", amplification="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?\\[\\d+\\]" ) phasing <- attr(subbodies,"multi") isMulti <- length(subbodies) > 1 lapply(1:length(subbodies), function(i.multi) { body <- subbodies[[i.multi]] type <- findType(body,types) if (type == "substitution") { groups <- extract.groups(body,"(\\d+)([+-]\\d+)?([ACGT])>([ACGT])")[1,] position <- as.integer(groups[[1]]) intronOffset <- as.integer(groups[[2]]) ancestral <- groups[[3]] variant <- groups[[4]] if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=position,startIntron=intronOffset,ancestral=ancestral, variant=variant)) } else { return(list(hgvs=s,subject=subject,type=type,start=position, startIntron=intronOffset,ancestral=ancestral,variant=variant)) } } else if (type == "singledeletion") { groups <- extract.groups(body,"(\\d+)([+-]\\d+)?del")[1,] position <- as.integer(groups[[1]]) intronOffset <- as.integer(groups[[2]]) if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=position,startIntron=intronOffset)) } else { return(list(hgvs=s,subject=subject,type=type,start=position, startIntron=intronOffset)) } } else if (type == "deletion") { groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?del") start <- as.integer(groups[[1]]) intronOffset <- as.integer(groups[[2]]) end <- as.integer(groups[[3]]) intronOffset2 <- as.integer(groups[[4]]) if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=start,startIntron=intronOffset, end=end,endIntron=intronOffset2)) } else { return(list(hgvs=s,subject=subject,type=type,start=start, startIntron=intronOffset,end=end,endIntron=intronOffset2)) } } else if (type == "inversion") { groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?inv") start <- as.integer(groups[[1]]) intronOffset <- as.integer(groups[[2]]) end <- as.integer(groups[[3]]) intronOffset2 <- as.integer(groups[[4]]) if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=start,startIntron=intronOffset, end=end,endIntron=intronOffset2)) } else { return(list(hgvs=s,subject=subject,type=type,start=start, startIntron=intronOffset,end=end,endIntron=intronOffset2)) } } else if (type == "duplication") { groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?dup") start <- as.integer(groups[[1]]) intronOffset <- as.integer(groups[[2]]) end <- as.integer(groups[[3]]) intronOffset2 <- as.integer(groups[[4]]) if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=start,startIntron=intronOffset, end=end,endIntron=intronOffset2)) } else { return(list(hgvs=s,subject=subject,type=type,start=start, startIntron=intronOffset,end=end,endIntron=intronOffset2)) } } else if (type == "insertion") { groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?ins([ATCG]+)") start <- as.integer(groups[[1]]) intronOffset <- as.integer(groups[[2]]) end <- as.integer(groups[[3]]) intronOffset2 <- as.integer(groups[[4]]) variant <- groups[[5]] if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=start,startIntron=intronOffset, end=end,endIntron=intronOffset2,variant=variant)) } else { return(list(hgvs=s,subject=subject,type=type,start=start, startIntron=intronOffset,end=end,endIntron=intronOffset2,variant=variant)) } } else if (type == "conversion") { groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?con(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?") start <- as.integer(groups[[1]]) intronOffset <- as.integer(groups[[2]]) end <- as.integer(groups[[3]]) intronOffset2 <- as.integer(groups[[4]]) tStart <- as.integer(groups[[5]]) intronOffset3 <- as.integer(groups[[6]]) tEnd <- as.integer(groups[[7]]) intronOffset4 <- as.integer(groups[[8]]) if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=start,startIntron=intronOffset, end=end,endIntron=intronOffset2,templateStart=tStart, templateStartIntron=intronOffset3,templateEnd=tEnd, templateEndIntron=intronOffset4)) } else { return(list(hgvs=s,subject=subject,type=type, start=start,startIntron=intronOffset, end=end,endIntron=intronOffset2,templateStart=tStart, templateStartIntron=intronOffset3,templateEnd=tEnd, templateEndIntron=intronOffset4)) } } else if (type == "delins") { groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?delins([ATCG]+)") start <- as.integer(groups[[1]]) intronOffset <- as.integer(groups[[2]]) end <- as.integer(groups[[3]]) intronOffset2 <- as.integer(groups[[4]]) variant <- groups[[5]] if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=start,startIntron=intronOffset, end=end,endIntron=intronOffset2,variant=variant)) } else { return(list(hgvs=s,subject=subject,type=type,start=start, startIntron=intronOffset,end=end,endIntron=intronOffset2,variant=variant)) } } else if (type == "amplification") { groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?\\[(\\d+)\\]") start <- as.integer(groups[[1]]) intronOffset <- as.integer(groups[[2]]) end <- as.integer(groups[[3]]) intronOffset2 <- as.integer(groups[[4]]) copies <- as.integer(groups[[3]]) if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=start,startIntron=intronOffset, end=end,endIntron=intronOffset2,copies=copies)) } else { return(list(hgvs=s,subject=subject,type=type,start=start, startIntron=intronOffset,end=end,endIntron=intronOffset2,copies=copies)) } } else { if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type="invalid")) } else { return(list(hgvs=s,subject=subject,type="invalid")) } } }) } else if (subject=="protein") { one2three <- c(A="Ala",C="Cys",D="Asp",E="Glu",F="Phe",G="Gly",H="His", I="Ile",K="Lys",L="Leu",M="Met",N="Asn",P="Pro",Q="Gln",R="Arg", S="Ser",T="Thr",V="Val",W="Trp",Y="Tyr",`*`="Ter") three2one <- c(Ala="A",Arg="R",Asn="N",Asp="D",Cys="C",Gln="Q",Glu="E", Gly="G",His="H",Ile="I",Leu="L",Lys="K",Met="M",Phe="F",Pro="P", Ser="S",Thr="T",Trp="W",Tyr="Y",Val="V",Ter="*") codes <- paste(c(one2three,three2one[-21],"\\*"),collapse="|") types <- list( synonymous1="^=$", synonymous2=paste0("^(",codes,")(\\d+)=$"), substitution=paste0("^(",codes,")(\\d+)(",codes,")$"), singledeletion=paste0("^(",codes,")(\\d+)del$"), deletion=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)del$"), duplication=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)dup$"), insertion=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)ins((",codes,")+)$"), delins=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)delins((",codes,")+)$"), frameshift1=paste0("^(",codes,")(\\d+)fs$"), frameshift2=paste0("^(",codes,")(\\d+)(",codes,")fs(Ter|\\*)(\\d+)$") ) phasing <- attr(subbodies,"multi") isMulti <- length(subbodies) > 1 lapply(1:length(subbodies), function(i.multi) { body <- subbodies[[i.multi]] type <- findType(body,types) if (type == "synonymous1") { if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,type=type)) } else { return(list(hgvs=s,subject=subject,type="synonymous")) } } else if (type == "synonymous2") { groups <- extract.groups(body,types$synonymous2) aa1 <- groups[[1]] pos <- as.integer(groups[[2]]) if (aa1 %in% c(one2three,three2one)) { if (is.na(aacode)) { #do nothing } else if (aacode == 1) { if (nchar(aa1) == 3) aa1 <- three2one[[aa1]] } else if (aacode ==3) { if (nchar(aa1) == 1) aa1 <- one2three[[aa1]] } else { #this should never happen, as it's supposed to be detected at start of function stop("Invalid aacode. If you see this, report this as a bug!") } if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type="synonymous",start=pos,ancestral=aa1)) } else { return(list(hgvs=s,subject=subject,type="synonymous",start=pos, ancestral=aa1)) } } else {#not valid amino acid if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type="invalid")) } else { return(list(hgvs=s,subject=subject,type="invalid")) } } } else if (type == "substitution") { groups <- extract.groups(body,types$substitution) aa1 <- groups[[1]] pos <- as.integer(groups[[2]]) aa2 <- groups[[3]] if (aa1 %in% c(one2three,three2one) && aa2 %in% c(one2three,three2one)) { if (is.na(aacode)) { #do nothing } else if (aacode == 1) { if (nchar(aa1) == 3) aa1 <- three2one[[aa1]] if (nchar(aa2) == 3) aa2 <- three2one[[aa2]] } else if (aacode ==3) { if (nchar(aa1) == 1) aa1 <- one2three[[aa1]] if (nchar(aa2) == 1) aa2 <- one2three[[aa2]] } else { #this should never happen, as it's supposed to be detected at start of function stop("Invalid aacode. If you see this, report this as a bug!") } if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=pos,ancestral=aa1,variant=aa2)) } else { return(list(hgvs=s,subject=subject,type=type,start=pos, ancestral=aa1,variant=aa2)) } } else {#not valid amino acid if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type="invalid")) } else { return(list(hgvs=s,subject=subject,type="invalid")) } } } else if (type == "singledeletion") { groups <- extract.groups(body,types$singledeletion) aa1 <- groups[[1]] pos <- as.integer(groups[[2]]) if (is.na(aacode)) { #do nothing } else if (aacode == 1) { if (nchar(aa1) == 3) aa1 <- three2one[[aa1]] } else if (aacode == 3) { if (nchar(aa1) == 1) aa1 <- one2three[[aa1]] } else { #this should never happen, as it's supposed to be detected at start of function stop("Invalid aacode. If you see this, report this as a bug!") } if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=pos,ancestral=aa1)) } else { return(list(hgvs=s,subject=subject,type=type,start=pos,ancestral=aa1)) } } else if (type == "deletion") { groups <- extract.groups(body,types$deletion) aa1 <- groups[[1]] pos <- as.integer(groups[[2]]) aa2 <- groups[[3]] pos2 <- as.integer(groups[[4]]) if (is.na(aacode)) { #do nothing } else if (aacode == 1) { if (nchar(aa1) == 3) aa1 <- three2one[[aa1]] if (nchar(aa2) == 3) aa2 <- three2one[[aa2]] } else if (aacode == 3) { if (nchar(aa1) == 1) aa1 <- one2three[[aa1]] if (nchar(aa2) == 1) aa2 <- one2three[[aa2]] } else { #this should never happen, as it's supposed to be detected at start of function stop("Invalid aacode. If you see this, report this as a bug!") } if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2)) } else { return(list(hgvs=s,subject=subject,type=type,start=pos, ancestral=aa1,end=pos2,ancestral2=aa2)) } } else if (type == "duplication") { groups <- extract.groups(body,types$duplication) aa1 <- groups[[1]] pos <- as.integer(groups[[2]]) aa2 <- groups[[3]] pos2 <- as.integer(groups[[4]]) if (is.na(aacode)) { #do nothing } else if (aacode == 1) { if (nchar(aa1) == 3) aa1 <- three2one[[aa1]] if (nchar(aa2) == 3) aa2 <- three2one[[aa2]] } else if (aacode == 3) { if (nchar(aa1) == 1) aa1 <- one2three[[aa1]] if (nchar(aa2) == 1) aa2 <- one2three[[aa2]] } else { #this should never happen, as it's supposed to be detected at start of function stop("Invalid aacode. If you see this, report this as a bug!") } if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2)) } else { return(list(hgvs=s,subject=subject,type=type,start=pos, ancestral=aa1,end=pos2,ancestral2=aa2)) } } else if (type == "insertion") { groups <- extract.groups(body,types$insertion) aa1 <- groups[[1]] pos <- as.integer(groups[[2]]) aa2 <- groups[[3]] pos2 <- as.integer(groups[[4]]) insert <- groups[[5]] #TODO: Implement code conversion if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert)) } else { return(list(hgvs=s,subject=subject,type=type,start=pos, ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert)) } } else if (type == "delins") { groups <- extract.groups(body,types$delins) aa1 <- groups[[1]] pos <- as.integer(groups[[2]]) aa2 <- groups[[3]] pos2 <- as.integer(groups[[4]]) insert <- groups[[5]] #TODO: Implement code conversion if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert)) } else { return(list(hgvs=s,subject=subject,type=type,start=pos, ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert)) } } else if (type == "frameshift1") { groups <- extract.groups(body,types$frameshift1) aa1 <- groups[[1]] pos <- as.integer(groups[[2]]) #TODO: Implement code conversion if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type="frameshift",start=pos,ancestral=aa1)) } else { return(list(hgvs=s,subject=subject,type="frameshift",start=pos, ancestral=aa1)) } } else if (type == "frameshift2") { groups <- extract.groups(body,types$frameshift2) aa1 <- groups[[1]] pos <- as.integer(groups[[2]]) aa2 <- groups[[3]] term <- as.integer(groups[[5]]) #TODO: Implement code conversion if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type="frameshift",start=pos,ancestral=aa1,variant=aa2,end=term)) } else { return(list(hgvs=s,subject=subject,type="frameshift",start=pos, ancestral=aa1,variant=aa2,end=term)) } } else {#unmatched type if (isMulti) { return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, type="invalid")) } else { return(list(hgvs=s,subject=subject,type="invalid")) } } }) } else if (subject=="noncoding") { #FIXME: These need to be list of lists to match postprocessing return(list(list(hgvs=s,subject="not_implemented",type="not_implemented"))) } else if (subject=="mitochondrial") { return(list(list(hgvs=s,subject="not_implemented",type="not_implemented"))) } else if (subject=="rna") { return(list(list(hgvs=s,subject="not_implemented",type="not_implemented"))) } else {#unmatched subject, shouldn't happen stop("Unmatched subject! If you see this, report it as a bug!") } }) #demote multimutants to enforce simple list structure multiLengths <- sapply(out,length) ids <- do.call(c,lapply(1:length(multiLengths), function(i) if (multiLengths[[i]]==1) as.character(i) else paste0(i,".",1:multiLengths[[i]]) )) out2 <- do.call(c,out) names(out2) <- ids to.df(out2) }