Mercurial > repos > dlalgroup > simtext_app
comparison pubmed_by_queries.R @ 0:34ed44f3f85c draft
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
| author | dlalgroup |
|---|---|
| date | Thu, 24 Sep 2020 02:17:05 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:34ed44f3f85c |
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 #tool: pubmed_by_queries | |
| 3 # | |
| 4 #This tool uses a set of search queries to download a defined number of abstracts or PMIDs for search query from PubMed. | |
| 5 #PubMed's search rules and syntax apply. | |
| 6 # | |
| 7 #Input: Tab-delimited table with search queries in a column starting with "ID_", e.g. "ID_gene" if search queries are genes. | |
| 8 # | |
| 9 # Output: Input table with additional columns with PMIDs or abstracts (--abstracts) from PubMed. | |
| 10 # | |
| 11 #Usage: $ pubmed_by_queries.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER] [-a] [-k KEY] | |
| 12 # | |
| 13 # optional arguments: | |
| 14 # -h, --help show this help message and exit | |
| 15 # -i INPUT, --input INPUT input file name. add path if file is not in working directory | |
| 16 # -o OUTPUT, --output OUTPUT output file name. [default "pubmed_by_queries_output"] | |
| 17 # -n NUMBER, --number NUMBER number of PMIDs or abstracts to save per ID [default "5"] | |
| 18 # -a, --abstract if abstracts instead of PMIDs should be retrieved use --abstracts | |
| 19 # -k KEY, --key KEY if NCBI API key is available, add it to speed up the fetching of pubmed data | |
| 20 | |
| 21 if ( '--install_packages' %in% commandArgs()) { | |
| 22 print('Installing packages') | |
| 23 if (!require('argparse')) install.packages('argparse',repo="http://cran.rstudio.com/"); | |
| 24 if (!require('easyPubMed')) install.packages('easyPubMed',repo="http://cran.rstudio.com/"); | |
| 25 } | |
| 26 | |
| 27 suppressPackageStartupMessages(library("argparse")) | |
| 28 suppressPackageStartupMessages(library("easyPubMed")) | |
| 29 | |
| 30 parser <- ArgumentParser() | |
| 31 parser$add_argument("-i", "--input", | |
| 32 help = "input fie name. add path if file is not in working directory") | |
| 33 parser$add_argument("-o", "--output", default="pubmed_by_queries_output", | |
| 34 help = "output file name. [default \"%(default)s\"]") | |
| 35 parser$add_argument("-n", "--number", type="integer", default=5, | |
| 36 help="Number of PMIDs (or abstracts) to save per ID. [default \"%(default)s\"]") | |
| 37 parser$add_argument("-a", "--abstract", action="store_true", default=FALSE, | |
| 38 help="if abstracts instead of PMIDs should be retrieved use --abstracts ") | |
| 39 parser$add_argument("-k", "--key", type="character", | |
| 40 help="if ncbi API key is available, add it to speed up the download of pubmed data") | |
| 41 parser$add_argument("--install_packages", action="store_true", default=FALSE, | |
| 42 help="If you want to auto install missing required packages.") | |
| 43 args <- parser$parse_args() | |
| 44 | |
| 45 MAX_WEB_TRIES = 100 | |
| 46 | |
| 47 data = read.delim(args$input, stringsAsFactors=FALSE) | |
| 48 | |
| 49 id_col_index <- grep("ID_", names(data)) | |
| 50 | |
| 51 pubmed_data_in_table <- function(data, row, query, number, key, abstract){ | |
| 52 if (is.null(query)){print(data)} | |
| 53 pubmed_search <- get_pubmed_ids(query, api_key = key) | |
| 54 | |
| 55 if(as.numeric(pubmed_search$Count) == 0){ | |
| 56 cat("No PubMed result for the following query: ", query, "\n") | |
| 57 return(data) | |
| 58 | |
| 59 } else if (abstract == FALSE) { # fetch PMIDs | |
| 60 | |
| 61 myPubmedURL <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?", | |
| 62 "db=pubmed&retmax=", number, "&term=", pubmed_search$OriginalQuery, "&usehistory=n", sep = "") | |
| 63 # get ids | |
| 64 idXML <- c() | |
| 65 for (i in 1:MAX_WEB_TRIES){ | |
| 66 tryCatch({ | |
| 67 IDconnect <- suppressWarnings(url(myPubmedURL, open = "rb", encoding = "UTF8")) | |
| 68 idXML <- suppressWarnings(readLines(IDconnect, warn = FALSE, encoding = "UTF8")) | |
| 69 suppressWarnings(close(IDconnect)) | |
| 70 break | |
| 71 }, error = function(e) { | |
| 72 print(paste('Error getting URL, sleeping',2*i,'seconds.')) | |
| 73 print(e) | |
| 74 Sys.sleep(time = 2*i) | |
| 75 }) | |
| 76 } | |
| 77 | |
| 78 PMIDs = c() | |
| 79 | |
| 80 for (i in 1:length(idXML)) { | |
| 81 if (grepl("^<Id>", idXML[i])) { | |
| 82 pmid <- custom_grep(idXML[i], tag = "Id", format = "char") | |
| 83 PMIDs <- c(PMIDs, as.character(pmid[1])) | |
| 84 } | |
| 85 } | |
| 86 | |
| 87 | |
| 88 if(length(PMIDs)>0){ | |
| 89 data[row,sapply(1:length(PMIDs),function(i){paste0("PMID_",i)})] <- PMIDs | |
| 90 cat(length(PMIDs)," PMIDs for ",query, " are added in the table.", "\n") | |
| 91 } | |
| 92 | |
| 93 return(data) | |
| 94 | |
| 95 } else if (abstract == TRUE) { # fetch abstracts and title text | |
| 96 | |
| 97 efetch_url = paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?", | |
| 98 "db=pubmed&WebEnv=", pubmed_search$WebEnv, "&query_key=", pubmed_search$QueryKey, | |
| 99 "&retstart=", 0, "&retmax=", number, | |
| 100 "&rettype=", "null","&retmode=", "xml", sep = "") | |
| 101 | |
| 102 api_key <- pubmed_search$APIkey | |
| 103 if (!is.null(api_key)) { | |
| 104 efetch_url <- paste(efetch_url, "&api_key=", api_key, sep = "") | |
| 105 } | |
| 106 | |
| 107 # initialize | |
| 108 out.data <- NULL | |
| 109 try_num <- 1 | |
| 110 t_0 <- Sys.time() | |
| 111 | |
| 112 # Try to fetch results | |
| 113 while(is.null(out.data)) { | |
| 114 | |
| 115 # Timing check: kill at 3 min | |
| 116 if (try_num > 1){ | |
| 117 Sys.sleep(time = 2*try_num) | |
| 118 cat("Problem to receive PubMed data or error is received. Please wait. Try number:",try_num,"\n") | |
| 119 } | |
| 120 | |
| 121 t_1 <- Sys.time() | |
| 122 | |
| 123 if(as.numeric(difftime(t_1, t_0, units = "mins")) > 3){ | |
| 124 message("Killing the request! Something is not working. Please, try again later","\n") | |
| 125 return(data) | |
| 126 } | |
| 127 | |
| 128 # ENTREZ server connect | |
| 129 out.data <- tryCatch({ | |
| 130 tmpConnect <- suppressWarnings(url(efetch_url, open = "rb", encoding = "UTF8")) | |
| 131 suppressWarnings(readLines(tmpConnect, warn = FALSE, encoding = "UTF8")) | |
| 132 }, error = function(e) { | |
| 133 print(e) | |
| 134 }, finally = { | |
| 135 try(suppressWarnings(close(tmpConnect)), silent = TRUE) | |
| 136 }) | |
| 137 | |
| 138 # Check if error | |
| 139 if (!is.null(out.data) && | |
| 140 class(out.data) == "character" && | |
| 141 grepl("<ERROR>", substr(paste(utils::head(out.data, n = 100), collapse = ""), 1, 250))) { | |
| 142 out.data <- NULL | |
| 143 } | |
| 144 try_num <- try_num + 1 | |
| 145 } | |
| 146 | |
| 147 if (is.null(out.data)) { | |
| 148 message("Killing the request! Something is not working. Please, try again later","\n") | |
| 149 return(data) | |
| 150 } | |
| 151 | |
| 152 # process xml data | |
| 153 xml_data <- paste(out.data, collapse = "") | |
| 154 | |
| 155 # articles to list | |
| 156 xml_data <- strsplit(xml_data, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1] | |
| 157 xml_data <- sapply(xml_data, function(x) { | |
| 158 #trim extra stuff at the end of the record | |
| 159 if (!grepl("</PubmedArticle>$", x)) | |
| 160 x <- sub("(^.*</PubmedArticle>).*$", "\\1", x) | |
| 161 # Rebuid XML structure and proceed | |
| 162 x <- paste("<PubmedArticle>", x) | |
| 163 gsub("[[:space:]]{2,}", " ", x)}, | |
| 164 USE.NAMES = FALSE, simplify = TRUE) | |
| 165 | |
| 166 #titles | |
| 167 titles = sapply(xml_data, function(x){ | |
| 168 x = custom_grep(x, tag="ArticleTitle", format="char") | |
| 169 x <- gsub("</{0,1}i>", "", x, ignore.case = T) | |
| 170 x <- gsub("</{0,1}b>", "", x, ignore.case = T) | |
| 171 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) | |
| 172 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) | |
| 173 if (length(x) > 1){ | |
| 174 x <- paste(x, collapse = " ", sep = " ") | |
| 175 } else if (length(x) < 1) { | |
| 176 x <- NA | |
| 177 } | |
| 178 x | |
| 179 }, | |
| 180 USE.NAMES = FALSE, simplify = TRUE) | |
| 181 | |
| 182 # abstracts | |
| 183 abstract.text = sapply(xml_data, function(x){ | |
| 184 custom_grep(x, tag="AbstractText", format="char")}, | |
| 185 USE.NAMES = FALSE, simplify = TRUE) | |
| 186 | |
| 187 abstracts <- sapply(abstract.text, function(x){ | |
| 188 if (length(x) > 1){ | |
| 189 x <- paste(x, collapse = " ", sep = " ") | |
| 190 x <- gsub("</{0,1}i>", "", x, ignore.case = T) | |
| 191 x <- gsub("</{0,1}b>", "", x, ignore.case = T) | |
| 192 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) | |
| 193 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) | |
| 194 } else if (length(x) < 1) { | |
| 195 x <- NA | |
| 196 } else { | |
| 197 x <- gsub("</{0,1}i>", "", x, ignore.case = T) | |
| 198 x <- gsub("</{0,1}b>", "", x, ignore.case = T) | |
| 199 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) | |
| 200 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) | |
| 201 } | |
| 202 x | |
| 203 }, | |
| 204 USE.NAMES = FALSE, simplify = TRUE) | |
| 205 | |
| 206 #add title to abstracts | |
| 207 if (length(titles) == length(abstracts)){ | |
| 208 abstracts = paste(titles, abstracts) | |
| 209 } | |
| 210 | |
| 211 #add abstracts to data frame | |
| 212 if(length(abstracts)>0){ | |
| 213 data[row,sapply(1:length(abstracts),function(i){paste0("ABSTRACT_",i)})] <- abstracts | |
| 214 cat(length(abstracts)," abstracts for ",query, " are added in the table.", "\n") | |
| 215 } | |
| 216 | |
| 217 return(data) | |
| 218 } | |
| 219 } | |
| 220 | |
| 221 for(i in 1:nrow(data)){ | |
| 222 data = tryCatch(pubmed_data_in_table(data= data, | |
| 223 row= i, | |
| 224 query= data[i,id_col_index], | |
| 225 number= args$number, | |
| 226 key= args$key, | |
| 227 abstract= args$abstract), error=function(e){ | |
| 228 print('main error') | |
| 229 print(e) | |
| 230 Sys.sleep(5) | |
| 231 }) | |
| 232 } | |
| 233 | |
| 234 write.table(data, args$output, append = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE) | |
| 235 | |
| 236 |
