simtext_app: pubmed_by_queries.R comparison

comparison pubmed_by_queries.R @ 0:34ed44f3f85c draft

"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"

author	dlalgroup
date	Thu, 24 Sep 2020 02:17:05 +0000
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:34ed44f3f85c
+#!/usr/bin/env Rscript
+#tool: pubmed_by_queries
+#
+#This tool uses a set of search queries to download a defined number of abstracts or PMIDs for search query from PubMed.
+#PubMed's search rules and syntax apply.
+#
+#Input: Tab-delimited table with search queries in a column starting with "ID_", e.g. "ID_gene" if search queries are genes.
+#
+# Output: Input table with additional columns with PMIDs or abstracts (--abstracts) from PubMed.
+#
+#Usage: $ pubmed_by_queries.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER] [-a] [-k KEY]
+#
+# optional arguments:
+# -h, --help                  show this help message and exit
+# -i INPUT, --input INPUT     input file name. add path if file is not in working directory
+# -o OUTPUT, --output OUTPUT  output file name. [default "pubmed_by_queries_output"]
+# -n NUMBER, --number NUMBER  number of PMIDs or abstracts to save per ID [default "5"]
+# -a, --abstract              if abstracts instead of PMIDs should be retrieved use --abstracts
+# -k KEY, --key KEY           if NCBI API key is available, add it to speed up the fetching of pubmed data
+if ( '--install_packages' %in% commandArgs()) {
+print('Installing packages')
+if (!require('argparse')) install.packages('argparse',repo="http://cran.rstudio.com/");
+if (!require('easyPubMed')) install.packages('easyPubMed',repo="http://cran.rstudio.com/");
+}
+suppressPackageStartupMessages(library("argparse"))
+suppressPackageStartupMessages(library("easyPubMed"))
+parser <- ArgumentParser()
+parser$add_argument("-i", "--input",
+help = "input fie name. add path if file is not in working directory")
+parser$add_argument("-o", "--output", default="pubmed_by_queries_output",
+help = "output file name. [default \"%(default)s\"]")
+parser$add_argument("-n", "--number", type="integer", default=5,
+help="Number of PMIDs (or abstracts) to save per  ID. [default \"%(default)s\"]")
+parser$add_argument("-a", "--abstract", action="store_true", default=FALSE,
+help="if abstracts instead of PMIDs should be retrieved use --abstracts ")
+parser$add_argument("-k", "--key", type="character",
+help="if ncbi API key is available, add it to speed up the download of pubmed data")
+parser$add_argument("--install_packages", action="store_true", default=FALSE,
+help="If you want to auto install missing required packages.")
+args <- parser$parse_args()
+MAX_WEB_TRIES = 100
+data = read.delim(args$input, stringsAsFactors=FALSE)
+id_col_index <- grep("ID_", names(data))
+pubmed_data_in_table <- function(data, row, query, number, key, abstract){
+if (is.null(query)){print(data)}
+pubmed_search <- get_pubmed_ids(query, api_key = key)
+if(as.numeric(pubmed_search$Count) == 0){
+cat("No PubMed result for the following query: ", query, "\n")
+return(data)
+} else if (abstract == FALSE) { # fetch PMIDs
+myPubmedURL <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?",
+"db=pubmed&retmax=", number, "&term=", pubmed_search$OriginalQuery, "&usehistory=n", sep = "")
+# get ids
+idXML <- c()
+for (i in 1:MAX_WEB_TRIES){
+tryCatch({
+IDconnect <- suppressWarnings(url(myPubmedURL, open = "rb", encoding = "UTF8"))
+idXML <- suppressWarnings(readLines(IDconnect, warn = FALSE, encoding = "UTF8"))
+suppressWarnings(close(IDconnect))
+break
+}, error = function(e) {
+print(paste('Error getting URL, sleeping',2*i,'seconds.'))
+print(e)
+Sys.sleep(time = 2*i)
+})
+}
+PMIDs = c()
+for (i in 1:length(idXML)) {
+if (grepl("^<Id>", idXML[i])) {
+pmid <- custom_grep(idXML[i], tag = "Id", format = "char")
+PMIDs <- c(PMIDs, as.character(pmid[1]))
+}
+}
+if(length(PMIDs)>0){
+data[row,sapply(1:length(PMIDs),function(i){paste0("PMID_",i)})] <- PMIDs
+cat(length(PMIDs)," PMIDs for ",query, " are added in the table.",  "\n")
+}
+return(data)
+} else if (abstract == TRUE) { # fetch abstracts and title text
+efetch_url = paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?",
+"db=pubmed&WebEnv=", pubmed_search$WebEnv, "&query_key=", pubmed_search$QueryKey,
+"&retstart=", 0, "&retmax=", number,
+"&rettype=", "null","&retmode=", "xml", sep = "")
+api_key <- pubmed_search$APIkey
+if (!is.null(api_key)) {
+efetch_url <- paste(efetch_url, "&api_key=", api_key, sep = "")
+}
+# initialize
+out.data <- NULL
+try_num <- 1
+t_0 <- Sys.time()
+# Try to fetch results
+while(is.null(out.data)) {
+# Timing check: kill at 3 min
+if (try_num > 1){
+Sys.sleep(time = 2*try_num)
+cat("Problem to receive PubMed data or error is received. Please wait. Try number:",try_num,"\n")
+}
+t_1 <- Sys.time()
+if(as.numeric(difftime(t_1, t_0, units = "mins")) > 3){
+message("Killing the request! Something is not working. Please, try again later","\n")
+return(data)
+}
+# ENTREZ server connect
+out.data <- tryCatch({
+tmpConnect <- suppressWarnings(url(efetch_url, open = "rb", encoding = "UTF8"))
+suppressWarnings(readLines(tmpConnect, warn = FALSE, encoding = "UTF8"))
+}, error = function(e) {
+print(e)
+}, finally = {
+try(suppressWarnings(close(tmpConnect)), silent = TRUE)
+})
+# Check if error
+if (!is.null(out.data) &&
+class(out.data) == "character" &&
+grepl("<ERROR>", substr(paste(utils::head(out.data, n = 100), collapse = ""), 1, 250))) {
+out.data <- NULL
+}
+try_num <- try_num + 1
+}
+if (is.null(out.data)) {
+message("Killing the request! Something is not working. Please, try again later","\n")
+return(data)
+}
+# process xml data
+xml_data <- paste(out.data, collapse = "")
+# articles to list
+xml_data <- strsplit(xml_data, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1]
+xml_data <- sapply(xml_data, function(x) {
+#trim extra stuff at the end of the record
+if (!grepl("</PubmedArticle>$", x))
+x <- sub("(^.*</PubmedArticle>).*$", "\\1", x)
+# Rebuid XML structure and proceed
+x <- paste("<PubmedArticle>", x)
+gsub("[[:space:]]{2,}", " ", x)},
+USE.NAMES = FALSE, simplify = TRUE)
+#titles
+titles = sapply(xml_data, function(x){
+x = custom_grep(x, tag="ArticleTitle", format="char")
+x <- gsub("</{0,1}i>", "", x, ignore.case = T)
+x <- gsub("</{0,1}b>", "", x, ignore.case = T)
+x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
+x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
+if (length(x) > 1){
+x <- paste(x, collapse = " ", sep = " ")
+} else if (length(x) < 1) {
+x <- NA
+}
+x
+},
+USE.NAMES = FALSE, simplify = TRUE)
+# abstracts
+abstract.text = sapply(xml_data, function(x){
+custom_grep(x, tag="AbstractText", format="char")},
+USE.NAMES = FALSE, simplify = TRUE)
+abstracts <- sapply(abstract.text, function(x){
+if (length(x) > 1){
+x <- paste(x, collapse = " ", sep = " ")
+x <- gsub("</{0,1}i>", "", x, ignore.case = T)
+x <- gsub("</{0,1}b>", "", x, ignore.case = T)
+x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
+x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
+} else if (length(x) < 1) {
+x <- NA
+} else {
+x <- gsub("</{0,1}i>", "", x, ignore.case = T)
+x <- gsub("</{0,1}b>", "", x, ignore.case = T)
+x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
+x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
+}
+x
+},
+USE.NAMES = FALSE, simplify = TRUE)
+#add title to abstracts
+if (length(titles) == length(abstracts)){
+abstracts = paste(titles,  abstracts)
+}
+#add abstracts to data frame
+if(length(abstracts)>0){
+data[row,sapply(1:length(abstracts),function(i){paste0("ABSTRACT_",i)})] <- abstracts
+cat(length(abstracts)," abstracts for ",query, " are added in the table.",  "\n")
+}
+return(data)
+}
+}
+for(i in 1:nrow(data)){
+data = tryCatch(pubmed_data_in_table(data= data,
+row= i,
+query= data[i,id_col_index],
+number= args$number,
+key= args$key,
+abstract= args$abstract), error=function(e){
+print('main error')
+print(e)
+Sys.sleep(5)
+})
+}
+write.table(data, args$output, append = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE)

Mercurial > repos > dlalgroup > simtext_app

comparison pubmed_by_queries.R @ 0:34ed44f3f85c draft