Mercurial > repos > dlalgroup > pmids_to_pubtator_matrix

diff pubmed_by_queries.R @ 0:3f4adc85ba5d draft
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
author: dlalgroup
date: Thu, 24 Sep 2020 02:01:50 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pubmed_by_queries.R	Thu Sep 24 02:01:50 2020 +0000
@@ -0,0 +1,236 @@
+#!/usr/bin/env Rscript
+#tool: pubmed_by_queries
+#
+#This tool uses a set of search queries to download a defined number of abstracts or PMIDs for search query from PubMed. 
+#PubMed's search rules and syntax apply.
+# 
+#Input: Tab-delimited table with search queries in a column starting with "ID_", e.g. "ID_gene" if search queries are genes. 
+#
+# Output: Input table with additional columns with PMIDs or abstracts (--abstracts) from PubMed.
+#
+#Usage: $ pubmed_by_queries.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER] [-a] [-k KEY]
+# 
+# optional arguments:
+# -h, --help                  show this help message and exit
+# -i INPUT, --input INPUT     input file name. add path if file is not in working directory
+# -o OUTPUT, --output OUTPUT  output file name. [default "pubmed_by_queries_output"]
+# -n NUMBER, --number NUMBER  number of PMIDs or abstracts to save per ID [default "5"]
+# -a, --abstract              if abstracts instead of PMIDs should be retrieved use --abstracts 
+# -k KEY, --key KEY           if NCBI API key is available, add it to speed up the fetching of pubmed data
+
+if ( '--install_packages' %in% commandArgs()) {
+  print('Installing packages')
+  if (!require('argparse')) install.packages('argparse',repo="http://cran.rstudio.com/");
+  if (!require('easyPubMed')) install.packages('easyPubMed',repo="http://cran.rstudio.com/");
+}
+
+suppressPackageStartupMessages(library("argparse"))
+suppressPackageStartupMessages(library("easyPubMed"))
+
+parser <- ArgumentParser()
+parser$add_argument("-i", "--input", 
+                    help = "input fie name. add path if file is not in working directory")
+parser$add_argument("-o", "--output", default="pubmed_by_queries_output",
+                    help = "output file name. [default \"%(default)s\"]")
+parser$add_argument("-n", "--number", type="integer", default=5, 
+                    help="Number of PMIDs (or abstracts) to save per  ID. [default \"%(default)s\"]")
+parser$add_argument("-a", "--abstract", action="store_true", default=FALSE,
+                    help="if abstracts instead of PMIDs should be retrieved use --abstracts ")
+parser$add_argument("-k", "--key", type="character", 
+                    help="if ncbi API key is available, add it to speed up the download of pubmed data")
+parser$add_argument("--install_packages", action="store_true", default=FALSE,
+                    help="If you want to auto install missing required packages.")
+args <- parser$parse_args()
+
+MAX_WEB_TRIES = 100
+
+data = read.delim(args$input, stringsAsFactors=FALSE)
+
+id_col_index <- grep("ID_", names(data))
+
+pubmed_data_in_table <- function(data, row, query, number, key, abstract){
+if (is.null(query)){print(data)}
+    pubmed_search <- get_pubmed_ids(query, api_key = key)
+
+    if(as.numeric(pubmed_search$Count) == 0){
+      cat("No PubMed result for the following query: ", query, "\n")
+      return(data)
+      
+    } else if (abstract == FALSE) { # fetch PMIDs
+          
+            myPubmedURL <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?", 
+                                 "db=pubmed&retmax=", number, "&term=", pubmed_search$OriginalQuery, "&usehistory=n", sep = "")
+            # get ids
+            idXML <- c()
+            for (i in 1:MAX_WEB_TRIES){
+              tryCatch({
+                IDconnect <- suppressWarnings(url(myPubmedURL, open = "rb", encoding = "UTF8"))
+                idXML <- suppressWarnings(readLines(IDconnect, warn = FALSE, encoding = "UTF8"))
+                suppressWarnings(close(IDconnect))
+                break
+              }, error = function(e) {
+                print(paste('Error getting URL, sleeping',2*i,'seconds.'))
+                print(e)
+                Sys.sleep(time = 2*i)
+              })
+          }
+
+            PMIDs = c()
+            
+            for (i in 1:length(idXML)) {
+              if (grepl("^<Id>", idXML[i])) {
+                pmid <- custom_grep(idXML[i], tag = "Id", format = "char")
+                PMIDs <- c(PMIDs, as.character(pmid[1]))
+              }
+            }
+          
+
+            if(length(PMIDs)>0){
+              data[row,sapply(1:length(PMIDs),function(i){paste0("PMID_",i)})] <- PMIDs
+              cat(length(PMIDs)," PMIDs for ",query, " are added in the table.",  "\n")
+             }
+
+            return(data) 
+      
+    } else if (abstract == TRUE) { # fetch abstracts and title text
+          
+          efetch_url = paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?", 
+                             "db=pubmed&WebEnv=", pubmed_search$WebEnv, "&query_key=", pubmed_search$QueryKey, 
+                             "&retstart=", 0, "&retmax=", number, 
+                             "&rettype=", "null","&retmode=", "xml", sep = "")
+
+          api_key <- pubmed_search$APIkey
+          if (!is.null(api_key)) {
+            efetch_url <- paste(efetch_url, "&api_key=", api_key, sep = "")
+          }
+
+          # initialize
+          out.data <- NULL
+          try_num <- 1
+          t_0 <- Sys.time()
+          
+          # Try to fetch results
+          while(is.null(out.data)) {
+            
+              # Timing check: kill at 3 min
+              if (try_num > 1){
+                Sys.sleep(time = 2*try_num)
+                cat("Problem to receive PubMed data or error is received. Please wait. Try number:",try_num,"\n") 
+                }
+
+              t_1 <- Sys.time()
+              
+              if(as.numeric(difftime(t_1, t_0, units = "mins")) > 3){
+                message("Killing the request! Something is not working. Please, try again later","\n")
+                return(data)
+              }
+              
+              # ENTREZ server connect
+              out.data <- tryCatch({    
+                tmpConnect <- suppressWarnings(url(efetch_url, open = "rb", encoding = "UTF8"))
+                suppressWarnings(readLines(tmpConnect, warn = FALSE, encoding = "UTF8"))
+              }, error = function(e) {
+                print(e)
+              }, finally = {
+                try(suppressWarnings(close(tmpConnect)), silent = TRUE)
+              })  
+              
+              # Check if error
+              if (!is.null(out.data) && 
+                  class(out.data) == "character" &&
+                  grepl("<ERROR>", substr(paste(utils::head(out.data, n = 100), collapse = ""), 1, 250))) {
+                  out.data <- NULL
+              }
+              try_num <- try_num + 1
+          }
+          
+          if (is.null(out.data)) {
+            message("Killing the request! Something is not working. Please, try again later","\n")
+            return(data)
+          } 
+      
+          # process xml data
+          xml_data <- paste(out.data, collapse = "")
+          
+          # articles to list
+          xml_data <- strsplit(xml_data, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1]
+          xml_data <- sapply(xml_data, function(x) {
+                #trim extra stuff at the end of the record
+                if (!grepl("</PubmedArticle>$", x))
+                  x <- sub("(^.*</PubmedArticle>).*$", "\\1", x) 
+                # Rebuid XML structure and proceed
+                x <- paste("<PubmedArticle>", x)
+                gsub("[[:space:]]{2,}", " ", x)}, 
+                USE.NAMES = FALSE, simplify = TRUE)
+          
+          #titles
+          titles = sapply(xml_data, function(x){
+              x = custom_grep(x, tag="ArticleTitle", format="char")
+              x <- gsub("</{0,1}i>", "", x, ignore.case = T)
+              x <- gsub("</{0,1}b>", "", x, ignore.case = T)
+              x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
+              x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
+              if (length(x) > 1){
+                x <- paste(x, collapse = " ", sep = " ")
+              } else if (length(x) < 1) {
+                x <- NA
+              }
+              x
+            }, 
+            USE.NAMES = FALSE, simplify = TRUE)
+
+          # abstracts
+          abstract.text = sapply(xml_data, function(x){
+            custom_grep(x, tag="AbstractText", format="char")}, 
+            USE.NAMES = FALSE, simplify = TRUE)
+          
+          abstracts <- sapply(abstract.text, function(x){
+                  if (length(x) > 1){
+                    x <- paste(x, collapse = " ", sep = " ")
+                    x <- gsub("</{0,1}i>", "", x, ignore.case = T)
+                    x <- gsub("</{0,1}b>", "", x, ignore.case = T)
+                    x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
+                    x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
+                  } else if (length(x) < 1) {
+                    x <- NA
+                  } else {
+                    x <- gsub("</{0,1}i>", "", x, ignore.case = T)
+                    x <- gsub("</{0,1}b>", "", x, ignore.case = T)
+                    x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
+                    x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
+                  }
+              x
+          }, 
+          USE.NAMES = FALSE, simplify = TRUE)
+          
+          #add title to abstracts
+          if (length(titles) == length(abstracts)){
+            abstracts = paste(titles,  abstracts)
+          }
+
+          #add abstracts to data frame
+          if(length(abstracts)>0){
+            data[row,sapply(1:length(abstracts),function(i){paste0("ABSTRACT_",i)})] <- abstracts
+            cat(length(abstracts)," abstracts for ",query, " are added in the table.",  "\n")
+          }
+          
+          return(data)
+        }
+    }
+
+for(i in 1:nrow(data)){
+    data = tryCatch(pubmed_data_in_table(data= data, 
+                           row= i,
+                           query= data[i,id_col_index],
+                           number= args$number,
+                           key= args$key,
+                           abstract= args$abstract), error=function(e){
+                             print('main error')
+                             print(e)
+                             Sys.sleep(5)
+                             })
+    }
+
+write.table(data, args$output, append = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE)
+
+
author	dlalgroup
date	Thu, 24 Sep 2020 02:01:50 +0000
parents
children