Mercurial > repos > iuc > text_to_wordmatrix

diff abstracts_by_pmids.R @ 0:0692d11af909 draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
author: iuc
date: Wed, 24 Mar 2021 08:33:25 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/abstracts_by_pmids.R	Wed Mar 24 08:33:25 2021 +0000
@@ -0,0 +1,142 @@
+#!/usr/bin/env Rscript
+#TOOL2 abstracts_by_pmids
+#
+#This tool retrieves for all PMIDs in each row of a table the according abstracts and saves them in additional columns.
+#
+#Input: Tab-delimited table with columns containing PMIDs. The names of the PMID columns should start with “PMID”, e.g. “PMID_1”, “PMID_2” etc.
+#
+#Output: Input table with additional columns containing abstracts corresponding to the PMIDs from PubMed.
+#The abstract columns are called "ABSTRACT_1", "ABSTARCT_2" etc.
+#
+# Usage: $ T2_abstracts_by_pmid.R [-h] [-i INPUT] [-o OUTPUT]
+#
+# optional arguments:
+# -h, --help                 show help message
+# -i INPUT, --input INPUT    input file name. add path if file is not in working directory
+# -o OUTPUT, --output OUTPUT output file name. [default "T2_output"]
+
+
+if ("--install_packages" %in% commandArgs()) {
+  print("Installing packages")
+  if (!require("argparse")) install.packages("argparse", repo = "http://cran.rstudio.com/");
+  if (!require("reutils")) install.packages("reutils", repo = "http://cran.rstudio.com/");
+  if (!require("easyPubMed")) install.packages("easyPubMed", repo = "http://cran.rstudio.com/");
+  if (!require("textclean")) install.packages("textclean", repo = "http://cran.rstudio.com/");
+}
+
+suppressPackageStartupMessages(library("argparse"))
+library("reutils")
+suppressPackageStartupMessages(library("easyPubMed"))
+suppressPackageStartupMessages(library("textclean"))
+
+parser <- ArgumentParser()
+parser$add_argument("-i", "--input",
+                    help = "input fie name. add path if file is not in workind directory")
+parser$add_argument("-o", "--output", default = "abstracts_by_pmids_output",
+                    help = "output file name. [default \"%(default)s\"]")
+parser$add_argument("--install_packages", action = "store_true", default = FALSE,
+                    help = "If you want to auto install missing required packages.")
+
+args <- parser$parse_args()
+
+data <- read.delim(args$input, stringsAsFactors = FALSE, header = TRUE, sep = "\t")
+pmids_cols_index <- grep("PMID", names(data))
+
+fetch_abstracts <- function(pmids, row) {
+
+  efetch_result <- NULL
+  try_num <- 1
+  t_0 <- Sys.time()
+
+  while (is.null(efetch_result)) {
+
+    # Timing check: kill at 3 min
+    if (try_num > 1) {
+      Sys.sleep(time = 1 * try_num)
+      cat("Problem to receive PubMed data or error is received. Please wait. Try number: ", try_num, "\n")
+    }
+
+    t_1 <- Sys.time()
+
+    if (as.numeric(difftime(t_1, t_0, units = "mins")) > 3) {
+      message("Killing the request! Something is not working. Please, try again later", "\n")
+      return(data)
+    }
+
+    efetch_result <- tryCatch({
+      suppressWarnings(efetch(uid = pmids, db = "pubmed", retmode = "xml"))
+    }, error = function(e) {
+      NULL
+    })
+
+    if (!is.null(as.list(efetch_result$errors)$error)) {
+      if (as.list(efetch_result$errors)$error == "HTTP error: Status 400; Bad Request") {
+        efetch_result <- NULL
+      }
+    }
+
+    try_num <- try_num + 1
+
+  } #while loop end
+
+  # articles to list
+  xml_data <- strsplit(efetch_result$content, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1]
+  xml_data <- sapply(xml_data, function(x) {
+    #trim extra stuff at the end of the record
+    if (!grepl("</PubmedArticle>$", x))
+      x <- sub("(^.*</PubmedArticle>).*$", "\\1", x)
+    # Rebuid XML structure and proceed
+    x <- paste("<PubmedArticle>", x)
+    gsub("[[:space:]]{2,}", " ", x)},
+    USE.NAMES = FALSE, simplify = TRUE)
+
+  abstract_text <- sapply(xml_data, function(x) {
+    custom_grep(x, tag = "AbstractText", format = "char")},
+    USE.NAMES = FALSE, simplify = TRUE)
+
+  abstracts <- sapply(abstract_text, function(x) {
+    if (length(x) > 1) {
+      x <- paste(x, collapse = " ", sep = " ")
+      x <- gsub("</{0,1}i>", "", x, ignore.case = T)
+      x <- gsub("</{0,1}b>", "", x, ignore.case = T)
+      x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
+      x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
+    } else if (length(x) < 1) {
+      x <- NA
+    } else {
+      x <- gsub("</{0,1}i>", "", x, ignore.case = T)
+      x <- gsub("</{0,1}b>", "", x, ignore.case = T)
+      x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
+      x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
+    }
+    x
+  },
+  USE.NAMES = FALSE, simplify = TRUE)
+
+  abstracts <- as.character(abstracts)
+
+  if (length(abstracts) > 0) {
+    data[row, sapply(seq(length(abstracts)), function(i) {
+      paste0("ABSTRACT_", i)
+      })] <- abstracts
+    cat(length(abstracts), " abstracts for PMIDs of row ", row, " are added in the table.", "\n")
+  }
+
+  return(data)
+}
+    
+
+for (row in seq(nrow(data))) {
+  pmids <-  as.character(unique(data[row, pmids_cols_index]))
+  pmids <- pmids[!pmids == "NA"]
+
+  if (length(pmids) > 0) {
+    data <- tryCatch(fetch_abstracts(pmids, row),
+                    error = function(e) {
+                      Sys.sleep(3)
+                      })
+  } else {
+    print(paste("No PMIDs in row", row))
+  }
+}
+write.table(data, args$output, sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)