simtext_app: abstracts_by_pmids.R comparison

comparison abstracts_by_pmids.R @ 0:34ed44f3f85c draft

"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"

author	dlalgroup
date	Thu, 24 Sep 2020 02:17:05 +0000
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:34ed44f3f85c
+#!/usr/bin/env Rscript
+#TOOL2 abstracts_by_pmids
+#
+#This tool retrieves for all PMIDs in each row of a table the according abstracts and saves them in additional columns.
+#
+#Input: Tab-delimited table with columns containing PMIDs. The names of the PMID columns should start with “PMID”, e.g. “PMID_1”, “PMID_2” etc.
+#
+#Output: Input table with additional columns containing abstracts corresponding to the PMIDs from PubMed.
+#The abstract columns are called "ABSTRACT_1", "ABSTARCT_2" etc.
+#
+# Usage: $ T2_abstracts_by_pmid.R [-h] [-i INPUT] [-o OUTPUT]
+#
+# optional arguments:
+# -h, --help                 show help message
+# -i INPUT, --input INPUT    input file name. add path if file is not in working directory
+# -o OUTPUT, --output OUTPUT output file name. [default "T2_output"]
+if ( '--install_packages' %in% commandArgs()) {
+print('Installing packages')
+if (!require('argparse')) install.packages('argparse', repo="http://cran.rstudio.com/");
+if (!require("reutils")) install.packages("reutils", repo="http://cran.rstudio.com/");
+if (!require('easyPubMed')) install.packages('easyPubMed', repo="http://cran.rstudio.com/" );
+if (!require('textclean')) install.packages('textclean', repo="http://cran.rstudio.com/");
+}
+suppressPackageStartupMessages(library("argparse"))
+library("reutils")
+suppressPackageStartupMessages(library("easyPubMed"))
+suppressPackageStartupMessages(library("textclean"))
+parser <- ArgumentParser()
+parser$add_argument("-i", "--input",
+help = "input fie name. add path if file is not in workind directory")
+parser$add_argument("-o", "--output", default="abstracts_by_pmids_output",
+help = "output file name. [default \"%(default)s\"]")
+parser$add_argument("--install_packages", action="store_true", default=FALSE,
+help="If you want to auto install missing required packages.")
+args <- parser$parse_args()
+data = read.delim(args$input, stringsAsFactors=FALSE, header= TRUE, sep='\t')
+pmids_cols_index <- grep("PMID", names(data))
+fetch_abstracts = function(PMIDs, row){
+efetch_result <- NULL
+try_num <- 1
+t_0 <- Sys.time()
+while(is.null(efetch_result)) {
+# Timing check: kill at 3 min
+if (try_num > 1){
+Sys.sleep(time = 1*try_num)
+cat("Problem to receive PubMed data or error is received. Please wait. Try number: ",try_num,"\n")
+}
+t_1 <- Sys.time()
+if(as.numeric(difftime(t_1, t_0, units = "mins")) > 3){
+message("Killing the request! Something is not working. Please, try again later","\n")
+return(data)
+}
+efetch_result <- tryCatch({
+suppressWarnings(efetch(uid=PMIDs, db="pubmed", retmode = "xml"))
+}, error = function(e) {
+NULL
+})
+if(!is.null(as.list(efetch_result$errors)$error)){
+if (as.list(efetch_result$errors)$error == "HTTP error: Status 400; Bad Request") {
+efetch_result <- NULL
+}
+}
+try_num <- try_num + 1
+} #while loop end
+# articles to list
+xml_data <- strsplit(efetch_result$content, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1]
+xml_data <- sapply(xml_data, function(x) {
+#trim extra stuff at the end of the record
+if (!grepl("</PubmedArticle>$", x))
+x <- sub("(^.*</PubmedArticle>).*$", "\\1", x)
+# Rebuid XML structure and proceed
+x <- paste("<PubmedArticle>", x)
+gsub("[[:space:]]{2,}", " ", x)},
+USE.NAMES = FALSE, simplify = TRUE)
+abstract.text = sapply(xml_data, function(x){
+custom_grep(x, tag="AbstractText", format="char")},
+USE.NAMES = FALSE, simplify = TRUE)
+abstracts <- sapply(abstract.text, function(x){
+if (length(x) > 1){
+x <- paste(x, collapse = " ", sep = " ")
+x <- gsub("</{0,1}i>", "", x, ignore.case = T)
+x <- gsub("</{0,1}b>", "", x, ignore.case = T)
+x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
+x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
+} else if (length(x) < 1) {
+x <- NA
+} else {
+x <- gsub("</{0,1}i>", "", x, ignore.case = T)
+x <- gsub("</{0,1}b>", "", x, ignore.case = T)
+x <- gsub("</{0,1}sub>", "", x, ignore.case = T)
+x <- gsub("</{0,1}exp>", "", x, ignore.case = T)
+}
+x
+},
+USE.NAMES = FALSE, simplify = TRUE)
+abstracts = as.character(abstracts)
+if(length(abstracts)>0){
+data[row,sapply(1:length(abstracts),function(i){paste0("ABSTRACT_",i)})] <- abstracts
+cat(length(abstracts)," abstracts for PMIDs of row ", row, " are added in the table.","\n")
+}
+return(data)
+}
+for(row in 1:nrow(data)){
+PMIDs= as.character(unique(data[row, pmids_cols_index]))
+PMIDs = PMIDs[!PMIDs=="NA"]
+if(length(PMIDs) > 0){
+data = tryCatch(fetch_abstracts(PMIDs, row),
+error=function(e){
+Sys.sleep(3)
+})
+} else {
+print(paste("No PMIDs in row", row))
+}
+}
+write.table(data, args$output, sep = '\t', row.names = FALSE, col.names = TRUE)

Mercurial > repos > dlalgroup > simtext_app

comparison abstracts_by_pmids.R @ 0:34ed44f3f85c draft