Mercurial > repos > dlalgroup > simtext_app
comparison abstracts_by_pmids.R @ 0:34ed44f3f85c draft
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
| author | dlalgroup |
|---|---|
| date | Thu, 24 Sep 2020 02:17:05 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:34ed44f3f85c |
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 #TOOL2 abstracts_by_pmids | |
| 3 # | |
| 4 #This tool retrieves for all PMIDs in each row of a table the according abstracts and saves them in additional columns. | |
| 5 # | |
| 6 #Input: Tab-delimited table with columns containing PMIDs. The names of the PMID columns should start with “PMID”, e.g. “PMID_1”, “PMID_2” etc. | |
| 7 # | |
| 8 #Output: Input table with additional columns containing abstracts corresponding to the PMIDs from PubMed. | |
| 9 #The abstract columns are called "ABSTRACT_1", "ABSTARCT_2" etc. | |
| 10 # | |
| 11 # Usage: $ T2_abstracts_by_pmid.R [-h] [-i INPUT] [-o OUTPUT] | |
| 12 # | |
| 13 # optional arguments: | |
| 14 # -h, --help show help message | |
| 15 # -i INPUT, --input INPUT input file name. add path if file is not in working directory | |
| 16 # -o OUTPUT, --output OUTPUT output file name. [default "T2_output"] | |
| 17 | |
| 18 | |
| 19 if ( '--install_packages' %in% commandArgs()) { | |
| 20 print('Installing packages') | |
| 21 if (!require('argparse')) install.packages('argparse', repo="http://cran.rstudio.com/"); | |
| 22 if (!require("reutils")) install.packages("reutils", repo="http://cran.rstudio.com/"); | |
| 23 if (!require('easyPubMed')) install.packages('easyPubMed', repo="http://cran.rstudio.com/" ); | |
| 24 if (!require('textclean')) install.packages('textclean', repo="http://cran.rstudio.com/"); | |
| 25 } | |
| 26 | |
| 27 suppressPackageStartupMessages(library("argparse")) | |
| 28 library("reutils") | |
| 29 suppressPackageStartupMessages(library("easyPubMed")) | |
| 30 suppressPackageStartupMessages(library("textclean")) | |
| 31 | |
| 32 parser <- ArgumentParser() | |
| 33 parser$add_argument("-i", "--input", | |
| 34 help = "input fie name. add path if file is not in workind directory") | |
| 35 parser$add_argument("-o", "--output", default="abstracts_by_pmids_output", | |
| 36 help = "output file name. [default \"%(default)s\"]") | |
| 37 parser$add_argument("--install_packages", action="store_true", default=FALSE, | |
| 38 help="If you want to auto install missing required packages.") | |
| 39 | |
| 40 args <- parser$parse_args() | |
| 41 | |
| 42 data = read.delim(args$input, stringsAsFactors=FALSE, header= TRUE, sep='\t') | |
| 43 pmids_cols_index <- grep("PMID", names(data)) | |
| 44 | |
| 45 fetch_abstracts = function(PMIDs, row){ | |
| 46 | |
| 47 efetch_result <- NULL | |
| 48 try_num <- 1 | |
| 49 t_0 <- Sys.time() | |
| 50 | |
| 51 while(is.null(efetch_result)) { | |
| 52 | |
| 53 # Timing check: kill at 3 min | |
| 54 if (try_num > 1){ | |
| 55 Sys.sleep(time = 1*try_num) | |
| 56 cat("Problem to receive PubMed data or error is received. Please wait. Try number: ",try_num,"\n") | |
| 57 } | |
| 58 | |
| 59 t_1 <- Sys.time() | |
| 60 | |
| 61 if(as.numeric(difftime(t_1, t_0, units = "mins")) > 3){ | |
| 62 message("Killing the request! Something is not working. Please, try again later","\n") | |
| 63 return(data) | |
| 64 } | |
| 65 | |
| 66 efetch_result <- tryCatch({ | |
| 67 suppressWarnings(efetch(uid=PMIDs, db="pubmed", retmode = "xml")) | |
| 68 }, error = function(e) { | |
| 69 NULL | |
| 70 }) | |
| 71 | |
| 72 if(!is.null(as.list(efetch_result$errors)$error)){ | |
| 73 if (as.list(efetch_result$errors)$error == "HTTP error: Status 400; Bad Request") { | |
| 74 efetch_result <- NULL | |
| 75 } | |
| 76 } | |
| 77 | |
| 78 try_num <- try_num + 1 | |
| 79 | |
| 80 } #while loop end | |
| 81 | |
| 82 # articles to list | |
| 83 xml_data <- strsplit(efetch_result$content, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1] | |
| 84 xml_data <- sapply(xml_data, function(x) { | |
| 85 #trim extra stuff at the end of the record | |
| 86 if (!grepl("</PubmedArticle>$", x)) | |
| 87 x <- sub("(^.*</PubmedArticle>).*$", "\\1", x) | |
| 88 # Rebuid XML structure and proceed | |
| 89 x <- paste("<PubmedArticle>", x) | |
| 90 gsub("[[:space:]]{2,}", " ", x)}, | |
| 91 USE.NAMES = FALSE, simplify = TRUE) | |
| 92 | |
| 93 abstract.text = sapply(xml_data, function(x){ | |
| 94 custom_grep(x, tag="AbstractText", format="char")}, | |
| 95 USE.NAMES = FALSE, simplify = TRUE) | |
| 96 | |
| 97 abstracts <- sapply(abstract.text, function(x){ | |
| 98 if (length(x) > 1){ | |
| 99 x <- paste(x, collapse = " ", sep = " ") | |
| 100 x <- gsub("</{0,1}i>", "", x, ignore.case = T) | |
| 101 x <- gsub("</{0,1}b>", "", x, ignore.case = T) | |
| 102 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) | |
| 103 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) | |
| 104 } else if (length(x) < 1) { | |
| 105 x <- NA | |
| 106 } else { | |
| 107 x <- gsub("</{0,1}i>", "", x, ignore.case = T) | |
| 108 x <- gsub("</{0,1}b>", "", x, ignore.case = T) | |
| 109 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) | |
| 110 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) | |
| 111 } | |
| 112 x | |
| 113 }, | |
| 114 USE.NAMES = FALSE, simplify = TRUE) | |
| 115 | |
| 116 abstracts = as.character(abstracts) | |
| 117 | |
| 118 if(length(abstracts)>0){ | |
| 119 data[row,sapply(1:length(abstracts),function(i){paste0("ABSTRACT_",i)})] <- abstracts | |
| 120 cat(length(abstracts)," abstracts for PMIDs of row ", row, " are added in the table.","\n") | |
| 121 } | |
| 122 | |
| 123 return(data) | |
| 124 } | |
| 125 | |
| 126 | |
| 127 for(row in 1:nrow(data)){ | |
| 128 PMIDs= as.character(unique(data[row, pmids_cols_index])) | |
| 129 PMIDs = PMIDs[!PMIDs=="NA"] | |
| 130 | |
| 131 if(length(PMIDs) > 0){ | |
| 132 data = tryCatch(fetch_abstracts(PMIDs, row), | |
| 133 error=function(e){ | |
| 134 Sys.sleep(3) | |
| 135 }) | |
| 136 } else { | |
| 137 print(paste("No PMIDs in row", row)) | |
| 138 } | |
| 139 } | |
| 140 | |
| 141 write.table(data, args$output, sep = '\t', row.names = FALSE, col.names = TRUE) | |
| 142 |
