Mercurial > repos > dlalgroup > simtext_app
annotate abstracts_by_pmids.R @ 2:d7b190591e63 draft default tip
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
| author | dlalgroup | 
|---|---|
| date | Thu, 24 Sep 2020 05:44:58 +0000 | 
| parents | 34ed44f3f85c | 
| children | 
| rev | line source | 
|---|---|
| 0 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 1 #!/usr/bin/env Rscript | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 2 #TOOL2 abstracts_by_pmids | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 3 # | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 4 #This tool retrieves for all PMIDs in each row of a table the according abstracts and saves them in additional columns. | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 5 # | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 6 #Input: Tab-delimited table with columns containing PMIDs. The names of the PMID columns should start with “PMID”, e.g. “PMID_1”, “PMID_2” etc. | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 7 # | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 8 #Output: Input table with additional columns containing abstracts corresponding to the PMIDs from PubMed. | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 9 #The abstract columns are called "ABSTRACT_1", "ABSTARCT_2" etc. | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 10 # | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 11 # Usage: $ T2_abstracts_by_pmid.R [-h] [-i INPUT] [-o OUTPUT] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 12 # | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 13 # optional arguments: | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 14 # -h, --help show help message | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 15 # -i INPUT, --input INPUT input file name. add path if file is not in working directory | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 16 # -o OUTPUT, --output OUTPUT output file name. [default "T2_output"] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 17 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 18 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 19 if ( '--install_packages' %in% commandArgs()) { | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 20 print('Installing packages') | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 21 if (!require('argparse')) install.packages('argparse', repo="http://cran.rstudio.com/"); | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 22 if (!require("reutils")) install.packages("reutils", repo="http://cran.rstudio.com/"); | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 23 if (!require('easyPubMed')) install.packages('easyPubMed', repo="http://cran.rstudio.com/" ); | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 24 if (!require('textclean')) install.packages('textclean', repo="http://cran.rstudio.com/"); | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 25 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 26 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 27 suppressPackageStartupMessages(library("argparse")) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 28 library("reutils") | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 29 suppressPackageStartupMessages(library("easyPubMed")) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 30 suppressPackageStartupMessages(library("textclean")) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 31 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 32 parser <- ArgumentParser() | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 33 parser$add_argument("-i", "--input", | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 34 help = "input fie name. add path if file is not in workind directory") | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 35 parser$add_argument("-o", "--output", default="abstracts_by_pmids_output", | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 36 help = "output file name. [default \"%(default)s\"]") | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 37 parser$add_argument("--install_packages", action="store_true", default=FALSE, | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 38 help="If you want to auto install missing required packages.") | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 39 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 40 args <- parser$parse_args() | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 41 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 42 data = read.delim(args$input, stringsAsFactors=FALSE, header= TRUE, sep='\t') | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 43 pmids_cols_index <- grep("PMID", names(data)) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 44 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 45 fetch_abstracts = function(PMIDs, row){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 46 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 47 efetch_result <- NULL | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 48 try_num <- 1 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 49 t_0 <- Sys.time() | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 50 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 51 while(is.null(efetch_result)) { | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 52 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 53 # Timing check: kill at 3 min | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 54 if (try_num > 1){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 55 Sys.sleep(time = 1*try_num) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 56 cat("Problem to receive PubMed data or error is received. Please wait. Try number: ",try_num,"\n") | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 57 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 58 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 59 t_1 <- Sys.time() | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 60 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 61 if(as.numeric(difftime(t_1, t_0, units = "mins")) > 3){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 62 message("Killing the request! Something is not working. Please, try again later","\n") | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 63 return(data) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 64 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 65 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 66 efetch_result <- tryCatch({ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 67 suppressWarnings(efetch(uid=PMIDs, db="pubmed", retmode = "xml")) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 68 }, error = function(e) { | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 69 NULL | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 70 }) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 71 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 72 if(!is.null(as.list(efetch_result$errors)$error)){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 73 if (as.list(efetch_result$errors)$error == "HTTP error: Status 400; Bad Request") { | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 74 efetch_result <- NULL | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 75 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 76 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 77 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 78 try_num <- try_num + 1 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 79 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 80 } #while loop end | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 81 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 82 # articles to list | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 83 xml_data <- strsplit(efetch_result$content, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 84 xml_data <- sapply(xml_data, function(x) { | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 85 #trim extra stuff at the end of the record | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 86 if (!grepl("</PubmedArticle>$", x)) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 87 x <- sub("(^.*</PubmedArticle>).*$", "\\1", x) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 88 # Rebuid XML structure and proceed | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 89 x <- paste("<PubmedArticle>", x) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 90 gsub("[[:space:]]{2,}", " ", x)}, | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 91 USE.NAMES = FALSE, simplify = TRUE) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 92 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 93 abstract.text = sapply(xml_data, function(x){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 94 custom_grep(x, tag="AbstractText", format="char")}, | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 95 USE.NAMES = FALSE, simplify = TRUE) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 96 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 97 abstracts <- sapply(abstract.text, function(x){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 98 if (length(x) > 1){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 99 x <- paste(x, collapse = " ", sep = " ") | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 100 x <- gsub("</{0,1}i>", "", x, ignore.case = T) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 101 x <- gsub("</{0,1}b>", "", x, ignore.case = T) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 102 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 103 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 104 } else if (length(x) < 1) { | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 105 x <- NA | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 106 } else { | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 107 x <- gsub("</{0,1}i>", "", x, ignore.case = T) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 108 x <- gsub("</{0,1}b>", "", x, ignore.case = T) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 109 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 110 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 111 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 112 x | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 113 }, | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 114 USE.NAMES = FALSE, simplify = TRUE) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 115 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 116 abstracts = as.character(abstracts) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 117 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 118 if(length(abstracts)>0){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 119 data[row,sapply(1:length(abstracts),function(i){paste0("ABSTRACT_",i)})] <- abstracts | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 120 cat(length(abstracts)," abstracts for PMIDs of row ", row, " are added in the table.","\n") | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 121 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 122 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 123 return(data) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 124 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 125 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 126 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 127 for(row in 1:nrow(data)){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 128 PMIDs= as.character(unique(data[row, pmids_cols_index])) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 129 PMIDs = PMIDs[!PMIDs=="NA"] | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 130 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 131 if(length(PMIDs) > 0){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 132 data = tryCatch(fetch_abstracts(PMIDs, row), | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 133 error=function(e){ | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 134 Sys.sleep(3) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 135 }) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 136 } else { | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 137 print(paste("No PMIDs in row", row)) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 138 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 139 } | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 140 | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 141 write.table(data, args$output, sep = '\t', row.names = FALSE, col.names = TRUE) | 
| 
34ed44f3f85c
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 dlalgroup parents: diff
changeset | 142 | 
