Mercurial > repos > dlalgroup > pmids_to_pubtator_matrix
annotate pubmed_by_queries.R @ 0:3f4adc85ba5d draft
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
author | dlalgroup |
---|---|
date | Thu, 24 Sep 2020 02:01:50 +0000 |
parents | |
children |
rev | line source |
---|---|
0
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
1 #!/usr/bin/env Rscript |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
2 #tool: pubmed_by_queries |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
3 # |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
4 #This tool uses a set of search queries to download a defined number of abstracts or PMIDs for search query from PubMed. |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
5 #PubMed's search rules and syntax apply. |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
6 # |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
7 #Input: Tab-delimited table with search queries in a column starting with "ID_", e.g. "ID_gene" if search queries are genes. |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
8 # |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
9 # Output: Input table with additional columns with PMIDs or abstracts (--abstracts) from PubMed. |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
10 # |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
11 #Usage: $ pubmed_by_queries.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER] [-a] [-k KEY] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
12 # |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
13 # optional arguments: |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
14 # -h, --help show this help message and exit |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
15 # -i INPUT, --input INPUT input file name. add path if file is not in working directory |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
16 # -o OUTPUT, --output OUTPUT output file name. [default "pubmed_by_queries_output"] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
17 # -n NUMBER, --number NUMBER number of PMIDs or abstracts to save per ID [default "5"] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
18 # -a, --abstract if abstracts instead of PMIDs should be retrieved use --abstracts |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
19 # -k KEY, --key KEY if NCBI API key is available, add it to speed up the fetching of pubmed data |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
20 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
21 if ( '--install_packages' %in% commandArgs()) { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
22 print('Installing packages') |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
23 if (!require('argparse')) install.packages('argparse',repo="http://cran.rstudio.com/"); |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
24 if (!require('easyPubMed')) install.packages('easyPubMed',repo="http://cran.rstudio.com/"); |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
25 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
26 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
27 suppressPackageStartupMessages(library("argparse")) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
28 suppressPackageStartupMessages(library("easyPubMed")) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
29 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
30 parser <- ArgumentParser() |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
31 parser$add_argument("-i", "--input", |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
32 help = "input fie name. add path if file is not in working directory") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
33 parser$add_argument("-o", "--output", default="pubmed_by_queries_output", |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
34 help = "output file name. [default \"%(default)s\"]") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
35 parser$add_argument("-n", "--number", type="integer", default=5, |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
36 help="Number of PMIDs (or abstracts) to save per ID. [default \"%(default)s\"]") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
37 parser$add_argument("-a", "--abstract", action="store_true", default=FALSE, |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
38 help="if abstracts instead of PMIDs should be retrieved use --abstracts ") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
39 parser$add_argument("-k", "--key", type="character", |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
40 help="if ncbi API key is available, add it to speed up the download of pubmed data") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
41 parser$add_argument("--install_packages", action="store_true", default=FALSE, |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
42 help="If you want to auto install missing required packages.") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
43 args <- parser$parse_args() |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
44 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
45 MAX_WEB_TRIES = 100 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
46 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
47 data = read.delim(args$input, stringsAsFactors=FALSE) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
48 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
49 id_col_index <- grep("ID_", names(data)) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
50 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
51 pubmed_data_in_table <- function(data, row, query, number, key, abstract){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
52 if (is.null(query)){print(data)} |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
53 pubmed_search <- get_pubmed_ids(query, api_key = key) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
54 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
55 if(as.numeric(pubmed_search$Count) == 0){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
56 cat("No PubMed result for the following query: ", query, "\n") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
57 return(data) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
58 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
59 } else if (abstract == FALSE) { # fetch PMIDs |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
60 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
61 myPubmedURL <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?", |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
62 "db=pubmed&retmax=", number, "&term=", pubmed_search$OriginalQuery, "&usehistory=n", sep = "") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
63 # get ids |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
64 idXML <- c() |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
65 for (i in 1:MAX_WEB_TRIES){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
66 tryCatch({ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
67 IDconnect <- suppressWarnings(url(myPubmedURL, open = "rb", encoding = "UTF8")) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
68 idXML <- suppressWarnings(readLines(IDconnect, warn = FALSE, encoding = "UTF8")) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
69 suppressWarnings(close(IDconnect)) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
70 break |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
71 }, error = function(e) { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
72 print(paste('Error getting URL, sleeping',2*i,'seconds.')) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
73 print(e) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
74 Sys.sleep(time = 2*i) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
75 }) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
76 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
77 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
78 PMIDs = c() |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
79 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
80 for (i in 1:length(idXML)) { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
81 if (grepl("^<Id>", idXML[i])) { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
82 pmid <- custom_grep(idXML[i], tag = "Id", format = "char") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
83 PMIDs <- c(PMIDs, as.character(pmid[1])) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
84 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
85 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
86 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
87 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
88 if(length(PMIDs)>0){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
89 data[row,sapply(1:length(PMIDs),function(i){paste0("PMID_",i)})] <- PMIDs |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
90 cat(length(PMIDs)," PMIDs for ",query, " are added in the table.", "\n") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
91 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
92 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
93 return(data) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
94 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
95 } else if (abstract == TRUE) { # fetch abstracts and title text |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
96 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
97 efetch_url = paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?", |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
98 "db=pubmed&WebEnv=", pubmed_search$WebEnv, "&query_key=", pubmed_search$QueryKey, |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
99 "&retstart=", 0, "&retmax=", number, |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
100 "&rettype=", "null","&retmode=", "xml", sep = "") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
101 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
102 api_key <- pubmed_search$APIkey |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
103 if (!is.null(api_key)) { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
104 efetch_url <- paste(efetch_url, "&api_key=", api_key, sep = "") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
105 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
106 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
107 # initialize |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
108 out.data <- NULL |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
109 try_num <- 1 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
110 t_0 <- Sys.time() |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
111 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
112 # Try to fetch results |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
113 while(is.null(out.data)) { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
114 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
115 # Timing check: kill at 3 min |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
116 if (try_num > 1){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
117 Sys.sleep(time = 2*try_num) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
118 cat("Problem to receive PubMed data or error is received. Please wait. Try number:",try_num,"\n") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
119 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
120 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
121 t_1 <- Sys.time() |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
122 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
123 if(as.numeric(difftime(t_1, t_0, units = "mins")) > 3){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
124 message("Killing the request! Something is not working. Please, try again later","\n") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
125 return(data) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
126 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
127 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
128 # ENTREZ server connect |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
129 out.data <- tryCatch({ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
130 tmpConnect <- suppressWarnings(url(efetch_url, open = "rb", encoding = "UTF8")) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
131 suppressWarnings(readLines(tmpConnect, warn = FALSE, encoding = "UTF8")) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
132 }, error = function(e) { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
133 print(e) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
134 }, finally = { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
135 try(suppressWarnings(close(tmpConnect)), silent = TRUE) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
136 }) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
137 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
138 # Check if error |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
139 if (!is.null(out.data) && |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
140 class(out.data) == "character" && |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
141 grepl("<ERROR>", substr(paste(utils::head(out.data, n = 100), collapse = ""), 1, 250))) { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
142 out.data <- NULL |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
143 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
144 try_num <- try_num + 1 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
145 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
146 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
147 if (is.null(out.data)) { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
148 message("Killing the request! Something is not working. Please, try again later","\n") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
149 return(data) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
150 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
151 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
152 # process xml data |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
153 xml_data <- paste(out.data, collapse = "") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
154 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
155 # articles to list |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
156 xml_data <- strsplit(xml_data, "<PubmedArticle(>|[[:space:]]+?.*>)")[[1]][-1] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
157 xml_data <- sapply(xml_data, function(x) { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
158 #trim extra stuff at the end of the record |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
159 if (!grepl("</PubmedArticle>$", x)) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
160 x <- sub("(^.*</PubmedArticle>).*$", "\\1", x) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
161 # Rebuid XML structure and proceed |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
162 x <- paste("<PubmedArticle>", x) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
163 gsub("[[:space:]]{2,}", " ", x)}, |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
164 USE.NAMES = FALSE, simplify = TRUE) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
165 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
166 #titles |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
167 titles = sapply(xml_data, function(x){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
168 x = custom_grep(x, tag="ArticleTitle", format="char") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
169 x <- gsub("</{0,1}i>", "", x, ignore.case = T) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
170 x <- gsub("</{0,1}b>", "", x, ignore.case = T) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
171 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
172 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
173 if (length(x) > 1){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
174 x <- paste(x, collapse = " ", sep = " ") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
175 } else if (length(x) < 1) { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
176 x <- NA |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
177 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
178 x |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
179 }, |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
180 USE.NAMES = FALSE, simplify = TRUE) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
181 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
182 # abstracts |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
183 abstract.text = sapply(xml_data, function(x){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
184 custom_grep(x, tag="AbstractText", format="char")}, |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
185 USE.NAMES = FALSE, simplify = TRUE) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
186 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
187 abstracts <- sapply(abstract.text, function(x){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
188 if (length(x) > 1){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
189 x <- paste(x, collapse = " ", sep = " ") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
190 x <- gsub("</{0,1}i>", "", x, ignore.case = T) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
191 x <- gsub("</{0,1}b>", "", x, ignore.case = T) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
192 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
193 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
194 } else if (length(x) < 1) { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
195 x <- NA |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
196 } else { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
197 x <- gsub("</{0,1}i>", "", x, ignore.case = T) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
198 x <- gsub("</{0,1}b>", "", x, ignore.case = T) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
199 x <- gsub("</{0,1}sub>", "", x, ignore.case = T) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
200 x <- gsub("</{0,1}exp>", "", x, ignore.case = T) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
201 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
202 x |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
203 }, |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
204 USE.NAMES = FALSE, simplify = TRUE) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
205 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
206 #add title to abstracts |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
207 if (length(titles) == length(abstracts)){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
208 abstracts = paste(titles, abstracts) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
209 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
210 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
211 #add abstracts to data frame |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
212 if(length(abstracts)>0){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
213 data[row,sapply(1:length(abstracts),function(i){paste0("ABSTRACT_",i)})] <- abstracts |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
214 cat(length(abstracts)," abstracts for ",query, " are added in the table.", "\n") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
215 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
216 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
217 return(data) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
218 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
219 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
220 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
221 for(i in 1:nrow(data)){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
222 data = tryCatch(pubmed_data_in_table(data= data, |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
223 row= i, |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
224 query= data[i,id_col_index], |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
225 number= args$number, |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
226 key= args$key, |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
227 abstract= args$abstract), error=function(e){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
228 print('main error') |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
229 print(e) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
230 Sys.sleep(5) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
231 }) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
232 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
233 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
234 write.table(data, args$output, append = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
235 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
236 |