Mercurial > repos > dlalgroup > pubmed_by_queries
annotate pmids_to_pubtator_matrix.R @ 0:f40606281050 draft
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
| author | dlalgroup | 
|---|---|
| date | Thu, 24 Sep 2020 03:01:43 +0000 | 
| parents | |
| children | 
| rev | line source | 
|---|---|
| 
0
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
1 #!/usr/bin/env Rscript | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
2 #tool: pmids_to_pubtator_matrix | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
3 # | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
4 #The tool uses all PMIDs per row and extracts "Gene", "Disease", "Mutation", "Chemical" and "Species" terms of the | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
5 #corresponding abstracts, using PubTator annotations. The user can choose from which categories terms should be extracted. | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
6 #The extracted terms are united in one large binary matrix, with 0= term not present in abstracts of that row and 1= term | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
7 #present in abstracts of that row. The user can decide if the extracted scientific terms should be extracted and used as | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
8 #they are or if they should be grouped by their geneIDs/ meshIDs (several terms can often be grouped into one ID). | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
9 #äAlso, by default all terms are extracted, otherwise the user can specify a number of most frequent words to be extracted per row. | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
10 # | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
11 #Input: Output of abstracts_by_pmids or tab-delimited table with columns containing PMIDs. | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
12 #The names of the PMID columns should start with "PMID", e.g. "PMID_1", "PMID_2" etc. | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
13 # | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
14 #Output: Binary matrix in that each column represents one of the extracted terms. | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
15 # | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
16 # usage: $ pmids_to_pubtator_matrix.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
17 # [-c {Genes,Diseases,Mutations,Chemicals,Species} [{Genes,Diseases,Mutations,Chemicals,Species} ...]] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
18 # | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
19 # optional arguments: | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
20 # -h, --help show help message | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
21 # -i INPUT, --input INPUT input file name. add path if file is not in workind directory | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
22 # -n NUMBER, --number NUMBER Number of most frequent terms/IDs to extract. By default all terms/IDs are extracted. | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
23 # -o OUTPUT, --output OUTPUT output file name. [default "pmids_to_pubtator_matrix_output"] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
24 # -c {Gene,Disease,Mutation,Chemical,Species} [{Genes,Diseases,Mutations,Chemicals,Species} ...], --categories {Gene,Disease,Mutation,Chemical,Species} [{Gene,Disease,Mutation,Chemical,Species} ...] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
25 # Pubtator categories that should be considered. [default "('Gene', 'Disease', 'Mutation','Chemical')"] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
26 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
27 if ( '--install_packages' %in% commandArgs()) { | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
28 print('Installing packages') | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
29 if (!require('argparse')) install.packages('argparse',repo="http://cran.rstudio.com/"); | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
30 if (!require('stringr')) install.packages('stringr',repo="http://cran.rstudio.com/"); | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
31 if (!require('RCurl')) install.packages('RCurl',repo="http://cran.rstudio.com/"); | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
32 if (!require('stringi')) install.packages('stringi',repo="http://cran.rstudio.com/"); | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
33 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
34 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
35 suppressPackageStartupMessages(library("argparse")) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
36 library('stringr') | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
37 library('stringi') | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
38 library('RCurl') | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
39 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
40 parser <- ArgumentParser() | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
41 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
42 parser$add_argument("-i", "--input", | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
43 help = "input fie name. add path if file is not in workind directory") | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
44 parser$add_argument("-o", "--output", default="pmids_to_pubtator_matrix_output", | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
45 help = "output file name. [default \"%(default)s\"]") | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
46 parser$add_argument("-c", "--categories", choices=c("Gene", "Disease", "Mutation", "Chemical", "Species"), nargs="+", | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
47 default= c("Gene", "Disease", "Mutation", "Chemical"), | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
48 help = "Pubtator categories that should be considered. [default \"%(default)s\"]") | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
49 parser$add_argument("-b", "--byid", action="store_true", default=FALSE, | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
50 help="If you want to find common gene IDs / mesh IDs instead of scientific terms.") | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
51 parser$add_argument("-n", "--number", default=NULL, type="integer", | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
52 help="Number of most frequent terms/IDs to extract. By default all terms/IDs are extracted.") | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
53 parser$add_argument("--install_packages", action="store_true", default=FALSE, | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
54 help="If you want to auto install missing required packages.") | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
55 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
56 args <- parser$parse_args() | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
57 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
58 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
59 data = read.delim(args$input, stringsAsFactors=FALSE, header = TRUE, sep='\t') | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
60 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
61 pmid_cols_index <- grep(c("PMID"), names(data)) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
62 word_matrix = data.frame() | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
63 dict.table = data.frame() | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
64 pmids_count <- 0 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
65 pubtator_max_ids = 100 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
66 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
67 get_pubtator_terms = function(pmids, categories){ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
68 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
69 table = NULL | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
70 for (pmid_split in split(pmids, ceiling(seq_along(pmids)/pubtator_max_ids))){ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
71 out.data = NULL | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
72 try_num <- 1 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
73 t_0 <- Sys.time() | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
74 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
75 while(TRUE) { | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
76 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
77 # Timing check: kill at 3 min | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
78 if (try_num > 1){ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
79 cat("Connection problem. Please wait. Try number:",try_num,"\n") | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
80 Sys.sleep(time = 2*try_num) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
81 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
82 try_num <- try_num + 1 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
83 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
84 t_1 <- Sys.time() | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
85 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
86 if(as.numeric(difftime(t_1, t_0, units = "mins")) > 3){ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
87 message("Killing the request! Something is not working. Please, try again later","\n") | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
88 return(table) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
89 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
90 out.data <- tryCatch({ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
91 getURL(paste("https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator?pmids=", | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
92 paste(pmid_split, collapse=","), sep = "")) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
93 }, error = function(e) { | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
94 print(e) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
95 next | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
96 }, finally = { | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
97 Sys.sleep(0) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
98 }) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
99 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
100 if(!is.null(out.data)){ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
101 out.data = unlist(strsplit(out.data, "\n", fixed = T)) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
102 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
103 # skip first few lines, is this needed? | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
104 for (i in 3:length(out.data)) { | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
105 temps = unlist(strsplit(out.data[i], "\t", fixed = T)) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
106 if (length(temps) == 5) { | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
107 # make 5 be 6 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
108 temps = c(temps, NA) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
109 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
110 if (length(temps) == 6) { | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
111 # keep only 6 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
112 table = rbind(table, temps) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
113 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
114 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
115 break | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
116 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
117 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
118 } #end while loop | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
119 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
120 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
121 index.categories = c() | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
122 categories = as.character(unlist(categories)) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
123 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
124 if(ncol(table) == 6){ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
125 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
126 for(i in categories){ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
127 tmp.index = grep(TRUE, i == as.character(table[,5])) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
128 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
129 if(length(tmp.index) > 0){ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
130 index.categories = c(index.categories,tmp.index) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
131 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
132 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
133 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
134 table = as.data.frame(table, stringsAsFactors=FALSE) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
135 table = table[index.categories,c(4,6)] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
136 table = table[!is.na(table[,2]),] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
137 table = table[!(table[,2] == "NA"),] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
138 table = table[!(table[,1] == "NA"),] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
139 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
140 if(args$byid){ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
141 if(!is.null(args$number)){ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
142 #retrieve top X mesh.ids | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
143 table.mesh = as.data.frame(table(table[,2])) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
144 colnames(table.mesh)[1] = "mesh.id" | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
145 table = table[order(table.mesh$Freq, decreasing = TRUE),] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
146 table = table[1:min(args$number, nrow(table.mesh)),] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
147 table.mesh$mesh.id = as.character(table.mesh$mesh.id) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
148 #subset table for top X mesh.ids | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
149 table = table[which(as.character(table$V6) %in% as.character(table.mesh$mesh.id)),] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
150 table = table[!duplicated(table[,2]),] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
151 }else{ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
152 table = table[!duplicated(table[,2]),] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
153 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
154 } else { | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
155 if(!is.null(args$number)){ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
156 table[,1] = tolower(as.character(table[,1])) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
157 table = as.data.frame(table(table[,1])) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
158 colnames(table)[1] = "term" | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
159 table = table[order(table$Freq, decreasing = TRUE),] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
160 table = table[1:min(args$number, nrow(table)),] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
161 table$term = as.character(table$term) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
162 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
163 }else{ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
164 table[,1] = tolower(as.character(table[,1])) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
165 table = table[!duplicated(table[,1]),] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
166 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
167 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
168 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
169 return(table) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
170 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
171 } else { | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
172 return(NULL) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
173 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
174 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
175 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
176 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
177 #for all PMIDs of a row get PubTator terms and add them to the matrix | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
178 for (i in 1:nrow(data)){ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
179 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
180 pmids = as.character(data[i,pmid_cols_index]) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
181 pmids = pmids[!pmids == "NA"] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
182 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
183 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
184 if ( (pmids_count > 10000)){ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
185 cat("Break (10s) to avoid killing of requests. Please wait.",'\n') | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
186 Sys.sleep(10) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
187 pmids_count = 0 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
188 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
189 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
190 pmids_count = pmids_count + length(pmids) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
191 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
192 #get puptator terms with get_pubtator_terms function | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
193 if (length(pmids) >0){ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
194 table = get_pubtator_terms(pmids, args$categories) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
195 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
196 if(!is.null(table)){ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
197 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
198 colnames(table)= c("term","mesh.id") | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
199 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
200 # add data in binary matrix | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
201 if (args$byid){ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
202 mesh.ids = as.character(table$mesh.id) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
203 if (length(mesh.ids) > 0 ){ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
204 word_matrix[i,mesh.ids] <- 1 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
205 cat(length(mesh.ids), " IDs for PMIDs of row", i," were added",'\n') | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
206 # add data in dictionary | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
207 dict.table = rbind(dict.table, table) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
208 dict.table = dict.table[!duplicated(as.character(dict.table[,2])),] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
209 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
210 } else { | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
211 terms = as.character(table[,1]) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
212 if (length(terms) > 0 ){ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
213 word_matrix[i,terms] <- 1 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
214 cat(length(terms), " terms for PMIDs of row", i," were added.",'\n') | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
215 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
216 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
217 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
218 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
219 } else { | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
220 cat("No terms for PMIDs of row", i," were found.",'\n') | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
221 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
222 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
223 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
224 if (args$byid){ | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
225 #change column names of matrix: exchange mesh ids/ids with term | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
226 index_names = match(names(word_matrix), as.character(dict.table[[2]])) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
227 names(word_matrix) = dict.table[index_names,1] | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
228 } | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
229 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
230 colnames(word_matrix) = gsub("[^[:print:]]","",colnames(word_matrix)) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
231 colnames(word_matrix) = gsub('\"', "", colnames(word_matrix), fixed = TRUE) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
232 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
233 #merge duplicated columns | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
234 word_matrix = as.data.frame(do.call(cbind, by(t(word_matrix),INDICES=names(word_matrix),FUN=colSums))) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
235 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
236 #save binary matrix | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
237 word_matrix <- as.matrix(word_matrix) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
238 word_matrix[is.na(word_matrix)] <- 0 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
239 cat("Matrix with ",nrow(word_matrix)," rows and ",ncol(word_matrix)," columns generated.","\n") | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
240 #write.table(word_matrix, args$output) | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
241 write.table(word_matrix, args$output, row.names = FALSE, sep = '\t') | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
242 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
243 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
244 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
245 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
246 | 
| 
 
f40606281050
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
 
dlalgroup 
parents:  
diff
changeset
 | 
247 | 
